def main(): input_fname = 'small' if len(sys.argv) > 1: input_fname = sys.argv[1] tknzr = TweetTokenizer() tagger = PerceptronTagger() fout = ('embeddings/smiley_tweets_embedding_expanded_{}'.format(input_fname)) fname,delimiter,ndim = ('embeddings/smiley_tweets_embedding_{}'.format(input_fname),' ',52) word2vec = load_glove_vec(fname,{},delimiter,ndim) tagdict = tagger.tagdict tagidx = {} nRows = len(word2vec) nCols = len(tagdict) print nRows,':',nCols counter = 0 for tag in tagdict.keys(): tagidx[tag] = counter counter += 1 exp_wemb = {} for word in word2vec.keys(): exp_wemb[word] = np.zeros(nCols) print tagidx train = "semeval/task-B-train-plus-dev.tsv.gz" test = "semeval/task-B-test2014-twitter.tsv.gz" dev = "semeval/twitter-test-gold-B.downloaded.tsv.gz" test15 = "semeval/task-B-test2015-twitter.tsv.gz" smiley_pos = 'semeval/smiley_tweets_{}.gz'.format(input_fname) it = 0 files = [train,test,dev,test15,smiley_pos] for filen in files: for tweet in gzip.open(filen,'rb'): tweet = tknzr.tokenize(tweet.decode('utf-8')) tags = _pos_tag(tweet, None, tagger) for (word,tag) in tags: if word in exp_wemb.keys() and tag in tagidx.keys(): idx = tagidx[tag] exp_wemb[word][idx] = 1 if (it%10) == 0: print 'Progress:',it it += 1 f = open(fout,'wb') for word in exp_wemb: f.write(word) tags = exp_wemb[word] for i in np.nditer(tags): f.write(' {}'.format(i)) fname.write("\n")
def main(): HOME_DIR = "semeval_parsed" np.random.seed(123) input_fname = '200M' embedding = 'custom' type = '200M' ndim = 52 data_dir = HOME_DIR + '_' + input_fname fname_vocab = os.path.join(data_dir, 'vocab_{}.pickle'.format('topic')) tknr = TweetTokenizer() alphabet = cPickle.load(open(fname_vocab)) words = alphabet.keys() tok_words = {} words = [] for word,idx in alphabet.iteritems(): tok_word = tknr.tokenize(word.decode('utf-8')) tok_words[idx] = tok_word words.extend(tok_word) print len(tok_words) print len(words) print "Vocab size", len(alphabet) fname,delimiter,ndim = ('embeddings/updated_embeddings_custom_200M'.format(type,str(ndim)),' ',ndim) word2vec = load_glove_vec(fname,words,delimiter,ndim) print 'len',len(word2vec) ndim = len(word2vec[word2vec.keys()[0]]) print 'ndim', ndim random_words_count = 0 vocab_emb = np.zeros((len(alphabet) + 1, ndim),dtype='float32') for idx,tok_word in tok_words.iteritems(): isrand = 1 word_vec = np.zeros(ndim) for tok in tok_word: if tok in word2vec.keys(): word_vec += word2vec[tok] isrand = 0 if isrand: word_vec = np.random.uniform(-0.25, 0.25, ndim) random_words_count += 1 vocab_emb[idx] = word_vec.astype(np.float32)/len(tok_word) print "Using zero vector as random" print 'random_words_count', random_words_count svd = TruncatedSVD(n_components=5) vocab_emb = svd.fit_transform(vocab_emb).astype(np.float32) print vocab_emb.shape fname = 'embeddings/smiley_tweets_embedding_{}'.format('topic') outfile = os.path.join(data_dir, 'emb_{}.npy'.format(os.path.basename(fname))) print outfile np.save(outfile, vocab_emb)
def cosineSimilarity(text_a, text_b): # Tokenize sentences tknzr = TweetTokenizer() word_list_a = tknzr.tokenize(text_a) word_list_b = tknzr.tokenize(text_b) keys = list(set(word_list_a + word_list_b)) vector_size = len(keys) vector_a = [0] * vector_size vector_b = [0] * vector_size for i in range(vector_size): vector_a[i] = word_list_a.count(keys[i]) vector_b[i] = word_list_b.count(keys[i]) return dot(vector_a, vector_b) / (norm(vector_a) * norm(vector_b))
def main(): # x, y = load_dataset("datasets/sentiment_uci/yelp_labelled.txt") x, y = load_datasets(["../datasets/sentiment_uci/yelp_labelled.txt"]) stopwords = set() with open('../stopwords.txt', 'r') as f: for w in f: stopwords.add(w) tok = TweetTokenizer() stemmer = EnglishStemmer() vectorizer = TfidfVectorizer(sublinear_tf=True, use_idf=True, binary=True, preprocessor=stemmer.stem, tokenizer=tok.tokenize, ngram_range=(1, 2)) accu_p = np.zeros(shape=(2,)) accu_r = np.zeros(shape=(2,)) accu_f = np.zeros(shape=(2,)) accu_a = 0.0 folds = 10 for train_idx, test_idx in StratifiedKFold(y=y, n_folds=folds, shuffle=True): train_x, train_y = x[train_idx], y[train_idx] test_x, test_y = x[test_idx], y[test_idx] cls = tree.DecisionTreeClassifier() # train train_x = vectorizer.fit_transform(train_x).toarray() cls.fit(train_x, train_y) # test test_x = vectorizer.transform(test_x).toarray() pred_y = cls.predict(test_x) # evaluate p, r, f, _ = precision_recall_fscore_support(test_y, pred_y) a = accuracy_score(test_y, pred_y) accu_p += p accu_r += r accu_f += f accu_a += a print("Evaluating classifier:") print("\tAccuracy: {}".format(a)) print("\tPrecision[0]: {}".format(p[0])) print("\tPrecision[1]: {}".format(p[1])) print("\tRecall[0]: {}".format(r[0])) print("\tRecall[1]: {}".format(r[1])) print("\tF1-score[0]: {}".format(f[0])) print("\tF1-score[1]: {}".format(f[1])) print("Average evaluation") print("\tAccuracy: {}".format(accu_a / folds)) print("\tPrecision[0]: {}".format(accu_p[0] / folds)) print("\tPrecision[1]: {}".format(accu_p[1] / folds)) print("\tRecall[0]: {}".format(accu_r[0] / folds)) print("\tRecall[1]: {}".format(accu_r[1] / folds)) print("\tF1-score[0]: {}".format(accu_f[0] / folds)) print("\tF1-score[1]: {}".format(accu_f[1] / folds))
def tokenize_with(kwargs): tokenizer = TweetTokenizer(**kwargs) def tweet_tokenizer(data): return [' '.join(tokenizer.tokenize(tweet)) for tweet in data] return tweet_tokenizer
class NltkTweetTokenizer(Tokenizer): def __init__(self) -> None: super().__init__() self._base_tokenizer = TweetTokenizer() def tokenize_text(self, text: str) -> List[str]: return self._base_tokenizer.tokenize(text)
class SpaceSeparatedWordsMixIn(AbstractLanguage, metaclass=abc.ABCMeta): """Language in which words are separated by spaces.""" def __init__(self): super().__init__() self.__tokenizer = TweetTokenizer(preserve_case=False) def split_sentence_to_words(self, sentence: str) -> List[str]: """Splits a sentence into words using spaces (for Latin languages).""" sentence = decode_object_from_bytes_if_needed(sentence) if sentence is None: log.warning("Sentence is None.") return [] # Normalize apostrophe so that "it’s" and "it's" get treated identically sentence = sentence.replace("’", "'") tokens = self.__tokenizer.tokenize(text=sentence) def is_word(token_: str) -> bool: """Returns True if token looks like a word.""" if re.match(pattern=r'\w', string=token_, flags=re.UNICODE): return True else: return False # TweetTokenizer leaves punctuation in-place tokens = [token for token in tokens if is_word(token)] return tokens
class Tokeniser(BaseEstimator, TransformerMixin): def __init__(self, return_flags=False): self.tokeniser = TweetTokenizer() self.return_flags = return_flags def fit(self, *args, **kwargs): return self def tokenise(self, sequence): flag = "" ix = 0 tokens, positions = [], [] for t in self.tokeniser.tokenize(sequence): ix = sequence.find(t, ix) if len(t) == 1 and ord(t) >= 127462: # this is the code for 🇦 if not self.return_flags: continue if flag: tokens.append(flag + t) positions.append(ix - 1) flag = "" else: flag = t else: tokens.append(t) positions.append(ix) ix = +1 return tokens, positions def transform(self, x, y=None): return [self.tokenise(sequence) for sequence in x]
class MySentences(object): def __init__(self, files): self.files = files self.tknzr = TweetTokenizer() def max_reached(self, language_tags): all_max = True for lang in max_for_lang.keys(): for sent in ['positive', 'negative']: tag = '{}_{}'.format(lang, sent) curr_is_max = language_tags[tag] >= max_for_lang[lang] all_max &= curr_is_max return all_max def __iter__(self): language_tags = defaultdict(lambda: 0) for (fname) in self.files: for line in open(fname, 'rb'): if self.max_reached(language_tags): return splits = line.split('\t') lang_tag = splits[0].strip() sent_tag = splits[4].strip() tag = '{}_{}'.format(lang_tag, sent_tag) if language_tags[tag] < max_for_lang[lang_tag]: language_tags[tag] += 1 tweet = line.split('\t')[-1] tweet = preprocess_tweet(tweet) tweet = self.tknzr.tokenize(tweet.decode('utf-8')) yield filter(lambda word: ' ' not in word, tweet)
def boolenModel(self, freq, onlyfiles): self.comboBox_4.clear() self.comboBox_4.addItem(' ') requete = self.plainTextEdit_2.toPlainText() requete = requete.lower() req = TweetTokenizer().tokenize(requete) for file in onlyfiles: reqtemp = [] for mot in req: mot.lower() if (mot in ['and', 'or', '(', ')', 'not']): reqtemp.append(mot) reqtemp.append(' ') else: listfile = self.indexmotSimple(mot) if (file in listfile): reqtemp.append('1') reqtemp.append(' ') else: reqtemp.append('0') reqtemp.append(' ') evaluation = eval(''.join(reqtemp)) if (evaluation == 1): self.comboBox_4.addItem(file)
class PartsOfSpeechExtractor(BaseEstimator, TransformerMixin): IGNORE_TAGS = ['PUNCT', 'CCONJ'] _vectorizer = None _tokenizer = TweetTokenizer(reduce_len=True) _pos_helper = PartsOfSpeechHelper() def __init__(self): pass def transform(self, data, y=None): result = [] for tweet in data: result.append(self.pos_tag(tweet)) if self._vectorizer == None: self._vectorizer = DictVectorizer(sparse=False) self._vectorizer.fit(result) return self._vectorizer.transform(result) def pos_tag(self, tweet): tokens = self._tokenizer.tokenize(tweet) pos_tweet = self._pos_helper.pos_tag(tokens) return Counter([t for w, t in pos_tweet if t not in self.IGNORE_TAGS]) def fit(self, df, y=None): return self
def run(dataset, hyperparameters, metrics, fname=None): # # Load Resources word2vec = None if hyperparameters['model'] != 'rand': word2vec = load_word2vec() # # Load Dataset df = load_dataset(dataset[0], **dataset[1]) # # Preprocess df['clean_tweets'] = df.tweet.apply( TweetPreprocessor(normalize=['link', 'mention']).preprocess) df['tokens'] = df.clean_tweets.apply(TweetTokenizer().tokenize) X_train, X_dev, X_test, y_train, y_dev, y_test = train_dev_test_split( df.tokens, df.label) # # Train clf = NeuralNetClassifier(module=TextCNN, corpus=df.tokens, word_vectors=word2vec, metrics=metrics, **hyperparameters) clf.fit(X_train, y_train, validation_data=(X_dev, y_dev)) # # Predict y_pred = clf.predict(X_test) # # Evaluate pprint( dict(dataset=dataset, hyperparameters=hyperparameters, scores={ scorer: get_score_func(scorer)(y_test, y_pred) for scorer in metrics })) # # Save to file X_test['pred'] = y_pred X_test.to_excel(scratch_path('predictions_%s.xlsx' % fname))
def word2vec_feature_from_tweets(self, glove_input_file, embedd_dim, name='default'): # --- loaded saved features if it's exist ? --- features_path = 'saved_objects/features/train/embedding_features-' + name + '.pkl' if (os.path.exists(features_path)): file = open(features_path, 'rb') return pickle.load(file) # --- otherwise generate embedding features --- word2vec = KeyedVectors.load_word2vec_format(glove_input_file, unicode_errors='ignore', binary=False) # get tfidf from each word required in embedding features _, _, tfidf_scores = self.tfidf_from_tweets() tfidf = dict(zip(tfidf_scores.get_feature_names(), tfidf_scores.idf_)) # ---weighted-average tweet2vec. --- def build_average_Word2vec(tokens, size): vec = np.zeros(size) count = 0. for word in tokens: try: vec += word2vec[word] * tfidf[word] count += 1. except KeyError: continue if count != 0: vec /= count return vec tokenizer = TweetTokenizer() embedd_table = {} for _, row in self.norm_df.iterrows(): # self.norm_test_df.iterrows() tweet2vec = build_average_Word2vec(tokenizer.tokenize( row['norm_tweets']), size=embedd_dim) embedd_table[row['tweet_id']] = tweet2vec # ----- saving embedding features to disk -------- file = open(features_path, 'wb') pickle.dump(embedd_table, file) file.close() return embedd_table
def preprocess_tweet(self, tweet): """Pre-process a tweet and/or profile description. The following pre-processing operations are done on the text: - Replace emojis like: "Python is :thumbs_up:" - Replace repeated character sequences of length 3 or greater with sequences of length 3 - Lowercase - Replace all URLs and username mentions with the following tags: URL <URLURL> @Username <UsernameMention> Args: tweet: String Returns: The pre-processed tweet as String IMPROVEMENTS TO MAKE: - Instead of tokenizing and detokenizing, which is messy, the strings should be directly replaced using regex. """ replaced_urls = [] # Create an empty list replaced_mentions = [] # Create an empty list # Replace emojis tweet = emoji.demojize(tweet) # Tokenize using NLTK tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True) tokens = tokenizer.tokenize(tweet) # Iterate over tokens for index, token in enumerate(tokens): # Replace URLs if token[0:4] == "http": replaced_urls.append(token) tokens[index] = "<URLURL>" # ↳ *tokens[index]* will directly modify *tokens*, whereas any changes to *token* will be lost. # Replace mentions (Twitter handles; usernames) elif token[0] == "@" and len(token) > 1: # ↳ Skip the single '@' tokens replaced_mentions.append(token) tokens[index] = "<UsernameMention>" # Detokenize using NLTK's Treebank Word Detokenizer detokenizer = TreebankWordDetokenizer() processed_tweet = detokenizer.detokenize(tokens) # *replaced_urls* and *replaced_mentions* will contain all of the replaced URLs and Mentions of the input string. return processed_tweet
def main(): x, y = load_datasets(["../datasets/sentiment_uci/yelp_labelled.txt"]) features(stopwords) stopwords = set() with open('../stopwords.txt', 'r') as f: for w in f: stopwords.add(w.strip()) tok = TweetTokenizer() x = [remove_stopwords(tok.tokenize(s.lower()), stopwords) for s in x] x = np.array(x) accumulate = dict() folds = 10 for train_idx, test_idx in StratifiedKFold(y=y, n_folds=folds, shuffle=True): train_x, train_y = x[train_idx], y[train_idx] test_x, test_y = x[test_idx], y[test_idx] train_docs = [(sent, label) for sent, label in zip(train_x, train_y)] test_docs = [(sent, label) for sent, label in zip(test_x, test_y)] cls = SentimentAnalyzer() # train words_with_neg = cls.all_words([mark_negation(a) for a in train_x]) unigram_feats = cls.unigram_word_feats(words_with_neg) bigram_feats = cls.bigram_collocation_feats(train_x) cls.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats, handle_negation=True) cls.add_feat_extractor(extract_bigram_feats, bigrams=bigram_feats) training_set = cls.apply_features(train_docs, labeled=True) cls.train(PositiveNaiveBayesClassifier.train, training_set) # test & evaluate test_set = cls.apply_features(test_docs) for key, value in sorted(cls.evaluate(test_set).items()): print('\t{0}: {1}'.format(key, value)) accumulate.setdefault(key, 0.0) accumulate[key] += value print("Averages") for key, value in sorted(accumulate.items()): print('\tAverage {0}: {1}'.format(key, value/folds))
def remove_common_words(data, proportion): """Removes the top words of a sample by a give proportion. Parameters ---------- data: np.array Corpus of text where each phrase is a separate array. proportion: float The proportion of words that you would like removed. Returns ------- top_words_removed: np.array Returns the corpus back with the top words removed. """ tokenizer = TweetTokenizer() # tokenize the data tokenized_data = [] for s in data: try: tokenized_data.append(tokenizer.tokenize(s)) except TypeError: pass # flatten and remove punctuation tokens = [word.lower() for phrase in tokenized_data for word in phrase] tokens = [word for word in tokens if word not in set(string.punctuation)] # count token occurences token_counts = Counter(tokens) # find the number for removal n_top = round(len(token_counts.keys()) * proportion) top_tokens = [t[0] for t in token_counts.most_common(n_top)] top_words_removed = [] for phrase in tokenized_data: top_words_removed.append(" ".join( [word for word in phrase if word.lower() not in top_tokens])) top_words_removed = np.array(top_words_removed) return top_words_removed
def __init__(self, reduce_len=True, preserve_case=False, stopwords=[]): """Initialize a Preprocessor object. arguments: reduce_len: Whether repeated occurences of letters in words should be shortened to at most 3 letter. E.g hellooooooo -> hellooo preserve_case: Whether the case of words should be preserved. stopwords: List of words that should be filtered out of the tokenized tweets. """ self.tokenizer = TweetTokenizer(reduce_len=reduce_len, preserve_case=preserve_case) self.stopwords = stopwords self.url_token = '<url>' self.user_token = '<user>' self.email_token = '<email>' self.tag_token = '<tag>' self.number_token = '<number>'
def tokenize_2(self, text, ngrams_sizes=(3, 2), remove_stopwords=True): tknzr = TweetTokenizer() text = text.lower() if ngrams_sizes: for i in ngrams_sizes: # join ngrams with '_' tokens = tknzr.tokenize(text) ngs = ngrams(tokens, i) for ng in ngs: phrs = "_".join(ng) if phrs in self.dictionary: text = text.replace(" ".join(ng), phrs) tokens = tknzr.tokenize(text) if remove_stopwords: tokens = [t for t in tokens if t not in self.stopwords] return tokens
def tokenize_tweets(filename, dest_folder): basename = os.path.basename(filename) dest = os.path.join(dest_folder, basename + '.tok') print("processing %s" % basename) tknzr = TweetTokenizer() with codecs.open(dest, 'w', "utf-8") as out_fs: with open(filename, 'r', encoding="utf-8") as in_fs: for line in in_fs:
def load_data(fname,pos): tid,tweets,sentiments = [],[],[] tknzr = TweetTokenizer(reduce_len=True) n_not_available = 0 with open(fname) as f: for line in f: splits = line.split('\t') tweet = splits[pos + 1] sentiment = convertSentiment(splits[pos]) tid.append(splits[0]) tweet = pts.preprocess_tweet(tweet) tweet_tok = tknzr.tokenize(tweet.decode('utf-8')) tweets.append(tweet_tok) sentiments.append(int(sentiment)) return tid,tweets,sentiments
def cargar_twitts(): direc = "data_set/" files = os.listdir(direc) archivos = [direc + twitt for twitt in files] twitts = [] for a in archivos: fp = open(a, "r") lineas = fp.readlines()[1:] for x in lineas: palabras = ''.join( [c for c in x.split(' ')[1] if c not in non_words]) tt = TweetTokenizer() twitt = tt.tokenize(palabras) twitts.append([twitt, x.split(' ')[2]]) fp.close() return twitts
def analyze(self, text): s = 0 li = TweetTokenizer().tokenize(text) for w in li: w.lower if (w in d): s = s + d[w] return s
def lineNormalization(self, line): tknzr = TweetTokenizer() norm = w2vAndGramsConverter() line = re.sub(r"#\s+", "", line) tmp = line.split(" ") st = "" for t in tmp: #reStart = time.time() links = re.findall( r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", t) #print("reg time spend: " + str(time.time() - reStart)) if links.__len__() > 0: #etStart = time.time() dom = tldextract.extract(links[0]).domain #print("first ext time spend: " + str(time.time() - etStart)) if dom == 'bit': if self.linkHash.get(links[0]) is None: try: #exreStart = time.time() t = tldextract.extract( self.sess.head( links[0]).headers['location']).domain #print("request and extract time spend:" + str(time.time() - exreStart)) except: t = "invaildURL" self.linkHash[links[0]] = t else: t = self.linkHash.get(links[0]) else: t = dom st += t + " " line = st line = norm.normalizeSentence(line) line = self.removePuncu(line) line = line.lower() tokens = tknzr.tokenize(line) return tokens
def load_twetts(): direc = "data_set_eng/" files = os.listdir(direc) files = [direc + twitt for twitt in files] twetts = [] for a in files: fp = open(a, "r") lines = fp.readlines()[1:] for x in lines: words = ''.join( [c for c in x.split('\t')[1] if c not in non_words]) words = clean_text(words) tt = TweetTokenizer() twitt = tt.tokenize(words) twetts.append([twitt, x.split('\t')[2]]) fp.close() return twetts
def tokenize_alexa(filename, dest_folder): basename = os.path.basename(filename) dest = os.path.join(dest_folder, basename + '.tok') print("processing %s" % basename) tknzr = TweetTokenizer() with codecs.open(dest, 'w', "utf-8") as out_fs: with open(filename, 'r', encoding="utf-8") as in_fs: for line in in_fs: try: service_desc = line #.strip().split('\t') except: print("could not parse line.") continue tweet = tknzr.tokenize(service_desc) if not 6 < len(tweet) < 110: continue tweet = preprocess_tweet(' '.join(tweet)) out_fs.write(tweet + '\t' + '\n')
def featureVecotrize(batch): # global debug_procces_words tknzr = TweetTokenizer() vect = [0] * (feature_len - 6) tokenized_batch = tknzr.tokenize(batch) sent_text = nltk.sent_tokenize(batch) # ordered_dict_listed = tuple(OrderedDict(analyze_text.unigrams_dict).keys()) ordered_dict_listed = tuple(OrderedDict(top_500_in_dict).keys()) len_sum = 0 num_of_words = 0 shell_nouns_count = 0 references_count = 0 function_words_count = 0 for word in tokenized_batch: if word in ordered_dict_listed: vect[ordered_dict_listed.index(word)] += 1 / len(all_ages_batched[0]) if word in SHELL_NOUNS: shell_nouns_count += 1 if word in REFERENCES: references_count += 1 if word in FUNCTION_WORDS: function_words_count += 1 if len(word) < 2 or (len(word) == 1 and "." in word or "," in word): continue else: num_of_words +=1 len_sum += len(word) avg_sent = sum([len(sent.replace(","," ").split()) for sent in sent_text])/ len(sent_text) batch_wo_punc = batch.replace(",","") unique_words = len(set((batch_wo_punc.split()))) avg_word = len_sum/num_of_words vect.append(avg_sent) vect.append(avg_word) vect.append(shell_nouns_count/len(all_ages_batched[0])) vect.append(references_count/len(all_ages_batched[0])) vect.append(function_words_count/len(all_ages_batched[0])) vect.append(unique_words/len(all_ages_batched[0])) # print(avg_sent,avg_word,shell_nouns_count,references_count,function_words_count,unique_words,file=debug_file) return vect
def tweetLength(self, line): # normalize the line w2vLib = w2vAndGramsConverter() line = w2vLib.normalizeSentence(line) # tokenize sentence tnz = TweetTokenizer() tokens = tnz.tokenize(line) if tokens.__len__() <= 10: return 1 elif tokens.__len__() <= 20: return 2 elif tokens.__len__() <= 30: return 3 elif tokens.__len__() <= 40: return 4 else: return 5
def get_sentence_embeddings(sentences, phrase = True, ngram='bigrams', model='concat_wiki_twitter'): """ Returns a numpy matrix of embeddings for one of the published models. It handles tokenization and can be given raw sentences. Arguments: - ngram: 'unigrams' or 'bigrams' - model: 'wiki', 'twitter', or 'concat_wiki_twitter' - sentences: a list of raw sentences ['Once upon a time', 'This is another sentence.', ...] """ wiki_embeddings = None twitter_embbedings = None tokenized_sentences_NLTK_tweets = None tokenized_sentences_SNLP = None if model == "wiki" or model == 'concat_wiki_twitter': tknzr = StanfordTokenizer(SNLP_TAGGER_JAR, encoding='utf-8') #print("sentences",sentences) #print(tknzr) s = ' <delimiter> '.join(sentences) #just a trick to make things faster #print("S",s) tokenized_sentences_SNLP = tokenize_sentences(tknzr, [s]) tokenized_sentences_SNLP = tokenized_sentences_SNLP[0].split(' <delimiter> ') assert(len(tokenized_sentences_SNLP) == len(sentences)) if phrase: temp = extract_keyphrase_candidates(str(tokenized_sentences_SNLP)) #print("Temp = ", temp) temp1 = list([' '.join(x) for x in temp if type(x) == list]) temp2 = list([x for x in temp if type(x) == str]) tokenized_sentences_SNLP = [*temp1, *temp2] #print("Senetences = ", sentences, "\n") if ngram == 'unigrams': wiki_embeddings = get_embeddings_for_preprocessed_sentences(tokenized_sentences_SNLP, \ MODEL_WIKI_UNIGRAMS, FASTTEXT_EXEC_PATH) else: wiki_embeddings = get_embeddings_for_preprocessed_sentences(tokenized_sentences_SNLP, \ MODEL_WIKI_BIGRAMS, FASTTEXT_EXEC_PATH) if model == "twitter" or model == 'concat_wiki_twitter': tknzr = TweetTokenizer() tokenized_sentences_NLTK_tweets = tokenize_sentences(tknzr, sentences) if ngram == 'unigrams': twitter_embbedings = get_embeddings_for_preprocessed_sentences(tokenized_sentences_NLTK_tweets, \ MODEL_TWITTER_UNIGRAMS, FASTTEXT_EXEC_PATH) else: twitter_embbedings = get_embeddings_for_preprocessed_sentences(tokenized_sentences_NLTK_tweets, \ MODEL_TWITTER_BIGRAMS, FASTTEXT_EXEC_PATH) if model == "twitter": return twitter_embbedings elif model == "wiki": if phrase: return tokenized_sentences_SNLP, wiki_embeddings else: return wiki_embeddings elif model == "concat_wiki_twitter": return np.concatenate((wiki_embeddings, twitter_embbedings), axis=1) sys.exit(-1)
def clean_text(phrase): # remove hyperlinks phrase = re.sub(r'https?:\/\/.*[\r\n]*', '', phrase) # remove hashtags # only removing the hash # sign from the word phrase = re.sub('#[\w\.\-]+', '', phrase) phrase = re.sub('@[\w\.\-]+', '', phrase) # tokenize tweets tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True) phrase_tokens = tokenizer.tokenize(phrase) phrase_clean = [] for word in phrase_tokens: if (word not in stopwords_english and # remove stopwords word not in string.punctuation and word not in emoticons): stem_word = stemmer.stem(word) # stemming word phrase_clean.append(stem_word) return phrase_clean
class PartsOfSpeechPatternExtractor(BaseEstimator, TransformerMixin): POS_PATTERNS = [('NOUN','ADJ'), ('NOUN','NOUN'), ('ADJ','NOUN'), ('VERB','NOUN'), ('AUX','NOUN'), ('NOUN','PRON','NOUN'), ('VERB','PRON','NOUN'), ('AUX','PRON','NOUN')] IGNORE_TAGS = ['PUNCT'] _vectorizer = None _tokenizer = TweetTokenizer(reduce_len=True) _processor = Preprocessor(stemming=True) _pos_helper = PartsOfSpeechHelper() def __init__(self): pass def transform(self, data, y=None): result = [] for tweet in data: result.append(self.get_patterns(tweet)) if self._vectorizer == None : self._vectorizer = DictVectorizer(sparse=False) self._vectorizer.fit(result) return self._vectorizer.transform(result) def get_patterns(self, tweet): result = [] tokens = self._tokenizer.tokenize(tweet) pos_tags = self._pos_helper.pos_tag(tokens) if len(pos_tags) > 1: pos_tags = [p for p in pos_tags if p[1] not in self.IGNORE_TAGS] words, tags = zip(*pos_tags) for pattern in self.POS_PATTERNS: found = self.find_sublist(list(pattern), list(tags)) for i,j in found: # Added patterns instead of tokens result.append('_'.join(list(pattern))) # result.append(self._processor.preprocess(' '.join(words[i:j]))) return Counter(result) def fit(self, df, y=None): return self def find_sublist(self, sl, l): results = [] sll = len(sl) for ind in (i for i, e in enumerate(l) if e == sl[0]): if l[ind:ind + sll] == sl: results.append((ind, ind + sll)) return results
def processTweet(tweet): _stopwords = set( stopwords.words('english') + list(punctuation) + ['AT_USER', 'URL', 'RT', 'rt', 'at_user', 'url']) tweet = tweet.lower() # convert text to lower-case tweet = expandContractions( tweet) # expand the contractions to remove the stop words tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'URL', tweet) # remove URLs tweet = re.sub(r'#([^\s]+)', r'\1', tweet) # remove the # in #hashtag tokenizer = TweetTokenizer( strip_handles=True, reduce_len=True) # tokenize the tweet and remove the handle tweet = tokenizer.tokenize(tweet) # remove the stop words from the tokenzied tweet tweet = [word for word in tweet if word not in _stopwords] # perform lemmatization on the words which helps to find the root of the word # The Lemmatization will not be performed for NER as the Lexicons are derived from Twitter # tweet = [WordNetLemmatizer().lemmatize(w, get_wordnet_pos(w)) for w in tweet] return tweet
class MySentences(object): def __init__(self, files): self.files = files self.tknzr = TweetTokenizer() def __iter__(self): for fname in self.files: for line in gzip.open(fname,'rb'): tweet = preprocess_tweet(line) tweet = self.tknzr.tokenize(tweet.decode('utf-8')) yield filter(lambda word: ' ' not in word, tweet)
class LexiconExtractor(BaseEstimator, TransformerMixin): NGRAM_LENGTH = 3 REVERSE_WORDS = ['no', 'ni', 'tampoc', 'ningun'] _tokenizer = TweetTokenizer() _preprocessor = Preprocessor(twitter_features=Preprocessor.REMOVE, stemming=True) def __init__(self): self._neg_words = self.file_to_list('lexicon/negative_words.txt') self._pos_words = self.file_to_list('lexicon/positive_words.txt') def transform(self, data, y=None): result = [] for tweet in data: tweet = self._preprocessor.preprocess(tweet) result.append(self.count_polarity_words(tweet)) return preprocessing.normalize(result) def count_polarity_words(self, text): num_pos_words = 0 num_neg_words = 0 list_ngrams = list( ngrams(self._tokenizer.tokenize(text), self.NGRAM_LENGTH, pad_left=True)) for ngram in list_ngrams: pre_words = ngram[:self.NGRAM_LENGTH - 1] word = ngram[self.NGRAM_LENGTH - 1] if word in self._pos_words: if any(w in pre_words for w in self.REVERSE_WORDS): num_neg_words += 1 else: num_pos_words += 1 elif word in self._neg_words: if any(w in pre_words for w in self.REVERSE_WORDS): num_pos_words += 1 else: num_neg_words += 1 return [num_pos_words, num_neg_words] def fit(self, df, y=None): return self def file_to_list(self, filename): return io.open(filename).read().splitlines()
def tokenize_tweets(filename, dest_folder): basename = os.path.basename(filename) dest = os.path.join(dest_folder, basename + '.tok') print("processing %s" % basename) tknzr = TweetTokenizer() with codecs.open(dest, 'w', "utf-8") as out_fs: with open(filename, 'r', encoding="utf-8") as in_fs: for line in in_fs: try: language, id, timestamp, username, tweet = line.strip().split('\t') except: print("could not parse line.") continue if language != 'en': continue tweet = tknzr.tokenize(tweet) if not 6 < len(tweet) < 110: continue tweet = preprocess_tweet(' '.join(tweet)) filter(lambda word: ' ' not in word, tweet) out_fs.write(id+'\t'+timestamp+'\t'+username+'\t'+tweet+'\n')
def load_data(fname): tid,topics,tweets,sentiments = [],[],[],[] tknzr = TweetTokenizer(reduce_len=True) n_not_available = 0 with open(fname) as f: for line in f: splits = line.split('\t') tweet = splits[3] sentiment = convertSentiment(splits[2]) if tweet != "Not Available\n": tid.append(splits[0]) topic = pts.preprocess_tweet(splits[1]) topic_tok = tknzr.tokenize(topic.decode('utf-8')) topics.append(splits[1]) tweet = pts.preprocess_tweet(tweet) tweet_tok = tknzr.tokenize(tweet.decode('utf-8')) tweets.append(tweet_tok) sentiments.append(int(sentiment)) else: n_not_available += 1 print "Number of not availalbe tweets:", n_not_available return tid,topics,tweets,sentiments
def __init__(self): super().__init__() self.__tokenizer = TweetTokenizer(preserve_case=False)
from __future__ import print_function import nltk #nltk.download() from nltk import TweetTokenizer import csv import random from collections import defaultdict tokenizer = TweetTokenizer() csvfile = open('trainingandtestdata/testdata.manual.2009.06.14.csv', 'rb') reader = csv.reader(csvfile, delimiter=',') rownum = 0 sentiments = [] tokens = [[]] for row in reader: colnum = 0 for col in row: if colnum == 0: sentiments.insert(rownum,int(col)) if colnum == 5: raw = col #.read().decode('utf8') tokens.insert(rownum,tokenizer.tokenize(raw)) ## print("tokens contents:", end='') ## for word in tokens[rownum]: ## print(word, end = " ") ## print() colnum += 1 rownum += 1 csvfile.close() #Divide into training and test data - randomly allocate 4/5 to training and 1/5 to test
def __init__(self, files): self.files = files self.tknzr = TweetTokenizer()