def demo_liu_hu_lexicon(sentence): tokenizer = treebank.TreebankWordTokenizer() pos_words = 0 neg_words = 0 tokenized_sent = [ word.lower() for word in tokenizer.tokenize(sentence) ] x = list(range(len(tokenized_sent))) # x axis for the plot y = [] for word in tokenized_sent: if word in opinion_lexicon.positive(): pos_words += 1 y.append(1) # positive elif word in opinion_lexicon.negative(): neg_words += 1 y.append(-1) # negative else: y.append(0) # neutral if pos_words > neg_words: return 'Positive' elif pos_words < neg_words: return 'Negative' elif pos_words == neg_words: return 'Neutral'
def predictOpinionAbandoned(self, text): """ Function that predicts whether the given text has a (positive, negative, neutral) opinion on the targets. :param text: Text that possibly has an opinion on the given targets :param targets: The targets on which the opinion has been expressed :return: 1: Positive. 0: Neutral. -1: Negative """ tokenizer = treebank.TreebankWordTokenizer() pos_words = 1 neg_words = 1 tokenized_sent = [word.lower() for word in tokenizer.tokenize(text)] y = [] for word in tokenized_sent: if word in self.pos_lexicon: pos_words += 1 y.append(1) # positive elif word in self.neg_lexicon: neg_words += 1 y.append(-1) # negative else: y.append(0) # neutra if pos_words / neg_words > self.RATIO: print("Support.") return 1 elif neg_words / pos_words > self.RATIO: print("Oppose.") return -1 else: print("Neutral") return 0
def dlll_pos_neg_ratio(text): tokenizer = treebank.TreebankWordTokenizer() pos_words = 0 neg_words = 0 tokenized_sent = [word.lower() for word in tokenizer.tokenize(text)] x = list(range(len(tokenized_sent))) # x axis for the plot y = [] for word in tokenized_sent: if word in opinion_lexicon.positive(): pos_words += 1 y.append(1) # positive elif word in opinion_lexicon.negative(): neg_words += 1 y.append(-1) # negative else: y.append(0) # neutral print(pos_words, neg_words) ratio = sum(y) / len(y) if pos_words > neg_words: return ("Positive", ratio) elif pos_words < neg_words: return ("Negative", ratio) elif pos_words == neg_words: return ("Neutral", ratio)
def demo_liu_hu_lexicon(sentence, plot=False): """ Basic example of sentiment classification using Liu and Hu opinion lexicon. This function simply counts the number of positive, negative and neutral words in the sentence and classifies it depending on which polarity is more represented. Words that do not appear in the lexicon are considered as neutral. :param sentence: a sentence whose polarity has to be classified. :param plot: if True, plot a visual representation of the sentence polarity. """ from nltk.corpus import opinion_lexicon from nltk.tokenize import treebank tokenizer = treebank.TreebankWordTokenizer() pos_words = 0 neg_words = 0 tokenized_sent = [word.lower() for word in tokenizer.tokenize(sentence)] x = list(range(len(tokenized_sent))) # x axis for the plot y = [] for word in tokenized_sent: if word in opinion_lexicon.positive(): pos_words += 1 y.append(1) # positive elif word in opinion_lexicon.negative(): neg_words += 1 y.append(-1) # negative else: y.append(0) # neutral # if plot == True: # _show_plot(x, y, x_labels=tokenized_sent, y_labels=['Negative', 'Neutral', 'Positive']) return pos_words, neg_words
def get_nltk_sentiment(sentence, method): if (method == 'vader'): sa = sentiment.vader.SentimentIntensityAnalyzer() output = sa.polarity_scores(str(sentence)) return output['compound'] elif (method == 'liu'): wordType = '' if "PERSON" in str(ne_chunk(pos_tag(word_tokenize(sentence)))): wordType = 'tag' tokenizer = treebank.TreebankWordTokenizer() pos_words = 0 neg_words = 0 tokenized_sent = [word.lower() for word in tokenizer.tokenize(sentence)] for word in tokenized_sent: if word in opinion_lexicon.positive(): pos_words += 1 elif word in opinion_lexicon.negative(): neg_words += 1 if pos_words > neg_words: return 'Positive' elif pos_words < neg_words: return 'Negative' elif pos_words == neg_words: if wordType == 'tag': return 'Positive' else: return 'Neutral'
def negopinion(sentence): tokenizer = treebank.TreebankWordTokenizer() neg1 = 0 tokenized = [word.lower() for word in tokenizer.tokenize(sentence)] for word in tokenized: if word in opinion_lexicon.negative(): neg1 += 1 return neg1
def posopinion(sentence): tokenizer = treebank.TreebankWordTokenizer() pos1 = 0 tokenized = [word.lower() for word in tokenizer.tokenize(sentence)] for word in tokenized: if word in opinion_lexicon.positive(): pos1 += 1 return pos1
def run(self): self.output().makedirs() self.tokenzier = treebank.TreebankWordTokenizer() self.stemmer = snowball.SnowballStemmer('english') self.vectorizer = CountVectorizer(ngram_range=(1, self.ngram_max), min_df=self.ngram_min_df) train_data = rf_dataset.Dataset().load('train', fold=None, as_df=True) test_data = rf_dataset.Dataset().load('test', fold=None, as_df=True) all_questions = np.concatenate([ train_data.question1_clean.values, test_data.question1_clean.values, train_data.question2_clean.values, test_data.question2_clean.values ]) print(colors.lightblue | 'Tokenizing') all_tokens = multiprocessing.Pool(4).map(self.vectorize_question, all_questions) print(colors.lightblue | 'Finished tokenizing, now fitting') transformed_tokens = self.vectorizer.fit_transform(all_tokens) print(colors.lightblue | colors.bold | 'Gosh that takes a long time') transformed_tokens = transformed_tokens.tocsr() halfpt = transformed_tokens.shape[0] // 2 assert halfpt == train_data.shape[0] + test_data.shape[0] q1s = transformed_tokens[:halfpt] q2s = transformed_tokens[halfpt:] train_q1s = q1s[:train_data.shape[0]] train_q2s = q2s[:train_data.shape[0]] test_q1s = q1s[train_data.shape[0]:] test_q2s = q2s[train_data.shape[0]:] nose.tools.assert_equal(test_q1s.shape[0], test_data.shape[0]) nose.tools.assert_equal(test_q2s.shape[0], test_data.shape[0]) nose.tools.assert_equal(train_q1s.shape[0], train_data.shape[0]) nose.tools.assert_equal(train_q2s.shape[0], train_data.shape[0]) self.write_mat_to(self.make_path('train_q1.pkl'), train_q1s) self.write_mat_to(self.make_path('train_q2.pkl'), train_q2s) self.write_mat_to(self.make_path('test_q1.pkl'), test_q1s) self.write_mat_to(self.make_path('test_q2.pkl'), test_q2s) diffs = sp.hstack([np.abs(q1s - q2s), q1s.multiply(q2s)]).tocsr() train_vecs = diffs[:train_data.shape[0]] test_vecs = diffs[train_data.shape[0]:] nose.tools.assert_equal(train_vecs.shape[0], train_data.shape[0]) nose.tools.assert_equal(test_vecs.shape[0], test_data.shape[0]) self.write_mat_to(self.make_path('train_mat.pkl'), train_vecs) self.write_mat_to(self.make_path('test_mat.pkl'), test_vecs) with self.output().open('w'): pass
def idf_embedding(data): idf_corpus = [] tok = treebank.TreebankWordTokenizer() for i, c in enumerate(data): for u in c.utterances: idf_corpus.append(c.title) idf_corpus.append(u.utterance) vectorizer = TfidfVectorizer(tokenizer=tok.tokenize, stop_words='english') vectorizer.fit(idf_corpus) return vectorizer
def getPositiveWords(sentence): from nltk.corpus import opinion_lexicon from nltk.tokenize import treebank tokenizer = treebank.TreebankWordTokenizer() tokenized_sent = [word.lower() for word in tokenizer.tokenize(sentence)] x = list() # x axis for the plot for word in tokenized_sent: if word in opinion_lexicon.positive(): x.append(word) return x
def run(self): self.output().makedirs() tqdm.pandas(tqdm) self.tokenzier = treebank.TreebankWordTokenizer() self.stemmer = snowball.SnowballStemmer('english') self.vectorizer = TfidfVectorizer(ngram_range=(1, 2), min_df=50) train, merge, valid = Dataset().load() logging.info('Vectorizing train') train_mat, q1, q2 = self.fit(train) train_cols = train_mat.shape[1] train_q1_cols, train_q2_cols = q1.shape[1], q2.shape[1] scipy.io.mmwrite('cache/tfidf/train.mtx', train_mat) scipy.io.mmwrite('cache/tfidf/train_q1.mtx', q1) scipy.io.mmwrite('cache/tfidf/train_q2.mtx', q2) del train, train_mat logging.info('Vectorizing valid') valid_mat, q1, q2 = self.transform(valid) assert valid_mat.shape[1] == train_cols assert q1.shape[1] == train_q1_cols and q2.shape[1] == train_q2_cols scipy.io.mmwrite('cache/tfidf/valid.mtx', valid_mat) scipy.io.mmwrite('cache/tfidf/valid_q1.mtx', q1) scipy.io.mmwrite('cache/tfidf/valid_q2.mtx', q2) del valid, valid_mat logging.info('Vectorizing merge') merge_mat, q1, q2 = self.transform(merge) assert merge_mat.shape[1] == train_cols assert q1.shape[1] == train_q1_cols and q2.shape[1] == train_q2_cols scipy.io.mmwrite('cache/tfidf/merge.mtx', merge_mat) scipy.io.mmwrite('cache/tfidf/merge_q1.mtx', q1) scipy.io.mmwrite('cache/tfidf/merge_q2.mtx', q2) del merge, merge_mat logging.info('Vectorizing test') test = Dataset().load_test() test_mat, q1, q2 = self.transform(test) assert test_mat.shape[1] == train_cols assert q1.shape[1] == train_q1_cols and q2.shape[1] == train_q2_cols scipy.io.mmwrite('cache/tfidf/test.mtx', test_mat) scipy.io.mmwrite('cache/tfidf/test_q1.mtx', q1) scipy.io.mmwrite('cache/tfidf/test_q2.mtx', q2) assert self.load_dataset('test').shape[1] == train_cols with self.output().open('w') as f: pass
def combine_sentimental(conversations): x = [] conv_count = len(conversations) analyzer = SentimentIntensityAnalyzer() tok = treebank.TreebankWordTokenizer() for i, c in enumerate(conversations): for u in c.utterances: x.append([ thank(u), e_mark(u), feedback(u), *sentiment_score(analyzer, u), *opinion_lex(tok, u) ]) print('\r>>>> {}/{} done...'.format((i + 1), conv_count), end='') return np.asarray(x)
def acisWordAnalysis(sentence): tokenizer = treebank.TreebankWordTokenizer() pos_words = 0 neg_words = 0 tokenized_sent = [word.lower() for word in tokenizer.tokenize(sentence)] for word in tokenized_sent: if word in opinion_lexicon.positive(): pos_words += 1 elif word in opinion_lexicon.negative(): neg_words += 1 if pos_words > neg_words: return "Positive" elif pos_words < neg_words: return "Negative" elif pos_words == neg_words: return "Neutral"
def run(self): self.tokenzier = treebank.TreebankWordTokenizer() self.kvecs = gensim.models.KeyedVectors.load_word2vec_format(w2v_file) train_data = rf_dataset.Dataset().load_all( 'train', as_df=True)[['question1_clean', 'question2_clean']] test_data = rf_dataset.Dataset().load_all( 'test', as_df=True)[['question1_clean', 'question2_clean']] all_data = pandas.concat([train_data, test_data], 0) distances = list( tqdm(multiprocessing.Pool().imap(self.vectorize, zip(all_data['question1_clean'], all_data['question2_clean']), chunksize=50_000), total=all_data.shape[0], desc='vectorizing the words'))
def treebank_tokenizer(self, review): tokenizer = treebank.TreebankWordTokenizer() if self.features in [1, 2]: tokens = [ process_word(word.lower()) for word in tokenizer.tokenize(self.data[review]['Content']) ] else: tokens = [ word.lower() for word in tokenizer.tokenize(self.data[review]['Content']) ] tags = nltk.pos_tag(tokens) if self.features in [2, 3]: tags = self.ngrams(tokens, tags) return tokens, tags
def simple_sentiment(text): tokenizer = treebank.TreebankWordTokenizer() pos_words = 0 neg_words = 0 tokenized_sent = [word.lower() for word in tokenizer.tokenize(text)] for word in tokenized_sent: if word in opinion_lexicon.positive(): pos_words += 1 elif word in opinion_lexicon.negative(): neg_words += 1 if pos_words > neg_words: return 'Positive' elif pos_words < neg_words: return 'Negative' elif pos_words == neg_words: return 'Neutral'
def run(self): self.output().makedirs() kvecs = gensim.models.KeyedVectors.load_word2vec_format(w2v_file) train_dataset = rf_dataset.Dataset().load_all('train', as_df=True) test_dataset = rf_dataset.Dataset().load_all('test', as_df=True) self.tokenzier = treebank.TreebankWordTokenizer() all_words = pandas.concat([ train_dataset.question1_clean.str.lower(), train_dataset.question2_clean.str.lower(), test_dataset.question1_clean.str.lower(), test_dataset.question2_clean.str.lower(), ]) tokenizer = Tokenizer(num_words=250_000) tokenizer.fit_on_texts(all_words) all_seqs = tokenizer.texts_to_sequences(all_words) all_padded_seqs = pad_sequences(all_seqs, 32) train_seqs = all_padded_seqs[:train_dataset.shape[0] * 2] test_seqs = all_padded_seqs[train_dataset.shape[0] * 2:] nose.tools.assert_equal(test_seqs.shape[0], test_dataset.shape[0] * 2) train_q1 = train_seqs[:train_dataset.shape[0]] train_q2 = train_seqs[train_dataset.shape[0]:] test_q1 = test_seqs[:test_dataset.shape[0]] test_q2 = test_seqs[test_dataset.shape[0]:] np.savez_compressed(self.make_path('train.npz'), q1=train_q1, q2=train_q2) np.savez_compressed(self.make_path('test.npz'), q1=test_q1, q2=test_q2) embedding_matrix = np.zeros((250_000, 300)) for word, ix in tokenizer.word_index.items(): if word in kvecs: embedding_matrix[ix, :] = kvecs[word] np.savez_compressed(self.make_path('embedding.npz'), data=embedding_matrix) with self.output().open('w'): pass
def __init__(self, lemmatize=lambda x:x, boundaries=False, **kwargs): try: import nltk.tokenize.treebank as tb self.tokenize = tb.TreebankWordTokenizer().tokenize except ImportError: print "Could not import NLTK tokenizer. Tokenizing on space instead" self.tokenize = lambda x : x.split(" ") self.lemmatize = lemmatize self.boundaries = boundaries if 'preLemmatized' in kwargs: def get_lemma(x): x = x.split(kwargs['preLemmatized']) return x[-1] def get_lemmata(xs): return [get_lemma(x) for x in xs] self.lemmatize = get_lemmata self._set_parameters(**kwargs)
def getPolarity_lex(sentence): """ Polarity of the sentences, conventional Liu and Hu Opinion Lexicon Takes in a sentence and returns the sentiment of the sentence by counting the no of positive and negitive and negitive words and by reversing the sentiment if the words NO or NOT are present """ tokenizer = treebank.TreebankWordTokenizer() pos_words = 0 neg_words = 0 tokenized_sent = [word.lower() for word in tokenizer.tokenize(sentence)] x = list(range(len(tokenized_sent))) y = [] isNegation = False negationWords = [ 'no', 'not', 'never', 'none', 'hardly', 'rarely', 'scarcely', '' ] for word in tokenized_sent: if word in opinion_lexicon.positive(): pos_words += 1 y.append(1) # positive elif word in opinion_lexicon.negative(): neg_words += 1 y.append(-1) # negative else: y.append(0) # neutral if word in negationWords: isNegation = True if pos_words > neg_words and isNegation == True: return 'neg' elif pos_words > neg_words: return 'pos' elif pos_words < neg_words and isNegation == True: return 'pos' elif pos_words < neg_words: return 'neg' elif pos_words == neg_words: return 'neutral'
def run(self): self.output().makedirs() tqdm.pandas(tqdm) self.tokenzier = treebank.TreebankWordTokenizer() self.stemmer = snowball.SnowballStemmer('english') self.vectorizer = CountVectorizer(ngram_range=(1, 3), min_df=50) train, merge, valid = Dataset().load() logging.info('Vectorizing train') train_mat, q1, q2 = self.fit(train) scipy.io.mmwrite('cache/count/train.mtx', train_mat) scipy.io.mmwrite('cache/count/train_q1.mtx', q1) scipy.io.mmwrite('cache/count/train_q2.mtx', q2) del train, train_mat logging.info('Vectorizing valid') valid_mat, q1, q2 = self.transform(valid) scipy.io.mmwrite('cache/count/valid.mtx', valid_mat) scipy.io.mmwrite('cache/count/valid_q1.mtx', q1) scipy.io.mmwrite('cache/count/valid_q2.mtx', q2) del valid, valid_mat logging.info('Vectorizing merge') merge_mat, q1, q2 = self.transform(merge) scipy.io.mmwrite('cache/count/merge.mtx', merge_mat) scipy.io.mmwrite('cache/count/merge_q1.mtx', q1) scipy.io.mmwrite('cache/count/merge_q2.mtx', q2) del merge, merge_mat logging.info('Vectorizing test') test = Dataset().load_test() test_mat, q1, q2 = self.transform(test) scipy.io.mmwrite('cache/count/test.mtx', test_mat) scipy.io.mmwrite('cache/count/test_q1.mtx', q1) scipy.io.mmwrite('cache/count/test_q2.mtx', q2) with self.output().open('w') as f: pass
def liu_hu_opinion_lexicon(sentence: str) -> str: ''' Modified version of the Liu Hu opinion lexicon algorithm for sentiment analysis on sentences. Reference: https://www.nltk.org/_modules/nltk/sentiment/util.html#demo_liu_hu_lexicon The function has been modified to return the values instead of printing. Returns: -------- Sentiment of a sentence, classified as 'Positive', 'Negative' or 'Neutral' ''' from nltk.corpus import opinion_lexicon from nltk.tokenize import treebank tokenizer = treebank.TreebankWordTokenizer() pos_words, neg_words = 0, 0 y = [] tokenized_sent = [word.lower() for word in tokenizer.tokenize(sentence)] for word in tokenized_sent: if word in opinion_lexicon.positive(): pos_words += 1 y.append(1) # positive elif word in opinion_lexicon.negative(): neg_words += 1 y.append(-1) # negative else: y.append(0) # neutral if pos_words > neg_words: return ('Positive') elif pos_words < neg_words: return ('Negative') elif pos_words == neg_words: return ('Neutral')
def tokenize_tree(): tokenizer = treebank.TreebankWordTokenizer() def tokenize_edu(edu_node): if edu_node.text: edu_node.text = tokenizer.tokenize(edu_node.text, convert_parentheses=True, return_str=True) for rstf in glob.glob(FLAGS.rst_path + "/*.dis"): if rstf.endswith("dis"): basename = rstf.rsplit("/", 1)[1].split(".")[0] if basename.startswith("wsj"): print >> logs, "Tokenizing", basename rstlines = " ".join( [line.strip() for line in open(rstf).readlines()]) rstt = RSTTree.parse(rstlines) rstt.postorder_visit(tokenize_edu) tgtfile = FLAGS.rst_path + "/" + basename + ".out.dis.tok" prettystr = rstt.pretty_str() + "\n" open(tgtfile, "w").write(prettystr)
def tokenize_by_treebank_word(text): tokenizer = treebank.TreebankWordTokenizer() return tokenizer.tokenize(text)
def run(self): global _independent_transformers self.tokenzier = treebank.TreebankWordTokenizer() self.stemmer = snowball.SnowballStemmer('english') train_data = rf_dataset.Dataset().load_all( 'train', as_df=True)[['question1_clean', 'question2_clean']] test_data = rf_dataset.Dataset().load_all( 'test', as_df=True)[['question1_clean', 'question2_clean']] all_data = pandas.concat([train_data, test_data], 0) all_q1 = list(all_data['question1_clean']) all_t1 = list( tqdm(multiprocessing.Pool().imap(self.tokenize, all_q1, chunksize=5000), total=len(all_q1), desc='Tokenizing: 1')) all_q2 = list(all_data['question2_clean']) all_t2 = list( tqdm(multiprocessing.Pool().imap(self.tokenize, all_q2, chunksize=5000), total=len(all_q2), desc='Tokenizing: 2')) all_indep_dists = list( tqdm(multiprocessing.Pool().imap(transform, zip(all_q1, all_q2, all_t1, all_t2), chunksize=5000), total=len(all_q1), desc='Computing distances')) all_df = pandas.DataFrame(all_indep_dists) print('Loading dependent transforms') dependent_transformers = { 'word_mover': WordMoverDistance(), 'sentiment': SentimentDifference() } print('Finished loading!') for name, fn in dependent_transformers.items(): dist = [ fn(q1, q2, t1, t2) for q1, q2, t1, t2 in tqdm(zip(all_q1, all_q2, all_t1, all_t2), total=len(all_q1), desc=name) ] if isinstance(dist[0], dict): frame = pandas.DataFrame.from_dict(dist, orient='columns') for col in frame: all_df[name + '_' + col] = frame[col] else: all_df[name] = dist self.output().makedirs() train_dists = all_df.iloc[:train_data.shape[0]] test_dists = all_df.iloc[train_data.shape[0]:] train_dists.to_msgpack(_train_loc) test_dists.to_msgpack(_test_loc) little_cls = ensemble.ExtraTreesClassifier(n_estimators=200, n_jobs=-1) little_cls.fit( train_dists.clip(-10000, 10000).values, rf_dataset.Dataset().load_all('train', as_df=True).is_duplicate.values) print( pandas.Series(little_cls.feature_importances_, train_dists.columns).sort_values()) with self.output().open('w') as f: f.write( str( pandas.Series(little_cls.feature_importances_, train_dists.columns).sort_values())) f.write("\n")
def sentiment(body): from nltk.tokenize import treebank stripped = Markup(body).striptags() tokenizer = treebank.TreebankWordTokenizer() return sum([word_score.get(word.lower(), 0) for word in tokenizer.tokenize(stripped)])
print(s1_comments) s1_comments = s1_comments.lower() words = s1_comments.split() letters_only = re.sub("[^a-zA-Z]", " ", l) stops = set(stopwords.words("english")) meaningful_words = [w for w in words if not w in stops] print(meaningful_words) sentence = s1_comments sentence = ''.join(s1_comments) tokenizer = treebank.TreebankWordTokenizer() pos_words = 0 neg_words = 0 tokenized_sent = [word.lower() for word in tokenizer.tokenize(sentence)] x = list(range(len(tokenized_sent))) # x axis for the plot y = [] for word in tokenized_sent: if word in opinion_lexicon.positive(): pos_words += 1 y.append(1) # positive elif word in opinion_lexicon.negative(): neg_words += 1 y.append(-1) # negative else:
def neuopinion(sentence): tokenizer = treebank.TreebankWordTokenizer() tokenized = [word.lower() for word in tokenizer.tokenize(sentence)] total = len(tokenized) return total
def __init__(self): self.pos = set(opinion_lexicon.positive()) self.neg = set(opinion_lexicon.negative()) self.tok = treebank.TreebankWordTokenizer()