def sentiment(movie): stemmer = PorterStemmer() pos_count = 0 neg_count = 0 for line in movie.lines: words = re.findall(r"[\w']+|[.,!?;]", line.content.lower()) for w in words: word = stemmer.stem(w) if word in set(opinion_lexicon.negative()): neg_count += 1 elif word in set(opinion_lexicon.negative()): pos_count += 1 if pos_count > neg_count: return 1 else: return 0
def get_nltk_sentiment(sentence, method): if (method == 'vader'): sa = sentiment.vader.SentimentIntensityAnalyzer() output = sa.polarity_scores(str(sentence)) return output['compound'] elif (method == 'liu'): wordType = '' if "PERSON" in str(ne_chunk(pos_tag(word_tokenize(sentence)))): wordType = 'tag' tokenizer = treebank.TreebankWordTokenizer() pos_words = 0 neg_words = 0 tokenized_sent = [word.lower() for word in tokenizer.tokenize(sentence)] for word in tokenized_sent: if word in opinion_lexicon.positive(): pos_words += 1 elif word in opinion_lexicon.negative(): neg_words += 1 if pos_words > neg_words: return 'Positive' elif pos_words < neg_words: return 'Negative' elif pos_words == neg_words: if wordType == 'tag': return 'Positive' else: return 'Neutral'
def dlll_pos_neg_ratio(text): tokenizer = treebank.TreebankWordTokenizer() pos_words = 0 neg_words = 0 tokenized_sent = [word.lower() for word in tokenizer.tokenize(text)] x = list(range(len(tokenized_sent))) # x axis for the plot y = [] for word in tokenized_sent: if word in opinion_lexicon.positive(): pos_words += 1 y.append(1) # positive elif word in opinion_lexicon.negative(): neg_words += 1 y.append(-1) # negative else: y.append(0) # neutral print(pos_words, neg_words) ratio = sum(y) / len(y) if pos_words > neg_words: return ("Positive", ratio) elif pos_words < neg_words: return ("Negative", ratio) elif pos_words == neg_words: return ("Neutral", ratio)
def compute_pos_neg_scores(clean_content): """"" :param clean_content: pre-processed confessions :type all_comments: list :returns: None """ assert isinstance(clean_content, list) #nltk opinion lexicon pos_lex = opinion_lexicon.positive() neg_lex = opinion_lexicon.negative() pn_lex_score = defaultdict(int) for i, note in tqdm(enumerate(clean_content)): pn_lex_score[i] = np.array([0., 0.]) note = re.sub("[^\w]", " ", note).split() for word in note: if word in pos_lex: pn_lex_score[i] += np.array([1., 0.]) elif word in neg_lex: pn_lex_score[i] += np.array([0., 1.]) output = open('pn_lex_score.pkl', 'wb') pickle.dump(pn_lex_score, output) output.close() return None
def get_opinion_features(words): """ This function creates the opinion lexicon features as described in the assignment3 handout. the negative and positive data has been read into the following lists: * neg_opinion * pos_opinion if you haven't downloaded the opinion lexicon, run the following commands: * import nltk * nltk.download('opinion_lexicon') :param tags: tokens :return: feature_vectors: a dictionary values for each opinion feature """ neg_opinion = opinion_lexicon.negative() pos_opinion = opinion_lexicon.positive() feature_vectors = {} for word in neg_opinion: if word in words: feature_vectors[word] = 1 else: feature_vectors[word] = 0 for word in pos_opinion: if word in words: feature_vectors[word] = 1 else: feature_vectors[word] = 0 return feature_vectors
def demo_liu_hu_lexicon(sentence): from nltk.corpus import opinion_lexicon from nltk.tokenize import treebank #tokenizer = treebank.TreebankWordTokenizer() pos_words = 0.1 neg_words = 0.1 #tokenized_sent = [word.lower() for word in tokenizer.tokenize(sentence)] x = list(range(len(sentence))) y = [] for word in sentence: if word in opinion_lexicon.positive(): pos_words += 1 #y.append(1) # positive elif word in opinion_lexicon.negative(): neg_words += 1 #y.append(-1) # negative else: pos_words += 0 neg_words += 0 p_n_rat = pos_words/neg_words return p_n_rat,pos_words,neg_words
class Liu_Hu_Sentiment: positive = set(opinion_lexicon.positive()) negative = set(opinion_lexicon.negative()) sentiments = ('sentiment',) name = 'Liu Hu' def __init__(self): super().__init__() def transform(self, corpus, copy=True): scores = [] tokenizer = WordPunctTokenizer() tokens = tokenizer(corpus.documents) for doc in tokens: pos_words = sum(word in self.positive for word in doc) neg_words = sum(word in self.negative for word in doc) scores.append([100*(pos_words - neg_words)/max(len(doc), 1)]) X = np.array(scores).reshape((-1, len(self.sentiments))) # set compute values shared_cv = SharedTransform(self) cv = [VectorizationComputeValue(shared_cv, col) for col in self.sentiments] if copy: corpus = corpus.copy() corpus.extend_attributes(X, self.sentiments, compute_values=cv) return corpus
def get_lexicon(): # This lexicon must be used for sentiment classification (opinion). # Opinion Lexicon (or Sentiment Lexicon) (Hu and Liu, KDD-2004). opinion_pos = opinion_lexicon.positive() opinion_neg = opinion_lexicon.negative() lexicon = {0: opinion_neg, 1: opinion_pos} return lexicon
def analyze(string): count = 0 #print(opinion_lexicon.words()[:4]) result = {} negative_words = [] all_negative_words = [] postive_result = [] text_set = (string.split()) num_words = len(text_set) clean_words = text_set opinion_words = opinion_lexicon.negative() #clean_words = ["stupid","afraid"] for item in clean_words: if item in opinion_words: count = count + 1 new_word = make_positive(item) if new_word != item: # i.e. if the word was changed successfully negative_words.append(item) postive_result.append(new_word) all_negative_words.append(new_word) else: postive_result.append(item) result['negative_score'] = (count / num_words) * 100 positive_text = " ".join(postive_result) result['original_text'] = string result['positive_text'] = positive_text result['all_negative_words'] = set(all_negative_words) result['count'] = count result['num_words'] = num_words result['replaced_negative_words'] = set(negative_words) print(result)
def get_senti_lexicon(): # opinion_lexicon from nltk.corpus import opinion_lexicon opinion_pos = opinion_lexicon.positive() opinion_neg = opinion_lexicon.negative() # vader_lexicon from nltk.sentiment.vader import SentimentIntensityAnalyzer sentiment_analyzer = SentimentIntensityAnalyzer() vader_lexicon = sentiment_analyzer.lexicon vader_pos = set() vader_neg = set() for d in vader_lexicon: if vader_lexicon[d] >= 0.5: # threshold 조정 필요 ? vader_pos.add(d) elif vader_lexicon[d] <= -0.5: vader_neg.add(d) lexicon_path = '/'.join(os.getcwd().split('/')[:-1]) # finance lexcion finance_pos = get_lexicon(lexicon_path +'/lexicons/finance_pos.txt') finance_neg = get_lexicon(lexicon_path +'/lexicons/finance_neg.txt') # hu-liu lexicon hu_liu_pos = get_lexicon(lexicon_path +'/lexicons/hu_liu_pos.txt') hu_liu_neg = get_lexicon(lexicon_path +'/lexicons/hu_liu_neg.txt') # harvard lexicon harvard_neg = get_lexicon(lexicon_path +'/lexicons/harvard_neg.txt') pos_lexicon = set(opinion_pos) & hu_liu_pos neg_lexicon = set(opinion_neg) & hu_liu_neg senti_lexicon = pos_lexicon | neg_lexicon lexicon = {0:pos_lexicon, 1:neg_lexicon} return lexicon
def get_senti_lexicon(): # opinion_lexicon from nltk.corpus import opinion_lexicon opinion_pos = opinion_lexicon.positive() opinion_neg = opinion_lexicon.negative() # vader_lexicon from nltk.sentiment.vader import SentimentIntensityAnalyzer sentiment_analyzer = SentimentIntensityAnalyzer() vader_lexicon = sentiment_analyzer.lexicon vader_pos = set() vader_neg = set() for d in vader_lexicon: if vader_lexicon[d] >= 0.5: # threshold 조정 필요 ? vader_pos.add(d) elif vader_lexicon[d] <= -0.5: vader_neg.add(d) # finance lexcion finance_pos = get_lexicon('../lexicons/finance_pos.txt') finance_neg = get_lexicon('../lexicons/finance_neg.txt') # hu-liu lexicon hu_liu_pos = get_lexicon('../lexicons/hu_liu_pos.txt') hu_liu_neg = get_lexicon('../lexicons/hu_liu_neg.txt') # harvard lexicon harvard_neg = get_lexicon('../lexicons/harvard_neg.txt') pos_lexicon = set(opinion_pos) | vader_pos | finance_pos | hu_liu_pos neg_lexicon = set(opinion_neg) | vader_neg | finance_neg | hu_liu_neg | harvard_neg senti_lexicon = pos_lexicon | neg_lexicon return pos_lexicon, neg_lexicon, senti_lexicon
def _get_pos_neg_words_count(self, text): words = word_tokenize(text) pos_opinion_count = len(set(opinion_lexicon.positive()) & set(words)) neg_opinion_count = len(set(opinion_lexicon.negative()) & set(words)) return [pos_opinion_count, neg_opinion_count]
def demo_liu_hu_lexicon(sentence, plot=False): """ Basic example of sentiment classification using Liu and Hu opinion lexicon. This function simply counts the number of positive, negative and neutral words in the sentence and classifies it depending on which polarity is more represented. Words that do not appear in the lexicon are considered as neutral. :param sentence: a sentence whose polarity has to be classified. :param plot: if True, plot a visual representation of the sentence polarity. """ from nltk.corpus import opinion_lexicon from nltk.tokenize import treebank tokenizer = treebank.TreebankWordTokenizer() pos_words = 0 neg_words = 0 tokenized_sent = [word.lower() for word in tokenizer.tokenize(sentence)] x = list(range(len(tokenized_sent))) # x axis for the plot y = [] for word in tokenized_sent: if word in opinion_lexicon.positive(): pos_words += 1 y.append(1) # positive elif word in opinion_lexicon.negative(): neg_words += 1 y.append(-1) # negative else: y.append(0) # neutral # if plot == True: # _show_plot(x, y, x_labels=tokenized_sent, y_labels=['Negative', 'Neutral', 'Positive']) return pos_words, neg_words
def classifier(self,sentence): tokenizer = treebank.TreebankWordTokenizer() pos_words = 0 neg_words = 0 tokenized_sent = [word.lower() for word in tokenizer.tokenize(sentence)] x = list(range(len(tokenized_sent))) # x axis for the plot y = [] for word in tokenized_sent: if word in opinion_lexicon.positive(): pos_words += 1 y.append(1) # positive elif word in opinion_lexicon.negative(): neg_words += 1 y.append(-1) # negative else: y.append(0) # neutral if pos_words > neg_words: return 'Positive' elif pos_words < neg_words: return 'Negative' elif pos_words == neg_words: return 'Neutral'
def extract_features(corpus): feature_dict = {} analyser = SentimentIntensityAnalyzer() pos_opinion_words = set(opinion_lexicon.positive()) neg_opinion_words = set(opinion_lexicon.negative()) for dialog_index, dialog in enumerate(corpus): vectorizer = TfidfVectorizer() tfidf = vectorizer.fit_transform([diag[0] for diag in dialog]) similarity_matrix = cosine_similarity(tfidf) starter_user_id = dialog[0][1] for utterance_index, utt_info in enumerate(dialog[:-1]): utterance = utt_info[0] key = str(dialog_index) + "_" + str(utterance_index) words = word_tokenize(utterance) content_features = extract_content_features(similarity_matrix, utterance_index, utterance) structural_features = extract_structural_features(utterance_index, dialog, utt_info, starter_user_id, words) sentimental_features = extract_sentimental_features(utterance, utt_info, words, analyser, pos_opinion_words, neg_opinion_words) feature_dict[key] = content_features + structural_features + sentimental_features return feature_dict
def demo_liu_hu_lexicon(sentence): tokenizer = treebank.TreebankWordTokenizer() pos_words = 0 neg_words = 0 tokenized_sent = [ word.lower() for word in tokenizer.tokenize(sentence) ] x = list(range(len(tokenized_sent))) # x axis for the plot y = [] for word in tokenized_sent: if word in opinion_lexicon.positive(): pos_words += 1 y.append(1) # positive elif word in opinion_lexicon.negative(): neg_words += 1 y.append(-1) # negative else: y.append(0) # neutral if pos_words > neg_words: return 'Positive' elif pos_words < neg_words: return 'Negative' elif pos_words == neg_words: return 'Neutral'
def evaluate_sentence(sentence: str) -> bool: positive_count = 0 negative_count = 0 lemmatizer = WordNetLemmatizer() # lemmatization is commented out for submission for speed and improved evaluation new_sentence = "" for word in sentence.split(" "): new_sentence += lemmatizer.lemmatize(word) sentence = new_sentence for positive_word in opinion_lexicon.positive(): positive_word = lemmatizer.lemmatize(positive_word) if positive_word in sentence: positive_count += 1 for negative_word in opinion_lexicon.negative(): negative_word = lemmatizer.lemmatize(negative_word) if negative_word in sentence: negative_count += 1 if positive_count >= negative_count: is_sentence_positive = True else: is_sentence_positive = False return is_sentence_positive
def demo_liu_hu_lexicon(sentence): """ THIS IS JUST BIT MODIFIED Basic example of sentiment classification using Liu and Hu opinion lexicon. This function simply counts the number of positive, negative and neutral words in the sentence and classifies it depending on which polarity is more represented. Words that do not appear in the lexicon are considered as neutral. :param sentence: a sentence whose polarity has to be classified. :param plot: if True, plot a visual representation of the sentence polarity. """ from nltk.corpus import opinion_lexicon pos_words = 0 neg_words = 0 y = [] for word in sentence: if word in opinion_lexicon.positive(): pos_words += 1 y.append(1) # positive elif word in opinion_lexicon.negative(): neg_words += 1 y.append(-1) # negative else: y.append(0) # neutral if pos_words > neg_words: return 'Positive' elif pos_words < neg_words: return 'Negative' elif pos_words == neg_words: return 'Neutral'
def gen_word_cloud(self): raw_text = ' '.join(self.tdf.filt_text.to_list()) wc = wordcloud.WordCloud().process_text(raw_text) # assign postive/negative connotations to words from nltk corpus sent_df = pd.DataFrame(columns=['Words', 'align']) print(len(wc.keys())) for h, i in enumerate(wc.keys()): if h % 10 == 0: print(h) sent_df.loc[h, 'Words'] = i if i in ol.positive(): sent_df.loc[h, 'align'] = 'Positive' elif i in ol.negative(): sent_df.loc[h, 'align'] = 'Negative' else: sent_df.loc[h, 'align'] = 'Neutral' pos_words = sent_df.loc[sent_df['align'] == 'Positive', 'Words'].tolist() neg_words = sent_df.loc[sent_df['align'] == 'Negative', 'Words'].tolist() pos_dict = {k: v for k, v in wc.items() if k in pos_words} neg_dict = {k: v for k, v in wc.items() if k in neg_words} wordcloud.WordCloud( width=800, height=400).generate_from_frequencies(pos_dict).recolor( colormap='Greens').to_file('output/pos_wordcloud.png') wordcloud.WordCloud( width=800, height=400).generate_from_frequencies(neg_dict).recolor( colormap='Reds').to_file('output/neg_wordcloud.png')
def predict(self, X, binary=None): if (str(self.classes_.dtype)[:3] != 'int'): return self._predict(X, binary) else: tokenised_reviews = [review.split(" ") for review in X] if self.binary == None: if binary == None: self.binary = False else: self.binary = binary 'implement sentiment analyser using lexicon from Hu and Liu' predicted_sentiment = [] for review in tokenised_reviews: pos_words = sum(token in review for token in opinion_lexicon.positive()) neg_words = sum(token in review for token in opinion_lexicon.negative()) if self.binary == True: if pos_words > neg_words: predicted_sentiment.append(1) else: predicted_sentiment.append(0) else: if pos_words > neg_words: predicted_sentiment.append(2) elif pos_words < neg_words: predicted_sentiment.append(0) else: predicted_sentiment.append(1) return np.array(predicted_sentiment).astype(int)
def prepare_lexicon(process=True, dim=250, save=False): if process: dm = DatasetManager() data = dm.prepare_datasets() nega = set(opinion_lexicon.negative()) posi = set(opinion_lexicon.positive()) lexicon = opinion_lexicon.words() lexicon_dic = {x: 0 for x in lexicon} for t in data['vader']['text']: for w in t: if w in lexicon_dic: lexicon_dic[w] += 1 for t in data['sentiment140']['text']: for w in t: if w in lexicon_dic: lexicon_dic[w] += 1 L = Counter(lexicon_dic).most_common(4000) N = [] P = [] for w, _ in L: if w in nega: N.append(w) elif w in posi: P.append(w) l = P[:dim] + N[:dim] if save: with open('senti.lexicon', 'w') as f: for d in l: f.write(d) f.write('\n') return l else: with open('senti.lexicon', 'r') as f: data = [line.strip() for line in f] return data
def prepare_lexicon(corpus, embedding, num=250, extra=False): V = set([w for w in embedding.vocab]) neg = set(opinion_lexicon.negative()) pos = set(opinion_lexicon.positive()) senti_lexicon = opinion_lexicon.words() senti_lexicon = [w for w in senti_lexicon if w in V] lexicon_dic = {x: 0 for x in senti_lexicon} for sent in corpus: for w in sent: if w in lexicon_dic: lexicon_dic[w] += 1 L = Counter(lexicon_dic).most_common(5000) N = [] N_count = [] P = [] P_count = [] for word, count in L: if word in neg: N.append(word) N_count.append(count) elif word in pos: P.append(word) P_count.append(count) Senti_L = P[:num] + N[:num] P_sum = sum(P_count[:num]) P_score = [x * 1.0 / P_sum for x in P_count[:num]] N_sum = sum(N_count[:num]) N_score = [x * 1.0 / N_sum for x in N_count[:num]] Senti_W = P_score + N_score if extra: Extra_L = [l for l in Extra_Lexicon if l in V] Extra_W = [1.0 for l in Extra_L] return Senti_L + Extra_L, Senti_W + Extra_W return Senti_L, Senti_W
def __init__(self): self.RATIO = 1.2 self.pos_lexicon = opinion_lexicon.positive() self.neg_lexicon = opinion_lexicon.negative() self.neg_synonyms_lexicon = [] self.pos_synonyms_lexicon = [] self.enricher = TW.TweetEnricher()
def pos_neg_fraction_with_negation(text): """ Compute the fraction of positive and negative words in a text, including negated words :param text: input text :return: a fraction of positive and negative words in the text """ # Sets of already known positive and negative words positive_words = set(opinion_lexicon.positive()) negative_words = set(opinion_lexicon.negative()) # Set of all positive words including negated negative words all_positive_words = positive_words.union( {tag + "_NEG" for tag in negative_words}) # Set of all positive words including negated positive words all_negative_words = negative_words.union( {tag + "_NEG" for tag in positive_words}) tokens = tokenize_with_negation(text) # count how many positive and negative words occur in the text count_pos, count_neg = 0, 0 for token in tokens: if token in all_positive_words: count_pos += 1 if token in all_negative_words: count_neg += 1 count_all = len(tokens) if count_all != 0: return count_pos / count_all, count_neg / count_all else: # avoid division by zero return 0., 0.
def demo_liu_hu_lexicon(sentence, plot=False): """ Basic example of sentiment classification using Liu and Hu opinion lexicon. This function simply counts the number of positive, negative and neutral words in the sentence and classifies it depending on which polarity is more represented. Words that do not appear in the lexicon are considered as neutral. :param sentence: a sentence whose polarity has to be classified. :param plot: if True, plot a visual representation of the sentence polarity. """ from nltk.corpus import opinion_lexicon from nltk.tokenize import treebank tokenizer = treebank.TreebankWordTokenizer() pos_words = 0 neg_words = 0 tokenized_sent = [word.lower() for word in tokenizer.tokenize(sentence)] x = list(range(len(tokenized_sent))) # x axis for the plot y = [] for word in tokenized_sent: if word in opinion_lexicon.positive(): pos_words += 1 y.append(1) # positive elif word in opinion_lexicon.negative(): neg_words += 1 y.append(-1) # negative else: y.append(0) # neutral if (pos_words+neg_words) > 0: return (pos_words-neg_words)/float(pos_words+neg_words) else: return 0
def negopinion(sentence): tokenizer = treebank.TreebankWordTokenizer() neg1 = 0 tokenized = [word.lower() for word in tokenizer.tokenize(sentence)] for word in tokenized: if word in opinion_lexicon.negative(): neg1 += 1 return neg1
def opinion_lex(tokenizer, utterance): pos = 0 neg = 0 for word in tokenizer.tokenize(utterance.utterance): pos += word in opinion_lexicon.positive() neg += word in opinion_lexicon.negative() return pos, neg
def __init__(self): """ constructor """ self.positive_sentences = [] self.negative_sentences = [] response1 = input( 'Would you want to test sentiment with a local text data? (Y/N) ') if response1.lower() == 'y' or response1.lower() == 'yes': positive_file = input( 'Input the path for the positive sentiment data: ') negative_file = input( 'Input the path for the negative sentiment data: ') if os.path.exists(positive_file): # read positive sentences with open(positive_file, "r") as reader: self.positive_sentences = reader.readlines() self.positive_sentences = [ sent.rstrip() for sent in self.positive_sentences ] if os.path.exists(negative_file): # read negative sentences with open(negative_file, "r") as reader: self.negative_sentences = reader.readlines() self.negative_sentences = [ sent.rstrip() for sent in self.negative_sentences ] else: # use 5331 positive sentences and 5331 negative sentences as testing data # since this requires a huge amount of lexica, so this part is not implemented response2 = input( 'Would you want to test sentiment with data in sentence_polarity? (Y/N) ' ) if response2.lower() == 'y' or response2.lower() == 'yes': # negative words self.negative_lexica = opinion_lexicon.negative() self.negative_lexica_size = len(self.negative_lexica) # positive words self.positive_lexica = opinion_lexicon.positive() self.positive_lexica_size = len(self.positive_lexica) # sentence sentiment categories self.senti_categories = sentence_polarity.categories() # negative sentiment sentences self.negative_sentences = sentence_polarity.sents( categories=['neg'])[:10] # get the first 10 sentences self.negative_sentences = [ ' '.join(sent) for sent in self.negative_sentences ] self.negative_sentences_size = len(self.negative_sentences) # positive sentiment sentences self.positive_sentences = sentence_polarity.sents( categories=['pos'])[:10] # get the first 10 sentences self.positive_sentences = [ ' '.join(sent) for sent in self.positive_sentences ] self.positive_sentences_size = len(self.positive_sentences)
def __init__(self): self.tokenizer = TweetTokenizer() self.stemmer = PorterStemmer() self.stopset = set(stopwords.words('english')) self.negative_opinions = opinion_lexicon.negative() self.positive_opinions = opinion_lexicon.positive() self.brexit_keywords = [ line.rstrip('\n') for line in open('../Data/Lists/BrexitKeywords') ] self.vulgar_words = [ line.rstrip('\n').lower() for line in open('../Data/Lists/VulgarWordsList') ] self.twitter_jargons = [ line.rstrip('\n') for line in open('../Data/Lists/TwitterSlangsAndAbbreviations') ] self.web_abbreviations = [ line.rstrip('\n').lower() for line in open('../Data/Lists/WebAcronymns') ] self.emoticons_list = [ line.rstrip('\n') for line in open('../Data/Lists/EmojiList') ] self.pos_emoticons_list = [ line.rstrip('\n') for line in open('../Data/Lists/PositiveEmojiList') ] self.neg_emoticons_list = [ line.rstrip('\n') for line in open('../Data/Lists/NegativeEmojiList') ] self.first_person_pronouns = [ line.rstrip('\n') for line in open('../Data/Lists/FirstPersonPronouns') ] self.speech_act_verbs = [ line.rstrip('\n') for line in open('../Data/Lists/StemmedSpeechActVerbs') ] self.trusted_domains = [ line.rstrip('\n') for line in open('../Data/Lists/TrustedDomains') ] self.verb_tags = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'] self.n_gram_count_matrix = {} self.vectorizer = CountVectorizer( ngram_range=(1, 3), tokenizer=self.tokenizer.tokenize, stop_words=list(self.stopset) + self.web_abbreviations + list(string.punctuation) + ["…", "...", "..", ")", "(", "-->", "->", ">>", "#", "RT", "@"]) self.vectorizer_unigram = CountVectorizer( ngram_range=(1, 1), tokenizer=self.tokenizer.tokenize, stop_words=list(self.stopset) + self.web_abbreviations + list(string.punctuation)) self.positive_ops = [x.lower() for x in self.positive_opinions] self.negative_ops = [x.lower() for x in self.negative_opinions]
def __init__(self): super().__init__() self.sentiment_analyzer = SentimentIntensityAnalyzer() self.negative_lexicon = list(opinion_lexicon.negative()) self.positive_lexicon = list(opinion_lexicon.positive()) logging.basicConfig(filename="feature.log", filemode="w+", level=logging.INFO) self.logger = logging.getLogger("info")
def __init__(self): # negative words self.negative_lexica = opinion_lexicon.negative() self.negative_lexica_size = len(self.negative_lexica) # positive words self.positive_lexica = opinion_lexicon.positive() self.positive_lexica_size = len(self.positive_lexica) # sentence sentiment categories self.senti_categories = sentence_polarity.categories()
def count_words(self, sentence, positive=False): if positive: lex = set(opinion_lexicon.positive()) else: lex = set(opinion_lexicon.negative()) numb_acc = 0 for word in word_tokenize(sentence): numb_acc += word in lex return numb_acc
def getNegativeWords(sentence): from nltk.corpus import opinion_lexicon from nltk.tokenize import treebank tokenizer = treebank.TreebankWordTokenizer() tokenized_sent = [word.lower() for word in tokenizer.tokenize(sentence)] x = list() # x axis for the plot for word in tokenized_sent: if word in opinion_lexicon.negative(): x.append(word) return x
def wordSentenceContainsOpinionatedWords(review_spacy): for word in review_spacy: if word.orth_ in opinion_lexicon.positive() or word in opinion_lexicon.negative(): return 1 return 0
name = 'Prabhat Saini' def getpersonaldetails() : name = 'Prabhat Saini' sex = 'Male' dob = '30th November 1991' personality = 'ENTP' return (name, sex, dob, personality) def getdetails() : name, sex, dob, personality = getpersonaldetails() ##check for positive terms in the text pos_words = [word for word in ol.positive()] ##print pos_words neg_words = [word for word in ol.negative()] ##print neg_words ##create positive and negative word indices and use as dictionary for word in words : if word in pos_words : pos_sentiment[word] = 'positive' if word in neg_words : neg_sentiment[word] = 'negative' ##print words print pos_sentiment print neg_sentiment ##proceed to sentiment analysis using textblob naivebayes text_blob_pattern = TextBlob(text)
wordCount=0 BOW = set() #these are maps of word with corresponding counts BOW1 = {} negBOW = {} posBOW = {} positiveWords={} negativeWords={} XMap={'A':'B'} #Map of X for all the documents stemmer = PorterStemmer() weights=[] positiveWords={} negativeWords={} negatives=opinion_lexicon.negative() positives=opinion_lexicon.positive() unit_step = lambda x: 0 if x < 0 else 1 class Perceptron: stopwords = nltk.corpus.stopwords.words('english') class TrainSplit: """Represents a set of training/testing data. self.train is a list of Examples, as is self.test. """ def __init__(self): self.train = [] self.test = [] class Example: """Represents a document with a label. klass is 'pos' or 'neg' by convention. words is a list of strings.