def voc_sentiword(): """Construct the vocabulary based on words with the biggest opinion score in Sentiword. Parameters ---------- Returns ------- None The vocabulary is constructed. """ res = {} for s_word in swn.all_senti_synsets(): ll = str(s_word).split('.') if len(ll) >= 6: continue w, pos_tag = ll[0][1:], ll[1] if not pos_bool: pos_tag = 'x' key = w + '.' + pos_tag if key not in res: if s_word.pos_score() >= threshold or s_word.neg_score() >= threshold: res[key] = (s_word.pos_score(), s_word.neg_score()) sorted_res = sorted(res.items(), key = lambda t : max(t[1][0], t[1][1])) sorted_res.reverse() select_voc(sorted_res)
def swn_lexicon(self): lexicon_data = {} temp_lexicon = {} #add data from the regular resource synsets = swn.all_senti_synsets() for synset in synsets: synset_val = str(synset) synset_full = synset_val.strip('<').split(':')[0] synset_full = synset_full[:-3] synset_word = synset_full[:-2] synset_tag = synset_full[-1] if synset_tag == 's': synset_tag = 'a' if synset_full not in temp_lexicon: #even appends values without a sentiment rating, could be wrong temp_lexicon[synset_full] = { 'positive': [synset.pos_score()], 'negative': [synset.neg_score()] } else: temp_lexicon[synset_full]['positive'].append( synset.pos_score()) temp_lexicon[synset_full]['negative'].append( synset.neg_score()) for word in temp_lexicon: if mean(temp_lexicon[word]['positive']) > 0 or mean( temp_lexicon[word]['negative']) > 0: lexicon_data[word] = { 'positive': mean(temp_lexicon[word]['positive']), 'negative': mean(temp_lexicon[word]['negative']) } with open(self.setup.file_swn, 'w') as f: lexicon = {} for word in sorted(lexicon_data): lexicon[word] = lexicon_data[word] json.dump(lexicon, f)
""" Python file to understand properties of libraries. """ from nltk.corpus import sentiwordnet as swn from nltk.corpus import wordnet as wn import nltk import enchant import logging from datetime import datetime startTime = datetime.now() BING_LIU_DATA_PATH = 'data/bingliu_lexicon' #Does sentiwordnet have words with '_' or multi_words? swn_all_words = swn.all_senti_synsets() swn_words = [] print "\nSWN" for word in swn_all_words: word_name = word.synset.name().split('.')[0] if '_' in word_name: swn_words.append(word_name) print str(len(swn_words)) + str(swn_words[:10]) #What about Bing Liu? logging.info(__name__ + " - " + "\nBing Liu") words = [] with open(BING_LIU_DATA_PATH + "/positive-words.txt", 'r') as bing_pos_file: for line in bing_pos_file: w = str(line)
import re import math import nltk nltk.download('punkt') nltk.download('stopwords') nltk.download('sentiwordnet') nltk.download('wordnet') from nltk import word_tokenize from nltk.corpus import stopwords from nltk.corpus import sentiwordnet as swn from rake_nltk import Metric, Rake import tfidf next(swn.all_senti_synsets()) logging.basicConfig(filename='process.log', filemode='a', level='INFO', format='%(asctime)s - %(levelname)s - %(message)s') redis_client = redis.StrictRedis(host='localhost', port=6379, db=0) def compute_term_frequency(word_dict, bow): tf_dict = {} bow_count = len(bow) for word, count in word_dict.items(): tf_dict[word] = count/float(bow_count) return tf_dict def compute_inverse_data_frequency(doc_list):
# generate LDA model ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=4, id2word=dictionary, passes=20) print("LDA............") topics = ldamodel.print_topics(num_topics=3, num_words=5) for topic in topics: print(type(topic)) print(topic) print("LSA.................") #id2word = gensim.corpora.Dictionary.load_from_text("c:\lda_test.txt") lsi = gensim.models.lsimodel.LsiModel(corpus, id2word=dictionary) from nltk.corpus import sentiwordnet as swn topics = lsi.print_topics(5) for topic in topics: print(topic[1]) print(swn.senti_synsets(topic[1])) print("----------------------------------------") #print(list(swn.senti_synsets('slow'))) happy = swn.senti_synsets('happy') print(happy.neg_score()) all = swn.all_senti_synsets() #print(all)
def extract_new_concepts(): """ Extracts new concepts using SentiWordNet(SWN) and Bing Liu's Opinion lexicon. Also adding few manually picked up concepts. Arguments: None Returns: List of new concepts """ startTime = datetime.now() current_concepts = [key for (key, value) in senticnet.iteritems()] logging.info("Currently Available Concepts: (sample)") logging.info(str(current_concepts[:10])) bing_negative_words = [] bing_positive_words = [] swn_negative_words = [] swn_positive_words = [] new_neg_words, new_pos_words = [], [] #Section 1: code to extract concepts from SWN. #Call preprocess for every word encountered. logging.info("Extracting from SWN") swn_all_words = swn.all_senti_synsets() i, j = 0, 0 for word in swn_all_words: """if i >=5 and j>=5: break""" word_name = word.synset.name().split('.')[0] if word.pos_score() > word.neg_score(): w = preprocess(word_name) if w and w is not '': swn_positive_words.append(w) #i+=1 else: w = preprocess(word_name) if w and w is not '': swn_negative_words.append(w) #j+=1 #include only if they are not available in knowledge base of senticnet logging.info("Checking SenticNet...") # Running time O(n^2). Better solution below. """ for x in xrange(len(swn_positive_words)): if swn_positive_words[x] not in current_concepts: new_pos_words.append(swn_positive_words[x]) for x in xrange(len(swn_negative_words)): if swn_negative_words[x] not in current_concepts: new_neg_words.append(swn_negative_words[x]) """ #Running time O(n*logn) logging.info("Positive Words") new_pos_words = list(set(swn_positive_words)-set(current_concepts)) logging.info("Negative Words") new_neg_words = list(set(swn_negative_words)-set(current_concepts)) print "Sample SWN: \tTotal Length: ", len(new_pos_words), len(new_neg_words) print new_pos_words[:10] print new_neg_words[:10] #Section 2: code to extract concepts from Bing Liu's Opinion lexicon. logging.info("Extracting from Bing Liu") i=0 with open(BING_LIU_DATA_PATH + "/positive-words.txt", 'r') as bing_pos_file: for line in bing_pos_file: if i==1: break w = preprocess(line) if w is not '': bing_positive_words.append(w) i+=1 i=0 with open(BING_LIU_DATA_PATH + "/negative-words.txt", 'r') as bing_neg_file: for line in bing_neg_file: if i==1: break w = preprocess(line) if w is not '': bing_negative_words.append(w) i+=1 #include only if they are not available in knowledge base of senticnet logging.info("Checking SenticNet...") # Running time O(n^2). Better solution below. """ for x in xrange(len(bing_positive_words)): if bing_positive_words[x] not in current_concepts: new_pos_words.append(bing_positive_words[x]) for x in xrange(len(bing_negative_words)): if bing_negative_words[x] not in current_concepts: new_neg_words.append(bing_negative_words[x]) """ #unique concepts #Running time O(n*logn) logging.info("Positive Words") bing_new_pos_words = list(set(bing_positive_words)-set(current_concepts)) logging.info("Negative Words") bing_new_neg_words = list(set(bing_negative_words)-set(current_concepts)) """ print "Sample Bing Liu: Length: ", len(bing_new_pos_words), len(bing_new_neg_words) print bing_new_pos_words print bing_new_neg_words """ new_pos_words+=bing_new_pos_words new_neg_words+=bing_new_neg_words #store them in file. with open(OUTPUT_BASE_PATH + '/new_positive_words.txt', 'w+') as out_posi_file: for word in new_pos_words: out_posi_file.write("%s\n" %word) with open(OUTPUT_BASE_PATH + '/new_negative_words.txt', 'w+') as out_neg_file: for word in new_neg_words: out_neg_file.write("%s\n" %word) #startTime = datetime.now() logging.error("Time to execute extract_new_concepts.extract_new_concepts(): {0}".format(datetime.now() - startTime))
from nltk.corpus import sentiwordnet as swn from nltk.corpus import wordnet as wn f = open ("pos_dictionary_name.txt","a+") pos_words = [] for ss in swn.all_senti_synsets(): if ss.pos_score() > ss.neg_score (): pos_words.append (ss.synset) f.write(ss.synset.name()[:-5]) f.write ("\n") f.close() f = open ("neg_dictionary_name.txt","a+") neg_words = [] for ss in swn.all_senti_synsets(): if ss.neg_score() > ss.pos_score (): neg_words.append (ss.synset) f.write(ss.synset.name()[:-5]) f.write ("\n") f.close()
#!usr/bin/env python3 # -*- coding: utf-8 -*- 'get subject-score > 0.5 words from sentiwordnet' from nltk.corpus import sentiwordnet as swn import json pos = {} neg = {} all = list(swn.all_senti_synsets()) for each in all: if each.pos_score() > 0.5: pos[each.__repr__()[13:-7]] = each.pos_score() if each.neg_score() > 0.5: neg[each.__repr__()[13:-7]] = each.neg_score() with open('basic_pos_words.txt', 'w', encoding='utf8') as f: f.write(json.dumps(pos)) with open('basic_neg_words.txt', 'w', encoding='utf8') as f: f.write(json.dumps(neg)) print('complete.')