def findScoreWord(word, dType): swn_filename = 'SentiWordNet_3.0.0_20130122.txt' swn = SentiWordNetCorpusReader(swn_filename) word = re.sub('[%s]' % re.escape(string.punctuation), ' ', word) word = word.lower() #print word wS = 0 for w in word.split(): #print w if dType == NN: test = swn.senti_synsets(w, 'n') elif dType == ADJ: test = swn.senti_synsets(w, 'a') elif dType == VB: test = swn.senti_synsets(w, 'v') elif dType == ADV: test = swn.senti_synsets(w, 'r') try: wS += test[0].obj_score except: continue #print word, wS if len(word.split()) == 0: return 0 return wS/len(word.split())
def __init__(self, traing_data_fileP1='mood_traing_p1.dat', traing_data_fileP2='mood_traing.dat', data_file='tweets_raw.dat'): if self.sentiwordnet: print "using sentiwordnet dictionary" else: print "not using sentiwordnet dictionary" self.clsP1 = MoodDetectTrainer(data_file=traing_data_fileP1) self.clsP2 = MoodDetectTrainer(data_file=traing_data_fileP2) self.langClassifier = LangDetect(supportedLangs) self.training_data_p1 = MoodDetectTrainData() self.training_data_p2 = MoodDetectTrainData() self.tweetsFile = open( os.path.join(os.curdir, os.path.normpath('../data/' + data_file)), 'rb') self.countRows(self.tweetsFile) self.tweetsFile = open( os.path.join(os.curdir, os.path.normpath('../data/' + data_file)), 'rb') self.limit['en'] = 300000 self.limit['default'] = 10000 self.count = 0 swn_filename = '../dict/sentiwordnet/' + conf.SENTIWORDNET_DICT_FILENAME self.swn = SentiWordNetCorpusReader(swn_filename)
def __init__(self): self.mwnet = MWNet(os.path.join("data", "mwnet.db")) self.swn = SentiWordNetCorpusReader(os.path.join("data", "SentiWordNet_3.0.0.txt")) if os.getenv('TANL_EMAIL'): self.splitter = TanlSplitter() else: self.splitter = SimpleSplitter()
def __init__(self,traing_data_fileP1='mood_traing_p1.dat',traing_data_fileP2='mood_traing.dat',data_file='tweets_raw.dat'): if self.sentiwordnet: print "using sentiwordnet dictionary" else: print "not using sentiwordnet dictionary" self.clsP1 = MoodDetectTrainer(data_file = traing_data_fileP1) self.clsP2 = MoodDetectTrainer(data_file = traing_data_fileP2) self.langClassifier = LangDetect(supportedLangs) self.training_data_p1 = MoodDetectTrainData() self.training_data_p2 = MoodDetectTrainData() self.tweetsFile = open(os.path.join(os.curdir, os.path.normpath('../data/' + data_file)) ,'rb') self.countRows(self.tweetsFile) self.tweetsFile = open(os.path.join(os.curdir , os.path.normpath('../data/' + data_file)) ,'rb') self.limit['en'] = 300000 self.limit['default'] = 10000 self.count = 0 swn_filename = '../dict/sentiwordnet/' + conf.SENTIWORDNET_DICT_FILENAME self.swn = SentiWordNetCorpusReader(swn_filename)
class SentiWordNetLexicon(): def __init__(self): SWN_FILENAME = "lexicon\SentiWordNet_3.0.0_20130122.txt" self.swn= SentiWordNetCorpusReader(SWN_FILENAME) def get_values(self, word, context=None, pos_tag=None): """ Perform lookup in SentiWordNet """ # entry = swn.senti_synset("breakdown.n.03") entries = None for w in word.split(' '): entries = self.swn.senti_synsets(w) if entries != None: break if entries is None or len(entries)==0: return None if len(entries)==1 or pos_tag is None: return [entries[0].pos_score, entries[0].neg_score, entries[0].obj_score] elif len(entries)>1: #Find out which word to chose, if there are several classes print "Several entries ",entries for entry in entries: if entry.synset.pos()==TYPECRAFT_SENTIWORDNET[pos_tag]: print "Found matching entry: ", entry return [entry.pos_score, entry.neg_score, entry.obj_score] return [entries[0].pos_score, entries[0].neg_score, entries[0].obj_score] return None
class Analyzer(object): def __init__(self): self.mwnet = MWNet(os.path.join("data", "mwnet.db")) self.swn = SentiWordNetCorpusReader(os.path.join("data", "SentiWordNet_3.0.0.txt")) if os.getenv('TANL_EMAIL'): self.splitter = TanlSplitter() else: self.splitter = SimpleSplitter() def analyze_sentence(self, sentence): scores = [] result = {} for word, lemma, tag, wn_type, indices in self.splitter.iter_words(sentence): # Here we can also impose the type to be an ADJ, NAME, or VERB synsets = self.mwnet.get_english_synsets(lemma, wn_type) if not synsets: continue synsets_dict = {} found = False for syn in synsets: for translation in self.mwnet.get_translation(syn): senti_synsets = self.swn.senti_synsets(translation, wn_type) if not senti_synsets: continue synsets_dict[syn] = map(lambda x: [x.synset.name, x.pos_score, x.neg_score, x.obj_score], senti_synsets) scores.extend(map(lambda x: (x.pos_score, x.neg_score, x.obj_score), senti_synsets)) if found: break positive = map(lambda x: x[0], scores) negative = map(lambda x: x[1], scores) objective = map(lambda x: x[2], scores) if len(positive) > 0: pscore = sum(positive) * 1.0 / len(positive) nscore = sum(negative) * 1.0 / len(positive) oscore = sum(objective) * 1.0 / len(positive) result[word] = { 'indices': indices, 'lemma': lemma, 'features': tag, 'synsets': synsets_dict, 'scores': { 'positive': pscore, 'negative': nscore, 'objective': oscore, }, } return result
def __getSentiWords__(self,location=config.SENTI_WORDNET_FILE): from sentiwordnet import SentiWordNetCorpusReader, SentiSynset swn = SentiWordNetCorpusReader(location) w = {} for senti_synset in swn.all_senti_synsets(): score = senti_synset.pos_score + senti_synset.neg_score #if totalScore > 0 : # score = abs(senti_synset.pos_score-senti_synset.neg_score)/totalScore #else : # continue; if score > 0 : word = senti_synset.synset.name.split('.')[0] try: if w[word]>=score: continue; except KeyError, e: pass w[word] = score
def scores(preProData,emot,sentifile='SentiWordNet_3.0.0_20130122.txt'): swn = SentiWordNetCorpusReader(sentifile) res = list() bar = 0.0 nm = NegMod() for tweet,emo in zip(preProData,emot): print bar / float(len(preProData)) tweetneg = 0.0 tweetpos = 0.0 c = 0 for word in tweet: try: w = str(wn.synsets(word)[0].name()) temp = swn.senti_synset(w) plop = 0.0 plopp = 0.0 # Negation et modifieurs if c != 0: if nm.neg_it(tweet[c-1]):#negation tweetpos = temp[2] tweetneg = temp[1] break if nm.mod_multiply(tweet[c-1]):#modifier plop = temp[1]*2 plopp = temp[2]*2 else: plop = temp[1] plopp = temp[2] else: plop = temp[1] plopp = temp[2] tweetpos = tweetpos + plop tweetneg = tweetneg + plopp except: pass c = c + 1 # Add emot feeling tweetpos = tweetpos + emo[0] tweetneg = tweetneg + emo[1] res.append((tweetpos,tweetneg)) bar = bar + 1.0 return res
def findScoreWord(word, dType): swn_filename = 'SentiWordNet_3.0.0_20130122.txt' swn = SentiWordNetCorpusReader(swn_filename) word = re.sub('[%s]' % re.escape(string.punctuation), ' ', word) word = word.lower() #print word wS = 0 for w in word.split(): #print w if dType == NN: test = swn.senti_synsets(w, 'n') elif dType == ADJ: test = swn.senti_synsets(w, 'a') elif dType == VB: test = swn.senti_synsets(w, 'v') elif dType == ADV: test = swn.senti_synsets(w, 'r') try: if test[0].pos_score < 0.1: wS += -test[0].neg_score elif test[0].neg_score < 0.1: wS += test[0].pos_score else: wS = test[0].pos_score except: continue #print word, wS if len(word.split()) == 0: return 0 return wS/len(word.split())
def __init__(self, traing_data_fileP1='mood_traing_p1.dat', traing_data_fileP2='mood_traing.dat', data_file='tweets_raw.dat'): self.clsP1 = MoodDetectTrainer(data_file=traing_data_fileP1) self.clsP2 = MoodDetectTrainer(data_file=traing_data_fileP2) self.langClassifier = LangDetect(supportedLangs) self.training_data_p1 = MoodDetectTrainData() self.training_data_p2 = MoodDetectTrainData() self.tweetsFile = open(os.path.join(self.dataDir, data_file), 'rb') self.countRows(self.tweetsFile) self.tweetsFile = open(os.path.join(self.dataDir, data_file), 'rb') self.limit['en'] = 150000 self.limit['default'] = 10000 self.count = 0 swn_filename = '../dict/sentiwordnet/SentiWordNet_3.0.0_20100705.txt' self.swn = SentiWordNetCorpusReader(swn_filename)
class SentiWordnetTagger(): def __init__(self): f = '../files/SentiWordNet_3.0.0_20120206.txt' self.swn = SentiWordNetCorpusReader(f) def tag(self, word, pos): if (pos is None): return (None, 'z') synsets = self.swn.senti_synsets(word, pos) if not synsets: return None # assumes the list is ranked and gets the first as the most frequent s = synsets[0] offset = s.offset if s.pos_score > s.neg_score: tag = 'p' elif s.pos_score < s.neg_score: tag = 'n' else: tag = 'z' # offset is the synset id return (offset, tag)
def __init__(self,traing_data_fileP1='mood_traing_p1.dat',traing_data_fileP2='mood_traing.dat',data_file='tweets_raw.dat'): self.clsP1 = MoodDetectTrainer(data_file = traing_data_fileP1) self.clsP2 = MoodDetectTrainer(data_file = traing_data_fileP2) self.langClassifier = LangDetect(supportedLangs) self.training_data_p1 = MoodDetectTrainData() self.training_data_p2 = MoodDetectTrainData() self.tweetsFile = open(os.path.join(self.dataDir,data_file),'rb') self.countRows(self.tweetsFile) self.tweetsFile = open(os.path.join(self.dataDir,data_file),'rb') self.limit['en'] = 150000 self.limit['default'] = 10000 self.count = 0 swn_filename = '../dict/sentiwordnet/SentiWordNet_3.0.0_20100705.txt' self.swn = SentiWordNetCorpusReader(swn_filename)
def __init__(self): SWN_FILENAME = "lexicon\SentiWordNet_3.0.0_20130122.txt" self.swn= SentiWordNetCorpusReader(SWN_FILENAME)
import sys sys.path.append("/home/sgolbeck/nltk_data/corpora/sentiwordnet") print sys.path from sentiwordnet import SentiWordNetCorpusReader, SentiSynset dir1="/home/sgolbeck/nltk_data/corpora/sentiwordnet/" swn_filename = dir1+"SentiWordNet_3.0.0.txt" #swn_filename = "SentiWordNet_3.0.0_20100705.txt" swn = SentiWordNetCorpusReader(swn_filename) swn_bad=swn.senti_synsets('bad') ####################################################### from pattern.en import wordnet print wordnet.synsets("kill",pos="VB")[0].weight from pattern.en import ADJECTIVE pattern_bad=wordnet.synsets('bad', ADJECTIVE)[0] ####################################################### from pattern.en import parse pattern_bad_parse=parse('he is a bad man of crime that dances violently') pattern_bad_parse=pattern_bad_parse.split() print pattern_bad_parse pattern_bad_parse_word=pattern_bad_parse[0][3] ####################################################### import nltk text=nltk.word_tokenize("And now for something completely different") #requires that 'maxent_treebank_pos_tagger' has been downloaded text_tagged=nltk.pos_tag(text) #######################################################
class RawClassifier(object): statsData = {} dataDir = "/home/toni/git/financial-twitter-sentiment-analyzer/tracker/data" limit = {} skip = 0 p2_f_limit = 0.75 def __init__(self, traing_data_fileP1='mood_traing_p1.dat', traing_data_fileP2='mood_traing.dat', data_file='tweets_raw.dat'): self.clsP1 = MoodDetectTrainer(data_file=traing_data_fileP1) self.clsP2 = MoodDetectTrainer(data_file=traing_data_fileP2) self.langClassifier = LangDetect(supportedLangs) self.training_data_p1 = MoodDetectTrainData() self.training_data_p2 = MoodDetectTrainData() self.tweetsFile = open(os.path.join(self.dataDir, data_file), 'rb') self.countRows(self.tweetsFile) self.tweetsFile = open(os.path.join(self.dataDir, data_file), 'rb') self.limit['en'] = 150000 self.limit['default'] = 10000 self.count = 0 swn_filename = '../dict/sentiwordnet/SentiWordNet_3.0.0_20100705.txt' self.swn = SentiWordNetCorpusReader(swn_filename) def classifyP1(self, stripSmiles=False): self.classifiyRaw(self.tweetsFile, stripSmiles) self.clsP1.train(self.training_data_p1) print "done training P1" print self.statsData def classifyP2(self): """ remove noisy n-grams """ _st = {'tf': 0, 'df': 0} for feutures, label in self.training_data_p1: lang = feutures.pop('x_lang') feuturesP2 = feutures.copy() for f, v in feutures.items(): prob = self.clsP1.classifier.prob_classify({ f: v, 'x_lang': lang }) _st['tf'] += 1 if max(prob.prob('n'), prob.prob('p')) <= self.p2_f_limit: del feuturesP2[f] _st['df'] += 1 if len(feuturesP2) >= 3: feuturesP2['x_lang'] = lang self.training_data_p2.append((feuturesP2, label)) else: pass print 'p2_length:', len(self.training_data_p2), ' p1_lenght:', len( self.training_data_p1) print 'st:', _st print "deleting p1 set" del self.training_data_p1 del self.clsP1 print "Done deleting p1 set" self.clsP2.train(self.training_data_p2) def stripSmiles(self, text): emos = [ ':)', ':-)', ';-)', ': )', ':d', '=)', ':p', ';)', '<3', ':(', ':-(', ': (' ] for item in emos: text = text.replace(item, "") return text def stats(self, lang, mood): if not self.statsData.has_key(lang): self.statsData[lang] = {'n': 0, 'p': 0} if self.limit.has_key(lang): limit = self.limit[lang] else: limit = self.limit['default'] if self.statsData[lang][mood] >= limit: return 0 else: self.statsData[lang][mood] += 1 return 1 def checkWithSentiwordnet(self, text): tokens = nltk.word_tokenize(text) for token in tokens: synsets = self.swn.senti_synsets(token) if len(synsets) > 0: synset = self.swn.senti_synset(str(synsets[0])) print synset def checkKeyWords(self, text): count = self.containsPositiveWord(text) + self.containsNegativeWord( text) if count > 0: return 'p' if count < 0: return 'n' return 'x' def containsPositiveWord(self, text): count = 0 for item in dictionary.positive: if item in text: count += 1 #print 'p:',item return count def containsNegativeWord(self, text): count = 0 for item in dictionary.negative: if item in text: #print 'n:', item count -= 1 return count def classifiyRaw(self, file, stripSmiles): while True: try: tweet = cPickle.load(file) except EOFError: print "done classify" break except: print "error" pass if self.skip > 0: print "skip" self.skip -= 1 continue if tweet: text = unicode(tweet.get('text')) if text.lower().find('rt ') != -1: print 'rt' continue mood = self.checkKeyWords(text) if mood == 'x': continue lang = self.langClassifier.detect(text) if stripSmiles: text = self.stripSmiles(text) sres = self.stats(lang[0], mood) if sres == 0: # limite de idioma alcanzado print 'limit reached for ', lang[0] continue if sres == -1: print "done for %s" % mood break if self.count and self.count % 100 == 0: print "classified %d tweets" % (self.count) self.count += 1 self.checkWithSentiwordnet(text) self.training_data_p1.addRow(text, mood, lang[0]) def countRows(self, file): rows = 0 breakes = 0 while True: try: tweet = cPickle.load(file) rows += 1 except EOFError: break except: breakes += 1 print 'tweets:', rows, ' breakes:', breakes
class RawClassifier(object): statsData = {} limit = {} skip = 0 p2_f_limit = 0.6 sentiwordnet = conf.USE_SENTIWORDNET_DICT def __init__(self, traing_data_fileP1='mood_traing_p1.dat', traing_data_fileP2='mood_traing.dat', data_file='tweets_raw.dat'): if self.sentiwordnet: print "using sentiwordnet dictionary" else: print "not using sentiwordnet dictionary" self.clsP1 = MoodDetectTrainer(data_file=traing_data_fileP1) self.clsP2 = MoodDetectTrainer(data_file=traing_data_fileP2) self.langClassifier = LangDetect(supportedLangs) self.training_data_p1 = MoodDetectTrainData() self.training_data_p2 = MoodDetectTrainData() self.tweetsFile = open( os.path.join(os.curdir, os.path.normpath('../data/' + data_file)), 'rb') self.countRows(self.tweetsFile) self.tweetsFile = open( os.path.join(os.curdir, os.path.normpath('../data/' + data_file)), 'rb') self.limit['en'] = 300000 self.limit['default'] = 10000 self.count = 0 swn_filename = '../dict/sentiwordnet/' + conf.SENTIWORDNET_DICT_FILENAME self.swn = SentiWordNetCorpusReader(swn_filename) def classifyP1(self, stripSmiles=False): self.classifiyRaw(self.tweetsFile, stripSmiles) self.clsP1.train(self.training_data_p1) print "done training P1" print self.statsData def classifyP2(self): """ remove noisy n-grams """ _st = {'tf': 0, 'df': 0} for feutures, label in self.training_data_p1: lang = feutures.pop('x_lang') feuturesP2 = feutures.copy() for f, v in feutures.items(): prob = self.clsP1.classifier.prob_classify({ f: v, 'x_lang': lang }) _st['tf'] += 1 if max(prob.prob('n'), prob.prob('p')) <= self.p2_f_limit: del feuturesP2[f] _st['df'] += 1 if len(feuturesP2) >= 3: feuturesP2['x_lang'] = lang self.training_data_p2.append((feuturesP2, label)) else: pass print 'p2_length:', len(self.training_data_p2), ' p1_lenght:', len( self.training_data_p1) print 'st:', _st print "deleting p1 set" del self.training_data_p1 del self.clsP1 print "Done deleting p1 set" self.clsP2.train(self.training_data_p2) def stripSmiles(self, text): emos = [ ':)', ':-)', ';-)', ': )', ':d', '=)', ':p', ';)', '<3', ':(', ':-(', ': (' ] for item in emos: text = text.replace(item, "") return text def stats(self, lang, mood): if not self.statsData.has_key(lang): self.statsData[lang] = {'n': 0, 'p': 0} if self.limit.has_key(lang): limit = self.limit[lang] else: limit = self.limit['default'] if self.statsData[lang][mood] >= limit: return 0 else: self.statsData[lang][mood] += 1 return 1 # CHECK WITH SENTIWORDNET def checkWithSentiwordnet(self, text): count = 0 tokens = nltk.word_tokenize(text) #TODO more languages #tokens = [w for w in tokens if not w in nltk.corpus.stopwords.words('english')] if len(tokens) > 0: for token in tokens: synsets = self.swn.senti_synsets(token) if len(synsets) > 0: # TODO no tiene por que ser este lemma. Comprobar la categoria lemma = synsets[0] count = count + lemma.pos_score - lemma.neg_score #print count, " points for tokens :", tokens if count > 0.5: return 'p' if count < 0.5: return 'n' return 'x' # CHECK WITH FINANCIAL DICTIONARIES def checkWithFinancialDict(self, text): count = self.containsPositiveWord(text) + self.containsNegativeWord( text) if count > 0: return 'p' if count < 0: return 'n' return 'x' def containsPositiveWord(self, text): count = 0 for item in dictionary.positive: if item in text: count += 1 #print 'p:',item return count def containsNegativeWord(self, text): count = 0 for item in dictionary.negative: if item in text: #print 'n:', item count -= 1 return count def classifiyRaw(self, file, stripSmiles): while True: try: tweet = cPickle.load(file) except EOFError: print "done classify" break except: print "error" pass if self.skip > 0: print "skip" self.skip -= 1 continue if tweet: text = unicode(tweet.text) if text.lower().find('rt ') != -1: print 'rt' continue lang = self.langClassifier.detect(text) # TODO more languages if lang[0] != 'en': continue if stripSmiles: text = self.stripSmiles(text) if self.sentiwordnet: mood = self.checkWithSentiwordnet(text) else: mood = self.checkWithFinancialDict(text) if mood == 'x': continue sres = self.stats(lang[0], mood) if sres == 0: # limite de idioma alcanzado print 'limit reached for ', lang[0] continue if sres == -1: print "done for %s" % mood break if self.count and self.count % 100 == 0: print "classified %d tweets" % (self.count) self.count += 1 self.training_data_p1.addRow(text, mood, lang[0]) def countRows(self, file): rows = 0 breakes = 0 while True: try: tweet = cPickle.load(file) rows += 1 except EOFError: break except: breakes += 1 print 'tweets:', rows, ' breakes:', breakes
words_noslang = [] for w in words: for k, v in dico_slang.items(): if k in w: #print(k) w = list(filter(lambda x: x != k, w)) w.append(v) #print(w) words_noslang.append(w) print(words_noslang) tag = nltk.pos_tag tag_list_tot = [] tag_list = [tag(words_noslang[i]) for i in range(N)] print(tag_list) for i in range(N): tag_list_tot.extend(tag(words_noslang[i])) tot_pos = [tag_list_tot[i][1] for i in range(len(tag_list_tot))] count_VB = Counter(tot_pos) print(count_VB) verb_tag = ['VB', 'VBZ', 'VBP', 'VBG', 'VBN', 'VBD'] sum_vb = 0 for k, v in count_VB.items(): if k in verb_tag: sum_vb += v print('Le nombre total de POS verbes sur l\'ensemble des tweets est: ', sum_vb) swn_filename = '/home/audrey/Audrey/Cours/INF344/TP/TP_sentiment/SentiWordNet_3.0.0_20130122.txt' swn = SentiWordNetCorpusReader(swn_filename) swn.senti_synset('breakdown.n.03')
# -*- coding: cp1252 -*- import nltk, enchant,re,math from nltk.corpus import wordnet as wn from sentiwordnet import SentiWordNetCorpusReader, SentiSynset swn_filename = 'SentiWordNet_3.0.0_20130122.txt' swn = SentiWordNetCorpusReader(swn_filename) dictionary = enchant.Dict("en_US") emotipatternhappy = re.compile("(:-\))|(:\))|(:o\))|(:])|( :3 )|(:c\))|(:>)|(=])|(8\))|(=\))|(:})|(:\^\))|(:'-\))|(:'\))") emotipatternveryhappy = re.compile("(:-D)|( :D )|(8-D)|( 8D )|(x-D)|( xD )|(X-D)|( XD )|(=-D)|( =D )|(=-3)|( =3 )|(B\^D)|(:-\)\))|(:\)\))") emotipatternsad = re.compile("(>:\[)|(:-\()|(:\()|(:-c)|( :c )|(:-<)|(:<)|(:-\[)|(:\[)|(:{)|(:'\()|(:'-\()") emotipatternverysad = re.compile("(D:<)|( D: )|( D8 )|(D;)|(D=)|( DX )|(v\.v)|(D-':)|(:-\|\|)|(>:\()") def calculate_sentiment(tweet,followers): #### Emoticon #### emotipos = 0.3 * len(emotipatternhappy.findall(tweet)) emotipos = emotipos + 0.5 * len(emotipatternveryhappy.findall(tweet)) emotineg = 0.3 * len(emotipatternsad.findall(tweet)) emotineg = emotineg + 0.5 * len(emotipatternverysad.findall(tweet)) emotiscore = emotipos - emotineg if emotiscore > 1: emotiscore = 1 if emotiscore < -1: emotiscore = -1 #### Hash Score & SWN Score #### #remove punctuation as this will mess up pos tagging tweet = re.sub('[\$\£\+\'(..)(...)\?!\(\)\[\]":;-]\&','',tweet)
class RawClassifier(object): statsData = {} dataDir = "/home/toni/git/financial-twitter-sentiment-analyzer/tracker/data" limit = {} skip = 0 p2_f_limit = 0.75 def __init__(self,traing_data_fileP1='mood_traing_p1.dat',traing_data_fileP2='mood_traing.dat',data_file='tweets_raw.dat'): self.clsP1 = MoodDetectTrainer(data_file = traing_data_fileP1) self.clsP2 = MoodDetectTrainer(data_file = traing_data_fileP2) self.langClassifier = LangDetect(supportedLangs) self.training_data_p1 = MoodDetectTrainData() self.training_data_p2 = MoodDetectTrainData() self.tweetsFile = open(os.path.join(self.dataDir,data_file),'rb') self.countRows(self.tweetsFile) self.tweetsFile = open(os.path.join(self.dataDir,data_file),'rb') self.limit['en'] = 150000 self.limit['default'] = 10000 self.count = 0 swn_filename = '../dict/sentiwordnet/SentiWordNet_3.0.0_20100705.txt' self.swn = SentiWordNetCorpusReader(swn_filename) def classifyP1(self,stripSmiles=False): self.classifiyRaw(self.tweetsFile,stripSmiles) self.clsP1.train(self.training_data_p1) print "done training P1" print self.statsData def classifyP2(self): """ remove noisy n-grams """ _st={'tf':0,'df':0} for feutures,label in self.training_data_p1: lang = feutures.pop('x_lang') feuturesP2 = feutures.copy() for f,v in feutures.items(): prob = self.clsP1.classifier.prob_classify({f:v,'x_lang':lang}) _st['tf']+=1 if max(prob.prob('n'),prob.prob('p')) <= self.p2_f_limit: del feuturesP2[f] _st['df']+=1 if len(feuturesP2) >= 3: feuturesP2['x_lang']=lang self.training_data_p2.append((feuturesP2,label)) else: pass print 'p2_length:' , len(self.training_data_p2), ' p1_lenght:' , len(self.training_data_p1) print 'st:' , _st print "deleting p1 set" del self.training_data_p1 del self.clsP1 print "Done deleting p1 set" self.clsP2.train(self.training_data_p2) def stripSmiles(self,text): emos = [':)',':-)',';-)',': )',':d','=)',':p',';)','<3',':(',':-(',': ('] for item in emos: text = text.replace(item,"") return text def stats(self,lang,mood): if not self.statsData.has_key(lang): self.statsData[lang] = {'n':0,'p':0} if self.limit.has_key(lang): limit = self.limit[lang] else: limit = self.limit['default'] if self.statsData[lang][mood] >= limit: return 0 else: self.statsData[lang][mood]+=1 return 1 def checkWithSentiwordnet(self, text): tokens = nltk.word_tokenize(text) for token in tokens: synsets = self.swn.senti_synsets(token) if len(synsets) > 0: synset = self.swn.senti_synset(str(synsets[0])) print synset def checkKeyWords(self,text): count = self.containsPositiveWord(text) + self.containsNegativeWord(text); if count > 0: return 'p' if count < 0: return 'n' return 'x' def containsPositiveWord(self,text): count = 0 for item in dictionary.positive: if item in text: count += 1 #print 'p:',item return count def containsNegativeWord(self,text): count = 0 for item in dictionary.negative: if item in text: #print 'n:', item count -= 1 return count def classifiyRaw(self,file,stripSmiles): while True: try: tweet = cPickle.load(file) except EOFError: print "done classify" break except: print "error" pass if self.skip > 0: print "skip" self.skip -= 1 continue if tweet: text = unicode(tweet.get('text')) if text.lower().find('rt ') != -1: print 'rt' continue mood = self.checkKeyWords(text) if mood == 'x': continue lang = self.langClassifier.detect(text) if stripSmiles: text = self.stripSmiles(text) sres = self.stats(lang[0], mood) if sres == 0: # limite de idioma alcanzado print 'limit reached for ' , lang[0] continue if sres == -1: print "done for %s" % mood break if self.count and self.count % 100 == 0: print "classified %d tweets" % (self.count) self.count += 1 self.checkWithSentiwordnet(text) self.training_data_p1.addRow(text, mood, lang[0]) def countRows(self,file): rows = 0 breakes = 0 while True: try: tweet = cPickle.load(file) rows +=1 except EOFError: break except: breakes +=1 print 'tweets:',rows,' breakes:',breakes
def sentimentAnalysis(filename, outputFile): swn_filename = 'SentiWordNet_3.0.0_20100705.txt' swn = SentiWordNetCorpusReader(swn_filename) regex = re.compile('[%s]' % re.escape(string.punctuation)) with codecs.open(filename, 'r', 'utf-8') as f: data1 = f.readlines() f.close() writer = csv.writer(open(outputFile, 'wb')) tag = ['CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'TO', 'UH', 'PDT', 'SYM', 'RP'] noun = ['NN', 'NNS', 'NP', 'NPS'] adj = ['JJ', 'JJR', 'JJS'] pronoun = ['PP', 'PP$', 'WP', 'WP$'] verb = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'] adverb = ['RB', 'RBR', 'RBS', 'WRB'] for text in data1: count = re.split("\s+", text, 1)[0] if len(re.split("\s+", text, 1)) > 1: text = re.split("\s+", text, 1)[1] Tex = regex.sub(u'', text) words = word_tokenize(Tex.lower()) word = nltk.pos_tag(words) objCount = 0 subCount = 0 for w in word: if not w[1] in tag: #print(w) if w[1] in noun: pos_Char = 'n' elif w[1] in adj: pos_Char = 'a' elif w[1] in pronoun: pos_Char = 'p' elif w[1] in verb: pos_Char = 'v' elif w[1] in adverb: pos_Char = 'r' else: pos_Char = 'none' if pos_Char == 'none': try: s = swn.senti_synsets(w[0]) scores = list(s)[0] if scores.obj_score > 0.5: objCount += 1 elif scores.pos_score + scores.neg_score > 0.5: subCount += 1 except: print('Unexpected word') else: try: s = swn.senti_synsets(w[0], pos_Char) scores = list(s)[0] if scores.obj_score > 0.5: objCount += 1 elif scores.pos_score + scores.neg_score > 0.5: subCount += 1 except: print('Unexpected word') if objCount + subCount > 0: ratioObj = float(objCount) / (objCount + subCount) ratioSub = float(subCount) / (objCount + subCount) else: ratioObj = 0.0 ratioSub = 0.0 writer.writerow([count, ratioObj, ratioSub])
def __init__(self): f = '../files/SentiWordNet_3.0.0_20120206.txt' self.swn = SentiWordNetCorpusReader(f)
class polarsentenceSample: ssen = '' weightP = 0.000 weightN = 0.000 index = 0 def myS (myData) : if myData < 0.000 : return -1.00 else : return 1.00 swn_filename = 'SentiWordNet_3.0.0_20130122.txt' swn = SentiWordNetCorpusReader(swn_filename) lmtzr = WordNetLemmatizer() lemmaList = set() engStopWords = set(stopwords.words('english')) ############# # set test size, input directory and output directory ############## testSize = 3 #no of sents. to be appeared in the summary and anti summary testSentSize = 12 #maximum no of sents expected in the document windowRange = 4 #window size inPath = "/Users/fahmida/Desktop/randomnessTesting/dataSetDUC2004" outPath = "/Users/fahmida/Desktop/randomnessTesting/textRankDUC2004L" outPath2 = "/Users/fahmida/Desktop/randomnessTesting/polarityRankDUC2004L/positive" outPath3 = "/Users/fahmida/Desktop/randomnessTesting/polarityRankDUC2004L/negative"
class RawClassifier(object): statsData = {} limit = {} skip = 0 p2_f_limit = 0.6 sentiwordnet = conf.USE_SENTIWORDNET_DICT def __init__(self,traing_data_fileP1='mood_traing_p1.dat',traing_data_fileP2='mood_traing.dat',data_file='tweets_raw.dat'): if self.sentiwordnet: print "using sentiwordnet dictionary" else: print "not using sentiwordnet dictionary" self.clsP1 = MoodDetectTrainer(data_file = traing_data_fileP1) self.clsP2 = MoodDetectTrainer(data_file = traing_data_fileP2) self.langClassifier = LangDetect(supportedLangs) self.training_data_p1 = MoodDetectTrainData() self.training_data_p2 = MoodDetectTrainData() self.tweetsFile = open(os.path.join(os.curdir, os.path.normpath('../data/' + data_file)) ,'rb') self.countRows(self.tweetsFile) self.tweetsFile = open(os.path.join(os.curdir , os.path.normpath('../data/' + data_file)) ,'rb') self.limit['en'] = 300000 self.limit['default'] = 10000 self.count = 0 swn_filename = '../dict/sentiwordnet/' + conf.SENTIWORDNET_DICT_FILENAME self.swn = SentiWordNetCorpusReader(swn_filename) def classifyP1(self,stripSmiles=False): self.classifiyRaw(self.tweetsFile,stripSmiles) self.clsP1.train(self.training_data_p1) print "done training P1" print self.statsData def classifyP2(self): """ remove noisy n-grams """ _st={'tf':0,'df':0} for feutures,label in self.training_data_p1: lang = feutures.pop('x_lang') feuturesP2 = feutures.copy() for f,v in feutures.items(): prob = self.clsP1.classifier.prob_classify({f:v,'x_lang':lang}) _st['tf']+=1 if max(prob.prob('n'),prob.prob('p')) <= self.p2_f_limit: del feuturesP2[f] _st['df']+=1 if len(feuturesP2) >= 3: feuturesP2['x_lang']=lang self.training_data_p2.append((feuturesP2,label)) else: pass print 'p2_length:' , len(self.training_data_p2), ' p1_lenght:' , len(self.training_data_p1) print 'st:' , _st print "deleting p1 set" del self.training_data_p1 del self.clsP1 print "Done deleting p1 set" self.clsP2.train(self.training_data_p2) def stripSmiles(self,text): emos = [':)',':-)',';-)',': )',':d','=)',':p',';)','<3',':(',':-(',': ('] for item in emos: text = text.replace(item,"") return text def stats(self,lang,mood): if not self.statsData.has_key(lang): self.statsData[lang] = {'n':0,'p':0} if self.limit.has_key(lang): limit = self.limit[lang] else: limit = self.limit['default'] if self.statsData[lang][mood] >= limit: return 0 else: self.statsData[lang][mood]+=1 return 1 # CHECK WITH SENTIWORDNET def checkWithSentiwordnet(self, text): count = 0 tokens = nltk.word_tokenize(text) #TODO more languages #tokens = [w for w in tokens if not w in nltk.corpus.stopwords.words('english')] if len(tokens) > 0: for token in tokens: synsets = self.swn.senti_synsets(token) if len(synsets) > 0: # TODO no tiene por que ser este lemma. Comprobar la categoria lemma = synsets[0] count = count + lemma.pos_score - lemma.neg_score #print count, " points for tokens :", tokens if count > 0.5: return 'p' if count < 0.5: return 'n' return 'x' # CHECK WITH FINANCIAL DICTIONARIES def checkWithFinancialDict(self,text): count = self.containsPositiveWord(text) + self.containsNegativeWord(text); if count > 0: return 'p' if count < 0: return 'n' return 'x' def containsPositiveWord(self,text): count = 0 for item in dictionary.positive: if item in text: count += 1 #print 'p:',item return count def containsNegativeWord(self,text): count = 0 for item in dictionary.negative: if item in text: #print 'n:', item count -= 1 return count def classifiyRaw(self,file,stripSmiles): while True: try: tweet = cPickle.load(file) except EOFError: print "done classify" break except: print "error" pass if self.skip > 0: print "skip" self.skip -= 1 continue if tweet: text = unicode(tweet.get('text')) if text.lower().find('rt ') != -1: print 'rt' continue lang = self.langClassifier.detect(text) # TODO more languages if lang[0] != 'en': continue if stripSmiles: text = self.stripSmiles(text) if self.sentiwordnet: mood = self.checkWithSentiwordnet(text) else: mood = self.checkWithFinancialDict(text) if mood == 'x': continue sres = self.stats(lang[0], mood) if sres == 0: # limite de idioma alcanzado print 'limit reached for ' , lang[0] continue if sres == -1: print "done for %s" % mood break if self.count and self.count % 100 == 0: print "classified %d tweets" % (self.count) self.count += 1 self.training_data_p1.addRow(text, mood, lang[0]) def countRows(self,file): rows = 0 breakes = 0 while True: try: tweet = cPickle.load(file) rows +=1 except EOFError: break except: breakes +=1 print 'tweets:',rows,' breakes:',breakes
class polarsentenceSample: ssen = '' weightP = 0.000 weightN = 0.000 index = 0 def myS(myData): if myData < 0.000: return -1.00 else: return 1.00 swn_filename = 'SentiWordNet_3.0.0_20130122.txt' swn = SentiWordNetCorpusReader(swn_filename) lmtzr = WordNetLemmatizer() lemmaList = set() engStopWords = set(stopwords.words('english')) ############# # set test size, input directory and output directory ############## testSize = 3 #no of sents. to be appeared in the summary and anti summary testSentSize = 12 #maximum no of sents expected in the document windowRange = 4 #window size inPath = "/Users/fahmida/Desktop/randomnessTesting/dataSetDUC2004" outPath = "/Users/fahmida/Desktop/randomnessTesting/textRankDUC2004L" outPath2 = "/Users/fahmida/Desktop/randomnessTesting/polarityRankDUC2004L/positive" outPath3 = "/Users/fahmida/Desktop/randomnessTesting/polarityRankDUC2004L/negative"
tick_indices={} for i in range(len(DF_tick_indices)): temp=DF_tick_indices.ix[i,1].split(', ')[1:-1] temp1=[] [temp1.append(int(x)) for x in temp] tick_indices[DF_tick_indices.ix[i,0]]=temp1 ################################################################################################## ################################################################################################## ################################################################################################## #manually setup sentiwordnet cd /home/sgolbeck/nltk_data/corpora/sentiwordnet from sentiwordnet import SentiWordNetCorpusReader, SentiSynset swn_filename = "SentiWordNet_3.0.0.txt" #swn_filename = "SentiWordNet_3.0.0_20100705.txt" swn = SentiWordNetCorpusReader(swn_filename) swn.senti_synsets('slow') cd /home/sgolbeck/workspace/PythonExercises/twitter/Tweets ################################################################################################## ################################################################################################## ################################################################################################## #create a time window for September left_window=datetime(2014,9,1,0,0,0) right_window=datetime(2014,10,1,0,0,0) #test if date_object is in the window cond_l=(DF_tick[11]>=left_window) cond_r=(DF_tick[11]<right_window) cond=[cond_l[i] and cond_r[i] for i in DF_tick.index] #select only those rows within the window DF_sept=DF_tick[cond]
def sentimentAnalysis(filename, outputFile): swn_filename = 'SentiWordNet_3.0.0_20100705.txt' swn = SentiWordNetCorpusReader(swn_filename) regex = re.compile('[%s]' % re.escape(string.punctuation)) with codecs.open(filename, 'r', 'utf-8') as f: data1 = f.readlines() f.close() writer = csv.writer(open(outputFile, 'wb')) tag = ['CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'TO', 'UH', 'PDT', 'SYM', 'RP'] noun = ['NN', 'NNS', 'NP', 'NPS'] adj = ['JJ', 'JJR', 'JJS'] pronoun = ['PP', 'PP$', 'WP', 'WP$'] verb = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'] adverb = ['RB', 'RBR', 'RBS', 'WRB'] for text in data1: count = re.split("\s+",text, 1)[0] if len(re.split("\s+",text, 1)) > 1: text = re.split("\s+",text, 1)[1] Tex = regex.sub(u'', text) words = word_tokenize(Tex.lower()) word = nltk.pos_tag(words) objCount = 0 subCount = 0 for w in word: if not w[1] in tag: #print(w) if w[1] in noun: pos_Char = 'n' elif w[1] in adj: pos_Char = 'a' elif w[1] in pronoun: pos_Char = 'p' elif w[1] in verb: pos_Char = 'v' elif w[1] in adverb: pos_Char = 'r' else: pos_Char = 'none' if pos_Char == 'none': try: s = swn.senti_synsets(w[0]) scores = list(s)[0] if scores.obj_score > 0.5: objCount += 1 elif scores.pos_score + scores.neg_score > 0.5: subCount += 1 except: print('Unexpected word') else: try: s = swn.senti_synsets(w[0], pos_Char) scores=list(s)[0] if scores.obj_score > 0.5: objCount += 1 elif scores.pos_score + scores.neg_score > 0.5: subCount += 1 except: print('Unexpected word') if objCount+subCount > 0: ratioObj = float(objCount)/(objCount+subCount) ratioSub = float(subCount)/(objCount+subCount) else: ratioObj = 0.0 ratioSub = 0.0 writer.writerow([count, ratioObj, ratioSub])