def lexicon_tweet(tweet): SWN = sentlex.SWN3Lexicon() classifier = sentlex.sentanalysis.BasicDocSentiScore() classifier.classify_document(tweet, tagged=False, L=SWN, a=True, v=True, n=True, r=False, negation=True, verbose=False) results = classifier.resultdata results_pos = results['resultpos'] results_neg = results['resultneg'] if results_pos == 0 and results_neg == 0: sentiment = 'irrelevant' else: dif = abs(results_pos - results_neg) if dif < 0.05: sentiment = 'neutral' else: if results_pos > results_neg: sentiment = 'positive' else: sentiment = 'negative' return sentiment
def setUp(self): self.L1 = sentlex.UICLexicon() self.L2 = sentlex.SWN3Lexicon() self.L = sentlex.CompositeLexicon() self.L.add_lexicon(self.L1) self.L.add_lexicon(self.L2) self.L.compile_frequency()
def comp(): L1 = sentlex.UICLexicon() L2 = sentlex.SWN3Lexicon() L = sentlex.CompositeLexicon() L.add_lexicon(L1) L.add_lexicon(L2) L.compile_frequency() return L
def runTest(self): L = sentlex.SWN3Lexicon() L.compile_frequency() baseline = [('bad', 0.0005451764705882353), ('good', 0.002610137254901961), ('the', 0.029449176470588236), ('want', 0.0027591764705882354)] for (w, f) in baseline: self.assertTrue( L.get_freq(w) == f, 'Incorrect freq found for %s (%.8f <> %.8f)' % (w, f, L.get_freq(w)))
from itertools import chain import nltk from sklearn.metrics import classification_report, confusion_matrix from sklearn.preprocessing import LabelBinarizer import sklearn import pycrfsuite from loadTuples import load, load2, load3 from sklearn import svm from evalt import * from collections import Counter import sentlex test_sents = load3("test") #print train_sents #print "sent =" +str(len(train_sents)) SWN = sentlex.SWN3Lexicon() f = open("PredictedTags.pkl", 'rb') Eventpredicted = pickle.load(f) f.close() global wordCnt wordCnt = -1 def word2features(sent, i): """get the feautes corresponding to a word in a sentence at a particular position Args: sent: the sentence whose word is to be considered i: the position of the word in the sentence
def transform(self, documents): import enchant import sentlex from feature_extraction import tokenize_document d = enchant.Dict("en_US") swn = sentlex.SWN3Lexicon() tokenized_documents = [tokenize_document(document) for document in documents] n_words = [] n_chars = [] # number of uppercase words all_caps = [] n_bad = [] exclamation = [] addressing = [] n_dwords = [sum(1 for word in document if d.check(word)) for document in tokenized_documents] sent_pos = [] sent_neg = [] n_you_re = [] for comment in documents: n_words.append(len(comment.split())) n_chars.append(len(comment)) all_caps.append(np.sum([w.isupper() for w in comment.split()])) n_bad.append(comment.count('fakeinsult')) exclamation.append(comment.count("!")) addressing.append(comment.count("@")) doc = nlp(comment) count = 0. pos_sum = 0. neg_sum = 0. for token in doc: if token.text == 'fakeinsult': pos_sum += 0. neg_sum += 1. count += 1. continue if token.pos_.startswith('RB'): sentiment = swn.getadverb(token.text) pos_sum += sentiment[0] neg_sum += sentiment[1] count += 1. elif token.pos_.startswith('NN'): sentiment = swn.getnoun(token.text) pos_sum += sentiment[0] neg_sum += sentiment[1] count += 1. if token.pos_.startswith('JJ'): sentiment = swn.getadjective(token.text) pos_sum += sentiment[0] neg_sum += sentiment[1] count += 1. if token.pos_.startswith('VB'): sentiment = swn.getverb(token.text) pos_sum += sentiment[0] neg_sum += sentiment[1] count += 1. if count != 0: pos_sum /= count neg_sum /= count sent_neg.append(neg_sum) sent_pos.append(pos_sum) matches = self.__matcher(doc) n_you_re.append(len(matches)) allcaps_ratio = np.array(all_caps) / np.array(n_words, dtype=np.float) bad_ratio = np.array(n_bad) / np.array(n_words, dtype=np.float) dic_ratio = np.array(n_dwords) / np.array(n_words, dtype=np.float) return np.array([n_words, n_chars, n_dwords, n_you_re, exclamation, all_caps, addressing, bad_ratio, n_bad, allcaps_ratio, dic_ratio, sent_pos]).T
def transform(self, documents): import enchant import re import sentlex from pattern.en import tag as tagger d = enchant.Dict("en_US") SWN = sentlex.SWN3Lexicon() from feature_extraction import tokenize_document tokenized_documents = [ tokenize_document(document) for document in documents ] n_words = [len(c.split()) for c in documents] #n_words = [len(document) for document in tokenized_documents] n_chars = [len(c) for c in documents] n_dwords = [ sum(1 for word in document if d.check(word)) for document in tokenized_documents ] sent_pos = [] sent_neg = [] for comment in documents: count = 0. pos_sum = 0. neg_sum = 0. for word, tag in tagger(comment.lower()): if word == 'fakeinsult': pos_sum += 0. neg_sum += 1. count += 1. continue if tag.startswith('RB'): sentiment = SWN.getadverb(word) pos_sum += sentiment[0] neg_sum += sentiment[1] count += 1. elif tag.startswith('NN'): sentiment = SWN.getnoun(word) pos_sum += sentiment[0] neg_sum += sentiment[1] count += 1. if tag.startswith('JJ'): sentiment = SWN.getadjective(word) pos_sum += sentiment[0] neg_sum += sentiment[1] count += 1. if tag.startswith('VB'): sentiment = SWN.getverb(word) pos_sum += sentiment[0] neg_sum += sentiment[1] count += 1. if count != 0: pos_sum /= count neg_sum /= count sent_neg.append(neg_sum) sent_pos.append(pos_sum) n_you_re = [ len(re.findall(self.__you_re, document)) for document in documents ] n_you = [ len(re.findall(self.__you, document)) for document in documents ] # number of uppercase words allcaps = [ np.sum([w.isupper() for w in comment.split()]) for comment in documents ] # longest word #max_word_len = [np.max([len(w) for w in c.split()]) for c in documents] # average word length #mean_word_len = [np.mean([len(w) for w in c.split()]) # for c in documents] # number badwords: n_bad = [ np.sum([c.lower().count(w) for w in self.__badwords]) for c in documents ] exclamation = [c.count("!") for c in documents] addressing = [c.count("@") for c in documents] allcaps_ratio = np.array(allcaps) / np.array(n_words, dtype=np.float) bad_ratio = np.array(n_bad) / np.array(n_words, dtype=np.float) dic_ratio = np.array(n_dwords) / np.array(n_words, dtype=np.float) return np.array([ n_words, n_chars, n_dwords, n_you_re, n_you, exclamation, allcaps, addressing, bad_ratio, n_bad, allcaps_ratio, dic_ratio, sent_pos ]).T
def swn3(): L = sentlex.SWN3Lexicon() L.compile_frequency() return L