def _get_sentiments(self, d): sent_word_net = load_sent_word_net() poscache_filename = "poscache.json" try: poscache = json.load(open(poscache_filename, "r")) except IOError: poscache = {} # http://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html #import pdb;pdb.set_trace() sent = tuple(nltk.word_tokenize(d)) #pos_tag tags tokens with part of speech (noun, verb etc) if poscache is not None: if d in poscache: tagged = poscache[d] else: poscache[d] = tagged = nltk.pos_tag(sent) else: tagged = nltk.pos_tag(sent) pos_vals = [] neg_vals = [] nouns = 0. adjectives = 0. verbs = 0. adverbs = 0. for w,t in tagged: p, n = 0,0 sent_pos_type = None if t.startswith("NN"): sent_pos_type = "n" nouns += 1 elif t.startswith("JJ"): sent_pos_type = "a" adjectives += 1 elif t.startswith("VB"): sent_pos_type = "v" verbs += 1 elif t.startswith("RB"): sent_pos_type = "r" adverbs += 1 if sent_pos_type is not None: sent_word = "%s/%s"%(sent_pos_type, w) if sent_word in sent_word_net: p,n = sent_word_net[sent_word] pos_vals.append(p) neg_vals.append(n) l = len(sent) avg_pos_val = np.mean(pos_vals) avg_neg_val = np.mean(neg_vals) #import pdb;pdb.set_trace() return [1-avg_pos_val-avg_neg_val, avg_pos_val, avg_neg_val, nouns/l, adjectives/l, verbs/l, adverbs/l]
from sklearn.metrics import precision_recall_curve, roc_curve, auc from sklearn.cross_validation import ShuffleSplit from sklearn.pipeline import Pipeline from utils import plot_pr from utils import load_sanders_data from utils import tweak_labels from utils import log_false_positives from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.naive_bayes import MultinomialNB from utils import load_sent_word_net sent_word_net = load_sent_word_net() phase = "03" emo_repl = { # positive emoticons "<3": " good ", ":d": " good ", # :D in lower case ":dd": " good ", # :DD in lower case "8)": " good ", ":-)": " good ", ":)": " good ", ";)": " good ", "(-:": " good ", "(:": " good ",
from utils import plot_pr from utils import load_sanders_data from utils import tweak_labels from utils import log_false_positives from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.pipeline import Pipeline, FeatureUnion from sklearn.grid_search import GridSearchCV from sklearn.metrics import f1_score from sklearn.base import BaseEstimator from sklearn.naive_bayes import MultinomialNB from utils import load_sent_word_net sent_word_net = load_sent_word_net() import json poscache_filename = "poscache.json" try: poscache = json.load(open(poscache_filename, "r")) except IOError: poscache = {} class StructCounter(BaseEstimator): def get_feature_names(self): return np.array([ 'sent_neut', 'sent_pos', 'sent_neg', 'nouns', 'adjectives', 'verbs', 'adverbs', 'allcaps', 'exclamation', 'question',
r"\bu\b": "you", r"\bhaha\b": "ha", r"\bhahaha\b": "ha", r"\bdon't\b": "do not", r"\bdoesn't\b": "does not", r"\bdidn't\b": "did not", r"\bhasn't\b": "has not", r"\bhaven't\b": "have not", r"\bhadn't\b": "had not", r"\bwon't\b": "will not", r"\bwouldn't\b": "would not", r"\bcan't\b": "can not", r"\bcannot\b": "can not", } sent_words = load_sent_word_net() def create_ngram_model(params=None): def preprocessor(tweet): tweet = tweet.lower() global emoticons_reversed for emoticon in emoticons_reversed: tweet = tweet.replace(emoticon, emoticons_replacements[emoticon]) for regex, replacement in regex_replace.iteritems(): tweet = re.sub(regex, replacement, tweet) return tweet