def _get_sentiments(self, d):

        sent_word_net = load_sent_word_net()

        poscache_filename = "poscache.json"
        try:
            poscache = json.load(open(poscache_filename, "r"))
        except IOError:
            poscache = {}
        # http://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
        #import pdb;pdb.set_trace()
        sent = tuple(nltk.word_tokenize(d))
        #pos_tag tags tokens with part of speech (noun, verb etc)
        if poscache is not None:
            if d in poscache:
                tagged = poscache[d]
            else:
                poscache[d] = tagged = nltk.pos_tag(sent)
        else:
            tagged = nltk.pos_tag(sent)

        pos_vals = []
        neg_vals = []

        nouns = 0.
        adjectives = 0.
        verbs = 0.
        adverbs = 0.

        for w,t in tagged:
            p, n = 0,0
            sent_pos_type = None
            if t.startswith("NN"):
                sent_pos_type = "n"
                nouns += 1
            elif t.startswith("JJ"):
                sent_pos_type = "a"
                adjectives += 1
            elif t.startswith("VB"):
                sent_pos_type = "v"
                verbs += 1
            elif t.startswith("RB"):
                sent_pos_type = "r"
                adverbs += 1

            if sent_pos_type is not None:
                sent_word = "%s/%s"%(sent_pos_type, w)

                if sent_word in sent_word_net:
                    p,n = sent_word_net[sent_word]

            pos_vals.append(p)
            neg_vals.append(n)

        l = len(sent)
        avg_pos_val = np.mean(pos_vals)
        avg_neg_val = np.mean(neg_vals)
        #import pdb;pdb.set_trace()
        return [1-avg_pos_val-avg_neg_val, avg_pos_val, avg_neg_val,
                nouns/l, adjectives/l, verbs/l, adverbs/l]
Beispiel #2
0
from sklearn.metrics import precision_recall_curve, roc_curve, auc
from sklearn.cross_validation import ShuffleSplit
from sklearn.pipeline import Pipeline

from utils import plot_pr
from utils import load_sanders_data
from utils import tweak_labels
from utils import log_false_positives

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB

from utils import load_sent_word_net

sent_word_net = load_sent_word_net()

phase = "03"

emo_repl = {
    # positive emoticons
    "<3": " good ",
    ":d": " good ", # :D in lower case
    ":dd": " good ", # :DD in lower case
    "8)": " good ",
    ":-)": " good ",
    ":)": " good ",
    ";)": " good ",
    "(-:": " good ",
    "(:": " good ",
Beispiel #3
0
from utils import plot_pr
from utils import load_sanders_data
from utils import tweak_labels
from utils import log_false_positives

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.base import BaseEstimator

from sklearn.naive_bayes import MultinomialNB

from utils import load_sent_word_net

sent_word_net = load_sent_word_net()

import json

poscache_filename = "poscache.json"
try:
    poscache = json.load(open(poscache_filename, "r"))
except IOError:
    poscache = {}


class StructCounter(BaseEstimator):
    def get_feature_names(self):
        return np.array([
            'sent_neut', 'sent_pos', 'sent_neg', 'nouns', 'adjectives',
            'verbs', 'adverbs', 'allcaps', 'exclamation', 'question',
Beispiel #4
0
    r"\bu\b": "you",
    r"\bhaha\b": "ha",
    r"\bhahaha\b": "ha",
    r"\bdon't\b": "do not",
    r"\bdoesn't\b": "does not",
    r"\bdidn't\b": "did not",
    r"\bhasn't\b": "has not",
    r"\bhaven't\b": "have not",
    r"\bhadn't\b": "had not",
    r"\bwon't\b": "will not",
    r"\bwouldn't\b": "would not",
    r"\bcan't\b": "can not",
    r"\bcannot\b": "can not",
}

sent_words = load_sent_word_net()


def create_ngram_model(params=None):
    def preprocessor(tweet):
        tweet = tweet.lower()
        global emoticons_reversed

        for emoticon in emoticons_reversed:
            tweet = tweet.replace(emoticon, emoticons_replacements[emoticon])

        for regex, replacement in regex_replace.iteritems():
            tweet = re.sub(regex, replacement, tweet)

        return tweet