Exemple #1
0
 def add_term_frequencies(self, tweet, counts):
     tokens = tweet_preprocessing(tweet)
     for t in tokens:
         try:
             counts[t] = counts[t] + 1
         except:
             counts[t] = 1
     return counts
Exemple #2
0
 def add_doc_frequencies(self, tweet, counts):
     already_seen = []
     tokens = tweet_preprocessing(tweet)
     for t in tokens:
         if t not in already_seen:
             try:
                 counts[t] = counts[t] + 1
             except:
                 counts[t] = 1
             already_seen.append(t)
     return counts
Exemple #3
0
    def predict_distant_label(self, tweet):
        tokens = tweet_preprocessing(tweet)
        aggress_likelihood = len(
            [t for t in tokens if t in self.aggress_indicators])
        loss_likelihood = len([t for t in tokens if t in self.loss_indicators])
        other_likelihood = len(
            [t for t in tokens if t in self.other_indicators])

        if aggress_likelihood > 0 and aggress_likelihood >= loss_likelihood and aggress_likelihood >= other_likelihood:
            return ' '.join([
                t for t in tokens if t not in self.aggress_indicators
            ]), 'aggress'
        elif loss_likelihood > 0 and loss_likelihood >= other_likelihood:
            return ' '.join(
                [t for t in tokens if t not in self.loss_indicators]), 'loss'
        elif other_likelihood > 0:
            return ' '.join(
                [t for t in tokens if t not in self.other_indicators]), 'other'
        else:
            return None
Exemple #4
0
    def importance(self, tweets, num_indicators):
        no_fly = ['�e_',
                  '“USER_HANDLE:']  #things we don't want to use as indicators

        aggress, loss = get_label_sets()
        aggress_tf, loss_tf, other_tf = {}, {}, {}
        aggress_df, loss_df, other_df = {}, {}, {}
        vocab = set()

        #get vocab
        for t in tweets:
            vocab.update(tweet_preprocessing(t[0]))

        #initalize td, df scores
        for v in vocab:
            aggress_tf[v], loss_tf[v], other_tf[v] = 0.0, 0.0, 0.0
            aggress_df[v], loss_df[v], other_df[v] = 0.0, 0.0, 0.0

        #get term, doc counts for vocab on each label
        for tweet in tweets:
            found = False
            for l in aggress:
                if l in tweet[1].lower() and found == False:
                    found = True
                    aggress_tf = self.add_term_frequencies(
                        tweet[0], aggress_tf)
                    aggress_df = self.add_doc_frequencies(tweet[0], aggress_df)
            for l in loss:
                if l in tweet[1].lower() and found == False:
                    found = True
                    loss_tf = self.add_term_frequencies(tweet[0], loss_tf)
                    loss_df = self.add_doc_frequencies(tweet[0], loss_df)
            if found == False:
                other_tf = self.add_term_frequencies(tweet[0], other_tf)
                other_df = self.add_doc_frequencies(tweet[0], other_df)

        #normalization counts
        aggress_total_tf = sum([aggress_tf[v] for v in vocab])
        loss_total_tf = sum([loss_tf[v] for v in vocab])
        other_total_tf = sum([other_tf[v] for v in vocab])

        aggress_total_df = sum([aggress_df[v] for v in vocab])
        if aggress_total_df == 0: aggress_total_df = 1
        loss_total_df = sum([loss_df[v] for v in vocab])
        if loss_total_df == 0: loss_total_df = 1
        other_total_df = sum([other_df[v] for v in vocab])
        if other_total_df == 0: other_total_df = 1

        #caluclate aggress, loss, other importance score: (tf_pos. * log(df_pos.)) - (tf_neg. * log(df_neg.))

        aggress_scores, loss_scores, other_scores = {}, {}, {}
        for v in vocab:
            #aggress importance score
            aggress_scores[v] = (aggress_tf[v] / aggress_total_tf) * (
                aggress_df[v] / aggress_total_df)
            aggress_scores[v] -= ((loss_tf[v] + other_tf[v]) /
                                  (loss_total_tf + other_total_tf)) * (
                                      (loss_df[v] + other_df[v]) /
                                      (loss_total_df + other_total_df))

            #loss importance score
            loss_scores[v] = (loss_tf[v] / loss_total_tf) * (loss_df[v] /
                                                             loss_total_df)
            loss_scores[v] -= ((aggress_tf[v] + other_tf[v]) /
                               (aggress_total_tf + other_total_tf)) * (
                                   (aggress_df[v] + other_df[v]) /
                                   (aggress_total_df + other_total_df))

            #other importance score
            other_scores[v] = (other_tf[v] / other_total_tf) * (other_df[v] /
                                                                other_total_df)
            other_scores[v] -= ((loss_tf[v] + aggress_tf[v]) /
                                (loss_total_tf + aggress_total_tf)) * (
                                    (loss_df[v] + aggress_df[v]) /
                                    (loss_total_df + aggress_total_df))

        #choose indicators based on importance score
        sorted_aggress = [
            k[0] for k in sorted(aggress_scores.items(),
                                 key=operator.itemgetter(1),
                                 reverse=True) if k[0] not in no_fly
        ]
        sorted_loss = [
            k[0] for k in sorted(
                loss_scores.items(), key=operator.itemgetter(1), reverse=True)
            if k[0] not in no_fly
        ]
        sorted_other = [
            k[0] for k in sorted(
                other_scores.items(), key=operator.itemgetter(1), reverse=True)
            if k[0] not in no_fly
        ]

        sorted_aggress, sorted_loss, sorted_other = self.no_conficting_indicators(
            num_indicators, sorted_aggress, sorted_loss, sorted_other)

        for s in sorted_aggress[:num_indicators]:
            print 'aggress ' + str(s)  #TESTING
        for s in sorted_loss[:num_indicators]:
            print 'loss ' + str(s)  #TESTING
        for s in sorted_other[:num_indicators]:
            print 'other ' + str(s)  #TESTING

        return sorted_aggress[:
                              num_indicators], sorted_loss[:
                                                           num_indicators], sorted_other[:
                                                                                         num_indicators]
Exemple #5
0
scorer = EmotionScorer(sw=True)
total_words_scored = 0
total_words = 0

data_file = 'data/gakirah/gakirah_aggress_loss.csv'
emoji_data_file = 'data/emotion/emoji_lex.txt'
f = open(data_file, 'rU')
reader = csv.DictReader(f)

#FOR EMOTION ERROR ANAYSIS
count = 1
for row in reader:
    if count > 50: break
    tweet = row['CONTENT']
    print str(count) + ' ' + tweet
    tokens = tweet_preprocessing(tweet)
    count += 1
    for t in tokens:
        res = scorer.min_max_score(t.lower())
        print t + ' ' + str(res)
    cont = raw_input()
    print
'''
#FOR EMOJI TESTING
emojis = {}
f = open(emoji_data_file, 'r')

for line in f:
	if len(line.strip()) < 1: continue
	tokens = line.split(' : ')
	emojis[tokens[0].lower()] = tokens[1]
reader = csv.DictReader(f)

aggress, loss = get_label_sets()

w = open(out_file, 'wb')
writer = csv.DictWriter(w,
                        ['AUTHOR', 'CONTENT', 'LABEL', 'DATE', 'URL', 'DESC'])

writer.writeheader()
for row in reader:
    label = row['LABEL'].lower()
    d = {
        'AUTHOR': row['AUTHOR'],
        'CONTENT': row['CONTENT'],
        'LABEL': label,
        'DATE': row['DATE'],
        'URL': row['URL'],
        'DESC': row['DESC']
    }

    tokens = tweet_preprocessing(row['CONTENT'].lower())
    good_tokens = []
    for t in tokens:
        if any(c.isalpha() for c in t) and not re.match(
                '(r|b).?i.?p.?', t) and t != 'USER_HANDLE':
            good_tokens.append(t)
    if len(good_tokens) > 0: writer.writerow(d)

f.close()
w.close()