def add_term_frequencies(self, tweet, counts): tokens = tweet_preprocessing(tweet) for t in tokens: try: counts[t] = counts[t] + 1 except: counts[t] = 1 return counts
def add_doc_frequencies(self, tweet, counts): already_seen = [] tokens = tweet_preprocessing(tweet) for t in tokens: if t not in already_seen: try: counts[t] = counts[t] + 1 except: counts[t] = 1 already_seen.append(t) return counts
def predict_distant_label(self, tweet): tokens = tweet_preprocessing(tweet) aggress_likelihood = len( [t for t in tokens if t in self.aggress_indicators]) loss_likelihood = len([t for t in tokens if t in self.loss_indicators]) other_likelihood = len( [t for t in tokens if t in self.other_indicators]) if aggress_likelihood > 0 and aggress_likelihood >= loss_likelihood and aggress_likelihood >= other_likelihood: return ' '.join([ t for t in tokens if t not in self.aggress_indicators ]), 'aggress' elif loss_likelihood > 0 and loss_likelihood >= other_likelihood: return ' '.join( [t for t in tokens if t not in self.loss_indicators]), 'loss' elif other_likelihood > 0: return ' '.join( [t for t in tokens if t not in self.other_indicators]), 'other' else: return None
def importance(self, tweets, num_indicators): no_fly = ['�e_', '“USER_HANDLE:'] #things we don't want to use as indicators aggress, loss = get_label_sets() aggress_tf, loss_tf, other_tf = {}, {}, {} aggress_df, loss_df, other_df = {}, {}, {} vocab = set() #get vocab for t in tweets: vocab.update(tweet_preprocessing(t[0])) #initalize td, df scores for v in vocab: aggress_tf[v], loss_tf[v], other_tf[v] = 0.0, 0.0, 0.0 aggress_df[v], loss_df[v], other_df[v] = 0.0, 0.0, 0.0 #get term, doc counts for vocab on each label for tweet in tweets: found = False for l in aggress: if l in tweet[1].lower() and found == False: found = True aggress_tf = self.add_term_frequencies( tweet[0], aggress_tf) aggress_df = self.add_doc_frequencies(tweet[0], aggress_df) for l in loss: if l in tweet[1].lower() and found == False: found = True loss_tf = self.add_term_frequencies(tweet[0], loss_tf) loss_df = self.add_doc_frequencies(tweet[0], loss_df) if found == False: other_tf = self.add_term_frequencies(tweet[0], other_tf) other_df = self.add_doc_frequencies(tweet[0], other_df) #normalization counts aggress_total_tf = sum([aggress_tf[v] for v in vocab]) loss_total_tf = sum([loss_tf[v] for v in vocab]) other_total_tf = sum([other_tf[v] for v in vocab]) aggress_total_df = sum([aggress_df[v] for v in vocab]) if aggress_total_df == 0: aggress_total_df = 1 loss_total_df = sum([loss_df[v] for v in vocab]) if loss_total_df == 0: loss_total_df = 1 other_total_df = sum([other_df[v] for v in vocab]) if other_total_df == 0: other_total_df = 1 #caluclate aggress, loss, other importance score: (tf_pos. * log(df_pos.)) - (tf_neg. * log(df_neg.)) aggress_scores, loss_scores, other_scores = {}, {}, {} for v in vocab: #aggress importance score aggress_scores[v] = (aggress_tf[v] / aggress_total_tf) * ( aggress_df[v] / aggress_total_df) aggress_scores[v] -= ((loss_tf[v] + other_tf[v]) / (loss_total_tf + other_total_tf)) * ( (loss_df[v] + other_df[v]) / (loss_total_df + other_total_df)) #loss importance score loss_scores[v] = (loss_tf[v] / loss_total_tf) * (loss_df[v] / loss_total_df) loss_scores[v] -= ((aggress_tf[v] + other_tf[v]) / (aggress_total_tf + other_total_tf)) * ( (aggress_df[v] + other_df[v]) / (aggress_total_df + other_total_df)) #other importance score other_scores[v] = (other_tf[v] / other_total_tf) * (other_df[v] / other_total_df) other_scores[v] -= ((loss_tf[v] + aggress_tf[v]) / (loss_total_tf + aggress_total_tf)) * ( (loss_df[v] + aggress_df[v]) / (loss_total_df + aggress_total_df)) #choose indicators based on importance score sorted_aggress = [ k[0] for k in sorted(aggress_scores.items(), key=operator.itemgetter(1), reverse=True) if k[0] not in no_fly ] sorted_loss = [ k[0] for k in sorted( loss_scores.items(), key=operator.itemgetter(1), reverse=True) if k[0] not in no_fly ] sorted_other = [ k[0] for k in sorted( other_scores.items(), key=operator.itemgetter(1), reverse=True) if k[0] not in no_fly ] sorted_aggress, sorted_loss, sorted_other = self.no_conficting_indicators( num_indicators, sorted_aggress, sorted_loss, sorted_other) for s in sorted_aggress[:num_indicators]: print 'aggress ' + str(s) #TESTING for s in sorted_loss[:num_indicators]: print 'loss ' + str(s) #TESTING for s in sorted_other[:num_indicators]: print 'other ' + str(s) #TESTING return sorted_aggress[: num_indicators], sorted_loss[: num_indicators], sorted_other[: num_indicators]
scorer = EmotionScorer(sw=True) total_words_scored = 0 total_words = 0 data_file = 'data/gakirah/gakirah_aggress_loss.csv' emoji_data_file = 'data/emotion/emoji_lex.txt' f = open(data_file, 'rU') reader = csv.DictReader(f) #FOR EMOTION ERROR ANAYSIS count = 1 for row in reader: if count > 50: break tweet = row['CONTENT'] print str(count) + ' ' + tweet tokens = tweet_preprocessing(tweet) count += 1 for t in tokens: res = scorer.min_max_score(t.lower()) print t + ' ' + str(res) cont = raw_input() print ''' #FOR EMOJI TESTING emojis = {} f = open(emoji_data_file, 'r') for line in f: if len(line.strip()) < 1: continue tokens = line.split(' : ') emojis[tokens[0].lower()] = tokens[1]
reader = csv.DictReader(f) aggress, loss = get_label_sets() w = open(out_file, 'wb') writer = csv.DictWriter(w, ['AUTHOR', 'CONTENT', 'LABEL', 'DATE', 'URL', 'DESC']) writer.writeheader() for row in reader: label = row['LABEL'].lower() d = { 'AUTHOR': row['AUTHOR'], 'CONTENT': row['CONTENT'], 'LABEL': label, 'DATE': row['DATE'], 'URL': row['URL'], 'DESC': row['DESC'] } tokens = tweet_preprocessing(row['CONTENT'].lower()) good_tokens = [] for t in tokens: if any(c.isalpha() for c in t) and not re.match( '(r|b).?i.?p.?', t) and t != 'USER_HANDLE': good_tokens.append(t) if len(good_tokens) > 0: writer.writerow(d) f.close() w.close()