def test_multiple_topics_with_stop_words(self): article = u"the banana a banana an banana or orange and orange" topics = get_topics(article) self.assertEqual(len(topics), 2) self.assertTrue("banana" in topics) self.assertTrue("orange" in topics) self.assertFalse("the" in topics)
def get_synonyms(word): #Clean input word (lowercase, stemming, ect) # res = topics.get_topics(word) cleanedWord = word #Load trained model, !-CHANGE PATH TO PATH OF MODEL-! model = word2vec.load( "/Users/thiagolobo/Desktop/irproject/repo_new/stalkr/stalkr/vectors.bin" ) try: #get similar words from model indexes, metrics = model.cosine(cleanedWord) synonyms = model.vocab[indexes] #Clean data string = "" for elem in synonyms: string = string + " " + elem cleanedSynonyms = topics.get_topics(string) #remove duplicate of search word wordList = list(cleanedSynonyms) while cleanedWord in wordList: wordList.remove(cleanedWord) except KeyError as e: #return empty list if word was not present in model wordList = [] return wordList
def test_punctuation(self): article = u"the cat was on the mat." topics = get_topics(article) self.assertEqual(len(topics), 2) self.assertTrue("cat" in topics) self.assertTrue("mat" in topics) self.assertFalse("." in topics)
def tokenize(query): tokens = topics.get_topics(query) tokenized = {} for token in tokens: if token in tokenized: tokenized[token] += 1 else: tokenized[token] = 1 return tokenized
def list_topics(area_id): if users.login_id() == 0: return render_template("index.html", login="******") else: area_name = areas.get_areaname(area_id) tlist = topics.get_topics(area_id) return render_template("topics.html", area_id=area_id, area_name=area_name, topics=tlist)
def __init__(self, grouped_tweets_csv_filename, plot_year): self.grouped_tweets_csv_filename = grouped_tweets_csv_filename self.year = plot_year self.topics = t.get_topics() self.female_topics = [ 'womens_rights', 'education', 'healthcare', 'familycare' ] self.male_topics = [ 'military', 'economics', 'foreign_affairs', 'policing' ] self.party_and_gender_colors = t.get_colors('gender_and_party') self.gender_colors = t.get_colors('gender') self.party_colors = t.get_colors('party')
def main(): print('Loading corpus...') chapters = load_corpus() print('Finding named entities...') entities_list = get_named_entities(chapters) entities_list = filter_entities(entities_list) print('Processing corpus...') chapters_processed = preprocess(chapters) print('Creating TermDocumentMatrix...') vocabulary, vector_space = get_termDocumentMatrix(chapters_processed) print('Computing entities relationships...') entity_matrix = get_entity_matrix(entities_list, vocabulary, vector_space) print('Topic extraction...') topics = get_topics(chapters_processed) print('Entities graph...') draw_entities_relation(entity_matrix, topics)
def get(self): cypher = self.db.cypher query = self.get_argument('q') alpha = float(self.get_argument('a', 0.5)) prtype = self.get_argument('t', 'rankb') limit = int(self.get_argument('l', 50)) users = recommend(query, alpha=alpha, pr_type=prtype, limit=limit) tokens = get_topics(query) rawtokens = query.split(" ") # Get synonyms for all tokens present in supplied query and flatten the # result into a single list. allsyn = sum([get_synonyms(token) for token in rawtokens], []) # Quote all synonyms. allsyn = [ "'" + s + "'" for s in allsyn] # Only keep the synonyms that exists in the database. cursor = self.db.cypher.execute("MATCH (w:Word) WHERE w.name IN [" + ",".join(allsyn) + "] RETURN w.name") synonyms = [w["w.name"] for w in cursor] res = {'users': users, 'tokens': tokens, 'synonyms': synonyms} self.write(res)
def get(self): cypher = self.db.cypher query = self.get_argument('q') alpha = float(self.get_argument('a', 0.5)) prtype = self.get_argument('t', 'rankb') limit = int(self.get_argument('l', 50)) users = recommend(query, alpha=alpha, pr_type=prtype, limit=limit) tokens = get_topics(query) rawtokens = query.split(" ") # Get synonyms for all tokens present in supplied query and flatten the # result into a single list. allsyn = sum([get_synonyms(token) for token in rawtokens], []) # Quote all synonyms. allsyn = ["'" + s + "'" for s in allsyn] # Only keep the synonyms that exists in the database. cursor = self.db.cypher.execute("MATCH (w:Word) WHERE w.name IN [" + ",".join(allsyn) + "] RETURN w.name") synonyms = [w["w.name"] for w in cursor] res = {'users': users, 'tokens': tokens, 'synonyms': synonyms} self.write(res)
from datetime import datetime import matplotlib.pyplot as plt import pandas as pd import numpy as np import topics as t topics = t.get_topics() topic_colors = t.get_colors('topics') topic_gendered_colors = t.get_colors('topics_gendered') def replace_party(party): if party == 'Republican Party': return 'Republican' elif party == 'Democratic Party': return 'Democratic' else: return party def date_to_month_and_year(date_string): date_string = standardize_date_string(date_string) month, day, year = date_string.split('/') month_and_year = month + '/' + year return month_and_year def standardize_date_string(date_string): ret = '' split_string = date_string.split('/')
"""Test topic matching and the like""" import os import unittest import lib import topics os.chdir(lib.get_project_dir()) topic_list = topics.get_topics() class TestMatching(unittest.TestCase): def test_matching(self): title_list = "the quick brown fox".split() for word in title_list: self.assertTrue(topics.matching(word, title_list)) for word in title_list: self.assertTrue(topics.matching(word[1:-1], title_list)) self.assertTrue(topics.matching("", [])) self.assertTrue(topics.matching("unicode", ["unicode?"])) if __name__ == "__main__": unittest.main()
def main(): only_max_topics = False # true for only selecting max topic(s), false otherwise for all topics present skip_officials = False # make true to not include official twitters in analysis threshold_percent = 0.1 input_csv = 'C:\\Users\\Eleanor\\Dropbox\\Honors Research\\Tweets\\TwitterAPIWrapper\\tweets_2018.csv' # input_csv = 'sample_tweets.csv' output_csv = '2018_categorized_tweets_all_2.csv' topics = t.get_topics() start = time.time() print('reading') tweets_df = pd.read_csv(input_csv) print('cleaning') clean_data(tweets_df) if skip_officials: tweets_df = tweets_df[tweets_df['twitter_type'] != 'official_twitter'] tweets_df['category'] = '' tweets_df['category_word_matches'] = '' tweets_df['category_topic_percent'] = 0 for topic in topics: tweets_df[topic] = 0 print('processing') for index, row in tweets_df.iterrows(): if index % 1000 == 0: print(index) counter = {topic: 0 for topic in topics.keys()} counter_words = {topic: [] for topic in topics.keys()} for topic in topics: for phrase in topics[topic]: if ' ' in phrase: joined_clean_text = ' '.join(row['cleaned_text']) if phrase in joined_clean_text: if ' ' in phrase and not really_in( joined_clean_text, phrase): continue counter[topic] += 1 counter_words[topic].append(phrase) else: if phrase in row['cleaned_text']: counter[topic] += 1 counter_words[topic].append(phrase) if only_max_topics: max_topic = max(counter, key=lambda val: counter[val]) max_topic_phrases = counter_words[max_topic] max_topic_percent = counter[max_topic] / len( row['cleaned_text']) if len(row['cleaned_text']) != 0 else 0 else: max_topic = '' max_topic_phrases = [] max_topic_percent = 1.0001 * threshold_percent if max_topic_percent <= threshold_percent: continue for topic in topics.keys(): if topic == max_topic: continue topic_percent = counter[topic] / len(row['cleaned_text']) if len( row['cleaned_text']) != 0 else 0 if topic_percent >= max_topic_percent: max_topic += ',' + topic max_topic_phrases += counter_words[topic] if max_topic == '': continue elif max_topic[0] == ',': max_topic = max_topic[1:] tweets_df.loc[index, ['category']] = max_topic tweets_df.loc[index, ['category_word_matches']] = str(max_topic_phrases) tweets_df.loc[index, ['category_topic_percent']] = max_topic_percent for topic in max_topic.split(','): tweets_df.loc[index, [topic.strip()]] = 1 print('writing') tweets_df.to_csv(output_csv) print(time.time() - start)
def group(categorized_tweets_csv_filename, candidates_csv_filename, output_filename): topics = get_topics() categorized_tweets_df = pd.read_csv(categorized_tweets_csv_filename) cols = [ 'candidate_id', 'tweet_favorites', 'tweet_retweets', 'category' ] new_tweets_df = pd.DataFrame(columns=cols) for index, row in categorized_tweets_df.iterrows(): if index % 500 == 0: print(index) for topic in topics: if row[topic] == 1: new_row = { 'candidate_id': row['candidate_id'], 'tweet_favorites': row['tweet_favorites'], 'tweet_retweets': row['tweet_retweets'], 'category': topic } new_tweets_df = new_tweets_df.append(new_row, ignore_index=True) new_tweets_df['tweet_favorites'] = new_tweets_df[ 'tweet_favorites'].astype(int) new_tweets_df['tweet_retweets'] = new_tweets_df[ 'tweet_retweets'].astype(int) new_tweets_df = new_tweets_df.groupby(['candidate_id', 'category' ]).agg(['count', 'mean', 'sum']) new_tweets_df.columns = new_tweets_df.columns.map( '{0[0]}_{0[1]}'.format) new_tweets_df = new_tweets_df.reset_index() new_tweets_df.drop(['tweet_retweets_count'], axis=1, inplace=True) new_tweets_df.columns = [ 'candidate_id', 'category', 'count', 'tweet_favorites_mean', 'tweet_favorites_sum', 'tweet_retweets_mean', 'tweet_retweets_sum' ] cols = ['candidate_id'] for topic in topics: cols.append(topic + '_count') cols.append(topic + '_average_favorites') cols.append(topic + '_sum_favorites') cols.append(topic + '_average_retweets') cols.append(topic + '_sum_retweets') candidate_category_info_df = pd.DataFrame(columns=cols) current_cand_id = -1 current_row = {} for index, row in new_tweets_df.iterrows(): if index % 500 == 0: print(index) if row['candidate_id'] != current_cand_id: for topic in topics: if topic + '_count' not in current_row: current_row[topic + '_count'] = 0 current_row[topic + '_average_favorites'] = 0 current_row[topic + '_sum_favorites'] = 0 current_row[topic + '_average_retweets'] = 0 current_row[topic + '_sum_retweets'] = 0 if current_cand_id != -1: candidate_category_info_df = candidate_category_info_df.append( current_row, ignore_index=True) current_row = {'candidate_id': row['candidate_id']} current_cand_id = row['candidate_id'] category = row['category'] current_row[category + '_count'] = row['count'] current_row[category + '_average_favorites'] = row['tweet_favorites_mean'] current_row[category + '_sum_favorites'] = row['tweet_favorites_sum'] current_row[category + '_average_retweets'] = row['tweet_retweets_mean'] current_row[category + '_sum_retweets'] = row['tweet_retweets_sum'] engagement_df = CandidateGrouper.get_engagement_df( categorized_tweets_df) candidates_df = pd.read_csv(candidates_csv_filename) candidates_df = candidates_df[['candidate_id', 'party', 'gender']] candidate_category_info_df = candidate_category_info_df.merge( engagement_df, on='candidate_id') candidate_category_info_df = candidate_category_info_df.merge( candidates_df, on='candidate_id') # This part calculates masculine/feminine engagement feminine_topics = [ 'womens_rights', 'education', 'healthcare', 'familycare' ] masculine_topics = [ 'military', 'economics', 'foreign_affairs', 'policing' ] candidate_category_info_df['masculine_count'] = sum( candidate_category_info_df['{0}_count'.format(topic)] for topic in masculine_topics) candidate_category_info_df['feminine_count'] = sum( candidate_category_info_df['{0}_count'.format(topic)] for topic in feminine_topics) candidate_category_info_df['masculine_total_engagement'] = sum( candidate_category_info_df['{0}_sum_favorites'.format(topic)] for topic in masculine_topics) + sum( candidate_category_info_df['{0}_sum_retweets'.format(topic)] for topic in masculine_topics) candidate_category_info_df['feminine_total_engagement'] = sum( candidate_category_info_df['{0}_sum_favorites'.format(topic)] for topic in feminine_topics) + sum( candidate_category_info_df['{0}_sum_retweets'.format(topic)] for topic in feminine_topics) candidate_category_info_df[ 'masculine_average_engagement'] = candidate_category_info_df[ 'masculine_total_engagement'] / candidate_category_info_df[ 'masculine_count'] candidate_category_info_df[ 'feminine_average_engagement'] = candidate_category_info_df[ 'feminine_total_engagement'] / candidate_category_info_df[ 'feminine_count'] candidate_category_info_df.to_csv(output_filename)
def __init__(self, grouped_tweets_csv_filename): self.grouped_tweets_csv_filename = grouped_tweets_csv_filename self.topics = t.get_topics() self.topic_colors = t.get_colors('topics') self.last_ys = None
def process_text(data): tokens = topics.get_topics(data) return tokens
def test_topics_without_stop_words(self): article = u"banana banana banana" topics = get_topics(article) self.assertEqual(len(topics), 1) self.assertTrue("banana" in topics)
def test_multiple_topics_without_stop_words(self): article = u"banana banana banana orange orange" topics = get_topics(article) self.assertEqual(len(topics), 2) self.assertTrue("banana" in topics) self.assertTrue("orange" in topics)