def produce_plots(user_name: str, thresh, i, path=DEFAULT_PATH): injector = Injector.get_injector_from_file(path) process_module = injector.get_process_module() dao_module = injector.get_dao_module() user_friend_getter = dao_module.get_user_friend_getter() friends_cleaner = process_module.get_extended_friends_cleaner() social_graph_constructor = process_module.get_social_graph_constructor() clusterer = process_module.get_clusterer() user_getter = dao_module.get_user_getter() seed_id = user_getter.get_user_by_screen_name(user_name).id # Full user friend list init_user_friends = user_friend_getter.get_user_friends_ids(seed_id) # tweet_processor.process_tweets_by_user_list(init_user_friends) global_clean = friends_cleaner.clean_friends_global(seed_id, init_user_friends, tweet_threshold=50, follower_threshold=50, friend_threshold=0, bot_threshold=0) clean_list, removed_list = friends_cleaner.clean_friends_local( seed_id, global_clean, local_following=thresh) clean_list = [str(id) for id in clean_list] init_user_dict = get_local_neighbourhood_user_dict(seed_id, clean_list, user_friend_getter) local_neighbourhood = LocalNeighbourhood(seed_id=seed_id, params=None, users=init_user_dict) social_graph = social_graph_constructor.construct_social_graph_from_local_neighbourhood( seed_id, local_neighbourhood, remove_unconnected_nodes=True) clusters = clusterer.cluster_by_social_graph(seed_id, social_graph, {}) log.info("Iteration: " + str(i))
def download_tweets(user_name: str, path=DEFAULT_PATH): injector = Injector.get_injector_from_file(path) process_module = injector.get_process_module() dao_module = injector.get_dao_module() user_friend_getter = dao_module.get_user_friend_getter() friends_cleaner = process_module.get_extended_friends_cleaner() user_getter = dao_module.get_user_getter() user_tweet_downloader = process_module.get_user_tweet_downloader() seed_id = user_getter.get_user_by_screen_name(user_name).id # Full user friend list init_user_friends = user_friend_getter.get_user_friends_ids(seed_id) # tweet_processor.process_tweets_by_user_list(init_user_friends) global_clean = friends_cleaner.clean_friends_global(seed_id, init_user_friends, tweet_threshold=50, follower_threshold=50, bot_threshold=0) clean_list10, removed_list10 = friends_cleaner.clean_friends_local( seed_id, global_clean, local_following=10) clean_list10.append(seed_id) user_tweet_downloader.stream_tweets_by_user_list(clean_list10)
def produce_plots(seed_id: str, user_name: str, threshold: int, i, type, path=DEFAULT_PATH): threshold = int(threshold) if type == 0: type_str = "default" t1 = threshold t2 = threshold elif type == 1: type_str = "follower_only" t1 = 0 t2 = threshold elif type == 2: type_str = "tweet_only" t1 = threshold t2 = 0 injector = Injector.get_injector_from_file(path) process_module = injector.get_process_module() dao_module = injector.get_dao_module() user_friend_getter = dao_module.get_user_friend_getter() friends_cleaner = process_module.get_friends_cleaner() social_graph_constructor = process_module.get_social_graph_constructor() clusterer = process_module.get_clusterer() # Full user friend list init_user_friends = user_friend_getter.get_user_friends_ids(seed_id) # tweet_processor.process_tweets_by_user_list(init_user_friends) clean_list = friends_cleaner.clean_friends_from_list_hard_thresh(seed_id, init_user_friends, t1, t2) clean_list = [str(id) for id in clean_list] init_user_dict = get_local_neighbourhood_user_dict(seed_id, clean_list, user_friend_getter) local_neighbourhood = LocalNeighbourhood(seed_id=seed_id, params=None, users=init_user_dict) social_graph = social_graph_constructor.construct_social_graph_from_local_neighbourhood(seed_id, local_neighbourhood, remove_unconnected_nodes=False) clusters = clusterer.cluster_by_social_graph(seed_id, social_graph, {}) write_clusters_to_file(user_name, clusters, i, threshold, type_str)
def produce_plots(user_name: str, path=DEFAULT_PATH): injector = Injector.get_injector_from_file(path) process_module = injector.get_process_module() dao_module = injector.get_dao_module() friends_getter = dao_module.get_user_friend_getter() user_getter = dao_module.get_user_getter() friends_cleaner = process_module.get_extended_friends_cleaner() log.info("Getting seed user id") seed_id = str(user_getter.get_user_by_screen_name(user_name).get_id()) plotter = DataCleaningDistributions(friends_getter, user_getter, friends_cleaner) log.info("Starting to plot") #plotter.tweet_plot(seed_id) #plotter.follower_plot(seed_id) #plotter.follower_ratio_plot(seed_id) #plotter.local_friends_plot(seed_id) #plotter.local_friends_cutoff_plots(seed_id, 60) #plotter.local_friends_cutoff_plots(seed_id, 100) #plotter.local_friends_cutoff_plots(seed_id, 120) #plotter.local_friends_cutoff_plots(seed_id, 180) #plotter.global_attributes_of_deleted_users(seed_id, 40, 50) #plotter.local_friends_set_similarity(seed_id, 30) #for i in range(1, 4): #plotter.local_friends_set_kept(seed_id, 10*i) plotter.local_follower_distribution(seed_id, 50, 15)
def check_following(user_name: str, path=DEFAULT_PATH): injector = Injector.get_injector_from_file(path) process_module = injector.get_process_module() dao_module = injector.get_dao_module() user_friend_getter = dao_module.get_user_friend_getter() friends_cleaner = process_module.get_extended_friends_cleaner() user_getter = dao_module.get_user_getter() ranking_getter = dao_module.get_ranking_getter() cleaned_friends_getter = dao_module.get_cleaned_user_friend_getter() seed_id = user_getter.get_user_by_screen_name(user_name).id cluster = cleaned_friends_getter.get_user_friends_ids(seed_id) cluster.append(seed_id) news = ['nytimes', 'kylegriffin1', 'propublica', 'TheAtlantic', 'brianstelter', 'NewYorker'] ml = ['mer__edith', 'timnitGebru', 'merbroussard', 'rajiinio'] for name in news: user_id = user_getter.get_user_by_screen_name(name).id friends = user_friend_getter.get_user_friends_ids(str(user_id)) intersection = set(friends).intersection(cluster) log.info(name) log.info([user_getter.get_user_by_id(id).screen_name for id in intersection])
def process_user_tweets(id: str, path=DEFAULT_PATH): injector = Injector.get_injector_from_file(path) process_module = injector.get_process_module() tweet_processor = process_module.get_tweet_processor() tweet_processor.process_tweets_by_user_id(id)
def process_dates(path=DEFAULT_PATH): injector = Injector.get_injector_from_file(path) process_module = injector.get_process_module() dao_module = injector.get_dao_module() user_tweet_getter = dao_module.get_user_tweet_getter() user_tweet_getter.convert_dates()
def produce_plots(seed_id: str, user_name: str, path=DEFAULT_PATH): threshold = 60 injector = Injector.get_injector_from_file(path) process_module = injector.get_process_module() dao_module = injector.get_dao_module() user_friend_getter = dao_module.get_user_friend_getter() friends_cleaner = process_module.get_friends_cleaner() social_graph_constructor = process_module.get_social_graph_constructor() clusterer = process_module.get_clusterer() cluster_word_frequency_processor = process_module.get_cluster_word_frequency_processor() tweet_processor = process_module.get_tweet_processor() production_ranker = process_module.get_ranker() consumption_ranker = process_module.get_ranker(type="Consumption") follower_ranker = process_module.get_ranker(type="Follower") # Full user friend list init_user_friends = user_friend_getter.get_user_friends_ids(seed_id) # tweet_processor.process_tweets_by_user_list(init_user_friends) clean_list = friends_cleaner.clean_friends_from_list(seed_id, init_user_friends, percent_threshold=threshold) clean_list = [str(id) for id in clean_list] init_user_dict = get_local_neighbourhood_user_dict(seed_id, clean_list, user_friend_getter) local_neighbourhood = LocalNeighbourhood(seed_id=seed_id, params=None, users=init_user_dict) social_graph = social_graph_constructor.construct_social_graph_from_local_neighbourhood(seed_id, local_neighbourhood) clusters = clusterer.cluster_by_social_graph(seed_id, social_graph, {}) count = 1 for cluster in clusters: if len(cluster.users) < 5: continue prod_ranking, prod_scores = production_ranker.rank(seed_id, cluster) cons_ranking, cons_scores = consumption_ranker.rank(seed_id, cluster) foll_ranking, foll_scores = follower_ranker.rank(seed_id, cluster) cluster_wf_vector = cluster_word_frequency_processor.process_cluster_word_frequency_vector(cluster.users) wf_dict = cluster_wf_vector.get_words_dict() sorted_words = list(sorted(wf_dict, key=wf_dict.get, reverse=True)) sorted_words.remove("rt") sorted_words.remove("like") top_words = sorted_words[0:min(len(sorted_words), 10)] file_prefix = user_name + '_' + str(count) scatter_plot_from_scores(user_name, prod_scores, cons_scores, count, top_words, file_prefix + "prod_cons") scatter_plot_from_scores(user_name, prod_scores, cons_scores, count, top_words, file_prefix + "prod_cons", use_log_log_scale=True) scatter_plot_from_scores(user_name, prod_scores, foll_scores, count, top_words, file_prefix + "prod_foll", type1='Production Utility', type2='Follower Utility') scatter_plot_from_scores(user_name, prod_scores, foll_scores, count, top_words, file_prefix + "prod_foll", use_log_log_scale=True, type1='Production Utility', type2='Follower Utility') scatter_plot_from_scores(user_name, cons_scores, foll_scores, count, top_words, file_prefix + "cons_foll", type1='Consumption Utility', type2='Follower Utility') scatter_plot_from_scores(user_name, cons_scores, foll_scores, count, top_words, file_prefix + "cons_foll", use_log_log_scale=True, type1='Consumption Utility', type2='Follower Utility') write_scores_to_file({"production": prod_scores, "consumption": cons_scores, "follower": foll_scores}, user_name, count) count += 1
def get_cluster_word_frequency(ids: List[str], path=DEFAULT_PATH): injector = Injector.get_injector_from_file(path) process_module = injector.get_process_module() cluster_word_frequency_processor = process_module.get_cluster_word_frequency_processor() cluster_word_frequency_processor.process_cluster_word_frequency_vector(ids) cluster_word_frequency_processor.process_relative_cluster_word_frequency(ids)
def get_user_word_frequency(id, path=DEFAULT_PATH): injector = Injector.get_injector_from_file(path) process_module = injector.get_process_module() user_word_frequency_processor = process_module.get_user_word_frequency_processor( ) user_word_frequency_processor.process_user_word_frequency_vector(id) user_word_frequency_processor.process_relative_user_word_frequency(id)
def download_user(name: str, path=DEFAULT_PATH): injector = Injector.get_injector_from_file(path) process_module = injector.get_process_module() user_downloader = process_module.get_user_downloader() log.info("Starting Download user with name: %s" % (name)) user_downloader.download_user_by_screen_name(name) log.info("Done downloading user: %s" % (name))
def download_user_friends(name: str, saturated=False, path=DEFAULT_PATH): injector = Injector.get_injector_from_file(path) process_module = injector.get_process_module() user_friend_downloader = process_module.get_user_friend_downloader() if saturated: user_friend_downloader.download_friends_users_by_screen_name(name) else: user_friend_downloader.download_friends_ids_by_screen_name(name)
def detect_core(name: int, path=DEFAULT_PATH): try: injector = Injector.get_injector_from_file(path) process_module = injector.get_process_module() core_detector = process_module.get_core_detector() core_detector.detect_core_by_screen_name(name) except Exception as e: log.exception(e) exit()
def process_local_neighbourhood_tweets(id: str, path=DEFAULT_PATH): injector = Injector.get_injector_from_file(path) dao_module = injector.get_dao_module() process_module = injector.get_process_module() local_neighbourhood_getter = dao_module.get_local_neighbourhood_getter() tweet_processor = process_module.get_tweet_processor() local_neighbourhood = local_neighbourhood_getter.get_local_neighbourhood(id) tweet_processor.process_tweets_by_local_neighbourhood(local_neighbourhood)
def ranking(user_name: str, thresh, path=DEFAULT_PATH): injector = Injector.get_injector_from_file(path) process_module = injector.get_process_module() dao_module = injector.get_dao_module() local_followers_ranker = process_module.get_ranker("LocalFollowers") consumption_ranker = process_module.get_ranker("Consumption") production_ranker = process_module.get_ranker() ranking_getter = dao_module.get_ranking_getter() user_getter = dao_module.get_user_getter() user_id = user_getter.get_user_by_screen_name(user_name).id
def rank_cluster(seed_id: str, params=None, path=DEFAULT_PATH): injector = Injector.get_injector_from_file(path) process_module = injector.get_process_module() dao_module = injector.get_dao_module() cluster_getter = dao_module.get_cluster_getter() ranker = process_module.get_ranker('Consumption') clusters, _ = cluster_getter.get_clusters(seed_id) for cluster in clusters: ranker.rank(seed_id, cluster)
def download_user_friends_tweets(id: str, path=DEFAULT_PATH): injector = Injector.get_injector_from_file(path) dao_module = injector.get_dao_module() process_module = injector.get_process_module() user_friend_getter = dao_module.get_user_friend_getter() user_tweet_downloader = process_module.get_user_tweet_downloader() log.info("Getting user friends for " + str(id)) list = [id] + user_friend_getter.get_user_friends_ids(id) log.info("Beginning to download tweets for user " + str(id)) # user_tweet_downloader.download_user_tweets_by_user_list(list) user_tweet_downloader.stream_tweets_by_user_list(list)
def produce_plots(user_name: str, path=DEFAULT_PATH): injector = Injector.get_injector_from_file(path) process_module = injector.get_process_module() dao_module = injector.get_dao_module() friends_getter = dao_module.get_user_friend_getter() user_getter = dao_module.get_user_getter() log.info("Getting seed user id") seed_id = str(user_getter.get_user_by_screen_name(user_name).get_id()) plotter = DataCleaningDistributions(friends_getter, user_getter) log.info("Starting to plot") #plotter.tweet_plot(seed_id) #plotter.follower_plot(seed_id) #plotter.follower_ratio_plot(seed_id) plotter.local_friends_plot(seed_id)
def ranking(user_name: str, path=DEFAULT_PATH): injector = Injector.get_injector_from_file(path) process_module = injector.get_process_module() dao_module = injector.get_dao_module() local_followers_ranker = process_module.get_ranker("LocalFollowers") consumption_ranker = process_module.get_ranker("Consumption") production_ranker = process_module.get_ranker() user_getter = dao_module.get_user_getter() ranking_getter = dao_module.get_ranking_getter() cluster_getter = dao_module.get_cluster_getter() seed_id = user_getter.get_user_by_screen_name(user_name).id clusters, _ = cluster_getter.get_clusters(seed_id, params={"graph_type": "union"}) production_ranking = ranking_getter.get_ranking(seed_id) cluster = clusters[1].users log.info('Scoring Consumption...') #consumption = consumption_ranker.score_users(cluster) #ranked_consumption = list(sorted(consumption, key=consumption.get, reverse=True))[:20] ranked_consumption = ['109117316', '1203825050476072960', '359831209', '1294363908694827010', '2492917412', '1291153576455266304', '929791330519322624', '2999992556', '254201259', '810582380', '66999485', '918511183', '161455525', '432957426', '6466252', '166479009', '748528569064710145', '382376904', '24223629', '2311193425'] log.info('Scoring Production...') ranked_production = production_ranking.get_top_20_user_ids() consumption_users = [user_getter.get_user_by_id(str(id)).screen_name for id in ranked_consumption] production_users = [user_getter.get_user_by_id(str(id)).screen_name for id in ranked_production] log.info(consumption_users) log.info(production_users) log.info(len(set(consumption_users).intersection(production_users))) log.info(jaccard_similarity(ranked_consumption, ranked_production))
def get_tweets(name: str, path=DEFAULT_PATH): injector = Injector.get_injector_from_file(path) process_module = injector.get_process_module() dao_module = injector.get_dao_module() user_getter = dao_module.get_user_getter() user_id = user_getter.get_user_by_screen_name(name).id twitter_getter = dao_module.get_twitter_getter() # tweets = twitter_getter.get_tweets_by_user_id(user_id) user_tweet_getter = dao_module.get_user_tweet_getter() tweets = sorted(user_tweet_getter.get_tweets_by_user_id_time_restricted(str(user_id)), key=lambda x: x.created_at) log.info(len(tweets)) log.info(tweets[0].text) tweet = tweets[0] date = tweet.created_at log.info(date) if type(date) != datetime: proper_date = datetime.strptime(date, '%a %b %d %H:%M:%S +0000 %Y') tweet.created_at = proper_date log.info(tweet.__dict__)
def produce_plots(user_name: str, thresh, i, path=DEFAULT_PATH): injector = Injector.get_injector_from_file(path) process_module = injector.get_process_module() dao_module = injector.get_dao_module() user_friend_getter = dao_module.get_user_friend_getter() friends_cleaner = process_module.get_extended_friends_cleaner() social_graph_constructor = process_module.get_social_graph_constructor() clusterer = process_module.get_clusterer() user_getter = dao_module.get_user_getter() seed_id = user_getter.get_user_by_screen_name(user_name).id # Full user friend list init_user_friends = user_friend_getter.get_user_friends_ids(seed_id) # tweet_processor.process_tweets_by_user_list(init_user_friends) type = 'local_and_global' filename_start = "./dc2_exp/" + str(type) + '/clusters_local_' + str(thresh) + '_global_50' '/' + str(user_name) + '_clusters_' filename = filename_start + str(i) + '.json' with open(filename, 'r') as file: user_lists = json.load(file) count = len(user_lists) max_cluster = user_lists[0] for j in range(1, count): if len(user_lists[j]) > len(max_cluster): max_cluster = user_lists[j] max_cluster.remove(str(seed_id)) log.info("Num users in max cluster is " + str(len(max_cluster))) init_user_dict = get_local_neighbourhood_user_dict(seed_id, max_cluster, user_friend_getter) local_neighbourhood = LocalNeighbourhood(seed_id=seed_id, params=None, users=init_user_dict) social_graph = social_graph_constructor.construct_social_graph_from_local_neighbourhood(seed_id, local_neighbourhood, remove_unconnected_nodes=True) clusters = clusterer.cluster_by_social_graph(seed_id, social_graph, {}) log.info("Iteration: " + str(i)) write_clusters_to_file(user_name, clusters, i, thresh, "local_and_global_of_cluster")
def ranking(user_name: str, thresh, path=DEFAULT_PATH): injector = Injector.get_injector_from_file(path) process_module = injector.get_process_module() dao_module = injector.get_dao_module() local_followers_ranker = process_module.get_ranker("LocalFollowers") consumption_ranker = process_module.get_ranker("Consumption") production_ranker = process_module.get_ranker() type = 'local_and_global' filename = "./dc2_exp/" + str(type) + '/clusters_local_' + str(thresh) + '_global_50' '/' + str(user_name) + '_clusters_0.json' with open(filename, 'r') as file: user_lists = json.load(file) count = len(user_lists) for i in range(count): # Going through each cluster cluster = user_lists[i] log.info('Scoring Consumption...') consumption = consumption_ranker.score_users(cluster) ranked_consumption = list(sorted(consumption, key=consumption.get, reverse=True)) write_ranking_to_file(user_name, ranked_consumption, i+1, thresh, 'consumption') log.info('Scoring Production...') production = production_ranker.score_users(cluster) ranked_production = list(sorted(production, key=production.get, reverse=True)) write_ranking_to_file(user_name, ranked_production, i+1, thresh, 'production') log.info('Scoring Local Followers...') local_followers = local_followers_ranker.score_users(cluster) log.info(local_followers) ranked_followers = list(sorted(local_followers, key=local_followers.get, reverse=True)) write_ranking_to_file(user_name, ranked_followers, i+1, thresh, 'followers')
def produce_plots(user_name, path=DEFAULT_PATH): #series = ['5', '10', '15'] # series = ['0', '200', '400', '600', '800', '1000', '1200', '1400', '1600', '1800', '2000'] labels = [] series_means = {} injector = Injector.get_injector_from_file(path) process_module = injector.get_process_module() dao_module = injector.get_dao_module() user_friend_getter = dao_module.get_user_friend_getter() friends_cleaner = process_module.get_extended_friends_cleaner() user_getter = dao_module.get_user_getter() seed_id = user_getter.get_user_by_screen_name(user_name).id # Full user friend list init_user_friends = user_friend_getter.get_user_friends_ids(seed_id) # tweet_processor.process_tweets_by_user_list(init_user_friends) global_clean = friends_cleaner.clean_friends_global(seed_id, init_user_friends, tweet_threshold=50, follower_threshold=50, bot_threshold=0) clean_list10, removed_list10 = friends_cleaner.clean_friends_local( seed_id, global_clean, local_following=10) clean_list15, removed_list15 = friends_cleaner.clean_friends_local( seed_id, global_clean, local_following=15) lst10 = [str(user) for user in clean_list10] lst15 = [str(user) for user in clean_list15] user_difference = list(set(lst10) - set(lst15)) # fig = plt.figure() fig, axes = plt.subplots(1, 3) fig.suptitle( 'Proportion of Users Removed from Cluster that are Actually Cleaned out for ' + str(user_name) + " with Global Threshold 50 and Comparing Local Threshold 10 to 15") titles = ['Cluster 1', 'Cluster 2', 'Cluster 3'] type = 'local_and_global' prefix = "./dc2_exp/" d_repr = './dc2_exp/local_and_global/clusters_local_10.0_global_50/david_madras_clusters_0.json' d2_repr = './dc2_exp/default/clusters_80/david_madras_clusters_8.json' repr_lst = [] with open(d_repr) as file: user_lists = json.load(file) #assert len(user_lists) == 3, "Nope!" repr1 = user_lists[0] repr2 = user_lists[1] repr3 = user_lists[2] repr3_seed_removed = repr3[:] repr3_seed_removed.remove(str(seed_id)) title1 = titles[0] title2 = titles[1] title3 = titles[2] filename_list = glob.glob(prefix + str(type) + '/clusters_local_15.0_global_50' '/' + str(user_name) + '_clusters_*.json') counts1 = [] counts2 = [] counts3 = [] delete_counts1 = [] delete_counts2 = [] delete_counts3 = [] subset_counts1 = [] subset_counts2 = [] iterations = [] ax1 = axes[0] ax2 = axes[1] ax3 = axes[2] j = 0 filename_start = prefix + str( type) + '/clusters_local_15.0_global_50' '/' + str( user_name) + '_clusters_' for k in range(20): filename = filename_start + str(k) + '.json' with open(filename, 'r') as file: user_lists = json.load(file) count = len(user_lists) sims1 = [] sims2 = [] sims3 = [] for i in range(count): sims1.append(jaccard_similarity(user_lists[i], repr1)) sims2.append(jaccard_similarity(user_lists[i], repr2)) sims3.append(jaccard_similarity(user_lists[i], repr3)) index1 = sims1.index(max(sims1)) index2 = sims2.index(max(sims2)) index3 = sims3.index(max(sims3)) #if index1 == index2 and index2 == index3 and index3 == index1: if index1 == index2 or count == 1: log.info('does not work for ' + filename + ', ' + str(j)) else: max_sim = [max(sims1), max(sims2), max(sims3)] max_sim.sort(reverse=True) cluster1 = user_lists[index1] cluster2 = user_lists[index2] cluster3 = user_lists[index3] d1 = jaccard_similarity(repr1, cluster1) difference1 = list(set(repr1) - set(cluster1)) delete_counts1.append(overlap(difference1, user_difference)) subset_counts1.append(overlap(cluster1, repr1)) d2 = jaccard_similarity(repr2, cluster2) difference2 = list(set(repr2) - set(cluster2)) delete_counts2.append(overlap(difference2, user_difference)) subset_counts2.append(overlap(cluster2, repr2)) d3 = jaccard_similarity(repr3, cluster3) delete_counts3.append( overlap(repr3_seed_removed, user_difference)) remaining3 = list( set(repr3_seed_removed) - set(user_difference)) log.info(remaining3) counts1.append(d1) counts2.append(d2) counts3.append(d3) iterations.append(j) j += 1 ax1.bar(iterations, delete_counts1) ax2.bar(iterations, delete_counts2) ax3.bar(iterations, delete_counts3) # Add some text for labels, title and custom x-axis tick labels, etc. for ax in [ax1, ax2, ax3]: #for ax in [ax2, ax2]: ax.set_ylabel('Overlap Similarity') ax.set_xlabel('Iteration Number') ax1.set_title(title1, fontsize=10) ax2.set_title(title2, fontsize=10) ax3.set_title(title3, fontsize=10) plt.show() fig, axes = plt.subplots(1, 2) fig.suptitle( 'Proportion of Users in Remaining Cluster that were in Previous Cluster ' + str(user_name) + " with Global Threshold 50 and Comparing Local Threshold 10 to 15") ax1 = axes[0] ax2 = axes[1] ax1.bar(iterations, subset_counts1) ax2.bar(iterations, subset_counts2) for ax in [ax1, ax2]: #for ax in [ax2, ax2]: ax.set_ylabel('Overlap Similarity') ax.set_xlabel('Iteration Number') ax1.set_title(title1, fontsize=10) ax2.set_title(title2, fontsize=10) ax3.set_title(title3, fontsize=10) plt.show() plot_removed(user_difference, repr3, user_name)
def produce_plots(user_name: str, thresh, iteration, path=DEFAULT_PATH): injector = Injector.get_injector_from_file(path) process_module = injector.get_process_module() dao_module = injector.get_dao_module() user_friend_getter = dao_module.get_user_friend_getter() friends_cleaner = process_module.get_extended_friends_cleaner() social_graph_constructor = process_module.get_social_graph_constructor() clusterer = process_module.get_clusterer() user_getter = dao_module.get_user_getter() user_tweet_getter = dao_module.get_user_tweet_getter() clean_user_friend_getter = dao_module.get_cleaned_user_friend_getter() local_neighbourhood_getter = dao_module.get_local_neighbourhood_getter() prod_ranker = process_module.get_ranker() con_ranker = process_module.get_ranker("Consumption") seed_id = user_getter.get_user_by_screen_name(user_name).id # Full user friend list init_user_friends = user_friend_getter.get_user_friends_ids(seed_id) # tweet_processor.process_tweets_by_user_list(init_user_friends) # user = user_getter.get_user_by_id(str(seed_id)) # follower_thresh = 0.1 * user.followers_count # friend_thresh = 0.1 * user.friends_count # tweet_thresh = 0.1 * len(user_tweet_getter.get_tweets_by_user_id_time_restricted(str(seed_id))) # global_clean = friends_cleaner.clean_friends_global(seed_id, # tweet_threshold=tweet_thresh, follower_threshold=follower_thresh, friend_threshold=friend_thresh) # clean_list, removed_list = friends_cleaner.clean_friends_local(seed_id, global_clean, local_following=thresh) # clean_list = [str(id) for id in clean_list] clean_list = clean_user_friend_getter.get_user_friends_ids(str(seed_id)) # social_graph = social_graph_constructor.construct_social_graph(seed_id, is_union=False) # following_counts = {} # for user_id in clean_list: # friends = user_friend_getter.get_user_friends_ids(str(user_id)) # following_counts[user_id] = len(set(friends).intersection(clean_list)) # sorted_users = list(sorted(following_counts, key=following_counts.get, reverse=True)) # print([following_counts[user] for user in sorted_users]) local_neighbourhood = local_neighbourhood_getter.get_local_neighbourhood(seed_id) # Refined Friends Method for k in range(1, 7): log.info("Refining Friends List:") user_list = local_neighbourhood.get_user_id_list() friends_map = {} print('1012256833816363008' in user_list) for user in user_list: friends_list = [] friends = local_neighbourhood.get_user_friends(user) # print(len(friends)) for friend in friends: if user in local_neighbourhood.get_user_friends(str(friend)): friends_list.append(str(friend)) if user == str(seed_id): if int(user) in user_friend_getter.get_user_friends_ids(str(friend)): friends_list.append(str(friend)) # print(len(friends_list)) friends_map[str(user)] = friends_list if user == "254201259": print(len(friends_list)) log.info("Refining by Jaccard Similarity:") for user in [str(id) for id in user_list]: friends_list = friends_map[user] similarities = {} for friend in friends_list: sim = jaccard_similarity(friends_list, friends_map[str(friend)]) similarities[friend] = sim sorted_users = sorted(similarities, key=similarities.get, reverse=True) top_sum = 0 for top_user in sorted_users[:10]: top_sum += similarities[top_user] if len(sorted_users) >= 10: thresh = 0.1 * k * (top_sum / 10) elif len(sorted_users) == 0: thresh = 0 else: thresh = 0.1 * k * (top_sum / len(sorted_users)) # Can do more efficiently using binary search index = len(sorted_users) for i in range(len(sorted_users)): user = sorted_users[i] if similarities[user] < thresh: index = i break friends_map[user] = sorted_users[:index] log.info("Thresh: " + str(0.1*k)) log.info("Setting Local Neighborhood:") refined_local_neighborhood = LocalNeighbourhood(str(seed_id), None, friends_map) social_graph = social_graph_constructor.construct_social_graph_from_local_neighbourhood(seed_id, refined_local_neighborhood, is_union=False) log.info("Clustering:") clusters = clusterer.cluster_by_social_graph(seed_id, social_graph, None) # log.info("Iteration: " + str(iteration)) log.info(len(clusters)) cluster_sizes = {} for i in range(len(clusters)): cluster_sizes[i] = len(clusters[i].users) sorted_indices = sorted(cluster_sizes, key=cluster_sizes.get, reverse=True) for index in sorted_indices[:5]: cluster = clusters[index] prod_ranking, prod = prod_ranker.rank(str(seed_id), cluster) con_ranking, con = con_ranker.rank(str(seed_id), cluster) ranked_prod = prod_ranking.get_all_ranked_user_ids() ranked_con = con_ranking.get_all_ranked_user_ids() log.info("Cluster Size: " + str(len(cluster.users))) log.info("Ranked by Production: ") log.info([user_getter.get_user_by_id(str(id)).screen_name for id in ranked_prod]) log.info("Ranked by Consumption: ") log.info([user_getter.get_user_by_id(str(id)).screen_name for id in ranked_con])
def ranking_distribution(user_name: str, thresh, path=DEFAULT_PATH): injector = Injector.get_injector_from_file(path) process_module = injector.get_process_module() dao_module = injector.get_dao_module() production_ranker = process_module.get_ranker() consumption_ranker = process_module.get_ranker("Consumption") local_followers_ranker = process_module.get_ranker("LocalFollowers") relative_production_ranker = process_module.get_ranker( "RelativeProduction") user_getter = dao_module.get_user_getter() friends_getter = dao_module.get_user_friend_getter() tweet_getter = dao_module.get_user_tweet_getter() clusterer = process_module.get_clusterer() seed_id = user_getter.get_user_by_screen_name(user_name).id type = 'local_and_global' filename = "./dc2_exp/" + str(type) + '/clusters_local_' + str( thresh) + '_global_50' '/' + str(user_name) + '_clusters_0.json' with open(filename, 'r') as file: user_lists = json.load(file) count = len(user_lists) cluster1 = user_lists[0] #similarity_retweets_matrix(user_name, thresh, 1, tweet_getter, cluster1) #similarity_matrix(user_name, thresh, 1, 'p', 'p', friends_getter, cluster1) #similarity_graph(user_name, seed_id, thresh, 1, friends_getter, cluster1, clusterer) for i in range(count): cluster = user_lists[i] log.info(len(cluster)) log.info('Scoring Consumption...') consumption = consumption_ranker.score_users(cluster) ranked_consumption = list( sorted(consumption, key=consumption.get, reverse=True)) consumptions = [consumption[user] for user in ranked_consumption] log.info('Scoring Production...') production = production_ranker.score_users(cluster) ranked_production = list( sorted(production, key=production.get, reverse=True)) productions = [production[user] for user in ranked_production] log.info('Scoring Local Followers...') local_followers = local_followers_ranker.score_users(cluster) log.info(local_followers) ranked_followers = list( sorted(local_followers, key=local_followers.get, reverse=True)) followers = [local_followers[user] for user in ranked_followers] log.info('Scoring Relative Production...') relative_production = relative_production_ranker.score_users(cluster) ranked_relative_production = list( sorted(relative_production, key=relative_production.get, reverse=True)) titles = [ 'Distribution of Consumption at Local Threshold ' + str(thresh) + ' for Cluster ' + str(i + 1), 'Distribution of Production at Local Threshold ' + str(thresh) + ' for Cluster ' + str(i + 1), 'Distribution of Local Followers at Local Threshold ' + str(thresh) + ' for Cluster ' + str(i + 1) ] title = titles[0] plt.bar(ranked_consumption, consumptions) plt.ylabel('Consumption Utility') plt.xlabel('Users in Cluster') plt.title(title) plt.show() title = titles[1] plt.bar(ranked_production, productions) plt.ylabel('Production Utility') plt.xlabel('Users in Cluster') plt.title(title) plt.show() title = titles[2] plt.bar(ranked_followers, followers) plt.ylabel('Local Followers') plt.xlabel('Users in Cluster') plt.title(title) plt.show() compare_top_users(ranked_consumption, ranked_production, ranked_followers, ranked_relative_production, i + 1, thresh, user_getter)