Exemple #1
0
def produce_plots(user_name: str, thresh, i, path=DEFAULT_PATH):

    injector = Injector.get_injector_from_file(path)
    process_module = injector.get_process_module()
    dao_module = injector.get_dao_module()

    user_friend_getter = dao_module.get_user_friend_getter()
    friends_cleaner = process_module.get_extended_friends_cleaner()
    social_graph_constructor = process_module.get_social_graph_constructor()
    clusterer = process_module.get_clusterer()
    user_getter = dao_module.get_user_getter()

    seed_id = user_getter.get_user_by_screen_name(user_name).id
    # Full user friend list
    init_user_friends = user_friend_getter.get_user_friends_ids(seed_id)
    # tweet_processor.process_tweets_by_user_list(init_user_friends)
    global_clean = friends_cleaner.clean_friends_global(seed_id,
                                                        init_user_friends,
                                                        tweet_threshold=50,
                                                        follower_threshold=50,
                                                        friend_threshold=0,
                                                        bot_threshold=0)
    clean_list, removed_list = friends_cleaner.clean_friends_local(
        seed_id, global_clean, local_following=thresh)
    clean_list = [str(id) for id in clean_list]

    init_user_dict = get_local_neighbourhood_user_dict(seed_id, clean_list,
                                                       user_friend_getter)
    local_neighbourhood = LocalNeighbourhood(seed_id=seed_id,
                                             params=None,
                                             users=init_user_dict)
    social_graph = social_graph_constructor.construct_social_graph_from_local_neighbourhood(
        seed_id, local_neighbourhood, remove_unconnected_nodes=True)
    clusters = clusterer.cluster_by_social_graph(seed_id, social_graph, {})
    log.info("Iteration: " + str(i))
def download_tweets(user_name: str, path=DEFAULT_PATH):

    injector = Injector.get_injector_from_file(path)
    process_module = injector.get_process_module()
    dao_module = injector.get_dao_module()

    user_friend_getter = dao_module.get_user_friend_getter()
    friends_cleaner = process_module.get_extended_friends_cleaner()
    user_getter = dao_module.get_user_getter()
    user_tweet_downloader = process_module.get_user_tweet_downloader()

    seed_id = user_getter.get_user_by_screen_name(user_name).id
    # Full user friend list
    init_user_friends = user_friend_getter.get_user_friends_ids(seed_id)
    # tweet_processor.process_tweets_by_user_list(init_user_friends)
    global_clean = friends_cleaner.clean_friends_global(seed_id,
                                                        init_user_friends,
                                                        tweet_threshold=50,
                                                        follower_threshold=50,
                                                        bot_threshold=0)
    clean_list10, removed_list10 = friends_cleaner.clean_friends_local(
        seed_id, global_clean, local_following=10)
    clean_list10.append(seed_id)

    user_tweet_downloader.stream_tweets_by_user_list(clean_list10)
def produce_plots(seed_id: str, user_name: str, threshold: int, i, type, path=DEFAULT_PATH):
    threshold = int(threshold)
    if type == 0:
        type_str = "default"
        t1 = threshold
        t2 = threshold
    elif type == 1:
        type_str = "follower_only"
        t1 = 0
        t2 = threshold
    elif type == 2:
        type_str = "tweet_only"
        t1 = threshold
        t2 = 0

    injector = Injector.get_injector_from_file(path)
    process_module = injector.get_process_module()
    dao_module = injector.get_dao_module()

    user_friend_getter = dao_module.get_user_friend_getter()
    friends_cleaner = process_module.get_friends_cleaner()
    social_graph_constructor = process_module.get_social_graph_constructor()
    clusterer = process_module.get_clusterer()

    # Full user friend list
    init_user_friends = user_friend_getter.get_user_friends_ids(seed_id)
    # tweet_processor.process_tweets_by_user_list(init_user_friends)
    clean_list = friends_cleaner.clean_friends_from_list_hard_thresh(seed_id, init_user_friends, t1, t2)
    clean_list = [str(id) for id in clean_list]
    init_user_dict = get_local_neighbourhood_user_dict(seed_id, clean_list, user_friend_getter)
    local_neighbourhood = LocalNeighbourhood(seed_id=seed_id, params=None, users=init_user_dict)
    social_graph = social_graph_constructor.construct_social_graph_from_local_neighbourhood(seed_id, local_neighbourhood, remove_unconnected_nodes=False)

    clusters = clusterer.cluster_by_social_graph(seed_id, social_graph, {})
    write_clusters_to_file(user_name, clusters, i, threshold, type_str)
Exemple #4
0
def produce_plots(user_name: str, path=DEFAULT_PATH):

    injector = Injector.get_injector_from_file(path)
    process_module = injector.get_process_module()
    dao_module = injector.get_dao_module()

    friends_getter = dao_module.get_user_friend_getter()
    user_getter = dao_module.get_user_getter()
    friends_cleaner = process_module.get_extended_friends_cleaner()

    log.info("Getting seed user id")
    seed_id = str(user_getter.get_user_by_screen_name(user_name).get_id())

    plotter = DataCleaningDistributions(friends_getter, user_getter,
                                        friends_cleaner)

    log.info("Starting to plot")
    #plotter.tweet_plot(seed_id)
    #plotter.follower_plot(seed_id)
    #plotter.follower_ratio_plot(seed_id)
    #plotter.local_friends_plot(seed_id)

    #plotter.local_friends_cutoff_plots(seed_id, 60)
    #plotter.local_friends_cutoff_plots(seed_id, 100)
    #plotter.local_friends_cutoff_plots(seed_id, 120)
    #plotter.local_friends_cutoff_plots(seed_id, 180)

    #plotter.global_attributes_of_deleted_users(seed_id, 40, 50)

    #plotter.local_friends_set_similarity(seed_id, 30)
    #for i in range(1, 4):
    #plotter.local_friends_set_kept(seed_id, 10*i)

    plotter.local_follower_distribution(seed_id, 50, 15)
Exemple #5
0
def check_following(user_name: str, path=DEFAULT_PATH):

    injector = Injector.get_injector_from_file(path)
    process_module = injector.get_process_module()
    dao_module = injector.get_dao_module()

    user_friend_getter = dao_module.get_user_friend_getter()
    friends_cleaner = process_module.get_extended_friends_cleaner()
    user_getter = dao_module.get_user_getter()
    ranking_getter = dao_module.get_ranking_getter()
    cleaned_friends_getter = dao_module.get_cleaned_user_friend_getter()

    seed_id = user_getter.get_user_by_screen_name(user_name).id

    cluster = cleaned_friends_getter.get_user_friends_ids(seed_id)
    cluster.append(seed_id)
    news = ['nytimes', 'kylegriffin1', 'propublica', 'TheAtlantic', 'brianstelter',
            'NewYorker']

    ml = ['mer__edith', 'timnitGebru', 'merbroussard', 'rajiinio']

    for name in news:
        user_id = user_getter.get_user_by_screen_name(name).id
        friends = user_friend_getter.get_user_friends_ids(str(user_id))
        intersection = set(friends).intersection(cluster)
        log.info(name)
        log.info([user_getter.get_user_by_id(id).screen_name for id in intersection])
Exemple #6
0
def process_user_tweets(id: str, path=DEFAULT_PATH):
    injector = Injector.get_injector_from_file(path)
    process_module = injector.get_process_module()

    tweet_processor = process_module.get_tweet_processor()

    tweet_processor.process_tweets_by_user_id(id)
Exemple #7
0
def process_dates(path=DEFAULT_PATH):

    injector = Injector.get_injector_from_file(path)
    process_module = injector.get_process_module()
    dao_module = injector.get_dao_module()
    user_tweet_getter = dao_module.get_user_tweet_getter()
    user_tweet_getter.convert_dates()
Exemple #8
0
def produce_plots(seed_id: str, user_name: str, path=DEFAULT_PATH):
    threshold = 60

    injector = Injector.get_injector_from_file(path)
    process_module = injector.get_process_module()
    dao_module = injector.get_dao_module()

    user_friend_getter = dao_module.get_user_friend_getter()
    friends_cleaner = process_module.get_friends_cleaner()
    social_graph_constructor = process_module.get_social_graph_constructor()
    clusterer = process_module.get_clusterer()
    cluster_word_frequency_processor = process_module.get_cluster_word_frequency_processor()

    tweet_processor = process_module.get_tweet_processor()

    production_ranker = process_module.get_ranker()
    consumption_ranker = process_module.get_ranker(type="Consumption")
    follower_ranker = process_module.get_ranker(type="Follower")

    # Full user friend list
    init_user_friends = user_friend_getter.get_user_friends_ids(seed_id)
    # tweet_processor.process_tweets_by_user_list(init_user_friends)
    clean_list = friends_cleaner.clean_friends_from_list(seed_id, init_user_friends, percent_threshold=threshold)
    clean_list = [str(id) for id in clean_list]
    init_user_dict = get_local_neighbourhood_user_dict(seed_id, clean_list, user_friend_getter)
    local_neighbourhood = LocalNeighbourhood(seed_id=seed_id, params=None, users=init_user_dict)
    social_graph = social_graph_constructor.construct_social_graph_from_local_neighbourhood(seed_id, local_neighbourhood)
    clusters = clusterer.cluster_by_social_graph(seed_id, social_graph, {})

    count = 1
    for cluster in clusters:
        if len(cluster.users) < 5:
            continue

        prod_ranking, prod_scores = production_ranker.rank(seed_id, cluster)
        cons_ranking, cons_scores = consumption_ranker.rank(seed_id, cluster)
        foll_ranking, foll_scores = follower_ranker.rank(seed_id, cluster)

        cluster_wf_vector = cluster_word_frequency_processor.process_cluster_word_frequency_vector(cluster.users)

        wf_dict = cluster_wf_vector.get_words_dict()
        sorted_words = list(sorted(wf_dict, key=wf_dict.get, reverse=True))
        sorted_words.remove("rt")
        sorted_words.remove("like")
        top_words = sorted_words[0:min(len(sorted_words), 10)]

        file_prefix = user_name + '_' + str(count)

        scatter_plot_from_scores(user_name, prod_scores, cons_scores, count, top_words, file_prefix + "prod_cons")
        scatter_plot_from_scores(user_name, prod_scores, cons_scores, count, top_words, file_prefix + "prod_cons", use_log_log_scale=True)

        scatter_plot_from_scores(user_name, prod_scores, foll_scores, count, top_words, file_prefix + "prod_foll", type1='Production Utility', type2='Follower Utility')
        scatter_plot_from_scores(user_name, prod_scores, foll_scores, count, top_words, file_prefix + "prod_foll", use_log_log_scale=True, type1='Production Utility', type2='Follower Utility')

        scatter_plot_from_scores(user_name, cons_scores, foll_scores, count, top_words, file_prefix + "cons_foll", type1='Consumption Utility', type2='Follower Utility')
        scatter_plot_from_scores(user_name, cons_scores, foll_scores, count, top_words, file_prefix + "cons_foll", use_log_log_scale=True, type1='Consumption Utility', type2='Follower Utility')

        write_scores_to_file({"production": prod_scores, "consumption": cons_scores, "follower": foll_scores}, user_name, count)
        count += 1
def get_cluster_word_frequency(ids: List[str], path=DEFAULT_PATH):
    injector = Injector.get_injector_from_file(path)
    process_module = injector.get_process_module()

    cluster_word_frequency_processor = process_module.get_cluster_word_frequency_processor()

    cluster_word_frequency_processor.process_cluster_word_frequency_vector(ids)
    cluster_word_frequency_processor.process_relative_cluster_word_frequency(ids)
Exemple #10
0
def get_user_word_frequency(id, path=DEFAULT_PATH):
    injector = Injector.get_injector_from_file(path)
    process_module = injector.get_process_module()

    user_word_frequency_processor = process_module.get_user_word_frequency_processor(
    )

    user_word_frequency_processor.process_user_word_frequency_vector(id)
    user_word_frequency_processor.process_relative_user_word_frequency(id)
Exemple #11
0
def download_user(name: str, path=DEFAULT_PATH):
    injector = Injector.get_injector_from_file(path)
    process_module = injector.get_process_module()

    user_downloader = process_module.get_user_downloader()

    log.info("Starting Download user with name: %s" % (name))
    user_downloader.download_user_by_screen_name(name)
    log.info("Done downloading user: %s" % (name))
Exemple #12
0
def download_user_friends(name: str, saturated=False, path=DEFAULT_PATH):
    injector = Injector.get_injector_from_file(path)
    process_module = injector.get_process_module()

    user_friend_downloader = process_module.get_user_friend_downloader()

    if saturated:
        user_friend_downloader.download_friends_users_by_screen_name(name)
    else:
        user_friend_downloader.download_friends_ids_by_screen_name(name)
Exemple #13
0
def detect_core(name: int, path=DEFAULT_PATH):
    try:
        injector = Injector.get_injector_from_file(path)
        process_module = injector.get_process_module()

        core_detector = process_module.get_core_detector()
        core_detector.detect_core_by_screen_name(name)
    except Exception as e:
        log.exception(e)
        exit()
def process_local_neighbourhood_tweets(id: str, path=DEFAULT_PATH):
    injector = Injector.get_injector_from_file(path)

    dao_module = injector.get_dao_module()
    process_module = injector.get_process_module()

    local_neighbourhood_getter = dao_module.get_local_neighbourhood_getter()
    tweet_processor = process_module.get_tweet_processor()

    local_neighbourhood = local_neighbourhood_getter.get_local_neighbourhood(id)
    tweet_processor.process_tweets_by_local_neighbourhood(local_neighbourhood)
Exemple #15
0
def ranking(user_name: str, thresh, path=DEFAULT_PATH):

    injector = Injector.get_injector_from_file(path)
    process_module = injector.get_process_module()
    dao_module = injector.get_dao_module()

    local_followers_ranker = process_module.get_ranker("LocalFollowers")
    consumption_ranker = process_module.get_ranker("Consumption")
    production_ranker = process_module.get_ranker()
    ranking_getter = dao_module.get_ranking_getter()
    user_getter = dao_module.get_user_getter()
    user_id = user_getter.get_user_by_screen_name(user_name).id
Exemple #16
0
def rank_cluster(seed_id: str, params=None, path=DEFAULT_PATH):
    injector = Injector.get_injector_from_file(path)
    process_module = injector.get_process_module()
    dao_module = injector.get_dao_module()

    cluster_getter = dao_module.get_cluster_getter()
    ranker = process_module.get_ranker('Consumption')

    clusters, _ = cluster_getter.get_clusters(seed_id)

    for cluster in clusters:
        ranker.rank(seed_id, cluster)
Exemple #17
0
def download_user_friends_tweets(id: str, path=DEFAULT_PATH):
    injector = Injector.get_injector_from_file(path)

    dao_module = injector.get_dao_module()
    process_module = injector.get_process_module()

    user_friend_getter = dao_module.get_user_friend_getter()
    user_tweet_downloader = process_module.get_user_tweet_downloader()

    log.info("Getting user friends for " + str(id))
    list = [id] + user_friend_getter.get_user_friends_ids(id)

    log.info("Beginning to download tweets for user " + str(id))
    # user_tweet_downloader.download_user_tweets_by_user_list(list)
    user_tweet_downloader.stream_tweets_by_user_list(list)
Exemple #18
0
def produce_plots(user_name: str, path=DEFAULT_PATH):

    injector = Injector.get_injector_from_file(path)
    process_module = injector.get_process_module()
    dao_module = injector.get_dao_module()

    friends_getter = dao_module.get_user_friend_getter()
    user_getter = dao_module.get_user_getter()

    log.info("Getting seed user id")
    seed_id = str(user_getter.get_user_by_screen_name(user_name).get_id())

    plotter = DataCleaningDistributions(friends_getter, user_getter)

    log.info("Starting to plot")
    #plotter.tweet_plot(seed_id)
    #plotter.follower_plot(seed_id)
    #plotter.follower_ratio_plot(seed_id)
    plotter.local_friends_plot(seed_id)
Exemple #19
0
def ranking(user_name: str, path=DEFAULT_PATH):

    injector = Injector.get_injector_from_file(path)
    process_module = injector.get_process_module()
    dao_module = injector.get_dao_module()

    local_followers_ranker = process_module.get_ranker("LocalFollowers")
    consumption_ranker = process_module.get_ranker("Consumption")
    production_ranker = process_module.get_ranker()

    user_getter = dao_module.get_user_getter()
    ranking_getter = dao_module.get_ranking_getter()

    cluster_getter = dao_module.get_cluster_getter()
    seed_id = user_getter.get_user_by_screen_name(user_name).id

    clusters, _ = cluster_getter.get_clusters(seed_id, params={"graph_type": "union"})

    production_ranking = ranking_getter.get_ranking(seed_id)
    cluster = clusters[1].users

    log.info('Scoring Consumption...')
    #consumption = consumption_ranker.score_users(cluster)
    #ranked_consumption = list(sorted(consumption, key=consumption.get, reverse=True))[:20]

    ranked_consumption = ['109117316', '1203825050476072960', '359831209', '1294363908694827010', '2492917412',
     '1291153576455266304', '929791330519322624', '2999992556', '254201259', '810582380', '66999485', '918511183', '161455525',
     '432957426', '6466252', '166479009', '748528569064710145', '382376904', '24223629', '2311193425']

    log.info('Scoring Production...')

    ranked_production = production_ranking.get_top_20_user_ids()
    consumption_users = [user_getter.get_user_by_id(str(id)).screen_name for id in ranked_consumption]
    production_users = [user_getter.get_user_by_id(str(id)).screen_name for id in ranked_production]
    log.info(consumption_users)
    log.info(production_users)
    log.info(len(set(consumption_users).intersection(production_users)))
    log.info(jaccard_similarity(ranked_consumption, ranked_production))
Exemple #20
0
def get_tweets(name: str, path=DEFAULT_PATH):

    injector = Injector.get_injector_from_file(path)
    process_module = injector.get_process_module()
    dao_module = injector.get_dao_module()

    user_getter = dao_module.get_user_getter()
    user_id = user_getter.get_user_by_screen_name(name).id
    twitter_getter = dao_module.get_twitter_getter()
    # tweets = twitter_getter.get_tweets_by_user_id(user_id)
    user_tweet_getter = dao_module.get_user_tweet_getter()
    tweets = sorted(user_tweet_getter.get_tweets_by_user_id_time_restricted(str(user_id)),
                    key=lambda x: x.created_at)
    log.info(len(tweets))
    log.info(tweets[0].text)
    tweet = tweets[0]
    date = tweet.created_at
    log.info(date)

    if type(date) != datetime:
        proper_date = datetime.strptime(date, '%a %b %d %H:%M:%S +0000 %Y')
        tweet.created_at = proper_date
    log.info(tweet.__dict__)
def produce_plots(user_name: str, thresh, i, path=DEFAULT_PATH):

    injector = Injector.get_injector_from_file(path)
    process_module = injector.get_process_module()
    dao_module = injector.get_dao_module()

    user_friend_getter = dao_module.get_user_friend_getter()
    friends_cleaner = process_module.get_extended_friends_cleaner()
    social_graph_constructor = process_module.get_social_graph_constructor()
    clusterer = process_module.get_clusterer()
    user_getter = dao_module.get_user_getter()

    seed_id = user_getter.get_user_by_screen_name(user_name).id
    # Full user friend list
    init_user_friends = user_friend_getter.get_user_friends_ids(seed_id)
    # tweet_processor.process_tweets_by_user_list(init_user_friends)

    type = 'local_and_global'
    filename_start = "./dc2_exp/" + str(type) + '/clusters_local_' + str(thresh) + '_global_50' '/' + str(user_name) + '_clusters_'
    filename = filename_start + str(i) + '.json'
    with open(filename, 'r') as file:
        user_lists = json.load(file)
        count = len(user_lists)
    max_cluster = user_lists[0]
    for j in range(1, count):
        if len(user_lists[j]) > len(max_cluster):
            max_cluster = user_lists[j]
    max_cluster.remove(str(seed_id))
    log.info("Num users in max cluster is " + str(len(max_cluster)))
    init_user_dict = get_local_neighbourhood_user_dict(seed_id, max_cluster, user_friend_getter)
    local_neighbourhood = LocalNeighbourhood(seed_id=seed_id, params=None, users=init_user_dict)
    social_graph = social_graph_constructor.construct_social_graph_from_local_neighbourhood(seed_id, local_neighbourhood, remove_unconnected_nodes=True)
    clusters = clusterer.cluster_by_social_graph(seed_id, social_graph, {})

    log.info("Iteration: " + str(i))
    write_clusters_to_file(user_name, clusters, i, thresh, "local_and_global_of_cluster")
Exemple #22
0
def ranking(user_name: str, thresh, path=DEFAULT_PATH):

    injector = Injector.get_injector_from_file(path)
    process_module = injector.get_process_module()
    dao_module = injector.get_dao_module()

    local_followers_ranker = process_module.get_ranker("LocalFollowers")
    consumption_ranker = process_module.get_ranker("Consumption")
    production_ranker = process_module.get_ranker()

    type = 'local_and_global'
    filename = "./dc2_exp/" + str(type) + '/clusters_local_' + str(thresh) + '_global_50' '/' + str(user_name) + '_clusters_0.json'
    with open(filename, 'r') as file:
        user_lists = json.load(file)
        count = len(user_lists)

    for i in range(count): # Going through each cluster
        cluster = user_lists[i]
        log.info('Scoring Consumption...')
        consumption = consumption_ranker.score_users(cluster)
        ranked_consumption = list(sorted(consumption, key=consumption.get, reverse=True))

        write_ranking_to_file(user_name, ranked_consumption, i+1, thresh, 'consumption')

        log.info('Scoring Production...')
        production = production_ranker.score_users(cluster)
        ranked_production = list(sorted(production, key=production.get, reverse=True))

        write_ranking_to_file(user_name, ranked_production, i+1, thresh, 'production')

        log.info('Scoring Local Followers...')
        local_followers = local_followers_ranker.score_users(cluster)
        log.info(local_followers)
        ranked_followers = list(sorted(local_followers, key=local_followers.get, reverse=True))

        write_ranking_to_file(user_name, ranked_followers, i+1, thresh, 'followers')
Exemple #23
0
def produce_plots(user_name, path=DEFAULT_PATH):
    #series = ['5', '10', '15']
    # series = ['0', '200', '400', '600', '800', '1000', '1200', '1400', '1600', '1800', '2000']
    labels = []
    series_means = {}
    injector = Injector.get_injector_from_file(path)
    process_module = injector.get_process_module()
    dao_module = injector.get_dao_module()

    user_friend_getter = dao_module.get_user_friend_getter()
    friends_cleaner = process_module.get_extended_friends_cleaner()
    user_getter = dao_module.get_user_getter()

    seed_id = user_getter.get_user_by_screen_name(user_name).id
    # Full user friend list
    init_user_friends = user_friend_getter.get_user_friends_ids(seed_id)
    # tweet_processor.process_tweets_by_user_list(init_user_friends)
    global_clean = friends_cleaner.clean_friends_global(seed_id,
                                                        init_user_friends,
                                                        tweet_threshold=50,
                                                        follower_threshold=50,
                                                        bot_threshold=0)
    clean_list10, removed_list10 = friends_cleaner.clean_friends_local(
        seed_id, global_clean, local_following=10)
    clean_list15, removed_list15 = friends_cleaner.clean_friends_local(
        seed_id, global_clean, local_following=15)

    lst10 = [str(user) for user in clean_list10]
    lst15 = [str(user) for user in clean_list15]
    user_difference = list(set(lst10) - set(lst15))

    # fig = plt.figure()

    fig, axes = plt.subplots(1, 3)
    fig.suptitle(
        'Proportion of Users Removed from Cluster that are Actually Cleaned out for '
        + str(user_name) +
        " with Global Threshold 50 and Comparing Local Threshold 10 to 15")

    titles = ['Cluster 1', 'Cluster 2', 'Cluster 3']
    type = 'local_and_global'
    prefix = "./dc2_exp/"

    d_repr = './dc2_exp/local_and_global/clusters_local_10.0_global_50/david_madras_clusters_0.json'
    d2_repr = './dc2_exp/default/clusters_80/david_madras_clusters_8.json'

    repr_lst = []
    with open(d_repr) as file:
        user_lists = json.load(file)
        #assert len(user_lists) == 3, "Nope!"

        repr1 = user_lists[0]
        repr2 = user_lists[1]
        repr3 = user_lists[2]

    repr3_seed_removed = repr3[:]
    repr3_seed_removed.remove(str(seed_id))

    title1 = titles[0]
    title2 = titles[1]
    title3 = titles[2]

    filename_list = glob.glob(prefix + str(type) +
                              '/clusters_local_15.0_global_50'
                              '/' + str(user_name) + '_clusters_*.json')
    counts1 = []
    counts2 = []
    counts3 = []

    delete_counts1 = []
    delete_counts2 = []
    delete_counts3 = []

    subset_counts1 = []
    subset_counts2 = []

    iterations = []

    ax1 = axes[0]
    ax2 = axes[1]
    ax3 = axes[2]
    j = 0
    filename_start = prefix + str(
        type) + '/clusters_local_15.0_global_50' '/' + str(
            user_name) + '_clusters_'
    for k in range(20):
        filename = filename_start + str(k) + '.json'
        with open(filename, 'r') as file:
            user_lists = json.load(file)
            count = len(user_lists)
            sims1 = []
            sims2 = []
            sims3 = []
            for i in range(count):
                sims1.append(jaccard_similarity(user_lists[i], repr1))
                sims2.append(jaccard_similarity(user_lists[i], repr2))
                sims3.append(jaccard_similarity(user_lists[i], repr3))

            index1 = sims1.index(max(sims1))
            index2 = sims2.index(max(sims2))
            index3 = sims3.index(max(sims3))

            #if index1 == index2 and index2 == index3 and index3 == index1:
            if index1 == index2 or count == 1:
                log.info('does not work for ' + filename + ', ' + str(j))
            else:
                max_sim = [max(sims1), max(sims2), max(sims3)]
                max_sim.sort(reverse=True)

                cluster1 = user_lists[index1]
                cluster2 = user_lists[index2]
                cluster3 = user_lists[index3]

                d1 = jaccard_similarity(repr1, cluster1)
                difference1 = list(set(repr1) - set(cluster1))
                delete_counts1.append(overlap(difference1, user_difference))
                subset_counts1.append(overlap(cluster1, repr1))

                d2 = jaccard_similarity(repr2, cluster2)
                difference2 = list(set(repr2) - set(cluster2))
                delete_counts2.append(overlap(difference2, user_difference))
                subset_counts2.append(overlap(cluster2, repr2))

                d3 = jaccard_similarity(repr3, cluster3)
                delete_counts3.append(
                    overlap(repr3_seed_removed, user_difference))

                remaining3 = list(
                    set(repr3_seed_removed) - set(user_difference))
                log.info(remaining3)

                counts1.append(d1)
                counts2.append(d2)
                counts3.append(d3)
                iterations.append(j)
        j += 1

    ax1.bar(iterations, delete_counts1)
    ax2.bar(iterations, delete_counts2)
    ax3.bar(iterations, delete_counts3)

    # Add some text for labels, title and custom x-axis tick labels, etc.

    for ax in [ax1, ax2, ax3]:
        #for ax in [ax2, ax2]:
        ax.set_ylabel('Overlap Similarity')
        ax.set_xlabel('Iteration Number')

    ax1.set_title(title1, fontsize=10)
    ax2.set_title(title2, fontsize=10)
    ax3.set_title(title3, fontsize=10)

    plt.show()

    fig, axes = plt.subplots(1, 2)
    fig.suptitle(
        'Proportion of Users in Remaining Cluster that were in Previous Cluster '
        + str(user_name) +
        " with Global Threshold 50 and Comparing Local Threshold 10 to 15")
    ax1 = axes[0]
    ax2 = axes[1]
    ax1.bar(iterations, subset_counts1)
    ax2.bar(iterations, subset_counts2)

    for ax in [ax1, ax2]:
        #for ax in [ax2, ax2]:
        ax.set_ylabel('Overlap Similarity')
        ax.set_xlabel('Iteration Number')

    ax1.set_title(title1, fontsize=10)
    ax2.set_title(title2, fontsize=10)
    ax3.set_title(title3, fontsize=10)
    plt.show()

    plot_removed(user_difference, repr3, user_name)
Exemple #24
0
def produce_plots(user_name: str, thresh, iteration, path=DEFAULT_PATH):

    injector = Injector.get_injector_from_file(path)
    process_module = injector.get_process_module()
    dao_module = injector.get_dao_module()

    user_friend_getter = dao_module.get_user_friend_getter()
    friends_cleaner = process_module.get_extended_friends_cleaner()
    social_graph_constructor = process_module.get_social_graph_constructor()
    clusterer = process_module.get_clusterer()
    user_getter = dao_module.get_user_getter()
    user_tweet_getter = dao_module.get_user_tweet_getter()
    clean_user_friend_getter = dao_module.get_cleaned_user_friend_getter()
    local_neighbourhood_getter = dao_module.get_local_neighbourhood_getter()
    prod_ranker = process_module.get_ranker()
    con_ranker = process_module.get_ranker("Consumption")

    seed_id = user_getter.get_user_by_screen_name(user_name).id
    # Full user friend list
    init_user_friends = user_friend_getter.get_user_friends_ids(seed_id)
    # tweet_processor.process_tweets_by_user_list(init_user_friends)

    # user = user_getter.get_user_by_id(str(seed_id))
    # follower_thresh = 0.1 * user.followers_count
    # friend_thresh = 0.1 * user.friends_count
    # tweet_thresh = 0.1 * len(user_tweet_getter.get_tweets_by_user_id_time_restricted(str(seed_id)))
    # global_clean = friends_cleaner.clean_friends_global(seed_id,
    #             tweet_threshold=tweet_thresh, follower_threshold=follower_thresh, friend_threshold=friend_thresh)
    # clean_list, removed_list = friends_cleaner.clean_friends_local(seed_id, global_clean, local_following=thresh)
    # clean_list = [str(id) for id in clean_list]

    clean_list = clean_user_friend_getter.get_user_friends_ids(str(seed_id))
    # social_graph = social_graph_constructor.construct_social_graph(seed_id, is_union=False)
    # following_counts = {}
    # for user_id in clean_list:
    #     friends = user_friend_getter.get_user_friends_ids(str(user_id))
    #     following_counts[user_id] = len(set(friends).intersection(clean_list))
    # sorted_users = list(sorted(following_counts, key=following_counts.get, reverse=True))
    # print([following_counts[user] for user in sorted_users])

    local_neighbourhood = local_neighbourhood_getter.get_local_neighbourhood(seed_id)

    # Refined Friends Method
    for k in range(1, 7):
        log.info("Refining Friends List:")
        user_list = local_neighbourhood.get_user_id_list()
        friends_map = {}
        print('1012256833816363008' in user_list)
        for user in user_list:
            friends_list = []
            friends = local_neighbourhood.get_user_friends(user)
            # print(len(friends))
            for friend in friends:
                if user in local_neighbourhood.get_user_friends(str(friend)):
                    friends_list.append(str(friend))
                if user == str(seed_id):
                    if int(user) in user_friend_getter.get_user_friends_ids(str(friend)):
                        friends_list.append(str(friend))
            # print(len(friends_list))
            friends_map[str(user)] = friends_list
            if user == "254201259":
                print(len(friends_list))

        log.info("Refining by Jaccard Similarity:")
        for user in [str(id) for id in user_list]:
            friends_list = friends_map[user]
            similarities = {}
            for friend in friends_list:
                sim = jaccard_similarity(friends_list, friends_map[str(friend)])
                similarities[friend] = sim
            sorted_users = sorted(similarities, key=similarities.get, reverse=True)
            top_sum = 0
            for top_user in sorted_users[:10]:
                top_sum += similarities[top_user]
            if len(sorted_users) >= 10:
                thresh = 0.1 * k * (top_sum / 10)
            elif len(sorted_users) == 0:
                thresh = 0
            else:
                thresh = 0.1 * k * (top_sum / len(sorted_users))
            # Can do more efficiently using binary search
            index = len(sorted_users)
            for i in range(len(sorted_users)):
                user = sorted_users[i]
                if similarities[user] < thresh:
                    index = i
                    break
            friends_map[user] = sorted_users[:index]

        log.info("Thresh: " + str(0.1*k))
        log.info("Setting Local Neighborhood:")
        refined_local_neighborhood = LocalNeighbourhood(str(seed_id), None, friends_map)
        social_graph = social_graph_constructor.construct_social_graph_from_local_neighbourhood(seed_id, refined_local_neighborhood, is_union=False)
        log.info("Clustering:")
        clusters = clusterer.cluster_by_social_graph(seed_id, social_graph, None)
        # log.info("Iteration: " + str(iteration))
        log.info(len(clusters))
        cluster_sizes = {}
        for i in range(len(clusters)):
            cluster_sizes[i] = len(clusters[i].users)
        sorted_indices = sorted(cluster_sizes, key=cluster_sizes.get, reverse=True)
        for index in sorted_indices[:5]:
            cluster = clusters[index]
            prod_ranking, prod = prod_ranker.rank(str(seed_id), cluster)
            con_ranking, con = con_ranker.rank(str(seed_id), cluster)
            ranked_prod = prod_ranking.get_all_ranked_user_ids()
            ranked_con = con_ranking.get_all_ranked_user_ids()

            log.info("Cluster Size: " + str(len(cluster.users)))
            log.info("Ranked by Production: ")
            log.info([user_getter.get_user_by_id(str(id)).screen_name for id in ranked_prod])
            log.info("Ranked by Consumption: ")
            log.info([user_getter.get_user_by_id(str(id)).screen_name for id in ranked_con])
def ranking_distribution(user_name: str, thresh, path=DEFAULT_PATH):

    injector = Injector.get_injector_from_file(path)
    process_module = injector.get_process_module()
    dao_module = injector.get_dao_module()

    production_ranker = process_module.get_ranker()
    consumption_ranker = process_module.get_ranker("Consumption")
    local_followers_ranker = process_module.get_ranker("LocalFollowers")
    relative_production_ranker = process_module.get_ranker(
        "RelativeProduction")

    user_getter = dao_module.get_user_getter()
    friends_getter = dao_module.get_user_friend_getter()
    tweet_getter = dao_module.get_user_tweet_getter()
    clusterer = process_module.get_clusterer()

    seed_id = user_getter.get_user_by_screen_name(user_name).id
    type = 'local_and_global'
    filename = "./dc2_exp/" + str(type) + '/clusters_local_' + str(
        thresh) + '_global_50' '/' + str(user_name) + '_clusters_0.json'
    with open(filename, 'r') as file:
        user_lists = json.load(file)
        count = len(user_lists)

    cluster1 = user_lists[0]
    #similarity_retweets_matrix(user_name, thresh, 1, tweet_getter, cluster1)
    #similarity_matrix(user_name, thresh, 1, 'p', 'p', friends_getter, cluster1)
    #similarity_graph(user_name, seed_id, thresh, 1, friends_getter, cluster1, clusterer)

    for i in range(count):

        cluster = user_lists[i]
        log.info(len(cluster))

        log.info('Scoring Consumption...')
        consumption = consumption_ranker.score_users(cluster)
        ranked_consumption = list(
            sorted(consumption, key=consumption.get, reverse=True))
        consumptions = [consumption[user] for user in ranked_consumption]

        log.info('Scoring Production...')
        production = production_ranker.score_users(cluster)
        ranked_production = list(
            sorted(production, key=production.get, reverse=True))
        productions = [production[user] for user in ranked_production]

        log.info('Scoring Local Followers...')
        local_followers = local_followers_ranker.score_users(cluster)
        log.info(local_followers)
        ranked_followers = list(
            sorted(local_followers, key=local_followers.get, reverse=True))
        followers = [local_followers[user] for user in ranked_followers]

        log.info('Scoring Relative Production...')
        relative_production = relative_production_ranker.score_users(cluster)
        ranked_relative_production = list(
            sorted(relative_production,
                   key=relative_production.get,
                   reverse=True))

        titles = [
            'Distribution of Consumption at Local Threshold ' + str(thresh) +
            ' for Cluster ' + str(i + 1),
            'Distribution of Production at Local Threshold ' + str(thresh) +
            ' for Cluster ' + str(i + 1),
            'Distribution of Local Followers at Local Threshold ' +
            str(thresh) + ' for Cluster ' + str(i + 1)
        ]

        title = titles[0]
        plt.bar(ranked_consumption, consumptions)
        plt.ylabel('Consumption Utility')
        plt.xlabel('Users in Cluster')
        plt.title(title)
        plt.show()

        title = titles[1]
        plt.bar(ranked_production, productions)
        plt.ylabel('Production Utility')
        plt.xlabel('Users in Cluster')
        plt.title(title)
        plt.show()

        title = titles[2]
        plt.bar(ranked_followers, followers)
        plt.ylabel('Local Followers')
        plt.xlabel('Users in Cluster')
        plt.title(title)
        plt.show()

        compare_top_users(ranked_consumption, ranked_production,
                          ranked_followers, ranked_relative_production, i + 1,
                          thresh, user_getter)