Beispiel #1
0
def similarity_stats(population_name=None, friends=None):
    def calc(friends):
        friends_info = [get_FriendInfo(f) for f in friends]
        similarity_stats = Stats()
        for i in range(len(friends)):
            f1 = friends_info[i]
            for j in range(i+1, len(friends)):
                f2 = friends_info[j]
                sim = f1.similarity_with(f2)
                similarity_stats.update(sim['score'])
        return similarity_stats

    if not friends:
        friends = get_friends()

    if population_name:
        out = os.path.join(outdir, config.population_similarity_stats_filename(population_name))
        try:
            with open(out, 'r') as f:
                return Stats(_dict=json.loads(f.read()))
        except IOError:
            stats = calc(friends)
            with open(out, 'w') as f:
                f.write(json.dumps(stats.__dict__, indent=4, sort_keys=True))
            return stats
    else:
        return calc(friends)
Beispiel #2
0
def analyze():
    friends = get_friends()
    pop_wcs = population_word_counts(population_name='everyone')
    pop_sim_stats = similarity_stats(population_name='everyone')
    friend_wcs = {wc.friend_id: wc for wc in friend_word_counts()}

    pop_cutoff = pop_sim_stats.avg() + 2*pop_sim_stats.std_dev()
    group_sims = Stats()
    group_words = []
    for word, d in pop_wcs.word_counts():
        if d['stats'].n() > 1: # more than 1 person used this word
            cutoff = d['stats'].avg() + 2*d['stats'].std_dev()
            highs = []
            for friend_id in d['ids']: # of the people who used it a lot
                if friend_wcs[friend_id].counts[word] > cutoff:
                    highs.append(friend_id)
            if len(highs) > 1:         # how similar are they?
                sim_stats = similarity_stats(population_name='word_'+word, friends = [{'id':i, 'name':friend_wcs[i].friend_name} for i in highs])
                group_words.append((word, sim_stats.avg()))
                group_sims.multi_update(sim_stats.data)

    print 'EVERYONE'
    print pop_sim_stats.row_display()

    print '\nGROUPS'
    print group_sims.row_display()

    group_words = sorted(group_words, key=lambda gw: gw[1], reverse=True)
    print 'Words for which frequenty users were most similar'
    n = min(len(group_words), 10)
    for i in range(n):
        print group_words[i]

    with open(os.path.join(config.population_similarity_stats_filename('freq_word_users')), 'w') as f:
        f.write(json.dumps(group_sims.__dict__, indent=4, sort_keys=True))

    # bar chart for all friend pairs
    plt.title('Similarity of all friend pairs')
    n, bins, patches = plt.hist(pop_sim_stats.data, bins=20, range=[0,50])
    plt.xlabel('Similarity')
    plt.ylabel('Number of Friend Pairs')
    plt.show()

    # pie chart for all friend pairs
    N = float(sum(n))
    fracs = [x/N for x in n]
    bucket_size = bins[1] - bins[0]
    labels = [str(b)+'-'+str(b+bucket_size) for b in bins[0:-1]]
    print len(labels), len(fracs)
    plt.title('Similarity of all friend pairs')
    plt.pie(fracs, labels=labels)
    plt.show()

    # bar chart for word-sharing friend pairs
    plt.title('Similarity of pairs of friends who use the same word frequently')
    n, bins, patches = plt.hist(group_sims.data, bins=20, range=[0,50])
    plt.xlabel('Similarity')
    plt.ylabel('Number of Friend Pairs')
    plt.show()

    # pie chart for all friend pairs
    N = float(sum(n))
    fracs = [x/N for x in n]
    bucket_size = bins[1] - bins[0]
    labels = [str(b)+'-'+str(b+bucket_size) for b in bins[0:-1]]
    print len(labels), len(fracs)
    plt.title('Similarity of pairs of friends who use the same word frequently')
    plt.pie(fracs, labels=labels)
    plt.show()