def similarity_stats(population_name=None, friends=None): def calc(friends): friends_info = [get_FriendInfo(f) for f in friends] similarity_stats = Stats() for i in range(len(friends)): f1 = friends_info[i] for j in range(i+1, len(friends)): f2 = friends_info[j] sim = f1.similarity_with(f2) similarity_stats.update(sim['score']) return similarity_stats if not friends: friends = get_friends() if population_name: out = os.path.join(outdir, config.population_similarity_stats_filename(population_name)) try: with open(out, 'r') as f: return Stats(_dict=json.loads(f.read())) except IOError: stats = calc(friends) with open(out, 'w') as f: f.write(json.dumps(stats.__dict__, indent=4, sort_keys=True)) return stats else: return calc(friends)
def analyze(): friends = get_friends() pop_wcs = population_word_counts(population_name='everyone') pop_sim_stats = similarity_stats(population_name='everyone') friend_wcs = {wc.friend_id: wc for wc in friend_word_counts()} pop_cutoff = pop_sim_stats.avg() + 2*pop_sim_stats.std_dev() group_sims = Stats() group_words = [] for word, d in pop_wcs.word_counts(): if d['stats'].n() > 1: # more than 1 person used this word cutoff = d['stats'].avg() + 2*d['stats'].std_dev() highs = [] for friend_id in d['ids']: # of the people who used it a lot if friend_wcs[friend_id].counts[word] > cutoff: highs.append(friend_id) if len(highs) > 1: # how similar are they? sim_stats = similarity_stats(population_name='word_'+word, friends = [{'id':i, 'name':friend_wcs[i].friend_name} for i in highs]) group_words.append((word, sim_stats.avg())) group_sims.multi_update(sim_stats.data) print 'EVERYONE' print pop_sim_stats.row_display() print '\nGROUPS' print group_sims.row_display() group_words = sorted(group_words, key=lambda gw: gw[1], reverse=True) print 'Words for which frequenty users were most similar' n = min(len(group_words), 10) for i in range(n): print group_words[i] with open(os.path.join(config.population_similarity_stats_filename('freq_word_users')), 'w') as f: f.write(json.dumps(group_sims.__dict__, indent=4, sort_keys=True)) # bar chart for all friend pairs plt.title('Similarity of all friend pairs') n, bins, patches = plt.hist(pop_sim_stats.data, bins=20, range=[0,50]) plt.xlabel('Similarity') plt.ylabel('Number of Friend Pairs') plt.show() # pie chart for all friend pairs N = float(sum(n)) fracs = [x/N for x in n] bucket_size = bins[1] - bins[0] labels = [str(b)+'-'+str(b+bucket_size) for b in bins[0:-1]] print len(labels), len(fracs) plt.title('Similarity of all friend pairs') plt.pie(fracs, labels=labels) plt.show() # bar chart for word-sharing friend pairs plt.title('Similarity of pairs of friends who use the same word frequently') n, bins, patches = plt.hist(group_sims.data, bins=20, range=[0,50]) plt.xlabel('Similarity') plt.ylabel('Number of Friend Pairs') plt.show() # pie chart for all friend pairs N = float(sum(n)) fracs = [x/N for x in n] bucket_size = bins[1] - bins[0] labels = [str(b)+'-'+str(b+bucket_size) for b in bins[0:-1]] print len(labels), len(fracs) plt.title('Similarity of pairs of friends who use the same word frequently') plt.pie(fracs, labels=labels) plt.show()