def pr_bin_dist(self): st = Stat() final_pr = st.user_pagerank()[self.log.end_date] log_pr = self.cal_log_pr(final_pr.values()) per = self.get_bins(70, log_pr) rep = self.get_rep(log_pr, per) count = np.zeros(len(rep)) cdf = np.zeros(len(rep)) for pr in final_pr.values(): g = self.get_bin_group(self.cal_log_pr(pr), per) count[g] += 1 for i in range(len(count)): if i == 0: cdf[i] = count[i] cdf[i] = count[i] + cdf[i - 1] print cdf count = self.cal_log_pr(count) cdf = self.cal_log_pr(cdf) count, rep_cleaned = self.clean_lists(count, rep) xp = np.linspace(np.min(rep_cleaned), np.max(rep_cleaned), 100) print np.polyfit(rep_cleaned, count, 1) cc = np.poly1d(np.polyfit(rep_cleaned, count, 1)) plt.plot(rep_cleaned, count, '.', xp, cc(xp), '-') plt.show() cdf, rep_cleaned = self.clean_lists(cdf, rep) print np.polyfit(rep_cleaned, 1 - cdf, 1) cc = np.poly1d(np.polyfit(rep_cleaned, 1 - cdf, 1)) plt.plot(rep_cleaned, 1 - cdf, '.', xp, cc(xp), '-') plt.show()
def pr_distr(self): st = Stat() pr = st.pagerank()[self.log.end_date] graph = st.graphs[self.log.end_date] pr_list = [] for v in graph.vertices(): pr_list.append(float("{0:.5f}".format(pr[v]))) count = Counter(pr_list) x = [] y = [] for key in sorted(count.iterkeys()): x.append(key) y.append(count[key]) plt.plot(x, y, 'r.') plt.ylim([0, 200]) plt.show() plt.plot(np.log(x), np.log(y), 'r.') plt.ylim([0, np.log(200)]) plt.show() cdf = [y[0]] for i in range(1, len(y)): cdf.append(cdf[i - 1] + y[i]) plt.plot(x, cdf, 'r.') plt.show() plt.plot(np.log(x), np.log(cdf), 'r.') plt.show() plt.plot(np.log(x), 1 - np.log(np.array(cdf)), 'r.') plt.show()
def join_date_final_pagerank(self): x = self.log.join_date_since_inception() st = Stat() pr = st.pagerank()[self.log.end_date] graph = st.graphs[self.log.end_date] #print pr , graph id = graph.vertex_properties['id'] join_since = [] ult_pgr = [] for v in graph.vertices(): if v.out_degree() > 1: ult_pgr.append(pr[v]) join_since.append(x[id[v]]) # # join_since = join_since[1:] # ult_pgr = ult_pgr[1:] #plt.plot(join_since , ult_pgr , 'r.') #data = [join_since , ult_pgr] data = [] for i in sorted(list(set(join_since))): d = [] for j in range(len(join_since)): if join_since[j] == i: d.append(ult_pgr[i]) data.append(d) plt.boxplot(data) plt.xlabel('Day of joining ') plt.ylabel('Value of page rank') plt.show() # stat tests demo thr = max(ult_pgr) - min(ult_pgr) thr /= 2 first = [] second = [] for i in range(len(ult_pgr)): if ult_pgr[i] > thr: first.append(join_since[i]) else: second.append(join_since[i]) # join = [] # ult = [] # for i in range(len(ult_pgr)): # #if ult_pgr[i] > 0.0001: # join.append(join_since[i]) # ult.append(ult_pgr[i]) # print len(join) # print len(join_since) # plt.plot(join, ult, 'r.') # plt.show() print(stats.ttest_ind(first, second, equal_var=False)) print(stats.pearsonr(join_since, ult_pgr)) print(stats.spearmanr(join_since, ult_pgr))
def matthew_effect(self): st = Stat() final_pr = st.user_pagerank()[self.log.end_date] log_pr = self.cal_log_pr(final_pr.values()) per = self.get_bins(70, log_pr) rep = self.get_rep(log_pr, per) print rep pr = st.user_pagerank() date = self.log.start_date pch = np.zeros(len(rep)) pcount = np.zeros(len(rep)) nch = np.zeros(len(rep)) ncount = np.zeros(len(rep)) change = np.zeros(len(rep)) count = np.zeros(len(rep)) rel_prob = np.zeros(len(rep)) while date < self.log.end_date: for user in pr[date]: g = self.get_bin_group(self.cal_log_pr(pr[date][user]), per) if user not in pr[date + timedelta(7)]: nch[g] += pr[date][user] ncount[g] += 1 change[g] += pr[date][user] count[g] += 1 continue ch = pr[date + timedelta(7)][user] - pr[date][user] if ch >= 0: pch[g] += ch pcount[g] += 1 if ch < 0: nch[g] += abs(ch) ncount[g] += 1 change[g] += pr[date][user] count[g] += 1 date += timedelta(7) for i in range(len(pch)): pch[i] /= pcount[i] nch[i] /= ncount[i] change[i] /= count[i] nch = self.cal_log_pr(nch) pch = self.cal_log_pr(pch) change = self.cal_log_pr(change) print np.polyfit(rep, nch, 1) print np.polyfit(rep, pch, 1) print np.polyfit(rep, change, 1) pp = np.poly1d(np.polyfit(rep, pch, 1)) nn = np.poly1d(np.polyfit(rep, nch, 1)) changep = np.poly1d(np.polyfit(rep, change, 1)) xp = np.linspace(np.min(rep), np.max(rep), 100) plt.plot(rep, pch, '.', xp, pp(xp), '-') plt.show() plt.plot(rep, nch, '.', xp, nn(xp), '-') plt.show() plt.plot(rep, change, '.', xp, changep(xp), '-') plt.show()
def ten_top(self): x = self.log.join_date_since_inception() st = Stat() pr = st.pagerank()[self.log.end_date] graph = st.graphs[self.log.end_date] id = graph.vertex_properties['id'] id_invert = {} for v in graph.vertices(): id_invert[id[v]] = int(v) print id_invert pr_dict = {} for v in graph.vertices(): pr_dict[int(id[v])] = pr[v] sorted_pr = sorted(pr_dict.values()) sorted_id = sorted(pr_dict, key=pr_dict.get) samples_pr = sorted_pr[20:30] samples_id = sorted_id[20:30] K = len(samples_id) samples_join = [] for idx in samples_id: samples_join.append(x[str(idx)]) print samples_pr print samples_join print samples_id pr = st.pagerank() date = self.log.start_date pr_ev = [] date_idx = [] for i in range(K): pr_ev.append([]) while (date < self.log.end_date): p = pr[date] for i in range(K): try: pr_ev[i].append(p[str(id_invert[str(samples_id[i])])]) except Exception: pr_ev[i].append(0) date += timedelta(14) date_idx.append((date - self.log.start_date).days) for i in range(K): plt.plot(date_idx, pr_ev[i]) #print pr_ev[i] #plt.ylim([0,0.0002]) plt.show()
def __create_cumulative_design_matrix__(self , k = 5): kgram_list , kgram_count = self.create_k_grams(k) n = len(kgram_count.keys()) X = np.zeros((n , len(kgram_list))) y = np.zeros(n) st = Stat() pr = st.user_pagerank()[self.log.end_date] sorted_ids = sorted(kgram_count.keys() , key = lambda x: int(x)) for i , user in enumerate(sorted_ids): for j, gram in enumerate(kgram_list): if user not in pr: print user continue X[i,j] = kgram_count[user][gram] y[i]= pr[user] return X , y
def join_final_date_avg_pagerank(self): x = self.log.join_date_since_inception() st = Stat() pr = st.pagerank()[self.log.end_date] graph = st.graphs[self.log.end_date] id = graph.vertex_properties['id'] join_since = [] ult_pgr = [] for v in graph.vertices(): if v.out_degree() > 0: ult_pgr.append(pr[v]) join_since.append(x[id[v]]) avg_pr = [] for i in range(1, 217): s = [] for j in range(len(join_since)): if i == j: s.append(ult_pgr[j]) avg_pr.append(np.average(s)) print(stats.pearsonr([i for i in range(len(avg_pr))], avg_pr)) print(stats.spearmanr([i for i in range(len(avg_pr))], avg_pr)) plt.plot([i for i in range(1, 217)], avg_pr, 'r.') plt.show() avg_pr = [] i = 0 while i < 217: s = [] for j in range(len(join_since)): if j >= i and j < i + 7: s.append(ult_pgr[j]) avg_pr.append(np.average(s)) i += 7 plt.plot([i for i in range(len(avg_pr))], avg_pr, 'r.') plt.show() print(stats.pearsonr([i for i in range(len(avg_pr))], avg_pr)) print(stats.spearmanr([i for i in range(len(avg_pr))], avg_pr))
def pr_msg_received(self, type='r'): spcorr = {} corr = {} st = Stat() rank = st.get_users_rank() seq = self.log.get_user_date_seq() for date in seq.iterkeys(): weekly_rank = [] weekly_r_count = [] for user in seq[date].iterkeys(): if date != self.log.end_date: if user in rank[date]: weekly_rank.append(rank[date].index(user) + 1) weekly_r_count.append( Counter(seq[date + timedelta(7)][user])[type]) if weekly_r_count != [] and weekly_rank != []: corr[date] = stats.pearsonr(weekly_rank, weekly_r_count) spcorr[date] = stats.spearmanr(weekly_rank, weekly_r_count) # x = [] # y = [] # for date in sorted(corr.iterkeys()): # x.append((date - self.log.start_date).days) # y.append(corr[date][0]) # # plt.plot(x,y , 'r.') # plt.show() x = [] y = [] for date in sorted(spcorr.iterkeys()): x.append((date - self.log.start_date).days) y.append(spcorr[date][0]) # plt.plot(x, y, 'r.') # plt.show() return x, y
def deck_pagerank(self): x = self.log.join_date_since_inception() st = Stat() pr = st.pagerank()[self.log.end_date] graph = st.graphs[self.log.end_date] id = graph.vertex_properties['id'] pr_dict = {} for v in graph.vertices(): pr_dict[int(id[v])] = pr[v] sorted_pr = sorted(pr_dict.values()) sorted_id = sorted(pr_dict, key=pr_dict.get) samples = [] sample_day_joined = [] samples_ids = [] i = 0 while (i < len(sorted_id)): idx = int(np.random.uniform() * 20) it = 0 while (sorted_pr[idx + i] < 0.0001 and it < 20): idx = int(np.random.uniform() * 20) it += 1 samples.append(sorted_pr[idx + i]) sample_day_joined.append(x[str(sorted_id[idx + i])]) samples_ids.append(idx + i) i += 20 print(sample_day_joined) print(samples) print(samples_ids) plt.plot([i for i in range(len(samples))], sample_day_joined, 'r.') plt.show()
def freq_pr_corr_weekly(self): st = Stat() rank = st.get_users_rank() seq = Sequence() kgram_list, kgram_count = seq.create_weekly_sequences() scorr = {} pcorr = {} for kgram in kgram_list: scorr[kgram] = {} pcorr[kgram] = {} date = self.log.start_date + timedelta(7) while date <= self.log.end_date: lrank = [] lfreq = [] for i, user in enumerate(rank[date]): if user in kgram_count[date - timedelta(7)]: if kgram in kgram_count[date - timedelta(7)][user]: lfreq.append( kgram_count[date - timedelta(7)][user][kgram]) if user in rank[date - timedelta(7)]: prank = rank[date - timedelta(7)].index(user) lrank.append(i - prank) else: lrank.append(i + 1) if lrank is not [] and lfreq is not []: scorr[kgram][date] = stats.spearmanr(lfreq, lrank) pcorr[kgram][date] = stats.pearsonr(lfreq, lrank) date += timedelta(7) print scorr print pcorr for kgram in kgram_list: score = [] dates = [] for date in scorr[kgram]: if type(scorr[kgram][date][0]) == np.float64 \ and scorr[kgram][date][0] is not np.nan and \ scorr[kgram][date][1] < 0.001: dates.append((date - self.log.start_date).days) score.append(scorr[kgram][date][0]) if len(score) > 0: print(kgram) print score plt.plot(dates, score, '.') plt.show() for kgram in kgram_list: score = [] dates = [] for date in pcorr[kgram]: if type(pcorr[kgram][date][0]) == np.float64 \ and pcorr[kgram][date][0] is not np.nan and \ pcorr[kgram][date][1] < 0.001: dates.append((date - self.log.start_date).days) score.append(pcorr[kgram][date][0]) if len(score) > 0: print(kgram) plt.plot(dates, score, '.') plt.show()
def top_ten_change_in_pr(self): x = self.log.join_date_since_inception() st = Stat() pr = st.pagerank()[self.log.end_date] graph = st.graphs[self.log.end_date] id = graph.vertex_properties['id'] id_invert = {} for v in graph.vertices(): id_invert[id[v]] = int(v) print id_invert pr_dict = {} for v in graph.vertices(): pr_dict[int(id[v])] = pr[v] sorted_pr = sorted(pr_dict.values()) sorted_id = sorted(pr_dict, key=pr_dict.get) K = 10 samples_pr = sorted_pr[:] samples_id = sorted_id[:] # samples_pr = sorted_pr[-K:] # samples_id = sorted_id[-K:] # samples_pr = sorted_pr[1800 - K: 1800] # samples_id = sorted_id[1800 - K: 1800] # samples_pr = sorted_pr[1000 - K: 1000] # samples_id = sorted_id[1000 - K: 1000] # samples_pr = sorted_pr[800 - K: 800] # samples_id = sorted_id[800 - K: 800] K = len(samples_id) samples_join = [] for idx in samples_id: samples_join.append(x[str(idx)]) print samples_pr print samples_join print samples_id rank = st.get_users_rank() date_idx = [] pr_ev = [] pr_inc = [] pr_dec = [] for i in range(K): pr_ev.append([]) pr_inc.append([]) pr_dec.append([]) date = self.log.start_date while date <= self.log.end_date - timedelta(7): date_idx.append((date - self.log.start_date).days) for i in range(K): try: change = -rank[date].index(str(samples_id[i])) + \ rank[date + timedelta(7)].index(str(samples_id[i])) pr_ev[i].append(change) if change > 0: pr_inc[i].append(change) pr_dec[i].append(0) else: pr_dec[i].append(change) pr_inc[i].append(0) except: pr_ev[i].append(0) pr_dec[i].append(0) pr_inc[i].append(0) date += timedelta(7) for i in range(K): l = [] date_idx_temp = [] for j in range(len(pr_ev[i])): l.append(pr_ev[i][j]) date_idx_temp.append(date_idx[j]) plt.plot(date_idx_temp, l) plt.show() interval = timedelta(7) event_snapshot = self.log.get_event_snapshot('week', 'event') date = self.log.start_date data = [] while date <= self.log.end_date - timedelta(7): data.append(len(event_snapshot[date])) date += interval freq = self.log.event_frequency('msg', 'week') prch = np.average(pr_ev, axis=0) prchp = np.average(pr_inc, axis=0) prchn = np.average(pr_dec, axis=0) print(stats.spearmanr(prch, data)) print(stats.pearsonr(prch, data)) print(stats.spearmanr(prchp, data)) print(stats.pearsonr(prchp, data)) print(stats.spearmanr(prchn, data)) print(stats.pearsonr(prchn, data)) prchp = prchp / np.array(freq[1:]) prchn = prchn / np.array(freq[1:]) plt.plot([i for i in range(len(prch))], prch) plt.show() plt.plot([i for i in range(len(prchp))], prchp) plt.show() plt.plot([i for i in range(len(prchp))], np.abs(prchn)) plt.show()
def pr_new_msgs_received(self): spcorr = {} corr = {} user_new_msg = {} new_weekly_msgs = self.log.get_new_messages_received() st = Stat() rank = st.get_users_rank() event_frequency = self.log.event_frequency('msg', 'week') for date in rank: wrank = [] wnr = [] for i, user in enumerate(rank[date]): if i + 1 not in user_new_msg: user_new_msg[i + 1] = 0 wrank.append(i + 1) if date in new_weekly_msgs: if user in new_weekly_msgs[date]: wnr.append(len(new_weekly_msgs[date][user])) user_new_msg[i + 1] += len(new_weekly_msgs[date][user]) else: wnr.append(0) if len(wnr) == len(wrank): spcorr[date] = stats.spearmanr(wnr, wrank) corr[date] = stats.pearsonr(wnr, wrank) # # x = [] # y = [] # for date in sorted(corr.iterkeys()): # x.append((date - self.log.start_date).days) # y.append(corr[date][0] ) # # plt.plot(x, y, 'r.') # plt.show() x = [] y = [] for date in sorted(spcorr.iterkeys()): if spcorr[date][0] < 0.5: x.append((date - self.log.start_date).days) y.append(spcorr[date][0]) new, = plt.plot(x, y, 'g') plt.xlabel('Day from inception') plt.ylabel('Spearman correlation') plt.title('Spearman correlation between page rank and number ' +\ 'of\n new messages received through out time') plt.margins(0.05) # x, y = self.pr_msg_received() # total, = plt.plot(x,y, 'r') plt.show() x, y = [], [] for user in sorted(user_new_msg.keys(), key=lambda x: int(x)): x.append(user) y.append(user_new_msg[user]) print np.max(y) plt.hist(y, x, normed=False) plt.title( 'Total number of new messages received by each rank through out time' ) plt.xlabel('Pagerank') plt.ylabel('Number of new messages received') plt.margins(0.05) plt.show()
def ten_top_rank(self): x = self.log.join_date_since_inception() st = Stat() pr = st.pagerank()[self.log.end_date] graph = st.graphs[self.log.end_date] id = graph.vertex_properties['id'] id_invert = {} for v in graph.vertices(): id_invert[id[v]] = int(v) print id_invert pr_dict = {} for v in graph.vertices(): pr_dict[int(id[v])] = pr[v] sorted_pr = sorted(pr_dict.values()) sorted_id = sorted(pr_dict, key=pr_dict.get) K = 20 # samples_pr = sorted_pr[:] # samples_id = sorted_id[:] # samples_pr = sorted_pr[-K:] # samples_id = sorted_id[-K:] samples_pr = sorted_pr[1000 - K:1000] samples_id = sorted_id[1000 - K:1000] # samples_pr = sorted_pr[800 - K: 800] # samples_id = sorted_id[800 - K: 800] K = len(samples_id) samples_join = [] for idx in samples_id: samples_join.append(x[str(idx)]) print samples_pr print samples_join print samples_id rank = st.get_users_rank() date_idx = [] pr_ev = [] for i in range(K): pr_ev.append([]) date = self.log.start_date while date < self.log.end_date: date_idx.append((date - self.log.start_date).days) for i in range(K): try: pr_ev[i].append(rank[date].index(str(samples_id[i])) + 1) except: pr_ev[i].append(0) date += timedelta(1) for i in range(K): l = [] date_idx_temp = [] for j in range(len(pr_ev[i])): if pr_ev[i][j] != 0: l.append(pr_ev[i][j]) date_idx_temp.append(date_idx[j]) plt.plot(date_idx_temp, l) plt.show()