def ten_top(self): x = self.log.join_date_since_inception() st = Stat() pr = st.pagerank()[self.log.end_date] graph = st.graphs[self.log.end_date] id = graph.vertex_properties['id'] id_invert = {} for v in graph.vertices(): id_invert[id[v]] = int(v) print id_invert pr_dict = {} for v in graph.vertices(): pr_dict[int(id[v])] = pr[v] sorted_pr = sorted(pr_dict.values()) sorted_id = sorted(pr_dict, key=pr_dict.get) samples_pr = sorted_pr[20:30] samples_id = sorted_id[20:30] K = len(samples_id) samples_join = [] for idx in samples_id: samples_join.append(x[str(idx)]) print samples_pr print samples_join print samples_id pr = st.pagerank() date = self.log.start_date pr_ev = [] date_idx = [] for i in range(K): pr_ev.append([]) while (date < self.log.end_date): p = pr[date] for i in range(K): try: pr_ev[i].append(p[str(id_invert[str(samples_id[i])])]) except Exception: pr_ev[i].append(0) date += timedelta(14) date_idx.append((date - self.log.start_date).days) for i in range(K): plt.plot(date_idx, pr_ev[i]) #print pr_ev[i] #plt.ylim([0,0.0002]) plt.show()
def join_date_final_pagerank(self): x = self.log.join_date_since_inception() st = Stat() pr = st.pagerank()[self.log.end_date] graph = st.graphs[self.log.end_date] #print pr , graph id = graph.vertex_properties['id'] join_since = [] ult_pgr = [] for v in graph.vertices(): if v.out_degree() > 1: ult_pgr.append(pr[v]) join_since.append(x[id[v]]) # # join_since = join_since[1:] # ult_pgr = ult_pgr[1:] #plt.plot(join_since , ult_pgr , 'r.') #data = [join_since , ult_pgr] data = [] for i in sorted(list(set(join_since))): d = [] for j in range(len(join_since)): if join_since[j] == i: d.append(ult_pgr[i]) data.append(d) plt.boxplot(data) plt.xlabel('Day of joining ') plt.ylabel('Value of page rank') plt.show() # stat tests demo thr = max(ult_pgr) - min(ult_pgr) thr /= 2 first = [] second = [] for i in range(len(ult_pgr)): if ult_pgr[i] > thr: first.append(join_since[i]) else: second.append(join_since[i]) # join = [] # ult = [] # for i in range(len(ult_pgr)): # #if ult_pgr[i] > 0.0001: # join.append(join_since[i]) # ult.append(ult_pgr[i]) # print len(join) # print len(join_since) # plt.plot(join, ult, 'r.') # plt.show() print(stats.ttest_ind(first, second, equal_var=False)) print(stats.pearsonr(join_since, ult_pgr)) print(stats.spearmanr(join_since, ult_pgr))
def pr_distr(self): st = Stat() pr = st.pagerank()[self.log.end_date] graph = st.graphs[self.log.end_date] pr_list = [] for v in graph.vertices(): pr_list.append(float("{0:.5f}".format(pr[v]))) count = Counter(pr_list) x = [] y = [] for key in sorted(count.iterkeys()): x.append(key) y.append(count[key]) plt.plot(x, y, 'r.') plt.ylim([0, 200]) plt.show() plt.plot(np.log(x), np.log(y), 'r.') plt.ylim([0, np.log(200)]) plt.show() cdf = [y[0]] for i in range(1, len(y)): cdf.append(cdf[i - 1] + y[i]) plt.plot(x, cdf, 'r.') plt.show() plt.plot(np.log(x), np.log(cdf), 'r.') plt.show() plt.plot(np.log(x), 1 - np.log(np.array(cdf)), 'r.') plt.show()
def join_final_date_avg_pagerank(self): x = self.log.join_date_since_inception() st = Stat() pr = st.pagerank()[self.log.end_date] graph = st.graphs[self.log.end_date] id = graph.vertex_properties['id'] join_since = [] ult_pgr = [] for v in graph.vertices(): if v.out_degree() > 0: ult_pgr.append(pr[v]) join_since.append(x[id[v]]) avg_pr = [] for i in range(1, 217): s = [] for j in range(len(join_since)): if i == j: s.append(ult_pgr[j]) avg_pr.append(np.average(s)) print(stats.pearsonr([i for i in range(len(avg_pr))], avg_pr)) print(stats.spearmanr([i for i in range(len(avg_pr))], avg_pr)) plt.plot([i for i in range(1, 217)], avg_pr, 'r.') plt.show() avg_pr = [] i = 0 while i < 217: s = [] for j in range(len(join_since)): if j >= i and j < i + 7: s.append(ult_pgr[j]) avg_pr.append(np.average(s)) i += 7 plt.plot([i for i in range(len(avg_pr))], avg_pr, 'r.') plt.show() print(stats.pearsonr([i for i in range(len(avg_pr))], avg_pr)) print(stats.spearmanr([i for i in range(len(avg_pr))], avg_pr))
def deck_pagerank(self): x = self.log.join_date_since_inception() st = Stat() pr = st.pagerank()[self.log.end_date] graph = st.graphs[self.log.end_date] id = graph.vertex_properties['id'] pr_dict = {} for v in graph.vertices(): pr_dict[int(id[v])] = pr[v] sorted_pr = sorted(pr_dict.values()) sorted_id = sorted(pr_dict, key=pr_dict.get) samples = [] sample_day_joined = [] samples_ids = [] i = 0 while (i < len(sorted_id)): idx = int(np.random.uniform() * 20) it = 0 while (sorted_pr[idx + i] < 0.0001 and it < 20): idx = int(np.random.uniform() * 20) it += 1 samples.append(sorted_pr[idx + i]) sample_day_joined.append(x[str(sorted_id[idx + i])]) samples_ids.append(idx + i) i += 20 print(sample_day_joined) print(samples) print(samples_ids) plt.plot([i for i in range(len(samples))], sample_day_joined, 'r.') plt.show()
def top_ten_change_in_pr(self): x = self.log.join_date_since_inception() st = Stat() pr = st.pagerank()[self.log.end_date] graph = st.graphs[self.log.end_date] id = graph.vertex_properties['id'] id_invert = {} for v in graph.vertices(): id_invert[id[v]] = int(v) print id_invert pr_dict = {} for v in graph.vertices(): pr_dict[int(id[v])] = pr[v] sorted_pr = sorted(pr_dict.values()) sorted_id = sorted(pr_dict, key=pr_dict.get) K = 10 samples_pr = sorted_pr[:] samples_id = sorted_id[:] # samples_pr = sorted_pr[-K:] # samples_id = sorted_id[-K:] # samples_pr = sorted_pr[1800 - K: 1800] # samples_id = sorted_id[1800 - K: 1800] # samples_pr = sorted_pr[1000 - K: 1000] # samples_id = sorted_id[1000 - K: 1000] # samples_pr = sorted_pr[800 - K: 800] # samples_id = sorted_id[800 - K: 800] K = len(samples_id) samples_join = [] for idx in samples_id: samples_join.append(x[str(idx)]) print samples_pr print samples_join print samples_id rank = st.get_users_rank() date_idx = [] pr_ev = [] pr_inc = [] pr_dec = [] for i in range(K): pr_ev.append([]) pr_inc.append([]) pr_dec.append([]) date = self.log.start_date while date <= self.log.end_date - timedelta(7): date_idx.append((date - self.log.start_date).days) for i in range(K): try: change = -rank[date].index(str(samples_id[i])) + \ rank[date + timedelta(7)].index(str(samples_id[i])) pr_ev[i].append(change) if change > 0: pr_inc[i].append(change) pr_dec[i].append(0) else: pr_dec[i].append(change) pr_inc[i].append(0) except: pr_ev[i].append(0) pr_dec[i].append(0) pr_inc[i].append(0) date += timedelta(7) for i in range(K): l = [] date_idx_temp = [] for j in range(len(pr_ev[i])): l.append(pr_ev[i][j]) date_idx_temp.append(date_idx[j]) plt.plot(date_idx_temp, l) plt.show() interval = timedelta(7) event_snapshot = self.log.get_event_snapshot('week', 'event') date = self.log.start_date data = [] while date <= self.log.end_date - timedelta(7): data.append(len(event_snapshot[date])) date += interval freq = self.log.event_frequency('msg', 'week') prch = np.average(pr_ev, axis=0) prchp = np.average(pr_inc, axis=0) prchn = np.average(pr_dec, axis=0) print(stats.spearmanr(prch, data)) print(stats.pearsonr(prch, data)) print(stats.spearmanr(prchp, data)) print(stats.pearsonr(prchp, data)) print(stats.spearmanr(prchn, data)) print(stats.pearsonr(prchn, data)) prchp = prchp / np.array(freq[1:]) prchn = prchn / np.array(freq[1:]) plt.plot([i for i in range(len(prch))], prch) plt.show() plt.plot([i for i in range(len(prchp))], prchp) plt.show() plt.plot([i for i in range(len(prchp))], np.abs(prchn)) plt.show()
def ten_top_rank(self): x = self.log.join_date_since_inception() st = Stat() pr = st.pagerank()[self.log.end_date] graph = st.graphs[self.log.end_date] id = graph.vertex_properties['id'] id_invert = {} for v in graph.vertices(): id_invert[id[v]] = int(v) print id_invert pr_dict = {} for v in graph.vertices(): pr_dict[int(id[v])] = pr[v] sorted_pr = sorted(pr_dict.values()) sorted_id = sorted(pr_dict, key=pr_dict.get) K = 20 # samples_pr = sorted_pr[:] # samples_id = sorted_id[:] # samples_pr = sorted_pr[-K:] # samples_id = sorted_id[-K:] samples_pr = sorted_pr[1000 - K:1000] samples_id = sorted_id[1000 - K:1000] # samples_pr = sorted_pr[800 - K: 800] # samples_id = sorted_id[800 - K: 800] K = len(samples_id) samples_join = [] for idx in samples_id: samples_join.append(x[str(idx)]) print samples_pr print samples_join print samples_id rank = st.get_users_rank() date_idx = [] pr_ev = [] for i in range(K): pr_ev.append([]) date = self.log.start_date while date < self.log.end_date: date_idx.append((date - self.log.start_date).days) for i in range(K): try: pr_ev[i].append(rank[date].index(str(samples_id[i])) + 1) except: pr_ev[i].append(0) date += timedelta(1) for i in range(K): l = [] date_idx_temp = [] for j in range(len(pr_ev[i])): if pr_ev[i][j] != 0: l.append(pr_ev[i][j]) date_idx_temp.append(date_idx[j]) plt.plot(date_idx_temp, l) plt.show()