def matthew_effect(self): st = Stat() final_pr = st.user_pagerank()[self.log.end_date] log_pr = self.cal_log_pr(final_pr.values()) per = self.get_bins(70, log_pr) rep = self.get_rep(log_pr, per) print rep pr = st.user_pagerank() date = self.log.start_date pch = np.zeros(len(rep)) pcount = np.zeros(len(rep)) nch = np.zeros(len(rep)) ncount = np.zeros(len(rep)) change = np.zeros(len(rep)) count = np.zeros(len(rep)) rel_prob = np.zeros(len(rep)) while date < self.log.end_date: for user in pr[date]: g = self.get_bin_group(self.cal_log_pr(pr[date][user]), per) if user not in pr[date + timedelta(7)]: nch[g] += pr[date][user] ncount[g] += 1 change[g] += pr[date][user] count[g] += 1 continue ch = pr[date + timedelta(7)][user] - pr[date][user] if ch >= 0: pch[g] += ch pcount[g] += 1 if ch < 0: nch[g] += abs(ch) ncount[g] += 1 change[g] += pr[date][user] count[g] += 1 date += timedelta(7) for i in range(len(pch)): pch[i] /= pcount[i] nch[i] /= ncount[i] change[i] /= count[i] nch = self.cal_log_pr(nch) pch = self.cal_log_pr(pch) change = self.cal_log_pr(change) print np.polyfit(rep, nch, 1) print np.polyfit(rep, pch, 1) print np.polyfit(rep, change, 1) pp = np.poly1d(np.polyfit(rep, pch, 1)) nn = np.poly1d(np.polyfit(rep, nch, 1)) changep = np.poly1d(np.polyfit(rep, change, 1)) xp = np.linspace(np.min(rep), np.max(rep), 100) plt.plot(rep, pch, '.', xp, pp(xp), '-') plt.show() plt.plot(rep, nch, '.', xp, nn(xp), '-') plt.show() plt.plot(rep, change, '.', xp, changep(xp), '-') plt.show()
def pr_bin_dist(self): st = Stat() final_pr = st.user_pagerank()[self.log.end_date] log_pr = self.cal_log_pr(final_pr.values()) per = self.get_bins(70, log_pr) rep = self.get_rep(log_pr, per) count = np.zeros(len(rep)) cdf = np.zeros(len(rep)) for pr in final_pr.values(): g = self.get_bin_group(self.cal_log_pr(pr), per) count[g] += 1 for i in range(len(count)): if i == 0: cdf[i] = count[i] cdf[i] = count[i] + cdf[i - 1] print cdf count = self.cal_log_pr(count) cdf = self.cal_log_pr(cdf) count, rep_cleaned = self.clean_lists(count, rep) xp = np.linspace(np.min(rep_cleaned), np.max(rep_cleaned), 100) print np.polyfit(rep_cleaned, count, 1) cc = np.poly1d(np.polyfit(rep_cleaned, count, 1)) plt.plot(rep_cleaned, count, '.', xp, cc(xp), '-') plt.show() cdf, rep_cleaned = self.clean_lists(cdf, rep) print np.polyfit(rep_cleaned, 1 - cdf, 1) cc = np.poly1d(np.polyfit(rep_cleaned, 1 - cdf, 1)) plt.plot(rep_cleaned, 1 - cdf, '.', xp, cc(xp), '-') plt.show()
def __create_cumulative_design_matrix__(self , k = 5): kgram_list , kgram_count = self.create_k_grams(k) n = len(kgram_count.keys()) X = np.zeros((n , len(kgram_list))) y = np.zeros(n) st = Stat() pr = st.user_pagerank()[self.log.end_date] sorted_ids = sorted(kgram_count.keys() , key = lambda x: int(x)) for i , user in enumerate(sorted_ids): for j, gram in enumerate(kgram_list): if user not in pr: print user continue X[i,j] = kgram_count[user][gram] y[i]= pr[user] return X , y