Beispiel #1
0
    def ten_top(self):
        x = self.log.join_date_since_inception()
        st = Stat()
        pr = st.pagerank()[self.log.end_date]
        graph = st.graphs[self.log.end_date]
        id = graph.vertex_properties['id']
        id_invert = {}
        for v in graph.vertices():
            id_invert[id[v]] = int(v)
        print id_invert

        pr_dict = {}
        for v in graph.vertices():
            pr_dict[int(id[v])] = pr[v]

        sorted_pr = sorted(pr_dict.values())
        sorted_id = sorted(pr_dict, key=pr_dict.get)

        samples_pr = sorted_pr[20:30]
        samples_id = sorted_id[20:30]

        K = len(samples_id)

        samples_join = []
        for idx in samples_id:
            samples_join.append(x[str(idx)])

        print samples_pr
        print samples_join
        print samples_id

        pr = st.pagerank()
        date = self.log.start_date
        pr_ev = []
        date_idx = []
        for i in range(K):
            pr_ev.append([])
        while (date < self.log.end_date):
            p = pr[date]

            for i in range(K):
                try:
                    pr_ev[i].append(p[str(id_invert[str(samples_id[i])])])
                except Exception:
                    pr_ev[i].append(0)
            date += timedelta(14)
            date_idx.append((date - self.log.start_date).days)

        for i in range(K):
            plt.plot(date_idx, pr_ev[i])
        #print pr_ev[i]
        #plt.ylim([0,0.0002])
        plt.show()
Beispiel #2
0
    def join_date_final_pagerank(self):
        x = self.log.join_date_since_inception()
        st = Stat()
        pr = st.pagerank()[self.log.end_date]
        graph = st.graphs[self.log.end_date]
        #print pr , graph
        id = graph.vertex_properties['id']
        join_since = []
        ult_pgr = []
        for v in graph.vertices():
            if v.out_degree() > 1:
                ult_pgr.append(pr[v])
                join_since.append(x[id[v]])

        #
        # join_since = join_since[1:]
        # ult_pgr = ult_pgr[1:]

        #plt.plot(join_since , ult_pgr , 'r.')
        #data = [join_since , ult_pgr]

        data = []
        for i in sorted(list(set(join_since))):
            d = []
            for j in range(len(join_since)):
                if join_since[j] == i:
                    d.append(ult_pgr[i])
            data.append(d)
        plt.boxplot(data)

        plt.xlabel('Day of joining ')
        plt.ylabel('Value of page rank')
        plt.show()

        # stat tests demo
        thr = max(ult_pgr) - min(ult_pgr)
        thr /= 2
        first = []
        second = []
        for i in range(len(ult_pgr)):
            if ult_pgr[i] > thr:
                first.append(join_since[i])
            else:
                second.append(join_since[i])

        # join = []
        # ult = []
        # for i in range(len(ult_pgr)):
        # 	#if ult_pgr[i] > 0.0001:
        # 	join.append(join_since[i])
        # 	ult.append(ult_pgr[i])

        # print len(join)
        # print len(join_since)
        # plt.plot(join, ult, 'r.')
        # plt.show()

        print(stats.ttest_ind(first, second, equal_var=False))
        print(stats.pearsonr(join_since, ult_pgr))
        print(stats.spearmanr(join_since, ult_pgr))
Beispiel #3
0
    def pr_distr(self):
        st = Stat()
        pr = st.pagerank()[self.log.end_date]
        graph = st.graphs[self.log.end_date]
        pr_list = []
        for v in graph.vertices():
            pr_list.append(float("{0:.5f}".format(pr[v])))
        count = Counter(pr_list)
        x = []
        y = []
        for key in sorted(count.iterkeys()):
            x.append(key)
            y.append(count[key])
        plt.plot(x, y, 'r.')
        plt.ylim([0, 200])
        plt.show()

        plt.plot(np.log(x), np.log(y), 'r.')
        plt.ylim([0, np.log(200)])
        plt.show()

        cdf = [y[0]]
        for i in range(1, len(y)):
            cdf.append(cdf[i - 1] + y[i])

        plt.plot(x, cdf, 'r.')
        plt.show()

        plt.plot(np.log(x), np.log(cdf), 'r.')
        plt.show()

        plt.plot(np.log(x), 1 - np.log(np.array(cdf)), 'r.')
        plt.show()
Beispiel #4
0
    def join_final_date_avg_pagerank(self):
        x = self.log.join_date_since_inception()
        st = Stat()
        pr = st.pagerank()[self.log.end_date]
        graph = st.graphs[self.log.end_date]
        id = graph.vertex_properties['id']
        join_since = []
        ult_pgr = []
        for v in graph.vertices():
            if v.out_degree() > 0:
                ult_pgr.append(pr[v])
                join_since.append(x[id[v]])

        avg_pr = []
        for i in range(1, 217):
            s = []
            for j in range(len(join_since)):
                if i == j: s.append(ult_pgr[j])
            avg_pr.append(np.average(s))

        print(stats.pearsonr([i for i in range(len(avg_pr))], avg_pr))
        print(stats.spearmanr([i for i in range(len(avg_pr))], avg_pr))

        plt.plot([i for i in range(1, 217)], avg_pr, 'r.')
        plt.show()

        avg_pr = []
        i = 0
        while i < 217:
            s = []
            for j in range(len(join_since)):
                if j >= i and j < i + 7:
                    s.append(ult_pgr[j])
            avg_pr.append(np.average(s))
            i += 7

        plt.plot([i for i in range(len(avg_pr))], avg_pr, 'r.')
        plt.show()

        print(stats.pearsonr([i for i in range(len(avg_pr))], avg_pr))
        print(stats.spearmanr([i for i in range(len(avg_pr))], avg_pr))
Beispiel #5
0
    def deck_pagerank(self):
        x = self.log.join_date_since_inception()
        st = Stat()
        pr = st.pagerank()[self.log.end_date]
        graph = st.graphs[self.log.end_date]
        id = graph.vertex_properties['id']

        pr_dict = {}
        for v in graph.vertices():
            pr_dict[int(id[v])] = pr[v]

        sorted_pr = sorted(pr_dict.values())
        sorted_id = sorted(pr_dict, key=pr_dict.get)

        samples = []
        sample_day_joined = []
        samples_ids = []

        i = 0
        while (i < len(sorted_id)):

            idx = int(np.random.uniform() * 20)
            it = 0
            while (sorted_pr[idx + i] < 0.0001 and it < 20):
                idx = int(np.random.uniform() * 20)
                it += 1
            samples.append(sorted_pr[idx + i])
            sample_day_joined.append(x[str(sorted_id[idx + i])])
            samples_ids.append(idx + i)
            i += 20

        print(sample_day_joined)
        print(samples)
        print(samples_ids)

        plt.plot([i for i in range(len(samples))], sample_day_joined, 'r.')
        plt.show()
Beispiel #6
0
    def top_ten_change_in_pr(self):
        x = self.log.join_date_since_inception()
        st = Stat()
        pr = st.pagerank()[self.log.end_date]
        graph = st.graphs[self.log.end_date]
        id = graph.vertex_properties['id']
        id_invert = {}
        for v in graph.vertices():
            id_invert[id[v]] = int(v)
        print id_invert

        pr_dict = {}
        for v in graph.vertices():
            pr_dict[int(id[v])] = pr[v]

        sorted_pr = sorted(pr_dict.values())
        sorted_id = sorted(pr_dict, key=pr_dict.get)

        K = 10

        samples_pr = sorted_pr[:]
        samples_id = sorted_id[:]
        # samples_pr = sorted_pr[-K:]
        # samples_id = sorted_id[-K:]
        # samples_pr = sorted_pr[1800 - K: 1800]
        # samples_id = sorted_id[1800 - K: 1800]
        # samples_pr = sorted_pr[1000 - K: 1000]
        # samples_id = sorted_id[1000 - K: 1000]
        # samples_pr = sorted_pr[800 - K: 800]
        # samples_id = sorted_id[800 - K: 800]

        K = len(samples_id)

        samples_join = []
        for idx in samples_id:
            samples_join.append(x[str(idx)])

        print samples_pr
        print samples_join
        print samples_id

        rank = st.get_users_rank()
        date_idx = []
        pr_ev = []
        pr_inc = []
        pr_dec = []
        for i in range(K):
            pr_ev.append([])
            pr_inc.append([])
            pr_dec.append([])
        date = self.log.start_date
        while date <= self.log.end_date - timedelta(7):
            date_idx.append((date - self.log.start_date).days)
            for i in range(K):
                try:
                    change = -rank[date].index(str(samples_id[i])) + \
                       rank[date + timedelta(7)].index(str(samples_id[i]))
                    pr_ev[i].append(change)
                    if change > 0:
                        pr_inc[i].append(change)
                        pr_dec[i].append(0)
                    else:
                        pr_dec[i].append(change)
                        pr_inc[i].append(0)
                except:
                    pr_ev[i].append(0)
                    pr_dec[i].append(0)
                    pr_inc[i].append(0)
            date += timedelta(7)
        for i in range(K):
            l = []
            date_idx_temp = []
            for j in range(len(pr_ev[i])):
                l.append(pr_ev[i][j])
                date_idx_temp.append(date_idx[j])
            plt.plot(date_idx_temp, l)
        plt.show()

        interval = timedelta(7)
        event_snapshot = self.log.get_event_snapshot('week', 'event')
        date = self.log.start_date
        data = []
        while date <= self.log.end_date - timedelta(7):
            data.append(len(event_snapshot[date]))
            date += interval

        freq = self.log.event_frequency('msg', 'week')

        prch = np.average(pr_ev, axis=0)
        prchp = np.average(pr_inc, axis=0)
        prchn = np.average(pr_dec, axis=0)
        print(stats.spearmanr(prch, data))
        print(stats.pearsonr(prch, data))
        print(stats.spearmanr(prchp, data))
        print(stats.pearsonr(prchp, data))
        print(stats.spearmanr(prchn, data))
        print(stats.pearsonr(prchn, data))
        prchp = prchp / np.array(freq[1:])
        prchn = prchn / np.array(freq[1:])
        plt.plot([i for i in range(len(prch))], prch)
        plt.show()
        plt.plot([i for i in range(len(prchp))], prchp)
        plt.show()
        plt.plot([i for i in range(len(prchp))], np.abs(prchn))
        plt.show()
Beispiel #7
0
    def ten_top_rank(self):
        x = self.log.join_date_since_inception()
        st = Stat()
        pr = st.pagerank()[self.log.end_date]
        graph = st.graphs[self.log.end_date]
        id = graph.vertex_properties['id']
        id_invert = {}
        for v in graph.vertices():
            id_invert[id[v]] = int(v)
        print id_invert

        pr_dict = {}
        for v in graph.vertices():
            pr_dict[int(id[v])] = pr[v]

        sorted_pr = sorted(pr_dict.values())
        sorted_id = sorted(pr_dict, key=pr_dict.get)

        K = 20

        # samples_pr = sorted_pr[:]
        # samples_id = sorted_id[:]
        # samples_pr = sorted_pr[-K:]
        # samples_id = sorted_id[-K:]
        samples_pr = sorted_pr[1000 - K:1000]
        samples_id = sorted_id[1000 - K:1000]
        # samples_pr = sorted_pr[800 - K: 800]
        # samples_id = sorted_id[800 - K: 800]

        K = len(samples_id)

        samples_join = []
        for idx in samples_id:
            samples_join.append(x[str(idx)])

        print samples_pr
        print samples_join
        print samples_id

        rank = st.get_users_rank()
        date_idx = []
        pr_ev = []
        for i in range(K):
            pr_ev.append([])
        date = self.log.start_date
        while date < self.log.end_date:
            date_idx.append((date - self.log.start_date).days)
            for i in range(K):
                try:
                    pr_ev[i].append(rank[date].index(str(samples_id[i])) + 1)
                except:
                    pr_ev[i].append(0)
            date += timedelta(1)

        for i in range(K):
            l = []
            date_idx_temp = []
            for j in range(len(pr_ev[i])):
                if pr_ev[i][j] != 0:
                    l.append(pr_ev[i][j])
                    date_idx_temp.append(date_idx[j])
            plt.plot(date_idx_temp, l)
        plt.show()