def stat_pid_rid_time():
    """
    Statistics for pid_rid_time structure
    :rtype : None
    :return: None
    """
    rid_pid_set = load_rid_pid_set()
    rid_label = load_rid_label()
    pid_rid_time_list = load_pid_rid_time_list()

    spammed_pid = set()
    rids = set()
    for pid, rid_time_list in pid_rid_time_list.items():
        for rid, ts in rid_time_list:
            rids.add(rid)
            if rid_label.get(rid) == 1:
                spammed_pid.add(pid)
    rid_num_review_dist = [len(rid_pid_set[rid]) for rid in list(rids)]
    singletons = [num for num in rid_num_review_dist if num == 1]
    print 'The total number of products:', len(pid_rid_time_list.keys())
    print 'The number of spammed products:', len(spammed_pid)
    print 'The total number of users:', len(rids)
    print 'The number of singleton:', len(singletons)
    plt.hist(rid_num_review_dist, bins=100, range=(0, 50), normed=True, cumulative=True, histtype='step')
    plt.show()
def early_movement():
    """
    Q1: how early a spammer would begin to act
    :return: None
    """
    pid_rid_time_list = load_pid_rid_time_list()
    rid_label = load_rid_label()
    rid_orders_pos = {}
    rid_orders_neg = {}
    for i in range(len(pid_rid_time_list.keys())):
        pid = pid_rid_time_list.keys()[i]
        rid_time_list = pid_rid_time_list[pid]
        for j in range(len(rid_time_list)):
            rid = rid_time_list[j][0]
            if rid_label.get(rid) == 1:
                rid_orders_pos.setdefault(rid, [])
                rid_orders_pos[rid].append(j)
            elif rid_label.get(rid) == 0:
                rid_orders_neg.setdefault(rid, [])
                rid_orders_neg[rid].append(j)
    ave_order_pos = []
    ave_order_neg = []
    for rid, orders in rid_orders_pos.items():
        ave_order_pos.append(np.mean(orders))
    for rid, orders in rid_orders_neg.items():
        ave_order_neg.append(np.mean(orders))
    plt.hist([ave_order_pos, ave_order_neg],
             bins=100, normed=True, cumulative=True,
             histtype='step', label=['pos', 'neg'], color=['red', 'blue'])
    plt.legend()
    plt.show()
def pid_time_series_day(pid, pid_review_list):
    """
    Create time series for a specified product, period: day
    """
    rid_label = load_rid_label()
    reviews = pid_review_list[pid]
    rids, ratings, timestamps = zip(*reviews)
    labels = [1 if rid_label.get(rid) == 1 else 0 for rid in rids]
    df = pd.DataFrame({'rid': rids, 'rating': ratings, 'label': labels},
                      index=[dt.datetime.strptime(ts, '%Y-%m-%d') for ts in timestamps])
    df = df.sort_index(ascending=True)  # sort by timestamp
    ts_rating = df['rating']  # get 'rating' time series
    ts_label = df['label']  # get 'label' time series
    ts_rating_day_sum = ts_rating.groupby(level=0).sum()  # sum of rating each day
    ts_rating_day_count = ts_rating.groupby(level=0).count()  # number of reviews each day
    df_rating_dist = pd.concat([ts_rating.where(ts_rating == i).groupby(level=0).count() for i in range(1, 6)],
                               keys=['1', '2', '3', '4', '5'], axis=1)  # distribution of ratings each day
    ts_spammer_day = ts_label.groupby(level=0).sum()  # number of spammers each day
    ts_repu = ts_rating_day_sum.cumsum() / ts_rating_day_count.cumsum()  # reputation each day
    ts_repu = ts_repu.resample(pd.datetools.Day(), fill_method='pad')
    ts_repu_change_rate = ts_repu - ts_repu.shift(1)
    ts_spammer_day = ts_spammer_day.resample(pd.datetools.Day())
    ts_rating_day_count = ts_rating_day_count.resample(pd.datetools.Day())

    # draw
    fig, axes = plt.subplots(nrows=5, ncols=1)
    plt.title(pid)
    ts_repu.plot(ax=axes[0], title='Reputation')
    ts_repu_change_rate.plot(ax=axes[1], title='Reputation Change Rate')
    ts_rating_day_count.plot(ax=axes[2], kind='bar', title='Review Count', xticks=[])
    ts_spammer_day.plot(ax=axes[3], kind='bar', title='Spam Review Count', xticks=[])
    df_rating_dist.plot(ax=axes[4], kind='area', title='Rating Count Distribution', xticks=[])
    plt.show()
def print_rid_pids(rid, rid_pids, pid_rids):
    rid_label = load_rid_label()
    if rid_pids.get(rid) != None:
        pids = rid_pids[rid]
        for pid in pids:
            if pid_rids.get(pid) != None:
                num_spammer = len([rid for rid in pid_rids[pid] if rid_label.get(rid) == 1])
                if num_spammer > 0:
                    print pid
def print_pid_rid_time(pid_rid_time, min_rid_c, max_rid_c, min_spammer_c, max_spammer_c):
    pid_brand = load_pid_brand()
    rid_label = load_rid_label()
    for pid, rid_time in pid_rid_time.items():
        if max_rid_c >= len(rid_time) >= min_rid_c:
            n_spammer = 0
            for rid, time in rid_time:
                if rid_label.get(rid) == 1:
                    n_spammer += 1
            if max_spammer_c >= n_spammer >= min_spammer_c:
                print pid, pid_brand.get(pid), len(rid_time), n_spammer
def analyze_rid_date_spamicity():
    rid_label = load_rid_label()
    pos = [item[0] for item in rid_label.items() if item[1] == 1]
    neg = [item[0] for item in rid_label.items() if item[1] == 0]

    rid_date_spamicity = cPickle.load(open(RID_SPAMICITY_AMAZON_CN, 'rb'))
    rid_history = cPickle.load(open(RID_HISTORY_AMAZON_CN, 'rb'))
    rid_date_rank = cPickle.load(open(RID_RANK_AMAZON_CN, 'rb'))

    pos_spamicity = set()
    neg_spamicity = set()
    for rid, date_spamicity in rid_date_spamicity.items():
        if rid_label.get(rid) == 1:     # spammers
            pos_spamicity.add(rid)
        elif rid_label.get(rid) == 0:   # non-spammers
            neg_spamicity.add(rid)
    print 'pos in date spamicity:', len(pos_spamicity)
    print 'neg in date spamicity:', len(neg_spamicity)

    pos_history = set()
    neg_history = set()
    for rid, history in rid_history.items():
        if rid_label.get(rid) == 1:     # spammers
            pos_history.add(rid)
        elif rid_label.get(rid) == 0:   # non-spammers
            neg_history.add(rid)
    print 'pos in date history:', len(pos_history)
    print 'neg in date history:', len(neg_history)

    print 'pos in data:', len(pos)
    print 'neg in data:', len(neg)

    for thres in [0.05, 0.1, 0.15, 0.2, 0.25, 0.3]:
        tp = set()
        fp = set()
        for rid, date_rank in rid_date_rank.items():
            if rid_label.get(rid) == 1:     # spammers
                for date, rank in date_rank:
                    if rank <= thres:
                        tp.add(rid)
            elif rid_label.get(rid) == 0:   # non-spammers
                for date, rank in date_rank:
                    if rank <= thres:
                        fp.add(rid)

        print thres, len(tp), len(fp), 1. * len(tp) / (len(tp) + len(fp)), 1. * len(tp) / len(pos_spamicity)
    def __init__(self, k):
        """

        :param K:
            the k latest record to be updated, or
            the k most similar record to be updated
        :return:
        """
        self.rid_label = load_rid_label()
        self.review_stream = simulate_review_stream()
        self.rid_spamicity = {}
        self.pid_spamicity = {}
        self.rid_history = {}
        self.pid_history = {}
        self.k = k
        self.sigma = 1e-3
        self.alpha = 0.85
        # self.sigma = 0.
        self.rid_most_sim = {}
        self.rid_inherent = {}
def select_spammed_product_for_early_detection():
    """
    Select suitable products for early detection experiments
    Condition: the product has been reviewed by at least 3 spammers (1515)
    :return:
    """
    rid_label = load_rid_label()
    pid_rid_time_list = load_pid_rid_time_list()

    selected_product = set()

    for pid, rid_time_list in pid_rid_time_list.items():
        num_spammer = 0
        for rid, ts in rid_time_list:
            if rid_label.get(rid) == 1:
                num_spammer += 1
        if num_spammer >= 3:
            selected_product.add(pid)
            print pid + '\t' + str(len(rid_time_list)) + '\t' + str(num_spammer)
    print len(selected_product)