コード例 #1
0
def early_movement():
    """
    Q1: how early a spammer would begin to act
    :return: None
    """
    pid_rid_time_list = load_pid_rid_time_list()
    rid_label = load_rid_label()
    rid_orders_pos = {}
    rid_orders_neg = {}
    for i in range(len(pid_rid_time_list.keys())):
        pid = pid_rid_time_list.keys()[i]
        rid_time_list = pid_rid_time_list[pid]
        for j in range(len(rid_time_list)):
            rid = rid_time_list[j][0]
            if rid_label.get(rid) == 1:
                rid_orders_pos.setdefault(rid, [])
                rid_orders_pos[rid].append(j)
            elif rid_label.get(rid) == 0:
                rid_orders_neg.setdefault(rid, [])
                rid_orders_neg[rid].append(j)
    ave_order_pos = []
    ave_order_neg = []
    for rid, orders in rid_orders_pos.items():
        ave_order_pos.append(np.mean(orders))
    for rid, orders in rid_orders_neg.items():
        ave_order_neg.append(np.mean(orders))
    plt.hist([ave_order_pos, ave_order_neg],
             bins=100, normed=True, cumulative=True,
             histtype='step', label=['pos', 'neg'], color=['red', 'blue'])
    plt.legend()
    plt.show()
コード例 #2
0
def stat_pid_rid_time():
    """
    Statistics for pid_rid_time structure
    :rtype : None
    :return: None
    """
    rid_pid_set = load_rid_pid_set()
    rid_label = load_rid_label()
    pid_rid_time_list = load_pid_rid_time_list()

    spammed_pid = set()
    rids = set()
    for pid, rid_time_list in pid_rid_time_list.items():
        for rid, ts in rid_time_list:
            rids.add(rid)
            if rid_label.get(rid) == 1:
                spammed_pid.add(pid)
    rid_num_review_dist = [len(rid_pid_set[rid]) for rid in list(rids)]
    singletons = [num for num in rid_num_review_dist if num == 1]
    print 'The total number of products:', len(pid_rid_time_list.keys())
    print 'The number of spammed products:', len(spammed_pid)
    print 'The total number of users:', len(rids)
    print 'The number of singleton:', len(singletons)
    plt.hist(rid_num_review_dist, bins=100, range=(0, 50), normed=True, cumulative=True, histtype='step')
    plt.show()
コード例 #3
0
def select_spammed_product_for_early_detection():
    """
    Select suitable products for early detection experiments
    Condition: the product has been reviewed by at least 3 spammers (1515)
    :return:
    """
    rid_label = load_rid_label()
    pid_rid_time_list = load_pid_rid_time_list()

    selected_product = set()

    for pid, rid_time_list in pid_rid_time_list.items():
        num_spammer = 0
        for rid, ts in rid_time_list:
            if rid_label.get(rid) == 1:
                num_spammer += 1
        if num_spammer >= 3:
            selected_product.add(pid)
            print pid + '\t' + str(len(rid_time_list)) + '\t' + str(num_spammer)
    print len(selected_product)
コード例 #4
0
__author__ = 'xuch0007'

from dataset.amazon_cn.utility import load_pid_rid_time_list

####################################################################################
# This module is to evaluate the utility of a feature in terms of timeliness
####################################################################################




def product_timeline(pid, pid_rid_time_list):
    """
    For a product, inspect how early can we decide whether it has been involved in opinion spam.
    Example: B004VDBR04, B0046HAHBU, B0046HAHA6, B003R50Q9K, B0046HAH9W, B001MYL1GK, B001MYL1GA, B001MYL1FQ
    :param pid: product id
    :return:
    """
    rid_time_list = pid_rid_time_list[pid]
    print rid_time_list[:10]


if __name__ == '__main__':
    pid_rid_time_list = load_pid_rid_time_list()
    product_timeline('B004VDBR04', pid_rid_time_list)