def early_movement(): """ Q1: how early a spammer would begin to act :return: None """ pid_rid_time_list = load_pid_rid_time_list() rid_label = load_rid_label() rid_orders_pos = {} rid_orders_neg = {} for i in range(len(pid_rid_time_list.keys())): pid = pid_rid_time_list.keys()[i] rid_time_list = pid_rid_time_list[pid] for j in range(len(rid_time_list)): rid = rid_time_list[j][0] if rid_label.get(rid) == 1: rid_orders_pos.setdefault(rid, []) rid_orders_pos[rid].append(j) elif rid_label.get(rid) == 0: rid_orders_neg.setdefault(rid, []) rid_orders_neg[rid].append(j) ave_order_pos = [] ave_order_neg = [] for rid, orders in rid_orders_pos.items(): ave_order_pos.append(np.mean(orders)) for rid, orders in rid_orders_neg.items(): ave_order_neg.append(np.mean(orders)) plt.hist([ave_order_pos, ave_order_neg], bins=100, normed=True, cumulative=True, histtype='step', label=['pos', 'neg'], color=['red', 'blue']) plt.legend() plt.show()
def stat_pid_rid_time(): """ Statistics for pid_rid_time structure :rtype : None :return: None """ rid_pid_set = load_rid_pid_set() rid_label = load_rid_label() pid_rid_time_list = load_pid_rid_time_list() spammed_pid = set() rids = set() for pid, rid_time_list in pid_rid_time_list.items(): for rid, ts in rid_time_list: rids.add(rid) if rid_label.get(rid) == 1: spammed_pid.add(pid) rid_num_review_dist = [len(rid_pid_set[rid]) for rid in list(rids)] singletons = [num for num in rid_num_review_dist if num == 1] print 'The total number of products:', len(pid_rid_time_list.keys()) print 'The number of spammed products:', len(spammed_pid) print 'The total number of users:', len(rids) print 'The number of singleton:', len(singletons) plt.hist(rid_num_review_dist, bins=100, range=(0, 50), normed=True, cumulative=True, histtype='step') plt.show()
def select_spammed_product_for_early_detection(): """ Select suitable products for early detection experiments Condition: the product has been reviewed by at least 3 spammers (1515) :return: """ rid_label = load_rid_label() pid_rid_time_list = load_pid_rid_time_list() selected_product = set() for pid, rid_time_list in pid_rid_time_list.items(): num_spammer = 0 for rid, ts in rid_time_list: if rid_label.get(rid) == 1: num_spammer += 1 if num_spammer >= 3: selected_product.add(pid) print pid + '\t' + str(len(rid_time_list)) + '\t' + str(num_spammer) print len(selected_product)
__author__ = 'xuch0007' from dataset.amazon_cn.utility import load_pid_rid_time_list #################################################################################### # This module is to evaluate the utility of a feature in terms of timeliness #################################################################################### def product_timeline(pid, pid_rid_time_list): """ For a product, inspect how early can we decide whether it has been involved in opinion spam. Example: B004VDBR04, B0046HAHBU, B0046HAHA6, B003R50Q9K, B0046HAH9W, B001MYL1GK, B001MYL1GA, B001MYL1FQ :param pid: product id :return: """ rid_time_list = pid_rid_time_list[pid] print rid_time_list[:10] if __name__ == '__main__': pid_rid_time_list = load_pid_rid_time_list() product_timeline('B004VDBR04', pid_rid_time_list)