def stat_pid_rid_time(): """ Statistics for pid_rid_time structure :rtype : None :return: None """ rid_pid_set = load_rid_pid_set() rid_label = load_rid_label() pid_rid_time_list = load_pid_rid_time_list() spammed_pid = set() rids = set() for pid, rid_time_list in pid_rid_time_list.items(): for rid, ts in rid_time_list: rids.add(rid) if rid_label.get(rid) == 1: spammed_pid.add(pid) rid_num_review_dist = [len(rid_pid_set[rid]) for rid in list(rids)] singletons = [num for num in rid_num_review_dist if num == 1] print 'The total number of products:', len(pid_rid_time_list.keys()) print 'The number of spammed products:', len(spammed_pid) print 'The total number of users:', len(rids) print 'The number of singleton:', len(singletons) plt.hist(rid_num_review_dist, bins=100, range=(0, 50), normed=True, cumulative=True, histtype='step') plt.show()
def early_movement(): """ Q1: how early a spammer would begin to act :return: None """ pid_rid_time_list = load_pid_rid_time_list() rid_label = load_rid_label() rid_orders_pos = {} rid_orders_neg = {} for i in range(len(pid_rid_time_list.keys())): pid = pid_rid_time_list.keys()[i] rid_time_list = pid_rid_time_list[pid] for j in range(len(rid_time_list)): rid = rid_time_list[j][0] if rid_label.get(rid) == 1: rid_orders_pos.setdefault(rid, []) rid_orders_pos[rid].append(j) elif rid_label.get(rid) == 0: rid_orders_neg.setdefault(rid, []) rid_orders_neg[rid].append(j) ave_order_pos = [] ave_order_neg = [] for rid, orders in rid_orders_pos.items(): ave_order_pos.append(np.mean(orders)) for rid, orders in rid_orders_neg.items(): ave_order_neg.append(np.mean(orders)) plt.hist([ave_order_pos, ave_order_neg], bins=100, normed=True, cumulative=True, histtype='step', label=['pos', 'neg'], color=['red', 'blue']) plt.legend() plt.show()
def pid_time_series_day(pid, pid_review_list): """ Create time series for a specified product, period: day """ rid_label = load_rid_label() reviews = pid_review_list[pid] rids, ratings, timestamps = zip(*reviews) labels = [1 if rid_label.get(rid) == 1 else 0 for rid in rids] df = pd.DataFrame({'rid': rids, 'rating': ratings, 'label': labels}, index=[dt.datetime.strptime(ts, '%Y-%m-%d') for ts in timestamps]) df = df.sort_index(ascending=True) # sort by timestamp ts_rating = df['rating'] # get 'rating' time series ts_label = df['label'] # get 'label' time series ts_rating_day_sum = ts_rating.groupby(level=0).sum() # sum of rating each day ts_rating_day_count = ts_rating.groupby(level=0).count() # number of reviews each day df_rating_dist = pd.concat([ts_rating.where(ts_rating == i).groupby(level=0).count() for i in range(1, 6)], keys=['1', '2', '3', '4', '5'], axis=1) # distribution of ratings each day ts_spammer_day = ts_label.groupby(level=0).sum() # number of spammers each day ts_repu = ts_rating_day_sum.cumsum() / ts_rating_day_count.cumsum() # reputation each day ts_repu = ts_repu.resample(pd.datetools.Day(), fill_method='pad') ts_repu_change_rate = ts_repu - ts_repu.shift(1) ts_spammer_day = ts_spammer_day.resample(pd.datetools.Day()) ts_rating_day_count = ts_rating_day_count.resample(pd.datetools.Day()) # draw fig, axes = plt.subplots(nrows=5, ncols=1) plt.title(pid) ts_repu.plot(ax=axes[0], title='Reputation') ts_repu_change_rate.plot(ax=axes[1], title='Reputation Change Rate') ts_rating_day_count.plot(ax=axes[2], kind='bar', title='Review Count', xticks=[]) ts_spammer_day.plot(ax=axes[3], kind='bar', title='Spam Review Count', xticks=[]) df_rating_dist.plot(ax=axes[4], kind='area', title='Rating Count Distribution', xticks=[]) plt.show()
def print_rid_pids(rid, rid_pids, pid_rids): rid_label = load_rid_label() if rid_pids.get(rid) != None: pids = rid_pids[rid] for pid in pids: if pid_rids.get(pid) != None: num_spammer = len([rid for rid in pid_rids[pid] if rid_label.get(rid) == 1]) if num_spammer > 0: print pid
def print_pid_rid_time(pid_rid_time, min_rid_c, max_rid_c, min_spammer_c, max_spammer_c): pid_brand = load_pid_brand() rid_label = load_rid_label() for pid, rid_time in pid_rid_time.items(): if max_rid_c >= len(rid_time) >= min_rid_c: n_spammer = 0 for rid, time in rid_time: if rid_label.get(rid) == 1: n_spammer += 1 if max_spammer_c >= n_spammer >= min_spammer_c: print pid, pid_brand.get(pid), len(rid_time), n_spammer
def analyze_rid_date_spamicity(): rid_label = load_rid_label() pos = [item[0] for item in rid_label.items() if item[1] == 1] neg = [item[0] for item in rid_label.items() if item[1] == 0] rid_date_spamicity = cPickle.load(open(RID_SPAMICITY_AMAZON_CN, 'rb')) rid_history = cPickle.load(open(RID_HISTORY_AMAZON_CN, 'rb')) rid_date_rank = cPickle.load(open(RID_RANK_AMAZON_CN, 'rb')) pos_spamicity = set() neg_spamicity = set() for rid, date_spamicity in rid_date_spamicity.items(): if rid_label.get(rid) == 1: # spammers pos_spamicity.add(rid) elif rid_label.get(rid) == 0: # non-spammers neg_spamicity.add(rid) print 'pos in date spamicity:', len(pos_spamicity) print 'neg in date spamicity:', len(neg_spamicity) pos_history = set() neg_history = set() for rid, history in rid_history.items(): if rid_label.get(rid) == 1: # spammers pos_history.add(rid) elif rid_label.get(rid) == 0: # non-spammers neg_history.add(rid) print 'pos in date history:', len(pos_history) print 'neg in date history:', len(neg_history) print 'pos in data:', len(pos) print 'neg in data:', len(neg) for thres in [0.05, 0.1, 0.15, 0.2, 0.25, 0.3]: tp = set() fp = set() for rid, date_rank in rid_date_rank.items(): if rid_label.get(rid) == 1: # spammers for date, rank in date_rank: if rank <= thres: tp.add(rid) elif rid_label.get(rid) == 0: # non-spammers for date, rank in date_rank: if rank <= thres: fp.add(rid) print thres, len(tp), len(fp), 1. * len(tp) / (len(tp) + len(fp)), 1. * len(tp) / len(pos_spamicity)
def __init__(self, k): """ :param K: the k latest record to be updated, or the k most similar record to be updated :return: """ self.rid_label = load_rid_label() self.review_stream = simulate_review_stream() self.rid_spamicity = {} self.pid_spamicity = {} self.rid_history = {} self.pid_history = {} self.k = k self.sigma = 1e-3 self.alpha = 0.85 # self.sigma = 0. self.rid_most_sim = {} self.rid_inherent = {}
def select_spammed_product_for_early_detection(): """ Select suitable products for early detection experiments Condition: the product has been reviewed by at least 3 spammers (1515) :return: """ rid_label = load_rid_label() pid_rid_time_list = load_pid_rid_time_list() selected_product = set() for pid, rid_time_list in pid_rid_time_list.items(): num_spammer = 0 for rid, ts in rid_time_list: if rid_label.get(rid) == 1: num_spammer += 1 if num_spammer >= 3: selected_product.add(pid) print pid + '\t' + str(len(rid_time_list)) + '\t' + str(num_spammer) print len(selected_product)