def show_distribution(): r, l, c, a = load_data() # print('register_user_id count') # print(len(r)) # # print('video_id count') # print(len(a['video_id'].drop_duplicates())) # # print('author_id count') # print(len(a['author_id'].drop_duplicates())) # # print('page') # print('0:', len(a[a.page == 0])) # print('1:', len(a[a.page == 1])) # print('2:', len(a[a.page == 2])) # print('3:', len(a[a.page == 3])) # print('4:', len(a[a.page == 4])) # print('>=5:', len(a[a.page >= 5])) # # print('action_type') # print('0:', len(a[a.action_type == 0])) # print('1:', len(a[a.action_type == 1])) # print('2:', len(a[a.action_type == 2])) # print('3:', len(a[a.action_type == 3])) # print('4:', len(a[a.action_type == 4])) # print('5:', len(a[a.action_type == 5])) # print('>=6:', len(a[a.action_type >= 6])) print(r.describe()) print(l.describe()) print(c.describe()) print(a.describe())
def preprocessing_24_6(): r, l, c, a = load_data() # 将1到23天作为训练集 x_l = l[l.day < 25] x_a = a[a.day < 25] x_r = r[r.register_day < 25] x_c = c[c.day < 25] last_week_l_u = l[l.day >= 25]['user_id'].drop_duplicates().get_values() author_id = list(a['author_id'].get_values()) x, y = [], [] for index in tqdm(x_r.index): user_id = x_r.loc[index]['user_id'] v = vec(x_r.loc[index].get_values(), x_l.loc[x_l.user_id == user_id].get_values(), x_c.loc[x_c.user_id == user_id].get_values(), x_a.loc[x_a.user_id == user_id].get_values(), author_id, 25) # 根据最后7天中用户是否登录app,来判断是否都活跃用户, # 仅凭判断user_id是否在launch_log中即可,不必判断另外两个log。 is_active = 1 if user_id in last_week_l_u else 0 x.append(v) y.append(is_active) x = np.array(x) y = np.array(y) np.save('../original_data/x_24_6', x) np.save('../original_data/y_24_6', y) print('x.shape:', x.shape) print('y.shape:', y.shape)
:return: 最大连续数 """ # 注意,默认输入的l是已经经过从小到大排序的 # 列表为空时,返回0 if len(l) == 0: return 0 cons = np.ones(len(l), dtype=np.int64) for i in range(len(l) - 1): if l[i + 1] == (l[i] + 1): cons[i + 1] = cons[i] + 1 return cons.max() if __name__ == '__main__': test_user_index = [1, 2, 3, 4, 5, 6, 7, 8] r, l, c, a = load_data() result = [] author_id = list(a['author_id'].get_values()) for index in test_user_index: r_temp = r.loc[index].get_values() l_temp = l[l.user_id == r.loc[index]['user_id']].get_values() c_temp = c[c.user_id == r.loc[index]['user_id']].get_values() a_temp = a[a.user_id == r.loc[index]['user_id']].get_values() result.append(vec(r_temp, l_temp, c_temp, a_temp, author_id, 24)) print('result.shape:', result[0].shape)