Exemple #1
0
def show_distribution():
    r, l, c, a = load_data()

    # print('register_user_id count')
    # print(len(r))
    #
    # print('video_id count')
    # print(len(a['video_id'].drop_duplicates()))
    #
    # print('author_id count')
    # print(len(a['author_id'].drop_duplicates()))
    #
    # print('page')
    # print('0:', len(a[a.page == 0]))
    # print('1:', len(a[a.page == 1]))
    # print('2:', len(a[a.page == 2]))
    # print('3:', len(a[a.page == 3]))
    # print('4:', len(a[a.page == 4]))
    # print('>=5:', len(a[a.page >= 5]))
    #
    # print('action_type')
    # print('0:', len(a[a.action_type == 0]))
    # print('1:', len(a[a.action_type == 1]))
    # print('2:', len(a[a.action_type == 2]))
    # print('3:', len(a[a.action_type == 3]))
    # print('4:', len(a[a.action_type == 4]))
    # print('5:', len(a[a.action_type == 5]))
    # print('>=6:', len(a[a.action_type >= 6]))

    print(r.describe())
    print(l.describe())
    print(c.describe())
    print(a.describe())
Exemple #2
0
def preprocessing_24_6():
    r, l, c, a = load_data()
    # 将1到23天作为训练集
    x_l = l[l.day < 25]
    x_a = a[a.day < 25]
    x_r = r[r.register_day < 25]
    x_c = c[c.day < 25]
    last_week_l_u = l[l.day >= 25]['user_id'].drop_duplicates().get_values()
    author_id = list(a['author_id'].get_values())

    x, y = [], []

    for index in tqdm(x_r.index):
        user_id = x_r.loc[index]['user_id']
        v = vec(x_r.loc[index].get_values(),
                x_l.loc[x_l.user_id == user_id].get_values(),
                x_c.loc[x_c.user_id == user_id].get_values(),
                x_a.loc[x_a.user_id == user_id].get_values(), author_id, 25)

        # 根据最后7天中用户是否登录app,来判断是否都活跃用户,
        # 仅凭判断user_id是否在launch_log中即可,不必判断另外两个log。
        is_active = 1 if user_id in last_week_l_u else 0

        x.append(v)
        y.append(is_active)

    x = np.array(x)
    y = np.array(y)
    np.save('../original_data/x_24_6', x)
    np.save('../original_data/y_24_6', y)
    print('x.shape:', x.shape)
    print('y.shape:', y.shape)
    :return: 最大连续数
    """

    # 注意,默认输入的l是已经经过从小到大排序的

    # 列表为空时,返回0
    if len(l) == 0:
        return 0

    cons = np.ones(len(l), dtype=np.int64)
    for i in range(len(l) - 1):
        if l[i + 1] == (l[i] + 1):
            cons[i + 1] = cons[i] + 1
    return cons.max()


if __name__ == '__main__':
    test_user_index = [1, 2, 3, 4, 5, 6, 7, 8]
    r, l, c, a = load_data()
    result = []
    author_id = list(a['author_id'].get_values())
    for index in test_user_index:
        r_temp = r.loc[index].get_values()
        l_temp = l[l.user_id == r.loc[index]['user_id']].get_values()
        c_temp = c[c.user_id == r.loc[index]['user_id']].get_values()
        a_temp = a[a.user_id == r.loc[index]['user_id']].get_values()

        result.append(vec(r_temp, l_temp, c_temp, a_temp, author_id, 24))

    print('result.shape:', result[0].shape)