def xgb_pre(train_x,
            train_y,
            test_x,
            num_round=500,
            params=None,
            test_y=None,
            if_save_imp=True):
    dtrain = xgb.DMatrix(train_x, label=train_y)
    dtest = xgb.DMatrix(test_x, label=test_y)
    if params is None:
        params = {
            'objective': 'binary:logistic',
            # 'objective': 'rank:pairwise',
            'eta': 0.01,
            'max_depth': 5,
            'colsample_bytree': 0.8,
            'subsample': 0.8,
            'min_child_weight': 16,
            'tree_method': 'exact',
            # 'gamma': 0.1,
            # 'scale_pos_weight': 10,
            # 'max_delta_step': 0.7,
            # 'eval_metric': 'auc',
        }
    watchlist = [(dtrain, 'train'), (dtest, 'test')]
    if test_y is None:
        bst = xgb.train(params, dtrain, num_boost_round=num_round)
    else:
        bst = xgb.train(params, dtrain, num_round, watchlist, feval=evalerror)

    if if_save_imp:
        imp_dict = bst.get_fscore(fmap='')
        imp = pd.DataFrame({
            'column': list(imp_dict.keys()),
            'importance': list(imp_dict.values())
        })
        com.save_csv(imp.sort_values(by='importance'),
                     com.get_project_path('Data/Temp/'),
                     'xgb-val_importance.csv')
    pre_label = pd.Series(bst.predict(dtest))
    return pre_label
Ejemplo n.º 2
0
def run():
    csv_data_all = pd.read_csv(com.get_project_path('Data/Csv/OriData/csv_data_all.csv'))
    csv_data_item = pd.read_csv(com.get_project_path('Data/Csv/OriData/tianchi_fresh_comp_train_item.csv'), header=0, names=['item_id', 'item_geo', 'item_cate'])
    # 测试代码时解注下面一条
    # csv_data_all = pd.read_csv(com.get_project_path('Data/Csv/OriData/csv_data_all_h1w.csv'))

    # 处理time
    csv_data_all['time'] = pd.to_datetime(csv_data_all['time'], format='%Y%m%d %H')
    csv_data_all['hour'] = csv_data_all['time'].dt.hour
    csv_data_all['time'] = csv_data_all['time'].dt.normalize()
    csv_data_all['week'] = csv_data_all['time'].apply(lambda a: a.weekday()+1)
    csv_data_all['day_rank'] = csv_data_all['time'].rank(method='dense').apply(lambda a: int(a))
    # del csv_data_all['time']

    # 处理经纬度
    csv_data_item['item_geo'] = csv_data_item['item_geo'].replace('input_data_is_error', '').fillna('').apply(lambda a: gh64.decode(a))
    csv_data_item['item_geo_lat'] = csv_data_item['item_geo'].apply(lambda a: get_lat_lon(a, 0, inplace=-90))
    csv_data_item['item_geo_lon'] = csv_data_item['item_geo'].apply(lambda a: get_lat_lon(a, 1, inplace=180))
    del csv_data_item['item_geo']
    csv_data_all['user_geo'] = csv_data_all['user_geo'].replace('input_data_is_error', '').fillna('').apply(lambda a: gh64.decode(a))
    csv_data_all['user_geo_lat'] = csv_data_all['user_geo'].apply(lambda a: get_lat_lon(a, 0, inplace=90))
    csv_data_all['user_geo_lon'] = csv_data_all['user_geo'].apply(lambda a: get_lat_lon(a, 1, inplace=-180))
    del csv_data_all['user_geo']

    # 保存
    com.save_csv(csv_data_all.sort_values(by=['user_id', 'day_rank', 'item_id', 'beh_type']), com.get_project_path('Data/Csv/ClnData/'), 'csv_data_all.csv')
    com.save_csv(csv_data_item.sort_values(by=['item_id', 'item_cate']), com.get_project_path('Data/Csv/ClnData/'), 'csv_data_item.csv')
    com.save_csv(csv_data_all[csv_data_all['item_id'].isin(csv_data_item['item_id'])].sort_values(by=['user_id', 'day_rank', 'item_id', 'beh_type']),
                 com.get_project_path('Data/Csv/ClnData/'), 'csv_data_p.csv')

    # 保存1w条做来测试代码
    csv_data_all.head(10000).sort_values(by=['user_id', 'day_rank', 'item_id']).to_csv(com.get_project_path('Data/Csv/ClnData/csv_data_all_h1w.csv'), index=None)
def run():
    data_all = pd.read_csv(com.get_project_path('Data/Csv/ClnData/csv_data_all.csv'))

    train_x = pd.read_csv(com.get_project_path(FEATURE_PATH+'fea_all_label31_dur31_sl3.csv'))
    train_x['ui_id'] = sp.get_ui_id(train_x)
    test_x = pd.read_csv(com.get_project_path(FEATURE_PATH+'fea_all_label32_dur31_sl3_p.csv'))
    test_x['ui_id'] = sp.get_ui_id(test_x)

    train_y = sp.get_csv_label(data_all, 31)
    train_y['ui_id'] = sp.get_ui_id(train_y)
    train_y = train_x['ui_id'].isin(train_y['ui_id']).replace({True: 1, False: 0})

    print('特征数量: '+str(len(train_x.columns)-3))
    print('训练集数量: ' + str(len(train_x)))
    # ########### 搞模型 ############ #
    pre_label = xgb_pre(train_x.drop(['user_id', 'item_id', 'ui_id'], axis=1), train_y,
                        test_x.drop(['user_id', 'item_id', 'ui_id'], axis=1))

    tmp = list(pre_label.sort_values(ascending=False))[500]
    pre_label = pre_label.apply(lambda a: a>=tmp).replace({True: 1, False: 0})
    test_x['label'] = pre_label
    csv_fea_label24_dur14_p = test_x[test_x['label']==1].loc[:, ['user_id', 'item_id']]
    save_name = '_A_02_xgb_202001032331.csv'
    com.save_csv(csv_fea_label24_dur14_p.loc[:, ['user_id', 'item_id']], com.get_project_path(RESULT_PATH), save_name)
Ejemplo n.º 4
0
def get_feature(data_all,
                data_p,
                label_day_rank,
                duration=7,
                p_only=True,
                data_item=None,
                save=False):
    # 第一部分: 用户的特征
    fea_user_path = get_user_feature(data_all=data_all,
                                     data_p=data_p,
                                     data_item=data_item,
                                     label_day_rank=label_day_rank,
                                     duration=duration,
                                     p_only=p_only,
                                     save=True)
    # fea_user_path = com.get_project_path(FEATURE_PATH) + get_save_name(label_day_rank, duration, p_only, index='user')

    # 第二部分: 商品的特征
    fea_item_path = get_item_feature(data_all=data_all,
                                     data_p=data_p,
                                     data_item=data_item,
                                     label_day_rank=label_day_rank,
                                     duration=duration,
                                     p_only=p_only,
                                     save=True)
    # fea_item_path = com.get_project_path(FEATURE_PATH) + get_save_name(label_day_rank, duration, p_only, index='item')

    # 第三部分: 用户X商品 的特征
    fea_ui_path = get_ui_feature(data_all=data_all,
                                 data_p=data_p,
                                 data_item=data_item,
                                 label_day_rank=label_day_rank,
                                 duration=duration,
                                 p_only=p_only,
                                 save=True)
    # fea_ui_path = com.get_project_path(FEATURE_PATH) + get_save_name(label_day_rank, duration, p_only, index='ui')

    # 组合特征
    data_fea = pd.read_csv(fea_ui_path).loc[:, ['user_id', 'item_id']]
    data_fea = pd.merge(data_fea,
                        pd.read_csv(fea_user_path),
                        on='user_id',
                        how='left')
    data_fea = pd.merge(data_fea,
                        pd.read_csv(fea_item_path),
                        on='item_id',
                        how='left')
    data_fea = pd.merge(data_fea,
                        pd.read_csv(fea_ui_path),
                        on=['user_id', 'item_id'],
                        how='left')

    if save is True:
        save_name = get_save_name(label_day_rank,
                                  duration,
                                  p_only,
                                  index='all')
        com.save_csv(data_fea, com.get_project_path(FEATURE_PATH), save_name)
        return com.get_project_path(FEATURE_PATH) + save_name
    else:
        return data_fea
Ejemplo n.º 5
0
def get_ui_feature(data_all,
                   data_p,
                   label_day_rank,
                   duration=7,
                   p_only=True,
                   data_item=None,
                   save=False):
    data_all = data_all[(data_all['day_rank'] >= label_day_rank - duration)
                        & (data_all['day_rank'] <= label_day_rank - 1)]
    data_p = data_p[(data_p['day_rank'] >= label_day_rank - duration)
                    & (data_p['day_rank'] <= label_day_rank - 1)]

    if p_only is True:
        data_fea = data_p[
            (data_p['day_rank'] >= label_day_rank - SET_LENGTH)
            & (data_p['beh_type'] == 1
               )].loc[:, ['user_id', 'item_id', 'item_cate', 'ui_id', 'uc_id'
                          ]].drop_duplicates()
    else:
        data_fea = data_all[
            (data_all['day_rank'] >= label_day_rank - SET_LENGTH)
            & (data_all['beh_type'] == 1
               )].loc[:, ['user_id', 'item_id', 'item_cate', 'ui_id', 'uc_id'
                          ]].drop_duplicates()

    # 用户 前1天当天/前2天当天/前3天当天 购买了/浏览了 几次这个商品
    for ago_time in [1, 2, 3]:
        for beh_type in [1, 2, 3, 4]:
            fea_name = 'beh_type_' + str(beh_type) + '_count&ui_id&' + str(
                ago_time) + '_day_ago'
            feature = com.pivot_table_plus(
                data_all[(data_all['beh_type'] == beh_type)
                         & (data_all['day_rank'] == label_day_rank -
                            ago_time)],
                index='ui_id',
                values='user_id',
                aggfunc='count',
                new_name=fea_name)
            data_fea = pd.merge(data_fea, feature, on='ui_id', how='left')
            data_fea[fea_name] = data_fea[fea_name].fillna(0).astype(int)
            print('# -- ' + fea_name + ' complete -- #')

    # 用户是否 收藏/购买 过这个商品
    data_fea['beh_type_4_if&ui_id'] = (data_fea['ui_id'].isin(
        data_all[data_all['beh_type'] == 4]['ui_id'])).replace({
            True: 1,
            False: 0
        })
    data_fea['beh_type_2_if&ui_id'] = (data_fea['ui_id'].isin(
        data_all[data_all['beh_type'] == 2]['ui_id'])).replace({
            True: 1,
            False: 0
        })

    # 用户与这个商品最后一次交互 是购买1 还是收藏1.5 还是浏览2 还是加购物车4
    fea_name = 'beh_type_?_last&ui_id'
    feature = data_all.copy()
    feature['tmp'] = feature['day_rank'] * 100 + feature['hour']
    feature['rank'] = feature.groupby('ui_id')['tmp'].rank(ascending=0)
    feature = feature[feature['rank'] == 1]
    data_fea = pd.merge(data_fea,
                        feature.loc[:, ['ui_id', 'beh_type']],
                        on='ui_id',
                        how='left')
    data_fea[fea_name] = data_fea['beh_type'].replace({
        1: 2,
        2: 1.5,
        3: 4,
        4: 1
    }).fillna(0)
    del data_fea['beh_type']

    # 商品在 全集/子集 是否是用户最后的交互对象
    for data_index in ['data_all', 'data_p']:
        if data_index == 'data_all': data = data_all
        else: data = data_p
        fea_name = 'is_last&ui_id&' + data_index
        feature = data.loc[:, ['user_id', 'ui_id', 'day_rank', 'hour'
                               ]].sort_values(
                                   by=['user_id', 'day_rank', 'hour'],
                                   ascending=[0, 0,
                                              0]).drop_duplicates('user_id')
        data_fea[fea_name] = (data_fea['ui_id'].isin(
            feature['ui_id'])).replace({
                True: 1,
                False: 0
            })
        print('# -- ' + fea_name + ' complete -- #')

    # 商品是用户在 全集/子集 倒数第几个交互对象
    for data_index in ['data_all', 'data_p']:
        if data_index == 'data_all': data = data_all
        else: data = data_p
        fea_name = 'last_?&ui_id&' + data_index
        feature = data.loc[:, ['ui_id', 'day_rank', 'hour']]
        feature['tmp'] = feature['day_rank'] * 100 + feature['hour']
        feature['rank'] = feature.groupby('ui_id')['tmp'].rank(method='dense',
                                                               ascending=1)
        feature = feature.sort_values(by=['rank', 'ui_id'],
                                      ascending=[True, True]).drop_duplicates(
                                          ['ui_id']).loc[:, ['ui_id', 'rank']]
        data_fea = pd.merge(data_fea, feature, on='ui_id',
                            how='left').fillna(max(feature['rank'] + 1))
        data_fea = data_fea.rename(columns={'rank': fea_name})
        print('# -- ' + fea_name + ' complete -- #')

    # 用户最后一次 浏览/收藏/购物车/购买 该商品/该商品类型 距标签多少小时
    for beh_type in [1, 2, 3, 4]:
        for id_index in ['ui_id', 'uc_id']:
            fea_name = 'beh_type_' + str(
                beh_type) + '_latest_to_now_hour&' + id_index
            feature = data_all[(
                data_all['beh_type'] == beh_type
            )].loc[:, [id_index, 'day_rank', 'hour']].sort_values(
                by=['day_rank',
                    'hour'], ascending=[0, 0]).drop_duplicates(id_index)
            feature[fea_name] = feature['day_rank'].apply(
                lambda a: label_day_rank - label_day_rank)
            feature[fea_name] = (feature[fea_name] * 24) + (24 -
                                                            feature['hour'])
            data_fea = pd.merge(data_fea,
                                feature.loc[:, [id_index, fea_name]],
                                how='left',
                                on=id_index)
            data_fea[fea_name] = data_fea[fea_name].fillna(
                24 * duration).astype(int)
            print('# -- ' + fea_name + ' complete -- #')

    del data_fea['uc_id']
    del data_fea['ui_id']
    del data_fea['item_cate']
    if save is True:
        save_name = get_save_name(label_day_rank, duration, p_only, index='ui')
        com.save_csv(data_fea, com.get_project_path(FEATURE_PATH), save_name)
        return com.get_project_path(FEATURE_PATH) + save_name
    else:
        return data_fea
Ejemplo n.º 6
0
def get_item_feature(data_all,
                     data_p,
                     label_day_rank,
                     duration=7,
                     p_only=True,
                     data_item=None,
                     save=False):
    data_all = data_all[(data_all['day_rank'] >= label_day_rank - duration)
                        & (data_all['day_rank'] <= label_day_rank - 1)]
    data_p = data_p[(data_p['day_rank'] >= label_day_rank - duration)
                    & (data_p['day_rank'] <= label_day_rank - 1)]

    if p_only is True:
        data_fea = data_p[(data_p['day_rank'] >= label_day_rank - SET_LENGTH) &
                          (data_p['beh_type'] == 1
                           )].loc[:,
                                  ['item_id', 'item_cate']].drop_duplicates()
    else:
        data_fea = data_all[
            (data_all['day_rank'] >= label_day_rank - SET_LENGTH)
            & (data_all['beh_type'] == 1
               )].loc[:, ['item_id', 'item_cate']].drop_duplicates()

    # 商品在/商品种类在 总/前1天内/前2天内/前3天内浏览/收藏/购物车/购买 的计数
    for item_index in ['item_id', 'item_cate']:
        for duration_time in [duration, 1, 2, 3]:
            for beh_type in [1, 2, 3, 4]:
                fea_name = 'beh_type_' + str(beh_type) + '_count&' + str(
                    item_index) + '&latest_' + str(duration_time)
                feature = com.pivot_table_plus(
                    data_all[(data_all['beh_type'] == beh_type)
                             & (data_all['day_rank'] >= label_day_rank -
                                duration_time)],
                    index=item_index,
                    values='user_id',
                    aggfunc='count',
                    new_name=fea_name)
                data_fea = pd.merge(data_fea,
                                    feature,
                                    on=item_index,
                                    how='left')
                data_fea[fea_name] = data_fea[fea_name].fillna(0).astype(int)
                print('# -- ' + fea_name + ' complete -- #')

    # 商品的/商品种类的 购买/收藏 的计数在 全集/子集 中的 正/反 排序
    for data_index in ['data_all', 'data_p']:
        for item_index in ['item_id', 'item_cate']:
            for beh_type in [2, 4]:
                for ascending in [0, 1]:
                    if data_index == 'data_all': data = data_all
                    else: data = data_p
                    fea_name = 'count_rank' + str(ascending) + '&' + str(
                        item_index) + '&beh_type_' + str(beh_type) + '&' + str(
                            data_index)
                    feature = com.pivot_table_plus(
                        data[(data['beh_type'] == beh_type)],
                        index=item_index,
                        values='user_id',
                        aggfunc='count',
                        new_name='tmp')
                    data_fea = pd.merge(data_fea,
                                        feature.loc[:, [item_index, 'tmp']],
                                        on=item_index,
                                        how='left')
                    data_fea['tmp'] = data_fea['tmp'].fillna(0)
                    data_fea[fea_name] = data_fea['tmp'].rank(
                        ascending=ascending, method='dense')
                    print('# -- ' + fea_name + ' complete -- #')
                    del data_fea['tmp']

    # 商品/商品类型 被多少人 浏览/收藏/购物车/购买 过
    for item_index in ['item_id', 'item_cate']:
        for beh_type in [1, 2, 3, 4]:
            fea_name = 'user_count&' + item_index + '&' + 'beh_type_' + str(
                beh_type)
            feature = data_all[
                (data_all[item_index].isin(data_fea[item_index]))
                & (data_all['beh_type'] == beh_type)]
            feature = com.pivot_table_plus(
                feature,
                index=item_index,
                values='user_id',
                aggfunc=com.count_with_drop_duplicates_for_series,
                new_name=fea_name)
            data_fea = pd.merge(data_fea, feature, on=item_index, how='left')
            data_fea[fea_name] = data_fea[fea_name].fillna(0).astype(int)
            print('# -- ' + fea_name + ' complete -- #')

    # 商品的/商品类型的 转化率(浏览X购买)
    data_fea['item_id_ctr'] = 1. * data_fea[
        'beh_type_4_count&item_id&latest_' +
        str(duration)] / data_fea['beh_type_1_count&item_id&latest_' +
                                  str(duration)]
    data_fea['item_id_ctr'] = data_fea['item_id_ctr'].fillna(0)
    data_fea['item_cate_ctr'] = 1. * data_fea[
        'beh_type_4_count&item_cate&latest_' +
        str(duration)] / data_fea['beh_type_1_count&item_cate&latest_' +
                                  str(duration)]
    data_fea['item_cate_ctr'] = data_fea['item_cate_ctr'].fillna(0)

    # 商品有几个经纬度
    for item_index in ['item_id', 'item_cate']:
        fea_name = 'geo_count&' + item_index
        feature = data_item[data_item['item_geo_lat'] != -90]
        feature = com.pivot_table_plus(
            feature, item_index, 'item_geo_lat',
            com.count_with_drop_duplicates_for_series, fea_name)
        data_fea = pd.merge(data_fea, feature, on=item_index, how='left')
        data_fea[fea_name] = data_fea[fea_name].fillna(0).astype(int)

    del data_fea['item_cate']
    if save is True:
        save_name = get_save_name(label_day_rank,
                                  duration,
                                  p_only,
                                  index='item')
        com.save_csv(data_fea, com.get_project_path(FEATURE_PATH), save_name)
        return com.get_project_path(FEATURE_PATH) + save_name
    else:
        return data_fea
Ejemplo n.º 7
0
def get_user_feature(data_all,
                     data_p,
                     label_day_rank,
                     duration=7,
                     p_only=True,
                     data_item=None,
                     save=False):
    data_all = data_all[(data_all['day_rank'] >= label_day_rank - duration)
                        & (data_all['day_rank'] <= label_day_rank - 1)]
    data_p = data_p[(data_p['day_rank'] >= label_day_rank - duration)
                    & (data_p['day_rank'] <= label_day_rank - 1)]

    if p_only is True:
        data_fea = data_p[
            (data_p['day_rank'] >= label_day_rank - SET_LENGTH)
            & (data_p['beh_type'] == 1)].loc[:, ['user_id']].drop_duplicates()
    else:
        data_fea = data_all[
            (data_all['day_rank'] >= label_day_rank - SET_LENGTH)
            & (data_all['beh_type'] == 1)].loc[:,
                                               ['user_id']].drop_duplicates()

    # 用户在 全集中的/子集中的 总/前1天内/前2天内/前3天内  浏览/收藏/购物车/购买 的计数
    for data_index in ['data_all', 'data_p']:
        for duration_time in [duration, 1, 2, 3]:
            for beh_type in [1, 2, 3, 4]:
                if data_index == 'data_all': data = data_all
                else: data = data_p
                fea_name = 'beh_type_' + str(
                    beh_type) + '_count&user&latest_' + str(
                        duration_time) + '&' + str(data_index)
                feature = com.pivot_table_plus(
                    data[(data['beh_type'] == beh_type) &
                         (data['day_rank'] >= label_day_rank - duration_time)],
                    index='user_id',
                    values='beh_type',
                    aggfunc='count',
                    new_name=fea_name)
                data_fea = pd.merge(data_fea,
                                    feature,
                                    on='user_id',
                                    how='left')
                data_fea[fea_name] = data_fea[fea_name].fillna(0).astype(int)
                print('# -- ' + fea_name + ' complete -- #')

    # 用户在 全集中/子集中 浏览/收藏/购物车/购买 过几种商品
    for data_index in ['data_all', 'data_p']:
        for beh_type in [1, 2, 3, 4]:
            if data_index == 'data_all': data = data_all
            else: data = data_p
            fea_name = 'item_count&' + 'user&' + 'beh_type_' + str(
                beh_type) + '&' + str(data_index)
            feature = com.pivot_table_plus(
                data[(data['beh_type'] == beh_type)],
                index='user_id',
                values='item_id',
                aggfunc=com.count_with_drop_duplicates_for_series,
                new_name=fea_name)
            data_fea = pd.merge(data_fea, feature, on='user_id', how='left')
            data_fea[fea_name] = data_fea[fea_name].fillna(0).astype(int)
            print('# -- ' + fea_name + ' complete -- #')

    # 用户在 全集中的/子集中的 转化率
    data_fea['user_ctr&data_all'] = 1. * data_fea[
        'beh_type_4_count&user&latest_' + str(duration) +
        '&data_all'] / data_fea['beh_type_1_count&user&latest_' +
                                str(duration) + '&data_all']
    data_fea['user_ctr&data_all'] = data_fea['user_ctr&data_all'].fillna(0)
    data_fea['user_ctr&data_p'] = 1. * data_fea[
        'beh_type_4_count&user&latest_' + str(duration) +
        '&data_p'] / data_fea['beh_type_1_count&user&latest_' + str(duration) +
                              '&data_p']
    data_fea['user_ctr&data_p'] = data_fea['user_ctr&data_p'].fillna(0)

    # 用户最后一次 浏览/收藏/购物车/购买 距标签多少小时
    for beh_type in [1, 2, 3, 4]:
        fea_name = 'beh_type_' + str(beh_type) + '_latest_to_now_hour&user_id'
        feature = data_all.loc[:, ['user_id', 'day_rank', 'hour']].sort_values(
            by=['user_id', 'day_rank',
                'hour'], ascending=[0, 0, 0]).drop_duplicates('user_id')
        feature[fea_name] = feature['day_rank'].apply(
            lambda a: label_day_rank - a)
        feature[fea_name] = (feature[fea_name] * 24) + (24 - feature['hour'])
        data_fea = pd.merge(data_fea,
                            feature.loc[:, ['user_id', fea_name]],
                            how='left',
                            on='user_id')
        data_fea[fea_name] = data_fea[fea_name].fillna(24 *
                                                       duration).astype(int)
        print('# -- ' + fea_name + ' complete -- #')

    # 用户有几个经纬度
    fea_name = 'geo_count&user_id'
    feature = data_all[data_all['user_geo_lat'] != 90]
    feature = com.pivot_table_plus(feature, 'user_id', 'user_geo_lat',
                                   com.count_with_drop_duplicates_for_series,
                                   fea_name)
    data_fea = pd.merge(data_fea, feature, on='user_id', how='left')
    data_fea[fea_name] = data_fea[fea_name].fillna(0).astype(int)
    print('# -- ' + fea_name + ' complete -- #')

    if save is True:
        save_name = get_save_name(label_day_rank,
                                  duration,
                                  p_only,
                                  index='user')
        com.save_csv(data_fea, com.get_project_path(FEATURE_PATH), save_name)
        return com.get_project_path(FEATURE_PATH) + save_name
    else:
        return data_fea