Example #1
0
def gen_fea(base_tr_path=None, base_te_path=None):

    tr = loader.load_df('../input/train.ftr')
    te = loader.load_df('../input/test.ftr')
    df_base = pd.concat([tr, te])

    #df_base = filter_acts_after_last_clk(df_base)

    df_feat = feat_extract(df_base)
    loader.save_df(df_feat, '../feature/df_feat.ftr')

    tr_sample = loader.load_df('../feature/tr_s0_0.ftr')
    te_sample = loader.load_df('../feature/te_s0_0.ftr')

    tr = tr_sample[ID_NAMES].merge(df_feat, on=ID_NAMES, how='left')
    te = te_sample[ID_NAMES].merge(df_feat, on=ID_NAMES, how='left')

    print (tr.shape, te.shape)
    print (tr.head())
    print (te.head())
    print (tr.columns)

    #tr = df_base[pd.notnull(df_base['target'])].reset_index(drop=True)
    #te = df_base[pd.isnull(df_base['target'])].reset_index(drop=True)

    output_fea(tr, te)
Example #2
0
def gen_sample(ori, des):
    df = loader.load_df(ori)
    print(df.shape)
    df = df[df.action_type == 'clickout item']
    print(df.shape)
    df_out = explode(df, ['impressions', 'prices'])
    print(df_out.shape)
    loader.save_df(df_out, des)
Example #3
0
def gen_tr_feat():
    df = loader.load_df('../input/sample_train.ftr')
    df['reference'] = df['reference'].astype('int')
    df['target'] = (df['reference'] == df['impressions']).astype(int)
    df.drop(['reference','action_type'],axis=1,inplace=True)
    df_session = df[['session_id','step']].drop_duplicates(subset='session_id',keep='last').reset_index(drop=True)
    df = df_session.merge(df, on=['session_id','step'], how='left').reset_index(drop=True)
    loader.save_df(df,'../input/tr.ftr')
Example #4
0
def merge_fea(tr_list, te_list):
    tr = loader.merge_fea(tr_list, primary_keys=ID_NAMES)
    te = loader.merge_fea(te_list, primary_keys=ID_NAMES)

    print(tr.head())
    print(te.head())

    loader.save_df(tr, tr_out_path)
    loader.save_df(te, te_out_path)
Example #5
0
def merge_fea(tr_list, te_list):
    tr = loader.merge_fea(tr_list, primary_keys=ID_NAMES)
    te = loader.merge_fea(te_list, primary_keys=ID_NAMES)

    tr['impressions'] = tr['impressions'].astype('int')
    te['impressions'] = te['impressions'].astype('int')

    print(tr.head())
    print(te.head())

    print(tr[ID_NAMES].head())

    loader.save_df(tr, tr_out_path)
    loader.save_df(te, te_out_path)
def filter_useless_data():

    tr = loader.load_df('../input/train.ftr')
    te = loader.load_df('../input/test.ftr')

    tr = remove_repeated_session_in_tr(tr, te)

    tr = remove_invalid_reference(tr)
    te = remove_invalid_reference(te)

    tr = remove_acts_after_last_clk(tr, is_te=False)
    te = remove_acts_after_last_clk(te, is_te=True)

    loader.save_df(tr, '../input/train.ftr')
    loader.save_df(te, '../input/test.ftr')
Example #7
0
def merge_val(file_path, sub_name, fold_num):
    file_list = os.listdir(file_path)

    paths = ['{}_cv_{}.csv'.format(sub_name, i) for i in range(1, fold_num + 1)]
    print (paths)

    dfs = []
    for path in paths:
        assert path in file_list, '{} not exist'.format(path)
        path = '{}/{}'.format(file_path, path)
        dfs.append(loader.load_df(path))

    df = pd.concat(dfs)
    print (df.head())
    print (df.describe())
    out_path = '{}/{}_cv.ftr'.format(file_path, sub_name)
    loader.save_df(df, out_path)
Example #8
0
def sub_convert(df_path, pred_path, out_path1, out_path2):
    te_data = loader.load_df(df_path)
    df_pred = loader.load_df(pred_path)

    sort_df_pred = df_pred.sort_values(['description_id', 'target'],
                                       ascending=False)
    df_pred = df_pred[['description_id']].drop_duplicates() \
            .merge(sort_df_pred, on=['description_id'], how='left')
    df_pred['rank'] = df_pred.groupby('description_id').cumcount().values
    df_pred = df_pred[df_pred['rank'] < 3]
    df_pred = df_pred.groupby(['description_id'])['paper_id'] \
            .apply(lambda s : ','.join((s))).reset_index()

    df_pred = te_data[['description_id']].merge(df_pred,
                                                on=['description_id'],
                                                how='left')
    loader.save_df(df_pred, out_path1)
Example #9
0
def output_fea(tr, te):
    # 特征重排,保证输出顺序一致
    # ...

    # 特征文件只保留主键 & 本次新增特征
    #primary_keys = ['session_id', 'impressions']
    #fea_cols = []
    #required_cols =  primary_keys + fea_cols

    # 特征输出
    #tr = tr[required_cols]
    #te = te[required_cols]

    print(tr.head())
    print(te.head())

    loader.save_df(tr, tr_fea_out_path)
    loader.save_df(te, te_fea_out_path)
Example #10
0
def merge_sub(file_path, sub_name, fold_num):
    file_list = os.listdir(file_path)

    paths = ['{}_{}.csv'.format(sub_name, i) for i in range(1, fold_num + 1)]
    print (paths)

    df = pd.DataFrame()
    for i, path in enumerate(paths):
        assert path in file_list, '{} not exist'.format(path)
        path = '{}/{}'.format(file_path, path)
        if i == 0:
            df = loader.load_df(path)
        else:
            df[TARGET_NAME] += loader.load_df(path)[TARGET_NAME]

    df[TARGET_NAME] /= fold_num
    print (df.head())
    print (df.describe())
    out_path = '{}/{}.ftr'.format(file_path, sub_name)
    loader.save_df(df, out_path)
Example #11
0
def merge_fea(tr_list, te_list):
    tr = loader.merge_fea_v2(tr_list, primary_keys=ID_NAMES)
    te = loader.merge_fea_v2(te_list, primary_keys=ID_NAMES)

    tr['impressions'] = tr['impressions'].astype('int')
    te['impressions'] = te['impressions'].astype('int')

    tr_sample = loader.load_df('../feature/tr_s0_0.ftr')
    tr_sample = tr_sample[ID_NAMES + ['cv']]

    tr = tr.merge(tr_sample, on=ID_NAMES, how='left')
    te['cv'] = 0

    print(tr.head())
    print(te.head())

    print(tr[ID_NAMES].head())

    loader.save_df(tr, tr_out_path)
    loader.save_df(te, te_out_path)
Example #12
0
def gen_fea(base_tr_path=None, base_te_path=None):

    tr = loader.load_df('../input/train.ftr')
    te = loader.load_df('../input/test.ftr')
    df_base = pd.concat([tr, te])

    df_feat = feat_extract(df_base)
    loader.save_df(df_feat, '../feature/df_feat.ftr')

    tr_sample = loader.load_df('../feature/tr_s0_0.ftr')
    te_sample = loader.load_df('../feature/te_s0_0.ftr')

    tr = tr_sample[ID_NAMES].merge(df_feat, on='session_id', how='left')
    te = te_sample[ID_NAMES].merge(df_feat, on='session_id', how='left')

    print(tr.shape, te.shape)
    print(tr.head())
    print(te.head())
    print(tr.columns)

    output_fea(tr, te)
def gen_samples(paper, tr_desc_path, tr_recall_path, fea_out_path):
    tr_desc = loader.load_df(tr_desc_path)
    tr = loader.load_df(tr_recall_path)
    #     tr = tr.head(1000)

    tr = tr.merge(paper, on=['paper_id'], how='left')
    tr = tr.merge(tr_desc[['description_id', 'quer_key', 'quer_all']],
                  on=['description_id'],
                  how='left')

    print(tr.columns)
    print(tr.head())

    tr_feat = multi_process_feat(tr)
    loader.save_df(tr_feat, fea_out_path)

    tr = tr.merge(tr_feat, on=ID_NAMES, how='left')
    del_cols = [
        col for col in tr.columns
        if tr[col].dtype == 'O' and col not in ID_NAMES
    ]
    print('tr del cols', del_cols)
    return tr.drop(del_cols, axis=1)
Example #14
0
    df = loader.load_df(in_path)
    df = topk_lines(df, k)
    df['sim_score'] = df['sim_score'].astype('float')
    df.rename(columns={'sim_score': 'corp_sim_score'}, inplace=True)
    return df


if __name__ == "__main__":

    ts = time.time()
    tr_path = '../../feat/tr_tfidf_30.ftr'
    te_path = '../../feat/te_tfidf_30.ftr'

    cv = loader.load_df('../../input/cv_ids_0109.csv')[['description_id', 'cv']]

    tr = process(tr_path, k=50)
    tr = tr.merge(cv, on=['description_id'], how='left')

    te = process(te_path, k=50)
    te['cv'] = 0

    loader.save_df(tr, '../../feat/tr_samples_30-50.ftr')
    loader.save_df(te, '../../feat/te_samples_30-50.ftr')
    print('all completed: {}, cost {}s'.format(datetime.now(), np.round(time.time() - ts, 2)))





def output_fea(tr, te):
    print(tr.head())
    print(te.head())

    loader.save_df(tr, tr_fea_out_path)
    loader.save_df(te, te_fea_out_path)

# 增加 vec sim 特征

if __name__ == "__main__":

    ts = time.time()
    tqdm.pandas()
    print('start time: %s' % datetime.now())
    paper = loader.load_df('../../input/paper_input_final.ftr')
    paper['abst'] = paper['abst'].apply(lambda s: s.replace('no_content', ''))
    paper['corp'] = paper['abst'] + ' ' + paper['titl'] + ' ' + paper[
        'keywords'].fillna('').replace(';', ' ')

    tr_desc_path = '../../input/tr_input_final.ftr'
    te_desc_path = '../../input/te_input_final.ftr'

    tr_recall_path = '../../feat/tr_s0_30-50.ftr'
    te_recall_path = '../../feat/te_s0_30-50.ftr'

    tr = gen_samples(paper, tr_desc_path, tr_recall_path, tr_fea_out_path)
    print(tr.columns)
    print([col for col in tr.columns if tr[col].dtype == 'O'])
    loader.save_df(tr, tr_out_path)

    te = gen_samples(paper, te_desc_path, te_recall_path, te_fea_out_path)
    print(te.columns)
    loader.save_df(te, te_out_path)
    print('all completed: {}, cost {}s'.format(datetime.now(),
                                               np.round(time.time() - ts, 2)))
Example #17
0
def gen_tr_click():
    df = loader.load_df('../input/sample_train.ftr')
    df = df[['session_id','reference']].drop_duplicates(subset='session_id',keep='last').reset_index(drop=True)
    print(df.shape)
    loader.save_df(df,'../input/tr_click.ftr')
Example #18
0
def get_te_feat():
    df = loader.load_df('../input/sample_test.ftr')
    df = df[pd.isnull(df['reference'])].reset_index(drop=True)
    print(df.shape)
    df.drop(['reference','action_type'],axis=1,inplace=True)
    loader.save_df(df,'../input/te.ftr')
    tr = loader.load_df('../input/train.ftr')
    te = loader.load_df('../input/test.ftr')

    tr = remove_repeated_session_in_tr(tr, te)

    tr = remove_invalid_reference(tr)
    te = remove_invalid_reference(te)

    tr = remove_acts_after_last_clk(tr, is_te=False)
    te = remove_acts_after_last_clk(te, is_te=True)

    loader.save_df(tr, '../input/train.ftr')
    loader.save_df(te, '../input/test.ftr')


if __name__ == "__main__":

    print('start time: %s' % datetime.now())
    tr = loader.load_df('../../../input/train.csv')
    te = loader.load_df('../../../input/test.csv')
    item_meta = loader.load_df('../../../input/item_metadata.csv')

    loader.save_df(tr, '../input/train.ftr')
    loader.save_df(te, '../input/test.ftr')
    loader.save_df(item_meta, '../input/item_metadata.ftr')

    filter_useless_data()

    print('all completed: %s' % datetime.now())
Example #20
0
        'prices_div_active_items-session_id_by_prices_median-v2', 'price_rank',
        'act_pre1', 'lastest_item-impr_rank',
        'impr_rank_sub_impressions_by_impr_rank_median', 'price_div',
        'session_act_sum',
        'prices_div_active_items-session_id_by_prices_median',
        'impressions_by_hist_interaction item info_sum',
        'impressions_by_hist_interaction item image_sum',
        'impressions_by_hist_clickout item_sum', 'session_id_by_prices_count',
        'impressions_target', 'impressions_active_ratio',
        'price_div_impr_rank_1_price', 'impressions_target_sub_session_median',
        'impressions_target_sub_session_max', 'session_hist_clickout item',
        'device', 'impr_rank_sub_session_id_by_impr_rank_median',
        'session_hist_interaction item image',
        'impr_rank_1_impressions_target',
        'impr_rank_sub_session_id_by_impr_rank_max',
        'impr_rank_sub_session_id_by_impr_rank_min', 'current_filters'
    ]

    tr = tr[cols]
    te = te[cols]

    tr.columns = ['session_id', 'impressions'] + \
            ['m2_{}'.format(c) for c in tr.columns.tolist()[2:]]
    te.columns = ['session_id', 'impressions'] + \
            ['m2_{}'.format(c) for c in te.columns.tolist()[2:]]

    loader.save_df(tr, '../../../feat/m2_tr_top30_fea.ftr')
    loader.save_df(te, '../../../feat/m2_te_top30_fea.ftr')

    print('all completed: %s' % datetime.now())
        tr = loader.load_df(input_root_path + 'tr_input_final.ftr')
        tr = tr[~pd.isnull(tr['description_id'])]

        #         tr = tr.head(1000)
        tr_desc, tr_desc_ids = [dictionary.doc2bow(line.split(' ')) for line in tr['quer_all'].tolist()], \
                tr['description_id'].tolist()
        print('gen tf completed, cost {}s'.format(np.round(
            time.time() - ts, 2)))

        tr_samples = gen_samples(tr, tr_desc, tr_desc_ids, \
                corpus_list, paper_ids_list, k=50)
        tr_samples = tr.rename(columns={'paper_id': 'target_paper_id'}) \
                .merge(tr_samples, on='description_id', how='left')
        tr_samples.loc[tr_samples['target_paper_id'] == tr_samples['paper_id'],
                       'target'] = 1
        loader.save_df(tr_samples[out_cols], train_out_path)
        print('recall succ {} from {}'.format(tr_samples['target'].sum(),
                                              tr.shape[0]))
        print(tr.shape, tr_samples.shape)

    if sys.argv[1] in ['te']:
        # for te ins
        te = loader.load_df(input_root_path + 'te_input_final.ftr')
        te = te[~pd.isnull(te['description_id'])]

        #         te = te.head(1000)
        te_desc, te_desc_ids = [dictionary.doc2bow(line.split(' ')) for line in te['quer_all'].tolist()], \
                te['description_id'].tolist()
        print('gen tf completed, cost {}s'.format(np.round(
            time.time() - ts, 2)))