Example #1
0
    def load_data(tt):
        df = merge_all([
            t_tr_te_classify[tt].load().rename(
                columns={'item_id': 'impressions'}),
            f_m2_top30[tt].load(),
            f_m3_top30[tt].load(),
            f_top30[tt].load(),
            f_si_sim[tt].load().rename(columns={'item_id': 'impressions'}),
            f_si_cmp[tt].load().rename(columns={'item_id': 'impressions'}),
            f_si_win[tt].load().rename(columns={'item_id': 'impressions'}),
        ],
                       on=['session_id', 'impressions'],
                       how='left')

        df = rank_similarity_inside_session(df)

        cols_win = [
            'item_id_impression_prev_item_win_ratio',
            'item_id_impression_first_item_win_ratio',
            'item_id_interaction_last_item_win_ratio',
            'item_id_interaction_most_item_win_ratio',
        ]

        df = rank_feat_inside_session(df, cols_win)
        return df
Example #2
0
def similarity_pair():
    cols_pair = [
        'item_id_impression_prev', 
    #     'item_id_impression_next', 
        'item_id_impression_first', 
    #     'item_id_interaction_first', 
        'item_id_interaction_last', 
        'item_id_interaction_most', 
    ]

    cols_sim = [
        'item_meta_cos',
        'co_appearence_interaction_count', 'co_appearence_impression_count',
        'similarity_wv_impression', 'similarity_wv_interaction',
    ]

    df_sim = t_sim.load(columns=['item_id', 'item_id_anchor'] + cols_sim)
    for tt in ['train', 'test']:
        df = t_tr_te_pair[tt].load()

        dd_lst = []
        for col in cols_pair:
            dd = pd.merge(df[['session_id', 'item_id', col]], df_sim, 
                          left_on=['item_id', col], right_on=['item_id', 'item_id_anchor'], how='left')
            dd = dd.rename(columns=dict([(c, '%s_%s' % (col, c)) for c in cols_sim]))
            dd.drop(['item_id_anchor', col], axis=1, inplace=True)
            dd_lst.append(dd)
            
        dd = merge_all(dd_lst, on=['session_id', 'item_id'], how='left')
        f_si_sim[tt].save(dd)
Example #3
0
    def load_data(tt):
        df = merge_all([
            t_tr_te_classify[tt].load().rename(
                columns={'item_id': 'impressions'}),
            f_top100[tt].load(),
            f_si_sim[tt].load().rename(columns={'item_id': 'impressions'}),
        ],
                       on=['session_id', 'impressions'],
                       how='left')

        df = rank_similarity_inside_session(df)
        return df
Example #4
0
def similarity_all():
    df_pair = merge_all([
        t_pair.load(),
        t_sim_item_meta_cos.load(),
        t_sim_co_imp.load(),
        t_sim_co_int.load(),
        t_sim_wv_imp.load(),
        t_sim_wv_int.load(),
    ],
                        on=['item_id', 'item_id_anchor'],
                        how='left')

    t_sim.save(df_pair)
Example #5
0
def concat_lgb_20190522_3(tt):
    from feat_names import names_lgb_20190522_3 as feats

    cols_keep = ['session_id', 'item_id'] + feats 
    df = t_tr_te_classify[tt].load()

    df_sess = merge_all([
            f_sess_basic[tt].load(columns=cols_keep),
            f_sess_imp[tt].load(columns=cols_keep),
            f_sess_int[tt].load(columns=cols_keep),
            f_sess_imp_eq[tt].load(columns=cols_keep),
            f_sess_int_price[tt].load(columns=cols_keep),
            f_sess_le[tt].load(columns=cols_keep),
    ], on='session_id', how='outer')

    df_item = merge_all([
        f_item_expo_cnt.load(columns=cols_keep),
        f_item_int_cnt.load(columns=cols_keep),
        f_item_meta_gte_50000.load(columns=cols_keep),
    ], on='item_id', how='outer')

    df_sess_item = merge_all([
        f_si_basic[tt].load(columns=cols_keep),
        f_si_diff_last_int[tt].load(columns=cols_keep),
        f_si_first_last[tt].load(columns=cols_keep),
        f_si_int[tt].load(columns=cols_keep),
        f_si_diff_imp_price[tt].load(columns=cols_keep),
    ], on=['session_id', 'item_id'], how='outer')

    df = df.merge(df_sess, on='session_id', how='left')\
            .merge(df_item, on='item_id', how='left')\
            .merge(df_sess_item, on=['session_id', 'item_id'], how='left')

    for col in ['session_interaction_item_price_min', 'session_interaction_item_price_max', 
        'session_interaction_item_price_mean']:
        df['%s_div' % col] = df[col] / df['price']

    return df
Example #6
0
def merge_te_target(dd_te, dd_target):
    cols_pair = [
        'item_id_impression_prev', 
        'item_id_impression_first', 
        'item_id_interaction_last', 
        'item_id_interaction_most', 
    ]
    
    dd_lst = []
    
    for col in cols_pair:
        dd = pd.merge(dd_target[['session_id', 'item_id', col]], dd_te[['item_id', 'item_id_anchor', 'item_win_ratio']], 
                      left_on=['item_id', col], right_on=['item_id', 'item_id_anchor'], how='left')
        dd = dd.rename(columns={'item_win_ratio': '%s_item_win_ratio' % col})
        dd.drop(['item_id_anchor', col], axis=1, inplace=True)
        dd_lst.append(dd)
        
    dd = merge_all(dd_lst, on=['session_id', 'item_id'], how='left')
    return dd
Example #7
0
def compare_pair():
    cols_pair = [
        'item_id_impression_prev', 
    #     'item_id_impression_next', 
        'item_id_impression_first', 
    #     'item_id_interaction_first', 
        'item_id_interaction_last', 
        'item_id_interaction_most', 
    ]

    cols_compare = [
        'price', 'price_rank', 'item_rank', 
        'session_item_interaction_count'
    ]

    for tt in ['train', 'test']:
        cols_keep = ['session_id', 'impressions'] + ['m1_%s' % f for f in cols_compare]
        cols_rn = ['session_id', 'item_id'] + cols_compare
        
        df = f_top100[tt].load(columns=cols_keep)
        df.columns = cols_rn
        
        df_pair = t_tr_te_pair[tt].load()
        df_pair = pd.merge(df_pair, df, on=['session_id', 'item_id'])
        
        dd_lst = []
        for col in cols_pair:
            cols_rn = dict([(c, '%s_anchor' % c) for c in cols_compare] + [('item_id', col)])
            dd_c = pd.merge(df_pair, df.rename(columns=cols_rn), on=['session_id', col])

            cols_keep = ['session_id', 'item_id']
            for cc in cols_compare:
                if cc == 'item_id':
                    continue

                col_n = '%s_%s_div' % (col, cc)
                dd_c[col_n] = (dd_c[cc] / dd_c[cols_rn[cc]]).replace(np.inf, -1)
                cols_keep.append(col_n)

            dd_lst.append(dd_c[cols_keep])

        dd = merge_all(dd_lst, on=['session_id', 'item_id'], how='outer')
        f_si_cmp[tt].save(dd)  
Example #8
0
def session_item_first_last():
    for tt in ['train', 'test']:
        df = t_tr_te_flt_step[tt].load(
            columns=['session_id', 'reference'])

        dff = df[(~df['reference'].isnull()) & (df['reference'].str.isdigit())]
        
        dff = dff.rename(columns={'reference': 'item_id'})
        dff['item_id'] = dff['item_id'].astype(int)
        
        dd_first = dff.groupby('session_id')['item_id'].first().reset_index()
        dd_first['session_item_first'] = 1

        dd_last = dff.groupby('session_id')['item_id'].last().reset_index()
        dd_last['session_item_last'] = 1

        dd = merge_all([dd_first, dd_last], 
                on=['session_id', 'item_id'], how='outer')
        f_si_first_last[tt].save(dd)
Example #9
0
def session_item_diff_last_interaction_index():
    cols = ['session_last_interaction_item', 'session_last_interaction_item_image_item', 
            'session_last_interaction_item_info_item', 'session_last_interaction_item_rating_item', 
            'session_last_interaction_item_deals_item', 'session_last_search_for_item_item', 
            'session_last_clickout_item_item']
    for tt in ['train', 'test']:
        target = t_tr_te_classify[tt].load()
        target['index'] = np.arange(target.shape[0])

        df = t_tr_te_sess_last_int[tt].load()

        lst_dd = []
        for col in cols:
            col_t = '%s_index' % col
            col_n = '%s_index_diff' % col

            dd = df[['session_id', col]].rename(columns={col: 'item_id'})
            dd = dd[~dd['item_id'].isnull()]
            dd['item_id'] = dd['item_id'].astype(int)

            dd = pd.merge(target[['session_id', 'item_id', 'index']], dd, 
                          on=['session_id', 'item_id'], how='inner')
            dd = dd.rename(columns={'index': col_t})
            assert dd.shape[0] == dd.session_id.unique().shape[0]

            dd = pd.merge(target[['session_id', 'item_id', 'index']], 
                          dd[['session_id', col_t]], on='session_id', how='inner')
            dd[col_n] = dd['index'] - dd[col_t]

            dd['%s_gt_0' % col_n] = (dd[col_n] > 0) * 1
            dd['%s_gte_0' % col_n] = (dd[col_n] >= 0) * 1
            dd = dd[['session_id', 'item_id', col_n, 
                     '%s_gt_0' % col_n, '%s_gte_0' % col_n]]
            
            lst_dd.append(dd)

        df_feat = merge_all(lst_dd, on=['session_id', 'item_id'], how='outer')
        f_si_diff_last_int[tt].save(df_feat)
Example #10
0
def session_item_interaction():
    action_types = ['interaction', 'interaction item image', 'interaction item info', 
                    'interaction item rating', 'interaction item deals', 
                    'search for item', 'clickout item']

    for tt in ['train', 'test']:
        target = t_tr_te_target[tt].load(
            columns=['session_id', 'step', 'timestamp'])
#         target = target.drop_duplicates('session_id', keep='last')

        df = t_tr_te_flt[tt].load(
            columns=['session_id', 'reference', 'action_type', 'step', 'timestamp'])

        df = df[(~df['reference'].isnull()) & (df['reference'].str.isdigit())]
        df = df.rename(columns={'reference': 'item_id'})
        df['item_id'] = df['item_id'].astype(int)

        lst_dd = []
        for at in action_types:
            at_n = at.replace(' ', '_')

            dff = df if at == 'interaction' else df[df['action_type'] == at]

            series = dff.groupby(['session_id', 'item_id'])['action_type'].count()
            series.name = 'session_item_%s_count' % at_n
            lst_dd.append(series.reset_index())

            dd_first = dff[['session_id', 'item_id', 'step', 'timestamp']]\
                        .groupby(['session_id', 'item_id']).first()
            dd_first = dd_first.reset_index().rename(columns={
                'step': 'session_item_first_%s_step' % at_n, 
                'timestamp': 'session_item_first_%s_timestamp' % at_n 
            })
            lst_dd.append(dd_first)
            
            dd_last = dff[['session_id', 'item_id', 'step', 'timestamp']]\
                            .groupby(['session_id', 'item_id']).last()
            dd_last = dd_last.reset_index().rename(columns={
                'step': 'session_item_last_%s_step' % at_n, 
                'timestamp': 'session_item_last_%s_timestamp' % at_n 
            })

            lst_dd.append(dd_last)

        df_feat = merge_all(lst_dd, on=['session_id', 'item_id'], how='outer')
        df_feat = pd.merge(df_feat, target, on='session_id', how='left')

        for at in action_types:
            for fl in ['first', 'last']:
                for cc in ['step', 'timestamp']:
                    at_n = at.replace(' ', '_')
                    col = 'session_item_%s_%s_%s' % (fl, at_n, cc)
                    col_n = '%s_diff_target' % col
                    df_feat[col_n] = (df_feat[cc] - df_feat[col]).astype('float32')
        
        cols = ['session_item_interaction_item_image_count', 'session_item_interaction_item_info_count', 
                'session_item_search_for_item_count', 'session_item_clickout_item_count']

        df_feat['session_item_interaction_count'] = df_feat[cols].sum(axis=1)

        df_feat.drop(['step', 'timestamp'], axis=1, inplace=True)
        f_si_int[tt].save(df_feat)