def load_data(tt): df = merge_all([ t_tr_te_classify[tt].load().rename( columns={'item_id': 'impressions'}), f_m2_top30[tt].load(), f_m3_top30[tt].load(), f_top30[tt].load(), f_si_sim[tt].load().rename(columns={'item_id': 'impressions'}), f_si_cmp[tt].load().rename(columns={'item_id': 'impressions'}), f_si_win[tt].load().rename(columns={'item_id': 'impressions'}), ], on=['session_id', 'impressions'], how='left') df = rank_similarity_inside_session(df) cols_win = [ 'item_id_impression_prev_item_win_ratio', 'item_id_impression_first_item_win_ratio', 'item_id_interaction_last_item_win_ratio', 'item_id_interaction_most_item_win_ratio', ] df = rank_feat_inside_session(df, cols_win) return df
def similarity_pair(): cols_pair = [ 'item_id_impression_prev', # 'item_id_impression_next', 'item_id_impression_first', # 'item_id_interaction_first', 'item_id_interaction_last', 'item_id_interaction_most', ] cols_sim = [ 'item_meta_cos', 'co_appearence_interaction_count', 'co_appearence_impression_count', 'similarity_wv_impression', 'similarity_wv_interaction', ] df_sim = t_sim.load(columns=['item_id', 'item_id_anchor'] + cols_sim) for tt in ['train', 'test']: df = t_tr_te_pair[tt].load() dd_lst = [] for col in cols_pair: dd = pd.merge(df[['session_id', 'item_id', col]], df_sim, left_on=['item_id', col], right_on=['item_id', 'item_id_anchor'], how='left') dd = dd.rename(columns=dict([(c, '%s_%s' % (col, c)) for c in cols_sim])) dd.drop(['item_id_anchor', col], axis=1, inplace=True) dd_lst.append(dd) dd = merge_all(dd_lst, on=['session_id', 'item_id'], how='left') f_si_sim[tt].save(dd)
def load_data(tt): df = merge_all([ t_tr_te_classify[tt].load().rename( columns={'item_id': 'impressions'}), f_top100[tt].load(), f_si_sim[tt].load().rename(columns={'item_id': 'impressions'}), ], on=['session_id', 'impressions'], how='left') df = rank_similarity_inside_session(df) return df
def similarity_all(): df_pair = merge_all([ t_pair.load(), t_sim_item_meta_cos.load(), t_sim_co_imp.load(), t_sim_co_int.load(), t_sim_wv_imp.load(), t_sim_wv_int.load(), ], on=['item_id', 'item_id_anchor'], how='left') t_sim.save(df_pair)
def concat_lgb_20190522_3(tt): from feat_names import names_lgb_20190522_3 as feats cols_keep = ['session_id', 'item_id'] + feats df = t_tr_te_classify[tt].load() df_sess = merge_all([ f_sess_basic[tt].load(columns=cols_keep), f_sess_imp[tt].load(columns=cols_keep), f_sess_int[tt].load(columns=cols_keep), f_sess_imp_eq[tt].load(columns=cols_keep), f_sess_int_price[tt].load(columns=cols_keep), f_sess_le[tt].load(columns=cols_keep), ], on='session_id', how='outer') df_item = merge_all([ f_item_expo_cnt.load(columns=cols_keep), f_item_int_cnt.load(columns=cols_keep), f_item_meta_gte_50000.load(columns=cols_keep), ], on='item_id', how='outer') df_sess_item = merge_all([ f_si_basic[tt].load(columns=cols_keep), f_si_diff_last_int[tt].load(columns=cols_keep), f_si_first_last[tt].load(columns=cols_keep), f_si_int[tt].load(columns=cols_keep), f_si_diff_imp_price[tt].load(columns=cols_keep), ], on=['session_id', 'item_id'], how='outer') df = df.merge(df_sess, on='session_id', how='left')\ .merge(df_item, on='item_id', how='left')\ .merge(df_sess_item, on=['session_id', 'item_id'], how='left') for col in ['session_interaction_item_price_min', 'session_interaction_item_price_max', 'session_interaction_item_price_mean']: df['%s_div' % col] = df[col] / df['price'] return df
def merge_te_target(dd_te, dd_target): cols_pair = [ 'item_id_impression_prev', 'item_id_impression_first', 'item_id_interaction_last', 'item_id_interaction_most', ] dd_lst = [] for col in cols_pair: dd = pd.merge(dd_target[['session_id', 'item_id', col]], dd_te[['item_id', 'item_id_anchor', 'item_win_ratio']], left_on=['item_id', col], right_on=['item_id', 'item_id_anchor'], how='left') dd = dd.rename(columns={'item_win_ratio': '%s_item_win_ratio' % col}) dd.drop(['item_id_anchor', col], axis=1, inplace=True) dd_lst.append(dd) dd = merge_all(dd_lst, on=['session_id', 'item_id'], how='left') return dd
def compare_pair(): cols_pair = [ 'item_id_impression_prev', # 'item_id_impression_next', 'item_id_impression_first', # 'item_id_interaction_first', 'item_id_interaction_last', 'item_id_interaction_most', ] cols_compare = [ 'price', 'price_rank', 'item_rank', 'session_item_interaction_count' ] for tt in ['train', 'test']: cols_keep = ['session_id', 'impressions'] + ['m1_%s' % f for f in cols_compare] cols_rn = ['session_id', 'item_id'] + cols_compare df = f_top100[tt].load(columns=cols_keep) df.columns = cols_rn df_pair = t_tr_te_pair[tt].load() df_pair = pd.merge(df_pair, df, on=['session_id', 'item_id']) dd_lst = [] for col in cols_pair: cols_rn = dict([(c, '%s_anchor' % c) for c in cols_compare] + [('item_id', col)]) dd_c = pd.merge(df_pair, df.rename(columns=cols_rn), on=['session_id', col]) cols_keep = ['session_id', 'item_id'] for cc in cols_compare: if cc == 'item_id': continue col_n = '%s_%s_div' % (col, cc) dd_c[col_n] = (dd_c[cc] / dd_c[cols_rn[cc]]).replace(np.inf, -1) cols_keep.append(col_n) dd_lst.append(dd_c[cols_keep]) dd = merge_all(dd_lst, on=['session_id', 'item_id'], how='outer') f_si_cmp[tt].save(dd)
def session_item_first_last(): for tt in ['train', 'test']: df = t_tr_te_flt_step[tt].load( columns=['session_id', 'reference']) dff = df[(~df['reference'].isnull()) & (df['reference'].str.isdigit())] dff = dff.rename(columns={'reference': 'item_id'}) dff['item_id'] = dff['item_id'].astype(int) dd_first = dff.groupby('session_id')['item_id'].first().reset_index() dd_first['session_item_first'] = 1 dd_last = dff.groupby('session_id')['item_id'].last().reset_index() dd_last['session_item_last'] = 1 dd = merge_all([dd_first, dd_last], on=['session_id', 'item_id'], how='outer') f_si_first_last[tt].save(dd)
def session_item_diff_last_interaction_index(): cols = ['session_last_interaction_item', 'session_last_interaction_item_image_item', 'session_last_interaction_item_info_item', 'session_last_interaction_item_rating_item', 'session_last_interaction_item_deals_item', 'session_last_search_for_item_item', 'session_last_clickout_item_item'] for tt in ['train', 'test']: target = t_tr_te_classify[tt].load() target['index'] = np.arange(target.shape[0]) df = t_tr_te_sess_last_int[tt].load() lst_dd = [] for col in cols: col_t = '%s_index' % col col_n = '%s_index_diff' % col dd = df[['session_id', col]].rename(columns={col: 'item_id'}) dd = dd[~dd['item_id'].isnull()] dd['item_id'] = dd['item_id'].astype(int) dd = pd.merge(target[['session_id', 'item_id', 'index']], dd, on=['session_id', 'item_id'], how='inner') dd = dd.rename(columns={'index': col_t}) assert dd.shape[0] == dd.session_id.unique().shape[0] dd = pd.merge(target[['session_id', 'item_id', 'index']], dd[['session_id', col_t]], on='session_id', how='inner') dd[col_n] = dd['index'] - dd[col_t] dd['%s_gt_0' % col_n] = (dd[col_n] > 0) * 1 dd['%s_gte_0' % col_n] = (dd[col_n] >= 0) * 1 dd = dd[['session_id', 'item_id', col_n, '%s_gt_0' % col_n, '%s_gte_0' % col_n]] lst_dd.append(dd) df_feat = merge_all(lst_dd, on=['session_id', 'item_id'], how='outer') f_si_diff_last_int[tt].save(df_feat)
def session_item_interaction(): action_types = ['interaction', 'interaction item image', 'interaction item info', 'interaction item rating', 'interaction item deals', 'search for item', 'clickout item'] for tt in ['train', 'test']: target = t_tr_te_target[tt].load( columns=['session_id', 'step', 'timestamp']) # target = target.drop_duplicates('session_id', keep='last') df = t_tr_te_flt[tt].load( columns=['session_id', 'reference', 'action_type', 'step', 'timestamp']) df = df[(~df['reference'].isnull()) & (df['reference'].str.isdigit())] df = df.rename(columns={'reference': 'item_id'}) df['item_id'] = df['item_id'].astype(int) lst_dd = [] for at in action_types: at_n = at.replace(' ', '_') dff = df if at == 'interaction' else df[df['action_type'] == at] series = dff.groupby(['session_id', 'item_id'])['action_type'].count() series.name = 'session_item_%s_count' % at_n lst_dd.append(series.reset_index()) dd_first = dff[['session_id', 'item_id', 'step', 'timestamp']]\ .groupby(['session_id', 'item_id']).first() dd_first = dd_first.reset_index().rename(columns={ 'step': 'session_item_first_%s_step' % at_n, 'timestamp': 'session_item_first_%s_timestamp' % at_n }) lst_dd.append(dd_first) dd_last = dff[['session_id', 'item_id', 'step', 'timestamp']]\ .groupby(['session_id', 'item_id']).last() dd_last = dd_last.reset_index().rename(columns={ 'step': 'session_item_last_%s_step' % at_n, 'timestamp': 'session_item_last_%s_timestamp' % at_n }) lst_dd.append(dd_last) df_feat = merge_all(lst_dd, on=['session_id', 'item_id'], how='outer') df_feat = pd.merge(df_feat, target, on='session_id', how='left') for at in action_types: for fl in ['first', 'last']: for cc in ['step', 'timestamp']: at_n = at.replace(' ', '_') col = 'session_item_%s_%s_%s' % (fl, at_n, cc) col_n = '%s_diff_target' % col df_feat[col_n] = (df_feat[cc] - df_feat[col]).astype('float32') cols = ['session_item_interaction_item_image_count', 'session_item_interaction_item_info_count', 'session_item_search_for_item_count', 'session_item_clickout_item_count'] df_feat['session_item_interaction_count'] = df_feat[cols].sum(axis=1) df_feat.drop(['step', 'timestamp'], axis=1, inplace=True) f_si_int[tt].save(df_feat)