def extract_feature(self): train = data.train_df(mode=self.mode, cluster=self.cluster) test = data.test_df(mode=self.mode, cluster=self.cluster) df = pd.concat([train, test]) # remove last clks and last part of session new_df = remove_last_part_of_clk_sessions(df) new_df = new_df.drop(find(new_df)) no_last_clks_numeric = new_df[ new_df.reference.str.isnumeric() == True][[ 'user_id', 'session_id', 'action_type', 'reference' ]] # we want to make it fast, avoid any loops... # simply drop duplicates and mantain last occurence # of the tuple user-session-item :D last_actions = no_last_clks_numeric.drop_duplicates( ['user_id', 'session_id', 'reference'], keep='last') last_actions = last_actions.rename( columns={ 'reference': 'item_id', 'action_type': 'last_action_involving_impression' }) last_actions.item_id = last_actions.item_id.astype(int) # get last clickouts and expand last_clk = df.loc[find(df)] clk_expanded = expand_impressions(last_clk)[[ 'user_id', 'session_id', 'item_id' ]] # now simply merge and fill NaNs with 'no_action' as in the original feature feature = pd.merge(clk_expanded, last_actions, how='left', on=['user_id', 'session_id', 'item_id']) feature.last_action_involving_impression = feature.last_action_involving_impression.astype( object).fillna('no_action') return feature
def remove_last_part_of_clk_sessions(df): """ This function takes a dataframe and removes the interactions that occur after the last clickout of each session. """ df = df.sort_values( by=['user_id', 'session_id', 'timestamp', 'step']).reset_index( drop=True) last_indices = find(df) last_clks = df.loc[last_indices] clks_sessions = last_clks.session_id.unique().tolist() clks_users = last_clks.user_id.unique().tolist() df_last_clks_sess_only = df[(df.session_id.isin(clks_sessions)) & (df.user_id.isin(clks_users))][[ 'user_id', 'session_id', 'action_type' ]] df_last_clks_sess_only_no_dupl = df_last_clks_sess_only.drop_duplicates( ['user_id', 'session_id']) df_last_clks_sess_only_no_dupl['last_index'] = sorted(last_indices) df_last_clks_sess_only_no_dupl = df_last_clks_sess_only_no_dupl.drop( 'action_type', 1) merged = pd.merge(df_last_clks_sess_only, df_last_clks_sess_only_no_dupl, how='left', on=['user_id', 'session_id' ]).set_index(df_last_clks_sess_only.index) indices_to_remove = [] for t in tqdm(zip(merged.index, merged.last_index)): if t[0] > t[1]: indices_to_remove.append(t[0]) return df.drop(indices_to_remove)
def extract_feature(self): o = ImpressionFeature(self.mode) f = o.read_feature() f = f.drop(['properties'], axis=1) f['popularity'] = 0 pop = dict(zip(f.item_id.values, f.popularity.values)) train = data.train_df(mode=self.mode, cluster=self.cluster) test = data.test_df(mode=self.mode, cluster=self.cluster) df = pd.concat([train, test]) last_clickout_indices = find(df) df_dropped_last_clickouts = df.drop(last_clickout_indices) df_no_last_clickouts = df_dropped_last_clickouts[ (df_dropped_last_clickouts.action_type == 'clickout item') & ~(df_dropped_last_clickouts.reference.isnull())] references = df_no_last_clickouts.reference.values for r in references: pop[int(r)] += 1 final_df = pd.DataFrame( list(pop.items()), columns=['item_id', 'top_pop_interaction_clickout_per_impression']) return final_df
def extract_feature(self): self.current_directory = Path(__file__).absolute().parent self.data_dir = self.current_directory.joinpath( '..', '..', 'stacking', self.mode) train = data.train_df(mode=self.mode, cluster=self.cluster) test = data.test_df(mode=self.mode, cluster=self.cluster) df = pd.concat([train, test]) last_indices = find(df) # extract scores self.train_dir = self.data_dir.joinpath('test') for file in glob.glob(str(self.train_dir) + '/rnn*'): rnn = np.load(file) rnn = pd.DataFrame( rnn, columns=['index', 'item_recommendations', 'scores']) rnn = rnn.astype({'index': int}) rnn = rnn[rnn['index'].isin(last_indices)] rnn_idx = list(rnn['index']) print(f'Rnn indices are : {len(set(rnn_idx))}') print(f'Last indices are : {len((last_indices))}') common = set(rnn_idx) & set(last_indices) print(f'In common : {len(common)}') t = assign_score(rnn, 'rnn') t = t.sort_values(by='index') df['index'] = df.index.values df = df[['user_id', 'session_id', 'index']] df = pd.merge(t, df, how='left', on=['index']) num_idx = len(set(df['index'].values)) print(num_idx) return df[['user_id', 'session_id', 'item_id', 'score_rnn']]
def save_folds(df, user_session_df, train_index, test_index, count, mode): u_s_train = list( user_session_df.loc[train_index]['user_session'].values) u_s_test = list(user_session_df.loc[test_index]['user_session'].values) path = 'dataset/preprocessed/{}/{}'.format('fold_' + str(count), mode) check_folder(path) train = df[df['user_session'].isin(u_s_train)] train = train.drop(['user_session'], axis=1) train.to_csv(os.path.join(path, 'train.csv')) train_indices = train.index.values np.save(os.path.join(path, 'train_indices'), train_indices) test = df[df['user_session'].isin(u_s_test)] target_indices = sorted(find(test)) test.at[target_indices, 'reference'] = np.nan test = test.drop(['user_session'], axis=1) test.to_csv(os.path.join(path, 'test.csv')) test_indices = test.index.values np.save(os.path.join(path, 'test_indices'), test_indices) np.save(os.path.join(path, 'target_indices'), target_indices) print(f'Train shape : {train.shape} , Test shape : {test.shape}') print(f'Last clickout indices : {len(target_indices)}')
def extract_feature(self): train = data.train_df(mode=self.mode, cluster=self.cluster) test = data.test_df(mode=self.mode, cluster=self.cluster) df = pd.concat([train, test]) last_clickout_indices = find(df) clickout_rows = df.loc[last_clickout_indices, ['user_id','session_id','reference','action_type','impressions']] reference_rows = df[(df.reference.str.isnumeric() == True) & (df.action_type == 'clickout item')] df_item_clicks = ( reference_rows .groupby(["reference"]) .size() .reset_index(name="n_interactions_per_item") ) df_item_clicks = df_item_clicks.rename(columns={'reference':'item_id'}) df_item_clicks['item_id'] = df_item_clicks['item_id'].astype(int) #df_item_clicks clk_expanded = expand_impressions(clickout_rows) final_feature = pd.merge(clk_expanded, df_item_clicks, how='left', on=['item_id']).fillna(0) final_feature.n_interactions_per_item = final_feature.n_interactions_per_item.astype(int) final_feature = final_feature.drop(['index'], axis=1) final_feature.reference = final_feature.reference.astype(int) new_column = [] for t in zip(final_feature.item_id, final_feature.reference, final_feature.n_interactions_per_item): if t[0] == t[1]: new_column.append(int(t[2]-1)) else: new_column.append(int(t[2])) final_feature['personalized_popularity'] = new_column final_feature_reduced = final_feature[['user_id','session_id','item_id','personalized_popularity']] return final_feature_reduced
def extract_feature(self): train = data.train_df(mode=self.mode, cluster=self.cluster) test = data.test_df(mode=self.mode, cluster=self.cluster) df = pd.concat([train, test]) # get all the cities cities = df['city'].unique().tolist() # get clickout rows (WITHOUT last clk) last_indices = find(df) df_non_last_clk = df.drop(last_indices) df_clickout = df_non_last_clk[(df_non_last_clk['action_type']=='clickout item')][['reference','city']] df_clickout = df_clickout.rename(columns={'reference':'item_id'}) df_clickout = df_clickout.dropna() # remove NaNs, that should not be there anywayss df_clickout.item_id = df_clickout.item_id.astype(int) # open impressions df o = ImpressionFeature(mode='small') df_accomodations = o.read_feature(True) df_accomodations = df_accomodations.drop(['properties1 Star', 'properties2 Star', 'properties3 Star', 'properties4 Star', 'properties5 Star'],1) # get all clicks properties df_clicks_properties = pd.merge(df_clickout, df_accomodations, how='left', on=['item_id']) df_clicks_properties = df_clicks_properties.sort_values(by=['city']) df_clicks_properties = df_clicks_properties.drop('item_id',1) # sum all properties per city grouped_by_city = df_clicks_properties.groupby('city').sum() # create df with city:array_of_features df_city_features = pd.DataFrame(columns=['city','properties_array']) df_city_features.city = grouped_by_city.index df_city_features.properties_array = grouped_by_city.values.tolist() # now take last clk df clickout_rows = df.loc[last_indices, ['user_id','session_id','city','action_type','impressions']][df.action_type == 'clickout item'] clk_expanded = expand_impressions(clickout_rows) clk_expanded_wt_city_feat = pd.merge(clk_expanded, df_city_features, how='left', on=['city']) # create df with item:array_of_features array = df_accomodations.drop(['item_id'],axis=1).values df_item_features = pd.DataFrame(columns=['item_id','features_array']) df_item_features['item_id'] = df_accomodations['item_id'].values df_item_features['features_array'] = list(array) final_feature = pd.merge(clk_expanded_wt_city_feat, df_item_features, how='left', on=['item_id']) for n in tqdm(final_feature[final_feature['properties_array'].isnull()].index.tolist()): final_feature.at[n,'properties_array'] = [0]*152 # cast list to numpy array to use the cosine (it's written for doubles) final_feature.properties_array = final_feature.properties_array.progress_apply(lambda x: np.asarray(x)) # create new column new_col =[] if self.metric == 'cosine': shrink = 0 # TRY ME for t in tqdm(zip(final_feature.properties_array, final_feature.features_array)): new_col.append(cosine_similarity(t[0].astype(np.double), t[1].astype(np.double),shrink)) if self.metric == 'euclidean': for t in tqdm(zip(final_feature.properties_array, final_feature.features_array)): new_col.append(np.linalg.norm(t[0]-t[1])) # final feature new_feature = final_feature[['user_id','session_id','item_id']] new_feature['city_similarity'] = new_col return new_feature
def extract_feature(self): list_of_sorting_filters_wout_pop = [ 'Sort by Price', 'Sort by Distance', 'Sort by Rating', 'Best Value', 'Focus on Rating', 'Focus on Distance' ] list_of_sorting_filters = [ 'Sort by Price', 'Sort by Distance', 'Sort by Rating', 'Best Value', 'Focus on Rating', 'Focus on Distance', 'Sort by Popularity' ] def mask_sorting(x): if np.isin(x, list_of_sorting_filters_wout_pop).any(): return x else: return ['Sort by Popularity'] start = time.time() train = data.train_df(mode=self.mode, cluster=self.cluster) test = data.test_df(mode=self.mode, cluster=self.cluster) df = pd.concat([train, test]) indices_last_clks = find(df) d = df[df.action_type == 'clickout item'].drop(indices_last_clks) d_splitted = d.current_filters.progress_apply( lambda x: str(x).split('|')) md = d_splitted.progress_apply(mask_sorting) df_f = df.loc[md.index] df_ref = df_f.reference dict_ref_to_filters = dict( zip(df_ref.unique(), [dict(zip(list_of_sorting_filters, np.zeros(len(list_of_sorting_filters))))\ for i in range(len(df_ref.unique()))])) for index, row in tqdm(df_f.iterrows(), total=df_f.shape[0]): for i in md.loc[index]: if i in list_of_sorting_filters: dict_ref_to_filters[row.reference][i] += 1 df_feature = pd.DataFrame.from_dict(dict_ref_to_filters, orient='index') df_feature = df_feature.astype(int).reset_index().rename( index=str, columns={"index": "item_id"}) set_of_not_clicked_items = set(data.accomodations_df().item_id) - set( df_feature.item_id) extension = pd.DataFrame(data=sorted( [i for i in set_of_not_clicked_items]), columns=['item_id']) extd = df_feature.append(extension, ignore_index=True, sort=True) f = extd.fillna(0).reset_index().drop(columns=['index']) feature = f[np.insert(f.columns[:-1].values, 0, f.columns[-1])].astype(int) _time = time.time() - start elapsed = time.strftime('%Mm %Ss', time.gmtime(_time)) print(f"elapsed in: {elapsed}") return feature
def extract_feature(self): train = data.train_df(mode=self.mode, cluster=self.cluster) test = data.test_df(mode=self.mode, cluster=self.cluster) df = pd.concat([train, test]) # get ALL clickouts reference_rows = df[(df.reference.str.isnumeric() == True) & (df.action_type =='clickout item')][['user_id','session_id','reference','impressions']] # get last clickout last_clickout_indices = find(df) clickout_rows = df.loc[last_clickout_indices, ['user_id','session_id','reference','impressions']] clk_expanded = expand_impressions(clickout_rows) # get the impressions impression_lists = reference_rows.impressions.str.split('|').tolist() big_list = [x for l in impression_lists for x in l] c = dict(Counter(big_list)) df_times_in_impressions = pd.DataFrame.from_dict(c, orient='index',columns=['number_of_times_in_impr']) df_times_in_impressions['item_id'] = df_times_in_impressions.index.astype(int) df_times_in_impressions = df_times_in_impressions.reindex(columns = ['item_id', 'number_of_times_in_impr']) feature_times_per_imp = pd.merge(clk_expanded, df_times_in_impressions, how='left', on=['item_id']).fillna(0) feature_times_per_imp.number_of_times_in_impr = feature_times_per_imp.number_of_times_in_impr.astype(int) feature_times_per_imp = feature_times_per_imp[['user_id', 'session_id','item_id','number_of_times_in_impr']] df_item_clicks = ( reference_rows .groupby(["reference"]) .size() .reset_index(name="n_clickouts") ) df_item_clicks = df_item_clicks.rename(columns={'reference':'item_id'}) df_item_clicks['item_id'] = df_item_clicks['item_id'].astype(int) merged = pd.merge(df_times_in_impressions, df_item_clicks, how='left', on=['item_id']).fillna(0) merged.n_clickouts = merged.n_clickouts.astype(int) final_feature = pd.merge(clk_expanded, merged, how='left', on=['item_id']).fillna(0) new_col = [] final_feature.reference = final_feature.reference.astype(int) final_feature.item_id = final_feature.item_id.astype(int) for t in tqdm(zip(final_feature.reference, final_feature.item_id, final_feature.number_of_times_in_impr, final_feature.n_clickouts)): if t[0]==t[1]: # stessa reference, quindi decremento di 1 sia #click che #imp if t[2]!=1: new_col.append(round(((t[3]-1)*100)/(t[2]-1),5)) else: new_col.append(0) else: if 0 not in [t[2],t[3]] and t[2]!=1: new_col.append(round(((t[3])*100)/(t[2]-1),5)) else: new_col.append(0) final_feature['adj_perc_click_appeared'] = new_col final_feature = final_feature[['user_id','session_id','item_id','adj_perc_click_appeared']] return final_feature
def extract_feature(self): def get_pos(item, rec): res = np.empty(item.shape) for i in tqdm(range(len(item))): if str(item[i]) in rec[i]: res[i] = rec[i].index(str(item[i])) + 1 else: res[i] = -1 return res.astype(int) train = data.train_df(mode=self.mode, cluster=self.cluster) test = data.test_df(mode=self.mode, cluster=self.cluster) df = pd.concat([train, test]) last_clickout_indices = find(df) all_clk_rows = df[df.reference.str.isnumeric()==True][df.action_type == 'clickout item'] all_clk_rows = all_clk_rows [['user_id','session_id','reference','impressions']] all_clk_rows.impressions = all_clk_rows.impressions.str.split('|') pos_col = get_pos(all_clk_rows.reference.values,all_clk_rows.impressions.values) all_clk_rows = all_clk_rows.drop('impressions',1) all_clk_rows['position'] = pos_col all_clk_rows_after_1 = all_clk_rows[all_clk_rows.position>1] df_clicks_after_1 = ( all_clk_rows_after_1 .groupby(["reference"]) .size() .reset_index(name="n_clicks_per_item") ) df_clicks_after_1.reference = df_clicks_after_1.reference.astype(int) df_clicks_after_1 = df_clicks_after_1.rename(columns={'reference':'item_id'}) last_clk_rows = df.loc[last_clickout_indices, ['user_id','session_id','reference','impressions']] last_clk_rows['imp_list'] = last_clk_rows.impressions.str.split('|') clk_expanded = expand_impressions(last_clk_rows) clk_expanded = clk_expanded.drop('index',1) pos_col = get_pos(clk_expanded.item_id.values,clk_expanded.imp_list.values) clk_expanded['position'] = pos_col clk_expanded = clk_expanded.drop('imp_list',1) merged = pd.merge(clk_expanded, df_clicks_after_1, how='left',on='item_id').fillna(0) new_col = [] merged.item_id = merged.item_id.astype(int) merged.reference = merged.reference.astype(int) for t in tqdm(zip(merged.item_id, merged.reference, merged.position, merged.n_clicks_per_item)): if t[0]==t[1] and t[2]>1: new_col.append(int(t[3]-1)) else: new_col.append(int(t[3])) merged['n_clicks_after_first_pos'] = new_col feature = merged[['user_id','session_id','item_id','n_clicks_after_first_pos']] return feature
def _fit(self, mode): """ train, test and target indices are just sessions which have: - no num ref - more than 1 step """ def RepresentsInt(s): try: int(s) return True except ValueError: return False train = data.train_df(mode) train_index = train.index.values test = data.test_df(mode) test_index = test.index.values tgt_indices = data.target_indices(mode) df = pd.concat([train, test]) del train del test lst_clk_indices = sorted(find(df)) to_return = [] for idx in lst_clk_indices: usr_sess_indices = [] try: a_user = df.at[idx, 'user_id'] a_sess = df.at[idx, 'session_id'] usr_sess_indices.append(idx) except: continue j = idx - 1 while j >= 0: try: new_user = df.at[j, 'user_id'] new_sess = df.at[j, 'session_id'] if new_user == a_user and new_sess == a_sess: usr_sess_indices.append(j) reference = df.at[j, 'reference'] if RepresentsInt(reference): break j -= 1 else: if idx - j >= 2: to_return += usr_sess_indices break except: j -= 1 self.train_indices = sorted(list(set(train_index) & set(to_return))) self.test_indices = sorted(list(set(test_index) & set(to_return))) self.target_indices = sorted(list(set(tgt_indices) & set(to_return)))
def extract_feature(self): train = data.train_df(mode=self.mode, cluster=self.cluster) test = data.test_df(mode=self.mode, cluster=self.cluster) df = pd.concat([train, test]) last_clickout_indices = find(df) clickout_rows = df.loc[last_clickout_indices, [ 'user_id', 'session_id', 'platform', 'action_type', 'impressions' ]] last_clk_removed_df = df.drop(last_clickout_indices) reference_rows = last_clk_removed_df[( last_clk_removed_df.reference.str.isnumeric() == True)] df_item_clicks = (reference_rows.groupby( ["reference", "platform"]).size().reset_index(name="n_interactions_per_item")) df_item_clicks = df_item_clicks.rename( columns={'reference': 'item_id'}) df_item_clicks['item_id'] = df_item_clicks['item_id'].astype(int) df_city_clicks = ( reference_rows.groupby('platform').size().reset_index( name="n_interactions_per_plat")) final_df = pd.merge(df_item_clicks, df_city_clicks, how='left', on=['platform']).fillna(0) final_df['percentage_of_total_plat_inter'] = 0.0 for t in zip(final_df.index, final_df.n_interactions_per_item, final_df.n_interactions_per_plat): percentage_of_total_plat_inter = round((t[1] * 100.0) / t[2], 2) final_df.at[ t[0], 'percentage_of_total_plat_inter'] = percentage_of_total_plat_inter feature = final_df[[ 'platform', 'item_id', 'percentage_of_total_plat_inter' ]] clk_expanded = expand_impressions(clickout_rows) feature = pd.merge(clk_expanded, feature, how='left', on=['platform', 'item_id']).fillna(0) feature = feature[[ 'user_id', 'session_id', 'item_id', 'percentage_of_total_plat_inter' ]] return feature
def extract_feature(self): tr = data.train_df(mode=self.mode, cluster=self.cluster) te = data.test_df(mode=self.mode, cluster=self.cluster) df = pd.concat([tr, te]) idxs = sorted(find(df)) mean_prices = [] for i in tqdm(idxs): prices = list(map(int, df.at[i, 'prices'].split('|'))) mean_prices.append(sum(prices)/len(prices)) total = df.loc[idxs, ['user_id', 'session_id']] total['mean_price_clickout'] = mean_prices return total.reset_index(drop=True)
def extract_feature(self): train = data.train_df(mode=self.mode, cluster=self.cluster) test = data.test_df(mode=self.mode, cluster=self.cluster) df = pd.concat([train, test]) # preprocess needed df = df.sort_values( by=['user_id', 'session_id', 'timestamp', 'step']).reset_index( drop=True) df = remove_last_part_of_clk_sessions(df) # compute number of interactions per session df_int = df[df.action_type == 'interaction item image'][[ 'user_id', 'session_id', 'timestamp', 'step', 'action_type' ]] feature = (df_int.groupby( ['user_id', 'session_id']).size().reset_index(name='num_inter_item_image')) # compute session length sess_size = (df.groupby(['user_id', 'session_id' ]).size().reset_index(name='session_length')) # get clk rows and expand clickout_rows = df.loc[ find(df), ['user_id', 'session_id', 'action_type', 'impressions']][ df.action_type == 'clickout item'] clk_expanded = expand_impressions(clickout_rows).drop( ['index', 'action_type'], 1) # merge final_feature = pd.merge(clk_expanded, feature, how='left', on=['user_id', 'session_id']).fillna(0) final_feature.num_inter_item_image = final_feature.num_inter_item_image.astype( int) final_feature = pd.merge(final_feature, sess_size, how='left', on=['user_id', 'session_id']).fillna(0) final_feature.session_length = final_feature.session_length.astype(int) # compute the percentage perc = [] for t in tqdm( zip(final_feature.num_inter_item_image, final_feature.session_length)): perc.append((t[0] * 100) / t[1]) final_feature['perc_inter_item_image'] = perc return final_feature[[ 'user_id', 'session_id', 'item_id', 'perc_inter_item_image' ]]
def extract_feature(self): train = data.train_df(mode=self.mode, cluster=self.cluster) test = data.test_df(mode=self.mode, cluster=self.cluster) df = pd.concat([train, test]) # preprocess needed df = df.sort_values( by=['user_id', 'session_id', 'timestamp', 'step']).reset_index( drop=True) df = remove_last_part_of_clk_sessions(df) sess_not_numeric_interactions = ( df[df.reference.str.isnumeric() != True][[ 'user_id', 'session_id', 'timestamp', 'step' ]].groupby([ 'user_id', 'session_id' ]).size().reset_index(name='num_not_numeric_interactions')) sess_size = (df.groupby(['user_id', 'session_id' ]).size().reset_index(name='session_length')) clickout_rows = df.loc[ find(df), ['user_id', 'session_id', 'action_type', 'impressions']][ df.action_type == 'clickout item'] clk_expanded = expand_impressions(clickout_rows).drop('index', 1) feature = pd.merge(clk_expanded, sess_not_numeric_interactions, how='left', on=['user_id', 'session_id']).fillna(0) feature.num_not_numeric_interactions = feature.num_not_numeric_interactions.astype( int) feature = pd.merge(feature, sess_size, how='left', on=['user_id', 'session_id']).fillna(0) feature.session_length = feature.session_length.astype(int) perc = [] for t in tqdm( zip(feature.num_not_numeric_interactions, feature.session_length)): perc.append((t[0] * 100) / t[1]) feature['perc_not_numeric'] = perc return feature[[ 'user_id', 'session_id', 'item_id', 'perc_not_numeric' ]]
def extract_feature(self): train = data.train_df(mode=self.mode, cluster=self.cluster) test = data.test_df(mode=self.mode, cluster=self.cluster) df = pd.concat([train, test]) print('Sorting...') df = df.sort_values(['user_id', 'session_id', 'timestamp', 'step']) # find indices of last clickouts print('Finding last clickouts...') last_clickout_indices = find(df) # get only last clickout rows to use the timestamp column print('Getting only last clk dataframe...') clickout_rows = df.loc[last_clickout_indices, ['user_id', 'session_id', 'timestamp']] clickout_rows = clickout_rows.rename( columns={'timestamp': 'clk_timestamp'}) # add the timestamp of last clk for each session as column print('Getting tmp...') tmp_df = df[[ 'user_id', 'session_id', 'step', 'action_type', 'timestamp' ]] tmp_df = pd.merge(tmp_df, clickout_rows, how='left', on=['user_id', 'session_id']).fillna(0) tmp_df.clk_timestamp = tmp_df.clk_timestamp.astype(int) # subtracts the timestamps, puts 0 if there is no clickout in the session def func(t, t_clko): res = np.empty(len(t)) for i in tqdm(range(len(t))): if t_clko[i] == 0: res[i] = 0 else: res[i] = t_clko[i] - t[i] return res print('Subtracting timestamps...') tmp_df['diff'] = func(tmp_df.timestamp.values, tmp_df.clk_timestamp) tmp_df['diff'] = tmp_df['diff'].astype(int) tmp_df['index'] = tmp_df.index feature = tmp_df[['index', 'diff']] return feature
def extract_feature(self): train = data.train_df(mode=self.mode, cluster=self.cluster) test = data.test_df(mode=self.mode, cluster=self.cluster) df = pd.concat([train, test]) last_clickout_indices = find(df) last_clk_removed_df = df.drop(last_clickout_indices) reference_rows = last_clk_removed_df[ (last_clk_removed_df.reference.str.isnumeric() == True) & (last_clk_removed_df.action_type == 'clickout item')][[ 'user_id', 'session_id', 'reference', 'impressions' ]] clickout_rows = df.loc[last_clickout_indices, ['user_id', 'session_id', 'impressions']] clk_expanded = expand_impressions(clickout_rows) impression_lists = reference_rows.impressions.str.split('|').tolist() big_list = [x for l in impression_lists for x in l] # flatten multi dim list in 1-dim list :) c = dict(Counter( big_list)) # count occurrence of each item_id in the impressions df_times_in_impressions = pd.DataFrame.from_dict( c, orient='index', columns=['num_times_item_impressed']) df_times_in_impressions[ 'item_id'] = df_times_in_impressions.index.astype(int) df_times_in_impressions = df_times_in_impressions.reindex( columns=['item_id', 'num_times_item_impressed']) df_times_in_impressions = df_times_in_impressions.sort_values( by=['item_id']).reset_index(drop=True) feature = pd.merge(clk_expanded, df_times_in_impressions, how='left', on=['item_id']).fillna(0) feature.num_times_item_impressed = feature.num_times_item_impressed.astype( int) return feature[[ 'user_id', 'session_id', 'item_id', 'num_times_item_impressed' ]]
def extract_feature(self): self.current_directory = Path(__file__).absolute().parent self.data_dir = self.current_directory.joinpath('..', '..', 'stacking', self.mode) train = data.train_df(mode=self.mode, cluster=self.cluster) test = data.test_df(mode=self.mode, cluster=self .cluster) df = pd.concat([train, test]) last_indices = find(df) # extract test scores self.train_dir = self.data_dir.joinpath('test') for file in glob.glob(str(self.train_dir) + '/xgboost*'): test_xgb = np.load(file) test_xgb = pd.DataFrame(test_xgb, columns=['index', 'item_recommendations', 'scores']) test_xgb = test_xgb.astype({'index': int}) self.train_dir = self.data_dir.joinpath('train') for file in glob.glob(str(self.train_dir) + '/xgboost*'): train_xgb = np.load(file) train_xgb = pd.DataFrame(train_xgb, columns=['index', 'item_recommendations', 'scores']) train_xgb = train_xgb.astype({'index': int}) xgb = pd.concat([train_xgb, test_xgb]) # xgb_idx = list(xgb['index']) # print(f'Xgb indices are : {len(set(xgb_idx))}') # print(f'Last indices are : {len((last_indices))}') # common = set(xgb_idx) & set(last_indices) # print(f'In common : {len(common)}') xgb = xgb[xgb['index'].isin(last_indices)] xgb_idx = list(xgb['index']) t = assign_score(xgb, 'xgboost') t = t.sort_values(by='index') df['index'] = df.index.values df = df[['user_id', 'session_id','index']] df = pd.merge(t, df, how='left', on=['index']) return df[['user_id', 'session_id', 'item_id', 'score_xgboost']]
def extract_feature(self): train = data.train_df(mode=self.mode, cluster=self.cluster) test = data.test_df(mode=self.mode, cluster=self.cluster) df = pd.concat([train, test]) # get clickout rows clickout_rows = df.loc[find(df), ['user_id', 'session_id', 'impressions']][ df.action_type == 'clickout item'] clk_expanded = expand_impressions(clickout_rows).drop(['index'], 1) # get position new_col = [] curr_u = clk_expanded.loc[0, 'user_id'] curr_s = clk_expanded.loc[0, 'session_id'] pos = 0 for t in tqdm(zip(clk_expanded.user_id, clk_expanded.session_id)): if t[0] == curr_u and t[1] == curr_s: pos += 1 else: pos = 1 curr_u = t[0] curr_s = t[1] new_col.append(pos) clk_expanded['position'] = new_col # get impression count for each session imp_count = (clk_expanded.groupby( ['user_id', 'session_id']).size().reset_index(name='num_impressions')) # merge and compute percentage feature = pd.merge(clk_expanded, imp_count, how='left', on=['user_id', 'session_id']).fillna(0) pos_perc = [] for t in tqdm(zip(feature.position, feature.num_impressions)): pos_perc.append((t[0] * 100) / t[1]) feature['impression_position_in_percentage'] = pos_perc return feature[[ 'user_id', 'session_id', 'item_id', 'impression_position_in_percentage' ]]
def extract_feature(self): tr = data.train_df(mode=self.mode, cluster=self.cluster) te = data.test_df(mode=self.mode, cluster=self.cluster) df = pd.concat([tr, te]) idxs = sorted(find(df)) means = [] stds = [] for i in tqdm(idxs): a_user = df.at[i, 'user_id'] a_sess = df.at[i, 'session_id'] a_time = df.at[i, 'timestamp'] j = i - 1 diffs = [] while j >= 0: try: new_user = df.at[j, 'user_id'] new_sess = df.at[j, 'session_id'] new_time = df.at[j, 'timestamp'] if new_user == a_user and new_sess == a_sess: diffs.append(a_time - new_time) else: break j -= 1 a_time = new_time except: j -= 1 if len(diffs) > 0: np_diffs = np.array(diffs) means.append(np.mean(np_diffs)) stds.append(np.std(np_diffs)) else: means.append(-1) stds.append(-1) total = df.loc[idxs, ['user_id', 'session_id']] total['mean_time_per_step'] = means total['frenzy_factor'] = stds return total.reset_index(drop=True)
def extract_feature(self): train = data.train_df(mode=self.mode, cluster=self.cluster) test = data.test_df(mode=self.mode, cluster=self.cluster) df = pd.concat([train, test]) last_clickout_indices = find(df) clickout_rows = df.loc[last_clickout_indices, ['user_id', 'session_id', 'impressions']] clk_expanded = expand_impressions(clickout_rows) o = ImpressionFeature(mode=self.mode) f = o.read_feature(True) # get the accomodation's df feature_stars = f[[ 'item_id', 'properties1 Star', 'properties2 Star', 'properties3 Star', 'properties4 Star', 'properties5 Star' ]] # remap the name feature_stars = feature_stars.rename( columns={ 'properties1 Star': '1', 'properties2 Star': '2', 'properties3 Star': '3', 'properties4 Star': '4', 'properties5 Star': '5' }) # set default 0 Stars for those for which the feature is missing feature_stars['0'] = pd.Series(np.ones(len(feature_stars), dtype=np.uint8), index=feature_stars.index) feature_stars['stars'] = feature_stars[['5', '4', '3', '2', '1', '0']].idxmax(axis=1) feature_stars_restricted = feature_stars[['item_id', 'stars']] final_feature = pd.merge(clk_expanded, feature_stars_restricted, how='left', on=['item_id']).fillna(1) final_feature['stars'] = final_feature['stars'].astype(int) final_feature['stars'] = final_feature['stars'].replace(0, -1) return final_feature[['user_id', 'session_id', 'item_id', 'stars']]
def retrieve_real_test_indices(mode, cluster): test = data.test_df(mode, cluster) idxs = sorted(find(test)) test_indices = [] for i in idxs: to_append = [i] a_user = test.at[i, 'user_id'] a_sess = test.at[i, 'session_id'] j = i - 1 while j >= test.index.values[0]: try: new_user = test.at[j, 'user_id'] new_sess = test.at[j, 'session_id'] if new_user == a_user and new_sess == a_sess: to_append.append(j) j -= 1 else: break except: j -= 1 j = i + 1 while j <= test.index.values[-1]: try: new_user = test.at[j, 'user_id'] new_sess = test.at[j, 'session_id'] if new_user == a_user and new_sess == a_sess: to_append.append(j) j += 1 else: break except: j += 1 test_indices += to_append return sorted(test_indices)
def extract_feature(self): train = data.train_df(mode=self.mode, cluster=self.cluster) test = data.test_df(mode=self.mode, cluster=self.cluster) df = pd.concat([train, test]) # get clk rows last_clickout_indices = find(df) clickout_rows = df.loc[ last_clickout_indices, ['user_id', 'session_id', 'impressions', 'prices']] clk_expanded = expand_impressions(clickout_rows).drop('index', 1) # open item metadata in one hot o = ImpressionFeature(mode=self.mode) df_accomodations = o.read_feature(True) # get the stars feature_stars = df_accomodations[[ 'item_id', 'properties1 Star', 'properties2 Star', 'properties3 Star', 'properties4 Star', 'properties5 Star' ]] # remap the name feature_stars = feature_stars.rename( columns={ 'properties1 Star': '1', 'properties2 Star': '2', 'properties3 Star': '3', 'properties4 Star': '4', 'properties5 Star': '5' }) # set default 0 Stars for those for which the feature is missing feature_stars['0'] = pd.Series(np.ones(len(feature_stars), dtype=np.uint8), index=feature_stars.index) feature_stars['stars'] = feature_stars[['5', '4', '3', '2', '1', '0']].idxmax(axis=1) feature_stars_restricted = feature_stars[['item_id', 'stars']] f_stars = pd.merge(clk_expanded, feature_stars_restricted, how='left', on=['item_id']) f_stars['stars'] = f_stars['stars'].astype(int) # get the ratings f_ratings = df_accomodations[[ 'item_id', 'propertiesExcellent Rating', 'propertiesVery Good Rating', 'propertiesGood Rating', 'propertiesSatisfactory Rating', ]] f_ratings['propertiesNo Rating'] = pd.Series(np.ones(len(f_ratings), dtype=np.uint8), index=f_ratings.index) df = f_ratings.iloc[:, 1:] df['fake'] = pd.Series(np.zeros(len(df), dtype=np.uint8), index=df.index) cols = df.columns.tolist() cols = [cols[-1]] + cols[:-1] df = df.reindex(columns=cols) dff = df.diff(axis=1).drop(['fake'], axis=1) dff = dff.astype(int) dff.columns = [5, 4, 3, 2, 1] f_ratings = f_ratings.drop(f_ratings.columns[1:], axis=1) f_ratings['rating'] = dff.idxmax(axis=1) f_ratings = pd.merge(f_ratings, feature_stars_restricted, how='left', on=['item_id']) df_clk_rat_star = pd.merge(clk_expanded, f_ratings, how='left', on='item_id') # expand prices df_clk_rat_star.prices = df_clk_rat_star.prices.str.split('|') curr_user = '******' curr_sess = '_' pos = 0 price_expanded = [] for t in tqdm( zip(df_clk_rat_star.user_id, df_clk_rat_star.session_id, df_clk_rat_star.prices)): #check if in session if curr_user != t[0] or curr_sess != t[1]: pos = 0 curr_user = t[0] curr_sess = t[1] else: pos += 1 price_expanded.append(t[2][pos]) df_clk_rat_star['price'] = price_expanded df_clk_rat_star = df_clk_rat_star.drop(['prices'], 1) df_clk_rat_star.stars = df_clk_rat_star.stars.astype(int) # fills missing stars values with the mean avg = df_clk_rat_star[['user_id', 'session_id', 'stars']] avg = avg.loc[avg.stars != 0] # va calcolata la media solo sui non zero avg = pd.DataFrame( avg.groupby(['user_id', 'session_id'])['stars'].progress_apply( lambda x: int(x.sum() / x.size))).fillna(0) avg = avg.rename(columns={'stars': 'stars_avg'}) avg.stars = avg.stars_avg.astype(int) no_stars = df_clk_rat_star.loc[df_clk_rat_star.stars == 0, ['user_id', 'session_id', 'item_id']] stars_filled = pd.merge(no_stars, avg, how='left', on=['user_id', 'session_id']).fillna(0) stars_filled.stars = stars_filled.stars_avg.astype(int) df_clk_rat_star_filled = pd.merge( df_clk_rat_star, stars_filled, how='left', on=['user_id', 'session_id', 'item_id']) for t in zip(df_clk_rat_star_filled.stars, df_clk_rat_star_filled.stars_avg, df_clk_rat_star_filled.index): if t[0] == 0: df_clk_rat_star_filled.at[t[2], 'stars'] = t[1] df_clk_rat_star_filled = df_clk_rat_star_filled.drop('stars_avg', 1) # now fill missing values for rating avg = df_clk_rat_star_filled[['user_id', 'session_id', 'rating']] avg.rating = avg.rating.astype(int) avg = avg.loc[avg.rating != 1] # va calcolata la media solo sui non zero avg = pd.DataFrame( avg.groupby(['user_id', 'session_id'])['rating'].progress_apply( lambda x: int(x.sum() / x.size))).fillna(0) avg = avg.rename(columns={'rating': 'rating_avg'}) avg.rating = avg.rating_avg.astype(int) no_rat = df_clk_rat_star.loc[df_clk_rat_star.rating == 1, ['user_id', 'session_id', 'item_id']] rat_filled = pd.merge(no_rat, avg, how='left', on=['user_id', 'session_id']).fillna(0) rat_filled.rating = rat_filled.rating_avg.astype(int) df_clk_rat_star_rat_filled = pd.merge( df_clk_rat_star_filled, rat_filled, how='left', on=['user_id', 'session_id', 'item_id']) for t in zip(df_clk_rat_star_rat_filled.rating, df_clk_rat_star_rat_filled.rating_avg, df_clk_rat_star_rat_filled.index): if t[0] == 1: df_clk_rat_star_rat_filled.at[t[2], 'rating'] = t[1] df_clk_rat_star_rat_filled = df_clk_rat_star_rat_filled.drop( 'rating_avg', 1) # add feature column new_col = [] df_clk_rat_star_rat_filled.rating = df_clk_rat_star_rat_filled.rating.astype( int) df_clk_rat_star_rat_filled.stars = df_clk_rat_star_rat_filled.stars.astype( int) df_clk_rat_star_rat_filled.price = df_clk_rat_star_rat_filled.price.astype( int) for t in tqdm( zip(df_clk_rat_star_rat_filled.rating, df_clk_rat_star_rat_filled.stars, df_clk_rat_star_rat_filled.price)): new_col.append((1.5 * t[0] + t[1]) / t[2]) df_clk_rat_star_rat_filled['price_quality'] = new_col final_feature = df_clk_rat_star_rat_filled[[ 'user_id', 'session_id', 'item_id', 'price_quality' ]] return final_feature
def extract_feature(self): def extend_session_current_filters(y): x = y cf = x.current_filters if len(cf.dropna()) == 0: return x ind = cf.dropna().head(1).index.values[0] # indice del primo cf diverso da nan while ind < cf.tail(1).index.values[0]: # serve un while per la fine della sessione Nan_ind_found = False nan_ind = ind while Nan_ind_found == False: # trovo la prima action che annulla i filtri if nan_ind == cf.tail(1).index.values[0]: return x try: if x.loc[nan_ind + 1].action_type in ['interaction item image', 'interaction item deals', 'interaction item info', 'interaction item rating', 'change of sort order']: nan_ind = nan_ind + 1 Nan_ind_found = True except: print(x) else: nan_ind += 1 # ora nan_ind è l'indice della prima action che annulla i filtri Nan_ind_last_found = False not_nan_ind = nan_ind # scorro e modifico i valori di cf finchè non trovo il primo indice che nn annulla cf: not_nan_ind while Nan_ind_last_found == False: cf.loc[not_nan_ind] = cf.loc[not_nan_ind - 1] if not_nan_ind == cf.tail(1).index.values[0]: x.current_filters = cf return x if x.loc[not_nan_ind + 1].action_type in ['search for poi', 'search for destination', 'search for item', 'filter selection', 'clickout item']: not_nan_ind = not_nan_ind + 1 Nan_ind_last_found = True else: not_nan_ind += 1 # ora not_nan_ind è il primo indice che non annulla cf (corrisponde a ind) si riparte da capo e si continua finchè # non si arriva a fine sessione ind = not_nan_ind x.current_filters = cf return x list_of_sorting_filters_wout_pop = ['Sort by Price', 'Sort by Distance', 'Sort by Rating', 'Best Value', 'Focus on Rating', 'Focus on Distance'] list_of_sorting_filters = ['Sort by Price', 'Sort by Distance', 'Sort by Rating', 'Best Value', 'Focus on Rating', 'Focus on Distance', 'Sort by Popularity'] def mask_sorting(x): if np.isin(x, list_of_sorting_filters_wout_pop).any(): return x else: return ['Sort by Popularity'] start = time.time() train = data.train_df(mode=self.mode, cluster=self.cluster) test = data.test_df(mode=self.mode, cluster=self.cluster) df = pd.concat([train, test]) # extend current_filters df.groupby(['user_id', 'session_id']).progress_apply(extend_session_current_filters) indices_last_clks = find(df) d = df.drop(indices_last_clks) reference_rows = d[d.reference.astype(str).str.isnumeric()] d_splitted = reference_rows.current_filters.progress_apply(lambda x: str(x).split('|')) md = d_splitted.progress_apply(mask_sorting) df_f = df.loc[md.index] df_ref = df_f.reference dict_ref_to_filters = dict( zip(df_ref.unique(), [dict(zip(list_of_sorting_filters, np.zeros(len(list_of_sorting_filters)))) \ for i in range(len(df_ref.unique()))])) for index, row in tqdm(df_f.iterrows(), total=df_f.shape[0]): for i in md.loc[index]: if i in list_of_sorting_filters: dict_ref_to_filters[row.reference][i] += 1 df_feature = pd.DataFrame.from_dict(dict_ref_to_filters, orient='index') df_feature = df_feature.astype(int).reset_index().rename(index=str, columns={"index": "item_id"}) set_of_not_clicked_items = set(data.accomodations_df().item_id) - set(df_feature.item_id) extension = pd.DataFrame(data=sorted([i for i in set_of_not_clicked_items]), columns=['item_id']) extd = df_feature.append(extension, ignore_index=True, sort=True) f = extd.fillna(0).reset_index().drop(columns=['index']) feature = f[np.insert(f.columns[:-1].values, 0, f.columns[-1])].astype(int) _time = time.time() - start elapsed = time.strftime('%Mm %Ss', time.gmtime(_time)) print(f"elapsed in: {elapsed}") return feature
def extract_feature(self): train = data.train_df(mode=self.mode, cluster=self.cluster) test = data.test_df(mode=self.mode, cluster=self.cluster) df = pd.concat([train, test]) # get only non-last-clickout clickout rows last_clickout_indices = find(df) last_clk_removed_df = df.drop(last_clickout_indices) reference_rows = last_clk_removed_df[ (last_clk_removed_df.reference.str.isnumeric() == True) & (last_clk_removed_df.action_type == 'clickout item')][[ 'user_id', 'session_id', 'reference', 'impressions' ]] # get the impressions impression_lists = reference_rows.impressions.str.split('|').tolist() big_list = [x for l in impression_lists for x in l] # convert multi-dim list in 1-dim list c = dict( Counter(big_list) ) # count occurence of each accomodation in the impression list # create df from dictonary: for each accomodation tells the number of times it appears in impressions df_times_in_impressions = pd.DataFrame.from_dict( c, orient='index', columns=['number_of_times_in_impr']) df_times_in_impressions[ 'item_id'] = df_times_in_impressions.index.astype(int) df_times_in_impressions = df_times_in_impressions.reindex( columns=['item_id', 'number_of_times_in_impr']) # get number of times an accomodation has been clicked df_item_clicks = (reference_rows.groupby( ["reference"]).size().reset_index(name="n_clickouts")) df_item_clicks = df_item_clicks.rename( columns={'reference': 'item_id'}) df_item_clicks['item_id'] = df_item_clicks['item_id'].astype(int) # merge the two df merged = pd.merge(df_times_in_impressions, df_item_clicks, how='left', on=['item_id']).fillna(0) merged.n_clickouts = merged.n_clickouts.astype(int) merged['perc_click_appeared'] = round( (merged.n_clickouts * 100) / (merged.number_of_times_in_impr), 2) # create the feature for each item feature_per_item = merged[['item_id', 'perc_click_appeared']] # use the feature for each last clickout clickout_rows = df.loc[last_clickout_indices, ['user_id', 'session_id', 'impressions']] clk_expanded = expand_impressions(clickout_rows) final_feature = pd.merge(clk_expanded, feature_per_item, how='left', on=['item_id']).fillna(0) final_feature = final_feature[[ 'user_id', 'session_id', 'item_id', 'perc_click_appeared' ]] return final_feature
def extract_feature(self): train = data.train_df(mode=self.mode, cluster=self.cluster) test = data.test_df(mode=self.mode, cluster=self.cluster) df = pd.concat([train, test]) # get last clickout rows last_clickout_indices = find(df) clickout_rows = df.loc[ last_clickout_indices, ['user_id', 'session_id', 'city', 'reference', 'impressions']][ df.action_type == 'clickout item'] # get reference rows WITH last clickout reference_rows = df[(df.reference.str.isnumeric() == True) & (df.action_type == 'clickout item')] # compute popularity WITH last clickout df_item_clicks = (reference_rows.groupby( ["reference", "city"]).size().reset_index(name="n_interactions_per_item")) df_item_clicks = df_item_clicks.rename( columns={'reference': 'item_id'}) df_item_clicks['item_id'] = df_item_clicks['item_id'].astype(int) df_city_clicks = (reference_rows.groupby('city').size().reset_index( name="n_interactions_per_city")) # merge clickout rows expanded with the popularity dataframes merged_df = pd.merge(df_item_clicks, df_city_clicks, how='left', on=['city']).fillna(0) clk_expanded = expand_impressions(clickout_rows) feature = pd.merge(clk_expanded, merged_df, how='left', on=['item_id', 'city']).fillna(0) # compute the percentage of clicks per platfom new_col = [] feature.reference = feature.reference.astype(int) feature.item_id = feature.item_id.astype(int) for t in tqdm( zip(feature.reference, feature.item_id, feature.n_interactions_per_item, feature.n_interactions_per_city)): if t[0] == t[1]: # è quello cliccato if t[3] != 1: percentage_of_total_city_clk = round( ((t[2] - 1) * 100.0) / (t[3] - 1), 5) else: percentage_of_total_city_clk = 0 else: # non è quello cliccato if 0 not in [t[2], t[3]] and t[3] != 1: percentage_of_total_city_clk = round( (t[2] * 100.0) / (t[3] - 1), 5) # tolgo comunque il click per plat else: percentage_of_total_city_clk = 0 new_col.append(percentage_of_total_city_clk) feature['adj_percentage_of_total_city_clk'] = new_col feature.adj_percentage_of_total_city_clk = feature.adj_percentage_of_total_city_clk.astype( float) final_feature_reduced = feature[[ 'user_id', 'session_id', 'item_id', 'adj_percentage_of_total_city_clk' ]] return final_feature_reduced
def extract_feature(self): train = data.train_df(mode=self.mode, cluster=self.cluster) test = data.test_df(mode=self.mode, cluster=self.cluster) df = pd.concat([train, test]) # first step: get all the platforms platforms = sorted(df.platform.unique().tolist()) # create df that for each plat will hold the feature vector df_plat_feature = pd.DataFrame( columns=['platform', 'properties_array']) df_plat_feature['platform'] = platforms # remove last clickouts and do some preprocessing last_indices = find(df) df_clickout = df[(df.reference.str.isnumeric() == True) & (df['action_type'] == 'clickout item')][[ 'reference', 'platform' ]] df_clickout = df_clickout.rename(columns={'reference': 'item_id'}) df_clickout.item_id = df_clickout.item_id.astype(int) # get the item metadata in one hot o = ImpressionFeature(mode=self.mode) df_accomodations = o.read_feature(True) df_accomodations = df_accomodations.drop([ 'properties1 Star', 'properties2 Star', 'properties3 Star', 'properties4 Star', 'properties5 Star' ], 1) # merge clickouts dataframe with the metadata df_clicks_properties = pd.merge(df_clickout, df_accomodations, how='left', on=['item_id']) # extract the one hot econded feature into a 1-dim numpy array array = df_accomodations.drop(['item_id'], axis=1).values # for each item append the features as numpy array df_item_features = pd.DataFrame(columns=['item_id', 'features_array']) df_item_features['item_id'] = df_accomodations['item_id'].values df_item_features['features_array'] = list(array) # for each column compute the sum of all the clickout-rows' features new_col = [] # which will hold the platform feature vector for p in tqdm(platforms): df_clicks_properties_per_plat = df_clicks_properties[ df_clicks_properties.platform == p] df_clicks_properties_per_plat = df_clicks_properties_per_plat.drop( ['item_id', 'platform'], axis=1) df_sum = df_clicks_properties_per_plat.sum() # questo if serve perché ci sono plat che non compaiono nei clickout # per quelle metto un vettore di 0 if df_clicks_properties_per_plat.shape[0] != 0: df_sum = df_sum.apply( lambda x: x / df_clicks_properties_per_plat.shape[0]) plat_feature = df_sum.values else: plat_feature = np.asarray( [0] * df_clicks_properties_per_plat.shape[1]) new_col.append(plat_feature) df_plat_feature['properties_array'] = new_col # now take the last clickout rows and expand on the impression list clickout_rows = df.loc[last_indices, [ 'user_id', 'session_id', 'platform', 'action_type', 'impressions' ]][df.action_type == 'clickout item'] clk_expanded = expand_impressions(clickout_rows) clk_expanded = clk_expanded.drop(['index', 'action_type'], axis=1) # for each impression, add the feature vector of the platform and the feature vector of the impression clk_expanded_wt_plat_feat = pd.merge(clk_expanded, df_plat_feature, how='left', on=['platform']) final_feature = pd.merge(clk_expanded_wt_plat_feat, df_item_features, how='left', on=['item_id']) # compute the similarity between the impression's feature vector and the plat feature vector new_col = [] if self.metric == 'cosine': shrink = 5 # TRY ME for t in tqdm( zip(final_feature.properties_array, final_feature.features_array)): new_col.append( cosine_similarity(t[0].astype(np.double), t[1].astype(np.double), shrink)) final_feature = final_feature[['user_id', 'session_id', 'item_id']] final_feature['adj_platform_features_similarity'] = new_col return final_feature
def extract_feature(self): train = data.train_df(mode=self.mode, cluster=self.cluster) test = data.test_df(mode=self.mode, cluster=self.cluster) df = pd.concat([train, test]) platforms = df['platform'].unique().tolist() df_plat_feature = pd.DataFrame(columns=['platform','properties_array']) df_plat_feature['platform'] = platforms last_indices = find(df) df_non_last_clk = df.drop(last_indices) df_clickout = df_non_last_clk[(df_non_last_clk['action_type']=='clickout item')][['reference','platform']] df_clickout = df_clickout.rename(columns={'reference':'item_id'}) df_clickout = df_clickout.dropna() # remove NaNs df_clickout.item_id = df_clickout.item_id.astype(int) o = ImpressionFeature(mode=self.mode) df_accomodations = o.read_feature(True) df_accomodations = df_accomodations.drop(['properties1 Star', 'properties2 Star', 'properties3 Star', 'properties4 Star', 'properties5 Star'],1) df_clicks_properties = pd.merge(df_clickout, df_accomodations, how='left', on=['item_id']) array = df_accomodations.drop(['item_id'],axis=1).values df_item_features = pd.DataFrame(columns=['item_id','features_array']) df_item_features['item_id'] = df_accomodations['item_id'].values df_item_features['features_array'] = list(array) new_col = [] for p in tqdm(platforms): df_clicks_properties_per_plat = df_clicks_properties[df_clicks_properties.platform == p] df_clicks_properties_per_plat = df_clicks_properties_per_plat.drop(['item_id','platform'], axis=1) df_sum = df_clicks_properties_per_plat.sum() if df_clicks_properties_per_plat.shape[0] !=0: # questo vuol dire che appare almeno una volta la plat plat_feature = df_sum.values else: plat_feature = np.asarray([0]*df_clicks_properties_per_plat.shape[1]) new_col.append(plat_feature) df_plat_feature['properties_array'] = new_col global_sum = df_clicks_properties.drop(['item_id','platform'],1) global_sum = global_sum.sum().tolist() df_plat_feature['global_properties'] = df_plat_feature.apply(lambda x: global_sum, axis=1) properties_globally_normalized = [] for t in tqdm(zip(df_plat_feature.properties_array, df_plat_feature.global_properties)): properties_globally_normalized.append(np.asarray([x/y for x,y in zip(t[0],t[1])])) df_plat_feature['properties_globally_normalized'] = properties_globally_normalized df_plat_feature = df_plat_feature.drop(['properties_array','global_properties'],1) # ora prendo il dataframe coi clickout solito last_clickout_indices = find(df) clickout_rows = df.loc[last_clickout_indices, ['user_id','session_id','platform','action_type','impressions']][df.action_type == 'clickout item'] clk_expanded = expand_impressions(clickout_rows) clk_expanded = clk_expanded.drop(['index','action_type'], axis = 1) clk_expanded_wt_plat_feat = pd.merge(clk_expanded, df_plat_feature, how='left', on=['platform']).astype(object) clk_expanded_wt_plat_feat.item_id = clk_expanded_wt_plat_feat.item_id.astype(int) final_feature = pd.merge(clk_expanded_wt_plat_feat, df_item_features, how='left', on=['item_id']) new_col =[] shrink = 0 # TRY ME for t in tqdm(zip(final_feature.properties_globally_normalized, final_feature.features_array)): new_col.append(cosine_similarity(t[0].astype(np.double), t[1].astype(np.double),shrink)) new_feature = final_feature[['user_id','session_id','item_id']] new_feature['platform_similarity_normalized'] = new_col return new_feature
def extract_feature(self): def func(x): def extract_daytime(timestamp, platform): res = np.empty(len(timestamp), dtype='datetime64[s]') unique_platforms = x['platform'].unique() dict_row_platform = { 'AU': 3, 'CA': 11, 'RU': 1, 'BR': 1, 'US': 17 } list_of_common_platforms = [ i for i in unique_platforms if i not in dict_row_platform.keys() ] for i in list_of_common_platforms: dict_row_platform[i] = 0 dict_row_platform['GB'] = dict_row_platform.pop('UK') dict_row_platform['ET'] = dict_row_platform.pop('AA') austral_emisphere = [ 'AU', 'MX', 'CL', 'AR', 'ID', 'NZ', 'EC', 'BR' ] for i in tqdm(range(len(timestamp))): ts = timestamp[i] p = platform[i] if p == 'UK': p = 'GB' elif p == 'AA': p = 'ET' if p in austral_emisphere: bool_amb = True else: bool_amb = False zone = pytz.country_timezones(p)[dict_row_platform[p]] timeznd = pd.to_datetime(ts).tz_localize( zone, ambiguous=np.array(bool_amb)) res[i] = timeznd return res return extract_daytime( pd.to_datetime(x['timestamp'], unit='s', origin='unix').values, x['platform'].values).astype('datetime64[s]') def get_moment_in_the_day(x): if (0 <= x.hour) & (x.hour < 8): return 'N' elif (8 <= x.hour) & (x.hour < 13): return 'M' elif (13 <= x.hour) & (x.hour < 19): return 'A' elif (19 <= x.hour) & (x.hour < 24): return 'E' train = data.train_df(mode=self.mode, cluster=self.cluster) test = data.test_df(mode=self.mode, cluster=self.cluster) df_indices = find(pd.concat([train, test])) df = pd.concat([train, test]).loc[df_indices] df['day'] = func(df) df['moment'] = df['day'].progress_apply( lambda x: get_moment_in_the_day(x)) df['day'] = df['day'].progress_apply( lambda x: pd.to_datetime(x).dayofweek) return df.drop(columns=[ 'action_type', 'reference', 'impressions', 'prices', 'city', 'device', 'step', 'current_filters', 'timestamp', 'platform', 'frequence' ])
def extract_feature(self): def convert_and_add_pos(df): df_t = expand_impressions(df) df['index'] = df.index df = pd.merge(df_t, df, how='left', on=['index', 'user_id', 'session_id', 'action_type'], suffixes=('', '_y')) df = df.drop('time_per_impression_y', axis=1) df['item_pos'] = df.apply( lambda x: (x['impression_list'].index(str(x['item_id']))) + 1, axis=1) df = df.drop(['impression_list', 'index'], axis=1) return df train = data.train_df(mode=self.mode, cluster=self.cluster) test = data.test_df(mode=self.mode, cluster=self.cluster) df = pd.concat([train, test]) df = df.sort_values(['user_id', 'session_id', 'timestamp', 'step']).reset_index(drop=True) df['time_per_impression'] = df['timestamp'].shift(-1) - df['timestamp'] last_clickout_indices = find(df) clickout_rows = df.loc[ last_clickout_indices, ['user_id', 'session_id', 'action_type', 'impressions']][ df.action_type == 'clickout item'] clickout_rows['impression_list'] = clickout_rows.impressions.str.split( '|') clickout_rows['time_per_impression'] = [ [0] * 25 for x in range(len(clickout_rows.index)) ] last_clk_removed_df = df.drop(last_clickout_indices) reference_rows = last_clk_removed_df[ last_clk_removed_df.reference.astype(str).str.isnumeric()] reference_rows = reference_rows.drop('action_type', axis=1) reference_rows = reference_rows[ reference_rows.user_id.isin(clickout_rows.user_id) & reference_rows.session_id.isin(clickout_rows.session_id)] j = 0 clickout_indices = clickout_rows.index.values clickout_user = clickout_rows.at[clickout_indices[j], 'user_id'] clickout_session = clickout_rows.at[clickout_indices[j], 'session_id'] for t in tqdm( zip(reference_rows.index, reference_rows.time_per_impression, reference_rows.user_id, reference_rows.session_id, reference_rows.reference)): if t[0] >= clickout_indices[-1]: break # find the next clickout index while t[0] > clickout_indices[j]: j += 1 clickout_user = clickout_rows.at[clickout_indices[j], 'user_id'] clickout_session = clickout_rows.at[clickout_indices[j], 'session_id'] # check if row and next_clickout are in the same session if t[2] == clickout_user and t[3] == clickout_session: try: ref_idx = clickout_rows.at[clickout_indices[j], 'impression_list'].index(t[4]) feature_list = clickout_rows.at[clickout_indices[j], 'time_per_impression'] feature_list[ref_idx] += t[1] except: pass final_df = convert_and_add_pos(clickout_rows) final_df['impression_time'] = final_df.apply( lambda x: list(x['time_per_impression'])[int(x['item_pos']) - 1], axis=1) final_df = final_df[[ 'user_id', 'session_id', 'item_id', 'impression_time' ]] final_df['impression_time'] = final_df['impression_time'].astype(int) return final_df