def extract_feature(self): tqdm.pandas() df = data.full_df() df = df.sort_values(['user_id', 'session_id', 'timestamp', 'step']).reset_index() # find the last clickout rows last_clickout_idxs = find_last_clickout_indices(df) clickout_rows = df.loc[ last_clickout_idxs, ['user_id', 'session_id', 'impressions', 'index']] clickout_rows[ 'impressions_count'] = clickout_rows.impressions.str.split( '|').str.len() clickout_rows = clickout_rows.drop('impressions', axis=1) # multi-hot the counts one_hot_counts = np.zeros((clickout_rows.shape[0], 25), dtype=np.int8) for i, c in tqdm(enumerate(clickout_rows.impressions_count.values)): one_hot_counts[i, 0:c] = 1 # add to the clickouts for i in range(25): clickout_rows['impr_c{}'.format(i)] = one_hot_counts[:, i] return clickout_rows.drop('impressions_count', axis=1).set_index('index')
def get_label(df): """ Return a dataframe with: index | user_id | session_id | label """ # find the last clickout rows idxs = find_last_clickout_indices(df) res_df = df[['user_id', 'session_id', 'reference', 'impressions']].loc[idxs] # remove the test sessions with reference NaN res_df = res_df.dropna(subset=['reference']).astype( {'reference': 'int'}) # create impressions list res_df['impressions_list'] = res_df['impressions'].str.split( '|').apply(lambda x: list(map(int, x))) res_df.drop('impressions', axis=1, inplace=True) label_series = np.zeros(res_df.shape[0], dtype='int8') # iterate over the rows k = 0 for row in tqdm( zip(res_df['reference'], res_df['impressions_list'])): ref = row[0] impress = row[1] if ref in impress: label_series[k] = impress.index(ref) k += 1 # add the new column res_df['label'] = label_series return res_df.drop(['reference', 'impressions_list'], axis=1)
def retrieve_pd_dataframe_score(df): icm = data.accomodations_one_hot().sort_index() sess_user_dict = create_user_feature_dict(df) idxs_clicks = find_last_clickout_indices(df) scores = [] # iterate on the index of the target clicks and create for each iteration a tuple to be appended on the final list print('computing the distances...') for idx in tqdm(idxs_clicks): # retrieve the user sess and impressions of the click user = df.at[idx, 'user_id'] sess = df.at[idx, 'session_id'] impressions = list(map(int, df.at[idx, 'impressions'].split('|'))) # retrieve the impression of the user-sess pair if it isn't in the dictionary means # that there weren't numeric actions in the session so initialize it with an empty vector us_tuple = (user, sess) if us_tuple in sess_user_dict: user_feature_vec = sess_user_dict[(user, sess)] else: user_feature_vec = np.zeros(icm.shape[1]) # retrieve the features of the impression selected features_imp = icm.loc[impressions].values # create the various version of the user vector CLIPPED, TRESHOLDED clipped_user_feature_vec = np.clip(user_feature_vec,0,1) tresholded_user_feature_vec = user_feature_vec.copy() if np.sum(user_feature_vec) > 0: treshold_limit = np.mean(user_feature_vec[user_feature_vec > 0]) tresholded_user_feature_vec[tresholded_user_feature_vec<treshold_limit]=0 # compute the distance between the two vectors _scores_manhattan = manhattan_distances(user_feature_vec.reshape(1, -1), features_imp) _scores_cosine = cosine_similarity(user_feature_vec.reshape(1, -1), features_imp) _scores_jaccard_no_norm = np.dot(user_feature_vec, features_imp.T) _scores_manhattan_clip = manhattan_distances(clipped_user_feature_vec.reshape(1, -1), features_imp) _scores_cosine_clip = cosine_similarity(clipped_user_feature_vec.reshape(1, -1), features_imp) _scores_jaccard_no_norm_clip = np.dot(clipped_user_feature_vec, features_imp.T) _scores_manhattan_tr = manhattan_distances(tresholded_user_feature_vec.reshape(1, -1), features_imp) _scores_cosine_tr = cosine_similarity(tresholded_user_feature_vec.reshape(1, -1), features_imp) _scores_jaccard_no_norm_tr = np.dot(tresholded_user_feature_vec, features_imp.T) # create and append a tuple on the final list for i in range(len(impressions)): scores.append((user, sess, impressions[i], _scores_cosine[0][i], _scores_manhattan[0][i],_scores_jaccard_no_norm[i], _scores_cosine_clip[0][i], _scores_manhattan_clip[0][i],_scores_jaccard_no_norm_clip[i], _scores_cosine_tr[0][i], _scores_manhattan_tr[0][i],_scores_jaccard_no_norm_tr[i])) return pd.DataFrame(scores, columns=['user_id', 'session_id', 'item_id', 'scores_cosine', 'scores_manhatthan', 'scores_jaccard_no_norm', 'scores_cosine_clip', 'scores_manhatthan_clip', 'scores_jaccard_no_norm_clip', 'scores_cosine_tr', 'scores_manhatthan_tr', 'scores_jaccard_no_norm_tr'])
def create_user_feature_dict(df): idxs_clicks = find_last_clickout_indices(df) df = df.drop(idxs_clicks) # retrieve the icm icm = data.accomodations_one_hot() # filter on the columns of interests temp_df = df[['user_id', 'session_id', 'reference']].dropna() # mantain only the rows with numeric reference temp_df = temp_df[temp_df['reference'].str.isnumeric()] temp_df = temp_df.drop_duplicates() # retrieve user_sess and item associated to it from the dataframe users_idxs = temp_df.to_dict('list')['user_id'] sessions_idxs = temp_df.to_dict('list')['session_id'] users_sess = list(zip(users_idxs, sessions_idxs)) items_idxs = list(map(int, temp_df.to_dict('list')['reference'])) # create a diction with keys tuples like ('user_id', 'session_id') and as value the array representing # the user as an house summing up all the features of the houses with wich he has interacted # during the session count = 0 user_session_dict = {} for user_session in tqdm(users_sess): user_items = icm.loc[items_idxs[count]].values if user_session in user_session_dict: user_session_dict[user_session] += user_items else: user_session_dict[user_session] = user_items count += 1 return user_session_dict
def train_indices(mode='local', cluster='no_cluster'): df_train = data.train_df(mode=mode, cluster=cluster) df_test = data.test_df(mode=mode, cluster=cluster) target_indices = data.target_indices(mode=mode, cluster=cluster) df = pd.concat([df_train, df_test]) idx = find_last_clickout_indices(df) train_idx = set(idx) - set(target_indices) return train_idx
def extract_feature(self): train = data.train_df(mode=self.mode, cluster=self.cluster) test = data.test_df(mode=self.mode, cluster=self.cluster) df = pd.concat([train, test]) idxs_click = find_last_clickout_indices(df) temp = df[['user_id', 'session_id', 'step', 'timestamp']] session_id_l = [] length_step_l = [] length_timestamp_l = [] timestamp_last_action_l = [] final_timestamp_l = [] user_id_l = [] for i in tqdm(idxs_click): user_id = temp.at[i, 'user_id'] session_id = temp.at[i, 'session_id'] step = temp.at[i, 'step'] f_timestamp = temp.at[i, 'timestamp'] i_timestamp = temp.at[i - (step - 1), 'timestamp'] if step > 1: timestamp_last_action = temp.at[i - 1, 'timestamp'] else: timestamp_last_action = f_timestamp user_id_l.append(user_id) session_id_l.append(session_id) length_step_l.append(int(step)) length_timestamp_l.append(int(f_timestamp - i_timestamp)) timestamp_last_action_l.append(int(timestamp_last_action)) final_timestamp_l.append(int(f_timestamp)) final_df = pd.DataFrame({ 'user_id': user_id_l, 'session_id': session_id_l, 'length_step': length_step_l, 'length_timestamp': length_timestamp_l, 'timestamp_last_action': timestamp_last_action_l, 'final_timestamp': final_timestamp_l }) final_df['mean_time_action'] = final_df['length_timestamp'] / final_df[ 'length_step'] final_df['elapsed_last_action_click'] = final_df[ 'final_timestamp'] - final_df['timestamp_last_action'] final_df['elapsed_last_action_click_log'] = np.log( final_df['elapsed_last_action_click'] + 1) final_df['variance_last_action'] = ( final_df['elapsed_last_action_click'] - final_df['mean_time_action'])**2 final_df['std_last_action'] = abs( final_df['elapsed_last_action_click'] - final_df['mean_time_action']) final_df.drop(['timestamp_last_action', 'final_timestamp', 'mean_time_action', \ 'length_step', 'length_timestamp', 'elapsed_last_action_click'], axis=1, inplace=True) return final_df
def extract_feature(self): train = data.train_df(mode=self.mode, cluster=self.cluster) test = data.test_df(mode=self.mode, cluster=self.cluster) df = pd.concat([train, test]) temp = df.fillna('0') idxs_click = sorted(find_last_clickout_indices(temp)) idxs_numeric_reference = temp[temp['reference'].str.isnumeric() == True].index count = 0 last_click = idxs_click[0] impr_features = {} impr_feature = [] for i in tqdm(sorted(idxs_numeric_reference)): if i == last_click: impressions = list(map(int, temp.at[i, 'impressions'].split('|'))) click_timestamp = temp.at[i, 'timestamp'] click_step = temp.at[i, 'step'] for impr in impressions: if impr not in impr_features: impr_feature.append({'num_interactions_impr': 0, 'step_from_last_interaction': -1, 'timestamp_from_last_interaction': -1, 'last_action_type_with_impr': 'None'}) else: impr_features[impr]['timestamp_from_last_interaction'] = click_timestamp - impr_features[impr][ 'timestamp_from_last_interaction'] impr_features[impr]['step_from_last_interaction'] = click_step - impr_features[impr][ 'step_from_last_interaction'] impr_feature.append(impr_features[impr]) impr_features = {} count += 1 if count < len(idxs_click): last_click = idxs_click[count] continue ref = int(temp.at[i, 'reference']) if ref in impr_features: impr_features[ref]['num_interactions_impr'] += 1 impr_features[ref]['step_from_last_interaction'] = df.at[i, 'step'] impr_features[ref]['timestamp_from_last_interaction'] = df.at[i, 'timestamp'] impr_features[ref]['last_action_type_with_impr'] = df.at[i, 'action_type'] else: impr_features[ref] = {'num_interactions_impr': 1, 'step_from_last_interaction': df.at[i, 'step'], 'timestamp_from_last_interaction': df.at[i, 'timestamp'], 'last_action_type_with_impr': df.at[i, 'action_type']} final_df = expand_impressions(temp[['user_id', 'session_id', 'impressions']].loc[idxs_click]) print(len(final_df)) print(len(impr_feature)) final_df['dict'] = impr_feature features_df = pd.DataFrame(final_df.progress_apply(lambda x: tuple(x['dict'].values()), axis=1).tolist(), columns=list(final_df.iloc[0].dict.keys())) final_df_ = pd.concat([final_df, features_df], axis=1).drop('dict', axis=1) final_df_ = final_df_.drop(['num_interactions_impr', 'last_action_type_with_impr'], axis=1) return final_df_
def extract_feature(self): train = data.train_df(mode=self.mode, cluster=self.cluster) test = data.test_df(mode=self.mode, cluster=self.cluster) df = pd.concat([train, test]) idxs_click = find_last_clickout_indices(df) df = df.loc[idxs_click][[ 'user_id', 'session_id', 'impressions', 'prices' ]] impression_price_position_list = [] fraction_pos_price_list = [] for i in tqdm(df.index): impr = list(map(int, df.at[i, 'impressions'].split('|'))) prices = list(map(int, df.at[i, 'prices'].split('|'))) impression_position = np.arange(len(impr)) + 1 couples = zip(prices, impression_position, impr) couples = sorted(couples, key=lambda a: a[0]) prices_ordered, position, impressions_ordered = zip(*couples) _, price_pos = list( zip(*sorted(list(zip(position, impression_position)), key=lambda a: a[0]))) fraction_pos_price = list(impression_position / price_pos) fraction_pos_price_list.append(np.array(fraction_pos_price)) impression_price_position_list.append(np.array(price_pos)) df['impression_pos_price'] = impression_price_position_list df['impressions'] = df['impressions'].str.split('|') df['prices'] = df['prices'].str.split('|') final_df = pd.DataFrame({ col: np.repeat(df[col], df['impressions'].str.len()) for col in df.columns.drop(['impressions', 'prices']) }).assign( **{ 'item_id': np.concatenate(df['impressions'].values), 'price': np.concatenate(df['prices'].values), 'impression_pos_price': np.concatenate(df['impression_pos_price'].values) }) final_df['item_id'] = pd.to_numeric(final_df['item_id']) final_df['impression_pos_price'] = pd.to_numeric( final_df['impression_pos_price']) final_df['price'] = pd.to_numeric(final_df['price']) return final_df
def get_class_to_sessions_dict(train): idxs = find_last_clickout_indices(train) train = train.loc[idxs] class_to_sessions = dict() for i in range(25): class_to_sessions[i] = list() for t in tqdm(zip(train.session_id, train.reference, train.impressions)): imps = list(map(int, t[2].split("|"))) ref = int(t[1]) if ref in imps: idx = imps.index(ref) class_to_sessions[idx] += [t[0]] return class_to_sessions
def extract_feature(self): train = data.train_df(mode=self.mode, cluster=self.cluster) test = data.test_df(mode=self.mode, cluster=self.cluster) df = pd.concat([train, test]) idxs_click = find_last_clickout_indices(df) tuple_list = [] for i in idxs_click: user = df.at[i, 'user_id'] sess = df.at[i, 'session_id'] device = df.at[i, 'device'] tuple_list.append((user, sess, device)) return pd.DataFrame( tuple_list, columns=['user_id', 'session_id', 'session_device'])
def merge_features_tf_cv(mode, cluster, features_array): # load the full_df train_df = data.train_df(mode, cluster) test_df = data.test_df(mode, cluster) full_df = pd.concat([train_df, test_df]) del train_df, test_df # retrieve the indeces of the last clikcouts print('find_last_click_idxs') last_click_idxs = find_last_clickout_indices(full_df) # filter on the found indeces obtaining only the rows of a last clickout print('filter full on last click idxs') click_df = full_df.loc[last_click_idxs].copy() # expand the impression as rows print('expand the impression') click_df = expand_impressions(click_df)[['user_id', 'session_id', 'item_id', 'index']] click_df['dummy_step'] = np.arange(len(click_df)) # do the join print('join with the features') print(f'train_shape: {click_df.shape}\n') context_features_id = [] for f in features_array: if type(f) == tuple: feature = f[0](mode=mode, cluster='no_cluster').read_feature(one_hot=f[1]) else: feature = f(mode=mode, cluster='no_cluster').read_feature(one_hot=True) print(f'columns of the feature:\n {feature.columns}') print(f'NaN values are: {feature.isnull().values.sum()}') # if there are none fill it with -1 feature.fillna(0, inplace=True) # check if it is a feature of the impression if 'item_id' not in feature.columns: for i in range(click_df.shape[1] - 6 + 1, click_df.shape[1] - 6 + 1 + feature.shape[1] - 2, 1): context_features_id.append(str(i)) print(f'session features names:{context_features_id}') print(f'shape of feature: {feature.shape}') print(f'len of feature:{len(feature)}') click_df = click_df.merge(feature) print(f'train_shape: {click_df.shape}\n ') print('sorting by index and step...') # sort the dataframes click_df.sort_values(['index', 'dummy_step'], inplace=True) click_df.drop('dummy_step', axis=1, inplace=True) print('after join') return click_df, np.array(context_features_id)
def extract_feature(self): train = data.train_df(mode=self.mode, cluster=self.cluster) test = data.test_df(mode=self.mode, cluster=self.cluster) if self.mode in ['small', 'local']: print('reinserting clickout') test = test.groupby(['session_id', 'user_id']).progress_apply(_reinsert_clickout) df = pd.concat([train, test]) idxs_click = find_last_clickout_indices(df) df = df.loc[idxs_click][[ 'user_id', 'session_id', 'reference', 'impressions' ]] df = expand_impressions(df) df['label'] = (df['item_id'] == df['reference'].astype('float')) * 1 df.drop(['index', 'reference'], axis=1, inplace=True) print(df) return df
def extract_feature(self): df = data.train_df(mode=self.mode, cluster=self.cluster) test = data.test_df(mode=self.mode, cluster=self.cluster) if self.mode in ['small', 'local']: print('reinserting clickout') test = test.groupby(['session_id', 'user_id']).progress_apply(_reinsert_clickout) df = pd.concat([df, test]) idxs_click = find_last_clickout_indices(df) #& (df.reference.notnull())] #df = df.drop_duplicates("session_id", keep="last") #df = df[(df.reference.notnull()) & (df.index.isin(idxs_click))] labels = list() df = df[df.index.isin(idxs_click)] for t in tqdm(zip(df.reference, df.impressions)): if type(t[0]) != float: reference = int(t[0]) impressions = list(map(int, t[1].split("|"))) if reference in impressions and impressions.index( reference) == 0: labels.append(1) else: labels.append(0) else: labels.append(np.nan) df = df[["user_id", "session_id"]] df["label"] = labels #add label for prediction on full_df # if self.mode == "full": # test = data.test_df(mode=self.mode, cluster=self.cluster) # print("Adding full test rows") # test = test[test.index.isin(idxs_click)] # test = test[["user_id", "session_id"]] # df = pd.concat([df, test], sort=False) print(len(df)) return df
def extract_feature(self): train = data.train_df(mode=self.mode, cluster=self.cluster) test = data.test_df(mode=self.mode, cluster=self.cluster) df = pd.concat([train, test]) idxs_click = find_last_clickout_indices(df) df = df.loc[idxs_click][['user_id', 'session_id', 'impressions']] df = expand_impressions(df) # initialize the session id session_id = '' count = 1 impression_position = [] for i in tqdm(df.index): c_session = df.at[i, 'session_id'] if c_session != session_id: session_id = c_session count = 1 impression_position.append(count) count += 1 df['impression_position'] = impression_position df['impression_position'] = pd.to_numeric(df['impression_position']) df.drop('index', axis=1, inplace=True) return df
def extract_feature(self): price_dict_df = ImpressionsAveragePrice().read_feature().set_index( 'item_id') price_dict = price_dict_df.to_dict('index') train = data.train_df(mode=self.mode, cluster=self.cluster) test = data.test_df(mode=self.mode, cluster=self.cluster) df = pd.concat([train, test]) temp = df.fillna('0') idxs_click = sorted(find_last_clickout_indices(temp)) idxs_numeric_reference = temp[temp['reference'].str.isnumeric() == True].index count = 0 last_click = idxs_click[0] features = [] prices_interacted = [] impression_interacted = {} prices_clickout_interacted = [] for i in tqdm(sorted(idxs_numeric_reference)): if i == last_click: prices_click = sorted( list(map(int, temp.at[i, 'prices'].split('|')))) mean_price_click = np.mean(np.array(prices_click)) max_price_click = prices_click[-1] min_price_click = prices_click[0] var_prices_click = np.var(np.array(prices_click)) support_interaction = len(prices_interacted) if support_interaction == 0: last_price_interacted = -1 mean_price_interacted = -1 min_price_interacted = -1 max_price_interacted = -1 user_class = 'None' var_price_interacted = -1 distance_max_price_from_mean = -1 else: last_price_interacted = prices_interacted[-1] prices_interacted = sorted(prices_interacted) mean_price_interacted = np.mean( np.array(prices_interacted)) min_price_interacted = prices_interacted[0] max_price_interacted = prices_interacted[-1] user_class = 'poor' if mean_price_interacted < mean_price_click else 'rich' var_price_interacted = np.var(np.array(prices_interacted)) distance_max_price_from_mean = max_price_interacted - mean_price_click support_interaction_clickout = len(prices_clickout_interacted) if support_interaction_clickout == 0: last_price_clickout_interacted = -1 prices_clickout_interacted = -1 mean_price_clickout_interacted = -1 min_price_clickout_interacted = -1 max_price_clickout_interacted = -1 user_click_class = 'None' var_prices_click_interacted = -1 distance_max_price_clickout_from_mean = -1 else: last_price_clickout_interacted = prices_clickout_interacted[ -1] prices_clickout_interacted = sorted( prices_clickout_interacted) mean_price_clickout_interacted = np.mean( np.array(prices_clickout_interacted)) min_price_clickout_interacted = prices_clickout_interacted[ 0] max_price_clickout_interacted = prices_clickout_interacted[ -1] user_click_class = 'poor' if mean_price_clickout_interacted < mean_price_click else 'rich' var_prices_click_interacted = np.var( np.array(prices_clickout_interacted)) distance_max_price_clickout_from_mean = max_price_clickout_interacted - mean_price_click features_dict = { 'max_price_click': max_price_click, 'min_price_click': min_price_click, 'var_prices_click': var_prices_click, 'support_interaction': support_interaction, 'last_price_interacted': last_price_interacted, 'mean_price_interacted': mean_price_interacted, 'min_price_interacted': min_price_interacted, 'max_price_interacted': max_price_interacted, 'user_class': user_class, 'var_price_interacted': var_price_interacted, 'distance_max_price_from_mean': distance_max_price_from_mean, 'support_interaction_clickout': support_interaction_clickout, 'last_price_clickout_interacted': last_price_clickout_interacted, 'mean_price_clickout_interacted': mean_price_clickout_interacted, 'min_price_clickout_interacted': min_price_clickout_interacted, 'max_price_clickout_interacted': max_price_clickout_interacted, 'user_click_class': user_click_class, 'var_prices_click_interacted': var_prices_click_interacted, 'distance_max_price_clickout_from_mean': distance_max_price_clickout_from_mean } features.append(features_dict) count += 1 prices_interacted = [] impression_interacted = {} prices_clickout_interacted = [] if count < len(idxs_click): last_click = idxs_click[count] continue ref = int(temp.at[i, 'reference']) action_type = temp.at[i, 'action_type'] if action_type == 'clickout item': prices = list(map(int, temp.at[i, 'prices'].split('|'))) impressions = list( map(int, temp.at[i, 'impressions'].split('|'))) idx = impressions.index(ref) prices_clickout_interacted.append(prices[idx]) if ref not in impression_interacted: impression_interacted[ref] = 1 prices_interacted.append(prices[idx]) else: if ref not in impression_interacted: impression_interacted[ref] = 1 if ref in price_dict: prices_interacted.append( price_dict[ref]['prices_mean']) else: pass final_df = temp[['user_id', 'session_id']].loc[idxs_click] final_df['dict'] = features features_df = pd.DataFrame(final_df.progress_apply( lambda x: tuple(x['dict'].values()), axis=1).tolist(), columns=list(final_df.iloc[0].dict.keys())) final_df_ = pd.merge( final_df.drop('dict', axis=1).reset_index(drop=True).reset_index(), features_df.reset_index()).drop('index', axis=1) return final_df_
def extract_feature(self): train = data.train_df(mode=self.mode, cluster=self.cluster) test = data.test_df(mode=self.mode, cluster=self.cluster) df = pd.concat([train, test]) temp = df.fillna('0') idxs_click = sorted(find_last_clickout_indices(temp)) idxs_numeric_reference = temp[temp['reference'].str.isnumeric() == True].index count = 0 last_click = idxs_click[0] sess_features_dict = {} sess_feature = [] for i in tqdm(sorted(idxs_numeric_reference)): n_user = df.at[last_click, 'user_id'] n_sess = df.at[last_click, 'session_id'] if i == last_click: impressions = list( map(int, temp.at[i, 'impressions'].split('|'))) impressions_len = len(impressions) num_interacted_impressions = 0 impression_interacted = set() positions_interacted = [] tuples = sorted(list(sess_features_dict.items()), key=lambda t: t[0]) min_pos = 26 max_pos = -1 first_pos = None last_pos = None mean_pos_interacted = -1 for t in tuples: try: index = impressions.index(t[1]) + 1 if first_pos is None: first_pos = index last_pos = index if index < min_pos: min_pos = index if index > max_pos: max_pos = index num_interacted_impressions += 1 impression_interacted.add(t[1]) positions_interacted.append(index) except ValueError: pass if impressions_len > 0: percentage_interacted_impression = len( impression_interacted) / impressions_len else: percentage_interacted_impression = 1 if num_interacted_impressions == 0: min_pos = -1 max_pos = -1 first_pos = -1 last_pos = -1 if num_interacted_impressions > 0: mean_pos_interacted = sum(positions_interacted) / len( positions_interacted) f_d = { 'mean_pos_interacted': mean_pos_interacted, 'min_pos_interacted': min_pos, 'max_pos_interacted': max_pos, 'first_pos_interacted': first_pos, 'last_pos_interacted': last_pos, 'num_interacted_impressions': num_interacted_impressions, 'percentage_interacted_impressions': percentage_interacted_impression } sess_feature.append(f_d) sess_features_dict = {} count += 1 if count < len(idxs_click): last_click = idxs_click[count] continue if (temp.at[i, 'user_id'] == n_user) and (temp.at[i, 'session_id'] == n_sess): ref = int(temp.at[i, 'reference']) step_interaction = temp.at[i, 'step'] sess_features_dict[step_interaction] = ref final_df = temp[['user_id', 'session_id']].loc[idxs_click] final_df['dict'] = sess_feature features_df = pd.DataFrame(final_df.progress_apply( lambda x: tuple(x['dict'].values()), axis=1).tolist(), columns=list(final_df.iloc[0].dict.keys())) final_df_ = pd.merge( final_df.drop('dict', axis=1).reset_index(drop=True).reset_index(), features_df.reset_index()).drop('index', axis=1) return final_df_.drop([ 'mean_pos_interacted', 'max_pos_interacted', 'last_pos_interacted' ], axis=1)
def merge_features_tf(mode, cluster, features_array, stacking_scores_path): # load the full_df train_df = data.train_df(mode, cluster) test_df = data.test_df(mode, cluster) full_df = pd.concat([train_df, test_df]) del train_df, test_df # retrieve the indeces of the last clikcouts print('find_last_click_idxs') last_click_idxs=find_last_clickout_indices(full_df) # filter on the found indeces obtaining only the rows of a last clickout print('filter full on last click idxs') click_df = full_df.loc[last_click_idxs].copy() print('retrieve vali_idxs') # if the mode is full we don't have the validation if the mode is small or local the validation is performed # on the target indices vali_test_idxs = data.target_indices(mode, cluster) # construct the validation train and test df_base print('construct test and vali df') validation_test_df = click_df.loc[vali_test_idxs] all_idxs = click_df.index.values # find the differences print('construct train df') train_idxs = np.setdiff1d(all_idxs, vali_test_idxs, assume_unique=True) train_df = click_df.loc[train_idxs] # expand the impression as rows print('expand the impression') train_df = expand_impressions(train_df)[['user_id', 'session_id', 'item_id', 'index']] train_df['dummy_step']=np.arange(len(train_df)) validation_test_df = expand_impressions(validation_test_df)[['user_id', 'session_id', 'item_id', 'index']] validation_test_df['dummy_step'] = np.arange(len(validation_test_df)) # do the join print('join with the features') print(f'train_shape: {train_df.shape}\n vali_test_shape: {validation_test_df.shape}') context_features_id = [] for f in features_array: if type(f) == tuple: feature = f[0](mode=mode, cluster='no_cluster').read_feature(one_hot=f[1]) else: feature = f(mode=mode, cluster='no_cluster').read_feature(one_hot=True) print(f'columns of the feature:\n {feature.columns}') print(f'NaN values are: {feature.isnull().values.sum()}') # if there are none fill it with -1 feature.fillna(-1, inplace=True) # check if it is a feature of the impression if 'item_id' not in feature.columns: for i in range(train_df.shape[1]-6+1, train_df.shape[1]-6+1+feature.shape[1]-2, 1): context_features_id.append(str(i)) print(f'session features names:{context_features_id}') print(f'shape of feature: {feature.shape}') print(f'len of feature:{len(feature)}') train_df = train_df.merge(feature) validation_test_df = validation_test_df.merge(feature) print(f'train_shape: {train_df.shape}\n vali_shape: {validation_test_df.shape}') if len(stacking_scores_path)>1: for path in stacking_scores_path: score = pd.read_csv(path) cols = [c for c in score.columns if c in ['user_id', 'session_id', 'item_id'] or 'score' in c] score = score[cols] #if 'rnn' in path: score = score.groupby(['user_id', 'session_id', 'item_id'], as_index=False).last() train_df = train_df.merge(score, on=['user_id', 'session_id', 'item_id'], how='left') validation_test_df = validation_test_df.merge(score, on=['user_id', 'session_id', 'item_id'], how='left') print(f'train_shape: {train_df.shape}\n vali_shape: {validation_test_df.shape}') train_df.fillna(0, inplace=True) validation_test_df.fillna(0, inplace=True) print('sorting by index and step...') # sort the dataframes train_df.sort_values(['index', 'dummy_step'], inplace=True) train_df.drop('dummy_step', axis=1, inplace=True ) validation_test_df.sort_values(['index', 'dummy_step'], inplace=True) validation_test_df.drop('dummy_step', axis=1, inplace=True) print('after join') return train_df, validation_test_df, np.array(context_features_id)
def extract_feature(self): tqdm.pandas() df = data.full_df() # reset index to correct access df = df.sort_values(['user_id', 'session_id', 'timestamp', 'step']).reset_index() # find the last clickout rows last_clickout_idxs = find_last_clickout_indices(df) clickout_rows = df.loc[ last_clickout_idxs, ['user_id', 'session_id', 'action_type', 'impressions', 'prices']] # cast the impressions and the prices to lists clickout_rows['impression_list'] = clickout_rows.impressions.str.split( '|').apply(lambda x: list(map(int, x))) clickout_rows['price_list'] = clickout_rows.prices.str.split( '|').apply(lambda x: list(map(int, x))) clickout_rows = clickout_rows.drop('impressions', axis=1) # order the prices lists clickout_rows['sorted_price_list'] = clickout_rows.price_list.apply( lambda x: sorted(x)) clickout_rows = clickout_rows.drop('prices', axis=1) # find the interactions with numeric reference and not last clickouts reference_rows = df[[ 'user_id', 'session_id', 'reference', 'action_type', 'index' ]] reference_rows = reference_rows[df.reference.str.isnumeric() == True].astype({'reference': 'int'}) # skip last clickouts reference_rows = reference_rows.loc[~reference_rows.index. isin(last_clickout_idxs)] reference_rows = reference_rows.drop('action_type', axis=1) ref_pos_series = np.ones(reference_rows.shape[0], dtype=int) * (-1) # iterate over the sorted reference_rows and clickout_rows j = 0 clickout_indices = clickout_rows.index.values ckidx = clickout_indices[j] next_clickout_user_id = clickout_rows.at[ckidx, 'user_id'] next_clickout_sess_id = clickout_rows.at[ckidx, 'session_id'] k = 0 for row in tqdm( zip(reference_rows.index, reference_rows.user_id, reference_rows.session_id, reference_rows.reference)): idx = row[0] # if the current index is over the last clickout, break if idx >= clickout_indices[-1]: break # find the next clickout index while idx > clickout_indices[j]: j += 1 ckidx = clickout_indices[j] next_clickout_user_id = clickout_rows.at[ckidx, 'user_id'] next_clickout_sess_id = clickout_rows.at[ckidx, 'session_id'] next_clickout_impress = clickout_rows.at[ckidx, 'impression_list'] next_clickout_prices = clickout_rows.at[ckidx, 'price_list'] next_clickout_sortedprices = clickout_rows.at[ ckidx, 'sorted_price_list'] # check if row and next_clickout are in the same session if row[1] == next_clickout_user_id and row[ 2] == next_clickout_sess_id: try: ref_idx = next_clickout_impress.index(row[3]) ref_price = int(next_clickout_prices[ref_idx]) ref_pos_series[k] = next_clickout_sortedprices.index( ref_price) except: pass k += 1 reference_rows['price_pos'] = ref_pos_series return reference_rows.drop(['user_id', 'session_id', 'reference'], axis=1).set_index('index')
def extract_feature(self): train = data.train_df(self.mode, cluster=self.cluster) test = data.test_df(self.mode, cluster=self.cluster) df = pd.concat([train, test]) idxs_click = sorted(find_last_clickout_indices(df)) # for every last clickout index, retrieve the list # of all the clickouts for that session list_impres = [] list_prices_impres_wise = [] list_prices_orderd_wise = [] for i in tqdm(idxs_click): a_user = df.at[i, 'user_id'] a_sess = df.at[i, 'session_id'] impres = [list(map(int, df.at[i, 'impressions'].split('|')))] prices = list(map(int, df.at[i, 'prices'].split('|'))) prices_impres_wise = [prices] prices_orderd_wise = [sorted(prices)] j = i - 1 while j >= 0: try: n_user = df.at[j, 'user_id'] n_sess = df.at[j, 'session_id'] if a_sess == n_sess and a_user == n_user: if df.at[j, 'action_type'] == 'clickout item': impres.append( list( map(int, df.at[j, 'impressions'].split('|')))) prices = list( map(int, df.at[j, 'prices'].split('|'))) prices_impres_wise.append(prices) prices_orderd_wise.append(sorted(prices)) else: break j -= 1 except: j -= 1 list_impres.append(impres) list_prices_impres_wise.append(prices_impres_wise) list_prices_orderd_wise.append(prices_orderd_wise) # then build the feature list_mean_prices_interacted = [] list_mean_pos_interacted = [] count = 0 for i in tqdm(idxs_click): prices_interacted = [] pos_interacted = [] a_user = df.at[i, 'user_id'] a_sess = df.at[i, 'session_id'] impres = list_impres[count] prices_impres_wise = list_prices_impres_wise[count] prices_orderd_wise = list_prices_orderd_wise[count] j = i - 1 while j >= 0: try: n_user = df.at[j, 'user_id'] n_sess = df.at[j, 'session_id'] if a_sess == n_sess and a_user == n_user: n_ref = df.at[j, 'reference'] if n_ref.isdigit(): n_ref = int(n_ref) count_clickouts = 0 while True: elem_impres = impres[count_clickouts] elem_prices_impres_wise = prices_impres_wise[ count_clickouts] elem_prices_orderd_wise = prices_orderd_wise[ count_clickouts] if n_ref in elem_impres: price_reference = elem_prices_impres_wise[ elem_impres.index(n_ref)] prices_interacted.append(price_reference) pos_interacted.append( elem_prices_orderd_wise.index( price_reference) + 1) break else: count_clickouts += 1 j -= 1 else: break except: j -= 1 if len(prices_interacted) > 0: list_mean_prices_interacted.append( sum(prices_interacted) / len(prices_interacted)) else: list_mean_prices_interacted.append(-1) if len(pos_interacted) > 0: list_mean_pos_interacted.append( sum(pos_interacted) / len(pos_interacted)) else: list_mean_pos_interacted.append(-1) count += 1 final_df = df[['user_id', 'session_id']].loc[idxs_click] final_df['mean_cheap_pos_interacted'] = list_mean_pos_interacted final_df['mean_price_interacted'] = list_mean_prices_interacted return final_df.reset_index(drop=True)
def merge_features_lgb(mode, cluster, features_array): # load the full_df train_df = data.train_df(mode, cluster) test_df = data.test_df(mode, cluster) full_df = pd.concat([train_df, test_df]) del train_df, test_df # retrieve the indeces of the last clikcouts print('find_last_click_idxs') last_click_idxs = find_last_clickout_indices(full_df) # filter on the found indeces obtaining only the rows of a last clickout print('filter full on last click idxs') click_df = full_df.loc[last_click_idxs].copy() print('retrieve vali_idxs') # if the mode is full we don't have the validation if the mode is small or local the validation is performed # on the target indices vali_test_idxs = data.target_indices(mode, cluster) # construct the validation train and test df_base print('construct test and vali df') validation_test_df = click_df.loc[vali_test_idxs] all_idxs = click_df.index.values # find the differences print('construct train df') train_idxs = np.setdiff1d(all_idxs, vali_test_idxs, assume_unique=True) train_df = click_df.loc[train_idxs] # expand the impression as rows print('expand the impression') train_df = expand_impressions(train_df)[[ 'user_id', 'session_id', 'item_id', 'index' ]] train_df['dummy_step'] = np.arange(len(train_df)) validation_test_df = expand_impressions(validation_test_df)[[ 'user_id', 'session_id', 'item_id', 'index' ]] validation_test_df['dummy_step'] = np.arange(len(validation_test_df)) # do the join print('join with the features') print( f'train_shape: {train_df.shape}\n vali_test_shape: {validation_test_df.shape}' ) time_joins = 0 for f in features_array: _feature = f(mode=mode, cluster='no_cluster') feature = _feature.read_feature(one_hot=False) print(f'shape of feature: {feature.shape}\n') print(f'len of feature:{len(feature)}\n') start = time() train_df = train_df.merge(feature) validation_test_df = validation_test_df.merge(feature) print(f'time to do the join: {time()-start}') time_joins += time() - start print( f'train_shape: {train_df.shape}\n vali_shape: {validation_test_df.shape}' ) print(f'total time to do joins: {time_joins}') print('sorting by index and step...') # sort the dataframes train_df.sort_values(['index', 'dummy_step'], inplace=True) train_df.drop('dummy_step', axis=1, inplace=True) validation_test_df.sort_values(['index', 'dummy_step'], inplace=True) validation_test_df.drop('dummy_step', axis=1, inplace=True) print('after join') return train_df, validation_test_df
def merge_features(mode, cluster, features_array, onehot=True, merge_kind='inner', create_not_existing_features=True, multithread=False): # load the full_df train_df = data.train_df(mode, cluster) test_df = data.test_df(mode, cluster) full_df = pd.concat([train_df, test_df]) del train_df, test_df # retrieve the indeces of the last clikcouts print('find_last_click_idxs') last_click_idxs = find_last_clickout_indices(full_df) last_click_idxs = sorted(last_click_idxs) # filter on the found indeces obtaining only the rows of a last clickout print('filter full on last click idxs') click_df = full_df.loc[last_click_idxs].copy() print('retrieve vali_idxs') # if the mode is full we don't have the validation if the mode is small or local the validation is performed # on the target indices vali_test_idxs = data.target_indices(mode, cluster) # construct the validation train and test df_base print('construct test and vali df') validation_test_df = click_df.loc[vali_test_idxs] all_idxs = click_df.index.values # find the differences print('construct train df') train_idxs = np.setdiff1d(all_idxs, vali_test_idxs, assume_unique=True) train_df = click_df.loc[train_idxs] # expand the impression as rows print('expand the impression') train_df = expand_impressions(train_df)[[ 'user_id', 'session_id', 'item_id', 'index' ]] train_df['dummy_step'] = np.arange(len(train_df)) validation_test_df = expand_impressions(validation_test_df)[[ 'user_id', 'session_id', 'item_id', 'index' ]] validation_test_df['dummy_step'] = np.arange(len(validation_test_df)) if not multithread: train_df, validation_test_df = actual_merge_one_thread(train_df, validation_test_df, features_array, \ mode, cluster, create_not_existing_features, merge_kind, onehot) else: train_df, validation_test_df = actual_merge_multithread(train_df, validation_test_df, features_array, \ mode, cluster, create_not_existing_features, merge_kind, onehot) print('sorting by index and step...') # sort the dataframes train_df.sort_values(['index', 'dummy_step'], inplace=True) train_df.drop('dummy_step', axis=1, inplace=True) validation_test_df.sort_values(['index', 'dummy_step'], inplace=True) validation_test_df.drop('dummy_step', axis=1, inplace=True) print('after join') return train_df, validation_test_df, train_idxs, vali_test_idxs