def create_user_feature_dict(df): idxs_clicks = find_last_clickout_indices(df) df = df.drop(idxs_clicks) # retrieve the icm icm = data.accomodations_one_hot() # filter on the columns of interests temp_df = df[['user_id', 'session_id', 'reference']].dropna() # mantain only the rows with numeric reference temp_df = temp_df[temp_df['reference'].str.isnumeric()] temp_df = temp_df.drop_duplicates() # retrieve user_sess and item associated to it from the dataframe users_idxs = temp_df.to_dict('list')['user_id'] sessions_idxs = temp_df.to_dict('list')['session_id'] users_sess = list(zip(users_idxs, sessions_idxs)) items_idxs = list(map(int, temp_df.to_dict('list')['reference'])) # create a diction with keys tuples like ('user_id', 'session_id') and as value the array representing # the user as an house summing up all the features of the houses with wich he has interacted # during the session count = 0 user_session_dict = {} for user_session in tqdm(users_sess): user_items = icm.loc[items_idxs[count]].values if user_session in user_session_dict: user_session_dict[user_session] += user_items else: user_session_dict[user_session] = user_items count += 1 return user_session_dict
def retrieve_pd_dataframe_score(df): icm = data.accomodations_one_hot().sort_index() sess_user_dict = create_user_feature_dict(df) idxs_clicks = find_last_clickout_indices(df) scores = [] # iterate on the index of the target clicks and create for each iteration a tuple to be appended on the final list print('computing the distances...') for idx in tqdm(idxs_clicks): # retrieve the user sess and impressions of the click user = df.at[idx, 'user_id'] sess = df.at[idx, 'session_id'] impressions = list(map(int, df.at[idx, 'impressions'].split('|'))) # retrieve the impression of the user-sess pair if it isn't in the dictionary means # that there weren't numeric actions in the session so initialize it with an empty vector us_tuple = (user, sess) if us_tuple in sess_user_dict: user_feature_vec = sess_user_dict[(user, sess)] else: user_feature_vec = np.zeros(icm.shape[1]) # retrieve the features of the impression selected features_imp = icm.loc[impressions].values # create the various version of the user vector CLIPPED, TRESHOLDED clipped_user_feature_vec = np.clip(user_feature_vec,0,1) tresholded_user_feature_vec = user_feature_vec.copy() if np.sum(user_feature_vec) > 0: treshold_limit = np.mean(user_feature_vec[user_feature_vec > 0]) tresholded_user_feature_vec[tresholded_user_feature_vec<treshold_limit]=0 # compute the distance between the two vectors _scores_manhattan = manhattan_distances(user_feature_vec.reshape(1, -1), features_imp) _scores_cosine = cosine_similarity(user_feature_vec.reshape(1, -1), features_imp) _scores_jaccard_no_norm = np.dot(user_feature_vec, features_imp.T) _scores_manhattan_clip = manhattan_distances(clipped_user_feature_vec.reshape(1, -1), features_imp) _scores_cosine_clip = cosine_similarity(clipped_user_feature_vec.reshape(1, -1), features_imp) _scores_jaccard_no_norm_clip = np.dot(clipped_user_feature_vec, features_imp.T) _scores_manhattan_tr = manhattan_distances(tresholded_user_feature_vec.reshape(1, -1), features_imp) _scores_cosine_tr = cosine_similarity(tresholded_user_feature_vec.reshape(1, -1), features_imp) _scores_jaccard_no_norm_tr = np.dot(tresholded_user_feature_vec, features_imp.T) # create and append a tuple on the final list for i in range(len(impressions)): scores.append((user, sess, impressions[i], _scores_cosine[0][i], _scores_manhattan[0][i],_scores_jaccard_no_norm[i], _scores_cosine_clip[0][i], _scores_manhattan_clip[0][i],_scores_jaccard_no_norm_clip[i], _scores_cosine_tr[0][i], _scores_manhattan_tr[0][i],_scores_jaccard_no_norm_tr[i])) return pd.DataFrame(scores, columns=['user_id', 'session_id', 'item_id', 'scores_cosine', 'scores_manhatthan', 'scores_jaccard_no_norm', 'scores_cosine_clip', 'scores_manhatthan_clip', 'scores_jaccard_no_norm_clip', 'scores_cosine_tr', 'scores_manhatthan_tr', 'scores_jaccard_no_norm_tr'])
def merge_reference_features(df, pad_sessions_length): res_df = df.copy() # set the non-numeric references to 0 and cast to int res_df.loc[res_df.reference.str.isnumeric() != True, 'reference'] = 0 res_df = res_df.astype({'reference': 'int'}) # join res_df = res_df.merge(data.accomodations_one_hot(), how='left', left_on='reference', right_index=True) # set to 0 the features of the non-joined rows features_cols = data.accomodations_one_hot().columns col_start = list(res_df.columns).index(features_cols[0]) col_end = list(res_df.columns).index(features_cols[-1]) res_df.loc[:, features_cols] = res_df.loc[:, features_cols].fillna(0) # remove the item features for the last clickout of each session: TO-DO clickout may be not the last item res_df.iloc[np.arange(-1, len(res_df), pad_sessions_length)[1:], col_start:col_end] = 0 return res_df
def add_accomodations_features(df, path_to_save, logic='skip', row_indices=[]): """ Add the features (one-hot) to the dataframe that match the 'reference' and save the resulting dataframe. It is possible to specify a list of rows to skip (logic='skip'), or to join only for some rows (logic='subset'). Return the target columns and the one-hot columns that have been added to the dataframe """ # save the references series and then set the reference to NaN to skip the join on that rows join_data = dict() join_data['backup_reference_series'] = df.reference.values.copy() if len(row_indices) > 0: if logic == 'skip': # set to NaN the rows to be skipped df.loc[row_indices, 'reference'] = np.nan if logic == 'subset': # set to NaN all rows, except for the specified rows backup_serie = df.loc[row_indices].reference.copy() df.reference = np.nan df.loc[row_indices, 'reference'] = backup_serie # cast the reference column to Int64 removing the string values df.reference = pd.to_numeric(df.reference, errors='coerce') #.astype('Int64') # one-hot encoding of the accomodations features attributes_df = data.accomodations_one_hot() # accomodations features columns #features_columns = attributes_df.columns # with open(one_hot_accomodations_features_path, 'w') as f: # pickle.dump(features_columns, f) #original_columns = set(df.columns) # add the 'no-reference' column #df['no_reference'] = (~df.reference.fillna('').str.isnumeric()) * 1 # after_one_hot_columns = set(df.columns) # one_hot_columns = after_one_hot_columns.difference(original_columns) # one_hot_columns = list(one_hot_columns.union(set(features_columns))) def post_join(chunk_df, data): # reset the original references #chunk_df.loc[:,'reference'] = data['backup_reference_series'][data['$i1']:data['$i2']] return chunk_df.drop('reference', axis=1) sparsedf.left_join_in_chunks(df, attributes_df, left_on='reference', right_on=None, right_index=True, post_join_fn=post_join, data=join_data, path_to_save=path_to_save)
def recommend_batch(self, target_indices): X, indices = self.dataset.load_Xtest() # predict the references predictions = self.model.predict(X) # flatten the predictions and the indices to be 2-dimensional predictions = predictions.reshape((-1, predictions.shape[-1])) indices = indices.flatten() # take only the target predictions pred_df = pd.DataFrame(predictions) pred_df['orig_index'] = indices pred_df = pred_df.set_index('orig_index') predictions = pred_df.loc[target_indices].sort_index().values del pred_df assert len(predictions) == len(target_indices) full_df = data.full_df() accomodations1hot_df = data.accomodations_one_hot() result_predictions = [] for k, index in tqdm(enumerate(target_indices)): # get the impressions of the clickout to predict impr = list(map(int, full_df.loc[index]['impressions'].split('|'))) # get the true labels from the accomodations one-hot true_labels = accomodations1hot_df.loc[impr].values # build a list of (impression, l2norm distance) prediction_impressions_distances = [ (impr[j], L2Norm(true_labels[j] - predictions[k])) for j in range(len(impr)) ] # order the list based on the l2norm (smaller distance is better) prediction_impressions_distances.sort(key=lambda tup: tup[1]) # get only the impressions ids ordered_impressions = list( map(lambda x: x[0], prediction_impressions_distances)) # append the couple (index, reranked impressions) result_predictions.append((index, ordered_impressions)) print('prediction created !!!') return result_predictions
def get_user_favorite_filters(full_df, users): """ I want a structure that for every user in the train gives an one_hot_encoded structures for all possible parameters of hotels clicked by that user ex. parameter: 3 Stars """ # get clickout of train and merge metadata of the hotel train_df = full_df[full_df["user_id"].isin(users)] train_df = train_df[(train_df["action_type"] == "clickout item") & ( pd.to_numeric(train_df['reference'], errors='coerce').notnull())] train_df.drop([ "session_id", "timestamp", "step", "action_type", "platform", "city", "device", "current_filters", "impressions", "prices" ], axis=1, inplace=True) # Merge & eliminate column metatadata_one_hot = data.accomodations_one_hot().reset_index() train_df['reference'] = train_df['reference'].astype(int) metatadata_one_hot['item_id'] = metatadata_one_hot['item_id'].astype( int) train_df = pd.merge(train_df, metatadata_one_hot, how='outer', left_on='reference', right_on='item_id') train_df = train_df.drop(["reference", "item_id"], axis=1) print( "Finishing binaryzing, now summing and getting user favorite properties of hotels..." ) out_df = train_df.groupby('user_id')[train_df.columns[2:]].sum() return out_df
def extract_feature(self): tqdm.pandas() df = data.full_df() #df = data.train_df('small') accom_df = data.accomodations_one_hot() # this kind of filters are those of the type 'change-of-sort order' # they have a particular meaning and they must be handled in a separate feature change_sort_filters = set(['sort by price', 'sort by distance', 'sort by rating', 'sort by popularity', 'focus on rating', 'focus on distance', 'best value']) # find the clickout rows clickouts = df[(df.action_type == 'clickout item')] # & df.current_filters.notnull()] clickouts = clickouts[['user_id','session_id','current_filters','impressions']] # split the filters and the impressions clickouts['filters_list'] = clickouts['current_filters'].str.lower().str.split('|').fillna('') clickouts['impress_list'] = clickouts['impressions'].str.split('|') clickouts = clickouts.drop(['impressions','current_filters'], axis=1) # cast the impressions to int clickouts['impress_list'] = clickouts['impress_list'].apply(lambda x: list(map(int, x))) # create the binarizer with the same classes as the accomodations one-hot mlb = MultiLabelBinarizer(classes=accom_df.columns.str.lower()) # iterate over the clickouts and one-hot print('Total interactions:', clickouts.shape[0]) result = np.zeros((clickouts.shape[0],25), dtype='float') k = 0 for idx in tqdm(clickouts.index): filters = clickouts.at[idx, 'filters_list'] impressions = clickouts.at[idx, 'impress_list'] # do not consider change-of-sort filters filters = set(filters).difference(change_sort_filters) # fix some wrong filters names if 'gay friendly' in filters: filters.remove('gay friendly') filters.add('gay-friendly') if 'internet (rooms)' in filters: filters.remove('internet (rooms)') filters.add('free wifi (rooms)') filters = list(filters) filters_len = len(filters) if filters_len > 0: # one-hot the filters filters_one_hot = mlb.fit_transform([filters])[0] # take the one-hot of the impressions tags impressions_features_one_hot = accom_df.loc[impressions].values satisfaction_percentage = np.sum( np.bitwise_and(filters_one_hot, impressions_features_one_hot), axis=1) / filters_len result[k, 0:len(impressions)] = satisfaction_percentage else: # there are only change-of-sort filters result[k, 0:len(impressions)] = 1 k += 1 result = result.round(4) # add the 25 new columns for i in range(25): clickouts['satisf_perc_{}'.format(i)] = result[:,i] return clickouts.drop(['user_id','session_id','filters_list','impress_list'], axis=1)
def extract_feature(self): tqdm.pandas() tr = data.train_df(self.mode, cluster=self.cluster) te = data.test_df(self.mode, cluster=self.cluster) df = pd.concat([tr, te]) accom_df = data.accomodations_one_hot() # this kind of filters are those of the type 'change-of-sort order' # they have a particular meaning and they must be handled in a separate feature change_sort_filters = set([ 'sort by price', 'sort by distance', 'sort by rating', 'sort by popularity', 'focus on rating', 'focus on distance', 'best value' ]) # find the clickout rows last_clk = find(df) clickouts = df.loc[last_clk] clickouts = clickouts[[ 'user_id', 'session_id', 'current_filters', 'impressions' ]] # split the filters and the impressions clickouts['filters_list'] = clickouts['current_filters'].str.lower( ).str.split('|').fillna('') clickouts['impress_list'] = clickouts['impressions'].str.split('|') clickouts = clickouts.drop(['impressions', 'current_filters'], axis=1) # cast the impressions to int clickouts['impress_list'] = clickouts['impress_list'].apply( lambda x: list(map(int, x))) # create the binarizer with the same classes as the accomodations one-hot mlb = MultiLabelBinarizer(classes=accom_df.columns.str.lower()) # iterate over the clickouts and one-hot print('Total interactions:', clickouts.shape[0]) satisfaction_percentage = [] k = 0 for idx in tqdm(clickouts.index): filters = clickouts.at[idx, 'filters_list'] impressions = clickouts.at[idx, 'impress_list'] # do not consider change-of-sort filters filters = set(filters).difference(change_sort_filters) # fix some wrong filters names if 'gay friendly' in filters: filters.remove('gay friendly') filters.add('gay-friendly') if 'internet (rooms)' in filters: filters.remove('internet (rooms)') filters.add('free wifi (rooms)') filters = list(filters) filters_len = len(filters) if filters_len > 0: # one-hot the filters filters_one_hot = mlb.fit_transform([filters])[0] # take the one-hot of the impressions tags impressions_features_one_hot = accom_df.loc[impressions].values impression_satisfaction = list( np.sum(np.bitwise_and(filters_one_hot, impressions_features_one_hot), axis=1) / filters_len) satisfaction_percentage.append(impression_satisfaction) else: # there are only change-of-sort filters satisfaction_percentage.append(list(np.ones(len(impressions)))) k += 1 clickouts['satisfaction_percentage'] = satisfaction_percentage clickouts = clickouts.drop(['filters_list'], axis=1) expanded = pd.DataFrame({col: np.repeat(clickouts[col].values, clickouts.satisfaction_percentage.str.len()) \ for col in clickouts.columns.drop(['impress_list', 'satisfaction_percentage'])}).\ assign(**{'item_id': np.concatenate(clickouts.impress_list.values), \ 'satisfaction_percentage': np.concatenate(clickouts.satisfaction_percentage.values)}) return expanded
def create_dataset_for_regression(mode, cluster, pad_sessions_length, add_item_features=True, save_X_Y=True): """ pad_sessions_length (int): final length of sessions after padding/truncating add_item_features (bool): whether to add the one-hot accomodations features to the training data save_X_Y (bool): whether to save the train data into 2 separate files (X_train, Y_train) or in a unique file (train_vec) """ train_df = data.train_df(mode, cluster) test_df = data.test_df(mode, cluster) path = f'dataset/preprocessed/{cluster}/{mode}/dataset_regression' check_folder(path) devices_classes = ['mobile', 'desktop', 'tablet'] actions_classes = ['show_impression', 'clickout item', 'interaction item rating', 'interaction item info', 'interaction item image', 'interaction item deals', 'change of sort order', 'filter selection', 'search for item', 'search for destination', 'search for poi'] ## ======== TRAIN ======== ## # add the impressions as new interactions print('Adding impressions as new actions...') train_df, final_new_index = sess2vec.add_impressions_as_new_actions(train_df) print('Done!\n') # pad the sessions if pad_sessions_length > 0: print('Padding/truncating sessions...') train_df = sess2vec.pad_sessions(train_df, max_session_length=pad_sessions_length) print('Done!\n') if save_X_Y: print('Getting the last clickout of each session...') train_clickouts_df = sess2vec.get_last_clickout(train_df, index_name='index', rename_index='orig_index') train_clickouts_indices = train_clickouts_df.orig_index.values train_clickouts_indices.sort() print('Done!\n') # add the one-hot of the device print('Adding one-hot columns of device...', end=' ', flush=True) train_df = sess2vec.one_hot_df_column(train_df, 'device', classes=devices_classes) print('Done!\n') # add the one-hot of the action-type print('Adding one-hot columns of action_type...', end=' ', flush=True) train_df = sess2vec.one_hot_df_column(train_df, 'action_type', classes=actions_classes) print('Done!\n') TRAIN_LEN = train_df.shape[0] TRAIN_NAME = '' if save_X_Y: # set the columns to be placed in the labels file Y_COLUMNS = ['user_id','session_id','timestamp','step','reference'] # join the accomodations one-hot features X_train_path = os.path.join(path, 'X_train.csv') if add_item_features: print('Joining the accomodations features...') sess2vec.add_accomodations_features(train_df.copy(), X_train_path, logic='skip', row_indices=train_clickouts_indices) else: # set the last clickouts to NaN and save the X dataframe backup_ref_serie = train_df.reference.values.copy() train_df.loc[train_clickouts_indices, 'reference'] = np.nan train_df.to_csv(X_train_path, index_label='orig_index', float_format='%.4f') train_df.reference = backup_ref_serie del backup_ref_serie Y_train_path = os.path.join(path, 'Y_train.csv') train_df = train_df[Y_COLUMNS] if add_item_features: sess2vec.add_accomodations_features(train_df.copy(), Y_train_path, logic='subset', row_indices=train_clickouts_indices) else: # set all clickouts to NaN except for the last clickouts and save the Y dataframe backup_ref_serie = train_df.loc[train_clickouts_indices].reference.copy() train_df.reference = np.nan train_df.loc[train_clickouts_indices, 'reference'] = backup_ref_serie train_df.to_csv(Y_train_path, index_label='orig_index', float_format='%.4f') # clean ram del train_clickouts_df del train_clickouts_indices else: TRAIN_NAME = 'train_vec.csv' train_path = os.path.join(path, TRAIN_NAME) train_df.to_csv(train_path, index_label='orig_index', float_format='%.4f') del train_df ## ======== TEST ======== ## print('Adding impressions as new actions...') test_df, _ = sess2vec.add_impressions_as_new_actions(test_df, final_new_index) print('Done!\n') # pad the sessions if pad_sessions_length > 0: print('Padding/truncating sessions...') test_df = sess2vec.pad_sessions(test_df, max_session_length=pad_sessions_length) print('Done!\n') print('Getting the last clickout of each session...') test_clickouts_df = sess2vec.get_last_clickout(test_df, index_name='index', rename_index='orig_index') test_clickouts_indices = test_clickouts_df.orig_index.values test_clickouts_indices.sort() print('Done!\n') # add the one-hot of the device print('Adding one-hot columns of device...', end=' ', flush=True) test_df = sess2vec.one_hot_df_column(test_df, 'device', classes=devices_classes) print('Done!\n') # add the one-hot of the action-type print('Adding one-hot columns of action_type...', end=' ', flush=True) test_df = sess2vec.one_hot_df_column(test_df, 'action_type', classes=actions_classes) print('Done!\n') TEST_LEN = test_df.shape[0] # join the accomodations one-hot features X_test_path = os.path.join(path, 'X_test.csv') if add_item_features: print('Joining the accomodations features...') sess2vec.add_accomodations_features(test_df.copy(), X_test_path, logic='skip', row_indices=test_clickouts_indices) else: # set the last clickouts to NaN and save the X dataframe backup_ref_serie = test_df.reference.values.copy() test_df.loc[test_clickouts_indices, 'reference'] = np.nan test_df.to_csv(X_test_path, index_label='orig_index', float_format='%.4f') #test_df.reference = backup_ref_serie del backup_ref_serie ## ======== CONFIG ======== ## # save the dataset config file that stores dataset length and the list of sparse columns features_cols = list(data.accomodations_one_hot().columns) if add_item_features else [] x_sparse_cols = devices_classes + actions_classes + features_cols datasetconfig.save_config(path, mode, cluster, TRAIN_LEN, TEST_LEN, train_name=TRAIN_NAME, rows_per_sample=pad_sessions_length, X_sparse_cols=x_sparse_cols, Y_sparse_cols=features_cols)