def _create_csvs(): print('creating CSV...') # create no_cluster/full path = 'dataset/preprocessed/no_cluster' full = data.full_df() train_len = data.read_config()[data.TRAIN_LEN_KEY] train = full.iloc[0:train_len] test = full.iloc[train_len:len(full)] target_indices = get_target_indices(test) check_folder('dataset/preprocessed/no_cluster/full') train.to_csv(os.path.join(path, 'full/train.csv')) test.to_csv(os.path.join(path, 'full/test.csv')) np.save(os.path.join(path, 'full/train_indices'), train.index) np.save(os.path.join(path, 'full/test_indices'), test.index) np.save(os.path.join(path, 'full/target_indices'), target_indices) no_of_rows_in_small = int( input('How many rows do you want in small.csv? ')) train_small = get_small_dataset(train, maximum_rows=no_of_rows_in_small) check_folder('dataset/preprocessed/no_cluster/small') split(train_small, os.path.join(path, 'small')) check_folder('dataset/preprocessed/no_cluster/local') split(train, os.path.join(path, 'local')) # create item_metadata in preprocess folder original_item_metadata = data.accomodations_original_df() original_item_metadata.to_csv(data.ITEMS_PATH) # append missing accomodations to item metadata append_missing_accomodations('full')
def score_fn(model, X, y): global i print(i) i += 1 t0 = time.time() target_indices = data.target_indices('local') full_impressions = data.full_df() scores = list(model.xg.predict(X)) final_predictions = [] count = 0 for index in tqdm(target_indices): impressions = list( map(int, full_impressions.loc[index]['impressions'].split('|'))) predictions = scores[count:count + len(impressions)] couples = list(zip(predictions, impressions)) couples.sort(key=lambda x: x[0], reverse=True) _, sorted_impr = zip(*couples) final_predictions.append((index, list(sorted_impr))) count = count + len(impressions) mrr = model.compute_MRR(final_predictions) print('Done in', time.time() - t0) print() return mrr
def extract_feature(self): tqdm.pandas() df = data.full_df() # find the clickout interactions res_df = df[['user_id','session_id','prices']] res_df = res_df[df.action_type == 'clickout item'] # expand the prices as vector expanded_prices = res_df.prices.str.split('|', expand=True).fillna(0).astype('int') # scale log log_prices = np.log(expanded_prices +1) max_price = max(np.max(log_prices)) min_price = min(np.min(log_prices)) log_prices = (log_prices - min_price) / (max_price - min_price) # add the prices to the resulting df for i in range(25): res_df['price_{}'.format(i)] = log_prices.loc[:, i] return res_df.drop(['user_id','session_id','prices'], axis=1)
def extract_feature(self): tqdm.pandas() df = data.full_df() df = df.sort_values(['user_id', 'session_id', 'timestamp', 'step']).reset_index() # find the last clickout rows last_clickout_idxs = find_last_clickout_indices(df) clickout_rows = df.loc[ last_clickout_idxs, ['user_id', 'session_id', 'impressions', 'index']] clickout_rows[ 'impressions_count'] = clickout_rows.impressions.str.split( '|').str.len() clickout_rows = clickout_rows.drop('impressions', axis=1) # multi-hot the counts one_hot_counts = np.zeros((clickout_rows.shape[0], 25), dtype=np.int8) for i, c in tqdm(enumerate(clickout_rows.impressions_count.values)): one_hot_counts[i, 0:c] = 1 # add to the clickouts for i in range(25): clickout_rows['impr_c{}'.format(i)] = one_hot_counts[:, i] return clickout_rows.drop('impressions_count', axis=1).set_index('index')
def get_scores_cv(self, x, groups, test_indices): """ Return scores for a fold """ x_val = x[test_indices] indices = x_val[:, :, 0][:, self.dataset.rows_per_sample - 1] predictions = self.model.predict(x_val[:, :, 1:]) # take the target rows res_df = data.full_df()[['user_id', 'session_id', 'impressions']].loc[indices].copy() res_df['impressions'] = res_df['impressions'].str.split('|') # add the scores as a new column res_df['scores'] = list(predictions) # trim the scores to the real number of impressions # (otherwise all rows have the fixed number of scores (25) ) res_df['length'] = res_df['impressions'].str.len() res_df['scores'] = res_df.apply(lambda x: x['scores'][:x['length']], axis=1) res_df.drop('length', axis=1, inplace=True) # expand the df to have a row for each item_id - score res_df = pd.DataFrame({ col: np.repeat(res_df[col], res_df['scores'].str.len()) for col in res_df.columns.drop(['impressions', 'scores']) }).assign( **{ 'item_id': np.concatenate(res_df['impressions'].values), 'score': np.concatenate(res_df['scores'].values), }) return res_df
def create_feature(self): # load dataset and indices train, train_indices = self.dataset.load_Xtrain(return_indices=True) test, test_indices = self.dataset.load_Xtest() # make predictions train_test = np.concatenate([train, test]) del train del test predictions = self.model.predict(train_test).flatten() # build feature df concat_indices = np.concatenate([train_indices, test_indices]) del train_indices del test_indices users_sessions = data.full_df().loc[concat_indices] feature_df = pd.DataFrame( { 'user_id': users_sessions['user_id'], 'session_id': users_sessions['session_id'], 'rnn_binary_preds': predictions }, index=concat_indices) path = 'dataset/preprocessed/no_cluster/{}/feature/rnn_binary_preds/features.csv'.format( self.mode) check_folder(path) feature_df.to_csv(path) return feature_df
def get_r_hat(self): """ Return the r_hat matrix as: R^ = R•S or R^ = S•R """ R = self.urm target_indices_urm = [] for ind in self.target_indices: if self.type == 'user': target_indices_urm.append( self.dict_row[data.full_df().loc[ind]['user_id']]) if self.type == 'session': target_indices_urm.append(self.dict_row[tuple( data.full_df().loc[ind][['user_id', 'session_id']])]) if self._matrix_mul_order == 'inverse': return self._sim_matrix.tocsr()[target_indices_urm].dot(R) else: return R[target_indices_urm].dot(self._sim_matrix)
def recommend_batch(self): # load full df print('loading df_full') full_df = data.full_df() icm = data.icm().tocsr( ) #sim.normalization.bm25(data.icm().tocsr(), axis=1) predictions_batch = [] self.scores_batch = [] count = 0 predicted_count = 0 skipped_count = 0 for index in tqdm(self.target_indices): # get the impressions of the clickout to predict impr = list(map(int, full_df.loc[index]['impressions'].split('|'))) # get the row index of the icm to predict icm_rows = [] for i in impr: icm_rows.append(self.dict_col[i]) temp = list(zip(impr, icm_rows)) temp.sort(key=lambda tup: tup[1]) list_impr_icmrows = list(zip(*temp)) impr_sorted = list(list_impr_icmrows[0]) icm_rows = list(list_impr_icmrows[1]) icm_filtered = icm[icm_rows] r_hat_row = self.user_features_matrix[count] * icm_filtered.T l = list(zip(impr_sorted, r_hat_row.todense().tolist()[0])) l_scores = l.copy() l.sort(key=lambda tup: tup[1], reverse=True) l_scores.sort(key=lambda tup: tup[0], reverse=True) count += 1 if l[0][1] == 0: skipped_count += 1 self.scores_batch.append((index, [], [])) continue else: predicted_count += 1 p = [e[0] for e in l] print(f'impr: {impr}\n rec: {p}') predictions_batch.append((index, p)) scores = [e[1] for e in l] self.scores_batch.append((index, p, scores)) print(scores) print( f'predicted percentage: {predicted_count/len(self.target_indices)}\n jumped percentage: {skipped_count/len(self.target_indices)}' ) print('prediction created !!!') return predictions_batch
def fit(self): urm = data.urm(self.mode, self.cluster, self.type, self.urm_name) icm = data.icm().tocsr() # computing target indices_urm target_indices_urm = [] if self.type == 'user': for ind in self.target_indices: target_indices_urm.append( self.dict_row[data.full_df().loc[ind]['user_id']]) if self.type == 'session': for ind in self.target_indices: target_indices_urm.append(self.dict_row[tuple( data.full_df().loc[ind][['user_id', 'session_id']])]) self.user_features_matrix = sps.normalize(urm[target_indices_urm] * icm, norm='l2', axis=0)
def _reinsert_clickout(df): # take the row of the missing clickout clickout_rows_df = df[(df['action_type'] == 'clickout item') & df['reference'].isnull()] # check if it exsists if len(clickout_rows_df) > 0: # retrieve from the full_df the clickout missing_click = data.full_df().loc[ clickout_rows_df.index[0]]['reference'] # reinsert the clickout on the df df.at[clickout_rows_df.index[0], 'reference'] = missing_click return df
def get_scores_cv(self, x, groups, test_indices): """ Return scores for a fold """ x_val = x[test_indices] indices = x_val[:, :, 0][:, self.dataset.rows_per_sample - 1] predictions = self.model.predict(x_val[:, :, 1:]) # take the target rows res_df = data.full_df()[['user_id', 'session_id']].loc[indices].copy() # add the scores as a new column res_df['scores'] = predictions return res_df
def extract_feature(self): tqdm.pandas() df = data.full_df() df = df.sort_index() # find the clickout rows clickout_rows = df[[ 'user_id', 'session_id', 'action_type', 'impressions' ]][df.action_type == 'clickout item'] clickout_rows[ 'impressions_count'] = clickout_rows.impressions.str.split( '|').str.len() # prepare the resulting dataframe res_df = df[['user_id', 'session_id']].copy() res_df['impressions_count'] = 0 # iterate over the sorted reference_rows and clickout_rows j = 0 clickout_indices = clickout_rows.index.values ck_idx = clickout_indices[0] next_clickout_user_id = clickout_rows.at[ck_idx, 'user_id'] next_clickout_sess_id = clickout_rows.at[ck_idx, 'session_id'] for idx, row in tqdm(res_df.iterrows()): # if the current index is over the last clickout, break if idx > clickout_indices[-1]: break # find the next clickout index while idx > clickout_indices[j]: j += 1 ck_idx = clickout_indices[j] next_clickout_user_id = clickout_rows.at[ck_idx, 'user_id'] next_clickout_sess_id = clickout_rows.at[ck_idx, 'session_id'] # check if row and next_clickout are in the same session if row.user_id == next_clickout_user_id and row.session_id == next_clickout_sess_id: res_df.at[idx, 'impressions_count'] = clickout_rows.at[ ck_idx, 'impressions_count'] # create the 25 categories one_hot_counts = np.zeros((res_df.shape[0], 25), dtype=np.int8) for i, c in enumerate(res_df.impressions_count.values): one_hot_counts[i, 0:c] = 1 for i in range(25): res_df['impr_c{}'.format(i)] = one_hot_counts[:, i] return res_df.drop(['user_id', 'session_id', 'impressions_count'], axis=1)
def get_y_true(clickout_indices): df = data.full_df().loc[clickout_indices] def add_label(row): impress = list(map(int, row['impressions'].split('|'))) ref = row['reference'] if ref in impress: return 1 if impress[0] == ref else 0 else: return 0 df = df.astype({'reference': 'int'}) df['label'] = df.progress_apply(add_label, axis=1) return df['label']
def get_scores_batch(self): final_predictions = [] count = 0 for index in tqdm(self.target_indices): impr = list( map(int, data.full_df().loc[index]['impressions'].split('|'))) pred = self.predictions[count][0:len(impr)] couples = list(zip(pred, impr)) # print(couples) couples.sort(key=lambda x: x[0], reverse=True) scores, sorted_impr = zip(*couples) final_predictions.append((index, list(sorted_impr), list(scores))) count += 1 return final_predictions
def extract_feature(self): tqdm.pandas() df = data.full_df() # count the popularity #cnt = Counter(df[(df.action_type == 'clickout item') & (df.reference.str.isnumeric() == True)].reference.values.astype(int)) pop_df = df[(df.action_type == 'clickout item') & (df.reference.str.isnumeric() == True)] \ [['reference','frequence']].astype('int').groupby('reference').sum() cnt = pop_df.to_dict()['frequence'] # find the clickout rows clickout_rows = df[df.action_type == 'clickout item'][[ 'reference', 'impressions' ]] clickout_rows = clickout_rows.fillna(-1).astype({'reference': 'int'}) clickout_rows['impressions'] = clickout_rows.apply( lambda x: list(map(int, x.impressions.split('|'))), axis=1) # build the resulting matrix matrix = np.zeros((clickout_rows.shape[0], 25), dtype=int) i = 0 for impr in tqdm(clickout_rows.impressions): for j, impr in enumerate(impr): ## OLD version #popularity = cnt[impr] if impr in cnt else 0 #if popularity == row.reference: # popularity -= 1 ## NEW ! (decrease 1 to all references) popularity = cnt[impr] - 1 if impr in cnt else 0 matrix[i, j] = popularity i += 1 # scale log and min-max min_pop = np.log((pop_df['frequence'] - 1).clip(0).min() + 1) max_pop = np.log((pop_df['frequence'] - 1).clip(0).max() + 1) matrix = (np.log(matrix + 1) - min_pop) / (max_pop - min_pop) # add the columns to the resulting dataframe for i in range(25): clickout_rows['impr_pop{}'.format(i)] = matrix[:, i] return clickout_rows.drop(['reference', 'impressions'], axis=1)
def compute_MRR(self, predictions): """ :param predictions: :return: MRR computed on just the sessions where the clickout is not on the first impression """ #assert (self.mode == 'local' or self.mode == 'small') #train_df = pd.read_csv('dataset/preprocessed/no_cluster/full/train.csv'.format( # self.cluster), usecols=['reference', 'impressions']) if self.mode == 'full': train_df = data.full_df() else: train_df = data.train_df('full') target_indices, recs = zip(*predictions) target_indices = list(target_indices) correct_clickouts = train_df.loc[target_indices].reference.values impression = train_df.loc[target_indices].impressions.values len_rec = len(recs) count = 0 RR = 0 print("Calculating MRR (hoping for a 0.99)") for i in tqdm(range(len_rec)): if correct_clickouts[i] not in impression[i].split('|'): print(f'Reference {correct_clickouts[i]} not in impression') continue if impression[i].split('|').index( correct_clickouts[i]) != 0 or not self.class_weights: correct_clickout = int(correct_clickouts[i]) if correct_clickout in predictions[i][1]: rank_pos = recs[i].index(correct_clickout) + 1 if rank_pos <= 25: RR += 1 / rank_pos count += 1 else: print('skipping because:') print(impression[i]) print(correct_clickouts[i]) MRR = RR / count print(f'MRR: {MRR}') return MRR
def recommend_batch(self, target_indices): X, indices = self.dataset.load_Xtest() # predict the references predictions = self.model.predict(X) # flatten the predictions and the indices to be 2-dimensional predictions = predictions.reshape((-1, predictions.shape[-1])) indices = indices.flatten() # take only the target predictions pred_df = pd.DataFrame(predictions) pred_df['orig_index'] = indices pred_df = pred_df.set_index('orig_index') predictions = pred_df.loc[target_indices].sort_index().values del pred_df assert len(predictions) == len(target_indices) full_df = data.full_df() accomodations1hot_df = data.accomodations_one_hot() result_predictions = [] for k, index in tqdm(enumerate(target_indices)): # get the impressions of the clickout to predict impr = list(map(int, full_df.loc[index]['impressions'].split('|'))) # get the true labels from the accomodations one-hot true_labels = accomodations1hot_df.loc[impr].values # build a list of (impression, l2norm distance) prediction_impressions_distances = [ (impr[j], L2Norm(true_labels[j] - predictions[k])) for j in range(len(impr)) ] # order the list based on the l2norm (smaller distance is better) prediction_impressions_distances.sort(key=lambda tup: tup[1]) # get only the impressions ids ordered_impressions = list( map(lambda x: x[0], prediction_impressions_distances)) # append the couple (index, reranked impressions) result_predictions.append((index, ordered_impressions)) print('prediction created !!!') return result_predictions
def __init__(self, mode, cluster, urm_name, factors=100, regularization=0.01, iterations=10, alpha=25): os.environ['MKL_NUM_THREADS'] = '1' name = 'ALS urm_name: {}\n factors: {}\n regularization: {}\n ' \ 'iterations: {}\n alpha: {}'.format(urm_name, factors, regularization, iterations, alpha) super(AlternatingLeastSquare, self).__init__(mode, cluster, name) self.factors = int(factors) self.regularization = regularization self.iterations = int(iterations) self.alpha = int(alpha) self.target_indices = data.target_indices(mode, cluster) self.dict_row = data.dictionary_row(mode, cluster) self.target_indices_urm = [] for ind in self.target_indices: self.target_indices_urm.append(self.dict_row[tuple( data.full_df().loc[ind][['session_id', 'user_id']])]) self.urm = data.urm(mode=mode, cluster=cluster, urm_name=urm_name) self.user_vecs = None self.item_vecs = None self._model = None self.fixed_params_dict = { 'mode': mode, 'urm_name': urm_name, 'cluster': cluster } self.hyperparameters_dict = { 'factors': (50, 200), 'regularization': (0, 1), 'iterations': (1, 250), 'alpha': (15, 45) }
def _merge_sessions(): print("Merging similar sessions (same user_id and city)") print("Loading full_df") full_df = data.full_df() print("Sorting, grouping, and other awesome things") grouped = full_df.sort_values(["user_id", "timestamp"], ascending=[True, True]).groupby( ["user_id", "city"]) new_col = np.array(["" for _ in range(len(full_df))], dtype=object) print("Now I'm really merging...") for name, g in tqdm(grouped): s_id = g.iloc[0]["session_id"] new_col[g.index.values] = s_id print("Writing on the df") full_df["unified_session_id"] = pd.Series(new_col) print("Saving new df to file") with open(data.FULL_PATH, 'w', encoding='utf-8') as f: full_df.to_csv(f) data.refresh_full_df()
def extract_feature(self): df = data.full_df() # find the clickout rows clickout_rows = df[df.prices.notnull()][['impressions', 'prices']] # cast the impressions and the prices to lists clickout_rows['impressions'] = clickout_rows.impressions.str.split('|') clickout_rows['prices'] = clickout_rows.prices.str.split('|') clickout_rows = pd.DataFrame({col:np.concatenate(clickout_rows[col].values) \ for col in clickout_rows.columns }).astype('int') # compute mean and standard deviation res_df = clickout_rows.groupby('impressions').agg(['mean', 'std' ]).reset_index() res_df.columns = ['_'.join(x) for x in res_df.columns.ravel()] res_df = res_df.rename(columns={'impressions_': 'item_id'}) res_df['prices_std'] = res_df['prices_std'].fillna(0) return res_df
def extract_feature(self): tqdm.pandas() df = data.full_df() df = df.sort_index() # find the clickout rows clickout_rows = df[['user_id','session_id','action_type','impressions','prices']][df.action_type == 'clickout item'] # cast the impressions and the prices to lists clickout_rows['impression_list'] = clickout_rows.impressions.str.split('|') clickout_rows['price_list'] = clickout_rows.prices.str.split('|').apply(lambda x: list(map(int,x))) # order the prices clickout_rows['sorted_price_list'] = clickout_rows.price_list.apply(lambda x: sorted(x)) clickout_rows = clickout_rows.drop('prices', axis=1) # find the interaction with numeric reference reference_rows = df[['user_id','session_id','reference','action_type']] reference_rows = reference_rows[df.reference.str.isnumeric() & (df.action_type != 'clickout item')] reference_rows = reference_rows.drop('action_type',axis=1) reference_rows['price_pos'] = -1 reference_rows = reference_rows.sort_index() # iterate over the sorted reference_rows and clickout_rows j = 0 clickout_indices = clickout_rows.index.values for idx,row in tqdm(reference_rows.iterrows()): # if the current index is over the last clickout, break if idx >= clickout_indices[-1]: break # find the next clickout index while idx > clickout_indices[j]: j += 1 next_clickout = clickout_rows.loc[clickout_indices[j]] # check if row and next_clickout are in the same session if row.user_id == next_clickout.user_id and row.session_id == next_clickout.session_id: try: ref_idx = next_clickout.impression_list.index(row.reference) ref_price = int(next_clickout.price_list[ref_idx]) reference_rows.at[idx, 'price_pos'] = next_clickout.sorted_price_list.index(ref_price) except: pass return reference_rows.drop('reference', axis=1)
def __init__(self, mode='local', learning_rate=0.3, min_child_weight=1, n_estimators=300, max_depth=3, subsample=1, colsample_bytree=1, reg_lambda=1.0, reg_alpha=0): name = 'gbdt_hybrid' cluster = 'no_cluster' super(Gbdt_Hybrid, self).__init__(mode, cluster, name) self.current_directory = Path(__file__).absolute().parent self.data_directory = self.current_directory.joinpath( '..', '..', 'submissions/hybrid') #self.gt_csv = self.data_directory.joinpath('ground_truth.csv') self.mode = mode self.full = data.full_df() self.local_target_indices = data.target_indices(mode='local', cluster='no_cluster') self.full_target_indices = data.target_indices(mode='full', cluster='no_cluster') directory = self.data_directory.joinpath('local') full_dir = self.data_directory.joinpath('full') self.xg = xgb.XGBRanker(learning_rate=learning_rate, min_child_weight=min_child_weight, max_depth=math.ceil(max_depth), n_estimators=math.ceil(n_estimators), subsample=subsample, colsample_bytree=colsample_bytree, reg_lambda=reg_lambda, reg_alpha=reg_alpha, n_jobs=-1, objective='rank:ndcg') self.cv_path = self.data_directory.joinpath('cross_validation')
def extract_feature(self): df = data.full_df() # count the numeric references (skipping NaN in the test) res_df = df[(df.action_type == 'clickout item') & (df.reference.str.isnumeric() == True)] res_df = res_df[['reference','frequence']].astype('int').groupby('reference').sum() res_df['frequence'] -= 1 res_df = res_df[res_df['frequence'] > 0] # scale log and min-max min_pop = res_df['frequence'].values.min() max_pop = res_df['frequence'].values.max() min_pop = np.log(min_pop +1) max_pop = np.log(max_pop +1) res_df['frequence'] = (np.log(res_df['frequence'].values +1) - min_pop) / (max_pop - min_pop) res_df = res_df.reset_index() return res_df.rename(columns={'reference': 'item_id', 'frequence': 'glob_clickout_popularity'})
def recommend_batch(self, target_indices): X, indices = self.dataset.load_Xtest() # predict the references predictions = self.model.predict(X) # take only the last index for each session (target row) and flatten #predictions = predictions.reshape((-1, predictions.shape[-1])) #indices = indices[:,-1].flatten() # take only the target predictions pred_df = pd.DataFrame(predictions) pred_df['orig_index'] = indices pred_df = pred_df.set_index('orig_index') predictions = pred_df.loc[target_indices] del pred_df assert len(predictions) == len(target_indices) full_df = data.full_df() result_predictions = [] for index in tqdm(target_indices): # get the impressions of the clickout to predict impr = list(map(int, full_df.loc[index]['impressions'].split('|'))) # build a list of (impression, score) prediction_impressions_distances = [(impr[j], predictions.at[index, j]) for j in range(len(impr))] # order the list based on scores (greater is better) prediction_impressions_distances.sort(key=lambda tup: tup[1], reverse=True) # get only the impressions ids ordered_impressions = list( map(lambda x: x[0], prediction_impressions_distances)) # append the couple (index, reranked impressions) result_predictions.append((index, ordered_impressions)) print('prediction created !!!') return result_predictions
def get_scores_batch(self, scores_type='test'): assert scores_type in ['train', 'test'] if scores_type == 'test': X, indices = self.dataset.load_Xtest() else: X, indices = self.dataset.load_Xtrain(return_indices=True) predictions = self.model.predict(X) full_df = data.full_df() result_predictions = [] for i, index in tqdm(enumerate(indices)): # get the impressions of the clickout to predict impr = list(map(int, full_df.loc[index]['impressions'].split('|'))) scores = predictions[i] # append the triple (index, impressions, scores) result_predictions.append((index, impr, scores)) return result_predictions
def recommend_batch(self): X_test, _, _, _ = data.dataset_xgboost_test(mode=self.mode, cluster=self.cluster, kind=self.kind) target_indices = data.target_indices(self.mode, self.cluster) full_impressions = data.full_df() print('data for test ready') scores = list(self.xg.predict(X_test)) final_predictions = [] count = 0 for index in tqdm(target_indices): impressions = list( map(int, full_impressions.loc[index]['impressions'].split('|'))) predictions = scores[count:count + len(impressions)] couples = list(zip(predictions, impressions)) couples.sort(key=lambda x: x[0], reverse=True) _, sorted_impr = zip(*couples) final_predictions.append((index, list(sorted_impr))) count = count + len(impressions) return final_predictions
def extract_feature(self): tqdm.pandas() df = data.full_df() # mapping and encoding change_sort_filters = set(['sort by price','best value', 'sort by rating','focus on rating', 'sort by popularity']) sof_classes = ['sort_rating', 'sort_pop', 'sort_price'] mapping = { 'sort by price': [0,0,1], 'best value': [0,1,1], 'sort by rating': [1,0,0], 'focus on rating': [1,1,0], 'sort by popularity': [0,1,0], } rows = df[(df.action_type == 'clickout item') & df.current_filters.notnull()] rows = rows[['current_filters']] # filter the filters by the sort filters and re-cast to list rows['filters_list'] = rows['current_filters'].str.lower().str.split('|')\ .progress_apply(lambda x: list(set(x) & change_sort_filters)) rows = rows.drop(['current_filters'], axis=1) rows = rows[rows['filters_list'].str.len() > 0] rows['filters_list'] = rows['filters_list'].apply(lambda x: x[0]) # iterate over the interactions print('Total interactions:', rows.shape[0]) matrix = np.zeros((rows.shape[0], len(sof_classes)), dtype='int8') k = 0 for fl in tqdm(rows['filters_list'].values): matrix[k,:] = mapping[fl] k += 1 # add the 3 new columns for i,col_name in enumerate(sof_classes): rows[col_name] = matrix[:,i] return rows.drop('filters_list', axis=1)
def recommend_batch(self): final_predictions = [] scores_batch = [] count = 0 for index in tqdm(self.target_indices): impr = list( map(int, data.full_df().loc[index]['impressions'].split('|'))) pred = self.predictions[count][0:len(impr)] couples = list(zip(pred, impr)) #print(couples) couples.sort(key=lambda x: x[0], reverse=True) scores, sorted_impr = zip(*couples) final_predictions.append((index, list(sorted_impr))) scores_batch.append((index, list(sorted_impr), list(scores))) count += 1 if self.mode != 'small': cf.check_folder('scores') np.save(f'scores/{self.name}', np.array(scores_batch)) return final_predictions
def __init__(self, mode, cluster, dataset_name, pred_name, predict_train=False): """ the dataset name is used to load the prediction created by the tensorflow ranking class :param dataset_name: dataset name passed to the CreateDataset() method """ name = 'tf_ranking' super(TensorflowRankig, self).__init__(mode=mode, cluster=cluster, name=name) self.dataset_name = dataset_name # the path where the PREDICTION are stored _BASE_PATH = f'dataset/preprocessed/tf_ranking/{cluster}/{mode}/{self.dataset_name}' _PREDICTION_PATH = f'{_BASE_PATH}/{pred_name}.npy' # check if the PREDICTION have been made exists_path_predictions = os.path.isfile(_PREDICTION_PATH) if not exists_path_predictions: print( f'the prediction for the \ndataset: {self.dataset_name}\n mode:{mode}\n ' f'cluster:{cluster}\n have not been made') exit(0) self.predictions = np.load(_PREDICTION_PATH) print('predictions loaded') if not predict_train: self.target_indices = data.target_indices(mode, cluster) else: self.target_indices = sorted(find_last_clickouts(data.full_df()))
def save(self, mode='full', add_unused_clickouts_to_test=True): """ makes use of fit to create the dataset for a specific cluster. in particular it take cares to create a folder at the same level of base_split with the specified name and the folders structure inside """ print('Creating {} cluster...'.format(mode), end=' ', flush=True) self._fit(mode) # create cluster root folder path = f'dataset/preprocessed/{self.name}' check_folder(path) # create full and local folders full_path = os.path.join(path, mode) check_folder(full_path) train = data.train_df(mode).loc[self.train_indices] train.to_csv(os.path.join(full_path, 'train.csv')) del train # in case I specify some target_indices, I do not want to leave missing clickout not-to-predict if add_unused_clickouts_to_test & len(self.target_indices) > 0: indices_from_full = list(set(self.test_indices) - set(self.target_indices)) indices_from_test = self.target_indices test = pd.concat([data.test_df(mode).loc[indices_from_test], data.full_df().loc[indices_from_full]]) else: test = data.test_df(mode).loc[self.test_indices] test.to_csv(os.path.join(full_path, 'test.csv')) if len(self.target_indices) > 0: np.save(os.path.join(full_path, 'target_indices'), self.target_indices) else: trgt_indices = preprocess.get_target_indices(test) np.save(os.path.join(full_path, 'target_indices'), trgt_indices) del test print('Done!')