def _fit(self, mode): self.test_indices = data.test_indices(mode) self.train_indices = data.train_indices(mode) total_target_set = set( data.target_indices(mode=mode, cluster=data.SPLIT_USED)) covered_target_list = [] for i in self.clusters: covered_target_list += list(data.target_indices(mode, cluster=i)) covered_target_set = set(covered_target_list) self.target_indices = list(total_target_set - covered_target_set)
def fit(self): """ Fit the model on the data. Creates a list list_scores of 100m lines initialized to 0 representing id of each hotel For every Recommender matrix, for every triple, the corresponding cell of list_scores will be summed by the score """ self.dict_scores = {} target_indices = data.target_indices(self.mode, self.cluster) # Initialize list for dict containing scores of imoressions for s_target in target_indices: self.dict_scores[s_target] = {} for i in range(len(self.impression_scores_matrices)): print("Getting scores from recommender number {} ...".format(i)) self.matrices_array = self.impression_scores_matrices[i].copy() self._normalization(self.normalization_mode) for k in tqdm(range(len(self.impression_scores_matrices[i]))): triple = self.impression_scores_matrices[i][k] # updating scores multiplied by corresponding weight for j in range(len(triple[2])): if triple[1][j] in self.dict_scores[triple[0]]: self.dict_scores[triple[0]][triple[1][j]] += self.normalized_matrices_array[k][j] * \ self.weights_array[i] else: self.dict_scores[triple[0]][triple[1][j]] = self.normalized_matrices_array[k][j] * \ self.weights_array[i]
def preprocess_cv(mode='full'): def save_folds(df, user_session_df, train_index, test_index, count, mode): u_s_train = list( user_session_df.loc[train_index]['user_session'].values) u_s_test = list(user_session_df.loc[test_index]['user_session'].values) path = 'dataset/preprocessed/{}/{}'.format('fold_' + str(count), mode) check_folder(path) train = df[df['user_session'].isin(u_s_train)] train = train.drop(['user_session'], axis=1) train.to_csv(os.path.join(path, 'train.csv')) train_indices = train.index.values np.save(os.path.join(path, 'train_indices'), train_indices) test = df[df['user_session'].isin(u_s_test)] target_indices = sorted(find(test)) test.at[target_indices, 'reference'] = np.nan test = test.drop(['user_session'], axis=1) test.to_csv(os.path.join(path, 'test.csv')) test_indices = test.index.values np.save(os.path.join(path, 'test_indices'), test_indices) np.save(os.path.join(path, 'target_indices'), target_indices) print(f'Train shape : {train.shape} , Test shape : {test.shape}') print(f'Last clickout indices : {len(target_indices)}') train_df = data.train_df(mode=mode, cluster='no_cluster') train_df['user_session'] = train_df['user_id'].values + '_' + train_df[ 'session_id'].values test_df = data.test_df(mode=mode, cluster='no_cluster') test_df['user_session'] = test_df['user_id'].values + '_' + test_df[ 'session_id'].values df = pd.concat([train_df, test_df]) # extract user_session referring to target_indices target_indices = data.target_indices(mode=mode, cluster='no_cluster') test_target_u_s = test_df.loc[target_indices].drop_duplicates( 'user_session')['user_session'].to_list() print(f'Number of user_session in target_indices : {len(test_target_u_s)}') # remove those sessions from df df = df[~df['user_session'].isin(test_target_u_s)] #df['user_session'] = df['user_id'].values + '_' + df['session_id'].values user_session_df = df.drop_duplicates('user_session') user_session_df = user_session_df.reset_index(drop=True) print( f'Number of user_session NOT in target_indices : {user_session_df.shape[0]}' ) kf = KFold(n_splits=5, shuffle=True, random_state=42) for i, (train_index, test_index) in enumerate(kf.split(user_session_df)): print( f' train indices : {len(train_index)}, test indices : {len(test_index)}' ) save_folds(df, user_session_df, train_index, test_index, i, mode)
def recommend_batch(self): svm_filename = 'svmlight_test.txt' _path = self.data_dir.joinpath(svm_filename) X_test, y_test = load_svmlight_file(str(_path)) target_indices = data.target_indices(self.mode, self.cluster) target_indices.sort() test = data.test_df('small', 'no_cluster') print('data for test ready') scores = list(self.model.predict(X_test)) final_predictions = [] count = 0 for index in tqdm(target_indices): impressions = list(map(int, test.loc[index]['impressions'].split('|'))) predictions = scores[count:count + len(impressions)] couples = list(zip(predictions, impressions)) couples.sort(key=lambda x: x[0], reverse=True) _, sorted_impr = zip(*couples) final_predictions.append((index, list(sorted_impr))) count = count + len(impressions) return final_predictions
def supersampling(mode): print("Supersampling for mode: {}".format(mode)) train = data.train_df(mode) class_to_sessions = get_class_to_sessions_dict(train) session_to_indices = get_session_to_indices_dict(train) sessions_to_be_resapmled = resample_session(class_to_sessions.copy(), train) new = duplicate_sessions(sessions_to_be_resapmled.copy(), train, session_to_indices) test = data.test_df(mode) max_test_index = max(test.index) max_train_index = max(train.index) max_index = max(max_test_index, max_train_index) new.index += max_index + 1 new = pd.concat([train, new]) train_len = len(new) old_starting_index = test.index[0] new = pd.concat([new, test]) print("Supersampling ended for mode={}, saving df".format(mode)) new_train = new.iloc[:train_len - 1] new_test = new.iloc[train_len:] # new_starting_index = new_test.index[0] # offset = new_starting_index - old_starting_index # target_indices += offset target_indices = data.target_indices(mode, "no_cluster") np.save(path + "/" + mode + "/target_indices", target_indices) new_train.to_csv(path + "/" + mode + "/train.csv", index=True) new_test.to_csv(path + "/" + mode + "/test.csv", index=True)
def score_fn(model, X, y): global i print(i) i += 1 t0 = time.time() target_indices = data.target_indices('local') full_impressions = data.full_df() scores = list(model.xg.predict(X)) final_predictions = [] count = 0 for index in tqdm(target_indices): impressions = list( map(int, full_impressions.loc[index]['impressions'].split('|'))) predictions = scores[count:count + len(impressions)] couples = list(zip(predictions, impressions)) couples.sort(key=lambda x: x[0], reverse=True) _, sorted_impr = zip(*couples) final_predictions.append((index, list(sorted_impr))) count = count + len(impressions) mrr = model.compute_MRR(final_predictions) print('Done in', time.time() - t0) print() return mrr
def recommend_batch(self): test = pd.read_csv( str(self.data_directory.joinpath('..', 'gbdt_test.csv'))) target_indices = data.target_indices(mode='full') #target_indices = sorted(target_indices) # Take indices in common between test and target_indices test_idx = set(list(test['index'].values)) print(f'Length test_idx : {len(test_idx)}') print(f'Length target_indices : {len(target_indices)}') target_indices = set(target_indices) & test_idx print(f'Length indices in common: {len(target_indices)}') target_indices = sorted(target_indices) #target_indices = sorted(test_idx) X_test = test.sort_values(by=['index', 'impression_position']) X_test = X_test.drop(['item_id', 'label', 'index'], axis=1) X_test = X_test.astype(np.float64) # full_impressions = data.full_df() full_impressions = self.full print('data for test ready') scores = list(self.xg.predict(X_test)) final_predictions = [] count = 0 for index in tqdm(target_indices): impressions = list( map(int, full_impressions.loc[index]['impressions'].split('|'))) predictions = scores[count:count + len(impressions)] couples = list(zip(predictions, impressions)) couples.sort(key=lambda x: x[0], reverse=True) _, sorted_impr = zip(*couples) final_predictions.append((index, list(sorted_impr))) count = count + len(impressions) return final_predictions
def fit(self): """ Create list of tuples for recommendations ordering them by impressions """ df_test = data.test_df(self.mode) target_indices = data.target_indices(self.mode, self.cluster) df_test_target = df_test[df_test.index.isin(target_indices)] # Initializing list of recs recs_tuples = [] for i in tqdm(df_test_target.index): impressions = df_test_target.at[i, "impressions"] impressions = list(map(int, impressions.split('|'))) prices = df_test_target.at[i, "prices"] prices = list(map(int, prices.split('|'))) temp_dict = {} for j in range(len(impressions)): temp_dict[impressions[j]] = int(prices[j]) ordered_recs = sorted(temp_dict, key=temp_dict.__getitem__) recs_tuples.append((i, ordered_recs)) self.recs_batch = recs_tuples return recs_tuples
def train_indices(mode='local', cluster='no_cluster'): df_train = data.train_df(mode=mode, cluster=cluster) df_test = data.test_df(mode=mode, cluster=cluster) target_indices = data.target_indices(mode=mode, cluster=cluster) df = pd.concat([df_train, df_test]) idx = find_last_clickout_indices(df) train_idx = set(idx) - set(target_indices) return train_idx
def _fit(self, mode): """ has target indices equal to the 10% of the session with: - no num ref - more than 1 step but anyway we train on all of them ;) """ def RepresentsInt(s): try: int(s) return True except ValueError: return False train = data.train_df('small') test = data.test_df('small') tgt_indices = data.target_indices('small') real_test_to_keep = [] for idx in tgt_indices: usr_sess_indices = [] theres_int = False a_user = test.at[idx, 'user_id'] a_sess = test.at[idx, 'session_id'] usr_sess_indices.append(idx) j = idx-1 pos_moved = 0 while j >= 0: try: new_user = test.at[j, 'user_id'] new_sess = test.at[j, 'session_id'] if new_user == a_user and new_sess == a_sess: usr_sess_indices.append(j) reference = test.at[j, 'reference'] if RepresentsInt(reference): theres_int = True j -= 1 pos_moved += 1 else: if not (pos_moved == 0 or theres_int): real_test_to_keep += sorted(usr_sess_indices) break except: if j < test.index.values[0]: if not (pos_moved == 0 or theres_int): real_test_to_keep += sorted(usr_sess_indices) break else: j -= 1 self.train_indices = train.index.values real_test_indices = retrieve_real_test_indices(mode, 'no_cluster') all_test_indices = data.test_df(mode).index.values self.test_indices = sorted(list(set(all_test_indices) - set(real_test_indices)) + real_test_to_keep) self.target_indices = sorted(list(set(self.test_indices) & set(tgt_indices)))
def __init__(self, filepaths, cluster): self.filepaths = filepaths self.cluster = cluster self.target_sessions = list(data.test_df("full", "no_cluster") .iloc[data.target_indices("full", "no_cluster")].session_id.values) #TODO Check if filepaths exists self.absolute_path = 'submission/'
def __init__(self, mode='local', learning_rate=0.3, min_child_weight=1, n_estimators=300, max_depth=3, subsample=1, colsample_bytree=1, reg_lambda=1.0, reg_alpha=0): name = 'gbdt_hybrid' cluster = 'no_cluster' super(Gbdt_Hybrid, self).__init__(mode, cluster, name) self.current_directory = Path(__file__).absolute().parent self.data_directory = self.current_directory.joinpath( '..', '..', 'submissions/hybrid') #self.gt_csv = self.data_directory.joinpath('ground_truth.csv') self.mode = mode self.full = data.full_df() self.local_target_indices = data.target_indices(mode='local', cluster='no_cluster') self.full_target_indices = data.target_indices(mode='full', cluster='no_cluster') directory = self.data_directory.joinpath('local') full_dir = self.data_directory.joinpath('full') self.xg = xgb.XGBRanker(learning_rate=learning_rate, min_child_weight=min_child_weight, max_depth=math.ceil(max_depth), n_estimators=math.ceil(n_estimators), subsample=subsample, colsample_bytree=colsample_bytree, reg_lambda=reg_lambda, reg_alpha=reg_alpha, n_jobs=-1, objective='rank:ndcg') self.cv_path = self.data_directory.joinpath('cross_validation')
def get_preprocessed_dataset(self, mode): """ Apply preprocessing steps to dataset - add id to identify groups - keep only useful columns - join eventual one-hotted categorical features :param mode: :return: """ if mode == 'train': classification_df = data.dataset_catboost_train( self.mode, self.cluster).copy() elif mode == 'test': classification_df = data.dataset_catboost_test( self.mode, self.cluster).copy() else: print('Wrong mode given in get_preprocessed_dataset!') return if len(self.features_to_drop) > 0: classification_df.drop(self.features_to_drop, axis=1, inplace=True) print('Lenght of dataset {} is {}, features numbers are {}'.format( mode, classification_df.shape[0], classification_df.shape[1])) target_indices = data.target_indices(mode=self.mode, cluster=self.cluster) if mode == 'test': self.test_df = data.test_df(self.mode, self.cluster) sessi_target = self.test_df.loc[target_indices].session_id.values self.dict_session_id = dict(zip(sessi_target, target_indices)) classification_df['id'] = classification_df.apply( lambda row: self.dict_session_id.get(row.session_id), axis=1) else: # Creating univoque id for each user_id / session_id pair classification_df = classification_df.sort_values( by=['user_id', 'session_id']) classification_df = classification_df.assign(id=( classification_df['user_id'] + '_' + classification_df['session_id']).astype('category').cat.codes) if self.features_to_one_hot is not None: for f in self.features_to_one_hot: one_hot = pd.get_dummies(classification_df[f]) classification_df = classification_df.drop([f], axis=1) classification_df = classification_df.join(one_hot) return classification_df
def _fit(self, mode): """ train, test and target indices are just sessions which have: - no num ref - more than 1 step """ def RepresentsInt(s): try: int(s) return True except ValueError: return False train = data.train_df(mode) train_index = train.index.values test = data.test_df(mode) test_index = test.index.values tgt_indices = data.target_indices(mode) df = pd.concat([train, test]) del train del test lst_clk_indices = sorted(find(df)) to_return = [] for idx in lst_clk_indices: usr_sess_indices = [] try: a_user = df.at[idx, 'user_id'] a_sess = df.at[idx, 'session_id'] usr_sess_indices.append(idx) except: continue j = idx - 1 while j >= 0: try: new_user = df.at[j, 'user_id'] new_sess = df.at[j, 'session_id'] if new_user == a_user and new_sess == a_sess: usr_sess_indices.append(j) reference = df.at[j, 'reference'] if RepresentsInt(reference): break j -= 1 else: if idx - j >= 2: to_return += usr_sess_indices break except: j -= 1 self.train_indices = sorted(list(set(train_index) & set(to_return))) self.test_indices = sorted(list(set(test_index) & set(to_return))) self.target_indices = sorted(list(set(tgt_indices) & set(to_return)))
def fit(self): all_recs = {} self.recs = [] for e in recs: if i in all_recs: raise Exception('Recommendation ids shouldn\'t be overlapped') all_recs[e[0]] = e[1] target_ids = data.target_indices(self.mode, self.cluster) for i in target_ids: if i not in all_recs: raise Exception('Recommendation ids should cover all the ids of the actual mode used') self.recs.append((i, all_recs[i]))
def fit(self): df_test = data.test_df(self.mode, cluster=self.cluster) print("{}: creating grouped sessions with interaction lists".format(self.name)) session_groups = self.get_groupby_sessions_references(data.test_df(self.mode, cluster=self.cluster)) # Getting target sessions target_indices = data.target_indices(self.mode, self.cluster) df_test_target = df_test[df_test.index.isin(target_indices)] # I must reason with session_ids since i'm interested in getting last interactions of same session df_test_target = df_test_target.set_index("session_id") # then i create a dictionary for re-mapping session into indices if len(df_test_target.index) != len(target_indices): print("Indices not same lenght of sessions, go get some coffee...") return self.dictionary_indices = dict(zip(df_test_target.index, target_indices)) list_sessions = session_groups.index recs_tuples = [] print("{}: fitting the model".format(self.name)) for i in tqdm(df_test_target.index): # Check if it is a session without interactions if i not in list_sessions: recs_tuples.append((self.dictionary_indices.get(i), [])) else: # Get interacted element of session with no duplicates interacted_elements = np.asarray(session_groups.at[i, "sequence"]) interacted_elements = np.asarray(self._set_no_reordering(x for x in interacted_elements)) impressions = np.asarray(df_test_target.at[i, "impressions"].split("|")) # First i want to be sure the impressions contains all the interacted elements (if not, they must be cutted off from relevant items) mask_only_in_impression = np.in1d(interacted_elements, impressions, assume_unique=True) interacted_elements = interacted_elements[mask_only_in_impression] # I append the last interacted elements as first (so I invert the order of relevant_elements!) real_recommended = np.flipud(interacted_elements) real_recommended = real_recommended.astype(np.int) recs_tuples.append( (self.dictionary_indices.get(i), list(real_recommended)[:self.k_first_only_to_recommend])) self.recs_batch = recs_tuples
def recommend_batch(self): test = data.test_df(self.mode, self.cluster) target_indices = data.target_indices(self.mode, self.cluster) target_indices.sort() if self.include_test: test_with_weights = self.test_with_weights else: dataset_test = self.get_preprocessed_dataset(mode='test') print('data for test ready') test_feat_df = dataset_test.drop( ['user_id', 'session_id', 'item_id', 'label', 'id'], axis=1) if list(test_feat_df.columns.values) != list(self.train_features): print( 'Training columns are different from test columns! Check') print(self.train_features) print(test_feat_df.columns.values) exit(0) X_test = test_feat_df.values group_id = dataset_test.id.values test_with_weights = Pool(data=X_test, label=None, group_id=group_id, cat_features=self.categorical_features) scores = self.ctb.predict(test_with_weights) self.predictions = [] self.scores_batch = [] count = 0 for index in tqdm(target_indices): impressions = list( map(int, test.loc[index]['impressions'].split('|'))) predictions = scores[count:count + len(impressions)] couples = list(zip(predictions, impressions)) couples.sort(key=lambda x: x[0], reverse=True) if len(couples) > 0: scores_impr, sorted_impr = zip(*couples) count = count + len(impressions) self.predictions.append((index, list(sorted_impr))) self.scores_batch.append( (index, list(sorted_impr), scores_impr)) return self.predictions
def __init__(self, matrix, _type, mode='full', cluster='no_cluster', name='distancebased', urm_name='urm_clickout', k=100, distance='cosine', shrink=0, threshold=0, implicit=False, alpha=0.5, beta=0.5, l=0.5, c=0.5, urm=None, matrix_mul_order='standard'): super(DistanceBasedRecommender, self).__init__(mode=mode, cluster=cluster, name=name) self.type = _type self.urm_name = urm_name self._sim_matrix = None self.target_indices = data.target_indices(mode, cluster) self._matrix_mul_order = matrix_mul_order # if you want R•R', or 'inverse' if you want to compute S•R self.dict_row = data.dictionary_row(mode=self.mode, cluster=self.cluster, urm_name=self.urm_name, type=self.type) self.dict_col = data.dictionary_col(mode=self.mode, cluster=self.cluster, urm_name=self.urm_name, type=self.type) self.matrix = matrix self.k = int(k) self.distance = distance self.shrink = shrink self.threshold = threshold self.implicit = implicit self.alpha = alpha self.beta = beta self.l = l self.c = c self.urm = urm self.scores_batch = None
def fit(self): """ Create list of tuples for recommendations ordering them by impressions """ df_test = data.test_df(self.mode) target_indices = data.target_indices(self.mode, self.cluster) df_test_target = df_test[df_test.index.isin(target_indices)] # Initializing list of recs recs_tuples = [] for i in tqdm(df_test_target.index): impressions = df_test_target.at[i, "impressions"] impressions = list(map(int, impressions.split('|'))) recs_tuples.append((i, impressions)) self.recs_batch = recs_tuples
def train(): mode = menu.mode_selection() # fit the model model = interactive_model(mode) model.fit(epochs=10000, early_stopping_patience=25, early_stopping_on='val_mrr', mode='max') print('\nFit completed!') # recommend target_indices = data.target_indices(mode, 'cluster_recurrent') print('Recommending...') recommendations = model.recommend_batch(target_indices) print('Recommendation count: ', len(recommendations)) mrr = model.compute_MRR(recommendations) model.save(folderpath='saved_models/', suffix='_{}'.format(round(mrr, 5)).replace('.', ''))
def __init__(self, mode, urm_name, _type, cluster='no_cluster'): name = 'content_based_remastered' super(ContentBased, self).__init__(mode, cluster, name) self.type = _type self.urm_name = urm_name self.dict_row = data.dictionary_row(mode=mode, urm_name=urm_name, cluster=cluster, type=self.type) self.dict_col = data.dictionary_col(mode=mode, urm_name=urm_name, cluster=cluster, type=self.type) self.user_features_matrix = None self.scores_batch = None # load target indices self.target_indices = data.target_indices(mode, cluster)
def __init__(self, mode, cluster, urm_name, factors=100, regularization=0.01, iterations=10, alpha=25): os.environ['MKL_NUM_THREADS'] = '1' name = 'ALS urm_name: {}\n factors: {}\n regularization: {}\n ' \ 'iterations: {}\n alpha: {}'.format(urm_name, factors, regularization, iterations, alpha) super(AlternatingLeastSquare, self).__init__(mode, cluster, name) self.factors = int(factors) self.regularization = regularization self.iterations = int(iterations) self.alpha = int(alpha) self.target_indices = data.target_indices(mode, cluster) self.dict_row = data.dictionary_row(mode, cluster) self.target_indices_urm = [] for ind in self.target_indices: self.target_indices_urm.append(self.dict_row[tuple( data.full_df().loc[ind][['session_id', 'user_id']])]) self.urm = data.urm(mode=mode, cluster=cluster, urm_name=urm_name) self.user_vecs = None self.item_vecs = None self._model = None self.fixed_params_dict = { 'mode': mode, 'urm_name': urm_name, 'cluster': cluster } self.hyperparameters_dict = { 'factors': (50, 200), 'regularization': (0, 1), 'iterations': (1, 250), 'alpha': (15, 45) }
def load_Xtest(self): test_df = data.dataset_catboost_test(mode=self.mode, cluster=self.cluster) target_indices = data.target_indices(self.mode, self.cluster) sessi_target = data.test_df( self.mode, self.cluster).loc[target_indices].session_id.values dict_session_trg_idx = dict(zip(sessi_target, target_indices)) test_df['id'] = test_df.apply( lambda row: dict_session_trg_idx.get(row.session_id), axis=1) print('data for test ready') test_feat_df = test_df.drop(['user_id', 'session_id', 'item_id'], axis=1) return test_feat_df
def recommend_batch(self): X_test, _, _, _ = data.dataset_xgboost_test(mode=self.mode, cluster=self.cluster, kind=self.kind) target_indices = data.target_indices(self.mode, self.cluster) full_impressions = data.full_df() print('data for test ready') scores = list(self.xg.predict(X_test)) final_predictions = [] count = 0 for index in tqdm(target_indices): impressions = list( map(int, full_impressions.loc[index]['impressions'].split('|'))) predictions = scores[count:count + len(impressions)] couples = list(zip(predictions, impressions)) couples.sort(key=lambda x: x[0], reverse=True) _, sorted_impr = zip(*couples) final_predictions.append((index, list(sorted_impr))) count = count + len(impressions) return final_predictions
def __init__(self, mode, cluster, dataset_name, pred_name, predict_train=False): """ the dataset name is used to load the prediction created by the tensorflow ranking class :param dataset_name: dataset name passed to the CreateDataset() method """ name = 'tf_ranking' super(TensorflowRankig, self).__init__(mode=mode, cluster=cluster, name=name) self.dataset_name = dataset_name # the path where the PREDICTION are stored _BASE_PATH = f'dataset/preprocessed/tf_ranking/{cluster}/{mode}/{self.dataset_name}' _PREDICTION_PATH = f'{_BASE_PATH}/{pred_name}.npy' # check if the PREDICTION have been made exists_path_predictions = os.path.isfile(_PREDICTION_PATH) if not exists_path_predictions: print( f'the prediction for the \ndataset: {self.dataset_name}\n mode:{mode}\n ' f'cluster:{cluster}\n have not been made') exit(0) self.predictions = np.load(_PREDICTION_PATH) print('predictions loaded') if not predict_train: self.target_indices = data.target_indices(mode, cluster) else: self.target_indices = sorted(find_last_clickouts(data.full_df()))
def recommend_batch(self): print('loading target indices') target_indices = data.target_indices(mode=self.mode, cluster=self.cluster) print('done\n') full_impressions = data.full_df() print('retriving predictions') scores = self.model.predict(self.x_vali) final_predictions = [] count = 0 for index in tqdm(target_indices): impressions = list( map(int, full_impressions.loc[index]['impressions'].split('|'))) predictions = scores[count:count + len(impressions)] couples = list(zip(predictions, impressions)) couples.sort(key=lambda x: x[0], reverse=True) _, sorted_impr = zip(*couples) final_predictions.append((index, list(sorted_impr))) count = count + len(impressions) return final_predictions
def fit(self): df_test = data.test_df(self.mode, cluster=self.cluster) # Getting target sessions target_indices = data.target_indices(self.mode, self.cluster) df_test_target = df_test[df_test.index.isin(target_indices)] count = 0 oneshot = 0 resorted = 0 recs_tuples = [] print("Fitting...") for index, row in tqdm(df_test_target.iterrows()): impressions = list(map(int, row["impressions"].split("|"))) if int(row.step) == 1: # This means that the clickout is the first interaction of the session --> we are in a one shot session oneshot += 1 recs_tuples.append((index, impressions)) else: previous_row = df_test.loc[index - 1] last_interaction_ref = previous_row["reference"] if type(last_interaction_ref) == str and last_interaction_ref.isdigit(): last_interaction_ref = int(last_interaction_ref) if last_interaction_ref and last_interaction_ref in impressions: i = impressions.index(last_interaction_ref) sorted_impressions = impressions[i:] + impressions[:i] count += 1 recs_tuples.append((index, sorted_impressions)) else: resorted += 1 recs_tuples.append((index, impressions)) print("{} % of resorted session".format(round(resorted / len(df_test_target) * 100, 2))) print("{} % of oneshot session".format(round(oneshot / len(df_test_target) * 100, 2))) print("{} % of lazy session".format(round(count / len(df_test_target) * 100, 2))) self.recs = recs_tuples print("Fitting completed!")
def submission(): mode = 'full' model = interactive_model(mode) sub_suffix = input('Insert submission suffix: ') checkpoint_path = menu.checkpoint_selection( checkpoints_dir='saved_models') print('Loading {}...'.format(checkpoint_path), end='\r', flush=True) model.load(checkpoint_path) print('Done!', flush=True) # recommend target_indices = data.target_indices(mode, 'cluster_recurrent') print('Recommending...') recommendations = model.recommend_batch(target_indices) # create and send sub sub_name = f'{model.name}_{sub_suffix}' sub_path = out.create_sub(recommendations, submission_name=sub_name) print('Done') sub.send(sub_path, username='******', password='******')
import utils.menu as menu from recommenders.recurrent.RNNClassificationRecommender import RNNClassificationRecommender from utils.dataset import SequenceDatasetForClassification if __name__ == "__main__": mode = menu.mode_selection() dataset = SequenceDatasetForClassification( f'dataset/preprocessed/cluster_recurrent/{mode}/dataset_classification' ) model = RNNClassificationRecommender(dataset, cell_type='gru', num_recurrent_units=128, num_recurrent_layers=3, num_dense_layers=2, class_weights=[]) model.load( '/Users/federico/Desktop/rnn_GRU_3layers_128units_2dense_class_06316.h5' ) #model.load('gru.h5') target_indices = data.target_indices(mode, 'cluster_recurrent') recomendations = model.recommend_batch(target_indices) if mode != 'full': model.compute_MRR(recomendations) out.create_sub(recomendations, submission_name=model.name + '_06316')
def build_user_prop(mode, cluster='no_cluster'): def func(x): y = x[(x['action_type'] == 'clickout item')] # features features = { 'avg price': -1, 'avg cheap position': -1, 'avg time per step': 0, 'session avg length': 0, 'session avg steps': 0, 'session num': 0, 'mobile perc': 0, 'tablet perc': 0, 'desktop perc': 0, 'filters_during_session': '', 'num change of sort order session': 0, 'num clickout item session': 0, 'num filter selection session': 0, 'num interaction item deals session': 0, 'num interaction item image session': 0, 'num interaction item info session': 0, 'num interaction item rating session': 0, 'num search for destination session': 0, 'num search for item session': 0, 'num search for poi session': 0 } # Compute avg lenght of session in seconds (OSS: not considering session ending at last clickout!) session_grouped = x.groupby("session_id") get_lenght_sum = 0 step_count = 0 for name, group in session_grouped: tail_temp = group.tail(1) get_lenght_sum += int(tail_temp['timestamp'].values[0]) - int( group.head(1)['timestamp'].values[0]) step_count += int(tail_temp['step']) # Compure avg steps in a session (OSS: not considering session ending at last clickout!) user_sessions = set(x['session_id'].values) avg_steps = round(step_count / len(user_sessions), 2) features['session avg steps'] = avg_steps features['session num'] = len(user_sessions) avg_length = round(get_lenght_sum / len(user_sessions), 2) features['session avg length'] = avg_length features['avg time per step'] = round(avg_length / avg_steps, 2) # Computing types of non_numeric actions performed by that user in the past actions = list(x['action_type'].values) for ind in range(len(actions)): if ('num ' + actions[ind] + ' session') in features: features['num ' + actions[ind] + ' session'] += 1 # Remove duplicates: x.drop(['timestamp', 'step'], axis=1, inplace=True) x = x.drop_duplicates() if len(y) > 0: # Builld a record of interacted price of items only when available: impressions_prices_available = y[y['impressions'] != None][[ "impressions", "prices" ]].drop_duplicates() # [13, 43, 4352, 543, 345, 3523] impressions # [45, 34, 54, 54, 56, 54] prices # -> [(13,45), (43,34), ...] # Then create dict # {13: 45, 43: 34, ... } tuples_impr_prices = [] tuples_impr_price_pos_asc = [] for i in impressions_prices_available.index: impr = impressions_prices_available.at[i, 'impressions'].split( '|') prices = impressions_prices_available.at[i, 'prices'].split('|') tuples_impr_prices += list(zip(impr, prices)) sorted(tuples_impr_prices, key=lambda x: x[1]) tuples_impr_price_pos_asc += list( zip(impr, list(range(1, len(tuples_impr_prices) + 1)))) tuples_impr_prices = list(set(tuples_impr_prices)) dict_impr_price = dict(tuples_impr_prices) # Create dict for getting position wrt clicked impression based on cheapest item tuples_impr_price_pos_asc = list(set(tuples_impr_price_pos_asc)) dict_impr_price_pos = dict(tuples_impr_price_pos_asc) sum_price = 0 sum_pos_price = 0 count_interacted = 0 # IMPORTANT: I decided to consider impressions and clickouts distinctively. # If an impression is also clicked, that price counts double df_only_numeric = x[pd.to_numeric(x['reference'], errors='coerce').notnull()][[ "reference", "impressions", "action_type" ]].drop_duplicates() # Not considering last clickout in the train sessions clks_num_reference = df_only_numeric[df_only_numeric['action_type'] == 'clickout item'] if len(clks_num_reference) == len(y): # is it a train session? idx_last_clk = y.tail(1).index.values[0] df_only_numeric = df_only_numeric.drop(idx_last_clk) for idx, row in df_only_numeric.iterrows(): reference = row.reference if reference in dict_impr_price.keys(): if row.action_type == "clickout item": sum_price += int(dict_impr_price[reference]) * 2 sum_pos_price += int( dict_impr_price_pos[reference]) * 2 count_interacted += 2 else: sum_price += int(dict_impr_price[reference]) sum_pos_price += int(dict_impr_price_pos[reference]) count_interacted += 1 if count_interacted > 0: features['avg price'] = round(sum_price / count_interacted, 2) features['avg cheap position'] = round( sum_pos_price / count_interacted, 2) else: features['avg price'] = -1 features['avg cheap position'] = -1 # Device percentages features tot_clks = len(y) features['mobile perc'] = round( y[y.device == "mobile"].shape[0] / tot_clks, 2) features['tablet perc'] = round( y[y.device == "tablet"].shape[0] / tot_clks, 2) features['desktop perc'] = round( y[y.device == "desktop"].shape[0] / tot_clks, 2) # Getting used filters during past clickouts (except during clickout to predict!), then they will be one_hotted y_filters = y[(y.current_filters != None) & (y.reference != None)] for i in y_filters.index: features['filters_during_session'] += str( y_filters.at[i, 'current_filters']) + "|" x_activating_filters = x[(x.action_type == "filter selection")] for i in x_activating_filters.index: features['filters_during_session'] += str( x_activating_filters.at[i, 'reference']) + "|" return pd.DataFrame.from_records([features]) def construct_features(df): dataset = df.groupby(['user_id']).progress_apply(func) one_hot = dataset['filters_during_session'].astype( str).str.get_dummies() missing = poss_filters - set(one_hot.columns) to_drop = set(one_hot.columns) - poss_filters for e in missing: one_hot[e] = 0 for e in to_drop: one_hot = one_hot.drop([e], axis=1) dataset = dataset.drop(['filters_during_session'], axis=1) dataset = dataset.join(one_hot) return dataset def get_user_favorite_filters(full_df, users): """ I want a structure that for every user in the train gives an one_hot_encoded structures for all possible parameters of hotels clicked by that user ex. parameter: 3 Stars """ # get clickout of train and merge metadata of the hotel train_df = full_df[full_df["user_id"].isin(users)] train_df = train_df[(train_df["action_type"] == "clickout item") & ( pd.to_numeric(train_df['reference'], errors='coerce').notnull())] train_df.drop([ "session_id", "timestamp", "step", "action_type", "platform", "city", "device", "current_filters", "impressions", "prices" ], axis=1, inplace=True) # Merge & eliminate column metatadata_one_hot = data.accomodations_one_hot().reset_index() train_df['reference'] = train_df['reference'].astype(int) metatadata_one_hot['item_id'] = metatadata_one_hot['item_id'].astype( int) train_df = pd.merge(train_df, metatadata_one_hot, how='outer', left_on='reference', right_on='item_id') train_df = train_df.drop(["reference", "item_id"], axis=1) print( "Finishing binaryzing, now summing and getting user favorite properties of hotels..." ) out_df = train_df.groupby('user_id')[train_df.columns[2:]].sum() return out_df # Start trying to compute dataset train = data.train_df(mode=mode, cluster=cluster) test = data.test_df(mode=mode, cluster=cluster) target_indices = data.target_indices(mode=mode, cluster=cluster) target_user_id = test.loc[target_indices]['user_id'].values full = pd.concat([train, test]) del train del test poss_filters = [] for f in full[~full['current_filters'].isnull()]['current_filters'].values: poss_filters += f.split('|') poss_filters = set(poss_filters) user_fav_filters = get_user_favorite_filters(full, target_user_id) # Add suffix in order to distinguish hotel properties from user filters user_fav_filters.columns = [ str(col) + '_hotel' for col in user_fav_filters.columns ] user_fav_filters.reset_index(inplace=True) #Remove duplicate before processing full.drop_duplicates( subset=["user_id", "session_id", "action_type", "reference"], inplace=True) # build in chunk count_chunk = 0 chunk_size = 10000000000 print( "{}: Started chunk processing".format("Build Dataset Classification")) groups = full.groupby(np.arange(len(full)) // chunk_size) for idxs, gr in groups: features = construct_features(gr) features.reset_index(inplace=True) outcome = pd.merge(features, user_fav_filters, how='outer', left_on="user_id", right_on="user_id") outcome.drop(["level_1", outcome.columns.values[-1]], axis=1, inplace=True) #Get floats to int outcome.iloc[:, -user_fav_filters. shape[1]:] = outcome.iloc[:, -user_fav_filters. shape[1]:].fillna(0).astype(int) if count_chunk == 0: outcome.to_csv( 'dataset/preprocessed/{}/{}/user_properties.csv'.format( cluster, mode)) else: with open( 'dataset/preprocessed/{}/{}/user_properties.csv'.format( cluster, mode), 'a') as f: outcome.to_csv(f, header=False) count_chunk += 1 print('chunk {} over {} completed'.format(count_chunk, len(groups)))