def read_feature(self, one_hot=False, create_not_existing_features=True): """ it reads a feature from disk and returns it. if one_hot = False, it returns it as was saved. if one_hot = True, returns the onehot of the categorical columns, by means of self.columns_to_onehot """ path = 'dataset/preprocessed/{}/{}/feature/{}/features.csv'.format( self.cluster, self.mode, self.name) if not os.path.exists(path): if create_not_existing_features: choice = 'y' print('Missing feature: creating') else: choice = yesno_choice( 'feature \'{}\' does not exist. want to create?'.format( self.name)) if choice == 'y': self.save_feature() else: return index_col = 0 if self.save_index else None df = pd.read_csv(path, index_col=index_col) #df = df.drop('Unnamed: 0', axis=1) print('{} feature read'.format(self.name)) # then proceed with one hot if one_hot: for t in self.columns_to_onehot: col = df[t[0]] one_hot_prefix = t[2] if len(t) == 3 else t[0] if t[1] == 'single': oh = pd.get_dummies(col, prefix=one_hot_prefix) elif t[1] == 'binary': ce = BinaryEncoder(cols=t[0]) oh = ce.fit_transform(col) else: mid = col.apply(lambda x: x.split('|') if isinstance(x, str) else x) mid.fillna(value='', inplace=True) mlb = MultiLabelBinarizer() oh = mlb.fit_transform(mid) oh = pd.DataFrame(oh, columns=mlb.classes_) oh = oh.astype(np.uint8) oh = oh.add_prefix(one_hot_prefix) df = df.drop([t[0]], axis=1) df = pd.concat([df, oh], axis=1) print('{} onehot completed'.format(self.name)) df = self.post_loading(df) return df
def fit(self): check_folder('models') if self.ask_to_load: if os.path.isfile('models/{}.model'.format(self.name)): if yesno_choice( 'the exact same model was yet created. want to load?' ) == 'y': self.xg.load_model('models/{}.model'.format(self.name)) return if self.class_weights: X_train, y_train, group, _, weights, _ = data.dataset_xgboost_train( mode=self.mode, cluster=self.cluster, class_weights=self.class_weights, kind=self.kind) else: X_train, y_train, group, _, _ = data.dataset_xgboost_train( mode=self.mode, cluster=self.cluster, class_weights=self.class_weights, kind=self.kind) print('data for train ready') if self.class_weights: self.xg.fit(X_train, y_train, group, sample_weight=weights) elif self.weights_position: bp = 'dataset/preprocessed/{}/{}/xgboost/{}/'.format( cluster, mode, kind) w = np.load(os.path.join(bp, 'weights_position.npy')) print(w.size) print(group.shape) self.xg.fit(X_train, y_train, group, sample_weight=w) elif self.log_weights: bp = 'dataset/preprocessed/{}/{}/xgboost/{}/'.format( cluster, mode, kind) w = np.load(os.path.join(bp, 'log_weights.npy')) print(w.size) print(group.shape) self.xg.fit(X_train, y_train, group, sample_weight=w) else: self.xg.fit(X_train, y_train, group) print('fit done') self.xg.save_model('models/{}.model'.format(self.name)) print('model saved')
def save_feature(self, overwrite_if_exists=None): """ overwrite_if_exists: if true overwrite without asking; if false do not overwrite, if None ask before overwrite """ path = 'dataset/preprocessed/{}/{}/feature/{}/features.csv'.format( self.cluster, self.mode, self.name) if os.path.exists(path): if overwrite_if_exists == None: choice = yesno_choice( 'The feature \'{}\' already exists. Want to recreate?'. format(self.name)) if choice == 'n': return elif not overwrite_if_exists: return df = self.extract_feature() check_folder(path) df.to_csv(path, index=self.save_index)
def interactive_model(mode): cell_type = menu.single_choice( 'Choose a network architecture:', ['LSTM', 'GRU', 'default architecture'], [lambda: 'LSTM', lambda: 'GRU', lambda: 'auto']) print() if cell_type == 'auto': cell_type = 'GRU' rec_layers = 1 dense_layers = 2 units = 4 weights = True else: rec_layers = int(input('Insert number of recurrent layers: ')) units = int(input('Insert number of units per layer: ')) dense_layers = int(input('Insert number of dense layers: ')) weights = menu.yesno_choice('Do you want to use sample weights?', lambda: True, lambda: None) #tb_path = menu.yesno_choice('Do you want to enable Tensorboard?', lambda: 'recommenders/tensorboard', lambda: None) pad = menu.single_choice('Which dataset?', ['Padded 6', 'Padded 12'], [lambda: 6, lambda: 12]) dataset = SequenceDatasetForClassification( f'dataset/preprocessed/cluster_recurrent/{mode}/dataset_classification_p{pad}' ) if weights is not None: weights = dataset.get_sample_weights() model = RNNClassificationRecommender( dataset, use_generator=False, cell_type=cell_type, input_shape=(dataset.rows_per_sample, 168), num_recurrent_layers=rec_layers, num_recurrent_units=units, optimizer='adam', num_dense_layers=dense_layers, #class_weights=weights sample_weights=weights) return model
train_len = len(new) old_starting_index = test.index[0] new = pd.concat([new, test]) print("Supersampling ended for mode={}, saving df".format(mode)) new_train = new.iloc[:train_len - 1] new_test = new.iloc[train_len:] # new_starting_index = new_test.index[0] # offset = new_starting_index - old_starting_index # target_indices += offset target_indices = data.target_indices(mode, "no_cluster") np.save(path + "/" + mode + "/target_indices", target_indices) new_train.to_csv(path + "/" + mode + "/train.csv", index=True) new_test.to_csv(path + "/" + mode + "/test.csv", index=True) if __name__ == '__main__': if already_existing(): answer = menu.yesno_choice( title='Another supersampling detected, do you want to recreate it?', callback_yes=lambda: True, callback_no=lambda: False) if answer: rmtree(path) else: exit(0) os.mkdir(path) modes = ["small", "local", "full"] for m in modes: os.mkdir(path + "/" + m) supersampling(m)
TimePerImpression, TimesUserInteractedWithImpression, TimingFromLastInteractionImpression, TopPopInteractionClickoutPerImpression, TopPopPerImpression, User2Item, #UserFeature ] assert features_array[0] == ImpressionLabel, 'first feature must be the label!' choice = menu.yesno_choice(['want the scores?']) if choice == 'y': base_path_stacking = 'scores_stacking' stacking_scores_path = ['xgboost_nobias.csv.gz', 'catboost_rank.csv.gz','rnn_GRU_2layers_64units_2dense_class_nobias_05952.csv.gz', 'scores_pairwise_soft_zero_one_loss.csv.gz'] stacking_scores_path = [f'{base_path_stacking}/{a}' for a in stacking_scores_path] else: stacking_scores_path = [] mode = menu.mode_selection() cluster = menu.cluster_selection() dataset_name = input('insert dataset name\n') choice = menu.single_choice(['select mode'], ['normal', 'cv']) if choice == 'cv':
def create_full_df(custom_preprocess_function): """ Save the dataframe containing train.csv and test.csv contiguosly with reset indexes. Also save the config file containing the number of rows in the original train.csv (max_train_idx). This is used to know which indices indicates train rows (idx < max_train_idx) and test rows (idx >= max_train_idx). pass a custom preprocess function to personalize the original train and test df from which the creation of the full df starts """ train_df, test_df = custom_preprocess_function(data.original_train_df().reset_index(drop=True), \ data.original_test_df().reset_index(drop=True)) train_df = train_df.reset_index(drop=True) test_df = test_df.reset_index(drop=True) # TEST len_original_train = train_df.shape[0] compressed = menu.yesno_choice( title='Do you want the compressed version? (no for the original full)', callback_yes=lambda: True, callback_no=lambda: False) # TRAIN; FIXING DUPLICATED SESSION_ID <-> STEP PAIRS print('Fixing wrong duplicated steps in train...') train_df = reset_step_for_duplicated_sessions(train_df) # TRAIN; MERGING DUPLICATES if compressed: train_df = merge_duplicates(train_df) else: train_df["frequence"] = 1 len_train = train_df.shape[0] train_df.to_csv(data.FULL_PATH) del train_df # save config file data.save_config(data.TRAIN_LEN_KEY, len_train) # TEST with open(data.FULL_PATH, 'a', encoding='utf-8') as f: # restore index summing the len of the original train (to be the same as without merging) test_df.index += len_original_train # TEST; FIXING DUPLICATED SESSION_ID <-> STEP PAIRS print('Fixing wrong duplicated steps in test...') test_df = reset_step_for_duplicated_sessions(test_df) # TEST; MERGING DUPLICATES if compressed: test_df = merge_duplicates(test_df) else: test_df["frequence"] = 1 # TEST; DELETING UNNFORMATIVE INTERACTIONS mask = (test_df["action_type"] != "clickout item") & (test_df["reference"].isnull()) test_df = test_df.drop(test_df[mask].index) test_df.to_csv(f, header=False)
def preprocess(): """ Preprocess menu NOTE: it is required to have the original CSV files in the folder dataset/original """ def _create_csvs(): print('creating CSV...') # create no_cluster/full path = 'dataset/preprocessed/no_cluster' full = data.full_df() train_len = data.read_config()[data.TRAIN_LEN_KEY] train = full.iloc[0:train_len] test = full.iloc[train_len:len(full)] target_indices = get_target_indices(test) check_folder('dataset/preprocessed/no_cluster/full') train.to_csv(os.path.join(path, 'full/train.csv')) test.to_csv(os.path.join(path, 'full/test.csv')) np.save(os.path.join(path, 'full/train_indices'), train.index) np.save(os.path.join(path, 'full/test_indices'), test.index) np.save(os.path.join(path, 'full/target_indices'), target_indices) no_of_rows_in_small = int( input('How many rows do you want in small.csv? ')) train_small = get_small_dataset(train, maximum_rows=no_of_rows_in_small) check_folder('dataset/preprocessed/no_cluster/small') split(train_small, os.path.join(path, 'small')) check_folder('dataset/preprocessed/no_cluster/local') split(train, os.path.join(path, 'local')) # create item_metadata in preprocess folder original_item_metadata = data.accomodations_original_df() original_item_metadata.to_csv(data.ITEMS_PATH) # append missing accomodations to item metadata append_missing_accomodations('full') def _preprocess_item_metadata(): # interactively enable preprocessing function labels = ['Remove \'From n stars\' attributes'] pre_processing_f = [remove_from_stars_features] menu_title = 'Choose the preprocessing function(s) to apply to the accomodations.\nPress numbers to enable/disable the options, press X to confirm.' activated_prefns = menu.options(pre_processing_f, labels, title=menu_title, custom_exit_label='Confirm') # preprocess accomodations dataframe preprocess_accomodations_df(activated_prefns) def _create_urm_session_aware(): """ NOTE: CHANGE THE PARAMETERS OF THE SEQUENCE AWARE URM HERE !!!! """ create_urm.urm_session_aware(mode, cluster, time_weight='lin') def _create_urm_clickout(): """ NOTE: CHANGE THE PARAMETERS OF THE CLICKOUT_ONLY URM HERE !!!! """ create_urm.urm(mode, cluster, clickout_score=5, impressions_score=1) def _merge_sessions(): print("Merging similar sessions (same user_id and city)") print("Loading full_df") full_df = data.full_df() print("Sorting, grouping, and other awesome things") grouped = full_df.sort_values(["user_id", "timestamp"], ascending=[True, True]).groupby( ["user_id", "city"]) new_col = np.array(["" for _ in range(len(full_df))], dtype=object) print("Now I'm really merging...") for name, g in tqdm(grouped): s_id = g.iloc[0]["session_id"] new_col[g.index.values] = s_id print("Writing on the df") full_df["unified_session_id"] = pd.Series(new_col) print("Saving new df to file") with open(data.FULL_PATH, 'w', encoding='utf-8') as f: full_df.to_csv(f) data.refresh_full_df() print("Hello buddy... Copenaghen is waiting...") print() # create full_df.csv # pick your custom preprocessing function # original # funct = no_custom_preprocess_function # unroll funct = unroll_custom_preprocess_function check_folder(data.FULL_PATH) if os.path.isfile(data.FULL_PATH): menu.yesno_choice('An old full dataframe has been found. Do you want to delete it and create again?', \ callback_yes=(lambda: create_full_df(funct))) else: print('The full dataframe (index master) is missing. Creating it...', end=' ', flush=True) create_full_df(funct) print('Done!') # create CSV files menu.yesno_choice( title= 'Do you want to merge similar sessions (adding unified_session_id)?', callback_yes=_merge_sessions) # create CSV files menu.yesno_choice(title='Do you want to create the CSV files?', callback_yes=_create_csvs) # preprocess item_metadata menu.yesno_choice(title='Do you want to preprocess the item metadata?', callback_yes=_preprocess_item_metadata) # create ICM menu.yesno_choice(title='Do you want to create the ICM matrix files?', callback_yes=create_icm.create_ICM) # create URM lbls = [ 'Create URM from LOCAL dataset', 'Create URM from FULL dataset', 'Create URM from SMALL dataset', 'Skip URM creation' ] callbacks = [lambda: 'local', lambda: 'full', lambda: 'small', lambda: 0] res = menu.single_choice(title='What do you want to do?', labels=lbls, callbacks=callbacks) if res is None: exit(0) if res != 0: # initialize the train and test dataframes mode = res # get the cluster print('for which cluster do you want to create the URM ???') cluster = input() callbacks = [_create_urm_session_aware, _create_urm_clickout] menu.single_choice(title='Which URM do you want create buddy?', labels=['Sequence-aware URM', 'Clickout URM'], callbacks=callbacks) return
if __name__ == "__main__": mode = menu.mode_selection() #cluster_name = 'cluster_recurrent' cluster = menu.single_choice( 'Which cluster?', ['cluster recurrent', 'cluster len <= 6', 'cluster len > 6'], callbacks=[ lambda: ClusterRecurrent, lambda: ClusterUpToLen6, lambda: ClusterOverLen6 ]) c = cluster() # create the cluster cluster_choice = menu.yesno_choice('Do you want to create the cluster?', lambda: True, lambda: False) if cluster_choice: c.save(mode) print() only_test = False if mode != 'small': only_test = menu.yesno_choice( 'Do you want to create only the test dataset?', lambda: True, lambda: False) binary_dataset = menu.single_choice('Which dataset?', ['Standard', 'Binary'], [False, True]) sess_length = int( input(
[lambda: 'LSTM', lambda: 'GRU', lambda: 'auto']) print() if cell_type == 'auto': cell_type = 'GRU' epochs = 1 rec_layers = 1 dense_layers = 2 units = 4 tb_path = None else: epochs = int(input('Insert number of epochs: ')) rec_layers = int(input('Insert number of recurrent layers: ')) units = int(input('Insert number of units per layer: ')) dense_layers = int(input('Insert number of dense layers: ')) tb_path = menu.yesno_choice('Do you want to enable Tensorboard?', lambda: 'recommenders/tensorboard', lambda: None) dataset = SequenceDatasetForRegression( f'dataset/preprocessed/cluster_recurrent/{mode}/dataset_regression') model = RNNRegressionRecommender(dataset, use_generator=False, cell_type=cell_type, num_recurrent_layers=rec_layers, num_recurrent_units=units, num_dense_layers=dense_layers) model.fit(epochs=epochs) print('\nFit completed!')