def train(): mode = menu.mode_selection() # build the model opt = menu.single_choice('Optimizer?', ['Adam', 'RMSProp'], ['adam', 'rmsprop']) lr = menu.single_choice('Learning rate?', ['e-3', 'e-4', 'e-5'], [1e-3, 1e-4, 1e-5]) if opt == 'adam': optim = keras.optimizers.Adam(lr=lr) else: optim = keras.optimizers.RMSprop(lr=lr) model = interactive_model(mode, optim=optim) # fit the model model.fit(epochs=10000) print('\nFit completed!') best_accuracy = np.max(model.history.history['val_acc']) model.save(folderpath='saved_models/', suffix='_{}'.format(round(best_accuracy, 5)).replace('.', '')) # evaluate report = model.evaluate() bot.send_message(report, account='parro') print('Opt: {}'.format(opt)) print('Lr: {}'.format(lr))
def interactive_model(mode): cell_type = menu.single_choice( 'Choose a network architecture:', ['LSTM', 'GRU', 'default architecture'], [lambda: 'LSTM', lambda: 'GRU', lambda: 'auto']) print() if cell_type == 'auto': cell_type = 'GRU' rec_layers = 1 dense_layers = 2 units = 4 weights = True else: rec_layers = int(input('Insert number of recurrent layers: ')) units = int(input('Insert number of units per layer: ')) dense_layers = int(input('Insert number of dense layers: ')) weights = menu.yesno_choice('Do you want to use sample weights?', lambda: True, lambda: None) #tb_path = menu.yesno_choice('Do you want to enable Tensorboard?', lambda: 'recommenders/tensorboard', lambda: None) pad = menu.single_choice('Which dataset?', ['Padded 6', 'Padded 12'], [lambda: 6, lambda: 12]) dataset = SequenceDatasetForClassification( f'dataset/preprocessed/cluster_recurrent/{mode}/dataset_classification_p{pad}' ) if weights is not None: weights = dataset.get_sample_weights() model = RNNClassificationRecommender( dataset, use_generator=False, cell_type=cell_type, input_shape=(dataset.rows_per_sample, 168), num_recurrent_layers=rec_layers, num_recurrent_units=units, optimizer='adam', num_dense_layers=dense_layers, #class_weights=weights sample_weights=weights) return model
def create_feature(): mode = 'full' model = interactive_model(mode) model_checkpoints = os.listdir('saved_models') checkpoint_path = menu.single_choice('Choose the model checkpoint:', model_checkpoints) checkpoint_path = os.path.join('saved_models', checkpoint_path) print('Loading {}...'.format(checkpoint_path), end='\r', flush=True) model.load(checkpoint_path) print('Done!', flush=True) print('Creating feature for {}...'.format(mode)) model.create_feature()
def interactive_model(mode, optim='adam'): pad = menu.single_choice('Which dataset?', ['Padded 6', 'Padded 12'], [lambda: 6, lambda: 12]) dataset = SequenceDatasetForBinaryClassification( f'dataset/preprocessed/cluster_recurrent/{mode}/dataset_binary_classification_p{pad}' ) weights = dataset.get_class_weights() model = RNNBinaryClassificator(dataset, input_shape=(dataset.rows_per_sample, 168), cell_type='gru', num_recurrent_layers=2, num_recurrent_units=64, num_dense_layers=2, class_weights=weights, optimizer=optim) return model
TopPopPerImpression, User2Item, #UserFeature ] assert features_array[0] == ImpressionLabel, 'first feature must be the label!' choice = menu.yesno_choice(['want the scores?']) if choice == 'y': base_path_stacking = 'scores_stacking' stacking_scores_path = ['xgboost_nobias.csv.gz', 'catboost_rank.csv.gz','rnn_GRU_2layers_64units_2dense_class_nobias_05952.csv.gz', 'scores_pairwise_soft_zero_one_loss.csv.gz'] stacking_scores_path = [f'{base_path_stacking}/{a}' for a in stacking_scores_path] else: stacking_scores_path = [] mode = menu.mode_selection() cluster = menu.cluster_selection() dataset_name = input('insert dataset name\n') choice = menu.single_choice(['select mode'], ['normal', 'cv']) if choice == 'cv': create_dataset_cv(mode, cluster, features_array, dataset_name, k=5) else: create_dataset(mode, cluster, features_array, dataset_name, stacking_scores_path)
model.save(folderpath='saved_models/', suffix='_{}'.format(round(best_accuracy, 5)).replace('.', '')) # evaluate report = model.evaluate() bot.send_message(report, account='parro') print('Opt: {}'.format(opt)) print('Lr: {}'.format(lr)) def create_feature(): mode = 'full' model = interactive_model(mode) model_checkpoints = os.listdir('saved_models') checkpoint_path = menu.single_choice('Choose the model checkpoint:', model_checkpoints) checkpoint_path = os.path.join('saved_models', checkpoint_path) print('Loading {}...'.format(checkpoint_path), end='\r', flush=True) model.load(checkpoint_path) print('Done!', flush=True) print('Creating feature for {}...'.format(mode)) model.create_feature() activity = menu.single_choice('What do you want to do?', ['Train', 'Create feature'], [train, create_feature])
#'sample_weights': dataset.get_sample_weights() } fit_params = {'epochs': int(input('Insert number of epochs: '))} kfscorer = KFoldScorer(model_class=RNNClassificationRecommender, init_params=init_params, k=5) kfscorer.fit_predict(dataset, multithreading=True, fit_params=fit_params) def scores_bin(): dataset = DatasetScoresBinaryClassification(f'dataset/preprocessed/cluster_recurrent/small/dataset_binary_classification_p6') init_params = { 'dataset': dataset, 'input_shape': (6,168), 'cell_type': 'gru', 'num_recurrent_layers': 2, 'num_recurrent_units': 64, 'num_dense_layers': 2, 'optimizer': 'adam', #'class_weights': dataset.get_class_weights(), 'sample_weights': dataset.get_sample_weights() } fit_params = {'epochs': int(input('Insert number of epochs: '))} kfscorer = KFoldScorer(model_class=RNNBinaryClassificator, init_params=init_params, k=5) kfscorer.fit_predict(dataset, multithreading=True, fit_params=fit_params) menu.single_choice('Which model?', ['RNN', 'RNN binary'], [scores_rnn, scores_bin])
result_predictions.append((index, ordered_impressions)) print('prediction created !!!') return result_predictions def get_scores_batch(self): return None if __name__ == "__main__": import utils.menu as menu mode = menu.mode_selection() cell_type = menu.single_choice( 'Choose a network architecture:', ['LSTM', 'GRU', 'default architecture'], [lambda: 'LSTM', lambda: 'GRU', lambda: 'auto']) print() if cell_type == 'auto': cell_type = 'GRU' epochs = 1 rec_layers = 1 dense_layers = 2 units = 4 tb_path = None else: epochs = int(input('Insert number of epochs: ')) rec_layers = int(input('Insert number of recurrent layers: ')) units = int(input('Insert number of units per layer: ')) dense_layers = int(input('Insert number of dense layers: ')) tb_path = menu.yesno_choice('Do you want to enable Tensorboard?',
def preprocess(): """ Preprocess menu NOTE: it is required to have the original CSV files in the folder dataset/original """ def _create_csvs(): print('creating CSV...') # create no_cluster/full path = 'dataset/preprocessed/no_cluster' full = data.full_df() train_len = data.read_config()[data.TRAIN_LEN_KEY] train = full.iloc[0:train_len] test = full.iloc[train_len:len(full)] target_indices = get_target_indices(test) check_folder('dataset/preprocessed/no_cluster/full') train.to_csv(os.path.join(path, 'full/train.csv')) test.to_csv(os.path.join(path, 'full/test.csv')) np.save(os.path.join(path, 'full/train_indices'), train.index) np.save(os.path.join(path, 'full/test_indices'), test.index) np.save(os.path.join(path, 'full/target_indices'), target_indices) no_of_rows_in_small = int( input('How many rows do you want in small.csv? ')) train_small = get_small_dataset(train, maximum_rows=no_of_rows_in_small) check_folder('dataset/preprocessed/no_cluster/small') split(train_small, os.path.join(path, 'small')) check_folder('dataset/preprocessed/no_cluster/local') split(train, os.path.join(path, 'local')) # create item_metadata in preprocess folder original_item_metadata = data.accomodations_original_df() original_item_metadata.to_csv(data.ITEMS_PATH) # append missing accomodations to item metadata append_missing_accomodations('full') def _preprocess_item_metadata(): # interactively enable preprocessing function labels = ['Remove \'From n stars\' attributes'] pre_processing_f = [remove_from_stars_features] menu_title = 'Choose the preprocessing function(s) to apply to the accomodations.\nPress numbers to enable/disable the options, press X to confirm.' activated_prefns = menu.options(pre_processing_f, labels, title=menu_title, custom_exit_label='Confirm') # preprocess accomodations dataframe preprocess_accomodations_df(activated_prefns) def _create_urm_session_aware(): """ NOTE: CHANGE THE PARAMETERS OF THE SEQUENCE AWARE URM HERE !!!! """ create_urm.urm_session_aware(mode, cluster, time_weight='lin') def _create_urm_clickout(): """ NOTE: CHANGE THE PARAMETERS OF THE CLICKOUT_ONLY URM HERE !!!! """ create_urm.urm(mode, cluster, clickout_score=5, impressions_score=1) def _merge_sessions(): print("Merging similar sessions (same user_id and city)") print("Loading full_df") full_df = data.full_df() print("Sorting, grouping, and other awesome things") grouped = full_df.sort_values(["user_id", "timestamp"], ascending=[True, True]).groupby( ["user_id", "city"]) new_col = np.array(["" for _ in range(len(full_df))], dtype=object) print("Now I'm really merging...") for name, g in tqdm(grouped): s_id = g.iloc[0]["session_id"] new_col[g.index.values] = s_id print("Writing on the df") full_df["unified_session_id"] = pd.Series(new_col) print("Saving new df to file") with open(data.FULL_PATH, 'w', encoding='utf-8') as f: full_df.to_csv(f) data.refresh_full_df() print("Hello buddy... Copenaghen is waiting...") print() # create full_df.csv # pick your custom preprocessing function # original # funct = no_custom_preprocess_function # unroll funct = unroll_custom_preprocess_function check_folder(data.FULL_PATH) if os.path.isfile(data.FULL_PATH): menu.yesno_choice('An old full dataframe has been found. Do you want to delete it and create again?', \ callback_yes=(lambda: create_full_df(funct))) else: print('The full dataframe (index master) is missing. Creating it...', end=' ', flush=True) create_full_df(funct) print('Done!') # create CSV files menu.yesno_choice( title= 'Do you want to merge similar sessions (adding unified_session_id)?', callback_yes=_merge_sessions) # create CSV files menu.yesno_choice(title='Do you want to create the CSV files?', callback_yes=_create_csvs) # preprocess item_metadata menu.yesno_choice(title='Do you want to preprocess the item metadata?', callback_yes=_preprocess_item_metadata) # create ICM menu.yesno_choice(title='Do you want to create the ICM matrix files?', callback_yes=create_icm.create_ICM) # create URM lbls = [ 'Create URM from LOCAL dataset', 'Create URM from FULL dataset', 'Create URM from SMALL dataset', 'Skip URM creation' ] callbacks = [lambda: 'local', lambda: 'full', lambda: 'small', lambda: 0] res = menu.single_choice(title='What do you want to do?', labels=lbls, callbacks=callbacks) if res is None: exit(0) if res != 0: # initialize the train and test dataframes mode = res # get the cluster print('for which cluster do you want to create the URM ???') cluster = input() callbacks = [_create_urm_session_aware, _create_urm_clickout] menu.single_choice(title='Which URM do you want create buddy?', labels=['Sequence-aware URM', 'Clickout URM'], callbacks=callbacks) return
mode, cluster, TRAIN_LEN, TEST_LEN, rows_per_sample=pad_sessions_length) #X_sparse_cols=x_sparse_cols, Y_sparse_cols=ref_classes) if __name__ == "__main__": mode = menu.mode_selection() #cluster_name = 'cluster_recurrent' cluster = menu.single_choice( 'Which cluster?', ['cluster recurrent', 'cluster len <= 6', 'cluster len > 6'], callbacks=[ lambda: ClusterRecurrent, lambda: ClusterUpToLen6, lambda: ClusterOverLen6 ]) c = cluster() # create the cluster cluster_choice = menu.yesno_choice('Do you want to create the cluster?', lambda: True, lambda: False) if cluster_choice: c.save(mode) print() only_test = False if mode != 'small': only_test = menu.yesno_choice(
from recommenders.lightGBM import lightGBM from skopt import gp_minimize from skopt import dummy_minimize from skopt.space import Real, Integer, Categorical from skopt.utils import use_named_args import data from utils.menu import single_choice class OptimizerWrapper: def __init__(self, recommender_class, mode, cluster, dataset_name): self.space, self.objective = recommender_class.get_optimize_params(mode, cluster, dataset_name) def optimize_bayesian(self): best_param = gp_minimize(self.objective, self.space, n_random_starts=10, n_calls=100) print(best_param) def optimize_random(self): best_param = dummy_minimize(self.objective, self.space, n_calls=1000) if __name__ == '__main__': opt_technique = single_choice('optimization technique', ['bayesian', 'random']) mode = single_choice('insert mode:', ['local', 'small']) cluster = single_choice('insert cluster', ['no_cluster']) dataset_name = input('insert_the_dataset_name') opt = OptimizerWrapper(lightGBM, mode=mode, cluster=cluster, dataset_name=dataset_name) if opt_technique == 'bayesian': opt.optimize_bayesian() else: opt.optimize_random()
# save the dataset config file that stores dataset length and the list of sparse columns features_cols = list(data.accomodations_one_hot().columns) if add_item_features else [] x_sparse_cols = devices_classes + actions_classes + features_cols datasetconfig.save_config(path, mode, cluster, TRAIN_LEN, TEST_LEN, train_name=TRAIN_NAME, rows_per_sample=pad_sessions_length, X_sparse_cols=x_sparse_cols, Y_sparse_cols=features_cols) if __name__ == "__main__": mode = menu.mode_selection() #cluster_name = 'cluster_recurrent' cluster = menu.single_choice('Which cluster?', ['cluster recurrent','cluster recurrent len <= 6'], callbacks=[lambda: ClusterRecurrent, lambda: ClusterRecurrentUpToLen6]) c = cluster() # create the cluster cluster_choice = menu.yesno_choice('Do you want to create the cluster?', lambda: True, lambda: False) if cluster_choice: print('Creating the cluster...') c.save(mode) print() only_test = False if mode != 'small': only_test = menu.yesno_choice('Do you want to create only the test dataset?', lambda: True, lambda: False) sess_length = int(input('Insert the desired sessions length, -1 to not to pad/truncate the sessions: '))
username='******', password='******') def scores(): mode = 'full' model = interactive_model(mode) checkpoint_path = menu.checkpoint_selection( checkpoints_dir='saved_models') print('Loading {}...'.format(checkpoint_path), end='\r', flush=True) model.load(checkpoint_path) print('Done!', flush=True) # get scores for train and test and save them scores_folder = 'scores' check_folder(scores_folder) for st in ['train', 'test']: print('Building scores for {} {}...'.format(st, mode)) scores = model.get_scores_batch(scores_type=st) print('Saving scores for {} {}...'.format(st, mode)) scores_filename = '{}_scores_{}'.format(model.name, st) np.save(os.path.join(scores_folder, scores_filename), np.array(scores)) activity = menu.single_choice('What do you want to do?', ['Train', 'Submission', 'Scores'], [train, submission, scores])
def _mrr(y_true, y_pred): y_pred = y_pred.get_label() l = memoryview(np.array(y_pred, dtype=np.int32)) p = memoryview(np.array(y_true, dtype=np.float32)) g = memoryview(np.array(_group_t, dtype=np.int32)) mrr = mrr_cython(l, p, g, len(_group_t)) return 'MRR', -mrr if __name__ == '__main__': from utils.menu import mode_selection from utils.menu import cluster_selection from utils.menu import single_choice from utils.menu import options modality = single_choice('smart evaluate or normal recommender?', ['smart evaluate', 'normal recommender']) if modality == 'normal recommender': kind = input('pick the kind: ') mode = mode_selection() cluster = cluster_selection() sel = options(['evaluate', 'export the sub', 'export the scores'], [ 'evaluate', 'export the sub', 'export the scores' ], 'what do you want to do after model fitting and the recommendations?' ) model = XGBoostWrapper(mode=mode, cluster=cluster, kind=kind) if 'export the sub' in sel and 'export the scores' in sel: model.run(export_sub=True, export_scores=True) elif 'export the sub' in sel and 'export the scores' not in sel: model.run(export_sub=True, export_scores=False)
##PlatformReferencePercentageOfInteractions, PriceQuality, # RefPopAfterFirstPosition, SessionActionNumRefDiffFromImpressions, SessionDevice, SessionFilterActiveWhenClickout, SessionLength, # SessionNumClickouts, # SessionNumFilterSel, # SessionNumInterItemImage, # SessionNumNotNumeric, SessionSortOrderWhenClickout, StatisticsTimeFromLastAction, TimePerImpression, TimesUserInteractedWithImpression, TimingFromLastInteractionImpression, TopPopInteractionClickoutPerImpression, TopPopPerImpression, User2Item, #UserFeature ] mode = single_choice('select mode:', ['full', 'local', 'small']) cluster = single_choice('select cluster:', ['no_cluster']) #dataset_name=single_choice('select dataset name:',['prova', 'dataset1', 'dataset2', 'old']) dataset_name = input('insert the dataset name:\n') create_lightGBM_dataset(mode=mode, cluster=cluster, features_array=features_array, dataset_name=dataset_name)
cf.check_folder( f'{_BASE_PATH}/output_dir_{datetime.datetime.now().strftime("%Y-%m-%d_%H:%M")}' ) _OUTPUT_DIR = f'{_BASE_PATH}/output_dir_{datetime.datetime.now().strftime("%Y-%m-%d_%H:%M")}' flags_dict['output_dir'] = _OUTPUT_DIR # let the user choose the params train_batch_size = int(input('insert train batch size')) learning_rate = float(input('insert learning rate')) dropout_rate = float(input('insert dropout rate')) hidden_layer_dims = input( 'insert hidden layer dims as numbers separeted by spaces').split(' ') loss = menu.single_choice('select the loss', [ 'pairwise_hinge_loss', 'pairwise_logistic_loss', 'pairwise_soft_zero_one_loss', 'softmax_loss', 'sigmoid_cross_entropy_loss', 'mean_squared_loss', 'list_mle_loss', 'approx_ndcg_loss' ]) group_size = int(input('insert the group_size:\n')) # update flag dict flags_dict['train_batch_size'] = train_batch_size flags_dict['learning_rate'] = learning_rate flags_dict['dropout_rate'] = dropout_rate flags_dict['hidden_layer_dims'] = hidden_layer_dims flags_dict['group_size'] = group_size flags_dict['loss'] = loss if _MODE == 'full':
from numpy.linalg import norm as L2Norm from sklearn.utils import shuffle from sklearn.metrics import classification_report import keras from keras.models import Sequential, load_model from keras.layers import Dense, LSTM, GRU, Embedding, Dropout, TimeDistributed from keras.callbacks import EarlyStopping if __name__ == "__main__": import utils.menu as menu tqdm.pandas() mode = menu.mode_selection() opt = menu.single_choice('Optimizer?', ['Adam', 'RMSProp'], ['adam', 'rmsprop']) lr = menu.single_choice('Learning rate?', ['e-3', 'e-4', 'e-5'], [1e-3, 1e-4, 1e-5]) pad = menu.single_choice('Which dataset?', ['Padded 6', 'Padded 12'], [lambda: 6, lambda: 12]) dataset = SequenceDatasetForBinaryClassification( f'dataset/preprocessed/cluster_recurrent/{mode}/dataset_binary_classification_p{pad}' ) print('Loading data...') x, y = dataset.load_Xtrain(), dataset.load_Ytrain() x, y = shuffle(x, y) print() perc = np.sum(y) / len(y)