def validate(self, iterations): self.fixed_params_dict, self.hyperparameters_dict = self.reference_object.get_params() for i in range(iterations): # sample a random parameter from the dictionary sampled_params = {} for key, value in self.hyperparameters_dict.items(): sampled_params[key] = self.sample(value) params_dict = {**self.fixed_params_dict, **sampled_params} print(params_dict) model = self.reference_object.__class__(**params_dict) score = model.evaluate() # assign it again in case a fixed parameter has changed self.fixed_params_dict, self.hyperparameters_dict = model.get_params() params_dict = {**self.fixed_params_dict, **sampled_params} if self.automatic_export != None: self.automatic_export.check_if_export(score, params_dict) self.writer.write( 'params: {}\n MRR is: {}\n\n'.format(params_dict, score)) # sending a message on the telegram channel HERA.send_message( 'name: {} params: {}\n MRR is: {}\n\n'.format(model.name, params_dict, score), self.user) print('name: {} params: {}\n MRR is: {}\n\n'.format(model.name, params_dict, score))
def train(): mode = menu.mode_selection() # build the model opt = menu.single_choice('Optimizer?', ['Adam', 'RMSProp'], ['adam', 'rmsprop']) lr = menu.single_choice('Learning rate?', ['e-3', 'e-4', 'e-5'], [1e-3, 1e-4, 1e-5]) if opt == 'adam': optim = keras.optimizers.Adam(lr=lr) else: optim = keras.optimizers.RMSprop(lr=lr) model = interactive_model(mode, optim=optim) # fit the model model.fit(epochs=10000) print('\nFit completed!') best_accuracy = np.max(model.history.history['val_acc']) model.save(folderpath='saved_models/', suffix='_{}'.format(round(best_accuracy, 5)).replace('.', '')) # evaluate report = model.evaluate() bot.send_message(report, account='parro') print('Opt: {}'.format(opt)) print('Lr: {}'.format(lr))
def export(self, obj, params_dict, mode, mrr): params_dict['mode'] = mode instance = obj(**params_dict) # print('EXPORTING sub and scores algo {} with score {} in mode {} with params {}'.format(instance.name, mrr, mode, params_dict)) HERA.send_message( 'EXPORTING sub and scores algo {} with score {} in mode {} with params {}' .format(instance.name, mrr, mode, params_dict), self.user) instance.run(export_sub=True, export_scores=True)
def _hera_callback(param): iteration_num = param[2] if iteration_num % param[1]['print_every'] == 0: message = f'PARAMS:\n' for k in param[1]: message += f'{k}: {param[1][k]}\n' Hera.send_message( f'ITERATION_NUM: {iteration_num}\n {message}\n MRR: {param[5][0][2]}', account='edo')
def train_and_test(): features, labels = load_libsvm_data(FLAGS.train_path, FLAGS.list_size) train_input_fn, train_hook = get_train_inputs(features, labels, FLAGS.train_batch_size) features_test, labels_test = load_libsvm_data(FLAGS.test_path, FLAGS.list_size) def _train_op_fn(loss): """Defines train op used in ranking head.""" return tf.contrib.layers.optimize_loss( loss=loss, global_step=tf.train.get_global_step(), learning_rate=FLAGS.learning_rate, optimizer="Adagrad") if FLAGS.loss == 'list_mle_loss': lambda_weight = tfr.losses.create_p_list_mle_lambda_weight(list_size=25) elif FLAGS.loss == 'approx_ndcg_loss': lambda_weight = tfr.losses.create_ndcg_lambda_weight(topn=25) else: lambda_weight = tfr.losses.create_reciprocal_rank_lambda_weight(topn=25) ranking_head = tfr.head.create_ranking_head( loss_fn=tfr.losses.make_loss_fn(FLAGS.loss, lambda_weight=lambda_weight), eval_metric_fns=get_eval_metric_fns(), train_op_fn=_train_op_fn) # tfr.losses.create_p_list_mle_lambda_weight(25) # lambda_weight=tfr.losses.create_reciprocal_rank_lambda_weight() estimator = tf.estimator.Estimator( model_fn=tfr.model.make_groupwise_ranking_fn( group_score_fn=make_score_fn(), group_size=FLAGS.group_size, transform_fn=tfr.feature.make_identity_transform_fn(FLAGS.train_context_features_id), ranking_head=ranking_head)) estimator.train(train_input_fn, hooks=[train_hook], steps=FLAGS.num_train_steps) # predict also for the train to get the scores for the staking pred_train = np.array(list(estimator.predict(lambda: batch_inputs(features, labels, 128)))) pred = np.array(list(estimator.predict(lambda: batch_inputs(features_test, labels_test, 128)))) pred_name_train=f'train_predictions_{FLAGS.loss}_learning_rate_{FLAGS.learning_rate}_train_batch_size_{FLAGS.train_batch_size}_' \ f'hidden_layers_dim_{FLAGS.hidden_layer_dims}_num_train_steps_{FLAGS.num_train_steps}_dropout_{FLAGS.dropout_rate}_{FLAGS.group_size}' pred_name=f'predictions_{FLAGS.loss}_learning_rate_{FLAGS.learning_rate}_train_batch_size_{FLAGS.train_batch_size}_' \ f'hidden_layers_dim_{FLAGS.hidden_layer_dims}_num_train_steps_{FLAGS.num_train_steps}_dropout_{FLAGS.dropout_rate}_{FLAGS.group_size}' np.save(f'{FLAGS.save_path}/{pred_name}', pred) np.save(f'{FLAGS.save_path}/{pred_name_train}', pred_train) for name in [pred_name, pred_name_train]: HERA.send_message(f'EXPORTING A SUB... mode:{FLAGS.mode}, name:{name}') model = TensorflowRankig(mode=FLAGS.mode, cluster='no_cluster', dataset_name=FLAGS.dataset_name, pred_name=name) model.name = f'tf_ranking_{name}' model.run() HERA.send_message(f'EXPORTED... mode:{FLAGS.mode}, name:{name}')
def callbak(obj): global _best_so_far if -obj[6][1][1] > _best_so_far: _best_so_far = -obj[6][1][1] if _best_so_far > 0.6765: HERA.send_message( 'xgboost {} iteration {} mrr is {}'.format( _kind, obj.iteration, _best_so_far), 'teo') print('xgboost iteration {} mrr is {}'.format(obj.iteration, _best_so_far))
def create_dataset(mode, cluster, features_array, dataset_name, stacking_scores_path): _SAVE_BASE_PATH = f'dataset/preprocessed/tf_ranking/{cluster}/{mode}/{dataset_name}' cf.check_folder(_SAVE_BASE_PATH) train_df, vali_test_df, context_features_id = merge_features_tf(mode, cluster, features_array, stacking_scores_path) # save context features id print(f'saving context feature id to: {_SAVE_BASE_PATH}/context_features_id.npy') np.save(f'{_SAVE_BASE_PATH}/context_features_id', context_features_id) parse_dataset(train_df, _SAVE_BASE_PATH, 'train') parse_dataset(vali_test_df, _SAVE_BASE_PATH, 'test') Hera.send_message('tf ranking dataset saved !') print('PROCEDURE ENDED CORRECTLY')
def create_sub(estimator, checkpoint_path, eval_result, batch_size=128, patience=0.001): # now works also for local and small it will create a sub # create a sub only if the MMR is > 0.65 if self.mode == 'local': eval_result_f = eval_result['metric/mrr'] global_step = eval_result['global_step'] if eval_result_f > self.min_mrr + patience: # set as new threshold the new mrr self.min_mrr = eval_result_f # predict the test... pred = np.array( list( estimator.predict(lambda: batch_inputs( self.test_x, self.test_y, batch_size)))) pred_train = np.array( list( estimator.predict(lambda: batch_inputs( self.x, self.y, batch_size)))) pred_name_train = 'train_predictions_{}_learning_rate_{}_train_batch_size_{}_hidden_layers_dim_{}_num_train_steps_{}' \ '_dropout_{}_global_steps_{}_{}_mrr_{}'.format(self.params['loss'], self.params['learning_rate'], self.params['train_batch_size'], self.params['hidden_layer_dims'], self.params['num_train_steps'], self.params['dropout_rate'], global_step, self.params['group_size'], eval_result_f) pred_name = 'predictions_{}_learning_rate_{}_train_batch_size_{}_hidden_layers_dim_{}_num_train_steps_{}' \ '_dropout_{}_global_steps_{}_{}_mrr_{}'.format(self.params['loss'], self.params['learning_rate'], self.params['train_batch_size'], self.params['hidden_layer_dims'], self.params['num_train_steps'], self.params['dropout_rate'], global_step, self.params['group_size'], eval_result_f) np.save(f'{self.save_path}/{pred_name_train}', pred_train) np.save(f'{self.save_path}/{pred_name}', pred) for name in [pred_name, pred_name_train]: HERA.send_message( f'EXPORTING A SUB... {eval_result_f} mode:{self.mode}, name:{name}' ) model = TensorflowRankig( mode=self.mode, cluster='no_cluster', dataset_name=self.dataset_name, pred_name=name) model.name = f'tf_ranking_{name}' model.run() HERA.send_message( f'EXPORTED... {eval_result_f} mode:{self.mode}, name:{name}' )
def evaluate(self, send_MRR_on_telegram=False): self.fit() print(self.xgb.feature_importances_) Y_test, Y_pred = self.recommend_batch() report = classification_report(Y_test, Y_pred) report += "\n Accuracy: {} %".format( accuracy_score(Y_test, Y_pred) * 100) print(report) if send_MRR_on_telegram: HERA.send_message( 'evaluating classifier {} on {}.\n Classification report is: \n {}\n\n' .format(self.name, self.mode, report)) return report
def _validate_step(self, **dict): # initialize the recommender params_dict = {**self.fixed_params_dict, **dict} #partial_initialized_model = partial(self.reference_object.__init__, **self.fixed_params_dict) model = self.reference_object.__class__(**params_dict) score = model.evaluate() del model gc.collect() self.writer.write('params: {}\n MRR is: {}\n\n'.format( params_dict, score)) # sending a message on the telegram channel HERA.send_message('params: {}\n MRR is: {}\n\n'.format( params_dict, score)) return score
def get_mrr(arg_list): learning_rate, num_leaves, min_split_gain, min_child_weight, \ min_child_samples, bagging_freq, feature_fraction = arg_list params_dict = { 'boosting_type': 'gbdt', 'num_leaves': num_leaves, 'max_depth': -1, 'n_estimators': 5000, 'learning_rate': learning_rate, 'subsample_for_bin': 200000, 'class_weights': None, #'min_data_in_leaf': min_data_in_leaf, 'min_split_gain': min_split_gain, 'min_child_weight': min_child_weight, 'min_child_samples': min_child_samples, 'bagging_freq': bagging_freq, 'feature_fraction': feature_fraction, 'subsample': 1, 'subsample_freq': 0, 'colsample_bytree': 1, 'reg_alpha': 0.0, 'reg_lambda': 0.0, 'random_state': None, 'n_jobs': -1, 'silent': False, 'importance_type': 'split', 'metric': 'None', 'print_every': 10000, } lgb = lightGBM(mode=mode, cluster=cluster, dataset_name=dataset_name, params_dict=params_dict) mrr = lgb.validate() best_it = lgb.model._Booster.best_iteration Hera.send_message( f'MRR: {mrr}\n' f'params:\n' f'num_iteration:{best_it}, learning_rate:{learning_rate}, num_leaves:{num_leaves}, ' f'min_split_gain: {min_split_gain}, min_child_weight: {min_child_weight}, min_child_samples: {min_child_samples}' ) return -mrr
def iterations_validation(self, max_trees, range_step=25, mode='auto'): if self.ctb is None: self.fit() test_df = self.get_preprocessed_dataset(mode='test') test_df.drop(['user_id', 'session_id', 'item_id'], inplace=True, axis=1) if mode == 'auto': list_num_trees = [ max_trees - i * range_step for i in range(max_trees) ] for trees in list_num_trees: self.set_limit_trees(trees) self.predictions = [] self.scores_batch = [] test_df.groupby('id', as_index=False).progress_apply(self.func) MRR = self.compute_MRR(self.predictions[1:]) HERA.send_message( 'evaluating recommender {} on {}. Iterations used {}\n MRR is: {}\n\n' .format(self.name, self.cluster, trees, MRR)) else: while True: # Getting user input while True: trees = input("How many iterations?") try: self.set_limit_trees(int(trees)) break except ValueError: pass self.predictions = [] self.scores_batch = [] test_df.groupby('id', as_index=False).progress_apply(self.func) MRR = self.compute_MRR(self.predictions[1:]) HERA.send_message( 'evaluating recommender {} on {}. Iterations used {}\n MRR is: {}\n\n' .format(self.name, self.cluster, trees, MRR))
def create_lightGBM_dataset(mode, cluster, features_array, dataset_name): def _create_groups(df): """ function used to retrieve the len of the groups :param df: :return: """ df = df[['user_id', 'session_id']] group = df.groupby(['user_id', 'session_id'], sort=False).apply(lambda x: len(x)).values return group def _save_dataset(base_path, mode, df): assert mode in ['train', 'vali'], 'the mode has to be train or vali' print('reducing memory usage...') df = reduce_mem_usage(df) check_folder(base_path) x = df.drop(['index', 'user_id', 'session_id', 'item_id', 'label'], axis=1) x.to_hdf(f'{_BASE_PATH}/x_{mode}.hdf', key='df', index=False, format='table') print(f'x_{mode} saved at: {_BASE_PATH}/x_{mode}.hdf') y = df['label'].values np.save(f'{_BASE_PATH}/y_{mode}', y) print(f'y_{mode} saved at: {_BASE_PATH}/y_{mode}.npy') groups = _create_groups(df) np.save(f'{_BASE_PATH}/groups_{mode}', groups) print(f'groups_{mode} saved at: {_BASE_PATH}/groups_{mode}.npy') user_session_item = df[['user_id', 'session_id', 'item_id']] user_session_item.to_csv(f'{_BASE_PATH}/user_session_item_{mode}.csv', index=False) print( f'user_session_item_{mode} saved at: {_BASE_PATH}/user_session_item_{mode}.csv' ) # base save path _BASE_PATH = f'dataset/preprocessed/lightGBM/{cluster}/{mode}/{dataset_name}' # retrieve the TRAIN and VALIDATION/TEST data train_df, validation_df = merge_features_lgb(mode, cluster, features_array) print('saving features names...') check_folder(f"{_BASE_PATH}") with open(f"{_BASE_PATH}/Features.txt", "w+") as text_file: text_file.write(str([str(fn) for fn in features_array])) Hera.send_message('SAVING TRAIN LIGHTGBM...') _save_dataset(_BASE_PATH, 'train', train_df) Hera.send_message('SAVING VALI LIGHTGBM...') _save_dataset(_BASE_PATH, 'vali', validation_df) Hera.send_message('PROCEDURE ENDED CORRECTLY')
def get_scores_cv(k): df_scores = [] for i in range(k): i = i + 1 HERA.send_message(f'fold_{i} start') base_path = '{}/fold_{}'.format(flags_dict['save_path'], i) # load usi usi_df = pd.read_csv(f'{base_path}/usi.csv') pred = np.array(train_cv(base_path)) # create the df of the scores usi_df['score_tf'] = pred.flatten() #append the score df df_scores.append(usi_df) HERA.send_message(f'fold_{i} end') _BASE_PATH = 'dataset/preprocessed/tf_ranking/no_cluster/full/{}'.format( flags_dict['dataset_name']) HERA.send_message('retrieving the score for full') # retrieve the full scores pred = train_cv(_BASE_PATH) # load usi of the full usi_df = pd.read_csv(f'{_BASE_PATH}/usi.csv') usi_df['score_tf'] = pred.flatten() # append the full scores df_scores.append(usi_df) # concat all the scores final_scores = pd.concat(df_scores) # save the scores save_path = flags_dict['save_path'] _loss = flags_dict['loss'] final_scores.to_csv(f'{save_path}/scores_{_loss}.csv.gz', compression='gzip', index=False) HERA.send_message(f'SCORES SAVED SUCCESFULLY')
def train_and_test(): path = flags_dict['save_path'] features, labels = load_data(path, 'train') train_input_fn, train_hook = get_train_inputs( features, labels, flags_dict['train_batch_size']) features_test, labels_test = load_data(path, 'test') def _train_op_fn(loss): """Defines train op used in ranking head.""" return tf.contrib.layers.optimize_loss( loss=loss, global_step=tf.train.get_global_step(), learning_rate=flags_dict['learning_rate'], optimizer="Adagrad") if flags_dict['loss'] == 'list_mle_loss': lambda_weight = tfr.losses.create_p_list_mle_lambda_weight( list_size=25) elif flags_dict['loss'] == 'approx_ndcg_loss': lambda_weight = tfr.losses.create_ndcg_lambda_weight(topn=25) else: lambda_weight = tfr.losses.create_reciprocal_rank_lambda_weight( topn=25) ranking_head = tfr.head.create_ranking_head( loss_fn=tfr.losses.make_loss_fn(flags_dict['loss'], lambda_weight=lambda_weight), eval_metric_fns=get_eval_metric_fns(), train_op_fn=_train_op_fn) # tfr.losses.create_p_list_mle_lambda_weight(25) # lambda_weight=tfr.losses.create_reciprocal_rank_lambda_weight() estimator = tf.estimator.Estimator( model_fn=tfr.model.make_groupwise_ranking_fn( group_score_fn=make_score_fn(), group_size=flags_dict['group_size'], transform_fn=tfr.feature.make_identity_transform_fn( flags_dict['train_context_features_id']), ranking_head=ranking_head)) estimator.train(train_input_fn, hooks=[train_hook], steps=flags_dict['num_train_steps']) # predict also for the train to get the scores for the staking pred_train = np.array( list(estimator.predict(lambda: batch_inputs(features, labels, 128)))) pred = np.array( list( estimator.predict( lambda: batch_inputs(features_test, labels_test, 128)))) pred_name_train='train_predictions_{}_learning_rate_{}_train_batch_size_{}_' \ 'hidden_layers_dim_{}_num_train_steps_{}_dropout_{}_group_size_{}'.format(flags_dict['loss'], flags_dict['learning_rate'], flags_dict['train_batch_size'], flags_dict['hidden_layer_dims'], flags_dict['num_train_steps'], flags_dict['dropout_rate'], flags_dict['group_size']) pred_name ='predictions_{}_learning_rate_{}_train_batch_size_{}_' \ 'hidden_layers_dim_{}_num_train_steps_{}_dropout_{}_group_size_{}'.format(flags_dict['loss'], flags_dict['learning_rate'], flags_dict['train_batch_size'], flags_dict['hidden_layer_dims'], flags_dict['num_train_steps'], flags_dict['dropout_rate'], flags_dict['group_size']) np.save('{}/{}'.format(flags_dict['save_path'], pred_name), pred) np.save('{}/{}'.format(flags_dict['save_path'], pred_name_train), pred_train) for name in [pred_name, pred_name_train]: HERA.send_message('EXPORTING A SUB... mode:{}, name:{}'.format( flags_dict['mode'], name)) model = TensorflowRankig(mode=flags_dict['mode'], cluster='no_cluster', dataset_name=flags_dict['dataset_name'], pred_name=name) model.name = f'tf_ranking_{name}' model.run() HERA.send_message('EXPORTED... mode:{}, name:{}'.format( flags_dict['mode'], name))
def export(self, estimator, export_path, checkpoint_path, eval_result, is_the_final_export): def batch_inputs(features, labels, batch_size): dataset = tf.data.Dataset.from_tensor_slices((features, labels)) return dataset.batch(batch_size) def create_sub(estimator, checkpoint_path, eval_result, batch_size=128, patience=0.001): # now works also for local and small it will create a sub # create a sub only if the MMR is > 0.65 if self.mode == 'local': eval_result_f = eval_result['metric/mrr'] global_step = eval_result['global_step'] if eval_result_f > self.min_mrr + patience: # set as new threshold the new mrr self.min_mrr = eval_result_f # predict the test... pred = np.array( list( estimator.predict(lambda: batch_inputs( self.test_x, self.test_y, batch_size)))) pred_train = np.array( list( estimator.predict(lambda: batch_inputs( self.x, self.y, batch_size)))) pred_name_train = 'train_predictions_{}_learning_rate_{}_train_batch_size_{}_hidden_layers_dim_{}_num_train_steps_{}' \ '_dropout_{}_global_steps_{}_{}_mrr_{}'.format(self.params['loss'], self.params['learning_rate'], self.params['train_batch_size'], self.params['hidden_layer_dims'], self.params['num_train_steps'], self.params['dropout_rate'], global_step, self.params['group_size'], eval_result_f) pred_name = 'predictions_{}_learning_rate_{}_train_batch_size_{}_hidden_layers_dim_{}_num_train_steps_{}' \ '_dropout_{}_global_steps_{}_{}_mrr_{}'.format(self.params['loss'], self.params['learning_rate'], self.params['train_batch_size'], self.params['hidden_layer_dims'], self.params['num_train_steps'], self.params['dropout_rate'], global_step, self.params['group_size'], eval_result_f) np.save(f'{self.save_path}/{pred_name_train}', pred_train) np.save(f'{self.save_path}/{pred_name}', pred) for name in [pred_name, pred_name_train]: HERA.send_message( f'EXPORTING A SUB... {eval_result_f} mode:{self.mode}, name:{name}' ) model = TensorflowRankig( mode=self.mode, cluster='no_cluster', dataset_name=self.dataset_name, pred_name=name) model.name = f'tf_ranking_{name}' model.run() HERA.send_message( f'EXPORTED... {eval_result_f} mode:{self.mode}, name:{name}' ) self._log('export checkpoint {}'.format(checkpoint_path)) step = eval_result['global_step'] score = eval_result['metric/mrr'] checkpoint = Checkpoint(path=checkpoint_path, score=score) HERA.send_message( 'mode: {}\n step:{}\nTFRANKING mrr is: {}\n dropout:{}\n' 'learning_rate:{}\n train_batch_size:{}\n' 'hidden_layer_dims:{}\n loss:{}\n group_size:{}'.format( self.mode, step, score, self.params['dropout_rate'], self.params['learning_rate'], self.params['train_batch_size'], self.params['hidden_layer_dims'], self.params['loss'], self.params['group_size'])) if self._shouldKeep(checkpoint): self._keepCheckpoint(checkpoint) create_sub(estimator, checkpoint_path, eval_result) self._pruneCheckpoints(checkpoint) else: self._log('skipping checkpoint {}'.format(checkpoint.path))