def objective(latent_factors, regularization, alpha): # parameters must be the same defined above average_map = 0.0 n_tests = 3 # number of tests (on different data split) seed = [1234, 12, 34] # seed to define the split for i in range(n_tests): URM_train, URM_test = splitter.split_train_test(urm, testing=0.15, seed=seed[i]) URM_test = n_interaction_interval( URM_test, 0, 5 ) # maintain only users with a number of interaction between 0 and 5 (excluded) evaluator_test = EvaluatorHoldout(URM_test, [10]) rec = ALS(URM_train) # can be used also with other recommenders rec.fit(latent_factors=latent_factors, regularization=regularization, iterations=100, alpha=alpha) # pass the parameter we are tuning results_run_dict, results_run_string = evaluator_test.evaluateRecommender( rec) cumulative_MAP = results_run_dict[10]['MAP'] average_map += cumulative_MAP print( f"\nlatent_factors: {latent_factors}, regularization: {regularization}\navg MAP: {average_map/n_tests}\n\n" ) return -average_map / n_tests # return the avg_map among the different test (to avoid overfitting on a specific data split)
def single_test(urm_train, urm_test, urm_valid, x_tick): evaluator_test = EvaluatorHoldout(urm_test, cutoff_list=[10]) recommender = HybridNorm3Recommender(urm_train) recommender.fit(beta=best_alpha) result, str_result = evaluator_test.evaluateRecommender(recommender) return result[10]['MAP']
def search_hyperparameter_to_recommenders(urm_train_split: csr_matrix, urm_validation_split: csr_matrix, urm_test_split: csr_matrix, urm_impressions: csr_matrix, recommender: Type[BaseRecommender]): URM_train = urm_train_split.copy() URM_validation = urm_validation_split.copy() URM_test = urm_test_split.copy() URM_impressions = urm_impressions.copy() if any(not isspmatrix_csr(split) for split in [URM_train, URM_validation, URM_test, URM_impressions]): raise ValueError("The matrices are not all CSR matrices.") assert_implicit_data([URM_train, URM_validation, URM_test]) assert_disjoint_matrices([URM_train, URM_validation, URM_test]) if recommender_class.RECOMMENDER_NAME == Random.RECOMMENDER_NAME: evaluator_validation = EvaluatorHoldout(URM_validation, cutoff_list=[10], parallel=False) evaluator_test = EvaluatorHoldout(URM_test, cutoff_list=[5, 10, 20], parallel=False) else: evaluator_validation = EvaluatorHoldout(URM_validation, cutoff_list=[10], parallel=True, num_workers=NUM_WORKERS) evaluator_test = EvaluatorHoldout(URM_test, cutoff_list=[5, 10, 20], parallel=True, num_workers=NUM_WORKERS) runParameterSearch_Collaborative_partial = partial( runParameterSearch_Collaborative, URM_train=URM_train, URM_train_last_test=URM_train + URM_validation, metric_to_optimize=METRIC_TO_OPTIMIZE, evaluator_validation_earlystopping=evaluator_validation, evaluator_validation=evaluator_validation, evaluator_test=evaluator_test, output_folder_path=EXPERIMENTS_FOLDER_PATH, parallelizeKNN=False, allow_weighting=True, resume_from_saved=True, n_cases=NUM_CASES, n_random_starts=NUM_RANDOM_STARTS, URM_impressions=URM_impressions) try: runParameterSearch_Collaborative_partial(recommender) except Exception as e: logging.exception(f"On recommender {recommender} Exception {e}")
def ablation_study(arguments): study_path = 'ablation_study' if not os.path.exists(study_path): os.makedirs(study_path, exist_ok=False) exp_path = 'experiments' datasets = [] modes = ['user', 'item'] run_all = False if '--run-all' in arguments: datasets = all_datasets run_all = True for arg in arguments: if arg in name_datasets and not run_all: datasets.append(all_datasets[name_datasets.index(arg)]) if arg in modes: modes = [arg] cutoffs = [5, 10, 20, 50] marker = itertools.cycle(['o', '^', 's', 'p', '1', 'D', 'P', '*']) for m in modes: for d in datasets: plotting_data = {c: {m: [] for m in metrics} for c in cutoffs} best_params = load_best_params(exp_path, d if isinstance(d, str) else d.DATASET_NAME, 'GANMF', m) range_coeff = np.arange(0, 1.1, 0.2) for coeff in range_coeff: best_params['recon_coefficient'] = coeff URM_train, URM_test, _, _, _ = load_URMs(d, dataset_kwargs) set_seed(seed) test_evaluator = EvaluatorHoldout(URM_test, cutoffs, exclude_seen=True) model = GANMF(URM_train, mode=m, seed=seed, is_experiment=True) model.fit(validation_set=None, sample_every=None, validation_evaluator=None, **best_params) result_dict, result_str = test_evaluator.evaluateRecommender(model) plotting_data[coeff] = {} for c in cutoffs: for met in metrics: plotting_data[c][met].append(result_dict[c][met]) dname = d if isinstance(d, str) else d.DATASET_NAME substudy_path = os.path.join(study_path, dname + '_GANMF_' + m) if not os.path.exists(substudy_path): os.makedirs(substudy_path, exist_ok=False) for c in cutoffs: fig, ax = plt.subplots(figsize=(20, 10)) ax.set_xlabel('Feature Matching Coefficient') for met in metrics: ax.plot(range_coeff, plotting_data[c][met], label=met, marker=next(marker)) ax.legend(loc='best', fontsize='x-large') fig.savefig(os.path.join(substudy_path, str(c) + '_feature_matching_effect.png'), bbox_inches='tight')
def read_data_split_and_search(): """ This function provides a simple example on how to tune parameters of a given algorithm The BayesianSearch object will save: - A .txt file with all the cases explored and the recommendation quality - A _best_model file which contains the trained model and can be loaded with recommender.load_model() - A _best_parameter file which contains a dictionary with all the fit parameters, it can be passed to recommender.fit(**_best_parameter) - A _best_result_validation file which contains a dictionary with the results of the best solution on the validation - A _best_result_test file which contains a dictionary with the results, on the test set, of the best solution chosen using the validation set """ dataReader = Movielens10MReader() dataset = dataReader.load_data() URM_train, URM_test = split_train_in_two_percentage_global_sample( dataset.get_URM_all(), train_percentage=0.80) URM_train, URM_validation = split_train_in_two_percentage_global_sample( URM_train, train_percentage=0.80) output_folder_path = "result_experiments/" # If directory does not exist, create if not os.path.exists(output_folder_path): os.makedirs(output_folder_path) collaborative_algorithm_list = [ Random, TopPop, P3alphaRecommender, RP3betaRecommender, ItemKNNCFRecommender, UserKNNCFRecommender, MatrixFactorization_BPR_Cython, MatrixFactorization_FunkSVD_Cython, PureSVDRecommender, SLIM_BPR_Cython, SLIMElasticNetRecommender ] from Base.Evaluation.Evaluator import EvaluatorHoldout evaluator_validation = EvaluatorHoldout(URM_validation, cutoff_list=[5]) evaluator_test = EvaluatorHoldout(URM_test, cutoff_list=[5, 10]) runParameterSearch_Collaborative_partial = partial( runParameterSearch_Collaborative, URM_train=URM_train, metric_to_optimize="MAP", n_cases=10, evaluator_validation_earlystopping=evaluator_validation, evaluator_validation=evaluator_validation, evaluator_test=evaluator_test, output_folder_path=output_folder_path, similarity_type_list=["cosine"], parallelizeKNN=False) pool = multiprocessing.Pool(processes=int(multiprocessing.cpu_count()), maxtasksperchild=1) pool.map(runParameterSearch_Collaborative_partial, collaborative_algorithm_list)
def get_precision(learning_rate, num_epoch, URM_train, URM_test): recommender = SLIM_BPR_Cython(URM_train, recompile_cython=False) recommender.fit(epochs=num_epoch, batch_size=1, sgd_mode='sgd', learning_rate=learning_rate, positive_threshold_BPR=1) evaluator_validation = EvaluatorHoldout(URM_test, cutoff_list=[10]) results_dict, results_run_string = evaluator_validation.evaluateRecommender( recommender) return results_dict[10]['PRECISION']
def single_test(urm_train, urm_test, urm_valid): evaluator_valid = EvaluatorHoldout(urm_valid, cutoff_list=[10]) evaluator_test = EvaluatorHoldout(urm_test, cutoff_list=[10]) recommender = UserKNNCBFRecommender(urm_train, ucm_all) recommender.fit(shrink=1777, topK=1998, similarity='tversky', feature_weighting='BM25', tversky_alpha=0.1604953616, tversky_beta=0.9862348646) result, str_result = evaluator_test.evaluateRecommender(recommender) # result, str_result = evaluator_valid.evaluateRecommender(recommender) # res[num_test] = result[10]['MAP'] return result[10]['MAP']
def single_test(urm_train, urm_test, urm_valid, x_tick): evaluator_valid = EvaluatorHoldout(urm_valid, cutoff_list=[10], verbose=False) MAP_per_k_valid = [] recommender = HybridNorm3Recommender(urm_train) for alpha in tqdm(x_tick): recommender.fit(beta=alpha) result_dict, res_str = evaluator_valid.evaluateRecommender(recommender) MAP_per_k_valid.append(result_dict[10]["MAP"]) return MAP_per_k_valid
def __init__(self, recommender_class, URM_train, k=5, seed=1666, level=None, evaluator_test=None, verbose=True): super(SearchAbstractClass, self).__init__() self.recommender_class = recommender_class self.URM_train = URM_train.copy() self.k = k self.seed = seed self.verbose = verbose self.log_file = None self.level = level self.results_test_best = {} self.parameter_dictionary_best = {} self.URM_list = [] self.URM_test_list = [ ] #aggiunta solo perchè ho il dubbio che ci sia bisogno di tenere referenziato l'oggetto da qualche parte self.evaluator_list = [] # k-fold kf = KFold(n_splits=k, shuffle=True, random_state=self.seed) shape = URM_train.shape indptr = URM_train.indptr indices = URM_train.indices data = URM_train.data for train_index, test_index in kf.split(data): data_train = np.ones(data.shape) data_test = np.ones(data.shape) data_train[test_index] = 0 data_test[train_index] = 0 kf_train = sps.csr_matrix((data_train, indices, indptr), shape=shape).copy() kf_test = sps.csr_matrix((data_test, indices, indptr), shape=shape).copy() kf_train.eliminate_zeros() kf_test.eliminate_zeros() self.URM_list.append(kf_train) self.URM_test_list.append(kf_test) self.evaluator_list.append( EvaluatorHoldout(kf_test, cutoff_list=[10])) if evaluator_test is None: self.evaluator_test = None else: self.evaluator_test = evaluator_test
def parallel_fit_and_eval_job(recommender, data: DataObject, epochs, num_factors, learning_rate, sgd, negative_interactions_quota, init_mean, init_std_dev, user_reg, item_reg, bias_reg, positive_reg, negative_reg): ev = EvaluatorHoldout(data.urm_test, [10], minRatingsPerUser=1, exclude_seen=True, verbose=True) # Fit recommender.fit(epochs=epochs, batch_size=1000, num_factors=num_factors, positive_threshold_BPR=None, learning_rate=learning_rate, use_bias=True, sgd_mode=sgd, negative_interactions_quota=negative_interactions_quota, init_mean=init_mean, init_std_dev=init_std_dev, user_reg=user_reg, item_reg=item_reg, bias_reg=bias_reg, positive_reg=positive_reg, negative_reg=negative_reg, validation_every_n=3, epochs_min=1, stop_on_validation=True, validation_metric="MAP", lower_validations_allowed=2, evaluator_object=ev, random_seed=None) # Eval _result = [] for n, users, description in data.urm_train_users_by_type: _eval, _map = MyEvaluator.evaluate_algorithm(data.urm_test, users, recommender, at=10, remove_top=0) _result.append(_map) users = data.ids_target_users _eval, _map = MyEvaluator.evaluate_algorithm(data.urm_test, users, recommender, at=10, remove_top=0) _result.append(_map) return _result
def evaluate(urm, ICM): URM_train, URM_val, URM_test = splitter.split(urm, testing=0.1, validation=0.2) evaluator_validation = EvaluatorHoldout(URM_val, [10]) evaluator_test = EvaluatorHoldout(URM_test, [10]) recommender = Hybrid(URM_train, ICM) recommender.fit() results_run_dict, results_run_string = evaluator_validation.evaluateRecommender(recommender) print(results_run_string) results_run_dict, results_run_string = evaluator_test.evaluateRecommender(recommender) print(results_run_string)
def search_param(alpha, beta, topK): res = [] for current in my_input: recommender = current[1] urm_valid = current[0] evaluator_valid = EvaluatorHoldout(urm_valid, cutoff_list=[10]) #recommender.fit(alpha=alpha, beta=beta, gamma=gamma, phi=phi, psi=psi, li=li, mi=mi) recommender.fit(alpha=alpha, beta=beta, topK=int(topK)) result_valid, str_result = evaluator_valid.evaluateRecommender(recommender) res.append(result_valid[10]['MAP']) print('Il max valid è il n: {} con : {}'.format(vec['n_valid'], optimizer.max)) print('Il max test è il n : {} con test : {}'.format(vec['n_test'], vec['max_test'])) res = np.array(res) print('Il Map corrente è : {}'.format(res.mean())) if res.mean() > vec['max_valid']: vec['n_valid'] = vec['n'] vec['max_valid'] = res.mean() print('new max valid found') res_test = [] for current in my_input: recommender = current[1] urm_test = current[2] evaluator_test = EvaluatorHoldout(urm_test, cutoff_list=[10]) #recommender.fit(alpha=alpha, beta=beta, gamma=gamma, phi=phi, psi=psi, li=li, mi=mi) recommender.fit(alpha=alpha, beta=beta, topK=int(topK)) result_test, str_result = evaluator_test.evaluateRecommender(recommender) res_test.append(result_test[10]['MAP']) res_test = np.array(res_test) if res_test.mean() > vec['max_test']: print('un nuovo max è stato trovato') vec['max_test'] = res_test.mean() vec['n_test'] = vec['n'] vec['n'] += 1 return res.mean()
def read_data_split_and_search(dataset_variant, train_interactions, flag_baselines_tune=False, flag_DL_article_default=False, flag_DL_tune=False, flag_print_results=False): # Using dataReader from CollaborativeVAE_our_interface as they use the same data in the same way from Conferences.KDD.CollaborativeVAE_our_interface.Citeulike.CiteulikeReader import CiteulikeReader result_folder_path = "result_experiments/{}/{}_citeulike_{}_{}/".format( CONFERENCE_NAME, ALGORITHM_NAME, dataset_variant, train_interactions) result_folder_path_CollaborativeVAE = "result_experiments/{}/{}_citeulike_{}_{}/".format( CONFERENCE_NAME, "CollaborativeVAE", dataset_variant, train_interactions) dataset = CiteulikeReader(result_folder_path_CollaborativeVAE, dataset_variant=dataset_variant, train_interactions=train_interactions) URM_train = dataset.URM_DICT["URM_train"].copy() URM_validation = dataset.URM_DICT["URM_validation"].copy() URM_test = dataset.URM_DICT["URM_test"].copy() # Ensure IMPLICIT data assert_implicit_data([URM_train, URM_validation, URM_test]) # Due to the sparsity of the dataset, choosing an evaluation as subset of the train # While keepning validation interaction in the train set if train_interactions == 1: # In this case the train data will contain validation data to avoid cold users assert_disjoint_matrices([URM_train, URM_test]) assert_disjoint_matrices([URM_validation, URM_test]) exclude_seen_validation = False URM_train_last_test = URM_train else: assert_disjoint_matrices([URM_train, URM_validation, URM_test]) exclude_seen_validation = True URM_train_last_test = URM_train + URM_validation assert_implicit_data([URM_train_last_test]) # If directory does not exist, create if not os.path.exists(result_folder_path): os.makedirs(result_folder_path) from Base.Evaluation.Evaluator import EvaluatorHoldout evaluator_validation = EvaluatorHoldout( URM_validation, cutoff_list=[150], exclude_seen=exclude_seen_validation) evaluator_test = EvaluatorHoldout( URM_test, cutoff_list=[50, 100, 150, 200, 250, 300]) ################################################################################################ ###### ###### DL ALGORITHM ###### if flag_DL_article_default: try: collaborativeDL_article_hyperparameters = { "para_lv": 10, "para_lu": 1, "para_ln": 1e3, "batch_size": 128, "epoch_sdae": 200, "epoch_dae": 200, } parameterSearch = SearchSingleCase( CollaborativeDL_Matlab_RecommenderWrapper, evaluator_validation=evaluator_validation, evaluator_test=evaluator_test) recommender_input_args = SearchInputRecommenderArgs( CONSTRUCTOR_POSITIONAL_ARGS=[ URM_train, dataset.ICM_DICT["ICM_tokens_TFIDF"] ], FIT_KEYWORD_ARGS={}) recommender_input_args_last_test = recommender_input_args.copy() recommender_input_args_last_test.CONSTRUCTOR_POSITIONAL_ARGS[ 0] = URM_train_last_test parameterSearch.search( recommender_input_args, recommender_input_args_last_test= recommender_input_args_last_test, fit_hyperparameters_values= collaborativeDL_article_hyperparameters, output_folder_path=result_folder_path, resume_from_saved=True, output_file_name_root=CollaborativeDL_Matlab_RecommenderWrapper .RECOMMENDER_NAME) except Exception as e: print("On recommender {} Exception {}".format( CollaborativeDL_Matlab_RecommenderWrapper, str(e))) traceback.print_exc() ################################################################################################ ###### ###### PRINT RESULTS ###### if flag_print_results: n_test_users = np.sum(np.ediff1d(URM_test.indptr) >= 1) ICM_names_to_report_list = list(dataset.ICM_DICT.keys()) dataset_name = "{}_{}".format(dataset_variant, train_interactions) file_name = "{}..//{}_{}_".format(result_folder_path, ALGORITHM_NAME, dataset_name) result_loader = ResultFolderLoader( result_folder_path, base_algorithm_list=None, other_algorithm_list=[CollaborativeDL_Matlab_RecommenderWrapper], KNN_similarity_list=KNN_similarity_to_report_list, ICM_names_list=ICM_names_to_report_list, UCM_names_list=None) result_loader.generate_latex_results( file_name + "{}_latex_results.txt".format("article_metrics"), metrics_list=["RECALL"], cutoffs_list=[50, 100, 150, 200, 250, 300], table_title=None, highlight_best=True) result_loader.generate_latex_results( file_name + "{}_latex_results.txt".format("all_metrics"), metrics_list=[ "PRECISION", "RECALL", "MAP_MIN_DEN", "MRR", "NDCG", "F1", "HIT_RATE", "ARHR_ALL_HITS", "NOVELTY", "DIVERSITY_MEAN_INTER_LIST", "DIVERSITY_HERFINDAHL", "COVERAGE_ITEM", "DIVERSITY_GINI", "SHANNON_ENTROPY" ], cutoffs_list=[150], table_title=None, highlight_best=True) result_loader.generate_latex_time_statistics( file_name + "{}_latex_results.txt".format("time"), n_evaluation_users=n_test_users, table_title=None)
# idx_topk = idx_topk_part[np.arange(batch_users)[:, np.newaxis], idx_part] recommended_items = np.argsort(-pred_val, axis=1).ravel()[:k] is_relevant = np.in1d(recommended_items, pos_items_array, assume_unique=True) # his_recall = Recall_at_k_batch(pred_val, pos_items_sparse, k=20)[0] # my_recall = recall(is_relevant, pos_items_array) his_ndcg = NDCG_binary_at_k_batch(pred_val, pos_items_sparse, k=100)[0] my_ndcg = ndcg(recommended_items, pos_items_array) if not np.allclose(my_ndcg, his_ndcg, atol=0.0001): pass n100_list = np.concatenate(n100_list) r20_list = np.concatenate(r20_list) r50_list = np.concatenate(r50_list) print("Test NDCG@100=%.5f (%.5f)" % (np.mean(n100_list), np.std(n100_list) / np.sqrt(len(n100_list)))) print("Test Recall@20=%.5f (%.5f)" % (np.mean(r20_list), np.std(r20_list) / np.sqrt(len(r20_list)))) print("Test Recall@50=%.5f (%.5f)" % (np.mean(r50_list), np.std(r50_list) / np.sqrt(len(r50_list)))) from Base.Evaluation.Evaluator import EvaluatorHoldout evaluator = EvaluatorHoldout(test_data_te, cutoff_list=[20, 50, 100]) results_dict, results_run_string = evaluator.evaluateRecommender(recommender) print(results_run_string)
def run_recommender(recommender_class): temp_save_file_folder = "./result_experiments/__temp_model/" if not os.path.isdir(temp_save_file_folder): os.makedirs(temp_save_file_folder) try: dataset_object = Movielens1MReader() dataSplitter = DataSplitter_leave_k_out(dataset_object, k_out_value=2) dataSplitter.load_data() URM_train, URM_validation, URM_test = dataSplitter.get_holdout_split() write_log_string(log_file, "On Recommender {}\n".format(recommender_class)) recommender_object = recommender_class(URM_train) if isinstance(recommender_object, Incremental_Training_Early_Stopping): fit_params = {"epochs": 15} else: fit_params = {} recommender_object.fit(**fit_params) write_log_string(log_file, "Fit OK, ") evaluator = EvaluatorHoldout(URM_test, [5], exclude_seen=True) _, results_run_string = evaluator.evaluateRecommender(recommender_object) write_log_string(log_file, "EvaluatorHoldout OK, ") evaluator = EvaluatorNegativeItemSample(URM_test, URM_train, [5], exclude_seen=True) _, _ = evaluator.evaluateRecommender(recommender_object) write_log_string(log_file, "EvaluatorNegativeItemSample OK, ") recommender_object.saveModel(temp_save_file_folder, file_name="temp_model") write_log_string(log_file, "saveModel OK, ") recommender_object = recommender_class(URM_train) recommender_object.loadModel(temp_save_file_folder, file_name="temp_model") evaluator = EvaluatorHoldout(URM_test, [5], exclude_seen=True) _, results_run_string_2 = evaluator.evaluateRecommender(recommender_object) write_log_string(log_file, "loadModel OK, ") shutil.rmtree(temp_save_file_folder, ignore_errors=True) write_log_string(log_file, " PASS\n") write_log_string(log_file, results_run_string + "\n\n") except Exception as e: print("On Recommender {} Exception {}".format(recommender_class, str(e))) log_file.write("On Recommender {} Exception {}\n\n\n".format(recommender_class, str(e))) log_file.flush() traceback.print_exc()
def read_data_split_and_search(): """ This function provides a simple example on how to tune parameters of a given algorithm The BayesianSearch object will save: - A .txt file with all the cases explored and the recommendation quality - A _best_model file which contains the trained model and can be loaded with recommender.load_model() - A _best_parameter file which contains a dictionary with all the fit parameters, it can be passed to recommender.fit(**_best_parameter) - A _best_result_validation file which contains a dictionary with the results of the best solution on the validation - A _best_result_test file which contains a dictionary with the results, on the test set, of the best solution chosen using the validation set """ from Data_manager.Movielens1M.Movielens1MReader import Movielens1MReader from Data_manager.DataSplitter_k_fold_stratified import DataSplitter_Warm_k_fold dataset_object = Movielens1MReader() dataSplitter = DataSplitter_Warm_k_fold(dataset_object) dataSplitter.load_data() URM_train, URM_validation, URM_test = dataSplitter.get_holdout_split() output_folder_path = "result_experiments/SKOPT_prova/" # If directory does not exist, create if not os.path.exists(output_folder_path): os.makedirs(output_folder_path) collaborative_algorithm_list = [ Random, TopPop, P3alphaRecommender, RP3betaRecommender, ItemKNNCFRecommender, UserKNNCFRecommender, # MatrixFactorization_BPR_Cython, # MatrixFactorization_FunkSVD_Cython, # PureSVDRecommender, # SLIM_BPR_Cython, # SLIMElasticNetRecommender ] from Base.Evaluation.Evaluator import EvaluatorHoldout evaluator_validation = EvaluatorHoldout(URM_validation, cutoff_list=[5]) evaluator_test = EvaluatorHoldout(URM_test, cutoff_list=[5, 10]) runParameterSearch_Collaborative_partial = partial( runParameterSearch_Collaborative, URM_train=URM_train, metric_to_optimize="MAP", n_cases=8, evaluator_validation_earlystopping=evaluator_validation, evaluator_validation=evaluator_validation, evaluator_test=evaluator_test, output_folder_path=output_folder_path) from Utils.PoolWithSubprocess import PoolWithSubprocess # pool = PoolWithSubprocess(processes=int(multiprocessing.cpu_count()), maxtasksperchild=1) # resultList = pool.map(runParameterSearch_Collaborative_partial, collaborative_algorithm_list) # pool.close() # pool.join() for recommender_class in collaborative_algorithm_list: try: runParameterSearch_Collaborative_partial(recommender_class) except Exception as e: print("On recommender {} Exception {}".format( recommender_class, str(e))) traceback.print_exc()
def read_data_split_and_search(): """ This function provides a simple example on how to tune parameters of a given algorithm The BayesianSearch object will save: - A .txt file with all the cases explored and the recommendation quality - A _best_model file which contains the trained model and can be loaded with recommender.load_model() - A _best_parameter file which contains a dictionary with all the fit parameters, it can be passed to recommender.fit(**_best_parameter) - A _best_result_validation file which contains a dictionary with the results of the best solution on the validation - A _best_result_test file which contains a dictionary with the results, on the test set, of the best solution chosen using the validation set """ parser = DataParser() URM_all = parser.get_URM_all() ICM_obj = parser.get_ICM_all() URM_train, URM_test = split_train_in_two_percentage_global_sample( URM_all, train_percentage=0.80) URM_train, URM_validation = split_train_in_two_percentage_global_sample( URM_train, train_percentage=0.85) """ 26-10-2020 > OPTIMIZATION ON THE RANGE [200, +INF) Already done optimizations: > RECOMMENDER I'AM CONSIDERING (the fastest up to now) > PureSVD > ItemKNNCBF > ItemKNNCF > UserKNNCF > P3A > RP3beta """ f_range = (200, -1) URM_validation = parser.filter_URM_test_by_range(URM_train, URM_validation, f_range) URM_test = parser.filter_URM_test_by_range(URM_train, URM_test, f_range) output_folder_path = "result_experiments_v2/" + "range_" + str( f_range[0]) + "-" + str(f_range[1]) + "/" # If directory does not exist, create if not os.path.exists(output_folder_path): os.makedirs(output_folder_path) """ collaborative_algorithm_list = [ #EASE_R_Recommender PipeHybrid001, #Random, #TopPop, #P3alphaRecommender, #RP3betaRecommender, #ItemKNNCFRecommender, #UserKNNCFRecommender, #MatrixFactorization_BPR_Cython, #MatrixFactorization_FunkSVD_Cython, #PureSVDRecommender, #NMFRecommender, #PureSVDItemRecommender #SLIM_BPR_Cython, #SLIMElasticNetRecommender #IALSRecommender #MF_MSE_PyTorch #MergedHybrid000 ] content_algorithm_list= [ #ItemKNNCBFRecommender ] """ algorithm_in_sequence = [(ItemKNNCFRecommender, 'CF'), (UserKNNCFRecommender, 'CF'), (P3alphaRecommender, 'CF'), (RP3betaRecommender, 'CF'), (PureSVDRecommender, 'CF'), (ItemKNNCBFRecommender, 'CBF')] from Base.Evaluation.Evaluator import EvaluatorHoldout evaluator_validation = EvaluatorHoldout(URM_validation, cutoff_list=[10]) evaluator_test = EvaluatorHoldout(URM_test, cutoff_list=[10]) for algo, type in algorithm_in_sequence: print(F"OPTIMIZING {algo.RECOMMENDER_NAME} - {type}") if type == 'CF': collaborative_algorithm_list = [] collaborative_algorithm_list.append(algo) runParameterSearch_Collaborative_partial = partial( runParameterSearch_Collaborative, URM_train=URM_train, ICM_train=ICM_obj, metric_to_optimize="MAP", n_cases=50, n_random_starts=50 * 0.3, evaluator_validation_earlystopping=evaluator_validation, evaluator_validation=evaluator_validation, evaluator_test=evaluator_test, output_folder_path=output_folder_path, allow_weighting=False, #LOOOK AT HEREEEEEEEEEEEEEEEEE parallelizeKNN=False) pool = multiprocessing.Pool(processes=int( multiprocessing.cpu_count()), maxtasksperchild=1) pool.map(runParameterSearch_Collaborative_partial, collaborative_algorithm_list) elif type == 'CBF': content_algorithm_list = [] content_algorithm_list.append(algo) runParameterSearch_Content_partial = partial( runParameterSearch_Content, URM_train=URM_train, ICM_object=ICM_obj, ICM_name='BookFeatures', n_cases=50, n_random_starts=50 * 0.3, evaluator_validation=evaluator_validation, evaluator_test=evaluator_test, metric_to_optimize="MAP", parallelizeKNN=False, allow_weighting=True, #similarity_type_list=['cosine'] ) pool = multiprocessing.Pool(processes=int( multiprocessing.cpu_count()), maxtasksperchild=1) pool.map(runParameterSearch_Content_partial, content_algorithm_list)
def __init__(self, recommender_class, dataset, fit_param_names=[], metric='MAP', method='bayesian', at=5, verbose=True, seed=1234): # Seed for reproducibility of results and consistent initialization of weights/splitting of dataset set_seed(seed) self.recommender_class = recommender_class self.dataset = dataset self.dataset_name = self.dataset if isinstance(self.dataset, str) else self.dataset.DATASET_NAME self.fit_param_names = fit_param_names self.metric = metric self.method = method self.at = at self.verbose = verbose self.seed = seed self.isGAN = False # if isinstance(self.dataset, str) and self.dataset in Movielens.urls.keys(): # self.reader = Movielens(version=self.dataset, **dataset_kwargs) # else: # self.reader = self.dataset(**dataset_kwargs) # self.logsdir = os.path.join('experiments', self.recommender_class.RECOMMENDER_NAME + '_' + self.reader.DATASET_NAME) self.logsdir = os.path.join('experiments', self.recommender_class.RECOMMENDER_NAME + '_' + train_mode + '_' + self.dataset_name) if not os.path.exists(self.logsdir): os.makedirs(self.logsdir, exist_ok=False) # with open(os.path.join(self.logsdir, 'dataset_config.txt'), 'w') as f: # json.dump(self.reader.config, f, indent=4) codesdir = os.path.join(self.logsdir, 'code') os.makedirs(codesdir, exist_ok=True) shutil.copy(os.path.abspath(sys.modules[self.__module__].__file__), codesdir) shutil.copy(os.path.abspath(sys.modules[self.recommender_class.__module__].__file__), codesdir) # self.URM_train, self.URM_test, self.URM_validation = self.reader.split_urm(split_ratio=[0.6, 0.2, 0.2], save_local=False, verbose=False) # self.URM_train = self.reader.get_URM_train() # self.URM_test = self.reader.get_URM_test() # self.URM_for_train, _, self.URM_validation = self.reader.split_urm( # self.URM_train.tocoo(), split_ratio=[0.75, 0, 0.25], save_local=False, verbose=False) # self.URM_train_small, _, self.URM_early_stop = self.reader.split_urm(self.URM_for_train.tocoo(), split_ratio=[0.85, 0, 0.15], save_local=False, verbose=False) # del self.URM_for_train self.URM_train, self.URM_test, self.URM_validation, self.URM_train_small, self.URM_early_stop = load_URMs( dataset, dataset_kwargs) self.evaluator_validation = EvaluatorHoldout(self.URM_validation, [self.at], exclude_seen=True) self.evaluator_earlystop = EvaluatorHoldout(self.URM_early_stop, [self.at], exclude_seen=True) self.evaluatorTest = EvaluatorHoldout(self.URM_test, [self.at, 10, 20, 50], exclude_seen=True, minRatingsPerUser=2) self.fit_params = {} modules = getattr(self.recommender_class, '__module__', None) if modules and modules.split('.')[0] == gans.__name__: self.isGAN = True # EARLY STOPPING from Maurizio's framework for baselines 对于基准框架的early stopping self.early_stopping_parameters = { 'epochs_min': 0, 'validation_every_n': 5, 'stop_on_validation': True, 'validation_metric': self.metric, 'lower_validations_allowed': 5, 'evaluator_object': self.evaluator_earlystop } # EARYL STOPPING for GAN-based recommenders 对于 基于GAN 的 推荐算法的 early stopping self.my_early_stopping = { 'allow_worse': 5, 'freq': 5, 'validation_evaluator': self.evaluator_earlystop, 'validation_set': None, 'sample_every': None, }
pyplot.xlabel('Sorted Item') pyplot.show() user_activity = np.ediff1d(URM_all.indptr) user_activity = np.sort(user_activity) pyplot.plot(user_activity, 'ro') pyplot.ylabel('Num Interactions ') pyplot.xlabel('Sorted User') pyplot.show()''' #np.random.seed(1234) URM_train, URM_test = train_test_holdout(URM_all, train_perc=0.90) ICM_train, ICM_test = train_test_holdout(ICM_all, train_perc=0.9) evaluator_validation = EvaluatorHoldout(URM_test, cutoff_list=[10], exclude_seen=True) URM_ICM_train = sps.vstack([URM_train, ICM_all.T]) URM_ICM_train = URM_ICM_train.tocsr() URM_ICM_train2 = sps.hstack([ICM_all, URM_train.T]) URM_ICM_train2 = URM_ICM_train2.tocsr() earlystopping_keywargs = { "validation_every_n": 10, "stop_on_validation": True, "evaluator_object": evaluator_validation, "lower_validations_allowed": 5, "validation_metric": "MAP", }
pyplot.xlabel('Sorted Item') pyplot.show() user_activity = np.ediff1d(URM_all.indptr) user_activity = np.sort(user_activity) pyplot.plot(user_activity, 'ro') pyplot.ylabel('Num Interactions ') pyplot.xlabel('Sorted User') pyplot.show() np.random.seed(1234) URM_train, URM_test = train_test_holdout(URM_all, train_perc=0.8) ICM_train, ICM_test = train_test_holdout(ICM_all, train_perc=0.8) evaluator_validation = EvaluatorHoldout(URM_test, cutoff_list=[10], exclude_seen=True) earlystopping_keywargs = { "validation_every_n": 10, "stop_on_validation": True, "evaluator_object": evaluator_validation, "lower_validations_allowed": 5, "validation_metric": "MAP", } # MAP 0.057, kaggle MAP 0.054 recommender1 = SLIM_BPR_Cython(URM_train, recompile_cython=False) recommender1.load_model('SavedModels', 'SLIM_BPR_Cyrhon') #recommender1.fit(**{"topK": 865, "epochs": 1000, "symmetric": False, "sgd_mode": "adagrad", "lambda_i": 0.01, # "lambda_j": 1e-05, "learning_rate": 0.0001})
def single_test(i): evaluator_test = EvaluatorHoldout(n_urm_test[i], cutoff_list=[10]) #n_recommender[i].fit(alpha=alpha, beta=beta, gamma=gamma, phi=phi, psi=psi, li=li) n_recommender[i].fit(alpha=alpha, beta=beta, topK=int(topK)) result, str_result = evaluator_test.evaluateRecommender(n_recommender[i]) return result[10]['MAP']
from FeatureWeighting.User_CFW_D_Similarity_Linalg import User_CFW_D_Similarity_Linalg from Hybrid.HybridNorm3Recommender import HybridNorm3Recommender from MatrixFactorization.ALSRecommender import ALSRecommender from MatrixFactorization.BPRRecommender import BPRRecommender import similaripy as sim data = DataManager() urm_train = data.get_urm() urm_train, urm_test = split_train_leave_k_out_user_wise(data.get_urm(), temperature='normal') urm_train, urm_valid = split_train_leave_k_out_user_wise(urm_train, temperature='valid2') urm_train_warm = data.create_test_warm_users(urm_train, threshold=10) urm_test_warm = data.create_test_warm_users(urm_test, threshold=10) evaluator_test_warm = EvaluatorHoldout(urm_test_warm, cutoff_list=[10]) recommender = UserKNNCFRecommender(urm_train) recommender.fit(shrink=2, topK=600, normalize=True) recommender_warm = UserKNNCFRecommender(urm_train_warm) recommender_warm.fit(shrink=2, topK=500, normalize=True) result, str_result = evaluator_test_warm.evaluateRecommender(recommender) print('The Map of test of urm normal is : {}'.format(result[10]['MAP'])) result, str_result = evaluator_test_warm.evaluateRecommender(recommender_warm) print('The Map of test of urm warm is : {}'.format(result[10]['MAP']))
from KNN.UserKNNCBFRecommender import UserKNNCBFRecommender from KNN.UserKNNCFRecommender import UserKNNCFRecommender import numpy as np import scipy.sparse as sps from FeatureWeighting.User_CFW_D_Similarity_Linalg import User_CFW_D_Similarity_Linalg from Hybrid.HybridGen2Recommender import HybridGen2Recommender from Hybrid.HybridNormRecommender import HybridNormRecommender from Hybrid.HybridNorm1Recommender import HybridNorm1Recommender from Hybrid.HybridNorm2Recommender import HybridNorm2Recommender Data = DataManager() urm_train, urm_test = split_train_leave_k_out_user_wise(Data.get_urm(), threshold=10, temperature='normal') urm_train, urm_valid = split_train_leave_k_out_user_wise(urm_train, threshold=10, temperature='valid') evaluator_valid = EvaluatorHoldout(urm_valid, cutoff_list=[10]) evaluator_test = EvaluatorHoldout(urm_test, cutoff_list=[10]) recommender = HybridNorm1Recommender # recommender_3 = UserKNNCFRecommender(urm_train) # recommender_3.fit(shrink=2, topK=600, normalize=True) # w_sparse = recommender_3.W_sparse parameterSearch = SearchBayesianSkopt(recommender, evaluator_validation=evaluator_valid, evaluator_test=evaluator_test) # earlystopping_keywargs = {"validation_every_n": 5, # "stop_on_validation": True, # "evaluator_object": evaluator_valid,
def main(arguments): test_results_path = 'test_results' if not os.path.exists(test_results_path): os.makedirs(test_results_path, exist_ok=False) exp_path = 'experiments' datasets = [] run_all = False train_mode = ['user', 'item'] cutoffs = [5, 10, 20, 50] recommender = None dict_rec_classes = {} dict_rec_classes['TopPop'] = TopPop dict_rec_classes['Random'] = Random dict_rec_classes['PureSVD'] = PureSVDRecommender dict_rec_classes['BPR'] = MatrixFactorization_BPR_Cython dict_rec_classes['ALS'] = IALSRecommender dict_rec_classes['NMF'] = NMFRecommender dict_rec_classes['GANMF'] = GANMF dict_rec_classes['CFGAN'] = CFGAN dict_rec_classes['DisGANMF'] = DisGANMF dict_rec_classes['SLIMBPR'] = SLIM_BPR_Cython dict_rec_classes['fullGANMF'] = fullGANMF dict_rec_classes['DeepGANMF'] = DeepGANMF if '--run-all' in arguments: datasets = all_datasets run_all = True for arg in arguments: if arg in name_datasets and not run_all: datasets.append(all_datasets[name_datasets.index(arg)]) if arg in ['user', 'item']: train_mode = [arg] if arg in all_recommenders and recommender is None: recommender = arg if recommender not in ['GANMF', 'DisGANMF', 'CFGAN', 'fullGANMF', 'DeepGANMF']: train_mode = [''] for d in datasets: dname = d if isinstance(d, str) else d.DATASET_NAME for mode in train_mode: if recommender == 'fullGANMF': best_params = load_best_params(exp_path, dname, 'GANMF', mode) else: best_params = load_best_params(exp_path, dname, dict_rec_classes[recommender].RECOMMENDER_NAME, mode) set_seed(seed) URM_train, URM_test, _, _, _ = load_URMs(d, dataset_kwargs) test_evaluator = EvaluatorHoldout(URM_test, cutoffs, exclude_seen=True) if recommender in ['GANMF', 'DisGANMF', 'CFGAN', 'fullGANMF', 'DeepGANMF']: model = dict_rec_classes[recommender](URM_train, mode=mode, seed=seed, is_experiment=True) model.fit(validation_set=None, sample_every=None, validation_evaluator=None, **best_params) else: model = dict_rec_classes[recommender](URM_train) model.fit(**best_params) results_dict, results_str = test_evaluator.evaluateRecommender(model) save_path = os.path.join(test_results_path, model.RECOMMENDER_NAME + '_' + mode + '_' + dname) if not os.path.exists(save_path): os.makedirs(save_path, exist_ok=False) with open(os.path.join(save_path, 'test_results.txt'), 'a') as f: f.write(results_str) else: results_filename = os.path.join(save_path, 'test_results.txt') if not os.path.exists(results_filename): with open(results_filename, 'a') as f: f.write(results_str)
def read_data_split_and_search(dataset_name, flag_baselines_tune = False, flag_DL_article_default = False, flag_MF_baselines_tune = False, flag_DL_tune = False, flag_print_results = False): from Conferences.WWW.MultiVAE_our_interface.Movielens20M.Movielens20MReader import Movielens20MReader from Conferences.WWW.MultiVAE_our_interface.NetflixPrize.NetflixPrizeReader import NetflixPrizeReader split_type = "cold_user" result_folder_path = "result_experiments/{}/{}_{}_{}/".format(CONFERENCE_NAME, ALGORITHM_NAME, dataset_name, split_type) if dataset_name == "movielens20m": dataset = Movielens20MReader(result_folder_path, split_type = split_type) elif dataset_name == "netflixPrize": dataset = NetflixPrizeReader(result_folder_path) # If directory does not exist, create if not os.path.exists(result_folder_path): os.makedirs(result_folder_path) metric_to_optimize = "NDCG" n_cases = 50 n_random_starts = 15 if split_type == "cold_user": collaborative_algorithm_list = [ Random, TopPop, # UserKNNCFRecommender, ItemKNNCFRecommender, P3alphaRecommender, RP3betaRecommender, # PureSVDRecommender, # IALSRecommender, # NMFRecommender, # MatrixFactorization_BPR_Cython, # MatrixFactorization_FunkSVD_Cython, EASE_R_Recommender, SLIM_BPR_Cython, SLIMElasticNetRecommender, ] URM_train = dataset.URM_DICT["URM_train"].copy() URM_train_all = dataset.URM_DICT["URM_train_all"].copy() URM_validation = dataset.URM_DICT["URM_validation"].copy() URM_test = dataset.URM_DICT["URM_test"].copy() # Ensure IMPLICIT data and DISJOINT sets assert_implicit_data([URM_train, URM_train_all, URM_validation, URM_test]) assert_disjoint_matrices([URM_train, URM_validation, URM_test]) assert_disjoint_matrices([URM_train_all, URM_validation, URM_test]) from Base.Evaluation.Evaluator import EvaluatorHoldout evaluator_validation = EvaluatorHoldout(URM_validation, cutoff_list=[100]) evaluator_test = EvaluatorHoldout(URM_test, cutoff_list=[20, 50, 100]) evaluator_validation = EvaluatorUserSubsetWrapper(evaluator_validation, URM_train_all) evaluator_test = EvaluatorUserSubsetWrapper(evaluator_test, URM_train_all) runParameterSearch_Collaborative_partial = partial(runParameterSearch_Collaborative, URM_train = URM_train, URM_train_last_test = URM_train + URM_validation, metric_to_optimize = metric_to_optimize, evaluator_validation_earlystopping = evaluator_validation, evaluator_validation = evaluator_validation, evaluator_test = evaluator_test, output_folder_path = result_folder_path, parallelizeKNN = False, allow_weighting = True, resume_from_saved = True, n_cases = n_cases, n_random_starts = n_random_starts) if flag_baselines_tune: for recommender_class in collaborative_algorithm_list: try: runParameterSearch_Collaborative_partial(recommender_class) except Exception as e: print("On recommender {} Exception {}".format(recommender_class, str(e))) traceback.print_exc() ################################################################################################ ###### Matrix Factorization Cold users collaborative_MF_algorithm_list = [ PureSVDRecommender, IALSRecommender, NMFRecommender, MatrixFactorization_BPR_Cython, MatrixFactorization_FunkSVD_Cython, ] runParameterSearch_cold_user_MF_partial = partial(runParameterSearch_cold_user_MF, URM_train = URM_train, URM_train_last_test = URM_train + URM_validation, metric_to_optimize = metric_to_optimize, evaluator_validation_earlystopping = evaluator_validation, evaluator_validation = evaluator_validation, evaluator_test = evaluator_test, output_folder_path = result_folder_path, resume_from_saved = True, n_cases = n_cases, n_random_starts = n_random_starts) if flag_MF_baselines_tune: for recommender_class in collaborative_MF_algorithm_list: try: runParameterSearch_cold_user_MF_partial(recommender_class) except Exception as e: print("On recommender {} Exception {}".format(recommender_class, str(e))) traceback.print_exc() ################################################################################################ ###### ###### DL ALGORITHM ###### if flag_DL_article_default: try: if dataset_name == "movielens20m": epochs = 100 elif dataset_name == "netflixPrize": epochs = 200 multiVAE_article_hyperparameters = { "epochs": epochs, "batch_size": 500, "total_anneal_steps": 200000, "p_dims": None, } multiVAE_earlystopping_hyperparameters = { "validation_every_n": 5, "stop_on_validation": True, "evaluator_object": evaluator_validation, "lower_validations_allowed": 5, "validation_metric": metric_to_optimize, } parameterSearch = SearchSingleCase(Mult_VAE_RecommenderWrapper, evaluator_validation=evaluator_validation, evaluator_test=evaluator_test) recommender_input_args = SearchInputRecommenderArgs( CONSTRUCTOR_POSITIONAL_ARGS = [URM_train], FIT_KEYWORD_ARGS = multiVAE_earlystopping_hyperparameters) recommender_input_args_last_test = recommender_input_args.copy() recommender_input_args_last_test.CONSTRUCTOR_POSITIONAL_ARGS[0] = URM_train + URM_validation parameterSearch.search(recommender_input_args, recommender_input_args_last_test = recommender_input_args_last_test, fit_hyperparameters_values=multiVAE_article_hyperparameters, output_folder_path = result_folder_path, resume_from_saved = True, output_file_name_root = Mult_VAE_RecommenderWrapper.RECOMMENDER_NAME) except Exception as e: print("On recommender {} Exception {}".format(Mult_VAE_RecommenderWrapper, str(e))) traceback.print_exc() ################################################################################################ ###### ###### PRINT RESULTS ###### if flag_print_results: n_test_users = np.sum(np.ediff1d(URM_test.indptr)>=1) file_name = "{}..//{}_{}_".format(result_folder_path, ALGORITHM_NAME, dataset_name) result_loader = ResultFolderLoader(result_folder_path, base_algorithm_list = None, other_algorithm_list = [Mult_VAE_RecommenderWrapper], KNN_similarity_list = KNN_similarity_to_report_list, ICM_names_list = None, UCM_names_list = None) result_loader.generate_latex_results(file_name + "{}_latex_results.txt".format("article_metrics"), metrics_list = ["RECALL", "NDCG"], cutoffs_list = [20, 50, 100], table_title = None, highlight_best = True) result_loader.generate_latex_results(file_name + "{}_latex_results.txt".format("all_metrics"), metrics_list = ["PRECISION", "RECALL", "MAP_MIN_DEN", "MRR", "NDCG", "F1", "HIT_RATE", "ARHR_ALL_HITS", "NOVELTY", "DIVERSITY_MEAN_INTER_LIST", "DIVERSITY_HERFINDAHL", "COVERAGE_ITEM", "DIVERSITY_GINI", "SHANNON_ENTROPY"], cutoffs_list = [50], table_title = None, highlight_best = True) result_loader.generate_latex_time_statistics(file_name + "{}_latex_results.txt".format("time"), n_evaluation_users=n_test_users, table_title = None)
# The load_data function will split the data and save it in the desired folder. # Once the split is saved, further calls to the load_data will load the splitted data ensuring you always use the same split dataSplitter.load_data( save_folder_path="result_experiments/usage_example/data/") # We can access the three URMs with this function and the ICMs (if present in the data Reader) URM_train, URM_validation, URM_test = dataSplitter.get_holdout_split() ICM_dict = dataSplitter.get_loaded_ICM_dict() # Now that we have the split, we can create the evaluators. # The constructor of the evaluator allows you to specify the evaluation conditions (data, recommendation list length, # excluding already seen items). Whenever you want to evaluate a model, use the evaluateRecommender function of the evaluator object evaluator_validation = EvaluatorHoldout(URM_validation, cutoff_list=[5], exclude_seen=False) evaluator_test = EvaluatorHoldout(URM_test, cutoff_list=[5, 10, 20], exclude_seen=False) # We now fit and evaluate a non personalized algorithm recommender = TopPop(URM_train) recommender.fit() results_dict, results_run_string = evaluator_validation.evaluateRecommender( recommender) print("Result of TopPop is:\n" + results_run_string) # We now fit and evaluate a personalized algorithm passing some hyperparameters to the fit functions recommender = P3alphaRecommender(URM_train)
class RecSysExp: def __init__(self, recommender_class, dataset, fit_param_names=[], metric='MAP', method='bayesian', at=5, verbose=True, seed=1234): # Seed for reproducibility of results and consistent initialization of weights/splitting of dataset set_seed(seed) self.recommender_class = recommender_class self.dataset = dataset self.dataset_name = self.dataset if isinstance(self.dataset, str) else self.dataset.DATASET_NAME self.fit_param_names = fit_param_names self.metric = metric self.method = method self.at = at self.verbose = verbose self.seed = seed self.isGAN = False # if isinstance(self.dataset, str) and self.dataset in Movielens.urls.keys(): # self.reader = Movielens(version=self.dataset, **dataset_kwargs) # else: # self.reader = self.dataset(**dataset_kwargs) # self.logsdir = os.path.join('experiments', self.recommender_class.RECOMMENDER_NAME + '_' + self.reader.DATASET_NAME) self.logsdir = os.path.join('experiments', self.recommender_class.RECOMMENDER_NAME + '_' + train_mode + '_' + self.dataset_name) if not os.path.exists(self.logsdir): os.makedirs(self.logsdir, exist_ok=False) # with open(os.path.join(self.logsdir, 'dataset_config.txt'), 'w') as f: # json.dump(self.reader.config, f, indent=4) codesdir = os.path.join(self.logsdir, 'code') os.makedirs(codesdir, exist_ok=True) shutil.copy(os.path.abspath(sys.modules[self.__module__].__file__), codesdir) shutil.copy(os.path.abspath(sys.modules[self.recommender_class.__module__].__file__), codesdir) # self.URM_train, self.URM_test, self.URM_validation = self.reader.split_urm(split_ratio=[0.6, 0.2, 0.2], save_local=False, verbose=False) # self.URM_train = self.reader.get_URM_train() # self.URM_test = self.reader.get_URM_test() # self.URM_for_train, _, self.URM_validation = self.reader.split_urm( # self.URM_train.tocoo(), split_ratio=[0.75, 0, 0.25], save_local=False, verbose=False) # self.URM_train_small, _, self.URM_early_stop = self.reader.split_urm(self.URM_for_train.tocoo(), split_ratio=[0.85, 0, 0.15], save_local=False, verbose=False) # del self.URM_for_train self.URM_train, self.URM_test, self.URM_validation, self.URM_train_small, self.URM_early_stop = load_URMs( dataset, dataset_kwargs) self.evaluator_validation = EvaluatorHoldout(self.URM_validation, [self.at], exclude_seen=True) self.evaluator_earlystop = EvaluatorHoldout(self.URM_early_stop, [self.at], exclude_seen=True) self.evaluatorTest = EvaluatorHoldout(self.URM_test, [self.at, 10, 20, 50], exclude_seen=True, minRatingsPerUser=2) self.fit_params = {} modules = getattr(self.recommender_class, '__module__', None) if modules and modules.split('.')[0] == gans.__name__: self.isGAN = True # EARLY STOPPING from Maurizio's framework for baselines 对于基准框架的early stopping self.early_stopping_parameters = { 'epochs_min': 0, 'validation_every_n': 5, 'stop_on_validation': True, 'validation_metric': self.metric, 'lower_validations_allowed': 5, 'evaluator_object': self.evaluator_earlystop } # EARYL STOPPING for GAN-based recommenders 对于 基于GAN 的 推荐算法的 early stopping self.my_early_stopping = { 'allow_worse': 5, 'freq': 5, 'validation_evaluator': self.evaluator_earlystop, 'validation_set': None, 'sample_every': None, } def build_fit_params(self, params): for i, val in enumerate(params): param_name = self.dimension_names[i] if param_name in self.fit_param_names: self.fit_params[param_name] = val elif param_name == 'epochs' and self.recommender_class in early_stopping_algos: self.fit_params[param_name] = val def save_best_params(self, additional_params=None): d = dict(self.fit_params) if additional_params is not None: d.update(additional_params) with open(os.path.join(self.logsdir, 'best_params.pkl'), 'wb') as f: pickle.dump(d, f, pickle.HIGHEST_PROTOCOL) def load_best_params(self): with open(os.path.join(self.logsdir, 'best_params.pkl'), 'rb') as f: return pickle.load(f) def obj_func(self, params): """ Black-box objective function. Parameters ---------- params: list Ranges of hyperparameters to consider. List of skopt.space.space.Dimension. Returns ------- obj_func_value: float Value of the objective function as denoted by the experiment metric. """ # print('Optimizing for', self.reader.DATASET_NAME) print('Optimizing', self.recommender_class.RECOMMENDER_NAME, 'for', self.dataset_name) # Split the parameters into build_params and fit_params self.build_fit_params(params) # Create the model and fit it. try: if self.isGAN: model = self.recommender_class(self.URM_train_small, mode=train_mode, seed=seed, is_experiment=True) model.logsdir = self.logsdir fit_early_params = dict(self.fit_params) fit_early_params.update(self.my_early_stopping) last_epoch = model.fit(**fit_early_params) # Save the right number of epochs that produces the current model if last_epoch != self.fit_params['epochs']: self.fit_params['epochs'] = last_epoch - \ self.my_early_stopping['allow_worse'] * self.my_early_stopping['freq'] else: model = self.recommender_class(self.URM_train_small) if self.recommender_class in early_stopping_algos: fit_early_params = dict(self.fit_params) fit_early_params.update(self.early_stopping_parameters) model.fit(**fit_early_params) else: model.fit(**self.fit_params) results_dic, results_run_string = self.evaluator_validation.evaluateRecommender(model) fitness = -results_dic[self.at][self.metric] except tf.errors.ResourceExhaustedError: return 0 try: if fitness < self.best_res: self.best_res = fitness self.save_best_params(additional_params=dict(epochs=model.epochs_best) if self.recommender_class in early_stopping_algos else None) except AttributeError: self.best_res = fitness self.save_best_params(additional_params=model.get_early_stopping_final_epochs_dict() if self.recommender_class in early_stopping_algos else None) with open(os.path.join(self.logsdir, 'results.txt'), 'a') as f: d = self.fit_params if self.recommender_class in early_stopping_algos: d.update(model.get_early_stopping_final_epochs_dict()) d_str = json.dumps(d) f.write(d_str) f.write('\n') f.write(results_run_string) f.write('\n\n') return fitness def tune(self, params, evals=10, init_config=None, seed=None): """ Runs the hyperparameter search using Gaussian Process as surrogate model or Random Search, saves the results of the trials and print the best found parameters. 使用 高斯过程 作为 替代模型 进行 超参数 搜索 或 随机搜索 保存 并 打印 训练 得到的 最佳 参数 Parameters ---------- params: list List of skopt.space.space.Dimensions to be searched. 参数为 scikit-learn Base class for search space dimensions evals: int Number of evaluations to perform. init_config: list, default None An initial parameter configuration for seeding the Gaussian Process seed: int, default None Seed for random_state of `gp_minimize` or `dummy_minimize`. Set to a fixed integer for reproducibility. """ msg = 'Started ' + self.recommender_class.RECOMMENDER_NAME + ' ' + self.dataset_name subprocess.run(['telegram-send', msg]) # URM_test CSR矩阵的shape U, I = self.URM_test.shape if self.recommender_class == GANMF: params.append(Integer(4, int(I * 0.75) if I <= 1024 else 1024, name='emb_dim', dtype=int)) self.fit_param_names.append('emb_dim') if self.recommender_class == CFGAN or self.recommender_class == DeepGANMF: params.append(Integer(4, int(I * 0.75) if I <= 1024 else 1024, name='d_nodes', dtype=int)) params.append(Integer(4, int(I * 0.75) if I <= 1024 else 1024, name='g_nodes', dtype=int)) self.fit_param_names.append('d_nodes') self.fit_param_names.append('g_nodes') if self.recommender_class == DisGANMF: params.append(Integer(4, int(I * 0.75) if I <= 1024 else 1024, name='d_nodes', dtype=int)) self.fit_param_names.append('d_nodes') self.dimension_names = [p.name for p in params] ''' Need to make sure that the max. value of `num_factors` parameters must be lower than the max(U, I) ''' try: idx = self.dimension_names.index('num_factors') maxval = params[idx].bounds[1] if maxval > min(U, I): params[idx] = Integer(1, min(U, I), name='num_factors', dtype=int) except ValueError: pass if len(params) > 0: # Check if there is already a checkpoint for this experiment 检查点 checkpoint_path = os.path.join(self.logsdir, 'checkpoint.pkl') checkpoint_exists = True if os.path.exists(checkpoint_path) else False checkpoint_saver = CheckpointSaver(os.path.join(self.logsdir, 'checkpoint.pkl'), compress=3) if seed is None: seed = self.seed t_start = int(time.time()) if checkpoint_exists: previous_run = skopt.load(checkpoint_path) if self.method == 'bayesian': results = gp_minimize(self.obj_func, params, n_calls=evals - len(previous_run.func_vals), x0=previous_run.x_iters, y0=previous_run.func_vals, n_random_starts=0, random_state=seed, verbose=True, callback=[checkpoint_saver]) else: results = dummy_minimize(self.obj_func, params, n_calls=evals - len(previous_run.func_vals), x0=previous_run.x_iters, y0=previous_run.func_vals, random_state=seed, verbose=True, callback=[checkpoint_saver]) else: # 超参数优化 if self.method == 'bayesian': results = gp_minimize(self.obj_func, params, n_calls=evals, random_state=seed, verbose=True, callback=[checkpoint_saver]) else: results = dummy_minimize(self.obj_func, params, n_calls=evals, random_state=seed, verbose=True, callback=[checkpoint_saver]) t_end = int(time.time()) # Save best parameters of this experiment # best_params = dict(zip(self.dimension_names, results.x)) # with open(os.path.join(self.logsdir, 'best_params.pkl'), 'wb') as f: # pickle.dump(best_params, f, pickle.HIGHEST_PROTOCOL) best_params = self.load_best_params() with open(os.path.join(self.logsdir, 'results.txt'), 'a') as f: f.write('Experiment ran for {}\n'.format(str(datetime.timedelta(seconds=t_end - t_start)))) f.write('Best {} score: {}. Best result found at: {}\n'.format(self.metric, results.fun, best_params)) if self.recommender_class in [IALSRecommender, MatrixFactorization_BPR_Cython]: self.dimension_names.append('epochs') self.build_fit_params(best_params.values()) # Retrain with all training data set_seed(seed) if self.isGAN: model = self.recommender_class(self.URM_train, mode=train_mode, is_experiment=True) model.logsdir = self.logsdir model.fit(**self.fit_params) # load_models(model, save_dir='best_model', all_in_folder=True) else: model = self.recommender_class(self.URM_train) model.fit(**self.fit_params) # model.loadModel(os.path.join(self.logsdir, 'best_model')) _, results_run_string = self.evaluatorTest.evaluateRecommender(model) print('\n\nResults on test set:') print(results_run_string) print('\n\n') with open(os.path.join(self.logsdir, 'result_test.txt'), 'w') as f: f.write(results_run_string) msg = 'Finished ' + self.recommender_class.RECOMMENDER_NAME + ' ' + self.dataset_name subprocess.run(['telegram-send', msg])
def read_data_split_and_search_SpectralCF(dataset_name, cold_start=False, cold_items=None, isKNN_multiprocess=True, isKNN_tune=True, isSpectralCF_train_default=True, isSpectralCF_tune=True, print_results=True): if dataset_name == "movielens1m_original": assert(cold_start is not True) dataset = Movielens1MReader(type="original") elif dataset_name == "movielens1m_ours": dataset = Movielens1MReader(type="ours", cold_start=cold_start, cold_items=cold_items) elif dataset_name == "hetrec": assert (cold_start is not True) dataset = MovielensHetrec2011Reader() elif dataset_name == "amazon_instant_video": assert (cold_start is not True) dataset = AmazonInstantVideoReader() if not cold_start: output_folder_path = "result_experiments/{}/{}_{}/".format(CONFERENCE_NAME, ALGORITHM_NAME, dataset_name) else: output_folder_path = "result_experiments/{}/{}_cold_{}_{}/".format(CONFERENCE_NAME, ALGORITHM_NAME, cold_items, dataset_name) URM_train = dataset.URM_train.copy() URM_validation = dataset.URM_validation.copy() URM_test = dataset.URM_test.copy() # Ensure IMPLICIT data and DISJOINT sets assert_implicit_data([URM_train, URM_validation, URM_test]) assert_disjoint_matrices([URM_train, URM_validation, URM_test]) # If directory does not exist, create if not os.path.exists(output_folder_path): os.makedirs(output_folder_path) algorithm_dataset_string = "{}_{}_".format(ALGORITHM_NAME, dataset_name) plot_popularity_bias([URM_train + URM_validation, URM_test], ["URM train", "URM test"], output_folder_path + algorithm_dataset_string + "popularity_plot") save_popularity_statistics([URM_train + URM_validation, URM_test], ["URM train", "URM test"], output_folder_path + algorithm_dataset_string + "popularity_statistics") metric_to_optimize = "RECALL" from Base.Evaluation.Evaluator import EvaluatorHoldout if not cold_start: cutoff_list_validation = [50] cutoff_list_test = [20, 30, 40, 50, 60, 70, 80, 90, 100] else: cutoff_list_validation = [20] cutoff_list_test = [20] evaluator_validation = EvaluatorHoldout(URM_validation, cutoff_list=cutoff_list_validation) evaluator_test = EvaluatorHoldout(URM_test, cutoff_list=cutoff_list_test) ################################################################################################ ###### KNN CF if isKNN_tune: collaborative_algorithm_list = [ Random, TopPop, UserKNNCFRecommender, ItemKNNCFRecommender, P3alphaRecommender, RP3betaRecommender, ] runParameterSearch_Collaborative_partial = partial(runParameterSearch_Collaborative, URM_train = URM_train, metric_to_optimize = metric_to_optimize, evaluator_validation_earlystopping = evaluator_validation, evaluator_validation = evaluator_validation, evaluator_test = evaluator_test, output_folder_path = output_folder_path, parallelizeKNN = False, allow_weighting = True, n_cases = 35) if isKNN_multiprocess: pool = multiprocessing.Pool(processes=int(multiprocessing.cpu_count()), maxtasksperchild=1) resultList = pool.map(runParameterSearch_Collaborative_partial, collaborative_algorithm_list) pool.close() pool.join() else: for recommender_class in collaborative_algorithm_list: try: runParameterSearch_Collaborative_partial(recommender_class) except Exception as e: print("On recommender {} Exception {}".format(recommender_class, str(e))) traceback.print_exc() ################################################################################################ ###### SpectralCF if isSpectralCF_train_default: try: spectralCF_article_parameters = { "epochs": 1000, "batch_size": 1024, "embedding_size": 16, "decay": 0.001, "k": 3, "learning_rate": 1e-3, } spectralCF_earlystopping_parameters = { "validation_every_n": 5, "stop_on_validation": True, "lower_validations_allowed": 20, "evaluator_object": evaluator_validation, "validation_metric": metric_to_optimize, "epochs_min": 400, } parameterSearch = SearchSingleCase(SpectralCF_RecommenderWrapper, evaluator_validation=evaluator_validation, evaluator_test=evaluator_test) recommender_parameters = SearchInputRecommenderParameters( CONSTRUCTOR_POSITIONAL_ARGS = [URM_train], FIT_KEYWORD_ARGS = spectralCF_earlystopping_parameters) parameterSearch.search(recommender_parameters, fit_parameters_values = spectralCF_article_parameters, output_folder_path = output_folder_path, output_file_name_root = SpectralCF_RecommenderWrapper.RECOMMENDER_NAME + "_article_default") except Exception as e: print("On recommender {} Exception {}".format(SpectralCF_RecommenderWrapper, str(e))) traceback.print_exc() elif isSpectralCF_tune: try: spectralCF_earlystopping_parameters = { "validation_every_n": 5, "stop_on_validation": True, "lower_validations_allowed": 20, "evaluator_object": evaluator_validation, "validation_metric": metric_to_optimize, "epochs_min": 400, "epochs": 2000 } runParameterSearch_SpectralCF(SpectralCF_RecommenderWrapper, URM_train = URM_train, earlystopping_parameters = spectralCF_earlystopping_parameters, metric_to_optimize = metric_to_optimize, evaluator_validation = evaluator_validation, evaluator_test = evaluator_test, output_folder_path = output_folder_path, n_cases = 35, output_file_name_root = SpectralCF_RecommenderWrapper.RECOMMENDER_NAME) except Exception as e: print("On recommender {} Exception {}".format(SpectralCF_RecommenderWrapper, str(e))) traceback.print_exc() ################################################################################################ ###### print results if print_results: n_validation_users = np.sum(np.ediff1d(URM_validation.indptr)>=1) n_test_users = np.sum(np.ediff1d(URM_test.indptr)>=1) if not cold_start: results_file_root_name = ALGORITHM_NAME else: results_file_root_name = "{}_cold_{}".format(ALGORITHM_NAME, cold_items) print_time_statistics_latex_table(result_folder_path = output_folder_path, dataset_name = dataset_name, results_file_prefix_name = results_file_root_name, other_algorithm_list = [SpectralCF_RecommenderWrapper], n_validation_users = n_validation_users, n_test_users = n_test_users, n_decimals = 2) if cold_start: cutoffs_to_report_list = [20] else: cutoffs_to_report_list = [20, 40, 60, 80, 100] print_results_latex_table(result_folder_path = output_folder_path, results_file_prefix_name = results_file_root_name, dataset_name = dataset_name, metrics_to_report_list = ["RECALL", "MAP"], cutoffs_to_report_list = cutoffs_to_report_list, other_algorithm_list = [SpectralCF_RecommenderWrapper])
URM_train, URM_test = train_test_holdout(URM_all, train_perc=0.8) itemCF_recommender = ItemKNNCFRecommender(URM_train) itemCF_recommender.fit(**itemCFParam) slim_recommender = SLIM_BPR_Cython(URM_train, recompile_cython=False) slim_recommender.fit(**slimParam) p3_recommender = P3alphaRecommender(URM_train) p3_recommender.fit(**p3Param) recommender1 = SimilarityHybridRecommender(URM_train, itemCF_recommender.W_sparse, slim_recommender.W_sparse, p3_recommender.W_sparse) recommender1.fit(topK=100, alpha1=alpha1, alpha2=alpha2, alpha3=alpha3) evaluator_validation = EvaluatorHoldout(URM_test, cutoff_list=[10]) eval_res = evaluator_validation.evaluateRecommender(recommender1) MAP = eval_res[0][10]['MAP'] print("The MAP in one test is: ", MAP) itemCF_recommender = ItemKNNCFRecommender(URM_all) itemCF_recommender.fit(**itemCFParam) slim_recommender = SLIM_BPR_Cython(URM_all, recompile_cython=False) slim_recommender.fit(**slimParam) p3_recommender = P3alphaRecommender(URM_all) p3_recommender.fit(**p3Param) recommender1 = SimilarityHybridRecommender(URM_all, itemCF_recommender.W_sparse, slim_recommender.W_sparse, p3_recommender.W_sparse) recommender1.fit(topK=100, alpha1=alpha1, alpha2=alpha2, alpha3=alpha3)
ItemKNNCFRecommender, P3alphaRecommender, RP3betaRecommender, SLIM_BPR_Cython, SLIMElasticNetRecommender, MatrixFactorization_BPR_Cython, MatrixFactorization_FunkSVD_Cython, MatrixFactorization_AsySVD_Cython, PureSVDRecommender, IALSRecommender, ] from Base.Evaluation.Evaluator import EvaluatorHoldout evaluator = EvaluatorHoldout(URM_test, [5, 20], exclude_seen=True) output_root_path = "./result_experiments/" # If directory does not exist, create if not os.path.exists(output_root_path): os.makedirs(output_root_path) logFile = open(output_root_path + "result_all_algorithms.txt", "a") for recommender_class in recommender_list: try: