def __init__(self): super(Movielens100KReader, self).__init__() pre_splitted_path = "Data_manager_split_datasets/Movielens100K/KDD/MCRec_our_interface/" pre_splitted_filename = "splitted_data" original_data_path = "Conferences/KDD/MCRec_github/data/" # If directory does not exist, create if not os.path.exists(pre_splitted_path): os.makedirs(pre_splitted_path) try: print("Movielens100KReader: Attempting to load pre-splitted data") for attrib_name, attrib_object in load_data_dict(pre_splitted_path, pre_splitted_filename).items(): self.__setattr__(attrib_name, attrib_object) except FileNotFoundError: print("Movielens100KReader: Pre-splitted data not found, building new one") print("Movielens100KReader: loading URM") from Conferences.KDD.MCRec_github.code.Dataset import Dataset dataset = 'ml-100k' dataset = Dataset(original_data_path + dataset) URM_train, testRatings, testNegatives = dataset.trainMatrix, dataset.testRatings, dataset.testNegatives # Dataset adds 1 to user and item id, removing it to restore 0 indexing URM_train = sps.coo_matrix(URM_train) URM_train.row -= 1 URM_train.col -= 1 self.URM_train = sps.csr_matrix((np.ones_like(URM_train.data), (URM_train.row, URM_train.col))) num_users, num_items = self.URM_train.shape # Build sparse matrices from lists URM_test_builder = IncrementalSparseMatrix(n_rows=num_users, n_cols=num_items) URM_test_negative_builder = IncrementalSparseMatrix(n_rows=num_users, n_cols=num_items) for user_index in range(len(testRatings)): user_id = testRatings[user_index][0] current_user_test_items = testRatings[user_index][1:] current_user_test_negative_items = testNegatives[user_index] current_user_test_items = np.array(current_user_test_items) -1 current_user_test_negative_items = np.array(current_user_test_negative_items) -1 URM_test_builder.add_single_row(user_id -1, current_user_test_items, 1.0) URM_test_negative_builder.add_single_row(user_id -1, current_user_test_negative_items, 1.0) # the test data has repeated data, apparently self.URM_test = URM_test_builder.get_SparseMatrix() self.URM_test_negative = URM_test_negative_builder.get_SparseMatrix() # Split validation from train as 10% from Data_manager.split_functions.split_train_validation import split_train_validation_percentage_user_wise self.URM_train, self.URM_validation = split_train_validation_percentage_user_wise(self.URM_train, train_percentage=0.9) # Load features data_reader = Movielens100KReader_DataManager() data_reader.load_data() zipFile_path = data_reader.DATASET_SPLIT_ROOT_FOLDER + data_reader.DATASET_SUBFOLDER dataFile = zipfile.ZipFile(zipFile_path + "ml-100k.zip") ICM_path = dataFile.extract("ml-100k/u.item", path=zipFile_path + "decompressed/") ICM_genre = self._loadICM(ICM_path) ICM_genre = ICM_genre.get_SparseMatrix() shutil.rmtree(zipFile_path + "decompressed", ignore_errors=True) self.ICM_dict = {"ICM_genre": ICM_genre} data_dict = { "URM_train": self.URM_train, "URM_test": self.URM_test, "URM_validation": self.URM_validation, "URM_test_negative": self.URM_test_negative, "ICM_dict": self.ICM_dict, } save_data_dict(data_dict, pre_splitted_path, pre_splitted_filename) print("Movielens100KReader: loading complete")
layers = [512, 256, 128, 64] reg_layes = [0, 0, 0, 0] learning_rate = 0.001 epochs = 30 batch_size = 256 num_negatives = 4 learner = 'adam' verbose = 1 out = 0 evaluation_threads = 1 topK = 10 print('num_negatives = ', num_negatives) t1 = time() dataset = Dataset('../data/' + dataset) trainMatrix, testRatings, testNegatives = dataset.trainMatrix, dataset.testRatings, dataset.testNegatives train = dataset.train user_item_map = dataset.user_item_map item_user_map = dataset.item_user_map path_umtm = dataset.path_umtm path_umum = dataset.path_umum path_umtmum = dataset.path_umtmum path_uuum = dataset.path_uuum user_feature, item_feature, type_feature = dataset.user_feature, dataset.item_feature, dataset.type_feature num_users, num_items = trainMatrix.shape[0], trainMatrix.shape[1] path_nums = [ dataset.umtm_path_num, dataset.umum_path_num, dataset.umtmum_path_num, dataset.uuum_path_num ] timestamps = [
def read_data_split_and_search(dataset_name, flag_baselines_tune=False, flag_DL_article_default=False, flag_DL_tune=False, flag_print_results=False): from Conferences.KDD.MCRec_our_interface.Movielens100K.Movielens100KReader import Movielens100KReader result_folder_path = "result_experiments/{}/{}_{}/".format( CONFERENCE_NAME, ALGORITHM_NAME, dataset_name) if dataset_name == "movielens100k": dataset = Movielens100KReader(result_folder_path) URM_train = dataset.URM_DICT["URM_train"].copy() URM_validation = dataset.URM_DICT["URM_validation"].copy() URM_test = dataset.URM_DICT["URM_test"].copy() URM_test_negative = dataset.URM_DICT["URM_test_negative"].copy() # Ensure IMPLICIT data and DISJOINT sets assert_implicit_data( [URM_train, URM_validation, URM_test, URM_test_negative]) assert_disjoint_matrices( [URM_train, URM_validation, URM_test, URM_test_negative]) # If directory does not exist, create if not os.path.exists(result_folder_path): os.makedirs(result_folder_path) algorithm_dataset_string = "{}_{}_".format(ALGORITHM_NAME, dataset_name) plot_popularity_bias([URM_train + URM_validation, URM_test], ["URM train", "URM test"], result_folder_path + algorithm_dataset_string + "popularity_plot") save_popularity_statistics([URM_train + URM_validation, URM_test], ["URM train", "URM test"], result_folder_path + algorithm_dataset_string + "popularity_statistics") from Base.Evaluation.Evaluator import EvaluatorNegativeItemSample evaluator_validation = EvaluatorNegativeItemSample(URM_validation, URM_test_negative, cutoff_list=[10]) evaluator_test = EvaluatorNegativeItemSample(URM_test, URM_test_negative, cutoff_list=[10]) collaborative_algorithm_list = [ Random, TopPop, UserKNNCFRecommender, ItemKNNCFRecommender, P3alphaRecommender, RP3betaRecommender, PureSVDRecommender, NMFRecommender, IALSRecommender, MatrixFactorization_BPR_Cython, MatrixFactorization_FunkSVD_Cython, EASE_R_Recommender, SLIM_BPR_Cython, SLIMElasticNetRecommender, ] metric_to_optimize = "PRECISION" n_cases = 50 n_random_starts = 15 runParameterSearch_Collaborative_partial = partial( runParameterSearch_Collaborative, URM_train=URM_train, URM_train_last_test=URM_train + URM_validation, metric_to_optimize=metric_to_optimize, evaluator_validation_earlystopping=evaluator_validation, evaluator_validation=evaluator_validation, evaluator_test=evaluator_test, output_folder_path=result_folder_path, parallelizeKNN=False, allow_weighting=True, resume_from_saved=True, n_cases=n_cases, n_random_starts=n_random_starts) if flag_baselines_tune: for recommender_class in collaborative_algorithm_list: try: runParameterSearch_Collaborative_partial(recommender_class) except Exception as e: print("On recommender {} Exception {}".format( recommender_class, str(e))) traceback.print_exc() ################################################################################################ ###### Content Baselines for ICM_name, ICM_object in dataset.ICM_DICT.items(): try: runParameterSearch_Content( ItemKNNCBFRecommender, URM_train=URM_train, URM_train_last_test=URM_train + URM_validation, metric_to_optimize=metric_to_optimize, evaluator_validation=evaluator_validation, evaluator_test=evaluator_test, output_folder_path=result_folder_path, parallelizeKNN=False, allow_weighting=True, resume_from_saved=True, ICM_name=ICM_name, ICM_object=ICM_object.copy(), n_cases=n_cases, n_random_starts=n_random_starts) except Exception as e: print("On CBF recommender for ICM {} Exception {}".format( ICM_name, str(e))) traceback.print_exc() ################################################################################################ ###### Hybrid for ICM_name, ICM_object in dataset.ICM_DICT.items(): try: runParameterSearch_Hybrid( ItemKNN_CFCBF_Hybrid_Recommender, URM_train=URM_train, URM_train_last_test=URM_train + URM_validation, metric_to_optimize=metric_to_optimize, evaluator_validation=evaluator_validation, evaluator_test=evaluator_test, output_folder_path=result_folder_path, parallelizeKNN=False, allow_weighting=True, resume_from_saved=True, ICM_name=ICM_name, ICM_object=ICM_object.copy(), n_cases=n_cases, n_random_starts=n_random_starts) except Exception as e: print("On recommender {} Exception {}".format( ItemKNN_CFCBF_Hybrid_Recommender, str(e))) traceback.print_exc() ################################################################################################ ###### ###### DL ALGORITHM ###### if flag_DL_article_default: if dataset_name == "movielens100k": """ The code provided by the original authors of MCRec can be used only for the original data. Here I am passing to the Wrapper the URM_train matrix that is only required for its shape, the train will be done using the preprocessed data the original authors provided """ from Conferences.KDD.MCRec_github.code.Dataset import Dataset original_dataset_reader = Dataset( 'Conferences/KDD/MCRec_github/data/' + 'ml-100k') MCRec_article_hyperparameters = { "epochs": 200, "latent_dim": 128, "reg_latent": 0, "layers": [512, 256, 128, 64], "reg_layes": [0, 0, 0, 0], "learning_rate": 1e-3, "batch_size": 256, "num_negatives": 4, } MCRec_earlystopping_hyperparameters = { "validation_every_n": 5, "stop_on_validation": True, "evaluator_object": evaluator_validation, "lower_validations_allowed": 5, "validation_metric": metric_to_optimize } parameterSearch = SearchSingleCase( MCRecML100k_RecommenderWrapper, evaluator_validation=evaluator_validation, evaluator_test=evaluator_test) recommender_input_args = SearchInputRecommenderArgs( CONSTRUCTOR_POSITIONAL_ARGS=[ URM_train, original_dataset_reader ], FIT_KEYWORD_ARGS=MCRec_earlystopping_hyperparameters) recommender_input_args_last_test = recommender_input_args.copy() recommender_input_args_last_test.CONSTRUCTOR_POSITIONAL_ARGS[ 0] = URM_train + URM_validation parameterSearch.search( recommender_input_args, recommender_input_args_last_test= recommender_input_args_last_test, fit_hyperparameters_values=MCRec_article_hyperparameters, output_folder_path=result_folder_path, resume_from_saved=True, output_file_name_root=MCRecML100k_RecommenderWrapper. RECOMMENDER_NAME) ################################################################################################ ###### ###### PRINT RESULTS ###### if flag_print_results: n_test_users = np.sum(np.ediff1d(URM_test.indptr) >= 1) file_name = "{}..//{}_{}_".format(result_folder_path, ALGORITHM_NAME, dataset_name) ICM_names_to_report_list = list(dataset.ICM_DICT.keys()) result_loader = ResultFolderLoader( result_folder_path, base_algorithm_list=None, other_algorithm_list=[MCRecML100k_RecommenderWrapper], KNN_similarity_list=KNN_similarity_to_report_list, ICM_names_list=ICM_names_to_report_list, UCM_names_list=None) result_loader.generate_latex_results( file_name + "{}_latex_results.txt".format("article_metrics"), metrics_list=["PRECISION", "RECALL", "NDCG"], cutoffs_list=[10], table_title=None, highlight_best=True) result_loader.generate_latex_results( file_name + "{}_latex_results.txt".format("all_metrics"), metrics_list=[ "PRECISION", "RECALL", "MAP", "MRR", "NDCG", "F1", "HIT_RATE", "ARHR", "NOVELTY", "DIVERSITY_MEAN_INTER_LIST", "DIVERSITY_HERFINDAHL", "COVERAGE_ITEM", "DIVERSITY_GINI", "SHANNON_ENTROPY" ], cutoffs_list=[10], table_title=None, highlight_best=True) result_loader.generate_latex_time_statistics( file_name + "{}_latex_results.txt".format("time"), n_evaluation_users=n_test_users, table_title=None)
def fit(self, latent_dim=128, reg_latent=0, layers=[512, 256, 128, 64], reg_layes=[0, 0, 0, 0], learning_rate=0.001, epochs=30, batch_size=256, num_negatives=4, **earlystopping_kwargs): self.latent_dim = latent_dim self.reg_latent = reg_latent self.layers = layers self.reg_layes = reg_layes self.learning_rate = learning_rate self.epochs = epochs self.batch_size = batch_size self.num_negatives = num_negatives dataset = 'ml-100k' t1 = time() dataset = Dataset('Conferences/KDD/MCRec_github/data/' + dataset) trainMatrix, testRatings, testNegatives = dataset.trainMatrix, dataset.testRatings, dataset.testNegatives # # Replace train data with the train split passed as parameter # self._load_rating_file_as_map(self.URM_train, dataset) self._train = dataset.train self._user_item_map = dataset.user_item_map item_user_map = dataset.item_user_map path_umtm = dataset.path_umtm path_umum = dataset.path_umum path_umtmum = dataset.path_umtmum path_uuum = dataset.path_uuum user_feature, item_feature, type_feature = dataset.user_feature, dataset.item_feature, dataset.type_feature # Dataset adds 1 to user and item id, so n_users and n_items will be +1 greater than the correct value self.n_users, self.n_items = trainMatrix.shape self.path_nums = [ dataset.umtm_path_num, dataset.umum_path_num, dataset.umtmum_path_num, dataset.uuum_path_num ] self.timestamps = [ dataset.umtm_timestamp, dataset.umum_timestamp, dataset.umtmum_timestamp, dataset.uuum_timestamp ] self.length = dataset.fea_size print( "Load data done [%.1f s]. #user=%d, #item=%d, #train=%d, #test=%d" % (time() - t1, self.n_users, self.n_items, len( self._train), len(testRatings))) print('path nums = ', self.path_nums) print('timestamps = ', self.timestamps) print("MCRec_RecommenderWrapper: building model") self.model = get_model(self.n_users, self.n_items, self.path_nums, self.timestamps, self.length, self.layers, self.reg_layes, self.latent_dim, self.reg_latent) self.model.compile(optimizer=Adam(lr=learning_rate, decay=1e-4), loss='binary_crossentropy') # model.compile(optimizer = Nadam(), # loss = 'binary_crossentropy') # Check Init performance # t1 = time() # print("MCRec_RecommenderWrapper: evaluate_model") # (ps, rs, ndcgs) = evaluate_model(self.model, user_feature, item_feature, type_feature, self.n_users, self.n_items, path_umtm, path_umum, path_umtmum, path_uuum, path_nums, timestamps, length, testRatings, testNegatives, topK, evaluation_threads) # p, r, ndcg = np.array(ps).mean(), np.array(rs).mean(), np.array(ndcgs).mean() # print('Init: Precision = %.4f, Recall = %.4f, NDCG = %.4f [%.1f]' %(p, r, ndcg, time()-t1)) # best_p = -1 # p_list, r_list, ndcg_list = [], [], [] # print('Begin training....') # Originally these were global variables self._testRatings = testRatings self._testNegatives = testNegatives self._path_umtm = path_umtm self._path_umum = path_umum self._path_umtmum = path_umtmum self._path_uuum = path_uuum self._path_nums = self.path_nums self._timestamps = self.timestamps self._length = self.length self._user_feature = user_feature self._item_feature = item_feature self._type_feature = type_feature self._features = [user_feature, item_feature, type_feature] self._best_model = clone_model(self.model) self._best_model.set_weights(self.model.get_weights()) self._train_with_early_stopping(epochs, algorithm_name=self.RECOMMENDER_NAME, **earlystopping_kwargs) print("MCRec_RecommenderWrapper: Tranining complete") self.model = clone_model(self._best_model) self.model.set_weights(self._best_model.get_weights())