def main(): # Data loading root_data_path = os.path.join(get_project_root_path(), "data/") data_reader = RecSys2019Reader(root_data_path) data_reader = New_DataSplitter_leave_k_out( data_reader, k_out_value=K_OUT, use_validation_set=False, allow_cold_users=ALLOW_COLD_USERS, force_new_split=True, seed=get_split_seed()) data_reader.load_data() URM_train, URM_test = data_reader.get_holdout_split() ICM_all, _ = get_ICM_train_new(data_reader) UCM_all, _ = get_UCM_train_new(data_reader) # Ignoring users ignore_users = get_ignore_users( URM_train, data_reader.get_original_user_id_to_index_mapper(), lower_threshold=LOWER_THRESHOLD, upper_threshold=UPPER_THRESHOLD, ignore_non_target_users=IGNORE_NON_TARGET_USERS) evaluator = EvaluatorHoldout(URM_test, cutoff_list=[CUTOFF], ignore_users=ignore_users) # Model evaluation model = get_model(URM_train, ICM_all, UCM_all) print(evaluator.evaluateRecommender(model))
def fit(self, num_models=5, hyper_parameters_range=None): if hyper_parameters_range is None: hyper_parameters_range = {} np.random.seed(get_split_seed()) seeds = np.random.randint(low=0, high=2**32 - 1, size=num_models) for i in tqdm(range(num_models), desc="Fitting bagging models"): recommender_kwargs = self.recommender_constr_kwargs.copy() URM_bootstrap = self.URM_train if self.do_bootstrap: URM_bootstrap, added_user = get_user_bootstrap(self.URM_train) for name, value in recommender_kwargs.items(): if name == "UCM_train": UCM_object = recommender_kwargs[name] recommender_kwargs[name] = sps.vstack( [UCM_object, UCM_object[added_user, :]], format="csr") parameters = {} for parameter_name, parameter_range in hyper_parameters_range.items( ): parameters[parameter_name] = parameter_range.rvs( random_state=seeds[i]) block_print() recommender_object = self.recommender_class( URM_bootstrap, **recommender_kwargs) recommender_object.fit(**parameters) enable_print() self.models.append(recommender_object)
def get_arguments(): parser = argparse.ArgumentParser() parser.add_argument("-l", "--reader_path", default=get_root_data_path(), help="path to the root of data files") parser.add_argument("-r", "--recommender_name", required=True, help="recommender names should be one of: {}".format( list(RECOMMENDER_CLASS_DICT.keys()))) parser.add_argument("-n", "--n_cases", default=N_CASES, type=int, help="number of cases for hyper parameter tuning") parser.add_argument( "-nr", "--n_random_starts", default=N_RANDOM_STARTS, type=int, help="number of random starts for hyper parameter tuning") parser.add_argument("-p", "--parallelize", default=1, type=str2bool, help="1 to parallelize the search, 0 otherwise") parser.add_argument( "-ut", "--upper_threshold", default=MAX_UPPER_THRESHOLD, type=int, help="Upper threshold (included) of user profile length to validate") parser.add_argument( "-lt", "--lower_threshold", default=MIN_LOWER_THRESHOLD, type=int, help="Lower threshold (included) of user profile length to validate") parser.add_argument("-acu", "--allow_cold_users", default=0, type=str2bool, help="1 to allow cold users," " 0 otherwise") parser.add_argument("-ent", "--exclude_non_target", default=1, type=str2bool, help="1 to exclude non-target users, 0 otherwise") parser.add_argument("--seed", default=get_split_seed(), help="seed for the experiment", type=int) return parser.parse_args()
def get_arguments(): parser = argparse.ArgumentParser() parser.add_argument("-nrs", "--n_random_starts", default=N_RANDOM_STARTS, help="Number of random starts") parser.add_argument("-l", "--reader_path", default="../../data/", help="path to the root of data files") parser.add_argument("-r", "--recommender_name", required=True, help="recommender names should be one of: {}".format(list(RECOMMENDER_CLASS_DICT.keys()))) parser.add_argument("-n", "--n_cases", default=N_CASES, help="number of cases for hyperparameter tuning") parser.add_argument("-d", "--discretize", default=False, help="if true, it will discretize the ICMs") parser.add_argument("--seed", default=get_split_seed(), help="seed used in splitting the dataset") parser.add_argument("-foh", "--focus_on_high", default=0, help="focus the tuning only on users with profile" "lengths larger than the one specified here") parser.add_argument("-eu", "--exclude_users", default=False, help="1 to exclude cold users, 0 otherwise") parser.add_argument("-fol", "--focus_on_low", default=0, help="focus the tuning only on users with profile" "lengths smaller than the one specified here") return parser.parse_args()
def fit(self, num_factors=100, n_oversamples=10, n_iter=4, feature_weighting="none", random_seed=get_split_seed()): self._print("Computing SVD decomposition...") self.URM_train = apply_feature_weighting(self.URM_train, feature_weighting) U, Sigma, VT = randomized_svd(self.URM_train, n_oversamples=n_oversamples, n_iter=n_iter, n_components=num_factors, random_state=random_seed) s_Vt = sps.diags(Sigma) * VT self.USER_factors = U self.ITEM_factors = s_Vt.T self._print("Computing SVD decomposition... Done!")
from course_lib.Base.Evaluation.Evaluator import EvaluatorHoldout from src.data_management.New_DataSplitter_leave_k_out import New_DataSplitter_leave_k_out from src.data_management.RecSys2019Reader import RecSys2019Reader from src.data_management.dataframe_preprocesser import get_preprocessed_dataframe from src.tuning.holdout_validation.run_parameter_search_advanced_top_pop import run_parameter_search_advanced_top_pop from src.utils.general_utility_functions import get_split_seed if __name__ == '__main__': # Data loading data_reader = RecSys2019Reader("../../data/") data_reader = New_DataSplitter_leave_k_out(data_reader, k_out_value=3, use_validation_set=False, force_new_split=True, seed=get_split_seed()) data_reader.load_data() URM_train, URM_test = data_reader.get_holdout_split() mapper = data_reader.get_original_user_id_to_index_mapper() df = get_preprocessed_dataframe("../../data/", keep_warm_only=True) # Setting evaluator # warm_users_mask = np.ediff1d(URM_train.tocsr().indptr) > 0 # warm_users = np.arange(URM_train.shape[0])[warm_users_mask] # ignore_users = warm_users cutoff_list = [10] evaluator = EvaluatorHoldout(URM_test, cutoff_list=cutoff_list) # HP tuning print("Start tuning...") version_path = "../../report/hp_tuning/advanced_top_pop/"
def fit(self, topK=-1, num_models=5, hyper_parameters_range=None, seed=get_split_seed()): self.topK = topK super().fit(num_models, hyper_parameters_range, seed=seed)
output_file_name = output_folder_path + "results.txt" try: if not os.path.exists(output_folder_path): os.mkdir(output_folder_path) except FileNotFoundError as e: os.makedirs(output_folder_path) f = open(output_file_name, "w") else: f = None # Data loading root_data_path = "../../data/" data_reader = RecSys2019Reader(root_data_path) data_reader = New_DataSplitter_leave_k_out(data_reader, k_out_value=1, use_validation_set=False, force_new_split=True, seed=get_split_seed()) data_reader.load_data() URM_train, URM_test = data_reader.get_holdout_split() ICM_all = get_ICM_train(data_reader) UCM_all = get_UCM_train(data_reader) UCM_age = data_reader.get_UCM_from_name("UCM_age") age_feature_to_id_mapper = data_reader.dataReader_object.get_UCM_feature_to_index_mapper_from_name("UCM_age") age_demographic = get_user_demographic(UCM_age, age_feature_to_id_mapper, binned=True) ICM_subclass = data_reader.get_ICM_from_name("ICM_sub_class") subclass_feature_to_id_mapper = data_reader.dataReader_object.get_ICM_feature_to_index_mapper_from_name( "ICM_sub_class") subclass_content_dict = get_sub_class_content(ICM_subclass, subclass_feature_to_id_mapper, binned=True) subclass_content = get_sub_class_content(ICM_subclass, subclass_feature_to_id_mapper, binned=False)
global_normalization=False) all_models = _get_all_models(URM_train=URM_train, ICM_train=ICM_train, UCM_train=UCM_train) for model_name, model_object in all_models.items(): model.add_fitted_model(model_name, model_object) print("The models added in the hybrid are: {}".format( list(all_models.keys()))) return model if __name__ == '__main__': set_env_variables() seeds = get_seed_lists(N_FOLDS, get_split_seed()) # --------- DATA LOADING SECTION --------- # URM_train_list = [] ICM_train_list = [] UCM_train_list = [] evaluator_list = [] model_list = [] for fold_idx in range(N_FOLDS): # Read and split data data_reader = read_split_load_data(K_OUT, ALLOW_COLD_USERS, seeds[fold_idx]) URM_train, URM_test = data_reader.get_holdout_split() ICM_train, item_feature2range = get_ICM_train_new(data_reader) UCM_train, user_feature2range = get_UCM_train_new(data_reader)
def main(): set_env_variables() args = get_arguments() seeds = get_seed_lists(args.n_folds, get_split_seed()) # --------- DATA LOADING SECTION --------- # URM_train_list = [] ICM_train_list = [] UCM_train_list = [] evaluator_list = [] for fold_idx in range(args.n_folds): # Read and split data data_reader = read_split_load_data(K_OUT, args.allow_cold_users, seeds[fold_idx]) URM_train, URM_test = data_reader.get_holdout_split() ICM_train, item_feature2range = get_ICM_train_new(data_reader) UCM_train, user_feature2range = get_UCM_train_new(data_reader) # Ignore users and setting evaluator ignore_users = get_ignore_users(URM_train, data_reader.get_original_user_id_to_index_mapper(), args.lower_threshold, args.upper_threshold, ignore_non_target_users=args.exclude_non_target) # Ignore users by age # UCM_age = data_reader.get_UCM_from_name("UCM_age") # age_feature_to_id_mapper = data_reader.dataReader_object.get_UCM_feature_to_index_mapper_from_name("UCM_age") # age_demographic = get_user_demographic(UCM_age, age_feature_to_id_mapper, binned=True) # ignore_users = np.unique(np.concatenate((ignore_users, get_ignore_users_age(age_demographic, AGE_TO_KEEP)))) URM_train_list.append(URM_train) ICM_train_list.append(ICM_train) UCM_train_list.append(UCM_train) evaluator = EvaluatorHoldout(URM_test, cutoff_list=[CUTOFF], ignore_users=np.unique(ignore_users)) evaluator_list.append(evaluator) # --------- HYPER PARAMETERS TUNING SECTION --------- # print("Start tuning...") hp_tuning_path = "../../../report/hp_tuning/" + args.recommender_name + "/" date_string = datetime.now().strftime('%b%d_%H-%M-%S_k1_lt_{}/'.format(args.lower_threshold)) output_folder_path = hp_tuning_path + date_string if args.recommender_name in COLLABORATIVE_RECOMMENDER_CLASS_DICT.keys(): run_cv_parameter_search(URM_train_list=URM_train_list, recommender_class=RECOMMENDER_CLASS_DICT[args.recommender_name], evaluator_validation_list=evaluator_list, metric_to_optimize="MAP", output_folder_path=output_folder_path, parallelize_search=args.parallelize, n_jobs=args.n_jobs, n_cases=args.n_cases, n_random_starts=args.n_random_starts) elif args.recommender_name in CONTENT_RECOMMENDER_CLASS_DICT.keys(): run_cv_parameter_search(URM_train_list=URM_train_list, ICM_train_list=ICM_train_list, ICM_name="ICM_all", recommender_class=RECOMMENDER_CLASS_DICT[args.recommender_name], evaluator_validation_list=evaluator_list, metric_to_optimize="MAP", output_folder_path=output_folder_path, parallelize_search=args.parallelize, n_jobs=args.n_jobs, n_cases=args.n_cases, n_random_starts=args.n_random_starts) elif args.recommender_name in DEMOGRAPHIC_RECOMMENDER_CLASS_DICT.keys(): run_cv_parameter_search(URM_train_list=URM_train_list, UCM_train_list=UCM_train_list, UCM_name="UCM_all", recommender_class=RECOMMENDER_CLASS_DICT[args.recommender_name], evaluator_validation_list=evaluator_list, metric_to_optimize="MAP", output_folder_path=output_folder_path, parallelize_search=args.parallelize, n_jobs=args.n_jobs, n_cases=args.n_cases, n_random_starts=args.n_random_starts) elif args.recommender_name in SIDE_INFO_CLASS_DICT: temp_list = [] for i, URM in enumerate(URM_train_list): temp = sps.vstack([URM, ICM_train_list[i].T], format="csr") #temp = TF_IDF(temp).tocsr() temp_list.append(temp) run_cv_parameter_search(URM_train_list=temp_list, recommender_class=RECOMMENDER_CLASS_DICT[args.recommender_name], evaluator_validation_list=evaluator_list, metric_to_optimize="MAP", output_folder_path=output_folder_path, parallelize_search=args.parallelize, n_jobs=args.n_jobs, n_cases=args.n_cases, n_random_starts=args.n_random_starts) print("...tuning ended")
try: if not os.path.exists(output_folder_path): os.mkdir(output_folder_path) except FileNotFoundError as e: os.makedirs(output_folder_path) f = open(output_file_name, "w") # Data loading root_data_path = "../../data/" data_reader = RecSys2019Reader(root_data_path) data_reader = New_DataSplitter_leave_k_out(data_reader, k_out_value=3, use_validation_set=False, force_new_split=True, seed=get_split_seed()) data_reader.load_data() URM_train, URM_test = data_reader.get_holdout_split() ICM_all, _ = get_ICM_train_new(data_reader) UCM_all = get_UCM_train(data_reader) dataframe_path = "../../resources/boosting_dataframe/" train_df = pd.read_csv(dataframe_path + "train_df_100_advanced_lt_20.csv") valid_df = pd.read_csv(dataframe_path + "valid_df_30_advanced_lt_20.csv") train_df = preprocess_dataframe_after_reading(train_df) train_df_with_labels = train_df.copy() train_df = train_df.drop(columns=["label"], inplace=False) valid_df = preprocess_dataframe_after_reading(valid_df) print("Retrieving training labels...", end="") y_train, non_zero_count, total = get_label_array(data_frame=train_df, URM_train=URM_train) print("Done")