Example #1
0
def main():
    # Data loading
    root_data_path = os.path.join(get_project_root_path(), "data/")
    data_reader = RecSys2019Reader(root_data_path)
    data_reader = New_DataSplitter_leave_k_out(
        data_reader,
        k_out_value=K_OUT,
        use_validation_set=False,
        allow_cold_users=ALLOW_COLD_USERS,
        force_new_split=True,
        seed=get_split_seed())
    data_reader.load_data()
    URM_train, URM_test = data_reader.get_holdout_split()
    ICM_all, _ = get_ICM_train_new(data_reader)
    UCM_all, _ = get_UCM_train_new(data_reader)

    # Ignoring users
    ignore_users = get_ignore_users(
        URM_train,
        data_reader.get_original_user_id_to_index_mapper(),
        lower_threshold=LOWER_THRESHOLD,
        upper_threshold=UPPER_THRESHOLD,
        ignore_non_target_users=IGNORE_NON_TARGET_USERS)
    evaluator = EvaluatorHoldout(URM_test,
                                 cutoff_list=[CUTOFF],
                                 ignore_users=ignore_users)

    # Model evaluation
    model = get_model(URM_train, ICM_all, UCM_all)
    print(evaluator.evaluateRecommender(model))
Example #2
0
    def fit(self, num_models=5, hyper_parameters_range=None):
        if hyper_parameters_range is None:
            hyper_parameters_range = {}

        np.random.seed(get_split_seed())
        seeds = np.random.randint(low=0, high=2**32 - 1, size=num_models)

        for i in tqdm(range(num_models), desc="Fitting bagging models"):
            recommender_kwargs = self.recommender_constr_kwargs.copy()
            URM_bootstrap = self.URM_train
            if self.do_bootstrap:
                URM_bootstrap, added_user = get_user_bootstrap(self.URM_train)
                for name, value in recommender_kwargs.items():
                    if name == "UCM_train":
                        UCM_object = recommender_kwargs[name]
                        recommender_kwargs[name] = sps.vstack(
                            [UCM_object, UCM_object[added_user, :]],
                            format="csr")
            parameters = {}
            for parameter_name, parameter_range in hyper_parameters_range.items(
            ):
                parameters[parameter_name] = parameter_range.rvs(
                    random_state=seeds[i])

            block_print()
            recommender_object = self.recommender_class(
                URM_bootstrap, **recommender_kwargs)
            recommender_object.fit(**parameters)
            enable_print()

            self.models.append(recommender_object)
Example #3
0
def get_arguments():
    parser = argparse.ArgumentParser()
    parser.add_argument("-l",
                        "--reader_path",
                        default=get_root_data_path(),
                        help="path to the root of data files")
    parser.add_argument("-r",
                        "--recommender_name",
                        required=True,
                        help="recommender names should be one of: {}".format(
                            list(RECOMMENDER_CLASS_DICT.keys())))
    parser.add_argument("-n",
                        "--n_cases",
                        default=N_CASES,
                        type=int,
                        help="number of cases for hyper parameter tuning")
    parser.add_argument(
        "-nr",
        "--n_random_starts",
        default=N_RANDOM_STARTS,
        type=int,
        help="number of random starts for hyper parameter tuning")
    parser.add_argument("-p",
                        "--parallelize",
                        default=1,
                        type=str2bool,
                        help="1 to parallelize the search, 0 otherwise")
    parser.add_argument(
        "-ut",
        "--upper_threshold",
        default=MAX_UPPER_THRESHOLD,
        type=int,
        help="Upper threshold (included) of user profile length to validate")
    parser.add_argument(
        "-lt",
        "--lower_threshold",
        default=MIN_LOWER_THRESHOLD,
        type=int,
        help="Lower threshold (included) of user profile length to validate")
    parser.add_argument("-acu",
                        "--allow_cold_users",
                        default=0,
                        type=str2bool,
                        help="1 to allow cold users,"
                        " 0 otherwise")
    parser.add_argument("-ent",
                        "--exclude_non_target",
                        default=1,
                        type=str2bool,
                        help="1 to exclude non-target users, 0 otherwise")
    parser.add_argument("--seed",
                        default=get_split_seed(),
                        help="seed for the experiment",
                        type=int)

    return parser.parse_args()
Example #4
0
def get_arguments():
    parser = argparse.ArgumentParser()
    parser.add_argument("-nrs", "--n_random_starts", default=N_RANDOM_STARTS, help="Number of random starts")
    parser.add_argument("-l", "--reader_path", default="../../data/", help="path to the root of data files")
    parser.add_argument("-r", "--recommender_name", required=True,
                        help="recommender names should be one of: {}".format(list(RECOMMENDER_CLASS_DICT.keys())))
    parser.add_argument("-n", "--n_cases", default=N_CASES, help="number of cases for hyperparameter tuning")
    parser.add_argument("-d", "--discretize", default=False, help="if true, it will discretize the ICMs")
    parser.add_argument("--seed", default=get_split_seed(), help="seed used in splitting the dataset")
    parser.add_argument("-foh", "--focus_on_high", default=0, help="focus the tuning only on users with profile"
                                                                   "lengths larger than the one specified here")
    parser.add_argument("-eu", "--exclude_users", default=False, help="1 to exclude cold users, 0 otherwise")
    parser.add_argument("-fol", "--focus_on_low", default=0, help="focus the tuning only on users with profile"
                                                                  "lengths smaller than the one specified here")

    return parser.parse_args()
    def fit(self,
            num_factors=100,
            n_oversamples=10,
            n_iter=4,
            feature_weighting="none",
            random_seed=get_split_seed()):
        self._print("Computing SVD decomposition...")

        self.URM_train = apply_feature_weighting(self.URM_train,
                                                 feature_weighting)

        U, Sigma, VT = randomized_svd(self.URM_train,
                                      n_oversamples=n_oversamples,
                                      n_iter=n_iter,
                                      n_components=num_factors,
                                      random_state=random_seed)

        s_Vt = sps.diags(Sigma) * VT

        self.USER_factors = U
        self.ITEM_factors = s_Vt.T

        self._print("Computing SVD decomposition... Done!")
Example #6
0
from course_lib.Base.Evaluation.Evaluator import EvaluatorHoldout
from src.data_management.New_DataSplitter_leave_k_out import New_DataSplitter_leave_k_out
from src.data_management.RecSys2019Reader import RecSys2019Reader
from src.data_management.dataframe_preprocesser import get_preprocessed_dataframe
from src.tuning.holdout_validation.run_parameter_search_advanced_top_pop import run_parameter_search_advanced_top_pop
from src.utils.general_utility_functions import get_split_seed

if __name__ == '__main__':
    # Data loading
    data_reader = RecSys2019Reader("../../data/")
    data_reader = New_DataSplitter_leave_k_out(data_reader,
                                               k_out_value=3,
                                               use_validation_set=False,
                                               force_new_split=True,
                                               seed=get_split_seed())
    data_reader.load_data()
    URM_train, URM_test = data_reader.get_holdout_split()
    mapper = data_reader.get_original_user_id_to_index_mapper()
    df = get_preprocessed_dataframe("../../data/", keep_warm_only=True)

    # Setting evaluator
    # warm_users_mask = np.ediff1d(URM_train.tocsr().indptr) > 0
    # warm_users = np.arange(URM_train.shape[0])[warm_users_mask]
    # ignore_users = warm_users
    cutoff_list = [10]
    evaluator = EvaluatorHoldout(URM_test, cutoff_list=cutoff_list)

    # HP tuning
    print("Start tuning...")
    version_path = "../../report/hp_tuning/advanced_top_pop/"
 def fit(self, topK=-1, num_models=5, hyper_parameters_range=None, seed=get_split_seed()):
     self.topK = topK
     super().fit(num_models, hyper_parameters_range, seed=seed)
        output_file_name = output_folder_path + "results.txt"
        try:
            if not os.path.exists(output_folder_path):
                os.mkdir(output_folder_path)
        except FileNotFoundError as e:
            os.makedirs(output_folder_path)

        f = open(output_file_name, "w")
    else:
        f = None

    # Data loading
    root_data_path = "../../data/"
    data_reader = RecSys2019Reader(root_data_path)
    data_reader = New_DataSplitter_leave_k_out(data_reader, k_out_value=1, use_validation_set=False,
                                               force_new_split=True, seed=get_split_seed())
    data_reader.load_data()
    URM_train, URM_test = data_reader.get_holdout_split()
    ICM_all = get_ICM_train(data_reader)
    UCM_all = get_UCM_train(data_reader)

    UCM_age = data_reader.get_UCM_from_name("UCM_age")
    age_feature_to_id_mapper = data_reader.dataReader_object.get_UCM_feature_to_index_mapper_from_name("UCM_age")
    age_demographic = get_user_demographic(UCM_age, age_feature_to_id_mapper, binned=True)

    ICM_subclass = data_reader.get_ICM_from_name("ICM_sub_class")
    subclass_feature_to_id_mapper = data_reader.dataReader_object.get_ICM_feature_to_index_mapper_from_name(
        "ICM_sub_class")
    subclass_content_dict = get_sub_class_content(ICM_subclass, subclass_feature_to_id_mapper, binned=True)
    subclass_content = get_sub_class_content(ICM_subclass, subclass_feature_to_id_mapper, binned=False)
                                             global_normalization=False)

    all_models = _get_all_models(URM_train=URM_train,
                                 ICM_train=ICM_train,
                                 UCM_train=UCM_train)
    for model_name, model_object in all_models.items():
        model.add_fitted_model(model_name, model_object)
    print("The models added in the hybrid are: {}".format(
        list(all_models.keys())))

    return model


if __name__ == '__main__':
    set_env_variables()
    seeds = get_seed_lists(N_FOLDS, get_split_seed())

    # --------- DATA LOADING SECTION --------- #
    URM_train_list = []
    ICM_train_list = []
    UCM_train_list = []
    evaluator_list = []
    model_list = []
    for fold_idx in range(N_FOLDS):
        # Read and split data
        data_reader = read_split_load_data(K_OUT, ALLOW_COLD_USERS,
                                           seeds[fold_idx])
        URM_train, URM_test = data_reader.get_holdout_split()
        ICM_train, item_feature2range = get_ICM_train_new(data_reader)
        UCM_train, user_feature2range = get_UCM_train_new(data_reader)
Example #10
0
def main():
    set_env_variables()
    args = get_arguments()
    seeds = get_seed_lists(args.n_folds, get_split_seed())

    # --------- DATA LOADING SECTION --------- #
    URM_train_list = []
    ICM_train_list = []
    UCM_train_list = []
    evaluator_list = []
    for fold_idx in range(args.n_folds):
        # Read and split data
        data_reader = read_split_load_data(K_OUT, args.allow_cold_users, seeds[fold_idx])
        URM_train, URM_test = data_reader.get_holdout_split()
        ICM_train, item_feature2range = get_ICM_train_new(data_reader)
        UCM_train, user_feature2range = get_UCM_train_new(data_reader)

        # Ignore users and setting evaluator
        ignore_users = get_ignore_users(URM_train, data_reader.get_original_user_id_to_index_mapper(),
                                        args.lower_threshold, args.upper_threshold,
                                        ignore_non_target_users=args.exclude_non_target)

        # Ignore users by age
        # UCM_age = data_reader.get_UCM_from_name("UCM_age")
        # age_feature_to_id_mapper = data_reader.dataReader_object.get_UCM_feature_to_index_mapper_from_name("UCM_age")
        # age_demographic = get_user_demographic(UCM_age, age_feature_to_id_mapper, binned=True)
        # ignore_users = np.unique(np.concatenate((ignore_users, get_ignore_users_age(age_demographic, AGE_TO_KEEP))))

        URM_train_list.append(URM_train)
        ICM_train_list.append(ICM_train)
        UCM_train_list.append(UCM_train)

        evaluator = EvaluatorHoldout(URM_test, cutoff_list=[CUTOFF], ignore_users=np.unique(ignore_users))
        evaluator_list.append(evaluator)

    # --------- HYPER PARAMETERS TUNING SECTION --------- #
    print("Start tuning...")

    hp_tuning_path = "../../../report/hp_tuning/" + args.recommender_name + "/"
    date_string = datetime.now().strftime('%b%d_%H-%M-%S_k1_lt_{}/'.format(args.lower_threshold))
    output_folder_path = hp_tuning_path + date_string

    if args.recommender_name in COLLABORATIVE_RECOMMENDER_CLASS_DICT.keys():
        run_cv_parameter_search(URM_train_list=URM_train_list,
                                recommender_class=RECOMMENDER_CLASS_DICT[args.recommender_name],
                                evaluator_validation_list=evaluator_list,
                                metric_to_optimize="MAP", output_folder_path=output_folder_path,
                                parallelize_search=args.parallelize, n_jobs=args.n_jobs,
                                n_cases=args.n_cases, n_random_starts=args.n_random_starts)
    elif args.recommender_name in CONTENT_RECOMMENDER_CLASS_DICT.keys():
        run_cv_parameter_search(URM_train_list=URM_train_list, ICM_train_list=ICM_train_list, ICM_name="ICM_all",
                                recommender_class=RECOMMENDER_CLASS_DICT[args.recommender_name],
                                evaluator_validation_list=evaluator_list,
                                metric_to_optimize="MAP", output_folder_path=output_folder_path,
                                parallelize_search=args.parallelize, n_jobs=args.n_jobs,
                                n_cases=args.n_cases, n_random_starts=args.n_random_starts)
    elif args.recommender_name in DEMOGRAPHIC_RECOMMENDER_CLASS_DICT.keys():
        run_cv_parameter_search(URM_train_list=URM_train_list, UCM_train_list=UCM_train_list, UCM_name="UCM_all",
                                recommender_class=RECOMMENDER_CLASS_DICT[args.recommender_name],
                                evaluator_validation_list=evaluator_list,
                                metric_to_optimize="MAP", output_folder_path=output_folder_path,
                                parallelize_search=args.parallelize, n_jobs=args.n_jobs,
                                n_cases=args.n_cases, n_random_starts=args.n_random_starts)
    elif args.recommender_name in SIDE_INFO_CLASS_DICT:
        temp_list = []
        for i, URM in enumerate(URM_train_list):
            temp = sps.vstack([URM, ICM_train_list[i].T], format="csr")
            #temp = TF_IDF(temp).tocsr()
            temp_list.append(temp)

        run_cv_parameter_search(URM_train_list=temp_list,
                                recommender_class=RECOMMENDER_CLASS_DICT[args.recommender_name],
                                evaluator_validation_list=evaluator_list, metric_to_optimize="MAP",
                                output_folder_path=output_folder_path, parallelize_search=args.parallelize,
                                n_jobs=args.n_jobs, n_cases=args.n_cases, n_random_starts=args.n_random_starts)

    print("...tuning ended")
Example #11
0
    try:
        if not os.path.exists(output_folder_path):
            os.mkdir(output_folder_path)
    except FileNotFoundError as e:
        os.makedirs(output_folder_path)

    f = open(output_file_name, "w")

    # Data loading
    root_data_path = "../../data/"
    data_reader = RecSys2019Reader(root_data_path)
    data_reader = New_DataSplitter_leave_k_out(data_reader,
                                               k_out_value=3,
                                               use_validation_set=False,
                                               force_new_split=True,
                                               seed=get_split_seed())
    data_reader.load_data()
    URM_train, URM_test = data_reader.get_holdout_split()
    ICM_all, _ = get_ICM_train_new(data_reader)
    UCM_all = get_UCM_train(data_reader)
    dataframe_path = "../../resources/boosting_dataframe/"
    train_df = pd.read_csv(dataframe_path + "train_df_100_advanced_lt_20.csv")
    valid_df = pd.read_csv(dataframe_path + "valid_df_30_advanced_lt_20.csv")
    train_df = preprocess_dataframe_after_reading(train_df)
    train_df_with_labels = train_df.copy()
    train_df = train_df.drop(columns=["label"], inplace=False)
    valid_df = preprocess_dataframe_after_reading(valid_df)
    print("Retrieving training labels...", end="")
    y_train, non_zero_count, total = get_label_array(data_frame=train_df,
                                                     URM_train=URM_train)
    print("Done")