Esempio n. 1
0
def main():
    # Data loading
    root_data_path = os.path.join(get_project_root_path(), "data/")
    data_reader = RecSys2019Reader(root_data_path)
    data_reader = New_DataSplitter_leave_k_out(
        data_reader,
        k_out_value=K_OUT,
        use_validation_set=False,
        allow_cold_users=ALLOW_COLD_USERS,
        force_new_split=True,
        seed=get_split_seed())
    data_reader.load_data()
    URM_train, URM_test = data_reader.get_holdout_split()
    ICM_all, _ = get_ICM_train_new(data_reader)
    UCM_all, _ = get_UCM_train_new(data_reader)

    # Ignoring users
    ignore_users = get_ignore_users(
        URM_train,
        data_reader.get_original_user_id_to_index_mapper(),
        lower_threshold=LOWER_THRESHOLD,
        upper_threshold=UPPER_THRESHOLD,
        ignore_non_target_users=IGNORE_NON_TARGET_USERS)
    evaluator = EvaluatorHoldout(URM_test,
                                 cutoff_list=[CUTOFF],
                                 ignore_users=ignore_users)

    # Model evaluation
    model = get_model(URM_train, ICM_all, UCM_all)
    print(evaluator.evaluateRecommender(model))
Esempio n. 2
0
def get_ignore_users(URM_train,
                     original_user_id_to_index_mapper,
                     lower_threshold,
                     upper_threshold,
                     ignore_non_target_users=True):
    data_path = os.path.join(get_project_root_path(), "data/")
    ignore_users = []
    users_outside = get_users_outside_profile_len(
        URM_train,
        lower_threshold=lower_threshold,
        upper_threshold=upper_threshold)
    if len(users_outside) > 0:
        print("Excluding users with profile length outside ({}, {})".format(
            lower_threshold, upper_threshold))
        ignore_users = np.concatenate([ignore_users, users_outside])
    if ignore_non_target_users:
        print("Excluding non-target users...")
        original_target_users = read_target_users(
            os.path.join(data_path, "data_target_users_test.csv"))
        target_users = get_index_target_users(
            original_target_users, original_user_id_to_index_mapper)
        non_target_users = np.setdiff1d(np.arange(URM_train.shape[0]),
                                        target_users,
                                        assume_unique=True)
        ignore_users = np.concatenate([ignore_users, non_target_users])
    return np.unique(ignore_users)
Esempio n. 3
0
def read_split_load_data(k_out, allow_cold_users, seed):
    root_data_path = os.path.join(get_project_root_path(), "data/")
    data_reader = RecSys2019Reader(root_data_path)
    data_reader = New_DataSplitter_leave_k_out(
        data_reader,
        k_out_value=k_out,
        use_validation_set=False,
        allow_cold_users=allow_cold_users,
        force_new_split=True,
        seed=seed)
    data_reader.load_data()
    return data_reader
    def __init__(self,
                 URM_train,
                 train_svm_file_path,
                 approximate_recommender: BaseRecommender,
                 ICM_train=None,
                 UCM_train=None,
                 item_feature_fields=None,
                 user_feature_fields=None,
                 valid_svm_file_path=None,
                 max_items_to_predict=1000,
                 model_filename="model.out",
                 model_type="ffm",
                 temp_relative_folder="temp/",
                 verbose=True):
        self.ICM_train = ICM_train
        self.UCM_train = UCM_train
        user_fields = np.full(shape=URM_train.shape[0], fill_value=0)
        item_fields = np.full(shape=URM_train.shape[1], fill_value=1)
        if item_feature_fields is not None:
            item_feature_fields = item_feature_fields + 2
        if user_feature_fields is not None:
            user_feature_fields = user_feature_fields + np.max(
                item_feature_fields) + 1
        self.fields = np.concatenate([
            user_fields, item_fields, item_feature_fields, user_feature_fields
        ])

        self.approximate_recommender = approximate_recommender
        self.max_items_to_predict = max_items_to_predict

        # Set path of temp folder and model_path
        root_path = get_project_root_path()
        fm_data_path = os.path.join(root_path, "resources", "ffm_data")
        self.temp_folder = os.path.join(fm_data_path, temp_relative_folder)
        self.model_folder = os.path.join(fm_data_path, "model")
        self.model_path = os.path.join(self.model_folder, model_filename)

        if model_type == "ffm":
            self.model = xl.create_ffm()
        elif model_type == "fm":
            self.model = xl.create_fm()
        else:
            raise ValueError(
                "model_type is inexistent, choose between ffm and fm")
        self.model.setTrain(train_svm_file_path)
        if valid_svm_file_path is not None:
            self.model.setValidate(valid_svm_file_path)

        super().__init__(URM_train, verbose)
Esempio n. 5
0
    ICM_all, item_feature_fields = get_ICM_with_fields(data_reader)

    # Build UCMs: do not change the order of ICMs and UCMs
    UCM_all, user_feature_fields = get_UCM_with_fields(data_reader)

    cold_users_mask = np.ediff1d(URM_train.tocsr().indptr) == 0
    cold_users = np.arange(URM_train.shape[0])[cold_users_mask]

    cutoff_list = [10]
    evaluator = EvaluatorHoldout(URM_test,
                                 cutoff_list=cutoff_list,
                                 ignore_users=cold_users)

    best_model = new_best_models.ItemCBF_CF.get_model(URM_train, ICM_all)
    best_model.fit()
    ffm_data_path = os.path.join(get_project_root_path(), "resources",
                                 "ffm_data")
    model = FieldAwareFMRecommender(
        URM_train,
        model_type="fm",
        train_svm_file_path=os.path.join(
            ffm_data_path, "users_25_item_20_train_uncompressed.txt"),
        valid_svm_file_path=os.path.join(
            ffm_data_path, "users_25_item_20_valid_uncompressed.txt"),
        approximate_recommender=best_model,
        ICM_train=ICM_all,
        UCM_train=UCM_all,
        item_feature_fields=item_feature_fields,
        user_feature_fields=user_feature_fields,
        max_items_to_predict=20)
    #model.load_model(os.path.join(ffm_data_path, "model"), "model_row_4.out")
        ignore_users = get_ignore_users(
            URM_train,
            data_reader.get_original_user_id_to_index_mapper(),
            lower_threshold=LOWER_THRESHOLD,
            upper_threshold=UPPER_THRESHOLD,
            ignore_non_target_users=IGNORE_NON_TARGET_USERS)
        evaluator = EvaluatorHoldout(URM_test,
                                     cutoff_list=[CUTOFF],
                                     ignore_users=ignore_users)

        URM_train_list.append(URM_train)
        evaluator_list.append(evaluator)

    # Setting evaluator
    evaluator = EvaluatorCrossValidationKeepKOut(URM_train_list,
                                                 evaluator_list,
                                                 cutoff=CUTOFF)
    results = evaluator.crossevaluateRecommender(recommender_class,
                                                 model_parameters)

    # Writing on file cross validation results
    date_string = datetime.now().strftime('%b%d_%H-%M-%S')
    cross_valid_path = os.path.join(get_project_root_path(),
                                    "report/cross_validation/")
    file_path = os.path.join(
        cross_valid_path,
        "cross_valid_{}_{}.txt".format(model_name, date_string))
    write_results_on_file(file_path, recommender_class.RECOMMENDER_NAME,
                          model_parameters, num_folds, seed_list, results)
def run_parameter_search_mf_collaborative(
        recommender_class,
        URM_train,
        UCM_train=None,
        UCM_name="NO_UCM",
        ICM_train=None,
        ICM_name="NO_ICM",
        URM_train_last_test=None,
        metric_to_optimize="PRECISION",
        evaluator_validation=None,
        evaluator_test=None,
        evaluator_validation_earlystopping=None,
        output_folder_path="result_experiments/",
        parallelize_search=True,
        n_cases=35,
        n_random_starts=5,
        resume_from_saved=False,
        save_model="best",
        approximate_recommender=None):
    # If directory does not exist, create
    if not os.path.exists(output_folder_path):
        os.makedirs(output_folder_path)

    earlystopping_keywargs = {
        "validation_every_n": 5,
        "stop_on_validation": True,
        "evaluator_object": evaluator_validation_earlystopping,
        "lower_validations_allowed": 5,
        "validation_metric": metric_to_optimize,
    }

    URM_train = URM_train.copy()

    if URM_train_last_test is not None:
        URM_train_last_test = URM_train_last_test.copy()

    try:

        output_file_name_root = recommender_class.RECOMMENDER_NAME + "_" + ICM_name + "_" + UCM_name

        parameterSearch = SearchBayesianSkopt(
            recommender_class,
            evaluator_validation=evaluator_validation,
            evaluator_test=evaluator_test)

        recommender_input_args = SearchInputRecommenderArgs(
            CONSTRUCTOR_POSITIONAL_ARGS=[URM_train],
            CONSTRUCTOR_KEYWORD_ARGS={},
            FIT_POSITIONAL_ARGS=[],
            FIT_KEYWORD_ARGS={})
        hyperparameters_range_dictionary = {}

        if recommender_class is ImplicitALSRecommender:
            hyperparameters_range_dictionary["num_factors"] = Integer(300, 550)
            hyperparameters_range_dictionary["regularization"] = Real(
                low=1e-2, high=200, prior='log-uniform')
            hyperparameters_range_dictionary["epochs"] = Categorical([50])
            hyperparameters_range_dictionary[
                "confidence_scaling"] = Categorical(["linear"])
            hyperparameters_range_dictionary["alpha"] = Real(
                low=1e-2, high=1e2, prior='log-uniform')

        if recommender_class is MF_BPR_Recommender:
            hyperparameters_range_dictionary["num_factors"] = Categorical(
                [600])
            hyperparameters_range_dictionary["regularization"] = Real(
                low=1e-4, high=1e-1, prior='log-uniform')
            hyperparameters_range_dictionary["learning_rate"] = Real(
                low=1e-2, high=1e-1, prior='log-uniform')
            hyperparameters_range_dictionary["epochs"] = Categorical([300])

        if recommender_class is FunkSVDRecommender:
            hyperparameters_range_dictionary["num_factors"] = Integer(50, 400)
            hyperparameters_range_dictionary["regularization"] = Real(
                low=1e-8, high=1e-1, prior='log-uniform')
            hyperparameters_range_dictionary["learning_rate"] = Real(
                low=1e-6, high=1e-1, prior='log-uniform')
            hyperparameters_range_dictionary["epochs"] = Categorical([300])

        if recommender_class is LogisticMFRecommender:
            hyperparameters_range_dictionary["num_factors"] = Integer(20, 400)
            hyperparameters_range_dictionary["regularization"] = Real(
                low=1e-5, high=1e1, prior='log-uniform')
            hyperparameters_range_dictionary["learning_rate"] = Real(
                low=1e-2, high=1e-1, prior='log-uniform')
            hyperparameters_range_dictionary["epochs"] = Categorical([300])

        if recommender_class is LightFMRecommender:
            recommender_input_args.CONSTRUCTOR_KEYWORD_ARGS[
                'UCM_train'] = UCM_train
            recommender_input_args.CONSTRUCTOR_KEYWORD_ARGS[
                'ICM_train'] = ICM_train

            hyperparameters_range_dictionary['no_components'] = Categorical(
                [100])
            hyperparameters_range_dictionary['epochs'] = Categorical([100])

            run_light_fm_search(parameterSearch,
                                recommender_input_args,
                                hyperparameters_range_dictionary,
                                URM_train_last_test=URM_train_last_test,
                                parallelize_search=parallelize_search,
                                n_cases=n_cases,
                                n_random_starts=n_random_starts,
                                output_folder_path=output_folder_path,
                                output_file_name_root=output_file_name_root,
                                metric_to_optimize=metric_to_optimize,
                                save_model=save_model)

        if recommender_class is FieldAwareFMRecommender:
            if approximate_recommender is None:
                raise ValueError("approximate_recommender has to be set")
            root_path = get_project_root_path()
            train_svm_file_path = os.path.join(root_path, "resources",
                                               "fm_data",
                                               "URM_ICM_UCM_uncompressed.txt")
            recommender_input_args.CONSTRUCTOR_KEYWORD_ARGS[
                'train_svm_file_path'] = train_svm_file_path
            recommender_input_args.CONSTRUCTOR_KEYWORD_ARGS[
                'approximate_recommender'] = approximate_recommender
            recommender_input_args.CONSTRUCTOR_KEYWORD_ARGS[
                'UCM_train'] = UCM_train
            recommender_input_args.CONSTRUCTOR_KEYWORD_ARGS[
                'ICM_train'] = ICM_train

            hyperparameters_range_dictionary['epochs'] = Categorical([200])
            hyperparameters_range_dictionary['latent_factors'] = Integer(
                low=20, high=500)
            hyperparameters_range_dictionary['regularization'] = Real(
                low=10e-7, high=10e-1, prior="log-uniform")
            hyperparameters_range_dictionary['learning_rate'] = Real(
                low=10e-3, high=10e-1, prior="log-uniform")

        if URM_train_last_test is not None:
            recommender_input_args_last_test = recommender_input_args.copy()
            recommender_input_args_last_test.CONSTRUCTOR_POSITIONAL_ARGS[
                0] = URM_train_last_test
        else:
            recommender_input_args_last_test = None

        ## Final step, after the hyperparameter range has been defined for each type of algorithm
        parameterSearch.search(
            recommender_input_args,
            parameter_search_space=hyperparameters_range_dictionary,
            n_cases=n_cases,
            n_random_starts=n_random_starts,
            resume_from_saved=resume_from_saved,
            save_model=save_model,
            output_folder_path=output_folder_path,
            output_file_name_root=output_file_name_root,
            metric_to_optimize=metric_to_optimize,
            recommender_input_args_last_test=recommender_input_args_last_test)

    except Exception as e:

        print("On recommender {} Exception {}".format(recommender_class,
                                                      str(e)))
        traceback.print_exc()

        error_file = open(output_folder_path + "ErrorLog.txt", "a")
        error_file.write("On recommender {} Exception {}\n".format(
            recommender_class, str(e)))
        error_file.close()
    fields = np.concatenate(
        [user_fields, item_fields, item_feature_fields, user_feature_fields])

    positive_URM = URM_train
    negative_URM = sample_negative_interactions_uniformly(
        negative_sample_size=len(positive_URM.data) * 10, URM=positive_URM)

    URM_positive_FM_matrix = convert_URM_to_FM(positive_URM)
    URM_negative_FM_matrix = convert_URM_to_FM(negative_URM)

    URM_FM_matrix = sps.vstack(
        [URM_positive_FM_matrix, URM_negative_FM_matrix], format='csr')
    URM_FM_matrix = add_ICM_info(URM_FM_matrix, ICM_all, URM_train.shape[0])
    URM_FM_matrix = add_UCM_info(URM_FM_matrix, UCM_all, 0)

    root_path = get_project_root_path()
    fm_data_path = os.path.join(root_path, "resources", "ffm_data")

    # Prepare train sparse matrix and labels for dumping to file
    FM_sps_matrix = URM_FM_matrix.copy()
    labels = np.concatenate([
        np.ones(shape=URM_positive_FM_matrix.shape[0], dtype=np.int).tolist(),
        np.zeros(shape=URM_negative_FM_matrix.shape[0], dtype=np.int).tolist()
    ])

    random_state = 69420
    x_train, x_valid, y_train, y_valid = train_test_split(
        FM_sps_matrix,
        labels,
        shuffle=True,
        test_size=0.1,