コード例 #1
0
    def __init__(self):

        super(Movielens100KReader, self).__init__()


        pre_splitted_path = "Data_manager_split_datasets/Movielens100K/KDD/MCRec_our_interface/"

        pre_splitted_filename = "splitted_data"

        original_data_path = "Conferences/KDD/MCRec_github/data/"

        # If directory does not exist, create
        if not os.path.exists(pre_splitted_path):
            os.makedirs(pre_splitted_path)

        try:

            print("Movielens100KReader: Attempting to load pre-splitted data")

            for attrib_name, attrib_object in load_data_dict(pre_splitted_path, pre_splitted_filename).items():
                 self.__setattr__(attrib_name, attrib_object)


        except FileNotFoundError:

            print("Movielens100KReader: Pre-splitted data not found, building new one")

            print("Movielens100KReader: loading URM")


            from Conferences.KDD.MCRec_github.code.Dataset import Dataset

            dataset = 'ml-100k'

            dataset = Dataset(original_data_path + dataset)
            URM_train, testRatings, testNegatives = dataset.trainMatrix, dataset.testRatings, dataset.testNegatives

            # Dataset adds 1 to user and item id, removing it to restore 0 indexing
            URM_train = sps.coo_matrix(URM_train)
            URM_train.row -= 1
            URM_train.col -= 1

            self.URM_train = sps.csr_matrix((np.ones_like(URM_train.data), (URM_train.row, URM_train.col)))


            num_users, num_items = self.URM_train.shape



            # Build sparse matrices from lists
            URM_test_builder = IncrementalSparseMatrix(n_rows=num_users, n_cols=num_items)
            URM_test_negative_builder = IncrementalSparseMatrix(n_rows=num_users, n_cols=num_items)


            for user_index in range(len(testRatings)):

                user_id = testRatings[user_index][0]
                current_user_test_items = testRatings[user_index][1:]
                current_user_test_negative_items = testNegatives[user_index]

                current_user_test_items = np.array(current_user_test_items) -1
                current_user_test_negative_items = np.array(current_user_test_negative_items) -1

                URM_test_builder.add_single_row(user_id -1, current_user_test_items, 1.0)
                URM_test_negative_builder.add_single_row(user_id -1, current_user_test_negative_items, 1.0)



            # the test data has repeated data, apparently
            self.URM_test = URM_test_builder.get_SparseMatrix()

            self.URM_test_negative = URM_test_negative_builder.get_SparseMatrix()


            # Split validation from train as 10%
            from Data_manager.split_functions.split_train_validation import split_train_validation_percentage_user_wise

            self.URM_train, self.URM_validation = split_train_validation_percentage_user_wise(self.URM_train, train_percentage=0.9)


            # Load features

            data_reader = Movielens100KReader_DataManager()
            data_reader.load_data()

            zipFile_path = data_reader.DATASET_SPLIT_ROOT_FOLDER + data_reader.DATASET_SUBFOLDER
            dataFile = zipfile.ZipFile(zipFile_path + "ml-100k.zip")

            ICM_path = dataFile.extract("ml-100k/u.item", path=zipFile_path + "decompressed/")

            ICM_genre = self._loadICM(ICM_path)
            ICM_genre = ICM_genre.get_SparseMatrix()

            shutil.rmtree(zipFile_path + "decompressed", ignore_errors=True)

            self.ICM_dict = {"ICM_genre": ICM_genre}


            data_dict = {
                "URM_train": self.URM_train,
                "URM_test": self.URM_test,
                "URM_validation": self.URM_validation,
                "URM_test_negative": self.URM_test_negative,
                "ICM_dict": self.ICM_dict,

            }

            save_data_dict(data_dict, pre_splitted_path, pre_splitted_filename)

            print("Movielens100KReader: loading complete")
コード例 #2
0
    layers = [512, 256, 128, 64]
    reg_layes = [0, 0, 0, 0]
    learning_rate = 0.001
    epochs = 30
    batch_size = 256
    num_negatives = 4
    learner = 'adam'
    verbose = 1
    out = 0
    evaluation_threads = 1
    topK = 10

    print('num_negatives = ', num_negatives)

    t1 = time()
    dataset = Dataset('../data/' + dataset)
    trainMatrix, testRatings, testNegatives = dataset.trainMatrix, dataset.testRatings, dataset.testNegatives
    train = dataset.train
    user_item_map = dataset.user_item_map
    item_user_map = dataset.item_user_map
    path_umtm = dataset.path_umtm
    path_umum = dataset.path_umum
    path_umtmum = dataset.path_umtmum
    path_uuum = dataset.path_uuum
    user_feature, item_feature, type_feature = dataset.user_feature, dataset.item_feature, dataset.type_feature
    num_users, num_items = trainMatrix.shape[0], trainMatrix.shape[1]
    path_nums = [
        dataset.umtm_path_num, dataset.umum_path_num, dataset.umtmum_path_num,
        dataset.uuum_path_num
    ]
    timestamps = [
コード例 #3
0
def read_data_split_and_search(dataset_name,
                               flag_baselines_tune=False,
                               flag_DL_article_default=False,
                               flag_DL_tune=False,
                               flag_print_results=False):

    from Conferences.KDD.MCRec_our_interface.Movielens100K.Movielens100KReader import Movielens100KReader

    result_folder_path = "result_experiments/{}/{}_{}/".format(
        CONFERENCE_NAME, ALGORITHM_NAME, dataset_name)

    if dataset_name == "movielens100k":
        dataset = Movielens100KReader(result_folder_path)

    URM_train = dataset.URM_DICT["URM_train"].copy()
    URM_validation = dataset.URM_DICT["URM_validation"].copy()
    URM_test = dataset.URM_DICT["URM_test"].copy()
    URM_test_negative = dataset.URM_DICT["URM_test_negative"].copy()

    # Ensure IMPLICIT data and DISJOINT sets
    assert_implicit_data(
        [URM_train, URM_validation, URM_test, URM_test_negative])
    assert_disjoint_matrices(
        [URM_train, URM_validation, URM_test, URM_test_negative])

    # If directory does not exist, create
    if not os.path.exists(result_folder_path):
        os.makedirs(result_folder_path)

    algorithm_dataset_string = "{}_{}_".format(ALGORITHM_NAME, dataset_name)

    plot_popularity_bias([URM_train + URM_validation, URM_test],
                         ["URM train", "URM test"], result_folder_path +
                         algorithm_dataset_string + "popularity_plot")

    save_popularity_statistics([URM_train + URM_validation, URM_test],
                               ["URM train", "URM test"],
                               result_folder_path + algorithm_dataset_string +
                               "popularity_statistics")

    from Base.Evaluation.Evaluator import EvaluatorNegativeItemSample

    evaluator_validation = EvaluatorNegativeItemSample(URM_validation,
                                                       URM_test_negative,
                                                       cutoff_list=[10])
    evaluator_test = EvaluatorNegativeItemSample(URM_test,
                                                 URM_test_negative,
                                                 cutoff_list=[10])

    collaborative_algorithm_list = [
        Random,
        TopPop,
        UserKNNCFRecommender,
        ItemKNNCFRecommender,
        P3alphaRecommender,
        RP3betaRecommender,
        PureSVDRecommender,
        NMFRecommender,
        IALSRecommender,
        MatrixFactorization_BPR_Cython,
        MatrixFactorization_FunkSVD_Cython,
        EASE_R_Recommender,
        SLIM_BPR_Cython,
        SLIMElasticNetRecommender,
    ]

    metric_to_optimize = "PRECISION"
    n_cases = 50
    n_random_starts = 15

    runParameterSearch_Collaborative_partial = partial(
        runParameterSearch_Collaborative,
        URM_train=URM_train,
        URM_train_last_test=URM_train + URM_validation,
        metric_to_optimize=metric_to_optimize,
        evaluator_validation_earlystopping=evaluator_validation,
        evaluator_validation=evaluator_validation,
        evaluator_test=evaluator_test,
        output_folder_path=result_folder_path,
        parallelizeKNN=False,
        allow_weighting=True,
        resume_from_saved=True,
        n_cases=n_cases,
        n_random_starts=n_random_starts)

    if flag_baselines_tune:

        for recommender_class in collaborative_algorithm_list:
            try:
                runParameterSearch_Collaborative_partial(recommender_class)
            except Exception as e:
                print("On recommender {} Exception {}".format(
                    recommender_class, str(e)))
                traceback.print_exc()

        ################################################################################################
        ###### Content Baselines

        for ICM_name, ICM_object in dataset.ICM_DICT.items():

            try:

                runParameterSearch_Content(
                    ItemKNNCBFRecommender,
                    URM_train=URM_train,
                    URM_train_last_test=URM_train + URM_validation,
                    metric_to_optimize=metric_to_optimize,
                    evaluator_validation=evaluator_validation,
                    evaluator_test=evaluator_test,
                    output_folder_path=result_folder_path,
                    parallelizeKNN=False,
                    allow_weighting=True,
                    resume_from_saved=True,
                    ICM_name=ICM_name,
                    ICM_object=ICM_object.copy(),
                    n_cases=n_cases,
                    n_random_starts=n_random_starts)

            except Exception as e:

                print("On CBF recommender for ICM {} Exception {}".format(
                    ICM_name, str(e)))
                traceback.print_exc()

        ################################################################################################
        ###### Hybrid

        for ICM_name, ICM_object in dataset.ICM_DICT.items():

            try:

                runParameterSearch_Hybrid(
                    ItemKNN_CFCBF_Hybrid_Recommender,
                    URM_train=URM_train,
                    URM_train_last_test=URM_train + URM_validation,
                    metric_to_optimize=metric_to_optimize,
                    evaluator_validation=evaluator_validation,
                    evaluator_test=evaluator_test,
                    output_folder_path=result_folder_path,
                    parallelizeKNN=False,
                    allow_weighting=True,
                    resume_from_saved=True,
                    ICM_name=ICM_name,
                    ICM_object=ICM_object.copy(),
                    n_cases=n_cases,
                    n_random_starts=n_random_starts)

            except Exception as e:

                print("On recommender {} Exception {}".format(
                    ItemKNN_CFCBF_Hybrid_Recommender, str(e)))
                traceback.print_exc()

    ################################################################################################
    ######
    ######      DL ALGORITHM
    ######

    if flag_DL_article_default:

        if dataset_name == "movielens100k":
            """
            The code provided by the original authors of MCRec can be used only for the original data.
            Here I am passing to the Wrapper the URM_train matrix that is only required for its shape,
            the train will be done using the preprocessed data the original authors provided
            """
            from Conferences.KDD.MCRec_github.code.Dataset import Dataset

            original_dataset_reader = Dataset(
                'Conferences/KDD/MCRec_github/data/' + 'ml-100k')

            MCRec_article_hyperparameters = {
                "epochs": 200,
                "latent_dim": 128,
                "reg_latent": 0,
                "layers": [512, 256, 128, 64],
                "reg_layes": [0, 0, 0, 0],
                "learning_rate": 1e-3,
                "batch_size": 256,
                "num_negatives": 4,
            }

            MCRec_earlystopping_hyperparameters = {
                "validation_every_n": 5,
                "stop_on_validation": True,
                "evaluator_object": evaluator_validation,
                "lower_validations_allowed": 5,
                "validation_metric": metric_to_optimize
            }

            parameterSearch = SearchSingleCase(
                MCRecML100k_RecommenderWrapper,
                evaluator_validation=evaluator_validation,
                evaluator_test=evaluator_test)

            recommender_input_args = SearchInputRecommenderArgs(
                CONSTRUCTOR_POSITIONAL_ARGS=[
                    URM_train, original_dataset_reader
                ],
                FIT_KEYWORD_ARGS=MCRec_earlystopping_hyperparameters)

            recommender_input_args_last_test = recommender_input_args.copy()
            recommender_input_args_last_test.CONSTRUCTOR_POSITIONAL_ARGS[
                0] = URM_train + URM_validation

            parameterSearch.search(
                recommender_input_args,
                recommender_input_args_last_test=
                recommender_input_args_last_test,
                fit_hyperparameters_values=MCRec_article_hyperparameters,
                output_folder_path=result_folder_path,
                resume_from_saved=True,
                output_file_name_root=MCRecML100k_RecommenderWrapper.
                RECOMMENDER_NAME)

    ################################################################################################
    ######
    ######      PRINT RESULTS
    ######

    if flag_print_results:

        n_test_users = np.sum(np.ediff1d(URM_test.indptr) >= 1)
        file_name = "{}..//{}_{}_".format(result_folder_path, ALGORITHM_NAME,
                                          dataset_name)

        ICM_names_to_report_list = list(dataset.ICM_DICT.keys())

        result_loader = ResultFolderLoader(
            result_folder_path,
            base_algorithm_list=None,
            other_algorithm_list=[MCRecML100k_RecommenderWrapper],
            KNN_similarity_list=KNN_similarity_to_report_list,
            ICM_names_list=ICM_names_to_report_list,
            UCM_names_list=None)

        result_loader.generate_latex_results(
            file_name + "{}_latex_results.txt".format("article_metrics"),
            metrics_list=["PRECISION", "RECALL", "NDCG"],
            cutoffs_list=[10],
            table_title=None,
            highlight_best=True)

        result_loader.generate_latex_results(
            file_name + "{}_latex_results.txt".format("all_metrics"),
            metrics_list=[
                "PRECISION", "RECALL", "MAP", "MRR", "NDCG", "F1", "HIT_RATE",
                "ARHR", "NOVELTY", "DIVERSITY_MEAN_INTER_LIST",
                "DIVERSITY_HERFINDAHL", "COVERAGE_ITEM", "DIVERSITY_GINI",
                "SHANNON_ENTROPY"
            ],
            cutoffs_list=[10],
            table_title=None,
            highlight_best=True)

        result_loader.generate_latex_time_statistics(
            file_name + "{}_latex_results.txt".format("time"),
            n_evaluation_users=n_test_users,
            table_title=None)
コード例 #4
0
    def fit(self,
            latent_dim=128,
            reg_latent=0,
            layers=[512, 256, 128, 64],
            reg_layes=[0, 0, 0, 0],
            learning_rate=0.001,
            epochs=30,
            batch_size=256,
            num_negatives=4,
            **earlystopping_kwargs):

        self.latent_dim = latent_dim
        self.reg_latent = reg_latent
        self.layers = layers
        self.reg_layes = reg_layes
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.batch_size = batch_size
        self.num_negatives = num_negatives

        dataset = 'ml-100k'

        t1 = time()
        dataset = Dataset('Conferences/KDD/MCRec_github/data/' + dataset)
        trainMatrix, testRatings, testNegatives = dataset.trainMatrix, dataset.testRatings, dataset.testNegatives

        # # Replace train data with the train split passed as parameter
        # self._load_rating_file_as_map(self.URM_train, dataset)

        self._train = dataset.train
        self._user_item_map = dataset.user_item_map
        item_user_map = dataset.item_user_map
        path_umtm = dataset.path_umtm
        path_umum = dataset.path_umum
        path_umtmum = dataset.path_umtmum
        path_uuum = dataset.path_uuum

        user_feature, item_feature, type_feature = dataset.user_feature, dataset.item_feature, dataset.type_feature

        # Dataset adds 1 to user and item id, so n_users and n_items will be +1 greater than the correct value
        self.n_users, self.n_items = trainMatrix.shape

        self.path_nums = [
            dataset.umtm_path_num, dataset.umum_path_num,
            dataset.umtmum_path_num, dataset.uuum_path_num
        ]
        self.timestamps = [
            dataset.umtm_timestamp, dataset.umum_timestamp,
            dataset.umtmum_timestamp, dataset.uuum_timestamp
        ]
        self.length = dataset.fea_size

        print(
            "Load data done [%.1f s]. #user=%d, #item=%d, #train=%d, #test=%d"
            % (time() - t1, self.n_users, self.n_items, len(
                self._train), len(testRatings)))
        print('path nums = ', self.path_nums)
        print('timestamps = ', self.timestamps)

        print("MCRec_RecommenderWrapper: building model")
        self.model = get_model(self.n_users, self.n_items, self.path_nums,
                               self.timestamps, self.length, self.layers,
                               self.reg_layes, self.latent_dim,
                               self.reg_latent)

        self.model.compile(optimizer=Adam(lr=learning_rate, decay=1e-4),
                           loss='binary_crossentropy')
        # model.compile(optimizer = Nadam(),
        #              loss = 'binary_crossentropy')

        # Check Init performance
        # t1 = time()
        # print("MCRec_RecommenderWrapper: evaluate_model")
        # (ps, rs, ndcgs) = evaluate_model(self.model, user_feature, item_feature, type_feature, self.n_users, self.n_items, path_umtm, path_umum, path_umtmum, path_uuum, path_nums, timestamps, length, testRatings, testNegatives, topK, evaluation_threads)
        # p, r, ndcg = np.array(ps).mean(), np.array(rs).mean(), np.array(ndcgs).mean()
        # print('Init: Precision = %.4f, Recall = %.4f, NDCG = %.4f [%.1f]' %(p, r, ndcg, time()-t1))

        # best_p = -1
        # p_list, r_list, ndcg_list = [], [], []
        # print('Begin training....')

        # Originally these were global variables
        self._testRatings = testRatings
        self._testNegatives = testNegatives
        self._path_umtm = path_umtm
        self._path_umum = path_umum
        self._path_umtmum = path_umtmum
        self._path_uuum = path_uuum
        self._path_nums = self.path_nums
        self._timestamps = self.timestamps
        self._length = self.length
        self._user_feature = user_feature
        self._item_feature = item_feature
        self._type_feature = type_feature
        self._features = [user_feature, item_feature, type_feature]

        self._best_model = clone_model(self.model)
        self._best_model.set_weights(self.model.get_weights())

        self._train_with_early_stopping(epochs,
                                        algorithm_name=self.RECOMMENDER_NAME,
                                        **earlystopping_kwargs)

        print("MCRec_RecommenderWrapper: Tranining complete")

        self.model = clone_model(self._best_model)
        self.model.set_weights(self._best_model.get_weights())