예제 #1
0
    def __init__(self, path):
        '''
        Constructor
        '''
        trainMatrix = self.load_rating_file_as_matrix(path + ".train.rating")
        testRatings = self.load_rating_file_as_matrix(path + ".test.rating")
        testNegatives = self.load_negative_file(path + ".test.negative")
        assert len(testRatings) == len(testNegatives)

        self.num_users, self.num_items = trainMatrix.shape

        from Base.Recommender_utils import reshapeSparse

        self.URM_train = trainMatrix.tocsr()
        self.URM_test = testRatings.tocsr()

        shape = (max(self.URM_train.shape[0], self.URM_test.shape[0]),
                 max(self.URM_train.shape[1], self.URM_test.shape[1]))

        self.URM_train = reshapeSparse(self.URM_train, shape)
        self.URM_test = reshapeSparse(self.URM_test, shape)

        URM_test_negatives_builder = IncrementalSparseMatrix(n_rows=shape[0],
                                                             n_cols=shape[1])

        for user_index in range(len(testNegatives)):
            user_test_items = testNegatives[user_index]

            URM_test_negatives_builder.add_single_row(user_index,
                                                      user_test_items,
                                                      data=1.0)

        self.URM_test_negative = URM_test_negatives_builder.get_SparseMatrix()
예제 #2
0
def split_train_validation_test_VAE_CF(URM_dataframe, n_heldout_users):

    split_dir = "./result_experiments/__Temp_MultiVAE_Splitter/"

    split_train_validation_test_VAE_CF_original(URM_dataframe, split_dir, n_heldout_users)
    train_data, vad_data_tr, vad_data_te, test_data_tr, test_data_te, n_items = load_data_VAE_CF(split_dir)

    # Remove temp files
    shutil.rmtree(split_dir, ignore_errors=True)


    from Base.Recommender_utils import reshapeSparse

    URM_train_only = train_data.copy()
    URM_train_all = sps.vstack([train_data, vad_data_tr, test_data_tr])

    URM_train_all_shape = URM_train_all.shape

    ## OFFSET all row indices
    n_train_users = train_data.shape[0]

    URM_validation = offset_sparse_matrix_row(vad_data_te, n_train_users)

    n_train_and_validation_users = URM_validation.shape[0]

    URM_validation = reshapeSparse(URM_validation, URM_train_all_shape)

    URM_test = offset_sparse_matrix_row(test_data_te, n_train_and_validation_users)
    URM_test = reshapeSparse(URM_test, URM_train_all_shape)


    URM_train_only = sps.csr_matrix(URM_train_only)
    URM_train_all = sps.csr_matrix(URM_train_all)
    URM_validation = sps.csr_matrix(URM_validation)
    URM_test = sps.csr_matrix(URM_test)


    return URM_train_only, URM_train_all, URM_validation, URM_test
예제 #3
0
    def __init__(self):

        super(NetflixPrizeReader, self).__init__()

        pre_splitted_path = "Data_manager_split_datasets/NetflixPrize/WWW/MultiVAE_our_interface/"

        pre_splitted_filename = "splitted_data"

        # If directory does not exist, create
        if not os.path.exists(pre_splitted_path):
            os.makedirs(pre_splitted_path)

        try:

            print("NetflixPrizeReader: Attempting to load pre-splitted data")

            for attrib_name, attrib_object in load_data_dict(pre_splitted_path, pre_splitted_filename).items():
                 self.__setattr__(attrib_name, attrib_object)


        except FileNotFoundError:

            print("NetflixPrizeReader: Pre-splitted data not found, building new one")

            data_reader = NetflixPrizeReader_DataManager()
            data_reader.load_data()

            URM_all = data_reader.get_URM_all()

            # binarize the data (only keep ratings >= 4)
            URM_all.data = URM_all.data >= 4.0
            URM_all.eliminate_zeros()


            URM_all = sps.coo_matrix(URM_all)

            dict_for_dataframe = {"userId": URM_all.row,
                                  "movieId": URM_all.col,
                                  "rating": URM_all.data
                                }

            URM_all_dataframe = pd.DataFrame(data = dict_for_dataframe)


            self.URM_train, self.URM_train_all, self.URM_validation, self.URM_test = split_train_validation_test_VAE_CF(URM_all_dataframe,
                                                                                                                         n_heldout_users = 40000)


            n_rows = max(self.URM_train.shape[0], self.URM_train_all.shape[0], self.URM_validation.shape[0], self.URM_test.shape[0])
            n_cols = max(self.URM_train.shape[1], self.URM_train_all.shape[1], self.URM_validation.shape[1], self.URM_test.shape[1])

            newShape = (n_rows, n_cols)

            self.URM_test = reshapeSparse(self.URM_test, newShape)
            self.URM_train = reshapeSparse(self.URM_train, newShape)
            self.URM_train_all = reshapeSparse(self.URM_train_all, newShape)
            self.URM_test = reshapeSparse(self.URM_test, newShape)



            data_dict = {
                "URM_train": self.URM_train,
                "URM_train_all": self.URM_train_all,
                "URM_test": self.URM_test,
                "URM_validation": self.URM_validation,

            }

            save_data_dict(data_dict, pre_splitted_path, pre_splitted_filename)




            print("NetflixPrizeReader: Dataset loaded")
    def __init__(self, pre_splitted_path):

        super(Movielens1MReader, self).__init__()

        pre_splitted_path += "data_split/"
        pre_splitted_filename = "splitted_data_"

        # If directory does not exist, create
        if not os.path.exists(pre_splitted_path):
            os.makedirs(pre_splitted_path)

        try:

            print("Dataset_Movielens1M: Attempting to load pre-splitted data")

            for attrib_name, attrib_object in load_data_dict_zip(
                    pre_splitted_path, pre_splitted_filename).items():
                self.__setattr__(attrib_name, attrib_object)

        except FileNotFoundError:

            print(
                "Dataset_Movielens1M: Pre-splitted data not found, building new one"
            )

            # Ensure file is loaded as matrix
            Dataset_github.load_rating_file_as_list = Dataset_github.load_rating_file_as_matrix

            dataset = Dataset_github("Conferences/WWW/NeuMF_github/Data/ml-1m")

            URM_train_original, URM_test = dataset.trainMatrix, dataset.testRatings

            URM_train_original = URM_train_original.tocsr()
            URM_test = URM_test.tocsr()

            from Base.Recommender_utils import reshapeSparse

            shape = (max(URM_train_original.shape[0], URM_test.shape[0]),
                     max(URM_train_original.shape[1], URM_test.shape[1]))

            URM_train_original = reshapeSparse(URM_train_original, shape)
            URM_test = reshapeSparse(URM_test, shape)

            URM_test_negatives_builder = IncrementalSparseMatrix(
                n_rows=shape[0], n_cols=shape[1])

            for user_index in range(len(dataset.testNegatives)):

                user_test_items = dataset.testNegatives[user_index]

                URM_test_negatives_builder.add_single_row(user_index,
                                                          user_test_items,
                                                          data=1.0)

            URM_test_negative = URM_test_negatives_builder.get_SparseMatrix()

            URM_train, URM_validation = split_train_validation_leave_one_out_user_wise(
                URM_train_original.copy())

            self.URM_DICT = {
                "URM_train_original": URM_train_original,
                "URM_train": URM_train,
                "URM_test": URM_test,
                "URM_test_negative": URM_test_negative,
                "URM_validation": URM_validation,
            }

            save_data_dict_zip(self.URM_DICT, self.ICM_DICT, pre_splitted_path,
                               pre_splitted_filename)

        print("Dataset_Movielens1M: Dataset loaded")
예제 #5
0
    def __init__(self, split_type="cold_user"):

        super(Movielens20MReader, self).__init__()

        assert split_type in ["cold_user", "warm_user"]

        pre_splitted_path = "Data_manager_split_datasets/Movielens20M/WWW/MultiVAE_our_interface/"

        pre_splitted_filename = "splitted_data" + "_" + split_type

        # If directory does not exist, create
        if not os.path.exists(pre_splitted_path):
            os.makedirs(pre_splitted_path)

        try:

            print("Movielens20MReader: Attempting to load pre-splitted data")

            for attrib_name, attrib_object in load_data_dict(pre_splitted_path, pre_splitted_filename).items():
                self.__setattr__(attrib_name, attrib_object)


        except FileNotFoundError:

            print("Movielens20MReader: Pre-splitted data not found, building new one")

            data_reader = Movielens20MReader_DataManager()
            data_reader.load_data()

            URM_all = data_reader.get_URM_all()

            # binarize the data (only keep ratings >= 4)
            URM_all.data = URM_all.data >= 4.0
            URM_all.eliminate_zeros()

            if split_type == "cold_user":

                URM_all = sps.coo_matrix(URM_all)

                dict_for_dataframe = {"userId": URM_all.row,
                                      "movieId": URM_all.col,
                                      "rating": URM_all.data
                                      }

                URM_all_dataframe = pd.DataFrame(data=dict_for_dataframe)

                self.URM_train, self.URM_train_all, self.URM_validation, self.URM_test = split_train_validation_test_VAE_CF(
                    URM_all_dataframe,
                    n_heldout_users=10000)

                n_rows = max(self.URM_train.shape[0], self.URM_train_all.shape[0], self.URM_validation.shape[0],
                             self.URM_test.shape[0])
                n_cols = max(self.URM_train.shape[1], self.URM_train_all.shape[1], self.URM_validation.shape[1],
                             self.URM_test.shape[1])

                newShape = (n_rows, n_cols)

                self.URM_test = reshapeSparse(self.URM_test, newShape)
                self.URM_train = reshapeSparse(self.URM_train, newShape)
                self.URM_train_all = reshapeSparse(self.URM_train_all, newShape)
                self.URM_test = reshapeSparse(self.URM_test, newShape)

                data_dict = {
                    "URM_train": self.URM_train,
                    "URM_train_all": self.URM_train_all,
                    "URM_test": self.URM_test,
                    "URM_validation": self.URM_validation,

                }



            elif split_type == "warm_user":

                URM_all = sps.csr_matrix(URM_all)
                users_to_keep = np.ediff1d(URM_all.indptr) >= 4
                URM_all = URM_all[users_to_keep, :]

                URM_all = sps.csc_matrix(URM_all)
                items_to_keep = np.ediff1d(URM_all.indptr) >= 1
                URM_all = URM_all[:, items_to_keep]

                URM_all = sps.csr_matrix(URM_all)

                self.URM_train, self.URM_validation, self.URM_test, _ = split_train_validation_test_negative_leave_one_out_user_wise(
                    URM_all)

                data_dict = {
                    "URM_train": self.URM_train,
                    "URM_test": self.URM_test,
                    "URM_validation": self.URM_validation

                }

            save_data_dict(data_dict, pre_splitted_path, pre_splitted_filename)

            print("Movielens20MReader: Dataset loaded")
예제 #6
0
        output_directory + "split/")

from Base.Recommender_utils import reshapeSparse

URM_train_all = sps.vstack([train_data, vad_data_tr, test_data_tr])

URM_train_all_shape = URM_train_all.shape

## OFFSET all row indices
n_train_users = train_data.shape[0]

URM_validation = offset_sparse_matrix_row(vad_data_te, n_train_users)

n_train_and_validation_users = URM_validation.shape[0]

URM_validation = reshapeSparse(URM_validation, URM_train_all_shape)
URM_test = offset_sparse_matrix_row(test_data_te, n_train_and_validation_users)

##############################################################################################################################################################
##### Set up training hyperparameters


N = train_data.shape[0]
idxlist = list(range(N))

# training batch size
batch_size = 500
batches_per_epoch = int(np.ceil(float(N) / batch_size))

N_vad = vad_data_tr.shape[0]
idxlist_vad = list(range(N_vad))
    def __init__(self,
                 pre_splitted_path,
                 dataset_variant="a",
                 train_interactions=1):

        super(CiteulikeReader, self).__init__()

        assert dataset_variant in [
            "a", "t"
        ], "CiteulikeReader: dataset_variant must be either 'a' or 't'"
        assert train_interactions in [
            1, 10, "all"
        ], "CiteulikeReader: train_interactions must be: 1, 10 or 'all'"

        pre_splitted_path += "data_split/"
        pre_splitted_filename = "splitted_data_"

        original_data_path = "Conferences/KDD/CollaborativeVAE_github/data/citeulike-{}/".format(
            dataset_variant)

        # If directory does not exist, create
        if not os.path.exists(pre_splitted_path):
            os.makedirs(pre_splitted_path)

        try:

            print("CiteulikeReader: Attempting to load pre-splitted data")

            for attrib_name, attrib_object in load_data_dict_zip(
                    pre_splitted_path, pre_splitted_filename).items():
                self.__setattr__(attrib_name, attrib_object)

        except FileNotFoundError:

            print(
                "CiteulikeReader: Pre-splitted data not found, building new one"
            )

            print("CiteulikeReader: loading URM")

            if train_interactions == "all":
                train_interactions_file_suffix = 10
            else:
                train_interactions_file_suffix = train_interactions

            URM_train_builder = self._load_data_file(
                original_data_path +
                "cf-train-{}-users.dat".format(train_interactions_file_suffix))
            URM_test_builder = self._load_data_file(
                original_data_path +
                "cf-test-{}-users.dat".format(train_interactions_file_suffix))

            URM_test = URM_test_builder.get_SparseMatrix()
            URM_train = URM_train_builder.get_SparseMatrix()

            if dataset_variant == "a":
                ICM_tokens_TFIDF = scipy.io.loadmat(original_data_path +
                                                    "mult_nor.mat")['X']
            else:
                # Variant "t" uses a different file format and is transposed
                ICM_tokens_TFIDF = h5py.File(original_data_path +
                                             "mult_nor.mat").get('X')
                ICM_tokens_TFIDF = sps.csr_matrix(ICM_tokens_TFIDF).T

            ICM_tokens_TFIDF = sps.csr_matrix(ICM_tokens_TFIDF)

            ICM_tokens_bool = ICM_tokens_TFIDF.copy()
            ICM_tokens_bool.data = np.ones_like(ICM_tokens_bool.data)

            n_rows = max(URM_test.shape[0], URM_train.shape[0])
            n_cols = max(URM_test.shape[1], URM_train.shape[1],
                         ICM_tokens_TFIDF.shape[0])

            newShape = (n_rows, n_cols)

            URM_test = reshapeSparse(URM_test, newShape)
            URM_train = reshapeSparse(URM_train, newShape)

            if train_interactions == "all":

                URM_train += URM_test

                URM_train, URM_test = split_train_validation_percentage_random_holdout(
                    URM_train, train_percentage=0.8)
                URM_train, URM_validation = split_train_validation_percentage_random_holdout(
                    URM_train.copy(), train_percentage=0.8)

            elif train_interactions == 10:
                # If train interactions == 10 the train will NOT contain the validation data
                URM_train, URM_validation = split_train_validation_percentage_random_holdout(
                    URM_train.copy(), train_percentage=0.8)

            else:
                # If train interactions == 10 the train WILL contain the validation data
                _, URM_validation = split_train_validation_percentage_random_holdout(
                    URM_train.copy(), train_percentage=0.8)

            self.ICM_DICT = {
                "ICM_tokens_TFIDF": ICM_tokens_TFIDF,
                "ICM_tokens_bool": ICM_tokens_bool,
            }

            self.URM_DICT = {
                "URM_train": URM_train,
                "URM_test": URM_test,
                "URM_validation": URM_validation,
            }

            save_data_dict_zip(self.URM_DICT, self.ICM_DICT, pre_splitted_path,
                               pre_splitted_filename)

            print("CiteulikeReader: loading complete")
예제 #8
0
    def __init__(self, dataset_variant="a", train_interactions=1):

        super(CiteulikeReader, self).__init__()

        assert dataset_variant in [
            "a", "t"
        ], "CiteulikeReader: dataset_variant must be either 'a' or 't'"
        assert train_interactions in [
            1, 10, "all"
        ], "CiteulikeReader: train_interactions must be: 1, 10 or 'all'"

        pre_splitted_path = "Data_manager_split_datasets/CiteULike/KDD/CollaborativeVAE_our_interface/"

        pre_splitted_filename = "splitted_data_citeulike-{}-{}-items".format(
            dataset_variant, train_interactions)

        original_data_path = "Conferences/KDD/CollaborativeVAE_github/data/citeulike-{}/".format(
            dataset_variant)

        # If directory does not exist, create
        if not os.path.exists(pre_splitted_path):
            os.makedirs(pre_splitted_path)

        try:

            print("CiteulikeReader: Attempting to load pre-splitted data")

            for attrib_name, attrib_object in load_data_dict(
                    pre_splitted_path, pre_splitted_filename).items():
                self.__setattr__(attrib_name, attrib_object)

        except FileNotFoundError:

            print(
                "CiteulikeReader: Pre-splitted data not found, building new one"
            )

            print("CiteulikeReader: loading URM")

            if train_interactions == "all":
                train_interactions_file_suffix = 10
            else:
                train_interactions_file_suffix = train_interactions

            URM_train_builder = self._load_data_file(
                original_data_path +
                "cf-train-{}-users.dat".format(train_interactions_file_suffix))
            URM_test_builder = self._load_data_file(
                original_data_path +
                "cf-test-{}-users.dat".format(train_interactions_file_suffix))

            self.URM_test = URM_test_builder.get_SparseMatrix()
            self.URM_train = URM_train_builder.get_SparseMatrix()

            if dataset_variant == "a":
                self.ICM_title_abstract = scipy.io.loadmat(original_data_path +
                                                           "mult_nor.mat")['X']
            else:
                # Variant "t" uses a different file format and is transposed
                self.ICM_title_abstract = h5py.File(original_data_path +
                                                    "mult_nor.mat").get('X')
                self.ICM_title_abstract = sps.csr_matrix(
                    self.ICM_title_abstract).T

            self.ICM_title_abstract = sps.csr_matrix(self.ICM_title_abstract)

            n_rows = max(self.URM_test.shape[0], self.URM_train.shape[0])
            n_cols = max(self.URM_test.shape[1], self.URM_train.shape[1],
                         self.ICM_title_abstract.shape[0])

            newShape = (n_rows, n_cols)

            self.URM_test = reshapeSparse(self.URM_test, newShape)
            self.URM_train = reshapeSparse(self.URM_train, newShape)

            if train_interactions == "all":

                self.URM_train += self.URM_test

                self.URM_train, self.URM_test = split_train_validation_percentage_random_holdout(
                    self.URM_train, train_percentage=0.8)
                self.URM_train, self.URM_validation = split_train_validation_percentage_random_holdout(
                    self.URM_train, train_percentage=0.8)

            else:

                self.URM_train, self.URM_validation = split_train_validation_percentage_random_holdout(
                    self.URM_train, train_percentage=0.8)

            data_dict = {
                "URM_train": self.URM_train,
                "URM_test": self.URM_test,
                "URM_validation": self.URM_validation,
                "ICM_title_abstract": self.ICM_title_abstract
            }

            save_data_dict(data_dict, pre_splitted_path, pre_splitted_filename)

            print("CiteulikeReader: loading complete")
예제 #9
0
    def __init__(self, pre_splitted_path, type="original"):

        pre_splitted_path += "data_split/"
        pre_splitted_filename = "splitted_data_"

        # If directory does not exist, create
        if not os.path.exists(pre_splitted_path):
            os.makedirs(pre_splitted_path)

        try:
            print("Dataset_{}: Attempting to load pre-splitted data".format(
                self.DATASET_NAME))

            for attrib_name, attrib_object in load_data_dict_zip(
                    pre_splitted_path, pre_splitted_filename).items():
                self.__setattr__(attrib_name, attrib_object)

        except FileNotFoundError:

            print("Dataset_{}: Pre-splitted data not found, building new one".
                  format(self.DATASET_NAME))

            if type == "original":

                # Ensure file is loaded as matrix
                Dataset_github.load_rating_file_as_list = Dataset_github.load_rating_file_as_matrix

                dataset = Dataset_github(
                    "Conferences/IJCAI/DELF_original/Data/ml-1m")

                URM_train, URM_validation, URM_test, testNegatives = dataset.trainMatrix, dataset.validRatings, \
                                                                     dataset.testRatings, dataset.testNegatives

                URM_train = URM_train.tocsr()
                URM_validation = URM_validation.tocsr()
                URM_test = URM_test.tocsr()
                URM_timestamp = "no"

                from Base.Recommender_utils import reshapeSparse

                shape = (max(URM_train.shape[0], URM_validation.shape[0],
                             URM_test.shape[0]),
                         max(URM_train.shape[1], URM_validation.shape[1],
                             URM_test.shape[1]))

                URM_train = reshapeSparse(URM_train, shape)
                URM_validation = reshapeSparse(URM_validation, shape)
                URM_test = reshapeSparse(URM_test, shape)

                URM_test_negatives_builder = IncrementalSparseMatrix(
                    n_rows=shape[0], n_cols=shape[1])

                for user_index in range(len(dataset.testNegatives)):

                    user_test_items = dataset.testNegatives[user_index]

                    URM_test_negatives_builder.add_single_row(user_index,
                                                              user_test_items,
                                                              data=1.0)

                URM_test_negative = URM_test_negatives_builder.get_SparseMatrix(
                )

            elif type == "ours":

                # create from full dataset with leave out one time wise from ORIGINAL full dateset
                data_reader = Movielens1MReader_DataManager()
                loaded_dataset = data_reader.load_data()

                URM_all = loaded_dataset.get_URM_from_name("URM_all")
                URM_timestamp = loaded_dataset.get_URM_from_name(
                    "URM_timestamp")

                # make rating implicit
                URM_all.data = np.ones_like(URM_all.data)

                URM_train, URM_validation, URM_test, URM_test_negative = split_data_on_timestamp(
                    URM_all, URM_timestamp, negative_items_per_positive=99)

            else:
                assert False

            self.URM_DICT = {
                "URM_train": URM_train,
                "URM_test": URM_test,
                "URM_validation": URM_validation,
                "URM_test_negative": URM_test_negative,
                "URM_timestamp": URM_timestamp,
            }

            save_data_dict_zip(self.URM_DICT, self.ICM_DICT, pre_splitted_path,
                               pre_splitted_filename)

        print("{}: Dataset loaded".format(self.DATASET_NAME))

        print_stat_datareader(self)
예제 #10
0
    def _load_from_original_file(self):
        # Load data from original

        self._print("Loading original data")

        zipFile_path = self.DATASET_SPLIT_ROOT_FOLDER + self.DATASET_SUBFOLDER

        try:

            dataFile = zipfile.ZipFile(
                zipFile_path + "neural_factorization_machine-master.zip")

        except (FileNotFoundError, zipfile.BadZipFile):

            self._print("Unable to fild data zip file. Downloading...")

            download_from_URL(self.DATASET_URL, zipFile_path,
                              "neural_factorization_machine-master.zip")

            dataFile = zipfile.ZipFile(
                zipFile_path + "neural_factorization_machine-master.zip")

        inner_path_in_zip = "neural_factorization_machine-master/data/frappe/"

        URM_train_path = dataFile.extract(inner_path_in_zip +
                                          "frappe.train.libfm",
                                          path=zipFile_path + "decompressed/")
        URM_test_path = dataFile.extract(inner_path_in_zip +
                                         "frappe.test.libfm",
                                         path=zipFile_path + "decompressed/")
        URM_validation_path = dataFile.extract(
            inner_path_in_zip + "frappe.validation.libfm",
            path=zipFile_path + "decompressed/")

        tmp_URM_train, item_original_ID_to_index, user_original_ID_to_index = self._loadURM(
            URM_train_path,
            item_original_ID_to_index=None,
            user_original_ID_to_index=None)

        tmp_URM_test, item_original_ID_to_index, user_original_ID_to_index = self._loadURM(
            URM_test_path,
            item_original_ID_to_index=item_original_ID_to_index,
            user_original_ID_to_index=user_original_ID_to_index)

        tmp_URM_validation, item_original_ID_to_index, user_original_ID_to_index = self._loadURM(
            URM_validation_path,
            item_original_ID_to_index=item_original_ID_to_index,
            user_original_ID_to_index=user_original_ID_to_index)

        shape = (len(user_original_ID_to_index),
                 len(item_original_ID_to_index))

        tmp_URM_train = reshapeSparse(tmp_URM_train, shape)
        tmp_URM_test = reshapeSparse(tmp_URM_test, shape)
        tmp_URM_validation = reshapeSparse(tmp_URM_validation, shape)

        URM_occurrence = tmp_URM_train + tmp_URM_test + tmp_URM_validation

        URM_all = URM_occurrence.copy()
        URM_all.data = np.ones_like(URM_all.data)

        loaded_URM_dict = {
            "URM_all": URM_all,
            "URM_occurrence": URM_occurrence
        }

        loaded_dataset = Dataset(
            dataset_name=self._get_dataset_name(),
            URM_dictionary=loaded_URM_dict,
            ICM_dictionary=None,
            ICM_feature_mapper_dictionary=None,
            UCM_dictionary=None,
            UCM_feature_mapper_dictionary=None,
            user_original_ID_to_index=user_original_ID_to_index,
            item_original_ID_to_index=item_original_ID_to_index,
            is_implicit=self.IS_IMPLICIT,
        )

        self._print("cleaning temporary files")

        shutil.rmtree(zipFile_path + "decompressed", ignore_errors=True)

        self._print("loading complete")

        return loaded_dataset
예제 #11
0
    def __init__(self):

        super(PinterestICCVReader, self).__init__()

        pre_splitted_path = "Data_manager_split_datasets/PinterestICCV/WWW/NeuMF_our_interface/"

        pre_splitted_filename = "splitted_data"

        # If directory does not exist, create
        if not os.path.exists(pre_splitted_path):
            os.makedirs(pre_splitted_path)

        try:

            print("Dataset_Pinterest: Attempting to load pre-splitted data")

            for attrib_name, attrib_object in load_data_dict(
                    pre_splitted_path, pre_splitted_filename).items():
                self.__setattr__(attrib_name, attrib_object)

        except FileNotFoundError:

            print(
                "Dataset_Pinterest: Pre-splitted data not found, building new one"
            )

            # Ensure file is loaded as matrix
            Dataset_github.load_rating_file_as_list = Dataset_github.load_rating_file_as_matrix

            dataset = Dataset_github(
                "Conferences/WWW/NeuMF_github/Data/pinterest-20")

            self.URM_train_original, self.URM_test = dataset.trainMatrix, dataset.testRatings

            self.URM_train_original = self.URM_train_original.tocsr()
            self.URM_test = self.URM_test.tocsr()

            from Base.Recommender_utils import reshapeSparse

            shape = (max(self.URM_train_original.shape[0],
                         self.URM_test.shape[0]),
                     max(self.URM_train_original.shape[1],
                         self.URM_test.shape[1]))

            self.URM_train_original = reshapeSparse(self.URM_train_original,
                                                    shape)
            self.URM_test = reshapeSparse(self.URM_test, shape)

            URM_test_negatives_builder = IncrementalSparseMatrix(
                n_rows=shape[0], n_cols=shape[1])

            for user_index in range(len(dataset.testNegatives)):

                user_test_items = dataset.testNegatives[user_index]

                URM_test_negatives_builder.add_single_row(user_index,
                                                          user_test_items,
                                                          data=1.0)

            self.URM_test_negative = URM_test_negatives_builder.get_SparseMatrix(
            )

            self.URM_train, self.URM_validation = split_train_validation_leave_one_out_user_wise(
                self.URM_train_original.copy())

            data_dict = {
                "URM_train_original": self.URM_train_original,
                "URM_train": self.URM_train,
                "URM_test": self.URM_test,
                "URM_test_negative": self.URM_test_negative,
                "URM_validation": self.URM_validation,
            }

            save_data_dict(data_dict, pre_splitted_path, pre_splitted_filename)

        print("Dataset_Pinterest: Dataset loaded")

        print("N_items {}, n_users {}".format(self.URM_train.shape[1],
                                              self.URM_train.shape[0]))