Beispiel #1
0
    def __init__(self):

        test_percentage = 0.2
        validation_percentage = 0.2

        pre_splitted_path = "Data_manager_split_datasets/MovielensHetrec2011/RecSys/SpectralCF_our_interface/"

        pre_splitted_filename = "splitted_data"

        # If directory does not exist, create
        if not os.path.exists(pre_splitted_path):
            os.makedirs(pre_splitted_path)

        try:
            print(
                "Dataset_MovielensHetrec2011: Attempting to load pre-splitted data"
            )

            for attrib_name, attrib_object in load_data_dict(
                    pre_splitted_path, pre_splitted_filename).items():
                self.__setattr__(attrib_name, attrib_object)

        except FileNotFoundError:

            print(
                "Dataset_MovielensHetrec2011: Pre-splitted data not found, building new one"
            )

            data_reader = MovielensHetrec2011Reader_DataManager()
            data_reader.load_data()

            URM_all = data_reader.get_URM_all()

            # keep only ratings 5
            URM_all.data = URM_all.data == 5
            URM_all.eliminate_zeros()

            # create train - test - validation
            URM_train_original, self.URM_test = split_train_validation_percentage_user_wise(
                URM_all, train_percentage=1 - test_percentage, verbose=False)

            self.URM_train, self.URM_validation = split_train_validation_percentage_user_wise(
                URM_train_original,
                train_percentage=1 - validation_percentage,
                verbose=False)

            data_dict = {
                "URM_train": self.URM_train,
                "URM_test": self.URM_test,
                "URM_validation": self.URM_validation,
            }

            save_data_dict(data_dict, pre_splitted_path, pre_splitted_filename)

        print("Dataset_MovielensHetrec2011: Dataset loaded")

        ut.print_stat_datareader(self)
Beispiel #2
0
    def __init__(self):
        super(PinterestICCVReader, self).__init__()

        pre_splitted_path = "Data_manager_split_datasets/PinterestICCV/SIGIR/CMN_our_interface/"

        pre_splitted_filename = "splitted_data"

        # If directory does not exist, create
        if not os.path.exists(pre_splitted_path):
            os.makedirs(pre_splitted_path)

        try:

            print("PinterestICCVReader: Attempting to load pre-splitted data")

            for attrib_name, attrib_object in load_data_dict(
                    pre_splitted_path, pre_splitted_filename).items():
                self.__setattr__(attrib_name, attrib_object)

        except FileNotFoundError:

            print(
                "PinterestICCVReader: Pre-splitted data not found, building new one"
            )

            print("PinterestICCVReader: loading URM")

            # data_reader = PinterestICCVReader()
            # data_reader.load_data()
            #
            # URM_all = data_reader.get_URM_all()
            #
            # self.URM_train, self.URM_validation, self.URM_test, self.URM_negative = split_train_validation_test_negative_leave_one_out_user_wise(URM_all, negative_items_per_positive=100)

            dataset = Dataset_NeuralCollaborativeFiltering(
                "Conferences/WWW/NeuMF_github/Data/pinterest-20")

            self.URM_train_original, self.URM_test, self.URM_test_negative = dataset.URM_train, dataset.URM_test, dataset.URM_test_negative

            self.URM_train, self.URM_validation = split_train_validation_percentage_random_holdout(
                self.URM_train_original.copy(), train_percentage=0.8)

            data_dict = {
                "URM_train_original": self.URM_train_original,
                "URM_train": self.URM_train,
                "URM_test": self.URM_test,
                "URM_test_negative": self.URM_test_negative,
                "URM_validation": self.URM_validation,
            }

            save_data_dict(data_dict, pre_splitted_path, pre_splitted_filename)

            print("PinterestICCVReader: loading complete")
Beispiel #3
0
    def __init__(self):

        super(CiteULikeReader, self).__init__()

        pre_splitted_path = "Data_manager_split_datasets/CiteULike/SIGIR/CMN_our_interface/"

        pre_splitted_filename = "splitted_data"

        # If directory does not exist, create
        if not os.path.exists(pre_splitted_path):
            os.makedirs(pre_splitted_path)

        try:

            print("CiteULikeReader: Attempting to load pre-splitted data")

            for attrib_name, attrib_object in load_data_dict(
                    pre_splitted_path, pre_splitted_filename).items():
                self.__setattr__(attrib_name, attrib_object)

        except FileNotFoundError:

            print(
                "CiteULikeReader: Pre-splitted data not found, building new one"
            )

            print("CiteULikeReader: loading URM")

            filename = "Conferences/SIGIR/CMN_github/data/citeulike-a.npz"

            self.URM_train_original, self.URM_test, self.URM_test_negative = self.build_sparse_matrix(
                filename)

            self.URM_train, self.URM_validation = split_train_validation_leave_one_out_user_wise(
                self.URM_train_original.copy())

            data_dict = {
                "URM_train_original": self.URM_train_original,
                "URM_train": self.URM_train,
                "URM_test": self.URM_test,
                "URM_test_negative": self.URM_test_negative,
                "URM_validation": self.URM_validation,
            }

            save_data_dict(data_dict, pre_splitted_path, pre_splitted_filename)

        print("N_items {}, n_users {}".format(self.URM_train.shape[1],
                                              self.URM_train.shape[0]))

        print("CiteULikeReader: Dataset loaded")
Beispiel #4
0
    def __init__(self):
        super(EpinionsReader, self).__init__()


        pre_splitted_path = "Data_manager_split_datasets/Epinions/SIGIR/CMN_our_interface/"

        pre_splitted_filename = "splitted_data"

        # If directory does not exist, create
        if not os.path.exists(pre_splitted_path):
            os.makedirs(pre_splitted_path)

        try:

            print("EpinionsReader: Attempting to load pre-splitted data")

            for attrib_name, attrib_object in load_data_dict(pre_splitted_path, pre_splitted_filename).items():
                 self.__setattr__(attrib_name, attrib_object)


        except FileNotFoundError:

            print("EpinionsReader: Pre-splitted data not found, building new one")

            print("EpinionsReader: loading URM")


            data_reader = EpinionsReader_DataManager()
            data_reader.load_data()

            URM_all = data_reader.get_URM_all()
            URM_all.data = np.ones_like(URM_all.data)

            self.URM_train, self.URM_validation, self.URM_test, self.URM_test_negative = split_train_validation_test_negative_leave_one_out_user_wise(URM_all, negative_items_per_positive=100)

            # Compatibility with the other two datasets
            self.URM_train_original = self.URM_train + self.URM_validation

            data_dict = {
                "URM_train_original": self.URM_train_original,
                "URM_train": self.URM_train,
                "URM_test": self.URM_test,
                "URM_test_negative": self.URM_test_negative,
                "URM_validation": self.URM_validation,
            }

            save_data_dict(data_dict, pre_splitted_path, pre_splitted_filename)


            print("EpinionsReader: loading complete")
Beispiel #5
0
    def __init__(self):

        super(Movielens100KReader, self).__init__()


        pre_splitted_path = "Data_manager_split_datasets/Movielens100K/KDD/MCRec_our_interface/"

        pre_splitted_filename = "splitted_data"

        original_data_path = "Conferences/KDD/MCRec_github/data/"

        # If directory does not exist, create
        if not os.path.exists(pre_splitted_path):
            os.makedirs(pre_splitted_path)

        try:

            print("Movielens100KReader: Attempting to load pre-splitted data")

            for attrib_name, attrib_object in load_data_dict(pre_splitted_path, pre_splitted_filename).items():
                 self.__setattr__(attrib_name, attrib_object)


        except FileNotFoundError:

            print("Movielens100KReader: Pre-splitted data not found, building new one")

            print("Movielens100KReader: loading URM")


            from Conferences.KDD.MCRec_github.code.Dataset import Dataset

            dataset = 'ml-100k'

            dataset = Dataset(original_data_path + dataset)
            URM_train, testRatings, testNegatives = dataset.trainMatrix, dataset.testRatings, dataset.testNegatives

            # Dataset adds 1 to user and item id, removing it to restore 0 indexing
            URM_train = sps.coo_matrix(URM_train)
            URM_train.row -= 1
            URM_train.col -= 1

            self.URM_train = sps.csr_matrix((np.ones_like(URM_train.data), (URM_train.row, URM_train.col)))


            num_users, num_items = self.URM_train.shape



            # Build sparse matrices from lists
            URM_test_builder = IncrementalSparseMatrix(n_rows=num_users, n_cols=num_items)
            URM_test_negative_builder = IncrementalSparseMatrix(n_rows=num_users, n_cols=num_items)


            for user_index in range(len(testRatings)):

                user_id = testRatings[user_index][0]
                current_user_test_items = testRatings[user_index][1:]
                current_user_test_negative_items = testNegatives[user_index]

                current_user_test_items = np.array(current_user_test_items) -1
                current_user_test_negative_items = np.array(current_user_test_negative_items) -1

                URM_test_builder.add_single_row(user_id -1, current_user_test_items, 1.0)
                URM_test_negative_builder.add_single_row(user_id -1, current_user_test_negative_items, 1.0)



            # the test data has repeated data, apparently
            self.URM_test = URM_test_builder.get_SparseMatrix()

            self.URM_test_negative = URM_test_negative_builder.get_SparseMatrix()


            # Split validation from train as 10%
            from Data_manager.split_functions.split_train_validation import split_train_validation_percentage_user_wise

            self.URM_train, self.URM_validation = split_train_validation_percentage_user_wise(self.URM_train, train_percentage=0.9)


            # Load features

            data_reader = Movielens100KReader_DataManager()
            data_reader.load_data()

            zipFile_path = data_reader.DATASET_SPLIT_ROOT_FOLDER + data_reader.DATASET_SUBFOLDER
            dataFile = zipfile.ZipFile(zipFile_path + "ml-100k.zip")

            ICM_path = dataFile.extract("ml-100k/u.item", path=zipFile_path + "decompressed/")

            ICM_genre = self._loadICM(ICM_path)
            ICM_genre = ICM_genre.get_SparseMatrix()

            shutil.rmtree(zipFile_path + "decompressed", ignore_errors=True)

            self.ICM_dict = {"ICM_genre": ICM_genre}


            data_dict = {
                "URM_train": self.URM_train,
                "URM_test": self.URM_test,
                "URM_validation": self.URM_validation,
                "URM_test_negative": self.URM_test_negative,
                "ICM_dict": self.ICM_dict,

            }

            save_data_dict(data_dict, pre_splitted_path, pre_splitted_filename)

            print("Movielens100KReader: loading complete")
    def __init__(self):

        super(NetflixPrizeReader, self).__init__()

        pre_splitted_path = "Data_manager_split_datasets/NetflixPrize/WWW/MultiVAE_our_interface/"

        pre_splitted_filename = "splitted_data"

        # If directory does not exist, create
        if not os.path.exists(pre_splitted_path):
            os.makedirs(pre_splitted_path)

        try:

            print("NetflixPrizeReader: Attempting to load pre-splitted data")

            for attrib_name, attrib_object in load_data_dict(pre_splitted_path, pre_splitted_filename).items():
                 self.__setattr__(attrib_name, attrib_object)


        except FileNotFoundError:

            print("NetflixPrizeReader: Pre-splitted data not found, building new one")

            data_reader = NetflixPrizeReader_DataManager()
            data_reader.load_data()

            URM_all = data_reader.get_URM_all()

            # binarize the data (only keep ratings >= 4)
            URM_all.data = URM_all.data >= 4.0
            URM_all.eliminate_zeros()


            URM_all = sps.coo_matrix(URM_all)

            dict_for_dataframe = {"userId": URM_all.row,
                                  "movieId": URM_all.col,
                                  "rating": URM_all.data
                                }

            URM_all_dataframe = pd.DataFrame(data = dict_for_dataframe)


            self.URM_train, self.URM_train_all, self.URM_validation, self.URM_test = split_train_validation_test_VAE_CF(URM_all_dataframe,
                                                                                                                         n_heldout_users = 40000)


            n_rows = max(self.URM_train.shape[0], self.URM_train_all.shape[0], self.URM_validation.shape[0], self.URM_test.shape[0])
            n_cols = max(self.URM_train.shape[1], self.URM_train_all.shape[1], self.URM_validation.shape[1], self.URM_test.shape[1])

            newShape = (n_rows, n_cols)

            self.URM_test = reshapeSparse(self.URM_test, newShape)
            self.URM_train = reshapeSparse(self.URM_train, newShape)
            self.URM_train_all = reshapeSparse(self.URM_train_all, newShape)
            self.URM_test = reshapeSparse(self.URM_test, newShape)



            data_dict = {
                "URM_train": self.URM_train,
                "URM_train_all": self.URM_train_all,
                "URM_test": self.URM_test,
                "URM_validation": self.URM_validation,

            }

            save_data_dict(data_dict, pre_splitted_path, pre_splitted_filename)




            print("NetflixPrizeReader: Dataset loaded")
Beispiel #7
0
    def __init__(self):

        super(Movielens1MReader, self).__init__()

        pre_splitted_path = "Data_manager_split_datasets/Movielens1M/WWW/NeuMF_our_interface/"

        pre_splitted_filename = "splitted_data"

        # If directory does not exist, create
        if not os.path.exists(pre_splitted_path):
            os.makedirs(pre_splitted_path)

        try:

            print("Dataset_Movielens1M: Attempting to load pre-splitted data")

            for attrib_name, attrib_object in load_data_dict(
                    pre_splitted_path, pre_splitted_filename).items():
                self.__setattr__(attrib_name, attrib_object)

        except FileNotFoundError:

            print(
                "Dataset_Movielens1M: Pre-splitted data not found, building new one"
            )

            # Ensure file is loaded as matrix
            Dataset_github.load_rating_file_as_list = Dataset_github.load_rating_file_as_matrix

            dataset = Dataset_github("Conferences/WWW/NeuMF_github/Data/ml-1m")

            self.URM_train_original, self.URM_test = dataset.trainMatrix, dataset.testRatings

            self.URM_train_original = self.URM_train_original.tocsr()
            self.URM_test = self.URM_test.tocsr()

            from Base.Recommender_utils import reshapeSparse

            shape = (max(self.URM_train_original.shape[0],
                         self.URM_test.shape[0]),
                     max(self.URM_train_original.shape[1],
                         self.URM_test.shape[1]))

            self.URM_train_original = reshapeSparse(self.URM_train_original,
                                                    shape)
            self.URM_test = reshapeSparse(self.URM_test, shape)

            URM_test_negatives_builder = IncrementalSparseMatrix(
                n_rows=shape[0], n_cols=shape[1])

            for user_index in range(len(dataset.testNegatives)):

                user_test_items = dataset.testNegatives[user_index]

                URM_test_negatives_builder.add_single_row(user_index,
                                                          user_test_items,
                                                          data=1.0)

            self.URM_test_negative = URM_test_negatives_builder.get_SparseMatrix(
            )

            self.URM_train, self.URM_validation = split_train_validation_leave_one_out_user_wise(
                self.URM_train_original.copy())

            data_dict = {
                "URM_train_original": self.URM_train_original,
                "URM_train": self.URM_train,
                "URM_test": self.URM_test,
                "URM_test_negative": self.URM_test_negative,
                "URM_validation": self.URM_validation,
            }

            save_data_dict(data_dict, pre_splitted_path, pre_splitted_filename)

        print("Dataset_Movielens1M: Dataset loaded")

        print("N_items {}, n_users {}".format(self.URM_train.shape[1],
                                              self.URM_train.shape[0]))
Beispiel #8
0
    def __init__(self, split_type="cold_user"):

        super(Movielens20MReader, self).__init__()

        assert split_type in ["cold_user", "warm_user"]

        pre_splitted_path = "Data_manager_split_datasets/Movielens20M/WWW/MultiVAE_our_interface/"

        pre_splitted_filename = "splitted_data" + "_" + split_type

        # If directory does not exist, create
        if not os.path.exists(pre_splitted_path):
            os.makedirs(pre_splitted_path)

        try:

            print("Movielens20MReader: Attempting to load pre-splitted data")

            for attrib_name, attrib_object in load_data_dict(pre_splitted_path, pre_splitted_filename).items():
                self.__setattr__(attrib_name, attrib_object)


        except FileNotFoundError:

            print("Movielens20MReader: Pre-splitted data not found, building new one")

            data_reader = Movielens20MReader_DataManager()
            data_reader.load_data()

            URM_all = data_reader.get_URM_all()

            # binarize the data (only keep ratings >= 4)
            URM_all.data = URM_all.data >= 4.0
            URM_all.eliminate_zeros()

            if split_type == "cold_user":

                URM_all = sps.coo_matrix(URM_all)

                dict_for_dataframe = {"userId": URM_all.row,
                                      "movieId": URM_all.col,
                                      "rating": URM_all.data
                                      }

                URM_all_dataframe = pd.DataFrame(data=dict_for_dataframe)

                self.URM_train, self.URM_train_all, self.URM_validation, self.URM_test = split_train_validation_test_VAE_CF(
                    URM_all_dataframe,
                    n_heldout_users=10000)

                n_rows = max(self.URM_train.shape[0], self.URM_train_all.shape[0], self.URM_validation.shape[0],
                             self.URM_test.shape[0])
                n_cols = max(self.URM_train.shape[1], self.URM_train_all.shape[1], self.URM_validation.shape[1],
                             self.URM_test.shape[1])

                newShape = (n_rows, n_cols)

                self.URM_test = reshapeSparse(self.URM_test, newShape)
                self.URM_train = reshapeSparse(self.URM_train, newShape)
                self.URM_train_all = reshapeSparse(self.URM_train_all, newShape)
                self.URM_test = reshapeSparse(self.URM_test, newShape)

                data_dict = {
                    "URM_train": self.URM_train,
                    "URM_train_all": self.URM_train_all,
                    "URM_test": self.URM_test,
                    "URM_validation": self.URM_validation,

                }



            elif split_type == "warm_user":

                URM_all = sps.csr_matrix(URM_all)
                users_to_keep = np.ediff1d(URM_all.indptr) >= 4
                URM_all = URM_all[users_to_keep, :]

                URM_all = sps.csc_matrix(URM_all)
                items_to_keep = np.ediff1d(URM_all.indptr) >= 1
                URM_all = URM_all[:, items_to_keep]

                URM_all = sps.csr_matrix(URM_all)

                self.URM_train, self.URM_validation, self.URM_test, _ = split_train_validation_test_negative_leave_one_out_user_wise(
                    URM_all)

                data_dict = {
                    "URM_train": self.URM_train,
                    "URM_test": self.URM_test,
                    "URM_validation": self.URM_validation

                }

            save_data_dict(data_dict, pre_splitted_path, pre_splitted_filename)

            print("Movielens20MReader: Dataset loaded")
    def __init__(self):

        test_percentage = 0.2
        validation_percentage = 0.2

        pre_splitted_path = "Data_manager_split_datasets/AmazonInstantVideo/RecSys/SpectralCF_our_interface/"
        pre_splitted_filename = "splitted_data"

        ratings_file_name = "ratings_Amazon_Instant_Video.csv"

        # If directory does not exist, create
        if not os.path.exists(pre_splitted_path):
            os.makedirs(pre_splitted_path)

        try:
            print(
                "Dataset_AmazonInstantVideo: Attempting to load pre-splitted data"
            )

            for attrib_name, attrib_object in load_data_dict(
                    pre_splitted_path, pre_splitted_filename).items():
                self.__setattr__(attrib_name, attrib_object)

        except FileNotFoundError:

            print(
                "Dataset_AmazonInstantVideo: Pre-splitted data not found, building new one"
            )

            folder_path = self.DATASET_SPLIT_ROOT_FOLDER + self.DATASET_SUBFOLDER

            downloadFromURL(self.DATASET_URL, folder_path, ratings_file_name)

            # read Amazon Instant Video
            df = pd.read_csv(folder_path + ratings_file_name,
                             sep=',',
                             header=None,
                             names=['user', 'item', 'rating',
                                    'timestamp'])[['user', 'item', 'rating']]

            # keep only ratings = 5
            URM_train_builder = IncrementalSparseMatrix(
                auto_create_col_mapper=True, auto_create_row_mapper=True)
            URM_train_builder.add_data_lists(df['user'].values,
                                             df['item'].values,
                                             df['rating'].values)
            URM_all = URM_train_builder.get_SparseMatrix()

            URM_all.data = URM_all.data == 5
            URM_all.eliminate_zeros()

            # keep only users with at least 5 ratings
            URM_all = ut.filter_urm(URM_all,
                                    user_min_number_ratings=5,
                                    item_min_number_ratings=1)

            # create train - test - validation

            URM_train_original, self.URM_test = split_train_validation_percentage_user_wise(
                URM_all, train_percentage=1 - test_percentage, verbose=False)

            self.URM_train, self.URM_validation = split_train_validation_percentage_user_wise(
                URM_train_original,
                train_percentage=1 - validation_percentage,
                verbose=False)

            data_dict = {
                "URM_train": self.URM_train,
                "URM_test": self.URM_test,
                "URM_validation": self.URM_validation,
            }

            save_data_dict(data_dict, pre_splitted_path, pre_splitted_filename)

        print("Dataset_AmazonInstantVideo: Dataset loaded")

        ut.print_stat_datareader(self)
Beispiel #10
0
    def __init__(self, type="original", cold_start=False, cold_items=None):

        assert type in ["original", "ours"]

        # path for pre existed movielens1M split
        movielens_splitted_path = "Conferences/RecSys/SpectralCF_github/data/ml-1m/"

        pre_splitted_path = "Data_manager_split_datasets/Movielens1M/RecSys/SpectralCF_our_interface/"

        mode = 1  # their mode in cold start

        if cold_start:
            assert (isinstance(cold_items, int) and cold_items > 0)
            pre_splitted_filename = "splitted_data_{}_cold_start_{}_mode_{}".format(
                type, cold_items, mode)
        else:
            pre_splitted_filename = "splitted_data_{}".format(type)

        # If directory does not exist, create
        if not os.path.exists(pre_splitted_path):
            os.makedirs(pre_splitted_path)

        try:
            print("Dataset_Movielens1M: Attempting to load pre-splitted data")

            for attrib_name, attrib_object in load_data_dict(
                    pre_splitted_path, pre_splitted_filename).items():
                self.__setattr__(attrib_name, attrib_object)

        except FileNotFoundError:

            print(
                "Dataset_Movielens1M: Pre-splitted data not found, building new one"
            )

            if type == "original":
                assert (cold_start is False)

                # use the SpectralCF class to read data
                data_generator = Data(
                    train_file=movielens_splitted_path + 'train_users.dat',
                    test_file=movielens_splitted_path + 'test_users.dat',
                    batch_size=BATCH_SIZE)

                # convert train into csr
                full_train_matrix = sps.csr_matrix(data_generator.R)
                URM_train_original = full_train_matrix

                # convert test into csr
                test_set = data_generator.test_set
                uids, items = [], []
                for uid in test_set.keys():
                    uids += np.full(len(test_set[uid]), uid).tolist()
                    items += test_set[uid]
                test_matrix = sps.csr_matrix(
                    (np.ones(len(items)), (uids, items)),
                    shape=(full_train_matrix.shape))

                if not cold_start:
                    self.URM_test = test_matrix

                    # create validation
                    self.URM_train, self.URM_validation = split_train_validation_percentage_user_wise(
                        URM_train_original,
                        train_percentage=0.9,
                        verbose=False)

                else:
                    print('nothing')

            elif type == "ours":

                data_reader = Movielens1MReader_DataManager()
                data_reader.load_data()

                URM_all = data_reader.get_URM_all()

                URM_all.data = URM_all.data == 5
                URM_all.eliminate_zeros()

                if not cold_start:
                    URM_train, self.URM_test = split_train_validation_percentage_user_wise(
                        URM_all, train_percentage=0.8, verbose=False)

                    self.URM_train, self.URM_validation = split_train_validation_percentage_user_wise(
                        URM_train, train_percentage=0.9, verbose=False)

                else:

                    if mode == 1:  # their mode, cold start for full dataset
                        self.URM_train, URM_test = split_train_validation_cold_start_user_wise(
                            URM_all,
                            full_train_percentage=0.0,
                            cold_items=cold_items,
                            verbose=False)

                        self.URM_test, self.URM_validation = split_train_validation_percentage_user_wise(
                            URM_test, train_percentage=0.9, verbose=False)

                    if mode == 2:  # cold start only for some users
                        URM_train, self.URM_test = split_train_validation_cold_start_user_wise(
                            URM_all,
                            full_train_percentage=0.8,
                            cold_items=cold_items,
                            verbose=False)

                        self.URM_train, self.URM_validation = split_train_validation_cold_start_user_wise(
                            URM_train,
                            full_train_percentage=0.9,
                            cold_items=cold_items,
                            verbose=False)

            data_dict = {
                "URM_train": self.URM_train,
                "URM_test": self.URM_test,
                "URM_validation": self.URM_validation,
            }

            save_data_dict(data_dict, pre_splitted_path, pre_splitted_filename)

        print("Dataset_Movielens1M: Dataset loaded")

        ut.print_stat_datareader(self)
Beispiel #11
0
    def __init__(self, dataset_variant="a", train_interactions=1):

        super(CiteulikeReader, self).__init__()

        assert dataset_variant in [
            "a", "t"
        ], "CiteulikeReader: dataset_variant must be either 'a' or 't'"
        assert train_interactions in [
            1, 10, "all"
        ], "CiteulikeReader: train_interactions must be: 1, 10 or 'all'"

        pre_splitted_path = "Data_manager_split_datasets/CiteULike/KDD/CollaborativeVAE_our_interface/"

        pre_splitted_filename = "splitted_data_citeulike-{}-{}-items".format(
            dataset_variant, train_interactions)

        original_data_path = "Conferences/KDD/CollaborativeVAE_github/data/citeulike-{}/".format(
            dataset_variant)

        # If directory does not exist, create
        if not os.path.exists(pre_splitted_path):
            os.makedirs(pre_splitted_path)

        try:

            print("CiteulikeReader: Attempting to load pre-splitted data")

            for attrib_name, attrib_object in load_data_dict(
                    pre_splitted_path, pre_splitted_filename).items():
                self.__setattr__(attrib_name, attrib_object)

        except FileNotFoundError:

            print(
                "CiteulikeReader: Pre-splitted data not found, building new one"
            )

            print("CiteulikeReader: loading URM")

            if train_interactions == "all":
                train_interactions_file_suffix = 10
            else:
                train_interactions_file_suffix = train_interactions

            URM_train_builder = self._load_data_file(
                original_data_path +
                "cf-train-{}-users.dat".format(train_interactions_file_suffix))
            URM_test_builder = self._load_data_file(
                original_data_path +
                "cf-test-{}-users.dat".format(train_interactions_file_suffix))

            self.URM_test = URM_test_builder.get_SparseMatrix()
            self.URM_train = URM_train_builder.get_SparseMatrix()

            if dataset_variant == "a":
                self.ICM_title_abstract = scipy.io.loadmat(original_data_path +
                                                           "mult_nor.mat")['X']
            else:
                # Variant "t" uses a different file format and is transposed
                self.ICM_title_abstract = h5py.File(original_data_path +
                                                    "mult_nor.mat").get('X')
                self.ICM_title_abstract = sps.csr_matrix(
                    self.ICM_title_abstract).T

            self.ICM_title_abstract = sps.csr_matrix(self.ICM_title_abstract)

            n_rows = max(self.URM_test.shape[0], self.URM_train.shape[0])
            n_cols = max(self.URM_test.shape[1], self.URM_train.shape[1],
                         self.ICM_title_abstract.shape[0])

            newShape = (n_rows, n_cols)

            self.URM_test = reshapeSparse(self.URM_test, newShape)
            self.URM_train = reshapeSparse(self.URM_train, newShape)

            if train_interactions == "all":

                self.URM_train += self.URM_test

                self.URM_train, self.URM_test = split_train_validation_percentage_random_holdout(
                    self.URM_train, train_percentage=0.8)
                self.URM_train, self.URM_validation = split_train_validation_percentage_random_holdout(
                    self.URM_train, train_percentage=0.8)

            else:

                self.URM_train, self.URM_validation = split_train_validation_percentage_random_holdout(
                    self.URM_train, train_percentage=0.8)

            data_dict = {
                "URM_train": self.URM_train,
                "URM_test": self.URM_test,
                "URM_validation": self.URM_validation,
                "ICM_title_abstract": self.ICM_title_abstract
            }

            save_data_dict(data_dict, pre_splitted_path, pre_splitted_filename)

            print("CiteulikeReader: loading complete")