def __init__(self, pre_splitted_path):

        test_percentage = 0.2
        validation_percentage = 0.2

        pre_splitted_path += "data_split/"
        pre_splitted_filename = "splitted_data_"

        # If directory does not exist, create
        if not os.path.exists(pre_splitted_path):
            os.makedirs(pre_splitted_path)

        try:
            print(
                "Dataset_MovielensHetrec2011: Attempting to load pre-splitted data"
            )

            for attrib_name, attrib_object in load_data_dict_zip(
                    pre_splitted_path, pre_splitted_filename).items():
                self.__setattr__(attrib_name, attrib_object)

        except FileNotFoundError:

            print(
                "Dataset_MovielensHetrec2011: Pre-splitted data not found, building new one"
            )

            data_reader = MovielensHetrec2011Reader_DataManager()
            loaded_dataset = data_reader.load_data()

            URM_all = loaded_dataset.get_URM_all()

            # keep only ratings 5
            URM_all.data = URM_all.data == 5
            URM_all.eliminate_zeros()

            # create train - test - validation
            URM_train_original, URM_test = split_train_validation_percentage_user_wise(
                URM_all, train_percentage=1 - test_percentage, verbose=False)

            URM_train, URM_validation = split_train_validation_percentage_user_wise(
                URM_train_original,
                train_percentage=1 - validation_percentage,
                verbose=False)

            self.URM_DICT = {
                "URM_train": URM_train,
                "URM_test": URM_test,
                "URM_validation": URM_validation,
            }

            save_data_dict_zip(self.URM_DICT, self.ICM_DICT, pre_splitted_path,
                               pre_splitted_filename)

        print("Dataset_MovielensHetrec2011: Dataset loaded")

        ut.print_stat_datareader(self)
Esempio n. 2
0
    def __init__(self, pre_splitted_path):
        super(PinterestICCVReader, self).__init__()

        pre_splitted_path += "data_split/"
        pre_splitted_filename = "splitted_data_"

        # If directory does not exist, create
        if not os.path.exists(pre_splitted_path):
            os.makedirs(pre_splitted_path)

        try:

            print("PinterestICCVReader: Attempting to load pre-splitted data")

            for attrib_name, attrib_object in load_data_dict_zip(
                    pre_splitted_path, pre_splitted_filename).items():
                self.__setattr__(attrib_name, attrib_object)

        except FileNotFoundError:

            print(
                "PinterestICCVReader: Pre-splitted data not found, building new one"
            )

            print("PinterestICCVReader: loading URM")

            # data_reader = PinterestICCVReader()
            # loaded_dataset = data_reader.load_data()
            #
            # URM_all = loaded_dataset.get_URM_all()
            #
            # URM_train, URM_validation, URM_test, URM_negative = split_train_validation_test_negative_leave_one_out_user_wise(URM_all, negative_items_per_positive=100)

            dataset = Dataset_NeuralCollaborativeFiltering(
                "Conferences/WWW/NeuMF_github/Data/pinterest-20")

            URM_train_original, URM_test, URM_test_negative = dataset.URM_train, dataset.URM_test, dataset.URM_test_negative

            URM_train, URM_validation = split_train_validation_percentage_random_holdout(
                URM_train_original.copy(), train_percentage=0.8)

            self.URM_DICT = {
                "URM_train_original": URM_train_original,
                "URM_train": URM_train,
                "URM_test": URM_test,
                "URM_test_negative": URM_test_negative,
                "URM_validation": URM_validation,
            }

            save_data_dict_zip(self.URM_DICT, self.ICM_DICT, pre_splitted_path,
                               pre_splitted_filename)

            print("PinterestICCVReader: loading complete")
Esempio n. 3
0
    def __init__(self, pre_splitted_path):

        pre_splitted_path += "data_split/"
        pre_splitted_filename = "splitted_data_"

        # If directory does not exist, create
        if not os.path.exists(pre_splitted_path):
            os.makedirs(pre_splitted_path)

        try:
            print("Dataset_{}: Attempting to load pre-splitted data".format(self.DATASET_NAME))

            for attrib_name, attrib_object in load_data_dict_zip(pre_splitted_path, pre_splitted_filename).items():
                self.__setattr__(attrib_name, attrib_object)

        except FileNotFoundError:
            print("Dataset_{}: Pre-splitted data not found, building new one".format(self.DATASET_NAME))

            # create from full dataset with leave out one time wise from ORIGINAL full dateset
            data_reader = AmazonMusicReader_DataManager()
            loaded_dataset = data_reader.load_data()

            URM_all = loaded_dataset.get_URM_from_name("URM_all")
            URM_timestamp = loaded_dataset.get_URM_from_name("URM_timestamp")

            # make data implicit
            URM_all.data = np.ones_like(URM_all.data)

            # slice users with less than 20 interactions and after remove items with 0 interactions
            URM_all = filter_urm(URM_all, user_min_number_ratings=20, item_min_number_ratings=0)
            URM_all = filter_urm(URM_all, user_min_number_ratings=0, item_min_number_ratings=1)
            URM_timestamp = filter_urm(URM_timestamp, user_min_number_ratings=20, item_min_number_ratings=0)
            URM_timestamp = filter_urm(URM_timestamp, user_min_number_ratings=0, item_min_number_ratings=1)

            URM_train, URM_validation, URM_test, URM_test_negative = split_data_on_timestamp(URM_all, URM_timestamp, negative_items_per_positive=99)


            self.URM_DICT = {
                "URM_train": URM_train,
                "URM_test": URM_test,
                "URM_validation": URM_validation,
                "URM_test_negative": URM_test_negative,
                "URM_timestamp": URM_timestamp,
            }


            save_data_dict_zip(self.URM_DICT, self.ICM_DICT, pre_splitted_path, pre_splitted_filename)

        print("{}: Dataset loaded".format(self.DATASET_NAME))

        print_stat_datareader(self)
    def __init__(self, pre_splitted_path):

        pre_splitted_path += "data_split/"
        pre_splitted_filename = "splitted_data_"

        # If directory does not exist, create
        if not os.path.exists(pre_splitted_path):
            os.makedirs(pre_splitted_path)

        try:
            print("Dataset_FilmTrust: Attempting to load pre-splitted data")

            for attrib_name, attrib_object in load_data_dict_zip(
                    pre_splitted_path, pre_splitted_filename).items():
                self.__setattr__(attrib_name, attrib_object)

        except FileNotFoundError:

            print(
                "Dataset_FilmTrust: Pre-splitted data not found, building new one"
            )

            data_reader = FilmTrustReader_DataManager()
            loaded_dataset = data_reader.load_data()

            URM_all = loaded_dataset.get_URM_all()

            URM_all.eliminate_zeros()

            URM_all.data = np.ones_like(URM_all.data)

            URM_train, URM_test = split_train_validation_percentage_random_holdout(
                URM_all, train_percentage=0.8)

            URM_train, URM_validation = split_train_validation_percentage_random_holdout(
                URM_train, train_percentage=0.9)

            self.URM_DICT = {
                "URM_train": URM_train,
                "URM_test": URM_test,
                "URM_validation": URM_validation,
            }

            save_data_dict_zip(self.URM_DICT, self.ICM_DICT, pre_splitted_path,
                               pre_splitted_filename)

        print("FilmTrust: Dataset loaded")

        ut.print_stat_datareader(self)
    def __init__(self, pre_splitted_path):
        super(EpinionsReader, self).__init__()

        pre_splitted_path += "data_split/"
        pre_splitted_filename = "splitted_data_"

        # If directory does not exist, create
        if not os.path.exists(pre_splitted_path):
            os.makedirs(pre_splitted_path)

        try:

            print("EpinionsReader: Attempting to load pre-splitted data")

            for attrib_name, attrib_object in load_data_dict_zip(pre_splitted_path, pre_splitted_filename).items():
                 self.__setattr__(attrib_name, attrib_object)


        except FileNotFoundError:

            print("EpinionsReader: Pre-splitted data not found, building new one")

            print("EpinionsReader: loading URM")


            data_reader = EpinionsReader_DataManager()
            loaded_dataset = data_reader.load_data()

            URM_all = loaded_dataset.get_URM_all()
            URM_all.data = np.ones_like(URM_all.data)

            URM_train, URM_validation, URM_test, URM_test_negative = split_train_validation_test_negative_leave_one_out_user_wise(URM_all, negative_items_per_positive=100)

            # Compatibility with the other two datasets
            URM_train_original = URM_train + URM_validation

            self.URM_DICT = {
                "URM_train_original": URM_train_original,
                "URM_train": URM_train,
                "URM_test": URM_test,
                "URM_test_negative": URM_test_negative,
                "URM_validation": URM_validation,
            }

            save_data_dict_zip(self.URM_DICT, self.ICM_DICT, pre_splitted_path, pre_splitted_filename)


            print("EpinionsReader: loading complete")
    def __init__(self, pre_splitted_path):
        super(CiteULikeReader, self).__init__()

        pre_splitted_path += "data_split/"
        pre_splitted_filename = "splitted_data_"

        # If directory does not exist, create
        if not os.path.exists(pre_splitted_path):
            os.makedirs(pre_splitted_path)

        try:

            print("CiteULikeReader: Attempting to load pre-splitted data")

            for attrib_name, attrib_object in load_data_dict_zip(
                    pre_splitted_path, pre_splitted_filename).items():
                self.__setattr__(attrib_name, attrib_object)

        except FileNotFoundError:

            print(
                "CiteULikeReader: Pre-splitted data not found, building new one"
            )

            print("CiteULikeReader: loading URM")

            filename = "Conferences/SIGIR/CMN_github/data/citeulike-a.npz"

            URM_train_original, URM_test, URM_test_negative = self.build_sparse_matrix(
                filename)

            URM_train, URM_validation = split_train_validation_leave_one_out_user_wise(
                URM_train_original.copy())

            self.URM_DICT = {
                "URM_train_original": URM_train_original,
                "URM_train": URM_train,
                "URM_test": URM_test,
                "URM_test_negative": URM_test_negative,
                "URM_validation": URM_validation,
            }

            save_data_dict_zip(self.URM_DICT, self.ICM_DICT, pre_splitted_path,
                               pre_splitted_filename)

        print("CiteULikeReader: Dataset loaded")
    def __init__(self, pre_splitted_path):

        super(Movielens1MReader, self).__init__()

        pre_splitted_path += "data_split/"
        pre_splitted_filename = "splitted_data_"

        # If directory does not exist, create
        if not os.path.exists(pre_splitted_path):
            os.makedirs(pre_splitted_path)

        try:

            print("Dataset_Movielens1M: Attempting to load pre-splitted data")

            for attrib_name, attrib_object in load_data_dict_zip(
                    pre_splitted_path, pre_splitted_filename).items():
                self.__setattr__(attrib_name, attrib_object)

        except FileNotFoundError:

            print(
                "Dataset_Movielens1M: Pre-splitted data not found, building new one"
            )

            # Ensure file is loaded as matrix
            Dataset_github.load_rating_file_as_list = Dataset_github.load_rating_file_as_matrix

            dataset = Dataset_github("Conferences/WWW/NeuMF_github/Data/ml-1m")

            URM_train_original, URM_test = dataset.trainMatrix, dataset.testRatings

            URM_train_original = URM_train_original.tocsr()
            URM_test = URM_test.tocsr()

            from Base.Recommender_utils import reshapeSparse

            shape = (max(URM_train_original.shape[0], URM_test.shape[0]),
                     max(URM_train_original.shape[1], URM_test.shape[1]))

            URM_train_original = reshapeSparse(URM_train_original, shape)
            URM_test = reshapeSparse(URM_test, shape)

            URM_test_negatives_builder = IncrementalSparseMatrix(
                n_rows=shape[0], n_cols=shape[1])

            for user_index in range(len(dataset.testNegatives)):

                user_test_items = dataset.testNegatives[user_index]

                URM_test_negatives_builder.add_single_row(user_index,
                                                          user_test_items,
                                                          data=1.0)

            URM_test_negative = URM_test_negatives_builder.get_SparseMatrix()

            URM_train, URM_validation = split_train_validation_leave_one_out_user_wise(
                URM_train_original.copy())

            self.URM_DICT = {
                "URM_train_original": URM_train_original,
                "URM_train": URM_train,
                "URM_test": URM_test,
                "URM_test_negative": URM_test_negative,
                "URM_validation": URM_validation,
            }

            save_data_dict_zip(self.URM_DICT, self.ICM_DICT, pre_splitted_path,
                               pre_splitted_filename)

        print("Dataset_Movielens1M: Dataset loaded")
    def __init__(self,
                 pre_splitted_path,
                 dataset_variant="a",
                 train_interactions=1):

        super(CiteulikeReader, self).__init__()

        assert dataset_variant in [
            "a", "t"
        ], "CiteulikeReader: dataset_variant must be either 'a' or 't'"
        assert train_interactions in [
            1, 10, "all"
        ], "CiteulikeReader: train_interactions must be: 1, 10 or 'all'"

        pre_splitted_path += "data_split/"
        pre_splitted_filename = "splitted_data_"

        original_data_path = "Conferences/KDD/CollaborativeVAE_github/data/citeulike-{}/".format(
            dataset_variant)

        # If directory does not exist, create
        if not os.path.exists(pre_splitted_path):
            os.makedirs(pre_splitted_path)

        try:

            print("CiteulikeReader: Attempting to load pre-splitted data")

            for attrib_name, attrib_object in load_data_dict_zip(
                    pre_splitted_path, pre_splitted_filename).items():
                self.__setattr__(attrib_name, attrib_object)

        except FileNotFoundError:

            print(
                "CiteulikeReader: Pre-splitted data not found, building new one"
            )

            print("CiteulikeReader: loading URM")

            if train_interactions == "all":
                train_interactions_file_suffix = 10
            else:
                train_interactions_file_suffix = train_interactions

            URM_train_builder = self._load_data_file(
                original_data_path +
                "cf-train-{}-users.dat".format(train_interactions_file_suffix))
            URM_test_builder = self._load_data_file(
                original_data_path +
                "cf-test-{}-users.dat".format(train_interactions_file_suffix))

            URM_test = URM_test_builder.get_SparseMatrix()
            URM_train = URM_train_builder.get_SparseMatrix()

            if dataset_variant == "a":
                ICM_tokens_TFIDF = scipy.io.loadmat(original_data_path +
                                                    "mult_nor.mat")['X']
            else:
                # Variant "t" uses a different file format and is transposed
                ICM_tokens_TFIDF = h5py.File(original_data_path +
                                             "mult_nor.mat").get('X')
                ICM_tokens_TFIDF = sps.csr_matrix(ICM_tokens_TFIDF).T

            ICM_tokens_TFIDF = sps.csr_matrix(ICM_tokens_TFIDF)

            ICM_tokens_bool = ICM_tokens_TFIDF.copy()
            ICM_tokens_bool.data = np.ones_like(ICM_tokens_bool.data)

            n_rows = max(URM_test.shape[0], URM_train.shape[0])
            n_cols = max(URM_test.shape[1], URM_train.shape[1],
                         ICM_tokens_TFIDF.shape[0])

            newShape = (n_rows, n_cols)

            URM_test = reshapeSparse(URM_test, newShape)
            URM_train = reshapeSparse(URM_train, newShape)

            if train_interactions == "all":

                URM_train += URM_test

                URM_train, URM_test = split_train_validation_percentage_random_holdout(
                    URM_train, train_percentage=0.8)
                URM_train, URM_validation = split_train_validation_percentage_random_holdout(
                    URM_train.copy(), train_percentage=0.8)

            elif train_interactions == 10:
                # If train interactions == 10 the train will NOT contain the validation data
                URM_train, URM_validation = split_train_validation_percentage_random_holdout(
                    URM_train.copy(), train_percentage=0.8)

            else:
                # If train interactions == 10 the train WILL contain the validation data
                _, URM_validation = split_train_validation_percentage_random_holdout(
                    URM_train.copy(), train_percentage=0.8)

            self.ICM_DICT = {
                "ICM_tokens_TFIDF": ICM_tokens_TFIDF,
                "ICM_tokens_bool": ICM_tokens_bool,
            }

            self.URM_DICT = {
                "URM_train": URM_train,
                "URM_test": URM_test,
                "URM_validation": URM_validation,
            }

            save_data_dict_zip(self.URM_DICT, self.ICM_DICT, pre_splitted_path,
                               pre_splitted_filename)

            print("CiteulikeReader: loading complete")
Esempio n. 9
0
    def __init__(self, pre_splitted_path, original=True):

        pre_splitted_path += "data_split/"
        pre_splitted_filename = "splitted_data_"

        # If directory does not exist, create
        if not os.path.exists(pre_splitted_path):
            os.makedirs(pre_splitted_path)

        try:
            print("Dataset_{}: Attempting to load pre-splitted data".format(
                self.DATASET_NAME))

            for attrib_name, attrib_object in load_data_dict_zip(
                    pre_splitted_path, pre_splitted_filename).items():
                self.__setattr__(attrib_name, attrib_object)

        except FileNotFoundError:

            print("Dataset_{}: Pre-splitted data not found, building new one".
                  format(self.DATASET_NAME))

            compressed_file_folder = "Conferences/IJCAI/ConvNCF_github/Data/"
            decompressed_file_folder = "Data_manager_split_datasets/Yelp/"

            # compressed_file = tarfile.open(compressed_file_folder + "yelp.test.negative.gz", "r:gz")
            # compressed_file.extract("yelp.test.negative", path=decompressed_file_folder + "decompressed/")
            # compressed_file.close()
            #
            # compressed_file = tarfile.open(compressed_file_folder + "yelp.test.rating.gz", "r:gz")
            # compressed_file.extract("yelp.test.rating", path=decompressed_file_folder + "decompressed/")
            # compressed_file.close()
            #
            # compressed_file = tarfile.open(compressed_file_folder + "yelp.train.rating.gz", "r:gz")
            # compressed_file.extract("yelp.train.rating", path=decompressed_file_folder + "decompressed/")
            # compressed_file.close()

            # if original:

            Dataset_github.load_rating_file_as_list = Dataset_github.load_training_file_as_matrix

            try:
                dataset = Dataset_github(compressed_file_folder + "yelp")

            except FileNotFoundError as exc:

                print(
                    "Dataset_{}: Uncompressed files not found, please manually decompress the *.gz files in this folder: '{}'"
                    .format(self.DATASET_NAME, compressed_file_folder))

                raise exc

            URM_train_original, URM_test = dataset.trainMatrix, dataset.testRatings

            n_users = max(URM_train_original.shape[0], URM_test.shape[0])
            n_items = max(URM_train_original.shape[1], URM_test.shape[1])

            URM_train_original = sps.csr_matrix(URM_train_original,
                                                shape=(n_users, n_items))
            URM_test = sps.csr_matrix(URM_test, shape=(n_users, n_items))

            URM_train_original.data = np.ones_like(URM_train_original.data)
            URM_test.data = np.ones_like(URM_test.data)

            URM_test_negatives_builder = IncrementalSparseMatrix(
                n_rows=n_users, n_cols=n_items)

            n_negative_samples = 999
            for user_index in range(len(dataset.testNegatives)):
                user_test_items = dataset.testNegatives[user_index]
                if len(user_test_items) != n_negative_samples:
                    print(
                        "user id: {} has {} negative items instead {}".format(
                            user_index, len(user_test_items),
                            n_negative_samples))
                URM_test_negatives_builder.add_single_row(user_index,
                                                          user_test_items,
                                                          data=1.0)

            URM_test_negative = URM_test_negatives_builder.get_SparseMatrix()
            URM_test_negative.data = np.ones_like(URM_test_negative.data)

            URM_train, URM_validation = split_train_validation_leave_one_out_user_wise(
                URM_train_original.copy(), verbose=False)

            #
            # else:
            #     data_reader = YelpReader_DataManager()
            #     loaded_dataset = data_reader.load_data()
            #
            #     URM_all = loaded_dataset.get_URM_all()
            #
            #     URM_timestamp = URM_all.copy()
            #
            #     URM_all.data = np.ones_like(URM_all.data)
            #
            #     URM_train, URM_validation, URM_test, URM_negative = split_data_on_timestamp(URM_all, URM_timestamp, negative_items_per_positive=999)
            #     URM_train = URM_train + URM_validation
            #     URM_train, URM_validation = split_train_validation_leave_one_out_user_wise(URM_train, verbose=False)

            shutil.rmtree(decompressed_file_folder + "decompressed/",
                          ignore_errors=True)

            self.URM_DICT = {
                "URM_train": URM_train,
                "URM_test": URM_test,
                "URM_validation": URM_validation,
                "URM_test_negative": URM_test_negative,
            }

            save_data_dict_zip(self.URM_DICT, self.ICM_DICT, pre_splitted_path,
                               pre_splitted_filename)

        print("{}: Dataset loaded".format(self.DATASET_NAME))

        ut.print_stat_datareader(self)
    def __init__(self, pre_splitted_path):

        pre_splitted_path += "data_split/"
        pre_splitted_filename = "splitted_data_"

        # If directory does not exist, create
        if not os.path.exists(pre_splitted_path):
            os.makedirs(pre_splitted_path)

        try:
            print("Dataset_{}: Attempting to load pre-splitted data".format(
                self.DATASET_NAME))

            for attrib_name, attrib_object in load_data_dict_zip(
                    pre_splitted_path, pre_splitted_filename).items():
                self.__setattr__(attrib_name, attrib_object)

        except FileNotFoundError:

            print("Dataset_{}: Pre-splitted data not found, building new one".
                  format(self.DATASET_NAME))

            compressed_file_folder = "Conferences/IJCAI/ConvNCF_github/Data/"
            decompressed_file_folder = "Data_manager_split_datasets/Gowalla/"

            # compressed_file = tarfile.open(compressed_file_folder + "gowalla.test.negative.gz", "r:gz")
            # compressed_file.extract("yelp.test.negative", path=decompressed_file_folder + "decompressed/")
            # compressed_file.close()
            #
            # compressed_file = tarfile.open(compressed_file_folder + "gowalla.test.rating.gz", "r:gz")
            # compressed_file.extract("yelp.test.rating", path=decompressed_file_folder + "decompressed/")
            # compressed_file.close()
            #
            # compressed_file = tarfile.open(compressed_file_folder + "gowalla.train.rating.gz", "r:gz")
            # compressed_file.extract("yelp.train.rating", path=decompressed_file_folder + "decompressed/")
            # compressed_file.close()

            # if original:

            Dataset_github.load_rating_file_as_list = Dataset_github.load_training_file_as_matrix

            try:
                dataset = Dataset_github(compressed_file_folder + "gowalla")

            except FileNotFoundError as exc:

                print(
                    "Dataset_{}: Gowalla files not found, please download them and put them in this folder '{}', url: {}"
                    .format(self.DATASET_NAME, compressed_file_folder,
                            self.DATASET_URL))
                print(
                    "Dataset_{}: Uncompressed files not found, please manually decompress the *.gz files in this folder: '{}'"
                    .format(self.DATASET_NAME, compressed_file_folder))

                raise exc

            URM_train_original, URM_test = dataset.trainMatrix, dataset.testRatings

            n_users = max(URM_train_original.shape[0], URM_test.shape[0])
            n_items = max(URM_train_original.shape[1], URM_test.shape[1])

            URM_train_original = sps.csr_matrix(URM_train_original,
                                                shape=(n_users, n_items))
            URM_test = sps.csr_matrix(URM_test, shape=(n_users, n_items))

            URM_train_original.data = np.ones_like(URM_train_original.data)
            URM_test.data = np.ones_like(URM_test.data)

            URM_test_negatives_builder = IncrementalSparseMatrix(
                n_rows=n_users, n_cols=n_items)

            n_negative_samples = 999
            for user_index in range(len(dataset.testNegatives)):
                user_test_items = dataset.testNegatives[user_index]
                if len(user_test_items) != n_negative_samples:
                    print(
                        "user id: {} has {} negative items instead {}".format(
                            user_index, len(user_test_items),
                            n_negative_samples))
                URM_test_negatives_builder.add_single_row(user_index,
                                                          user_test_items,
                                                          data=1.0)

            URM_test_negative = URM_test_negatives_builder.get_SparseMatrix(
            ).tocsr()
            URM_test_negative.data = np.ones_like(URM_test_negative.data)

            URM_train, URM_validation = split_train_validation_leave_one_out_user_wise(
                URM_train_original.copy(), verbose=False)

            #
            #
            # # NOT USED
            # # elif not time_split: #create from full dataset with random leave one out from LINKED dateset in the article since timestamp is not present.
            # #
            # #     data_reader = GowallaGithubReader_DataManager()
            # #     loaded_dataset = data_reader.load_data()
            # #
            # #     URM_all = loaded_dataset.get_URM_all()
            # #
            # #     URM_all.eliminate_zeros()
            # #
            # #     URM_all.data = np.ones_like(URM_all.data)
            # #
            # #     #use this function 2 time because the order could change slightly the number of final interactions
            # #     #with this order we get the same number of interactions as in the paper
            # #     URM_all = filter_urm(URM_all, user_min_number_ratings=0, item_min_number_ratings=10)
            # #     URM_all = filter_urm(URM_all, user_min_number_ratings=2, item_min_number_ratings=0)
            # #
            # #     URM_train, URM_validation, URM_test, URM_negative = split_train_validation_test_negative_leave_one_out_user_wise(URM_all, negative_items_per_positive=999,
            # #                                                                                                                                          at_least_n_train_items_test=0, at_least_n_train_items_validation=0,
            # #                                                                                                                                          verbose=True)
            # #     URM_timestamp = sps.csc_matrix(([],([],[])), shape=URM_train.shape)
            #
            # else: # create from full dataset with leave out one time wise from ORIGINAL full dateset
            #     data_reader = GowallaReader_DataManager()
            #     loaded_dataset = data_reader.load_data()
            #
            #     URM_all = loaded_dataset.get_URM_all()
            #
            #     # use this function 2 time because the order could change slightly the number of final interactions
            #     # with this order we get the same number of interactions as in the paper
            #     URM_all = filter_urm(URM_all, user_min_number_ratings=0, item_min_number_ratings=10)
            #     URM_all = filter_urm(URM_all, user_min_number_ratings=2, item_min_number_ratings=0)
            #
            #     URM_timestamp = URM_all.copy()
            #     URM_all.data = np.ones_like(URM_all.data)
            #
            #     URM_train, URM_validation, URM_test, URM_negative = split_data_on_timestamp(URM_all, URM_timestamp, negative_items_per_positive=999)
            #     URM_train = URM_train + URM_validation
            #     URM_train, URM_validation = split_train_validation_leave_one_out_user_wise(URM_train, verbose=False)

            self.URM_DICT = {
                "URM_train": URM_train,
                "URM_test": URM_test,
                "URM_validation": URM_validation,
                "URM_test_negative": URM_test_negative,
            }

            save_data_dict_zip(self.URM_DICT, self.ICM_DICT, pre_splitted_path,
                               pre_splitted_filename)

        print("{}: Dataset loaded".format(self.DATASET_NAME))

        ut.print_stat_datareader(self)
    def __init__(self,
                 pre_splitted_path,
                 type="original",
                 cold_start=False,
                 cold_items=None):

        assert type in ["original", "ours"]

        pre_splitted_path += "data_split/"
        pre_splitted_filename = "splitted_data_"

        # their mode in cold start
        mode = 1

        # path for pre existed movielens1M split
        movielens_splitted_path = "Conferences/RecSys/SpectralCF_github/data/ml-1m/"

        # If directory does not exist, create
        if not os.path.exists(pre_splitted_path):
            os.makedirs(pre_splitted_path)

        try:
            print("Dataset_Movielens1M: Attempting to load pre-splitted data")

            for attrib_name, attrib_object in load_data_dict_zip(
                    pre_splitted_path, pre_splitted_filename).items():
                self.__setattr__(attrib_name, attrib_object)

        except FileNotFoundError:

            print(
                "Dataset_Movielens1M: Pre-splitted data not found, building new one"
            )

            if type == "original":
                assert (cold_start is False)

                # use the SpectralCF class to read data
                data_generator = Data(
                    train_file=movielens_splitted_path + 'train_users.dat',
                    test_file=movielens_splitted_path + 'test_users.dat',
                    batch_size=BATCH_SIZE)

                # convert train into csr
                full_train_matrix = sps.csr_matrix(data_generator.R)
                URM_train_original = full_train_matrix

                # convert test into csr
                test_set = data_generator.test_set
                uids, items = [], []
                for uid in test_set.keys():
                    uids += np.full(len(test_set[uid]), uid).tolist()
                    items += test_set[uid]
                test_matrix = sps.csr_matrix(
                    (np.ones(len(items)), (uids, items)),
                    shape=(full_train_matrix.shape))

                if not cold_start:
                    URM_test = test_matrix

                    # create validation
                    URM_train, URM_validation = split_train_validation_percentage_user_wise(
                        URM_train_original,
                        train_percentage=0.9,
                        verbose=False)

                else:
                    print('nothing')

            elif type == "ours":

                data_reader = Movielens1MReader_DataManager()
                loaded_dataset = data_reader.load_data()

                URM_all = loaded_dataset.get_URM_all()

                URM_all.data = URM_all.data == 5
                URM_all.eliminate_zeros()

                if not cold_start:
                    URM_train, URM_test = split_train_validation_percentage_user_wise(
                        URM_all, train_percentage=0.8, verbose=False)

                    URM_train, URM_validation = split_train_validation_percentage_user_wise(
                        URM_train, train_percentage=0.9, verbose=False)

                else:

                    if mode == 1:  # their mode, cold start for full dataset
                        URM_train, URM_test = split_train_validation_cold_start_user_wise(
                            URM_all,
                            full_train_percentage=0.0,
                            cold_items=cold_items,
                            verbose=False)

                        URM_test, URM_validation = split_train_validation_percentage_user_wise(
                            URM_test, train_percentage=0.9, verbose=False)

                    if mode == 2:  # cold start only for some users
                        URM_train, URM_test = split_train_validation_cold_start_user_wise(
                            URM_all,
                            full_train_percentage=0.8,
                            cold_items=cold_items,
                            verbose=False)

                        URM_train, URM_validation = split_train_validation_cold_start_user_wise(
                            URM_train,
                            full_train_percentage=0.9,
                            cold_items=cold_items,
                            verbose=False)

            self.URM_DICT = {
                "URM_train": URM_train,
                "URM_test": URM_test,
                "URM_validation": URM_validation,
            }

            save_data_dict_zip(self.URM_DICT, self.ICM_DICT, pre_splitted_path,
                               pre_splitted_filename)

        print("Dataset_Movielens1M: Dataset loaded")

        ut.print_stat_datareader(self)
    def __init__(self, pre_splitted_path, split_type = "cold_user"):
        super(Movielens20MReader, self).__init__()

        assert split_type in ["cold_user", "warm_user"]

        pre_splitted_path += "data_split/"
        pre_splitted_filename = "splitted_data_"

        # If directory does not exist, create
        if not os.path.exists(pre_splitted_path):
            os.makedirs(pre_splitted_path)

        try:

            print("Movielens20MReader: Attempting to load pre-splitted data")

            for attrib_name, attrib_object in load_data_dict_zip(pre_splitted_path, pre_splitted_filename).items():
                 self.__setattr__(attrib_name, attrib_object)


        except FileNotFoundError:

            print("Movielens20MReader: Pre-splitted data not found, building new one")

            data_reader = Movielens20MReader_DataManager()
            loaded_dataset = data_reader.load_data()

            URM_all = loaded_dataset.get_URM_all()

            # binarize the data (only keep ratings >= 4)
            URM_all.data = URM_all.data >= 4.0
            URM_all.eliminate_zeros()


            if split_type == "cold_user":

                URM_all = sps.coo_matrix(URM_all)

                dict_for_dataframe = {"userId": URM_all.row,
                                      "movieId": URM_all.col,
                                      "rating": URM_all.data
                                      }

                URM_all_dataframe = pd.DataFrame(data=dict_for_dataframe)


                URM_train, URM_train_all, URM_validation, URM_test = split_train_validation_test_VAE_CF(URM_all_dataframe,
                                                                                                                             n_heldout_users = 10000)


                n_rows = max(URM_train.shape[0], URM_train_all.shape[0], URM_validation.shape[0], URM_test.shape[0])
                n_cols = max(URM_train.shape[1], URM_train_all.shape[1], URM_validation.shape[1], URM_test.shape[1])

                newShape = (n_rows, n_cols)

                URM_test = reshapeSparse(URM_test, newShape)
                URM_train = reshapeSparse(URM_train, newShape)
                URM_train_all = reshapeSparse(URM_train_all, newShape)
                URM_test = reshapeSparse(URM_test, newShape)


                self.URM_DICT = {
                    "URM_train": URM_train,
                    "URM_train_all": URM_train_all,
                    "URM_test": URM_test,
                    "URM_validation": URM_validation,

                }



            elif split_type == "warm_user":


                URM_all = sps.csr_matrix(URM_all)
                users_to_keep = np.ediff1d(URM_all.indptr) >= 4
                URM_all = URM_all[users_to_keep,:]

                URM_all = sps.csc_matrix(URM_all)
                items_to_keep = np.ediff1d(URM_all.indptr) >= 1
                URM_all = URM_all[:,items_to_keep]


                URM_all = sps.csr_matrix(URM_all)

                URM_train, URM_validation, URM_test, _ = split_train_validation_test_negative_leave_one_out_user_wise(URM_all)


                self.URM_DICT = {
                    "URM_train": URM_train,
                    "URM_test": URM_test,
                    "URM_validation": URM_validation

                }

            save_data_dict_zip(self.URM_DICT, self.ICM_DICT, pre_splitted_path, pre_splitted_filename)





            print("Movielens20MReader: Dataset loaded")
Esempio n. 13
0
    def __init__(self, pre_splitted_path, original=True):

        pre_splitted_path += "data_split/"
        pre_splitted_filename = "splitted_data_"

        # If directory does not exist, create
        if not os.path.exists(pre_splitted_path):
            os.makedirs(pre_splitted_path)

        try:
            print("Dataset_{}: Attempting to load pre-splitted data".format(
                self.DATASET_NAME))

            for attrib_name, attrib_object in load_data_dict_zip(
                    pre_splitted_path, pre_splitted_filename).items():
                self.__setattr__(attrib_name, attrib_object)

        except FileNotFoundError:

            print("Dataset_{}: Pre-splitted data not found, building new one".
                  format(self.DATASET_NAME))

            if original:

                URM_path = 'Conferences/IJCAI/DMF_original/data_www/Amazon_ratings_Digital_Music_pruned.txt'
                #
                # dataFile = open(URM_path, "r")
                #
                # # textData = dataFile.readlines()
                # dataFile.close()
                #
                # u_map = {}
                # discarded = 0
                # for line in tqdm(textData):
                #     line = line.split(' ')
                #     u, i, rating, new_time = int(line[0]), int(line[1]), float(line[2]), int(line[3])
                #
                #     # convert u id and i id in integer starting from 0 and initialize u_map
                #     if u not in u_map:
                #         u_map[u] = {}
                #
                #     if i not in u_map[u]:
                #         u_map[u][i] = [rating, new_time]
                #     else:  # rating already exist, keep the most recent timestamp
                #         discarded += 1
                #         current_time = u_map[u][i][1]
                #         if new_time > current_time:
                #             u_map[u][i] = [rating, new_time]
                #
                # print('Merged {} interactions, kept the most recent timestamps'.format(discarded))
                #
                # UTM_builder = IncrementalSparseMatrix()
                # URM_builder = IncrementalSparseMatrix()
                #
                # for u in u_map:
                #     items, ratings, timestamps = [], [], []
                #     for i in u_map[u]:
                #         items.append(i)
                #         timestamps.append(u_map[u][i][1])
                #         ratings.append(u_map[u][i][0])
                #     UTM_builder.add_data_lists(row_list_to_add=np.full(len(items), int(u)), col_list_to_add=items, data_list_to_add=timestamps)
                #     URM_builder.add_data_lists(row_list_to_add=np.full(len(items), int(u)), col_list_to_add=items, data_list_to_add=ratings)
                #

                URM_rating_builder = IncrementalSparseMatrix(
                    auto_create_col_mapper=True, auto_create_row_mapper=True)
                URM_timestamp_builder = IncrementalSparseMatrix(
                    auto_create_col_mapper=True, auto_create_row_mapper=True)

                # URM_duplicate_assert_builder = IncrementalSparseMatrix( auto_create_col_mapper = True, auto_create_row_mapper = True)

                df_original = pd.read_csv(filepath_or_buffer=URM_path,
                                          sep=" ",
                                          header=None,
                                          dtype={
                                              0: int,
                                              1: int,
                                              2: float,
                                              3: int
                                          })

                df_original.columns = [
                    'userId', 'itemId', 'rating', 'timestamp'
                ]

                userId_list = df_original['userId'].values
                itemId_list = df_original['itemId'].values
                rating_list = df_original['rating'].values
                timestamp_list = df_original['timestamp'].values

                URM_rating_builder.add_data_lists(userId_list, itemId_list,
                                                  rating_list)
                URM_timestamp_builder.add_data_lists(userId_list, itemId_list,
                                                     timestamp_list)

                # URM_duplicate_assert_builder.add_data_lists(userId_list, itemId_list, np.ones_like(rating_list))
                # URM_duplicate_assert = URM_duplicate_assert_builder.get_SparseMatrix()
                #
                # assert np.all(URM_duplicate_assert.data == 1.0), "Duplicates detected"

                # Check if duplicates exist
                num_unique_user_item_ids = df_original.drop_duplicates(
                    ['userId', 'itemId'], keep='first', inplace=False).shape[0]
                assert num_unique_user_item_ids == len(
                    userId_list), "Duplicate (user, item) values found"

                URM_timestamp = URM_timestamp_builder.get_SparseMatrix()
                URM_all = URM_rating_builder.get_SparseMatrix()

                URM_train, URM_validation, URM_test, URM_test_negative = split_data_on_timestamp(
                    URM_all, URM_timestamp, negative_items_per_positive=99)

                # We want the validation to be sampled at random, not as the last interaction
                URM_train = URM_train + URM_validation
                URM_train, URM_validation = split_train_validation_leave_one_out_user_wise(
                    URM_train, verbose=False)

            else:
                # create from full dataset with leave out one time wise from ORIGINAL full dateset
                data_reader = AmazonMusicReader_DataManager()
                loaded_dataset = data_reader.load_data()

                URM_all = loaded_dataset.get_URM_from_name("URM_all")
                URM_timestamp = loaded_dataset.get_URM_from_name(
                    "URM_timestamp")

                # use this function 2 time because the order could change slightly the number of final interactions
                URM_all = filter_urm(URM_all,
                                     user_min_number_ratings=1,
                                     item_min_number_ratings=5)
                URM_all = filter_urm(URM_all,
                                     user_min_number_ratings=20,
                                     item_min_number_ratings=1)
                URM_timestamp = filter_urm(URM_timestamp,
                                           user_min_number_ratings=1,
                                           item_min_number_ratings=5)
                URM_timestamp = filter_urm(URM_timestamp,
                                           user_min_number_ratings=20,
                                           item_min_number_ratings=1)

                URM_timestamp = URM_timestamp

                URM_train, URM_validation, URM_test, URM_test_negative = split_data_on_timestamp(
                    URM_all, URM_timestamp, negative_items_per_positive=99)

                # We want the validation to be sampled at random, not as the last interaction
                URM_train = URM_train + URM_validation
                URM_train, URM_validation = split_train_validation_leave_one_out_user_wise(
                    URM_train, verbose=False)

            self.URM_DICT = {
                "URM_train": URM_train,
                "URM_test": URM_test,
                "URM_validation": URM_validation,
                "URM_test_negative": URM_test_negative,
                "URM_timestamp": URM_timestamp,
            }

            save_data_dict_zip(self.URM_DICT, self.ICM_DICT, pre_splitted_path,
                               pre_splitted_filename)

        print("{}: Dataset loaded".format(self.DATASET_NAME))

        print_stat_datareader(self)
Esempio n. 14
0
    def __init__(self, pre_splitted_path):

        super(Movielens100KReader, self).__init__()

        pre_splitted_path += "data_split/"
        pre_splitted_filename = "splitted_data_"

        original_data_path = "Conferences/KDD/MCRec_github/data/"

        # If directory does not exist, create
        if not os.path.exists(pre_splitted_path):
            os.makedirs(pre_splitted_path)

        try:

            print("Movielens100KReader: Attempting to load pre-splitted data")

            for attrib_name, attrib_object in load_data_dict_zip(pre_splitted_path, pre_splitted_filename).items():
                 self.__setattr__(attrib_name, attrib_object)


        except FileNotFoundError:

            print("Movielens100KReader: Pre-splitted data not found, building new one")

            print("Movielens100KReader: loading URM")


            from Conferences.KDD.MCRec_github.code.Dataset import Dataset

            dataset = 'ml-100k'

            dataset = Dataset(original_data_path + dataset)
            URM_train, testRatings, testNegatives = dataset.trainMatrix, dataset.testRatings, dataset.testNegatives

            # Dataset adds 1 to user and item id, removing it to restore 0 indexing
            URM_train = sps.coo_matrix(URM_train)
            URM_train.row -= 1
            URM_train.col -= 1

            URM_train = sps.csr_matrix((np.ones_like(URM_train.data), (URM_train.row, URM_train.col)))


            num_users, num_items = URM_train.shape



            # Build sparse matrices from lists
            URM_test_builder = IncrementalSparseMatrix(n_rows=num_users, n_cols=num_items)
            URM_test_negative_builder = IncrementalSparseMatrix(n_rows=num_users, n_cols=num_items)


            for user_index in range(len(testRatings)):

                user_id = testRatings[user_index][0]
                current_user_test_items = testRatings[user_index][1:]
                current_user_test_negative_items = testNegatives[user_index]

                current_user_test_items = np.array(current_user_test_items) -1
                current_user_test_negative_items = np.array(current_user_test_negative_items) -1

                URM_test_builder.add_single_row(user_id -1, current_user_test_items, 1.0)
                URM_test_negative_builder.add_single_row(user_id -1, current_user_test_negative_items, 1.0)



            # the test data has repeated data, apparently
            URM_test = URM_test_builder.get_SparseMatrix()

            URM_test_negative = URM_test_negative_builder.get_SparseMatrix()


            # Split validation from train as 10%
            from Data_manager.split_functions.split_train_validation import split_train_validation_percentage_user_wise

            URM_train, URM_validation = split_train_validation_percentage_user_wise(URM_train, train_percentage=0.9)


            # Load features

            data_reader = Movielens100KReader_DataManager()
            loaded_dataset = data_reader.load_data()

            zipFile_path = data_reader.DATASET_SPLIT_ROOT_FOLDER + data_reader.DATASET_SUBFOLDER
            dataFile = zipfile.ZipFile(zipFile_path + "ml-100k.zip")

            ICM_path = dataFile.extract("ml-100k/u.item", path=zipFile_path + "decompressed/")

            ICM_genre = self._loadICM(ICM_path)
            ICM_genre = ICM_genre.get_SparseMatrix()

            shutil.rmtree(zipFile_path + "decompressed", ignore_errors=True)

            self.ICM_DICT = {
                "ICM_genre": ICM_genre
            }

            self.URM_DICT = {
                "URM_train": URM_train,
                "URM_test": URM_test,
                "URM_test_negative": URM_test_negative,
                "URM_validation": URM_validation,
            }

            save_data_dict_zip(self.URM_DICT, self.ICM_DICT, pre_splitted_path, pre_splitted_filename)

            print("Movielens100KReader: loading complete")
Esempio n. 15
0
    def __init__(self, pre_splitted_path, type='original'):
        assert type in ["original", "ours"]

        pre_splitted_path += "data_split/"
        pre_splitted_filename = "splitted_data_"

        # If directory does not exist, create
        if not os.path.exists(pre_splitted_path):
            os.makedirs(pre_splitted_path)

        try:
            print("Dataset_{}: Attempting to load pre-splitted data".format(
                self.DATASET_NAME))

            for attrib_name, attrib_object in load_data_dict_zip(
                    pre_splitted_path, pre_splitted_filename).items():
                self.__setattr__(attrib_name, attrib_object)

        except FileNotFoundError:

            print("Dataset_{}: Pre-splitted data not found, building new one".
                  format(self.DATASET_NAME))

            from Conferences.IJCAI.CoupledCF_original import LoadMovieDataCnn as DatareaderOriginal
            path = "Conferences/IJCAI/CoupledCF_original/ml-1m/"

            n_users, gender, age, occupation = DatareaderOriginal.load_user_attributes(
                path=path, split=True)
            n_items, items_genres_mat = DatareaderOriginal.load_itemGenres_as_matrix(
                path=path)
            ratings = DatareaderOriginal.load_rating_train_as_matrix(path=path)
            testRatings = DatareaderOriginal.load_rating_file_as_list(
                path=path)
            testNegatives = DatareaderOriginal.load_negative_file(path=path)

            URM_all = ratings.tocsr()

            UCM_gender = gender.tocsr()
            UCM_age = age.tocsr()
            UCM_occupation = occupation.tocsr()
            UCM_all = sps.hstack((UCM_gender, UCM_age, UCM_occupation)).tocsr()

            ICM_all = sps.csr_matrix(items_genres_mat)

            testRatings = np.array(testRatings).T
            URM_test_builder = IncrementalSparseMatrix(n_rows=n_users + 1,
                                                       n_cols=n_items + 1)
            URM_test_builder.add_data_lists(testRatings[0], testRatings[1],
                                            np.ones(len(testRatings[0])))

            URM_test = URM_test_builder.get_SparseMatrix()

            URM_test_negatives_builder = IncrementalSparseMatrix(
                n_rows=n_users + 1, n_cols=n_items + 1)

            # care here, the test negative start from index 0 but it refer to user index 1 (user index start from 1)
            n_negative_samples = 99
            for index in range(len(testNegatives)):
                user_test_items = testNegatives[index]
                if len(user_test_items) != n_negative_samples:
                    print(
                        "user id: {} has {} negative items instead {}".format(
                            index + 1, len(user_test_items),
                            n_negative_samples))
                URM_test_negatives_builder.add_single_row(index + 1,
                                                          user_test_items,
                                                          data=1.0)

            URM_test_negative = URM_test_negatives_builder.get_SparseMatrix()
            URM_test_negative.data = np.ones_like(URM_test_negative.data)

            if type == 'original':
                URM_test = URM_test
                URM_train, URM_validation = split_train_validation_leave_one_out_user_wise(
                    URM_all.copy(), verbose=False)

            else:  # redo the split
                URM_full = URM_all + URM_test
                URM_temp, URM_test = split_train_validation_leave_one_out_user_wise(
                    URM_full.copy(), verbose=False)
                URM_train, URM_validation = split_train_validation_leave_one_out_user_wise(
                    URM_temp.copy(), verbose=False)

            self.ICM_DICT = {
                "UCM_gender": UCM_gender,
                "UCM_occupation": UCM_occupation,
                "UCM_age": UCM_age,
                "UCM_all": UCM_all,
                "ICM_all": ICM_all,
            }

            self.URM_DICT = {
                "URM_train": URM_train,
                "URM_test": URM_test,
                "URM_validation": URM_validation,
                "URM_test_negative": URM_test_negative,
            }

            save_data_dict_zip(self.URM_DICT, self.ICM_DICT, pre_splitted_path,
                               pre_splitted_filename)

        print("{}: Dataset loaded".format(self.DATASET_NAME))

        ut.print_stat_datareader(self)
Esempio n. 16
0
    def __init__(self, pre_splitted_path):

        pre_splitted_path += "data_split/"
        pre_splitted_filename = "splitted_data_"

        # If directory does not exist, create
        if not os.path.exists(pre_splitted_path):
            os.makedirs(pre_splitted_path)

        try:
            print("Dataset_{}: Attempting to load pre-splitted data".format(
                self.DATASET_NAME))

            for attrib_name, attrib_object in load_data_dict_zip(
                    pre_splitted_path, pre_splitted_filename).items():
                self.__setattr__(attrib_name, attrib_object)

        except FileNotFoundError:
            print("Dataset_{}: Pre-splitted data not found, building new one".
                  format(self.DATASET_NAME))

            # create from full dataset with leave out one time wise from ORIGINAL full dateset
            data_reader = Movielens100KReader_DataManager()
            loaded_dataset = data_reader.load_data()

            URM_all = loaded_dataset.get_URM_from_name("URM_all")
            URM_timestamp = loaded_dataset.get_URM_from_name("URM_timestamp")

            # use this function 2 time because the order could change slightly the number of final interactions
            URM_all = filter_urm(URM_all,
                                 user_min_number_ratings=0,
                                 item_min_number_ratings=5)
            URM_all = filter_urm(URM_all,
                                 user_min_number_ratings=20,
                                 item_min_number_ratings=0)
            URM_timestamp = filter_urm(URM_timestamp,
                                       user_min_number_ratings=0,
                                       item_min_number_ratings=5)
            URM_timestamp = filter_urm(URM_timestamp,
                                       user_min_number_ratings=20,
                                       item_min_number_ratings=0)

            URM_timestamp = URM_timestamp

            URM_train, URM_validation, URM_test, URM_test_negative = split_data_on_timestamp(
                URM_all, URM_timestamp, negative_items_per_positive=99)

            # We want the validation to be sampled at random, not as the last interaction
            URM_train = URM_train + URM_validation
            URM_train, URM_validation = split_train_validation_leave_one_out_user_wise(
                URM_train, verbose=False)

            self.URM_DICT = {
                "URM_train": URM_train,
                "URM_test": URM_test,
                "URM_validation": URM_validation,
                "URM_test_negative": URM_test_negative,
                "URM_timestamp": URM_timestamp,
            }

            save_data_dict_zip(self.URM_DICT, self.ICM_DICT, pre_splitted_path,
                               pre_splitted_filename)

        print("{}: Dataset loaded".format(self.DATASET_NAME))

        print_stat_datareader(self)
    def __init__(self, pre_splitted_path, type='original'):
        assert type in ["original", "ours"]

        pre_splitted_path += "data_split/"
        pre_splitted_filename = "splitted_data_"

        # If directory does not exist, create
        if not os.path.exists(pre_splitted_path):
            os.makedirs(pre_splitted_path)

        try:
            print("Dataset_{}: Attempting to load pre-splitted data".format(
                self.DATASET_NAME))

            for attrib_name, attrib_object in load_data_dict_zip(
                    pre_splitted_path, pre_splitted_filename).items():
                self.__setattr__(attrib_name, attrib_object)

        except FileNotFoundError:

            print("Dataset_{}: Pre-splitted data not found, building new one".
                  format(self.DATASET_NAME))

            from Conferences.IJCAI.CoupledCF_original import LoadTafengDataCnn as DatareaderOriginal
            path = "Conferences/IJCAI/CoupledCF_original/tafeng/"

            n_users, user_attributes_mat = DatareaderOriginal.load_user_attributes(
                path=path)
            n_items, items_genres_mat = DatareaderOriginal.load_itemGenres_as_matrix(
                path=path)
            ratings = DatareaderOriginal.load_rating_train_as_matrix(path=path)
            testRatings = DatareaderOriginal.load_rating_file_as_list(
                path=path)
            testNegatives = DatareaderOriginal.load_negative_file(path=path)

            URM_all = ratings.tocsr()

            UCM_all = sps.csc_matrix(user_attributes_mat)
            UCM_age = UCM_all[:, 0:11].tocsr()
            UCM_region = UCM_all[:, 11:19].tocsr()
            UCM_all = UCM_all.tocsr()

            # col: 0->category, 2->asset(0-1), 1->price(0-1)
            ICM_original = sps.csc_matrix(items_genres_mat)

            # category could be used as matrix, not single row
            ICM_sub_class = ICM_original[:, 0:1].tocsr()
            max = ICM_sub_class.shape[0]
            rows, cols, data = [], [], []
            for idx in range(max):
                # we have only index 0 as col
                data_vect = ICM_sub_class.data[
                    ICM_sub_class.indptr[idx]:ICM_sub_class.indptr[idx + 1]]
                if len(data_vect) == 0:
                    # handle category value 0 that in a csr matrix is not present
                    cols.append(int(0))
                else:
                    cols.append(int(data_vect[0]))
                rows.append(idx)
                data.append(1.0)

            ICM_sub_class = sps.csr_matrix((data, (rows, cols)))
            ICM_asset = ICM_original[:, 1:2].tocsr()
            ICM_price = ICM_original[:, 2:3].tocsr()

            ICM_original = ICM_original.tocsc()
            ICM_all = sps.hstack((ICM_sub_class, ICM_asset, ICM_price))

            testRatings = np.array(testRatings).T
            URM_test_builder = IncrementalSparseMatrix(n_rows=n_users + 1,
                                                       n_cols=n_items + 1)
            URM_test_builder.add_data_lists(testRatings[0], testRatings[1],
                                            np.ones(len(testRatings[0])))
            URM_test = URM_test_builder.get_SparseMatrix()

            URM_test_negatives_builder = IncrementalSparseMatrix(
                n_rows=n_users + 1, n_cols=n_items + 1)

            # care here, the test negative start from index 0 but it refer to user index 1 (user index start from 1)
            n_negative_samples = 99
            for index in range(len(testNegatives)):
                user_test_items = testNegatives[index]
                if len(user_test_items) != n_negative_samples:
                    print(
                        "user id: {} has {} negative items instead {}".format(
                            index + 1, len(user_test_items),
                            n_negative_samples))
                URM_test_negatives_builder.add_single_row(index + 1,
                                                          user_test_items,
                                                          data=1.0)

            URM_test_negative = URM_test_negatives_builder.get_SparseMatrix()
            URM_test_negative.data = np.ones_like(URM_test_negative.data)

            if type == 'original':
                URM_test = URM_test
                URM_train, URM_validation = split_train_validation_leave_one_out_user_wise(
                    URM_all.copy(), verbose=False)
            else:  # redo the split
                URM_full = URM_all + URM_test
                URM_temp, URM_test = split_train_validation_leave_one_out_user_wise(
                    URM_full.copy(), verbose=False)
                URM_train, URM_validation = split_train_validation_leave_one_out_user_wise(
                    URM_temp.copy(), verbose=False)

            self.ICM_DICT = {
                "UCM_age": UCM_age,
                "UCM_region": UCM_region,
                "UCM_all": UCM_all,
                "ICM_all": ICM_all,
                "ICM_original": ICM_original,
                "ICM_sub_class": ICM_sub_class,
                "ICM_asset": ICM_asset,
                "ICM_price": ICM_price,
            }

            self.URM_DICT = {
                "URM_train": URM_train,
                "URM_test": URM_test,
                "URM_validation": URM_validation,
                "URM_test_negative": URM_test_negative,
            }

            save_data_dict_zip(self.URM_DICT, self.ICM_DICT, pre_splitted_path,
                               pre_splitted_filename)

        print("{}: Dataset loaded".format(self.DATASET_NAME))

        ut.print_stat_datareader(self)
    def __init__(self, data_folder, dataset_name):

        self.DATASET_NAME = dataset_name
        pre_splitted_filename = "splitted_data_"

        # If directory does not exist, create
        if not os.path.exists(data_folder):
            os.makedirs(data_folder)

        data_folder_full = os.path.join(data_folder, "full")
        data_folder_validation = os.path.join(data_folder, "validation")

        if not os.path.exists(data_folder_full):
            os.makedirs(data_folder_full)

        if not os.path.exists(data_folder_validation):
            os.makedirs(data_folder_validation)

        try:
            print("Dataset_{}: Attempting to load pre-splitted data".format(
                self.DATASET_NAME))

            self.CFM_data_class_validation = LoadData(data_folder_validation,
                                                      "")
            self.CFM_data_class_full = LoadData(data_folder_full, "")

            for attrib_name, attrib_object in load_data_dict_zip(
                    data_folder, pre_splitted_filename).items():
                self.__setattr__(attrib_name, attrib_object)

        except FileNotFoundError:

            print("Dataset_{}: Pre-splitted data not found, building new one".
                  format(self.DATASET_NAME))

            original_data_path = 'CNN_on_embeddings/IJCAI/CFM_github/Data/'

            original_train_filepath = os.path.join(original_data_path,
                                                   self.DATASET_NAME,
                                                   'train.csv')
            original_test_filepath = os.path.join(original_data_path,
                                                  self.DATASET_NAME,
                                                  'test.csv')

            # Split train data in train-validation and copy original test data
            copyfile(original_train_filepath,
                     os.path.join(data_folder_full, 'train.csv'))
            copyfile(original_test_filepath,
                     os.path.join(data_folder_full, 'test.csv'))
            copyfile(original_test_filepath,
                     os.path.join(data_folder_validation, 'test.csv'))

            split_train_validation_CFM_format(
                original_train_filepath, new_folder=data_folder_validation)

            self.CFM_data_class_validation = LoadData(data_folder_validation,
                                                      "")
            self.CFM_data_class_full = LoadData(data_folder_full, "")

            URM_shape = (self.CFM_data_class_validation.user_bind_M,
                         self.CFM_data_class_validation.item_bind_M)

            self.URM_DICT = {
                "URM_train_tuning_only":
                dict_to_sparse_matrix(
                    self.CFM_data_class_validation.user_positive_list,
                    shape=URM_shape),
                "URM_validation_tuning_only":
                dict_to_sparse_matrix(
                    self.CFM_data_class_validation.user_positive_list_valid,
                    shape=URM_shape),
                "URM_test_tuning_only":
                dict_to_sparse_matrix(
                    self.CFM_data_class_validation.user_positive_list_test,
                    shape=URM_shape),
                "URM_train_full":
                dict_to_sparse_matrix(
                    self.CFM_data_class_full.user_positive_list,
                    shape=URM_shape),
                "URM_test_full":
                dict_to_sparse_matrix(
                    self.CFM_data_class_full.user_positive_list_test,
                    shape=URM_shape),
            }

            save_data_dict_zip(self.URM_DICT, self.ICM_DICT, data_folder,
                               pre_splitted_filename)

        print("{}: Dataset loaded".format(self.DATASET_NAME))
Esempio n. 19
0
    def __init__(self, pre_splitted_path):
        super(NetflixPrizeReader, self).__init__()

        pre_splitted_path += "data_split/"
        pre_splitted_filename = "splitted_data_"

        # If directory does not exist, create
        if not os.path.exists(pre_splitted_path):
            os.makedirs(pre_splitted_path)

        try:

            print("NetflixPrizeReader: Attempting to load pre-splitted data")

            for attrib_name, attrib_object in load_data_dict_zip(
                    pre_splitted_path, pre_splitted_filename).items():
                self.__setattr__(attrib_name, attrib_object)

        except FileNotFoundError:

            print(
                "NetflixPrizeReader: Pre-splitted data not found, building new one"
            )

            data_reader = NetflixPrizeReader_DataManager()
            loaded_dataset = data_reader.load_data()

            URM_all = loaded_dataset.get_URM_all()

            # binarize the data (only keep ratings >= 4)
            URM_all.data = URM_all.data >= 4.0
            URM_all.eliminate_zeros()

            URM_all = sps.coo_matrix(URM_all)

            dict_for_dataframe = {
                "userId": URM_all.row,
                "movieId": URM_all.col,
                "rating": URM_all.data
            }

            URM_all_dataframe = pd.DataFrame(data=dict_for_dataframe)

            URM_train, URM_train_all, URM_validation, URM_test = split_train_validation_test_VAE_CF(
                URM_all_dataframe, n_heldout_users=40000)

            n_rows = max(URM_train.shape[0], URM_train_all.shape[0],
                         URM_validation.shape[0], URM_test.shape[0])
            n_cols = max(URM_train.shape[1], URM_train_all.shape[1],
                         URM_validation.shape[1], URM_test.shape[1])

            newShape = (n_rows, n_cols)

            URM_test = reshapeSparse(URM_test, newShape)
            URM_train = reshapeSparse(URM_train, newShape)
            URM_train_all = reshapeSparse(URM_train_all, newShape)
            URM_test = reshapeSparse(URM_test, newShape)

            self.URM_DICT = {
                "URM_train": URM_train,
                "URM_train_all": URM_train_all,
                "URM_test": URM_test,
                "URM_validation": URM_validation,
            }

            save_data_dict_zip(self.URM_DICT, self.ICM_DICT, pre_splitted_path,
                               pre_splitted_filename)

            print("NetflixPrizeReader: Dataset loaded")
Esempio n. 20
0
    def __init__(self, pre_splitted_path, type="original"):

        pre_splitted_path += "data_split/"
        pre_splitted_filename = "splitted_data_"

        # If directory does not exist, create
        if not os.path.exists(pre_splitted_path):
            os.makedirs(pre_splitted_path)

        try:
            print("Dataset_{}: Attempting to load pre-splitted data".format(
                self.DATASET_NAME))

            for attrib_name, attrib_object in load_data_dict_zip(
                    pre_splitted_path, pre_splitted_filename).items():
                self.__setattr__(attrib_name, attrib_object)

        except FileNotFoundError:

            print("Dataset_{}: Pre-splitted data not found, building new one".
                  format(self.DATASET_NAME))

            if type == "original":

                # Ensure file is loaded as matrix
                Dataset_github.load_rating_file_as_list = Dataset_github.load_rating_file_as_matrix

                dataset = Dataset_github(
                    "Conferences/IJCAI/DELF_original/Data/ml-1m")

                URM_train, URM_validation, URM_test, testNegatives = dataset.trainMatrix, dataset.validRatings, \
                                                                     dataset.testRatings, dataset.testNegatives

                URM_train = URM_train.tocsr()
                URM_validation = URM_validation.tocsr()
                URM_test = URM_test.tocsr()
                URM_timestamp = "no"

                from Base.Recommender_utils import reshapeSparse

                shape = (max(URM_train.shape[0], URM_validation.shape[0],
                             URM_test.shape[0]),
                         max(URM_train.shape[1], URM_validation.shape[1],
                             URM_test.shape[1]))

                URM_train = reshapeSparse(URM_train, shape)
                URM_validation = reshapeSparse(URM_validation, shape)
                URM_test = reshapeSparse(URM_test, shape)

                URM_test_negatives_builder = IncrementalSparseMatrix(
                    n_rows=shape[0], n_cols=shape[1])

                for user_index in range(len(dataset.testNegatives)):

                    user_test_items = dataset.testNegatives[user_index]

                    URM_test_negatives_builder.add_single_row(user_index,
                                                              user_test_items,
                                                              data=1.0)

                URM_test_negative = URM_test_negatives_builder.get_SparseMatrix(
                )

            elif type == "ours":

                # create from full dataset with leave out one time wise from ORIGINAL full dateset
                data_reader = Movielens1MReader_DataManager()
                loaded_dataset = data_reader.load_data()

                URM_all = loaded_dataset.get_URM_from_name("URM_all")
                URM_timestamp = loaded_dataset.get_URM_from_name(
                    "URM_timestamp")

                # make rating implicit
                URM_all.data = np.ones_like(URM_all.data)

                URM_train, URM_validation, URM_test, URM_test_negative = split_data_on_timestamp(
                    URM_all, URM_timestamp, negative_items_per_positive=99)

            else:
                assert False

            self.URM_DICT = {
                "URM_train": URM_train,
                "URM_test": URM_test,
                "URM_validation": URM_validation,
                "URM_test_negative": URM_test_negative,
                "URM_timestamp": URM_timestamp,
            }

            save_data_dict_zip(self.URM_DICT, self.ICM_DICT, pre_splitted_path,
                               pre_splitted_filename)

        print("{}: Dataset loaded".format(self.DATASET_NAME))

        print_stat_datareader(self)
    def __init__(self, pre_splitted_path):

        test_percentage = 0.2
        validation_percentage = 0.2

        pre_splitted_path += "data_split/"
        pre_splitted_filename = "splitted_data_"

        ratings_file_name = "ratings_Amazon_Instant_Video.csv"

        # If directory does not exist, create
        if not os.path.exists(pre_splitted_path):
            os.makedirs(pre_splitted_path)

        try:
            print("Dataset_AmazonInstantVideo: Attempting to load pre-splitted data")

            for attrib_name, attrib_object in load_data_dict_zip(pre_splitted_path, pre_splitted_filename).items():
                 self.__setattr__(attrib_name, attrib_object)


        except FileNotFoundError:

            print("Dataset_AmazonInstantVideo: Pre-splitted data not found, building new one")

            folder_path = self.DATASET_SPLIT_ROOT_FOLDER + self.DATASET_SUBFOLDER

            download_from_URL(self.DATASET_URL, folder_path, ratings_file_name)

            # read Amazon Instant Video
            df = pd.read_csv(folder_path + ratings_file_name, sep=',', header=None, names=['user', 'item', 'rating', 'timestamp'])[
                ['user', 'item', 'rating']]

            # keep only ratings = 5
            URM_train_builder = IncrementalSparseMatrix(auto_create_col_mapper=True, auto_create_row_mapper=True)
            URM_train_builder.add_data_lists(df['user'].values, df['item'].values, df['rating'].values)
            URM_all = URM_train_builder.get_SparseMatrix()

            URM_all.data = URM_all.data==5
            URM_all.eliminate_zeros()

            # keep only users with at least 5 ratings
            URM_all = ut.filter_urm(URM_all, user_min_number_ratings=5, item_min_number_ratings=1)

            # create train - test - validation

            URM_train_original, URM_test = split_train_validation_percentage_user_wise(URM_all, train_percentage=1-test_percentage, verbose=False)

            URM_train, URM_validation = split_train_validation_percentage_user_wise(URM_train_original, train_percentage=1-validation_percentage, verbose=False)

            self.URM_DICT = {
                "URM_train": URM_train,
                "URM_test": URM_test,
                "URM_validation": URM_validation,
            }

            save_data_dict_zip(self.URM_DICT, self.ICM_DICT, pre_splitted_path, pre_splitted_filename)



        print("Dataset_AmazonInstantVideo: Dataset loaded")

        ut.print_stat_datareader(self)
Esempio n. 22
0
    def __init__(self, pre_splitted_path):
        super(Movielens1MReader, self).__init__()

        pre_splitted_path += "movielens1m_data_split/"
        pre_splitted_filename = "splitted_data_"

        original_data_path = "Conferences/CIKM/NCR_github/data/"

        # If directory does not exist, create
        if not os.path.exists(pre_splitted_path):
            os.makedirs(pre_splitted_path)

        try:

            print("Movielens1MReader: Attempting to load pre-splitted data")

            for attrib_name, attrib_object in load_data_dict_zip(
                    pre_splitted_path, pre_splitted_filename).items():
                self.__setattr__(attrib_name, attrib_object)

        except FileNotFoundError:

            print(
                "Movielens1MReader: Pre-splitted data not found, building new one"
            )

            dataset = DataSet()
            dataset.loadClicks(original_data_path + 'ml1m.txt', 10, 10)
            train_data, validation_data, test_data, negative_data = dataset.trainMatrix, dataset.validRatings, dataset.testRatings, dataset.testNegatives

            URM_train = train_data.tocsr()

            user_list = [pair[0] for pair in validation_data]
            item_list = [pair[1] for pair in validation_data]
            interactions = list(np.ones(len(item_list)))
            URM_validation = sps.coo_matrix(
                (interactions, (user_list, item_list)),
                shape=(dataset.nUsers, dataset.nItems),
                dtype=np.int32)
            URM_validation = URM_validation.tocsr()

            user_list = [pair[0] for pair in test_data]
            item_list = [pair[1] for pair in test_data]
            interactions = list(np.ones(len(item_list)))
            URM_test = sps.coo_matrix((interactions, (user_list, item_list)),
                                      shape=(dataset.nUsers, dataset.nItems),
                                      dtype=np.int32)
            URM_test = URM_test.tocsr()

            user_list = np.concatenate([
                np.full(len(items), i) for i, items in enumerate(negative_data)
            ])
            item_list = [item for items in negative_data for item in items]
            interactions = list(np.ones(len(item_list)))
            URM_negative = sps.coo_matrix(
                (interactions, (user_list, item_list)),
                shape=(dataset.nUsers, dataset.nItems),
                dtype=np.int32)
            URM_negative = URM_negative.tocsr()

            self.URM_DICT = {
                "URM_train": URM_train,
                "URM_validation": URM_validation,
                "URM_test": URM_test,
                "URM_negative": URM_negative,
            }

            save_data_dict_zip(self.URM_DICT, {}, pre_splitted_path,
                               pre_splitted_filename)

            print("Movielens1MReader: loading complete")