def _load_from_original_file(self):
        # Load data from original

        print("Movielens1MReader: Loading original data")

        zipFile_path = self.DATASET_OFFLINE_ROOT_FOLDER + self.DATASET_SUBFOLDER

        try:

            dataFile = zipfile.ZipFile(zipFile_path + "ml-1m.zip")

        except (FileNotFoundError, zipfile.BadZipFile):

            print("Movielens1MReader: Unable to find data zip file. Downloading...")
            downloadFromURL(self.DATASET_URL, zipFile_path, "ml-1m.zip")
            dataFile = zipfile.ZipFile(zipFile_path + "ml-1m.zip")


        URM_path = dataFile.extract("ml-1m/ratings.dat", path=zipFile_path + "decompressed/")

        URM_all, item_mapper, user_mapper = load_CSV_into_SparseBuilder(URM_path, separator="::")

        print("Movielens1MReader: cleaning temporary files")

        import shutil

        shutil.rmtree(zipFile_path + "decompressed", ignore_errors=True)

        print("Movielens1MReader: loading complete")

        return Dataset(self.get_dataset_name(), URM_dict={"URM_all": URM_all},
                       URM_mappers_dict={"URM_all": (user_mapper.copy(), item_mapper.copy())})
    def _load_from_original_file(self):
        # Load data from original

        self.zip_file_folder = self.DATASET_OFFLINE_ROOT_FOLDER + self.DATASET_SUBFOLDER
        self.decompressed_zip_file_folder = self.DATASET_OFFLINE_ROOT_FOLDER + self.DATASET_SUBFOLDER

        try:

            self.dataFile = zipfile.ZipFile(self.zip_file_folder +
                                            "netflix-prize-data.zip")

        except (FileNotFoundError, zipfile.BadZipFile):

            print("NetflixPrizeReader: Unable to find data zip file.")
            print(
                "NetflixPrizeReader: Automatic download not available, please ensure the ZIP data file is in folder {}."
                .format(self.zip_file_folder))
            print("NetflixPrizeReader: Data can be downloaded here: {}".format(
                self.DATASET_URL))

            # If directory does not exist, create
            if not os.path.exists(self.zip_file_folder):
                os.makedirs(self.zip_file_folder)

            raise FileNotFoundError("Automatic download not available.")

        URM_all, item_mapper, user_mapper = self._loadURM()

        print("NetflixPrizeReader: loading complete")

        return Dataset(self.get_dataset_name(),
                       URM_dict={"URM_all": URM_all},
                       URM_mappers_dict={
                           "URM_all": (user_mapper.copy(), item_mapper.copy())
                       })
Beispiel #3
0
    def _load_from_original_file(self):

        # Load data from original

        print("YelpReader: Loading original data")

        compressed_file_folder = self.DATASET_OFFLINE_ROOT_FOLDER + self.DATASET_SUBFOLDER
        decompressed_file_folder = self.DATASET_OFFLINE_ROOT_FOLDER + self.DATASET_SUBFOLDER

        try:

            compressed_file = tarfile.open(
                compressed_file_folder + "yelp_dataset.tar", "r")
            compressed_file.extract("yelp_academic_dataset_review.json",
                                    path=decompressed_file_folder +
                                    "decompressed/")
            compressed_file.close()

        except (FileNotFoundError, tarfile.ReadError, tarfile.ExtractError):

            print("YelpReader: Unable to fild or decompress tar.gz file.")
            print(
                "YelpReader: Automatic download not available, please ensure the ZIP data file is in folder {}."
                .format(compressed_file_folder))
            print("YelpReader: Data can be downloaded here: {}".format(
                self.DATASET_URL))

            # If directory does not exist, create
            if not os.path.exists(compressed_file_folder):
                os.makedirs(compressed_file_folder)

            raise FileNotFoundError("Automatic download not available.")

        URM_path = decompressed_file_folder + "decompressed/yelp_academic_dataset_review.json"

        print("YelpReader: loading URM")
        URM_all_builder = self._loadURM(URM_path,
                                        if_new_user="******",
                                        if_new_item="add")

        URM_all = URM_all_builder.get_SparseMatrix()

        item_mapper = URM_all_builder.get_column_token_to_id_mapper()
        user_mapper = URM_all_builder.get_row_token_to_id_mapper()

        print("YelpReader: cleaning temporary files")

        import shutil

        shutil.rmtree(decompressed_file_folder + "decompressed/",
                      ignore_errors=True)

        print("YelpReader: loading complete")

        return Dataset(self.get_dataset_name(),
                       URM_dict={"URM_all": URM_all},
                       URM_mappers_dict={
                           "URM_all": (user_mapper.copy(), item_mapper.copy())
                       })
    def _load_from_original_file(self):
        # Load data from original

        print("LastFMHetrec2011Reader: Loading original data")

        folder_path = self.DATASET_OFFLINE_ROOT_FOLDER + self.DATASET_SUBFOLDER

        try:

            dataFile = zipfile.ZipFile(folder_path +
                                       "hetrec2011-lastfm-2k.zip")

        except (FileNotFoundError, zipfile.BadZipFile):

            print(
                "LastFMHetrec2011Reader: Unable to find or extract data zip file. Downloading..."
            )
            downloadFromURL(self.DATASET_URL, folder_path,
                            "hetrec2011-lastfm-2k.zip")
            dataFile = zipfile.ZipFile(folder_path +
                                       "hetrec2011-lastfm-2k.zip")

        URM_path = dataFile.extract("user_artists.dat",
                                    path=folder_path + "decompressed")
        tags_path = dataFile.extract("user_taggedartists-timestamps.dat",
                                     path=folder_path + "decompressed")

        print("LastFMHetrec2011Reader: loading URM")
        URM_all, item_mapper, user_mapper = load_CSV_into_SparseBuilder(
            URM_path, separator="\t", header=True)

        print("LastFMHetrec2011Reader: loading tags")
        ICM_tags, feature_mapper, _ = self._loadICM_tags(tags_path,
                                                         item_mapper,
                                                         header=True,
                                                         separator='\t',
                                                         if_new_item="ignore")

        print("LastFMHetrec2011Reader: cleaning temporary files")

        import shutil

        shutil.rmtree(folder_path + "decompressed", ignore_errors=True)

        print("LastFMHetrec2011Reader: loading complete")

        return Dataset(self.get_dataset_name(),
                       URM_dict={"URM_all": URM_all},
                       URM_mappers_dict={
                           "URM_all": (user_mapper.copy(), item_mapper.copy())
                       },
                       ICM_dict={"ICM_all": ICM_tags},
                       ICM_mappers_dict={
                           "ICM_all":
                           (item_mapper.copy(), feature_mapper.copy())
                       })
Beispiel #5
0
    def _load_from_original_file(self):
        # Load data from original

        print("EpinionsReader: Loading original data")

        folder_path = self.DATASET_OFFLINE_ROOT_FOLDER + self.DATASET_SUBFOLDER

        compressed_file_path = folder_path + "ratings_data.txt.bz2"
        decompressed_file_path = folder_path + "ratings_data.txt"

        try:

            open(decompressed_file_path, "r")

        except FileNotFoundError:

            print(
                "EpinionsReader: Unable to find decompressed data file. Decompressing..."
            )

            try:

                compressed_file = bz2.open(compressed_file_path, "rb")

            except Exception:

                print(
                    "EpinionsReader: Unable to find or open compressed data file. Downloading..."
                )
                downloadFromURL(self.DATASET_URL, folder_path,
                                "ratings_data.txt.bz2")
                compressed_file = bz2.open(compressed_file_path, "rb")

            decompressed_file = open(decompressed_file_path, "w")
            self._save_BZ2_in_text_file(compressed_file, decompressed_file)
            decompressed_file.close()

        print("EpinionsReader: loading URM")

        URM_all, item_mapper, user_mapper = load_CSV_into_SparseBuilder(
            decompressed_file_path, separator=" ", header=True)

        print("EpinionsReader: cleaning temporary files")

        import os

        os.remove(decompressed_file_path)

        print("EpinionsReader: loading complete")

        return Dataset(self.get_dataset_name(),
                       URM_dict={"URM_all": URM_all},
                       URM_mappers_dict={
                           "URM_all": (user_mapper.copy(), item_mapper.copy())
                       })
    def _load_from_original_file(self):
        # Load data from original

        print("TVAudienceReader: Loading original data")

        compressed_zip_file_folder = self.DATASET_OFFLINE_ROOT_FOLDER + self.DATASET_SUBFOLDER
        decompressed_zip_file_folder = self.DATASET_OFFLINE_ROOT_FOLDER + self.DATASET_SUBFOLDER

        zipFile_name = "tv-audience-dataset.zip"

        try:

            dataFile = zipfile.ZipFile(compressed_zip_file_folder +
                                       zipFile_name)

            interactions_path = dataFile.extract(
                "tv-audience-dataset/tv-audience-dataset.csv",
                path=decompressed_zip_file_folder + "decompressed/")

        except (FileNotFoundError, zipfile.BadZipFile):

            print("TVAudienceReader: Unable to find or extract data zip file.")
            print(
                "TVAudienceReader: Automatic download not available, please ensure the ZIP data file is in folder {}."
                .format(compressed_zip_file_folder))
            print(
                "TVAudienceReader: Data zip file not found or damaged. You may download the data from: {}"
                .format(self.DATASET_URL))

            # If directory does not exist, create
            if not os.path.exists(compressed_zip_file_folder):
                os.makedirs(compressed_zip_file_folder)

            raise FileNotFoundError("Automatic download not available.")

        print("TVAudienceReader: Loading Interactions")
        URM_all, item_mapper, user_mapper = self._load_interactions(
            interactions_path, if_new_user="******", if_new_item="add")

        print("TVAudienceReader: cleaning temporary files")

        import shutil

        shutil.rmtree(decompressed_zip_file_folder + "decompressed/",
                      ignore_errors=True)

        print("TVAudienceReader: loading complete")

        return Dataset(self.get_dataset_name(),
                       URM_dict={"URM_all": URM_all},
                       URM_mappers_dict={
                           "URM_all": (user_mapper.copy(), item_mapper.copy())
                       })
    def _load_from_original_file(self):
        # Load data from original

        print("BrightkiteReader: Loading original data")

        folder_path = self.DATASET_OFFLINE_ROOT_FOLDER + self.DATASET_SUBFOLDER

        try:

            compressed_file = gzip.open(
                folder_path + "loc-brightkite_edges.txt.gz", 'rb')

        except (FileNotFoundError):

            print(
                "BrightkiteReader: Unable to find or extract data zip file. Downloading..."
            )

            downloadFromURL(self.DATASET_URL, folder_path,
                            "loc-brightkite_edges.txt.gz")

            compressed_file = gzip.open(
                folder_path + "loc-brightkite_edges.txt.gz", 'rb')

        URM_path = folder_path + "loc-brightkite_edges.txt"

        decompressed_file = open(URM_path, "w")

        self._save_GZ_in_text_file(compressed_file, decompressed_file)

        decompressed_file.close()

        print("BrightkiteReader: loading URM")
        URM_all, item_mapper, user_mapper = self._loadURM(URM_path,
                                                          separator="\t",
                                                          header=False)

        print("BrightkiteReader: cleaning temporary files")

        import os

        os.remove(URM_path)

        print("BrightkiteReader: loading complete")

        return Dataset(self.get_dataset_name(),
                       URM_dict={"URM_all": URM_all},
                       URM_mappers_dict={
                           "URM_all": (user_mapper.copy(), item_mapper.copy())
                       })
    def _load_from_original_file_all_amazon_datasets(self, URM_path, metadata_path=None, reviews_path=None):

        print("AmazonReviewDataReader: Loading original data")

        print("AmazonReviewDataReader: loading URM")
        URM_all, item_original_ID_to_index, user_original_ID_to_index = load_CSV_into_SparseBuilder(URM_path, separator=",", header = False)
        urm = {"URM_all": URM_all}
        urm_mappers = {"URM_all": (user_original_ID_to_index, item_original_ID_to_index)}

        icm = {}
        icm_mappers = {}
        if metadata_path is not None:
            print("AmazonReviewDataReader: loading metadata")
            ICM_metadata, feature_mapper, item_mapper = self._loadMetadata(metadata_path, item_original_ID_to_index, if_new_item="ignore")
            ICM_metadata, _, feature_mapper = removeFeatures(ICM_metadata, minOccurrence=5, maxPercOccurrence=0.30,
                                                             reconcile_mapper=feature_mapper)
            icm["ICM_metadata"] = ICM_metadata
            icm_mappers["ICM_metadata"] = (item_mapper.copy(), feature_mapper.copy())

        if reviews_path is not None:
            print("AmazonReviewDataReader: loading reviews")
            ICM_reviews, feature_mapper, item_mapper = self._loadReviews(reviews_path, item_original_ID_to_index, if_new_item="ignore")
            ICM_reviews, _, feature_mapper = removeFeatures(ICM_reviews, minOccurrence=5, maxPercOccurrence=0.30,
                                                            reconcile_mapper=feature_mapper)
            icm["ICM_reviews"] = ICM_reviews
            icm_mappers["ICM_reviews"] = (item_mapper.copy(), feature_mapper.copy())

        if len(icm) > 0:
            ICM_names = list(icm.keys())
            ICM_all, ICM_all_mapper = icm[ICM_names[0]], icm_mappers[ICM_names[0]]
            for key in ICM_names[1:]:
                ICM_all, ICM_all_mapper = self._merge_ICM(ICM_all, icm[key], ICM_all_mapper, icm_mappers[key])
            icm["ICM_all"] = ICM_all
            icm_mappers["ICM_all"] = ICM_all_mapper

        # Clean temp files
        print("AmazonReviewDataReader: cleaning temporary files")

        if metadata_path is not None:
            os.remove(metadata_path)

        if reviews_path is not None:
            os.remove(reviews_path)

        print("AmazonReviewDataReader: loading complete")

        return Dataset(self.get_dataset_name(),
                       URM_dict=urm, URM_mappers_dict=urm_mappers,
                       ICM_dict=icm, ICM_mappers_dict=icm_mappers)
    def load_split(self,
                   datareader,
                   save_folder_path=None,
                   postprocessings=None):

        if save_folder_path is None:
            tmp_save_folder_path = datareader.get_complete_default_save_path(
                postprocessings)
        else:
            tmp_save_folder_path = save_folder_path + os.sep

        try:

            datalist = self._get_dataset_names_in_split()
            for i in datalist:
                if not datareader.all_files_available(
                        tmp_save_folder_path + self.get_name() + os.sep,
                        filename_suffix="_{}".format(i)):
                    raise Exception

            datasets = []
            for i in datalist:
                urm, urm_mappers, icm, icm_mappers, ucm, ucm_mappers = datareader.load_from_saved_sparse_matrix(
                    tmp_save_folder_path + self.get_name() + os.sep,
                    filename_suffix="_{}".format(i))
                datasets.append(
                    Dataset(datareader.get_dataset_name(),
                            base_folder=datareader.get_default_save_path(),
                            postprocessings=postprocessings,
                            URM_dict=urm,
                            URM_mappers_dict=urm_mappers,
                            ICM_dict=icm,
                            ICM_mappers_dict=icm_mappers,
                            UCM_dict=ucm,
                            UCM_mappers_dict=ucm_mappers))

            return datasets

        except:

            print(
                "DataSplitter: Preloaded data not found or corrupted, reading from original files..."
            )
            dataset = datareader.load_data(save_folder_path=save_folder_path,
                                           postprocessings=postprocessings)
            return self.split(dataset)
    def _load_from_original_file(self):

        print("BookCrossingReader: Ratings are in range 1-10, value -1 refers to an implicit rating")
        print("BookCrossingReader: ICM contains the author, publisher, year and tokens from the title")

        print("BookCrossingReader: Loading original data")

        folder_path = self.DATASET_OFFLINE_ROOT_FOLDER + self.DATASET_SUBFOLDER

        try:

            dataFile = zipfile.ZipFile(folder_path + "BX-CSV-Dump.zip")

        except (FileNotFoundError, zipfile.BadZipFile):

            print("BookCrossingReader: Unable to find or extract data zip file. Downloading...")

            downloadFromURL(self.DATASET_URL, folder_path, "BX-CSV-Dump.zip")

            dataFile = zipfile.ZipFile(folder_path + "BX-CSV-Dump.zip")

        URM_path = dataFile.extract("BX-Book-Ratings.csv", path=folder_path + "decompressed")
        ICM_path = dataFile.extract("BX-Books.csv", path=folder_path + "decompressed")

        print("BookCrossingReader: loading ICM")
        ICM_all, feature_mapper, item_mapper = self._loadICM(ICM_path, separator=';', header=True, if_new_item="add")

        ICM_all, _, feature_mapper = removeFeatures(ICM_all, minOccurrence=5, maxPercOccurrence=0.30, reconcile_mapper=feature_mapper)

        print("BookCrossingReader: loading URM")
        URM_all, _, user_mapper = self._loadURM(URM_path, item_mapper, separator=";", header=True, if_new_user="******", if_new_item="ignore")

        print("BookCrossingReader: cleaning temporary files")

        import shutil

        shutil.rmtree(folder_path + "decompressed", ignore_errors=True)

        print("BookCrossingReader: loading complete")

        return Dataset(self.get_dataset_name(),
                       URM_dict={"URM_all": URM_all}, URM_mappers_dict={"URM_all": (user_mapper.copy(), item_mapper.copy())},
                       ICM_dict={"ICM_all": ICM_all}, ICM_mappers_dict={"ICM_all": (item_mapper.copy(), feature_mapper.copy())})
Beispiel #11
0
    def apply(self, dataset):

        new_URM_dict = {}
        for URM_name in dataset.get_URM_names():
            new_URM_dict[URM_name] = dataset.get_URM(URM_name)
            mask = np.ones(new_URM_dict[URM_name].data.size, dtype=np.bool)
            mask[new_URM_dict[URM_name].data >=
                 self.min_rating_threshold] = False
            new_URM_dict[URM_name].data[mask] = 0.0
            new_URM_dict[URM_name].eliminate_zeros()
            new_URM_dict[URM_name].data[:] = 1.0

        return Dataset(dataset.get_name(),
                       base_folder=dataset.get_base_folder(),
                       postprocessings=dataset.get_postprocessings() + [self],
                       URM_dict=new_URM_dict,
                       URM_mappers_dict=dataset.get_URM_mappers_dict(),
                       ICM_dict=dataset.get_ICM_dict(),
                       ICM_mappers_dict=dataset.get_ICM_mappers_dict(),
                       UCM_dict=dataset.get_UCM_dict(),
                       UCM_mappers_dict=dataset.get_UCM_mappers_dict())
Beispiel #12
0
    def _load_from_original_file(self):

        print("PinterestReader: Loading original data")

        zipFile_path = self.DATASET_OFFLINE_ROOT_FOLDER + self.DATASET_SUBFOLDER

        try:
            dataFile = zipfile.ZipFile(zipFile_path + "pinterest-20.zip")

        except (FileNotFoundError, zipfile.BadZipFile):
            print("PinterestReader: Unable to find data zip file.")
            print("PinterestReader: Automatic download not available, please ensure the compressed data file"
                  " is in folder {}.".format(zipFile_path))
            print("PinterestReader: Data can be downloaded here: {}".format(
                self.DATASET_URL))

        URM_train_path = dataFile.extract(
            "pinterest-20.train.rating.txt", path=zipFile_path + "decompressed/")
        URM_test_path = dataFile.extract(
            "pinterest-20.test.rating.txt", path=zipFile_path + "decompressed/")
        trainMatrix = self.load_rating_file_as_matrix(URM_train_path)
        testRatings = self.load_rating_file_as_matrix(URM_test_path)

        from recsys_framework.utils.common import reshapeSparse

        URM_train = trainMatrix.tocsr()
        URM_test = testRatings.tocsr()

        shape = (max(URM_train.shape[0], URM_test.shape[0]),
                 max(URM_train.shape[1], URM_test.shape[1]))

        URM_train = reshapeSparse(URM_train, shape)
        URM_test = reshapeSparse(URM_test, shape)

        mapper_users = {str(i+1): i for i in range(URM_train.shape[0])}
        mapper_items = {str(i+1): i for i in range(URM_train.shape[1])}

        return Dataset('Pinterest', URM_dict={"URM_all": URM_train+URM_test}, URM_mappers_dict={"URM_all": (mapper_users.copy(), mapper_items.copy())})
Beispiel #13
0
    def load_split(self, datareader, save_folder_path=None, postprocessings=None):

        tmp_save_folder_path = save_folder_path
        if tmp_save_folder_path is None:
            tmp_save_folder_path = datareader.get_complete_default_save_path(postprocessings)

        try:

            datalist = self._get_dataset_names_in_split()
            for i in range(self.n_folds):
                for d in datalist:
                    if not datareader.all_files_available(tmp_save_folder_path + self.get_name() + os.sep,
                                                          filename_suffix="_{}_{}".format(i, d)):
                        raise Exception
            r = []
            for i in range(self.n_folds):

                datasets = []
                for d in datalist:
                    urm, urm_mappers, icm, icm_mappers, ucm, ucm_mappers = datareader.load_from_saved_sparse_matrix(
                        tmp_save_folder_path + self.get_name() + os.sep, filename_suffix="_{}_{}".format(i, d))
                    datasets.append(Dataset(datareader.get_dataset_name(),
                                            base_folder=datareader.get_default_save_path(),
                                            postprocessings=postprocessings,
                                            URM_dict=urm, URM_mappers_dict=urm_mappers,
                                            ICM_dict=icm, ICM_mappers_dict=icm_mappers,
                                            UCM_dict=ucm, UCM_mappers_dict=ucm_mappers))

                # With KFold, Validation is intrisic, so we surely have only train and test datasets
                r.append((datasets[0], datasets[1]))
            
            return r
        except:

            print("DataSplitterKFold: Preloaded data not found or corrupted, reading from original files...")
            dataset = datareader.load_data(save_folder_path=save_folder_path, postprocessings=postprocessings)
            return self.split(dataset)
    def _load_from_original_file(self):
        # Load data from original

        print("SpotifySkipPredictionReader: Loading original data")

        compressed_file_folder = self.DATASET_OFFLINE_ROOT_FOLDER + self.DATASET_SUBFOLDER
        decompressed_file_folder = self.DATASET_OFFLINE_ROOT_FOLDER + self.DATASET_SUBFOLDER

        try:

            compressed_train_set_file = tarfile.open(
                compressed_file_folder + "20181113_training_set.tar.gz",
                "r:gz")

        except (FileNotFoundError, tarfile.ReadError, tarfile.ExtractError):

            print("SpotifySkipPredictionReader: Unable to fild data zip file.")
            print(
                "SpotifySkipPredictionReader: Automatic download not available, please ensure the compressed data file is in folder {}."
                .format(compressed_file_folder))
            print(
                "SpotifySkipPredictionReader: Data can be downloaded here: {}".
                format(self.DATASET_URL))

            # If directory does not exist, create
            if not os.path.exists(compressed_file_folder):
                os.makedirs(compressed_file_folder)

            raise FileNotFoundError("Automatic download not available.")

        #session_id,session_position,session_length,track_id_clean,skip_1,skip_2,skip_3,not_skipped,context_switch,no_pause_before_play,short_pause_before_play,long_pause_before_play,hist_user_behavior_n_seekfwd,hist_user_behavior_n_seekback,hist_user_behavior_is_shuffle,hour_of_day,date,premium,context_type,hist_user_behavior_reason_start,hist_user_behavior_reason_end

        URM_builder = IncrementalSparseMatrix_FilterIDs(
            preinitialized_col_mapper=None,
            on_new_col="add",
            preinitialized_row_mapper=None,
            on_new_row="add",
            dtype=np.bool)

        # If directory does not exist, create

        sps_blocks_path = decompressed_file_folder + "sps_blocks/"
        if not os.path.exists(sps_blocks_path):
            os.makedirs(sps_blocks_path)

        next_file = ""
        file_counter = 0
        interaction_counter = 0

        while next_file is not None:

            next_file = compressed_train_set_file.next()

            if file_counter <= 650:
                if next_file.isfile():
                    file_counter += 1
                    print("Skipping file {}: '{}'".format(
                        file_counter, next_file.path))
                continue

            if next_file is not None and next_file.isfile():

                print("Extracting: '{}'".format(next_file.path))

                compressed_train_set_file.extractall(
                    path=decompressed_file_folder + "decompressed/",
                    members=[next_file])
                decompressed_file_path = decompressed_file_folder + "decompressed/" + next_file.path
                self._load_URM_events(URM_builder, decompressed_file_path)
                file_counter += 1

                print("Loaded {}/660 files, {:.2E} interactions".format(
                    file_counter, interaction_counter + URM_builder.get_nnz()))

                os.remove(decompressed_file_path)

            if file_counter % 50 == 0 or next_file is None:

                URM_all = URM_builder.get_SparseMatrix()

                print("Saving {}".format(sps_blocks_path +
                                         "URM_file_{}".format(file_counter)))

                sps.save_npz(
                    sps_blocks_path + "URM_file_{}".format(file_counter),
                    URM_all)
                item_mapper = URM_builder.get_row_token_to_id_mapper()
                user_mapper = URM_builder.get_column_token_to_id_mapper()
                interaction_counter += URM_builder.get_nnz()

                URM_builder = IncrementalSparseMatrix_FilterIDs(
                    preinitialized_col_mapper=item_mapper,
                    on_new_col="add",
                    preinitialized_row_mapper=user_mapper,
                    on_new_row="add",
                    dtype=np.bool)

        compressed_train_set_file.close()

        print("ThirtyMusicReader: cleaning temporary files")

        import shutil

        shutil.rmtree(decompressed_file_folder + "decompressed/",
                      ignore_errors=True)

        print("ThirtyMusicReader: loading complete")

        return Dataset(self.get_dataset_name(),
                       URM_dict={"URM_all": URM_all},
                       URM_mappers_dict={
                           "URM_all": (user_mapper.copy(), item_mapper.copy())
                       })
Beispiel #15
0
    def split(self, dataset):

        super(WarmItemsKFold, self).split(dataset)

        # I can do the kfold of a slice of the initial URM!
        if self.percentage_initial_data_to_split < 1.0:
            h = Holdout(train_perc=self.percentage_initial_data_to_split, test_perc=1-self.percentage_initial_data_to_split)
            dataset = h.split(dataset)[0]

        folds = []
        URM = dataset.get_URM().tocoo()
        split_belonging = np.random.choice(self.n_folds, URM.data.size, replace=True)

        for i in range(self.n_folds):

            urm = {}
            urm_mappers = {}
            mask = split_belonging == i
            for URM_name in dataset.get_URM_names():
                URM = dataset.get_URM(URM_name).tocoo()
                # Sort nnz values by row and column indices, in order to remain consistent in the splits of different URMs
                row, col, data = zip(*sorted(zip(URM.row, URM.col, URM.data), key=lambda x: (x[0], x[1])))
                urm[URM_name] = sps.csr_matrix((np.array(data)[mask], (np.array(row)[mask], np.array(col)[mask])),
                                               shape=URM.shape)
                urm_mappers[URM_name] = dataset.get_URM_mapper(URM_name)

            folds.append(
                Dataset(dataset.get_name(), base_folder=dataset.get_base_folder(),
                        postprocessings=dataset.get_postprocessings(),
                        URM_dict=urm, URM_mappers_dict=urm_mappers,
                        ICM_dict=dataset.get_ICM_dict(), ICM_mappers_dict=dataset.get_ICM_mappers_dict(),
                        UCM_dict=dataset.get_UCM_dict(), UCM_mappers_dict=dataset.get_UCM_mappers_dict()
                        )
            )

        r = []
        for i in range(self.n_folds):
            urm = {}
            urm_mappers = {}
            for URM_name in folds[i].get_URM_names():
                # Keep i-th fold as test and merge the others as train
                urm[URM_name] = folds[(i + 1) % self.n_folds].get_URM(URM_name)
                urm_mappers[URM_name] = folds[(i + 1) % self.n_folds].get_URM_mapper(URM_name)
                for j in range(2, self.n_folds):
                    urm[URM_name] += folds[(i + j) % self.n_folds].get_URM(URM_name)

            train = Dataset(folds[i].get_name(), base_folder=folds[i].get_base_folder(),
                            postprocessings=folds[i].get_postprocessings(),
                            URM_dict=urm, URM_mappers_dict=urm_mappers,
                            ICM_dict=folds[i].get_ICM_dict(), ICM_mappers_dict=folds[i].get_ICM_mappers_dict(),
                            UCM_dict=folds[i].get_UCM_dict(), UCM_mappers_dict=folds[i].get_UCM_mappers_dict())

            urm = {}
            test_urm = folds[i].get_URM()
            test_urm.sort_indices()
            mask = test_urm.data <= self.test_rating_threshold
            for URM_name in folds[i].get_URM_names():
                urm[URM_name] = folds[i].get_URM(URM_name)
                urm[URM_name].sort_indices()
                urm[URM_name].data[mask] = 0.0
                urm[URM_name].eliminate_zeros()

            test = Dataset(folds[i].get_name(), base_folder=folds[i].get_base_folder(),
                           postprocessings=folds[i].get_postprocessings(),
                           URM_dict=urm, URM_mappers_dict=folds[i].get_URM_mappers_dict(),
                           ICM_dict=folds[i].get_ICM_dict(), ICM_mappers_dict=folds[i].get_ICM_mappers_dict(),
                           UCM_dict=folds[i].get_UCM_dict(), UCM_mappers_dict=folds[i].get_UCM_mappers_dict())

            if not self.allow_cold_users:
                users_to_remove = np.arange(train.n_users)[np.ediff1d(train.get_URM().tocsr().indptr) <= 0]
                train.remove_users(users_to_remove)
                test.remove_users(users_to_remove)

            r.append((train, test))
        return r
Beispiel #16
0
    def _load_from_original_file(self):
        # Load data from original

        print("Movielens20MReader: Loading original data")

        zipFile_path = self.DATASET_OFFLINE_ROOT_FOLDER + self.DATASET_SUBFOLDER

        try:

            dataFile = zipfile.ZipFile(zipFile_path + "ml-20m.zip")

        except (FileNotFoundError, zipfile.BadZipFile):

            print(
                "Movielens20MReader: Unable to fild data zip file. Downloading..."
            )
            downloadFromURL(self.DATASET_URL, zipFile_path, "ml-20m.zip")
            dataFile = zipfile.ZipFile(zipFile_path + "ml-20m.zip")

        genres_path = dataFile.extract("ml-20m/movies.csv",
                                       path=zipFile_path + "decompressed/")
        tags_path = dataFile.extract("ml-20m/tags.csv",
                                     path=zipFile_path + "decompressed/")
        URM_path = dataFile.extract("ml-20m/ratings.csv",
                                    path=zipFile_path + "decompressed/")

        print("Movielens20MReader: loading genres")
        ICM_genres, genres_mapper, item_mapper = self._loadICM_genres(
            genres_path, header=True, separator=',', genresSeparator="|")

        print("Movielens20MReader: loading tags")
        ICM_tags, tags_mapper, _ = self._loadICM_tags(tags_path,
                                                      item_mapper,
                                                      header=True,
                                                      separator=',',
                                                      if_new_item="ignore")

        print("Movielens20MReader: loading URM")
        URM_all, _, user_mapper = self._loadURM(URM_path,
                                                item_mapper,
                                                separator=",",
                                                header=True,
                                                if_new_user="******",
                                                if_new_item="ignore")

        ICM_all, feature_mapper = self._merge_ICM(ICM_genres, ICM_tags,
                                                  genres_mapper, tags_mapper)

        print("Movielens20MReader: cleaning temporary files")

        import shutil

        shutil.rmtree(zipFile_path + "decompressed", ignore_errors=True)

        print("Movielens20MReader: saving URM and ICM")

        return Dataset(self.get_dataset_name(),
                       URM_dict={"URM_all": URM_all},
                       URM_mappers_dict={
                           "URM_all": (user_mapper.copy(), item_mapper.copy())
                       },
                       ICM_dict={
                           "ICM_genres": ICM_genres,
                           "ICM_tags": ICM_tags,
                           "ICM_all": ICM_all
                       },
                       ICM_mappers_dict={
                           "ICM_genres":
                           (item_mapper.copy(), genres_mapper.copy()),
                           "ICM_tags":
                           (item_mapper.copy(), tags_mapper.copy()),
                           "ICM_all":
                           (item_mapper.copy(), feature_mapper.copy())
                       })
Beispiel #17
0
    def split(self, dataset):

        super(ColdItemsKFold, self).split(dataset)

        folds = []
        split_belonging = np.random.choice(self.n_folds, dataset.n_items, replace=True)

        for i in range(self.n_folds):

            urm = {}
            urm_mappers = {}
            mask = split_belonging != i
            for URM_name in dataset.get_URM_names():
                URM = dataset.get_URM(URM_name).tocsc(copy=True)
                # Sort nnz values by row and column indices, in order to remain consistent in the splits of different URMs
                for j in np.arange(URM.shape[1])[mask].tolist():
                    URM.data[URM.indptr[j]:URM.indptr[j + 1]] = 0.0
                URM.eliminate_zeros()
                urm[URM_name] = URM.tocsr()
                urm_mappers[URM_name] = dataset.get_URM_mapper(URM_name)

            folds.append(
                Dataset(dataset.get_name(), base_folder=dataset.get_base_folder(),
                        postprocessings=dataset.get_postprocessings(),
                        URM_dict=urm, URM_mappers_dict=urm_mappers,
                        ICM_dict=dataset.get_ICM_dict(), ICM_mappers_dict=dataset.get_ICM_mappers_dict(),
                        UCM_dict=dataset.get_UCM_dict(), UCM_mappers_dict=dataset.get_UCM_mappers_dict()
                        )
            )

        r = []
        for i in range(self.n_folds):
            urm = {}
            urm_mappers = {}
            for URM_name in folds[i].get_URM_names():
                # Keep i-th fold as test and merge the others as train
                urm[URM_name] = folds[(i + 1) % self.n_folds].get_URM(URM_name)
                urm_mappers[URM_name] = folds[(i + 1) % self.n_folds].get_URM_mapper(URM_name)
                for j in range(2, self.n_folds):
                    urm[URM_name] += folds[(i + j) % self.n_folds].get_URM(URM_name)

            train = Dataset(folds[i].get_name(), base_folder=folds[i].get_base_folder(),
                            postprocessings=folds[i].get_postprocessings(),
                            URM_dict=urm, URM_mappers_dict=urm_mappers,
                            ICM_dict=folds[i].get_ICM_dict(), ICM_mappers_dict=folds[i].get_ICM_mappers_dict(),
                            UCM_dict=folds[i].get_UCM_dict(), UCM_mappers_dict=folds[i].get_UCM_mappers_dict())

            urm = {}
            test_urm = folds[i].get_URM()
            test_urm.sort_indices()
            mask = test_urm.data <= self.test_rating_threshold
            for URM_name in folds[i].get_URM_names():
                urm[URM_name] = folds[i].get_URM(URM_name)
                urm[URM_name].sort_indices()
                urm[URM_name].data[mask] = 0.0
                urm[URM_name].eliminate_zeros()

            test = Dataset(folds[i].get_name(), base_folder=folds[i].get_base_folder(),
                           postprocessings=folds[i].get_postprocessings(),
                           URM_dict=urm, URM_mappers_dict=folds[i].get_URM_mappers_dict(),
                           ICM_dict=folds[i].get_ICM_dict(), ICM_mappers_dict=folds[i].get_ICM_mappers_dict(),
                           UCM_dict=folds[i].get_UCM_dict(), UCM_mappers_dict=folds[i].get_UCM_mappers_dict())

            if not self.allow_cold_users:
                users_to_remove = np.arange(train.n_users)[np.ediff1d(train.get_URM().tocsr().indptr) <= 0]
                train.remove_users(users_to_remove)
                test.remove_users(users_to_remove)

            r.append((train, test))

        return r
    def _load_from_original_file(self):

        print("TheMoviesDatasetReader: Loading original data")

        compressed_zip_file_folder = self.DATASET_OFFLINE_ROOT_FOLDER + self.DATASET_SUBFOLDER
        decompressed_zip_file_folder = self.DATASET_OFFLINE_ROOT_FOLDER + self.DATASET_SUBFOLDER

        zipFile_name = "the-movies-dataset.zip"

        try:

            dataFile = zipfile.ZipFile(compressed_zip_file_folder +
                                       zipFile_name)

            credits_path = dataFile.extract("credits.csv",
                                            path=decompressed_zip_file_folder +
                                            "decompressed/")
            metadata_path = dataFile.extract(
                "movies_metadata.csv",
                path=decompressed_zip_file_folder + "decompressed/")
            movielens_tmdb_id_map_path = dataFile.extract(
                "links.csv",
                path=decompressed_zip_file_folder + "decompressed/")

            URM_path = dataFile.extract("ratings.csv",
                                        path=decompressed_zip_file_folder +
                                        "decompressed/")

        except (FileNotFoundError, zipfile.BadZipFile):

            print(
                "TheMoviesDatasetReader: Unable to find or extract data zip file."
            )
            print(
                "TheMoviesDatasetReader: Automatic download not available, please ensure the ZIP data file is in folder {}."
                .format(compressed_zip_file_folder))
            print(
                "TheMoviesDatasetReader: Data zip file not found or damaged. You may download the data from: {}"
                .format(self.DATASET_URL))

            # If directory does not exist, create
            if not os.path.exists(compressed_zip_file_folder):
                os.makedirs(compressed_zip_file_folder)

            raise FileNotFoundError("Automatic download not available.")

        self.item_original_ID_to_title = {}
        self.item_index_to_title = {}

        print("TheMoviesDatasetReader: Loading ICM_credits")
        ICM_credits, ICM_credits_mapper, item_mapper = self._loadICM_credits(
            credits_path, header=True, if_new_item="add")

        print("TheMoviesDatasetReader: Loading ICM_metadata")
        ICM_metadata, ICM_metadata_mapper, item_mapper = self._loadICM_metadata(
            metadata_path, item_mapper, header=True, if_new_item="add")

        ICM_credits, _, ICM_credits_mapper = removeFeatures(
            ICM_credits,
            minOccurrence=5,
            maxPercOccurrence=0.30,
            reconcile_mapper=ICM_credits_mapper)

        ICM_metadata, _, ICM_metadata_mapper = removeFeatures(
            ICM_metadata,
            minOccurrence=5,
            maxPercOccurrence=0.30,
            reconcile_mapper=ICM_metadata_mapper)

        n_items = ICM_metadata.shape[0]

        ICM_credits = reshapeSparse(ICM_credits,
                                    (n_items, ICM_credits.shape[1]))

        # IMPORTANT: ICM uses TMDB indices, URM uses movielens indices
        # Load index mapper
        movielens_id_to_tmdb, tmdb_to_movielens_id = self._load_item_id_mapping(
            movielens_tmdb_id_map_path, header=True)

        # Modify saved mapper to accept movielens id instead of tmdb
        item_mapper = self._replace_tmdb_id_with_movielens(
            tmdb_to_movielens_id, item_mapper)

        print("TheMoviesDatasetReader: Loading URM")
        URM_all, _, user_mapper = self._load_URM(URM_path,
                                                 item_mapper,
                                                 header=True,
                                                 separator=",",
                                                 if_new_user="******",
                                                 if_new_item="ignore")

        # Reconcile URM and ICM
        # Keep only items having ICM entries, remove all the others
        n_items = ICM_credits.shape[0]
        URM_all = URM_all[:, 0:n_items]

        # URM is already clean

        ICM_all, ICM_all_mapper = self._merge_ICM(ICM_credits, ICM_metadata,
                                                  ICM_credits_mapper,
                                                  ICM_metadata_mapper)

        print("TheMoviesDatasetReader: cleaning temporary files")

        import shutil

        shutil.rmtree(decompressed_zip_file_folder + "decompressed/",
                      ignore_errors=True)

        print("TheMoviesDatasetReader: loading complete")

        return Dataset(self.get_dataset_name(),
                       URM_dict={"URM_all": URM_all},
                       URM_mappers_dict={
                           "URM_all": (user_mapper.copy(), item_mapper.copy())
                       },
                       ICM_dict={
                           "ICM_credits": ICM_credits,
                           "ICM_metadata": ICM_metadata,
                           "ICM_all": ICM_all
                       },
                       ICM_mappers_dict={
                           "ICM_credits":
                           (item_mapper.copy(), ICM_credits_mapper.copy()),
                           "ICM_metadata":
                           (item_mapper.copy(), ICM_metadata_mapper.copy()),
                           "ICM_all":
                           (item_mapper.copy(), ICM_all_mapper.copy())
                       })
    def _load_from_original_file(self):
        # Load data from original

        print("ThirtyMusicReader: Loading original data")

        compressed_file_folder = self.DATASET_OFFLINE_ROOT_FOLDER + self.DATASET_SUBFOLDER
        decompressed_file_folder = self.DATASET_OFFLINE_ROOT_FOLDER + self.DATASET_SUBFOLDER

        credits_path = "entities/albums.idomaar"
        persons_path = "entities/persons.idomaar"
        playlist_path = "entities/playlist.idomaar"
        tags_path = "entities/tags.idomaar"
        tracks_path = "entities/tracks.idomaar"
        users_path = "entities/users.idomaar"

        events_path = "relations/events.idomaar"
        love_path = "relations/love.idomaar"
        sessions_path = "relations/sessions.idomaar"

        try:

            compressed_file = tarfile.open(
                compressed_file_folder + "ThirtyMusic.tar.gz", "r:gz")
            compressed_file.extract(tracks_path,
                                    path=decompressed_file_folder +
                                    "decompressed/")
            compressed_file.extract(events_path,
                                    path=decompressed_file_folder +
                                    "decompressed/")
            compressed_file.close()

        except (FileNotFoundError, tarfile.ReadError, tarfile.ExtractError):

            print("ThirtyMusicReader: Unable to fild data zip file.")
            print(
                "ThirtyMusicReader: Automatic download not available, please ensure the compressed data file is in folder {}."
                .format(compressed_file_folder))
            print("ThirtyMusicReader: Data can be downloaded here: {}".format(
                self.DATASET_URL))

            # If directory does not exist, create
            if not os.path.exists(compressed_file_folder):
                os.makedirs(compressed_file_folder)

            raise FileNotFoundError("Automatic download not available.")

        tracks_path = decompressed_file_folder + "decompressed/" + tracks_path
        events_path = decompressed_file_folder + "decompressed/" + events_path

        print("ThirtyMusicReader: loading ICM_tracks")
        ICM_all, feature_mapper, item_mapper = self._load_ICM_tracks(
            tracks_path, if_new_item="add")

        print("ThirtyMusicReader: loading URM_events")
        URM_all, _, user_mapper = self._load_URM_events(events_path,
                                                        item_mapper,
                                                        if_new_user="******",
                                                        if_new_item="ignore")

        print("ThirtyMusicReader: cleaning temporary files")

        import shutil

        shutil.rmtree(decompressed_file_folder + "decompressed/",
                      ignore_errors=True)

        print("ThirtyMusicReader: loading complete")

        return Dataset(self.get_dataset_name(),
                       URM_dict={"URM_all": URM_all},
                       URM_mappers_dict={
                           "URM_all": (user_mapper.copy(), item_mapper.copy())
                       },
                       ICM_dict={"ICM_all": ICM_all},
                       ICM_mappers_dict={
                           "ICM_all":
                           (item_mapper.copy(), feature_mapper.copy())
                       })
Beispiel #20
0
    def _load_from_original_file(self):
        # Load data from original

        print("MovielensHetrecReader: Loading original data")

        zipFile_path = self.DATASET_OFFLINE_ROOT_FOLDER + self.DATASET_SUBFOLDER

        try:

            dataFile = zipfile.ZipFile(zipFile_path +
                                       "hetrec2011-movielens-2k-v2.zip")

        except (FileNotFoundError, zipfile.BadZipFile):

            print(
                "MovielensHetrecReader: Unable to fild data zip file. Downloading..."
            )
            downloadFromURL(self.DATASET_URL, zipFile_path,
                            "hetrec2011-movielens-2k-v2.zip")
            dataFile = zipfile.ZipFile(zipFile_path +
                                       "hetrec2011-movielens-2k-v2.zip")

        movies_path = dataFile.extract("movies.dat",
                                       path=zipFile_path + "decompressed/")
        genres_path = dataFile.extract("movie_genres.dat",
                                       path=zipFile_path + "decompressed/")
        directors_path = dataFile.extract("movie_directors.dat",
                                          path=zipFile_path + "decompressed/")
        actors_path = dataFile.extract("movie_actors.dat",
                                       path=zipFile_path + "decompressed/")
        countries_path = dataFile.extract("movie_countries.dat",
                                          path=zipFile_path + "decompressed/")
        locations_path = dataFile.extract("movie_locations.dat",
                                          path=zipFile_path + "decompressed/")
        URM_path = dataFile.extract("user_ratedmovies.dat",
                                    path=zipFile_path + "decompressed/")

        print("MovielensHetrecReader: loading years")
        ICM_years, years_mapper, item_mapper = self._load_tsv(
            movies_path,
            None,
            feature_columns=[5],
            header=True,
            if_new_item="add")

        print("MovielensHetrecReader: loading genres")
        ICM_genres, genres_mapper, _ = self._load_tsv(genres_path,
                                                      item_mapper,
                                                      header=True,
                                                      if_new_item="ignore")
        ICM_all, feature_mapper = self._merge_ICM(ICM_genres, ICM_years,
                                                  genres_mapper, years_mapper)

        print("MovielensHetrecReader: loading directors")
        ICM_directors, directors_mapper, _ = self._load_tsv(
            directors_path, item_mapper, header=True, if_new_item="ignore")
        ICM_all, feature_mapper = self._merge_ICM(ICM_all, ICM_directors,
                                                  feature_mapper,
                                                  directors_mapper)

        print("MovielensHetrecReader: loading actors")
        ICM_actors, actors_mapper, _ = self._load_tsv(actors_path,
                                                      item_mapper,
                                                      header=True,
                                                      if_new_item="ignore")
        ICM_all, feature_mapper = self._merge_ICM(ICM_all, ICM_actors,
                                                  feature_mapper,
                                                  actors_mapper)

        print("MovielensHetrecReader: loading countries")
        ICM_countries, countries_mapper, _ = self._load_tsv(
            countries_path, item_mapper, header=True, if_new_item="ignore")
        ICM_all, feature_mapper = self._merge_ICM(ICM_all, ICM_countries,
                                                  feature_mapper,
                                                  countries_mapper)

        print("MovielensHetrecReader: loading locations")
        ICM_locations, locations_mapper, _ = self._load_tsv(
            locations_path,
            item_mapper,
            feature_columns=[1, 2, 3],
            header=True,
            if_new_item="ignore")
        ICM_all, feature_mapper = self._merge_ICM(ICM_all, ICM_locations,
                                                  feature_mapper,
                                                  locations_mapper)

        print("MovielensHetrecReader: loading URM")
        URM_all, _, user_mapper = self._loadURM(URM_path,
                                                item_mapper,
                                                separator="\t",
                                                header=True,
                                                if_new_user="******",
                                                if_new_item="ignore")

        print("MovielensHetrecReader: cleaning temporary files")

        import shutil

        shutil.rmtree(zipFile_path + "decompressed", ignore_errors=True)

        print("MovielensHetrecReader: saving URM and ICM")

        return Dataset(
            self.get_dataset_name(),
            URM_dict={"URM_all": URM_all},
            URM_mappers_dict={
                "URM_all": (user_mapper.copy(), item_mapper.copy())
            },
            ICM_dict={
                "ICM_genres": ICM_genres,
                "ICM_years": ICM_years,
                "ICM_all": ICM_all,
                "ICM_directors": ICM_directors,
                "ICM_actors": ICM_actors,
                "ICM_countries": ICM_countries,
                "ICM_locations": ICM_locations,
            },
            ICM_mappers_dict={
                "ICM_genres": (item_mapper.copy(), genres_mapper.copy()),
                "ICM_years": (item_mapper.copy(), years_mapper.copy()),
                "ICM_directors": (item_mapper.copy(), directors_mapper.copy()),
                "ICM_actors": (item_mapper.copy(), actors_mapper.copy()),
                "ICM_countries": (item_mapper.copy(), countries_mapper.copy()),
                "ICM_locations": (item_mapper.copy(), locations_mapper.copy()),
                "ICM_all": (item_mapper.copy(), feature_mapper.copy())
            })
Beispiel #21
0
    def split(self, dataset):

        super(ColdItemsHoldout, self).split(dataset)

        n_users, n_items = dataset.n_users, dataset.n_items
        URM_train, URM_test, URM_validation = {}, {}, {}

        items_split = np.random.choice(
            3,
            n_items,
            replace=True,
            p=[self.train_perc, self.validation_perc, self.test_perc])
        train_items = np.arange(n_items)[items_split == 0]
        validation_items = np.arange(n_items)[items_split == 1]
        test_items = np.arange(n_items)[items_split == 2]

        #Select apriori how to randomizely sort every user
        users_to_remove = []

        for URM_name in dataset.get_URM_names():

            URM = dataset.get_URM(URM_name)
            URM = sps.csr_matrix(URM)

            URM_train_builder = IncrementalSparseMatrix(
                auto_create_row_mapper=False,
                n_rows=n_users,
                auto_create_col_mapper=False,
                n_cols=n_items)

            URM_test_builder = IncrementalSparseMatrix(
                auto_create_row_mapper=False,
                n_rows=n_users,
                auto_create_col_mapper=False,
                n_cols=n_items)

            if self.with_validation:
                URM_validation_builder = IncrementalSparseMatrix(
                    auto_create_row_mapper=False,
                    n_rows=n_users,
                    auto_create_col_mapper=False,
                    n_cols=n_items)

            for user_id in range(n_users):

                start_user_position = URM.indptr[user_id]
                end_user_position = URM.indptr[user_id + 1]

                user_interaction_items = URM.indices[
                    start_user_position:end_user_position]
                user_interaction_data = URM.data[
                    start_user_position:end_user_position]

                # Test interactions
                indices = np.in1d(user_interaction_items,
                                  test_items,
                                  assume_unique=True)
                user_interaction_items_test = user_interaction_items[indices]
                user_interaction_data_test = user_interaction_data[indices]

                # Remove from test interactions below a given threshold
                mask = user_interaction_data_test > self.test_rating_threshold
                user_interaction_items_test = user_interaction_items_test[mask]
                user_interaction_data_test = user_interaction_data_test[mask]

                URM_test_builder.add_data_lists(
                    [user_id] * len(user_interaction_data_test),
                    user_interaction_items_test, user_interaction_data_test)

                # validation interactions
                if self.with_validation:
                    indices = np.in1d(user_interaction_items,
                                      validation_items,
                                      assume_unique=True)
                    user_interaction_items_validation = user_interaction_items[
                        indices]
                    user_interaction_data_validation = user_interaction_data[
                        indices]

                    # Remove from validation interactions below a given threshold
                    mask = user_interaction_data_validation > self.test_rating_threshold
                    user_interaction_items_validation = user_interaction_items_validation[
                        mask]
                    user_interaction_data_validation = user_interaction_data_validation[
                        mask]

                    URM_validation_builder.add_data_lists(
                        [user_id] * len(user_interaction_data_validation),
                        user_interaction_items_validation,
                        user_interaction_data_validation)

                    #if len(user_interaction_items_validation) <= 0:
                    #    users_to_remove.append(user_id)

                # Train interactions
                indices = np.in1d(user_interaction_items,
                                  train_items,
                                  assume_unique=True)
                user_interaction_items_train = user_interaction_items[indices]
                user_interaction_data_train = user_interaction_data[indices]

                URM_train_builder.add_data_lists(
                    [user_id] * len(user_interaction_items_train),
                    user_interaction_items_train, user_interaction_data_train)

                #if len(user_interaction_items_test) <= 0:
                #    users_to_remove.append(user_id)

                if not self.allow_cold_users and len(
                        user_interaction_items_train) <= 0:
                    users_to_remove.append(user_id)

            URM_train[URM_name] = URM_train_builder.get_SparseMatrix()
            URM_test[URM_name] = URM_test_builder.get_SparseMatrix()

            if self.with_validation:
                URM_validation[
                    URM_name] = URM_validation_builder.get_SparseMatrix()

        train = Dataset(dataset.get_name(),
                        base_folder=dataset.get_base_folder(),
                        postprocessings=dataset.get_postprocessings(),
                        URM_dict=URM_train,
                        URM_mappers_dict=dataset.get_URM_mappers_dict(),
                        ICM_dict=dataset.get_ICM_dict(),
                        ICM_mappers_dict=dataset.get_ICM_mappers_dict(),
                        UCM_dict=dataset.get_UCM_dict(),
                        UCM_mappers_dict=dataset.get_UCM_mappers_dict())
        train.remove_users(users_to_remove)

        test = Dataset(dataset.get_name(),
                       base_folder=dataset.get_base_folder(),
                       postprocessings=dataset.get_postprocessings(),
                       URM_dict=URM_test,
                       URM_mappers_dict=dataset.get_URM_mappers_dict(),
                       ICM_dict=dataset.get_ICM_dict(),
                       ICM_mappers_dict=dataset.get_ICM_mappers_dict(),
                       UCM_dict=dataset.get_UCM_dict(),
                       UCM_mappers_dict=dataset.get_UCM_mappers_dict())
        test.remove_users(users_to_remove)

        if self.with_validation:
            validation = Dataset(
                dataset.get_name(),
                base_folder=dataset.get_base_folder(),
                postprocessings=dataset.get_postprocessings(),
                URM_dict=URM_validation,
                URM_mappers_dict=dataset.get_URM_mappers_dict(),
                ICM_dict=dataset.get_ICM_dict(),
                ICM_mappers_dict=dataset.get_ICM_mappers_dict(),
                UCM_dict=dataset.get_UCM_dict(),
                UCM_mappers_dict=dataset.get_UCM_mappers_dict())
            validation.remove_users(users_to_remove)
            return train, test, validation
        else:
            return train, test
Beispiel #22
0
    def split(self, dataset):

        super(Holdout, self).split(dataset)

        URM = sps.csr_matrix(dataset.get_URM())

        n_users, n_items = dataset.n_users, dataset.n_items
        user_indices = []
        URM_train, URM_test, URM_validation = {}, {}, {}

        #Select apriori how to randomizely sort every user
        users_to_remove = []
        for user_id in range(n_users):
            assignment = np.random.choice(
                3,
                URM.indptr[user_id + 1] - URM.indptr[user_id],
                replace=True,
                p=[self.train_perc, self.validation_perc, self.test_perc])
            assignments = [assignment == i for i in range(3)]
            #if assignments[2].sum() <= 0:
            #No interactions in test
            #    users_to_remove.append(user_id)
            #if self.with_validation and assignments[1].sum() <= 0:
            #No interactions in validation
            #    users_to_remove.append(user_id)
            if not self.allow_cold_users and assignments[0].sum() <= 0:
                #No interactions in train
                users_to_remove.append(user_id)
            user_indices.append(assignments)

        for URM_name in dataset.get_URM_names():

            URM = dataset.get_URM(URM_name)
            URM = sps.csr_matrix(URM)

            URM_train_builder = IncrementalSparseMatrix(
                auto_create_row_mapper=False,
                n_rows=n_users,
                auto_create_col_mapper=False,
                n_cols=n_items)

            URM_test_builder = IncrementalSparseMatrix(
                auto_create_row_mapper=False,
                n_rows=n_users,
                auto_create_col_mapper=False,
                n_cols=n_items)

            if self.with_validation:
                URM_validation_builder = IncrementalSparseMatrix(
                    auto_create_row_mapper=False,
                    n_rows=n_users,
                    auto_create_col_mapper=False,
                    n_cols=n_items)

            users_to_remove_index = 0
            for user_id in range(n_users):

                if users_to_remove_index < len(
                        users_to_remove
                ) and user_id == users_to_remove[users_to_remove_index]:
                    users_to_remove_index += 1
                    continue

                indices = user_indices[user_id]

                start_user_position = URM.indptr[user_id]
                end_user_position = URM.indptr[user_id + 1]

                user_interaction_items = URM.indices[
                    start_user_position:end_user_position]
                user_interaction_data = URM.data[
                    start_user_position:end_user_position]

                # Test interactions
                user_interaction_items_test = user_interaction_items[
                    indices[2]]
                user_interaction_data_test = user_interaction_data[indices[2]]

                mask = user_interaction_data_test > self.test_rating_threshold
                user_interaction_items_test = user_interaction_items_test[mask]
                user_interaction_data_test = user_interaction_data_test[mask]

                URM_test_builder.add_data_lists(
                    [user_id] * len(user_interaction_data_test),
                    user_interaction_items_test, user_interaction_data_test)

                # validation interactions
                if self.with_validation:
                    user_interaction_items_validation = user_interaction_items[
                        indices[1]]
                    user_interaction_data_validation = user_interaction_data[
                        indices[1]]

                    # Remove from validation interactions below a given threshold
                    mask = user_interaction_data_validation > self.test_rating_threshold
                    user_interaction_items_validation = user_interaction_items_validation[
                        mask]
                    user_interaction_data_validation = user_interaction_data_validation[
                        mask]

                    URM_validation_builder.add_data_lists(
                        [user_id] * len(user_interaction_data_validation),
                        user_interaction_items_validation,
                        user_interaction_data_validation)

                # Train interactions
                user_interaction_items_train = user_interaction_items[
                    indices[0]]
                user_interaction_data_train = user_interaction_data[indices[0]]

                URM_train_builder.add_data_lists(
                    [user_id] * len(user_interaction_items_train),
                    user_interaction_items_train, user_interaction_data_train)

            URM_train[URM_name] = URM_train_builder.get_SparseMatrix()
            URM_test[URM_name] = URM_test_builder.get_SparseMatrix()

            if self.with_validation:
                URM_validation[
                    URM_name] = URM_validation_builder.get_SparseMatrix()

        train = Dataset(dataset.get_name(),
                        base_folder=dataset.get_base_folder(),
                        postprocessings=dataset.get_postprocessings(),
                        URM_dict=URM_train,
                        URM_mappers_dict=dataset.get_URM_mappers_dict(),
                        ICM_dict=dataset.get_ICM_dict(),
                        ICM_mappers_dict=dataset.get_ICM_mappers_dict(),
                        UCM_dict=dataset.get_UCM_dict(),
                        UCM_mappers_dict=dataset.get_UCM_mappers_dict())
        train.remove_users(users_to_remove)

        test = Dataset(dataset.get_name(),
                       base_folder=dataset.get_base_folder(),
                       postprocessings=dataset.get_postprocessings(),
                       URM_dict=URM_test,
                       URM_mappers_dict=dataset.get_URM_mappers_dict(),
                       ICM_dict=dataset.get_ICM_dict(),
                       ICM_mappers_dict=dataset.get_ICM_mappers_dict(),
                       UCM_dict=dataset.get_UCM_dict(),
                       UCM_mappers_dict=dataset.get_UCM_mappers_dict())
        test.remove_users(users_to_remove)

        if self.with_validation:
            validation = Dataset(
                dataset.get_name(),
                base_folder=dataset.get_base_folder(),
                postprocessings=dataset.get_postprocessings(),
                URM_dict=URM_validation,
                URM_mappers_dict=dataset.get_URM_mappers_dict(),
                ICM_dict=dataset.get_ICM_dict(),
                ICM_mappers_dict=dataset.get_ICM_mappers_dict(),
                UCM_dict=dataset.get_UCM_dict(),
                UCM_mappers_dict=dataset.get_UCM_mappers_dict())
            validation.remove_users(users_to_remove)
            return train, test, validation
        else:
            return train, test
Beispiel #23
0
    def _load_from_original_file(self):

        print("NetflixEnhancedReader: Loading original data")

        compressed_zip_file_folder = self.DATASET_OFFLINE_ROOT_FOLDER + self.DATASET_SUBFOLDER
        decompressed_zip_file_folder = self.DATASET_OFFLINE_ROOT_FOLDER + self.DATASET_SUBFOLDER

        try:

            dataFile = zipfile.ZipFile(compressed_zip_file_folder +
                                       "NetflixEnhancedData.zip")

            URM_matfile_path = dataFile.extract(
                "urm.mat", path=decompressed_zip_file_folder + "decompressed/")
            titles_matfile_path = dataFile.extract(
                "titles.mat",
                path=decompressed_zip_file_folder + "decompressed/")
            ICM_matfile_path = dataFile.extract(
                "icm.mat", path=decompressed_zip_file_folder + "decompressed/")

        except (FileNotFoundError, zipfile.BadZipFile):

            print(
                "NetflixPrizeReader: Unable to find or extract data zip file.")
            print(
                "NetflixPrizeReader: Automatic download not available, please ensure the ZIP data file is in folder {}."
                .format(compressed_zip_file_folder))
            print("NetflixPrizeReader: Data can be downloaded here: {}".format(
                self.DATASET_URL))

            # If directory does not exist, create
            if not os.path.exists(compressed_zip_file_folder):
                os.makedirs(compressed_zip_file_folder)

            raise FileNotFoundError("Automatic download not available.")

        URM_matfile = sio.loadmat(URM_matfile_path)

        URM_all = URM_matfile["urm"]
        usercache_urm = URM_matfile["usercache_urm"]
        itemcache_urm = URM_matfile["itemcache_urm"]

        user_mapper = {}
        item_mapper = {}

        for item_id in range(URM_all.shape[1]):
            item_mapper[item_id] = item_id

        for user_id in range(URM_all.shape[0]):
            user_mapper[user_id] = user_id

        titles_matfile = sio.loadmat(titles_matfile_path)

        titles_list = titles_matfile["titles"]

        ICM_matfile = sio.loadmat(ICM_matfile_path)

        ICM_all = ICM_matfile["icm"]
        ICM_all = sps.csr_matrix(ICM_all.T)

        ICM_dictionary = ICM_matfile["dictionary"]
        itemcache_icm = ICM_matfile["itemcache_icm"]
        stemTypes = ICM_dictionary["stemTypes"][0][0]
        stems = ICM_dictionary["stems"][0][0]

        # Split ICM_tags and ICM_editorial
        is_tag_mask = np.zeros((len(stems)), dtype=np.bool)

        ICM_all_mapper, ICM_tags_mapper, ICM_editorial_mapper = {}, {}, {}

        for current_stem_index in range(len(stems)):
            current_stem_type = stemTypes[current_stem_index]
            current_stem_type_string = current_stem_type[0][0]

            token = stems[current_stem_index][0][0]

            if token in ICM_all_mapper:
                print(
                    "Duplicate token {} alredy existent in position {}".format(
                        token, ICM_all_mapper[token]))

            else:
                ICM_all_mapper[token] = current_stem_index
                if "KeywordsArray" in current_stem_type_string:
                    is_tag_mask[current_stem_index] = True
                    ICM_tags_mapper[token] = len(ICM_tags_mapper)
                else:
                    ICM_editorial_mapper[token] = len(ICM_editorial_mapper)

        ICM_tags = ICM_all[:, is_tag_mask]

        is_editorial_mask = np.logical_not(is_tag_mask)
        ICM_editorial = ICM_all[:, is_editorial_mask]

        # Remove features taking into account the filtered ICM
        ICM_all, _, ICM_all_mapper = removeFeatures(
            ICM_all,
            minOccurrence=5,
            maxPercOccurrence=0.30,
            reconcile_mapper=ICM_all_mapper)
        ICM_tags, _, ICM_tags_mapper = removeFeatures(
            ICM_tags,
            minOccurrence=5,
            maxPercOccurrence=0.30,
            reconcile_mapper=ICM_tags_mapper)
        ICM_editorial, _, ICM_editorial_mapper = removeFeatures(
            ICM_editorial,
            minOccurrence=5,
            maxPercOccurrence=0.30,
            reconcile_mapper=ICM_editorial_mapper)

        print("NetflixEnhancedReader: cleaning temporary files")

        import shutil

        shutil.rmtree(decompressed_zip_file_folder + "decompressed",
                      ignore_errors=True)

        print("NetflixEnhancedReader: loading complete")

        return Dataset(self.get_dataset_name(),
                       URM_dict={"URM_all": URM_all},
                       URM_mappers_dict={
                           "URM_all": (user_mapper.copy(), item_mapper.copy())
                       },
                       ICM_dict={
                           "ICM_editorial": ICM_editorial,
                           "ICM_tags": ICM_tags,
                           "ICM_all": ICM_all
                       },
                       ICM_mappers_dict={
                           "ICM_editorial":
                           (item_mapper.copy(), ICM_editorial_mapper.copy()),
                           "ICM_tags":
                           (item_mapper.copy(), ICM_tags_mapper.copy()),
                           "ICM_all":
                           (item_mapper.copy(), ICM_all_mapper.copy())
                       })
Beispiel #24
0
    def split(self, dataset):

        super(LeaveKOut, self).split(dataset)

        URM = sps.csr_matrix(dataset.get_URM())
        URM.sort_indices()

        split_number = 2
        if self.with_validation:
            split_number += 1

        # Min interactions at least self.k_value for each split +1 for train
        min_user_interactions = split_number * (self.k_value - 1) + 1

        users_to_preserve = np.arange(URM.shape[0])
        if not self.allow_cold_users:
            urm_threshold = URM.copy()
            urm_threshold.data[
                urm_threshold.data <= self.test_rating_threshold] = 0
            urm_threshold.eliminate_zeros()

            user_interactions = np.ediff1d(urm_threshold.tocsr().indptr)
            users_to_preserve = users_to_preserve[
                user_interactions >= min_user_interactions]

            print(
                "DataSplitterLeaveKOut: Removing {} of {} users because they have less than the {} interactions required for {} splits"
                .format(URM.shape[0] - len(users_to_preserve), URM.shape[0],
                        min_user_interactions, split_number))
        users_to_remove = np.setdiff1d(np.arange(URM.shape[0]),
                                       users_to_preserve)

        n_users, n_items = URM.shape
        user_indices = []
        URM_train, URM_test, URM_validation = {}, {}, {}

        #Select apriori how to randomizely sort every user
        for user_id in users_to_preserve.tolist():
            user_profile = URM.data[URM.indptr[user_id]:URM.
                                    indptr[user_id +
                                           1]] > self.test_rating_threshold
            test_and_val = np.random.permutation(
                np.arange(URM.indptr[user_id + 1] -
                          URM.indptr[user_id])[user_profile])

            limit = self.k_value
            if self.with_validation:
                limit = self.k_value * 2

            # Train, Test and Validation
            user_indices.append((np.setdiff1d(np.arange(len(user_profile)),
                                              test_and_val[:limit]),
                                 test_and_val[:self.k_value],
                                 test_and_val[self.k_value:limit]))

        for URM_name in dataset.get_URM_names():

            URM = dataset.get_URM(URM_name).tocsr()
            URM.sort_indices()

            URM_train_builder = IncrementalSparseMatrix(
                auto_create_row_mapper=False,
                n_rows=n_users,
                auto_create_col_mapper=False,
                n_cols=n_items)

            URM_test_builder = IncrementalSparseMatrix(
                auto_create_row_mapper=False,
                n_rows=n_users,
                auto_create_col_mapper=False,
                n_cols=n_items)

            if self.with_validation:
                URM_validation_builder = IncrementalSparseMatrix(
                    auto_create_row_mapper=False,
                    n_rows=n_users,
                    auto_create_col_mapper=False,
                    n_cols=n_items)

            for i, user_id in enumerate(users_to_preserve.tolist()):
                start_user_position = URM.indptr[user_id]
                end_user_position = URM.indptr[user_id + 1]

                indices = user_indices[i]
                user_interaction_items = URM.indices[
                    start_user_position:end_user_position]
                user_interaction_data = URM.data[
                    start_user_position:end_user_position]

                # Test interactions
                user_interaction_items_test = user_interaction_items[
                    indices[1]]
                user_interaction_data_test = user_interaction_data[indices[1]]

                URM_test_builder.add_data_lists([user_id] * self.k_value,
                                                user_interaction_items_test,
                                                user_interaction_data_test)

                train_start = self.k_value
                # validation interactions
                if self.with_validation:
                    user_interaction_items_validation = user_interaction_items[
                        indices[2]]
                    user_interaction_data_validation = user_interaction_data[
                        indices[2]]

                    URM_validation_builder.add_data_lists(
                        [user_id] * self.k_value,
                        user_interaction_items_validation,
                        user_interaction_data_validation)
                    train_start = self.k_value * 2

                # Train interactions
                user_interaction_items_train = user_interaction_items[
                    indices[0]]
                user_interaction_data_train = user_interaction_data[indices[0]]

                URM_train_builder.add_data_lists(
                    [user_id] * len(user_interaction_items_train),
                    user_interaction_items_train, user_interaction_data_train)

            URM_train[URM_name] = URM_train_builder.get_SparseMatrix()
            URM_test[URM_name] = URM_test_builder.get_SparseMatrix()

            if self.with_validation:
                URM_validation[
                    URM_name] = URM_validation_builder.get_SparseMatrix()

        train = Dataset(dataset.get_name(),
                        base_folder=dataset.get_base_folder(),
                        postprocessings=dataset.get_postprocessings(),
                        URM_dict=URM_train,
                        URM_mappers_dict=dataset.get_URM_mappers_dict(),
                        ICM_dict=dataset.get_ICM_dict(),
                        ICM_mappers_dict=dataset.get_ICM_mappers_dict(),
                        UCM_dict=dataset.get_UCM_dict(),
                        UCM_mappers_dict=dataset.get_UCM_mappers_dict())
        train.remove_users(users_to_remove)

        test = Dataset(dataset.get_name(),
                       base_folder=dataset.get_base_folder(),
                       postprocessings=dataset.get_postprocessings(),
                       URM_dict=URM_test,
                       URM_mappers_dict=dataset.get_URM_mappers_dict(),
                       ICM_dict=dataset.get_ICM_dict(),
                       ICM_mappers_dict=dataset.get_ICM_mappers_dict(),
                       UCM_dict=dataset.get_UCM_dict(),
                       UCM_mappers_dict=dataset.get_UCM_mappers_dict())
        test.remove_users(users_to_remove)

        if self.with_validation:
            validation = Dataset(
                dataset.get_name(),
                base_folder=dataset.get_base_folder(),
                postprocessings=dataset.get_postprocessings(),
                URM_dict=URM_validation,
                URM_mappers_dict=dataset.get_URM_mappers_dict(),
                ICM_dict=dataset.get_ICM_dict(),
                ICM_mappers_dict=dataset.get_ICM_mappers_dict(),
                UCM_dict=dataset.get_UCM_dict(),
                UCM_mappers_dict=dataset.get_UCM_mappers_dict())
            validation.remove_users(users_to_remove)
            return train, test, validation
        else:
            return train, test