예제 #1
0
    def _load_from_original_file(self):
        # Load data from original

        print("Movielens100KReader: Loading original data")

        zipFile_path = self.DATASET_SPLIT_ROOT_FOLDER + self.DATASET_SUBFOLDER

        try:

            dataFile = zipfile.ZipFile(zipFile_path + "ml-100k.zip")

        except (FileNotFoundError, zipfile.BadZipFile):

            print(
                "Movielens100KReader: Unable to fild data zip file. Downloading..."
            )

            downloadFromURL(self.DATASET_URL, zipFile_path, "ml-100k.zip")

            dataFile = zipfile.ZipFile(zipFile_path + "ml-100k.zip")

        URM_path = dataFile.extract("ml-100k/u.data",
                                    path=zipFile_path + "decompressed/")

        self.URM_all, self.item_original_ID_to_index, self.user_original_ID_to_index = load_CSV_into_SparseBuilder(
            URM_path, separator="\t", header=False)

        print("Movielens100KReader: cleaning temporary files")

        import shutil

        shutil.rmtree(zipFile_path + "decompressed", ignore_errors=True)

        print("Movielens100KReader: loading complete")
    def _load_from_original_file(self):

        zipFile_path =  self.DATASET_SPLIT_ROOT_FOLDER + self.DATASET_SUBFOLDER

        try:

            dataFile = zipfile.ZipFile(zipFile_path + "ml-10m.zip")

        except (FileNotFoundError, zipfile.BadZipFile):

            print("Movielens10MReader: Unable to fild data zip file. Downloading...")


            downloadFromURL(self.DATASET_URL, zipFile_path, "ml-10m.zip")

            dataFile = zipfile.ZipFile(zipFile_path + "ml-10m.zip")


        genres_path = dataFile.extract("ml-10M100K/movies.dat", path=zipFile_path + "decompressed/")
        tags_path = dataFile.extract("ml-10M100K/tags.dat", path=zipFile_path + "decompressed/")
        URM_path = dataFile.extract("ml-10M100K/ratings.dat", path=zipFile_path + "decompressed/")


        print("Movielens10MReader: loading genres")
        ICM_genres, tokenToFeatureMapper_ICM_genres, self.item_original_ID_to_index = _loadICM_genres(genres_path, header=True, separator='::', genresSeparator="|")

        self._LOADED_ICM_DICT["ICM_genres"] = ICM_genres
        self._LOADED_ICM_MAPPER_DICT["ICM_genres"] = tokenToFeatureMapper_ICM_genres

        print("Movielens10MReader: loading tags")
        ICM_tags, tokenToFeatureMapper_ICM_tags, _ = _loadICM_tags(tags_path, header=True, separator='::', if_new_item = "ignore",
                                                                             item_original_ID_to_index = self.item_original_ID_to_index)
        self._LOADED_ICM_DICT["ICM_tags"] = ICM_tags
        self._LOADED_ICM_MAPPER_DICT["ICM_tags"] = tokenToFeatureMapper_ICM_tags

        print("Movielens10MReader: loading URM")
        URM_all, self.item_original_ID_to_index, self.user_original_ID_to_index, URM_timestamp = _loadURM_preinitialized_item_id(URM_path, separator="::",
                                                                                          header = False, if_new_user = "******", if_new_item = "ignore",
                                                                                          item_original_ID_to_index = self.item_original_ID_to_index)
        self._LOADED_URM_DICT["URM_all"] = URM_all
        self._LOADED_URM_DICT["URM_timestamp"] = URM_timestamp
        self._LOADED_GLOBAL_MAPPER_DICT["user_original_ID_to_index"] = self.user_original_ID_to_index
        self._LOADED_GLOBAL_MAPPER_DICT["item_original_ID_to_index"] = self.item_original_ID_to_index

        ICM_all, tokenToFeatureMapper_ICM_all = merge_ICM(ICM_genres, ICM_tags,
                                                          tokenToFeatureMapper_ICM_genres,
                                                          tokenToFeatureMapper_ICM_tags)

        self._LOADED_ICM_DICT["ICM_all"] = ICM_all
        self._LOADED_ICM_MAPPER_DICT["ICM_all"] = tokenToFeatureMapper_ICM_all



        print("Movielens10MReader: cleaning temporary files")

        shutil.rmtree(zipFile_path + "decompressed", ignore_errors=True)

        print("Movielens10MReader: loading complete")
예제 #3
0
    def _load_from_original_file(self):
        # Load data from original

        print("Movielens1MReader: Loading original data")

        zipFile_path = self.DATASET_SPLIT_ROOT_FOLDER + self.DATASET_SUBFOLDER

        try:

            dataFile = zipfile.ZipFile(zipFile_path + "ml-1m.zip")

        except (FileNotFoundError, zipfile.BadZipFile):

            print(
                "Movielens1MReader: Unable to fild data zip file. Downloading..."
            )

            downloadFromURL(self.DATASET_URL, zipFile_path, "ml-1m.zip")

            dataFile = zipfile.ZipFile(zipFile_path + "ml-1m.zip")

        ICM_genre_path = dataFile.extract("ml-1m/movies.dat",
                                          path=zipFile_path + "decompressed/")
        UCM_path = dataFile.extract("ml-1m/users.dat",
                                    path=zipFile_path + "decompressed/")
        URM_path = dataFile.extract("ml-1m/ratings.dat",
                                    path=zipFile_path + "decompressed/")

        self.tokenToFeatureMapper_ICM_genres = {}
        self.tokenToFeatureMapper_UCM_all = {}

        print("Movielens1MReader: loading genres")
        self.ICM_genres, self.tokenToFeatureMapper_ICM_genres, self.item_original_ID_to_index = _loadICM_genres(
            ICM_genre_path, header=True, separator='::', genresSeparator="|")

        print("Movielens1MReader: loading UCM")
        self.UCM_all, self.tokenToFeatureMapper_UCM_all, self.user_original_ID_to_index = _loadUCM(
            UCM_path, header=True, separator='::')

        print("Movielens1MReader: loading URM")
        self.URM_all, self.item_original_ID_to_index, self.user_original_ID_to_index = _loadURM_preinitialized_item_id(
            URM_path,
            separator="::",
            header=True,
            if_new_user="******",
            if_new_item="ignore",
            item_original_ID_to_index=self.item_original_ID_to_index,
            user_original_ID_to_index=self.user_original_ID_to_index)

        print("Movielens1MReader: cleaning temporary files")

        shutil.rmtree(zipFile_path + "decompressed", ignore_errors=True)

        print("Movielens1MReader: loading complete")
예제 #4
0
    def _load_from_original_file(self):
        # Load data from original

        print("EpinionsReader: Loading original data")

        folder_path = self.DATASET_SPLIT_ROOT_FOLDER + self.DATASET_SUBFOLDER

        compressed_file_path = folder_path + "ratings_data.txt.bz2"
        decompressed_file_path = folder_path + "ratings_data.txt"

        try:

            open(decompressed_file_path, "r")

        except FileNotFoundError:

            print(
                "EpinionsReader: Unable to find decompressed data file. Decompressing..."
            )

            try:

                compressed_file = bz2.open(compressed_file_path, "rb")

            except Exception:

                print(
                    "EpinionsReader: Unable to find or open compressed data file. Downloading..."
                )

                downloadFromURL(self.DATASET_URL, folder_path,
                                "ratings_data.txt.bz2")

                compressed_file = bz2.open(compressed_file_path, "rb")

            decompressed_file = open(decompressed_file_path, "w")

            self._save_BZ2_in_text_file(compressed_file, decompressed_file)

            decompressed_file.close()

        print("EpinionsReader: loading URM")

        self.URM_all, self.item_original_ID_to_index, self.user_original_ID_to_index = load_CSV_into_SparseBuilder(
            decompressed_file_path, separator=" ", header=True)

        print("EpinionsReader: cleaning temporary files")

        import os

        os.remove(decompressed_file_path)

        print("EpinionsReader: loading complete")
    def _load_from_original_file(self):
        # Load data from original

        print("Movielens20MReader: Loading original data")

        zipFile_path =  self.DATASET_SPLIT_ROOT_FOLDER + self.DATASET_SUBFOLDER

        try:

            dataFile = zipfile.ZipFile(zipFile_path + "ml-20m.zip")

        except (FileNotFoundError, zipfile.BadZipFile):

            print("Movielens20MReader: Unable to fild data zip file. Downloading...")

            downloadFromURL(self.DATASET_URL, zipFile_path, "ml-20m.zip")

            dataFile = zipfile.ZipFile(zipFile_path + "ml-20m.zip")


        genres_path = dataFile.extract("ml-20m/movies.csv", path=zipFile_path + "decompressed/")
        tags_path = dataFile.extract("ml-20m/tags.csv", path=zipFile_path + "decompressed/")
        URM_path = dataFile.extract("ml-20m/ratings.csv", path=zipFile_path + "decompressed/")


        self.tokenToFeatureMapper_ICM_genre = {}

        print("Movielens20MReader: loading genres")
        self.ICM_genre, self.tokenToFeatureMapper_ICM_genre, self.item_original_ID_to_index = self._loadICM_genres(genres_path, header=True, separator=',', genresSeparator="|")

        print("Movielens20MReader: loading URM")
        self.URM_all, _, self.user_original_ID_to_index = self._loadURM(URM_path, separator=",", header = True, if_new_user = "******", if_new_item = "ignore")


        print("Movielens20MReader: cleaning temporary files")

        import shutil

        shutil.rmtree(zipFile_path + "decompressed", ignore_errors=True)

        print("Movielens20MReader: saving URM and ICM")
예제 #6
0
    def __init__(self):

        test_percentage = 0.2
        validation_percentage = 0.2

        pre_splitted_path = "Data_manager_split_datasets/AmazonInstantVideo/RecSys/SpectralCF_our_interface/"
        pre_splitted_filename = "splitted_data"

        ratings_file_name = "ratings_Amazon_Instant_Video.csv"

        # If directory does not exist, create
        if not os.path.exists(pre_splitted_path):
            os.makedirs(pre_splitted_path)

        try:
            print(
                "Dataset_AmazonInstantVideo: Attempting to load pre-splitted data"
            )

            for attrib_name, attrib_object in load_data_dict(
                    pre_splitted_path, pre_splitted_filename).items():
                self.__setattr__(attrib_name, attrib_object)

        except FileNotFoundError:

            print(
                "Dataset_AmazonInstantVideo: Pre-splitted data not found, building new one"
            )

            folder_path = self.DATASET_SPLIT_ROOT_FOLDER + self.DATASET_SUBFOLDER

            downloadFromURL(self.DATASET_URL, folder_path, ratings_file_name)

            # read Amazon Instant Video
            df = pd.read_csv(folder_path + ratings_file_name,
                             sep=',',
                             header=None,
                             names=['user', 'item', 'rating',
                                    'timestamp'])[['user', 'item', 'rating']]

            # keep only ratings = 5
            URM_train_builder = IncrementalSparseMatrix(
                auto_create_col_mapper=True, auto_create_row_mapper=True)
            URM_train_builder.add_data_lists(df['user'].values,
                                             df['item'].values,
                                             df['rating'].values)
            URM_all = URM_train_builder.get_SparseMatrix()

            URM_all.data = URM_all.data == 5
            URM_all.eliminate_zeros()

            # keep only users with at least 5 ratings
            URM_all = ut.filter_urm(URM_all,
                                    user_min_number_ratings=5,
                                    item_min_number_ratings=1)

            # create train - test - validation

            URM_train_original, self.URM_test = split_train_validation_percentage_user_wise(
                URM_all, train_percentage=1 - test_percentage, verbose=False)

            self.URM_train, self.URM_validation = split_train_validation_percentage_user_wise(
                URM_train_original,
                train_percentage=1 - validation_percentage,
                verbose=False)

            data_dict = {
                "URM_train": self.URM_train,
                "URM_test": self.URM_test,
                "URM_validation": self.URM_validation,
            }

            save_data_dict(data_dict, pre_splitted_path, pre_splitted_filename)

        print("Dataset_AmazonInstantVideo: Dataset loaded")

        ut.print_stat_datareader(self)
    def _load_from_original_file(self):

        zipFile_path = self.DATASET_SPLIT_ROOT_FOLDER + self.DATASET_SUBFOLDER

        try:

            dataFile = zipfile.ZipFile(
                zipFile_path + "recommender-system-2019-challenge-polimi.zip")

        except (FileNotFoundError, zipfile.BadZipFile):

            print(
                "Movielens10MReader: Unable to fild data zip file. Downloading..."
            )

            downloadFromURL(self.DATASET_URL, zipFile_path, "ml-10m.zip")

            dataFile = zipfile.ZipFile(zipFile_path + "ml-10m.zip")

        URM_path = dataFile.extract("data_train.csv",
                                    path=zipFile_path + "decompressed/")
        ICM_asset = dataFile.extract("data_ICM_asset.csv",
                                     path=zipFile_path + "decompressed/")
        ICM_price = dataFile.extract("data_ICM_price.csv",
                                     path=zipFile_path + "decompressed/")
        ICM_sub_class = dataFile.extract("data_ICM_sub_class.csv",
                                         path=zipFile_path + "decompressed/")

        # print("RecSys2019Reader: loading ICM assets")
        # ICM_assets, tokenToFeatureMapper_ICM_assets, self.item_original_ID_to_index = _loadICM_assets(ICM_asset,
        #                                                                                               header=True,
        #                                                                                               separator=',',
        #                                                                                               genresSeparator="|")
        # self._LOADED_ICM_DICT["ICM_assets"] = ICM_assets
        # self._LOADED_ICM_MAPPER_DICT["ICM_assets"] = tokenToFeatureMapper_ICM_assets
        #
        # print("RecSys2019Reader: loading ICM price")
        # ICM_prices, tokenToFeatureMapper_ICM_prices, self.item_original_ID_to_index = _loadICM_assets(ICM_price,
        #                                                                                               header=True,
        #                                                                                               separator=',',
        #                                                                                               genresSeparator="|")
        # self._LOADED_ICM_DICT["ICM_prices"] = ICM_assets
        # self._LOADED_ICM_MAPPER_DICT["ICM_prices"] = tokenToFeatureMapper_ICM_assets
        # print("Movielens10MReader: loading genres")
        # ICM_assets, tokenToFeatureMapper_ICM_assets, self.item_original_ID_to_index = _loadICM_genres(genres_path, header=True, separator='::', genresSeparator="|")
        #
        # self._LOADED_ICM_DICT["ICM_assets"] = ICM_assets
        # self._LOADED_ICM_MAPPER_DICT["ICM_assets"] = tokenToFeatureMapper_ICM_assets
        #
        # print("Movielens10MReader: loading tags")
        # ICM_prices, tokenToFeatureMapper_ICM_prices, _ = _loadICM_tags(tags_path, header=True, separator='::', if_new_item = "ignore",
        #                                                                      item_original_ID_to_index = self.item_original_ID_to_index)
        print("RecSys2019Reader: loading ICM class")
        ICM_classes, tokenToFeatureMapper_ICM_classes, self.item_original_ID_to_index = _loadICM_assets(
            ICM_sub_class, header=True, separator=',', genresSeparator="|")
        self._LOADED_ICM_DICT["ICM_classes"] = ICM_classes
        self._LOADED_ICM_MAPPER_DICT[
            "ICM_classes"] = tokenToFeatureMapper_ICM_classes
        print("Movielens10MReader: loading genres")

        print("RecSys2019Reader: loading URM")
        URM_all, self.item_original_ID_to_index, self.user_original_ID_to_index = _loadURM_preinitialized_item_id(
            URM_path,
            separator=",",
            header=False,
            if_new_user="******",
            if_new_item="add",
            item_original_ID_to_index=self.item_original_ID_to_index)
        self._LOADED_URM_DICT["URM_all"] = URM_all
        self._LOADED_GLOBAL_MAPPER_DICT[
            "user_original_ID_to_index"] = self.user_original_ID_to_index
        self._LOADED_GLOBAL_MAPPER_DICT[
            "item_original_ID_to_index"] = self.item_original_ID_to_index

        print(URM_all.shape)

        # ICM_all, tokenToFeatureMapper_ICM_all = merge_ICM(ICM_assets, ICM_prices,
        #                                                   tokenToFeatureMapper_ICM_assets,
        #                                                   tokenToFeatureMapper_ICM_prices)
        #
        # self._LOADED_ICM_DICT["ICM_all"] = ICM_all
        # self._LOADED_ICM_MAPPER_DICT["ICM_all"] = tokenToFeatureMapper_ICM_all

        print("RecSys2019Reader: cleaning temporary files")

        shutil.rmtree(zipFile_path + "decompressed", ignore_errors=True)

        print("RecSys2019Reader: loading complete")