コード例 #1
0
    def _load_from_original_file(self):
        # Load data from original

        print("Movielens1MReader: Loading original data")

        zipFile_path = self.DATASET_OFFLINE_ROOT_FOLDER + self.DATASET_SUBFOLDER

        try:

            dataFile = zipfile.ZipFile(zipFile_path + "ml-1m.zip")

        except (FileNotFoundError, zipfile.BadZipFile):

            print("Movielens1MReader: Unable to find data zip file. Downloading...")
            downloadFromURL(self.DATASET_URL, zipFile_path, "ml-1m.zip")
            dataFile = zipfile.ZipFile(zipFile_path + "ml-1m.zip")


        URM_path = dataFile.extract("ml-1m/ratings.dat", path=zipFile_path + "decompressed/")

        URM_all, item_mapper, user_mapper = load_CSV_into_SparseBuilder(URM_path, separator="::")

        print("Movielens1MReader: cleaning temporary files")

        import shutil

        shutil.rmtree(zipFile_path + "decompressed", ignore_errors=True)

        print("Movielens1MReader: loading complete")

        return Dataset(self.get_dataset_name(), URM_dict={"URM_all": URM_all},
                       URM_mappers_dict={"URM_all": (user_mapper.copy(), item_mapper.copy())})
コード例 #2
0
    def _load_from_original_file(self):
        # Load data from original

        print("LastFMHetrec2011Reader: Loading original data")

        folder_path = self.DATASET_OFFLINE_ROOT_FOLDER + self.DATASET_SUBFOLDER

        try:

            dataFile = zipfile.ZipFile(folder_path +
                                       "hetrec2011-lastfm-2k.zip")

        except (FileNotFoundError, zipfile.BadZipFile):

            print(
                "LastFMHetrec2011Reader: Unable to find or extract data zip file. Downloading..."
            )
            downloadFromURL(self.DATASET_URL, folder_path,
                            "hetrec2011-lastfm-2k.zip")
            dataFile = zipfile.ZipFile(folder_path +
                                       "hetrec2011-lastfm-2k.zip")

        URM_path = dataFile.extract("user_artists.dat",
                                    path=folder_path + "decompressed")
        tags_path = dataFile.extract("user_taggedartists-timestamps.dat",
                                     path=folder_path + "decompressed")

        print("LastFMHetrec2011Reader: loading URM")
        URM_all, item_mapper, user_mapper = load_CSV_into_SparseBuilder(
            URM_path, separator="\t", header=True)

        print("LastFMHetrec2011Reader: loading tags")
        ICM_tags, feature_mapper, _ = self._loadICM_tags(tags_path,
                                                         item_mapper,
                                                         header=True,
                                                         separator='\t',
                                                         if_new_item="ignore")

        print("LastFMHetrec2011Reader: cleaning temporary files")

        import shutil

        shutil.rmtree(folder_path + "decompressed", ignore_errors=True)

        print("LastFMHetrec2011Reader: loading complete")

        return Dataset(self.get_dataset_name(),
                       URM_dict={"URM_all": URM_all},
                       URM_mappers_dict={
                           "URM_all": (user_mapper.copy(), item_mapper.copy())
                       },
                       ICM_dict={"ICM_all": ICM_tags},
                       ICM_mappers_dict={
                           "ICM_all":
                           (item_mapper.copy(), feature_mapper.copy())
                       })
コード例 #3
0
    def _load_from_original_file(self):
        # Load data from original

        print("EpinionsReader: Loading original data")

        folder_path = self.DATASET_OFFLINE_ROOT_FOLDER + self.DATASET_SUBFOLDER

        compressed_file_path = folder_path + "ratings_data.txt.bz2"
        decompressed_file_path = folder_path + "ratings_data.txt"

        try:

            open(decompressed_file_path, "r")

        except FileNotFoundError:

            print(
                "EpinionsReader: Unable to find decompressed data file. Decompressing..."
            )

            try:

                compressed_file = bz2.open(compressed_file_path, "rb")

            except Exception:

                print(
                    "EpinionsReader: Unable to find or open compressed data file. Downloading..."
                )
                downloadFromURL(self.DATASET_URL, folder_path,
                                "ratings_data.txt.bz2")
                compressed_file = bz2.open(compressed_file_path, "rb")

            decompressed_file = open(decompressed_file_path, "w")
            self._save_BZ2_in_text_file(compressed_file, decompressed_file)
            decompressed_file.close()

        print("EpinionsReader: loading URM")

        URM_all, item_mapper, user_mapper = load_CSV_into_SparseBuilder(
            decompressed_file_path, separator=" ", header=True)

        print("EpinionsReader: cleaning temporary files")

        import os

        os.remove(decompressed_file_path)

        print("EpinionsReader: loading complete")

        return Dataset(self.get_dataset_name(),
                       URM_dict={"URM_all": URM_all},
                       URM_mappers_dict={
                           "URM_all": (user_mapper.copy(), item_mapper.copy())
                       })
コード例 #4
0
    def _load_from_original_file(self):
        # Load data from original

        print("BrightkiteReader: Loading original data")

        folder_path = self.DATASET_OFFLINE_ROOT_FOLDER + self.DATASET_SUBFOLDER

        try:

            compressed_file = gzip.open(
                folder_path + "loc-brightkite_edges.txt.gz", 'rb')

        except (FileNotFoundError):

            print(
                "BrightkiteReader: Unable to find or extract data zip file. Downloading..."
            )

            downloadFromURL(self.DATASET_URL, folder_path,
                            "loc-brightkite_edges.txt.gz")

            compressed_file = gzip.open(
                folder_path + "loc-brightkite_edges.txt.gz", 'rb')

        URM_path = folder_path + "loc-brightkite_edges.txt"

        decompressed_file = open(URM_path, "w")

        self._save_GZ_in_text_file(compressed_file, decompressed_file)

        decompressed_file.close()

        print("BrightkiteReader: loading URM")
        URM_all, item_mapper, user_mapper = self._loadURM(URM_path,
                                                          separator="\t",
                                                          header=False)

        print("BrightkiteReader: cleaning temporary files")

        import os

        os.remove(URM_path)

        print("BrightkiteReader: loading complete")

        return Dataset(self.get_dataset_name(),
                       URM_dict={"URM_all": URM_all},
                       URM_mappers_dict={
                           "URM_all": (user_mapper.copy(), item_mapper.copy())
                       })
コード例 #5
0
    def _load_from_original_file(self):

        print("BookCrossingReader: Ratings are in range 1-10, value -1 refers to an implicit rating")
        print("BookCrossingReader: ICM contains the author, publisher, year and tokens from the title")

        print("BookCrossingReader: Loading original data")

        folder_path = self.DATASET_OFFLINE_ROOT_FOLDER + self.DATASET_SUBFOLDER

        try:

            dataFile = zipfile.ZipFile(folder_path + "BX-CSV-Dump.zip")

        except (FileNotFoundError, zipfile.BadZipFile):

            print("BookCrossingReader: Unable to find or extract data zip file. Downloading...")

            downloadFromURL(self.DATASET_URL, folder_path, "BX-CSV-Dump.zip")

            dataFile = zipfile.ZipFile(folder_path + "BX-CSV-Dump.zip")

        URM_path = dataFile.extract("BX-Book-Ratings.csv", path=folder_path + "decompressed")
        ICM_path = dataFile.extract("BX-Books.csv", path=folder_path + "decompressed")

        print("BookCrossingReader: loading ICM")
        ICM_all, feature_mapper, item_mapper = self._loadICM(ICM_path, separator=';', header=True, if_new_item="add")

        ICM_all, _, feature_mapper = removeFeatures(ICM_all, minOccurrence=5, maxPercOccurrence=0.30, reconcile_mapper=feature_mapper)

        print("BookCrossingReader: loading URM")
        URM_all, _, user_mapper = self._loadURM(URM_path, item_mapper, separator=";", header=True, if_new_user="******", if_new_item="ignore")

        print("BookCrossingReader: cleaning temporary files")

        import shutil

        shutil.rmtree(folder_path + "decompressed", ignore_errors=True)

        print("BookCrossingReader: loading complete")

        return Dataset(self.get_dataset_name(),
                       URM_dict={"URM_all": URM_all}, URM_mappers_dict={"URM_all": (user_mapper.copy(), item_mapper.copy())},
                       ICM_dict={"ICM_all": ICM_all}, ICM_mappers_dict={"ICM_all": (item_mapper.copy(), feature_mapper.copy())})
コード例 #6
0
    def _get_ICM_metadata_path(self, data_folder, compressed_file_name, decompressed_file_name, file_url):
        """
        Metadata files are .csv
        :param data_folder:
        :param file_name:
        :param file_url:
        :return:
        """

        try:

            open(data_folder + decompressed_file_name, "r")

        except FileNotFoundError:

            print("AmazonReviewDataReader: Decompressing metadata file...")

            try:

                decompressed_file = open(data_folder + decompressed_file_name, "wb")

                compressed_file = gzip.open(data_folder + compressed_file_name, "rb")
                decompressed_file.write(compressed_file.read())

                compressed_file.close()
                decompressed_file.close()

            except (FileNotFoundError, Exception):

                print("AmazonReviewDataReader: Unable to find or decompress compressed file. Downloading...")

                downloadFromURL(file_url, data_folder, compressed_file_name)

                decompressed_file = open(data_folder + decompressed_file_name, "wb")

                compressed_file = gzip.open(data_folder + compressed_file_name, "rb")
                decompressed_file.write(compressed_file.read())

                compressed_file.close()
                decompressed_file.close()

        return data_folder + decompressed_file_name
コード例 #7
0
    def _get_URM_review_path(self, data_folder, file_name, file_url):
        """
        Metadata files are .csv
        :param data_folder:
        :param file_name:
        :param file_url:
        :return:
        """

        try:

            open(data_folder + file_name, "r")

        except FileNotFoundError:

            print("AmazonReviewDataReader: Unable to find or open review file. Downloading...")

            downloadFromURL(file_url, data_folder, file_name)

        return data_folder + file_name
コード例 #8
0
    def _load_from_original_file(self):
        # Load data from original

        print("Movielens20MReader: Loading original data")

        zipFile_path = self.DATASET_OFFLINE_ROOT_FOLDER + self.DATASET_SUBFOLDER

        try:

            dataFile = zipfile.ZipFile(zipFile_path + "ml-20m.zip")

        except (FileNotFoundError, zipfile.BadZipFile):

            print(
                "Movielens20MReader: Unable to fild data zip file. Downloading..."
            )
            downloadFromURL(self.DATASET_URL, zipFile_path, "ml-20m.zip")
            dataFile = zipfile.ZipFile(zipFile_path + "ml-20m.zip")

        genres_path = dataFile.extract("ml-20m/movies.csv",
                                       path=zipFile_path + "decompressed/")
        tags_path = dataFile.extract("ml-20m/tags.csv",
                                     path=zipFile_path + "decompressed/")
        URM_path = dataFile.extract("ml-20m/ratings.csv",
                                    path=zipFile_path + "decompressed/")

        print("Movielens20MReader: loading genres")
        ICM_genres, genres_mapper, item_mapper = self._loadICM_genres(
            genres_path, header=True, separator=',', genresSeparator="|")

        print("Movielens20MReader: loading tags")
        ICM_tags, tags_mapper, _ = self._loadICM_tags(tags_path,
                                                      item_mapper,
                                                      header=True,
                                                      separator=',',
                                                      if_new_item="ignore")

        print("Movielens20MReader: loading URM")
        URM_all, _, user_mapper = self._loadURM(URM_path,
                                                item_mapper,
                                                separator=",",
                                                header=True,
                                                if_new_user="******",
                                                if_new_item="ignore")

        ICM_all, feature_mapper = self._merge_ICM(ICM_genres, ICM_tags,
                                                  genres_mapper, tags_mapper)

        print("Movielens20MReader: cleaning temporary files")

        import shutil

        shutil.rmtree(zipFile_path + "decompressed", ignore_errors=True)

        print("Movielens20MReader: saving URM and ICM")

        return Dataset(self.get_dataset_name(),
                       URM_dict={"URM_all": URM_all},
                       URM_mappers_dict={
                           "URM_all": (user_mapper.copy(), item_mapper.copy())
                       },
                       ICM_dict={
                           "ICM_genres": ICM_genres,
                           "ICM_tags": ICM_tags,
                           "ICM_all": ICM_all
                       },
                       ICM_mappers_dict={
                           "ICM_genres":
                           (item_mapper.copy(), genres_mapper.copy()),
                           "ICM_tags":
                           (item_mapper.copy(), tags_mapper.copy()),
                           "ICM_all":
                           (item_mapper.copy(), feature_mapper.copy())
                       })
コード例 #9
0
    def _load_from_original_file(self):
        # Load data from original

        print("MovielensHetrecReader: Loading original data")

        zipFile_path = self.DATASET_OFFLINE_ROOT_FOLDER + self.DATASET_SUBFOLDER

        try:

            dataFile = zipfile.ZipFile(zipFile_path +
                                       "hetrec2011-movielens-2k-v2.zip")

        except (FileNotFoundError, zipfile.BadZipFile):

            print(
                "MovielensHetrecReader: Unable to fild data zip file. Downloading..."
            )
            downloadFromURL(self.DATASET_URL, zipFile_path,
                            "hetrec2011-movielens-2k-v2.zip")
            dataFile = zipfile.ZipFile(zipFile_path +
                                       "hetrec2011-movielens-2k-v2.zip")

        movies_path = dataFile.extract("movies.dat",
                                       path=zipFile_path + "decompressed/")
        genres_path = dataFile.extract("movie_genres.dat",
                                       path=zipFile_path + "decompressed/")
        directors_path = dataFile.extract("movie_directors.dat",
                                          path=zipFile_path + "decompressed/")
        actors_path = dataFile.extract("movie_actors.dat",
                                       path=zipFile_path + "decompressed/")
        countries_path = dataFile.extract("movie_countries.dat",
                                          path=zipFile_path + "decompressed/")
        locations_path = dataFile.extract("movie_locations.dat",
                                          path=zipFile_path + "decompressed/")
        URM_path = dataFile.extract("user_ratedmovies.dat",
                                    path=zipFile_path + "decompressed/")

        print("MovielensHetrecReader: loading years")
        ICM_years, years_mapper, item_mapper = self._load_tsv(
            movies_path,
            None,
            feature_columns=[5],
            header=True,
            if_new_item="add")

        print("MovielensHetrecReader: loading genres")
        ICM_genres, genres_mapper, _ = self._load_tsv(genres_path,
                                                      item_mapper,
                                                      header=True,
                                                      if_new_item="ignore")
        ICM_all, feature_mapper = self._merge_ICM(ICM_genres, ICM_years,
                                                  genres_mapper, years_mapper)

        print("MovielensHetrecReader: loading directors")
        ICM_directors, directors_mapper, _ = self._load_tsv(
            directors_path, item_mapper, header=True, if_new_item="ignore")
        ICM_all, feature_mapper = self._merge_ICM(ICM_all, ICM_directors,
                                                  feature_mapper,
                                                  directors_mapper)

        print("MovielensHetrecReader: loading actors")
        ICM_actors, actors_mapper, _ = self._load_tsv(actors_path,
                                                      item_mapper,
                                                      header=True,
                                                      if_new_item="ignore")
        ICM_all, feature_mapper = self._merge_ICM(ICM_all, ICM_actors,
                                                  feature_mapper,
                                                  actors_mapper)

        print("MovielensHetrecReader: loading countries")
        ICM_countries, countries_mapper, _ = self._load_tsv(
            countries_path, item_mapper, header=True, if_new_item="ignore")
        ICM_all, feature_mapper = self._merge_ICM(ICM_all, ICM_countries,
                                                  feature_mapper,
                                                  countries_mapper)

        print("MovielensHetrecReader: loading locations")
        ICM_locations, locations_mapper, _ = self._load_tsv(
            locations_path,
            item_mapper,
            feature_columns=[1, 2, 3],
            header=True,
            if_new_item="ignore")
        ICM_all, feature_mapper = self._merge_ICM(ICM_all, ICM_locations,
                                                  feature_mapper,
                                                  locations_mapper)

        print("MovielensHetrecReader: loading URM")
        URM_all, _, user_mapper = self._loadURM(URM_path,
                                                item_mapper,
                                                separator="\t",
                                                header=True,
                                                if_new_user="******",
                                                if_new_item="ignore")

        print("MovielensHetrecReader: cleaning temporary files")

        import shutil

        shutil.rmtree(zipFile_path + "decompressed", ignore_errors=True)

        print("MovielensHetrecReader: saving URM and ICM")

        return Dataset(
            self.get_dataset_name(),
            URM_dict={"URM_all": URM_all},
            URM_mappers_dict={
                "URM_all": (user_mapper.copy(), item_mapper.copy())
            },
            ICM_dict={
                "ICM_genres": ICM_genres,
                "ICM_years": ICM_years,
                "ICM_all": ICM_all,
                "ICM_directors": ICM_directors,
                "ICM_actors": ICM_actors,
                "ICM_countries": ICM_countries,
                "ICM_locations": ICM_locations,
            },
            ICM_mappers_dict={
                "ICM_genres": (item_mapper.copy(), genres_mapper.copy()),
                "ICM_years": (item_mapper.copy(), years_mapper.copy()),
                "ICM_directors": (item_mapper.copy(), directors_mapper.copy()),
                "ICM_actors": (item_mapper.copy(), actors_mapper.copy()),
                "ICM_countries": (item_mapper.copy(), countries_mapper.copy()),
                "ICM_locations": (item_mapper.copy(), locations_mapper.copy()),
                "ICM_all": (item_mapper.copy(), feature_mapper.copy())
            })