def _load_from_original_file(self): # Load data from original print("Movielens1MReader: Loading original data") zipFile_path = self.DATASET_OFFLINE_ROOT_FOLDER + self.DATASET_SUBFOLDER try: dataFile = zipfile.ZipFile(zipFile_path + "ml-1m.zip") except (FileNotFoundError, zipfile.BadZipFile): print("Movielens1MReader: Unable to find data zip file. Downloading...") downloadFromURL(self.DATASET_URL, zipFile_path, "ml-1m.zip") dataFile = zipfile.ZipFile(zipFile_path + "ml-1m.zip") URM_path = dataFile.extract("ml-1m/ratings.dat", path=zipFile_path + "decompressed/") URM_all, item_mapper, user_mapper = load_CSV_into_SparseBuilder(URM_path, separator="::") print("Movielens1MReader: cleaning temporary files") import shutil shutil.rmtree(zipFile_path + "decompressed", ignore_errors=True) print("Movielens1MReader: loading complete") return Dataset(self.get_dataset_name(), URM_dict={"URM_all": URM_all}, URM_mappers_dict={"URM_all": (user_mapper.copy(), item_mapper.copy())})
def _load_from_original_file(self): # Load data from original print("LastFMHetrec2011Reader: Loading original data") folder_path = self.DATASET_OFFLINE_ROOT_FOLDER + self.DATASET_SUBFOLDER try: dataFile = zipfile.ZipFile(folder_path + "hetrec2011-lastfm-2k.zip") except (FileNotFoundError, zipfile.BadZipFile): print( "LastFMHetrec2011Reader: Unable to find or extract data zip file. Downloading..." ) downloadFromURL(self.DATASET_URL, folder_path, "hetrec2011-lastfm-2k.zip") dataFile = zipfile.ZipFile(folder_path + "hetrec2011-lastfm-2k.zip") URM_path = dataFile.extract("user_artists.dat", path=folder_path + "decompressed") tags_path = dataFile.extract("user_taggedartists-timestamps.dat", path=folder_path + "decompressed") print("LastFMHetrec2011Reader: loading URM") URM_all, item_mapper, user_mapper = load_CSV_into_SparseBuilder( URM_path, separator="\t", header=True) print("LastFMHetrec2011Reader: loading tags") ICM_tags, feature_mapper, _ = self._loadICM_tags(tags_path, item_mapper, header=True, separator='\t', if_new_item="ignore") print("LastFMHetrec2011Reader: cleaning temporary files") import shutil shutil.rmtree(folder_path + "decompressed", ignore_errors=True) print("LastFMHetrec2011Reader: loading complete") return Dataset(self.get_dataset_name(), URM_dict={"URM_all": URM_all}, URM_mappers_dict={ "URM_all": (user_mapper.copy(), item_mapper.copy()) }, ICM_dict={"ICM_all": ICM_tags}, ICM_mappers_dict={ "ICM_all": (item_mapper.copy(), feature_mapper.copy()) })
def _load_from_original_file(self): # Load data from original print("EpinionsReader: Loading original data") folder_path = self.DATASET_OFFLINE_ROOT_FOLDER + self.DATASET_SUBFOLDER compressed_file_path = folder_path + "ratings_data.txt.bz2" decompressed_file_path = folder_path + "ratings_data.txt" try: open(decompressed_file_path, "r") except FileNotFoundError: print( "EpinionsReader: Unable to find decompressed data file. Decompressing..." ) try: compressed_file = bz2.open(compressed_file_path, "rb") except Exception: print( "EpinionsReader: Unable to find or open compressed data file. Downloading..." ) downloadFromURL(self.DATASET_URL, folder_path, "ratings_data.txt.bz2") compressed_file = bz2.open(compressed_file_path, "rb") decompressed_file = open(decompressed_file_path, "w") self._save_BZ2_in_text_file(compressed_file, decompressed_file) decompressed_file.close() print("EpinionsReader: loading URM") URM_all, item_mapper, user_mapper = load_CSV_into_SparseBuilder( decompressed_file_path, separator=" ", header=True) print("EpinionsReader: cleaning temporary files") import os os.remove(decompressed_file_path) print("EpinionsReader: loading complete") return Dataset(self.get_dataset_name(), URM_dict={"URM_all": URM_all}, URM_mappers_dict={ "URM_all": (user_mapper.copy(), item_mapper.copy()) })
def _load_from_original_file(self): # Load data from original print("BrightkiteReader: Loading original data") folder_path = self.DATASET_OFFLINE_ROOT_FOLDER + self.DATASET_SUBFOLDER try: compressed_file = gzip.open( folder_path + "loc-brightkite_edges.txt.gz", 'rb') except (FileNotFoundError): print( "BrightkiteReader: Unable to find or extract data zip file. Downloading..." ) downloadFromURL(self.DATASET_URL, folder_path, "loc-brightkite_edges.txt.gz") compressed_file = gzip.open( folder_path + "loc-brightkite_edges.txt.gz", 'rb') URM_path = folder_path + "loc-brightkite_edges.txt" decompressed_file = open(URM_path, "w") self._save_GZ_in_text_file(compressed_file, decompressed_file) decompressed_file.close() print("BrightkiteReader: loading URM") URM_all, item_mapper, user_mapper = self._loadURM(URM_path, separator="\t", header=False) print("BrightkiteReader: cleaning temporary files") import os os.remove(URM_path) print("BrightkiteReader: loading complete") return Dataset(self.get_dataset_name(), URM_dict={"URM_all": URM_all}, URM_mappers_dict={ "URM_all": (user_mapper.copy(), item_mapper.copy()) })
def _load_from_original_file(self): print("BookCrossingReader: Ratings are in range 1-10, value -1 refers to an implicit rating") print("BookCrossingReader: ICM contains the author, publisher, year and tokens from the title") print("BookCrossingReader: Loading original data") folder_path = self.DATASET_OFFLINE_ROOT_FOLDER + self.DATASET_SUBFOLDER try: dataFile = zipfile.ZipFile(folder_path + "BX-CSV-Dump.zip") except (FileNotFoundError, zipfile.BadZipFile): print("BookCrossingReader: Unable to find or extract data zip file. Downloading...") downloadFromURL(self.DATASET_URL, folder_path, "BX-CSV-Dump.zip") dataFile = zipfile.ZipFile(folder_path + "BX-CSV-Dump.zip") URM_path = dataFile.extract("BX-Book-Ratings.csv", path=folder_path + "decompressed") ICM_path = dataFile.extract("BX-Books.csv", path=folder_path + "decompressed") print("BookCrossingReader: loading ICM") ICM_all, feature_mapper, item_mapper = self._loadICM(ICM_path, separator=';', header=True, if_new_item="add") ICM_all, _, feature_mapper = removeFeatures(ICM_all, minOccurrence=5, maxPercOccurrence=0.30, reconcile_mapper=feature_mapper) print("BookCrossingReader: loading URM") URM_all, _, user_mapper = self._loadURM(URM_path, item_mapper, separator=";", header=True, if_new_user="******", if_new_item="ignore") print("BookCrossingReader: cleaning temporary files") import shutil shutil.rmtree(folder_path + "decompressed", ignore_errors=True) print("BookCrossingReader: loading complete") return Dataset(self.get_dataset_name(), URM_dict={"URM_all": URM_all}, URM_mappers_dict={"URM_all": (user_mapper.copy(), item_mapper.copy())}, ICM_dict={"ICM_all": ICM_all}, ICM_mappers_dict={"ICM_all": (item_mapper.copy(), feature_mapper.copy())})
def _get_ICM_metadata_path(self, data_folder, compressed_file_name, decompressed_file_name, file_url): """ Metadata files are .csv :param data_folder: :param file_name: :param file_url: :return: """ try: open(data_folder + decompressed_file_name, "r") except FileNotFoundError: print("AmazonReviewDataReader: Decompressing metadata file...") try: decompressed_file = open(data_folder + decompressed_file_name, "wb") compressed_file = gzip.open(data_folder + compressed_file_name, "rb") decompressed_file.write(compressed_file.read()) compressed_file.close() decompressed_file.close() except (FileNotFoundError, Exception): print("AmazonReviewDataReader: Unable to find or decompress compressed file. Downloading...") downloadFromURL(file_url, data_folder, compressed_file_name) decompressed_file = open(data_folder + decompressed_file_name, "wb") compressed_file = gzip.open(data_folder + compressed_file_name, "rb") decompressed_file.write(compressed_file.read()) compressed_file.close() decompressed_file.close() return data_folder + decompressed_file_name
def _get_URM_review_path(self, data_folder, file_name, file_url): """ Metadata files are .csv :param data_folder: :param file_name: :param file_url: :return: """ try: open(data_folder + file_name, "r") except FileNotFoundError: print("AmazonReviewDataReader: Unable to find or open review file. Downloading...") downloadFromURL(file_url, data_folder, file_name) return data_folder + file_name
def _load_from_original_file(self): # Load data from original print("Movielens20MReader: Loading original data") zipFile_path = self.DATASET_OFFLINE_ROOT_FOLDER + self.DATASET_SUBFOLDER try: dataFile = zipfile.ZipFile(zipFile_path + "ml-20m.zip") except (FileNotFoundError, zipfile.BadZipFile): print( "Movielens20MReader: Unable to fild data zip file. Downloading..." ) downloadFromURL(self.DATASET_URL, zipFile_path, "ml-20m.zip") dataFile = zipfile.ZipFile(zipFile_path + "ml-20m.zip") genres_path = dataFile.extract("ml-20m/movies.csv", path=zipFile_path + "decompressed/") tags_path = dataFile.extract("ml-20m/tags.csv", path=zipFile_path + "decompressed/") URM_path = dataFile.extract("ml-20m/ratings.csv", path=zipFile_path + "decompressed/") print("Movielens20MReader: loading genres") ICM_genres, genres_mapper, item_mapper = self._loadICM_genres( genres_path, header=True, separator=',', genresSeparator="|") print("Movielens20MReader: loading tags") ICM_tags, tags_mapper, _ = self._loadICM_tags(tags_path, item_mapper, header=True, separator=',', if_new_item="ignore") print("Movielens20MReader: loading URM") URM_all, _, user_mapper = self._loadURM(URM_path, item_mapper, separator=",", header=True, if_new_user="******", if_new_item="ignore") ICM_all, feature_mapper = self._merge_ICM(ICM_genres, ICM_tags, genres_mapper, tags_mapper) print("Movielens20MReader: cleaning temporary files") import shutil shutil.rmtree(zipFile_path + "decompressed", ignore_errors=True) print("Movielens20MReader: saving URM and ICM") return Dataset(self.get_dataset_name(), URM_dict={"URM_all": URM_all}, URM_mappers_dict={ "URM_all": (user_mapper.copy(), item_mapper.copy()) }, ICM_dict={ "ICM_genres": ICM_genres, "ICM_tags": ICM_tags, "ICM_all": ICM_all }, ICM_mappers_dict={ "ICM_genres": (item_mapper.copy(), genres_mapper.copy()), "ICM_tags": (item_mapper.copy(), tags_mapper.copy()), "ICM_all": (item_mapper.copy(), feature_mapper.copy()) })
def _load_from_original_file(self): # Load data from original print("MovielensHetrecReader: Loading original data") zipFile_path = self.DATASET_OFFLINE_ROOT_FOLDER + self.DATASET_SUBFOLDER try: dataFile = zipfile.ZipFile(zipFile_path + "hetrec2011-movielens-2k-v2.zip") except (FileNotFoundError, zipfile.BadZipFile): print( "MovielensHetrecReader: Unable to fild data zip file. Downloading..." ) downloadFromURL(self.DATASET_URL, zipFile_path, "hetrec2011-movielens-2k-v2.zip") dataFile = zipfile.ZipFile(zipFile_path + "hetrec2011-movielens-2k-v2.zip") movies_path = dataFile.extract("movies.dat", path=zipFile_path + "decompressed/") genres_path = dataFile.extract("movie_genres.dat", path=zipFile_path + "decompressed/") directors_path = dataFile.extract("movie_directors.dat", path=zipFile_path + "decompressed/") actors_path = dataFile.extract("movie_actors.dat", path=zipFile_path + "decompressed/") countries_path = dataFile.extract("movie_countries.dat", path=zipFile_path + "decompressed/") locations_path = dataFile.extract("movie_locations.dat", path=zipFile_path + "decompressed/") URM_path = dataFile.extract("user_ratedmovies.dat", path=zipFile_path + "decompressed/") print("MovielensHetrecReader: loading years") ICM_years, years_mapper, item_mapper = self._load_tsv( movies_path, None, feature_columns=[5], header=True, if_new_item="add") print("MovielensHetrecReader: loading genres") ICM_genres, genres_mapper, _ = self._load_tsv(genres_path, item_mapper, header=True, if_new_item="ignore") ICM_all, feature_mapper = self._merge_ICM(ICM_genres, ICM_years, genres_mapper, years_mapper) print("MovielensHetrecReader: loading directors") ICM_directors, directors_mapper, _ = self._load_tsv( directors_path, item_mapper, header=True, if_new_item="ignore") ICM_all, feature_mapper = self._merge_ICM(ICM_all, ICM_directors, feature_mapper, directors_mapper) print("MovielensHetrecReader: loading actors") ICM_actors, actors_mapper, _ = self._load_tsv(actors_path, item_mapper, header=True, if_new_item="ignore") ICM_all, feature_mapper = self._merge_ICM(ICM_all, ICM_actors, feature_mapper, actors_mapper) print("MovielensHetrecReader: loading countries") ICM_countries, countries_mapper, _ = self._load_tsv( countries_path, item_mapper, header=True, if_new_item="ignore") ICM_all, feature_mapper = self._merge_ICM(ICM_all, ICM_countries, feature_mapper, countries_mapper) print("MovielensHetrecReader: loading locations") ICM_locations, locations_mapper, _ = self._load_tsv( locations_path, item_mapper, feature_columns=[1, 2, 3], header=True, if_new_item="ignore") ICM_all, feature_mapper = self._merge_ICM(ICM_all, ICM_locations, feature_mapper, locations_mapper) print("MovielensHetrecReader: loading URM") URM_all, _, user_mapper = self._loadURM(URM_path, item_mapper, separator="\t", header=True, if_new_user="******", if_new_item="ignore") print("MovielensHetrecReader: cleaning temporary files") import shutil shutil.rmtree(zipFile_path + "decompressed", ignore_errors=True) print("MovielensHetrecReader: saving URM and ICM") return Dataset( self.get_dataset_name(), URM_dict={"URM_all": URM_all}, URM_mappers_dict={ "URM_all": (user_mapper.copy(), item_mapper.copy()) }, ICM_dict={ "ICM_genres": ICM_genres, "ICM_years": ICM_years, "ICM_all": ICM_all, "ICM_directors": ICM_directors, "ICM_actors": ICM_actors, "ICM_countries": ICM_countries, "ICM_locations": ICM_locations, }, ICM_mappers_dict={ "ICM_genres": (item_mapper.copy(), genres_mapper.copy()), "ICM_years": (item_mapper.copy(), years_mapper.copy()), "ICM_directors": (item_mapper.copy(), directors_mapper.copy()), "ICM_actors": (item_mapper.copy(), actors_mapper.copy()), "ICM_countries": (item_mapper.copy(), countries_mapper.copy()), "ICM_locations": (item_mapper.copy(), locations_mapper.copy()), "ICM_all": (item_mapper.copy(), feature_mapper.copy()) })