def _load_from_original_file(self): # Load data from original print("Movielens100KReader: Loading original data") zipFile_path = self.DATASET_SPLIT_ROOT_FOLDER + self.DATASET_SUBFOLDER try: dataFile = zipfile.ZipFile(zipFile_path + "ml-100k.zip") except (FileNotFoundError, zipfile.BadZipFile): print( "Movielens100KReader: Unable to fild data zip file. Downloading..." ) downloadFromURL(self.DATASET_URL, zipFile_path, "ml-100k.zip") dataFile = zipfile.ZipFile(zipFile_path + "ml-100k.zip") URM_path = dataFile.extract("ml-100k/u.data", path=zipFile_path + "decompressed/") self.URM_all, self.item_original_ID_to_index, self.user_original_ID_to_index = load_CSV_into_SparseBuilder( URM_path, separator="\t", header=False) print("Movielens100KReader: cleaning temporary files") import shutil shutil.rmtree(zipFile_path + "decompressed", ignore_errors=True) print("Movielens100KReader: loading complete")
def _load_from_original_file(self): zipFile_path = self.DATASET_SPLIT_ROOT_FOLDER + self.DATASET_SUBFOLDER try: dataFile = zipfile.ZipFile(zipFile_path + "ml-10m.zip") except (FileNotFoundError, zipfile.BadZipFile): print("Movielens10MReader: Unable to fild data zip file. Downloading...") downloadFromURL(self.DATASET_URL, zipFile_path, "ml-10m.zip") dataFile = zipfile.ZipFile(zipFile_path + "ml-10m.zip") genres_path = dataFile.extract("ml-10M100K/movies.dat", path=zipFile_path + "decompressed/") tags_path = dataFile.extract("ml-10M100K/tags.dat", path=zipFile_path + "decompressed/") URM_path = dataFile.extract("ml-10M100K/ratings.dat", path=zipFile_path + "decompressed/") print("Movielens10MReader: loading genres") ICM_genres, tokenToFeatureMapper_ICM_genres, self.item_original_ID_to_index = _loadICM_genres(genres_path, header=True, separator='::', genresSeparator="|") self._LOADED_ICM_DICT["ICM_genres"] = ICM_genres self._LOADED_ICM_MAPPER_DICT["ICM_genres"] = tokenToFeatureMapper_ICM_genres print("Movielens10MReader: loading tags") ICM_tags, tokenToFeatureMapper_ICM_tags, _ = _loadICM_tags(tags_path, header=True, separator='::', if_new_item = "ignore", item_original_ID_to_index = self.item_original_ID_to_index) self._LOADED_ICM_DICT["ICM_tags"] = ICM_tags self._LOADED_ICM_MAPPER_DICT["ICM_tags"] = tokenToFeatureMapper_ICM_tags print("Movielens10MReader: loading URM") URM_all, self.item_original_ID_to_index, self.user_original_ID_to_index, URM_timestamp = _loadURM_preinitialized_item_id(URM_path, separator="::", header = False, if_new_user = "******", if_new_item = "ignore", item_original_ID_to_index = self.item_original_ID_to_index) self._LOADED_URM_DICT["URM_all"] = URM_all self._LOADED_URM_DICT["URM_timestamp"] = URM_timestamp self._LOADED_GLOBAL_MAPPER_DICT["user_original_ID_to_index"] = self.user_original_ID_to_index self._LOADED_GLOBAL_MAPPER_DICT["item_original_ID_to_index"] = self.item_original_ID_to_index ICM_all, tokenToFeatureMapper_ICM_all = merge_ICM(ICM_genres, ICM_tags, tokenToFeatureMapper_ICM_genres, tokenToFeatureMapper_ICM_tags) self._LOADED_ICM_DICT["ICM_all"] = ICM_all self._LOADED_ICM_MAPPER_DICT["ICM_all"] = tokenToFeatureMapper_ICM_all print("Movielens10MReader: cleaning temporary files") shutil.rmtree(zipFile_path + "decompressed", ignore_errors=True) print("Movielens10MReader: loading complete")
def _load_from_original_file(self): # Load data from original print("Movielens1MReader: Loading original data") zipFile_path = self.DATASET_SPLIT_ROOT_FOLDER + self.DATASET_SUBFOLDER try: dataFile = zipfile.ZipFile(zipFile_path + "ml-1m.zip") except (FileNotFoundError, zipfile.BadZipFile): print( "Movielens1MReader: Unable to fild data zip file. Downloading..." ) downloadFromURL(self.DATASET_URL, zipFile_path, "ml-1m.zip") dataFile = zipfile.ZipFile(zipFile_path + "ml-1m.zip") ICM_genre_path = dataFile.extract("ml-1m/movies.dat", path=zipFile_path + "decompressed/") UCM_path = dataFile.extract("ml-1m/users.dat", path=zipFile_path + "decompressed/") URM_path = dataFile.extract("ml-1m/ratings.dat", path=zipFile_path + "decompressed/") self.tokenToFeatureMapper_ICM_genres = {} self.tokenToFeatureMapper_UCM_all = {} print("Movielens1MReader: loading genres") self.ICM_genres, self.tokenToFeatureMapper_ICM_genres, self.item_original_ID_to_index = _loadICM_genres( ICM_genre_path, header=True, separator='::', genresSeparator="|") print("Movielens1MReader: loading UCM") self.UCM_all, self.tokenToFeatureMapper_UCM_all, self.user_original_ID_to_index = _loadUCM( UCM_path, header=True, separator='::') print("Movielens1MReader: loading URM") self.URM_all, self.item_original_ID_to_index, self.user_original_ID_to_index = _loadURM_preinitialized_item_id( URM_path, separator="::", header=True, if_new_user="******", if_new_item="ignore", item_original_ID_to_index=self.item_original_ID_to_index, user_original_ID_to_index=self.user_original_ID_to_index) print("Movielens1MReader: cleaning temporary files") shutil.rmtree(zipFile_path + "decompressed", ignore_errors=True) print("Movielens1MReader: loading complete")
def _load_from_original_file(self): # Load data from original print("EpinionsReader: Loading original data") folder_path = self.DATASET_SPLIT_ROOT_FOLDER + self.DATASET_SUBFOLDER compressed_file_path = folder_path + "ratings_data.txt.bz2" decompressed_file_path = folder_path + "ratings_data.txt" try: open(decompressed_file_path, "r") except FileNotFoundError: print( "EpinionsReader: Unable to find decompressed data file. Decompressing..." ) try: compressed_file = bz2.open(compressed_file_path, "rb") except Exception: print( "EpinionsReader: Unable to find or open compressed data file. Downloading..." ) downloadFromURL(self.DATASET_URL, folder_path, "ratings_data.txt.bz2") compressed_file = bz2.open(compressed_file_path, "rb") decompressed_file = open(decompressed_file_path, "w") self._save_BZ2_in_text_file(compressed_file, decompressed_file) decompressed_file.close() print("EpinionsReader: loading URM") self.URM_all, self.item_original_ID_to_index, self.user_original_ID_to_index = load_CSV_into_SparseBuilder( decompressed_file_path, separator=" ", header=True) print("EpinionsReader: cleaning temporary files") import os os.remove(decompressed_file_path) print("EpinionsReader: loading complete")
def _load_from_original_file(self): # Load data from original print("Movielens20MReader: Loading original data") zipFile_path = self.DATASET_SPLIT_ROOT_FOLDER + self.DATASET_SUBFOLDER try: dataFile = zipfile.ZipFile(zipFile_path + "ml-20m.zip") except (FileNotFoundError, zipfile.BadZipFile): print("Movielens20MReader: Unable to fild data zip file. Downloading...") downloadFromURL(self.DATASET_URL, zipFile_path, "ml-20m.zip") dataFile = zipfile.ZipFile(zipFile_path + "ml-20m.zip") genres_path = dataFile.extract("ml-20m/movies.csv", path=zipFile_path + "decompressed/") tags_path = dataFile.extract("ml-20m/tags.csv", path=zipFile_path + "decompressed/") URM_path = dataFile.extract("ml-20m/ratings.csv", path=zipFile_path + "decompressed/") self.tokenToFeatureMapper_ICM_genre = {} print("Movielens20MReader: loading genres") self.ICM_genre, self.tokenToFeatureMapper_ICM_genre, self.item_original_ID_to_index = self._loadICM_genres(genres_path, header=True, separator=',', genresSeparator="|") print("Movielens20MReader: loading URM") self.URM_all, _, self.user_original_ID_to_index = self._loadURM(URM_path, separator=",", header = True, if_new_user = "******", if_new_item = "ignore") print("Movielens20MReader: cleaning temporary files") import shutil shutil.rmtree(zipFile_path + "decompressed", ignore_errors=True) print("Movielens20MReader: saving URM and ICM")
def __init__(self): test_percentage = 0.2 validation_percentage = 0.2 pre_splitted_path = "Data_manager_split_datasets/AmazonInstantVideo/RecSys/SpectralCF_our_interface/" pre_splitted_filename = "splitted_data" ratings_file_name = "ratings_Amazon_Instant_Video.csv" # If directory does not exist, create if not os.path.exists(pre_splitted_path): os.makedirs(pre_splitted_path) try: print( "Dataset_AmazonInstantVideo: Attempting to load pre-splitted data" ) for attrib_name, attrib_object in load_data_dict( pre_splitted_path, pre_splitted_filename).items(): self.__setattr__(attrib_name, attrib_object) except FileNotFoundError: print( "Dataset_AmazonInstantVideo: Pre-splitted data not found, building new one" ) folder_path = self.DATASET_SPLIT_ROOT_FOLDER + self.DATASET_SUBFOLDER downloadFromURL(self.DATASET_URL, folder_path, ratings_file_name) # read Amazon Instant Video df = pd.read_csv(folder_path + ratings_file_name, sep=',', header=None, names=['user', 'item', 'rating', 'timestamp'])[['user', 'item', 'rating']] # keep only ratings = 5 URM_train_builder = IncrementalSparseMatrix( auto_create_col_mapper=True, auto_create_row_mapper=True) URM_train_builder.add_data_lists(df['user'].values, df['item'].values, df['rating'].values) URM_all = URM_train_builder.get_SparseMatrix() URM_all.data = URM_all.data == 5 URM_all.eliminate_zeros() # keep only users with at least 5 ratings URM_all = ut.filter_urm(URM_all, user_min_number_ratings=5, item_min_number_ratings=1) # create train - test - validation URM_train_original, self.URM_test = split_train_validation_percentage_user_wise( URM_all, train_percentage=1 - test_percentage, verbose=False) self.URM_train, self.URM_validation = split_train_validation_percentage_user_wise( URM_train_original, train_percentage=1 - validation_percentage, verbose=False) data_dict = { "URM_train": self.URM_train, "URM_test": self.URM_test, "URM_validation": self.URM_validation, } save_data_dict(data_dict, pre_splitted_path, pre_splitted_filename) print("Dataset_AmazonInstantVideo: Dataset loaded") ut.print_stat_datareader(self)
def _load_from_original_file(self): zipFile_path = self.DATASET_SPLIT_ROOT_FOLDER + self.DATASET_SUBFOLDER try: dataFile = zipfile.ZipFile( zipFile_path + "recommender-system-2019-challenge-polimi.zip") except (FileNotFoundError, zipfile.BadZipFile): print( "Movielens10MReader: Unable to fild data zip file. Downloading..." ) downloadFromURL(self.DATASET_URL, zipFile_path, "ml-10m.zip") dataFile = zipfile.ZipFile(zipFile_path + "ml-10m.zip") URM_path = dataFile.extract("data_train.csv", path=zipFile_path + "decompressed/") ICM_asset = dataFile.extract("data_ICM_asset.csv", path=zipFile_path + "decompressed/") ICM_price = dataFile.extract("data_ICM_price.csv", path=zipFile_path + "decompressed/") ICM_sub_class = dataFile.extract("data_ICM_sub_class.csv", path=zipFile_path + "decompressed/") # print("RecSys2019Reader: loading ICM assets") # ICM_assets, tokenToFeatureMapper_ICM_assets, self.item_original_ID_to_index = _loadICM_assets(ICM_asset, # header=True, # separator=',', # genresSeparator="|") # self._LOADED_ICM_DICT["ICM_assets"] = ICM_assets # self._LOADED_ICM_MAPPER_DICT["ICM_assets"] = tokenToFeatureMapper_ICM_assets # # print("RecSys2019Reader: loading ICM price") # ICM_prices, tokenToFeatureMapper_ICM_prices, self.item_original_ID_to_index = _loadICM_assets(ICM_price, # header=True, # separator=',', # genresSeparator="|") # self._LOADED_ICM_DICT["ICM_prices"] = ICM_assets # self._LOADED_ICM_MAPPER_DICT["ICM_prices"] = tokenToFeatureMapper_ICM_assets # print("Movielens10MReader: loading genres") # ICM_assets, tokenToFeatureMapper_ICM_assets, self.item_original_ID_to_index = _loadICM_genres(genres_path, header=True, separator='::', genresSeparator="|") # # self._LOADED_ICM_DICT["ICM_assets"] = ICM_assets # self._LOADED_ICM_MAPPER_DICT["ICM_assets"] = tokenToFeatureMapper_ICM_assets # # print("Movielens10MReader: loading tags") # ICM_prices, tokenToFeatureMapper_ICM_prices, _ = _loadICM_tags(tags_path, header=True, separator='::', if_new_item = "ignore", # item_original_ID_to_index = self.item_original_ID_to_index) print("RecSys2019Reader: loading ICM class") ICM_classes, tokenToFeatureMapper_ICM_classes, self.item_original_ID_to_index = _loadICM_assets( ICM_sub_class, header=True, separator=',', genresSeparator="|") self._LOADED_ICM_DICT["ICM_classes"] = ICM_classes self._LOADED_ICM_MAPPER_DICT[ "ICM_classes"] = tokenToFeatureMapper_ICM_classes print("Movielens10MReader: loading genres") print("RecSys2019Reader: loading URM") URM_all, self.item_original_ID_to_index, self.user_original_ID_to_index = _loadURM_preinitialized_item_id( URM_path, separator=",", header=False, if_new_user="******", if_new_item="add", item_original_ID_to_index=self.item_original_ID_to_index) self._LOADED_URM_DICT["URM_all"] = URM_all self._LOADED_GLOBAL_MAPPER_DICT[ "user_original_ID_to_index"] = self.user_original_ID_to_index self._LOADED_GLOBAL_MAPPER_DICT[ "item_original_ID_to_index"] = self.item_original_ID_to_index print(URM_all.shape) # ICM_all, tokenToFeatureMapper_ICM_all = merge_ICM(ICM_assets, ICM_prices, # tokenToFeatureMapper_ICM_assets, # tokenToFeatureMapper_ICM_prices) # # self._LOADED_ICM_DICT["ICM_all"] = ICM_all # self._LOADED_ICM_MAPPER_DICT["ICM_all"] = tokenToFeatureMapper_ICM_all print("RecSys2019Reader: cleaning temporary files") shutil.rmtree(zipFile_path + "decompressed", ignore_errors=True) print("RecSys2019Reader: loading complete")