def _load_from_original_file(self): URM_all = self._originalReader.URM_DICT['URM_train'] + \ self._originalReader.URM_DICT['URM_validation'] + \ self._originalReader.URM_DICT['URM_test'] n_users, n_items = URM_all.shape loaded_URM_dict = {"URM_all": URM_all, "URM_test_negative": self._originalReader.URM_DICT['URM_test_negative']} loaded_ICM_dict = {"ICM_all": self._originalReader.ICM_DICT["ICM_all"]} loaded_ICM_mapper_dict = {"ICM_all": { i:i for i in range(self._originalReader.ICM_DICT["ICM_all"].shape[1])}} loaded_UCM_dict = {"UCM_all": self._originalReader.ICM_DICT["UCM_all"]} loaded_UCM_mapper_dict = {"UCM_all": { i:i for i in range(self._originalReader.ICM_DICT["UCM_all"].shape[1])}} user_original_ID_to_index = { i:i for i in range(n_users) } item_original_ID_to_index = { i:i for i in range(n_items) } loaded_dataset = Dataset(dataset_name = self._get_dataset_name(), URM_dictionary = loaded_URM_dict, ICM_dictionary = loaded_ICM_dict, ICM_feature_mapper_dictionary = loaded_ICM_mapper_dict, UCM_dictionary = loaded_UCM_dict, UCM_feature_mapper_dictionary = loaded_UCM_mapper_dict, user_original_ID_to_index= user_original_ID_to_index, item_original_ID_to_index= item_original_ID_to_index, is_implicit = self.IS_IMPLICIT, ) return loaded_dataset
def _load_from_original_file(self): # Load data from original self._print("Loading original data") folder_path = self.DATASET_SPLIT_ROOT_FOLDER + self.DATASET_SUBFOLDER try: compressed_file = gzip.open(folder_path + self.ZIP_NAME, ) except FileNotFoundError: self._print("Unable to find data zip file. Downloading...") download_from_URL(self.DATASET_URL, folder_path, self.ZIP_NAME) compressed_file = gzip.open(folder_path + self.ZIP_NAME) URM_path = folder_path + self.FILE_RATINGS_PATH decompressed_file = open(URM_path, "w") self._save_GZ_in_text_file(compressed_file, decompressed_file) decompressed_file.close() self._print("loading URM") URM_all, item_original_ID_to_index, user_original_ID_to_index = load_CSV_into_SparseBuilder( URM_path, header=False, separator="\t", remove_duplicates=True, custom_user_item_rating_columns=[0, 4, 2]) # URM_all contains the coordinates in textual format URM_all.data = np.ones_like(URM_all.data) loaded_URM_dict = {"URM_all": URM_all} loaded_dataset = Dataset( dataset_name=self._get_dataset_name(), URM_dictionary=loaded_URM_dict, ICM_dictionary=None, ICM_feature_mapper_dictionary=None, UCM_dictionary=None, UCM_feature_mapper_dictionary=None, user_original_ID_to_index=user_original_ID_to_index, item_original_ID_to_index=item_original_ID_to_index, is_implicit=self.IS_IMPLICIT, ) self._print("cleaning temporary files") os.remove(URM_path) self._print("loading complete") return loaded_dataset
def _load_from_original_file(self): # Load data from original zipFile_path = self.DATASET_SPLIT_ROOT_FOLDER + self.DATASET_SUBFOLDER try: dataFile = zipfile.ZipFile(zipFile_path + "hetrec2011-movielens-2k-v2.zip") except (FileNotFoundError, zipfile.BadZipFile): self._print("Unable to fild data zip file. Downloading...") download_from_URL(self.DATASET_URL, zipFile_path, "hetrec2011-movielens-2k-v2.zip") dataFile = zipfile.ZipFile(zipFile_path + "hetrec2011-movielens-2k-v2.zip") URM_path = dataFile.extract("user_ratedmovies.dat", path=zipFile_path + "decompressed/") URM_all, item_original_ID_to_index, user_original_ID_to_index = load_CSV_into_SparseBuilder( URM_path, separator="\t", header=True, custom_user_item_rating_columns=[0, 1, 2]) loaded_URM_dict = {"URM_all": URM_all} loaded_dataset = Dataset( dataset_name=self._get_dataset_name(), URM_dictionary=loaded_URM_dict, ICM_dictionary=None, ICM_feature_mapper_dictionary=None, UCM_dictionary=None, UCM_feature_mapper_dictionary=None, user_original_ID_to_index=user_original_ID_to_index, item_original_ID_to_index=item_original_ID_to_index, is_implicit=self.IS_IMPLICIT, ) self._print("cleaning temporary files") shutil.rmtree(zipFile_path + "decompressed", ignore_errors=True) self._print("loading complete") return loaded_dataset
def _load_from_original_file(self): # Load data from original self.zip_file_folder = self.DATASET_OFFLINE_ROOT_FOLDER + self.DATASET_SUBFOLDER self.decompressed_zip_file_folder = self.DATASET_SPLIT_ROOT_FOLDER + self.DATASET_SUBFOLDER try: self.dataFile = zipfile.ZipFile(self.zip_file_folder + "netflix-prize-data.zip") except (FileNotFoundError, zipfile.BadZipFile): self._print("Unable to find data zip file.") self._print( "Automatic download not available, please ensure the ZIP data file is in folder {}." .format(self.zip_file_folder)) self._print("Data can be downloaded here: {}".format( self.DATASET_URL)) # If directory does not exist, create if not os.path.exists(self.zip_file_folder): os.makedirs(self.zip_file_folder) raise FileNotFoundError("Automatic download not available.") URM_all, self.item_original_ID_to_index, self.user_original_ID_to_index = self._loadURM( ) loaded_URM_dict = {"URM_all": URM_all} loaded_dataset = Dataset( dataset_name=self._get_dataset_name(), URM_dictionary=loaded_URM_dict, ICM_dictionary=None, ICM_feature_mapper_dictionary=None, UCM_dictionary=None, UCM_feature_mapper_dictionary=None, user_original_ID_to_index=self.user_original_ID_to_index, item_original_ID_to_index=self.item_original_ID_to_index, is_implicit=self.IS_IMPLICIT, ) self._print("loading complete") return loaded_dataset
def _load_from_original_file(self): # Load data from original self._print("Loading original data") zipFile_path = self.DATASET_SPLIT_ROOT_FOLDER + self.DATASET_SUBFOLDER try: dataFile = zipfile.ZipFile(zipFile_path + "filmtrust.zip") except (FileNotFoundError, zipfile.BadZipFile): print("FilmTrust: Unable to fild data zip file. Downloading...") download_from_URL(self.DATASET_URL, zipFile_path, "filmtrust.zip") dataFile = zipfile.ZipFile(zipFile_path + "filmtrust.zip") URM_path = dataFile.extract("ratings.txt", path=zipFile_path + "decompressed/") URM_all, item_original_ID_to_index, user_original_ID_to_index = load_CSV_into_SparseBuilder( URM_path, separator=" ", header=False, remove_duplicates=True) loaded_URM_dict = {"URM_all": URM_all} loaded_dataset = Dataset( dataset_name=self._get_dataset_name(), URM_dictionary=loaded_URM_dict, ICM_dictionary=None, ICM_feature_mapper_dictionary=None, UCM_dictionary=None, UCM_feature_mapper_dictionary=None, user_original_ID_to_index=user_original_ID_to_index, item_original_ID_to_index=item_original_ID_to_index, is_implicit=self.IS_IMPLICIT, ) self._print("cleaning temporary files") shutil.rmtree(zipFile_path + "decompressed", ignore_errors=True) self._print("loading complete") return loaded_dataset
def _load_from_original_file(self): # Load data from original zipFile_path = self.DATASET_SPLIT_ROOT_FOLDER + self.DATASET_SUBFOLDER try: dataFile = zipfile.ZipFile(zipFile_path + "ml-1m.zip") except (FileNotFoundError, zipfile.BadZipFile): self._print("Unable to fild data zip file. Downloading...") download_from_URL(self.DATASET_URL, zipFile_path, "ml-1m.zip") dataFile = zipfile.ZipFile(zipFile_path + "ml-1m.zip") ICM_genre_path = dataFile.extract("ml-1m/movies.dat", path=zipFile_path + "decompressed/") UCM_path = dataFile.extract("ml-1m/users.dat", path=zipFile_path + "decompressed/") URM_path = dataFile.extract("ml-1m/ratings.dat", path=zipFile_path + "decompressed/") self._print("loading genres") ICM_genres, tokenToFeatureMapper_ICM_genres, item_original_ID_to_index = _loadICM_genres( ICM_genre_path, header=True, separator='::', genresSeparator="|") self._print("loading UCM") UCM_all, tokenToFeatureMapper_UCM_all, user_original_ID_to_index = _loadUCM( UCM_path, header=True, separator='::') self._print("loading URM") URM_all, item_original_ID_to_index, user_original_ID_to_index, URM_timestamp = _loadURM_preinitialized_item_id( URM_path, separator="::", header=False, if_new_user="******", if_new_item="ignore", item_original_ID_to_index=item_original_ID_to_index, user_original_ID_to_index=user_original_ID_to_index) loaded_URM_dict = {"URM_all": URM_all, "URM_timestamp": URM_timestamp} loaded_ICM_dict = {"ICM_genres": ICM_genres} loaded_ICM_mapper_dict = { "ICM_genres": tokenToFeatureMapper_ICM_genres } loaded_UCM_dict = {"UCM_all": UCM_all} loaded_UCM_mapper_dict = {"UCM_all": tokenToFeatureMapper_UCM_all} loaded_dataset = Dataset( dataset_name=self._get_dataset_name(), URM_dictionary=loaded_URM_dict, ICM_dictionary=loaded_ICM_dict, ICM_feature_mapper_dictionary=loaded_ICM_mapper_dict, UCM_dictionary=loaded_UCM_dict, UCM_feature_mapper_dictionary=loaded_UCM_mapper_dict, user_original_ID_to_index=user_original_ID_to_index, item_original_ID_to_index=item_original_ID_to_index, is_implicit=self.IS_IMPLICIT, ) self._print("cleaning temporary files") shutil.rmtree(zipFile_path + "decompressed", ignore_errors=True) self._print("loading complete") return loaded_dataset
def generate_Dataset(self, dataset_name, is_implicit): assert ( not self.__Dataset_finalized ), "Dataset mappers have already been generated, adding new data is forbidden" self.__Dataset_finalized = True # Generate ID to index mappers self._generate_global_mappers() self._generate_ICM_UCM_mappers() URM_DICT_sparse = {} ICM_DICT_sparse = {} UCM_DICT_sparse = {} on_new_ID = "ignore" for URM_name, URM_dataframe in self.URM_DICT.items(): URM_sparse_builder = IncrementalSparseMatrix_FilterIDs( preinitialized_col_mapper=self.item_original_ID_to_index, preinitialized_row_mapper=self.user_original_ID_to_index, on_new_col=on_new_ID, on_new_row=on_new_ID, ) URM_sparse_builder.add_data_lists( URM_dataframe["UserID"].values, URM_dataframe["ItemID"].values, URM_dataframe["Data"].values, ) URM_DICT_sparse[URM_name] = URM_sparse_builder.get_SparseMatrix() for ICM_name, ICM_dataframe in self.ICM_DICT.items(): feature_ID_to_index = self.ICM_mapper_DICT[ICM_name] ICM_sparse_builder = IncrementalSparseMatrix_FilterIDs( preinitialized_col_mapper=feature_ID_to_index, preinitialized_row_mapper=self.item_original_ID_to_index, on_new_col=on_new_ID, on_new_row=on_new_ID, ) ICM_sparse_builder.add_data_lists( ICM_dataframe["ItemID"].values, ICM_dataframe["FeatureID"].values, ICM_dataframe["Data"].values, ) ICM_DICT_sparse[ICM_name] = ICM_sparse_builder.get_SparseMatrix() for UCM_name, UCM_dataframe in self.UCM_DICT.items(): feature_ID_to_index = self.UCM_mapper_DICT[UCM_name] UCM_sparse_builder = IncrementalSparseMatrix_FilterIDs( preinitialized_col_mapper=feature_ID_to_index, preinitialized_row_mapper=self.user_original_ID_to_index, on_new_col=on_new_ID, on_new_row=on_new_ID, ) UCM_sparse_builder.add_data_lists( UCM_dataframe["UserID"].values, UCM_dataframe["FeatureID"].values, UCM_dataframe["Data"].values, ) UCM_DICT_sparse[UCM_name] = UCM_sparse_builder.get_SparseMatrix() loaded_dataset = Dataset( dataset_name=dataset_name, URM_dictionary=URM_DICT_sparse, ICM_dictionary=ICM_DICT_sparse, ICM_feature_mapper_dictionary=self.ICM_mapper_DICT, UCM_dictionary=UCM_DICT_sparse, UCM_feature_mapper_dictionary=self.UCM_mapper_DICT, user_original_ID_to_index=self.user_original_ID_to_index, item_original_ID_to_index=self.item_original_ID_to_index, is_implicit=is_implicit, ) return loaded_dataset
def load_data(self, save_folder_path=None): """ :param save_folder_path: path in which to save the loaded dataset None use default "dataset_name/original/" False do not save :return: """ # Use default e.g., "dataset_name/original/" if save_folder_path is None: save_folder_path = self.DATASET_SPLIT_ROOT_FOLDER + self._get_dataset_name_root( ) + self._get_dataset_name_data_subfolder() # If save_folder_path contains any path try to load a previously built split from it if save_folder_path is not False and not self.reload_from_original_data: try: loaded_dataset = Dataset() loaded_dataset.load_data(save_folder_path) self._print("Verifying data consistency...") loaded_dataset.verify_data_consistency() self._print("Verifying data consistency... Passed!") loaded_dataset.print_statistics() return loaded_dataset except FileNotFoundError: self._print( "Preloaded data not found, reading from original files...") except Exception: self._print( "Reading split from {} caused the following exception...". format(save_folder_path)) traceback.print_exc() raise Exception("{}: Exception while reading split".format( self._get_dataset_name())) self._print("Loading original data") loaded_dataset = self._load_from_original_file() self._print("Verifying data consistency...") loaded_dataset.verify_data_consistency() self._print("Verifying data consistency... Passed!") if save_folder_path not in [False]: # If directory does not exist, create if not os.path.exists(save_folder_path): self._print("Creating folder '{}'".format(save_folder_path)) os.makedirs(save_folder_path) else: self._print("Found already existing folder '{}'".format( save_folder_path)) loaded_dataset.save_data(save_folder_path) self._print("Saving complete!") loaded_dataset.print_statistics() return loaded_dataset
def _load_from_original_file_all_amazon_datasets(self, URM_path, metadata_path=None, reviews_path=None): # Load data from original self._print("loading URM") URM_all, URM_timestamp, self.item_original_ID_to_index, self.user_original_ID_to_index = load_CSV_into_SparseBuilder( URM_path, separator=",", header=False, timestamp=True) loaded_URM_dict = {"URM_all": URM_all, "URM_timestamp": URM_timestamp} loaded_ICM_dict = {} loaded_ICM_mapper_dict = {} if metadata_path is not None: self._print("loading metadata") ICM_metadata, tokenToFeatureMapper_ICM_metadata, _ = self._loadMetadata( metadata_path, if_new_item="ignore") ICM_metadata, _, tokenToFeatureMapper_ICM_metadata = remove_features( ICM_metadata, min_occurrence=5, max_percentage_occurrence=0.30, reconcile_mapper=tokenToFeatureMapper_ICM_metadata) loaded_ICM_dict["ICM_metadata"] = ICM_metadata loaded_ICM_mapper_dict[ "ICM_metadata"] = tokenToFeatureMapper_ICM_metadata if reviews_path is not None: self._print("loading reviews") ICM_reviews, tokenToFeatureMapper_ICM_reviews, _ = self._loadReviews( reviews_path, if_new_item="ignore") ICM_reviews, _, tokenToFeatureMapper_ICM_reviews = remove_features( ICM_reviews, min_occurrence=5, max_percentage_occurrence=0.30, reconcile_mapper=tokenToFeatureMapper_ICM_reviews) loaded_ICM_dict["ICM_reviews"] = ICM_reviews loaded_ICM_mapper_dict[ "ICM_reviews"] = tokenToFeatureMapper_ICM_reviews loaded_dataset = Dataset( dataset_name=self._get_dataset_name(), URM_dictionary=loaded_URM_dict, ICM_dictionary=loaded_ICM_dict, ICM_feature_mapper_dictionary=loaded_ICM_mapper_dict, UCM_dictionary=None, UCM_feature_mapper_dictionary=None, user_original_ID_to_index=self.user_original_ID_to_index, item_original_ID_to_index=self.item_original_ID_to_index, is_implicit=self.IS_IMPLICIT, ) # Clean temp files self._print("cleaning temporary files") if metadata_path is not None: os.remove(metadata_path) if reviews_path is not None: os.remove(reviews_path) self._print("loading complete") return loaded_dataset
def _load_from_original_file(self): # Load data from original self._print("Loading original data") zipFile_path = self.DATASET_SPLIT_ROOT_FOLDER + self.DATASET_SUBFOLDER try: dataFile = zipfile.ZipFile( zipFile_path + "neural_factorization_machine-master.zip") except (FileNotFoundError, zipfile.BadZipFile): self._print("Unable to fild data zip file. Downloading...") download_from_URL(self.DATASET_URL, zipFile_path, "neural_factorization_machine-master.zip") dataFile = zipfile.ZipFile( zipFile_path + "neural_factorization_machine-master.zip") inner_path_in_zip = "neural_factorization_machine-master/data/frappe/" URM_train_path = dataFile.extract(inner_path_in_zip + "frappe.train.libfm", path=zipFile_path + "decompressed/") URM_test_path = dataFile.extract(inner_path_in_zip + "frappe.test.libfm", path=zipFile_path + "decompressed/") URM_validation_path = dataFile.extract( inner_path_in_zip + "frappe.validation.libfm", path=zipFile_path + "decompressed/") tmp_URM_train, item_original_ID_to_index, user_original_ID_to_index = self._loadURM( URM_train_path, item_original_ID_to_index=None, user_original_ID_to_index=None) tmp_URM_test, item_original_ID_to_index, user_original_ID_to_index = self._loadURM( URM_test_path, item_original_ID_to_index=item_original_ID_to_index, user_original_ID_to_index=user_original_ID_to_index) tmp_URM_validation, item_original_ID_to_index, user_original_ID_to_index = self._loadURM( URM_validation_path, item_original_ID_to_index=item_original_ID_to_index, user_original_ID_to_index=user_original_ID_to_index) shape = (len(user_original_ID_to_index), len(item_original_ID_to_index)) tmp_URM_train = reshapeSparse(tmp_URM_train, shape) tmp_URM_test = reshapeSparse(tmp_URM_test, shape) tmp_URM_validation = reshapeSparse(tmp_URM_validation, shape) URM_occurrence = tmp_URM_train + tmp_URM_test + tmp_URM_validation URM_all = URM_occurrence.copy() URM_all.data = np.ones_like(URM_all.data) loaded_URM_dict = { "URM_all": URM_all, "URM_occurrence": URM_occurrence } loaded_dataset = Dataset( dataset_name=self._get_dataset_name(), URM_dictionary=loaded_URM_dict, ICM_dictionary=None, ICM_feature_mapper_dictionary=None, UCM_dictionary=None, UCM_feature_mapper_dictionary=None, user_original_ID_to_index=user_original_ID_to_index, item_original_ID_to_index=item_original_ID_to_index, is_implicit=self.IS_IMPLICIT, ) self._print("cleaning temporary files") shutil.rmtree(zipFile_path + "decompressed", ignore_errors=True) self._print("loading complete") return loaded_dataset
def _load_from_original_file(self): # Load data from original zipFile_path = self.DATASET_SPLIT_ROOT_FOLDER + self.DATASET_SUBFOLDER try: dataFile = zipfile.ZipFile(zipFile_path + "ml-20m.zip") except (FileNotFoundError, zipfile.BadZipFile): print("Movielens20MReader: Unable to fild data zip file. Downloading...") download_from_URL(self.DATASET_URL, zipFile_path, "ml-20m.zip") dataFile = zipfile.ZipFile(zipFile_path + "ml-20m.zip") genres_path = dataFile.extract("ml-20m/movies.csv", path=zipFile_path + "decompressed/") tags_path = dataFile.extract("ml-20m/tags.csv", path=zipFile_path + "decompressed/") URM_path = dataFile.extract("ml-20m/ratings.csv", path=zipFile_path + "decompressed/") self._print("loading genres") ICM_genres, tokenToFeatureMapper_ICM_genres, item_original_ID_to_index = _loadICM_genres(genres_path, header=True, separator=',', genresSeparator="|") self._print("loading tags") ICM_tags, tokenToFeatureMapper_ICM_tags, _ = _loadICM_tags(tags_path, header=True, separator=',', if_new_item = "ignore", item_original_ID_to_index = item_original_ID_to_index) self._print("loading URM") URM_all, item_original_ID_to_index, user_original_ID_to_index, URM_timestamp = _loadURM_preinitialized_item_id(URM_path, separator=",", header = True, if_new_user = "******", if_new_item = "ignore", item_original_ID_to_index = item_original_ID_to_index) ICM_all, tokenToFeatureMapper_ICM_all = merge_ICM(ICM_genres, ICM_tags, tokenToFeatureMapper_ICM_genres, tokenToFeatureMapper_ICM_tags) loaded_URM_dict = {"URM_all": URM_all, "URM_timestamp": URM_timestamp} loaded_ICM_dict = {"ICM_genres": ICM_genres, "ICM_tags": ICM_tags, "ICM_all": ICM_all} loaded_ICM_mapper_dict = {"ICM_genres": tokenToFeatureMapper_ICM_genres, "ICM_tags": tokenToFeatureMapper_ICM_tags, "ICM_all": tokenToFeatureMapper_ICM_all} loaded_dataset = Dataset(dataset_name = self._get_dataset_name(), URM_dictionary = loaded_URM_dict, ICM_dictionary = loaded_ICM_dict, ICM_feature_mapper_dictionary = loaded_ICM_mapper_dict, UCM_dictionary = None, UCM_feature_mapper_dictionary = None, user_original_ID_to_index= user_original_ID_to_index, item_original_ID_to_index= item_original_ID_to_index, is_implicit = self.IS_IMPLICIT, ) self._print("cleaning temporary files") shutil.rmtree(zipFile_path + "decompressed", ignore_errors=True) self._print("saving URM and ICM") return loaded_dataset