def _load_from_original_file(self): # Load data from original zipFile_path = self.DATASET_SPLIT_ROOT_FOLDER + self.DATASET_SUBFOLDER try: dataFile = zipfile.ZipFile(zipFile_path + "ml-10m.zip") except (FileNotFoundError, zipfile.BadZipFile): self._print("Unable to find data zip file. Downloading...") download_from_URL(self.DATASET_URL, zipFile_path, "ml-10m.zip") dataFile = zipfile.ZipFile(zipFile_path + "ml-10m.zip") ICM_genre_path = dataFile.extract("ml-10M100K/movies.dat", path=zipFile_path + "decompressed/") ICM_tags_path = dataFile.extract("ml-10M100K/tags.dat", path=zipFile_path + "decompressed/") URM_path = dataFile.extract("ml-10M100K/ratings.dat", path=zipFile_path + "decompressed/") self._print("Loading Item Features Genres") ICM_genres_dataframe, ICM_years_dataframe = _loadICM_genres_years( ICM_genre_path, header=None, separator='::', genresSeparator="|") self._print("Loading Item Features Tags") ICM_tags_dataframe = _loadICM_tags(ICM_tags_path, header=None, separator='::') ICM_all_dataframe = pd.concat( [ICM_genres_dataframe, ICM_tags_dataframe]) self._print("Loading Interactions") URM_all_dataframe, URM_timestamp_dataframe = _loadURM(URM_path, header=None, separator='::') dataset_manager = DatasetMapperManager() dataset_manager.add_URM(URM_all_dataframe, "URM_all") dataset_manager.add_URM(URM_timestamp_dataframe, "URM_timestamp") dataset_manager.add_ICM(ICM_genres_dataframe, "ICM_genres") dataset_manager.add_ICM(ICM_years_dataframe, "ICM_year") dataset_manager.add_ICM(ICM_tags_dataframe, "ICM_tags") dataset_manager.add_ICM(ICM_all_dataframe, "ICM_all") loaded_dataset = dataset_manager.generate_Dataset( dataset_name=self._get_dataset_name(), is_implicit=self.IS_IMPLICIT) self._print("Cleaning Temporary Files") shutil.rmtree(zipFile_path + "decompressed", ignore_errors=True) self._print("Loading Complete") return loaded_dataset
def _load_from_original_file(self): # Load data from original self._print("Loading original data") folder_path = self.DATASET_SPLIT_ROOT_FOLDER + self.DATASET_SUBFOLDER try: compressed_file = gzip.open(folder_path + self.ZIP_NAME, ) except FileNotFoundError: self._print("Unable to find data zip file. Downloading...") download_from_URL(self.DATASET_URL, folder_path, self.ZIP_NAME) compressed_file = gzip.open(folder_path + self.ZIP_NAME) URM_path = folder_path + self.FILE_RATINGS_PATH decompressed_file = open(URM_path, "w") self._save_GZ_in_text_file(compressed_file, decompressed_file) decompressed_file.close() self._print("loading URM") URM_all, item_original_ID_to_index, user_original_ID_to_index = load_CSV_into_SparseBuilder( URM_path, header=False, separator="\t", remove_duplicates=True, custom_user_item_rating_columns=[0, 4, 2]) # URM_all contains the coordinates in textual format URM_all.data = np.ones_like(URM_all.data) loaded_URM_dict = {"URM_all": URM_all} loaded_dataset = Dataset( dataset_name=self._get_dataset_name(), URM_dictionary=loaded_URM_dict, ICM_dictionary=None, ICM_feature_mapper_dictionary=None, UCM_dictionary=None, UCM_feature_mapper_dictionary=None, user_original_ID_to_index=user_original_ID_to_index, item_original_ID_to_index=item_original_ID_to_index, is_implicit=self.IS_IMPLICIT, ) self._print("cleaning temporary files") os.remove(URM_path) self._print("loading complete") return loaded_dataset
def _load_from_original_file(self): # Load data from original zipFile_path = self.DATASET_SPLIT_ROOT_FOLDER + self.DATASET_SUBFOLDER try: dataFile = zipfile.ZipFile(zipFile_path + "ml-1m.zip") except (FileNotFoundError, zipfile.BadZipFile): self._print("Unable to find data zip file. Downloading...") download_from_URL(self.DATASET_URL, zipFile_path, "ml-1m.zip") dataFile = zipfile.ZipFile(zipFile_path + "ml-1m.zip") ICM_genre_path = dataFile.extract("ml-1m/movies.dat", path=zipFile_path + "decompressed/") UCM_path = dataFile.extract("ml-1m/users.dat", path=zipFile_path + "decompressed/") URM_path = dataFile.extract("ml-1m/ratings.dat", path=zipFile_path + "decompressed/") self._print("Loading Interactions") URM_all_dataframe, URM_timestamp_dataframe = _loadURM(URM_path, header=None, separator='::') self._print("Loading Item Features genres") ICM_genres_dataframe = _loadICM_genres(ICM_genre_path, header=None, separator='::', genresSeparator="|") self._print("Loading User Features") UCM_dataframe = pd.read_csv(filepath_or_buffer=UCM_path, sep="::", header=None, dtype={0:str, 1:str, 2:str, 3:str, 4:str}) UCM_dataframe.columns = ["UserID", "gender", "age_group", "occupation", "zip_code"] # For each user a list of features UCM_list = [[feature_name + "_" + str(UCM_dataframe[feature_name][index]) for feature_name in ["gender", "age_group", "occupation", "zip_code"]] for index in range(len(UCM_dataframe))] UCM_dataframe = pd.DataFrame(UCM_list, index=UCM_dataframe["UserID"]).stack() UCM_dataframe = UCM_dataframe.reset_index()[[0, 'UserID']] UCM_dataframe.columns = ['FeatureID', 'UserID'] UCM_dataframe["Data"] = 1 dataset_manager = DatasetMapperManager() dataset_manager.add_URM(URM_all_dataframe, "URM_all") dataset_manager.add_URM(URM_timestamp_dataframe, "URM_timestamp") dataset_manager.add_ICM(ICM_genres_dataframe, "ICM_genres") dataset_manager.add_UCM(UCM_dataframe, "UCM_all") loaded_dataset = dataset_manager.generate_Dataset(dataset_name=self._get_dataset_name(), is_implicit=self.IS_IMPLICIT) self._print("cleaning temporary files") shutil.rmtree(zipFile_path + "decompressed", ignore_errors=True) self._print("Loading Complete") return loaded_dataset
def _load_from_original_file(self): # Load data from original zipFile_path = self.DATASET_SPLIT_ROOT_FOLDER + self.DATASET_SUBFOLDER try: dataFile = zipfile.ZipFile(zipFile_path + "hetrec2011-movielens-2k-v2.zip") except (FileNotFoundError, zipfile.BadZipFile): self._print("Unable to fild data zip file. Downloading...") download_from_URL(self.DATASET_URL, zipFile_path, "hetrec2011-movielens-2k-v2.zip") dataFile = zipfile.ZipFile(zipFile_path + "hetrec2011-movielens-2k-v2.zip") URM_path = dataFile.extract("user_ratedmovies.dat", path=zipFile_path + "decompressed/") URM_all, item_original_ID_to_index, user_original_ID_to_index = load_CSV_into_SparseBuilder( URM_path, separator="\t", header=True, custom_user_item_rating_columns=[0, 1, 2]) loaded_URM_dict = {"URM_all": URM_all} loaded_dataset = Dataset( dataset_name=self._get_dataset_name(), URM_dictionary=loaded_URM_dict, ICM_dictionary=None, ICM_feature_mapper_dictionary=None, UCM_dictionary=None, UCM_feature_mapper_dictionary=None, user_original_ID_to_index=user_original_ID_to_index, item_original_ID_to_index=item_original_ID_to_index, is_implicit=self.IS_IMPLICIT, ) self._print("cleaning temporary files") shutil.rmtree(zipFile_path + "decompressed", ignore_errors=True) self._print("loading complete") return loaded_dataset
def _get_ICM_metadata_path(self, data_folder, compressed_file_name, decompressed_file_name, file_url): """ Metadata files are .csv :param data_folder: :param file_name: :param file_url: :return: """ try: open(data_folder + decompressed_file_name, "r") except FileNotFoundError: self._print("Decompressing metadata file...") try: decompressed_file = open(data_folder + decompressed_file_name, "wb") compressed_file = gzip.open(data_folder + compressed_file_name, "rb") decompressed_file.write(compressed_file.read()) compressed_file.close() decompressed_file.close() except (FileNotFoundError, Exception): self._print( "Unable to find or decompress compressed file. Downloading..." ) download_from_URL(file_url, data_folder, compressed_file_name) decompressed_file = open(data_folder + decompressed_file_name, "wb") compressed_file = gzip.open(data_folder + compressed_file_name, "rb") decompressed_file.write(compressed_file.read()) compressed_file.close() decompressed_file.close() return data_folder + decompressed_file_name
def _load_from_original_file(self): # Load data from original zipFile_path = self.DATASET_SPLIT_ROOT_FOLDER + self.DATASET_SUBFOLDER try: dataFile = zipfile.ZipFile(zipFile_path + "hetrec2011-movielens-2k-v2.zip") except (FileNotFoundError, zipfile.BadZipFile): self._print("Unable to find data zip file. Downloading...") download_from_URL(self.DATASET_URL, zipFile_path, "hetrec2011-movielens-2k-v2.zip") dataFile = zipfile.ZipFile(zipFile_path + "hetrec2011-movielens-2k-v2.zip") URM_path = dataFile.extract("user_ratedmovies.dat", path=zipFile_path + "decompressed/") self._print("Loading Interactions") URM_all_dataframe = pd.read_csv(filepath_or_buffer=URM_path, sep="\t", header=0, dtype={ 0: str, 1: str, 2: float }, usecols=[0, 1, 2]) URM_all_dataframe.columns = ["UserID", "ItemID", "Data"] dataset_manager = DatasetMapperManager() dataset_manager.add_URM(URM_all_dataframe, "URM_all") loaded_dataset = dataset_manager.generate_Dataset( dataset_name=self._get_dataset_name(), is_implicit=self.IS_IMPLICIT) self._print("cleaning temporary files") shutil.rmtree(zipFile_path + "decompressed", ignore_errors=True) self._print("Loading Complete") return loaded_dataset
def _load_from_original_file(self): # Load data from original self._print("Loading original data") zipFile_path = self.DATASET_SPLIT_ROOT_FOLDER + self.DATASET_SUBFOLDER try: dataFile = zipfile.ZipFile(zipFile_path + "filmtrust.zip") except (FileNotFoundError, zipfile.BadZipFile): print("FilmTrust: Unable to fild data zip file. Downloading...") download_from_URL(self.DATASET_URL, zipFile_path, "filmtrust.zip") dataFile = zipfile.ZipFile(zipFile_path + "filmtrust.zip") URM_path = dataFile.extract("ratings.txt", path=zipFile_path + "decompressed/") URM_all, item_original_ID_to_index, user_original_ID_to_index = load_CSV_into_SparseBuilder( URM_path, separator=" ", header=False, remove_duplicates=True) loaded_URM_dict = {"URM_all": URM_all} loaded_dataset = Dataset( dataset_name=self._get_dataset_name(), URM_dictionary=loaded_URM_dict, ICM_dictionary=None, ICM_feature_mapper_dictionary=None, UCM_dictionary=None, UCM_feature_mapper_dictionary=None, user_original_ID_to_index=user_original_ID_to_index, item_original_ID_to_index=item_original_ID_to_index, is_implicit=self.IS_IMPLICIT, ) self._print("cleaning temporary files") shutil.rmtree(zipFile_path + "decompressed", ignore_errors=True) self._print("loading complete") return loaded_dataset
def _get_URM_review_path(self, data_folder, file_name, file_url): """ Metadata files are .csv :param data_folder: :param file_name: :param file_url: :return: """ try: open(data_folder + file_name, "r") except FileNotFoundError: self._print("Unable to find or open review file. Downloading...") download_from_URL(file_url, data_folder, file_name) return data_folder + file_name
def _load_from_original_file(self): # Load data from original zipFile_path = self.DATASET_SPLIT_ROOT_FOLDER + self.DATASET_SUBFOLDER try: dataFile = zipfile.ZipFile(zipFile_path + "ml-100k.zip") except (FileNotFoundError, zipfile.BadZipFile): self._print("Unable to find data zip file. Downloading...") download_from_URL(self.DATASET_URL, zipFile_path, "ml-100k.zip") dataFile = zipfile.ZipFile(zipFile_path + "ml-100k.zip") URM_path = dataFile.extract("ml-100k/u.data", path=zipFile_path + "decompressed/") self._print("Loading Interactions") URM_all_dataframe, URM_timestamp_dataframe = _loadURM(URM_path, header=None, separator='\t') dataset_manager = DatasetMapperManager() dataset_manager.add_URM(URM_all_dataframe, "URM_all") dataset_manager.add_URM(URM_timestamp_dataframe, "URM_timestamp") loaded_dataset = dataset_manager.generate_Dataset( dataset_name=self._get_dataset_name(), is_implicit=self.IS_IMPLICIT) self._print("Cleaning Temporary Files") shutil.rmtree(zipFile_path + "decompressed", ignore_errors=True) self._print("Loading Complete") return loaded_dataset
def _load_from_original_file(self): # Load data from original zipFile_path = self.DATASET_SPLIT_ROOT_FOLDER + self.DATASET_SUBFOLDER try: dataFile = zipfile.ZipFile(zipFile_path + "ml-1m.zip") except (FileNotFoundError, zipfile.BadZipFile): self._print("Unable to fild data zip file. Downloading...") download_from_URL(self.DATASET_URL, zipFile_path, "ml-1m.zip") dataFile = zipfile.ZipFile(zipFile_path + "ml-1m.zip") ICM_genre_path = dataFile.extract("ml-1m/movies.dat", path=zipFile_path + "decompressed/") UCM_path = dataFile.extract("ml-1m/users.dat", path=zipFile_path + "decompressed/") URM_path = dataFile.extract("ml-1m/ratings.dat", path=zipFile_path + "decompressed/") self._print("loading genres") ICM_genres, tokenToFeatureMapper_ICM_genres, item_original_ID_to_index = _loadICM_genres( ICM_genre_path, header=True, separator='::', genresSeparator="|") self._print("loading UCM") UCM_all, tokenToFeatureMapper_UCM_all, user_original_ID_to_index = _loadUCM( UCM_path, header=True, separator='::') self._print("loading URM") URM_all, item_original_ID_to_index, user_original_ID_to_index, URM_timestamp = _loadURM_preinitialized_item_id( URM_path, separator="::", header=False, if_new_user="******", if_new_item="ignore", item_original_ID_to_index=item_original_ID_to_index, user_original_ID_to_index=user_original_ID_to_index) loaded_URM_dict = {"URM_all": URM_all, "URM_timestamp": URM_timestamp} loaded_ICM_dict = {"ICM_genres": ICM_genres} loaded_ICM_mapper_dict = { "ICM_genres": tokenToFeatureMapper_ICM_genres } loaded_UCM_dict = {"UCM_all": UCM_all} loaded_UCM_mapper_dict = {"UCM_all": tokenToFeatureMapper_UCM_all} loaded_dataset = Dataset( dataset_name=self._get_dataset_name(), URM_dictionary=loaded_URM_dict, ICM_dictionary=loaded_ICM_dict, ICM_feature_mapper_dictionary=loaded_ICM_mapper_dict, UCM_dictionary=loaded_UCM_dict, UCM_feature_mapper_dictionary=loaded_UCM_mapper_dict, user_original_ID_to_index=user_original_ID_to_index, item_original_ID_to_index=item_original_ID_to_index, is_implicit=self.IS_IMPLICIT, ) self._print("cleaning temporary files") shutil.rmtree(zipFile_path + "decompressed", ignore_errors=True) self._print("loading complete") return loaded_dataset
def __init__(self, pre_splitted_path): test_percentage = 0.2 validation_percentage = 0.2 pre_splitted_path += "data_split/" pre_splitted_filename = "splitted_data_" ratings_file_name = "ratings_Amazon_Instant_Video.csv" # If directory does not exist, create if not os.path.exists(pre_splitted_path): os.makedirs(pre_splitted_path) try: print("Dataset_AmazonInstantVideo: Attempting to load pre-splitted data") for attrib_name, attrib_object in load_data_dict_zip(pre_splitted_path, pre_splitted_filename).items(): self.__setattr__(attrib_name, attrib_object) except FileNotFoundError: print("Dataset_AmazonInstantVideo: Pre-splitted data not found, building new one") folder_path = self.DATASET_SPLIT_ROOT_FOLDER + self.DATASET_SUBFOLDER download_from_URL(self.DATASET_URL, folder_path, ratings_file_name) # read Amazon Instant Video df = pd.read_csv(folder_path + ratings_file_name, sep=',', header=None, names=['user', 'item', 'rating', 'timestamp'])[ ['user', 'item', 'rating']] # keep only ratings = 5 URM_train_builder = IncrementalSparseMatrix(auto_create_col_mapper=True, auto_create_row_mapper=True) URM_train_builder.add_data_lists(df['user'].values, df['item'].values, df['rating'].values) URM_all = URM_train_builder.get_SparseMatrix() URM_all.data = URM_all.data==5 URM_all.eliminate_zeros() # keep only users with at least 5 ratings URM_all = ut.filter_urm(URM_all, user_min_number_ratings=5, item_min_number_ratings=1) # create train - test - validation URM_train_original, URM_test = split_train_validation_percentage_user_wise(URM_all, train_percentage=1-test_percentage, verbose=False) URM_train, URM_validation = split_train_validation_percentage_user_wise(URM_train_original, train_percentage=1-validation_percentage, verbose=False) self.URM_DICT = { "URM_train": URM_train, "URM_test": URM_test, "URM_validation": URM_validation, } save_data_dict_zip(self.URM_DICT, self.ICM_DICT, pre_splitted_path, pre_splitted_filename) print("Dataset_AmazonInstantVideo: Dataset loaded") ut.print_stat_datareader(self)
def _load_from_original_file(self): # Load data from original self._print("Loading original data") zipFile_path = self.DATASET_SPLIT_ROOT_FOLDER + self.DATASET_SUBFOLDER try: dataFile = zipfile.ZipFile( zipFile_path + "neural_factorization_machine-master.zip") except (FileNotFoundError, zipfile.BadZipFile): self._print("Unable to fild data zip file. Downloading...") download_from_URL(self.DATASET_URL, zipFile_path, "neural_factorization_machine-master.zip") dataFile = zipfile.ZipFile( zipFile_path + "neural_factorization_machine-master.zip") inner_path_in_zip = "neural_factorization_machine-master/data/frappe/" URM_train_path = dataFile.extract(inner_path_in_zip + "frappe.train.libfm", path=zipFile_path + "decompressed/") URM_test_path = dataFile.extract(inner_path_in_zip + "frappe.test.libfm", path=zipFile_path + "decompressed/") URM_validation_path = dataFile.extract( inner_path_in_zip + "frappe.validation.libfm", path=zipFile_path + "decompressed/") tmp_URM_train, item_original_ID_to_index, user_original_ID_to_index = self._loadURM( URM_train_path, item_original_ID_to_index=None, user_original_ID_to_index=None) tmp_URM_test, item_original_ID_to_index, user_original_ID_to_index = self._loadURM( URM_test_path, item_original_ID_to_index=item_original_ID_to_index, user_original_ID_to_index=user_original_ID_to_index) tmp_URM_validation, item_original_ID_to_index, user_original_ID_to_index = self._loadURM( URM_validation_path, item_original_ID_to_index=item_original_ID_to_index, user_original_ID_to_index=user_original_ID_to_index) shape = (len(user_original_ID_to_index), len(item_original_ID_to_index)) tmp_URM_train = reshapeSparse(tmp_URM_train, shape) tmp_URM_test = reshapeSparse(tmp_URM_test, shape) tmp_URM_validation = reshapeSparse(tmp_URM_validation, shape) URM_occurrence = tmp_URM_train + tmp_URM_test + tmp_URM_validation URM_all = URM_occurrence.copy() URM_all.data = np.ones_like(URM_all.data) loaded_URM_dict = { "URM_all": URM_all, "URM_occurrence": URM_occurrence } loaded_dataset = Dataset( dataset_name=self._get_dataset_name(), URM_dictionary=loaded_URM_dict, ICM_dictionary=None, ICM_feature_mapper_dictionary=None, UCM_dictionary=None, UCM_feature_mapper_dictionary=None, user_original_ID_to_index=user_original_ID_to_index, item_original_ID_to_index=item_original_ID_to_index, is_implicit=self.IS_IMPLICIT, ) self._print("cleaning temporary files") shutil.rmtree(zipFile_path + "decompressed", ignore_errors=True) self._print("loading complete") return loaded_dataset
def _load_from_original_file(self): # Load data from original zipFile_path = self.DATASET_SPLIT_ROOT_FOLDER + self.DATASET_SUBFOLDER try: dataFile = zipfile.ZipFile(zipFile_path + "ml-20m.zip") except (FileNotFoundError, zipfile.BadZipFile): print("Movielens20MReader: Unable to fild data zip file. Downloading...") download_from_URL(self.DATASET_URL, zipFile_path, "ml-20m.zip") dataFile = zipfile.ZipFile(zipFile_path + "ml-20m.zip") genres_path = dataFile.extract("ml-20m/movies.csv", path=zipFile_path + "decompressed/") tags_path = dataFile.extract("ml-20m/tags.csv", path=zipFile_path + "decompressed/") URM_path = dataFile.extract("ml-20m/ratings.csv", path=zipFile_path + "decompressed/") self._print("loading genres") ICM_genres, tokenToFeatureMapper_ICM_genres, item_original_ID_to_index = _loadICM_genres(genres_path, header=True, separator=',', genresSeparator="|") self._print("loading tags") ICM_tags, tokenToFeatureMapper_ICM_tags, _ = _loadICM_tags(tags_path, header=True, separator=',', if_new_item = "ignore", item_original_ID_to_index = item_original_ID_to_index) self._print("loading URM") URM_all, item_original_ID_to_index, user_original_ID_to_index, URM_timestamp = _loadURM_preinitialized_item_id(URM_path, separator=",", header = True, if_new_user = "******", if_new_item = "ignore", item_original_ID_to_index = item_original_ID_to_index) ICM_all, tokenToFeatureMapper_ICM_all = merge_ICM(ICM_genres, ICM_tags, tokenToFeatureMapper_ICM_genres, tokenToFeatureMapper_ICM_tags) loaded_URM_dict = {"URM_all": URM_all, "URM_timestamp": URM_timestamp} loaded_ICM_dict = {"ICM_genres": ICM_genres, "ICM_tags": ICM_tags, "ICM_all": ICM_all} loaded_ICM_mapper_dict = {"ICM_genres": tokenToFeatureMapper_ICM_genres, "ICM_tags": tokenToFeatureMapper_ICM_tags, "ICM_all": tokenToFeatureMapper_ICM_all} loaded_dataset = Dataset(dataset_name = self._get_dataset_name(), URM_dictionary = loaded_URM_dict, ICM_dictionary = loaded_ICM_dict, ICM_feature_mapper_dictionary = loaded_ICM_mapper_dict, UCM_dictionary = None, UCM_feature_mapper_dictionary = None, user_original_ID_to_index= user_original_ID_to_index, item_original_ID_to_index= item_original_ID_to_index, is_implicit = self.IS_IMPLICIT, ) self._print("cleaning temporary files") shutil.rmtree(zipFile_path + "decompressed", ignore_errors=True) self._print("saving URM and ICM") return loaded_dataset