def _load_from_original_file(self): # Load data from original zipFile_path = self.DATASET_SPLIT_ROOT_FOLDER + self.DATASET_SUBFOLDER try: dataFile = zipfile.ZipFile(zipFile_path + "ml-1m.zip") except (FileNotFoundError, zipfile.BadZipFile): self._print("Unable to find data zip file. Downloading...") download_from_URL(self.DATASET_URL, zipFile_path, "ml-1m.zip") dataFile = zipfile.ZipFile(zipFile_path + "ml-1m.zip") ICM_genre_path = dataFile.extract("ml-1m/movies.dat", path=zipFile_path + "decompressed/") UCM_path = dataFile.extract("ml-1m/users.dat", path=zipFile_path + "decompressed/") URM_path = dataFile.extract("ml-1m/ratings.dat", path=zipFile_path + "decompressed/") self._print("Loading Interactions") URM_all_dataframe, URM_timestamp_dataframe = _loadURM(URM_path, header=None, separator='::') self._print("Loading Item Features genres") ICM_genres_dataframe = _loadICM_genres(ICM_genre_path, header=None, separator='::', genresSeparator="|") self._print("Loading User Features") UCM_dataframe = pd.read_csv(filepath_or_buffer=UCM_path, sep="::", header=None, dtype={0:str, 1:str, 2:str, 3:str, 4:str}) UCM_dataframe.columns = ["UserID", "gender", "age_group", "occupation", "zip_code"] # For each user a list of features UCM_list = [[feature_name + "_" + str(UCM_dataframe[feature_name][index]) for feature_name in ["gender", "age_group", "occupation", "zip_code"]] for index in range(len(UCM_dataframe))] UCM_dataframe = pd.DataFrame(UCM_list, index=UCM_dataframe["UserID"]).stack() UCM_dataframe = UCM_dataframe.reset_index()[[0, 'UserID']] UCM_dataframe.columns = ['FeatureID', 'UserID'] UCM_dataframe["Data"] = 1 dataset_manager = DatasetMapperManager() dataset_manager.add_URM(URM_all_dataframe, "URM_all") dataset_manager.add_URM(URM_timestamp_dataframe, "URM_timestamp") dataset_manager.add_ICM(ICM_genres_dataframe, "ICM_genres") dataset_manager.add_UCM(UCM_dataframe, "UCM_all") loaded_dataset = dataset_manager.generate_Dataset(dataset_name=self._get_dataset_name(), is_implicit=self.IS_IMPLICIT) self._print("cleaning temporary files") shutil.rmtree(zipFile_path + "decompressed", ignore_errors=True) self._print("Loading Complete") return loaded_dataset
def _load_from_original_file(self): # Load data from original zipFile_path = self.DATASET_SPLIT_ROOT_FOLDER + self.DATASET_SUBFOLDER try: dataFile = zipfile.ZipFile(zipFile_path + "ml-10m.zip") except (FileNotFoundError, zipfile.BadZipFile): self._print("Unable to find data zip file. Downloading...") download_from_URL(self.DATASET_URL, zipFile_path, "ml-10m.zip") dataFile = zipfile.ZipFile(zipFile_path + "ml-10m.zip") ICM_genre_path = dataFile.extract("ml-10M100K/movies.dat", path=zipFile_path + "decompressed/") ICM_tags_path = dataFile.extract("ml-10M100K/tags.dat", path=zipFile_path + "decompressed/") URM_path = dataFile.extract("ml-10M100K/ratings.dat", path=zipFile_path + "decompressed/") self._print("Loading Item Features Genres") ICM_genres_dataframe, ICM_years_dataframe = _loadICM_genres_years( ICM_genre_path, header=None, separator='::', genresSeparator="|") self._print("Loading Item Features Tags") ICM_tags_dataframe = _loadICM_tags(ICM_tags_path, header=None, separator='::') ICM_all_dataframe = pd.concat( [ICM_genres_dataframe, ICM_tags_dataframe]) self._print("Loading Interactions") URM_all_dataframe, URM_timestamp_dataframe = _loadURM(URM_path, header=None, separator='::') dataset_manager = DatasetMapperManager() dataset_manager.add_URM(URM_all_dataframe, "URM_all") dataset_manager.add_URM(URM_timestamp_dataframe, "URM_timestamp") dataset_manager.add_ICM(ICM_genres_dataframe, "ICM_genres") dataset_manager.add_ICM(ICM_years_dataframe, "ICM_year") dataset_manager.add_ICM(ICM_tags_dataframe, "ICM_tags") dataset_manager.add_ICM(ICM_all_dataframe, "ICM_all") loaded_dataset = dataset_manager.generate_Dataset( dataset_name=self._get_dataset_name(), is_implicit=self.IS_IMPLICIT) self._print("Cleaning Temporary Files") shutil.rmtree(zipFile_path + "decompressed", ignore_errors=True) self._print("Loading Complete") return loaded_dataset
def _load_from_original_file(self): # Load data from original URM_path = '../../data/data_train.csv' ICM_path = '../../data/data_ICM_title_abstract.csv' self._print("Loading Interactions") URM_all_dataframe = _loadURM(URM_path, header=None, separator=',') ICM_all_dataframe = _loadICM(ICM_path, header=None, separator=',') # ICM_all_dataframe = pd.concat([ICM_genres_dataframe, ICM_tags_dataframe]) dataset_manager = DatasetMapperManager() dataset_manager.add_URM(URM_all_dataframe, "URM_all") dataset_manager.add_ICM(ICM_all_dataframe, "ICM_all") loaded_dataset = dataset_manager.generate_Dataset( dataset_name=self._get_dataset_name(), is_implicit=self.IS_IMPLICIT) return loaded_dataset
def _load_from_original_file(self): URM_path = "Data_manager/BookData/data_train.csv" ICM_path = "Data_manager/BookData/data_ICM_title_abstract.csv" self._print("Loading Interactions") URM_dataframe = _loadURM(URM_path, header=0, separator=",") self._print("Loading Item Features") ICM_dataframe = _loadICM(ICM_path, header=0, separator=",") dataset_manager = DatasetMapperManager() dataset_manager.add_URM(URM_dataframe, "URM_all") dataset_manager.add_ICM(ICM_dataframe, "ICM_all") loaded_dataset = dataset_manager.generate_Dataset( dataset_name=self._get_dataset_name(), is_implicit=self.IS_IMPLICIT) self._print("Loading Complete") return loaded_dataset