def _load_from_original_file(self):
        # Load data from original
        zipFile_path =  self.DATASET_SPLIT_ROOT_FOLDER + self.DATASET_SUBFOLDER

        try:

            dataFile = zipfile.ZipFile(zipFile_path + "ml-1m.zip")

        except (FileNotFoundError, zipfile.BadZipFile):

            self._print("Unable to find data zip file. Downloading...")

            download_from_URL(self.DATASET_URL, zipFile_path, "ml-1m.zip")

            dataFile = zipfile.ZipFile(zipFile_path + "ml-1m.zip")


        ICM_genre_path = dataFile.extract("ml-1m/movies.dat", path=zipFile_path + "decompressed/")
        UCM_path = dataFile.extract("ml-1m/users.dat", path=zipFile_path + "decompressed/")
        URM_path = dataFile.extract("ml-1m/ratings.dat", path=zipFile_path + "decompressed/")

        self._print("Loading Interactions")
        URM_all_dataframe, URM_timestamp_dataframe = _loadURM(URM_path, header=None, separator='::')

        self._print("Loading Item Features genres")
        ICM_genres_dataframe = _loadICM_genres(ICM_genre_path, header=None, separator='::', genresSeparator="|")

        self._print("Loading User Features")
        UCM_dataframe = pd.read_csv(filepath_or_buffer=UCM_path, sep="::", header=None, dtype={0:str, 1:str, 2:str, 3:str, 4:str})
        UCM_dataframe.columns = ["UserID", "gender", "age_group", "occupation", "zip_code"]

        # For each user a list of features
        UCM_list = [[feature_name + "_" + str(UCM_dataframe[feature_name][index]) for feature_name in ["gender", "age_group", "occupation", "zip_code"]] for index in range(len(UCM_dataframe))]
        UCM_dataframe = pd.DataFrame(UCM_list, index=UCM_dataframe["UserID"]).stack()
        UCM_dataframe = UCM_dataframe.reset_index()[[0, 'UserID']]
        UCM_dataframe.columns = ['FeatureID', 'UserID']
        UCM_dataframe["Data"] = 1


        dataset_manager = DatasetMapperManager()
        dataset_manager.add_URM(URM_all_dataframe, "URM_all")
        dataset_manager.add_URM(URM_timestamp_dataframe, "URM_timestamp")
        dataset_manager.add_ICM(ICM_genres_dataframe, "ICM_genres")
        dataset_manager.add_UCM(UCM_dataframe, "UCM_all")

        loaded_dataset = dataset_manager.generate_Dataset(dataset_name=self._get_dataset_name(),
                                                          is_implicit=self.IS_IMPLICIT)

        self._print("cleaning temporary files")

        shutil.rmtree(zipFile_path + "decompressed", ignore_errors=True)

        self._print("Loading Complete")

        return loaded_dataset
Esempio n. 2
0
    def _load_from_original_file(self):
        # Load data from original

        zipFile_path = self.DATASET_SPLIT_ROOT_FOLDER + self.DATASET_SUBFOLDER

        try:

            dataFile = zipfile.ZipFile(zipFile_path + "ml-10m.zip")

        except (FileNotFoundError, zipfile.BadZipFile):

            self._print("Unable to find data zip file. Downloading...")

            download_from_URL(self.DATASET_URL, zipFile_path, "ml-10m.zip")

            dataFile = zipfile.ZipFile(zipFile_path + "ml-10m.zip")

        ICM_genre_path = dataFile.extract("ml-10M100K/movies.dat",
                                          path=zipFile_path + "decompressed/")
        ICM_tags_path = dataFile.extract("ml-10M100K/tags.dat",
                                         path=zipFile_path + "decompressed/")
        URM_path = dataFile.extract("ml-10M100K/ratings.dat",
                                    path=zipFile_path + "decompressed/")

        self._print("Loading Item Features Genres")
        ICM_genres_dataframe, ICM_years_dataframe = _loadICM_genres_years(
            ICM_genre_path, header=None, separator='::', genresSeparator="|")

        self._print("Loading Item Features Tags")
        ICM_tags_dataframe = _loadICM_tags(ICM_tags_path,
                                           header=None,
                                           separator='::')

        ICM_all_dataframe = pd.concat(
            [ICM_genres_dataframe, ICM_tags_dataframe])

        self._print("Loading Interactions")
        URM_all_dataframe, URM_timestamp_dataframe = _loadURM(URM_path,
                                                              header=None,
                                                              separator='::')

        dataset_manager = DatasetMapperManager()
        dataset_manager.add_URM(URM_all_dataframe, "URM_all")
        dataset_manager.add_URM(URM_timestamp_dataframe, "URM_timestamp")
        dataset_manager.add_ICM(ICM_genres_dataframe, "ICM_genres")
        dataset_manager.add_ICM(ICM_years_dataframe, "ICM_year")
        dataset_manager.add_ICM(ICM_tags_dataframe, "ICM_tags")
        dataset_manager.add_ICM(ICM_all_dataframe, "ICM_all")

        loaded_dataset = dataset_manager.generate_Dataset(
            dataset_name=self._get_dataset_name(),
            is_implicit=self.IS_IMPLICIT)

        self._print("Cleaning Temporary Files")

        shutil.rmtree(zipFile_path + "decompressed", ignore_errors=True)

        self._print("Loading Complete")

        return loaded_dataset
Esempio n. 3
0
    def _load_from_original_file(self):
        # Load data from original

        URM_path = '../../data/data_train.csv'
        ICM_path = '../../data/data_ICM_title_abstract.csv'

        self._print("Loading Interactions")
        URM_all_dataframe = _loadURM(URM_path, header=None, separator=',')
        ICM_all_dataframe = _loadICM(ICM_path, header=None, separator=',')

        # ICM_all_dataframe = pd.concat([ICM_genres_dataframe, ICM_tags_dataframe])

        dataset_manager = DatasetMapperManager()
        dataset_manager.add_URM(URM_all_dataframe, "URM_all")
        dataset_manager.add_ICM(ICM_all_dataframe, "ICM_all")

        loaded_dataset = dataset_manager.generate_Dataset(
            dataset_name=self._get_dataset_name(),
            is_implicit=self.IS_IMPLICIT)

        return loaded_dataset
Esempio n. 4
0
    def _load_from_original_file(self):

        URM_path = "Data_manager/BookData/data_train.csv"
        ICM_path = "Data_manager/BookData/data_ICM_title_abstract.csv"

        self._print("Loading Interactions")
        URM_dataframe = _loadURM(URM_path, header=0, separator=",")

        self._print("Loading Item Features")
        ICM_dataframe = _loadICM(ICM_path, header=0, separator=",")

        dataset_manager = DatasetMapperManager()
        dataset_manager.add_URM(URM_dataframe, "URM_all")
        dataset_manager.add_ICM(ICM_dataframe, "ICM_all")

        loaded_dataset = dataset_manager.generate_Dataset(
            dataset_name=self._get_dataset_name(),
            is_implicit=self.IS_IMPLICIT)

        self._print("Loading Complete")

        return loaded_dataset