Beispiel #1
0
    def load_interaction(self):
        """Load the user-item interaction.

        Load the interaction from the processed file(Need to preprocess the raw file before loading)
        """
        processed_file_path = os.path.join(
            self.processed_path, f"{self.dataset_name}_interaction.npz")
        if not os.path.exists(os.path.join(processed_file_path)):
            try:
                self.preprocess()
            except FileNotFoundError:
                print("origin file is broken, re-download it")
                raw_file_path = os.path.join(self.raw_path,
                                             f"{self.dataset_name}.zip")
                os.remove(raw_file_path)
                self.download()
            finally:
                self.preprocess()
        data = get_dataframe_from_npz(processed_file_path)
        print("-" * 80)
        print("Loaded raw interaction statistics")
        print(
            tabulate(
                data.agg(["count", "nunique"]),
                headers=data.columns,
                tablefmt="psql",
                disable_numparse=True,
            ))
        print("-" * 80)
        return data
Beispiel #2
0
    def load_interaction(self):
        """Load the user-item interaction

        Load the interaction from the processed file(Need to preprocess the raw file before loading)
        """
        processed_file_path = os.path.join(
            self.processed_path, f"{self.dataset_name}_interaction.npz")
        if not os.path.exists(os.path.join(processed_file_path)):
            self.preprocess()
        data = get_dataframe_from_npz(processed_file_path)
        print("-" * 80)
        print("raw interaction statistics")
        print(
            tabulate(
                data.agg(["count", "nunique"]),
                headers=data.columns,
                tablefmt="psql",
                disable_numparse=True,
            ))
        print("-" * 80)
        return data
Beispiel #3
0
def load_split_data(path, n_test=10):
    """Load split DataFrame from a specified path.

    Args:
        path (string): split data path.
        n_test: number of testing and validation datasets.
                If n_test==0, will load the original (no negative items) valid and test datasets.

    Returns:
        (DataFrame, list(DataFrame), list(DataFrame)): DataFrame of training interaction,
        DataFrame list of validation interaction,
        DataFrame list of testing interaction,
    """
    train_file = os.path.join(path, "train.npz")
    train_data = get_dataframe_from_npz(train_file)
    print("-" * 80)
    print("Loaded training set statistics")
    print(
        tabulate(
            train_data.agg(["count", "nunique"]),
            headers=train_data.columns,
            tablefmt="psql",
            disable_numparse=True,
        )
    )
    if not n_test:
        valid_df = get_dataframe_from_npz(os.path.join(path, "valid.npz"))
        test_df = get_dataframe_from_npz(os.path.join(path, "test.npz"))
        print("Loaded validation set statistics")
        print(
            tabulate(
                valid_df.agg(["count", "nunique"]),
                headers=valid_df.columns,
                tablefmt="psql",
                disable_numparse=True,
            )
        )
        print("Loaded testing set statistics")
        print(
            tabulate(
                test_df.agg(["count", "nunique"]),
                headers=test_df.columns,
                tablefmt="psql",
                disable_numparse=True,
            )
        )
        print("-" * 80)
        return train_data, valid_df, test_df

    valid_data_li = []
    test_data_li = []
    for i in range(n_test):
        valid_df = get_dataframe_from_npz(os.path.join(path, f"valid_{i}.npz"))
        valid_data_li.append(valid_df)
        if i == 0:
            print(f"valid_data_{i} statistics")
            print(
                tabulate(
                    valid_df.agg(["count", "nunique"]),
                    headers=valid_df.columns,
                    tablefmt="psql",
                    disable_numparse=True,
                )
            )
        test_df = get_dataframe_from_npz(os.path.join(path, f"test_{i}.npz"))
        test_data_li.append(test_df)
        if i == 0:
            print(f"test_data_{i} statistics")
            print(
                tabulate(
                    test_df.agg(["count", "nunique"]),
                    headers=test_df.columns,
                    tablefmt="psql",
                    disable_numparse=True,
                )
            )
    print("-" * 80)
    return train_data, valid_data_li, test_data_li