Esempio n. 1
0
    def test_delimited_str_ids(self):

        output_dir = self.make_directory_tree("test_delimited_str_ids")

        tmp = pd.read_csv(Path(TMP_TEST_DIR) / Path("train_edges.txt"),
                          header=None,
                          sep=" ")

        tmp[0] = tmp[0].map(str) + "_test"
        tmp[1] = tmp[1].map(str) + "_test"
        tmp[2] = tmp[2].map(str) + "_test"

        tmp.to_csv(Path(TMP_TEST_DIR) / Path("str_train_edges.txt"),
                   header=None,
                   sep=" ",
                   index=False)

        converter = SparkEdgeListConverter(output_dir=output_dir,
                                           train_edges=Path(TMP_TEST_DIR) /
                                           Path("str_train_edges.txt"),
                                           delim=" ")

        converter.convert()

        expected_stats = DatasetConfig()
        expected_stats.dataset_dir = output_dir.__str__()
        expected_stats.num_edges = 1000
        expected_stats.num_nodes = 100
        expected_stats.num_relations = 10
        expected_stats.num_train = 1000

        validate_output_dir(output_dir=output_dir,
                            expected_stats=expected_stats,
                            dtype=np.int32,
                            remap_ids=True)
Esempio n. 2
0
    def test_header(self):

        output_dir = self.make_directory_tree("test_header")

        tmp = pd.read_csv(Path(TMP_TEST_DIR) / Path("train_edges.txt"),
                          header=None,
                          sep=" ")
        tmp.to_csv(Path(TMP_TEST_DIR) / Path("header_train_edges.txt"),
                   header=["src", "rel", "dst"],
                   sep=" ",
                   index=False)

        converter = SparkEdgeListConverter(output_dir=output_dir,
                                           train_edges=Path(TMP_TEST_DIR) /
                                           Path("header_train_edges.txt"),
                                           delim=" ",
                                           header_length=1)

        converter.convert()

        expected_stats = DatasetConfig()
        expected_stats.dataset_dir = output_dir.__str__()
        expected_stats.num_edges = 1000
        expected_stats.num_nodes = 100
        expected_stats.num_relations = 10
        expected_stats.num_train = 1000

        validate_output_dir(output_dir=output_dir,
                            expected_stats=expected_stats,
                            dtype=np.int32,
                            remap_ids=True)
Esempio n. 3
0
    def test_partitions(self):
        output_dir = self.make_directory_tree("test_partitions")

        converter = SparkEdgeListConverter(output_dir=output_dir,
                                           train_edges=Path(TMP_TEST_DIR) /
                                           Path("train_edges.txt"),
                                           delim=" ",
                                           num_partitions=10)

        converter.convert()

        expected_stats = DatasetConfig()
        expected_stats.dataset_dir = output_dir.__str__()
        expected_stats.num_edges = 1000
        expected_stats.num_nodes = 100
        expected_stats.num_relations = 10
        expected_stats.num_train = 1000

        validate_partitioned_output_dir(output_dir=output_dir,
                                        expected_stats=expected_stats,
                                        dtype=np.int32,
                                        num_partitions=10)

        converter = SparkEdgeListConverter(output_dir=output_dir,
                                           train_edges=Path(TMP_TEST_DIR) /
                                           Path("train_edges.txt"),
                                           delim=" ",
                                           num_partitions=100)

        converter.convert()

        validate_partitioned_output_dir(output_dir=output_dir,
                                        expected_stats=expected_stats,
                                        dtype=np.int32,
                                        num_partitions=100)
Esempio n. 4
0
    def test_no_remap(self):

        output_dir = Path(TMP_TEST_DIR) / Path("test_dtype")
        output_dir.mkdir()

        converter = TorchEdgeListConverter(
            output_dir=output_dir,
            train_edges=Path(TMP_TEST_DIR) / Path("train_edges.txt"),
            delim=" ",
            remap_ids=False,
            num_nodes=100,
            num_rels=10
        )

        converter.convert()

        expected_stats = DatasetConfig()
        expected_stats.dataset_dir = output_dir.__str__()
        expected_stats.num_edges = 1000
        expected_stats.num_nodes = 100
        expected_stats.num_relations = 10
        expected_stats.num_train = 1000

        validate_output_dir(output_dir=output_dir,
                            expected_stats=expected_stats,
                            dtype=np.int32,
                            remap_ids=False)
Esempio n. 5
0
    def test_delim(self):

        output_dir = Path(TMP_TEST_DIR) / Path("test_delim")
        output_dir.mkdir()

        tmp = pd.read_csv(Path(TMP_TEST_DIR) / Path("train_edges.txt"), header=None, sep=" ")
        tmp.to_csv(Path(TMP_TEST_DIR) / Path("delim_train_edges.txt"), header=None, sep=",", index=False)

        converter = TorchEdgeListConverter(
            output_dir=output_dir,
            train_edges=Path(TMP_TEST_DIR) / Path("delim_train_edges.txt"),
            delim=","
        )

        converter.convert()

        expected_stats = DatasetConfig()
        expected_stats.dataset_dir = output_dir.__str__()
        expected_stats.num_edges = 1000
        expected_stats.num_nodes = 100
        expected_stats.num_relations = 10
        expected_stats.num_train = 1000

        validate_output_dir(output_dir=output_dir,
                            expected_stats=expected_stats,
                            dtype=np.int32,
                            remap_ids=True)
Esempio n. 6
0
    def test_splits(self):
        output_dir = Path(TMP_TEST_DIR) / Path("test_splits")
        output_dir.mkdir()

        converter = TorchEdgeListConverter(
            output_dir=output_dir,
            train_edges=Path(TMP_TEST_DIR) / Path("train_edges.txt"),
            delim=" ",
            splits=[.9, .05, .05]
        )

        converter.convert()

        expected_stats = DatasetConfig()
        expected_stats.dataset_dir = output_dir.__str__()
        expected_stats.num_edges = 900
        expected_stats.num_nodes = 100
        expected_stats.num_relations = 10
        expected_stats.num_train = 900
        expected_stats.num_valid = 50
        expected_stats.num_test = 50

        validate_output_dir(output_dir=output_dir,
                            expected_stats=expected_stats,
                            dtype=np.int32,
                            remap_ids=True)
Esempio n. 7
0
    def test_pytorch_defaults(self):
        output_dir = Path(TMP_TEST_DIR) / Path("test_torch_defaults")
        output_dir.mkdir()

        train_edges_df = pd.read_csv(Path(TMP_TEST_DIR) / Path("train_edges.txt"), header=None, sep=" ")

        train_edges = torch.tensor(train_edges_df.to_numpy())

        converter = TorchEdgeListConverter(
            output_dir=output_dir,
            train_edges=train_edges,
            format="pytorch"
        )

        converter.convert()

        expected_stats = DatasetConfig()
        expected_stats.dataset_dir = output_dir.__str__()
        expected_stats.num_edges = 1000
        expected_stats.num_nodes = 100
        expected_stats.num_relations = 10
        expected_stats.num_train = 1000

        validate_output_dir(output_dir=output_dir,
                            expected_stats=expected_stats,
                            dtype=np.int32,
                            remap_ids=True)
Esempio n. 8
0
    def write_to_binary(self,
                        train_edges_tens,
                        valid_edges_tens,
                        test_edges_tens,
                        num_nodes,
                        num_rels,
                        num_partitions,
                        train_edges_offsets=None,
                        valid_edges_offsets=None,
                        test_edges_offsets=None):

        dataset_stats = DatasetConfig()
        dataset_stats.dataset_dir = Path(
            self.output_dir).absolute().__str__() + "/"

        dataset_stats.num_edges = train_edges_tens.size(0)
        dataset_stats.num_train = train_edges_tens.size(0)

        if valid_edges_tens is not None:
            dataset_stats.num_valid = valid_edges_tens.size(0)
        if test_edges_tens is not None:
            dataset_stats.num_test = test_edges_tens.size(0)

        dataset_stats.num_nodes = num_nodes
        dataset_stats.num_relations = num_rels

        with open(self.output_dir / Path("dataset.yaml"), "w") as f:
            print("Dataset statistics written to: {}".format(
                (self.output_dir / Path("dataset.yaml")).__str__()))
            yaml_file = OmegaConf.to_yaml(dataset_stats)
            f.writelines(yaml_file)

        with open(self.output_dir / Path(PathConstants.train_edges_path),
                  "wb") as f:
            f.write(bytes(train_edges_tens.numpy()))

        if valid_edges_tens is not None:
            with open(self.output_dir / Path(PathConstants.valid_edges_path),
                      "wb") as f:
                f.write(bytes(valid_edges_tens.numpy()))

        if test_edges_tens is not None:
            with open(self.output_dir / Path(PathConstants.test_edges_path),
                      "wb") as f:
                f.write(bytes(test_edges_tens.numpy()))

        if num_partitions > 1:
            with open(
                    self.output_dir /
                    Path(PathConstants.train_edge_buckets_path), "w") as f:
                f.writelines([str(o) + "\n" for o in train_edges_offsets])

            if valid_edges_offsets is not None:
                with open(
                        self.output_dir /
                        Path(PathConstants.valid_edge_buckets_path), "w") as f:
                    f.writelines([str(o) + "\n" for o in valid_edges_offsets])

            if test_edges_offsets is not None:
                with open(
                        self.output_dir /
                        Path(PathConstants.test_edge_buckets_path), "w") as f:
                    f.writelines([str(o) + "\n" for o in test_edges_offsets])

        return dataset_stats
Esempio n. 9
0
    def write_to_csv(self, train_edges_df, valid_edges_df, test_edges_df,
                     nodes_df, rels_df, num_partitions):

        dataset_stats = DatasetConfig()
        dataset_stats.dataset_dir = Path(self.output_dir).absolute().__str__()

        dataset_stats.num_edges = get_df_count(train_edges_df, EDGES_INDEX_COL)
        train_edges_df = train_edges_df.drop(EDGES_INDEX_COL)
        dataset_stats.num_train = dataset_stats.num_edges

        if valid_edges_df is not None:
            dataset_stats.num_valid = get_df_count(valid_edges_df,
                                                   EDGES_INDEX_COL)
            valid_edges_df = valid_edges_df.drop(EDGES_INDEX_COL)
        if test_edges_df is not None:
            dataset_stats.num_test = get_df_count(test_edges_df,
                                                  EDGES_INDEX_COL)
            test_edges_df = test_edges_df.drop(EDGES_INDEX_COL)

        dataset_stats.num_nodes = get_df_count(nodes_df, INDEX_COL)

        if rels_df is None:
            dataset_stats.num_relations = 1
        else:
            dataset_stats.num_relations = get_df_count(rels_df, REL_INDEX_COL)

        with open(self.output_dir / Path("dataset.yaml"), "w") as f:
            print("Dataset statistics written to: {}".format(
                (self.output_dir / Path("dataset.yaml")).__str__()))
            yaml_file = OmegaConf.to_yaml(dataset_stats)
            f.writelines(yaml_file)

        write_df_to_csv(
            nodes_df, self.output_dir / Path(PathConstants.node_mapping_path))

        if rels_df is not None:
            write_df_to_csv(
                rels_df,
                self.output_dir / Path(PathConstants.relation_mapping_path))

        if num_partitions > 1:
            offsets = write_partitioned_df_to_csv(
                train_edges_df, num_partitions,
                self.output_dir / Path(PathConstants.train_edges_path))

            with open(
                    self.output_dir /
                    Path(PathConstants.train_edge_buckets_path), "w") as f:
                f.writelines([str(o) + "\n" for o in offsets])

            if self.partitioned_evaluation:
                if valid_edges_df is not None:
                    offsets = write_partitioned_df_to_csv(
                        valid_edges_df, num_partitions,
                        self.output_dir / Path(PathConstants.valid_edges_path))

                    with open(
                            self.output_dir /
                            Path(PathConstants.valid_edge_buckets_path),
                            "w") as f:
                        f.writelines([str(o) + "\n" for o in offsets])

                if test_edges_df is not None:
                    offsets = write_partitioned_df_to_csv(
                        test_edges_df, num_partitions,
                        self.output_dir / Path(PathConstants.test_edges_path))
                    with open(
                            self.output_dir /
                            Path(PathConstants.test_edge_buckets_path),
                            "w") as f:
                        f.writelines([str(o) + "\n" for o in offsets])

            else:
                if valid_edges_df is not None:
                    write_df_to_csv(
                        valid_edges_df,
                        self.output_dir / Path(PathConstants.valid_edges_path))

                if test_edges_df is not None:
                    write_df_to_csv(
                        test_edges_df,
                        self.output_dir / Path(PathConstants.test_edges_path))

        else:
            write_df_to_csv(
                train_edges_df,
                self.output_dir / Path(PathConstants.train_edges_path))

            if valid_edges_df is not None:
                write_df_to_csv(
                    valid_edges_df,
                    self.output_dir / Path(PathConstants.valid_edges_path))

            if test_edges_df is not None:
                write_df_to_csv(
                    test_edges_df,
                    self.output_dir / Path(PathConstants.test_edges_path))

        return dataset_stats