Exemple #1
0
    def test_splits(self):
        output_dir = Path(TMP_TEST_DIR) / Path("test_splits")
        output_dir.mkdir()

        converter = TorchEdgeListConverter(
            output_dir=output_dir,
            train_edges=Path(TMP_TEST_DIR) / Path("train_edges.txt"),
            delim=" ",
            splits=[.9, .05, .05]
        )

        converter.convert()

        expected_stats = DatasetConfig()
        expected_stats.dataset_dir = output_dir.__str__()
        expected_stats.num_edges = 900
        expected_stats.num_nodes = 100
        expected_stats.num_relations = 10
        expected_stats.num_train = 900
        expected_stats.num_valid = 50
        expected_stats.num_test = 50

        validate_output_dir(output_dir=output_dir,
                            expected_stats=expected_stats,
                            dtype=np.int32,
                            remap_ids=True)
Exemple #2
0
    def write_to_binary(self,
                        train_edges_tens,
                        valid_edges_tens,
                        test_edges_tens,
                        num_nodes,
                        num_rels,
                        num_partitions,
                        train_edges_offsets=None,
                        valid_edges_offsets=None,
                        test_edges_offsets=None):

        dataset_stats = DatasetConfig()
        dataset_stats.dataset_dir = Path(
            self.output_dir).absolute().__str__() + "/"

        dataset_stats.num_edges = train_edges_tens.size(0)
        dataset_stats.num_train = train_edges_tens.size(0)

        if valid_edges_tens is not None:
            dataset_stats.num_valid = valid_edges_tens.size(0)
        if test_edges_tens is not None:
            dataset_stats.num_test = test_edges_tens.size(0)

        dataset_stats.num_nodes = num_nodes
        dataset_stats.num_relations = num_rels

        with open(self.output_dir / Path("dataset.yaml"), "w") as f:
            print("Dataset statistics written to: {}".format(
                (self.output_dir / Path("dataset.yaml")).__str__()))
            yaml_file = OmegaConf.to_yaml(dataset_stats)
            f.writelines(yaml_file)

        with open(self.output_dir / Path(PathConstants.train_edges_path),
                  "wb") as f:
            f.write(bytes(train_edges_tens.numpy()))

        if valid_edges_tens is not None:
            with open(self.output_dir / Path(PathConstants.valid_edges_path),
                      "wb") as f:
                f.write(bytes(valid_edges_tens.numpy()))

        if test_edges_tens is not None:
            with open(self.output_dir / Path(PathConstants.test_edges_path),
                      "wb") as f:
                f.write(bytes(test_edges_tens.numpy()))

        if num_partitions > 1:
            with open(
                    self.output_dir /
                    Path(PathConstants.train_edge_buckets_path), "w") as f:
                f.writelines([str(o) + "\n" for o in train_edges_offsets])

            if valid_edges_offsets is not None:
                with open(
                        self.output_dir /
                        Path(PathConstants.valid_edge_buckets_path), "w") as f:
                    f.writelines([str(o) + "\n" for o in valid_edges_offsets])

            if test_edges_offsets is not None:
                with open(
                        self.output_dir /
                        Path(PathConstants.test_edge_buckets_path), "w") as f:
                    f.writelines([str(o) + "\n" for o in test_edges_offsets])

        return dataset_stats
Exemple #3
0
    def write_to_csv(self, train_edges_df, valid_edges_df, test_edges_df,
                     nodes_df, rels_df, num_partitions):

        dataset_stats = DatasetConfig()
        dataset_stats.dataset_dir = Path(self.output_dir).absolute().__str__()

        dataset_stats.num_edges = get_df_count(train_edges_df, EDGES_INDEX_COL)
        train_edges_df = train_edges_df.drop(EDGES_INDEX_COL)
        dataset_stats.num_train = dataset_stats.num_edges

        if valid_edges_df is not None:
            dataset_stats.num_valid = get_df_count(valid_edges_df,
                                                   EDGES_INDEX_COL)
            valid_edges_df = valid_edges_df.drop(EDGES_INDEX_COL)
        if test_edges_df is not None:
            dataset_stats.num_test = get_df_count(test_edges_df,
                                                  EDGES_INDEX_COL)
            test_edges_df = test_edges_df.drop(EDGES_INDEX_COL)

        dataset_stats.num_nodes = get_df_count(nodes_df, INDEX_COL)

        if rels_df is None:
            dataset_stats.num_relations = 1
        else:
            dataset_stats.num_relations = get_df_count(rels_df, REL_INDEX_COL)

        with open(self.output_dir / Path("dataset.yaml"), "w") as f:
            print("Dataset statistics written to: {}".format(
                (self.output_dir / Path("dataset.yaml")).__str__()))
            yaml_file = OmegaConf.to_yaml(dataset_stats)
            f.writelines(yaml_file)

        write_df_to_csv(
            nodes_df, self.output_dir / Path(PathConstants.node_mapping_path))

        if rels_df is not None:
            write_df_to_csv(
                rels_df,
                self.output_dir / Path(PathConstants.relation_mapping_path))

        if num_partitions > 1:
            offsets = write_partitioned_df_to_csv(
                train_edges_df, num_partitions,
                self.output_dir / Path(PathConstants.train_edges_path))

            with open(
                    self.output_dir /
                    Path(PathConstants.train_edge_buckets_path), "w") as f:
                f.writelines([str(o) + "\n" for o in offsets])

            if self.partitioned_evaluation:
                if valid_edges_df is not None:
                    offsets = write_partitioned_df_to_csv(
                        valid_edges_df, num_partitions,
                        self.output_dir / Path(PathConstants.valid_edges_path))

                    with open(
                            self.output_dir /
                            Path(PathConstants.valid_edge_buckets_path),
                            "w") as f:
                        f.writelines([str(o) + "\n" for o in offsets])

                if test_edges_df is not None:
                    offsets = write_partitioned_df_to_csv(
                        test_edges_df, num_partitions,
                        self.output_dir / Path(PathConstants.test_edges_path))
                    with open(
                            self.output_dir /
                            Path(PathConstants.test_edge_buckets_path),
                            "w") as f:
                        f.writelines([str(o) + "\n" for o in offsets])

            else:
                if valid_edges_df is not None:
                    write_df_to_csv(
                        valid_edges_df,
                        self.output_dir / Path(PathConstants.valid_edges_path))

                if test_edges_df is not None:
                    write_df_to_csv(
                        test_edges_df,
                        self.output_dir / Path(PathConstants.test_edges_path))

        else:
            write_df_to_csv(
                train_edges_df,
                self.output_dir / Path(PathConstants.train_edges_path))

            if valid_edges_df is not None:
                write_df_to_csv(
                    valid_edges_df,
                    self.output_dir / Path(PathConstants.valid_edges_path))

            if test_edges_df is not None:
                write_df_to_csv(
                    test_edges_df,
                    self.output_dir / Path(PathConstants.test_edges_path))

        return dataset_stats