Beispiel #1
0
 def setUp(self):
     self.dir = "test_data"
     os.mkdir(self.dir)
     self.domain_file = os.path.join(self.dir, "domain.txt")
     self.range_file = os.path.join(self.dir, "range.txt")
     self.edges_file = os.path.join(self.dir, "edges.txt")
     self.typed_relation_instances = TypedRelationInstances()
Beispiel #2
0
class TypedRelationInstancesTest(unittest.TestCase):
    def setUp(self):
        self.dir = "test_data"
        os.mkdir(self.dir)
        self.domain_file = os.path.join(self.dir, "domain.txt")
        self.range_file = os.path.join(self.dir, "range.txt")
        self.edges_file = os.path.join(self.dir, "edges.txt")
        self.typed_relation_instances = TypedRelationInstances()

    def tearDown(self):
        shutil.rmtree(self.dir)

    def test_domain_and_range(self):
        # object in location
        # object made_of material
        with open(self.domain_file, "w+") as fh:
            fh.writelines(["in\tobject\n", "made_of\tobject\n"])
        with open(self.range_file, "w+") as fh:
            fh.writelines(["in\tlocation\n", "made_of\tmaterial\n"])
        self.typed_relation_instances.read_domains_and_ranges(
            self.domain_file, self.range_file)
        assert "object" == self.typed_relation_instances.relation_domain["in"]
        assert "object" == self.typed_relation_instances.relation_domain[
            "made_of"]
        assert "location" == self.typed_relation_instances.relation_range["in"]
        assert "material" == self.typed_relation_instances.relation_range[
            "made_of"]
        assert len(self.typed_relation_instances.relation_range) == 2
        assert len(self.typed_relation_instances.relation_domain) == 2

    def test_simple_edges(self):
        # object in location
        # object made_of material
        with open(self.domain_file, "w+") as fh:
            fh.writelines(["in\tobject\n", "made_of\tobject\n"])
        with open(self.range_file, "w+") as fh:
            fh.writelines(["in\tlocation\n", "made_of\tmaterial\n"])
        self.typed_relation_instances.read_domains_and_ranges(
            self.domain_file, self.range_file)

        # apple in basket
        # watermelon in refrigerator
        # water made_of water
        with open(self.edges_file, "w+") as fh:
            fh.write(
                "apple\tin\tbasket\nwatermelon\tin\trefrigerator\nwater\tmade_of\twater\n"
            )
        self.typed_relation_instances.construct_from_labeled_edges(
            self.edges_file, entity_name_is_typed=False, is_labeled=False)
        assert len(self.typed_relation_instances.type_to_entities) == 3
        assert self.typed_relation_instances.type_to_entities["object"] == {
            "object:apple", "object:watermelon", "object:water"
        }
        assert self.typed_relation_instances.type_to_entities["location"] == {
            "location:basket", "location:refrigerator"
        }
        assert self.typed_relation_instances.type_to_entities["material"] == {
            "material:water"
        }
                           word2vec_filename=WORD2VEC_FILENAME,
                           remove_repetitions=False)
        wn.read_data()
        wn.get_entity_types()
        wn.write_relation_domain_and_ranges()
        wn.write_edges()

    # 2. use PRA scala code and code here to create train/test/dev split and negative examples
    if run_step == 2:
        pra_driver = PRADriver(DATASET_FOLDER, PRA_TEMPLATE_DIR, PRA_RUN_DIR,
                               DATASET_NAME)
        pra_driver.prepare_split()

    # 3. Extract paths with entities
    if run_step == 3:
        typed_relation_instances = TypedRelationInstances()
        typed_relation_instances.read_domains_and_ranges(
            DOMAIN_FILENAME, RANGE_FILENAME)
        typed_relation_instances.construct_from_labeled_edges(
            EDGES_FILENAME, entity_name_is_typed=False, is_labeled=False)
        vocabs = Vocabs()
        vocabs.build_vocabs(typed_relation_instances)
        split = Split()
        split.read_splits(SPLIT_DIR, vocabs, entity_name_is_typed=True)
        graph = AdjacencyGraph()
        graph.build_graph(typed_relation_instances, vocabs)

        path_extractor = PathExtractor(max_length=6,
                                       include_entity=True,
                                       save_dir=PATH_DIR,
                                       include_path_len1=True,
Beispiel #4
0
    def prepare_split(self):
        """
        This function uses PRA code to generate initial train/test split and negative examples, and then uses code in
        this repo to create train/dev/test split.

        .. note::

            The pra data will be left in /examples folder in the PRA scala repo after running this function.

        .. note::

            PRA creates split all relation instances arbitrarily based on the train/test ratio. This is different from
            knowledge embedding approaches where train set need to contain entities in test set.

        :param run_pra: default False. If set to true, will also run PRA and SFE using the PRA scala code
        :return:
        """
        domain_filename = os.path.join(self.data_dir, "domains.tsv")
        range_filename = os.path.join(self.data_dir, "ranges.tsv")
        edges_filename = os.path.join(self.data_dir, "edges.txt")
        pra_dir = os.path.join(self.data_dir, "pra")
        split_dir = os.path.join(self.data_dir, "split")

        assert not os.path.exists(
            split_dir), "split folder already exists in {}".format(split_dir)
        pra_template_here = os.path.join(self.pra_template_dir, self.dataset,
                                         "examples")
        assert os.path.exists(pra_template_here)
        pra_template_there = os.path.join(self.pra_run_dir, "examples")
        assert not os.path.exists(
            pra_template_there), "examples folder already exists in {}".format(
                pra_template_there)

        # 1. Create PRA input files to generate split and negative examples.
        #    PRA creates split all relation instances arbitrarily based on the train/test ratio. This is different from
        #    knowledge embedding approaches where train set need to contain entities in test set.
        typed_relation_instances = TypedRelationInstances()
        typed_relation_instances.read_domains_and_ranges(
            domain_filename, range_filename)
        typed_relation_instances.construct_from_labeled_edges(
            edges_filename, entity_name_is_typed=False, is_labeled=False)
        typed_relation_instances.write_to_pra_format(
            pra_dir, only_positive_instance=True)

        # 2. Run PRA create_graph_and_split, copy generated split to $SPLIT_DIR.
        # copy the template folder to PRA scala repo
        shutil.copytree(pra_template_here, pra_template_there)

        # copy the data to the PRA scala repo
        relation_data_here = pra_dir
        relation_data_there = os.path.join(pra_template_there,
                                           "relation_metadata", {
                                               "wn18rr": "wordnet",
                                               "fb15k237": "freebase"
                                           }[self.dataset])
        if os.path.exists(relation_data_there):
            shutil.rmtree(relation_data_there)
        shutil.copytree(relation_data_here, relation_data_there)

        # run create_graph_and_split
        command = "sbt \"run ./examples/ {}_create_graph_and_split.json\"".format(
            {
                "wn18rr": "wordnet",
                "fb15k237": "freebase"
            }[self.dataset])
        run_interactive_command(self.pra_run_dir, command, input=1)

        # remove edge.dat
        os.remove(
            os.path.join(pra_template_there, "graphs", {
                "wn18rr": "wordnet",
                "fb15k237": "freebase"
            }[self.dataset], "edges.dat"))

        # copy generated split to this repo
        split_dir_here = split_dir
        split_dir_there = os.path.join(pra_template_there, "splits/split")
        shutil.copytree(split_dir_there, split_dir_here)

        # 3. Create development set
        typed_relation_instances = TypedRelationInstances()
        typed_relation_instances.read_domains_and_ranges(
            domain_filename, range_filename)
        typed_relation_instances.construct_from_labeled_edges(
            edges_filename, entity_name_is_typed=False, is_labeled=False)
        vocabs = Vocabs()
        vocabs.build_vocabs(typed_relation_instances)
        split = Split()
        split.read_splits(split_dir,
                          vocabs,
                          entity_name_is_typed=True,
                          create_development_set_if_not_exist=True)

        # 4. Copy new split to PRA scala repo
        split_dir_here = split_dir
        split_dir_there = os.path.join(pra_template_there, "splits/dev_split")
        shutil.copytree(split_dir_here, split_dir_there)
Beispiel #5
0
    def setup_cvsm_dir(self):
        """
        This function is used to set up cvsm directory in the data folder. This uses a path reader to help read paths
        and create paths and vocabs in the cvsm format.

        :return:
        """
        #################################################
        # -1. Set up files and directories
        domain_filename = os.path.join(self.experiment_dir, "domains.tsv")
        range_filename = os.path.join(self.experiment_dir, "ranges.tsv")
        edges_filename = os.path.join(self.experiment_dir, "edges.txt")
        split_dir = os.path.join(self.experiment_dir, "split")
        pra_path_dir = os.path.join(self.experiment_dir, "pra_paths")
        path_dir = os.path.join(self.experiment_dir, "paths")

        if self.augment_data:
            input("Warning: using augmented path. Press Enter to continue.")
            path_dir = os.path.join(self.experiment_dir, "paths_augment")

        if not self.include_entity:
            if not self.has_entity:
                cvsm_dir = os.path.join(self.experiment_dir, "cvsm")
            else:
                # Debug: make sure this uses the same data input as cvsm_entity
                cvsm_dir = os.path.join(self.experiment_dir,
                                        "cvsm_entity")  # before is cvsm_bfs
        else:
            cvsm_dir = os.path.join(self.experiment_dir, "cvsm_entity")

        create_cvsm_folder = True
        if os.path.exists(cvsm_dir):
            answer = input("CVSM folder already exists. Recreate it? Y/N ")
            if answer == "Y":
                create_cvsm_folder = True
                shutil.rmtree(cvsm_dir)
                os.mkdir(cvsm_dir)
            elif answer == "N":
                create_cvsm_folder = False
            else:
                raise Exception("Please input Y or N")
        else:
            os.mkdir(cvsm_dir)

        if self.augment_data:
            self.cvsm_data_dir = os.path.join(cvsm_dir, "augment_data")
        else:
            self.cvsm_data_dir = os.path.join(cvsm_dir, "data")

        self.cvsm_result_dir = os.path.join(cvsm_dir, "results")

        ##################################################
        # 0. Process data

        # typed_relation_instances = TypedRelationInstances()
        # typed_relation_instances.read_domains_and_ranges(domain_filename, range_filename)
        # typed_relation_instances.construct_from_labeled_edges(edges_filename, entity_name_is_typed=False,
        #                                                       is_labeled=False)
        # vocabs = Vocabs()
        # vocabs.build_vocabs(typed_relation_instances)
        # split = Split()
        # split.read_splits(split_dir, vocabs, entity_name_is_typed=True)
        #
        # self.relation_vocab_size = len(vocabs.relation_to_idx)
        # self.entity_vocab_size = len(vocabs.node_to_idx)
        #
        # # check if has development set
        # for rel in split.relation_to_splits_to_instances:
        #     if "development" in split.relation_to_splits_to_instances[rel]:
        #         self.has_development_set = True
        #         break

        if create_cvsm_folder:
            typed_relation_instances = TypedRelationInstances()
            typed_relation_instances.read_domains_and_ranges(
                domain_filename, range_filename)
            typed_relation_instances.construct_from_labeled_edges(
                edges_filename, entity_name_is_typed=False, is_labeled=False)
            vocabs = Vocabs()
            vocabs.build_vocabs(typed_relation_instances)
            split = Split()
            split.read_splits(split_dir, vocabs, entity_name_is_typed=True)

            self.relation_vocab_size = len(vocabs.relation_to_idx)
            self.entity_vocab_size = len(vocabs.node_to_idx)

            # check if has development set
            for rel in split.relation_to_splits_to_instances:
                if "development" in split.relation_to_splits_to_instances[rel]:
                    self.has_development_set = True
                    break

            if not self.include_entity:
                if not self.has_entity:
                    print("Read PRA's pra features from pra_paths directory")
                    pra_path_reader = PRAPathReader(save_dir=pra_path_dir,
                                                    include_entity=False)
                    pra_path_reader.read_paths(split)
                    pra_path_reader.write_cvsm_files(
                        cvsm_dir=self.cvsm_data_dir,
                        split=split,
                        vocabs=vocabs)
                else:
                    print("Read paths with entities from paths directory")
                    path_reader = PathReader(save_dir=path_dir)
                    path_reader.read_paths(split)
                    entity2types_filename = os.path.join(
                        self.experiment_dir, "entity2types.json")
                    path_reader.write_cvsm_files(self.cvsm_data_dir, split,
                                                 vocabs, entity2types_filename)
            else:
                print("Read paths with entities from paths directory")
                path_reader = PathReader(save_dir=path_dir)
                path_reader.read_paths(split)
                entity2types_filename = os.path.join(self.experiment_dir,
                                                     "entity2types.json")
                path_reader.write_cvsm_files(self.cvsm_data_dir, split, vocabs,
                                             entity2types_filename)