Esempio n. 1
0
def write_results(pl: Pipeline, output_path: str, input_data: str):
    pl.add(
        WikiArticleWriter(),
        config={
            "output_dir": output_path,
            "zip_pack": True,
            "drop_record": True,
        },
    )
    pl.run(input_data)
Esempio n. 2
0
def multi_example(input_path, output_path):
    """
    This example reads data from input path, and write multi pack output
    to output path.

    Args:
        input_path:
        output_path:

    Returns:

    """
    print("Multi Pack serialization example.")

    print(
        "We first read the data, and add multi-packs to them, and then "
        "save the results."
    )
    coref_pl = Pipeline()
    coref_pl.set_reader(DirPackReader())
    coref_pl.add(MultiPackBoxer())
    coref_pl.add(PackCopier())
    coref_pl.add(ExampleCoreferencer())
    coref_pl.add(ExampleCorefCounter())

    coref_pl.add(
        MultiPackWriter(),
        config={
            "output_dir": output_path,
            "indent": 2,
            "overwrite": True,
        },
    )

    coref_pl.run(input_path)

    print(
        "We can then load the saved results, and see if everything is OK. "
        "We should see the same number of multi packs there. "
    )
    reading_pl = Pipeline()
    reading_pl.set_reader(
        MultiPackDirectoryReader(),
        config={
            "multi_pack_dir": os.path.join(output_path, "multi"),
            "data_pack_dir": os.path.join(output_path, "packs"),
        },
    )
    reading_pl.add(ExampleCorefCounter())
    reading_pl.run()
Esempio n. 3
0
def multi_example(input_path, output_path):
    """
    This example reads data from input path, and write multi pack output
    to output path.

    Args:
        input_path:
        output_path:

    Returns:

    """
    print("Multi Pack serialization example.")

    print("We first read the data, and add multi-packs to them, and then "
          "save the results.")
    coref_pl = Pipeline()
    coref_pl.set_reader(DirPackReader())
    coref_pl.add(MultiPackBoxer())
    coref_pl.add(PackCopier())
    coref_pl.add(ExampleCoreferencer())
    coref_pl.add(ExampleCorefCounter())

    coref_pl.add(
        MultiPackWriter(),
        {
            'output_dir': output_path,
            'indent': 2,
            'overwrite': True,
        }
    )

    coref_pl.run(input_path)

    print("We can then load the saved results, and see if everything is OK. "
          "We should see the same number of multi packs there. ")
    reading_pl = Pipeline()
    reading_pl.set_reader(MultiPackDiskReader(), {'data_path': output_path})
    reading_pl.add(ExampleCorefCounter())
    reading_pl.run()
Esempio n. 4
0
class TestQuestionAnsweringMulti(unittest.TestCase):
    def setUp(self):
        self.nlp = Pipeline()
        self.nlp.set_reader(StringReader())
        self.nlp.add(NLTKSentenceSegmenter())
        boxer_config = {"pack_name": "question"}
        self.nlp.add(MultiPackBoxer(), boxer_config)
        self.nlp.add(MutliDocPackAdder())
        self.nlp.add(QuestionAnsweringMulti())
        self.nlp.initialize()

    def test_huggingface_qa_multi_processor(self):
        question = "Name synonym of Acrokeratosis paraneoplastica."
        packs: MultiPack = self.nlp.process(question)
        expected_ans = {
            "doc_1": "Bazex syndrome",
            "doc_2": "Bazex syndrome",
            "doc_3": "Bazex syndrome",
        }
        for doc_id in packs.pack_names:
            if doc_id == "question":
                continue
            pack = packs.get_pack(doc_id)
            for idx, phrase in enumerate(pack.get(entry_type=Phrase)):
                self.assertEqual(phrase.text, expected_ans[doc_id])

        linked_texts = []

        for link in packs.get(entry_type=MultiPackLink):
            parent_text = link.get_parent().text
            child_text = link.get_child().text
            linked_texts.append((parent_text, child_text))

        self.assertListEqual(
            sorted(linked_texts),
            sorted([
                (question, expected_ans["doc_1"]),
                (question, expected_ans["doc_2"]),
                (question, expected_ans["doc_3"]),
            ]),
        )
Esempio n. 5
0
    def testMultiPackWriting(self):
        coref_pl = Pipeline()
        coref_pl.set_reader(DirPackReader())
        coref_pl.add(MultiPackBoxer())
        coref_pl.add(CopySentence())

        coref_pl.add(PackIdMultiPackWriter(),
                     config={
                         'output_dir': os.path.join(self.main_output.name,
                                                    'multi'),
                         'indent': 2,
                         'overwrite': True,
                     })
        coref_pl.run(os.path.join(self.main_output.name, 'packs'))
        self.assertTrue(os.path.exists(os.path.join('multi_out', 'multi.idx')))
        self.assertTrue(os.path.exists(os.path.join('multi_out', 'pack.idx')))
        self.assertTrue(os.path.exists(os.path.join('multi_out', 'packs')))
        self.assertTrue(os.path.exists(os.path.join('multi_out', 'multi')))
    def testMultiPackWriting(self):
        coref_pl = Pipeline()
        coref_pl.set_reader(DirPackReader())
        coref_pl.add(MultiPackBoxer())
        coref_pl.add(CopySentence())

        coref_pl.add(
            PackIdMultiPackWriter(),
            config={
                "output_dir": os.path.join(self.main_output.name, "multi"),
                "indent": 2,
                "overwrite": True,
            },
        )
        coref_pl.run(os.path.join(self.main_output.name, "packs"))
        self.assertTrue(os.path.exists(os.path.join("multi_out", "multi.idx")))
        self.assertTrue(os.path.exists(os.path.join("multi_out", "pack.idx")))
        self.assertTrue(os.path.exists(os.path.join("multi_out", "packs")))
        self.assertTrue(os.path.exists(os.path.join("multi_out", "multi")))
Esempio n. 7
0
 def prepare(self):
     prepare_pl = Pipeline()
     prepare_pl.set_reader(self.train_reader)
     for p in self.preprocessors:
         prepare_pl.add(p)
     prepare_pl.run(self.configs.config_data.train_path)
Esempio n. 8
0

task = sys.argv[1]
assert task in ["ner", "pos"], "Not supported nlp task type: {}".format(task)

config_predict = yaml.safe_load(open("configs/config_predict.yml", "r"))
saved_model = torch.load(config_predict["model_path"])
train_state = torch.load(config_predict["train_state_path"])

reader = CoNLL03Reader()
predictor = TaggingPredictor()
evaluator = CoNLLNEREvaluator()

pl = Pipeline()
pl.set_reader(reader)
pl.add(predictor)
pl.add(evaluator)
pl.initialize()

for pack in pl.process_dataset(config_predict["test_path"]):
    print("---- pack ----")
    for instance in pack.get(Sentence):
        sent = instance.text
        output_tags = []
        if task == "ner":
            for entry in pack.get(EntityMention, instance):
                output_tags.append((entry.text, entry.ner_type))
        else:
            for entry in pack.get(Token, instance):
                output_tags.append((entry.text, entry.pos))
        print("---- example -----")
Esempio n. 9
0
    def testMultiPackWriting(self, config_data):
        zip_pack, method = config_data

        # Use different sub-directory to avoid conflicting.
        subdir = f"{zip_pack}_{method}"

        with tempfile.TemporaryDirectory() as main_output:
            # Prepare input data.
            prepared_input: str = os.path.join(main_output, subdir,
                                               "input_packs")
            data_output: str = os.path.join(main_output, subdir, "output")
            suffix = ".pickle" if method == "pickle" else ".json"
            if zip_pack:
                suffix = suffix + ".gz"

            nlp = Pipeline[DataPack]()
            nlp.set_reader(OntonotesReader())
            nlp.add(
                PackIdJsonPackWriter(),
                {
                    "output_dir": prepared_input,
                    "overwrite": True,
                    "serialize_method": method,
                    "zip_pack": zip_pack,
                },
            )
            nlp.run(self.data_path)

            # Convert to multi pack.
            coref_pl = Pipeline()

            coref_pl.set_reader(
                DirPackReader(),
                {
                    "serialize_method": method,
                    "zip_pack": zip_pack,
                    "suffix": suffix,
                },
            )
            coref_pl.add(MultiPackBoxer())
            coref_pl.add(CopySentence())
            coref_pl.add(NaiveCoref())

            coref_pl.add(
                PackIdMultiPackWriter(),
                config={
                    "output_dir": data_output,
                    "overwrite": True,
                    "serialize_method": method,
                    "zip_pack": zip_pack,
                },
            )
            coref_pl.run(prepared_input)

            self.assertTrue(
                os.path.exists(os.path.join(data_output, "multi.idx")))
            self.assertTrue(
                os.path.exists(os.path.join(data_output, "pack.idx")))
            self.assertTrue(os.path.exists(os.path.join(data_output, "packs")))
            self.assertTrue(os.path.exists(os.path.join(data_output, "multi")))

            # Read the multi pack again.
            mp_pipeline = Pipeline()

            mp_pipeline.set_reader(
                MultiPackDirectoryReader(),
                config={
                    "suffix": suffix,
                    "zip_pack": zip_pack,
                    "serialize_method": method,
                    "data_pack_dir": os.path.join(data_output, "packs"),
                    "multi_pack_dir": os.path.join(data_output, "multi"),
                },
            ).initialize()

            re: CrossDocEntityRelation
            for mp in mp_pipeline.process_dataset():
                for re in mp.get(CrossDocEntityRelation):
                    self.assertEqual(re.get_parent().text, re.get_child().text)