Exemple #1
0
    def test_process_multi_next(self):
        from forte.data.readers import OntonotesReader

        # Define and config the Pipeline
        nlp = Pipeline[DataPack]()
        nlp.set_reader(OntonotesReader())

        pack_name = 'test_pack'
        nlp.add(MultiPackBoxer(), {'pack_name': pack_name})
        nlp.add(DummyRelationExtractor(),
                config={"batcher": {
                    "batch_size": 5
                }},
                selector=NameMatchSelector(select_name=pack_name))
        nlp.initialize()

        dataset_path = data_samples_root + "/ontonotes/00"

        # get processed pack from dataset
        m_pack: MultiPack
        for m_pack in nlp.process_dataset(dataset_path):
            pack = m_pack.get_pack(pack_name)
            # get sentence from pack
            for sentence in pack.get(Sentence):
                sent_text = sentence.text

                # second method to get entry in a sentence
                tokens = [token.text for token in pack.get(Token, sentence)]
                self.assertEqual(sent_text, " ".join(tokens))
def build_pipeline(result_dir: str, word_counter: Counter,
                   tag_counter: Counter):
    r"""Build the pipeline to parse IU Xray report with tokenizer, lowercase and
    non-alpha removal to generate forte json file with the same name with
    preprocessed content and information of impression, findings and path to the
    parent image.
    Args:
        result_dir: the directory to save the forte json files.
    Return:
        pipeline: built pipeline to process the xml files
    """

    pipeline = Pipeline[MultiPack]()
    pipeline.resource.update(word_counter=word_counter)
    pipeline.resource.update(tag_counter=tag_counter)
    pipeline.set_reader(IUXrayReportReader())
    pipeline.add(MultiPackBoxer())
    pipeline.add(PackNameJsonPackWriter(), {
        'indent': 2,
        'output_dir': result_dir,
        'overwrite': True
    }, NameMatchSelector(select_name='default'))
    pipeline.initialize()

    return pipeline
Exemple #3
0
    def test_pipeline(self, texts, expected_outputs, expected_tokens):
        nlp = Pipeline[MultiPack]()

        boxer_config = {"pack_name": "input"}

        replacer_op = TmpReplacer.__module__ + "." + TmpReplacer.__qualname__

        processor_config = {
            "augment_entry": "ft.onto.base_ontology.Token",
            "other_entry_policy": {
                "ft.onto.base_ontology.Document": "auto_align",
                "ft.onto.base_ontology.Sentence": "auto_align",
            },
            "type": "data_augmentation_op",
            "data_aug_op": replacer_op,
            "data_aug_op_config": {},
            "augment_pack_names": {},
        }

        nlp.set_reader(reader=StringReader())
        nlp.add(component=MultiPackBoxer(), config=boxer_config)
        nlp.add(component=WhiteSpaceTokenizer(), selector=AllPackSelector())
        nlp.add(
            component=ReplacementDataAugmentProcessor(), config=processor_config
        )
        nlp.initialize()

        for idx, m_pack in enumerate(nlp.process_dataset(texts)):
            aug_pack = m_pack.get_pack("augmented_input")

            self.assertEqual(aug_pack.text, expected_outputs[idx])

            for j, token in enumerate(aug_pack.get(Token)):
                self.assertEqual(token.text, expected_tokens[idx][j])
Exemple #4
0
 def test_caster_all_selector(self):
     """
     Test if the caster and all pack selector works well.
     The caster is used to convert a single pack to multi pack, and then
     pack copier is used to create a new pack. The all pack selector selects
     all the pack from the multi pack. This test make sure this pipeline
     works OK.
     """
     mp: MultiPack
     for mp in (
         Pipeline()
         .set_reader(SentenceReader())
         .add(MultiPackBoxer())
         .add(MultiPackCopier())
         .add(DummyPackProcessor(), selector=AllPackSelector())
         .initialize()
         .process_dataset(
             os.path.join(data_samples_root, "random_texts", "0.txt")
         )
     ):
         num_pack = 0
         for pack in mp.packs:
             num_pack += 1
             entries = list(pack.get(NewType))
             self.assertEqual(len(entries), 1)
             self.assertEqual(entries[0].value, "[PACK]")
         self.assertEqual(num_pack, 2)
Exemple #5
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--config_file",
                        default="./config.yml",
                        help="Config YAML filepath")
    args = parser.parse_args()

    # loading config
    config = yaml.safe_load(open(args.config_file, "r"))

    nlp: Pipeline[MultiPack] = Pipeline()
    nlp.set_reader(RandomDataSelector(), config=config["data_selector_config"])
    nlp.add(component=MultiPackBoxer(), config=config["boxer_config"])
    nlp.add(component=NLTKWordTokenizer(), selector=AllPackSelector())
    nlp.add(component=NLTKPOSTagger(), selector=AllPackSelector())
    nlp.add(
        component=ReplacementDataAugmentProcessor(),
        config=config["da_processor_config"],
    )

    nlp.initialize()

    for _, m_pack in enumerate(nlp.process_dataset()):
        aug_pack = m_pack.get_pack("augmented_input")
        logging.info(aug_pack.text)
Exemple #6
0
 def setUp(self):
     self.nlp = Pipeline()
     self.nlp.set_reader(StringReader())
     self.nlp.add(NLTKSentenceSegmenter())
     boxer_config = {"pack_name": "question"}
     self.nlp.add(MultiPackBoxer(), boxer_config)
     self.nlp.add(MutliDocPackAdder())
     self.nlp.add(QuestionAnsweringMulti())
     self.nlp.initialize()
Exemple #7
0
    def setUp(self):
        random.seed(0)
        self.nlp = Pipeline[MultiPack]()

        boxer_config = {'pack_name': 'input_src'}

        self.nlp.set_reader(reader=StringReader())
        self.nlp.add(component=MultiPackBoxer(), config=boxer_config)
        self.nlp.add(component=NLTKWordTokenizer(), selector=AllPackSelector())
        self.nlp.add(component=NLTKPOSTagger(), selector=AllPackSelector())
Exemple #8
0
    def setUp(self):
        random.seed(0)
        self.nlp = Pipeline[MultiPack]()

        boxer_config = {"pack_name": "input_src"}

        self.nlp.set_reader(reader=StringReader())
        self.nlp.add(component=MultiPackBoxer(), config=boxer_config)
        self.nlp.add(
            component=WhiteSpaceTokenizer(), selector=AllPackSelector()
        )
Exemple #9
0
    def setUp(self):
        random.seed(8)
        self.nlp = Pipeline[MultiPack]()

        boxer_config = {"pack_name": "input_src"}
        entity_config = {"entities_to_insert": ["Mary", "station"]}
        self.nlp.set_reader(reader=StringReader())
        self.nlp.add(component=EntityMentionInserter(), config=entity_config)
        self.nlp.add(PeriodSentenceSplitter())
        self.nlp.add(component=MultiPackBoxer(), config=boxer_config)
        self.nlp.add(component=WhiteSpaceTokenizer(),
                     selector=AllPackSelector())
Exemple #10
0
def multi_example(input_path, output_path):
    """
    This example reads data from input path, and write multi pack output
    to output path.

    Args:
        input_path:
        output_path:

    Returns:

    """
    print("Multi Pack serialization example.")

    print(
        "We first read the data, and add multi-packs to them, and then "
        "save the results."
    )
    coref_pl = Pipeline()
    coref_pl.set_reader(DirPackReader())
    coref_pl.add(MultiPackBoxer())
    coref_pl.add(PackCopier())
    coref_pl.add(ExampleCoreferencer())
    coref_pl.add(ExampleCorefCounter())

    coref_pl.add(
        MultiPackWriter(),
        config={
            "output_dir": output_path,
            "indent": 2,
            "overwrite": True,
        },
    )

    coref_pl.run(input_path)

    print(
        "We can then load the saved results, and see if everything is OK. "
        "We should see the same number of multi packs there. "
    )
    reading_pl = Pipeline()
    reading_pl.set_reader(
        MultiPackDirectoryReader(),
        config={
            "multi_pack_dir": os.path.join(output_path, "multi"),
            "data_pack_dir": os.path.join(output_path, "packs"),
        },
    )
    reading_pl.add(ExampleCorefCounter())
    reading_pl.run()
Exemple #11
0
    def testMultiPackWriting(self):
        coref_pl = Pipeline()
        coref_pl.set_reader(DirPackReader())
        coref_pl.add(MultiPackBoxer())
        coref_pl.add(CopySentence())

        coref_pl.add(PackIdMultiPackWriter(),
                     config={
                         'output_dir': os.path.join(self.main_output.name,
                                                    'multi'),
                         'indent': 2,
                         'overwrite': True,
                     })
        coref_pl.run(os.path.join(self.main_output.name, 'packs'))
        self.assertTrue(os.path.exists(os.path.join('multi_out', 'multi.idx')))
        self.assertTrue(os.path.exists(os.path.join('multi_out', 'pack.idx')))
        self.assertTrue(os.path.exists(os.path.join('multi_out', 'packs')))
        self.assertTrue(os.path.exists(os.path.join('multi_out', 'multi')))
    def testMultiPackWriting(self):
        coref_pl = Pipeline()
        coref_pl.set_reader(DirPackReader())
        coref_pl.add(MultiPackBoxer())
        coref_pl.add(CopySentence())

        coref_pl.add(
            PackIdMultiPackWriter(),
            config={
                "output_dir": os.path.join(self.main_output.name, "multi"),
                "indent": 2,
                "overwrite": True,
            },
        )
        coref_pl.run(os.path.join(self.main_output.name, "packs"))
        self.assertTrue(os.path.exists(os.path.join("multi_out", "multi.idx")))
        self.assertTrue(os.path.exists(os.path.join("multi_out", "pack.idx")))
        self.assertTrue(os.path.exists(os.path.join("multi_out", "packs")))
        self.assertTrue(os.path.exists(os.path.join("multi_out", "multi")))
    def test_pipeline(self, texts, expected_outputs, expected_tokens):
        nlp = Pipeline[MultiPack]()

        boxer_config = {
            'pack_name': 'input'
        }

        processor_config = {
            'augment_entry': "ft.onto.base_ontology.Token",
            'other_entry_policy': {
                'type': '',
                'kwargs': {
                    "ft.onto.base_ontology.Document": "auto_align",
                    "ft.onto.base_ontology.Sentence": "auto_align"
                }
            },
            'type': 'data_augmentation_op',
            'data_aug_op': 'tests.forte.processors.base.data_augment_replacement_processor_test.TmpReplacer',
            'data_aug_op_config': {
                'type': '',
                'kwargs': {}
            },
            'augment_pack_names': {
                'kwargs': {
                    'input': 'augmented_input'
                }
            }
        }

        nlp.set_reader(reader=StringReader())
        nlp.add(component=MultiPackBoxer(), config=boxer_config)
        nlp.add(component=NLTKWordTokenizer(), selector=AllPackSelector())
        nlp.add(component=NLTKPOSTagger(), selector=AllPackSelector())
        nlp.add(component=ReplacementDataAugmentProcessor(), config=processor_config)
        nlp.initialize()

        for idx, m_pack in enumerate(nlp.process_dataset(texts)):
            aug_pack = m_pack.get_pack('augmented_input')

            self.assertEqual(aug_pack.text, expected_outputs[idx])

            for j, token in enumerate(aug_pack.get(Token)):
                self.assertEqual(token.text, expected_tokens[idx][j])
Exemple #14
0
def multi_example(input_path, output_path):
    """
    This example reads data from input path, and write multi pack output
    to output path.

    Args:
        input_path:
        output_path:

    Returns:

    """
    print("Multi Pack serialization example.")

    print("We first read the data, and add multi-packs to them, and then "
          "save the results.")
    coref_pl = Pipeline()
    coref_pl.set_reader(DirPackReader())
    coref_pl.add(MultiPackBoxer())
    coref_pl.add(PackCopier())
    coref_pl.add(ExampleCoreferencer())
    coref_pl.add(ExampleCorefCounter())

    coref_pl.add(
        MultiPackWriter(),
        {
            'output_dir': output_path,
            'indent': 2,
            'overwrite': True,
        }
    )

    coref_pl.run(input_path)

    print("We can then load the saved results, and see if everything is OK. "
          "We should see the same number of multi packs there. ")
    reading_pl = Pipeline()
    reading_pl.set_reader(MultiPackDiskReader(), {'data_path': output_path})
    reading_pl.add(ExampleCorefCounter())
    reading_pl.run()
Exemple #15
0
    def test_pipeline(self, texts, expected_outputs):
        nlp = Pipeline[MultiPack]()

        boxer_config = {"pack_name": "input"}

        nlp.set_reader(reader=StringReader())
        nlp.add(component=MultiPackBoxer(), config=boxer_config)
        nlp.add(component=WhiteSpaceTokenizer(), selector=AllPackSelector())

        processor_config = {
            "augment_entry":
            "ft.onto.base_ontology.Token",
            "other_entry_policy": {
                "ft.onto.base_ontology.Document": "auto_align",
                "ft.onto.base_ontology.Sentence": "auto_align",
            },
            "type":
            "data_augmentation_op",
            "data_aug_op":
            "forte.processors.data_augment.algorithms"
            ".embedding_similarity_replacement_op."
            "EmbeddingSimilarityReplacementOp",
            "data_aug_op_config": {
                "vocab_path": self.abs_vocab_path,
                "embed_hparams": self.embed_hparams,
                "top_k": 1,
            },
            "augment_pack_names": {
                "input": "augmented_input"
            },
        }
        nlp.add(component=ReplacementDataAugmentProcessor(),
                config=processor_config)
        nlp.initialize()

        for idx, m_pack in enumerate(nlp.process_dataset(texts)):
            aug_pack = m_pack.get_pack("augmented_input")
            self.assertEqual(aug_pack.text, expected_outputs[idx])
Exemple #16
0
Pipeline().set_reader(
    MultiNLIReader()
).add(
    # Call spacy on remote.
    RemoteProcessor(),
    config={
        "url": "http://localhost:8008"
    },
).add(
    # Call allennlp on remote.
    RemoteProcessor(),
    config={
        "url": "http://localhost:8009"
    },
).add(
    MultiPackBoxer()
).add(
    TweakData()
).add(
    NLIProcessor(),
    selector=NameMatchSelector(),
    selector_config={
        "select_name": "default",
        "reverse_selection": True,
    }
).add(
    PackNameMultiPackWriter(),
    config={
        "output_dir": output_dir
    }
).add(
Exemple #17
0
from forte.data.selector import RegexNameMatchSelector


if __name__ == "__main__":
    # Load config file
    config_file = os.path.join(os.path.dirname(__file__), "config.yml")
    config = yaml.safe_load(open(config_file, "r"))
    config = Config(config, default_hparams=None)

    # Build pipeline and add the reader, which will read query from terminal.
    nlp: Pipeline = Pipeline()
    nlp.set_reader(reader=TerminalReader())

    # Start to work on multi-packs in the rest of the pipeline, so we use a
    # boxer to change this.
    nlp.add(MultiPackBoxer(), config=config.boxer)

    # Search tweets.
    nlp.add(TweetSearchProcessor(), config=config.twitter_search)

    # Conduct sentiment analysis.
    pattern = rf"{config.twitter_search.response_pack_name_prefix}_\d"
    selector_hit = RegexNameMatchSelector(select_name=pattern)
    nlp.add(
        component=VaderSentimentProcessor(),
        selector=selector_hit,
        config=config.vader_sentiment,
    )

    nlp.initialize()
Exemple #18
0
    def testMultiPackWriting(self, config_data):
        zip_pack, method = config_data

        # Use different sub-directory to avoid conflicting.
        subdir = f"{zip_pack}_{method}"

        with tempfile.TemporaryDirectory() as main_output:
            # Prepare input data.
            prepared_input: str = os.path.join(main_output, subdir,
                                               "input_packs")
            data_output: str = os.path.join(main_output, subdir, "output")
            suffix = ".pickle" if method == "pickle" else ".json"
            if zip_pack:
                suffix = suffix + ".gz"

            nlp = Pipeline[DataPack]()
            nlp.set_reader(OntonotesReader())
            nlp.add(
                PackIdJsonPackWriter(),
                {
                    "output_dir": prepared_input,
                    "overwrite": True,
                    "serialize_method": method,
                    "zip_pack": zip_pack,
                },
            )
            nlp.run(self.data_path)

            # Convert to multi pack.
            coref_pl = Pipeline()

            coref_pl.set_reader(
                DirPackReader(),
                {
                    "serialize_method": method,
                    "zip_pack": zip_pack,
                    "suffix": suffix,
                },
            )
            coref_pl.add(MultiPackBoxer())
            coref_pl.add(CopySentence())
            coref_pl.add(NaiveCoref())

            coref_pl.add(
                PackIdMultiPackWriter(),
                config={
                    "output_dir": data_output,
                    "overwrite": True,
                    "serialize_method": method,
                    "zip_pack": zip_pack,
                },
            )
            coref_pl.run(prepared_input)

            self.assertTrue(
                os.path.exists(os.path.join(data_output, "multi.idx")))
            self.assertTrue(
                os.path.exists(os.path.join(data_output, "pack.idx")))
            self.assertTrue(os.path.exists(os.path.join(data_output, "packs")))
            self.assertTrue(os.path.exists(os.path.join(data_output, "multi")))

            # Read the multi pack again.
            mp_pipeline = Pipeline()

            mp_pipeline.set_reader(
                MultiPackDirectoryReader(),
                config={
                    "suffix": suffix,
                    "zip_pack": zip_pack,
                    "serialize_method": method,
                    "data_pack_dir": os.path.join(data_output, "packs"),
                    "multi_pack_dir": os.path.join(data_output, "multi"),
                },
            ).initialize()

            re: CrossDocEntityRelation
            for mp in mp_pipeline.process_dataset():
                for re in mp.get(CrossDocEntityRelation):
                    self.assertEqual(re.get_parent().text, re.get_child().text)
Exemple #19
0
    def test_reuse_processor(self):
        # Create a basic pipeline of multi packs that have two pack (by copying)
        nlp = (
            Pipeline()
            .set_reader(SentenceReader())
            .add(MultiPackBoxer())
            .add(MultiPackCopier())
        )

        # Create one shared instance of this extractor
        dummy = DummyPackProcessor()
        nlp.add(
            dummy,
            config={"test": "dummy1"},
            selector=NameMatchSelector(),
            selector_config={"select_name": "default"},
        )

        # This will not add the component successfully because the processor is
        # initialized.
        with self.assertRaises(ProcessorConfigError):
            nlp.add(dummy, config={"test": "dummy2"})

        # This will add the component, with a different selector
        nlp.add(
            dummy,
            selector=NameMatchSelector(),
            selector_config={"select_name": "copy"},
        )
        nlp.initialize()

        # Check that the two processors have the same name.
        self.assertEqual(
            nlp.components[2].name, get_full_module_name(DummyPackProcessor)
        )
        self.assertEqual(
            nlp.components[3].name, get_full_module_name(DummyPackProcessor)
        )

        # Check that the two processors are also the same instance.
        self.assertEqual(nlp.components[2], nlp.components[3])

        # Check that the initialization is only done once, here the count
        #  will only be 1.
        self.assertEqual(nlp.components[2].initialize_count, 1)
        self.assertEqual(nlp.components[3].initialize_count, 1)

        # Check that the configuration is not changed by the second insertion.
        self.assertEqual(nlp.components[3].configs.test, "dummy1")

        # Run it once to make sure it can run.
        dataset_path = os.path.join(data_samples_root, "random_texts", "0.txt")
        nlp.run(dataset_path)

        # Check that initialization will be false after `run`, because it
        #  calls the `finish` function of all components.
        self.assertFalse(nlp.components[2].is_initialized)
        self.assertFalse(nlp.components[3].is_initialized)

        # Check that we are able to re-initialize the pipeline.
        nlp.initialize()  # initialize the first time.
        nlp.initialize()  # re-initialize.

        # Check the name again after re-initialize.
        self.assertEqual(
            nlp.components[2].name, get_full_module_name(DummyPackProcessor)
        )
        self.assertEqual(
            nlp.components[3].name, get_full_module_name(DummyPackProcessor)
        )

        # Obtain the results from the multipack.
        mp: MultiPack = nlp.process(dataset_path)
        pack: DataPack = mp.get_pack("default")
        pack_copy: DataPack = mp.get_pack("copy")

        # Check both pack are processed by the DummyProcessor once, because
        #  we use different selector.
        pack.get_single(NewType).value = "[PACK]"
        pack_copy.get_single(NewType).value = "[PACK]"