Esempio n. 1
0
def setup(config: Config) -> Pipeline:
    resource = Resources()
    query_pipeline = Pipeline[MultiPack](resource=resource)
    query_pipeline.set_reader(
        reader=MultiPackTerminalReader(), config=config.reader)
    query_pipeline.add(
        component=MicrosoftBingTranslator(), config=config.translator)
    query_pipeline.add(
        component=BertBasedQueryCreator(), config=config.query_creator)
    query_pipeline.add(
        component=SearchProcessor(), config=config.searcher)

    top_response_pack_name = config.indexer.response_pack_name + '_0'

    query_pipeline.add(
        component=NLTKSentenceSegmenter(),
        selector=NameMatchSelector(select_name=top_response_pack_name))
    query_pipeline.add(
        component=NLTKWordTokenizer(),
        selector=NameMatchSelector(select_name=top_response_pack_name))
    query_pipeline.add(
        component=NLTKPOSTagger(),
        selector=NameMatchSelector(select_name=top_response_pack_name))
    query_pipeline.add(
        component=SRLPredictor(), config=config.SRL,
        selector=NameMatchSelector(select_name=top_response_pack_name))
    query_pipeline.add(
        component=MicrosoftBingTranslator(), config=config.back_translator)

    query_pipeline.initialize()

    return query_pipeline
Esempio n. 2
0
    def test_name_match_selector_backward_compatability(self) -> None:
        selector = NameMatchSelector(select_name="pack1")
        selector.initialize()
        packs = selector.select(self.multi_pack)
        doc_ids = ["1"]
        for doc_id, pack in zip(doc_ids, packs):
            self.assertEqual(doc_id, pack.pack_name)

        selector = NameMatchSelector("pack1")
        selector.initialize()
        packs = selector.select(self.multi_pack)
        doc_ids = ["1"]
        for doc_id, pack in zip(doc_ids, packs):
            self.assertEqual(doc_id, pack.pack_name)
Esempio n. 3
0
    def test_process_multi_next(self):
        from forte.data.readers import OntonotesReader

        # Define and config the Pipeline
        nlp = Pipeline[DataPack]()
        nlp.set_reader(OntonotesReader())

        pack_name = 'test_pack'
        nlp.add(MultiPackBoxer(), {'pack_name': pack_name})
        nlp.add(DummyRelationExtractor(),
                config={"batcher": {
                    "batch_size": 5
                }},
                selector=NameMatchSelector(select_name=pack_name))
        nlp.initialize()

        dataset_path = data_samples_root + "/ontonotes/00"

        # get processed pack from dataset
        m_pack: MultiPack
        for m_pack in nlp.process_dataset(dataset_path):
            pack = m_pack.get_pack(pack_name)
            # get sentence from pack
            for sentence in pack.get(Sentence):
                sent_text = sentence.text

                # second method to get entry in a sentence
                tokens = [token.text for token in pack.get(Token, sentence)]
                self.assertEqual(sent_text, " ".join(tokens))
def build_pipeline(result_dir: str, word_counter: Counter,
                   tag_counter: Counter):
    r"""Build the pipeline to parse IU Xray report with tokenizer, lowercase and
    non-alpha removal to generate forte json file with the same name with
    preprocessed content and information of impression, findings and path to the
    parent image.
    Args:
        result_dir: the directory to save the forte json files.
    Return:
        pipeline: built pipeline to process the xml files
    """

    pipeline = Pipeline[MultiPack]()
    pipeline.resource.update(word_counter=word_counter)
    pipeline.resource.update(tag_counter=tag_counter)
    pipeline.set_reader(IUXrayReportReader())
    pipeline.add(MultiPackBoxer())
    pipeline.add(PackNameJsonPackWriter(), {
        'indent': 2,
        'output_dir': result_dir,
        'overwrite': True
    }, NameMatchSelector(select_name='default'))
    pipeline.initialize()

    return pipeline
Esempio n. 5
0
    def test_name_match_selector(self) -> None:
        selector = NameMatchSelector()
        selector.initialize(
            configs={"select_name": "pack1"},
        )
        packs = selector.select(self.multi_pack)
        doc_ids = ["1"]
        for doc_id, pack in zip(doc_ids, packs):
            self.assertEqual(doc_id, pack.pack_name)

        # Test reverse selection.
        selector.initialize(
            configs={"select_name": "pack1", "reverse_selection": True},
        )
        packs = selector.select(self.multi_pack)
        doc_ids = ["2", "Three"]
        for doc_id, pack in zip(doc_ids, packs):
            self.assertEqual(doc_id, pack.pack_name)
Esempio n. 6
0
    # Call spacy on remote.
    RemoteProcessor(),
    config={
        "url": "http://localhost:8008"
    },
).add(
    # Call allennlp on remote.
    RemoteProcessor(),
    config={
        "url": "http://localhost:8009"
    },
).add(
    MultiPackBoxer()
).add(
    TweakData()
).add(
    NLIProcessor(),
    selector=NameMatchSelector(),
    selector_config={
        "select_name": "default",
        "reverse_selection": True,
    }
).add(
    PackNameMultiPackWriter(),
    config={
        "output_dir": output_dir
    }
).add(
    ProgressPrinter(),
).run()
Esempio n. 7
0
 def test_name_match_selector(self) -> None:
     selector = NameMatchSelector(select_name="pack1")
     packs = selector.select(self.multi_pack)
     doc_ids = ["1"]
     for doc_id, pack in zip(doc_ids, packs):
         self.assertEqual(doc_id, pack.meta.doc_id)
Esempio n. 8
0
def main():

    config = yaml.safe_load(open("config.yml", "r"))
    config = HParams(config, default_hparams=None)

    resource = Resources()
    query_pipeline = Pipeline(resource=resource)
    query_pipeline.set_reader(reader=MultiPackTerminalReader(),
                              config=config.reader)

    query_pipeline.add_processor(processor=MicrosoftBingTranslator(),
                                 config=config.translator)
    query_pipeline.add_processor(processor=BertBasedQueryCreator(),
                                 config=config.query_creator)
    query_pipeline.add_processor(processor=SearchProcessor(),
                                 config=config.indexer)
    query_pipeline.add_processor(
        processor=NLTKSentenceSegmenter(),
        selector=NameMatchSelector(
            select_name=config.indexer.response_pack_name[0]))
    query_pipeline.add_processor(
        processor=NLTKWordTokenizer(),
        selector=NameMatchSelector(
            select_name=config.indexer.response_pack_name[0]))
    query_pipeline.add_processor(
        processor=NLTKPOSTagger(),
        selector=NameMatchSelector(
            select_name=config.indexer.response_pack_name[0]))
    query_pipeline.add_processor(
        processor=SRLPredictor(),
        config=config.SRL,
        selector=NameMatchSelector(
            select_name=config.indexer.response_pack_name[0]))
    query_pipeline.add_processor(processor=MicrosoftBingTranslator(),
                                 config=config.back_translator)

    query_pipeline.initialize()

    for m_pack in query_pipeline.process_dataset():

        # update resource to be used in the next conversation
        query_pack = m_pack.get_pack(config.translator.in_pack_name)
        if resource.get("user_utterance"):
            resource.get("user_utterance").append(query_pack)
        else:
            resource.update(user_utterance=[query_pack])

        response_pack = m_pack.get_pack(config.back_translator.in_pack_name)

        if resource.get("bot_utterance"):
            resource.get("bot_utterance").append(response_pack)
        else:
            resource.update(bot_utterance=[response_pack])

        english_pack = m_pack.get_pack("pack")
        print(colored("English Translation of the query: ", "green"),
              english_pack.text, "\n")
        pack = m_pack.get_pack(config.indexer.response_pack_name[0])
        print(colored("Retrieved Document", "green"), pack.text, "\n")
        print(colored("German Translation", "green"),
              m_pack.get_pack("response").text, "\n")
        for sentence in pack.get(Sentence):
            sent_text = sentence.text
            print(colored("Sentence:", 'red'), sent_text, "\n")

            print(colored("Semantic role labels:", 'red'))
            for link in pack.get(PredicateLink, sentence):
                parent = link.get_parent()
                child = link.get_child()
                print(f"  - \"{child.text}\" is role {link.arg_type} of "
                      f"predicate \"{parent.text}\"")
            print()

            input(colored("Press ENTER to continue...\n", 'green'))
Esempio n. 9
0
def main():

    config = yaml.safe_load(open("config.yml", "r"))
    config = HParams(config, default_hparams=None)

    if not os.path.exists(config.indexer.model_dir):
        print(f"Creating a new index...")
        encoder = BERTEncoder(pretrained_model_name="bert-base-uncased")
        encoder.to(device)

        feature_original_types = {
            "id": ["int64", "FixedLenFeature"],
            "input_ids": ["int64", "FixedLenFeature",
                          config.indexer.max_seq_length],
            "segment_ids": ["int64", "FixedLenFeature",
                            config.indexer.max_seq_length],
            "text": ["str", "FixedLenFeature"]
        }

        hparam = {
            "allow_smaller_final_batch": True,
            "batch_size": config.indexer.batch_size,
            "dataset": {
                "data_name": "data",
                "feature_original_types": feature_original_types,
                "files": config.indexer.pickle_data_dir
            },
            "shuffle": False
        }

        print(f"Embedding the text using BERTEncoder...")
        record_data = RecordData(hparams=hparam, device=device)
        data_iterator = DataIterator(record_data)
        index = EmbeddingBasedIndexer(hparams={
            "index_type": "GpuIndexFlatIP",
            "dim": 768,
            "device": "gpu0"
        })

        for idx, batch in enumerate(data_iterator):
            ids = batch["id"]
            input_ids = batch["input_ids"]
            segment_ids = batch["segment_ids"]
            text = batch["text"]
            _, pooled_output = get_embeddings(encoder, input_ids, segment_ids)
            index.add(vectors=pooled_output,
                      meta_data={k.item(): v for k, v in zip(ids, text)})

            if (idx + 1) % 50 == 0:
                print(f"Completed {idx+1} batches of size "
                      f"{config.indexer.batch_size}")

        index.save(path=config.indexer.model_dir)

    resource = Resources()
    query_pipeline = Pipeline(resource=resource)
    query_pipeline.set_reader(MultiPackTerminalReader())

    query_pipeline.add_processor(
        processor=MachineTranslationProcessor(), config=config.translator)
    query_pipeline.add_processor(
        processor=QueryCreator(), config=config.query_creator)
    query_pipeline.add_processor(
        processor=SearchProcessor(), config=config.indexer)
    query_pipeline.add_processor(
        processor=NLTKSentenceSegmenter(),
        selector=NameMatchSelector(select_name="doc_0"))
    query_pipeline.add_processor(
        processor=NLTKWordTokenizer(),
        selector=NameMatchSelector(select_name="doc_0"))
    query_pipeline.add_processor(
        processor=NLTKPOSTagger(),
        selector=NameMatchSelector(select_name="doc_0"))
    query_pipeline.add_processor(
        processor=SRLPredictor(), config=config.SRL,
        selector=NameMatchSelector(select_name="doc_0"))
    # query_pipeline.add_processor(
    #    processor=CoNLLNERPredictor(), config=config.NER,
    #    selector=NameMatchSelector(select_name="doc_0"))
    query_pipeline.add_processor(
        processor=MachineTranslationProcessor(), config=config.back_translator)

    query_pipeline.initialize()

    for m_pack in query_pipeline.process_dataset():

        # update resource to be used in the next conversation
        query_pack = m_pack.get_pack("query")
        if resource.get("user_utterance"):
            resource.get("user_utterance").append(query_pack)
        else:
            resource.update(user_utterance=[query_pack])

        response_pack = m_pack.get_pack("response")

        if resource.get("bot_utterance"):
            resource.get("bot_utterance").append(response_pack)
        else:
            resource.update(bot_utterance=[response_pack])

        english_pack = m_pack.get_pack("pack")
        print(colored("English Translation of the query: ", "green"),
              english_pack.text, "\n")
        pack = m_pack.get_pack("doc_0")
        print(colored("Retrieved Document", "green"), pack.text, "\n")
        print(colored("German Translation", "green"),
              m_pack.get_pack("response").text, "\n")
        for sentence in pack.get(Sentence):
            sent_text = sentence.text
            print(colored("Sentence:", 'red'), sent_text, "\n")

            print(colored("Semantic role labels:", 'red'))
            for link in pack.get(PredicateLink, sentence):
                parent = link.get_parent()
                child = link.get_child()
                print(f"  - \"{child.text}\" is role {link.arg_type} of "
                      f"predicate \"{parent.text}\"")
            print()

            input(colored("Press ENTER to continue...\n", 'green'))
Esempio n. 10
0
    def test_reuse_processor(self):
        # Create a basic pipeline of multi packs that have two pack (by copying)
        nlp = (
            Pipeline()
            .set_reader(SentenceReader())
            .add(MultiPackBoxer())
            .add(MultiPackCopier())
        )

        # Create one shared instance of this extractor
        dummy = DummyPackProcessor()
        nlp.add(
            dummy,
            config={"test": "dummy1"},
            selector=NameMatchSelector(),
            selector_config={"select_name": "default"},
        )

        # This will not add the component successfully because the processor is
        # initialized.
        with self.assertRaises(ProcessorConfigError):
            nlp.add(dummy, config={"test": "dummy2"})

        # This will add the component, with a different selector
        nlp.add(
            dummy,
            selector=NameMatchSelector(),
            selector_config={"select_name": "copy"},
        )
        nlp.initialize()

        # Check that the two processors have the same name.
        self.assertEqual(
            nlp.components[2].name, get_full_module_name(DummyPackProcessor)
        )
        self.assertEqual(
            nlp.components[3].name, get_full_module_name(DummyPackProcessor)
        )

        # Check that the two processors are also the same instance.
        self.assertEqual(nlp.components[2], nlp.components[3])

        # Check that the initialization is only done once, here the count
        #  will only be 1.
        self.assertEqual(nlp.components[2].initialize_count, 1)
        self.assertEqual(nlp.components[3].initialize_count, 1)

        # Check that the configuration is not changed by the second insertion.
        self.assertEqual(nlp.components[3].configs.test, "dummy1")

        # Run it once to make sure it can run.
        dataset_path = os.path.join(data_samples_root, "random_texts", "0.txt")
        nlp.run(dataset_path)

        # Check that initialization will be false after `run`, because it
        #  calls the `finish` function of all components.
        self.assertFalse(nlp.components[2].is_initialized)
        self.assertFalse(nlp.components[3].is_initialized)

        # Check that we are able to re-initialize the pipeline.
        nlp.initialize()  # initialize the first time.
        nlp.initialize()  # re-initialize.

        # Check the name again after re-initialize.
        self.assertEqual(
            nlp.components[2].name, get_full_module_name(DummyPackProcessor)
        )
        self.assertEqual(
            nlp.components[3].name, get_full_module_name(DummyPackProcessor)
        )

        # Obtain the results from the multipack.
        mp: MultiPack = nlp.process(dataset_path)
        pack: DataPack = mp.get_pack("default")
        pack_copy: DataPack = mp.get_pack("copy")

        # Check both pack are processed by the DummyProcessor once, because
        #  we use different selector.
        pack.get_single(NewType).value = "[PACK]"
        pack_copy.get_single(NewType).value = "[PACK]"