def setup(config: Config) -> Pipeline: resource = Resources() query_pipeline = Pipeline[MultiPack](resource=resource) query_pipeline.set_reader( reader=MultiPackTerminalReader(), config=config.reader) query_pipeline.add( component=MicrosoftBingTranslator(), config=config.translator) query_pipeline.add( component=BertBasedQueryCreator(), config=config.query_creator) query_pipeline.add( component=SearchProcessor(), config=config.searcher) top_response_pack_name = config.indexer.response_pack_name + '_0' query_pipeline.add( component=NLTKSentenceSegmenter(), selector=NameMatchSelector(select_name=top_response_pack_name)) query_pipeline.add( component=NLTKWordTokenizer(), selector=NameMatchSelector(select_name=top_response_pack_name)) query_pipeline.add( component=NLTKPOSTagger(), selector=NameMatchSelector(select_name=top_response_pack_name)) query_pipeline.add( component=SRLPredictor(), config=config.SRL, selector=NameMatchSelector(select_name=top_response_pack_name)) query_pipeline.add( component=MicrosoftBingTranslator(), config=config.back_translator) query_pipeline.initialize() return query_pipeline
def test_name_match_selector_backward_compatability(self) -> None: selector = NameMatchSelector(select_name="pack1") selector.initialize() packs = selector.select(self.multi_pack) doc_ids = ["1"] for doc_id, pack in zip(doc_ids, packs): self.assertEqual(doc_id, pack.pack_name) selector = NameMatchSelector("pack1") selector.initialize() packs = selector.select(self.multi_pack) doc_ids = ["1"] for doc_id, pack in zip(doc_ids, packs): self.assertEqual(doc_id, pack.pack_name)
def test_process_multi_next(self): from forte.data.readers import OntonotesReader # Define and config the Pipeline nlp = Pipeline[DataPack]() nlp.set_reader(OntonotesReader()) pack_name = 'test_pack' nlp.add(MultiPackBoxer(), {'pack_name': pack_name}) nlp.add(DummyRelationExtractor(), config={"batcher": { "batch_size": 5 }}, selector=NameMatchSelector(select_name=pack_name)) nlp.initialize() dataset_path = data_samples_root + "/ontonotes/00" # get processed pack from dataset m_pack: MultiPack for m_pack in nlp.process_dataset(dataset_path): pack = m_pack.get_pack(pack_name) # get sentence from pack for sentence in pack.get(Sentence): sent_text = sentence.text # second method to get entry in a sentence tokens = [token.text for token in pack.get(Token, sentence)] self.assertEqual(sent_text, " ".join(tokens))
def build_pipeline(result_dir: str, word_counter: Counter, tag_counter: Counter): r"""Build the pipeline to parse IU Xray report with tokenizer, lowercase and non-alpha removal to generate forte json file with the same name with preprocessed content and information of impression, findings and path to the parent image. Args: result_dir: the directory to save the forte json files. Return: pipeline: built pipeline to process the xml files """ pipeline = Pipeline[MultiPack]() pipeline.resource.update(word_counter=word_counter) pipeline.resource.update(tag_counter=tag_counter) pipeline.set_reader(IUXrayReportReader()) pipeline.add(MultiPackBoxer()) pipeline.add(PackNameJsonPackWriter(), { 'indent': 2, 'output_dir': result_dir, 'overwrite': True }, NameMatchSelector(select_name='default')) pipeline.initialize() return pipeline
def test_name_match_selector(self) -> None: selector = NameMatchSelector() selector.initialize( configs={"select_name": "pack1"}, ) packs = selector.select(self.multi_pack) doc_ids = ["1"] for doc_id, pack in zip(doc_ids, packs): self.assertEqual(doc_id, pack.pack_name) # Test reverse selection. selector.initialize( configs={"select_name": "pack1", "reverse_selection": True}, ) packs = selector.select(self.multi_pack) doc_ids = ["2", "Three"] for doc_id, pack in zip(doc_ids, packs): self.assertEqual(doc_id, pack.pack_name)
# Call spacy on remote. RemoteProcessor(), config={ "url": "http://localhost:8008" }, ).add( # Call allennlp on remote. RemoteProcessor(), config={ "url": "http://localhost:8009" }, ).add( MultiPackBoxer() ).add( TweakData() ).add( NLIProcessor(), selector=NameMatchSelector(), selector_config={ "select_name": "default", "reverse_selection": True, } ).add( PackNameMultiPackWriter(), config={ "output_dir": output_dir } ).add( ProgressPrinter(), ).run()
def test_name_match_selector(self) -> None: selector = NameMatchSelector(select_name="pack1") packs = selector.select(self.multi_pack) doc_ids = ["1"] for doc_id, pack in zip(doc_ids, packs): self.assertEqual(doc_id, pack.meta.doc_id)
def main(): config = yaml.safe_load(open("config.yml", "r")) config = HParams(config, default_hparams=None) resource = Resources() query_pipeline = Pipeline(resource=resource) query_pipeline.set_reader(reader=MultiPackTerminalReader(), config=config.reader) query_pipeline.add_processor(processor=MicrosoftBingTranslator(), config=config.translator) query_pipeline.add_processor(processor=BertBasedQueryCreator(), config=config.query_creator) query_pipeline.add_processor(processor=SearchProcessor(), config=config.indexer) query_pipeline.add_processor( processor=NLTKSentenceSegmenter(), selector=NameMatchSelector( select_name=config.indexer.response_pack_name[0])) query_pipeline.add_processor( processor=NLTKWordTokenizer(), selector=NameMatchSelector( select_name=config.indexer.response_pack_name[0])) query_pipeline.add_processor( processor=NLTKPOSTagger(), selector=NameMatchSelector( select_name=config.indexer.response_pack_name[0])) query_pipeline.add_processor( processor=SRLPredictor(), config=config.SRL, selector=NameMatchSelector( select_name=config.indexer.response_pack_name[0])) query_pipeline.add_processor(processor=MicrosoftBingTranslator(), config=config.back_translator) query_pipeline.initialize() for m_pack in query_pipeline.process_dataset(): # update resource to be used in the next conversation query_pack = m_pack.get_pack(config.translator.in_pack_name) if resource.get("user_utterance"): resource.get("user_utterance").append(query_pack) else: resource.update(user_utterance=[query_pack]) response_pack = m_pack.get_pack(config.back_translator.in_pack_name) if resource.get("bot_utterance"): resource.get("bot_utterance").append(response_pack) else: resource.update(bot_utterance=[response_pack]) english_pack = m_pack.get_pack("pack") print(colored("English Translation of the query: ", "green"), english_pack.text, "\n") pack = m_pack.get_pack(config.indexer.response_pack_name[0]) print(colored("Retrieved Document", "green"), pack.text, "\n") print(colored("German Translation", "green"), m_pack.get_pack("response").text, "\n") for sentence in pack.get(Sentence): sent_text = sentence.text print(colored("Sentence:", 'red'), sent_text, "\n") print(colored("Semantic role labels:", 'red')) for link in pack.get(PredicateLink, sentence): parent = link.get_parent() child = link.get_child() print(f" - \"{child.text}\" is role {link.arg_type} of " f"predicate \"{parent.text}\"") print() input(colored("Press ENTER to continue...\n", 'green'))
def main(): config = yaml.safe_load(open("config.yml", "r")) config = HParams(config, default_hparams=None) if not os.path.exists(config.indexer.model_dir): print(f"Creating a new index...") encoder = BERTEncoder(pretrained_model_name="bert-base-uncased") encoder.to(device) feature_original_types = { "id": ["int64", "FixedLenFeature"], "input_ids": ["int64", "FixedLenFeature", config.indexer.max_seq_length], "segment_ids": ["int64", "FixedLenFeature", config.indexer.max_seq_length], "text": ["str", "FixedLenFeature"] } hparam = { "allow_smaller_final_batch": True, "batch_size": config.indexer.batch_size, "dataset": { "data_name": "data", "feature_original_types": feature_original_types, "files": config.indexer.pickle_data_dir }, "shuffle": False } print(f"Embedding the text using BERTEncoder...") record_data = RecordData(hparams=hparam, device=device) data_iterator = DataIterator(record_data) index = EmbeddingBasedIndexer(hparams={ "index_type": "GpuIndexFlatIP", "dim": 768, "device": "gpu0" }) for idx, batch in enumerate(data_iterator): ids = batch["id"] input_ids = batch["input_ids"] segment_ids = batch["segment_ids"] text = batch["text"] _, pooled_output = get_embeddings(encoder, input_ids, segment_ids) index.add(vectors=pooled_output, meta_data={k.item(): v for k, v in zip(ids, text)}) if (idx + 1) % 50 == 0: print(f"Completed {idx+1} batches of size " f"{config.indexer.batch_size}") index.save(path=config.indexer.model_dir) resource = Resources() query_pipeline = Pipeline(resource=resource) query_pipeline.set_reader(MultiPackTerminalReader()) query_pipeline.add_processor( processor=MachineTranslationProcessor(), config=config.translator) query_pipeline.add_processor( processor=QueryCreator(), config=config.query_creator) query_pipeline.add_processor( processor=SearchProcessor(), config=config.indexer) query_pipeline.add_processor( processor=NLTKSentenceSegmenter(), selector=NameMatchSelector(select_name="doc_0")) query_pipeline.add_processor( processor=NLTKWordTokenizer(), selector=NameMatchSelector(select_name="doc_0")) query_pipeline.add_processor( processor=NLTKPOSTagger(), selector=NameMatchSelector(select_name="doc_0")) query_pipeline.add_processor( processor=SRLPredictor(), config=config.SRL, selector=NameMatchSelector(select_name="doc_0")) # query_pipeline.add_processor( # processor=CoNLLNERPredictor(), config=config.NER, # selector=NameMatchSelector(select_name="doc_0")) query_pipeline.add_processor( processor=MachineTranslationProcessor(), config=config.back_translator) query_pipeline.initialize() for m_pack in query_pipeline.process_dataset(): # update resource to be used in the next conversation query_pack = m_pack.get_pack("query") if resource.get("user_utterance"): resource.get("user_utterance").append(query_pack) else: resource.update(user_utterance=[query_pack]) response_pack = m_pack.get_pack("response") if resource.get("bot_utterance"): resource.get("bot_utterance").append(response_pack) else: resource.update(bot_utterance=[response_pack]) english_pack = m_pack.get_pack("pack") print(colored("English Translation of the query: ", "green"), english_pack.text, "\n") pack = m_pack.get_pack("doc_0") print(colored("Retrieved Document", "green"), pack.text, "\n") print(colored("German Translation", "green"), m_pack.get_pack("response").text, "\n") for sentence in pack.get(Sentence): sent_text = sentence.text print(colored("Sentence:", 'red'), sent_text, "\n") print(colored("Semantic role labels:", 'red')) for link in pack.get(PredicateLink, sentence): parent = link.get_parent() child = link.get_child() print(f" - \"{child.text}\" is role {link.arg_type} of " f"predicate \"{parent.text}\"") print() input(colored("Press ENTER to continue...\n", 'green'))
def test_reuse_processor(self): # Create a basic pipeline of multi packs that have two pack (by copying) nlp = ( Pipeline() .set_reader(SentenceReader()) .add(MultiPackBoxer()) .add(MultiPackCopier()) ) # Create one shared instance of this extractor dummy = DummyPackProcessor() nlp.add( dummy, config={"test": "dummy1"}, selector=NameMatchSelector(), selector_config={"select_name": "default"}, ) # This will not add the component successfully because the processor is # initialized. with self.assertRaises(ProcessorConfigError): nlp.add(dummy, config={"test": "dummy2"}) # This will add the component, with a different selector nlp.add( dummy, selector=NameMatchSelector(), selector_config={"select_name": "copy"}, ) nlp.initialize() # Check that the two processors have the same name. self.assertEqual( nlp.components[2].name, get_full_module_name(DummyPackProcessor) ) self.assertEqual( nlp.components[3].name, get_full_module_name(DummyPackProcessor) ) # Check that the two processors are also the same instance. self.assertEqual(nlp.components[2], nlp.components[3]) # Check that the initialization is only done once, here the count # will only be 1. self.assertEqual(nlp.components[2].initialize_count, 1) self.assertEqual(nlp.components[3].initialize_count, 1) # Check that the configuration is not changed by the second insertion. self.assertEqual(nlp.components[3].configs.test, "dummy1") # Run it once to make sure it can run. dataset_path = os.path.join(data_samples_root, "random_texts", "0.txt") nlp.run(dataset_path) # Check that initialization will be false after `run`, because it # calls the `finish` function of all components. self.assertFalse(nlp.components[2].is_initialized) self.assertFalse(nlp.components[3].is_initialized) # Check that we are able to re-initialize the pipeline. nlp.initialize() # initialize the first time. nlp.initialize() # re-initialize. # Check the name again after re-initialize. self.assertEqual( nlp.components[2].name, get_full_module_name(DummyPackProcessor) ) self.assertEqual( nlp.components[3].name, get_full_module_name(DummyPackProcessor) ) # Obtain the results from the multipack. mp: MultiPack = nlp.process(dataset_path) pack: DataPack = mp.get_pack("default") pack_copy: DataPack = mp.get_pack("copy") # Check both pack are processed by the DummyProcessor once, because # we use different selector. pack.get_single(NewType).value = "[PACK]" pack_copy.get_single(NewType).value = "[PACK]"