def pack_example(input_path, output_path): """ This example read data from input path and serialize to output path. Args: input_path: output_path: Returns: """ print("Pack serialization example.") nlp = Pipeline[DataPack]() nlp.set_reader(OntonotesReader()) nlp.add(NLTKSentenceSegmenter()) nlp.add(NLTKWordTokenizer()) nlp.add(NLTKPOSTagger()) # This is a simple writer that serialize the result to the current # directory and will use the DocID field in the data pack as the file name. nlp.add(PackNameJsonPackWriter(), { 'output_dir': output_path, 'indent': 2, 'overwrite': True, }) nlp.run(input_path)
def setup(config: Config) -> Pipeline: resource = Resources() query_pipeline = Pipeline[MultiPack](resource=resource) query_pipeline.set_reader( reader=MultiPackTerminalReader(), config=config.reader) query_pipeline.add( component=MicrosoftBingTranslator(), config=config.translator) query_pipeline.add( component=BertBasedQueryCreator(), config=config.query_creator) query_pipeline.add( component=SearchProcessor(), config=config.searcher) top_response_pack_name = config.indexer.response_pack_name + '_0' query_pipeline.add( component=NLTKSentenceSegmenter(), selector=NameMatchSelector(select_name=top_response_pack_name)) query_pipeline.add( component=NLTKWordTokenizer(), selector=NameMatchSelector(select_name=top_response_pack_name)) query_pipeline.add( component=NLTKPOSTagger(), selector=NameMatchSelector(select_name=top_response_pack_name)) query_pipeline.add( component=SRLPredictor(), config=config.SRL, selector=NameMatchSelector(select_name=top_response_pack_name)) query_pipeline.add( component=MicrosoftBingTranslator(), config=config.back_translator) query_pipeline.initialize() return query_pipeline
def setUp(self): self.nltk = Pipeline[DataPack]() self.nltk.set_reader(StringReader()) self.nltk.add(NLTKSentenceSegmenter()) self.nltk.add(NLTKWordTokenizer()) self.nltk.add(NLTKPOSTagger()) self.nltk.initialize()
def setUp(self): self.nltk = Pipeline[DataPack]() self.nltk.set_reader(StringReader()) self.nltk.add(NLTKSentenceSegmenter()) self.nltk.add(NLTKWordTokenizer()) self.nltk.add(NLTKPOSTagger()) config = {'pattern': 'NP: {<DT>?<JJ>*<NN>}'} self.nltk.add(NLTKChunker(), config=config) self.nltk.initialize()
def setUp(self): random.seed(0) self.nlp = Pipeline[MultiPack]() boxer_config = {'pack_name': 'input_src'} self.nlp.set_reader(reader=StringReader()) self.nlp.add(component=MultiPackBoxer(), config=boxer_config) self.nlp.add(component=NLTKWordTokenizer(), selector=AllPackSelector()) self.nlp.add(component=NLTKPOSTagger(), selector=AllPackSelector())
def string_processor_example(ner_model_dir: str, srl_model_dir: str): pl = Pipeline() pl.set_reader(StringReader()) pl.add_processor(NLTKSentenceSegmenter()) pl.add_processor(NLTKWordTokenizer()) pl.add_processor(NLTKPOSTagger()) ner_configs = HParams( {'storage_path': os.path.join(ner_model_dir, 'resources.pkl')}, CoNLLNERPredictor.default_hparams()) ner_predictor = CoNLLNERPredictor() pl.add_processor(ner_predictor, ner_configs) srl_configs = HParams({ 'storage_path': srl_model_dir, }, SRLPredictor.default_hparams()) pl.add_processor(SRLPredictor(), srl_configs) pl.initialize() text = ( "The plain green Norway spruce is displayed in the gallery's foyer. " "Wentworth worked as an assistant to sculptor Henry Moore in the " "late 1960s. His reputation as a sculptor grew in the 1980s.") pack = pl.process_one(text) for sentence in pack.get(Sentence): sent_text = sentence.text print(colored("Sentence:", 'red'), sent_text, "\n") # first method to get entry in a sentence tokens = [(token.text, token.pos) for token in pack.get(Token, sentence)] entities = [(entity.text, entity.ner_type) for entity in pack.get(EntityMention, sentence)] print(colored("Tokens:", 'red'), tokens, "\n") print(colored("EntityMentions:", 'red'), entities, "\n") # second method to get entry in a sentence print(colored("Semantic role labels:", 'red')) for link in pack.get(PredicateLink, sentence): parent: PredicateMention = link.get_parent() # type: ignore child: PredicateArgument = link.get_child() # type: ignore print(f" - \"{child.text}\" is role {link.arg_type} of " f"predicate \"{parent.text}\"") entities = [ entity.text for entity in pack.get(EntityMention, child) ] print(" Entities in predicate argument:", entities, "\n") print() input(colored("Press ENTER to continue...\n", 'green'))
def test_serialize_deserialize_processor(self): pipe_serialize = Pipeline[DataPack]() pipe_serialize.set_reader(OntonotesReader()) pipe_serialize.add( AnnotationRemover(), # Remove tokens and sentences form OntonotesReader. { 'removal_types': [ 'ft.onto.base_ontology.Token', 'ft.onto.base_ontology.Sentence', ] }) pipe_serialize.add(NLTKSentenceSegmenter()) pipe_serialize.add(NLTKWordTokenizer()) pipe_serialize.add(NLTKPOSTagger()) output_path = tempfile.mkdtemp() pipe_serialize.add(DocIdJsonPackWriter(), { 'output_dir': output_path, 'indent': 2, }) dataset_path = "data_samples/ontonotes/00" pipe_serialize.run(dataset_path) pipe_deserialize = Pipeline[DataPack]() pipe_deserialize.set_reader(RecursiveDirectoryDeserializeReader()) pipe_deserialize.initialize() token_counts: Dict[str, int] = {} # This basically test whether the deserialized data is still the same # as expected. pack: DataPack for pack in pipe_deserialize.process_dataset(output_path): tokens: List[Token] = list(pack.get(Token)) token_counts[pack.pack_name] = len(tokens) expected_count = { 'bn/abc/00/abc_0039': 72, 'bn/abc/00/abc_0019': 370, 'bn/abc/00/abc_0059': 39, 'bn/abc/00/abc_0009': 424, 'bn/abc/00/abc_0029': 487, 'bn/abc/00/abc_0069': 428, 'bn/abc/00/abc_0049': 73 } assert token_counts == expected_count shutil.rmtree(output_path)
def main(dataset_dir: str, ner_model_path: str, srl_model_path: str): pl = Pipeline() pl.set_reader(PlainTextReader()) pl.add_processor(NLTKSentenceSegmenter()) pl.add_processor(NLTKWordTokenizer()) pl.add_processor(NLTKPOSTagger()) ner_configs = HParams( {'storage_path': os.path.join(ner_model_path, 'resources.pkl')}, CoNLLNERPredictor.default_hparams()) pl.add_processor(CoNLLNERPredictor(), ner_configs) srl_configs = HParams({ 'storage_path': srl_model_path, }, SRLPredictor.default_hparams()) pl.add_processor(SRLPredictor(), srl_configs) pl.initialize() for pack in pl.process_dataset(dataset_dir): print(colored("Document", 'red'), pack.meta.doc_id) for sentence in pack.get(Sentence): sent_text = sentence.text print(colored("Sentence:", 'red'), sent_text, "\n") # first method to get entry in a sentence tokens = [(token.text, token.pos) for token in pack.get(Token, sentence)] entities = [(entity.text, entity.ner_type) for entity in pack.get(EntityMention, sentence)] print(colored("Tokens:", 'red'), tokens, "\n") print(colored("EntityMentions:", 'red'), entities, "\n") # second method to get entry in a sentence print(colored("Semantic role labels:", 'red')) for link in pack.get(PredicateLink, sentence): parent: PredicateMention = link.get_parent() # type: ignore child: PredicateArgument = link.get_child() # type: ignore print(f" - \"{child.text}\" is role {link.arg_type} of " f"predicate \"{parent.text}\"") entities = [ entity.text for entity in pack.get(EntityMention, child) ] print(" Entities in predicate argument:", entities, "\n") print() input(colored("Press ENTER to continue...\n", 'green'))
def main(): pl = Pipeline() pl.set_reader(StringReader()) pl.add_processor(NLTKSentenceSegmenter()) pl.add_processor(NLTKWordTokenizer()) pl.add_processor(NLTKPOSTagger()) pl.add_processor(CoNLLNERPredictor(), config=config.NER) pl.add_processor(SRLPredictor(), config=config.SRL) pl.initialize() text = ( "So I was excited to see Journey to the Far Side of the Sun finally " "get released on an affordable DVD (the previous print had been " "fetching $100 on eBay - I'm sure those people wish they had their " "money back - but more about that in a second).") pack = pl.process_one(text) for sentence in pack.get(Sentence): sent_text = sentence.text print(colored("Sentence:", 'red'), sent_text, "\n") # first method to get entry in a sentence tokens = [(token.text, token.pos) for token in pack.get(Token, sentence)] entities = [(entity.text, entity.ner_type) for entity in pack.get(EntityMention, sentence)] print(colored("Tokens:", 'red'), tokens, "\n") print(colored("EntityMentions:", 'red'), entities, "\n") # second method to get entry in a sentence print(colored("Semantic role labels:", 'red')) for link in pack.get(PredicateLink, sentence): parent: PredicateMention = link.get_parent() # type: ignore child: PredicateArgument = link.get_child() # type: ignore print(f" - \"{child.text}\" is role {link.arg_type} of " f"predicate \"{parent.text}\"") entities = [ entity.text for entity in pack.get(EntityMention, child) ] print(" Entities in predicate argument:", entities, "\n") print() input(colored("Press ENTER to continue...\n", 'green'))
def test_pipeline(self, texts, expected_outputs, expected_tokens): nlp = Pipeline[MultiPack]() boxer_config = { 'pack_name': 'input' } processor_config = { 'augment_entry': "ft.onto.base_ontology.Token", 'other_entry_policy': { 'type': '', 'kwargs': { "ft.onto.base_ontology.Document": "auto_align", "ft.onto.base_ontology.Sentence": "auto_align" } }, 'type': 'data_augmentation_op', 'data_aug_op': 'tests.forte.processors.base.data_augment_replacement_processor_test.TmpReplacer', 'data_aug_op_config': { 'type': '', 'kwargs': {} }, 'augment_pack_names': { 'kwargs': { 'input': 'augmented_input' } } } nlp.set_reader(reader=StringReader()) nlp.add(component=MultiPackBoxer(), config=boxer_config) nlp.add(component=NLTKWordTokenizer(), selector=AllPackSelector()) nlp.add(component=NLTKPOSTagger(), selector=AllPackSelector()) nlp.add(component=ReplacementDataAugmentProcessor(), config=processor_config) nlp.initialize() for idx, m_pack in enumerate(nlp.process_dataset(texts)): aug_pack = m_pack.get_pack('augmented_input') self.assertEqual(aug_pack.text, expected_outputs[idx]) for j, token in enumerate(aug_pack.get(Token)): self.assertEqual(token.text, expected_tokens[idx][j])
def main(dataset_dir: str): config = yaml.safe_load(open("config.yml", "r")) config = Config(config, default_hparams=None) pl = Pipeline[DataPack]() pl.set_reader(PlainTextReader()) pl.add(NLTKSentenceSegmenter()) pl.add(NLTKWordTokenizer()) pl.add(NLTKPOSTagger()) pl.add(CoNLLNERPredictor(), config=config.NER) pl.add(SRLPredictor(), config=config.SRL) pl.initialize() for pack in pl.process_dataset(dataset_dir): print(colored("Document", 'red'), pack.meta.doc_id) for sentence in pack.get(Sentence): sent_text = sentence.text print(colored("Sentence:", 'red'), sent_text, "\n") # first method to get entry in a sentence tokens = [(token.text, token.pos) for token in pack.get(Token, sentence)] entities = [(entity.text, entity.ner_type) for entity in pack.get(EntityMention, sentence)] print(colored("Tokens:", 'red'), tokens, "\n") print(colored("EntityMentions:", 'red'), entities, "\n") # second method to get entry in a sentence print(colored("Semantic role labels:", 'red')) for link in pack.get(PredicateLink, sentence): parent: PredicateMention = link.get_parent() # type: ignore child: PredicateArgument = link.get_child() # type: ignore print(f" - \"{child.text}\" is role {link.arg_type} of " f"predicate \"{parent.text}\"") entities = [ entity.text for entity in pack.get(EntityMention, child) ] print(" Entities in predicate argument:", entities, "\n") print() input(colored("Press ENTER to continue...\n", 'green'))
def test_encoder_phrase(self): pipeline = Pipeline[DataPack]() pipeline.set_reader(StringReader()) pipeline.add(NLTKSentenceSegmenter()) pipeline.add(NLTKWordTokenizer()) pipeline.add(NLTKPOSTagger()) config = {'pattern': 'NP: {<DT>?<JJ>*<NN>}'} pipeline.add(NLTKChunker(), config=config) pipeline.add(PretrainedEncoder(), config={'entry_type': 'ft.onto.base_ontology.Phrase'}) pipeline.initialize() sentences = [ "This tool is called Forte.", "The goal of this project to help you build NLP " "pipelines.", "NLP has never been made this easy before." ] document = ' '.join(sentences) pack = pipeline.process(document) for i, phrase in enumerate(pack.get(Phrase)): self.assertEqual(phrase.embedding.shape, (1, 512, 768))
def main(): parser = argparse.ArgumentParser() parser.add_argument("--config_file", default="./config.yml", help="Config YAML filepath") args = parser.parse_args() # loading config config = yaml.safe_load(open(args.config_file, "r")) nlp: Pipeline[MultiPack] = Pipeline() nlp.set_reader(RandomDataSelector(), config=config["data_selector_config"]) nlp.add(component=MultiPackBoxer(), config=config["boxer_config"]) nlp.add(component=NLTKWordTokenizer(), selector=AllPackSelector()) nlp.add(component=NLTKPOSTagger(), selector=AllPackSelector()) nlp.add(component=ReplacementDataAugmentProcessor(), config=config["da_processor_config"]) nlp.initialize() for _, m_pack in enumerate(nlp.process_dataset()): aug_pack = m_pack.get_pack('augmented_input') logging.info(aug_pack.text)
def setUp(self): self.nltk = Pipeline() self.nltk.set_reader(StringReader()) self.nltk.add_processor(NLTKSentenceSegmenter()) self.nltk.add_processor(NLTKWordTokenizer()) self.nltk.add_processor(NLTKPOSTagger())
from texar.torch import HParams from forte.pipeline import Pipeline from forte.data.readers import OntonotesReader from forte.processors.nltk_processors import NLTKWordTokenizer, \ NLTKPOSTagger, NLTKSentenceSegmenter from forte.processors.writers import DocIdJsonPackWriter nlp = Pipeline() reader = OntonotesReader() data_path = "../data_samples/ontonotes/00/" nlp.set_reader(OntonotesReader()) nlp.add_processor(NLTKSentenceSegmenter()) nlp.add_processor(NLTKWordTokenizer()) nlp.add_processor(NLTKPOSTagger()) # This is a simple writer that serialize the result to the current directory and # will use the DocID field in the data pack as the file name. nlp.add_processor( DocIdJsonPackWriter(), HParams( {'output_dir': '.'}, DocIdJsonPackWriter.default_hparams(), )) nlp.initialize() nlp.run(data_path)
def main(): config = yaml.safe_load(open("config.yml", "r")) config = HParams(config, default_hparams=None) if not os.path.exists(config.indexer.model_dir): print(f"Creating a new index...") encoder = BERTEncoder(pretrained_model_name="bert-base-uncased") encoder.to(device) feature_original_types = { "id": ["int64", "FixedLenFeature"], "input_ids": ["int64", "FixedLenFeature", config.indexer.max_seq_length], "segment_ids": ["int64", "FixedLenFeature", config.indexer.max_seq_length], "text": ["str", "FixedLenFeature"] } hparam = { "allow_smaller_final_batch": True, "batch_size": config.indexer.batch_size, "dataset": { "data_name": "data", "feature_original_types": feature_original_types, "files": config.indexer.pickle_data_dir }, "shuffle": False } print(f"Embedding the text using BERTEncoder...") record_data = RecordData(hparams=hparam, device=device) data_iterator = DataIterator(record_data) index = EmbeddingBasedIndexer(hparams={ "index_type": "GpuIndexFlatIP", "dim": 768, "device": "gpu0" }) for idx, batch in enumerate(data_iterator): ids = batch["id"] input_ids = batch["input_ids"] segment_ids = batch["segment_ids"] text = batch["text"] _, pooled_output = get_embeddings(encoder, input_ids, segment_ids) index.add(vectors=pooled_output, meta_data={k.item(): v for k, v in zip(ids, text)}) if (idx + 1) % 50 == 0: print(f"Completed {idx+1} batches of size " f"{config.indexer.batch_size}") index.save(path=config.indexer.model_dir) resource = Resources() query_pipeline = Pipeline(resource=resource) query_pipeline.set_reader(MultiPackTerminalReader()) query_pipeline.add_processor( processor=MachineTranslationProcessor(), config=config.translator) query_pipeline.add_processor( processor=QueryCreator(), config=config.query_creator) query_pipeline.add_processor( processor=SearchProcessor(), config=config.indexer) query_pipeline.add_processor( processor=NLTKSentenceSegmenter(), selector=NameMatchSelector(select_name="doc_0")) query_pipeline.add_processor( processor=NLTKWordTokenizer(), selector=NameMatchSelector(select_name="doc_0")) query_pipeline.add_processor( processor=NLTKPOSTagger(), selector=NameMatchSelector(select_name="doc_0")) query_pipeline.add_processor( processor=SRLPredictor(), config=config.SRL, selector=NameMatchSelector(select_name="doc_0")) # query_pipeline.add_processor( # processor=CoNLLNERPredictor(), config=config.NER, # selector=NameMatchSelector(select_name="doc_0")) query_pipeline.add_processor( processor=MachineTranslationProcessor(), config=config.back_translator) query_pipeline.initialize() for m_pack in query_pipeline.process_dataset(): # update resource to be used in the next conversation query_pack = m_pack.get_pack("query") if resource.get("user_utterance"): resource.get("user_utterance").append(query_pack) else: resource.update(user_utterance=[query_pack]) response_pack = m_pack.get_pack("response") if resource.get("bot_utterance"): resource.get("bot_utterance").append(response_pack) else: resource.update(bot_utterance=[response_pack]) english_pack = m_pack.get_pack("pack") print(colored("English Translation of the query: ", "green"), english_pack.text, "\n") pack = m_pack.get_pack("doc_0") print(colored("Retrieved Document", "green"), pack.text, "\n") print(colored("German Translation", "green"), m_pack.get_pack("response").text, "\n") for sentence in pack.get(Sentence): sent_text = sentence.text print(colored("Sentence:", 'red'), sent_text, "\n") print(colored("Semantic role labels:", 'red')) for link in pack.get(PredicateLink, sentence): parent = link.get_parent() child = link.get_child() print(f" - \"{child.text}\" is role {link.arg_type} of " f"predicate \"{parent.text}\"") print() input(colored("Press ENTER to continue...\n", 'green'))
def main(): config = yaml.safe_load(open("config.yml", "r")) config = HParams(config, default_hparams=None) resource = Resources() query_pipeline = Pipeline(resource=resource) query_pipeline.set_reader(reader=MultiPackTerminalReader(), config=config.reader) query_pipeline.add_processor(processor=MicrosoftBingTranslator(), config=config.translator) query_pipeline.add_processor(processor=BertBasedQueryCreator(), config=config.query_creator) query_pipeline.add_processor(processor=SearchProcessor(), config=config.indexer) query_pipeline.add_processor( processor=NLTKSentenceSegmenter(), selector=NameMatchSelector( select_name=config.indexer.response_pack_name[0])) query_pipeline.add_processor( processor=NLTKWordTokenizer(), selector=NameMatchSelector( select_name=config.indexer.response_pack_name[0])) query_pipeline.add_processor( processor=NLTKPOSTagger(), selector=NameMatchSelector( select_name=config.indexer.response_pack_name[0])) query_pipeline.add_processor( processor=SRLPredictor(), config=config.SRL, selector=NameMatchSelector( select_name=config.indexer.response_pack_name[0])) query_pipeline.add_processor(processor=MicrosoftBingTranslator(), config=config.back_translator) query_pipeline.initialize() for m_pack in query_pipeline.process_dataset(): # update resource to be used in the next conversation query_pack = m_pack.get_pack(config.translator.in_pack_name) if resource.get("user_utterance"): resource.get("user_utterance").append(query_pack) else: resource.update(user_utterance=[query_pack]) response_pack = m_pack.get_pack(config.back_translator.in_pack_name) if resource.get("bot_utterance"): resource.get("bot_utterance").append(response_pack) else: resource.update(bot_utterance=[response_pack]) english_pack = m_pack.get_pack("pack") print(colored("English Translation of the query: ", "green"), english_pack.text, "\n") pack = m_pack.get_pack(config.indexer.response_pack_name[0]) print(colored("Retrieved Document", "green"), pack.text, "\n") print(colored("German Translation", "green"), m_pack.get_pack("response").text, "\n") for sentence in pack.get(Sentence): sent_text = sentence.text print(colored("Sentence:", 'red'), sent_text, "\n") print(colored("Semantic role labels:", 'red')) for link in pack.get(PredicateLink, sentence): parent = link.get_parent() child = link.get_child() print(f" - \"{child.text}\" is role {link.arg_type} of " f"predicate \"{parent.text}\"") print() input(colored("Press ENTER to continue...\n", 'green'))
def test_replace_token(self, texts, expected_outputs, expected_tokens, expected_links): for idx, text in enumerate(texts): file_path = os.path.join(self.test_dir, f"{idx + 1}.txt") with open(file_path, 'w') as f: f.write(text) nlp = Pipeline[MultiPack]() reader_config = { "input_pack_name": "input_src", "output_pack_name": "output_tgt" } nlp.set_reader(reader=MultiPackSentenceReader(), config=reader_config) nlp.add(component=NLTKWordTokenizer(), selector=AllPackSelector()) nlp.add(component=NLTKPOSTagger(), selector=AllPackSelector()) nlp.initialize() processor_config = { 'augment_entry': "ft.onto.base_ontology.Token", 'other_entry_policy': { "kwargs": { "ft.onto.base_ontology.Sentence": "auto_align" } }, 'type': 'data_augmentation_op', 'data_aug_op': 'tests.forte.processors.base.data_augment_replacement_processor_test.TmpReplacer', "data_aug_op_config": { 'kwargs': {} }, 'augment_pack_names': { 'kwargs': {} } } processor = ReplacementDataAugmentProcessor() processor.initialize(resources=None, configs=processor_config) for idx, m_pack in enumerate(nlp.process_dataset(self.test_dir)): src_pack = m_pack.get_pack('input_src') tgt_pack = m_pack.get_pack('output_tgt') num_mpl_orig, num_mpg_orig = 0, 0 # Copy the source pack to target pack. tgt_pack.set_text(src_pack.text) src_pack.add_entry(Document(src_pack, 0, len(src_pack.text))) for anno in src_pack.get(Annotation): new_anno = type(anno)( tgt_pack, anno.begin, anno.end ) tgt_pack.add_entry(new_anno) # Create MultiPackLink. m_pack.add_entry( MultiPackLink( m_pack, anno, new_anno ) ) # Create MultiPackGroup. m_pack.add_entry( MultiPackGroup( m_pack, [anno, new_anno] ) ) # Count the number of MultiPackLink/MultiPackGroup. num_mpl_orig += 1 num_mpg_orig += 1 # Create Links in the source pack. # The Links should be a tree: # # Link 3 # _________|_________ # | | # Link 2 | # _______|________ | # | | | # Link 1 | | # ______|_____ | | # | | | | # token 1 token 2 token 3 token 4 ... ... prev_entry = None for i, token in enumerate(src_pack.get(Token)): # Avoid overlapping with deleted tokens. if i < 10: continue if prev_entry: link = Link(src_pack, prev_entry, token) src_pack.add_entry( link ) prev_entry = link else: prev_entry = token # Create Groups in the target pack. # The Groups should be a tree like the Links. prev_entry = None for i, token in enumerate(tgt_pack.get(Token)): # Avoid overlapping with deleted tokens. if i < 10: continue if prev_entry: group = Group(tgt_pack, [prev_entry, token]) tgt_pack.add_entry( group ) prev_entry = group else: prev_entry = token doc_src = list(src_pack.get(Document))[0] doc_tgt = list(tgt_pack.get(Document))[0] sent_src = list(src_pack.get(Sentence))[0] sent_tgt = list(tgt_pack.get(Sentence))[0] # Insert two extra Links in the src_pack. # They should not be copied to new_src_pack, because the Document is not copied. link_src_low = src_pack.add_entry(Link(src_pack, doc_src, sent_src)) src_pack.add_entry(Link(src_pack, link_src_low, sent_src)) # Insert two extra Groups in the tgt_pack. # They should not be copied to new_tgt_pack, because the Document is not copied. group_tgt_low = tgt_pack.add_entry(Group(tgt_pack, [doc_tgt, sent_tgt])) tgt_pack.add_entry(Group(tgt_pack, [group_tgt_low, sent_tgt])) # Call the augment function explicitly for duplicate replacement # to test the False case of _replace function. processor._augment(m_pack, ["input_src", "output_tgt"]) # Test the insertion and deletion for pack in (src_pack, tgt_pack): # Insert an "NLP" at the beginning processor._insert(" NLP ", pack, 0) processor._insert(" NLP ", pack, 18) processor._insert(" NLP ", pack, len(pack.text) - 2) processor._insert("NLP", pack, len(pack.text) - 1) # Delete the second token "and" processor._delete(list(pack.get(Token))[1]) # This duplicate insertion should be invalid. processor._insert(" NLP ", pack, 0) # This insertion overlaps with a replacement. # It should be invalid. processor._insert(" NLP ", pack, 2) processor._process(m_pack) new_src_pack = m_pack.get_pack('augmented_input_src') new_tgt_pack = m_pack.get_pack('augmented_output_tgt') self.assertEqual(new_src_pack.text, expected_outputs[idx] + "\n") for j, token in enumerate(new_src_pack.get(Token)): self.assertEqual(token.text, expected_tokens[idx][j]) for sent in new_src_pack.get(Sentence): self.assertEqual(sent.text, expected_outputs[idx]) # Test the copied Links. prev_link = None for i, link in enumerate(new_src_pack.get(Link)): if prev_link: self.assertEqual(link.get_parent().tid, prev_link.tid) self.assertEqual(link.get_child().text, expected_links[idx][i]) prev_link = link # Test the copied Groups. prev_group = None for i, group in enumerate(new_tgt_pack.get(Group)): members = group.get_members() if isinstance(members[0], Token): member_token = members[0] member_group = members[1] else: member_token = members[1] member_group = members[0] if prev_group: self.assertEqual(isinstance(member_token, Token), True) self.assertEqual(isinstance(member_group, Group), True) self.assertEqual(member_group.tid, prev_group.tid) self.assertEqual(member_token.text, expected_links[idx][i]) prev_group = group # The two extra Links should not be copied, because of missing Document. self.assertEqual(len(list(src_pack.get(Link))) - 2, len(list(new_src_pack.get(Link)))) # The two extra Groups should not be copied, because of missing Document. self.assertEqual(len(list(tgt_pack.get(Group))) - 2, len(list(new_tgt_pack.get(Group)))) # Test the MultiPackLink/MultiPackGroup num_mpl_aug, num_mpg_aug = 0, 0 for mpl in m_pack.get(MultiPackLink): parent = mpl.get_parent() child = mpl.get_child() num_mpl_aug += 1 self.assertEqual(parent.text, child.text) self.assertNotEqual(parent.pack.meta.pack_id, child.pack.meta.pack_id) for mpg in m_pack.get(MultiPackGroup): members = mpg.get_members() num_mpg_aug += 1 self.assertEqual(members[0].text, members[1].text) self.assertNotEqual(members[0].pack.meta.pack_id, members[1].pack.meta.pack_id) # Test the number of MultiPackLink/MultiPackGroup. # Minus the aug and orig counters by 1, because the Document is not copied. # So we ignore the MPL and MPG between Document. # The number should be doubled, except for one deletion. self.assertEqual(num_mpl_aug - 1, (num_mpl_orig - 1) * 2 - 1) self.assertEqual(num_mpg_aug - 1, (num_mpg_orig - 1) * 2 - 1)