def main(): parser = argparse.ArgumentParser() parser.add_argument("--config_file", default="./config.yml", help="Config YAML filepath") args = parser.parse_args() # loading config config = yaml.safe_load(open(args.config_file, "r")) nlp: Pipeline[MultiPack] = Pipeline() nlp.set_reader(RandomDataSelector(), config=config["data_selector_config"]) nlp.add(component=MultiPackBoxer(), config=config["boxer_config"]) nlp.add(component=NLTKWordTokenizer(), selector=AllPackSelector()) nlp.add(component=NLTKPOSTagger(), selector=AllPackSelector()) nlp.add( component=ReplacementDataAugmentProcessor(), config=config["da_processor_config"], ) nlp.initialize() for _, m_pack in enumerate(nlp.process_dataset()): aug_pack = m_pack.get_pack("augmented_input") logging.info(aug_pack.text)
def test_all_pack_selector(self) -> None: selector = AllPackSelector() selector.initialize() packs = selector.select(self.multi_pack) doc_ids = ["1", "2", "Three"] for doc_id, pack in zip(doc_ids, packs): self.assertEqual(doc_id, pack.pack_name)
def setUp(self): random.seed(0) self.nlp = Pipeline[MultiPack]() boxer_config = {'pack_name': 'input_src'} self.nlp.set_reader(reader=StringReader()) self.nlp.add(component=MultiPackBoxer(), config=boxer_config) self.nlp.add(component=NLTKWordTokenizer(), selector=AllPackSelector()) self.nlp.add(component=NLTKPOSTagger(), selector=AllPackSelector())
def test_caster_all_selector(self): """ Test if the caster and all pack selector works well. The caster is used to convert a single pack to multi pack, and then pack copier is used to create a new pack. The all pack selector selects all the pack from the multi pack. This test make sure this pipeline works OK. """ mp: MultiPack for mp in ( Pipeline() .set_reader(SentenceReader()) .add(MultiPackBoxer()) .add(MultiPackCopier()) .add(DummyPackProcessor(), selector=AllPackSelector()) .initialize() .process_dataset( os.path.join(data_samples_root, "random_texts", "0.txt") ) ): num_pack = 0 for pack in mp.packs: num_pack += 1 entries = list(pack.get(NewType)) self.assertEqual(len(entries), 1) self.assertEqual(entries[0].value, "[PACK]") self.assertEqual(num_pack, 2)
def test_pipeline(self, texts, expected_outputs, expected_tokens): nlp = Pipeline[MultiPack]() boxer_config = {"pack_name": "input"} replacer_op = TmpReplacer.__module__ + "." + TmpReplacer.__qualname__ processor_config = { "augment_entry": "ft.onto.base_ontology.Token", "other_entry_policy": { "ft.onto.base_ontology.Document": "auto_align", "ft.onto.base_ontology.Sentence": "auto_align", }, "type": "data_augmentation_op", "data_aug_op": replacer_op, "data_aug_op_config": {}, "augment_pack_names": {}, } nlp.set_reader(reader=StringReader()) nlp.add(component=MultiPackBoxer(), config=boxer_config) nlp.add(component=WhiteSpaceTokenizer(), selector=AllPackSelector()) nlp.add( component=ReplacementDataAugmentProcessor(), config=processor_config ) nlp.initialize() for idx, m_pack in enumerate(nlp.process_dataset(texts)): aug_pack = m_pack.get_pack("augmented_input") self.assertEqual(aug_pack.text, expected_outputs[idx]) for j, token in enumerate(aug_pack.get(Token)): self.assertEqual(token.text, expected_tokens[idx][j])
def test_pipeline(self, texts, expected_outputs, expected_tokens): nlp = Pipeline[MultiPack]() boxer_config = { 'pack_name': 'input' } processor_config = { 'augment_entry': "ft.onto.base_ontology.Token", 'other_entry_policy': { 'type': '', 'kwargs': { "ft.onto.base_ontology.Document": "auto_align", "ft.onto.base_ontology.Sentence": "auto_align" } }, 'type': 'data_augmentation_op', 'data_aug_op': 'tests.forte.processors.base.data_augment_replacement_processor_test.TmpReplacer', 'data_aug_op_config': { 'type': '', 'kwargs': {} }, 'augment_pack_names': { 'kwargs': { 'input': 'augmented_input' } } } nlp.set_reader(reader=StringReader()) nlp.add(component=MultiPackBoxer(), config=boxer_config) nlp.add(component=NLTKWordTokenizer(), selector=AllPackSelector()) nlp.add(component=NLTKPOSTagger(), selector=AllPackSelector()) nlp.add(component=ReplacementDataAugmentProcessor(), config=processor_config) nlp.initialize() for idx, m_pack in enumerate(nlp.process_dataset(texts)): aug_pack = m_pack.get_pack('augmented_input') self.assertEqual(aug_pack.text, expected_outputs[idx]) for j, token in enumerate(aug_pack.get(Token)): self.assertEqual(token.text, expected_tokens[idx][j])
def setUp(self): random.seed(0) self.nlp = Pipeline[MultiPack]() boxer_config = {"pack_name": "input_src"} self.nlp.set_reader(reader=StringReader()) self.nlp.add(component=MultiPackBoxer(), config=boxer_config) self.nlp.add( component=WhiteSpaceTokenizer(), selector=AllPackSelector() )
def setUp(self): random.seed(8) self.nlp = Pipeline[MultiPack]() boxer_config = {"pack_name": "input_src"} entity_config = {"entities_to_insert": ["Mary", "station"]} self.nlp.set_reader(reader=StringReader()) self.nlp.add(component=EntityMentionInserter(), config=entity_config) self.nlp.add(PeriodSentenceSplitter()) self.nlp.add(component=MultiPackBoxer(), config=boxer_config) self.nlp.add(component=WhiteSpaceTokenizer(), selector=AllPackSelector())
def test_pipeline(self, texts, expected_outputs): nlp = Pipeline[MultiPack]() boxer_config = {"pack_name": "input"} nlp.set_reader(reader=StringReader()) nlp.add(component=MultiPackBoxer(), config=boxer_config) nlp.add(component=WhiteSpaceTokenizer(), selector=AllPackSelector()) processor_config = { "augment_entry": "ft.onto.base_ontology.Token", "other_entry_policy": { "ft.onto.base_ontology.Document": "auto_align", "ft.onto.base_ontology.Sentence": "auto_align", }, "type": "data_augmentation_op", "data_aug_op": "forte.processors.data_augment.algorithms" ".embedding_similarity_replacement_op." "EmbeddingSimilarityReplacementOp", "data_aug_op_config": { "vocab_path": self.abs_vocab_path, "embed_hparams": self.embed_hparams, "top_k": 1, }, "augment_pack_names": { "input": "augmented_input" }, } nlp.add(component=ReplacementDataAugmentProcessor(), config=processor_config) nlp.initialize() for idx, m_pack in enumerate(nlp.process_dataset(texts)): aug_pack = m_pack.get_pack("augmented_input") self.assertEqual(aug_pack.text, expected_outputs[idx])
def test_replace_token(self, texts, expected_outputs, expected_tokens, expected_links): for idx, text in enumerate(texts): file_path = os.path.join(self.test_dir, f"{idx + 1}.txt") with open(file_path, 'w') as f: f.write(text) nlp = Pipeline[MultiPack]() reader_config = { "input_pack_name": "input_src", "output_pack_name": "output_tgt" } nlp.set_reader(reader=MultiPackSentenceReader(), config=reader_config) nlp.add(component=NLTKWordTokenizer(), selector=AllPackSelector()) nlp.add(component=NLTKPOSTagger(), selector=AllPackSelector()) nlp.initialize() processor_config = { 'augment_entry': "ft.onto.base_ontology.Token", 'other_entry_policy': { "kwargs": { "ft.onto.base_ontology.Sentence": "auto_align" } }, 'type': 'data_augmentation_op', 'data_aug_op': 'tests.forte.processors.base.data_augment_replacement_processor_test.TmpReplacer', "data_aug_op_config": { 'kwargs': {} }, 'augment_pack_names': { 'kwargs': {} } } processor = ReplacementDataAugmentProcessor() processor.initialize(resources=None, configs=processor_config) for idx, m_pack in enumerate(nlp.process_dataset(self.test_dir)): src_pack = m_pack.get_pack('input_src') tgt_pack = m_pack.get_pack('output_tgt') num_mpl_orig, num_mpg_orig = 0, 0 # Copy the source pack to target pack. tgt_pack.set_text(src_pack.text) src_pack.add_entry(Document(src_pack, 0, len(src_pack.text))) for anno in src_pack.get(Annotation): new_anno = type(anno)( tgt_pack, anno.begin, anno.end ) tgt_pack.add_entry(new_anno) # Create MultiPackLink. m_pack.add_entry( MultiPackLink( m_pack, anno, new_anno ) ) # Create MultiPackGroup. m_pack.add_entry( MultiPackGroup( m_pack, [anno, new_anno] ) ) # Count the number of MultiPackLink/MultiPackGroup. num_mpl_orig += 1 num_mpg_orig += 1 # Create Links in the source pack. # The Links should be a tree: # # Link 3 # _________|_________ # | | # Link 2 | # _______|________ | # | | | # Link 1 | | # ______|_____ | | # | | | | # token 1 token 2 token 3 token 4 ... ... prev_entry = None for i, token in enumerate(src_pack.get(Token)): # Avoid overlapping with deleted tokens. if i < 10: continue if prev_entry: link = Link(src_pack, prev_entry, token) src_pack.add_entry( link ) prev_entry = link else: prev_entry = token # Create Groups in the target pack. # The Groups should be a tree like the Links. prev_entry = None for i, token in enumerate(tgt_pack.get(Token)): # Avoid overlapping with deleted tokens. if i < 10: continue if prev_entry: group = Group(tgt_pack, [prev_entry, token]) tgt_pack.add_entry( group ) prev_entry = group else: prev_entry = token doc_src = list(src_pack.get(Document))[0] doc_tgt = list(tgt_pack.get(Document))[0] sent_src = list(src_pack.get(Sentence))[0] sent_tgt = list(tgt_pack.get(Sentence))[0] # Insert two extra Links in the src_pack. # They should not be copied to new_src_pack, because the Document is not copied. link_src_low = src_pack.add_entry(Link(src_pack, doc_src, sent_src)) src_pack.add_entry(Link(src_pack, link_src_low, sent_src)) # Insert two extra Groups in the tgt_pack. # They should not be copied to new_tgt_pack, because the Document is not copied. group_tgt_low = tgt_pack.add_entry(Group(tgt_pack, [doc_tgt, sent_tgt])) tgt_pack.add_entry(Group(tgt_pack, [group_tgt_low, sent_tgt])) # Call the augment function explicitly for duplicate replacement # to test the False case of _replace function. processor._augment(m_pack, ["input_src", "output_tgt"]) # Test the insertion and deletion for pack in (src_pack, tgt_pack): # Insert an "NLP" at the beginning processor._insert(" NLP ", pack, 0) processor._insert(" NLP ", pack, 18) processor._insert(" NLP ", pack, len(pack.text) - 2) processor._insert("NLP", pack, len(pack.text) - 1) # Delete the second token "and" processor._delete(list(pack.get(Token))[1]) # This duplicate insertion should be invalid. processor._insert(" NLP ", pack, 0) # This insertion overlaps with a replacement. # It should be invalid. processor._insert(" NLP ", pack, 2) processor._process(m_pack) new_src_pack = m_pack.get_pack('augmented_input_src') new_tgt_pack = m_pack.get_pack('augmented_output_tgt') self.assertEqual(new_src_pack.text, expected_outputs[idx] + "\n") for j, token in enumerate(new_src_pack.get(Token)): self.assertEqual(token.text, expected_tokens[idx][j]) for sent in new_src_pack.get(Sentence): self.assertEqual(sent.text, expected_outputs[idx]) # Test the copied Links. prev_link = None for i, link in enumerate(new_src_pack.get(Link)): if prev_link: self.assertEqual(link.get_parent().tid, prev_link.tid) self.assertEqual(link.get_child().text, expected_links[idx][i]) prev_link = link # Test the copied Groups. prev_group = None for i, group in enumerate(new_tgt_pack.get(Group)): members = group.get_members() if isinstance(members[0], Token): member_token = members[0] member_group = members[1] else: member_token = members[1] member_group = members[0] if prev_group: self.assertEqual(isinstance(member_token, Token), True) self.assertEqual(isinstance(member_group, Group), True) self.assertEqual(member_group.tid, prev_group.tid) self.assertEqual(member_token.text, expected_links[idx][i]) prev_group = group # The two extra Links should not be copied, because of missing Document. self.assertEqual(len(list(src_pack.get(Link))) - 2, len(list(new_src_pack.get(Link)))) # The two extra Groups should not be copied, because of missing Document. self.assertEqual(len(list(tgt_pack.get(Group))) - 2, len(list(new_tgt_pack.get(Group)))) # Test the MultiPackLink/MultiPackGroup num_mpl_aug, num_mpg_aug = 0, 0 for mpl in m_pack.get(MultiPackLink): parent = mpl.get_parent() child = mpl.get_child() num_mpl_aug += 1 self.assertEqual(parent.text, child.text) self.assertNotEqual(parent.pack.meta.pack_id, child.pack.meta.pack_id) for mpg in m_pack.get(MultiPackGroup): members = mpg.get_members() num_mpg_aug += 1 self.assertEqual(members[0].text, members[1].text) self.assertNotEqual(members[0].pack.meta.pack_id, members[1].pack.meta.pack_id) # Test the number of MultiPackLink/MultiPackGroup. # Minus the aug and orig counters by 1, because the Document is not copied. # So we ignore the MPL and MPG between Document. # The number should be doubled, except for one deletion. self.assertEqual(num_mpl_aug - 1, (num_mpl_orig - 1) * 2 - 1) self.assertEqual(num_mpg_aug - 1, (num_mpg_orig - 1) * 2 - 1)