class TestNLTKWordTokenizer(unittest.TestCase): def setUp(self): self.nltk = Pipeline() self.nltk.set_reader(StringReader()) self.nltk.add_processor(NLTKSentenceSegmenter()) self.nltk.add_processor(NLTKWordTokenizer()) def test_tokenizer(self): sentences = [ "This tool is called Forte.", "The goal of this project to help you build NLP " "pipelines.", "NLP has never been made this easy before." ] tokens = [["This", "tool", "is", "called", "Forte", "."], [ "The", "goal", "of", "this", "project", "to", "help", "you", "build", "NLP", "pipelines", "." ], [ "NLP", "has", "never", "been", "made", "this", "easy", "before", "." ]] document = ' '.join(sentences) pack = self.nltk.process(document) for i, sentence in enumerate(pack.get(Sentence)): for j, token in enumerate( pack.get(entry_type=Token, range_annotation=sentence)): self.assertEqual(token.text, tokens[i][j])
class TestStanfordNLPProcessor(unittest.TestCase): def setUp(self): self.stanford_nlp = Pipeline() self.stanford_nlp.set_reader(StringReader()) models_path = os.getcwd() config = HParams( { "processors": "tokenize", "lang": "en", # Language code for the language to build the Pipeline "use_gpu": False }, StandfordNLPProcessor.default_hparams()) self.stanford_nlp.add_processor(StandfordNLPProcessor(models_path), config=config) self.stanford_nlp.initialize() # TODO @unittest.skip("We need to test this without needing to download models " "everytime") def test_stanford_processor(self): sentences = [ "This tool is called Forte.", "The goal of this project to help you build NLP " "pipelines.", "NLP has never been made this easy before." ] document = ' '.join(sentences) pack = self.stanford_nlp.process(document) print(pack)
class PipelineTest(unittest.TestCase): def setUp(self) -> None: # Define and config the Pipeline self.nlp = Pipeline() self.nlp.set_reader(OntonotesReader()) dummy = DummyRelationExtractor() config = HParams({"batcher": { "batch_size": 5 }}, dummy.default_hparams()) self.nlp.add_processor(dummy, config=config) self.nlp.initialize() self.dataset_path = \ "forte/tests/data_samples/ontonotes_sample_dataset/00" def test_process_next(self): # get processed pack from dataset for pack in self.nlp.process_dataset(self.dataset_path): # get sentence from pack for sentence in pack.get_entries(Sentence): sent_text = sentence.text # first method to get entry in a sentence for link in pack.get_entries(RelationLink, sentence): parent = link.get_parent() child = link.get_child() print(f"{parent.text} is {link.rel_type} {child.text}") pass # some operation on link # second method to get entry in a sentence tokens = [ token.text for token in pack.get_entries(Token, sentence) ] self.assertEqual(sent_text, " ".join(tokens))
def test_pipeline(self, texts): for idx, text in enumerate(texts): file_path = os.path.join(self.test_dir, f"{idx+1}.txt") with open(file_path, 'w') as f: f.write(text) nlp = Pipeline() reader_config = HParams( { "input_pack_name": "input", "output_pack_name": "output" }, MultiPackSentenceReader.default_hparams()) nlp.set_reader(reader=MultiPackSentenceReader(), config=reader_config) translator_config = HParams( { "src_language": "de", "target_language": "en", "in_pack_name": "input", "out_pack_name": "result" }, None) nlp.add_processor(MicrosoftBingTranslator(), config=translator_config) nlp.initialize() english_results = ["Hey good morning", "This is Forte. A tool for NLP"] for idx, m_pack in enumerate(nlp.process_dataset(self.test_dir)): self.assertEqual(set(m_pack._pack_names), set(["input", "output", "result"])) self.assertEqual( m_pack.get_pack("result").text, english_results[idx] + "\n")
def test_pipeline(self, texts): for idx, text in enumerate(texts): file_path = os.path.join(self.test_dir, f"{idx+1}.txt") with open(file_path, 'w') as f: f.write(text) nlp = Pipeline() reader_config = { "input_pack_name": "query", "output_pack_name": "output" } nlp.set_reader(reader=MultiPackSentenceReader(), config=reader_config) config = { "model": { "name": "bert-base-uncased" }, "tokenizer": { "name": "bert-base-uncased" }, "max_seq_length": 128, "query_pack_name": "query" } nlp.add_processor(BertBasedQueryCreator(), config=config) nlp.initialize() for idx, m_pack in enumerate(nlp.process_dataset(self.test_dir)): query_pack = m_pack.get_pack("query") self.assertEqual(len(query_pack.generics), 1) self.assertIsInstance(query_pack.generics[0], Query) query = query_pack.generics[0].value self.assertEqual(query.shape, (1, 768))
def stanford_nlp_example(lang: str, text: str): pl = Pipeline() pl.set_reader(StringReader()) models_path = os.getcwd() config = HParams( { 'processors': 'tokenize,pos,lemma,depparse', 'lang': lang, # Language code for the language to build the Pipeline 'use_gpu': False }, StandfordNLPProcessor.default_hparams()) pl.add_processor(processor=StandfordNLPProcessor(models_path), config=config) pl.initialize() pack = pl.process(text) for sentence in pack.get(Sentence): sent_text = sentence.text print(colored("Sentence:", 'red'), sent_text, "\n") tokens = [(token.text, token.pos, token.lemma) for token in pack.get(Token, sentence)] print(colored("Tokens:", 'red'), tokens, "\n") print(colored("Dependency Relations:", 'red')) for link in pack.get(Dependency, sentence): parent: Token = link.get_parent() # type: ignore child: Token = link.get_child() # type: ignore print(colored(child.text, 'cyan'), "has relation", colored(link.rel_type, 'green'), "of parent", colored(parent.text, 'cyan')) print("\n----------------------\n")
class CoNLL03ReaderPipelineTest(unittest.TestCase): def setUp(self): # Define and config the Pipeline self.dataset_path = "data_samples/conll03" self.nlp = Pipeline() self.nlp.set_reader(CoNLL03Reader()) self.nlp.add_processor(DummyPackProcessor()) self.nlp.add_processor(DummyPackProcessor()) self.nlp.initialize() def test_process_next(self): doc_exists = False # get processed pack from dataset for pack in self.nlp.process_dataset(self.dataset_path): # get sentence from pack for sentence in pack.get_entries(Sentence): doc_exists = True sent_text = sentence.text # second method to get entry in a sentence tokens = [ token.text for token in pack.get_entries(Token, sentence) ] self.assertEqual(sent_text, " ".join(tokens)) self.assertTrue(doc_exists)
def prepare(self, *args, **kwargs): # pylint: disable=unused-argument prepare_pl = Pipeline() prepare_pl.set_reader(self.train_reader) for p in self.preprocessors: prepare_pl.add_processor(p) prepare_pl.run(self.configs.config_data.train_path) for p in self.preprocessors: p.finish(resource=self.resource)
def string_processor_example(ner_model_dir: str, srl_model_dir: str): pl = Pipeline() pl.set_reader(StringReader()) pl.add_processor(NLTKSentenceSegmenter()) pl.add_processor(NLTKWordTokenizer()) pl.add_processor(NLTKPOSTagger()) ner_configs = HParams( {'storage_path': os.path.join(ner_model_dir, 'resources.pkl')}, CoNLLNERPredictor.default_hparams()) ner_predictor = CoNLLNERPredictor() pl.add_processor(ner_predictor, ner_configs) srl_configs = HParams({ 'storage_path': srl_model_dir, }, SRLPredictor.default_hparams()) pl.add_processor(SRLPredictor(), srl_configs) pl.initialize() text = ( "The plain green Norway spruce is displayed in the gallery's foyer. " "Wentworth worked as an assistant to sculptor Henry Moore in the " "late 1960s. His reputation as a sculptor grew in the 1980s.") pack = pl.process_one(text) for sentence in pack.get(Sentence): sent_text = sentence.text print(colored("Sentence:", 'red'), sent_text, "\n") # first method to get entry in a sentence tokens = [(token.text, token.pos) for token in pack.get(Token, sentence)] entities = [(entity.text, entity.ner_type) for entity in pack.get(EntityMention, sentence)] print(colored("Tokens:", 'red'), tokens, "\n") print(colored("EntityMentions:", 'red'), entities, "\n") # second method to get entry in a sentence print(colored("Semantic role labels:", 'red')) for link in pack.get(PredicateLink, sentence): parent: PredicateMention = link.get_parent() # type: ignore child: PredicateArgument = link.get_child() # type: ignore print(f" - \"{child.text}\" is role {link.arg_type} of " f"predicate \"{parent.text}\"") entities = [ entity.text for entity in pack.get(EntityMention, child) ] print(" Entities in predicate argument:", entities, "\n") print() input(colored("Press ENTER to continue...\n", 'green'))
def test_attribute_masker(self): pl = Pipeline() pl.set_reader(CoNLL03Reader()) config = {"kwargs": {Token: ["ner"]}} pl.add_processor(processor=AttributeMasker(), config=config) pl.initialize() for pack in pl.process_dataset("data_samples/conll03/"): entries = pack.get_entries_by_type(Token) for entry in entries: self.assertIsNone(entry.ner)
def test_pipeline7(self, batch_size1, batch_size2, batch_size3): # Tests a chain of Batch->Batch->Batch->Pack with different batch sizes. nlp = Pipeline() reader = MultiPackSentenceReader() nlp.set_reader(reader) dummy1 = DummmyFixedSizeBatchProcessor() config = {"batcher": {"batch_size": batch_size1}} nlp.add_processor(processor=dummy1, config=config, selector=FirstPackSelector()) dummy2 = DummmyFixedSizeBatchProcessor() config = {"batcher": {"batch_size": batch_size2}} nlp.add_processor(processor=dummy2, config=config, selector=FirstPackSelector()) dummy3 = DummmyFixedSizeBatchProcessor() config = {"batcher": {"batch_size": batch_size3}} nlp.add_processor(processor=dummy3, config=config, selector=FirstPackSelector()) dummy4 = DummyPackProcessor() nlp.add_processor(processor=dummy4, selector=FirstPackSelector()) nlp.initialize() data_path = "data_samples/random_texts/0.txt" num_packs = 0 for pack in nlp.process_dataset(data_path): types = list(pack.get_pack("pack").get_entries_by_type(NewType)) num_packs += 1 self.assertEqual(len(types), 1) self.assertEqual(types[0].value, "[BATCH][BATCH][BATCH][PACK]") # check that all packs are yielded self.assertEqual(num_packs, reader.count)
def test_encoder_sentence(self): pipeline = Pipeline() pipeline.set_reader(StringReader()) pipeline.add_processor(NLTKSentenceSegmenter()) pipeline.add_processor(PretrainedEncoder()) pipeline.initialize() sentences = ["This tool is called Forte.", "The goal of this project to help you build NLP " "pipelines.", "NLP has never been made this easy before."] document = ' '.join(sentences) pack = pipeline.process(document) for i, sentence in enumerate(pack.get(Sentence)): self.assertEqual(sentence.embedding.shape, (1, 512, 768))
def test_encoder_document(self): pipeline = Pipeline() pipeline.set_reader(StringReader()) pipeline.add_processor( PretrainedEncoder(), config={'entry_type': 'ft.onto.base_ontology.Document'}) pipeline.initialize() sentences = ["This tool is called Forte.", "The goal of this project to help you build NLP " "pipelines.", "NLP has never been made this easy before."] document = ' '.join(sentences) pack = pipeline.process(document) for i, doc in enumerate(pack.get(Document)): self.assertEqual(doc.embedding.shape, (1, 512, 768))
class TestNLTKSentenceSegmenter(unittest.TestCase): def setUp(self): self.nltk = Pipeline() self.nltk.set_reader(StringReader()) self.nltk.add_processor(NLTKSentenceSegmenter()) def test_segmenter(self): sentences = [ "This tool is called Forte.", "The goal of this project to help you build NLP " "pipelines.", "NLP has never been made this easy before." ] document = ' '.join(sentences) pack = self.nltk.process(document) for idx, sentence in enumerate(pack.get(Sentence)): self.assertEqual(sentence.text, sentences[idx])
def test_one_batch_processor(self, batch_size): nlp = Pipeline() nlp.set_reader(StringReader()) dummy = DummmyFixedSizeBatchProcessor() config = {"batcher": {"batch_size": batch_size}} nlp.add_processor(NLTKSentenceSegmenter()) nlp.add_processor(dummy, config=config) nlp.initialize() sentences = ["This tool is called Forte. The goal of this project to " "help you build NLP pipelines. NLP has never been made " "this easy before."] pack = nlp.process(sentences) sent_len = len(list(pack.get(Sentence))) self.assertEqual( dummy.counter, (sent_len // batch_size + (sent_len % batch_size > 0)))
class DummyProcessorTest(unittest.TestCase): def setUp(self) -> None: self.nlp = Pipeline() self.nlp.set_reader(OntonotesReader()) dummy = DummyRelationExtractor() config = {"batcher": {"batch_size": 5}} self.nlp.add_processor(dummy, config=config) self.nlp.initialize() self.data_path = "data_samples/ontonotes/00/" def test_processor(self): pack = self.nlp.process(self.data_path) relations = list(pack.get_entries(RelationLink)) assert (len(relations) > 0) for relation in relations: self.assertEqual(relation.get_field("rel_type"), "dummy_relation")
class TestLowerCaserProcessor(unittest.TestCase): def setUp(self): self.nlp = Pipeline() self.nlp.set_reader(StringReader()) self.nlp.add_processor(LowerCaserProcessor()) self.nlp.initialize() def test_lowercaser_processor(self): document = "This tool is called Forte. The goal of this project to " \ "help you build NLP pipelines. NLP has never been made " \ "this easy before." pack = self.nlp.process(document) print(pack) print(pack.text) assert pack.text == "this tool is called forte. the goal of this " \ "project to help you build nlp pipelines. nlp " \ "has never been made this easy before."
def main(dataset_dir: str, ner_model_path: str, srl_model_path: str): pl = Pipeline() pl.set_reader(PlainTextReader()) pl.add_processor(NLTKSentenceSegmenter()) pl.add_processor(NLTKWordTokenizer()) pl.add_processor(NLTKPOSTagger()) ner_configs = HParams( {'storage_path': os.path.join(ner_model_path, 'resources.pkl')}, CoNLLNERPredictor.default_hparams()) pl.add_processor(CoNLLNERPredictor(), ner_configs) srl_configs = HParams({ 'storage_path': srl_model_path, }, SRLPredictor.default_hparams()) pl.add_processor(SRLPredictor(), srl_configs) pl.initialize() for pack in pl.process_dataset(dataset_dir): print(colored("Document", 'red'), pack.meta.doc_id) for sentence in pack.get(Sentence): sent_text = sentence.text print(colored("Sentence:", 'red'), sent_text, "\n") # first method to get entry in a sentence tokens = [(token.text, token.pos) for token in pack.get(Token, sentence)] entities = [(entity.text, entity.ner_type) for entity in pack.get(EntityMention, sentence)] print(colored("Tokens:", 'red'), tokens, "\n") print(colored("EntityMentions:", 'red'), entities, "\n") # second method to get entry in a sentence print(colored("Semantic role labels:", 'red')) for link in pack.get(PredicateLink, sentence): parent: PredicateMention = link.get_parent() # type: ignore child: PredicateArgument = link.get_child() # type: ignore print(f" - \"{child.text}\" is role {link.arg_type} of " f"predicate \"{parent.text}\"") entities = [ entity.text for entity in pack.get(EntityMention, child) ] print(" Entities in predicate argument:", entities, "\n") print() input(colored("Press ENTER to continue...\n", 'green'))
def _create_pipeline(config): nlp = Pipeline() nlp.set_reader(StringReader()) # Using SpacyProcessor to segment the sentences nlp.add_processor( processor=SpacyProcessor(), config={ 'processors': '', 'lang': "en_core_web_sm", # Language code to build the Pipeline 'use_gpu': False }) nlp.add_processor(processor=AllenNLPProcessor(), config=config) nlp.initialize() return nlp
def main(): pl = Pipeline() pl.set_reader(StringReader()) pl.add_processor(NLTKSentenceSegmenter()) pl.add_processor(NLTKWordTokenizer()) pl.add_processor(NLTKPOSTagger()) pl.add_processor(CoNLLNERPredictor(), config=config.NER) pl.add_processor(SRLPredictor(), config=config.SRL) pl.initialize() text = ( "So I was excited to see Journey to the Far Side of the Sun finally " "get released on an affordable DVD (the previous print had been " "fetching $100 on eBay - I'm sure those people wish they had their " "money back - but more about that in a second).") pack = pl.process_one(text) for sentence in pack.get(Sentence): sent_text = sentence.text print(colored("Sentence:", 'red'), sent_text, "\n") # first method to get entry in a sentence tokens = [(token.text, token.pos) for token in pack.get(Token, sentence)] entities = [(entity.text, entity.ner_type) for entity in pack.get(EntityMention, sentence)] print(colored("Tokens:", 'red'), tokens, "\n") print(colored("EntityMentions:", 'red'), entities, "\n") # second method to get entry in a sentence print(colored("Semantic role labels:", 'red')) for link in pack.get(PredicateLink, sentence): parent: PredicateMention = link.get_parent() # type: ignore child: PredicateArgument = link.get_child() # type: ignore print(f" - \"{child.text}\" is role {link.arg_type} of " f"predicate \"{parent.text}\"") entities = [ entity.text for entity in pack.get(EntityMention, child) ] print(" Entities in predicate argument:", entities, "\n") print() input(colored("Press ENTER to continue...\n", 'green'))
def test_pipeline1(self): """Tests a pack processor only.""" nlp = Pipeline() reader = MultiPackSentenceReader() nlp.set_reader(reader) dummy = DummyPackProcessor() nlp.add_processor(dummy, selector=FirstPackSelector()) nlp.initialize() data_path = "data_samples/random_texts/0.txt" num_packs = 0 for pack in nlp.process_dataset(data_path): types = list(pack.get_pack("pack").get_entries_by_type(NewType)) num_packs += 1 self.assertEqual(len(types), 1) self.assertEqual(types[0].value, "[PACK]") # check that all packs are yielded self.assertEqual(num_packs, reader.count)
class DummyProcessorTest(unittest.TestCase): def setUp(self) -> None: self.nlp = Pipeline() self.reader = OntonotesReader() self.data_path = "examples/data_samples/ontonotes/00/" self.nlp.set_reader(OntonotesReader()) self.nlp.add_processor(DummyRelationExtractor()) self.nlp.initialize() def test_processor(self): pack = self.nlp.process(self.data_path) relations = list(pack.get_entries(RelationLink)) assert (len(relations) > 0) for relation in relations: assert (relation.get_field("rel_type") == "dummy_relation")
def test_pipeline2(self): """Tests a batch processor only.""" nlp = Pipeline() reader = SentenceReader() nlp.set_reader(reader) dummy = DummmyFixedSizeBatchProcessor() config = {"batcher": {"batch_size": 4}} nlp.add_processor(processor=dummy, config=config) nlp.initialize() data_path = "data_samples/random_texts/0.txt" num_packs = 0 for pack in nlp.process_dataset(data_path): types = list(pack.get_entries_by_type(NewType)) num_packs += 1 self.assertEqual(len(types), 1) self.assertEqual(types[0].value, "[BATCH]") # check that all packs are yielded self.assertEqual(num_packs, reader.count)
def main(dataset_dir: str): config = yaml.safe_load(open("config.yml", "r")) config = HParams(config, default_hparams=None) pl = Pipeline() pl.set_reader(PlainTextReader()) pl.add_processor(NLTKSentenceSegmenter()) pl.add_processor(NLTKWordTokenizer()) pl.add_processor(NLTKPOSTagger()) pl.add_processor(CoNLLNERPredictor(), config=config.NER) pl.add_processor(SRLPredictor(), config=config.SRL) pl.initialize() for pack in pl.process_dataset(dataset_dir): print(colored("Document", 'red'), pack.meta.doc_id) for sentence in pack.get(Sentence): sent_text = sentence.text print(colored("Sentence:", 'red'), sent_text, "\n") # first method to get entry in a sentence tokens = [(token.text, token.pos) for token in pack.get(Token, sentence)] entities = [(entity.text, entity.ner_type) for entity in pack.get(EntityMention, sentence)] print(colored("Tokens:", 'red'), tokens, "\n") print(colored("EntityMentions:", 'red'), entities, "\n") # second method to get entry in a sentence print(colored("Semantic role labels:", 'red')) for link in pack.get(PredicateLink, sentence): parent: PredicateMention = link.get_parent() # type: ignore child: PredicateArgument = link.get_child() # type: ignore print(f" - \"{child.text}\" is role {link.arg_type} of " f"predicate \"{parent.text}\"") entities = [ entity.text for entity in pack.get(EntityMention, child) ] print(" Entities in predicate argument:", entities, "\n") print() input(colored("Press ENTER to continue...\n", 'green'))
def test_neg_spacy_processor(self): spacy = Pipeline() spacy.set_reader(StringReader()) config = { "processors": 'ner', "lang": "xx_ent_wiki_sm", # Language code for the language to build the Pipeline "use_gpu": False } spacy.add_processor(SpacyProcessor(), config=config) spacy.initialize() sentences = [ "This tool is called Forte.", "The goal of this project to help you build NLP " "pipelines.", "NLP has never been made this easy before." ] document = ' '.join(sentences) with self.assertRaises(ValueError): _ = spacy.process(document)
def test_encoder_phrase(self): pipeline = Pipeline() pipeline.set_reader(StringReader()) pipeline.add_processor(NLTKSentenceSegmenter()) pipeline.add_processor(NLTKWordTokenizer()) pipeline.add_processor(NLTKPOSTagger()) config = {'pattern': 'NP: {<DT>?<JJ>*<NN>}'} pipeline.add_processor(NLTKChunker(), config=config) pipeline.add_processor( PretrainedEncoder(), config={'entry_type': 'ft.onto.base_ontology.Phrase'}) pipeline.initialize() sentences = ["This tool is called Forte.", "The goal of this project to help you build NLP " "pipelines.", "NLP has never been made this easy before."] document = ' '.join(sentences) pack = pipeline.process(document) for i, phrase in enumerate(pack.get(Phrase)): self.assertEqual(phrase.embedding.shape, (1, 512, 768))
class TestNLTKPOSTagger(unittest.TestCase): def setUp(self): self.nltk = Pipeline() self.nltk.set_reader(StringReader()) self.nltk.add_processor(NLTKSentenceSegmenter()) self.nltk.add_processor(NLTKWordTokenizer()) self.nltk.add_processor(NLTKPOSTagger()) def test_pos_tagger(self): sentences = [ "This tool is called Forte.", "The goal of this project to help you build NLP " "pipelines.", "NLP has never been made this easy before." ] pos = [["DT", "NN", "VBZ", "VBN", "NNP", "."], [ "DT", "NN", "IN", "DT", "NN", "TO", "VB", "PRP", "VB", "NNP", "NNS", "." ], ["NNP", "VBZ", "RB", "VBN", "VBN", "DT", "JJ", "RB", "."]] document = ' '.join(sentences) pack = self.nltk.process(document) for i, sentence in enumerate(pack.get(Sentence)): for j, token in enumerate( pack.get(entry_type=Token, range_annotation=sentence)): self.assertEqual(token.pos, pos[i][j])
def test_pipeline3(self, batch_size): """Tests a chain of Batch->Pack->Batch with different batch sizes.""" nlp = Pipeline() reader = SentenceReader() nlp.set_reader(reader) dummy1 = DummmyFixedSizeBatchProcessor() config = {"batcher": {"batch_size": batch_size}} nlp.add_processor(processor=dummy1, config=config) dummy2 = DummyPackProcessor() nlp.add_processor(processor=dummy2) dummy3 = DummmyFixedSizeBatchProcessor() config = {"batcher": {"batch_size": 2 * batch_size}} nlp.add_processor(processor=dummy3, config=config) nlp.initialize() data_path = "data_samples/random_texts/0.txt" num_packs = 0 for pack in nlp.process_dataset(data_path): types = list(pack.get_entries_by_type(NewType)) num_packs += 1 self.assertEqual(len(types), 1) self.assertEqual(types[0].value, "[BATCH][PACK][BATCH]") # check that all packs are yielded self.assertEqual(num_packs, reader.count)
def test_process_next(self): # Define and config the Pipeline nlp = Pipeline() nlp.set_reader(OntonotesReader()) dummy = DummyRelationExtractor() config = {"batcher": {"batch_size": 5}} nlp.add_processor(dummy, config=config) nlp.initialize() dataset_path = "data_samples/ontonotes/00" # get processed pack from dataset for pack in nlp.process_dataset(dataset_path): # get sentence from pack for sentence in pack.get_entries(Sentence): sent_text = sentence.text # second method to get entry in a sentence tokens = [ token.text for token in pack.get_entries(Token, sentence) ] self.assertEqual(sent_text, " ".join(tokens))
class DummyFixedSizeBatchProcessorTest(unittest.TestCase): def setUp(self) -> None: self.nlp = Pipeline() self.nlp.set_reader(StringReader()) self.dummy = DummmyFixedSizeBatchProcessor() @data(1, 2, 3) def test_processor(self, batch_size): config = HParams({"batcher": { "batch_size": batch_size }}, self.dummy.default_hparams()) self.nlp.add_processor(NLTKSentenceSegmenter()) self.nlp.add_processor(self.dummy, config=config) self.nlp.initialize() sentences = [ "This tool is called Forte. The goal of this project to " "help you build NLP pipelines. NLP has never been made " "this easy before." ] pack = self.nlp.process(sentences) sent_len = len(list(pack.get(Sentence))) self.assertEqual(self.dummy.counter, (sent_len // batch_size + (sent_len % batch_size > 0)))