def test_reader_replace_error_test(self, value): # Read with errors in span replacements span_ops, output = value pipeline = Pipeline() reader = PlainTextReader() reader.text_replace_operation = lambda _: span_ops pipeline.set_reader(reader, {"file_ext": ".html"}) pipeline.initialize() with self.assertRaises(ValueError): pipeline.process(self.test_dir)
class TestNLTKWordTokenizer(unittest.TestCase): def setUp(self): self.nltk = Pipeline() self.nltk.set_reader(StringReader()) self.nltk.add_processor(NLTKSentenceSegmenter()) self.nltk.add_processor(NLTKWordTokenizer()) def test_tokenizer(self): sentences = [ "This tool is called Forte.", "The goal of this project to help you build NLP " "pipelines.", "NLP has never been made this easy before." ] tokens = [["This", "tool", "is", "called", "Forte", "."], [ "The", "goal", "of", "this", "project", "to", "help", "you", "build", "NLP", "pipelines", "." ], [ "NLP", "has", "never", "been", "made", "this", "easy", "before", "." ]] document = ' '.join(sentences) pack = self.nltk.process(document) for i, sentence in enumerate(pack.get(Sentence)): for j, token in enumerate( pack.get(entry_type=Token, range_annotation=sentence)): self.assertEqual(token.text, tokens[i][j])
class TestStanfordNLPProcessor(unittest.TestCase): def setUp(self): self.stanford_nlp = Pipeline() self.stanford_nlp.set_reader(StringReader()) models_path = os.getcwd() config = HParams( { "processors": "tokenize", "lang": "en", # Language code for the language to build the Pipeline "use_gpu": False }, StandfordNLPProcessor.default_hparams()) self.stanford_nlp.add_processor(StandfordNLPProcessor(models_path), config=config) self.stanford_nlp.initialize() # TODO @unittest.skip("We need to test this without needing to download models " "everytime") def test_stanford_processor(self): sentences = [ "This tool is called Forte.", "The goal of this project to help you build NLP " "pipelines.", "NLP has never been made this easy before." ] document = ' '.join(sentences) pack = self.stanford_nlp.process(document) print(pack)
def stanford_nlp_example(lang: str, text: str): pl = Pipeline() pl.set_reader(StringReader()) models_path = os.getcwd() config = HParams( { 'processors': 'tokenize,pos,lemma,depparse', 'lang': lang, # Language code for the language to build the Pipeline 'use_gpu': False }, StandfordNLPProcessor.default_hparams()) pl.add_processor(processor=StandfordNLPProcessor(models_path), config=config) pl.initialize() pack = pl.process(text) for sentence in pack.get(Sentence): sent_text = sentence.text print(colored("Sentence:", 'red'), sent_text, "\n") tokens = [(token.text, token.pos, token.lemma) for token in pack.get(Token, sentence)] print(colored("Tokens:", 'red'), tokens, "\n") print(colored("Dependency Relations:", 'red')) for link in pack.get(Dependency, sentence): parent: Token = link.get_parent() # type: ignore child: Token = link.get_child() # type: ignore print(colored(child.text, 'cyan'), "has relation", colored(link.rel_type, 'green'), "of parent", colored(parent.text, 'cyan')) print("\n----------------------\n")
class TestNLTKPOSTagger(unittest.TestCase): def setUp(self): self.nltk = Pipeline() self.nltk.set_reader(StringReader()) self.nltk.add_processor(NLTKSentenceSegmenter()) self.nltk.add_processor(NLTKWordTokenizer()) self.nltk.add_processor(NLTKPOSTagger()) def test_pos_tagger(self): sentences = [ "This tool is called Forte.", "The goal of this project to help you build NLP " "pipelines.", "NLP has never been made this easy before." ] pos = [["DT", "NN", "VBZ", "VBN", "NNP", "."], [ "DT", "NN", "IN", "DT", "NN", "TO", "VB", "PRP", "VB", "NNP", "NNS", "." ], ["NNP", "VBZ", "RB", "VBN", "VBN", "DT", "JJ", "RB", "."]] document = ' '.join(sentences) pack = self.nltk.process(document) for i, sentence in enumerate(pack.get(Sentence)): for j, token in enumerate( pack.get(entry_type=Token, range_annotation=sentence)): self.assertEqual(token.pos, pos[i][j])
def test_encoder_sentence(self): pipeline = Pipeline() pipeline.set_reader(StringReader()) pipeline.add_processor(NLTKSentenceSegmenter()) pipeline.add_processor(PretrainedEncoder()) pipeline.initialize() sentences = ["This tool is called Forte.", "The goal of this project to help you build NLP " "pipelines.", "NLP has never been made this easy before."] document = ' '.join(sentences) pack = pipeline.process(document) for i, sentence in enumerate(pack.get(Sentence)): self.assertEqual(sentence.embedding.shape, (1, 512, 768))
def test_encoder_document(self): pipeline = Pipeline() pipeline.set_reader(StringReader()) pipeline.add_processor( PretrainedEncoder(), config={'entry_type': 'ft.onto.base_ontology.Document'}) pipeline.initialize() sentences = ["This tool is called Forte.", "The goal of this project to help you build NLP " "pipelines.", "NLP has never been made this easy before."] document = ' '.join(sentences) pack = pipeline.process(document) for i, doc in enumerate(pack.get(Document)): self.assertEqual(doc.embedding.shape, (1, 512, 768))
def test_one_batch_processor(self, batch_size): nlp = Pipeline() nlp.set_reader(StringReader()) dummy = DummmyFixedSizeBatchProcessor() config = {"batcher": {"batch_size": batch_size}} nlp.add_processor(NLTKSentenceSegmenter()) nlp.add_processor(dummy, config=config) nlp.initialize() sentences = ["This tool is called Forte. The goal of this project to " "help you build NLP pipelines. NLP has never been made " "this easy before."] pack = nlp.process(sentences) sent_len = len(list(pack.get(Sentence))) self.assertEqual( dummy.counter, (sent_len // batch_size + (sent_len % batch_size > 0)))
class TestNLTKSentenceSegmenter(unittest.TestCase): def setUp(self): self.nltk = Pipeline() self.nltk.set_reader(StringReader()) self.nltk.add_processor(NLTKSentenceSegmenter()) def test_segmenter(self): sentences = [ "This tool is called Forte.", "The goal of this project to help you build NLP " "pipelines.", "NLP has never been made this easy before." ] document = ' '.join(sentences) pack = self.nltk.process(document) for idx, sentence in enumerate(pack.get(Sentence)): self.assertEqual(sentence.text, sentences[idx])
class TestLowerCaserProcessor(unittest.TestCase): def setUp(self): self.nlp = Pipeline() self.nlp.set_reader(StringReader()) self.nlp.add_processor(LowerCaserProcessor()) self.nlp.initialize() def test_lowercaser_processor(self): document = "This tool is called Forte. The goal of this project to " \ "help you build NLP pipelines. NLP has never been made " \ "this easy before." pack = self.nlp.process(document) print(pack) print(pack.text) assert pack.text == "this tool is called forte. the goal of this " \ "project to help you build nlp pipelines. nlp " \ "has never been made this easy before."
class DummyProcessorTest(unittest.TestCase): def setUp(self) -> None: self.nlp = Pipeline() self.nlp.set_reader(OntonotesReader()) dummy = DummyRelationExtractor() config = {"batcher": {"batch_size": 5}} self.nlp.add_processor(dummy, config=config) self.nlp.initialize() self.data_path = "data_samples/ontonotes/00/" def test_processor(self): pack = self.nlp.process(self.data_path) relations = list(pack.get_entries(RelationLink)) assert (len(relations) > 0) for relation in relations: self.assertEqual(relation.get_field("rel_type"), "dummy_relation")
class DummyProcessorTest(unittest.TestCase): def setUp(self) -> None: self.nlp = Pipeline() self.reader = OntonotesReader() self.data_path = "examples/data_samples/ontonotes/00/" self.nlp.set_reader(OntonotesReader()) self.nlp.add_processor(DummyRelationExtractor()) self.nlp.initialize() def test_processor(self): pack = self.nlp.process(self.data_path) relations = list(pack.get_entries(RelationLink)) assert (len(relations) > 0) for relation in relations: assert (relation.get_field("rel_type") == "dummy_relation")
class TestQuestionAnsweringMulti(unittest.TestCase): def setUp(self): self.nlp = Pipeline() self.nlp.set_reader(StringReader()) self.nlp.add(NLTKSentenceSegmenter()) boxer_config = {"pack_name": "question"} self.nlp.add(MultiPackBoxer(), boxer_config) self.nlp.add(MutliDocPackAdder()) self.nlp.add(QuestionAnsweringMulti()) self.nlp.initialize() def test_huggingface_qa_multi_processor(self): question = "Name synonym of Acrokeratosis paraneoplastica." packs: MultiPack = self.nlp.process(question) expected_ans = { "doc_1": "Bazex syndrome", "doc_2": "Bazex syndrome", "doc_3": "Bazex syndrome", } for doc_id in packs.pack_names: if doc_id == "question": continue pack = packs.get_pack(doc_id) for idx, phrase in enumerate(pack.get(entry_type=Phrase)): self.assertEqual(phrase.text, expected_ans[doc_id]) linked_texts = [] for link in packs.get(entry_type=MultiPackLink): parent_text = link.get_parent().text child_text = link.get_child().text linked_texts.append((parent_text, child_text)) self.assertListEqual( sorted(linked_texts), sorted([ (question, expected_ans["doc_1"]), (question, expected_ans["doc_2"]), (question, expected_ans["doc_3"]), ]), )
def test_encoder_phrase(self): pipeline = Pipeline() pipeline.set_reader(StringReader()) pipeline.add_processor(NLTKSentenceSegmenter()) pipeline.add_processor(NLTKWordTokenizer()) pipeline.add_processor(NLTKPOSTagger()) config = {'pattern': 'NP: {<DT>?<JJ>*<NN>}'} pipeline.add_processor(NLTKChunker(), config=config) pipeline.add_processor( PretrainedEncoder(), config={'entry_type': 'ft.onto.base_ontology.Phrase'}) pipeline.initialize() sentences = ["This tool is called Forte.", "The goal of this project to help you build NLP " "pipelines.", "NLP has never been made this easy before."] document = ' '.join(sentences) pack = pipeline.process(document) for i, phrase in enumerate(pack.get(Phrase)): self.assertEqual(phrase.embedding.shape, (1, 512, 768))
def test_neg_spacy_processor(self): spacy = Pipeline() spacy.set_reader(StringReader()) config = { "processors": 'ner', "lang": "xx_ent_wiki_sm", # Language code for the language to build the Pipeline "use_gpu": False } spacy.add_processor(SpacyProcessor(), config=config) spacy.initialize() sentences = [ "This tool is called Forte.", "The goal of this project to help you build NLP " "pipelines.", "NLP has never been made this easy before." ] document = ' '.join(sentences) with self.assertRaises(ValueError): _ = spacy.process(document)
def test_two_batch_processors(self, batch_size): nlp = Pipeline() nlp.set_reader(PlainTextReader()) dummy1 = DummmyFixedSizeBatchProcessor() dummy2 = DummmyFixedSizeBatchProcessor() config = {"batcher": {"batch_size": batch_size}} nlp.add_processor(NLTKSentenceSegmenter()) nlp.add_processor(dummy1, config=config) config = {"batcher": {"batch_size": 2 * batch_size}} nlp.add_processor(dummy2, config=config) nlp.initialize() data_path = "data_samples/random_texts" pack = nlp.process(data_path) sent_len = len(list(pack.get(Sentence))) self.assertEqual(dummy1.counter, (sent_len // batch_size + (sent_len % batch_size > 0))) self.assertEqual(dummy2.counter, (sent_len // (2 * batch_size) + (sent_len % (2 * batch_size) > 0)))
class DummyFixedSizeBatchProcessorTest(unittest.TestCase): def setUp(self) -> None: self.nlp = Pipeline() self.nlp.set_reader(StringReader()) self.dummy = DummmyFixedSizeBatchProcessor() @data(1, 2, 3) def test_processor(self, batch_size): config = HParams({"batcher": { "batch_size": batch_size }}, self.dummy.default_hparams()) self.nlp.add_processor(NLTKSentenceSegmenter()) self.nlp.add_processor(self.dummy, config=config) self.nlp.initialize() sentences = [ "This tool is called Forte. The goal of this project to " "help you build NLP pipelines. NLP has never been made " "this easy before." ] pack = self.nlp.process(sentences) sent_len = len(list(pack.get(Sentence))) self.assertEqual(self.dummy.counter, (sent_len // batch_size + (sent_len % batch_size > 0)))
def test_spacy_variation_pipeline(self, value): spacy = Pipeline() spacy.set_reader(StringReader()) config = { "processors": value, "lang": "en_core_web_sm", # Language code for the language to build the Pipeline "use_gpu": False } spacy.add_processor(SpacyProcessor(), config=config) spacy.initialize() sentences = [ "This tool is called Forte.", "The goal of this project to help you build NLP " "pipelines.", "NLP has never been made this easy before." ] document = ' '.join(sentences) pack = spacy.process(document) tokens = [x for x in pack.annotations if isinstance(x, Token)] if "tokenize" in value: exp_pos = [ 'DT', 'NN', 'VBZ', 'VBN', 'NNP', '.', 'DT', 'NN', 'IN', 'DT', 'NN', 'TO', 'VB', 'PRP', 'VB', 'NNP', 'NNS', '.', 'NNP', 'VBZ', 'RB', 'VBN', 'VBN', 'DT', 'JJ', 'RB', '.' ] exp_lemma = [ 'this', 'tool', 'be', 'call', 'Forte', '.', 'the', 'goal', 'of', 'this', 'project', 'to', 'help', '-PRON-', 'build', 'NLP', 'pipeline', '.', 'NLP', 'have', 'never', 'be', 'make', 'this', 'easy', 'before', '.' ] tokens_text = [x.text for x in tokens] pos = [x.pos for x in pack.annotations if isinstance(x, Token)] lemma = [x.lemma for x in pack.annotations if isinstance(x, Token)] document_ = document.replace('.', ' .') self.assertEqual(tokens_text, document_.split()) # Check token texts for token, text in zip(tokens, tokens_text): start, end = token.span.begin, token.span.end self.assertEqual(document[start:end], text) if "pos" in value: self.assertListEqual(pos, exp_pos) else: none_pos = [None] * len(pos) self.assertListEqual(pos, none_pos) if "lemma" in value: self.assertListEqual(lemma, exp_lemma) else: none_lemma = [None] * len(lemma) self.assertListEqual(lemma, none_lemma) else: self.assertListEqual(tokens, []) if "ner" in value: entities_text = [ x.text for x in pack.annotations if isinstance(x, EntityMention) ] entities_type = [ x.ner_type for x in pack.annotations if isinstance(x, EntityMention) ] self.assertEqual(entities_text, ['Forte', 'NLP', 'NLP']) self.assertEqual(entities_type, ['GPE', 'ORG', 'ORG'])
class TestVaderSentiment(unittest.TestCase): def setUp(self): self.pipeline = Pipeline() self.pipeline.set_reader(StringReader()) self.pipeline.add_processor(NLTKSentenceSegmenter()) self.pipeline.add_processor(VaderSentimentProcessor()) self.pipeline.initialize() def test_segmenter(self): sentences = [ "VADER is smart, handsome, and funny.", # positive sentence example "VADER is smart, handsome, and funny!", # punctuation emphasis handled correctly (sentiment intensity # adjusted) "VADER is very smart, handsome, and funny.", # booster words handled correctly (sentiment intensity adjusted) "VADER is VERY SMART, handsome, and FUNNY.", # emphasis for ALLCAPS handled "VADER is VERY SMART, handsome, and FUNNY!!!", # combination of signals - VADER appropriately adjusts intensity "VADER is VERY SMART, uber handsome, and FRIGGIN FUNNY!!!", # booster words & punctuation make this close to ceiling for score "VADER is not smart, handsome, nor funny.", # negation sentence example "The book was good.", # positive sentence "At least it isn't a horrible book.", # negated negative sentence with contraction "The book was only kind of good.", # qualified positive sentence is handled correctly (intensity # adjusted) "The plot was good, but the characters are uncompelling and the " "dialog is not great.", # mixed negation sentence "Today SUX!", # negative slang with capitalization emphasis "Today only kinda sux! But I'll get by, lol", # mixed sentiment example with slang and constrastive conjunction # "but" "Make sure you :) or :D today!", # emoticons handled "Catch utf-8 emoji such as such as 💘 and 💋 and 😁", # emojis handled "Not bad at all" # Capitalized negation ] expected_scores = [ {'neg': 0.0, 'neu': 0.254, 'pos': 0.746, 'compound': 0.8316}, {'neg': 0.0, 'neu': 0.248, 'pos': 0.752, 'compound': 0.8439}, {'neg': 0.0, 'neu': 0.299, 'pos': 0.701, 'compound': 0.8545}, {'neg': 0.0, 'neu': 0.246, 'pos': 0.754, 'compound': 0.9227}, {'neg': 0.0, 'neu': 0.233, 'pos': 0.767, 'compound': 0.9342}, {'neg': 0.0, 'neu': 0.294, 'pos': 0.706, 'compound': 0.9469}, {'neg': 0.646, 'neu': 0.354, 'pos': 0.0, 'compound': -0.7424}, {'neg': 0.0, 'neu': 0.508, 'pos': 0.492, 'compound': 0.4404}, {'neg': 0.0, 'neu': 0.637, 'pos': 0.363, 'compound': 0.431}, {'neg': 0.0, 'neu': 0.697, 'pos': 0.303, 'compound': 0.3832}, {'neg': 0.327, 'neu': 0.579, 'pos': 0.094, 'compound': -0.7042}, {'neg': 0.779, 'neu': 0.221, 'pos': 0.0, 'compound': -0.5461}, {'neg': 0.454, 'neu': 0.546, 'pos': 0.0, 'compound': -0.3609}, {'neg': 0.0, 'neu': 0.327, 'pos': 0.673, 'compound': 0.9551}, {'neg': 0.0, 'neu': 0.698, 'pos': 0.302, 'compound': 0.8248}, ] document = ' '.join(sentences) pack = self.pipeline.process(document) sentence: Sentence for idx, sentence in enumerate(pack.get(Sentence)): self.assertEqual(sentence.sentiment, expected_scores[idx])