def test_reader_replace_error_test(self, value):
        # Read with errors in span replacements
        span_ops, output = value

        pipeline = Pipeline()
        reader = PlainTextReader()
        reader.text_replace_operation = lambda _: span_ops
        pipeline.set_reader(reader, {"file_ext": ".html"})
        pipeline.initialize()

        with self.assertRaises(ValueError):
            pipeline.process(self.test_dir)
class TestNLTKWordTokenizer(unittest.TestCase):
    def setUp(self):
        self.nltk = Pipeline()
        self.nltk.set_reader(StringReader())
        self.nltk.add_processor(NLTKSentenceSegmenter())
        self.nltk.add_processor(NLTKWordTokenizer())

    def test_tokenizer(self):
        sentences = [
            "This tool is called Forte.",
            "The goal of this project to help you build NLP "
            "pipelines.", "NLP has never been made this easy before."
        ]
        tokens = [["This", "tool", "is", "called", "Forte", "."],
                  [
                      "The", "goal", "of", "this", "project", "to", "help",
                      "you", "build", "NLP", "pipelines", "."
                  ],
                  [
                      "NLP", "has", "never", "been", "made", "this", "easy",
                      "before", "."
                  ]]
        document = ' '.join(sentences)
        pack = self.nltk.process(document)
        for i, sentence in enumerate(pack.get(Sentence)):
            for j, token in enumerate(
                    pack.get(entry_type=Token, range_annotation=sentence)):
                self.assertEqual(token.text, tokens[i][j])
Example #3
0
class TestStanfordNLPProcessor(unittest.TestCase):
    def setUp(self):
        self.stanford_nlp = Pipeline()
        self.stanford_nlp.set_reader(StringReader())
        models_path = os.getcwd()
        config = HParams(
            {
                "processors": "tokenize",
                "lang": "en",
                # Language code for the language to build the Pipeline
                "use_gpu": False
            },
            StandfordNLPProcessor.default_hparams())
        self.stanford_nlp.add_processor(StandfordNLPProcessor(models_path),
                                        config=config)
        self.stanford_nlp.initialize()

    # TODO
    @unittest.skip("We need to test this without needing to download models "
                   "everytime")
    def test_stanford_processor(self):
        sentences = [
            "This tool is called Forte.",
            "The goal of this project to help you build NLP "
            "pipelines.", "NLP has never been made this easy before."
        ]
        document = ' '.join(sentences)
        pack = self.stanford_nlp.process(document)
        print(pack)
def stanford_nlp_example(lang: str, text: str):
    pl = Pipeline()
    pl.set_reader(StringReader())

    models_path = os.getcwd()
    config = HParams(
        {
            'processors': 'tokenize,pos,lemma,depparse',
            'lang': lang,
            # Language code for the language to build the Pipeline
            'use_gpu': False
        },
        StandfordNLPProcessor.default_hparams())
    pl.add_processor(processor=StandfordNLPProcessor(models_path),
                     config=config)

    pl.initialize()

    pack = pl.process(text)
    for sentence in pack.get(Sentence):
        sent_text = sentence.text
        print(colored("Sentence:", 'red'), sent_text, "\n")
        tokens = [(token.text, token.pos, token.lemma)
                  for token in pack.get(Token, sentence)]
        print(colored("Tokens:", 'red'), tokens, "\n")

        print(colored("Dependency Relations:", 'red'))
        for link in pack.get(Dependency, sentence):
            parent: Token = link.get_parent()  # type: ignore
            child: Token = link.get_child()  # type: ignore
            print(colored(child.text, 'cyan'), "has relation",
                  colored(link.rel_type, 'green'), "of parent",
                  colored(parent.text, 'cyan'))

        print("\n----------------------\n")
class TestNLTKPOSTagger(unittest.TestCase):
    def setUp(self):
        self.nltk = Pipeline()
        self.nltk.set_reader(StringReader())
        self.nltk.add_processor(NLTKSentenceSegmenter())
        self.nltk.add_processor(NLTKWordTokenizer())
        self.nltk.add_processor(NLTKPOSTagger())

    def test_pos_tagger(self):
        sentences = [
            "This tool is called Forte.",
            "The goal of this project to help you build NLP "
            "pipelines.", "NLP has never been made this easy before."
        ]
        pos = [["DT", "NN", "VBZ", "VBN", "NNP", "."],
               [
                   "DT", "NN", "IN", "DT", "NN", "TO", "VB", "PRP", "VB",
                   "NNP", "NNS", "."
               ], ["NNP", "VBZ", "RB", "VBN", "VBN", "DT", "JJ", "RB", "."]]
        document = ' '.join(sentences)
        pack = self.nltk.process(document)
        for i, sentence in enumerate(pack.get(Sentence)):
            for j, token in enumerate(
                    pack.get(entry_type=Token, range_annotation=sentence)):
                self.assertEqual(token.pos, pos[i][j])
    def test_encoder_sentence(self):
        pipeline = Pipeline()
        pipeline.set_reader(StringReader())
        pipeline.add_processor(NLTKSentenceSegmenter())
        pipeline.add_processor(PretrainedEncoder())
        pipeline.initialize()

        sentences = ["This tool is called Forte.",
                     "The goal of this project to help you build NLP "
                     "pipelines.",
                     "NLP has never been made this easy before."]
        document = ' '.join(sentences)
        pack = pipeline.process(document)
        for i, sentence in enumerate(pack.get(Sentence)):
            self.assertEqual(sentence.embedding.shape, (1, 512, 768))
    def test_encoder_document(self):
        pipeline = Pipeline()
        pipeline.set_reader(StringReader())
        pipeline.add_processor(
            PretrainedEncoder(),
            config={'entry_type': 'ft.onto.base_ontology.Document'})
        pipeline.initialize()

        sentences = ["This tool is called Forte.",
                     "The goal of this project to help you build NLP "
                     "pipelines.",
                     "NLP has never been made this easy before."]
        document = ' '.join(sentences)
        pack = pipeline.process(document)
        for i, doc in enumerate(pack.get(Document)):
            self.assertEqual(doc.embedding.shape, (1, 512, 768))
 def test_one_batch_processor(self, batch_size):
     nlp = Pipeline()
     nlp.set_reader(StringReader())
     dummy = DummmyFixedSizeBatchProcessor()
     config = {"batcher": {"batch_size": batch_size}}
     nlp.add_processor(NLTKSentenceSegmenter())
     nlp.add_processor(dummy, config=config)
     nlp.initialize()
     sentences = ["This tool is called Forte. The goal of this project to "
                  "help you build NLP pipelines. NLP has never been made "
                  "this easy before."]
     pack = nlp.process(sentences)
     sent_len = len(list(pack.get(Sentence)))
     self.assertEqual(
         dummy.counter, (sent_len // batch_size +
                         (sent_len % batch_size > 0)))
class TestNLTKSentenceSegmenter(unittest.TestCase):
    def setUp(self):
        self.nltk = Pipeline()
        self.nltk.set_reader(StringReader())
        self.nltk.add_processor(NLTKSentenceSegmenter())

    def test_segmenter(self):
        sentences = [
            "This tool is called Forte.",
            "The goal of this project to help you build NLP "
            "pipelines.", "NLP has never been made this easy before."
        ]
        document = ' '.join(sentences)
        pack = self.nltk.process(document)
        for idx, sentence in enumerate(pack.get(Sentence)):
            self.assertEqual(sentence.text, sentences[idx])
Example #10
0
class TestLowerCaserProcessor(unittest.TestCase):
    def setUp(self):
        self.nlp = Pipeline()
        self.nlp.set_reader(StringReader())
        self.nlp.add_processor(LowerCaserProcessor())
        self.nlp.initialize()

    def test_lowercaser_processor(self):
        document = "This tool is called Forte. The goal of this project to " \
                   "help you build NLP pipelines. NLP has never been made " \
                   "this easy before."
        pack = self.nlp.process(document)
        print(pack)
        print(pack.text)
        assert pack.text == "this tool is called forte. the goal of this " \
                            "project to help you build nlp pipelines. nlp " \
                            "has never been made this easy before."
Example #11
0
class DummyProcessorTest(unittest.TestCase):
    def setUp(self) -> None:
        self.nlp = Pipeline()
        self.nlp.set_reader(OntonotesReader())
        dummy = DummyRelationExtractor()
        config = {"batcher": {"batch_size": 5}}
        self.nlp.add_processor(dummy, config=config)
        self.nlp.initialize()

        self.data_path = "data_samples/ontonotes/00/"

    def test_processor(self):
        pack = self.nlp.process(self.data_path)
        relations = list(pack.get_entries(RelationLink))
        assert (len(relations) > 0)
        for relation in relations:
            self.assertEqual(relation.get_field("rel_type"), "dummy_relation")
Example #12
0
class DummyProcessorTest(unittest.TestCase):
    def setUp(self) -> None:
        self.nlp = Pipeline()
        self.reader = OntonotesReader()

        self.data_path = "examples/data_samples/ontonotes/00/"

        self.nlp.set_reader(OntonotesReader())
        self.nlp.add_processor(DummyRelationExtractor())
        self.nlp.initialize()

    def test_processor(self):
        pack = self.nlp.process(self.data_path)

        relations = list(pack.get_entries(RelationLink))

        assert (len(relations) > 0)

        for relation in relations:
            assert (relation.get_field("rel_type") == "dummy_relation")
Example #13
0
class TestQuestionAnsweringMulti(unittest.TestCase):
    def setUp(self):
        self.nlp = Pipeline()
        self.nlp.set_reader(StringReader())
        self.nlp.add(NLTKSentenceSegmenter())
        boxer_config = {"pack_name": "question"}
        self.nlp.add(MultiPackBoxer(), boxer_config)
        self.nlp.add(MutliDocPackAdder())
        self.nlp.add(QuestionAnsweringMulti())
        self.nlp.initialize()

    def test_huggingface_qa_multi_processor(self):
        question = "Name synonym of Acrokeratosis paraneoplastica."
        packs: MultiPack = self.nlp.process(question)
        expected_ans = {
            "doc_1": "Bazex syndrome",
            "doc_2": "Bazex syndrome",
            "doc_3": "Bazex syndrome",
        }
        for doc_id in packs.pack_names:
            if doc_id == "question":
                continue
            pack = packs.get_pack(doc_id)
            for idx, phrase in enumerate(pack.get(entry_type=Phrase)):
                self.assertEqual(phrase.text, expected_ans[doc_id])

        linked_texts = []

        for link in packs.get(entry_type=MultiPackLink):
            parent_text = link.get_parent().text
            child_text = link.get_child().text
            linked_texts.append((parent_text, child_text))

        self.assertListEqual(
            sorted(linked_texts),
            sorted([
                (question, expected_ans["doc_1"]),
                (question, expected_ans["doc_2"]),
                (question, expected_ans["doc_3"]),
            ]),
        )
    def test_encoder_phrase(self):
        pipeline = Pipeline()
        pipeline.set_reader(StringReader())
        pipeline.add_processor(NLTKSentenceSegmenter())
        pipeline.add_processor(NLTKWordTokenizer())
        pipeline.add_processor(NLTKPOSTagger())
        config = {'pattern': 'NP: {<DT>?<JJ>*<NN>}'}
        pipeline.add_processor(NLTKChunker(), config=config)
        pipeline.add_processor(
            PretrainedEncoder(),
            config={'entry_type': 'ft.onto.base_ontology.Phrase'})
        pipeline.initialize()

        sentences = ["This tool is called Forte.",
                     "The goal of this project to help you build NLP "
                     "pipelines.",
                     "NLP has never been made this easy before."]
        document = ' '.join(sentences)
        pack = pipeline.process(document)
        for i, phrase in enumerate(pack.get(Phrase)):
            self.assertEqual(phrase.embedding.shape, (1, 512, 768))
Example #15
0
    def test_neg_spacy_processor(self):
        spacy = Pipeline()
        spacy.set_reader(StringReader())

        config = {
            "processors": 'ner',
            "lang": "xx_ent_wiki_sm",
            # Language code for the language to build the Pipeline
            "use_gpu": False
        }
        spacy.add_processor(SpacyProcessor(), config=config)
        spacy.initialize()

        sentences = [
            "This tool is called Forte.",
            "The goal of this project to help you build NLP "
            "pipelines.", "NLP has never been made this easy before."
        ]
        document = ' '.join(sentences)
        with self.assertRaises(ValueError):
            _ = spacy.process(document)
Example #16
0
    def test_two_batch_processors(self, batch_size):
        nlp = Pipeline()
        nlp.set_reader(PlainTextReader())
        dummy1 = DummmyFixedSizeBatchProcessor()
        dummy2 = DummmyFixedSizeBatchProcessor()
        config = {"batcher": {"batch_size": batch_size}}
        nlp.add_processor(NLTKSentenceSegmenter())

        nlp.add_processor(dummy1, config=config)
        config = {"batcher": {"batch_size": 2 * batch_size}}
        nlp.add_processor(dummy2, config=config)

        nlp.initialize()
        data_path = "data_samples/random_texts"
        pack = nlp.process(data_path)
        sent_len = len(list(pack.get(Sentence)))

        self.assertEqual(dummy1.counter, (sent_len // batch_size +
                                          (sent_len % batch_size > 0)))

        self.assertEqual(dummy2.counter, (sent_len // (2 * batch_size) +
                                          (sent_len % (2 * batch_size) > 0)))
class DummyFixedSizeBatchProcessorTest(unittest.TestCase):
    def setUp(self) -> None:
        self.nlp = Pipeline()
        self.nlp.set_reader(StringReader())
        self.dummy = DummmyFixedSizeBatchProcessor()

    @data(1, 2, 3)
    def test_processor(self, batch_size):
        config = HParams({"batcher": {
            "batch_size": batch_size
        }}, self.dummy.default_hparams())
        self.nlp.add_processor(NLTKSentenceSegmenter())
        self.nlp.add_processor(self.dummy, config=config)
        self.nlp.initialize()
        sentences = [
            "This tool is called Forte. The goal of this project to "
            "help you build NLP pipelines. NLP has never been made "
            "this easy before."
        ]
        pack = self.nlp.process(sentences)
        sent_len = len(list(pack.get(Sentence)))
        self.assertEqual(self.dummy.counter, (sent_len // batch_size +
                                              (sent_len % batch_size > 0)))
Example #18
0
    def test_spacy_variation_pipeline(self, value):
        spacy = Pipeline()
        spacy.set_reader(StringReader())

        config = {
            "processors": value,
            "lang": "en_core_web_sm",
            # Language code for the language to build the Pipeline
            "use_gpu": False
        }
        spacy.add_processor(SpacyProcessor(), config=config)
        spacy.initialize()

        sentences = [
            "This tool is called Forte.",
            "The goal of this project to help you build NLP "
            "pipelines.", "NLP has never been made this easy before."
        ]
        document = ' '.join(sentences)
        pack = spacy.process(document)
        tokens = [x for x in pack.annotations if isinstance(x, Token)]
        if "tokenize" in value:
            exp_pos = [
                'DT', 'NN', 'VBZ', 'VBN', 'NNP', '.', 'DT', 'NN', 'IN', 'DT',
                'NN', 'TO', 'VB', 'PRP', 'VB', 'NNP', 'NNS', '.', 'NNP', 'VBZ',
                'RB', 'VBN', 'VBN', 'DT', 'JJ', 'RB', '.'
            ]

            exp_lemma = [
                'this', 'tool', 'be', 'call', 'Forte', '.', 'the', 'goal',
                'of', 'this', 'project', 'to', 'help', '-PRON-', 'build',
                'NLP', 'pipeline', '.', 'NLP', 'have', 'never', 'be', 'make',
                'this', 'easy', 'before', '.'
            ]

            tokens_text = [x.text for x in tokens]

            pos = [x.pos for x in pack.annotations if isinstance(x, Token)]
            lemma = [x.lemma for x in pack.annotations if isinstance(x, Token)]
            document_ = document.replace('.', ' .')
            self.assertEqual(tokens_text, document_.split())

            # Check token texts
            for token, text in zip(tokens, tokens_text):
                start, end = token.span.begin, token.span.end
                self.assertEqual(document[start:end], text)

            if "pos" in value:
                self.assertListEqual(pos, exp_pos)
            else:
                none_pos = [None] * len(pos)
                self.assertListEqual(pos, none_pos)

            if "lemma" in value:
                self.assertListEqual(lemma, exp_lemma)
            else:
                none_lemma = [None] * len(lemma)
                self.assertListEqual(lemma, none_lemma)
        else:
            self.assertListEqual(tokens, [])

        if "ner" in value:
            entities_text = [
                x.text for x in pack.annotations
                if isinstance(x, EntityMention)
            ]
            entities_type = [
                x.ner_type for x in pack.annotations
                if isinstance(x, EntityMention)
            ]

            self.assertEqual(entities_text, ['Forte', 'NLP', 'NLP'])
            self.assertEqual(entities_type, ['GPE', 'ORG', 'ORG'])
Example #19
0
class TestVaderSentiment(unittest.TestCase):

    def setUp(self):
        self.pipeline = Pipeline()
        self.pipeline.set_reader(StringReader())
        self.pipeline.add_processor(NLTKSentenceSegmenter())
        self.pipeline.add_processor(VaderSentimentProcessor())
        self.pipeline.initialize()

    def test_segmenter(self):
        sentences = [
            "VADER is smart, handsome, and funny.",
            # positive sentence example
            "VADER is smart, handsome, and funny!",
            # punctuation emphasis handled correctly (sentiment intensity
            # adjusted)
            "VADER is very smart, handsome, and funny.",
            # booster words handled correctly (sentiment intensity adjusted)
            "VADER is VERY SMART, handsome, and FUNNY.",
            # emphasis for ALLCAPS handled
            "VADER is VERY SMART, handsome, and FUNNY!!!",
            # combination of signals - VADER appropriately adjusts intensity
            "VADER is VERY SMART, uber handsome, and FRIGGIN FUNNY!!!",
            # booster words & punctuation make this close to ceiling for score
            "VADER is not smart, handsome, nor funny.",
            # negation sentence example
            "The book was good.",  # positive sentence
            "At least it isn't a horrible book.",
            # negated negative sentence with contraction
            "The book was only kind of good.",
            # qualified positive sentence is handled correctly (intensity
            # adjusted)
            "The plot was good, but the characters are uncompelling and the "
            "dialog is not great.",
            # mixed negation sentence
            "Today SUX!",
            # negative slang with capitalization emphasis
            "Today only kinda sux! But I'll get by, lol",
            # mixed sentiment example with slang and constrastive conjunction
            # "but"
            "Make sure you :) or :D today!",  # emoticons handled
            "Catch utf-8 emoji such as such as 💘 and 💋 and 😁",
            # emojis handled
            "Not bad at all"  # Capitalized negation
        ]

        expected_scores = [
            {'neg': 0.0, 'neu': 0.254, 'pos': 0.746, 'compound': 0.8316},
            {'neg': 0.0, 'neu': 0.248, 'pos': 0.752, 'compound': 0.8439},
            {'neg': 0.0, 'neu': 0.299, 'pos': 0.701, 'compound': 0.8545},
            {'neg': 0.0, 'neu': 0.246, 'pos': 0.754, 'compound': 0.9227},
            {'neg': 0.0, 'neu': 0.233, 'pos': 0.767, 'compound': 0.9342},
            {'neg': 0.0, 'neu': 0.294, 'pos': 0.706, 'compound': 0.9469},
            {'neg': 0.646, 'neu': 0.354, 'pos': 0.0, 'compound': -0.7424},
            {'neg': 0.0, 'neu': 0.508, 'pos': 0.492, 'compound': 0.4404},
            {'neg': 0.0, 'neu': 0.637, 'pos': 0.363, 'compound': 0.431},
            {'neg': 0.0, 'neu': 0.697, 'pos': 0.303, 'compound': 0.3832},
            {'neg': 0.327, 'neu': 0.579, 'pos': 0.094, 'compound': -0.7042},
            {'neg': 0.779, 'neu': 0.221, 'pos': 0.0, 'compound': -0.5461},
            {'neg': 0.454, 'neu': 0.546, 'pos': 0.0, 'compound': -0.3609},
            {'neg': 0.0, 'neu': 0.327, 'pos': 0.673, 'compound': 0.9551},
            {'neg': 0.0, 'neu': 0.698, 'pos': 0.302, 'compound': 0.8248},
        ]

        document = ' '.join(sentences)
        pack = self.pipeline.process(document)

        sentence: Sentence
        for idx, sentence in enumerate(pack.get(Sentence)):
            self.assertEqual(sentence.sentiment, expected_scores[idx])