Esempio n. 1
0
    def setUp(self):
        self.spacy = Pipeline()
        self.spacy.set_reader(StringReader())

        config = {
            "processors": "tokenize",
            "lang": "en_core_web_sm",
            # Language code for the language to build the Pipeline
            "use_gpu": False
        }
        self.spacy.add_processor(SpacyProcessor(), config=config)
        self.spacy.initialize()
    def _create_pipeline(config):
        nlp = Pipeline[DataPack]()
        nlp.set_reader(StringReader())

        # Using SpacyProcessor to segment the sentences
        nlp.add(
            component=SpacyProcessor(),
            config={
                'processors': '',
                'lang':
                "en_core_web_sm",  # Language code to build the Pipeline
                'use_gpu': False
            })

        nlp.add(component=AllenNLPProcessor(), config=config)
        nlp.initialize()
        return nlp
Esempio n. 3
0
    def test_neg_spacy_processor(self):
        spacy = Pipeline()
        spacy.set_reader(StringReader())

        config = {
            "processors": 'ner',
            "lang": "xx_ent_wiki_sm",
            # Language code for the language to build the Pipeline
            "use_gpu": False
        }
        spacy.add_processor(SpacyProcessor(), config=config)
        spacy.initialize()

        sentences = [
            "This tool is called Forte.",
            "The goal of this project to help you build NLP "
            "pipelines.", "NLP has never been made this easy before."
        ]
        document = ' '.join(sentences)
        with self.assertRaises(ValueError):
            _ = spacy.process(document)
Esempio n. 4
0
    def test_spacy_variation_pipeline(self, value):
        spacy = Pipeline()
        spacy.set_reader(StringReader())

        config = {
            "processors": value,
            "lang": "en_core_web_sm",
            # Language code for the language to build the Pipeline
            "use_gpu": False
        }
        spacy.add_processor(SpacyProcessor(), config=config)
        spacy.initialize()

        sentences = [
            "This tool is called Forte.",
            "The goal of this project to help you build NLP "
            "pipelines.", "NLP has never been made this easy before."
        ]
        document = ' '.join(sentences)
        pack = spacy.process(document)
        tokens = [x for x in pack.annotations if isinstance(x, Token)]
        if "tokenize" in value:
            exp_pos = [
                'DT', 'NN', 'VBZ', 'VBN', 'NNP', '.', 'DT', 'NN', 'IN', 'DT',
                'NN', 'TO', 'VB', 'PRP', 'VB', 'NNP', 'NNS', '.', 'NNP', 'VBZ',
                'RB', 'VBN', 'VBN', 'DT', 'JJ', 'RB', '.'
            ]

            exp_lemma = [
                'this', 'tool', 'be', 'call', 'Forte', '.', 'the', 'goal',
                'of', 'this', 'project', 'to', 'help', '-PRON-', 'build',
                'NLP', 'pipeline', '.', 'NLP', 'have', 'never', 'be', 'make',
                'this', 'easy', 'before', '.'
            ]

            tokens_text = [x.text for x in tokens]

            pos = [x.pos for x in pack.annotations if isinstance(x, Token)]
            lemma = [x.lemma for x in pack.annotations if isinstance(x, Token)]
            document_ = document.replace('.', ' .')
            self.assertEqual(tokens_text, document_.split())

            # Check token texts
            for token, text in zip(tokens, tokens_text):
                start, end = token.span.begin, token.span.end
                self.assertEqual(document[start:end], text)

            if "pos" in value:
                self.assertListEqual(pos, exp_pos)
            else:
                none_pos = [None] * len(pos)
                self.assertListEqual(pos, none_pos)

            if "lemma" in value:
                self.assertListEqual(lemma, exp_lemma)
            else:
                none_lemma = [None] * len(lemma)
                self.assertListEqual(lemma, none_lemma)
        else:
            self.assertListEqual(tokens, [])

        if "ner" in value:
            entities_text = [
                x.text for x in pack.annotations
                if isinstance(x, EntityMention)
            ]
            entities_type = [
                x.ner_type for x in pack.annotations
                if isinstance(x, EntityMention)
            ]

            self.assertEqual(entities_text, ['Forte', 'NLP', 'NLP'])
            self.assertEqual(entities_type, ['GPE', 'ORG', 'ORG'])
Esempio n. 5
0
    def test_spacy_variation_pipeline(self, value):
        spacy = Pipeline[DataPack]()
        spacy.set_reader(StringReader())

        config = {
            "processors": value,
            "lang": "en_core_web_sm",
            # Language code for the language to build the Pipeline
            "use_gpu": False
        }
        spacy.add(SpacyProcessor(), config=config)
        spacy.initialize()

        sentences = [
            "This tool is called Forte.",
            "The goal of this project to help you build NLP "
            "pipelines.", "NLP has never been made this easy before."
        ]
        document = ' '.join(sentences)
        pack: DataPack = spacy.process(document)
        tokens: List[Token] = list(pack.get(Token))  # type: ignore

        raw_results = self.nlp(document)
        sentences = raw_results.sents

        if "tokenize" in value:
            exp_pos = []
            exp_lemma = []
            for s in sentences:
                for w in s:
                    exp_lemma.append(w.lemma_)
                    exp_pos.append(w.tag_)

            tokens_text = [x.text for x in tokens]
            self.assertEqual(tokens_text, document.replace('.', ' .').split())

            pos = [x.pos for x in tokens]
            lemma = [x.lemma for x in tokens]

            # Check token texts
            for token, text in zip(tokens, tokens_text):
                start, end = token.span.begin, token.span.end
                self.assertEqual(document[start:end], text)

            if "pos" in value:
                self.assertListEqual(pos, exp_pos)
            else:
                none_pos = [None] * len(pos)
                self.assertListEqual(pos, none_pos)

            if "lemma" in value:
                self.assertListEqual(lemma, exp_lemma)
            else:
                none_lemma = [None] * len(lemma)
                self.assertListEqual(lemma, none_lemma)
        else:
            self.assertListEqual(tokens, [])

        if "ner" in value:
            pack_ents: List[EntityMention] = list(pack.get(EntityMention))
            entities_text = [x.text for x in pack_ents]
            entities_type = [x.ner_type for x in pack_ents]

            raw_ents = raw_results.ents
            exp_ent_text = [
                document[ent.start_char:ent.end_char] for ent in raw_ents
            ]
            exp_ent_types = [ent.label_ for ent in raw_ents]

            self.assertEqual(entities_text, exp_ent_text)
            self.assertEqual(entities_type, exp_ent_types)