Esempio n. 1
0
    def test_huggingface_pos_token_classification(self):
        nlp = Pipeline[DataPack]()
        nlp.set_reader(StringReader())
        nlp.add(PeriodSentenceSplitter())
        token_config = {
            "entry_type": "ft.onto.base_ontology.Sentence",
            "output_entry_type": "ft.onto.base_ontology.Token",
            "attribute_name": "pos",
            "tagging_scheme": "no-merge",
            "model_name": "vblagoje/bert-english-uncased-finetuned-pos",
            "tokenizer": "vblagoje/bert-english-uncased-finetuned-pos",
            "framework": "pt",
        }
        nlp.add(TokenClassification(), config=token_config)
        nlp.initialize()
        sentences = ["My name is Clara and I live in Berkeley, California."]

        pack = nlp.process(sentences)

        expected_type = [[
            "PRON",
            "NOUN",
            "AUX",
            "PROPN",
            "CCONJ",
            "PRON",
            "VERB",
            "ADP",
            "PROPN",
            "PUNCT",
            "PROPN",
            "PUNCT",
        ]]
        expected_index = [[
            (0, 2),
            (3, 7),
            (8, 10),
            (11, 16),
            (17, 20),
            (21, 22),
            (23, 27),
            (28, 30),
            (31, 39),
            (39, 40),
            (41, 51),
            (51, 52),
        ]]

        for entry_idx, entry in enumerate(pack.get(
                token_config["entry_type"])):
            for idx, token in enumerate(
                    pack.get(
                        entry_type=token_config["output_entry_type"],
                        range_annotation=entry,
                    )):
                token_type = getattr(token, token_config["attribute_name"])
                self.assertEqual(token_type, expected_type[entry_idx][idx])
                self.assertEqual(token.begin,
                                 expected_index[entry_idx][idx][0])
                self.assertEqual(token.end, expected_index[entry_idx][idx][1])
Esempio n. 2
0
    def test_huggingface_ner_token_classification(self):
        nlp = Pipeline[DataPack]()
        nlp.set_reader(StringReader())
        nlp.add(PeriodSentenceSplitter())
        token_config = {
            "entry_type": "ft.onto.base_ontology.Sentence",
            "output_entry_type": "ft.onto.base_ontology.EntityMention",
            "attribute_name": "ner_type",
            "tagging_scheme": "no-merge",  # 'bio-merge'
            "model_name": "jplu/tf-xlm-r-ner-40-lang",
            "tokenizer": "jplu/tf-xlm-r-ner-40-lang",
            "framework": "tf",
        }
        nlp.add(TokenClassification(), config=token_config)
        nlp.initialize()
        sentences = ["Barack Obama was born in Hawaii."]

        pack = nlp.process(sentences)

        expected_type = [["PER", "PER", "LOC"]]
        expected_index = [[(0, 6), (7, 12), (25, 31)]]

        for entry_idx, entry in enumerate(pack.get(
                token_config["entry_type"])):
            for idx, token in enumerate(
                    pack.get(
                        entry_type=token_config["output_entry_type"],
                        range_annotation=entry,
                    )):

                token_type = getattr(token, token_config["attribute_name"])
                self.assertEqual(token_type, expected_type[entry_idx][idx])
                self.assertEqual(token.begin,
                                 expected_index[entry_idx][idx][0])
                self.assertEqual(token.end, expected_index[entry_idx][idx][1])
Esempio n. 3
0
    def setUp(self):
        random.seed(8)
        self.nlp = Pipeline[MultiPack]()

        boxer_config = {"pack_name": "input_src"}
        entity_config = {"entities_to_insert": ["Mary", "station"]}
        self.nlp.set_reader(reader=StringReader())
        self.nlp.add(component=EntityMentionInserter(), config=entity_config)
        self.nlp.add(PeriodSentenceSplitter())
        self.nlp.add(component=MultiPackBoxer(), config=boxer_config)
        self.nlp.add(component=WhiteSpaceTokenizer(),
                     selector=AllPackSelector())
Esempio n. 4
0
    def test_serialize_deserialize_processor(self):
        pipe_serialize = Pipeline[DataPack]()
        pipe_serialize.set_reader(OntonotesReader())
        pipe_serialize.add(
            AnnotationRemover(),
            # Remove tokens and sentences form OntonotesReader.
            {
                "removal_types": [
                    "ft.onto.base_ontology.Token",
                    "ft.onto.base_ontology.Sentence",
                ]
            },
        )
        pipe_serialize.add(PeriodSentenceSplitter())
        pipe_serialize.add(WhiteSpaceTokenizer())

        with tempfile.TemporaryDirectory() as output_dir:
            pipe_serialize.add(
                PackNameJsonPackWriter(),
                {
                    "output_dir": output_dir,
                    "indent": 2,
                },
            )

            pipe_serialize.run(self.data_path)

            pipe_deserialize = Pipeline[DataPack]()
            pipe_deserialize.set_reader(RecursiveDirectoryDeserializeReader())
            pipe_deserialize.initialize()

            token_counts: Dict[str, int] = {}

            # This basically test whether the deserialized data is
            # still the same as expected.
            pack: DataPack
            for pack in pipe_deserialize.process_dataset(output_dir):
                tokens: List[Token] = list(pack.get(Token))
                token_counts[pack.pack_name] = len(tokens)

            expected_count = {
                "bn/abc/00/abc_0039": 72,
                "bn/abc/00/abc_0019": 370,
                "bn/abc/00/abc_0059": 39,
                "bn/abc/00/abc_0009": 424,
                "bn/abc/00/abc_0029": 487,
                "bn/abc/00/abc_0069": 428,
                "bn/abc/00/abc_0049": 73,
            }

            assert token_counts == expected_count
    def test_encoder_sentence(self):
        pipeline = Pipeline[DataPack]()
        pipeline.set_reader(StringReader())
        pipeline.add(PeriodSentenceSplitter())
        pipeline.add(PretrainedEncoder())
        pipeline.initialize()

        sentences = [
            "This tool is called Forte.",
            "The goal of this project to help you build NLP " "pipelines.",
            "NLP has never been made this easy before.",
        ]
        document = " ".join(sentences)
        pack = pipeline.process(document)
        for i, sentence in enumerate(pack.get(Sentence)):
            self.assertEqual(sentence.embedding.shape, (1, 512, 768))
Esempio n. 6
0
    def test_huggingface_ws_token_classification(self):
        nlp = Pipeline[DataPack]()
        nlp.set_reader(StringReader())
        nlp.add(PeriodSentenceSplitter())
        token_config = {
            "entry_type": "ft.onto.base_ontology.Sentence",
            "output_entry_type": "ft.onto.base_ontology.Token",
            "attribute_name": "word_segment",
            "tagging_scheme": "bio-merge",
            "model_name": "ckiplab/bert-base-chinese-ws",
            "tokenizer": "ckiplab/bert-base-chinese-ws",
            "framework": "pt",
        }
        nlp.add(TokenClassification(), config=token_config)
        nlp.initialize()
        sentences = ["我叫克拉拉,我住在加州伯克利。"]

        pack = nlp.process(sentences)

        expected_index = [[
            (0, 1),
            (1, 2),
            (2, 5),
            (5, 6),
            (6, 7),
            (7, 8),
            (8, 9),
            (9, 11),
            (11, 14),
            (14, 15),
        ]]

        for entry_idx, entry in enumerate(pack.get(
                token_config["entry_type"])):
            for idx, token in enumerate(
                    pack.get(
                        entry_type=token_config["output_entry_type"],
                        range_annotation=entry,
                    )):
                self.assertEqual(token.begin,
                                 expected_index[entry_idx][idx][0])
                self.assertEqual(token.end, expected_index[entry_idx][idx][1])
Esempio n. 7
0
    def test_two_batch_processors(self, batch_size):
        nlp = Pipeline[DataPack]()
        nlp.set_reader(PlainTextReader())
        dummy1 = DummyFixedSizeBatchProcessor()
        dummy2 = DummyFixedSizeBatchProcessor()

        nlp.add(PeriodSentenceSplitter())
        nlp.add(
            dummy1,
            config={
                "batcher": {
                    "batch_size": batch_size,
                    "context_type": "ft.onto.base_ontology.Sentence",
                }
            },
        )

        nlp.add(
            dummy2,
            config={
                "batcher": {
                    "batch_size": 2 * batch_size,
                    "context_type": "ft.onto.base_ontology.Sentence",
                }
            },
        )

        nlp.initialize()
        data_path = os.path.join(data_samples_root, "random_texts")
        pack = nlp.process(data_path)
        sent_len = len(list(pack.get(Sentence)))

        self.assertEqual(
            dummy1.counter,
            (sent_len // batch_size + (sent_len % batch_size > 0)),
        )

        self.assertEqual(
            dummy2.counter,
            (sent_len // (2 * batch_size) + (sent_len % (2 * batch_size) > 0)),
        )
Esempio n. 8
0
    def test_huggingface_ner_bio_classification(self):
        nlp = Pipeline[DataPack]()
        nlp.set_reader(StringReader())
        nlp.add(PeriodSentenceSplitter())
        # nlp.add(PeriodSentenceSplitter())
        token_config = {
            "entry_type": "ft.onto.base_ontology.Sentence",
            "output_entry_type": "ft.onto.base_ontology.EntityMention",
            "attribute_name": "ner_type",
            "tagging_scheme": "bio-merge",
            "model_name": "dslim/bert-base-NER",
            "tokenizer": "dslim/bert-base-NER",
            "framework": "pt",
        }
        nlp.add(TokenClassification(), config=token_config)
        nlp.initialize()
        sentences = [
            "My name is Wolfgang and I live in Berlin.",
            "His name is Chris and he lives in Hawaii Island.",
        ]
        document = " ".join(sentences)
        pack = nlp.process(document)

        expected_type = [["PER", "LOC"], ["PER", "LOC"]]
        expected_index = [[(11, 19), (34, 40)], [(54, 59), (76, 89)]]

        for entry_idx, entry in enumerate(pack.get(
                token_config["entry_type"])):
            for idx, token in enumerate(
                    pack.get(
                        entry_type=token_config["output_entry_type"],
                        range_annotation=entry,
                    )):

                token_type = getattr(token, token_config["attribute_name"])
                self.assertEqual(token_type, expected_type[entry_idx][idx])
                self.assertEqual(token.begin,
                                 expected_index[entry_idx][idx][0])
                self.assertEqual(token.end, expected_index[entry_idx][idx][1])
Esempio n. 9
0
 def test_one_batch_processor(self, batch_size):
     nlp = Pipeline[DataPack]()
     nlp.set_reader(StringReader())
     batch_processor = DummyFixedSizeBatchProcessor()
     config = {
         "batcher": {
             "batch_size": batch_size,
             "context_type": "ft.onto.base_ontology.Sentence",
         },
     }
     nlp.add(PeriodSentenceSplitter())
     nlp.add(batch_processor, config=config)
     nlp.initialize()
     sentences = [
         "This tool is called Forte. The goal of this project to "
         "help you build NLP pipelines. NLP has never been made "
         "this easy before."
     ]
     pack = nlp.process(sentences)
     sent_len = len(list(pack.get(Sentence)))
     self.assertEqual(
         batch_processor.counter,
         (sent_len // batch_size + (sent_len % batch_size > 0)),
     )