Ejemplos de WhiteSpaceTokenizer en Python, ejemplos de forte.processors.misc.WhiteSpaceTokenizer en Python

Ejemplo n.º 1

0

Mostrar archivo

    def test_pipeline(self, texts, expected_outputs, expected_tokens):
        nlp = Pipeline[MultiPack]()

        boxer_config = {"pack_name": "input"}

        replacer_op = TmpReplacer.__module__ + "." + TmpReplacer.__qualname__

        processor_config = {
            "augment_entry": "ft.onto.base_ontology.Token",
            "other_entry_policy": {
                "ft.onto.base_ontology.Document": "auto_align",
                "ft.onto.base_ontology.Sentence": "auto_align",
            },
            "type": "data_augmentation_op",
            "data_aug_op": replacer_op,
            "data_aug_op_config": {},
            "augment_pack_names": {},
        }

        nlp.set_reader(reader=StringReader())
        nlp.add(component=MultiPackBoxer(), config=boxer_config)
        nlp.add(component=WhiteSpaceTokenizer(), selector=AllPackSelector())
        nlp.add(
            component=ReplacementDataAugmentProcessor(), config=processor_config
        )
        nlp.initialize()

        for idx, m_pack in enumerate(nlp.process_dataset(texts)):
            aug_pack = m_pack.get_pack("augmented_input")

            self.assertEqual(aug_pack.text, expected_outputs[idx])

            for j, token in enumerate(aug_pack.get(Token)):
                self.assertEqual(token.text, expected_tokens[idx][j])

Ejemplo n.º 2

0

Mostrar archivo

    def setUp(self):
        random.seed(0)
        self.nlp = Pipeline[MultiPack]()

        boxer_config = {"pack_name": "input_src"}

        self.nlp.set_reader(reader=StringReader())
        self.nlp.add(component=MultiPackBoxer(), config=boxer_config)
        self.nlp.add(
            component=WhiteSpaceTokenizer(), selector=AllPackSelector()
        )

Ejemplo n.º 3

0

Mostrar archivo

    def setUp(self):
        random.seed(8)
        self.nlp = Pipeline[MultiPack]()

        boxer_config = {"pack_name": "input_src"}
        entity_config = {"entities_to_insert": ["Mary", "station"]}
        self.nlp.set_reader(reader=StringReader())
        self.nlp.add(component=EntityMentionInserter(), config=entity_config)
        self.nlp.add(PeriodSentenceSplitter())
        self.nlp.add(component=MultiPackBoxer(), config=boxer_config)
        self.nlp.add(component=WhiteSpaceTokenizer(),
                     selector=AllPackSelector())

Ejemplo n.º 4

0

Mostrar archivo

Archivo: writer_test.py Proyecto: jennyzhang-petuum/forte

    def test_serialize_deserialize_processor(self):
        pipe_serialize = Pipeline[DataPack]()
        pipe_serialize.set_reader(OntonotesReader())
        pipe_serialize.add(
            AnnotationRemover(),
            # Remove tokens and sentences form OntonotesReader.
            {
                "removal_types": [
                    "ft.onto.base_ontology.Token",
                    "ft.onto.base_ontology.Sentence",
                ]
            },
        )
        pipe_serialize.add(PeriodSentenceSplitter())
        pipe_serialize.add(WhiteSpaceTokenizer())

        with tempfile.TemporaryDirectory() as output_dir:
            pipe_serialize.add(
                PackNameJsonPackWriter(),
                {
                    "output_dir": output_dir,
                    "indent": 2,
                },
            )

            pipe_serialize.run(self.data_path)

            pipe_deserialize = Pipeline[DataPack]()
            pipe_deserialize.set_reader(RecursiveDirectoryDeserializeReader())
            pipe_deserialize.initialize()

            token_counts: Dict[str, int] = {}

            # This basically test whether the deserialized data is
            # still the same as expected.
            pack: DataPack
            for pack in pipe_deserialize.process_dataset(output_dir):
                tokens: List[Token] = list(pack.get(Token))
                token_counts[pack.pack_name] = len(tokens)

            expected_count = {
                "bn/abc/00/abc_0039": 72,
                "bn/abc/00/abc_0019": 370,
                "bn/abc/00/abc_0059": 39,
                "bn/abc/00/abc_0009": 424,
                "bn/abc/00/abc_0029": 487,
                "bn/abc/00/abc_0069": 428,
                "bn/abc/00/abc_0049": 73,
            }

            assert token_counts == expected_count

Ejemplo n.º 5

0

Mostrar archivo

    def test_tokenizer_unicode(self, input_data):
        self.pl = (
            Pipeline[DataPack]()
            .set_reader(StringReader())
            .add(WhiteSpaceTokenizer())
            .add(
                SubwordTokenizer(),
                config={
                    "tokenizer_configs": {"do_lower_case": True},
                    "token_source": "ft.onto.base_ontology.Token",
                },
            )
            .initialize()
        )

        for pack in self.pl.process_dataset(input_data):
            subwords = list(pack.get(Subword))
            self.assertEqual(len(subwords), 57)
            self.assertEqual(subwords[-1].text, "İbrahimova")
            self.assertTrue(subwords[-1].is_unk)

Ejemplo n.º 6

0

Mostrar archivo

    def test_pipeline(self, texts, expected_outputs):
        nlp = Pipeline[MultiPack]()

        boxer_config = {"pack_name": "input"}

        nlp.set_reader(reader=StringReader())
        nlp.add(component=MultiPackBoxer(), config=boxer_config)
        nlp.add(component=WhiteSpaceTokenizer(), selector=AllPackSelector())

        processor_config = {
            "augment_entry":
            "ft.onto.base_ontology.Token",
            "other_entry_policy": {
                "ft.onto.base_ontology.Document": "auto_align",
                "ft.onto.base_ontology.Sentence": "auto_align",
            },
            "type":
            "data_augmentation_op",
            "data_aug_op":
            "forte.processors.data_augment.algorithms"
            ".embedding_similarity_replacement_op."
            "EmbeddingSimilarityReplacementOp",
            "data_aug_op_config": {
                "vocab_path": self.abs_vocab_path,
                "embed_hparams": self.embed_hparams,
                "top_k": 1,
            },
            "augment_pack_names": {
                "input": "augmented_input"
            },
        }
        nlp.add(component=ReplacementDataAugmentProcessor(),
                config=processor_config)
        nlp.initialize()

        for idx, m_pack in enumerate(nlp.process_dataset(texts)):
            aug_pack = m_pack.get_pack("augmented_input")
            self.assertEqual(aug_pack.text, expected_outputs[idx])

Ejemplo n.º 7

0

Mostrar archivo

    def test_replace_token(
        self, texts, expected_outputs, expected_tokens, expected_links
    ):
        for idx, text in enumerate(texts):
            file_path = os.path.join(self.test_dir, f"{idx + 1}.txt")
            with open(file_path, "w") as f:
                f.write(text)

        nlp = Pipeline[MultiPack]()
        reader_config = {
            "input_pack_name": "input_src",
            "output_pack_name": "output_tgt",
        }
        nlp.set_reader(reader=MultiPackSentenceReader(), config=reader_config)

        nlp.add(component=WhiteSpaceTokenizer(), selector=AllPackSelector())

        replacer_op = TmpReplacer.__module__ + "." + TmpReplacer.__qualname__

        processor_config = {
            "augment_entry": "ft.onto.base_ontology.Token",
            "other_entry_policy": {
                "ft.onto.base_ontology.Sentence": "auto_align"
            },
            "type": "data_augmentation_op",
            "data_aug_op": replacer_op,
            "data_aug_op_config": {},
            "augment_pack_names": {},
        }

        nlp.initialize()

        processor = ReplacementDataAugmentProcessor()
        # To test, initialize the processor itself.
        processor.initialize(resources=None, configs=processor_config)

        for idx, m_pack in enumerate(nlp.process_dataset(self.test_dir)):
            src_pack = m_pack.get_pack("input_src")
            tgt_pack = m_pack.get_pack("output_tgt")

            num_mpl_orig, num_mpg_orig = 0, 0
            # Copy the source pack to target pack.
            tgt_pack.set_text(src_pack.text)

            src_pack.add_entry(Document(src_pack, 0, len(src_pack.text)))
            for anno in src_pack.get(Annotation):
                new_anno = type(anno)(tgt_pack, anno.begin, anno.end)
                tgt_pack.add_entry(new_anno)

                # Create MultiPackLink.
                m_pack.add_entry(MultiPackLink(m_pack, anno, new_anno))

                # Create MultiPackGroup.
                m_pack.add_entry(MultiPackGroup(m_pack, [anno, new_anno]))

                # Count the number of MultiPackLink/MultiPackGroup.
                num_mpl_orig += 1
                num_mpg_orig += 1

            # Create Links in the source pack.
            # The Links should be a tree:
            #
            #                           Link 3
            #                    _________|_________
            #                   |                  |
            #                 Link 2               |
            #            _______|________          |
            #           |               |          |
            #         Link 1            |          |
            #     ______|_____          |          |
            #    |           |          |          |
            # token 1     token 2    token 3    token 4 ... ...
            prev_entry = None
            for i, token in enumerate(src_pack.get(Token)):
                # Avoid overlapping with deleted tokens.
                if i < 10:
                    continue
                if prev_entry:
                    link = Link(src_pack, prev_entry, token)
                    src_pack.add_entry(link)
                    prev_entry = link
                else:
                    prev_entry = token

            # Create Groups in the target pack.
            # The Groups should be a tree like the Links.
            prev_entry = None
            for i, token in enumerate(tgt_pack.get(Token)):
                # Avoid overlapping with deleted tokens.
                if i < 10:
                    continue
                if prev_entry:
                    group = Group(tgt_pack, [prev_entry, token])
                    tgt_pack.add_entry(group)
                    prev_entry = group
                else:
                    prev_entry = token

            doc_src = list(src_pack.get(Document))[0]
            doc_tgt = list(tgt_pack.get(Document))[0]

            sent_src = list(src_pack.get(Sentence))[0]
            sent_tgt = list(tgt_pack.get(Sentence))[0]

            # Insert two extra Links in the src_pack.
            # They should not be copied to new_src_pack, because the Document
            # is not copied.
            link_src_low = src_pack.add_entry(Link(src_pack, doc_src, sent_src))
            src_pack.add_entry(Link(src_pack, link_src_low, sent_src))

            # Insert two extra Groups in the tgt_pack.
            # They should not be copied to new_tgt_pack, because the
            # Document is not copied.
            group_tgt_low = tgt_pack.add_entry(
                Group(tgt_pack, [doc_tgt, sent_tgt])
            )
            tgt_pack.add_entry(Group(tgt_pack, [group_tgt_low, sent_tgt]))

            # Call the augment function explicitly for duplicate replacement
            # to test the False case of _replace function.
            processor._augment(m_pack, ["input_src", "output_tgt"])

            # Test the insertion and deletion
            for pack in (src_pack, tgt_pack):
                # Insert an "NLP" at the beginning
                processor._insert(" NLP ", pack, 0)
                processor._insert(" NLP ", pack, 18)
                processor._insert(" NLP ", pack, len(pack.text) - 2)
                processor._insert(" NLP", pack, len(pack.text) - 1)
                # Delete the second token "and"
                processor._delete(list(pack.get(Token))[1])

                # This duplicate insertion should be invalid.
                processor._insert(" NLP ", pack, 0)
                # This insertion overlaps with a replacement.
                # It should be invalid.
                processor._insert(" NLP ", pack, 2)

            processor._process(m_pack)

            new_src_pack = m_pack.get_pack("augmented_input_src")
            new_tgt_pack = m_pack.get_pack("augmented_output_tgt")

            self.assertEqual(new_src_pack.text, expected_outputs[idx] + "\n")

            for j, token in enumerate(new_src_pack.get(Token)):
                # print(f'[{token.text}], [{expected_tokens[idx][j]}]')
                self.assertEqual(token.text, expected_tokens[idx][j])

            for sent in new_src_pack.get(Sentence):
                self.assertEqual(sent.text, expected_outputs[idx])

            # Test the copied Links.
            prev_link = None
            for i, link in enumerate(new_src_pack.get(Link)):
                if prev_link:
                    self.assertEqual(link.get_parent().tid, prev_link.tid)
                    self.assertEqual(
                        link.get_child().text, expected_links[idx][i]
                    )
                prev_link = link

            # Test the copied Groups.
            prev_group = None
            for i, group in enumerate(new_tgt_pack.get(Group)):
                members = group.get_members()
                if isinstance(members[0], Token):
                    member_token = members[0]
                    member_group = members[1]
                else:
                    member_token = members[1]
                    member_group = members[0]

                if prev_group:
                    self.assertEqual(isinstance(member_token, Token), True)
                    self.assertEqual(isinstance(member_group, Group), True)
                    self.assertEqual(member_group.tid, prev_group.tid)
                    self.assertEqual(member_token.text, expected_links[idx][i])

                prev_group = group

            # The two extra Links should not be copied, because of missing Document.
            self.assertEqual(
                len(list(src_pack.get(Link))) - 2,
                len(list(new_src_pack.get(Link))),
            )
            # The two extra Groups should not be copied, because of missing Document.
            self.assertEqual(
                len(list(tgt_pack.get(Group))) - 2,
                len(list(new_tgt_pack.get(Group))),
            )

            # Test the MultiPackLink/MultiPackGroup
            num_mpl_aug, num_mpg_aug = 0, 0
            for mpl in m_pack.get(MultiPackLink):
                parent = mpl.get_parent()
                child = mpl.get_child()
                num_mpl_aug += 1
                self.assertEqual(parent.text, child.text)
                self.assertNotEqual(parent.pack.pack_id, child.pack.pack_id)

            for mpg in m_pack.get(MultiPackGroup):
                members = mpg.get_members()
                num_mpg_aug += 1
                self.assertEqual(members[0].text, members[1].text)
                self.assertNotEqual(
                    members[0].pack.pack_id, members[1].pack.pack_id
                )

            # Test the number of MultiPackLink/MultiPackGroup.
            # Minus the aug and orig counters by 1, because the Document is
            # not copied.
            # So we ignore the MPL and MPG between Document.
            # The number should be doubled, except for one deletion.
            self.assertEqual(num_mpl_aug - 1, (num_mpl_orig - 1) * 2 - 1)
            self.assertEqual(num_mpg_aug - 1, (num_mpg_orig - 1) * 2 - 1)