Python SquadDataSource Examples

Programming Language: Python

Namespace/Package Name: pytext.data.sources

Class/Type: SquadDataSource

Examples at hotexamples.com: 2

Python SquadDataSource - 2 examples found. These are the top rated real world Python examples of pytext.data.sources.SquadDataSource extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Config(2)

from_config(2)

Frequently Used Methods

Config (2)

from_config (2)

Example #1

Show file

File: tensorizers_test.py Project: ufukhurriyetoglu/pytext

    def test_squad_tensorizer(self):
        source = SquadDataSource.from_config(
            SquadDataSource.Config(
                eval_filename=tests_module.test_file("squad_tiny.json")
            )
        )
        row = next(iter(source.eval))
        tensorizer = SquadForBERTTensorizer.from_config(
            SquadForBERTTensorizer.Config(
                tokenizer=WordPieceTokenizer.Config(
                    wordpiece_vocab_path="pytext/data/test/data/wordpiece_1k.txt"
                ),
                max_seq_len=250,
            )
        )
        tokens, segments, seq_len, start, end = tensorizer.numberize(row)
        # check against manually verified answer positions in tokenized output
        # there are 4 identical answers
        self.assertEqual(start, [83, 83, 83, 83])
        self.assertEqual(end, [87, 87, 87, 87])
        self.assertEqual(len(tokens), seq_len)
        self.assertEqual(len(segments), seq_len)

        tensorizer.max_seq_len = 50
        # answer should be truncated out
        _, _, _, start, end = tensorizer.numberize(row)
        self.assertEqual(start, [-100, -100, -100, -100])
        self.assertEqual(end, [-100, -100, -100, -100])
        self.assertEqual(len(tokens), seq_len)
        self.assertEqual(len(segments), seq_len)

Example #2

Show file

File: tensorizers_test.py Project: zailushang2006/pytext

    def setUp(self):
        self.json_data_source = SquadDataSource.from_config(
            SquadDataSource.Config(
                train_filename=tests_module.test_file("squad_tiny.json"),
                eval_filename=None,
                test_filename=None,
            )
        )
        self.tsv_data_source = SquadDataSource.from_config(
            SquadDataSource.Config(
                train_filename=tests_module.test_file("squad_tiny.tsv"),
                eval_filename=None,
                test_filename=None,
            )
        )

        self.tensorizer_with_wordpiece = SquadTensorizer.from_config(
            SquadTensorizer.Config(
                tokenizer=WordPieceTokenizer.Config(
                    wordpiece_vocab_path="pytext/data/test/data/wordpiece_1k.txt"
                ),
                max_seq_len=250,
            )
        )
        self.tensorizer_with_alphanumeric = SquadTensorizer.from_config(
            SquadTensorizer.Config(
                tokenizer=Tokenizer.Config(split_regex=r"\W+"), max_seq_len=250
            )
        )