Beispiel #1
0
 class Config(TokenTensorizer.Config):
     #: The tokenizer to use to split input text into tokens.
     columns: List[str] = ["text"]
     tokenizer: Tokenizer.Config = WordPieceTokenizer.Config()
     add_bos_token: bool = True
     add_eos_token: bool = True
     bos_token: str = "[CLS]"
     eos_token: str = "[SEP]"
     pad_token: str = "[PAD]"
     unk_token: str = "[UNK]"
     mask_token: str = "[MASK]"
     vocab_file: str = WordPieceTokenizer.Config().wordpiece_vocab_path
Beispiel #2
0
 def test_wordpiece_tokenizer(self):
     text = "Marcó Lopᚠz"
     expected = [
         Token("m", 0, 1),
         Token("##ar", 1, 3),
         Token("##c", 3, 4),
         Token(value="##o", start=4, end=5),
         Token(value="[UNK]", start=6, end=11),
     ]
     tokenizer = WordPieceTokenizer.from_config(
         WordPieceTokenizer.Config(
             wordpiece_vocab_path="pytext/data/test/data/wordpiece_1k.txt"))
     tokens = tokenizer.tokenize(text)
     print(tokens)
     self.assertEqual(tokens, expected)
    def test_squad_tensorizer(self):
        source = SquadDataSource.from_config(
            SquadDataSource.Config(
                eval_filename=tests_module.test_file("squad_tiny.json")
            )
        )
        row = next(iter(source.eval))
        tensorizer = SquadForBERTTensorizer.from_config(
            SquadForBERTTensorizer.Config(
                tokenizer=WordPieceTokenizer.Config(
                    wordpiece_vocab_path="pytext/data/test/data/wordpiece_1k.txt"
                ),
                max_seq_len=250,
            )
        )
        tokens, segments, seq_len, start, end = tensorizer.numberize(row)
        # check against manually verified answer positions in tokenized output
        # there are 4 identical answers
        self.assertEqual(start, [83, 83, 83, 83])
        self.assertEqual(end, [87, 87, 87, 87])
        self.assertEqual(len(tokens), seq_len)
        self.assertEqual(len(segments), seq_len)

        tensorizer.max_seq_len = 50
        # answer should be truncated out
        _, _, _, start, end = tensorizer.numberize(row)
        self.assertEqual(start, [-100, -100, -100, -100])
        self.assertEqual(end, [-100, -100, -100, -100])
        self.assertEqual(len(tokens), seq_len)
        self.assertEqual(len(segments), seq_len)
    def setUp(self):
        self.json_data_source = SquadDataSource.from_config(
            SquadDataSource.Config(
                train_filename=tests_module.test_file("squad_tiny.json"),
                eval_filename=None,
                test_filename=None,
            )
        )
        self.tsv_data_source = SquadDataSource.from_config(
            SquadDataSource.Config(
                train_filename=tests_module.test_file("squad_tiny.tsv"),
                eval_filename=None,
                test_filename=None,
            )
        )

        self.tensorizer_with_wordpiece = SquadTensorizer.from_config(
            SquadTensorizer.Config(
                tokenizer=WordPieceTokenizer.Config(
                    wordpiece_vocab_path="pytext/data/test/data/wordpiece_1k.txt"
                ),
                max_seq_len=250,
            )
        )
        self.tensorizer_with_alphanumeric = SquadTensorizer.from_config(
            SquadTensorizer.Config(
                tokenizer=Tokenizer.Config(split_regex=r"\W+"), max_seq_len=250
            )
        )
Beispiel #5
0
 def test_bert_pair_tensorizer(self):
     sentences = ["Focus", "Driving School"]
     expected_tokens = [101, 175, 287, 766, 462, 102, 100, 379, 102]
     expected_segment_labels = [0, 0, 0, 0, 0, 0, 1, 1, 1]
     row = {"text1": sentences[0], "text2": sentences[1]}
     tensorizer = BERTTensorizer.from_config(
         BERTTensorizer.Config(
             columns=["text1", "text2"],
             tokenizer=WordPieceTokenizer.Config(
                 wordpiece_vocab_path=
                 "pytext/data/test/data/wordpiece_1k.txt"),
         ))
     tokens, segment_labels, seq_len = tensorizer.numberize(row)
     self.assertEqual(tokens, expected_tokens)
     self.assertEqual(segment_labels, expected_segment_labels)
     self.assertEqual(seq_len, len(expected_tokens))
Beispiel #6
0
 class Config(BERTTensorizerBase.Config):
     tokenizer: Tokenizer.Config = WordPieceTokenizer.Config()
     vocab_file: str = WordPieceTokenizer.Config().wordpiece_vocab_path
    def test_bert_tensorizer(self):
        sentence = "<SOS>  Focus Driving School Mulungushi bus station along Kasuba road, wamkopeka building.  Ndola,  Zambia."
        # expected result was obtained offline by running BertModelDataHandler
        expected = [
            101,
            133,
            278,
            217,
            135,
            175,
            287,
            766,
            462,
            100,
            379,
            182,
            459,
            334,
            459,
            280,
            504,
            462,
            425,
            283,
            171,
            462,
            567,
            474,
            180,
            262,
            217,
            459,
            931,
            262,
            913,
            117,
            192,
            262,
            407,
            478,
            287,
            744,
            263,
            478,
            262,
            560,
            119,
            183,
            282,
            287,
            843,
            117,
            195,
            262,
            407,
            931,
            566,
            119,
            102,
        ]
        row = {"text": sentence}
        tensorizer = BERTTensorizer.from_config(
            BERTTensorizer.Config(
                tokenizer=WordPieceTokenizer.Config(
                    wordpiece_vocab_path="pytext/data/test/data/wordpiece_1k.txt"
                )
            )
        )
        tokens, segment_label, seq_len = tensorizer.numberize(row)
        self.assertEqual(tokens, expected)
        self.assertEqual(seq_len, len(expected))
        self.assertEqual(segment_label, [0] * len(expected))

        tokens, pad_mask, segment_labels = tensorizer.tensorize(
            [(tokens, segment_label, seq_len)]
        )
        self.assertEqual(pad_mask[0].tolist(), [1] * len(expected))