def test_gpt2_bpe_tokenizer(self): text = "Prototype" expected = [Token("19703", 0, 4), Token("8690", 4, 9)] tokenizer = GPT2BPETokenizer.from_config( GPT2BPETokenizer.Config( bpe_vocab_path="pytext/data/test/data/gpt2_vocab.bpe", bpe_encoder_path="pytext/data/test/data/gpt2_encoder.json", )) tokens = tokenizer.tokenize(text) print(tokens) self.assertEqual(tokens, expected)
def test_squad_roberta_tensorizer(self): row = { "id": 0, "doc": "Prototype", "question": "otype", "answers": ["Prot"], "answer_starts": [0], "has_answer": True, } tensorizer = SquadForRoBERTaTensorizer.from_config( SquadForRoBERTaTensorizer.Config( tokenizer=GPT2BPETokenizer.Config( bpe_encoder_path="pytext/data/test/data/gpt2_encoder.json", bpe_vocab_path="pytext/data/test/data/gpt2_vocab.bpe", ), vocab_file="pytext/data/test/data/gpt2_dict.txt", max_seq_len=250, ) ) tokens, segments, seq_len, positions, start, end = tensorizer.numberize(row) # check against manually verified answer positions in tokenized output # there are 4 identical answers self.assertEqual(start, [3]) self.assertEqual(end, [3]) self.assertEqual(len(tokens), seq_len) self.assertEqual(len(segments), seq_len)
def test_roberta_tensorizer(self): text = "Prototype" tokens = [[0, 4, 5, 2]] pad_masks = [[1, 1, 1, 1]] segment_labels = [[0, 0, 0, 0]] positions = [[0, 1, 2, 3]] expected = [tokens, pad_masks, segment_labels, positions] tensorizer = RoBERTaTensorizer.from_config( RoBERTaTensorizer.Config( tokenizer=GPT2BPETokenizer.Config( bpe_encoder_path="pytext/data/test/data/gpt2_encoder.json", bpe_vocab_path="pytext/data/test/data/gpt2_vocab.bpe", ), vocab_file="pytext/data/test/data/gpt2_dict.txt", max_seq_len=256, )) tensors = tensorizer.tensorize([tensorizer.numberize({"text": text})]) for tensor, expect in zip(tensors, expected): self.assertEqual(tensor.tolist(), expect) tensorizer_impl = RoBERTaTensorizerScriptImpl( tokenizer=DoNothingTokenizer(), vocab=tensorizer.vocab, max_seq_len=tensorizer.max_seq_len, ).torchscriptify() per_sentence_tokens = [tensorizer.tokenizer.tokenize(text)] tokens_2d, segment_labels_2d, seq_lens_1d, positions_2d = zip( *[tensorizer_impl.numberize(per_sentence_tokens)]) script_tensors = tensorizer_impl.tensorize(tokens_2d, segment_labels_2d, seq_lens_1d, positions_2d) for tensor, expect in zip(script_tensors, expected): self.assertEqual(tensor.tolist(), expect)
def test_gpt2_bpe_tokenizer(self): tokenizer = GPT2BPETokenizer.from_config( GPT2BPETokenizer.Config( bpe_vocab_path="pytext/data/test/data/gpt2_vocab.bpe", bpe_encoder_path="pytext/data/test/data/gpt2_encoder.json", )) text_list = ["Prototype", " Prototype"] expected_list = [ [Token("19703", 0, 4), Token("8690", 4, 9)], [Token("220", 0, 0), Token("19703", 1, 5), Token("8690", 5, 10)], ] for (text, expected) in zip(text_list, expected_list): tokens = tokenizer.tokenize(text) self.assertEqual(tokens, expected)
class Config(BERTTensorizerBase.Config): # any unittest should be overriding this with a small local file vocab_file: str = resources.roberta.GPT2_BPE_DICT tokenizer: Tokenizer.Config = GPT2BPETokenizer.Config() max_seq_len: int = 256
class Config(BERTTensorizerBase.Config): vocab_file: str = ( "manifold://pytext_training/tree/static/vocabs/bpe/gpt2/dict.txt") tokenizer: Tokenizer.Config = GPT2BPETokenizer.Config() max_seq_len: int = 256
class Config(Tensorizer.Config): columns: List[str] = ["text"] tokenizer: GPT2BPETokenizer.Config = GPT2BPETokenizer.Config() max_seq_len: int = 256