def test_roberta_tensorizer(self): text = "Prototype" tokens = [[0, 4, 5, 2]] pad_masks = [[1, 1, 1, 1]] segment_labels = [[0, 0, 0, 0]] positions = [[0, 1, 2, 3]] expected = [tokens, pad_masks, segment_labels, positions] tensorizer = RoBERTaTensorizer.from_config( RoBERTaTensorizer.Config( tokenizer=GPT2BPETokenizer.Config( bpe_encoder_path="pytext/data/test/data/gpt2_encoder.json", bpe_vocab_path="pytext/data/test/data/gpt2_vocab.bpe", ), vocab_file="pytext/data/test/data/gpt2_dict.txt", max_seq_len=256, )) tensors = tensorizer.tensorize([tensorizer.numberize({"text": text})]) for tensor, expect in zip(tensors, expected): self.assertEqual(tensor.tolist(), expect) tensorizer_impl = RoBERTaTensorizerScriptImpl( tokenizer=DoNothingTokenizer(), vocab=tensorizer.vocab, max_seq_len=tensorizer.max_seq_len, ).torchscriptify() per_sentence_tokens = [tensorizer.tokenizer.tokenize(text)] tokens_2d, segment_labels_2d, seq_lens_1d, positions_2d = zip( *[tensorizer_impl.numberize(per_sentence_tokens)]) script_tensors = tensorizer_impl.tensorize(tokens_2d, segment_labels_2d, seq_lens_1d, positions_2d) for tensor, expect in zip(script_tensors, expected): self.assertEqual(tensor.tolist(), expect)
def test_bert_tensorizer(self): sentence = "<SOS> Focus Driving School Mulungushi bus station along Kasuba road, wamkopeka building. Ndola, Zambia." # expected result was obtained offline by running BertModelDataHandler expected = [ 101, 133, 278, 217, 135, 175, 287, 766, 462, 100, 379, 182, 459, 334, 459, 280, 504, 462, 425, 283, 171, 462, 567, 474, 180, 262, 217, 459, 931, 262, 913, 117, 192, 262, 407, 478, 287, 744, 263, 478, 262, 560, 119, 183, 282, 287, 843, 117, 195, 262, 407, 931, 566, 119, 102, ] row = {"text": sentence} tensorizer = BERTTensorizer.from_config( BERTTensorizer.Config( tokenizer=WordPieceTokenizer.Config( wordpiece_vocab_path="pytext/data/test/data/wordpiece_1k.txt" ) ) ) tensorizer_impl = BERTTensorizerScriptImpl( tokenizer=DoNothingTokenizer(), vocab=tensorizer.vocab, max_seq_len=tensorizer.max_seq_len, ).torchscriptify() tokens, segment_label, seq_len, positions = tensorizer.numberize(row) self.assertEqual(tokens, expected) self.assertEqual(seq_len, len(expected)) self.assertEqual(segment_label, [0] * len(expected)) tokens, pad_mask, segment_labels, _ = tensorizer.tensorize( [(tokens, segment_label, seq_len, positions)] ) self.assertEqual(pad_mask[0].tolist(), [1] * len(expected)) per_sentence_tokens = [tensorizer.tokenizer.tokenize(sentence)] tokens, segment_label, seq_len, positions = tensorizer_impl.numberize( per_sentence_tokens ) self.assertEqual(tokens, expected) self.assertEqual(seq_len, len(expected)) self.assertEqual(segment_label, [0] * len(expected)) tokens, pad_mask, segment_labels, _ = tensorizer_impl.tensorize( [tokens], [segment_label], [seq_len], [positions] ) self.assertEqual(pad_mask[0].tolist(), [1] * len(expected))