Ejemplo n.º 1
0
    def test_roberta_tensorizer(self):
        text = "Prototype"
        tokens = [[0, 4, 5, 2]]
        pad_masks = [[1, 1, 1, 1]]
        segment_labels = [[0, 0, 0, 0]]
        positions = [[0, 1, 2, 3]]
        expected = [tokens, pad_masks, segment_labels, positions]

        tensorizer = RoBERTaTensorizer.from_config(
            RoBERTaTensorizer.Config(
                tokenizer=GPT2BPETokenizer.Config(
                    bpe_encoder_path="pytext/data/test/data/gpt2_encoder.json",
                    bpe_vocab_path="pytext/data/test/data/gpt2_vocab.bpe",
                ),
                vocab_file="pytext/data/test/data/gpt2_dict.txt",
                max_seq_len=256,
            ))
        tensors = tensorizer.tensorize([tensorizer.numberize({"text": text})])
        for tensor, expect in zip(tensors, expected):
            self.assertEqual(tensor.tolist(), expect)

        tensorizer_impl = RoBERTaTensorizerScriptImpl(
            tokenizer=DoNothingTokenizer(),
            vocab=tensorizer.vocab,
            max_seq_len=tensorizer.max_seq_len,
        ).torchscriptify()
        per_sentence_tokens = [tensorizer.tokenizer.tokenize(text)]
        tokens_2d, segment_labels_2d, seq_lens_1d, positions_2d = zip(
            *[tensorizer_impl.numberize(per_sentence_tokens)])
        script_tensors = tensorizer_impl.tensorize(tokens_2d,
                                                   segment_labels_2d,
                                                   seq_lens_1d, positions_2d)
        for tensor, expect in zip(script_tensors, expected):
            self.assertEqual(tensor.tolist(), expect)
Ejemplo n.º 2
0
    def test_bert_tensorizer(self):
        sentence = "<SOS>  Focus Driving School Mulungushi bus station along Kasuba road, wamkopeka building.  Ndola,  Zambia."
        # expected result was obtained offline by running BertModelDataHandler
        expected = [
            101,
            133,
            278,
            217,
            135,
            175,
            287,
            766,
            462,
            100,
            379,
            182,
            459,
            334,
            459,
            280,
            504,
            462,
            425,
            283,
            171,
            462,
            567,
            474,
            180,
            262,
            217,
            459,
            931,
            262,
            913,
            117,
            192,
            262,
            407,
            478,
            287,
            744,
            263,
            478,
            262,
            560,
            119,
            183,
            282,
            287,
            843,
            117,
            195,
            262,
            407,
            931,
            566,
            119,
            102,
        ]
        row = {"text": sentence}
        tensorizer = BERTTensorizer.from_config(
            BERTTensorizer.Config(
                tokenizer=WordPieceTokenizer.Config(
                    wordpiece_vocab_path="pytext/data/test/data/wordpiece_1k.txt"
                )
            )
        )
        tensorizer_impl = BERTTensorizerScriptImpl(
            tokenizer=DoNothingTokenizer(),
            vocab=tensorizer.vocab,
            max_seq_len=tensorizer.max_seq_len,
        ).torchscriptify()

        tokens, segment_label, seq_len, positions = tensorizer.numberize(row)
        self.assertEqual(tokens, expected)
        self.assertEqual(seq_len, len(expected))
        self.assertEqual(segment_label, [0] * len(expected))

        tokens, pad_mask, segment_labels, _ = tensorizer.tensorize(
            [(tokens, segment_label, seq_len, positions)]
        )
        self.assertEqual(pad_mask[0].tolist(), [1] * len(expected))

        per_sentence_tokens = [tensorizer.tokenizer.tokenize(sentence)]
        tokens, segment_label, seq_len, positions = tensorizer_impl.numberize(
            per_sentence_tokens
        )
        self.assertEqual(tokens, expected)
        self.assertEqual(seq_len, len(expected))
        self.assertEqual(segment_label, [0] * len(expected))

        tokens, pad_mask, segment_labels, _ = tensorizer_impl.tensorize(
            [tokens], [segment_label], [seq_len], [positions]
        )
        self.assertEqual(pad_mask[0].tolist(), [1] * len(expected))