class InputConfig(ConfigBase):
            right_tokens: RoBERTaTensorizer.Config = RoBERTaTensorizer.Config()
            left_tokens: RoBERTaTensorizer.Config = RoBERTaTensorizer.Config()
            right_dense: Optional[FloatListTensorizer.Config] = None
            left_dense: Optional[FloatListTensorizer.Config] = None

            labels: LabelTensorizer.Config = LabelTensorizer.Config()
Exemple #2
0
    def test_roberta_tensorizer(self):
        text = "Prototype"
        tokens = [[0, 4, 5, 2]]
        pad_masks = [[1, 1, 1, 1]]
        segment_labels = [[0, 0, 0, 0]]
        positions = [[0, 1, 2, 3]]
        expected = [tokens, pad_masks, segment_labels, positions]

        tensorizer = RoBERTaTensorizer.from_config(
            RoBERTaTensorizer.Config(
                tokenizer=GPT2BPETokenizer.Config(
                    bpe_encoder_path="pytext/data/test/data/gpt2_encoder.json",
                    bpe_vocab_path="pytext/data/test/data/gpt2_vocab.bpe",
                ),
                vocab_file="pytext/data/test/data/gpt2_dict.txt",
                max_seq_len=256,
            ))
        tensors = tensorizer.tensorize([tensorizer.numberize({"text": text})])
        for tensor, expect in zip(tensors, expected):
            self.assertEqual(tensor.tolist(), expect)

        tensorizer_impl = RoBERTaTensorizerScriptImpl(
            tokenizer=DoNothingTokenizer(),
            vocab=tensorizer.vocab,
            max_seq_len=tensorizer.max_seq_len,
        ).torchscriptify()
        per_sentence_tokens = [tensorizer.tokenizer.tokenize(text)]
        tokens_2d, segment_labels_2d, seq_lens_1d, positions_2d = zip(
            *[tensorizer_impl.numberize(per_sentence_tokens)])
        script_tensors = tensorizer_impl.tensorize(tokens_2d,
                                                   segment_labels_2d,
                                                   seq_lens_1d, positions_2d)
        for tensor, expect in zip(script_tensors, expected):
            self.assertEqual(tensor.tolist(), expect)
Exemple #3
0
 def __init__(
     self,
     columns: List[str] = Config.columns,
     vocab: Optional[Vocabulary] = None,
     tokenizer: Optional[Tokenizer] = None,
     max_seq_len: int = Config.max_seq_len,
 ):
     RoBERTaTensorizer.__init__(
         self,
         columns=columns,
         vocab=vocab,
         tokenizer=tokenizer,
         max_seq_len=max_seq_len,
     )
 def __init__(
     self,
     columns: List[str] = Config.columns,
     vocab: Vocabulary = None,
     tokenizer: Tokenizer = None,
     max_seq_len: int = Config.max_seq_len,
     answers_column: str = Config.answers_column,
     answer_starts_column: str = Config.answer_starts_column,
 ):
     RoBERTaTensorizer.__init__(
         self,
         columns=columns,
         vocab=vocab,
         tokenizer=tokenizer,
         max_seq_len=max_seq_len,
     )
     self.answers_column = answers_column
     self.answer_starts_column = answer_starts_column
     self.wrap_special_tokens = False
Exemple #5
0
 class RegressionModelInput(ConfigBase):
     tokens: RoBERTaTensorizer.Config = RoBERTaTensorizer.Config()
     labels: NumericLabelTensorizer.Config = NumericLabelTensorizer.Config(
     )
 def _lookup_tokens(self, text):
     return RoBERTaTensorizer._lookup_tokens(self, text)
        class InputConfig(ConfigBase):
            tokens: RoBERTaTensorizer.Config = RoBERTaTensorizer.Config()
            right_dense: FloatListTensorizer.Config = None
            left_dense: FloatListTensorizer.Config = None

            labels: LabelTensorizer.Config = LabelTensorizer.Config()
Exemple #8
0
 class InputConfig(ConfigBase):
     tokens: RoBERTaTensorizer.Config = RoBERTaTensorizer.Config()
     labels: LabelTensorizer.Config = LabelTensorizer.Config()