コード例 #1
0
 class ModelInput(_EncoderPairwiseModel.Config.ModelInput):
     tokens1: BERTTensorizerBase.Config = BERTTensorizer.Config(
         columns=["text1"], max_seq_len=128
     )
     tokens2: BERTTensorizerBase.Config = BERTTensorizer.Config(
         columns=["text2"], max_seq_len=128
     )
コード例 #2
0
 class ModelInput(ModelInputBase):
     tokens1: BERTTensorizerBase.Config = BERTTensorizer.Config(
         columns=["text1"], max_seq_len=128)
     tokens2: BERTTensorizerBase.Config = BERTTensorizer.Config(
         columns=["text2"], max_seq_len=128)
     labels: LabelTensorizer.Config = LabelTensorizer.Config()
     # for metric reporter
     num_tokens: NtokensTensorizer.Config = NtokensTensorizer.Config(
         names=["tokens1", "tokens2"], indexes=[2, 2])
コード例 #3
0
 class BertModelInput(BaseModel.Config.ModelInput):
     tokens: BERTTensorizer.Config = BERTTensorizer.Config(max_seq_len=128)
     dense: Optional[FloatListTensorizer.Config] = None
     labels: LabelTensorizer.Config = LabelTensorizer.Config()
     # for metric reporter
     num_tokens: NtokensTensorizer.Config = NtokensTensorizer.Config(
         names=["tokens"], indexes=[2]
     )
コード例 #4
0
 class Config(DocumentClassificationTask.Config):
     model: NewBertModel.Config = NewBertModel.Config(
         inputs=NewBertModel.Config.BertModelInput(
             tokens=BERTTensorizer.Config(columns=["text1", "text2"],
                                          max_seq_len=128)))
     metric_reporter: ClassificationMetricReporter.Config = (
         ClassificationMetricReporter.Config(
             text_column_names=["text1", "text2"]))
コード例 #5
0
ファイル: tensorizers_test.py プロジェクト: twild-fb/pytext
 def test_bert_pair_tensorizer(self):
     sentences = ["Focus", "Driving School"]
     expected_tokens = [101, 175, 287, 766, 462, 102, 100, 379, 102]
     expected_segment_labels = [0, 0, 0, 0, 0, 0, 1, 1, 1]
     row = {"text1": sentences[0], "text2": sentences[1]}
     tensorizer = BERTTensorizer.from_config(
         BERTTensorizer.Config(
             columns=["text1", "text2"],
             tokenizer=WordPieceTokenizer.Config(
                 wordpiece_vocab_path=
                 "pytext/data/test/data/wordpiece_1k.txt"),
         ))
     tokens, segment_labels, seq_len = tensorizer.numberize(row)
     self.assertEqual(tokens, expected_tokens)
     self.assertEqual(segment_labels, expected_segment_labels)
     self.assertEqual(seq_len, len(expected_tokens))
コード例 #6
0
 class BertModelInput(_EncoderBaseModel.Config.ModelInput):
     tokens: BERTTensorizer.Config = BERTTensorizer.Config(max_seq_len=128)
コード例 #7
0
    def test_bert_tensorizer(self):
        sentence = "<SOS>  Focus Driving School Mulungushi bus station along Kasuba road, wamkopeka building.  Ndola,  Zambia."
        # expected result was obtained offline by running BertModelDataHandler
        expected = [
            101,
            133,
            278,
            217,
            135,
            175,
            287,
            766,
            462,
            100,
            379,
            182,
            459,
            334,
            459,
            280,
            504,
            462,
            425,
            283,
            171,
            462,
            567,
            474,
            180,
            262,
            217,
            459,
            931,
            262,
            913,
            117,
            192,
            262,
            407,
            478,
            287,
            744,
            263,
            478,
            262,
            560,
            119,
            183,
            282,
            287,
            843,
            117,
            195,
            262,
            407,
            931,
            566,
            119,
            102,
        ]
        row = {"text": sentence}
        tensorizer = BERTTensorizer.from_config(
            BERTTensorizer.Config(
                tokenizer=WordPieceTokenizer.Config(
                    wordpiece_vocab_path="pytext/data/test/data/wordpiece_1k.txt"
                )
            )
        )
        tokens, segment_label, seq_len = tensorizer.numberize(row)
        self.assertEqual(tokens, expected)
        self.assertEqual(seq_len, len(expected))
        self.assertEqual(segment_label, [0] * len(expected))

        tokens, pad_mask, segment_labels = tensorizer.tensorize(
            [(tokens, segment_label, seq_len)]
        )
        self.assertEqual(pad_mask[0].tolist(), [1] * len(expected))
コード例 #8
0
ファイル: masked_lm.py プロジェクト: puffythecat/pytext
 class InputConfig(ConfigBase):
     tokens: BERTTensorizer.Config = BERTTensorizer.Config(
         max_seq_len=128)
コード例 #9
0
 class InputConfig(ConfigBase):
     tokens: BERTTensorizer.Config = BERTTensorizer.Config(
         columns=["text1", "text2"], max_seq_len=128
     )
     labels: NumericLabelTensorizer.Config = NumericLabelTensorizer.Config()