コード例 #1
0
ファイル: tensorizers_test.py プロジェクト: kwikBioInc/pytext
    def test_create_label_tensors(self):
        tensorizer = LabelTensorizer(column="label")
        init = tensorizer.initialize()
        init.send(None)  # kick
        for row in self.data.train:
            init.send(row)
        init.close()

        rows = [
            {
                "label": types.Label("weather/find")
            },
            {
                "label": types.Label("alarm/set_alarm")
            },
            {
                "label": types.Label("non/existent")
            },
        ]

        tensors = (tensorizer.numberize(row) for row in rows)
        tensor = next(tensors)
        self.assertEqual(6, tensor)
        tensor = next(tensors)
        self.assertEqual(1, tensor)
        with self.assertRaises(Exception):
            tensor = next(tensors)
コード例 #2
0
 def test_initialize_label_tensorizer(self):
     tensorizer = LabelTensorizer(column="label")
     init = tensorizer.initialize()
     init.send(None)  # kick
     for row in self.data.train:
         init.send(row)
     init.close()
     self.assertEqual(7, len(tensorizer.labels))
コード例 #3
0
ファイル: output_layer_test.py プロジェクト: a-domingu/tbcnn
 def test_create_word_tagging_output_layer(self):
     tensorizer = LabelTensorizer()
     tensorizer.vocab = Vocabulary(["foo", "bar"])
     tensorizer.pad_idx = 0
     layer = WordTaggingOutputLayer.from_config(
         config=WordTaggingOutputLayer.Config(label_weights={"foo": 2.2}),
         labels=tensorizer.vocab,
     )
     np.testing.assert_array_almost_equal(
         np.array([2.2, 1]), layer.loss_fn.weight.detach().numpy()
     )
コード例 #4
0
    def test_create_label_tensors_fails_with_unknown_label(self):
        tensorizer = LabelTensorizer(column="label")
        init = tensorizer.initialize()
        init.send(None)  # kick
        for row in self.data.train:
            init.send(row)
        init.close()

        batch = [
            {"label": types.Label("non/existent")},
            {"label": types.Label("alarm/set_alarm")},
        ]

        with self.assertRaises(Exception):
            tensorizer.create_training_tensors(batch)
コード例 #5
0
    def test_create_label_tensors(self):
        tensorizer = LabelTensorizer(column="label")
        init = tensorizer.initialize()
        init.send(None)  # kick
        for row in self.data.train:
            init.send(row)
        init.close()

        batch = [
            {"label": types.Label("weather/find")},
            {"label": types.Label("alarm/set_alarm")},
        ]

        tensor = tensorizer.create_training_tensors(batch)
        self.assertEqual((2,), tensor.size())
        self.assertEqual([6, 1], tensor.tolist())
コード例 #6
0
        class InputConfig(ConfigBase):
            right_tokens: RoBERTaTensorizer.Config = RoBERTaTensorizer.Config()
            left_tokens: RoBERTaTensorizer.Config = RoBERTaTensorizer.Config()
            right_dense: Optional[FloatListTensorizer.Config] = None
            left_dense: Optional[FloatListTensorizer.Config] = None

            labels: LabelTensorizer.Config = LabelTensorizer.Config()
コード例 #7
0
 class EncoderModelInput(BaseModel.Config.ModelInput):
     tokens: Tensorizer.Config = Tensorizer.Config()
     dense: Optional[FloatListTensorizer.Config] = None
     labels: LabelTensorizer.Config = LabelTensorizer.Config()
     # for metric reporter
     num_tokens: NtokensTensorizer.Config = NtokensTensorizer.Config(
         names=["tokens"], indexes=[2])
コード例 #8
0
 class EncoderPairwiseModelInput(ModelInputBase):
     tokens1: Tensorizer.Config = Tensorizer.Config()
     tokens2: Tensorizer.Config = Tensorizer.Config()
     labels: LabelTensorizer.Config = LabelTensorizer.Config()
     # for metric reporter
     num_tokens: NtokensTensorizer.Config = NtokensTensorizer.Config(
         names=["tokens1", "tokens2"], indexes=[2, 2])
コード例 #9
0
ファイル: output_layer_test.py プロジェクト: a-domingu/tbcnn
    def test_doc_classification_output_layer(self):
        tensorizer = LabelTensorizer()
        tensorizer.vocab = Vocabulary([SpecialTokens.PAD, "foo", "bar"])
        layer = ClassificationOutputLayer.from_config(
            config=ClassificationOutputLayer.Config(loss=CrossEntropyLoss.Config()),
            labels=tensorizer.vocab,
        )
        self.assertEqual(layer.loss_fn.ignore_index, 0)

        # use default pad
        tensorizer.vocab = Vocabulary(["foo", "bar"])
        layer = ClassificationOutputLayer.from_config(
            config=ClassificationOutputLayer.Config(loss=CrossEntropyLoss.Config()),
            labels=tensorizer.vocab,
        )
        self.assertEqual(layer.loss_fn.ignore_index, -1)
コード例 #10
0
    def test_batch_predict_caffe2_model(self):
        with tempfile.NamedTemporaryFile() as snapshot_file, tempfile.NamedTemporaryFile() as caffe2_model_file:
            train_data = tests_module.test_file("train_data_tiny.tsv")
            eval_data = tests_module.test_file("test_data_tiny.tsv")
            config = PyTextConfig(
                task=DocumentClassificationTask.Config(
                    model=DocModel.Config(
                        inputs=DocModel.Config.ModelInput(
                            tokens=TokenTensorizer.Config(),
                            dense=FloatListTensorizer.Config(
                                column="dense", dim=1, error_check=True
                            ),
                            labels=LabelTensorizer.Config(),
                        )
                    ),
                    data=Data.Config(
                        source=TSVDataSource.Config(
                            train_filename=train_data,
                            eval_filename=eval_data,
                            test_filename=eval_data,
                            field_names=["label", "slots", "text", "dense"],
                        )
                    ),
                ),
                version=21,
                save_snapshot_path=snapshot_file.name,
                export_caffe2_path=caffe2_model_file.name,
            )
            task = create_task(config.task)
            task.export(task.model, caffe2_model_file.name)
            model = task.model
            save(config, model, meta=None, tensorizers=task.data.tensorizers)

            pt_results = task.predict(task.data.data_source.test)

            def assert_caffe2_results_correct(caffe2_results):
                for pt_res, res in zip(pt_results, caffe2_results):
                    np.testing.assert_array_almost_equal(
                        pt_res["score"].tolist()[0],
                        [score[0] for score in res.values()],
                    )

            results = batch_predict_caffe2_model(
                snapshot_file.name, caffe2_model_file.name
            )
            self.assertEqual(4, len(results))
            assert_caffe2_results_correct(results)

            results = batch_predict_caffe2_model(
                snapshot_file.name, caffe2_model_file.name, cache_size=2
            )
            self.assertEqual(4, len(results))
            assert_caffe2_results_correct(results)

            results = batch_predict_caffe2_model(
                snapshot_file.name, caffe2_model_file.name, cache_size=-1
            )
            self.assertEqual(4, len(results))
            assert_caffe2_results_correct(results)
コード例 #11
0
 class ModelInput(Model.Config.ModelInput):
     tokens: TokenTensorizer.Config = TokenTensorizer.Config()
     word_labels: SlotLabelTensorizer.Config = SlotLabelTensorizer.Config(
         allow_unknown=True)
     doc_labels: LabelTensorizer.Config = LabelTensorizer.Config(
         allow_unknown=True)
     doc_weight: Optional[FloatTensorizer.Config] = None
     word_weight: Optional[FloatTensorizer.Config] = None
コード例 #12
0
 class BertModelInput(BaseModel.Config.ModelInput):
     tokens: BERTTensorizer.Config = BERTTensorizer.Config(max_seq_len=128)
     dense: Optional[FloatListTensorizer.Config] = None
     labels: LabelTensorizer.Config = LabelTensorizer.Config()
     # for metric reporter
     num_tokens: NtokensTensorizer.Config = NtokensTensorizer.Config(
         names=["tokens"], indexes=[2]
     )
コード例 #13
0
 class ModelInput(BasePairwiseModel.Config.ModelInput):
     tokens1: TokenTensorizer.Config = TokenTensorizer.Config(column="text1")
     tokens2: TokenTensorizer.Config = TokenTensorizer.Config(column="text2")
     labels: LabelTensorizer.Config = LabelTensorizer.Config()
     # for metric reporter
     raw_text: JoinStringTensorizer.Config = JoinStringTensorizer.Config(
         columns=["text1", "text2"]
     )
コード例 #14
0
    def test_create_label_tensors(self):
        tensorizer = LabelTensorizer(label_column="label")
        self._initialize_tensorizer(tensorizer)

        rows = [
            {"label": "weather/find"},
            {"label": "alarm/set_alarm"},
            {"label": "non/existent"},
        ]

        tensors = (tensorizer.numberize(row) for row in rows)
        tensor = next(tensors)
        self.assertEqual(6, tensor)
        tensor = next(tensors)
        self.assertEqual(1, tensor)
        with self.assertRaises(Exception):
            tensor = next(tensors)
コード例 #15
0
 class ModelInput(BaseModel.Config.ModelInput):
     squad_input: Union[
         SquadForBERTTensorizer.Config, SquadForRoBERTaTensorizer.Config
     ] = SquadForBERTTensorizer.Config(max_seq_len=256)
     # is_impossible label
     has_answer: LabelTensorizer.Config = LabelTensorizer.Config(
         column="has_answer"
     )
コード例 #16
0
 def test_initialize_tensorizers(self):
     tensorizers = {
         "tokens": WordTensorizer(column="text"),
         "labels": LabelTensorizer(column="label"),
         "chars": CharacterTensorizer(column="text"),
     }
     initialize_tensorizers(tensorizers, self.data.train)
     self.assertEqual(49, len(tensorizers["tokens"].vocab))
     self.assertEqual(7, len(tensorizers["labels"].labels))
コード例 #17
0
 def test_initialize_tensorizers(self):
     tensorizers = {
         "tokens": TokenTensorizer(text_column="text"),
         "labels": LabelTensorizer(label_column="label"),
         "chars": ByteTensorizer(text_column="text"),
     }
     initialize_tensorizers(tensorizers, self.data.train)
     self.assertEqual(49, len(tensorizers["tokens"].vocab))
     self.assertEqual(7, len(tensorizers["labels"].vocab))
コード例 #18
0
 class ModelInput(ModelInputBase):
     tokens1: BERTTensorizerBase.Config = BERTTensorizer.Config(
         columns=["text1"], max_seq_len=128)
     tokens2: BERTTensorizerBase.Config = BERTTensorizer.Config(
         columns=["text2"], max_seq_len=128)
     labels: LabelTensorizer.Config = LabelTensorizer.Config()
     # for metric reporter
     num_tokens: NtokensTensorizer.Config = NtokensTensorizer.Config(
         names=["tokens1", "tokens2"], indexes=[2, 2])
コード例 #19
0
 class ModelInput(Model.Config.ModelInput):
     tokens: TokenTensorizer.Config = TokenTensorizer.Config()
     word_labels: SlotLabelTensorizer.Config = SlotLabelTensorizer.Config(
         allow_unknown=True)
     doc_labels: LabelTensorizer.Config = LabelTensorizer.Config(
         allow_unknown=True)
     doc_weight: FloatTensorizer.Config = FloatTensorizer.Config(
         column="doc_weight")
     word_weight: FloatTensorizer.Config = FloatTensorizer.Config(
         column="word_weight")
コード例 #20
0
ファイル: data_test.py プロジェクト: westinedu/pytext
 def test_data_initializes_tensorsizers(self):
     tensorizers = {
         "tokens": TokenTensorizer(text_column="text"),
         "labels": LabelTensorizer(label_column="label"),
     }
     # verify TokenTensorizer isn't in an initialized state yet
     assert tensorizers["tokens"].vocab is None
     Data(self.data_source, tensorizers)
     # Tensorizers should have been initialized
     self.assertEqual(49, len(tensorizers["tokens"].vocab))
     self.assertEqual(7, len(tensorizers["labels"].vocab))
コード例 #21
0
ファイル: data_test.py プロジェクト: yuxuan2015/pytext
 def test_data_initializes_tensorsizers(self):
     tensorizers = {
         "tokens": WordTensorizer(column="text"),
         "labels": LabelTensorizer(column="label"),
     }
     with self.assertRaises(AttributeError):
         # verify WordTensorizer isn't in an initialized state yet
         tensorizers["tokens"].vocab
     Data(self.data_source, tensorizers)
     # Tensorizers should have been initialized
     self.assertEqual(49, len(tensorizers["tokens"].vocab))
     self.assertEqual(7, len(tensorizers["labels"].labels))
コード例 #22
0
    def setUp(self):
        self.data_source = TSVDataSource(
            SafeFileWrapper(tests_module.test_file("train_dense_features_tiny.tsv")),
            SafeFileWrapper(tests_module.test_file("test_dense_features_tiny.tsv")),
            eval_file=None,
            field_names=["label", "slots", "text", "dense"],
            schema={"text": types.Text, "label": types.Label},
        )

        self.tensorizers = {
            "tokens": WordTensorizer(column="text"),
            "labels": LabelTensorizer(column="label", allow_unknown=True),
        }
コード例 #23
0
 class ModelInput(BasePairwiseModel.Config.ModelInput):
     tokens1: TokenTensorizer.Config = TokenTensorizer.Config(
         column="text1")
     tokens2: TokenTensorizer.Config = TokenTensorizer.Config(
         column="text2")
     labels: LabelTensorizer.Config = LabelTensorizer.Config()
コード例 #24
0
        class InputConfig(ConfigBase):
            tokens: RoBERTaTensorizer.Config = RoBERTaTensorizer.Config()
            right_dense: FloatListTensorizer.Config = None
            left_dense: FloatListTensorizer.Config = None

            labels: LabelTensorizer.Config = LabelTensorizer.Config()
コード例 #25
0
 class ModelInput(Model.Config.ModelInput):
     tokens: WordTensorizer.Config = WordTensorizer.Config()
     labels: LabelTensorizer.Config = LabelTensorizer.Config(
         allow_unknown=True)
     # for metric reporter
     raw_text: MetaInput.Config = MetaInput.Config(column="text")
コード例 #26
0
 class ModelInput(Model.Config.ModelInput):
     tokens: TokenTensorizer.Config = TokenTensorizer.Config()
     labels: LabelTensorizer.Config = LabelTensorizer.Config()
コード例 #27
0
 class ModelInput(Model.Config.ModelInput):
     tokens: TokenTensorizer.Config = TokenTensorizer.Config()
     dense: Optional[FloatListTensorizer.Config] = None
     labels: LabelTensorizer.Config = LabelTensorizer.Config()
コード例 #28
0
 def test_initialize_label_tensorizer(self):
     tensorizer = LabelTensorizer(label_column="label")
     self._initialize_tensorizer(tensorizer)
     self.assertEqual(7, len(tensorizer.vocab))
コード例 #29
0
 class ModelInput(Model.Config.ModelInput):
     tokens: TokenTensorizer.Config = TokenTensorizer.Config()
     labels: LabelTensorizer.Config = LabelTensorizer.Config(allow_unknown=True)
     # for metric reporter
     raw_text: RawString.Config = RawString.Config(column="text")
コード例 #30
0
ファイル: roberta.py プロジェクト: shadowkun/pytext
 class InputConfig(ConfigBase):
     tokens: RoBERTaTensorizer.Config = RoBERTaTensorizer.Config()
     labels: LabelTensorizer.Config = LabelTensorizer.Config()