Python TSVDataSource Beispiele, pytext.data.sources.tsv.TSVDataSource Python Beispiele

Beispiel #1

0

Datei anzeigen

def get_tensorizers(add_dict_feat=False, add_contextual_feat=False):
    schema = {"source_sequence": str, "dict_feat": Gazetteer, "target_sequence": str}
    data_source = TSVDataSource.from_config(
        TSVDataSource.Config(
            train_filename=TEST_FILE_NAME,
            field_names=["source_sequence", "dict_feat", "target_sequence"],
        ),
        schema,
    )
    src_tensorizer = TokenTensorizer.from_config(
        TokenTensorizer.Config(
            column="source_sequence", add_eos_token=True, add_bos_token=True
        )
    )
    tgt_tensorizer = TokenTensorizer.from_config(
        TokenTensorizer.Config(
            column="target_sequence", add_eos_token=True, add_bos_token=True
        )
    )
    tensorizers = {"src_seq_tokens": src_tensorizer, "trg_seq_tokens": tgt_tensorizer}
    initialize_tensorizers(tensorizers, data_source.train)

    if add_dict_feat:
        tensorizers["dict_feat"] = GazetteerTensorizer.from_config(
            GazetteerTensorizer.Config(
                text_column="source_sequence", dict_column="dict_feat"
            )
        )
        initialize_tensorizers(
            {"dict_feat": tensorizers["dict_feat"]}, data_source.train
        )
    return tensorizers

Beispiel #2

0

Datei anzeigen

Datei: tsv_data_source_test.py Projekt: zailushang2006/pytext

 def test_read_data_source_with_utf8_issues(self):
     schema = {"text": str, "label": str}
     data_source = TSVDataSource.from_config(
         TSVDataSource.Config(
             train_filename=tests_module.test_file("test_utf8_errors.tsv"),
             field_names=["label", "text"],
         ),
         schema,
     )
     list(data_source.train)

Beispiel #3

0

Datei anzeigen

    def test_load_saved_model(self):
        with tempfile.NamedTemporaryFile() as snapshot_file:
            train_data = tests_module.test_file("train_data_tiny.tsv")
            eval_data = tests_module.test_file("test_data_tiny.tsv")
            config = PyTextConfig(
                task=DocumentClassificationTask.Config(data=Data.Config(
                    source=TSVDataSource.Config(
                        train_filename=train_data,
                        eval_filename=eval_data,
                        field_names=["label", "slots", "text"],
                    ))),
                version=LATEST_VERSION,
                save_snapshot_path=snapshot_file.name,
            )
            task = create_task(config.task)
            model = task.model

            save(config, model, meta=None, tensorizers=task.data.tensorizers)
            task2, config2, training_state_none = load(snapshot_file.name)
            self.assertEqual(config.export, config2.export)
            self.assertEqual(config.export_list, config2.export_list)
            self.assertEqual(config.task, config2.task)
            self.assertEqual(config, config2)
            self.assertModulesEqual(model, task2.model)
            self.assertIsNone(training_state_none)
            model.eval()
            task2.model.eval()

            inputs = torch.LongTensor([[1, 2, 3]]), torch.LongTensor([3])
            self.assertEqual(
                model(*inputs).tolist(),
                task2.model(*inputs).tolist())

Beispiel #4

0

Datei anzeigen

Datei: tensorizers_test.py Projekt: twild-fb/pytext

    def test_seq_tensor_with_bos_eos_eol_bol(self):
        tensorizer = SeqTokenTensorizer(
            add_bos_token=True,
            add_eos_token=True,
            add_bol_token=True,
            add_eol_token=True,
        )

        data = TSVDataSource(
            train_file=SafeFileWrapper(
                tests_module.test_file("train_seq_features.tsv")),
            test_file=None,
            eval_file=None,
            field_names=["text_seq"],
            schema={"text_seq": List[str]},
        )

        self._initialize_tensorizer(tensorizer, data)
        # UNK + PAD + BOS + EOS + BOL + EOL + 6 tokens
        self.assertEqual(12, len(tensorizer.vocab))

        # only one row in test file:
        # ["where do you wanna meet?", "MPK"]
        for row in data.train:
            idx, lens = tensorizer.numberize(row)
            self.assertEqual(4, lens)
            self.assertEqual(
                [
                    [2, 4, 3, 1, 1, 1, 1],
                    [2, 6, 7, 8, 9, 10, 3],
                    [2, 11, 3, 1, 1, 1, 1],
                    [2, 5, 3, 1, 1, 1, 1],
                ],
                idx,
            )

Beispiel #5

0

Datei anzeigen

Datei: tensorizers_test.py Projekt: zailushang2006/pytext

    def test_seq_tensor(self):
        tensorizer = SeqTokenTensorizer()

        data = TSVDataSource(
            train_file=SafeFileWrapper(
                tests_module.test_file("train_seq_features.tsv")
            ),
            test_file=None,
            eval_file=None,
            field_names=["text_seq"],
            schema={"text_seq": List[str]},
        )

        self._initialize_tensorizer(tensorizer, data)
        # UNK + PAD + 6 tokens
        self.assertEqual(8, len(tensorizer.vocab))

        # only one row in test file:
        # ["where do you wanna meet?", "MPK"]
        for row in data.train:
            tokens, token_lens, seq_lens = tensorizer.prepare_input(row)
            idx, sentence_lens, lens = tensorizer.numberize(row)
            self.assertEqual(2, lens)
            self.assertEqual([[2, 3, 4, 5, 6], [7, 1, 1, 1, 1]], idx)
            self.assertEqual([5, 1], sentence_lens)
            self.assertEqual(2, seq_lens)
            self.assertEqual(
                [
                    ["where", "do", "you", "wanna", "meet?"],
                    ["mpk", "__PAD__", "__PAD__", "__PAD__", "__PAD__"],
                ],
                tokens,
            )

Beispiel #6

0

Datei anzeigen

Datei: tensorizers_test.py Projekt: ufukhurriyetoglu/pytext

    def test_seq_tensor(self):
        tensorizer = SeqTokenTensorizer()

        data = TSVDataSource(
            train_file=SafeFileWrapper(
                tests_module.test_file("train_seq_features.tsv")
            ),
            test_file=None,
            eval_file=None,
            field_names=["text_seq"],
            schema={"text_seq": List[str]},
        )

        init = tensorizer.initialize()
        init.send(None)  # kick
        for row in data.train:
            init.send(row)
        init.close()
        # UNK + PAD + 6 tokens
        self.assertEqual(8, len(tensorizer.vocab))

        # only one row in test file:
        # ["where do you wanna meet?", "MPK"]
        for row in data.train:
            idx, lens = tensorizer.numberize(row)
            self.assertEqual(2, lens)
            self.assertEqual([[2, 3, 4, 5, 6], [7, 1, 1, 1, 1]], idx)

Beispiel #7

0

Datei anzeigen

Datei: tensorizers_test.py Projekt: meltnur/pytext

    def test_seq_tensor_pad_batch(self):
        tensorizer = SeqTokenTensorizer()

        data = TSVDataSource(
            train_file=SafeFileWrapper(
                tests_module.test_file("train_seq_features.tsv")),
            test_file=None,
            eval_file=None,
            field_names=["text_seq"],
            schema={"text_seq": List[str]},
        )

        self._initialize_tensorizer(tensorizer, data)
        token_idx_1 = [[2, 3], [2, 1]]
        token_count_1 = [2, 1]
        seq_len_1 = 2
        token_idx_2 = [[2, 3, 4]]
        token_count_2 = [3]
        seq_len_2 = 1
        token_idx_tensor, token_count_tensor, seq_len_tensor = tensorizer.tensorize(
            [
                (token_idx_1, token_count_1, seq_len_1),
                (token_idx_2, token_count_2, seq_len_2),
            ])
        np.testing.assert_array_almost_equal(
            np.array([[[2, 3, 1], [2, 1, 1]], [[2, 3, 4], [1, 1, 1]]]),
            token_idx_tensor.detach().numpy(),
        )
        np.testing.assert_array_almost_equal(
            np.array([[2, 1], [3, 1]]),
            token_count_tensor.detach().numpy())
        np.testing.assert_array_almost_equal(np.array([2, 1]),
                                             seq_len_tensor.detach().numpy())

Beispiel #8

0

Datei anzeigen

 def _get_pytext_config(
     self,
     test_file_name: TestFileName,
     task_class: Type[NewTask],
     model_class: Type[Model],
 ) -> PyTextConfig:
     test_file_metadata = get_test_file_metadata(test_file_name)
     return PyTextConfig(
         task=task_class.Config(
             data=Data.Config(
                 source=TSVDataSource.Config(
                     train_filename=test_file_metadata.filename,
                     eval_filename=test_file_metadata.filename,
                     test_filename=test_file_metadata.filename,
                     field_names=test_file_metadata.field_names,
                 ),
                 batcher=Batcher.Config(
                 ),  # Use Batcher to avoid shuffling.
             ),
             trainer=TaskTrainer.Config(epochs=1),
             model=model_class.Config(
                 inputs=type(model_class.Config.inputs)(
                     dense=FloatListTensorizer.Config(
                         column=test_file_metadata.dense_col_name,
                         dim=test_file_metadata.dense_feat_dim,
                     ))),
         ),
         use_tensorboard=False,
         use_cuda_if_available=False,
         version=LATEST_VERSION,
     )

Beispiel #9

0

Datei anzeigen

Datei: tensorizers_test.py Projekt: ufukhurriyetoglu/pytext

    def test_annotation_num(self):
        data = TSVDataSource(
            SafeFileWrapper(tests_module.test_file("compositional_seq2seq_unit.tsv")),
            test_file=None,
            eval_file=None,
            field_names=["text", "seqlogical"],
            schema={"text": str, "seqlogical": str},
        )
        nbrz = AnnotationNumberizer()
        init = nbrz.initialize()
        init.send(None)  # kick
        for row in data.train:
            init.send(row)
        init.close()

        # vocab = {'IN:GET_INFO_TRAFFIC': 0, 'SHIFT': 1, 'SL:LOCATION': 2,
        # 'REDUCE': 3, 'IN:GET_DIRECTIONS': 4, 'SL:DESTINATION': 5, 'SL:SOURCE': 6,
        # 'IN:GET_LOCATION_HOME': 7, 'SL:CONTACT': 8, 'SL:DATE_TIME_DEPARTURE': 9,
        # 'IN:UNSUPPORTED_NAVIGATION': 10, 'IN:GET_ESTIMATED_DURATION': 11,
        # 'IN:GET_LOCATION_WORK': 12, 'SL:PATH_AVOID': 13, 'IN:GET_DISTANCE': 14}

        self.assertEqual(15, len(nbrz.vocab))
        self.assertEqual(1, nbrz.shift_idx)
        self.assertEqual(3, nbrz.reduce_idx)
        self.assertEqual([10], nbrz.ignore_subNTs_roots)
        self.assertEqual(
            [0, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], nbrz.valid_NT_idxs
        )
        self.assertEqual([0, 4, 7, 10, 11, 12, 14], nbrz.valid_IN_idxs)
        self.assertEqual([2, 5, 6, 8, 9, 13], nbrz.valid_SL_idxs)

        for row, expected in zip(data.train, EXPECTED_ACTIONS):
            actions = nbrz.numberize(row)
            self.assertEqual(expected, actions)

Beispiel #10

0

Datei anzeigen

Datei: tensorizers_test.py Projekt: jjw-megha/pytext

    def test_gazetteer_tensor(self):
        tensorizer = GazetteerTensorizer()

        data = TSVDataSource(
            train_file=SafeFileWrapper(
                tests_module.test_file("train_dict_features.tsv")
            ),
            test_file=None,
            eval_file=None,
            field_names=["text", "dict"],
            schema={"text": str, "dict": Gazetteer},
        )

        init = tensorizer.initialize()
        init.send(None)  # kick
        for row in data.train:
            init.send(row)
        init.close()
        # UNK + PAD + 3 labels
        self.assertEqual(5, len(tensorizer.vocab))

        # only one row in test file:
        # "Order coffee from Starbucks please"
        for row in data.train:
            idx, weights, lens = tensorizer.numberize(row)
            self.assertEqual([1, 1, 2, 3, 1, 1, 4, 1, 1, 1], idx)
            self.assertEqual(
                [0.0, 0.0, 0.8, 0.2, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0], weights
            )
            self.assertEqual([1, 2, 1, 1, 1], lens)

Beispiel #11

0

Datei anzeigen

    def test_gazetteer_tensor(self):
        tensorizer = GazetteerTensorizer()

        data = TSVDataSource(
            train_file=SafeFileWrapper(
                tests_module.test_file("train_dict_features.tsv")),
            test_file=None,
            eval_file=None,
            field_names=["text", "dict"],
            schema={
                "text": str,
                "dict": Gazetteer
            },
        )

        init = tensorizer.initialize()
        init.send(None)  # kick
        for row in data.train:
            init.send(row)
        init.close()
        # UNK + PAD + 5 labels
        self.assertEqual(7, len(tensorizer.vocab))

        # only two rows in test file:
        # "Order coffee from Starbucks please"
        # "Order some fries from McDonalds please"
        for i, row in enumerate(data.train):
            if i == 0:
                idx, weights, lens = tensorizer.numberize(row)
                self.assertEqual([1, 1, 2, 3, 1, 1, 4, 1, 1, 1], idx)
                self.assertEqual(
                    [0.0, 0.0, 0.8, 0.2, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0],
                    weights)
                self.assertEqual([1, 2, 1, 1, 1], lens)
            if i == 1:
                idx, weights, lens = tensorizer.numberize(row)
                self.assertEqual([1, 1, 5, 1, 6, 1], idx)
                self.assertEqual([0.0, 0.0, 1.0, 0.0, 1.0, 0.0], weights)
                self.assertEqual([1, 1, 1, 1, 1, 1], lens)

        feats, weights, lens = tensorizer.tensorize(
            tensorizer.numberize(row) for row in data.train)
        self.assertEqual(
            [
                [1, 1, 2, 3, 1, 1, 4, 1, 1, 1, 1, 1],
                [1, 1, 1, 1, 5, 1, 1, 1, 6, 1, 1, 1],
            ],
            feats.numpy().tolist(),
        )
        self.assertEqual(
            str([
                [0.0, 0.0, 0.8, 0.2, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0],
                [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0],
            ]),
            str([[round(w, 2) for w in utt_weights]
                 for utt_weights in weights.numpy()]),
        )
        self.assertEqual([[1, 2, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]],
                         lens.numpy().tolist())

Beispiel #12

0

Datei anzeigen

 def setUp(self):
     self.data = TSVDataSource(
         SafeFileWrapper(tests_module.test_file("train_dense_features_tiny.tsv")),
         SafeFileWrapper(tests_module.test_file("test_dense_features_tiny.tsv")),
         eval_file=None,
         field_names=["label", "slots", "text", "dense"],
         schema={"text": types.Text, "label": types.Label},
     )

Beispiel #13

0

Datei anzeigen

Datei: workflow.py Projekt: jingfeidu/pytext-1

def _get_data_source(test_path, field_names, task):
    if test_path:
        data_source = TSVDataSource(
            test_file=SafeFileWrapper(test_path),
            schema=task.data.data_source.schema,
            field_names=field_names,
        )
    else:
        data_source = task.data.data_source
    return data_source

Beispiel #14

0

Datei anzeigen

    def test_quoting(self):
        data_source = TSVDataSource(
            SafeFileWrapper(tests_module.test_file("test_tsv_quoting.tsv")),
            SafeFileWrapper(tests_module.test_file("test_tsv_quoting.tsv")),
            eval_file=None,
            field_names=["label", "text"],
            schema={"text": str, "label": str},
        )

        data = list(data_source.train)
        self.assertEqual(4, len(data))

Beispiel #15

0

Datei anzeigen

Datei: seq2seq_model_tests.py Projekt: facebookresearch/pytext

    def test_reset_incremental_states(self):
        """
        This test might seem trivial. However, interacting with the scripted
        sequence generator crosses the Torchscript boundary, which can lead
        to weird behavior. If the incremental states don't get properly
        reset, the model will produce garbage _after_ the first call, which
        is a pain to debug when you only catch it after training.
        """
        tensorizers = get_tensorizers()

        # Avoid numeric issues with quantization by setting a known seed.
        torch.manual_seed(42)

        model = Seq2SeqModel.from_config(
            Seq2SeqModel.Config(
                source_embedding=WordEmbedding.Config(embed_dim=512),
                target_embedding=WordEmbedding.Config(embed_dim=512),
            ),
            tensorizers,
        )

        # Get sample inputs using a data source.
        schema = {
            "source_sequence": str,
            "dict_feat": Gazetteer,
            "target_sequence": str,
        }
        data = Data.from_config(
            Data.Config(source=TSVDataSource.Config(
                train_filename=TEST_FILE_NAME,
                field_names=[
                    "source_sequence", "dict_feat", "target_sequence"
                ],
            )),
            schema,
            tensorizers,
        )
        data.batcher = Batcher(1, 1, 1)
        raw_batch, batch = next(
            iter(data.batches(Stage.TRAIN, load_early=True)))
        inputs = model.arrange_model_inputs(batch)

        model.eval()
        outputs = model(*inputs)
        pred, scores = model.get_pred(outputs, {"stage": Stage.TEST})

        # Verify that the incremental states reset correctly.
        decoder = model.sequence_generator.beam_search.decoder_ens
        decoder.reset_incremental_states()
        self.assertDictEqual(decoder.incremental_states, {"0": {}})

        # Verify that the model returns the same predictions.
        new_pred, new_scores = model.get_pred(outputs, {"stage": Stage.TEST})
        self.assertEqual(new_scores, scores)

Beispiel #16

0

Datei anzeigen

    def setUp(self):
        self.data_source = TSVDataSource(
            SafeFileWrapper(tests_module.test_file("train_dense_features_tiny.tsv")),
            SafeFileWrapper(tests_module.test_file("test_dense_features_tiny.tsv")),
            eval_file=None,
            field_names=["label", "slots", "text", "dense"],
            schema={"text": types.Text, "label": types.Label},
        )

        self.tensorizers = {
            "tokens": WordTensorizer(column="text"),
            "labels": LabelTensorizer(column="label", allow_unknown=True),
        }

Beispiel #17

0

Datei anzeigen

Datei: tsv_data_source_test.py Projekt: wenhaiyang6/pytext

    def test_csv(self):
        data_source = TSVDataSource(
            SafeFileWrapper(tests_module.test_file("test_data_tiny_csv.tsv")),
            test_file=None,
            eval_file=None,
            field_names=["label", "slots", "text"],
            delimiter=",",
            schema={"text": str, "label": str},
            quoted=True,
        )

        for row in data_source.train:
            self.assertEqual("alarm/set_alarm", row["label"])
            self.assertTrue(row["text"].startswith("this is the text"))

Beispiel #18

0

Datei anzeigen

Datei: scripted_seq2seq_generator_test.py Projekt: amohamedwa/pytext-1

 def _get_tensorizers(self):
     schema = {"source_sequence": str, "target_sequence": str}
     data_source = TSVDataSource.from_config(
         TSVDataSource.Config(
             train_filename=tests_module.test_file(
                 "compositional_seq2seq_unit.tsv"),
             field_names=["source_sequence", "target_sequence"],
         ),
         schema,
     )
     src_tensorizer = TokenTensorizer.from_config(
         TokenTensorizer.Config(column="source_sequence",
                                add_eos_token=True,
                                add_bos_token=True))
     tgt_tensorizer = TokenTensorizer.from_config(
         TokenTensorizer.Config(column="target_sequence",
                                add_eos_token=True,
                                add_bos_token=True))
     tensorizers = {
         "src_seq_tokens": src_tensorizer,
         "trg_seq_tokens": tgt_tensorizer,
     }
     initialize_tensorizers(tensorizers, data_source.train)
     return tensorizers

Beispiel #19

0

Datei anzeigen

    def test_read_data_source_with_column_remapping(self):
        data_source = TSVDataSource(
            SafeFileWrapper(tests_module.test_file("train_dense_features_tiny.tsv")),
            SafeFileWrapper(tests_module.test_file("test_dense_features_tiny.tsv")),
            eval_file=None,
            field_names=["remapped_label", "slots", "remapped_text", "dense"],
            column_mapping={"remapped_label": "label", "remapped_text": "text"},
            schema={"text": str, "label": str},
        )

        data = list(data_source.train)
        self.assertEqual(10, len(data))
        example = next(iter(data))
        self.assertEqual(2, len(example))
        self.assertEqual({"label", "text"}, set(example))

Beispiel #20

0

Datei anzeigen

Datei: tsv_data_source_test.py Projekt: wenhaiyang6/pytext

    def test_quoting(self):
        """
        The text column of the first row of this file opens a quote but
        does not close it.
        """
        data_source = TSVDataSource(
            SafeFileWrapper(tests_module.test_file("test_tsv_quoting.tsv")),
            SafeFileWrapper(tests_module.test_file("test_tsv_quoting.tsv")),
            eval_file=None,
            field_names=["label", "text"],
            schema={"text": str, "label": str},
        )

        data = list(data_source.train)
        self.assertEqual(4, len(data))

Beispiel #21

0

Datei anzeigen

    def test_load_checkpoint(self):
        with tempfile.NamedTemporaryFile() as checkpoint_file:
            train_data = tests_module.test_file("train_data_tiny.tsv")
            eval_data = tests_module.test_file("test_data_tiny.tsv")
            config = PyTextConfig(
                task=DocumentClassificationTask.Config(data=Data.Config(
                    source=TSVDataSource.Config(
                        train_filename=train_data,
                        eval_filename=eval_data,
                        field_names=["label", "slots", "text"],
                    ))),
                version=LATEST_VERSION,
                save_snapshot_path=checkpoint_file.name,
            )
            task = create_task(config.task)
            model = task.model
            # test checkpoint saving and loading
            optimizer = create_optimizer(Adam.Config(), model)
            scheduler = create_scheduler(Scheduler.Config(), optimizer)
            training_state = TrainingState(
                model=model,
                optimizer=optimizer,
                scheduler=scheduler,
                start_time=0,
                epoch=0,
                rank=0,
                stage=Stage.TRAIN,
                epochs_since_last_improvement=0,
                best_model_state=None,
                best_model_metric=None,
                tensorizers=task.data.tensorizers,
            )

            id = "epoch-1"
            saved_path = save(config, model, None, task.data.tensorizers,
                              training_state, id)
            # TODO: fix get_latest_checkpoint_path T53664139
            # self.assertEqual(saved_path, get_latest_checkpoint_path())
            task_restored, config_restored, training_state_restored = load(
                saved_path)
            self.assertCheckpointEqual(
                model,
                config,
                training_state,
                task_restored.model,
                config_restored,
                training_state_restored,
            )

Beispiel #22

0

Datei anzeigen

Datei: mask_tensorizers_test.py Projekt: facebookresearch/pytext

 def setUp(self):
     self.data = TSVDataSource(
         SafeFileWrapper(
             tests_module.test_file("compositional_seq2seq_unit.tsv")),
         test_file=None,
         eval_file=None,
         field_names=["text", "seqlogical"],
         schema={
             "text": str,
             "seqlogical": str
         },
     )
     self.masked_tensorizer = MaskedTokenTensorizer.from_config(
         MaskedTokenTensorizer.Config(column="seqlogical",
                                      masking_function=TreeMask.Config()))
     self._initialize_tensorizer(self.masked_tensorizer)

Beispiel #23

0

Datei anzeigen

Datei: tensorizers_test.py Projekt: ufukhurriyetoglu/pytext

    def test_gazetteer_tensor_bad_json(self):
        tensorizer = GazetteerTensorizer()

        data = TSVDataSource(
            train_file=SafeFileWrapper(
                tests_module.test_file("train_dict_features_bad_json.tsv")
            ),
            test_file=None,
            eval_file=None,
            field_names=["text", "dict"],
            schema={"text": str, "dict": Gazetteer},
        )

        init = tensorizer.initialize()
        init.send(None)  # kick
        with self.assertRaises(Exception):
            for row in data.train:
                init.send(row)
        init.close()

Beispiel #24

0

Datei anzeigen

Datei: seq2seq_model_tests.py Projekt: danielsimig/pytext

    def test_force_predictions_on_eval(self):
        tensorizers = get_tensorizers()

        model = Seq2SeqModel.from_config(
            Seq2SeqModel.Config(
                source_embedding=WordEmbedding.Config(embed_dim=512),
                target_embedding=WordEmbedding.Config(embed_dim=512),
            ),
            tensorizers,
        )

        # Get sample inputs using a data source.
        schema = {
            "source_sequence": str,
            "dict_feat": Gazetteer,
            "target_sequence": str,
        }
        data = Data.from_config(
            Data.Config(source=TSVDataSource.Config(
                train_filename=TEST_FILE_NAME,
                field_names=[
                    "source_sequence", "dict_feat", "target_sequence"
                ],
            )),
            schema,
            tensorizers,
        )
        data.batcher = Batcher(1, 1, 1)
        raw_batch, batch = next(
            iter(data.batches(Stage.TRAIN, load_early=True)))
        inputs = model.arrange_model_inputs(batch)

        # Verify that model does not run sequence generation on prediction.
        outputs = model(*inputs)
        pred = model.get_pred(outputs, {"stage": Stage.EVAL})
        self.assertEqual(pred, (None, None))

        # Verify that attempting to set force_eval_predictions is correctly
        # accounted for.
        model.force_eval_predictions = True
        with self.assertRaises(AssertionError):
            _ = model.get_pred(outputs, {"stage": Stage.EVAL})

Beispiel #25

0

Datei anzeigen

Datei: tensorizers_test.py Projekt: meltnur/pytext

    def test_seq_tensor_max_turn(self):
        tensorizer = SeqTokenTensorizer(max_turn=1)

        data = TSVDataSource(
            train_file=SafeFileWrapper(
                tests_module.test_file("train_seq_features.tsv")),
            test_file=None,
            eval_file=None,
            field_names=["text_seq"],
            schema={"text_seq": List[str]},
        )

        self._initialize_tensorizer(tensorizer, data)

        # only one row in test file:
        # ["where do you wanna meet?", "MPK"]
        for row in data.train:
            idx, sentence_lens, seq_len = tensorizer.numberize(row)
            self.assertEqual(1, seq_len)
            self.assertEqual([[2, 3, 4, 5, 6]], idx)
            self.assertEqual([5], sentence_lens)

Beispiel #26

0

Datei anzeigen

Datei: tensorizers_test.py Projekt: zailushang2006/pytext

    def test_create_normalized_float_list_tensor(self):
        def round_list(l):
            return [float("%.4f" % n) for n in l]

        data = TSVDataSource(
            SafeFileWrapper(tests_module.test_file("train_dense_features_tiny.tsv")),
            eval_file=None,
            field_names=["label", "slots", "text", "dense_feat"],
            schema={"text": str, "label": str, "dense_feat": List[float]},
        )
        tensorizer = FloatListTensorizer(
            column="dense_feat", dim=10, error_check=True, normalize=True
        )
        self._initialize_tensorizer(tensorizer, data)
        self.assertEqual(10, tensorizer.normalizer.num_rows)
        self.assertEqual(
            round_list(
                [
                    7.56409,
                    8.2388,
                    0.5531,
                    0.2403,
                    1.03130,
                    6.2888,
                    3.1595,
                    0.1538,
                    0.2403,
                    5.3463,
                ]
            ),
            round_list(tensorizer.normalizer.feature_sums),
        )
        self.assertEqual(
            round_list(
                [
                    5.80172,
                    7.57586,
                    0.30591,
                    0.05774,
                    0.52762,
                    5.22811,
                    2.51727,
                    0.02365,
                    0.05774,
                    4.48798,
                ]
            ),
            round_list(tensorizer.normalizer.feature_squared_sums),
        )
        self.assertEqual(
            round_list(
                [
                    0.75640,
                    0.82388,
                    0.05531,
                    0.02403,
                    0.10313,
                    0.62888,
                    0.31595,
                    0.01538,
                    0.02403,
                    0.53463,
                ]
            ),
            round_list(tensorizer.normalizer.feature_avgs),
        )
        self.assertEqual(
            round_list(
                [
                    0.08953,
                    0.28072,
                    0.16593,
                    0.07209,
                    0.20524,
                    0.35682,
                    0.38974,
                    0.04614,
                    0.07209,
                    0.40369,
                ]
            ),
            round_list(tensorizer.normalizer.feature_stddevs),
        )

        row = [0.64840776, 0.7575, 0.5531, 0.2403, 0, 0.9481, 0, 0.1538, 0.2403, 0.3564]
        output = tensorizer.numberize({"dense_feat": row})

        self.assertEqual(
            round_list(
                [
                    -1.20619,
                    -0.23646,
                    2.99999,
                    3.0,
                    -0.50246,
                    0.89462,
                    -0.81066,
                    2.99999,
                    3.0,
                    -0.44149,
                ]
            ),
            round_list(output),
        )