Beispiel #1
0
def get_tensorizers(add_dict_feat=False, add_contextual_feat=False):
    schema = {"source_sequence": str, "dict_feat": Gazetteer, "target_sequence": str}
    data_source = TSVDataSource.from_config(
        TSVDataSource.Config(
            train_filename=TEST_FILE_NAME,
            field_names=["source_sequence", "dict_feat", "target_sequence"],
        ),
        schema,
    )
    src_tensorizer = TokenTensorizer.from_config(
        TokenTensorizer.Config(
            column="source_sequence", add_eos_token=True, add_bos_token=True
        )
    )
    tgt_tensorizer = TokenTensorizer.from_config(
        TokenTensorizer.Config(
            column="target_sequence", add_eos_token=True, add_bos_token=True
        )
    )
    tensorizers = {"src_seq_tokens": src_tensorizer, "trg_seq_tokens": tgt_tensorizer}
    initialize_tensorizers(tensorizers, data_source.train)

    if add_dict_feat:
        tensorizers["dict_feat"] = GazetteerTensorizer.from_config(
            GazetteerTensorizer.Config(
                text_column="source_sequence", dict_column="dict_feat"
            )
        )
        initialize_tensorizers(
            {"dict_feat": tensorizers["dict_feat"]}, data_source.train
        )
    return tensorizers
Beispiel #2
0
    def test_gazetteer_tensor(self):
        tensorizer = GazetteerTensorizer()

        data = TSVDataSource(
            train_file=SafeFileWrapper(
                tests_module.test_file("train_dict_features.tsv")
            ),
            test_file=None,
            eval_file=None,
            field_names=["text", "dict"],
            schema={"text": str, "dict": Gazetteer},
        )

        init = tensorizer.initialize()
        init.send(None)  # kick
        for row in data.train:
            init.send(row)
        init.close()
        # UNK + PAD + 3 labels
        self.assertEqual(5, len(tensorizer.vocab))

        # only one row in test file:
        # "Order coffee from Starbucks please"
        for row in data.train:
            idx, weights, lens = tensorizer.numberize(row)
            self.assertEqual([1, 1, 2, 3, 1, 1, 4, 1, 1, 1], idx)
            self.assertEqual(
                [0.0, 0.0, 0.8, 0.2, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0], weights
            )
            self.assertEqual([1, 2, 1, 1, 1], lens)
Beispiel #3
0
 def test_tokens_dictfeat_contextual(self):
     # TODO (T65593688): this should be removed after
     # https://github.com/pytorch/pytorch/pull/33645 is merged.
     with torch.no_grad():
         model = Seq2SeqModel.from_config(
             Seq2SeqModel.Config(
                 source_embedding=WordEmbedding.Config(embed_dim=512),
                 target_embedding=WordEmbedding.Config(embed_dim=512),
                 inputs=Seq2SeqModel.Config.ModelInput(
                     dict_feat=GazetteerTensorizer.Config(
                         text_column="source_sequence"
                     ),
                     contextual_token_embedding=ByteTokenTensorizer.Config(),
                 ),
                 encoder_decoder=RNNModel.Config(
                     encoder=LSTMSequenceEncoder.Config(embed_dim=619)
                 ),
                 dict_embedding=DictEmbedding.Config(),
                 contextual_token_embedding=ContextualTokenEmbedding.Config(
                     embed_dim=7
                 ),
             ),
             get_tensorizers(add_dict_feat=True, add_contextual_feat=True),
         )
         model.eval()
         ts_model = model.torchscriptify()
         res = ts_model(
             ["call", "mom"],
             (["call", "mom"], [0.42, 0.17], [4, 3]),
             [0.42] * (7 * 2),
         )
         assert res is not None
Beispiel #4
0
    def test_gazetteer_tensor(self):
        tensorizer = GazetteerTensorizer()

        data = TSVDataSource(
            train_file=SafeFileWrapper(
                tests_module.test_file("train_dict_features.tsv")),
            test_file=None,
            eval_file=None,
            field_names=["text", "dict"],
            schema={
                "text": str,
                "dict": Gazetteer
            },
        )

        init = tensorizer.initialize()
        init.send(None)  # kick
        for row in data.train:
            init.send(row)
        init.close()
        # UNK + PAD + 5 labels
        self.assertEqual(7, len(tensorizer.vocab))

        # only two rows in test file:
        # "Order coffee from Starbucks please"
        # "Order some fries from McDonalds please"
        for i, row in enumerate(data.train):
            if i == 0:
                idx, weights, lens = tensorizer.numberize(row)
                self.assertEqual([1, 1, 2, 3, 1, 1, 4, 1, 1, 1], idx)
                self.assertEqual(
                    [0.0, 0.0, 0.8, 0.2, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0],
                    weights)
                self.assertEqual([1, 2, 1, 1, 1], lens)
            if i == 1:
                idx, weights, lens = tensorizer.numberize(row)
                self.assertEqual([1, 1, 5, 1, 6, 1], idx)
                self.assertEqual([0.0, 0.0, 1.0, 0.0, 1.0, 0.0], weights)
                self.assertEqual([1, 1, 1, 1, 1, 1], lens)

        feats, weights, lens = tensorizer.tensorize(
            tensorizer.numberize(row) for row in data.train)
        self.assertEqual(
            [
                [1, 1, 2, 3, 1, 1, 4, 1, 1, 1, 1, 1],
                [1, 1, 1, 1, 5, 1, 1, 1, 6, 1, 1, 1],
            ],
            feats.numpy().tolist(),
        )
        self.assertEqual(
            str([
                [0.0, 0.0, 0.8, 0.2, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0],
                [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0],
            ]),
            str([[round(w, 2) for w in utt_weights]
                 for utt_weights in weights.numpy()]),
        )
        self.assertEqual([[1, 2, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]],
                         lens.numpy().tolist())
    def test_gazetteer_tensor_bad_json(self):
        tensorizer = GazetteerTensorizer()

        data = TSVDataSource(
            train_file=SafeFileWrapper(
                tests_module.test_file("train_dict_features_bad_json.tsv")
            ),
            test_file=None,
            eval_file=None,
            field_names=["text", "dict"],
            schema={"text": str, "dict": Gazetteer},
        )

        init = tensorizer.initialize()
        init.send(None)  # kick
        with self.assertRaises(Exception):
            for row in data.train:
                init.send(row)
        init.close()
 def test_tokens_dictfeat(self):
     model = Seq2SeqModel.from_config(
         Seq2SeqModel.Config(
             source_embedding=WordEmbedding.Config(embed_dim=512),
             target_embedding=WordEmbedding.Config(embed_dim=512),
             inputs=Seq2SeqModel.Config.ModelInput(
                 dict_feat=GazetteerTensorizer.Config(
                     text_column="source_sequence")),
             encoder_decoder=RNNModel.Config(
                 encoder=LSTMSequenceEncoder.Config(embed_dim=612)),
             dict_embedding=DictEmbedding.Config(),
         ),
         get_tensorizers(add_dict_feat=True),
     )
     model.eval()
     ts_model = model.torchscriptify()
     res = ts_model(["call", "mom"],
                    (["call", "mom"], [0.42, 0.17], [4, 3]))
     assert res is not None