コード例 #1
0
 def test_initialize_word_tensorizer(self):
     tensorizer = WordTensorizer(column="text")
     init = tensorizer.initialize()
     init.send(None)  # kick
     for row in self.data.train:
         init.send(row)
     init.close()
     self.assertEqual(49, len(tensorizer.vocab))
コード例 #2
0
    def test_create_word_tensors(self):
        tensorizer = WordTensorizer(text_column="text")
        init = tensorizer.initialize()
        init.send(None)  # kick
        for row in self.data.train:
            init.send(row)
        init.close()

        rows = [{"text": "I want some coffee"}, {"text": "Turn it up"}]
        tensors = (tensorizer.numberize(row) for row in rows)
        tokens, seq_len = next(tensors)
        self.assertEqual([24, 0, 0, 0], tokens)
        self.assertEqual(4, seq_len)

        tokens, seq_len = next(tensors)
        self.assertEqual([13, 47, 9], tokens)
        self.assertEqual(3, seq_len)
コード例 #3
0
 def test_initialize_tensorizers(self):
     tensorizers = {
         "tokens": WordTensorizer(column="text"),
         "labels": LabelTensorizer(column="label"),
         "chars": CharacterTensorizer(column="text"),
     }
     initialize_tensorizers(tensorizers, self.data.train)
     self.assertEqual(49, len(tensorizers["tokens"].vocab))
     self.assertEqual(7, len(tensorizers["labels"].labels))
コード例 #4
0
 def test_create_batches_different_tensorizers(self):
     tensorizers = {"tokens": WordTensorizer(column="text")}
     data = Data(self.data_source, tensorizers, Batcher(train_batch_size=16))
     batches = list(data.batches(Stage.TRAIN))
     self.assertEqual(1, len(batches))
     batch = next(iter(batches))
     self.assertEqual({"tokens"}, set(batch))
     tokens, seq_lens = batch["tokens"]
     self.assertEqual((10,), seq_lens.size())
     self.assertEqual(10, len(tokens))
コード例 #5
0
    def test_create_word_tensors(self):
        tensorizer = WordTensorizer(column="text")
        init = tensorizer.initialize()
        init.send(None)  # kick
        for row in self.data.train:
            init.send(row)
        init.close()

        batch = [
            {"text": types.Text("I want some coffee")},
            {"text": types.Text("Turn it up")},
        ]

        tokens, seq_lens = tensorizer.create_training_tensors(batch)
        self.assertIsInstance(tokens, torch.LongTensor)
        self.assertIsInstance(seq_lens, torch.LongTensor)
        self.assertEqual((2, 4), tokens.size())
        self.assertEqual((2,), seq_lens.size())
        self.assertEqual([[24, 0, 0, 0], [13, 47, 9, 1]], tokens.tolist())
        self.assertEqual([4, 3], seq_lens.tolist())
コード例 #6
0
 def test_data_initializes_tensorsizers(self):
     tensorizers = {
         "tokens": WordTensorizer(text_column="text"),
         "labels": LabelTensorizer(label_column="label"),
     }
     # verify WordTensorizer isn't in an initialized state yet
     assert tensorizers["tokens"].vocab is None
     Data(self.data_source, tensorizers)
     # Tensorizers should have been initialized
     self.assertEqual(49, len(tensorizers["tokens"].vocab))
     self.assertEqual(7, len(tensorizers["labels"].labels))
コード例 #7
0
ファイル: data_test.py プロジェクト: yuxuan2015/pytext
 def test_data_initializes_tensorsizers(self):
     tensorizers = {
         "tokens": WordTensorizer(column="text"),
         "labels": LabelTensorizer(column="label"),
     }
     with self.assertRaises(AttributeError):
         # verify WordTensorizer isn't in an initialized state yet
         tensorizers["tokens"].vocab
     Data(self.data_source, tensorizers)
     # Tensorizers should have been initialized
     self.assertEqual(49, len(tensorizers["tokens"].vocab))
     self.assertEqual(7, len(tensorizers["labels"].labels))
コード例 #8
0
    def setUp(self):
        self.data_source = TSVDataSource(
            SafeFileWrapper(tests_module.test_file("train_dense_features_tiny.tsv")),
            SafeFileWrapper(tests_module.test_file("test_dense_features_tiny.tsv")),
            eval_file=None,
            field_names=["label", "slots", "text", "dense"],
            schema={"text": types.Text, "label": types.Label},
        )

        self.tensorizers = {
            "tokens": WordTensorizer(column="text"),
            "labels": LabelTensorizer(column="label", allow_unknown=True),
        }
コード例 #9
0
 class ModelInput(Model.Config.ModelInput):
     tokens: WordTensorizer.Config = WordTensorizer.Config()
     labels: LabelTensorizer.Config = LabelTensorizer.Config(
         allow_unknown=True)
     # for metric reporter
     raw_text: MetaInput.Config = MetaInput.Config(column="text")
コード例 #10
0
 class RegressionModelInput(Model.Config.ModelInput):
     tokens: WordTensorizer.Config = WordTensorizer.Config()
     labels: NumericLabelTensorizer.Config = NumericLabelTensorizer.Config(
     )
コード例 #11
0
ファイル: doc_model.py プロジェクト: javamickey/pytext
 class ModelInput(NewModel.Config.ModelInput):
     tokens: WordTensorizer.Config = WordTensorizer.Config()
     labels: LabelTensorizer.Config = LabelTensorizer.Config(allow_unknown=True)
コード例 #12
0
ファイル: new_model.py プロジェクト: yuxuan2015/pytext
 class Config(Model.Config, doc_model.DocModel.Config):
     inputs: Dict[str, Tensorizer.Config] = {
         "tokens": WordTensorizer.Config(),
         "labels": LabelTensorizer.Config(),
     }
     embedding: WordFeatConfig = WordFeatConfig()