def test_initialize_word_tensorizer(self): tensorizer = WordTensorizer(column="text") init = tensorizer.initialize() init.send(None) # kick for row in self.data.train: init.send(row) init.close() self.assertEqual(49, len(tensorizer.vocab))
def test_create_word_tensors(self): tensorizer = WordTensorizer(text_column="text") init = tensorizer.initialize() init.send(None) # kick for row in self.data.train: init.send(row) init.close() rows = [{"text": "I want some coffee"}, {"text": "Turn it up"}] tensors = (tensorizer.numberize(row) for row in rows) tokens, seq_len = next(tensors) self.assertEqual([24, 0, 0, 0], tokens) self.assertEqual(4, seq_len) tokens, seq_len = next(tensors) self.assertEqual([13, 47, 9], tokens) self.assertEqual(3, seq_len)
def test_initialize_tensorizers(self): tensorizers = { "tokens": WordTensorizer(column="text"), "labels": LabelTensorizer(column="label"), "chars": CharacterTensorizer(column="text"), } initialize_tensorizers(tensorizers, self.data.train) self.assertEqual(49, len(tensorizers["tokens"].vocab)) self.assertEqual(7, len(tensorizers["labels"].labels))
def test_create_batches_different_tensorizers(self): tensorizers = {"tokens": WordTensorizer(column="text")} data = Data(self.data_source, tensorizers, Batcher(train_batch_size=16)) batches = list(data.batches(Stage.TRAIN)) self.assertEqual(1, len(batches)) batch = next(iter(batches)) self.assertEqual({"tokens"}, set(batch)) tokens, seq_lens = batch["tokens"] self.assertEqual((10,), seq_lens.size()) self.assertEqual(10, len(tokens))
def test_create_word_tensors(self): tensorizer = WordTensorizer(column="text") init = tensorizer.initialize() init.send(None) # kick for row in self.data.train: init.send(row) init.close() batch = [ {"text": types.Text("I want some coffee")}, {"text": types.Text("Turn it up")}, ] tokens, seq_lens = tensorizer.create_training_tensors(batch) self.assertIsInstance(tokens, torch.LongTensor) self.assertIsInstance(seq_lens, torch.LongTensor) self.assertEqual((2, 4), tokens.size()) self.assertEqual((2,), seq_lens.size()) self.assertEqual([[24, 0, 0, 0], [13, 47, 9, 1]], tokens.tolist()) self.assertEqual([4, 3], seq_lens.tolist())
def test_data_initializes_tensorsizers(self): tensorizers = { "tokens": WordTensorizer(text_column="text"), "labels": LabelTensorizer(label_column="label"), } # verify WordTensorizer isn't in an initialized state yet assert tensorizers["tokens"].vocab is None Data(self.data_source, tensorizers) # Tensorizers should have been initialized self.assertEqual(49, len(tensorizers["tokens"].vocab)) self.assertEqual(7, len(tensorizers["labels"].labels))
def test_data_initializes_tensorsizers(self): tensorizers = { "tokens": WordTensorizer(column="text"), "labels": LabelTensorizer(column="label"), } with self.assertRaises(AttributeError): # verify WordTensorizer isn't in an initialized state yet tensorizers["tokens"].vocab Data(self.data_source, tensorizers) # Tensorizers should have been initialized self.assertEqual(49, len(tensorizers["tokens"].vocab)) self.assertEqual(7, len(tensorizers["labels"].labels))
def setUp(self): self.data_source = TSVDataSource( SafeFileWrapper(tests_module.test_file("train_dense_features_tiny.tsv")), SafeFileWrapper(tests_module.test_file("test_dense_features_tiny.tsv")), eval_file=None, field_names=["label", "slots", "text", "dense"], schema={"text": types.Text, "label": types.Label}, ) self.tensorizers = { "tokens": WordTensorizer(column="text"), "labels": LabelTensorizer(column="label", allow_unknown=True), }
class ModelInput(Model.Config.ModelInput): tokens: WordTensorizer.Config = WordTensorizer.Config() labels: LabelTensorizer.Config = LabelTensorizer.Config( allow_unknown=True) # for metric reporter raw_text: MetaInput.Config = MetaInput.Config(column="text")
class RegressionModelInput(Model.Config.ModelInput): tokens: WordTensorizer.Config = WordTensorizer.Config() labels: NumericLabelTensorizer.Config = NumericLabelTensorizer.Config( )
class ModelInput(NewModel.Config.ModelInput): tokens: WordTensorizer.Config = WordTensorizer.Config() labels: LabelTensorizer.Config = LabelTensorizer.Config(allow_unknown=True)
class Config(Model.Config, doc_model.DocModel.Config): inputs: Dict[str, Tensorizer.Config] = { "tokens": WordTensorizer.Config(), "labels": LabelTensorizer.Config(), } embedding: WordFeatConfig = WordFeatConfig()