def test_batch_level(self): settings = settings_from_file(testpath) settings['batch_size'] = 20 reader = Reader(settings, settings.input_path) label_encoder = MultiLabelEncoder.from_settings(settings) label_encoder.fit(line for _, line in reader.readsents()) data = Dataset(settings, reader, label_encoder) pre_batches = 0 for batch in data.batch_generator(): pre_batches += 1 self.assertAlmostEqual(pre_batches, self.insts // 20, delta=delta) devset = data.get_dev_split(self.insts, split=0.05) post_batches = 0 for batch in data.batch_generator(): post_batches += 1 self.assertAlmostEqual(pre_batches * 0.95, post_batches, delta=delta)
print() print("::: Target tasks :::") print() for task, le in label_encoder.tasks.items(): print("- {:<15} target={:<6} level={:<6} vocab={:<6}" .format(task, le.target, le.level, len(le))) print() trainset = Dataset(settings, reader, label_encoder) devset = None if settings.dev_path: devset = Dataset(settings, Reader(settings, settings.dev_path), label_encoder) devset = devset.get_batches() elif settings.dev_split > 0: devset = trainset.get_dev_split(ninsts, split=settings.dev_split) ninsts = ninsts - (len(devset) * settings.batch_size) else: logging.warning("No devset: cannot monitor/optimize training") testset = None if settings.test_path: testset = Dataset(settings, Reader(settings, settings.test_path), label_encoder) # model model = SimpleModel(trainset.label_encoder, settings.wemb_dim, settings.cemb_dim, settings.hidden_size, settings.num_layers, dropout=settings.dropout, word_dropout=settings.word_dropout, cell=settings.cell, cemb_type=settings.cemb_type, include_self=settings.include_self, pos_crf=settings.pos_crf)
class TestDevSplit(unittest.TestCase): def setUp(self): settings = settings_from_file(testpath) settings['batch_size'] = 1 reader = Reader(settings, settings.input_path) label_encoder = MultiLabelEncoder.from_settings(settings) insts = label_encoder.fit(line for _, line in reader.readsents()) self.insts = insts self.num_batches = insts // settings.batch_size self.data = Dataset(settings, reader, label_encoder) def test_split_length(self): total_batches = 0 for batch in self.data.batch_generator(): total_batches += 1 dev_batches = 0 for batch in self.data.get_dev_split(self.insts, split=0.05): dev_batches += 1 self.assertAlmostEqual(dev_batches, total_batches * 0.05, delta=delta) def test_remaining(self): pre_batches = 0 for batch in self.data.batch_generator(): pre_batches += 1 self.assertEqual(pre_batches, self.insts) # batch size is 1 self.assertEqual(pre_batches, self.num_batches) devset = self.data.get_dev_split(self.insts, split=0.05) post_batches = 0 for batch in self.data.batch_generator(): post_batches += 1 # FIXME self.assertAlmostEqual(len(devset) + post_batches, pre_batches, delta=delta * 5) self.assertAlmostEqual(pre_batches * 0.95, post_batches, delta=delta * 5) def test_batch_level(self): settings = settings_from_file(testpath) settings['batch_size'] = 20 reader = Reader(settings, settings.input_path) label_encoder = MultiLabelEncoder.from_settings(settings) label_encoder.fit(line for _, line in reader.readsents()) data = Dataset(settings, reader, label_encoder) pre_batches = 0 for batch in data.batch_generator(): pre_batches += 1 self.assertAlmostEqual(pre_batches, self.insts // 20, delta=delta) devset = data.get_dev_split(self.insts, split=0.05) post_batches = 0 for batch in data.batch_generator(): post_batches += 1 self.assertAlmostEqual(pre_batches * 0.95, post_batches, delta=delta)
if __name__ == '__main__': from pie.settings import settings_from_file from pie.data import Dataset, Reader, MultiLabelEncoder settings = settings_from_file('./config.json') reader = Reader(settings, settings.input_path) label_encoder = MultiLabelEncoder.from_settings(settings) label_encoder.fit_reader(reader) data = Dataset(settings, reader, label_encoder) model = SimpleModel(data.label_encoder, settings.wemb_dim, settings.cemb_dim, settings.hidden_size, settings.num_layers) model.to(settings.device) for batch in data.batch_generator(): model.loss(batch) break ((word, wlen), (char, clen)), tasks = next(data.batch_generator()) wemb, (cemb, cemb_outs) = model.wemb(word), model.cemb(char, clen, wlen) emb = model.merger(wemb, cemb) enc_outs = model.encoder(emb, wlen) model.pos_decoder.predict(enc_outs, wlen) lemma_hyps, _ = model.lemma_decoder.predict_max( cemb_outs, clen, context=torch_utils.flatten_padded_batch(enc_outs, wlen)) print(lemma_hyps) model.evaluate(data.get_dev_split())