Example #1
0
    def test_batch_level(self):
        settings = settings_from_file(testpath)
        settings['batch_size'] = 20
        reader = Reader(settings, settings.input_path)
        label_encoder = MultiLabelEncoder.from_settings(settings)
        label_encoder.fit(line for _, line in reader.readsents())
        data = Dataset(settings, reader, label_encoder)

        pre_batches = 0
        for batch in data.batch_generator():
            pre_batches += 1

        self.assertAlmostEqual(pre_batches, self.insts // 20, delta=delta)

        devset = data.get_dev_split(self.insts, split=0.05)

        post_batches = 0
        for batch in data.batch_generator():
            post_batches += 1

        self.assertAlmostEqual(pre_batches * 0.95, post_batches, delta=delta)
Example #2
0
        print()
        print("::: Target tasks :::")
        print()
        for task, le in label_encoder.tasks.items():
            print("- {:<15} target={:<6} level={:<6} vocab={:<6}"
                  .format(task, le.target, le.level, len(le)))
        print()

    trainset = Dataset(settings, reader, label_encoder)

    devset = None
    if settings.dev_path:
        devset = Dataset(settings, Reader(settings, settings.dev_path), label_encoder)
        devset = devset.get_batches()
    elif settings.dev_split > 0:
        devset = trainset.get_dev_split(ninsts, split=settings.dev_split)
        ninsts = ninsts - (len(devset) * settings.batch_size)
    else:
        logging.warning("No devset: cannot monitor/optimize training")

    testset = None
    if settings.test_path:
        testset = Dataset(settings, Reader(settings, settings.test_path), label_encoder)

    # model
    model = SimpleModel(trainset.label_encoder,
                        settings.wemb_dim, settings.cemb_dim, settings.hidden_size,
                        settings.num_layers, dropout=settings.dropout,
                        word_dropout=settings.word_dropout,
                        cell=settings.cell, cemb_type=settings.cemb_type,
                        include_self=settings.include_self, pos_crf=settings.pos_crf)
Example #3
0
class TestDevSplit(unittest.TestCase):
    def setUp(self):
        settings = settings_from_file(testpath)
        settings['batch_size'] = 1
        reader = Reader(settings, settings.input_path)
        label_encoder = MultiLabelEncoder.from_settings(settings)
        insts = label_encoder.fit(line for _, line in reader.readsents())
        self.insts = insts
        self.num_batches = insts // settings.batch_size
        self.data = Dataset(settings, reader, label_encoder)

    def test_split_length(self):
        total_batches = 0
        for batch in self.data.batch_generator():
            total_batches += 1

        dev_batches = 0
        for batch in self.data.get_dev_split(self.insts, split=0.05):
            dev_batches += 1

        self.assertAlmostEqual(dev_batches, total_batches * 0.05, delta=delta)

    def test_remaining(self):
        pre_batches = 0
        for batch in self.data.batch_generator():
            pre_batches += 1

        self.assertEqual(pre_batches, self.insts)  # batch size is 1
        self.assertEqual(pre_batches, self.num_batches)

        devset = self.data.get_dev_split(self.insts, split=0.05)

        post_batches = 0
        for batch in self.data.batch_generator():
            post_batches += 1

        # FIXME
        self.assertAlmostEqual(len(devset) + post_batches,
                               pre_batches,
                               delta=delta * 5)
        self.assertAlmostEqual(pre_batches * 0.95,
                               post_batches,
                               delta=delta * 5)

    def test_batch_level(self):
        settings = settings_from_file(testpath)
        settings['batch_size'] = 20
        reader = Reader(settings, settings.input_path)
        label_encoder = MultiLabelEncoder.from_settings(settings)
        label_encoder.fit(line for _, line in reader.readsents())
        data = Dataset(settings, reader, label_encoder)

        pre_batches = 0
        for batch in data.batch_generator():
            pre_batches += 1

        self.assertAlmostEqual(pre_batches, self.insts // 20, delta=delta)

        devset = data.get_dev_split(self.insts, split=0.05)

        post_batches = 0
        for batch in data.batch_generator():
            post_batches += 1

        self.assertAlmostEqual(pre_batches * 0.95, post_batches, delta=delta)
Example #4
0
if __name__ == '__main__':
    from pie.settings import settings_from_file
    from pie.data import Dataset, Reader, MultiLabelEncoder

    settings = settings_from_file('./config.json')
    reader = Reader(settings, settings.input_path)
    label_encoder = MultiLabelEncoder.from_settings(settings)
    label_encoder.fit_reader(reader)
    data = Dataset(settings, reader, label_encoder)
    model = SimpleModel(data.label_encoder, settings.wemb_dim,
                        settings.cemb_dim, settings.hidden_size,
                        settings.num_layers)
    model.to(settings.device)

    for batch in data.batch_generator():
        model.loss(batch)
        break
    ((word, wlen), (char, clen)), tasks = next(data.batch_generator())

    wemb, (cemb, cemb_outs) = model.wemb(word), model.cemb(char, clen, wlen)
    emb = model.merger(wemb, cemb)
    enc_outs = model.encoder(emb, wlen)
    model.pos_decoder.predict(enc_outs, wlen)
    lemma_hyps, _ = model.lemma_decoder.predict_max(
        cemb_outs,
        clen,
        context=torch_utils.flatten_padded_batch(enc_outs, wlen))
    print(lemma_hyps)

    model.evaluate(data.get_dev_split())