Example #1
0
class TestWordCharEncoding(unittest.TestCase):
    def setUp(self):
        settings = settings_from_file(testpath)
        reader = Reader(settings, settings.input_path)
        label_encoder = MultiLabelEncoder.from_settings(settings)
        label_encoder.fit_reader(reader)
        self.data = Dataset(settings, reader, label_encoder)

    def test_lengths(self):
        ((word, wlen), (char, clen)), _ = next(self.data.batch_generator())

        for c, cl in zip(char.t(), clen):
            self.assertEqual(c[0].item(),
                             self.data.label_encoder.char.get_bos())
            self.assertEqual(c[cl - 1].item(),
                             self.data.label_encoder.char.get_eos())

    def test_word_char(self):
        for ((word, wlen), (char, clen)), _ in self.data.batch_generator():
            idx = 0
            total_words = 0
            for sent, nwords in zip(word.t(), wlen):
                for word in sent[:nwords]:
                    # get word
                    word = self.data.label_encoder.word.inverse_table[word]
                    # get chars
                    chars = char.t()[idx][1:clen[idx] -
                                          1].tolist()  # remove <eos>,<bos>
                    chars = ''.join(
                        self.data.label_encoder.char.inverse_transform(chars))
                    self.assertEqual(word, chars)
                    idx += 1
                total_words += nwords
            self.assertEqual(idx, total_words, "Checked all words")
Example #2
0
def _test_conversion(settings, level='token'):
    reader = Reader(settings, settings.input_path)
    label_encoder = MultiLabelEncoder.from_settings(settings)
    label_encoder.fit_reader(reader)
    data = Dataset(settings, reader, label_encoder)

    le = label_encoder.tasks['lemma']
    for (inp, tasks), (rinp, rtasks) in data.batch_generator(return_raw=True):
        # preds
        tinp, tlen = tasks['lemma']
        preds = [
            le.stringify(t, l)
            for t, l in zip(tinp.t().tolist(), tlen.tolist())
        ]
        if level == 'token':
            preds = [w for line in preds for w in line]
        # tokens
        tokens = [tok for line in rinp for tok in line]
        # trues
        trues = [w for line in rtasks for w in line['lemma']]

        # check
        for pred, token, true in zip(preds, tokens, trues):
            rec = le.preprocessor_fn.inverse_transform(pred, token)
            assert rec == true, (pred, token, true, rec)
Example #3
0
    def test_batch_level(self):
        settings = settings_from_file(testpath)
        settings['batch_size'] = 20
        reader = Reader(settings, settings.input_path)
        label_encoder = MultiLabelEncoder.from_settings(settings)
        label_encoder.fit(line for _, line in reader.readsents())
        data = Dataset(settings, reader, label_encoder)

        pre_batches = 0
        for batch in data.batch_generator():
            pre_batches += 1

        self.assertAlmostEqual(pre_batches, self.insts // 20, delta=delta)

        devset = data.get_dev_split(self.insts, split=0.05)

        post_batches = 0
        for batch in data.batch_generator():
            post_batches += 1

        self.assertAlmostEqual(pre_batches * 0.95, post_batches, delta=delta)
Example #4
0
    print("::: Model :::")
    print()
    print(model)
    print()
    print("::: Model parameters :::")
    print()
    print(sum(p.nelement() for p in model.parameters()))
    print()

    # training
    print("Starting training")
    trainer = Trainer(settings, model, trainset, ninsts)
    try:
        trainer.train_epochs(settings.epochs, dev=devset)
    except KeyboardInterrupt:
        print("Stopping training")
    finally:
        model.eval()

    if testset is not None:
        print("Evaluating model on test set")
        for task in model.evaluate(testset.batch_generator()).values():
            task.print_summary()

    # save model
    fpath, infix = get_fname_infix(settings)
    fpath = model.save(fpath, infix=infix, settings=settings)
    print("Saved best model to: [{}]".format(fpath))

    print("Bye!")
Example #5
0
        return wembs + cembs


def EmbeddingConcat():
    def func(wemb, cemb):
        return torch.cat([wemb, cemb], dim=-1)
    return func


if __name__ == '__main__':
    from pie.settings import settings_from_file
    from pie.data import Dataset

    settings = settings_from_file('./config.json')
    data = Dataset(settings)
    ((word, wlen), (char, clen)), tasks = next(data.batch_generator())
    print("lemma", tasks['lemma'][0].size(), tasks['lemma'][1])
    print("char", char.size(), clen)
    print("word", word.size(), wlen)

    emb_dim = 20
    wemb = nn.Embedding(len(data.label_encoder.word), emb_dim)
    cemb = RNNEmbedding(len(data.label_encoder.char), emb_dim)
    cnncemb = CNNEmbedding(len(data.label_encoder.char), emb_dim)

    mixer = EmbeddingMixer(20)
    w, (c, _) = wemb(word), cemb(char, clen, wlen)
    output = mixer(w, c)

    output2 = []
    for w, c in zip(w, c):
Example #6
0
class TestDevSplit(unittest.TestCase):
    def setUp(self):
        settings = settings_from_file(testpath)
        settings['batch_size'] = 1
        reader = Reader(settings, settings.input_path)
        label_encoder = MultiLabelEncoder.from_settings(settings)
        insts = label_encoder.fit(line for _, line in reader.readsents())
        self.insts = insts
        self.num_batches = insts // settings.batch_size
        self.data = Dataset(settings, reader, label_encoder)

    def test_split_length(self):
        total_batches = 0
        for batch in self.data.batch_generator():
            total_batches += 1

        dev_batches = 0
        for batch in self.data.get_dev_split(self.insts, split=0.05):
            dev_batches += 1

        self.assertAlmostEqual(dev_batches, total_batches * 0.05, delta=delta)

    def test_remaining(self):
        pre_batches = 0
        for batch in self.data.batch_generator():
            pre_batches += 1

        self.assertEqual(pre_batches, self.insts)  # batch size is 1
        self.assertEqual(pre_batches, self.num_batches)

        devset = self.data.get_dev_split(self.insts, split=0.05)

        post_batches = 0
        for batch in self.data.batch_generator():
            post_batches += 1

        # FIXME
        self.assertAlmostEqual(len(devset) + post_batches,
                               pre_batches,
                               delta=delta * 5)
        self.assertAlmostEqual(pre_batches * 0.95,
                               post_batches,
                               delta=delta * 5)

    def test_batch_level(self):
        settings = settings_from_file(testpath)
        settings['batch_size'] = 20
        reader = Reader(settings, settings.input_path)
        label_encoder = MultiLabelEncoder.from_settings(settings)
        label_encoder.fit(line for _, line in reader.readsents())
        data = Dataset(settings, reader, label_encoder)

        pre_batches = 0
        for batch in data.batch_generator():
            pre_batches += 1

        self.assertAlmostEqual(pre_batches, self.insts // 20, delta=delta)

        devset = data.get_dev_split(self.insts, split=0.05)

        post_batches = 0
        for batch in data.batch_generator():
            post_batches += 1

        self.assertAlmostEqual(pre_batches * 0.95, post_batches, delta=delta)
Example #7
0
    parser.add_argument('--buffer_size', type=int, default=100000)
    parser.add_argument('--device', default='cpu')
    parser.add_argument('--model_info', action='store_true')
    parser.add_argument('--full', action='store_true')
    args = parser.parse_args()

    model = BaseModel.load(args.model_path).to(args.device)
    if args.model_info:
        print(model)

    if hasattr(model, '_settings'):  # new models should all have _settings
        settings = model._settings
    elif args.settings:
        with utils.shutup():
            settings = settings_from_file(args.settings)
    else:
        with utils.shutup():
            settings = load_default_settings()

    # overwrite defaults
    settings.batch_size = args.batch_size
    settings.buffer_size = args.buffer_size
    settings.device = args.device

    reader = Reader(settings, *args.test_path)
    dataset = Dataset(settings, reader, model.label_encoder)
    dataset = device_wrapper(list(dataset.batch_generator()), args.device)

    for task in model.evaluate(dataset).values():
        task.print_summary(full=args.full)