Beispiel #1
0
    def test_uneven_batch(self):
        test_seqs = [
            "a b".split(),
            "b b b".split(),
        ]
        tss = self.get_tokenized_splits(test_seqs, unroll=1)

        batches = BatchBuilder([self.ivec_app_ctor(ts) for ts in tss],
                               len(tss))
        batches = iter(batches)

        batch = next(batches)
        expectation = (
            torch.LongTensor([[0], [1]]),
            torch.LongTensor([[1], [1]]),
            torch.stack([
                self.ivec_eetor(" ".join(words[:-1])) for words in test_seqs
            ]),
            torch.LongTensor([]),
        )

        self.assertEqual(batch, expectation)

        batch = next(batches)
        expectation = (torch.LongTensor([[1]]), torch.LongTensor([[1]]),
                       torch.stack([
                           self.ivec_eetor(" ".join(words[:-1]))
                           for words in test_seqs[1:]
                       ]), torch.LongTensor([1]))

        self.assertEqual(batch, expectation)
Beispiel #2
0
    def test_no_discard_even_lenght_small_batch(self):
        test_seqs = [
            "b b".split(),
            "b c".split(),
        ]
        tss = self.get_tokenized_splits(test_seqs, unroll=1)

        batches = BatchBuilder([self.ivec_app_ctor(ts) for ts in tss],
                               1,
                               discard_h=False)
        batches = iter(batches)

        batch = next(batches)
        expectation = (
            torch.LongTensor([[1]]),
            torch.LongTensor([[1]]),
            torch.stack([self.ivec_eetor(" ".join(test_seqs[0][:-1]))]),
            torch.LongTensor([]),
        )

        self.assertEqual(batch, expectation)

        batch = next(batches)
        expectation = (
            torch.LongTensor([[1]]),
            torch.LongTensor([[2]]),
            torch.stack([self.ivec_eetor(" ".join(test_seqs[1][:-1]))]),
            torch.LongTensor([0]),
        )

        self.assertEqual(batch, expectation)
Beispiel #3
0
    def test_even_batch_multi_sample_len(self):
        test_seqs = [
            "a b c".split(),
            "b b b".split(),
        ]
        tss = self.get_tokenized_splits(test_seqs, unroll=1)

        batches = BatchBuilder([self.ivec_app_ctor(ts) for ts in tss],
                               len(tss))
        batches = iter(batches)

        self.assertEqual(len(list(batches)), 2)
Beispiel #4
0
    def test_reproducibility(self):
        test_seqs = [
            "a b c".split(),
            "a b".split(),
            "b b b".split(),
        ]
        tss = self.get_tokenized_splits(test_seqs, unroll=1)

        batches = BatchBuilder([self.ivec_app_ctor(ts) for ts in tss], 2)
        epoch1 = list(iter(batches))
        epoch2 = list(iter(batches))

        self.assertEqual(epoch1, epoch2)
Beispiel #5
0
    def test_even_batch_single_sample_no_ivecs(self):
        test_seqs = [
            "a b".split(),
            "b b".split(),
        ]
        tss = self.get_tokenized_splits(test_seqs, unroll=1)

        batches = BatchBuilder(tss, len(tss))
        batches = iter(batches)

        batch = next(batches)
        expectation = (
            torch.LongTensor([[0], [1]]),
            torch.LongTensor([[1], [1]]),
            torch.LongTensor([]),
        )

        self.assertEqual(batch, expectation)
Beispiel #6
0
    def test_even_lenght_small_batch_2(self):
        test_seqs = [
            "a b".split(),
            "b b".split(),
            "b c".split(),
            "c a".split(),
        ]
        tss = self.get_tokenized_splits(test_seqs, unroll=1)

        batches = BatchBuilder([self.ivec_app_ctor(ts) for ts in tss], 2)
        batches = iter(batches)

        batch = next(batches)
        expectation = (
            torch.LongTensor([[0], [1]]),
            torch.LongTensor([[1], [1]]),
            torch.stack([
                self.ivec_eetor(" ".join(words[:-1]))
                for words in test_seqs[0:2]
            ]),
            torch.LongTensor([]),
        )

        self.assertEqual(batch, expectation)

        batch = next(batches)
        expectation = (
            torch.LongTensor([[1], [2]]),
            torch.LongTensor([[2], [0]]),
            torch.stack([
                self.ivec_eetor(" ".join(words[:-1]))
                for words in test_seqs[2:4]
            ]),
            torch.LongTensor([]),
        )

        self.assertEqual(batch, expectation)
Beispiel #7
0
    def test_insufficient_stream_length(self):
        test_seqs = [
            "a b c".split(),
            "a".split(),
            "b b b".split(),
        ]
        tss = self.get_tokenized_splits(test_seqs, unroll=1)

        batches = BatchBuilder([self.ivec_app_ctor(ts) for ts in tss], 2)
        batches = iter(batches)

        batch = next(batches)
        expectation = (
            torch.LongTensor([[0], [1]]),
            torch.LongTensor([[1], [1]]),
            torch.stack([
                self.ivec_eetor(" ".join(words[:-1]))
                for words in [test_seqs[0], test_seqs[2]]
            ]),
            torch.LongTensor([]),
        )

        self.assertEqual(batch, expectation)

        batch = next(batches)
        expectation = (
            torch.LongTensor([[1], [1]]),
            torch.LongTensor([[2], [1]]),
            torch.stack([
                self.ivec_eetor(" ".join(words[:-1]))
                for words in [test_seqs[0], test_seqs[2]]
            ]),
            torch.LongTensor([0, 1]),
        )

        self.assertEqual(batch, expectation)
Beispiel #8
0
    if args.cuda:
        lm.cuda()
    print(lm.model)

    print("loading SMM iVector extractor ...")
    with open(args.ivec_extractor, 'rb') as f:
        ivec_extractor = smm_ivec_extractor.load(f)
    if args.ivec_nb_iters is not None:
        ivec_extractor._nb_iters = args.ivec_nb_iters
    print(ivec_extractor)

    print("preparing data...")

    def ts_from_file(f):
        return TokenizedSplitFFBase(
            f, lm.vocab, lambda seq: TemporalSplits(seq, lm.model.in_len, args.
                                                    target_seq_len))

    tss = filelist_to_objects(args.file_list, ts_from_file)
    data = BatchBuilder(tss,
                        args.batch_size,
                        discard_h=not args.concat_articles)
    if args.cuda:
        data = CudaStream(data)
    data_ivecs = ivec_appenders.ParalelIvecAppender(
        data, ivec_extractor, ivec_extractor.build_translator(lm.vocab))

    print("evaluating...")
    loss = evaluate(lm, data_ivecs, use_ivecs=True)
    print('loss {:5.2f} | ppl {:8.2f}'.format(loss, math.exp(loss)))
Beispiel #9
0
        ts, ivec_extractor)

    print("\ttraining...")

    def ivec_ts_from_file(f):
        ts = TokenizedSplitFFBase(
            f, lm.vocab, lambda seq: TemporalSplits(seq, lm.model.in_len, args.
                                                    target_seq_len))
        return ivec_appenders.CheatingIvecAppender(ts, ivec_extractor)

    train_data_ivecs = filelist_to_objects(args.train_list, ivec_ts_from_file)

    print("\tvalidation...")
    valid_data_ivecs = filelist_to_objects(args.valid_list, ivec_ts_from_file)
    valid_data = BatchBuilder(valid_data_ivecs,
                              args.batch_size,
                              discard_h=not args.concat_articles)
    if args.cuda:
        valid_data = CudaStream(valid_data)

    print("training...")
    lr = args.lr
    best_val_loss = None

    for epoch in range(1, args.epochs + 1):
        random.shuffle(train_data_ivecs)
        train_data = BatchBuilder(train_data_ivecs,
                                  args.batch_size,
                                  discard_h=not args.concat_articles)
        if args.cuda:
            train_data = CudaStream(train_data)