예제 #1
0
    def test_uneven_batch(self):
        test_seqs = [
            "a b".split(),
            "b b b".split(),
        ]
        tss = self.get_tokenized_splits(test_seqs, unroll=1)
        tokens = self.get_tokens(test_seqs)

        batches = BatchBuilder([self.ivec_app_ctor(ts) for ts in tss],
                               len(tss))
        batches = iter(batches)

        batch = next(batches)
        expectation = (
            torch.LongTensor([[0], [1]]),
            torch.LongTensor([[1], [1]]),
            torch.stack([
                self.ivec_eetor(" ".join(words[:-1])) for words in test_seqs
            ]),
            torch.LongTensor([]),
        )

        self.assertEqual(batch, expectation)

        batch = next(batches)
        expectation = (torch.LongTensor([[1]]), torch.LongTensor([[1]]),
                       torch.stack([
                           self.ivec_eetor(" ".join(words[:-1]))
                           for words in test_seqs[1:]
                       ]), torch.LongTensor([1]))

        self.assertEqual(batch, expectation)
예제 #2
0
    def test_no_discard_uneven_length_small_batch(self):
        test_seqs = [
            "a b c".split(),
            "a b".split(),
            "b b b".split(),
        ]
        tss = self.get_tokenized_splits(test_seqs, unroll=1)
        tokens = self.get_tokens(test_seqs)

        batches = BatchBuilder([self.ivec_app_ctor(ts) for ts in tss],
                               2,
                               discard_h=False)
        batches = iter(batches)

        batch = next(batches)
        expectation = (
            torch.LongTensor([[0], [0]]),
            torch.LongTensor([[1], [1]]),
            torch.stack([
                self.ivec_eetor(" ".join(words[:-1]))
                for words in [test_seqs[0], test_seqs[1]]
            ]),
            torch.LongTensor([]),
        )

        self.assertEqual(batch, expectation)

        batch = next(batches)
        expectation = (
            torch.LongTensor([[1], [1]]),
            torch.LongTensor([[2], [1]]),
            torch.stack([
                self.ivec_eetor(" ".join(words[:-1]))
                for words in [test_seqs[0], test_seqs[2]]
            ]),
            torch.LongTensor([0, 1]),
        )

        self.assertEqual(batch, expectation)

        batch = next(batches)
        expectation = (
            torch.LongTensor([[1]]),
            torch.LongTensor([[1]]),
            torch.stack([
                self.ivec_eetor(" ".join(words[:-1]))
                for words in [test_seqs[2]]
            ]),
            torch.LongTensor([1]),
        )

        self.assertEqual(batch, expectation)
예제 #3
0
    def test_even_batch_multi_sample_len(self):
        test_seqs = [
            "a b c".split(),
            "b b b".split(),
        ]
        tss = self.get_tokenized_splits(test_seqs, unroll=1)
        tokens = self.get_tokens(test_seqs)

        batches = BatchBuilder([self.ivec_app_ctor(ts) for ts in tss],
                               len(tss))
        batches = iter(batches)

        self.assertEqual(len(list(batches)), 2)
예제 #4
0
    def test_reproducibility(self):
        test_seqs = [
            "a b c".split(),
            "a b".split(),
            "b b b".split(),
        ]
        tss = self.get_tokenized_splits(test_seqs, unroll=1)
        tokens = self.get_tokens(test_seqs)

        batches = BatchBuilder([self.ivec_app_ctor(ts) for ts in tss], 2)
        epoch1 = list(iter(batches))
        epoch2 = list(iter(batches))

        self.assertEqual(epoch1, epoch2)
예제 #5
0
    def test_even_batch_single_sample_no_ivecs(self):
        test_seqs = [
            "a b".split(),
            "b b".split(),
        ]
        tss = self.get_tokenized_splits(test_seqs, unroll=1)
        tokens = self.get_tokens(test_seqs)

        batches = BatchBuilder(tss, len(tss))
        batches = iter(batches)

        batch = next(batches)
        expectation = (
            torch.LongTensor([[0], [1]]),
            torch.LongTensor([[1], [1]]),
            torch.LongTensor([]),
        )

        self.assertEqual(batch, expectation)
예제 #6
0
    def test_even_lenght_small_batch_2(self):
        test_seqs = [
            "a b".split(),
            "b b".split(),
            "b c".split(),
            "c a".split(),
        ]
        tss = self.get_tokenized_splits(test_seqs, unroll=1)
        tokens = self.get_tokens(test_seqs)

        batches = BatchBuilder([self.ivec_app_ctor(ts) for ts in tss], 2)
        batches = iter(batches)

        batch = next(batches)
        expectation = (
            torch.LongTensor([[0], [1]]),
            torch.LongTensor([[1], [1]]),
            torch.stack([
                self.ivec_eetor(" ".join(words[:-1]))
                for words in test_seqs[0:2]
            ]),
            torch.LongTensor([]),
        )

        self.assertEqual(batch, expectation)

        batch = next(batches)
        expectation = (
            torch.LongTensor([[1], [2]]),
            torch.LongTensor([[2], [0]]),
            torch.stack([
                self.ivec_eetor(" ".join(words[:-1]))
                for words in test_seqs[2:4]
            ]),
            torch.LongTensor([]),
        )

        self.assertEqual(batch, expectation)
예제 #7
0
    def test_insufficient_stream_length(self):
        test_seqs = [
            "a b c".split(),
            "a".split(),
            "b b b".split(),
        ]
        tss = self.get_tokenized_splits(test_seqs, unroll=1)
        tokens = self.get_tokens(test_seqs)

        batches = BatchBuilder([self.ivec_app_ctor(ts) for ts in tss], 2)
        batches = iter(batches)

        batch = next(batches)
        expectation = (
            torch.LongTensor([[0], [1]]),
            torch.LongTensor([[1], [1]]),
            torch.stack([
                self.ivec_eetor(" ".join(words[:-1]))
                for words in [test_seqs[0], test_seqs[2]]
            ]),
            torch.LongTensor([]),
        )

        self.assertEqual(batch, expectation)

        batch = next(batches)
        expectation = (
            torch.LongTensor([[1], [1]]),
            torch.LongTensor([[2], [1]]),
            torch.stack([
                self.ivec_eetor(" ".join(words[:-1]))
                for words in [test_seqs[0], test_seqs[2]]
            ]),
            torch.LongTensor([0, 1]),
        )

        self.assertEqual(batch, expectation)
예제 #8
0
    parser.add_argument('--load',
                        type=str,
                        required=True,
                        help='where to load a model from')
    args = parser.parse_args()
    print(args)

    init_seeds(args.seed, args.cuda)

    print("loading model...")
    lm = torch.load(args.load)
    if args.cuda:
        lm.cuda()
    print(lm.model)

    print("preparing data...")

    def temp_splits_from_fn(fn):
        tokens = tokens_from_file(fn, lm.vocab, randomize=False)
        return TemporalSplits(tokens, lm.model.in_len, args.target_seq_len)

    tss = filelist_to_objects(args.file_list, temp_splits_from_fn)
    data = BatchBuilder(tss,
                        args.batch_size,
                        discard_h=not args.concat_articles)
    if args.cuda:
        data = CudaStream(data)

    loss = evaluate(lm, data, use_ivecs=False)
    print('loss {:5.2f} | ppl {:8.2f}'.format(loss, math.exp(loss)))
예제 #9
0
    print("loading model...")
    lm = torch.load(args.load)
    if args.cuda:
        lm.cuda()
    print(lm.model)

    print("preparing data...")

    def temp_splits_from_fn(fn):
        tokens = tokens_from_file(fn, lm.vocab, randomize=False)
        return TemporalSplits(tokens, lm.model.in_len, args.target_seq_len)

    print("\ttraining...")
    train_tss = filelist_to_objects(args.train_list, temp_splits_from_fn)
    train_data = BatchBuilder(train_tss,
                              args.batch_size,
                              discard_h=not args.concat_articles)
    if args.cuda:
        train_data = CudaStream(train_data)

    print("\tvalidation...")
    valid_tss = filelist_to_objects(args.valid_list, temp_splits_from_fn)
    valid_data = BatchBuilder(valid_tss,
                              args.batch_size,
                              discard_h=not args.concat_articles)
    if args.cuda:
        valid_data = CudaStream(valid_data)

    print("training...")
    lr = args.lr
    best_val_loss = None
예제 #10
0
    print(ivec_extractor)
    translator = ivec_extractor.build_translator(lm.vocab)

    print("preparing data...")
    ivec_app_creator = lambda ts: ivec_appenders.HistoryIvecAppender(
        ts, ivec_extractor)

    print("\ttraining...")
    train_tss = filelist_to_tokenized_splits(args.train_list, lm.vocab,
                                             args.target_seq_len)

    print("\tvalidation...")
    valid_tss = filelist_to_tokenized_splits(args.valid_list, lm.vocab,
                                             args.target_seq_len)
    valid_data = BatchBuilder([ivec_app_creator(ts) for ts in valid_tss],
                              args.batch_size,
                              discard_h=not args.concat_articles)

    if args.cuda:
        valid_data = CudaStream(valid_data)

    print("training...")
    lr = args.lr
    best_val_loss = None

    for epoch in range(1, args.epochs + 1):
        random.shuffle(train_tss)
        train_data = BatchBuilder(train_tss,
                                  args.batch_size,
                                  discard_h=not args.concat_articles)
        if args.cuda: