def test_uneven_batch(self): test_seqs = [ "a b".split(), "b b b".split(), ] tss = self.get_tokenized_splits(test_seqs, unroll=1) batches = BatchBuilder([self.ivec_app_ctor(ts) for ts in tss], len(tss)) batches = iter(batches) batch = next(batches) expectation = ( torch.LongTensor([[0], [1]]), torch.LongTensor([[1], [1]]), torch.stack([ self.ivec_eetor(" ".join(words[:-1])) for words in test_seqs ]), torch.LongTensor([]), ) self.assertEqual(batch, expectation) batch = next(batches) expectation = (torch.LongTensor([[1]]), torch.LongTensor([[1]]), torch.stack([ self.ivec_eetor(" ".join(words[:-1])) for words in test_seqs[1:] ]), torch.LongTensor([1])) self.assertEqual(batch, expectation)
def test_no_discard_even_lenght_small_batch(self): test_seqs = [ "b b".split(), "b c".split(), ] tss = self.get_tokenized_splits(test_seqs, unroll=1) batches = BatchBuilder([self.ivec_app_ctor(ts) for ts in tss], 1, discard_h=False) batches = iter(batches) batch = next(batches) expectation = ( torch.LongTensor([[1]]), torch.LongTensor([[1]]), torch.stack([self.ivec_eetor(" ".join(test_seqs[0][:-1]))]), torch.LongTensor([]), ) self.assertEqual(batch, expectation) batch = next(batches) expectation = ( torch.LongTensor([[1]]), torch.LongTensor([[2]]), torch.stack([self.ivec_eetor(" ".join(test_seqs[1][:-1]))]), torch.LongTensor([0]), ) self.assertEqual(batch, expectation)
def test_even_batch_multi_sample_len(self): test_seqs = [ "a b c".split(), "b b b".split(), ] tss = self.get_tokenized_splits(test_seqs, unroll=1) batches = BatchBuilder([self.ivec_app_ctor(ts) for ts in tss], len(tss)) batches = iter(batches) self.assertEqual(len(list(batches)), 2)
def test_reproducibility(self): test_seqs = [ "a b c".split(), "a b".split(), "b b b".split(), ] tss = self.get_tokenized_splits(test_seqs, unroll=1) batches = BatchBuilder([self.ivec_app_ctor(ts) for ts in tss], 2) epoch1 = list(iter(batches)) epoch2 = list(iter(batches)) self.assertEqual(epoch1, epoch2)
def test_even_batch_single_sample_no_ivecs(self): test_seqs = [ "a b".split(), "b b".split(), ] tss = self.get_tokenized_splits(test_seqs, unroll=1) batches = BatchBuilder(tss, len(tss)) batches = iter(batches) batch = next(batches) expectation = ( torch.LongTensor([[0], [1]]), torch.LongTensor([[1], [1]]), torch.LongTensor([]), ) self.assertEqual(batch, expectation)
def test_even_lenght_small_batch_2(self): test_seqs = [ "a b".split(), "b b".split(), "b c".split(), "c a".split(), ] tss = self.get_tokenized_splits(test_seqs, unroll=1) batches = BatchBuilder([self.ivec_app_ctor(ts) for ts in tss], 2) batches = iter(batches) batch = next(batches) expectation = ( torch.LongTensor([[0], [1]]), torch.LongTensor([[1], [1]]), torch.stack([ self.ivec_eetor(" ".join(words[:-1])) for words in test_seqs[0:2] ]), torch.LongTensor([]), ) self.assertEqual(batch, expectation) batch = next(batches) expectation = ( torch.LongTensor([[1], [2]]), torch.LongTensor([[2], [0]]), torch.stack([ self.ivec_eetor(" ".join(words[:-1])) for words in test_seqs[2:4] ]), torch.LongTensor([]), ) self.assertEqual(batch, expectation)
def test_insufficient_stream_length(self): test_seqs = [ "a b c".split(), "a".split(), "b b b".split(), ] tss = self.get_tokenized_splits(test_seqs, unroll=1) batches = BatchBuilder([self.ivec_app_ctor(ts) for ts in tss], 2) batches = iter(batches) batch = next(batches) expectation = ( torch.LongTensor([[0], [1]]), torch.LongTensor([[1], [1]]), torch.stack([ self.ivec_eetor(" ".join(words[:-1])) for words in [test_seqs[0], test_seqs[2]] ]), torch.LongTensor([]), ) self.assertEqual(batch, expectation) batch = next(batches) expectation = ( torch.LongTensor([[1], [1]]), torch.LongTensor([[2], [1]]), torch.stack([ self.ivec_eetor(" ".join(words[:-1])) for words in [test_seqs[0], test_seqs[2]] ]), torch.LongTensor([0, 1]), ) self.assertEqual(batch, expectation)
if args.cuda: lm.cuda() print(lm.model) print("loading SMM iVector extractor ...") with open(args.ivec_extractor, 'rb') as f: ivec_extractor = smm_ivec_extractor.load(f) if args.ivec_nb_iters is not None: ivec_extractor._nb_iters = args.ivec_nb_iters print(ivec_extractor) print("preparing data...") def ts_from_file(f): return TokenizedSplitFFBase( f, lm.vocab, lambda seq: TemporalSplits(seq, lm.model.in_len, args. target_seq_len)) tss = filelist_to_objects(args.file_list, ts_from_file) data = BatchBuilder(tss, args.batch_size, discard_h=not args.concat_articles) if args.cuda: data = CudaStream(data) data_ivecs = ivec_appenders.ParalelIvecAppender( data, ivec_extractor, ivec_extractor.build_translator(lm.vocab)) print("evaluating...") loss = evaluate(lm, data_ivecs, use_ivecs=True) print('loss {:5.2f} | ppl {:8.2f}'.format(loss, math.exp(loss)))
ts, ivec_extractor) print("\ttraining...") def ivec_ts_from_file(f): ts = TokenizedSplitFFBase( f, lm.vocab, lambda seq: TemporalSplits(seq, lm.model.in_len, args. target_seq_len)) return ivec_appenders.CheatingIvecAppender(ts, ivec_extractor) train_data_ivecs = filelist_to_objects(args.train_list, ivec_ts_from_file) print("\tvalidation...") valid_data_ivecs = filelist_to_objects(args.valid_list, ivec_ts_from_file) valid_data = BatchBuilder(valid_data_ivecs, args.batch_size, discard_h=not args.concat_articles) if args.cuda: valid_data = CudaStream(valid_data) print("training...") lr = args.lr best_val_loss = None for epoch in range(1, args.epochs + 1): random.shuffle(train_data_ivecs) train_data = BatchBuilder(train_data_ivecs, args.batch_size, discard_h=not args.concat_articles) if args.cuda: train_data = CudaStream(train_data)