def test_get_training_data_iters(): train_line_count = 100 train_line_count_empty = 0 train_max_length = 30 dev_line_count = 20 dev_max_length = 30 expected_mean = 1.0 expected_std = 0.0 test_line_count = 20 test_line_count_empty = 0 test_max_length = 30 batch_size = 5 with tmp_digits_dataset("tmp_corpus", train_line_count, train_line_count_empty, train_max_length - C.SPACE_FOR_XOS, dev_line_count, dev_max_length - C.SPACE_FOR_XOS, test_line_count, test_line_count_empty, test_max_length - C.SPACE_FOR_XOS) as data: # tmp common vocab vcb = vocab.build_from_paths( [data['train_source'], data['train_target']]) train_iter, val_iter, config_data, data_info = data_io.get_training_data_iters( sources=[data['train_source']], target=data['train_target'], validation_sources=[data['dev_source']], validation_target=data['dev_target'], source_vocabs=[vcb], target_vocab=vcb, source_vocab_paths=[None], target_vocab_path=None, shared_vocab=True, batch_size=batch_size, batch_by_words=False, batch_num_devices=1, max_seq_len_source=train_max_length, max_seq_len_target=train_max_length, bucketing=True, bucket_width=10) assert isinstance(train_iter, data_io.ParallelSampleIter) assert isinstance(val_iter, data_io.ParallelSampleIter) assert isinstance(config_data, data_io.DataConfig) assert data_info.sources == [data['train_source']] assert data_info.target == data['train_target'] assert data_info.source_vocabs == [None] assert data_info.target_vocab is None assert config_data.data_statistics.max_observed_len_source == train_max_length assert config_data.data_statistics.max_observed_len_target == train_max_length assert np.isclose(config_data.data_statistics.length_ratio_mean, expected_mean) assert np.isclose(config_data.data_statistics.length_ratio_std, expected_std) assert train_iter.batch_size == batch_size assert val_iter.batch_size == batch_size assert train_iter.default_bucket_key == (train_max_length, train_max_length) assert val_iter.default_bucket_key == (dev_max_length, dev_max_length) assert train_iter.dtype == 'float32' # test some batches bos_id = vcb[C.BOS_SYMBOL] eos_id = vcb[C.EOS_SYMBOL] expected_first_target_symbols = np.full((batch_size, ), bos_id, dtype='float32') for epoch in range(2): while train_iter.iter_next(): batch = train_iter.next() assert isinstance(batch, data_io.Batch) source = batch.source.asnumpy() target = batch.target.asnumpy() label = batch.labels[C.TARGET_LABEL_NAME].asnumpy() length_ratio_label = batch.labels[ C.LENRATIO_LABEL_NAME].asnumpy() assert source.shape[0] == target.shape[0] == label.shape[ 0] == batch_size # target first symbol should be BOS # each source sequence contains one EOS symbol assert np.sum(source == eos_id) == batch_size assert np.array_equal(target[:, 0], expected_first_target_symbols) # label first symbol should be 2nd target symbol assert np.array_equal(label[:, 0], target[:, 1]) # each label sequence contains one EOS symbol assert np.sum(label == eos_id) == batch_size train_iter.reset()
def test_get_training_data_iters(): train_line_count = 100 train_max_length = 30 dev_line_count = 20 dev_max_length = 30 expected_mean = 1.0 expected_std = 0.0 test_line_count = 20 test_line_count_empty = 0 test_max_length = 30 batch_size = 5 with tmp_digits_dataset("tmp_corpus", train_line_count, train_max_length, dev_line_count, dev_max_length, test_line_count, test_line_count_empty, test_max_length) as data: # tmp common vocab vcb = vocab.build_from_paths([data['source'], data['target']]) train_iter, val_iter, config_data = data_io.get_training_data_iters(data['source'], data['target'], data['validation_source'], data['validation_target'], vocab_source=vcb, vocab_target=vcb, vocab_source_path=None, vocab_target_path=None, shared_vocab=True, batch_size=batch_size, batch_by_words=False, batch_num_devices=1, fill_up="replicate", max_seq_len_source=train_max_length, max_seq_len_target=train_max_length, bucketing=True, bucket_width=10) assert isinstance(train_iter, data_io.ParallelSampleIter) assert isinstance(val_iter, data_io.ParallelSampleIter) assert isinstance(config_data, data_io.DataConfig) assert config_data.source == data['source'] assert config_data.target == data['target'] assert config_data.vocab_source is None assert config_data.vocab_target is None assert config_data.data_statistics.max_observed_len_source == train_max_length - 1 assert config_data.data_statistics.max_observed_len_target == train_max_length assert np.isclose(config_data.data_statistics.length_ratio_mean, expected_mean) assert np.isclose(config_data.data_statistics.length_ratio_std, expected_std) assert train_iter.batch_size == batch_size assert val_iter.batch_size == batch_size assert train_iter.default_bucket_key == (train_max_length, train_max_length) assert val_iter.default_bucket_key == (dev_max_length, dev_max_length) assert train_iter.dtype == 'float32' # test some batches bos_id = vcb[C.BOS_SYMBOL] expected_first_target_symbols = np.full((batch_size,), bos_id, dtype='float32') for epoch in range(2): while train_iter.iter_next(): batch = train_iter.next() assert len(batch.data) == 2 assert len(batch.label) == 1 assert batch.bucket_key in train_iter.buckets source = batch.data[0].asnumpy() target = batch.data[1].asnumpy() label = batch.label[0].asnumpy() assert source.shape[0] == target.shape[0] == label.shape[0] == batch_size # target first symbol should be BOS assert np.array_equal(target[:, 0], expected_first_target_symbols) # label first symbol should be 2nd target symbol assert np.array_equal(label[:, 0], target[:, 1]) # each label sequence contains one EOS symbol assert np.sum(label == vcb[C.EOS_SYMBOL]) == batch_size train_iter.reset()
def test_get_training_data_iters(): train_line_count = 100 train_max_length = 30 dev_line_count = 20 dev_max_length = 30 expected_mean = 1.0 expected_std = 0.0 test_line_count = 20 test_line_count_empty = 0 test_max_length = 30 batch_size = 5 with tmp_digits_dataset("tmp_corpus", train_line_count, train_max_length - C.SPACE_FOR_XOS, dev_line_count, dev_max_length - C.SPACE_FOR_XOS, test_line_count, test_line_count_empty, test_max_length - C.SPACE_FOR_XOS) as data: # tmp common vocab vcb = vocab.build_from_paths([data['source'], data['target']]) train_iter, val_iter, config_data, data_info = data_io.get_training_data_iters( sources=[data['source']], target=data['target'], validation_sources=[ data['validation_source']], validation_target=data[ 'validation_target'], source_vocabs=[vcb], target_vocab=vcb, source_vocab_paths=[None], target_vocab_path=None, shared_vocab=True, batch_size=batch_size, batch_by_words=False, batch_num_devices=1, fill_up="replicate", max_seq_len_source=train_max_length, max_seq_len_target=train_max_length, bucketing=True, bucket_width=10) assert isinstance(train_iter, data_io.ParallelSampleIter) assert isinstance(val_iter, data_io.ParallelSampleIter) assert isinstance(config_data, data_io.DataConfig) assert data_info.sources == [data['source']] assert data_info.target == data['target'] assert data_info.source_vocabs == [None] assert data_info.target_vocab is None assert config_data.data_statistics.max_observed_len_source == train_max_length assert config_data.data_statistics.max_observed_len_target == train_max_length assert np.isclose(config_data.data_statistics.length_ratio_mean, expected_mean) assert np.isclose(config_data.data_statistics.length_ratio_std, expected_std) assert train_iter.batch_size == batch_size assert val_iter.batch_size == batch_size assert train_iter.default_bucket_key == (train_max_length, train_max_length) assert val_iter.default_bucket_key == (dev_max_length, dev_max_length) assert train_iter.dtype == 'float32' # test some batches bos_id = vcb[C.BOS_SYMBOL] eos_id = vcb[C.EOS_SYMBOL] expected_first_target_symbols = np.full((batch_size,), bos_id, dtype='float32') for epoch in range(2): while train_iter.iter_next(): batch = train_iter.next() assert len(batch.data) == 2 assert len(batch.label) == 1 assert batch.bucket_key in train_iter.buckets source = batch.data[0].asnumpy() target = batch.data[1].asnumpy() label = batch.label[0].asnumpy() assert source.shape[0] == target.shape[0] == label.shape[0] == batch_size # target first symbol should be BOS # each source sequence contains one EOS symbol assert np.sum(source == eos_id) == batch_size assert np.array_equal(target[:, 0], expected_first_target_symbols) # label first symbol should be 2nd target symbol assert np.array_equal(label[:, 0], target[:, 1]) # each label sequence contains one EOS symbol assert np.sum(label == eos_id) == batch_size train_iter.reset()
def test_get_training_data_iters(): from sockeye.test_utils import tmp_digits_dataset train_line_count = 100 train_line_count_empty = 0 train_max_length = 30 dev_line_count = 20 dev_max_length = 30 expected_mean = 1.0 expected_std = 0.0 test_line_count = 20 test_line_count_empty = 0 test_max_length = 30 batch_size = 5 num_source_factors = num_target_factors = 1 with tmp_digits_dataset("tmp_corpus", train_line_count, train_line_count_empty, train_max_length - C.SPACE_FOR_XOS, dev_line_count, dev_max_length - C.SPACE_FOR_XOS, test_line_count, test_line_count_empty, test_max_length - C.SPACE_FOR_XOS) as data: # tmp common vocab vcb = vocab.build_from_paths( [data['train_source'], data['train_target']]) train_iter, val_iter, config_data, data_info = data_io.get_training_data_iters( sources=[data['train_source']], targets=[data['train_target']], validation_sources=[data['dev_source']], validation_targets=[data['dev_target']], source_vocabs=[vcb], target_vocabs=[vcb], source_vocab_paths=[None], target_vocab_paths=[None], shared_vocab=True, batch_size=batch_size, batch_type=C.BATCH_TYPE_SENTENCE, max_seq_len_source=train_max_length, max_seq_len_target=train_max_length, bucketing=True, bucket_width=10) assert isinstance(train_iter, data_io.ParallelSampleIter) assert isinstance(val_iter, data_io.ParallelSampleIter) assert isinstance(config_data, data_io.DataConfig) assert data_info.sources == [data['train_source']] assert data_info.targets == [data['train_target']] assert data_info.source_vocabs == [None] assert data_info.target_vocabs == [None] assert config_data.data_statistics.max_observed_len_source == train_max_length assert config_data.data_statistics.max_observed_len_target == train_max_length assert np.isclose(config_data.data_statistics.length_ratio_mean, expected_mean) assert np.isclose(config_data.data_statistics.length_ratio_std, expected_std) assert train_iter.batch_size == batch_size assert val_iter.batch_size == batch_size # test some batches bos_id = vcb[C.BOS_SYMBOL] eos_id = vcb[C.EOS_SYMBOL] expected_first_target_symbols = torch.full((batch_size, 1), bos_id, dtype=torch.int32) for epoch in range(2): while train_iter.iter_next(): batch = train_iter.next() assert isinstance(batch, data_io.Batch) source = batch.source target = batch.target label = batch.labels[ C. TARGET_LABEL_NAME] # TODO: still 2-shape: (batch, length) length_ratio_label = batch.labels[C.LENRATIO_LABEL_NAME] assert source.shape[0] == target.shape[0] == label.shape[ 0] == batch_size assert source.shape[2] == target.shape[ 2] == num_source_factors == num_target_factors # target first symbol should be BOS # each source sequence contains one EOS symbol assert torch.sum(source == eos_id) == batch_size assert torch.equal(target[:, 0], expected_first_target_symbols) # label first symbol should be 2nd target symbol assert torch.equal(label[:, 0], target[:, 1, 0]) # each label sequence contains one EOS symbol assert torch.sum(label == eos_id) == batch_size train_iter.reset()