def test_existing_dataset(monkeypatch): from batchup.datasets import dataset import hashlib tdir = test_config._setup_batchup_temp(monkeypatch) source_path = os.path.join(tdir, 'source.txt') with open(source_path, 'w') as f: f.write('hello world') hasher = hashlib.sha256() hasher.update(b'hello world') expected_sha = hasher.hexdigest() f1 = dataset.ExistingSourceFile(source_path, sha256=expected_sha) @dataset.fetch_and_convert_dataset([f1], 'ds.txt') def existing_dataset(source_paths, target_path): return source_paths[0] dest = existing_dataset() # Check the resulting file assert dest == source_path assert open(dest, 'r').read() == 'hello world' test_config._teardown_batchup_temp(tdir)
def test_ExistingSourceFile_acquire(monkeypatch): from batchup.datasets import dataset import hashlib tdir = test_config._setup_batchup_temp(monkeypatch) source_path = os.path.join(tdir, 'source.txt') with open(source_path, 'w') as f: f.write('hello world') hasher = hashlib.sha256() hasher.update(b'hello world') expected_sha = hasher.hexdigest() f1 = dataset.ExistingSourceFile(source_path, sha256=expected_sha) assert f1.path == source_path dest = f1.acquire() assert dest == source_path assert os.path.exists(dest) assert open(dest, 'r').read() == 'hello world' # clean up - should NOT remove the file f1.clean_up() assert os.path.exists(dest) test_config._teardown_batchup_temp(tdir)
def test_ExistingSourceFile_constructor(monkeypatch): from batchup.datasets import dataset _patch_config_datadir(monkeypatch) f1 = dataset.ExistingSourceFile( path=os.path.join('some_place', 'other.txt')) assert f1.path == os.path.join('some_place', 'other.txt') assert str(f1) == 'file at {}'.format( os.path.join('some_place', 'other.txt')) f2 = dataset.ExistingSourceFile( path=lambda: os.path.join('some_place', 'other.txt')) assert f1.path == os.path.join('some_place', 'other.txt') assert str(f1) == 'file at {}'.format( os.path.join('some_place', 'other.txt')) with pytest.raises(TypeError): dataset.ExistingSourceFile(1)
def test_ExistingSourceFile_acquire_nonexistant(monkeypatch): from batchup.datasets import dataset import hashlib tdir = test_config._setup_batchup_temp(monkeypatch) source_path = os.path.join(tdir, 'source.txt') hasher = hashlib.sha256() hasher.update(b'hello world') expected_sha = hasher.hexdigest() f1 = dataset.ExistingSourceFile(source_path, sha256=expected_sha) assert f1.path == source_path dest = f1.acquire() assert dest is None test_config._teardown_batchup_temp(tdir)
def _syndigits_train_path(): return os.path.join(get_data_dir('syn_digits'), 'synth_train_32x32.mat') def _syndigits_test_path(): return os.path.join(get_data_dir('syn_digits'), 'synth_test_32x32.mat') def _syndigits_h5_path(): return os.path.abspath( os.path.join(get_data_dir('syn_digits'), 'syn_digits.h5')) _TRAIN_SRC = dataset.ExistingSourceFile(_syndigits_train_path, None) _TEST_SRC = dataset.ExistingSourceFile(_syndigits_test_path, None) @dataset.fetch_and_convert_dataset([_TRAIN_SRC, _TEST_SRC], _syndigits_h5_path) def fetch_syn_digits(source_paths, target_path): train_path, test_path = source_paths f_out = tables.open_file(target_path, mode='w') g_out = f_out.create_group(f_out.root, 'syn_digits', 'Syn-Digits data') # Load in the training data Matlab file print('Converting {} to HDF5...'.format(train_path)) train_X_u8, train_y = svhn._read_svhn_matlab(train_path) f_out.create_array(g_out, 'train_X_u8', train_X_u8) f_out.create_array(g_out, 'train_y', train_y)