def test_group_names_DataGeneratorDisk(): iu.resize_folder('images/', 'images1/', image_size_dst=(100, 100), overwrite=True) gp = gen_params.copy() gp.inputs = ['filename'] gp.group_names = ['images/'] gp.data_path = '' g = gr.DataGeneratorDisk(ids, **gp) assert gen.get_sizes(g[0]) == '([array<2,224,224,3>], [array<2,1>])' gp.group_names = ['images/', 'images1/'] g = gr.DataGeneratorDisk(ids, **gp) assert gen.get_sizes( g[0]) == '([array<2,224,224,3>, array<2,100,100,3>], [array<2,1>])' gp.group_names = [['images/'], ['images1/']] sizes = [] for i in range(100): g = gr.DataGeneratorDisk(ids, **gp) sizes.append(g[0][0][0].shape[1]) assert np.unique(sizes).shape[0] > 1 shutil.rmtree('images1/')
def test_DataGeneratorDisk(): g = gr.DataGeneratorDisk(ids, **gen_params) g.inputs = ['filename', 'filename'] assert gen.get_sizes( g[0]) == '([array<2,224,224,3>, array<2,224,224,3>], [array<2,1>])' g.inputs_df = ['score', 'score'] g.inputs = [] g.outputs = [] assert gen.get_sizes(g[0]) == '([array<2,2>], [])' g.inputs_df = [['score'], ['score', 'score']] assert gen.get_sizes(g[0]) == '([array<2,1>, array<2,2>], [])' g.inputs_df = [] g.outputs = ['score'] assert gen.get_sizes(g[0]) == '([], [array<2,1>])' g.outputs = ['score', ['score']] with pytest.raises(AssertionError): g[0] g.outputs = [['score'], ['score']] assert gen.get_sizes(g[0]) == '([], [array<2,1>, array<2,1>])'
def test_init_DataGeneratorDisk(): g = gr.DataGeneratorDisk(ids, **gen_params) assert isinstance(g[0], tuple) assert isinstance(g[0][0], list) assert isinstance(g[0][1], list) assert (gen.get_sizes(g[0]) == '([array<2,224,224,3>], [array<2,1>])') assert (np.all(g[0][1][0] == np.array([[1], [2]])))
def test_multi_return_proc_fn_DataGeneratorDisk(): gen_params_local = gen_params.copy() gen_params_local.process_fn = lambda im: [im, im + 1] g = gr.DataGeneratorDisk(ids.copy(), **gen_params_local) assert np.array_equal(g[0][0][0], g[0][0][1] - 1) assert np.array_equal(g[0][1][0], np.array([[1], [2]]))
def test_read_fn_DataGeneratorDisk(): import os def read_fn(name, g): # g is the parent generator object # name is the image name read from the DataFrame image_path = os.path.join(g.data_path, name) return iu.resize_image(iu.read_image(image_path), (100, 100)) g = gr.DataGeneratorDisk(ids, read_fn=read_fn, **gen_params) gen.get_sizes(g[0]) == '([array<2,100,100,3>], [array<2,1>])'
def test_accessor_function_numpy_array(): ids = pd.DataFrame( dict(a=range(10), b=list(np.random.randint(0, 10, (10, 2, 2))))) gen_params = Munch(batch_size=4, data_path=None, input_shape=None, inputs_df=lambda ids: [ids[['a']].values], outputs=['b'], shuffle=False, fixed_batches=True) # test using a function to access data from ids # test if data in ids items can be ndarrays g = gr.DataGeneratorDisk(ids, **gen_params) assert gen.get_sizes(g[0]) == '([array<4,1>], [array<4,2,2>])' # test if double inputs works gen_params.outputs = ['a', 'a'] g = gr.DataGeneratorDisk(ids, **gen_params) assert gen.get_sizes(g[0]) == '([array<4,1>], [array<4,2>])'
def test_multi_return_and_read_fn_DataGeneratorDisk(): def read_fn(*args): g = args[1] score = np.float32(g.ids[g.ids.filename == args[0]].score) return np.ones((3, 3)) * score gen_params_local = gen_params.copy() gen_params_local.batch_size = 3 gen_params_local.read_fn = read_fn gen_params_local.process_fn = lambda im: [im + 1, im + 2] g = gr.DataGeneratorDisk(ids, **gen_params_local) assert np.array_equal(g[0][0][0], g[0][0][1] - 1) assert np.array_equal(g[0][0][1][0, ...], np.ones((3, 3)) * 3.)
def test_generator_len_with_group_by_DataGeneratorDisk(): size = 10 ids_defa = pd.read_csv(u'ids.csv', encoding='latin-1') fnames = np.concatenate([ids_defa.filename.values] * 3)[:size] ids = pd.DataFrame( dict(cats=['cat{}'.format(i) for i in range(size)], dogs=['dog{}'.format(i) for i in range(size)], image_name=fnames, group=[i // 4 for i in range(10)])) gen_params = Munch(batch_size=1, inputs=['image_name'], outputs=['dogs'], data_path='images', group_by='group', shuffle=False, fixed_batches=True) for batch_size, len_g in zip(range(1, 5), [10, 5, 5, 3]): gen_params.batch_size = batch_size g = gr.DataGeneratorDisk(ids, **gen_params) assert len(g) == len_g a = g.ids_index.groupby('batch_index').group_by.mean().values b = g.ids_index.groupby('batch_index').group_by.last().values assert np.array_equal(a, b) gen_params.group_by = None for batch_size, len_g in zip(range(1, 5), [10, 5, 3, 2]): gen_params.batch_size = batch_size g = gr.DataGeneratorDisk(ids, **gen_params) assert len(g) == len_g gen_params.fixed_batches = False for batch_size, len_g in zip(range(1, 5), [10, 5, 4, 3]): gen_params.batch_size = batch_size g = gr.DataGeneratorDisk(ids, **gen_params) assert len(g) == len_g
def test_ids_fn(): gen_params_local = gen_params.copy() ids_local = ids.copy() def ids_fn(): ids_local.score = -ids_local.score return ids_local gen_params_local.ids_fn = ids_fn gen_params_local.batch_size = 4 g = gr.DataGeneratorDisk(ids, **gen_params_local) x = g[0][1][0] g.on_epoch_end() y = g[0][1][0] assert np.array_equal(-x, y)
def test_process_args_DataGeneratorDisk(): def preproc(im, arg): return np.zeros(1) + arg gen_params_local = gen_params.copy() gen_params_local.process_fn = preproc gen_params_local.process_args = {'filename': 'filename_args'} gen_params_local.batch_size = 4 ids_local = ids.copy() ids_local['filename_args'] = range(len(ids_local)) g = gr.DataGeneratorDisk(ids_local, **gen_params_local) x = g[0][0] assert np.array_equal(np.squeeze(x[0].T), np.arange(gen_params_local.batch_size))
def test_random_group_DataGeneratorDisk(): iu.resize_folder('images/', 'base/images100/', image_size_dst=(100, 100), overwrite=True) iu.resize_folder('images/', 'base/images50/', image_size_dst=(50, 50), overwrite=True) gp = gen_params.copy() gp.inputs = ['filename'] gp.data_path = '' gp.group_names = ['base'] gp.random_group = True g = gr.DataGeneratorDisk(ids, **gp) assert np.array_equal( np.unique([x[0][0].shape[1] for i in range(100) for x in g]), [50, 100]) shutil.rmtree('base/')
def test_basics_deterministic_shuffle_consistency_group_by(): ids = pd.DataFrame( dict(a=range(10), b=list(range(9, -1, -1)), c=np.arange(10) < 5)) gen_params = Munch(batch_size=4, data_path=None, input_shape=None, inputs_df=['a'], outputs=['b'], shuffle=False, fixed_batches=True) # check `fixed_batches` switch g = gr.DataGeneratorDisk(ids, **gen_params) assert np.array_equal( [gen.get_sizes(x) for x in g], ['([array<4,1>], [array<4,1>])', '([array<4,1>], [array<4,1>])']) assert np.array_equal(g[0][0][0].squeeze(), range(4)) gen_params.fixed_batches = False g = gr.DataGeneratorDisk(ids, **gen_params) assert np.array_equal([gen.get_sizes(x) for x in g], [ '([array<4,1>], [array<4,1>])', '([array<4,1>], [array<4,1>])', '([array<2,1>], [array<2,1>])' ]) assert np.array_equal(g[2][0][0].squeeze(), [8, 9]) # check randomized gen_params.shuffle = True gen_params.fixed_batches = False # maintain g = gr.DataGeneratorDisk(ids, **gen_params) # check if it returns all items data = list(zip(*list(g))) data0 = np.concatenate([l[0] for l in data[0]], axis=0).squeeze() data1 = np.concatenate([l[0] for l in data[1]], axis=0).squeeze() assert np.array_equal(np.sort(data0), np.arange(10)) assert np.array_equal(np.sort(data1), np.arange(10)) # check if randomization is applied, consistently num_randoms0 = 0 num_randoms1 = 0 for i in range(100): g = gr.DataGeneratorDisk(ids, **gen_params) data = list(zip(*list(g))) data0 = np.concatenate([l[0] for l in data[0]], axis=0).squeeze() data1 = np.concatenate([l[0] for l in data[1]], axis=0).squeeze() # check consistency ids_ = ids.copy() ids_.index = ids_.a np.array_equal(ids_.loc[data0].b, data1) num_randoms0 += not np.array_equal(data0, np.arange(10)) num_randoms1 += not np.array_equal(data1, np.arange(10)) # check randomization, at least once assert num_randoms0 assert num_randoms0 # check deterministic gen_params.shuffle = True gen_params.deterministic = np.random.randint(100) assert np.array_equal( gr.DataGeneratorDisk(ids, **gen_params)[0], gr.DataGeneratorDisk(ids, **gen_params)[0]) gen_params.update(fixed_batches=False, shuffle=True, group_by='c', deterministic=False) g = gr.DataGeneratorDisk(ids, **gen_params) data = list(zip(*list(g))) data = [[l[0] for l in d] for d in data] data_conc = [np.concatenate(d, axis=0) for d in data] # returns all df = pd.DataFrame(np.concatenate(data_conc, axis=1), columns=('a', 'b')) x = df.merge(ids, on='a') assert np.all(x.b_x == x.b_y) # each batch returns a single group ids_ = ids.copy() ids_.index = ids_.a for i, d in enumerate(data[0]): assert ids_.loc[d[0]].c.unique().shape == (1, )