def test_batch_data_alt(samples): # data that are: # - not in sub-batches # - not numpy arrays (and not casted) # - variable shape # - variable type batch_start = 0 for ex_samp, act_samp in zip( samples, corpus.batch_data(samples, subsamples=False)): try: assert np.allclose(ex_samp, act_samp) except TypeError: assert ex_samp == act_samp batch_start += 1 assert batch_start == len(samples) for batch_size in range(1, len(samples) + 2): batch_start = 0 for act_batch in corpus.batch_data( samples, batch_size=batch_size, subsamples=False): ex_batch = samples[batch_start:batch_start + batch_size] assert len(ex_batch) == len(act_batch) for ex_samp, act_samp in zip(ex_batch, act_batch): try: assert np.allclose(ex_samp, act_samp) except TypeError: assert ex_samp == act_samp batch_start += len(act_batch) assert batch_start == len(samples)
def test_batch_data_alt(samples): # data that are: # - not in sub-batches # - not numpy arrays (and not casted) # - variable shape # - variable type batch_start = 0 for ex_samp, act_samp in zip(samples, corpus.batch_data(samples, subsamples=False)): try: assert np.allclose(ex_samp, act_samp) except TypeError: assert ex_samp == act_samp batch_start += 1 assert batch_start == len(samples) for batch_size in range(1, len(samples) + 2): batch_start = 0 for act_batch in corpus.batch_data(samples, batch_size=batch_size, subsamples=False): ex_batch = samples[batch_start:batch_start + batch_size] assert len(ex_batch) == len(act_batch) for ex_samp, act_samp in zip(ex_batch, act_batch): try: assert np.allclose(ex_samp, act_samp) except TypeError: assert ex_samp == act_samp batch_start += len(act_batch) assert batch_start == len(samples)
def test_batch_data_numpy(samples): # samples are numpy data of the same shape and type batch_start = 0 for ex_samp, act_samp in zip( samples, corpus.batch_data(samples, subsamples=False)): try: assert np.allclose(ex_samp, act_samp) except TypeError: assert ex_samp.tolist() == act_samp.tolist() batch_start += 1 assert batch_start == len(samples) for axis in range(-1, len(samples.shape)): batch_slice = [slice(None)] * len(samples.shape) for batch_size in range(1, len(samples) + 2): batch_start = 0 for act_batch in corpus.batch_data( samples, batch_size=batch_size, subsamples=False, axis=axis): ex_batch = samples[batch_start:batch_start + batch_size] assert len(ex_batch) == act_batch.shape[axis] for samp_idx in range(len(ex_batch)): batch_slice[axis] = samp_idx try: assert np.allclose( ex_batch[samp_idx], act_batch[tuple(batch_slice)]) except TypeError: assert ( ex_batch[samp_idx].flatten().tolist() == act_batch[tuple(batch_slice)].flatten().tolist()) batch_start += len(ex_batch) assert batch_start == len(samples)
def test_batch_data_tups_alt(samples): batch_start = 0 for ex_samp, act_samp in zip(samples, corpus.batch_data(samples)): assert isinstance(act_samp, tuple) for ex_sub_samp, act_sub_samp in zip(ex_samp, act_samp): try: assert np.allclose(ex_sub_samp, act_sub_samp) except TypeError: assert ex_sub_samp == act_sub_samp batch_start += 1 assert batch_start == len(samples) for batch_size in range(1, len(samples) + 2): batch_start = 0 for act_batch in corpus.batch_data(samples, batch_size=batch_size): ex_batch = samples[batch_start:batch_start + batch_size] assert len(ex_batch[0]) == len(act_batch) # same num sub-batches for sub_batch_idx, act_sub_batch in enumerate(act_batch): assert len(act_sub_batch) == len(ex_batch) for sub_samp_idx, act_sub_samp in enumerate(act_sub_batch): ex_sub_samp = ex_batch[sub_samp_idx][sub_batch_idx] try: assert np.allclose(ex_sub_samp, act_sub_samp) except TypeError: assert ex_samp == act_samp batch_start += len(ex_batch) assert batch_start == len(samples)
def test_batch_data_numpy(samples): # samples are numpy data of the same shape and type batch_start = 0 for ex_samp, act_samp in zip(samples, corpus.batch_data(samples, subsamples=False)): try: assert np.allclose(ex_samp, act_samp) except TypeError: assert ex_samp.tolist() == act_samp.tolist() batch_start += 1 assert batch_start == len(samples) for axis in range(-1, len(samples.shape)): batch_slice = [slice(None)] * len(samples.shape) for batch_size in range(1, len(samples) + 2): batch_start = 0 for act_batch in corpus.batch_data(samples, batch_size=batch_size, subsamples=False, axis=axis): ex_batch = samples[batch_start:batch_start + batch_size] assert len(ex_batch) == act_batch.shape[axis] for samp_idx in range(len(ex_batch)): batch_slice[axis] = samp_idx try: assert np.allclose(ex_batch[samp_idx], act_batch[batch_slice]) except TypeError: assert (ex_batch[samp_idx].flatten().tolist() == act_batch[batch_slice].flatten().tolist()) batch_start += len(ex_batch) assert batch_start == len(samples)
def test_batch_data_tups_numpy(samples): input_shapes = tuple(sample.shape for sample in samples[0]) axes = tuple(product(*(range(len(shape) + 1) for shape in input_shapes))) axes += (0, -1) batch_start = 0 for ex_samp, act_samp in zip(samples, corpus.batch_data(samples)): assert isinstance(act_samp, tuple) for ex_sub_samp, act_sub_samp in zip(ex_samp, act_samp): assert ex_sub_samp.shape == act_sub_samp.shape try: assert np.allclose(ex_sub_samp, act_sub_samp) except TypeError: assert ( ex_sub_samp.flatten().tolist() == act_sub_samp.flatten().tolist() ) batch_start += 1 assert batch_start == len(samples) for axis in axes: batch_slices = tuple( [slice(None)] * (len(shape) + 1) for shape in input_shapes) if isinstance(axis, int): axis_iter = repeat(axis) else: axis_iter = axis for batch_size in range(1, len(samples) + 2): batch_start = 0 for act_batch in corpus.batch_data( samples, subsamples=True, batch_size=batch_size, axis=axis): assert len(act_batch) == len(input_shapes) assert isinstance(act_batch, tuple) ex_batch = samples[batch_start:batch_start + batch_size] for sub_batch_idx, sub_axis in zip( range(len(input_shapes)), axis_iter): act_sub_batch = act_batch[sub_batch_idx] assert len(ex_batch) == act_sub_batch.shape[sub_axis] sub_batch_slice = batch_slices[sub_batch_idx] for sub_samp_idx in range(len(ex_batch)): sub_batch_slice[sub_axis] = sub_samp_idx ex_sub_samp = ex_batch[sub_samp_idx, sub_batch_idx] act_sub_samp = act_sub_batch[tuple(sub_batch_slice)] # the == 2 is to account for the case when # ex_sub_samp are going to be np.generics (as # opposed to a arrays) assert (len(input_shapes) == 2) or ( ex_sub_samp.shape == act_sub_samp.shape, sub_axis) try: assert np.allclose(ex_sub_samp, act_sub_samp) except TypeError: assert ( ex_sub_samp.flatten().tolist() == act_sub_samp.flatten().tolist() ) batch_start += len(ex_batch) assert batch_start == len(samples)
def test_batch_data_tups_numpy(samples): input_shapes = tuple(sample.shape for sample in samples[0]) axes = tuple(product(*(range(len(shape) + 1) for shape in input_shapes))) axes += (0, -1) batch_start = 0 for ex_samp, act_samp in zip(samples, corpus.batch_data(samples)): assert isinstance(act_samp, tuple) for ex_sub_samp, act_sub_samp in zip(ex_samp, act_samp): assert ex_sub_samp.shape == act_sub_samp.shape try: assert np.allclose(ex_sub_samp, act_sub_samp) except TypeError: assert (ex_sub_samp.flatten().tolist() == act_sub_samp.flatten().tolist()) batch_start += 1 assert batch_start == len(samples) for axis in axes: batch_slices = tuple([slice(None)] * (len(shape) + 1) for shape in input_shapes) if isinstance(axis, int): axis_iter = repeat(axis) else: axis_iter = axis for batch_size in range(1, len(samples) + 2): batch_start = 0 for act_batch in corpus.batch_data(samples, subsamples=True, batch_size=batch_size, axis=axis): assert len(act_batch) == len(input_shapes) assert isinstance(act_batch, tuple) ex_batch = samples[batch_start:batch_start + batch_size] for sub_batch_idx, sub_axis in zip(range(len(input_shapes)), axis_iter): act_sub_batch = act_batch[sub_batch_idx] assert len(ex_batch) == act_sub_batch.shape[sub_axis] sub_batch_slice = batch_slices[sub_batch_idx] for sub_samp_idx in range(len(ex_batch)): sub_batch_slice[sub_axis] = sub_samp_idx ex_sub_samp = ex_batch[sub_samp_idx, sub_batch_idx] act_sub_samp = act_sub_batch[sub_batch_slice] # the == 2 is to account for the case when # ex_sub_samp are going to be np.generics (as # opposed to a arrays) assert (len(input_shapes) == 2) or (ex_sub_samp.shape == act_sub_samp.shape, sub_axis) try: assert np.allclose(ex_sub_samp, act_sub_samp) except TypeError: assert (ex_sub_samp.flatten().tolist() == act_sub_samp.flatten().tolist()) batch_start += len(ex_batch) assert batch_start == len(samples)
def test_padding(): samples = [([1], [2], [3]), ([4, 5], [6, 7]), ([8], [9])] l1_batches = tuple( corpus.batch_data( samples, subsamples=False, batch_size=1, pad_mode='maximum', cast_to_array=np.int32, )) assert len(l1_batches) == 3 assert np.allclose(l1_batches[0], [[1], [2], [3]]) assert np.allclose(l1_batches[1], [[4, 5], [6, 7]]) assert np.allclose(l1_batches[2], [[8], [9]]) l2_batches = tuple( corpus.batch_data( samples, subsamples=False, batch_size=2, pad_mode='maximum', cast_to_array=np.int32, )) assert len(l2_batches) == 2 assert np.allclose(l2_batches[0], [ [[1, 1], [2, 2], [3, 3]], [[4, 5], [6, 7], [6, 7]], ]) assert np.allclose(l2_batches[1], [[8], [9]]) l3_batches = tuple( corpus.batch_data( samples, subsamples=False, batch_size=3, pad_mode='wrap', cast_to_array=np.int32, )) assert len(l3_batches) == 1 assert np.allclose(l3_batches[0], [ [[1, 1], [2, 2], [3, 3]], [[4, 5], [6, 7], [4, 5]], [[8, 8], [9, 9], [8, 8]], ]) # if we do not set cast_to_array, no padding should occur no_cast_batches = tuple( corpus.batch_data(samples, subsamples=False, batch_size=3, pad_mode='wrap')) assert len(no_cast_batches) == 1 assert no_cast_batches[0] == samples
def test_padding(): samples = [([1], [2], [3]), ([4, 5], [6, 7]), ([8], [9])] l1_batches = tuple(corpus.batch_data( samples, subsamples=False, batch_size=1, pad_mode='maximum', cast_to_array=np.int32, )) assert len(l1_batches) == 3 assert np.allclose(l1_batches[0], [[1], [2], [3]]) assert np.allclose(l1_batches[1], [[4, 5], [6, 7]]) assert np.allclose(l1_batches[2], [[8], [9]]) l2_batches = tuple(corpus.batch_data( samples, subsamples=False, batch_size=2, pad_mode='maximum', cast_to_array=np.int32, )) assert len(l2_batches) == 2 assert np.allclose( l2_batches[0], [ [[1, 1], [2, 2], [3, 3]], [[4, 5], [6, 7], [6, 7]], ] ) assert np.allclose(l2_batches[1], [[8], [9]]) l3_batches = tuple(corpus.batch_data( samples, subsamples=False, batch_size=3, pad_mode='wrap', cast_to_array=np.int32, )) assert len(l3_batches) == 1 assert np.allclose( l3_batches[0], [ [[1, 1], [2, 2], [3, 3]], [[4, 5], [6, 7], [4, 5]], [[8, 8], [9, 9], [8, 8]], ] ) # if we do not set cast_to_array, no padding should occur no_cast_batches = tuple(corpus.batch_data( samples, subsamples=False, batch_size=3, pad_mode='wrap')) assert len(no_cast_batches) == 1 assert no_cast_batches[0] == samples