def test_process_train_set(): tar_data, names, jpeg_names = create_fake_tar_of_tars(20150925, 5, min_num_images=45, max_num_images=55) all_jpegs = numpy.array(sum(jpeg_names, [])) numpy.random.RandomState(20150925).shuffle(all_jpegs) patched_files = all_jpegs[:10] patches_data = create_fake_patch_images(filenames=patched_files, num_train=10, num_valid=0, num_test=0) hdf5_file = MockH5PYFile() prepare_hdf5_file(hdf5_file, len(all_jpegs), 0, 0) wnid_map = dict(zip((n.split('.')[0] for n in names), range(len(names)))) process_train_set(hdf5_file, io.BytesIO(tar_data), io.BytesIO(patches_data), len(all_jpegs), wnid_map) # Other tests cover that the actual images are what they should be. # Just do a basic verification of the filenames and targets. assert set(all_jpegs) == set(s.decode('ascii') for s in hdf5_file['filenames'][:, 0]) assert len(hdf5_file['encoded_images'][:]) == len(all_jpegs) assert len(hdf5_file['targets'][:]) == len(all_jpegs)
def test_image_consumer(): mock_messages = MOCK_CONSUMER_MESSAGES hdf5_file = MockH5PYFile() prepare_hdf5_file(hdf5_file, 4, 5, 8) socket = MockSocket(zmq.PULL, to_recv=mock_messages) image_consumer(socket, hdf5_file, 4) assert_equal(hdf5_file['encoded_images'][0], [6, 6, 6]) assert_equal(hdf5_file['encoded_images'][1], [1, 8, 1, 2, 0]) assert_equal(hdf5_file['encoded_images'][2], [1, 9, 7, 9]) assert_equal(hdf5_file['encoded_images'][3], [1, 8, 6, 7]) assert_equal(hdf5_file['filenames'][:4], [[b'foo.jpeg'], [b'bar.jpeg'], [b'baz.jpeg'], [b'bur.jpeg']]) assert_equal(hdf5_file['targets'][:4], [[2], [3], [5], [7]])
def test_prepare_hdf5_file(): hdf5_file = MockH5PYFile() prepare_hdf5_file(hdf5_file, 10, 5, 2) def get_start_stop(hdf5_file, split): rows = [r for r in hdf5_file.attrs['split'] if (r['split'].decode('utf8') == split)] return dict([(r['source'].decode('utf8'), (r['start'], r['stop'])) for r in rows]) # Verify properties of the train split. train_splits = get_start_stop(hdf5_file, 'train') assert all(v == (0, 10) for v in train_splits.values()) assert set(train_splits.keys()) == set([u'encoded_images', u'targets', u'filenames']) # Verify properties of the valid split. valid_splits = get_start_stop(hdf5_file, 'valid') assert all(v == (10, 15) for v in valid_splits.values()) assert set(valid_splits.keys()) == set([u'encoded_images', u'targets', u'filenames']) # Verify properties of the test split. test_splits = get_start_stop(hdf5_file, 'test') assert all(v == (15, 17) for v in test_splits.values()) assert set(test_splits.keys()) == set([u'encoded_images', u'targets', u'filenames']) from numpy import dtype # Verify properties of the encoded_images HDF5 dataset. assert hdf5_file['encoded_images'].shape[0] == 17 assert len(hdf5_file['encoded_images'].shape) == 1 assert hdf5_file['encoded_images'].dtype.kind == 'O' assert hdf5_file['encoded_images'].dtype.metadata['vlen'] == dtype('uint8') # Verify properties of the filenames dataset. assert hdf5_file['filenames'].shape[0] == 17 assert len(hdf5_file['filenames'].shape) == 2 assert hdf5_file['filenames'].dtype == dtype('S32') # Verify properties of the targets dataset. assert hdf5_file['targets'].shape[0] == 17 assert hdf5_file['targets'].shape[1] == 1 assert len(hdf5_file['targets'].shape) == 2 assert hdf5_file['targets'].dtype == dtype('int16')
def test_images_consumer_randomized(): mock_messages = MOCK_CONSUMER_MESSAGES + [ {'type': 'recv_pyobj', 'flags': zmq.SNDMORE, 'obj': ('jenny.jpeg', 1)}, {'type': 'recv', 'flags': 0, 'data': numpy.cast['uint8']([8, 6, 7, 5, 3, 0, 9])} ] hdf5_file = MockH5PYFile() prepare_hdf5_file(hdf5_file, 4, 5, 8) socket = MockSocket(zmq.PULL, to_recv=mock_messages) image_consumer(socket, hdf5_file, 5, offset=4, shuffle_seed=0) written_data = set(tuple(s) for s in hdf5_file['encoded_images'][4:9]) expected_data = set(tuple(s['data']) for s in mock_messages[1::2]) assert written_data == expected_data written_targets = set(hdf5_file['targets'][4:9].flatten()) expected_targets = set(s['obj'][1] for s in mock_messages[::2]) assert written_targets == expected_targets written_filenames = set(hdf5_file['filenames'][4:9].flatten()) expected_filenames = set(s['obj'][0].encode('ascii') for s in mock_messages[::2]) assert written_filenames == expected_filenames
def test_process_other_set(): images, all_filenames = create_fake_jpeg_tar(3, min_num_images=30, max_num_images=40, gzip_probability=0.0) all_filenames_shuffle = numpy.array(all_filenames) numpy.random.RandomState(20151202).shuffle(all_filenames_shuffle) patched_files = all_filenames_shuffle[:15] patches_data = create_fake_patch_images(filenames=patched_files, num_train=0, num_valid=15, num_test=0) hdf5_file = MockH5PYFile() OFFSET = 50 prepare_hdf5_file(hdf5_file, OFFSET, len(all_filenames), 0) groundtruth = [i % 10 for i in range(len(all_filenames))] process_other_set(hdf5_file, 'valid', io.BytesIO(images), io.BytesIO(patches_data), groundtruth, OFFSET) # Other tests cover that the actual images are what they should be. # Just do a basic verification of the filenames. assert all(hdf5_file['targets'][OFFSET:, 0] == groundtruth) assert all(a.decode('ascii') == b for a, b in zip(hdf5_file['filenames'][OFFSET:, 0], all_filenames))