Example #1
0
def test_process_train_set():
    tar_data, names, jpeg_names = create_fake_tar_of_tars(20150925, 5,
                                                          min_num_images=45,
                                                          max_num_images=55)
    all_jpegs = numpy.array(sum(jpeg_names, []))
    numpy.random.RandomState(20150925).shuffle(all_jpegs)
    patched_files = all_jpegs[:10]
    patches_data = create_fake_patch_images(filenames=patched_files,
                                            num_train=10, num_valid=0,
                                            num_test=0)
    hdf5_file = MockH5PYFile()
    prepare_hdf5_file(hdf5_file, len(all_jpegs), 0, 0)
    wnid_map = dict(zip((n.split('.')[0] for n in names), range(len(names))))

    process_train_set(hdf5_file, io.BytesIO(tar_data),
                      io.BytesIO(patches_data), len(all_jpegs),
                      wnid_map)

    # Other tests cover that the actual images are what they should be.
    # Just do a basic verification of the filenames and targets.

    assert set(all_jpegs) == set(s.decode('ascii')
                                 for s in hdf5_file['filenames'][:, 0])
    assert len(hdf5_file['encoded_images'][:]) == len(all_jpegs)
    assert len(hdf5_file['targets'][:]) == len(all_jpegs)
Example #2
0
def test_image_consumer():
    mock_messages = MOCK_CONSUMER_MESSAGES
    hdf5_file = MockH5PYFile()
    prepare_hdf5_file(hdf5_file, 4, 5, 8)
    socket = MockSocket(zmq.PULL, to_recv=mock_messages)
    image_consumer(socket, hdf5_file, 4)

    assert_equal(hdf5_file['encoded_images'][0], [6, 6, 6])
    assert_equal(hdf5_file['encoded_images'][1], [1, 8, 1, 2, 0])
    assert_equal(hdf5_file['encoded_images'][2], [1, 9, 7, 9])
    assert_equal(hdf5_file['encoded_images'][3], [1, 8, 6, 7])
    assert_equal(hdf5_file['filenames'][:4], [[b'foo.jpeg'], [b'bar.jpeg'],
                                              [b'baz.jpeg'], [b'bur.jpeg']])
    assert_equal(hdf5_file['targets'][:4], [[2], [3], [5], [7]])
Example #3
0
def test_prepare_hdf5_file():
    hdf5_file = MockH5PYFile()
    prepare_hdf5_file(hdf5_file, 10, 5, 2)

    def get_start_stop(hdf5_file, split):
        rows = [r for r in hdf5_file.attrs['split'] if
                (r['split'].decode('utf8') == split)]
        return dict([(r['source'].decode('utf8'), (r['start'], r['stop']))
                     for r in rows])

    # Verify properties of the train split.
    train_splits = get_start_stop(hdf5_file, 'train')
    assert all(v == (0, 10) for v in train_splits.values())
    assert set(train_splits.keys()) == set([u'encoded_images', u'targets',
                                            u'filenames'])

    # Verify properties of the valid split.
    valid_splits = get_start_stop(hdf5_file, 'valid')
    assert all(v == (10, 15) for v in valid_splits.values())
    assert set(valid_splits.keys()) == set([u'encoded_images', u'targets',
                                            u'filenames'])

    # Verify properties of the test split.
    test_splits = get_start_stop(hdf5_file, 'test')
    assert all(v == (15, 17) for v in test_splits.values())
    assert set(test_splits.keys()) == set([u'encoded_images', u'targets',
                                           u'filenames'])

    from numpy import dtype

    # Verify properties of the encoded_images HDF5 dataset.
    assert hdf5_file['encoded_images'].shape[0] == 17
    assert len(hdf5_file['encoded_images'].shape) == 1
    assert hdf5_file['encoded_images'].dtype.kind == 'O'
    assert hdf5_file['encoded_images'].dtype.metadata['vlen'] == dtype('uint8')

    # Verify properties of the filenames dataset.
    assert hdf5_file['filenames'].shape[0] == 17
    assert len(hdf5_file['filenames'].shape) == 2
    assert hdf5_file['filenames'].dtype == dtype('S32')

    # Verify properties of the targets dataset.
    assert hdf5_file['targets'].shape[0] == 17
    assert hdf5_file['targets'].shape[1] == 1
    assert len(hdf5_file['targets'].shape) == 2
    assert hdf5_file['targets'].dtype == dtype('int16')
Example #4
0
def test_images_consumer_randomized():
    mock_messages = MOCK_CONSUMER_MESSAGES + [
        {'type': 'recv_pyobj', 'flags': zmq.SNDMORE, 'obj': ('jenny.jpeg', 1)},
        {'type': 'recv', 'flags': 0,
         'data': numpy.cast['uint8']([8, 6, 7, 5, 3, 0, 9])}
    ]
    hdf5_file = MockH5PYFile()
    prepare_hdf5_file(hdf5_file, 4, 5, 8)
    socket = MockSocket(zmq.PULL, to_recv=mock_messages)
    image_consumer(socket, hdf5_file, 5, offset=4, shuffle_seed=0)
    written_data = set(tuple(s) for s in hdf5_file['encoded_images'][4:9])
    expected_data = set(tuple(s['data']) for s in mock_messages[1::2])
    assert written_data == expected_data

    written_targets = set(hdf5_file['targets'][4:9].flatten())
    expected_targets = set(s['obj'][1] for s in mock_messages[::2])
    assert written_targets == expected_targets

    written_filenames = set(hdf5_file['filenames'][4:9].flatten())
    expected_filenames = set(s['obj'][0].encode('ascii')
                             for s in mock_messages[::2])
    assert written_filenames == expected_filenames
Example #5
0
def test_process_other_set():
    images, all_filenames = create_fake_jpeg_tar(3, min_num_images=30,
                                                 max_num_images=40,
                                                 gzip_probability=0.0)
    all_filenames_shuffle = numpy.array(all_filenames)
    numpy.random.RandomState(20151202).shuffle(all_filenames_shuffle)
    patched_files = all_filenames_shuffle[:15]
    patches_data = create_fake_patch_images(filenames=patched_files,
                                            num_train=0, num_valid=15,
                                            num_test=0)
    hdf5_file = MockH5PYFile()
    OFFSET = 50
    prepare_hdf5_file(hdf5_file, OFFSET, len(all_filenames), 0)
    groundtruth = [i % 10 for i in range(len(all_filenames))]
    process_other_set(hdf5_file, 'valid', io.BytesIO(images),
                      io.BytesIO(patches_data), groundtruth, OFFSET)

    # Other tests cover that the actual images are what they should be.
    # Just do a basic verification of the filenames.

    assert all(hdf5_file['targets'][OFFSET:, 0] == groundtruth)
    assert all(a.decode('ascii') == b
               for a, b in zip(hdf5_file['filenames'][OFFSET:, 0],
                               all_filenames))