Ejemplo n.º 1
0
def create_mb_source(image_height, image_width, num_channels, map_file):
    transforms = [
        ImageDeserializer.scale(width=image_width,
                                height=image_height,
                                channels=num_channels,
                                interpolations='linear')
    ]
    return MinibatchSource(
        ImageDeserializer(
            map_file,
            StreamDefs(
                features=StreamDef(
                    field='image', transforms=transforms
                ),  # first column in map file is referred to as 'image'
                labels=StreamDef(field='label', shape=1000))
        ),  # and second as 'label'. TODO: add option to ignore labels
        randomize=False)
Ejemplo n.º 2
0
def create_mb_source(map_file,
                     image_width,
                     image_height,
                     num_channels,
                     num_classes,
                     randomize=True):
    transforms = [
        xforms.scale(width=image_width,
                     height=image_height,
                     channels=num_channels,
                     interpolations='linear')
    ]
    return MinibatchSource(ImageDeserializer(
        map_file,
        StreamDefs(features=StreamDef(field='image', transforms=transforms),
                   labels=StreamDef(field='label', shape=num_classes))),
                           randomize=randomize)
Ejemplo n.º 3
0
def test_full_sweep_minibatch(tmpdir):
    tmpfile = _write_data(tmpdir, MBDATA_DENSE_1)

    mb_source = MinibatchSource(CTFDeserializer(tmpfile, StreamDefs(
        features  = StreamDef(field='S0', shape=1),
        labels    = StreamDef(field='S1', shape=1))),
        randomization_window_in_chunks=0, max_sweeps=1)

    features_si = mb_source.stream_info('features')
    labels_si = mb_source.stream_info('labels')

    mb = mb_source.next_minibatch(1000)

    assert mb[features_si].num_sequences == 2
    assert mb[labels_si].num_sequences == 2

    features = mb[features_si]
    assert features.end_of_sweep
    assert len(features.as_sequences()) == 2
    expected_features = \
        [
            [[0], [1], [2], [3]],
            [[4], [5], [6]]
        ]

    for res, exp in zip(features.as_sequences(), expected_features):
        assert np.allclose(res, exp)

    assert np.allclose(features.data.mask,
            [[2, 1, 1, 1],
             [2, 1, 1, 0]])

    labels = mb[labels_si]
    assert labels.end_of_sweep
    assert len(labels.as_sequences()) == 2
    expected_labels = \
            [
                [[0],[1],[3]],
                [[1],[2]]
            ]
    for res, exp in zip(labels.as_sequences(), expected_labels):
        assert np.allclose(res, exp)

    assert np.allclose(labels.data.mask,
            [[2, 1, 1],
             [2, 1, 0]])
Ejemplo n.º 4
0
def create_mb(map_file, params, training_set):
    transforms = []
    image_dimensions = params['image_dimensions']
    num_classes = params['num_classes']
    if training_set:
        # Scale to square-sized image. without this the cropping transform would chop the larger dimension of an
        # image to make it squared, and then take 0.9 crops from within the squared image.
        transforms += [
            xforms.scale(width=2 * image_dimensions[0],
                         height=2 * image_dimensions[1],
                         channels=image_dimensions[2],
                         scale_mode='pad',
                         pad_value=114)
        ]
        transforms += [
            xforms.crop(crop_type='randomside',
                        side_ratio=0.9,
                        jitter_type='uniratio')
        ]  # Randomly crop square area
        #randomside enables Horizontal flipping
        #new_dim = side_ratio * min(old_w,old_h) , 0.9 * 224 = 201.6
        #transforms += [xforms.crop(crop_type='center')]
        transforms += [
            xforms.color(brightness_radius=0.2,
                         contrast_radius=0.2,
                         saturation_radius=0.2)
        ]

    transforms += [xforms.crop(crop_type='center',
                               side_ratio=0.875)]  # test has no jitter]
    # Scale down and pad
    transforms += [
        xforms.scale(width=image_dimensions[0],
                     height=image_dimensions[1],
                     channels=image_dimensions[2],
                     scale_mode='pad',
                     pad_value=114)
    ]

    return MinibatchSource(ImageDeserializer(
        map_file,
        StreamDefs(features=StreamDef(field='image', transforms=transforms),
                   labels=StreamDef(field='label', shape=num_classes))),
                           randomize=training_set,
                           multithreaded_deserializer=True)
def create_reader(path,
                  randomize,
                  input_vocab_dim,
                  label_vocab_dim,
                  size=INFINITELY_REPEAT):
    if not os.path.exists(path):
        raise RuntimeError("File '%s' does not exist." % (path))

    return MinibatchSource(CTFDeserializer(
        path,
        StreamDefs(features=StreamDef(field='S0',
                                      shape=input_vocab_dim,
                                      is_sparse=True),
                   labels=StreamDef(field='S1',
                                    shape=label_vocab_dim,
                                    is_sparse=True))),
                           randomize=randomize,
                           max_samples=size)
Ejemplo n.º 6
0
def create_mb_source(map_file, image_width, image_height, num_channels, num_classes, boTrain):
    transforms = []
    if boTrain:
        # Scale to square-sized image. without this the cropping transform would chop the larger dimension of an
        # image to make it squared, and then take 0.9 crops from within the squared image.
        transforms += [xforms.scale(width=2*image_width, height=2*image_height, channels=num_channels,
                                    interpolations='linear', scale_mode='pad', pad_value=114)]
        transforms += [xforms.crop(crop_type='randomside', side_ratio=0.9, jitter_type='uniratio')]     # Randomly crop square area
    transforms += [xforms.scale(width=image_width, height=image_height, channels=num_channels,          # Scale down and pad
                                interpolations='linear', scale_mode='pad', pad_value=114)]
    if boTrain:
        transforms += [xforms.color(brightness_radius=0.2, contrast_radius=0.2, saturation_radius=0.2)]

    return MinibatchSource(ImageDeserializer(map_file, StreamDefs(
            features  = StreamDef(field='image', transforms=transforms),
            labels    = StreamDef(field='label', shape=num_classes))),
            randomize = boTrain,
            multithreaded_deserializer=True)
Ejemplo n.º 7
0
def create_reader(map_file, mean_file, train, image_height=800, image_width=150, num_channels=3, num_classes=32):
  
    # transformation pipeline for the features has crop only when training

    trs = []
    if train:
        trs += [
            transforms.crop(crop_type='center', aspect_ratio=0.1875, side_ratio=0.95, jitter_type='uniratio') # Horizontal flip enabled
        ]
    trs += [
        transforms.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear'),
#        transforms.mean(mean_file)
    ]
    # deserializer
    image_source=ImageDeserializer(map_file, StreamDefs(
        features = StreamDef(field='image', transforms=trs), # first column in map file is referred to as 'image'
        labels   = StreamDef(field='label', shape=num_classes)      # and second as 'label'
    ))
    return MinibatchSource(image_source)
Ejemplo n.º 8
0
def create_image_mb_source(map_file, mean_file, train,
                           total_number_of_samples):
    """ Creates minibatch source
    """
    if not os.path.exists(map_file) or not os.path.exists(mean_file):
        raise RuntimeError("File '%s' or '%s' does not exist. " %
                           (map_file, mean_file))

    # transformation pipeline for the features has jitter/crop only when training
    transforms = []
    if train:
        imgfolder = os.path.join(os.path.split(map_file)[0], 'train')
        transforms += [
            xforms.crop(crop_type='randomside',
                        side_ratio=0.8,
                        jitter_type='uniratio')  # train uses jitter
        ]
    else:
        imgfolder = os.path.join(os.path.split(map_file)[0], 'test')

    transforms += [
        xforms.scale(width=_IMAGE_WIDTH,
                     height=_IMAGE_HEIGHT,
                     channels=_NUM_CHANNELS,
                     interpolations='linear'),
        xforms.mean(mean_file)
    ]

    map_file = process_map_file(map_file, imgfolder)

    # deserializer
    return MinibatchSource(
        ImageDeserializer(
            map_file,
            StreamDefs(
                features=StreamDef(field='image', transforms=transforms),
                # first column in map file is referred to as 'image'
                labels=StreamDef(
                    field='label',
                    shape=_NUM_CLASSES))),  # and second as 'label'
        randomize=train,
        max_samples=total_number_of_samples,
        multithreaded_deserializer=True)
Ejemplo n.º 9
0
def create_reader(map_file, mean_file, train):
    if not os.path.exists(map_file) or not os.path.exists(mean_file):
        raise RuntimeError("File '%s' or '%s' does not exist. Please run install_cifar10.py from DataSets/CIFAR-10 to fetch them" %
                           (map_file, mean_file))

    # transformation pipeline for the features has jitter/crop only when training
    transforms = []
    if train:
        transforms += [
            ImageDeserializer.crop(crop_type='randomside', side_ratio=0.8, jitter_type='uniratio') # train uses jitter
        ]
    transforms += [
        ImageDeserializer.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear'),
        ImageDeserializer.mean(mean_file)
    ]
    # deserializer
    return MinibatchSource(ImageDeserializer(map_file, StreamDefs(
        features = StreamDef(field='image', transforms=transforms), # first column in map file is referred to as 'image'
        labels   = StreamDef(field='label', shape=num_classes))))   # and second as 'label'
Ejemplo n.º 10
0
def create_mb_source(image_height, image_width, num_channels, map_file,
                     mean_file, is_training):
    if not os.path.exists(map_file):
        raise RuntimeError("File '%s' does not exist." % (map_file))

    # transformation pipeline for the features has jitter/crop only when training
    transforms = []
    if is_training:
        transforms += [
            xforms.crop(crop_type='randomside',
                        side_ratio=0.875,
                        jitter_type='uniratio')  # train uses jitter
        ]
    else:
        transforms += [
            xforms.crop(crop_type='center',
                        side_ratio=0.875)  # test has no jitter
        ]

    transforms += [
        xforms.scale(width=image_width,
                     height=image_height,
                     channels=num_channels,
                     interpolations='linear'),
    ]

    if mean_file != '':
        transforms += [
            xforms.mean(mean_file),
        ]

    # deserializer
    return MinibatchSource(
        ImageDeserializer(
            map_file,
            StreamDefs(features=StreamDef(
                field='image', transforms=transforms
            )  # first column in map file is referred to as 'image'
                       )),
        randomize=is_training,
        multithreaded_deserializer=True,
        max_sweeps=1)
Ejemplo n.º 11
0
def decode_model(use_gpu=True, gpu_id=0):
    # use GPU or CPU according to parameters
    try_set_default_device(gpu(gpu_id) if use_gpu else cpu())

    model_dnn = load_model("./model/speech_enhancement.model")
    features_file = "./test_normed.scp"
    feature_dim = 257
    test_reader = MinibatchSource(HTKFeatureDeserializer(StreamDefs(
            amazing_features=StreamDef(
                    shape=feature_dim, context=(3, 3),
                    scp=features_file))),
                                  randomize=False, frame_mode=False)
    eval_input_map = {input: test_reader.streams.amazing_features}

    f = open(features_file)
    line = f.readline()
    while line:
        temp_input_path = line.split(']')[0]
        mb_size = temp_input_path.split(',')[-1]
        mb_size = int(mb_size) + 1
        noisy_fea = test_reader.next_minibatch(
                mb_size, input_map=eval_input_map)
        real_noisy_fea = noisy_fea[input].data

        node_in_graph = model_dnn.find_by_name('irm')
        output_nodes = combine([node_in_graph.owner])
        out_noisy_fea = output_nodes.eval(real_noisy_fea)
        # out_noisy_fea = as_composite(model_dnn.output1[0].owner).eval(
        #         real_noisy_fea)

        out_SE_noisy_fea = np.concatenate((out_noisy_fea), axis=0)

        out_file_path = line.split('=')[0]
        out_file_name = os.path.join('./enhanced_norm_fea_mat', out_file_path)
        out_file_fullpath = os.path.split(out_file_name)[0]
        # print (out_file_fullpath)
        if not os.path.exists(out_file_fullpath):
            os.makedirs(out_file_fullpath)
        sio.savemat(out_file_name, {'SE': out_SE_noisy_fea})
        line = f.readline()

    f.close()
Ejemplo n.º 12
0
def cbf_reader(path, is_training, max_samples):
    """
    Returns a MinibatchSource for data at the given path
    :param path: Path to a CBF file
    :param is_training: Set to true if reader is for training set, else false
    :param max_samples: Max no. of samples to read
    """
    deserializer = CBFDeserializer(
        path,
        StreamDefs(label=StreamDef(field='label',
                                   shape=num_classes,
                                   is_sparse=True),
                   pixels=StreamDef(field='pixels',
                                    shape=frame_height * frame_width *
                                    sequence_length,
                                    is_sparse=False)))

    return MinibatchSource(deserializer,
                           randomize=is_training,
                           max_samples=max_samples)
Ejemplo n.º 13
0
def mb_source(tmpdir,
              fileprefix,
              max_samples=FULL_DATA_SWEEP,
              ctf=ctf_data,
              streams=['S0', 'S1']):
    ctf_file = str(tmpdir / (fileprefix + '2seqtest.txt'))
    with open(ctf_file, 'w') as f:
        f.write(ctf)

    mbs = MinibatchSource(CTFDeserializer(
        ctf_file,
        StreamDefs(features=StreamDef(field=streams[0],
                                      shape=input_dim,
                                      is_sparse=True),
                   labels=StreamDef(field=streams[1],
                                    shape=input_dim,
                                    is_sparse=True))),
                          randomize=False,
                          max_samples=max_samples)
    return mbs
Ejemplo n.º 14
0
def create_reader(map_file, mean_file, train, pixel_dimensions, classes,
                  total_number_of_samples):
    print(
        f"Reading map file: {map_file} with number of samples {total_number_of_samples}"
    )
    transforms = [
        xforms.scale(width=pixel_dimensions['width'],
                     height=pixel_dimensions['height'],
                     channels=pixel_dimensions['depth'],
                     interpolations='linear'),
        xforms.mean(mean_file)
    ]

    source = MinibatchSource(deserializers=ImageDeserializer(
        map_file,
        StreamDefs(features=StreamDef(field='image', transforms=transforms),
                   labels=StreamDef(field='label', shape=len(classes)))),
                             randomize=train,
                             max_samples=total_number_of_samples)
    return source
Ejemplo n.º 15
0
def create_image_mb_source(map_file, mean_file, is_training,
                           total_number_of_samples):
    if not os.path.exists(map_file) or not os.path.exists(mean_file):
        raise RuntimeError("File '%s' or '%s' does not exist." %
                           (map_file, mean_file))

    # transformation pipeline for the features has jitter/crop only when training
    transforms = []
    if is_training:
        transforms += [
            xforms.crop(crop_type='randomside',
                        side_ratio=0.875,
                        jitter_type='uniratio')  # train uses jitter
        ]
    else:
        transforms += [
            xforms.crop(crop_type='center',
                        side_ratio=0.875)  # test has no jitter
        ]

    transforms += [
        xforms.scale(width=IMAGE_WIDTH,
                     height=IMAGE_HEIGHT,
                     channels=NUM_CHANNELS,
                     interpolations='linear'),
        xforms.mean(mean_file)
    ]

    # deserializer
    return MinibatchSource(
        ImageDeserializer(
            map_file,
            StreamDefs(
                features=StreamDef(
                    field='image', transforms=transforms
                ),  # first column in map file is referred to as 'image'
                labels=StreamDef(field='label',
                                 shape=NUM_CLASSES))),  # and second as 'label'
        randomize=is_training,
        max_samples=total_number_of_samples,
        multithreaded_deserializer=True)
Ejemplo n.º 16
0
def create_reader(map_file, mean_file, train):
    if not os.path.exists(map_file) or not os.path.exists(mean_file):
        cifar_py3 = "" if sys.version_info.major < 3 else "_py3"
        raise RuntimeError("File '%s' or '%s' does not exist. Please run CifarDownload%s.py and CifarConverter%s.py from CIFAR-10 to fetch them" %
                           (map_file, mean_file, cifar_py3, cifar_py3))

    # transformation pipeline for the features has jitter/crop only when training
    transforms = []
    if train:
        transforms += [
            ImageDeserializer.crop(crop_type='Random', ratio=0.8, jitter_type='uniRatio') # train uses jitter
        ]
    transforms += [
        ImageDeserializer.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear'),
        ImageDeserializer.mean(mean_file)
    ]
    # deserializer
    return MinibatchSource(ImageDeserializer(map_file, StreamDefs(
        features = StreamDef(field='image', transforms=transforms), # first column in map file is referred to as 'image'
        labels   = StreamDef(field='label', shape=num_classes)      # and second as 'label'
    )))
Ejemplo n.º 17
0
def test_multiple_streams_in_htk():
    feature_dim = 33
    context = 2

    os.chdir(data_path)

    features_file = "glob_0000.scp"

    fd = HTKFeatureDeserializer(
        StreamDefs(amazing_features=StreamDef(shape=feature_dim,
                                              context=(context, context),
                                              scp=features_file),
                   amazing_features2=StreamDef(shape=feature_dim,
                                               context=(context, context),
                                               scp=features_file)))

    mbs = MinibatchSource([fd])
    mb = mbs.next_minibatch(1)
    assert (mb[mbs.streams.amazing_features].asarray() == mb[
        mbs.streams.amazing_features2].asarray()).all()
    os.chdir(abs_path)
Ejemplo n.º 18
0
def create_reader(map_file, mean_file, train, dimensions, classes,
                  total_number_of_samples):
    print("Reading map file: {} with number of samples {}".format(
        map_file, total_number_of_samples))

    # transformation pipeline for the features has jitter/crop only when training
    transforms = []
    transforms += [
        xforms.scale(width=dimensions['width'],
                     height=dimensions['height'],
                     channels=dimensions['depth'],
                     interpolations='linear'),
        xforms.mean(mean_file)
    ]
    source = MinibatchSource(ImageDeserializer(
        map_file,
        StreamDefs(features=StreamDef(field='image', transforms=transforms),
                   labels=StreamDef(field='label', shape=len(classes)))),
                             randomize=train,
                             max_samples=total_number_of_samples)
    return source
Ejemplo n.º 19
0
def create_reader(path, vocab_dim, entity_dim, randomize):
    """
  Create data reader for the model
  Args:
    path: The data path
    vocab_dim: The dimention of the vocabulary
    entity_dim: The dimention of entities
    randomize: Where to shuffle the data before feed into the trainer
  """
    return MinibatchSource(CTFDeserializer(
        path,
        StreamDefs(context=StreamDef(field='C',
                                     shape=vocab_dim,
                                     is_sparse=True),
                   query=StreamDef(field='Q', shape=vocab_dim, is_sparse=True),
                   entities=StreamDef(field='E', shape=1, is_sparse=False),
                   label=StreamDef(field='L', shape=1, is_sparse=False),
                   entity_ids=StreamDef(field='EID',
                                        shape=entity_dim,
                                        is_sparse=True))),
                           randomize=randomize)
Ejemplo n.º 20
0
def test_distributed_mb_source_again(tmpdir):
    import random
    from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs

    ctf_data = '''0  |S0 1   |S1 1
0   |S0 2   |S1 2
0   |S0 3
1   |S0 4
1   |S0 5   |S1 3
1   |S0 6   |S1 4
'''
    ctf_file = str(tmpdir / '2seqtest.txt')
    with open(ctf_file, 'w') as f:
        f.write(ctf_data)

    ctf = CTFDeserializer(
        ctf_file,
        StreamDefs(features=StreamDef(field='S0', shape=1),
                   labels=StreamDef(field='S1', shape=1)))

    random.seed(1234)
    mb_sources = []
    for randomize in [True, False]:
        mb_sources.append(MinibatchSource(ctf, randomize=randomize))
        mb_sources.append(
            MinibatchSource(ctf,
                            randomize=randomize,
                            max_sweeps=random.randint(1, 10)))
        mb_sources.append(
            MinibatchSource(ctf,
                            randomize=randomize,
                            max_samples=random.randint(1, 30)))

    for i in range(20):
        for source in mb_sources:
            data = source.next_minibatch(minibatch_size_in_samples=5,
                                         num_data_partitions=2,
                                         partition_index=i % 2)
            features = source.streams['features']
            assert (len(data) == 0 or data[features].num_samples == 3)
Ejemplo n.º 21
0
def test_prefetch_with_unpacking(tmpdir):
    data = r'''0  |S0 1 1 1 1   |S1 1000
1   |S0 2 2 2 2  |S1 100
2   |S0 3 3 3 3  |S1 100
3   |S0 1 1 1 1  |S1 10
4   |S0 2 2 2 2  |S1 1
5   |S0 3 3 3 3  |S1 2000
6   |S0 1 1 1 1  |S1 200
7   |S0 2 2 2 2  |S1 200
8   |S0 3 3 3 3  |S1 20
9   |S0 1 1 1 1  |S1 2
'''
    import time
    tmpfile = _write_data(tmpdir, data)

    input_dim = 4
    num_output_classes = 1

    mb_source = MinibatchSource(CTFDeserializer(tmpfile, StreamDefs(
        features=StreamDef(field='S0', shape=input_dim, is_sparse=False),
        labels=StreamDef(field='S1', shape=num_output_classes, is_sparse=False)
    )), randomize=False, max_samples=FULL_DATA_SWEEP)

    input_map = { 'S0' : mb_source.streams.features, 'S1' : mb_source.streams.labels }
    empty = False
    mb_size = 3
    # On the last minibatch there will be resize called, 
    # due to 10%3 = 1 sample  in the minibatch
    while not empty:
        mb = mb_source.next_minibatch(mb_size, input_map=input_map)
        time.sleep(1) # make sure the prefetch kicks in
        if mb:
            # Force unpacking to check that we do 
            # not break prefetch 
            actual_size = mb['S0'].shape[0]
            assert (mb['S0'].asarray() == np.array([[[1, 1, 1, 1]],
                                                    [[2, 2, 2, 2]],
                                                    [[3, 3, 3, 3]]], dtype=np.float32)[0:actual_size]).all()
        else:
            empty = True
def create_reader(map_file,
                  mean_file,
                  train,
                  total_data_size,
                  distributed_after=INFINITE_SAMPLES):
    if not os.path.exists(map_file) or not os.path.exists(mean_file):
        raise RuntimeError(
            "File '%s' or '%s' does not exist. Please run install_cifar10.py from DataSets/CIFAR-10 to fetch them"
            % (map_file, mean_file))

    # transformation pipeline for the features has jitter/crop only when training
    transforms = []
    if train:
        transforms += [
            ImageDeserializer.crop(crop_type='Random',
                                   ratio=0.8,
                                   jitter_type='uniRatio')  # train uses jitter
        ]
    transforms += [
        ImageDeserializer.scale(width=image_width,
                                height=image_height,
                                channels=num_channels,
                                interpolations='linear'),
        ImageDeserializer.mean(mean_file)
    ]
    # deserializer
    return MinibatchSource(
        ImageDeserializer(
            map_file,
            StreamDefs(
                features=StreamDef(
                    field='image', transforms=transforms
                ),  # first column in map file is referred to as 'image'
                labels=StreamDef(field='label',
                                 shape=num_classes))),  # and second as 'label'
        epoch_size=total_data_size,
        multithreaded_deserializer=
        False,  # turn off omp as CIFAR-10 is not heavy for deserializer
        distributed_after=distributed_after)
def create_reader(map_file):
    transforms = [
        xforms.crop(crop_type='randomside',
                    side_ratio=0.85,
                    jitter_type='uniratio'),
        xforms.scale(width=image_width,
                     height=image_height,
                     channels=num_channels,
                     interpolations='linear'),
        xforms.color(brightness_radius=0.2,
                     contrast_radius=0.2,
                     saturation_radius=0.2)
    ]
    return (MinibatchSource(
        ImageDeserializer(
            map_file,
            StreamDefs(features=StreamDef(field='image',
                                          transforms=transforms,
                                          is_sparse=False),
                       labels=StreamDef(field='label',
                                        shape=num_classes,
                                        is_sparse=False)))))
Ejemplo n.º 24
0
def create_image_mb_source(map_file, is_training, total_number_of_samples):
    if not os.path.exists(map_file):
        raise RuntimeError("File '%s' does not exist." % map_file)

    # transformation pipeline for the features has jitter/crop only when training
    transforms = []
    if is_training:
        transforms += [
            ImageDeserializer.crop(crop_type='randomside',
                                   side_ratio='0.4375:0.875',
                                   jitter_type='uniratio')  # train uses jitter
        ]
    else:
        transforms += [
            ImageDeserializer.crop(crop_type='center',
                                   side_ratio=0.5833333)  # test has no jitter
        ]

    transforms += [
        ImageDeserializer.scale(width=image_width,
                                height=image_height,
                                channels=num_channels,
                                interpolations='linear'),
    ]

    # deserializer
    return MinibatchSource(
        ImageDeserializer(
            map_file,
            StreamDefs(
                features=StreamDef(
                    field='image', transforms=transforms
                ),  # first column in map file is referred to as 'image'
                labels=StreamDef(field='label',
                                 shape=num_classes))),  # and second as 'label'
        randomize=is_training,
        max_samples=total_number_of_samples,
        multithreaded_deserializer=True)
Ejemplo n.º 25
0
def test_minibatch_defined_by_labels(tmpdir):

    input_dim = 1000
    num_output_classes = 5

    def assert_data(mb_source):
        features_si = mb_source.stream_info('features')
        labels_si = mb_source.stream_info('labels')

        mb = mb_source.next_minibatch(2)

        features = mb[features_si]

        # 2 samples, max seq len 4, 1000 dim
        assert features.shape == (2, 4, input_dim)
        assert features.end_of_sweep
        assert features.num_sequences == 2
        assert features.num_samples == 7
        assert features.is_sparse

        labels = mb[labels_si]
        # 2 samples, max seq len 1, 5 dim
        assert labels.shape == (2, 1, num_output_classes)
        assert labels.end_of_sweep
        assert labels.num_sequences == 2
        assert labels.num_samples == 2
        assert not labels.is_sparse

        label_data = labels.asarray()
        assert np.allclose(
            label_data,
            np.asarray([[[1., 0., 0., 0., 0.]], [[0., 1., 0., 0., 0.]]]))

        mb = mb_source.next_minibatch(3)
        features = mb[features_si]
        labels = mb[labels_si]

        assert features.num_samples == 10
        assert labels.num_samples == 3

    tmpfile = _write_data(tmpdir, MBDATA_SPARSE)
    mb_source = MinibatchSource(CTFDeserializer(
        tmpfile,
        StreamDefs(features=StreamDef(field='x',
                                      shape=input_dim,
                                      is_sparse=True),
                   labels=StreamDef(field='y',
                                    shape=num_output_classes,
                                    is_sparse=False,
                                    defines_mb_size=True))),
                                randomize=False)

    assert_data(mb_source)

    tmpfile1 = _write_data(tmpdir, MBDATA_SPARSE1, '1')
    tmpfile2 = _write_data(tmpdir, MBDATA_SPARSE2, '2')
    combined_mb_source = MinibatchSource([
        CTFDeserializer(
            tmpfile1,
            StreamDefs(features=StreamDef(
                field='x', shape=input_dim, is_sparse=True))),
        CTFDeserializer(
            tmpfile2,
            StreamDefs(labels=StreamDef(field='y',
                                        shape=num_output_classes,
                                        is_sparse=False,
                                        defines_mb_size=True)))
    ],
                                         randomize=False)

    assert_data(combined_mb_source)
Ejemplo n.º 26
0
def test_usermbsource(tmpdir):
    tmpfile = _write_data(tmpdir, MBDATA_SPARSE)

    input_dim = 1000
    num_output_classes = 5

    # Setting up the native MB source as the ground truth
    n_mb_source = CTFDeserializer(
        tmpfile,
        StreamDefs(features=StreamDef(field='x',
                                      shape=input_dim,
                                      is_sparse=True),
                   labels=StreamDef(field='y',
                                    shape=num_output_classes,
                                    is_sparse=False)))
    n_mb_source = MinibatchSource(n_mb_source, randomize=False)
    n_features_si = n_mb_source['features']
    n_labels_si = n_mb_source['labels']

    n_mb = n_mb_source.next_minibatch(2)
    n_features = n_mb[n_features_si]
    n_labels = n_mb[n_labels_si]

    # Setting up the user MB source
    u_mb_source = MyDataSource(input_dim, num_output_classes)
    u_features_si = u_mb_source['features']
    u_labels_si = u_mb_source['labels']

    u_mb = u_mb_source.next_minibatch(2, 1, 0)
    u_features = u_mb[u_features_si]
    u_labels = u_mb[u_labels_si]

    assert u_features.shape == n_features.shape == (1, 3, 1000)
    assert u_features.end_of_sweep == n_features.end_of_sweep
    assert u_features.num_sequences == n_features.num_sequences
    assert u_features.num_samples == n_features.num_samples
    assert u_features.is_sparse == n_features.is_sparse

    assert u_labels.shape == n_labels.shape == (1, 1, 5)
    assert u_labels.end_of_sweep is n_labels.end_of_sweep is False
    assert u_labels.num_sequences == u_labels.num_sequences
    assert u_labels.num_samples == u_labels.num_samples
    assert u_labels.is_sparse is n_labels.is_sparse is False

    u_label_data = u_labels.asarray()
    n_label_data = n_labels.asarray()
    assert np.allclose(u_label_data, n_label_data)

    n_mb = n_mb_source.next_minibatch(10)
    n_features = n_mb[n_features_si]
    n_labels = n_mb[n_labels_si]

    u_mb = u_mb_source.next_minibatch(10, 1, 0)
    u_features = u_mb[u_features_si]
    u_labels = u_mb[u_labels_si]

    assert u_labels.shape == n_labels.shape
    u_label_data = u_labels.asarray()
    n_label_data = n_labels.asarray()

    assert np.allclose(u_label_data, n_label_data)

    assert u_features.end_of_sweep is u_labels.end_of_sweep is True
    assert u_features.num_samples == n_features.num_samples
    assert u_features.num_sequences == n_features.num_sequences
Ejemplo n.º 27
0
def create_config(tmpdir):
    tmpfile = create_temp_file(tmpdir)
    return MinibatchSourceConfig() \
        .add_deserializer(
            CTFDeserializer(tmpfile,
                StreamDefs(features=StreamDef(field='S0', shape=1))))
Ejemplo n.º 28
0
def create_ctf_deserializer(tmpdir):
    tmpfile = create_temp_file(tmpdir)
    return CTFDeserializer(tmpfile,
                           StreamDefs(features=StreamDef(field='S0', shape=1)))
Ejemplo n.º 29
0
def test_base64_image_deserializer(tmpdir):
    import io, base64, uuid
    from PIL import Image
    images, b64_images = [], []

    np.random.seed(1)
    for i in range(10):
        data = np.random.randint(0, 2**8, (5, 7, 3))
        image = Image.fromarray(data.astype('uint8'), "RGB")
        buf = io.BytesIO()
        image.save(buf, format='PNG')
        assert image.width == 7 and image.height == 5
        b64_images.append(base64.b64encode(buf.getvalue()))
        images.append(np.array(image))

    image_data = str(tmpdir / 'mbdata1.txt')
    seq_ids = []
    uid = uuid.uuid1().int >> 64
    with open(image_data, 'wb') as f:
        for i, data in enumerate(b64_images):
            seq_id = uid ^ i
            seq_id = str(seq_id).encode('ascii')
            seq_ids.append(seq_id)
            line = seq_id + b'\t'
            label = str(i).encode('ascii')
            line += label + b'\t' + data + b'\n'
            f.write(line)

    ctf_data = str(tmpdir / 'mbdata2.txt')
    with open(ctf_data, 'wb') as f:
        for i, sid in enumerate(seq_ids):
            line = sid + b'\t' + b'|index ' + str(i).encode('ascii') + b'\n'
            f.write(line)

    transforms = [xforms.scale(width=7, height=5, channels=3)]
    b64_deserializer = Base64ImageDeserializer(
        image_data,
        StreamDefs(images=StreamDef(field='image', transforms=transforms),
                   labels=StreamDef(field='label', shape=10)))

    ctf_deserializer = CTFDeserializer(
        ctf_data, StreamDefs(index=StreamDef(field='index', shape=1)))

    mb_source = MinibatchSource([ctf_deserializer, b64_deserializer])
    assert isinstance(mb_source, MinibatchSource)

    for j in range(100):
        mb = mb_source.next_minibatch(10)

        index_stream = mb_source.streams['index']
        index = mb[index_stream].asarray().flatten()
        image_stream = mb_source.streams['images']

        results = mb[image_stream].asarray()

        for i in range(10):
            # original images are RBG, openCV produces BGR images,
            # reverse the last dimension of the original images
            bgrImage = images[int(index[i])][:, :, ::-1]
            # transposing to get CHW representation
            bgrImage = np.transpose(bgrImage, (2, 0, 1))
            assert (bgrImage == results[i][0]).all()
Ejemplo n.º 30
0
def test_image(tmpdir):
    map_file = "input.txt"
    mean_file = "mean.txt"

    feature_name = "f"
    image_width = 100
    image_height = 200
    num_channels = 3

    label_name = "l"
    num_classes = 7

    transforms = [
        xforms.crop(crop_type='randomside',
                    side_ratio=0.5,
                    jitter_type='uniratio'),
        xforms.scale(width=image_width,
                     height=image_height,
                     channels=num_channels,
                     interpolations='linear'),
        xforms.mean(mean_file)
    ]
    defs = StreamDefs(f=StreamDef(field='image', transforms=transforms),
                      l=StreamDef(field='label', shape=num_classes))
    image = ImageDeserializer(map_file, defs)

    config = to_dictionary(MinibatchSourceConfig([image], randomize=False))

    # Multithreading should be on by default for the ImageDeserializer.
    assert config['multiThreadedDeserialization'] is True
    assert len(config['deserializers']) == 1

    d = config['deserializers'][0]
    assert d['type'] == 'ImageDeserializer'
    assert d['file'] == map_file
    assert set(d['input'].keys()) == {label_name, feature_name}

    l = d['input'][label_name]
    assert l['labelDim'] == num_classes

    f = d['input'][feature_name]
    assert set(f.keys()) == {'transforms'}
    t0, t1, t2, _ = f['transforms']
    assert t0['type'] == 'Crop'
    assert t1['type'] == 'Scale'
    assert t2['type'] == 'Mean'
    assert t0['cropType'] == 'randomside'
    assert t0['cropSize'] == '0:0'
    assert t0['sideRatio'] == '0.5:0.5'
    assert t0['aspectRatio'] == '1:1'
    assert t0['areaRatio'] == '0:0'
    assert t0['jitterType'] == 'uniratio'
    assert t1['width'] == image_width
    assert t1['height'] == image_height
    assert t1['channels'] == num_channels
    assert t1['interpolations'] == 'linear'
    assert t2['meanFile'] == mean_file

    config = to_dictionary(MinibatchSourceConfig([image, image]))
    assert len(config['deserializers']) == 2

    ctf = create_ctf_deserializer(tmpdir)
    config = to_dictionary(MinibatchSourceConfig([image, ctf, image]))
    # Multithreading should still be enabled.
    assert config['multiThreadedDeserialization'] is True
    assert len(config['deserializers']) == 3

    # TODO depends on ImageReader.dll
    '''