def create_mb_source(data_set, img_height, img_width, n_classes, n_rois, data_path, randomize):
    # set paths
    map_file   = join(data_path, data_set + '.txt')
    roi_file   = join(data_path, data_set + '.rois.txt')
    label_file = join(data_path, data_set + '.roilabels.txt')
    if not os.path.exists(map_file) or not os.path.exists(roi_file) or not os.path.exists(label_file):
        raise RuntimeError("File '%s', '%s' or '%s' does not exist. " % (map_file, roi_file, label_file))

    # read images
    nrImages = len(readTable(map_file))
    transforms = [scale(width=img_width, height=img_height, channels=3,
                        scale_mode="pad", pad_value=114, interpolations='linear')]
    image_source = ImageDeserializer(map_file, StreamDefs(features = StreamDef(field='image', transforms=transforms)))

    # read rois and labels
    rois_dim  = 4 * n_rois
    label_dim = n_classes * n_rois
    roi_source = CTFDeserializer(roi_file, StreamDefs(
        rois = StreamDef(field='rois', shape=rois_dim, is_sparse=False)))
    label_source = CTFDeserializer(label_file, StreamDefs(
        roiLabels = StreamDef(field='roiLabels', shape=label_dim, is_sparse=False)))

    # define a composite reader
    mb = MinibatchSource([image_source, roi_source, label_source], epoch_size=sys.maxsize, randomize=randomize)
    return (mb, nrImages)
Beispiel #2
0
def mb_source(tmpdir,
              fileprefix,
              max_samples=FULL_DATA_SWEEP,
              ctf=ctf_data,
              streams=['S0', 'S1'],
              max_sweeps=None):
    ctf_file = str(tmpdir / (fileprefix + '2seqtest.txt'))
    with open(ctf_file, 'w') as f:
        f.write(ctf)

    if max_sweeps is None:
        mbs = MinibatchSource(CTFDeserializer(
            ctf_file,
            StreamDefs(features=StreamDef(field=streams[0],
                                          shape=input_dim,
                                          is_sparse=True),
                       labels=StreamDef(field=streams[1],
                                        shape=input_dim,
                                        is_sparse=True))),
                              randomize=False,
                              max_samples=max_samples)
    else:
        mbs = MinibatchSource(CTFDeserializer(
            ctf_file,
            StreamDefs(features=StreamDef(field=streams[0],
                                          shape=input_dim,
                                          is_sparse=True),
                       labels=StreamDef(field=streams[1],
                                        shape=input_dim,
                                        is_sparse=True))),
                              randomize=False,
                              max_sweeps=max_sweeps)

    return mbs
Beispiel #3
0
def create_mb_source(img_height, img_width, img_channels, n_classes, n_rois, data_path, data_set):
    rois_dim = 4 * n_rois
    label_dim = n_classes * n_rois

    path = os.path.normpath(os.path.join(abs_path, data_path))
    if data_set == 'test':
        map_file = os.path.join(path, test_map_filename)
    else:
        map_file = os.path.join(path, train_map_filename)
    roi_file = os.path.join(path, data_set + rois_filename_postfix)
    label_file = os.path.join(path, data_set + roilabels_filename_postfix)

    if not os.path.exists(map_file) or not os.path.exists(roi_file) or not os.path.exists(label_file):
        raise RuntimeError("File '%s', '%s' or '%s' does not exist. "
                           "Please run install_fastrcnn.py from Examples/Image/Detection/FastRCNN to fetch them" %
                           (map_file, roi_file, label_file))

    # read images
    transforms = [scale(width=img_width, height=img_height, channels=img_channels,
                        scale_mode="pad", pad_value=114, interpolations='linear')]

    image_source = ImageDeserializer(map_file, StreamDefs(
        features = StreamDef(field='image', transforms=transforms)))

    # read rois and labels
    roi_source = CTFDeserializer(roi_file, StreamDefs(
        rois = StreamDef(field=roi_stream_name, shape=rois_dim, is_sparse=False)))
    label_source = CTFDeserializer(label_file, StreamDefs(
        roiLabels = StreamDef(field=label_stream_name, shape=label_dim, is_sparse=False)))

    # define a composite reader
    return MinibatchSource([image_source, roi_source, label_source], epoch_size=sys.maxsize, randomize=data_set == "train")
Beispiel #4
0
def test_minibatch_defined_by_labels(tmpdir):

    input_dim = 1000
    num_output_classes = 5

    def assert_data(mb_source):
        features_si = mb_source.stream_info('features')
        labels_si = mb_source.stream_info('labels')

        mb = mb_source.next_minibatch(2)

        features = mb[features_si]

        # 2 samples, max seq len 4, 1000 dim
        assert features.shape == (2, 4, input_dim)
        assert features.end_of_sweep
        assert features.num_sequences == 2
        assert features.num_samples == 7
        assert features.is_sparse

        labels = mb[labels_si]
        # 2 samples, max seq len 1, 5 dim
        assert labels.shape == (2, 1, num_output_classes)
        assert labels.end_of_sweep
        assert labels.num_sequences == 2
        assert labels.num_samples == 2
        assert not labels.is_sparse

        label_data = labels.asarray()
        assert np.allclose(label_data,
                           np.asarray([
                               [[1.,  0.,  0.,  0.,  0.]],
                               [[0.,  1.,  0.,  0.,  0.]]
                           ]))

        mb = mb_source.next_minibatch(3)
        features = mb[features_si]
        labels = mb[labels_si]

        assert features.num_samples == 10
        assert labels.num_samples == 3

    tmpfile = _write_data(tmpdir, MBDATA_SPARSE)
    mb_source = MinibatchSource(CTFDeserializer(tmpfile, StreamDefs(
        features=StreamDef(field='x', shape=input_dim, is_sparse=True),
        labels=StreamDef(field='y', shape=num_output_classes, is_sparse=False, defines_mb_size=True)
    )), randomize=False)

    assert_data(mb_source)

    tmpfile1 = _write_data(tmpdir, MBDATA_SPARSE1, '1')
    tmpfile2 = _write_data(tmpdir, MBDATA_SPARSE2, '2')
    combined_mb_source = MinibatchSource([ CTFDeserializer(tmpfile1, StreamDefs(
            features=StreamDef(field='x', shape=input_dim, is_sparse=True))),
        CTFDeserializer(tmpfile2, StreamDefs(
            labels=StreamDef(field='y', shape=num_output_classes, is_sparse=False, defines_mb_size=True)
        ))], randomize=False)

    assert_data(combined_mb_source)
Beispiel #5
0
def train():
	global sentences, vocabulary, reverse_vocabulary
	# function will create the trainer and train it for specified number of epochs
	# Print loss 50 times while training
	print_freqency = 50
	pp = ProgressPrinter(print_freqency)

	# get the trainer
	word_one_hot, context_one_hots, negative_one_hots, targets, trainer, word_negative_context_product, embedding_layer = create_trainer()
	
	# Create a CTF reader which reads the sparse inputs
	print("reader started")
	reader = CTFDeserializer(G.CTF_input_file)
	reader.map_input(G.word_input_field, dim=G.embedding_vocab_size, format="sparse")
	# context inputs
	for i in range(context_size):
		reader.map_input(G.context_input_field.format(i), dim=G.embedding_vocab_size, format="sparse")
	# negative inputs
	for i in range(G.negative):
		reader.map_input(G.negative_input_field.format(i), dim=G.embedding_vocab_size, format="sparse")
	# targets
	reader.map_input(G.target_input_field, dim=(G.negative + 1), format="dense")
	print("reader done")

	# Get minibatch source from reader
	is_training = True
	minibatch_source = MinibatchSource(reader, randomize=is_training, epoch_size=INFINITELY_REPEAT if is_training else FULL_DATA_SWEEP)
	minibatch_source.streams[targets] = minibatch_source.streams[G.target_input_field]
	del minibatch_source.streams[G.target_input_field]
	print("minibatch source done")
	
	total_minibatches = total_training_instances // G.minibatch_size
	print("traning started")
	print("Total minibatches to train =", total_minibatches)
	for i in range(total_minibatches):
		# Collect minibatch
		# start_batch_collection = time.time()
		mb = minibatch_source.next_minibatch(G.minibatch_size, input_map=minibatch_source.streams)
		# end_batch_collection = time.time()
		# print("Batch collection time = %.6fsecs" % (end_batch_collection - start_batch_collection))
		# print("Time taken to collect one training_instance = %.6fsecs" % ((end_batch_collection - start_batch_collection)/G.minibatch_size))
		# Train minibatch
		# start_train = time.time()
		trainer.train_minibatch(mb)
		# end_train = time.time()
		# print("minibatch train time = %.6fsecs" % (end_train - start_train))
		# print("Time per training instance = %.6fsecs" % ((end_train - start_train)/G.minibatch_size))
		# Update progress printer
		pp.update_with_trainer(trainer)

		# start_batch_collection = time.time()
	print("Total training instances =", total_training_instances)
	return word_negative_context_product
Beispiel #6
0
def create_reader(path,
                  vocab_dim,
                  entity_dim,
                  randomize,
                  rand_size=DEFAULT_RANDOMIZATION_WINDOW,
                  size=INFINITELY_REPEAT):
    """
  Create data reader for the model
  Args:
    path: The data path
    vocab_dim: The dimention of the vocabulary
    entity_dim: The dimention of entities
    randomize: Where to shuffle the data before feed into the trainer
  """
    return MinibatchSource(CTFDeserializer(
        path,
        StreamDefs(context=StreamDef(field='C',
                                     shape=vocab_dim,
                                     is_sparse=True),
                   query=StreamDef(field='Q', shape=vocab_dim, is_sparse=True),
                   entities=StreamDef(field='E', shape=1, is_sparse=False),
                   label=StreamDef(field='L', shape=1, is_sparse=False),
                   entity_ids=StreamDef(field='EID',
                                        shape=entity_dim,
                                        is_sparse=True))),
                           randomize=randomize)
Beispiel #7
0
def test_MinibatchData_and_Value_as_input(tmpdir):

    mbdata = r'''0  |S0 100'''

    tmpfile = str(tmpdir / 'mbtest.txt')
    with open(tmpfile, 'w') as f:
        f.write(mbdata)

    defs = StreamDefs(f1=StreamDef(field='S0', shape=1))
    mb_source = MinibatchSource(CTFDeserializer(tmpfile, defs),
                                randomize=False)

    f1_si = mb_source.stream_info('f1')

    mb = mb_source.next_minibatch(1)

    f1 = input(shape=(1, ), needs_gradient=True, name='f')
    res = f1 * 2

    assert res.eval({f1: mb[f1_si]}) == [[200]]
    # Test MinibatchData
    assert res.eval(mb[f1_si]) == [[200]]
    # Test Value
    assert res.eval(mb[f1_si].data) == [[200]]
    # Test NumPy (converted back from MinibatchData)
    assert res.eval(mb[f1_si].asarray()) == [[200]]
    # Test Value
    assert res.eval(mb[f1_si].data) == [[200]]
Beispiel #8
0
def test_large_minibatch(tmpdir):
    tmpfile = _write_data(tmpdir, MBDATA_DENSE_2)

    mb_source = MinibatchSource(CTFDeserializer(
        tmpfile,
        StreamDefs(features=StreamDef(field='S0', shape=1),
                   labels=StreamDef(field='S1', shape=1))),
                                randomization_window_in_chunks=0)

    features_si = mb_source.stream_info('features')
    labels_si = mb_source.stream_info('labels')

    mb = mb_source.next_minibatch(1000)
    features = mb[features_si]
    labels = mb[labels_si]

    # Actually, the minibatch spans over multiple sweeps,
    # not sure if this is an artificial situation, but
    # maybe instead of a boolean flag we should indicate
    # the largest sweep index the data was taken from.
    assert features.end_of_sweep
    assert labels.end_of_sweep

    assert features.num_samples == 1000 - 1000 % 7
    assert labels.num_samples == 5 * (1000 // 7)

    assert mb[features_si].num_sequences == (1000 // 7)
    assert mb[labels_si].num_sequences == (1000 // 7)
Beispiel #9
0
def test_base64_image_deserializer(tmpdir):
    import io, base64, uuid
    from PIL import Image
    images, b64_images = [], []

    np.random.seed(1)
    for i in range(10):
        data = np.random.randint(0, 2**8, (5, 7, 3))
        image = Image.fromarray(data.astype('uint8'), "RGB")
        buf = io.BytesIO()
        image.save(buf, format='PNG')
        assert image.width == 7 and image.height == 5
        b64_images.append(base64.b64encode(buf.getvalue()))
        images.append(np.array(image))

    image_data = str(tmpdir / 'mbdata1.txt')
    seq_ids = []
    uid = uuid.uuid1().int >> 64
    with open(image_data, 'wb') as f:
        for i, data in enumerate(b64_images):
            seq_id = uid ^ i
            seq_id = str(seq_id).encode('ascii')
            seq_ids.append(seq_id)
            line = seq_id + b'\t'
            label = str(i).encode('ascii')
            line += label + b'\t' + data + b'\n'
            f.write(line)

    ctf_data = str(tmpdir / 'mbdata2.txt')
    with open(ctf_data, 'wb') as f:
        for i, sid in enumerate(seq_ids):
            line = sid + b'\t' + b'|index ' + str(i).encode('ascii') + b'\n'
            f.write(line)

    transforms = [xforms.scale(width=7, height=5, channels=3)]
    b64_deserializer = Base64ImageDeserializer(
        image_data,
        StreamDefs(images=StreamDef(field='image', transforms=transforms),
                   labels=StreamDef(field='label', shape=10)))

    ctf_deserializer = CTFDeserializer(
        ctf_data, StreamDefs(index=StreamDef(field='index', shape=1)))

    mb_source = MinibatchSource([ctf_deserializer, b64_deserializer])
    assert isinstance(mb_source, MinibatchSource)

    for j in range(100):
        mb = mb_source.next_minibatch(10)

        index_stream = mb_source.streams['index']
        index = mb[index_stream].asarray().flatten()
        image_stream = mb_source.streams['images']

        results = mb[image_stream].asarray()

        for i in range(10):
            # original images are RBG, openCV produces BGR images,
            # reverse the last dimension of the original images
            bgrImage = images[int(index[i])][:, :, ::-1]
            assert (bgrImage == results[i][0]).all()
def create_mask_deserializer(path):
	return CTFDeserializer(
		path,
		StreamDefs(
			mask = StreamDef(field = 'mask', shape = numLabels)
		)
	)
Beispiel #11
0
def test_distributed_mb_source_again(tmpdir):
    import random
    from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs

    ctf_data = '''0  |S0 1   |S1 1
0   |S0 2   |S1 2
0   |S0 3
1   |S0 4
1   |S0 5   |S1 3
1   |S0 6   |S1 4
'''
    ctf_file = str(tmpdir/'2seqtest.txt')
    with open(ctf_file, 'w') as f:
        f.write(ctf_data)

    ctf = CTFDeserializer(ctf_file, StreamDefs(
        features  = StreamDef(field='S0', shape=1),
        labels    = StreamDef(field='S1', shape=1)
        ))
    
    random.seed(1234)
    mb_sources = []
    for randomize in [True, False]:
        mb_sources.append(MinibatchSource(ctf, randomize=randomize))
        mb_sources.append(MinibatchSource(ctf, randomize=randomize,  max_sweeps=random.randint(1, 10)))
        mb_sources.append(MinibatchSource(ctf, randomize=randomize, max_samples=random.randint(1, 30)))

    for i in range(20):
        for source in mb_sources:
            data = source.next_minibatch(minibatch_size_in_samples=5, 
                num_data_partitions=2, partition_index=i % 2)
            features = source.streams['features']
            assert(len(data) == 0 or data[features].num_samples == 3)
Beispiel #12
0
def test_eval_sparse_dense(tmpdir, device_id):
    from cntk import Axis
    from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs
    from cntk.ops import input, times

    input_vocab_dim = label_vocab_dim = 69

    ctf_data = '''\
0	|S0 3:1 |# <s>	|S1 3:1 |# <s>
0	|S0 4:1 |# A	|S1 32:1 |# ~AH
0	|S0 5:1 |# B	|S1 36:1 |# ~B
0	|S0 4:1 |# A	|S1 31:1 |# ~AE
0	|S0 7:1 |# D	|S1 38:1 |# ~D
0	|S0 12:1 |# I	|S1 47:1 |# ~IY
0	|S0 1:1 |# </s>	|S1 1:1 |# </s>
2	|S0 60:1 |# <s>	|S1 3:1 |# <s>
2	|S0 61:1 |# A	|S1 32:1 |# ~AH
'''
    ctf_file = str(tmpdir / '2seqtest.txt')
    with open(ctf_file, 'w') as f:
        f.write(ctf_data)

    mbs = MinibatchSource(CTFDeserializer(
        ctf_file,
        StreamDefs(features=StreamDef(field='S0',
                                      shape=input_vocab_dim,
                                      is_sparse=True),
                   labels=StreamDef(field='S1',
                                    shape=label_vocab_dim,
                                    is_sparse=True))),
                          randomize=False,
                          epoch_size=2)

    raw_input = sequence.input(shape=input_vocab_dim,
                               sequence_axis=Axis('inputAxis'),
                               name='raw_input',
                               is_sparse=True)

    mb_valid = mbs.next_minibatch(minibatch_size_in_samples=100,
                                  input_map={raw_input: mbs.streams.features},
                                  device=cntk_device(device_id))

    z = times(raw_input, np.eye(input_vocab_dim))
    e_reader = z.eval(mb_valid, device=cntk_device(device_id))

    # CSR with the raw_input encoding in ctf_data
    one_hot_data = [[3, 4, 5, 4, 7, 12, 1], [60, 61]]
    data = [
        csr(np.eye(input_vocab_dim, dtype=np.float32)[d]) for d in one_hot_data
    ]
    e_csr = z.eval({raw_input: data}, device=cntk_device(device_id))
    assert np.all([np.allclose(a, b) for a, b in zip(e_reader, e_csr)])

    # One-hot with the raw_input encoding in ctf_data
    data = Value.one_hot(one_hot_data,
                         num_classes=input_vocab_dim,
                         device=cntk_device(device_id))
    e_hot = z.eval({raw_input: data}, device=cntk_device(device_id))
    assert np.all([np.allclose(a, b) for a, b in zip(e_reader, e_hot)])
def create_reader_raw(path, is_training, input_dim, num_label_classes):
    """
    Reads in the unstardized values.
    """
    return MinibatchSource(CTFDeserializer(path, StreamDefs(
        labels = StreamDef(field='rawlabels', shape=num_label_classes),
        features   = StreamDef(field='rawfeatures', shape=input_dim)
    )), randomize = is_training, max_sweeps = INFINITELY_REPEAT if is_training else 1)        
Beispiel #14
0
def create_reader(path, is_training, input_dim, label_dim):
    return MinibatchSource(
        CTFDeserializer(
            path,
            StreamDefs(features=StreamDef(field='features', shape=input_dim),
                       labels=StreamDef(field='labels', shape=label_dim))),
        randomize=is_training,
        epoch_size=INFINITELY_REPEAT if is_training else FULL_DATA_SWEEP)
def create_reader(path, is_training, input_dim, num_label_classes):
    """
    reads CNTK formatted file with 'labels' and 'features'
    """    
    return MinibatchSource(CTFDeserializer(path, StreamDefs(
        labels = StreamDef(field='labels', shape=num_label_classes),
        features   = StreamDef(field='features', shape=input_dim)
    )), randomize = is_training, max_sweeps = INFINITELY_REPEAT if is_training else 1)   
def create_reader(path, randomize, input_vocab_dim, label_vocab_dim, size=INFINITELY_REPEAT):
    if not os.path.exists(path):
        raise RuntimeError("File '%s' does not exist." % (path))

    return MinibatchSource(CTFDeserializer(path, StreamDefs(
        features  = StreamDef(field='S0', shape=input_vocab_dim,  is_sparse=True),
        labels    = StreamDef(field='S1', shape=label_vocab_dim,  is_sparse=True)
    )), randomize=randomize, max_samples = size)
Beispiel #17
0
def test_minibatch(tmpdir):

    mbdata = r'''0	|S0 0   |S1 0
0	|S0 1 	|S1 1 
0	|S0 2 	
0	|S0 3 	|S1 3 
1	|S0 4 	
1	|S0 5 	|S1 1
1	|S0 6	|S1 2 
'''

    tmpfile = str(tmpdir/'mbtest.txt')
    with open(tmpfile, 'w') as f:
        f.write(mbdata)

    from cntk.io import CTFDeserializer, MinibatchSource, StreamDef, StreamDefs
    mb_source = MinibatchSource(CTFDeserializer(tmpfile, StreamDefs(
        features  = StreamDef(field='S0', shape=1),
        labels    = StreamDef(field='S1', shape=1))))
     
    features_si = mb_source.stream_info('features')
    labels_si = mb_source.stream_info('labels')
    
    mb = mb_source.next_minibatch(1000)
    assert mb[features_si].num_sequences == 2
    assert mb[labels_si].num_sequences == 2

    features = mb[features_si]
    assert len(features.value) == 2
    expected_features = \
            [
                [[0],[1],[2],[3]],
                [[4],[5],[6]]
            ]

    for res, exp in zip (features.value, expected_features):
        assert np.allclose(res, exp)

    assert np.allclose(features.mask, 
            [[2, 1, 1, 1],
             [2, 1, 1, 0]])

    labels = mb[labels_si]
    assert len(labels.value) == 2
    expected_labels = \
            [
                [[0],[1],[3]], 
                [[1],[2]]
            ]
    for res, exp in zip (labels.value, expected_labels):
        assert np.allclose(res, exp)

    assert np.allclose(labels.mask, 
            [[2, 1, 1],
             [2, 1, 0]])
Beispiel #18
0
def create_reader(path, is_training, input_dim, output_dim):
    return MinibatchSource(CTFDeserializer(
        path,
        StreamDefs(features=StreamDef(field='attribs',
                                      shape=input_dim,
                                      is_sparse=False),
                   labels=StreamDef(field='species',
                                    shape=output_dim,
                                    is_sparse=False))),
                           randomize=is_training,
                           max_sweeps=INFINITELY_REPEAT if is_training else 1)
Beispiel #19
0
def create_reader(path, is_training):
    return MinibatchSource(CTFDeserializer(
        path,
        StreamDefs(features=StreamDef(field='S0',
                                      shape=input_vocab_dim,
                                      is_sparse=True),
                   labels=StreamDef(field='S1',
                                    shape=label_vocab_dim,
                                    is_sparse=True))),
                           randomize=is_training,
                           max_sweeps=INFINITELY_REPEAT if is_training else 1)
Beispiel #20
0
def mb_source(tmpdir, fileprefix, max_samples=FULL_DATA_SWEEP):
    ctf_file = str(tmpdir / (fileprefix + '2seqtest.txt'))
    with open(ctf_file, 'w') as f:
        f.write(ctf_data)

    mbs = MinibatchSource(CTFDeserializer(ctf_file, StreamDefs(
        features=StreamDef(field='S0', shape=input_dim, is_sparse=True),
        labels=StreamDef(field='S1', shape=input_dim, is_sparse=True)
    )),
        randomize=False, max_samples=max_samples)
    return mbs
Beispiel #21
0
def create_reader(path, randomize, size=INFINITELY_REPEAT):
    return MinibatchSource(CTFDeserializer(
        path,
        StreamDefs(features=StreamDef(field='S0',
                                      shape=input_vocab_size,
                                      is_sparse=True),
                   labels=StreamDef(field='S1',
                                    shape=label_vocab_size,
                                    is_sparse=True))),
                           randomize=randomize,
                           epoch_size=size)
Beispiel #22
0
def create_reader(path, is_training, input_dim, label_dim):
    """Create MinibatchSource for reaching training data from given file"""
    return MinibatchSource(CTFDeserializer(
        path,
        StreamDefs(features=StreamDef(field='features',
                                      shape=input_dim,
                                      is_sparse=False),
                   labels=StreamDef(field='labels',
                                    shape=label_dim,
                                    is_sparse=False))),
                           randomize=is_training,
                           max_sweeps=INFINITELY_REPEAT if is_training else 1)
def create_reader(path):
    return MinibatchSource(
        CTFDeserializer(
            path,
            StreamDefs(
                query=StreamDef(field='S0', shape=input_dim, is_sparse=True),
                intent_unused=StreamDef(
                    field='S1', shape=num_intents,
                    is_sparse=True),  # BUGBUG: unused, and should infer dim
                slot_labels=StreamDef(field='S2',
                                      shape=label_dim,
                                      is_sparse=True))))
Beispiel #24
0
def create_mb_source(img_height, img_width, img_channels, n_classes, n_rois, data_path, data_set):
    rois_dim = 4 * n_rois
    label_dim = n_classes * n_rois

    path = os.path.normpath(os.path.join(abs_path, data_path))
    if data_set == 'test':
        map_file = os.path.join(path, test_map_filename)
    else:
        map_file = os.path.join(path, train_map_filename)
    roi_file = os.path.join(path, data_set + rois_filename_postfix)
    label_file = os.path.join(path, data_set + roilabels_filename_postfix)

    if not os.path.exists(map_file) or not os.path.exists(roi_file) or not os.path.exists(label_file):
        raise RuntimeError("File '%s', '%s' or '%s' does not exist. "
                           "Please run install_fastrcnn.py from Examples/Image/Detection/FastRCNN to fetch them" %
                           (map_file, roi_file, label_file))

    # read images
    image_source = ImageDeserializer(map_file)
    image_source.ignore_labels()
    image_source.map_features(features_stream_name,
                              [ImageDeserializer.scale(width=img_width, height=img_height, channels=img_channels,
                                                       scale_mode="pad", pad_value=114, interpolations='linear')])

    # read rois and labels
    roi_source = CTFDeserializer(roi_file)
    roi_source.map_input(roi_stream_name, dim=rois_dim, format="dense")
    label_source = CTFDeserializer(label_file)
    label_source.map_input(label_stream_name, dim=label_dim, format="dense")

    # define a composite reader
    return MinibatchSource([image_source, roi_source, label_source], epoch_size=sys.maxsize, randomize=data_set == "train")
Beispiel #25
0
def test_text_format(tmpdir):
    tmpfile = _write_data(tmpdir, MBDATA_SPARSE)

    input_dim = 1000
    num_output_classes = 5

    mb_source = MinibatchSource(CTFDeserializer(
        tmpfile,
        StreamDefs(features=StreamDef(field='x',
                                      shape=input_dim,
                                      is_sparse=True),
                   labels=StreamDef(field='y',
                                    shape=num_output_classes,
                                    is_sparse=False))),
                                randomize=False)

    assert isinstance(mb_source, MinibatchSource)

    features_si = mb_source.stream_info('features')
    labels_si = mb_source.stream_info('labels')

    mb = mb_source.next_minibatch(7)

    features = mb[features_si]
    # 2 samples, max seq len 4, 1000 dim
    assert features.shape == (2, 4, input_dim)
    assert features.end_of_sweep
    assert features.num_sequences == 2
    assert features.num_samples == 7
    assert features.is_sparse

    labels = mb[labels_si]
    # 2 samples, max seq len 1, 5 dim
    assert labels.shape == (2, 1, num_output_classes)
    assert labels.end_of_sweep
    assert labels.num_sequences == 2
    assert labels.num_samples == 2
    assert not labels.is_sparse

    label_data = labels.asarray()
    assert np.allclose(
        label_data, np.asarray([[[1., 0., 0., 0., 0.]], [[0., 1., 0., 0.,
                                                          0.]]]))

    mb = mb_source.next_minibatch(1)
    features = mb[features_si]
    labels = mb[labels_si]

    assert not features.end_of_sweep
    assert not labels.end_of_sweep
    assert features.num_samples < 7
    assert labels.num_samples == 1
Beispiel #26
0
def test_text_format(tmpdir):
    from cntk.io import CTFDeserializer, MinibatchSource, StreamDef, StreamDefs

    mbdata = r'''0	|x 560:1	|y 1 0 0 0 0
0	|x 0:1
0	|x 0:1
1	|x 560:1	|y 0 1 0 0 0
1	|x 0:1
1	|x 0:1
1	|x 424:1
'''
    tmpfile = str(tmpdir/'mbdata.txt')
    with open(tmpfile, 'w') as f:
        f.write(mbdata)

    input_dim = 1000
    num_output_classes = 5

    mb_source = MinibatchSource(CTFDeserializer(tmpfile, StreamDefs(
         features  = StreamDef(field='x', shape=input_dim, is_sparse=True),
         labels    = StreamDef(field='y', shape=num_output_classes, is_sparse=False)
       )))

    assert isinstance(mb_source, MinibatchSource)

    features_si = mb_source.stream_info('features')
    labels_si = mb_source.stream_info('labels')

    mb = mb_source.next_minibatch(7)

    features = mb[features_si]
    # 2 samples, max seq len 4, 1000 dim
    assert features.shape == (2, 4, input_dim)
    assert features.is_sparse
    # TODO features is sparse and cannot be accessed right now:
    # *** RuntimeError: DataBuffer/WritableDataBuffer methods can only be called for NDArrayiew objects with dense storage format
    # 2 samples, max seq len 4, 1000 dim
    #assert features.data().shape().dimensions() == (2, 4, input_dim)
    #assert features.data().is_sparse()

    labels = mb[labels_si]
    # 2 samples, max seq len 1, 5 dim
    assert labels.shape == (2, 1, num_output_classes)
    assert not labels.is_sparse

    label_data = np.asarray(labels)
    assert np.allclose(label_data,
            np.asarray([
                [[ 1.,  0.,  0.,  0.,  0.]],
                [[ 0.,  1.,  0.,  0.,  0.]]
                ]))
Beispiel #27
0
def create_reader(path, is_training):
    return MinibatchSource(
        CTFDeserializer(
            path,
            StreamDefs(
                query=StreamDef(field='S0', shape=vocab_size, is_sparse=True),
                intent_labels=StreamDef(
                    field='S1', shape=num_intents, is_sparse=True
                ),  # (used for intent classification variant)
                slot_labels=StreamDef(field='S2',
                                      shape=num_labels,
                                      is_sparse=True))),
        randomize=is_training,
        max_sweeps=INFINITELY_REPEAT if is_training else 1)
Beispiel #28
0
def test_prefetch_with_unpacking(tmpdir):
    data = r'''0  |S0 1 1 1 1   |S1 1000
1   |S0 2 2 2 2  |S1 100
2   |S0 3 3 3 3  |S1 100
3   |S0 1 1 1 1  |S1 10
4   |S0 2 2 2 2  |S1 1
5   |S0 3 3 3 3  |S1 2000
6   |S0 1 1 1 1  |S1 200
7   |S0 2 2 2 2  |S1 200
8   |S0 3 3 3 3  |S1 20
9   |S0 1 1 1 1  |S1 2
'''
    import time
    tmpfile = _write_data(tmpdir, data)

    input_dim = 4
    num_output_classes = 1

    mb_source = MinibatchSource(CTFDeserializer(
        tmpfile,
        StreamDefs(features=StreamDef(field='S0',
                                      shape=input_dim,
                                      is_sparse=False),
                   labels=StreamDef(field='S1',
                                    shape=num_output_classes,
                                    is_sparse=False))),
                                randomize=False,
                                max_samples=FULL_DATA_SWEEP)

    input_map = {
        'S0': mb_source.streams.features,
        'S1': mb_source.streams.labels
    }
    empty = False
    mb_size = 3
    # On the last minibatch there will be resize called,
    # due to 10%3 = 1 sample  in the minibatch
    while not empty:
        mb = mb_source.next_minibatch(mb_size, input_map=input_map)
        time.sleep(1)  # make sure the prefetch kicks in
        if mb:
            # Force unpacking to check that we do
            # not break prefetch
            actual_size = mb['S0'].shape[0]
            assert (mb['S0'].asarray() == np.array(
                [[[1, 1, 1, 1]], [[2, 2, 2, 2]], [[3, 3, 3, 3]]],
                dtype=np.float32)[0:actual_size]).all()
        else:
            empty = True
Beispiel #29
0
def create_reader(path, is_training, query_total_dim, passage_total_dim,
                  label_total_dim):
    return MinibatchSource(
        CTFDeserializer(
            path,
            StreamDefs(queryfeatures=StreamDef(field='qfeatures',
                                               shape=query_total_dim,
                                               is_sparse=False),
                       passagefeatures=StreamDef(field='pfeatures',
                                                 shape=passage_total_dim,
                                                 is_sparse=False),
                       labels=StreamDef(field='labels',
                                        shape=label_total_dim,
                                        is_sparse=False))),
        randomize=is_training,
        max_sweeps=INFINITELY_REPEAT if is_training else FULL_DATA_SWEEP)
Beispiel #30
0
def test_full_sweep_minibatch(tmpdir):
    tmpfile = _write_data(tmpdir, MBDATA_DENSE_1)

    mb_source = MinibatchSource(CTFDeserializer(tmpfile, StreamDefs(
        features  = StreamDef(field='S0', shape=1),
        labels    = StreamDef(field='S1', shape=1))),
        randomization_window_in_chunks=0, max_sweeps=1)

    features_si = mb_source.stream_info('features')
    labels_si = mb_source.stream_info('labels')

    mb = mb_source.next_minibatch(1000)

    assert mb[features_si].num_sequences == 2
    assert mb[labels_si].num_sequences == 2

    features = mb[features_si]
    assert features.end_of_sweep
    assert len(features.as_sequences()) == 2
    expected_features = \
        [
            [[0], [1], [2], [3]],
            [[4], [5], [6]]
        ]

    for res, exp in zip(features.as_sequences(), expected_features):
        assert np.allclose(res, exp)

    assert np.allclose(features.data.mask,
            [[2, 1, 1, 1],
             [2, 1, 1, 0]])

    labels = mb[labels_si]
    assert labels.end_of_sweep
    assert len(labels.as_sequences()) == 2
    expected_labels = \
            [
                [[0],[1],[3]],
                [[1],[2]]
            ]
    for res, exp in zip(labels.as_sequences(), expected_labels):
        assert np.allclose(res, exp)

    assert np.allclose(labels.data.mask,
            [[2, 1, 1],
             [2, 1, 0]])
Beispiel #31
0
def create_mb_source(img_height, img_width, img_channels, n_classes, n_rois,
                     data_path, data_set):
    rois_dim = 4 * n_rois
    label_dim = n_classes * n_rois

    path = os.path.normpath(os.path.join(abs_path, data_path))
    if data_set == 'test':
        map_file = os.path.join(path, test_map_filename)
    else:
        map_file = os.path.join(path, train_map_filename)
    roi_file = os.path.join(path, data_set + rois_filename_postfix)
    label_file = os.path.join(path, data_set + roilabels_filename_postfix)

    if not os.path.exists(map_file) or not os.path.exists(
            roi_file) or not os.path.exists(label_file):
        raise RuntimeError(
            "File '%s', '%s' or '%s' does not exist. "
            "Please run install_fastrcnn.py from Examples/Image/Detection/FastRCNN to fetch them"
            % (map_file, roi_file, label_file))

    # read images
    image_source = ImageDeserializer(map_file)
    image_source.ignore_labels()
    image_source.map_features(features_stream_name, [
        ImageDeserializer.scale(width=img_width,
                                height=img_height,
                                channels=img_channels,
                                scale_mode="pad",
                                pad_value=114,
                                interpolations='linear')
    ])

    # read rois and labels
    roi_source = CTFDeserializer(roi_file)
    roi_source.map_input(roi_stream_name, dim=rois_dim, format="dense")
    label_source = CTFDeserializer(label_file)
    label_source.map_input(label_stream_name, dim=label_dim, format="dense")

    # define a composite reader
    rc = ReaderConfig([image_source, roi_source, label_source],
                      epoch_size=sys.maxsize,
                      randomize=data_set == "train")
    return rc.minibatch_source()
Beispiel #32
0
def test_usermbsource(tmpdir):
    tmpfile = _write_data(tmpdir, MBDATA_SPARSE)

    input_dim = 1000
    num_output_classes = 5

    # Setting up the native MB source as the ground truth
    n_mb_source = CTFDeserializer(tmpfile, StreamDefs(
        features=StreamDef(field='x', shape=input_dim, is_sparse=True),
        labels=StreamDef(field='y', shape=num_output_classes, is_sparse=False)
    ))
    n_mb_source = MinibatchSource(n_mb_source, randomize=False)
    n_features_si = n_mb_source['features']
    n_labels_si = n_mb_source['labels']

    n_mb = n_mb_source.next_minibatch(2)
    n_features = n_mb[n_features_si]
    n_labels = n_mb[n_labels_si]

    # Setting up the user MB source
    u_mb_source = MyDataSource(input_dim, num_output_classes)
    u_features_si = u_mb_source['features']
    u_labels_si = u_mb_source['labels']

    u_mb = u_mb_source.next_minibatch(2, 1, 0)
    u_features = u_mb[u_features_si]
    u_labels = u_mb[u_labels_si]

    assert u_features.shape == n_features.shape == (1, 3, 1000)
    assert u_features.end_of_sweep == n_features.end_of_sweep
    assert u_features.num_sequences == n_features.num_sequences
    assert u_features.num_samples == n_features.num_samples
    assert u_features.is_sparse == n_features.is_sparse

    assert u_labels.shape == n_labels.shape == (1, 1, 5)
    assert u_labels.end_of_sweep is n_labels.end_of_sweep is False
    assert u_labels.num_sequences == u_labels.num_sequences
    assert u_labels.num_samples == u_labels.num_samples
    assert u_labels.is_sparse is n_labels.is_sparse is False

    u_label_data = u_labels.asarray()
    n_label_data = n_labels.asarray()
    assert np.allclose(u_label_data, n_label_data)

    n_mb = n_mb_source.next_minibatch(10)
    n_features = n_mb[n_features_si]
    n_labels = n_mb[n_labels_si]

    u_mb = u_mb_source.next_minibatch(10, 1, 0)
    u_features = u_mb[u_features_si]
    u_labels = u_mb[u_labels_si]

    assert u_labels.shape == n_labels.shape
    u_label_data = u_labels.asarray()
    n_label_data = n_labels.asarray()

    assert np.allclose(u_label_data, n_label_data)

    assert u_features.end_of_sweep is u_labels.end_of_sweep is True
    assert u_features.num_samples == n_features.num_samples
    assert u_features.num_sequences == n_features.num_sequences