def create_mb_source(data_set, img_height, img_width, n_classes, n_rois, data_path, randomize): # set paths map_file = join(data_path, data_set + '.txt') roi_file = join(data_path, data_set + '.rois.txt') label_file = join(data_path, data_set + '.roilabels.txt') if not os.path.exists(map_file) or not os.path.exists(roi_file) or not os.path.exists(label_file): raise RuntimeError("File '%s', '%s' or '%s' does not exist. " % (map_file, roi_file, label_file)) # read images nrImages = len(readTable(map_file)) transforms = [scale(width=img_width, height=img_height, channels=3, scale_mode="pad", pad_value=114, interpolations='linear')] image_source = ImageDeserializer(map_file, StreamDefs(features = StreamDef(field='image', transforms=transforms))) # read rois and labels rois_dim = 4 * n_rois label_dim = n_classes * n_rois roi_source = CTFDeserializer(roi_file, StreamDefs( rois = StreamDef(field='rois', shape=rois_dim, is_sparse=False))) label_source = CTFDeserializer(label_file, StreamDefs( roiLabels = StreamDef(field='roiLabels', shape=label_dim, is_sparse=False))) # define a composite reader mb = MinibatchSource([image_source, roi_source, label_source], epoch_size=sys.maxsize, randomize=randomize) return (mb, nrImages)
def mb_source(tmpdir, fileprefix, max_samples=FULL_DATA_SWEEP, ctf=ctf_data, streams=['S0', 'S1'], max_sweeps=None): ctf_file = str(tmpdir / (fileprefix + '2seqtest.txt')) with open(ctf_file, 'w') as f: f.write(ctf) if max_sweeps is None: mbs = MinibatchSource(CTFDeserializer( ctf_file, StreamDefs(features=StreamDef(field=streams[0], shape=input_dim, is_sparse=True), labels=StreamDef(field=streams[1], shape=input_dim, is_sparse=True))), randomize=False, max_samples=max_samples) else: mbs = MinibatchSource(CTFDeserializer( ctf_file, StreamDefs(features=StreamDef(field=streams[0], shape=input_dim, is_sparse=True), labels=StreamDef(field=streams[1], shape=input_dim, is_sparse=True))), randomize=False, max_sweeps=max_sweeps) return mbs
def create_mb_source(img_height, img_width, img_channels, n_classes, n_rois, data_path, data_set): rois_dim = 4 * n_rois label_dim = n_classes * n_rois path = os.path.normpath(os.path.join(abs_path, data_path)) if data_set == 'test': map_file = os.path.join(path, test_map_filename) else: map_file = os.path.join(path, train_map_filename) roi_file = os.path.join(path, data_set + rois_filename_postfix) label_file = os.path.join(path, data_set + roilabels_filename_postfix) if not os.path.exists(map_file) or not os.path.exists(roi_file) or not os.path.exists(label_file): raise RuntimeError("File '%s', '%s' or '%s' does not exist. " "Please run install_fastrcnn.py from Examples/Image/Detection/FastRCNN to fetch them" % (map_file, roi_file, label_file)) # read images transforms = [scale(width=img_width, height=img_height, channels=img_channels, scale_mode="pad", pad_value=114, interpolations='linear')] image_source = ImageDeserializer(map_file, StreamDefs( features = StreamDef(field='image', transforms=transforms))) # read rois and labels roi_source = CTFDeserializer(roi_file, StreamDefs( rois = StreamDef(field=roi_stream_name, shape=rois_dim, is_sparse=False))) label_source = CTFDeserializer(label_file, StreamDefs( roiLabels = StreamDef(field=label_stream_name, shape=label_dim, is_sparse=False))) # define a composite reader return MinibatchSource([image_source, roi_source, label_source], epoch_size=sys.maxsize, randomize=data_set == "train")
def test_minibatch_defined_by_labels(tmpdir): input_dim = 1000 num_output_classes = 5 def assert_data(mb_source): features_si = mb_source.stream_info('features') labels_si = mb_source.stream_info('labels') mb = mb_source.next_minibatch(2) features = mb[features_si] # 2 samples, max seq len 4, 1000 dim assert features.shape == (2, 4, input_dim) assert features.end_of_sweep assert features.num_sequences == 2 assert features.num_samples == 7 assert features.is_sparse labels = mb[labels_si] # 2 samples, max seq len 1, 5 dim assert labels.shape == (2, 1, num_output_classes) assert labels.end_of_sweep assert labels.num_sequences == 2 assert labels.num_samples == 2 assert not labels.is_sparse label_data = labels.asarray() assert np.allclose(label_data, np.asarray([ [[1., 0., 0., 0., 0.]], [[0., 1., 0., 0., 0.]] ])) mb = mb_source.next_minibatch(3) features = mb[features_si] labels = mb[labels_si] assert features.num_samples == 10 assert labels.num_samples == 3 tmpfile = _write_data(tmpdir, MBDATA_SPARSE) mb_source = MinibatchSource(CTFDeserializer(tmpfile, StreamDefs( features=StreamDef(field='x', shape=input_dim, is_sparse=True), labels=StreamDef(field='y', shape=num_output_classes, is_sparse=False, defines_mb_size=True) )), randomize=False) assert_data(mb_source) tmpfile1 = _write_data(tmpdir, MBDATA_SPARSE1, '1') tmpfile2 = _write_data(tmpdir, MBDATA_SPARSE2, '2') combined_mb_source = MinibatchSource([ CTFDeserializer(tmpfile1, StreamDefs( features=StreamDef(field='x', shape=input_dim, is_sparse=True))), CTFDeserializer(tmpfile2, StreamDefs( labels=StreamDef(field='y', shape=num_output_classes, is_sparse=False, defines_mb_size=True) ))], randomize=False) assert_data(combined_mb_source)
def train(): global sentences, vocabulary, reverse_vocabulary # function will create the trainer and train it for specified number of epochs # Print loss 50 times while training print_freqency = 50 pp = ProgressPrinter(print_freqency) # get the trainer word_one_hot, context_one_hots, negative_one_hots, targets, trainer, word_negative_context_product, embedding_layer = create_trainer() # Create a CTF reader which reads the sparse inputs print("reader started") reader = CTFDeserializer(G.CTF_input_file) reader.map_input(G.word_input_field, dim=G.embedding_vocab_size, format="sparse") # context inputs for i in range(context_size): reader.map_input(G.context_input_field.format(i), dim=G.embedding_vocab_size, format="sparse") # negative inputs for i in range(G.negative): reader.map_input(G.negative_input_field.format(i), dim=G.embedding_vocab_size, format="sparse") # targets reader.map_input(G.target_input_field, dim=(G.negative + 1), format="dense") print("reader done") # Get minibatch source from reader is_training = True minibatch_source = MinibatchSource(reader, randomize=is_training, epoch_size=INFINITELY_REPEAT if is_training else FULL_DATA_SWEEP) minibatch_source.streams[targets] = minibatch_source.streams[G.target_input_field] del minibatch_source.streams[G.target_input_field] print("minibatch source done") total_minibatches = total_training_instances // G.minibatch_size print("traning started") print("Total minibatches to train =", total_minibatches) for i in range(total_minibatches): # Collect minibatch # start_batch_collection = time.time() mb = minibatch_source.next_minibatch(G.minibatch_size, input_map=minibatch_source.streams) # end_batch_collection = time.time() # print("Batch collection time = %.6fsecs" % (end_batch_collection - start_batch_collection)) # print("Time taken to collect one training_instance = %.6fsecs" % ((end_batch_collection - start_batch_collection)/G.minibatch_size)) # Train minibatch # start_train = time.time() trainer.train_minibatch(mb) # end_train = time.time() # print("minibatch train time = %.6fsecs" % (end_train - start_train)) # print("Time per training instance = %.6fsecs" % ((end_train - start_train)/G.minibatch_size)) # Update progress printer pp.update_with_trainer(trainer) # start_batch_collection = time.time() print("Total training instances =", total_training_instances) return word_negative_context_product
def create_reader(path, vocab_dim, entity_dim, randomize, rand_size=DEFAULT_RANDOMIZATION_WINDOW, size=INFINITELY_REPEAT): """ Create data reader for the model Args: path: The data path vocab_dim: The dimention of the vocabulary entity_dim: The dimention of entities randomize: Where to shuffle the data before feed into the trainer """ return MinibatchSource(CTFDeserializer( path, StreamDefs(context=StreamDef(field='C', shape=vocab_dim, is_sparse=True), query=StreamDef(field='Q', shape=vocab_dim, is_sparse=True), entities=StreamDef(field='E', shape=1, is_sparse=False), label=StreamDef(field='L', shape=1, is_sparse=False), entity_ids=StreamDef(field='EID', shape=entity_dim, is_sparse=True))), randomize=randomize)
def test_MinibatchData_and_Value_as_input(tmpdir): mbdata = r'''0 |S0 100''' tmpfile = str(tmpdir / 'mbtest.txt') with open(tmpfile, 'w') as f: f.write(mbdata) defs = StreamDefs(f1=StreamDef(field='S0', shape=1)) mb_source = MinibatchSource(CTFDeserializer(tmpfile, defs), randomize=False) f1_si = mb_source.stream_info('f1') mb = mb_source.next_minibatch(1) f1 = input(shape=(1, ), needs_gradient=True, name='f') res = f1 * 2 assert res.eval({f1: mb[f1_si]}) == [[200]] # Test MinibatchData assert res.eval(mb[f1_si]) == [[200]] # Test Value assert res.eval(mb[f1_si].data) == [[200]] # Test NumPy (converted back from MinibatchData) assert res.eval(mb[f1_si].asarray()) == [[200]] # Test Value assert res.eval(mb[f1_si].data) == [[200]]
def test_large_minibatch(tmpdir): tmpfile = _write_data(tmpdir, MBDATA_DENSE_2) mb_source = MinibatchSource(CTFDeserializer( tmpfile, StreamDefs(features=StreamDef(field='S0', shape=1), labels=StreamDef(field='S1', shape=1))), randomization_window_in_chunks=0) features_si = mb_source.stream_info('features') labels_si = mb_source.stream_info('labels') mb = mb_source.next_minibatch(1000) features = mb[features_si] labels = mb[labels_si] # Actually, the minibatch spans over multiple sweeps, # not sure if this is an artificial situation, but # maybe instead of a boolean flag we should indicate # the largest sweep index the data was taken from. assert features.end_of_sweep assert labels.end_of_sweep assert features.num_samples == 1000 - 1000 % 7 assert labels.num_samples == 5 * (1000 // 7) assert mb[features_si].num_sequences == (1000 // 7) assert mb[labels_si].num_sequences == (1000 // 7)
def test_base64_image_deserializer(tmpdir): import io, base64, uuid from PIL import Image images, b64_images = [], [] np.random.seed(1) for i in range(10): data = np.random.randint(0, 2**8, (5, 7, 3)) image = Image.fromarray(data.astype('uint8'), "RGB") buf = io.BytesIO() image.save(buf, format='PNG') assert image.width == 7 and image.height == 5 b64_images.append(base64.b64encode(buf.getvalue())) images.append(np.array(image)) image_data = str(tmpdir / 'mbdata1.txt') seq_ids = [] uid = uuid.uuid1().int >> 64 with open(image_data, 'wb') as f: for i, data in enumerate(b64_images): seq_id = uid ^ i seq_id = str(seq_id).encode('ascii') seq_ids.append(seq_id) line = seq_id + b'\t' label = str(i).encode('ascii') line += label + b'\t' + data + b'\n' f.write(line) ctf_data = str(tmpdir / 'mbdata2.txt') with open(ctf_data, 'wb') as f: for i, sid in enumerate(seq_ids): line = sid + b'\t' + b'|index ' + str(i).encode('ascii') + b'\n' f.write(line) transforms = [xforms.scale(width=7, height=5, channels=3)] b64_deserializer = Base64ImageDeserializer( image_data, StreamDefs(images=StreamDef(field='image', transforms=transforms), labels=StreamDef(field='label', shape=10))) ctf_deserializer = CTFDeserializer( ctf_data, StreamDefs(index=StreamDef(field='index', shape=1))) mb_source = MinibatchSource([ctf_deserializer, b64_deserializer]) assert isinstance(mb_source, MinibatchSource) for j in range(100): mb = mb_source.next_minibatch(10) index_stream = mb_source.streams['index'] index = mb[index_stream].asarray().flatten() image_stream = mb_source.streams['images'] results = mb[image_stream].asarray() for i in range(10): # original images are RBG, openCV produces BGR images, # reverse the last dimension of the original images bgrImage = images[int(index[i])][:, :, ::-1] assert (bgrImage == results[i][0]).all()
def create_mask_deserializer(path): return CTFDeserializer( path, StreamDefs( mask = StreamDef(field = 'mask', shape = numLabels) ) )
def test_distributed_mb_source_again(tmpdir): import random from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs ctf_data = '''0 |S0 1 |S1 1 0 |S0 2 |S1 2 0 |S0 3 1 |S0 4 1 |S0 5 |S1 3 1 |S0 6 |S1 4 ''' ctf_file = str(tmpdir/'2seqtest.txt') with open(ctf_file, 'w') as f: f.write(ctf_data) ctf = CTFDeserializer(ctf_file, StreamDefs( features = StreamDef(field='S0', shape=1), labels = StreamDef(field='S1', shape=1) )) random.seed(1234) mb_sources = [] for randomize in [True, False]: mb_sources.append(MinibatchSource(ctf, randomize=randomize)) mb_sources.append(MinibatchSource(ctf, randomize=randomize, max_sweeps=random.randint(1, 10))) mb_sources.append(MinibatchSource(ctf, randomize=randomize, max_samples=random.randint(1, 30))) for i in range(20): for source in mb_sources: data = source.next_minibatch(minibatch_size_in_samples=5, num_data_partitions=2, partition_index=i % 2) features = source.streams['features'] assert(len(data) == 0 or data[features].num_samples == 3)
def test_eval_sparse_dense(tmpdir, device_id): from cntk import Axis from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs from cntk.ops import input, times input_vocab_dim = label_vocab_dim = 69 ctf_data = '''\ 0 |S0 3:1 |# <s> |S1 3:1 |# <s> 0 |S0 4:1 |# A |S1 32:1 |# ~AH 0 |S0 5:1 |# B |S1 36:1 |# ~B 0 |S0 4:1 |# A |S1 31:1 |# ~AE 0 |S0 7:1 |# D |S1 38:1 |# ~D 0 |S0 12:1 |# I |S1 47:1 |# ~IY 0 |S0 1:1 |# </s> |S1 1:1 |# </s> 2 |S0 60:1 |# <s> |S1 3:1 |# <s> 2 |S0 61:1 |# A |S1 32:1 |# ~AH ''' ctf_file = str(tmpdir / '2seqtest.txt') with open(ctf_file, 'w') as f: f.write(ctf_data) mbs = MinibatchSource(CTFDeserializer( ctf_file, StreamDefs(features=StreamDef(field='S0', shape=input_vocab_dim, is_sparse=True), labels=StreamDef(field='S1', shape=label_vocab_dim, is_sparse=True))), randomize=False, epoch_size=2) raw_input = sequence.input(shape=input_vocab_dim, sequence_axis=Axis('inputAxis'), name='raw_input', is_sparse=True) mb_valid = mbs.next_minibatch(minibatch_size_in_samples=100, input_map={raw_input: mbs.streams.features}, device=cntk_device(device_id)) z = times(raw_input, np.eye(input_vocab_dim)) e_reader = z.eval(mb_valid, device=cntk_device(device_id)) # CSR with the raw_input encoding in ctf_data one_hot_data = [[3, 4, 5, 4, 7, 12, 1], [60, 61]] data = [ csr(np.eye(input_vocab_dim, dtype=np.float32)[d]) for d in one_hot_data ] e_csr = z.eval({raw_input: data}, device=cntk_device(device_id)) assert np.all([np.allclose(a, b) for a, b in zip(e_reader, e_csr)]) # One-hot with the raw_input encoding in ctf_data data = Value.one_hot(one_hot_data, num_classes=input_vocab_dim, device=cntk_device(device_id)) e_hot = z.eval({raw_input: data}, device=cntk_device(device_id)) assert np.all([np.allclose(a, b) for a, b in zip(e_reader, e_hot)])
def create_reader_raw(path, is_training, input_dim, num_label_classes): """ Reads in the unstardized values. """ return MinibatchSource(CTFDeserializer(path, StreamDefs( labels = StreamDef(field='rawlabels', shape=num_label_classes), features = StreamDef(field='rawfeatures', shape=input_dim) )), randomize = is_training, max_sweeps = INFINITELY_REPEAT if is_training else 1)
def create_reader(path, is_training, input_dim, label_dim): return MinibatchSource( CTFDeserializer( path, StreamDefs(features=StreamDef(field='features', shape=input_dim), labels=StreamDef(field='labels', shape=label_dim))), randomize=is_training, epoch_size=INFINITELY_REPEAT if is_training else FULL_DATA_SWEEP)
def create_reader(path, is_training, input_dim, num_label_classes): """ reads CNTK formatted file with 'labels' and 'features' """ return MinibatchSource(CTFDeserializer(path, StreamDefs( labels = StreamDef(field='labels', shape=num_label_classes), features = StreamDef(field='features', shape=input_dim) )), randomize = is_training, max_sweeps = INFINITELY_REPEAT if is_training else 1)
def create_reader(path, randomize, input_vocab_dim, label_vocab_dim, size=INFINITELY_REPEAT): if not os.path.exists(path): raise RuntimeError("File '%s' does not exist." % (path)) return MinibatchSource(CTFDeserializer(path, StreamDefs( features = StreamDef(field='S0', shape=input_vocab_dim, is_sparse=True), labels = StreamDef(field='S1', shape=label_vocab_dim, is_sparse=True) )), randomize=randomize, max_samples = size)
def test_minibatch(tmpdir): mbdata = r'''0 |S0 0 |S1 0 0 |S0 1 |S1 1 0 |S0 2 0 |S0 3 |S1 3 1 |S0 4 1 |S0 5 |S1 1 1 |S0 6 |S1 2 ''' tmpfile = str(tmpdir/'mbtest.txt') with open(tmpfile, 'w') as f: f.write(mbdata) from cntk.io import CTFDeserializer, MinibatchSource, StreamDef, StreamDefs mb_source = MinibatchSource(CTFDeserializer(tmpfile, StreamDefs( features = StreamDef(field='S0', shape=1), labels = StreamDef(field='S1', shape=1)))) features_si = mb_source.stream_info('features') labels_si = mb_source.stream_info('labels') mb = mb_source.next_minibatch(1000) assert mb[features_si].num_sequences == 2 assert mb[labels_si].num_sequences == 2 features = mb[features_si] assert len(features.value) == 2 expected_features = \ [ [[0],[1],[2],[3]], [[4],[5],[6]] ] for res, exp in zip (features.value, expected_features): assert np.allclose(res, exp) assert np.allclose(features.mask, [[2, 1, 1, 1], [2, 1, 1, 0]]) labels = mb[labels_si] assert len(labels.value) == 2 expected_labels = \ [ [[0],[1],[3]], [[1],[2]] ] for res, exp in zip (labels.value, expected_labels): assert np.allclose(res, exp) assert np.allclose(labels.mask, [[2, 1, 1], [2, 1, 0]])
def create_reader(path, is_training, input_dim, output_dim): return MinibatchSource(CTFDeserializer( path, StreamDefs(features=StreamDef(field='attribs', shape=input_dim, is_sparse=False), labels=StreamDef(field='species', shape=output_dim, is_sparse=False))), randomize=is_training, max_sweeps=INFINITELY_REPEAT if is_training else 1)
def create_reader(path, is_training): return MinibatchSource(CTFDeserializer( path, StreamDefs(features=StreamDef(field='S0', shape=input_vocab_dim, is_sparse=True), labels=StreamDef(field='S1', shape=label_vocab_dim, is_sparse=True))), randomize=is_training, max_sweeps=INFINITELY_REPEAT if is_training else 1)
def mb_source(tmpdir, fileprefix, max_samples=FULL_DATA_SWEEP): ctf_file = str(tmpdir / (fileprefix + '2seqtest.txt')) with open(ctf_file, 'w') as f: f.write(ctf_data) mbs = MinibatchSource(CTFDeserializer(ctf_file, StreamDefs( features=StreamDef(field='S0', shape=input_dim, is_sparse=True), labels=StreamDef(field='S1', shape=input_dim, is_sparse=True) )), randomize=False, max_samples=max_samples) return mbs
def create_reader(path, randomize, size=INFINITELY_REPEAT): return MinibatchSource(CTFDeserializer( path, StreamDefs(features=StreamDef(field='S0', shape=input_vocab_size, is_sparse=True), labels=StreamDef(field='S1', shape=label_vocab_size, is_sparse=True))), randomize=randomize, epoch_size=size)
def create_reader(path, is_training, input_dim, label_dim): """Create MinibatchSource for reaching training data from given file""" return MinibatchSource(CTFDeserializer( path, StreamDefs(features=StreamDef(field='features', shape=input_dim, is_sparse=False), labels=StreamDef(field='labels', shape=label_dim, is_sparse=False))), randomize=is_training, max_sweeps=INFINITELY_REPEAT if is_training else 1)
def create_reader(path): return MinibatchSource( CTFDeserializer( path, StreamDefs( query=StreamDef(field='S0', shape=input_dim, is_sparse=True), intent_unused=StreamDef( field='S1', shape=num_intents, is_sparse=True), # BUGBUG: unused, and should infer dim slot_labels=StreamDef(field='S2', shape=label_dim, is_sparse=True))))
def create_mb_source(img_height, img_width, img_channels, n_classes, n_rois, data_path, data_set): rois_dim = 4 * n_rois label_dim = n_classes * n_rois path = os.path.normpath(os.path.join(abs_path, data_path)) if data_set == 'test': map_file = os.path.join(path, test_map_filename) else: map_file = os.path.join(path, train_map_filename) roi_file = os.path.join(path, data_set + rois_filename_postfix) label_file = os.path.join(path, data_set + roilabels_filename_postfix) if not os.path.exists(map_file) or not os.path.exists(roi_file) or not os.path.exists(label_file): raise RuntimeError("File '%s', '%s' or '%s' does not exist. " "Please run install_fastrcnn.py from Examples/Image/Detection/FastRCNN to fetch them" % (map_file, roi_file, label_file)) # read images image_source = ImageDeserializer(map_file) image_source.ignore_labels() image_source.map_features(features_stream_name, [ImageDeserializer.scale(width=img_width, height=img_height, channels=img_channels, scale_mode="pad", pad_value=114, interpolations='linear')]) # read rois and labels roi_source = CTFDeserializer(roi_file) roi_source.map_input(roi_stream_name, dim=rois_dim, format="dense") label_source = CTFDeserializer(label_file) label_source.map_input(label_stream_name, dim=label_dim, format="dense") # define a composite reader return MinibatchSource([image_source, roi_source, label_source], epoch_size=sys.maxsize, randomize=data_set == "train")
def test_text_format(tmpdir): tmpfile = _write_data(tmpdir, MBDATA_SPARSE) input_dim = 1000 num_output_classes = 5 mb_source = MinibatchSource(CTFDeserializer( tmpfile, StreamDefs(features=StreamDef(field='x', shape=input_dim, is_sparse=True), labels=StreamDef(field='y', shape=num_output_classes, is_sparse=False))), randomize=False) assert isinstance(mb_source, MinibatchSource) features_si = mb_source.stream_info('features') labels_si = mb_source.stream_info('labels') mb = mb_source.next_minibatch(7) features = mb[features_si] # 2 samples, max seq len 4, 1000 dim assert features.shape == (2, 4, input_dim) assert features.end_of_sweep assert features.num_sequences == 2 assert features.num_samples == 7 assert features.is_sparse labels = mb[labels_si] # 2 samples, max seq len 1, 5 dim assert labels.shape == (2, 1, num_output_classes) assert labels.end_of_sweep assert labels.num_sequences == 2 assert labels.num_samples == 2 assert not labels.is_sparse label_data = labels.asarray() assert np.allclose( label_data, np.asarray([[[1., 0., 0., 0., 0.]], [[0., 1., 0., 0., 0.]]])) mb = mb_source.next_minibatch(1) features = mb[features_si] labels = mb[labels_si] assert not features.end_of_sweep assert not labels.end_of_sweep assert features.num_samples < 7 assert labels.num_samples == 1
def test_text_format(tmpdir): from cntk.io import CTFDeserializer, MinibatchSource, StreamDef, StreamDefs mbdata = r'''0 |x 560:1 |y 1 0 0 0 0 0 |x 0:1 0 |x 0:1 1 |x 560:1 |y 0 1 0 0 0 1 |x 0:1 1 |x 0:1 1 |x 424:1 ''' tmpfile = str(tmpdir/'mbdata.txt') with open(tmpfile, 'w') as f: f.write(mbdata) input_dim = 1000 num_output_classes = 5 mb_source = MinibatchSource(CTFDeserializer(tmpfile, StreamDefs( features = StreamDef(field='x', shape=input_dim, is_sparse=True), labels = StreamDef(field='y', shape=num_output_classes, is_sparse=False) ))) assert isinstance(mb_source, MinibatchSource) features_si = mb_source.stream_info('features') labels_si = mb_source.stream_info('labels') mb = mb_source.next_minibatch(7) features = mb[features_si] # 2 samples, max seq len 4, 1000 dim assert features.shape == (2, 4, input_dim) assert features.is_sparse # TODO features is sparse and cannot be accessed right now: # *** RuntimeError: DataBuffer/WritableDataBuffer methods can only be called for NDArrayiew objects with dense storage format # 2 samples, max seq len 4, 1000 dim #assert features.data().shape().dimensions() == (2, 4, input_dim) #assert features.data().is_sparse() labels = mb[labels_si] # 2 samples, max seq len 1, 5 dim assert labels.shape == (2, 1, num_output_classes) assert not labels.is_sparse label_data = np.asarray(labels) assert np.allclose(label_data, np.asarray([ [[ 1., 0., 0., 0., 0.]], [[ 0., 1., 0., 0., 0.]] ]))
def create_reader(path, is_training): return MinibatchSource( CTFDeserializer( path, StreamDefs( query=StreamDef(field='S0', shape=vocab_size, is_sparse=True), intent_labels=StreamDef( field='S1', shape=num_intents, is_sparse=True ), # (used for intent classification variant) slot_labels=StreamDef(field='S2', shape=num_labels, is_sparse=True))), randomize=is_training, max_sweeps=INFINITELY_REPEAT if is_training else 1)
def test_prefetch_with_unpacking(tmpdir): data = r'''0 |S0 1 1 1 1 |S1 1000 1 |S0 2 2 2 2 |S1 100 2 |S0 3 3 3 3 |S1 100 3 |S0 1 1 1 1 |S1 10 4 |S0 2 2 2 2 |S1 1 5 |S0 3 3 3 3 |S1 2000 6 |S0 1 1 1 1 |S1 200 7 |S0 2 2 2 2 |S1 200 8 |S0 3 3 3 3 |S1 20 9 |S0 1 1 1 1 |S1 2 ''' import time tmpfile = _write_data(tmpdir, data) input_dim = 4 num_output_classes = 1 mb_source = MinibatchSource(CTFDeserializer( tmpfile, StreamDefs(features=StreamDef(field='S0', shape=input_dim, is_sparse=False), labels=StreamDef(field='S1', shape=num_output_classes, is_sparse=False))), randomize=False, max_samples=FULL_DATA_SWEEP) input_map = { 'S0': mb_source.streams.features, 'S1': mb_source.streams.labels } empty = False mb_size = 3 # On the last minibatch there will be resize called, # due to 10%3 = 1 sample in the minibatch while not empty: mb = mb_source.next_minibatch(mb_size, input_map=input_map) time.sleep(1) # make sure the prefetch kicks in if mb: # Force unpacking to check that we do # not break prefetch actual_size = mb['S0'].shape[0] assert (mb['S0'].asarray() == np.array( [[[1, 1, 1, 1]], [[2, 2, 2, 2]], [[3, 3, 3, 3]]], dtype=np.float32)[0:actual_size]).all() else: empty = True
def create_reader(path, is_training, query_total_dim, passage_total_dim, label_total_dim): return MinibatchSource( CTFDeserializer( path, StreamDefs(queryfeatures=StreamDef(field='qfeatures', shape=query_total_dim, is_sparse=False), passagefeatures=StreamDef(field='pfeatures', shape=passage_total_dim, is_sparse=False), labels=StreamDef(field='labels', shape=label_total_dim, is_sparse=False))), randomize=is_training, max_sweeps=INFINITELY_REPEAT if is_training else FULL_DATA_SWEEP)
def test_full_sweep_minibatch(tmpdir): tmpfile = _write_data(tmpdir, MBDATA_DENSE_1) mb_source = MinibatchSource(CTFDeserializer(tmpfile, StreamDefs( features = StreamDef(field='S0', shape=1), labels = StreamDef(field='S1', shape=1))), randomization_window_in_chunks=0, max_sweeps=1) features_si = mb_source.stream_info('features') labels_si = mb_source.stream_info('labels') mb = mb_source.next_minibatch(1000) assert mb[features_si].num_sequences == 2 assert mb[labels_si].num_sequences == 2 features = mb[features_si] assert features.end_of_sweep assert len(features.as_sequences()) == 2 expected_features = \ [ [[0], [1], [2], [3]], [[4], [5], [6]] ] for res, exp in zip(features.as_sequences(), expected_features): assert np.allclose(res, exp) assert np.allclose(features.data.mask, [[2, 1, 1, 1], [2, 1, 1, 0]]) labels = mb[labels_si] assert labels.end_of_sweep assert len(labels.as_sequences()) == 2 expected_labels = \ [ [[0],[1],[3]], [[1],[2]] ] for res, exp in zip(labels.as_sequences(), expected_labels): assert np.allclose(res, exp) assert np.allclose(labels.data.mask, [[2, 1, 1], [2, 1, 0]])
def create_mb_source(img_height, img_width, img_channels, n_classes, n_rois, data_path, data_set): rois_dim = 4 * n_rois label_dim = n_classes * n_rois path = os.path.normpath(os.path.join(abs_path, data_path)) if data_set == 'test': map_file = os.path.join(path, test_map_filename) else: map_file = os.path.join(path, train_map_filename) roi_file = os.path.join(path, data_set + rois_filename_postfix) label_file = os.path.join(path, data_set + roilabels_filename_postfix) if not os.path.exists(map_file) or not os.path.exists( roi_file) or not os.path.exists(label_file): raise RuntimeError( "File '%s', '%s' or '%s' does not exist. " "Please run install_fastrcnn.py from Examples/Image/Detection/FastRCNN to fetch them" % (map_file, roi_file, label_file)) # read images image_source = ImageDeserializer(map_file) image_source.ignore_labels() image_source.map_features(features_stream_name, [ ImageDeserializer.scale(width=img_width, height=img_height, channels=img_channels, scale_mode="pad", pad_value=114, interpolations='linear') ]) # read rois and labels roi_source = CTFDeserializer(roi_file) roi_source.map_input(roi_stream_name, dim=rois_dim, format="dense") label_source = CTFDeserializer(label_file) label_source.map_input(label_stream_name, dim=label_dim, format="dense") # define a composite reader rc = ReaderConfig([image_source, roi_source, label_source], epoch_size=sys.maxsize, randomize=data_set == "train") return rc.minibatch_source()
def test_usermbsource(tmpdir): tmpfile = _write_data(tmpdir, MBDATA_SPARSE) input_dim = 1000 num_output_classes = 5 # Setting up the native MB source as the ground truth n_mb_source = CTFDeserializer(tmpfile, StreamDefs( features=StreamDef(field='x', shape=input_dim, is_sparse=True), labels=StreamDef(field='y', shape=num_output_classes, is_sparse=False) )) n_mb_source = MinibatchSource(n_mb_source, randomize=False) n_features_si = n_mb_source['features'] n_labels_si = n_mb_source['labels'] n_mb = n_mb_source.next_minibatch(2) n_features = n_mb[n_features_si] n_labels = n_mb[n_labels_si] # Setting up the user MB source u_mb_source = MyDataSource(input_dim, num_output_classes) u_features_si = u_mb_source['features'] u_labels_si = u_mb_source['labels'] u_mb = u_mb_source.next_minibatch(2, 1, 0) u_features = u_mb[u_features_si] u_labels = u_mb[u_labels_si] assert u_features.shape == n_features.shape == (1, 3, 1000) assert u_features.end_of_sweep == n_features.end_of_sweep assert u_features.num_sequences == n_features.num_sequences assert u_features.num_samples == n_features.num_samples assert u_features.is_sparse == n_features.is_sparse assert u_labels.shape == n_labels.shape == (1, 1, 5) assert u_labels.end_of_sweep is n_labels.end_of_sweep is False assert u_labels.num_sequences == u_labels.num_sequences assert u_labels.num_samples == u_labels.num_samples assert u_labels.is_sparse is n_labels.is_sparse is False u_label_data = u_labels.asarray() n_label_data = n_labels.asarray() assert np.allclose(u_label_data, n_label_data) n_mb = n_mb_source.next_minibatch(10) n_features = n_mb[n_features_si] n_labels = n_mb[n_labels_si] u_mb = u_mb_source.next_minibatch(10, 1, 0) u_features = u_mb[u_features_si] u_labels = u_mb[u_labels_si] assert u_labels.shape == n_labels.shape u_label_data = u_labels.asarray() n_label_data = n_labels.asarray() assert np.allclose(u_label_data, n_label_data) assert u_features.end_of_sweep is u_labels.end_of_sweep is True assert u_features.num_samples == n_features.num_samples assert u_features.num_sequences == n_features.num_sequences