def create_mb_source(image_height, image_width, num_channels, map_file): transforms = [ ImageDeserializer.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear') ] return MinibatchSource( ImageDeserializer( map_file, StreamDefs( features=StreamDef( field='image', transforms=transforms ), # first column in map file is referred to as 'image' labels=StreamDef(field='label', shape=1000)) ), # and second as 'label'. TODO: add option to ignore labels randomize=False)
def create_mb_source(map_file, image_width, image_height, num_channels, num_classes, randomize=True): transforms = [ xforms.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear') ] return MinibatchSource(ImageDeserializer( map_file, StreamDefs(features=StreamDef(field='image', transforms=transforms), labels=StreamDef(field='label', shape=num_classes))), randomize=randomize)
def test_full_sweep_minibatch(tmpdir): tmpfile = _write_data(tmpdir, MBDATA_DENSE_1) mb_source = MinibatchSource(CTFDeserializer(tmpfile, StreamDefs( features = StreamDef(field='S0', shape=1), labels = StreamDef(field='S1', shape=1))), randomization_window_in_chunks=0, max_sweeps=1) features_si = mb_source.stream_info('features') labels_si = mb_source.stream_info('labels') mb = mb_source.next_minibatch(1000) assert mb[features_si].num_sequences == 2 assert mb[labels_si].num_sequences == 2 features = mb[features_si] assert features.end_of_sweep assert len(features.as_sequences()) == 2 expected_features = \ [ [[0], [1], [2], [3]], [[4], [5], [6]] ] for res, exp in zip(features.as_sequences(), expected_features): assert np.allclose(res, exp) assert np.allclose(features.data.mask, [[2, 1, 1, 1], [2, 1, 1, 0]]) labels = mb[labels_si] assert labels.end_of_sweep assert len(labels.as_sequences()) == 2 expected_labels = \ [ [[0],[1],[3]], [[1],[2]] ] for res, exp in zip(labels.as_sequences(), expected_labels): assert np.allclose(res, exp) assert np.allclose(labels.data.mask, [[2, 1, 1], [2, 1, 0]])
def create_mb(map_file, params, training_set): transforms = [] image_dimensions = params['image_dimensions'] num_classes = params['num_classes'] if training_set: # Scale to square-sized image. without this the cropping transform would chop the larger dimension of an # image to make it squared, and then take 0.9 crops from within the squared image. transforms += [ xforms.scale(width=2 * image_dimensions[0], height=2 * image_dimensions[1], channels=image_dimensions[2], scale_mode='pad', pad_value=114) ] transforms += [ xforms.crop(crop_type='randomside', side_ratio=0.9, jitter_type='uniratio') ] # Randomly crop square area #randomside enables Horizontal flipping #new_dim = side_ratio * min(old_w,old_h) , 0.9 * 224 = 201.6 #transforms += [xforms.crop(crop_type='center')] transforms += [ xforms.color(brightness_radius=0.2, contrast_radius=0.2, saturation_radius=0.2) ] transforms += [xforms.crop(crop_type='center', side_ratio=0.875)] # test has no jitter] # Scale down and pad transforms += [ xforms.scale(width=image_dimensions[0], height=image_dimensions[1], channels=image_dimensions[2], scale_mode='pad', pad_value=114) ] return MinibatchSource(ImageDeserializer( map_file, StreamDefs(features=StreamDef(field='image', transforms=transforms), labels=StreamDef(field='label', shape=num_classes))), randomize=training_set, multithreaded_deserializer=True)
def create_reader(path, randomize, input_vocab_dim, label_vocab_dim, size=INFINITELY_REPEAT): if not os.path.exists(path): raise RuntimeError("File '%s' does not exist." % (path)) return MinibatchSource(CTFDeserializer( path, StreamDefs(features=StreamDef(field='S0', shape=input_vocab_dim, is_sparse=True), labels=StreamDef(field='S1', shape=label_vocab_dim, is_sparse=True))), randomize=randomize, max_samples=size)
def create_mb_source(map_file, image_width, image_height, num_channels, num_classes, boTrain): transforms = [] if boTrain: # Scale to square-sized image. without this the cropping transform would chop the larger dimension of an # image to make it squared, and then take 0.9 crops from within the squared image. transforms += [xforms.scale(width=2*image_width, height=2*image_height, channels=num_channels, interpolations='linear', scale_mode='pad', pad_value=114)] transforms += [xforms.crop(crop_type='randomside', side_ratio=0.9, jitter_type='uniratio')] # Randomly crop square area transforms += [xforms.scale(width=image_width, height=image_height, channels=num_channels, # Scale down and pad interpolations='linear', scale_mode='pad', pad_value=114)] if boTrain: transforms += [xforms.color(brightness_radius=0.2, contrast_radius=0.2, saturation_radius=0.2)] return MinibatchSource(ImageDeserializer(map_file, StreamDefs( features = StreamDef(field='image', transforms=transforms), labels = StreamDef(field='label', shape=num_classes))), randomize = boTrain, multithreaded_deserializer=True)
def create_reader(map_file, mean_file, train, image_height=800, image_width=150, num_channels=3, num_classes=32): # transformation pipeline for the features has crop only when training trs = [] if train: trs += [ transforms.crop(crop_type='center', aspect_ratio=0.1875, side_ratio=0.95, jitter_type='uniratio') # Horizontal flip enabled ] trs += [ transforms.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear'), # transforms.mean(mean_file) ] # deserializer image_source=ImageDeserializer(map_file, StreamDefs( features = StreamDef(field='image', transforms=trs), # first column in map file is referred to as 'image' labels = StreamDef(field='label', shape=num_classes) # and second as 'label' )) return MinibatchSource(image_source)
def create_image_mb_source(map_file, mean_file, train, total_number_of_samples): """ Creates minibatch source """ if not os.path.exists(map_file) or not os.path.exists(mean_file): raise RuntimeError("File '%s' or '%s' does not exist. " % (map_file, mean_file)) # transformation pipeline for the features has jitter/crop only when training transforms = [] if train: imgfolder = os.path.join(os.path.split(map_file)[0], 'train') transforms += [ xforms.crop(crop_type='randomside', side_ratio=0.8, jitter_type='uniratio') # train uses jitter ] else: imgfolder = os.path.join(os.path.split(map_file)[0], 'test') transforms += [ xforms.scale(width=_IMAGE_WIDTH, height=_IMAGE_HEIGHT, channels=_NUM_CHANNELS, interpolations='linear'), xforms.mean(mean_file) ] map_file = process_map_file(map_file, imgfolder) # deserializer return MinibatchSource( ImageDeserializer( map_file, StreamDefs( features=StreamDef(field='image', transforms=transforms), # first column in map file is referred to as 'image' labels=StreamDef( field='label', shape=_NUM_CLASSES))), # and second as 'label' randomize=train, max_samples=total_number_of_samples, multithreaded_deserializer=True)
def create_reader(map_file, mean_file, train): if not os.path.exists(map_file) or not os.path.exists(mean_file): raise RuntimeError("File '%s' or '%s' does not exist. Please run install_cifar10.py from DataSets/CIFAR-10 to fetch them" % (map_file, mean_file)) # transformation pipeline for the features has jitter/crop only when training transforms = [] if train: transforms += [ ImageDeserializer.crop(crop_type='randomside', side_ratio=0.8, jitter_type='uniratio') # train uses jitter ] transforms += [ ImageDeserializer.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear'), ImageDeserializer.mean(mean_file) ] # deserializer return MinibatchSource(ImageDeserializer(map_file, StreamDefs( features = StreamDef(field='image', transforms=transforms), # first column in map file is referred to as 'image' labels = StreamDef(field='label', shape=num_classes)))) # and second as 'label'
def create_mb_source(image_height, image_width, num_channels, map_file, mean_file, is_training): if not os.path.exists(map_file): raise RuntimeError("File '%s' does not exist." % (map_file)) # transformation pipeline for the features has jitter/crop only when training transforms = [] if is_training: transforms += [ xforms.crop(crop_type='randomside', side_ratio=0.875, jitter_type='uniratio') # train uses jitter ] else: transforms += [ xforms.crop(crop_type='center', side_ratio=0.875) # test has no jitter ] transforms += [ xforms.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear'), ] if mean_file != '': transforms += [ xforms.mean(mean_file), ] # deserializer return MinibatchSource( ImageDeserializer( map_file, StreamDefs(features=StreamDef( field='image', transforms=transforms ) # first column in map file is referred to as 'image' )), randomize=is_training, multithreaded_deserializer=True, max_sweeps=1)
def decode_model(use_gpu=True, gpu_id=0): # use GPU or CPU according to parameters try_set_default_device(gpu(gpu_id) if use_gpu else cpu()) model_dnn = load_model("./model/speech_enhancement.model") features_file = "./test_normed.scp" feature_dim = 257 test_reader = MinibatchSource(HTKFeatureDeserializer(StreamDefs( amazing_features=StreamDef( shape=feature_dim, context=(3, 3), scp=features_file))), randomize=False, frame_mode=False) eval_input_map = {input: test_reader.streams.amazing_features} f = open(features_file) line = f.readline() while line: temp_input_path = line.split(']')[0] mb_size = temp_input_path.split(',')[-1] mb_size = int(mb_size) + 1 noisy_fea = test_reader.next_minibatch( mb_size, input_map=eval_input_map) real_noisy_fea = noisy_fea[input].data node_in_graph = model_dnn.find_by_name('irm') output_nodes = combine([node_in_graph.owner]) out_noisy_fea = output_nodes.eval(real_noisy_fea) # out_noisy_fea = as_composite(model_dnn.output1[0].owner).eval( # real_noisy_fea) out_SE_noisy_fea = np.concatenate((out_noisy_fea), axis=0) out_file_path = line.split('=')[0] out_file_name = os.path.join('./enhanced_norm_fea_mat', out_file_path) out_file_fullpath = os.path.split(out_file_name)[0] # print (out_file_fullpath) if not os.path.exists(out_file_fullpath): os.makedirs(out_file_fullpath) sio.savemat(out_file_name, {'SE': out_SE_noisy_fea}) line = f.readline() f.close()
def cbf_reader(path, is_training, max_samples): """ Returns a MinibatchSource for data at the given path :param path: Path to a CBF file :param is_training: Set to true if reader is for training set, else false :param max_samples: Max no. of samples to read """ deserializer = CBFDeserializer( path, StreamDefs(label=StreamDef(field='label', shape=num_classes, is_sparse=True), pixels=StreamDef(field='pixels', shape=frame_height * frame_width * sequence_length, is_sparse=False))) return MinibatchSource(deserializer, randomize=is_training, max_samples=max_samples)
def mb_source(tmpdir, fileprefix, max_samples=FULL_DATA_SWEEP, ctf=ctf_data, streams=['S0', 'S1']): ctf_file = str(tmpdir / (fileprefix + '2seqtest.txt')) with open(ctf_file, 'w') as f: f.write(ctf) mbs = MinibatchSource(CTFDeserializer( ctf_file, StreamDefs(features=StreamDef(field=streams[0], shape=input_dim, is_sparse=True), labels=StreamDef(field=streams[1], shape=input_dim, is_sparse=True))), randomize=False, max_samples=max_samples) return mbs
def create_reader(map_file, mean_file, train, pixel_dimensions, classes, total_number_of_samples): print( f"Reading map file: {map_file} with number of samples {total_number_of_samples}" ) transforms = [ xforms.scale(width=pixel_dimensions['width'], height=pixel_dimensions['height'], channels=pixel_dimensions['depth'], interpolations='linear'), xforms.mean(mean_file) ] source = MinibatchSource(deserializers=ImageDeserializer( map_file, StreamDefs(features=StreamDef(field='image', transforms=transforms), labels=StreamDef(field='label', shape=len(classes)))), randomize=train, max_samples=total_number_of_samples) return source
def create_image_mb_source(map_file, mean_file, is_training, total_number_of_samples): if not os.path.exists(map_file) or not os.path.exists(mean_file): raise RuntimeError("File '%s' or '%s' does not exist." % (map_file, mean_file)) # transformation pipeline for the features has jitter/crop only when training transforms = [] if is_training: transforms += [ xforms.crop(crop_type='randomside', side_ratio=0.875, jitter_type='uniratio') # train uses jitter ] else: transforms += [ xforms.crop(crop_type='center', side_ratio=0.875) # test has no jitter ] transforms += [ xforms.scale(width=IMAGE_WIDTH, height=IMAGE_HEIGHT, channels=NUM_CHANNELS, interpolations='linear'), xforms.mean(mean_file) ] # deserializer return MinibatchSource( ImageDeserializer( map_file, StreamDefs( features=StreamDef( field='image', transforms=transforms ), # first column in map file is referred to as 'image' labels=StreamDef(field='label', shape=NUM_CLASSES))), # and second as 'label' randomize=is_training, max_samples=total_number_of_samples, multithreaded_deserializer=True)
def create_reader(map_file, mean_file, train): if not os.path.exists(map_file) or not os.path.exists(mean_file): cifar_py3 = "" if sys.version_info.major < 3 else "_py3" raise RuntimeError("File '%s' or '%s' does not exist. Please run CifarDownload%s.py and CifarConverter%s.py from CIFAR-10 to fetch them" % (map_file, mean_file, cifar_py3, cifar_py3)) # transformation pipeline for the features has jitter/crop only when training transforms = [] if train: transforms += [ ImageDeserializer.crop(crop_type='Random', ratio=0.8, jitter_type='uniRatio') # train uses jitter ] transforms += [ ImageDeserializer.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear'), ImageDeserializer.mean(mean_file) ] # deserializer return MinibatchSource(ImageDeserializer(map_file, StreamDefs( features = StreamDef(field='image', transforms=transforms), # first column in map file is referred to as 'image' labels = StreamDef(field='label', shape=num_classes) # and second as 'label' )))
def test_multiple_streams_in_htk(): feature_dim = 33 context = 2 os.chdir(data_path) features_file = "glob_0000.scp" fd = HTKFeatureDeserializer( StreamDefs(amazing_features=StreamDef(shape=feature_dim, context=(context, context), scp=features_file), amazing_features2=StreamDef(shape=feature_dim, context=(context, context), scp=features_file))) mbs = MinibatchSource([fd]) mb = mbs.next_minibatch(1) assert (mb[mbs.streams.amazing_features].asarray() == mb[ mbs.streams.amazing_features2].asarray()).all() os.chdir(abs_path)
def create_reader(map_file, mean_file, train, dimensions, classes, total_number_of_samples): print("Reading map file: {} with number of samples {}".format( map_file, total_number_of_samples)) # transformation pipeline for the features has jitter/crop only when training transforms = [] transforms += [ xforms.scale(width=dimensions['width'], height=dimensions['height'], channels=dimensions['depth'], interpolations='linear'), xforms.mean(mean_file) ] source = MinibatchSource(ImageDeserializer( map_file, StreamDefs(features=StreamDef(field='image', transforms=transforms), labels=StreamDef(field='label', shape=len(classes)))), randomize=train, max_samples=total_number_of_samples) return source
def create_reader(path, vocab_dim, entity_dim, randomize): """ Create data reader for the model Args: path: The data path vocab_dim: The dimention of the vocabulary entity_dim: The dimention of entities randomize: Where to shuffle the data before feed into the trainer """ return MinibatchSource(CTFDeserializer( path, StreamDefs(context=StreamDef(field='C', shape=vocab_dim, is_sparse=True), query=StreamDef(field='Q', shape=vocab_dim, is_sparse=True), entities=StreamDef(field='E', shape=1, is_sparse=False), label=StreamDef(field='L', shape=1, is_sparse=False), entity_ids=StreamDef(field='EID', shape=entity_dim, is_sparse=True))), randomize=randomize)
def test_distributed_mb_source_again(tmpdir): import random from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs ctf_data = '''0 |S0 1 |S1 1 0 |S0 2 |S1 2 0 |S0 3 1 |S0 4 1 |S0 5 |S1 3 1 |S0 6 |S1 4 ''' ctf_file = str(tmpdir / '2seqtest.txt') with open(ctf_file, 'w') as f: f.write(ctf_data) ctf = CTFDeserializer( ctf_file, StreamDefs(features=StreamDef(field='S0', shape=1), labels=StreamDef(field='S1', shape=1))) random.seed(1234) mb_sources = [] for randomize in [True, False]: mb_sources.append(MinibatchSource(ctf, randomize=randomize)) mb_sources.append( MinibatchSource(ctf, randomize=randomize, max_sweeps=random.randint(1, 10))) mb_sources.append( MinibatchSource(ctf, randomize=randomize, max_samples=random.randint(1, 30))) for i in range(20): for source in mb_sources: data = source.next_minibatch(minibatch_size_in_samples=5, num_data_partitions=2, partition_index=i % 2) features = source.streams['features'] assert (len(data) == 0 or data[features].num_samples == 3)
def test_prefetch_with_unpacking(tmpdir): data = r'''0 |S0 1 1 1 1 |S1 1000 1 |S0 2 2 2 2 |S1 100 2 |S0 3 3 3 3 |S1 100 3 |S0 1 1 1 1 |S1 10 4 |S0 2 2 2 2 |S1 1 5 |S0 3 3 3 3 |S1 2000 6 |S0 1 1 1 1 |S1 200 7 |S0 2 2 2 2 |S1 200 8 |S0 3 3 3 3 |S1 20 9 |S0 1 1 1 1 |S1 2 ''' import time tmpfile = _write_data(tmpdir, data) input_dim = 4 num_output_classes = 1 mb_source = MinibatchSource(CTFDeserializer(tmpfile, StreamDefs( features=StreamDef(field='S0', shape=input_dim, is_sparse=False), labels=StreamDef(field='S1', shape=num_output_classes, is_sparse=False) )), randomize=False, max_samples=FULL_DATA_SWEEP) input_map = { 'S0' : mb_source.streams.features, 'S1' : mb_source.streams.labels } empty = False mb_size = 3 # On the last minibatch there will be resize called, # due to 10%3 = 1 sample in the minibatch while not empty: mb = mb_source.next_minibatch(mb_size, input_map=input_map) time.sleep(1) # make sure the prefetch kicks in if mb: # Force unpacking to check that we do # not break prefetch actual_size = mb['S0'].shape[0] assert (mb['S0'].asarray() == np.array([[[1, 1, 1, 1]], [[2, 2, 2, 2]], [[3, 3, 3, 3]]], dtype=np.float32)[0:actual_size]).all() else: empty = True
def create_reader(map_file, mean_file, train, total_data_size, distributed_after=INFINITE_SAMPLES): if not os.path.exists(map_file) or not os.path.exists(mean_file): raise RuntimeError( "File '%s' or '%s' does not exist. Please run install_cifar10.py from DataSets/CIFAR-10 to fetch them" % (map_file, mean_file)) # transformation pipeline for the features has jitter/crop only when training transforms = [] if train: transforms += [ ImageDeserializer.crop(crop_type='Random', ratio=0.8, jitter_type='uniRatio') # train uses jitter ] transforms += [ ImageDeserializer.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear'), ImageDeserializer.mean(mean_file) ] # deserializer return MinibatchSource( ImageDeserializer( map_file, StreamDefs( features=StreamDef( field='image', transforms=transforms ), # first column in map file is referred to as 'image' labels=StreamDef(field='label', shape=num_classes))), # and second as 'label' epoch_size=total_data_size, multithreaded_deserializer= False, # turn off omp as CIFAR-10 is not heavy for deserializer distributed_after=distributed_after)
def create_reader(map_file): transforms = [ xforms.crop(crop_type='randomside', side_ratio=0.85, jitter_type='uniratio'), xforms.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear'), xforms.color(brightness_radius=0.2, contrast_radius=0.2, saturation_radius=0.2) ] return (MinibatchSource( ImageDeserializer( map_file, StreamDefs(features=StreamDef(field='image', transforms=transforms, is_sparse=False), labels=StreamDef(field='label', shape=num_classes, is_sparse=False)))))
def create_image_mb_source(map_file, is_training, total_number_of_samples): if not os.path.exists(map_file): raise RuntimeError("File '%s' does not exist." % map_file) # transformation pipeline for the features has jitter/crop only when training transforms = [] if is_training: transforms += [ ImageDeserializer.crop(crop_type='randomside', side_ratio='0.4375:0.875', jitter_type='uniratio') # train uses jitter ] else: transforms += [ ImageDeserializer.crop(crop_type='center', side_ratio=0.5833333) # test has no jitter ] transforms += [ ImageDeserializer.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear'), ] # deserializer return MinibatchSource( ImageDeserializer( map_file, StreamDefs( features=StreamDef( field='image', transforms=transforms ), # first column in map file is referred to as 'image' labels=StreamDef(field='label', shape=num_classes))), # and second as 'label' randomize=is_training, max_samples=total_number_of_samples, multithreaded_deserializer=True)
def test_minibatch_defined_by_labels(tmpdir): input_dim = 1000 num_output_classes = 5 def assert_data(mb_source): features_si = mb_source.stream_info('features') labels_si = mb_source.stream_info('labels') mb = mb_source.next_minibatch(2) features = mb[features_si] # 2 samples, max seq len 4, 1000 dim assert features.shape == (2, 4, input_dim) assert features.end_of_sweep assert features.num_sequences == 2 assert features.num_samples == 7 assert features.is_sparse labels = mb[labels_si] # 2 samples, max seq len 1, 5 dim assert labels.shape == (2, 1, num_output_classes) assert labels.end_of_sweep assert labels.num_sequences == 2 assert labels.num_samples == 2 assert not labels.is_sparse label_data = labels.asarray() assert np.allclose( label_data, np.asarray([[[1., 0., 0., 0., 0.]], [[0., 1., 0., 0., 0.]]])) mb = mb_source.next_minibatch(3) features = mb[features_si] labels = mb[labels_si] assert features.num_samples == 10 assert labels.num_samples == 3 tmpfile = _write_data(tmpdir, MBDATA_SPARSE) mb_source = MinibatchSource(CTFDeserializer( tmpfile, StreamDefs(features=StreamDef(field='x', shape=input_dim, is_sparse=True), labels=StreamDef(field='y', shape=num_output_classes, is_sparse=False, defines_mb_size=True))), randomize=False) assert_data(mb_source) tmpfile1 = _write_data(tmpdir, MBDATA_SPARSE1, '1') tmpfile2 = _write_data(tmpdir, MBDATA_SPARSE2, '2') combined_mb_source = MinibatchSource([ CTFDeserializer( tmpfile1, StreamDefs(features=StreamDef( field='x', shape=input_dim, is_sparse=True))), CTFDeserializer( tmpfile2, StreamDefs(labels=StreamDef(field='y', shape=num_output_classes, is_sparse=False, defines_mb_size=True))) ], randomize=False) assert_data(combined_mb_source)
def test_usermbsource(tmpdir): tmpfile = _write_data(tmpdir, MBDATA_SPARSE) input_dim = 1000 num_output_classes = 5 # Setting up the native MB source as the ground truth n_mb_source = CTFDeserializer( tmpfile, StreamDefs(features=StreamDef(field='x', shape=input_dim, is_sparse=True), labels=StreamDef(field='y', shape=num_output_classes, is_sparse=False))) n_mb_source = MinibatchSource(n_mb_source, randomize=False) n_features_si = n_mb_source['features'] n_labels_si = n_mb_source['labels'] n_mb = n_mb_source.next_minibatch(2) n_features = n_mb[n_features_si] n_labels = n_mb[n_labels_si] # Setting up the user MB source u_mb_source = MyDataSource(input_dim, num_output_classes) u_features_si = u_mb_source['features'] u_labels_si = u_mb_source['labels'] u_mb = u_mb_source.next_minibatch(2, 1, 0) u_features = u_mb[u_features_si] u_labels = u_mb[u_labels_si] assert u_features.shape == n_features.shape == (1, 3, 1000) assert u_features.end_of_sweep == n_features.end_of_sweep assert u_features.num_sequences == n_features.num_sequences assert u_features.num_samples == n_features.num_samples assert u_features.is_sparse == n_features.is_sparse assert u_labels.shape == n_labels.shape == (1, 1, 5) assert u_labels.end_of_sweep is n_labels.end_of_sweep is False assert u_labels.num_sequences == u_labels.num_sequences assert u_labels.num_samples == u_labels.num_samples assert u_labels.is_sparse is n_labels.is_sparse is False u_label_data = u_labels.asarray() n_label_data = n_labels.asarray() assert np.allclose(u_label_data, n_label_data) n_mb = n_mb_source.next_minibatch(10) n_features = n_mb[n_features_si] n_labels = n_mb[n_labels_si] u_mb = u_mb_source.next_minibatch(10, 1, 0) u_features = u_mb[u_features_si] u_labels = u_mb[u_labels_si] assert u_labels.shape == n_labels.shape u_label_data = u_labels.asarray() n_label_data = n_labels.asarray() assert np.allclose(u_label_data, n_label_data) assert u_features.end_of_sweep is u_labels.end_of_sweep is True assert u_features.num_samples == n_features.num_samples assert u_features.num_sequences == n_features.num_sequences
def create_config(tmpdir): tmpfile = create_temp_file(tmpdir) return MinibatchSourceConfig() \ .add_deserializer( CTFDeserializer(tmpfile, StreamDefs(features=StreamDef(field='S0', shape=1))))
def create_ctf_deserializer(tmpdir): tmpfile = create_temp_file(tmpdir) return CTFDeserializer(tmpfile, StreamDefs(features=StreamDef(field='S0', shape=1)))
def test_base64_image_deserializer(tmpdir): import io, base64, uuid from PIL import Image images, b64_images = [], [] np.random.seed(1) for i in range(10): data = np.random.randint(0, 2**8, (5, 7, 3)) image = Image.fromarray(data.astype('uint8'), "RGB") buf = io.BytesIO() image.save(buf, format='PNG') assert image.width == 7 and image.height == 5 b64_images.append(base64.b64encode(buf.getvalue())) images.append(np.array(image)) image_data = str(tmpdir / 'mbdata1.txt') seq_ids = [] uid = uuid.uuid1().int >> 64 with open(image_data, 'wb') as f: for i, data in enumerate(b64_images): seq_id = uid ^ i seq_id = str(seq_id).encode('ascii') seq_ids.append(seq_id) line = seq_id + b'\t' label = str(i).encode('ascii') line += label + b'\t' + data + b'\n' f.write(line) ctf_data = str(tmpdir / 'mbdata2.txt') with open(ctf_data, 'wb') as f: for i, sid in enumerate(seq_ids): line = sid + b'\t' + b'|index ' + str(i).encode('ascii') + b'\n' f.write(line) transforms = [xforms.scale(width=7, height=5, channels=3)] b64_deserializer = Base64ImageDeserializer( image_data, StreamDefs(images=StreamDef(field='image', transforms=transforms), labels=StreamDef(field='label', shape=10))) ctf_deserializer = CTFDeserializer( ctf_data, StreamDefs(index=StreamDef(field='index', shape=1))) mb_source = MinibatchSource([ctf_deserializer, b64_deserializer]) assert isinstance(mb_source, MinibatchSource) for j in range(100): mb = mb_source.next_minibatch(10) index_stream = mb_source.streams['index'] index = mb[index_stream].asarray().flatten() image_stream = mb_source.streams['images'] results = mb[image_stream].asarray() for i in range(10): # original images are RBG, openCV produces BGR images, # reverse the last dimension of the original images bgrImage = images[int(index[i])][:, :, ::-1] # transposing to get CHW representation bgrImage = np.transpose(bgrImage, (2, 0, 1)) assert (bgrImage == results[i][0]).all()
def test_image(tmpdir): map_file = "input.txt" mean_file = "mean.txt" feature_name = "f" image_width = 100 image_height = 200 num_channels = 3 label_name = "l" num_classes = 7 transforms = [ xforms.crop(crop_type='randomside', side_ratio=0.5, jitter_type='uniratio'), xforms.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear'), xforms.mean(mean_file) ] defs = StreamDefs(f=StreamDef(field='image', transforms=transforms), l=StreamDef(field='label', shape=num_classes)) image = ImageDeserializer(map_file, defs) config = to_dictionary(MinibatchSourceConfig([image], randomize=False)) # Multithreading should be on by default for the ImageDeserializer. assert config['multiThreadedDeserialization'] is True assert len(config['deserializers']) == 1 d = config['deserializers'][0] assert d['type'] == 'ImageDeserializer' assert d['file'] == map_file assert set(d['input'].keys()) == {label_name, feature_name} l = d['input'][label_name] assert l['labelDim'] == num_classes f = d['input'][feature_name] assert set(f.keys()) == {'transforms'} t0, t1, t2, _ = f['transforms'] assert t0['type'] == 'Crop' assert t1['type'] == 'Scale' assert t2['type'] == 'Mean' assert t0['cropType'] == 'randomside' assert t0['cropSize'] == '0:0' assert t0['sideRatio'] == '0.5:0.5' assert t0['aspectRatio'] == '1:1' assert t0['areaRatio'] == '0:0' assert t0['jitterType'] == 'uniratio' assert t1['width'] == image_width assert t1['height'] == image_height assert t1['channels'] == num_channels assert t1['interpolations'] == 'linear' assert t2['meanFile'] == mean_file config = to_dictionary(MinibatchSourceConfig([image, image])) assert len(config['deserializers']) == 2 ctf = create_ctf_deserializer(tmpdir) config = to_dictionary(MinibatchSourceConfig([image, ctf, image])) # Multithreading should still be enabled. assert config['multiThreadedDeserialization'] is True assert len(config['deserializers']) == 3 # TODO depends on ImageReader.dll '''