def device_minibatches(self, total_batch_size): record_input = data_flow_ops.RecordInput( file_pattern=os.path.join(FLAGS.data_dir, '%s-*' % self.subset), parallelism=64, # Note: This causes deadlock during init if larger than dataset buffer_size=FLAGS.input_buffer_size, batch_size=total_batch_size) records = record_input.get_yield_op() # Split batch into individual images records = tf.split(records, total_batch_size, 0) records = [tf.reshape(record, []) for record in records] # Deserialize and preprocess images into batches for each device images = defaultdict(list) labels = defaultdict(list) with tf.name_scope('input_pipeline'): for i, record in enumerate(records): imgdata, label, bbox, text = deserialize_image_record(record) image = self.preprocess(imgdata, bbox, thread_id=i) label -= 1 # Change to 0-based (don't use background class) device_num = i % self.num_devices images[device_num].append(image) labels[device_num].append(label) # Stack images back into a sub-batch for each device for device_num in range(self.num_devices): images[device_num] = tf.parallel_stack(images[device_num]) labels[device_num] = tf.concat(labels[device_num], 0) images[device_num] = tf.reshape( images[device_num], [-1, self.height, self.width, 3]) images[device_num] = tf.clip_by_value(images[device_num], 0., 255.) images[device_num] = tf.cast(images[device_num], self.dtype) return images, labels
def minibatch(self): with tf.name_scope('batch_processing'): images = [] record_input = data_flow_ops.RecordInput( file_pattern=dp.tf_records( self.data_set, '{0}_crop_patch_full_{1}'.format(self.file_format, self.region)), seed=301, parallelism=64, buffer_size=5000, shift_ratio=0.2, batch_size=self.batch_size, name='record_input') records = record_input.get_yield_op() records = tf.split(records, self.batch_size, 0) records = [tf.reshape(record, []) for record in records] for i in xrange(self.batch_size): value = records[i] image, dim = self._parse_example_proto(value, i) image = tf.image.resize_images(image, self.image_shape) image = self.distort_image(image) image = image[:, :, 0] images.append(image) images = tf.parallel_stack(images) images = tf.reshape(images, shape=[ self.batch_size, self.image_shape[0], self.image_shape[1], -1 ]) return images
def testEmptyGlob(self): with self.cached_session() as sess: record_input = data_flow_ops.RecordInput(file_pattern="foo") yield_op = record_input.get_yield_op() self.evaluate(variables.global_variables_initializer()) with self.assertRaises(NotFoundError): self.evaluate(yield_op)
def minibatch(self): with tf.name_scope('batch_processing'): images = [] bboxes = [] record_input = data_flow_ops.RecordInput( file_pattern=dp.tf_records( self.data_set, '{0}_keypoint'.format(self.keypoint.name)), seed=301, parallelism=64, buffer_size=2000, shift_ratio=0.2, batch_size=self.batch_size, name='record_input') records = record_input.get_yield_op() records = tf.split(records, self.batch_size, 0) records = [tf.reshape(record, []) for record in records] for i in xrange(self.batch_size): value = records[i] image, bbox = self._parse_example_proto(value, i) bbox = bbox[:4] image, bbox = self.distort_image(image, bbox) image -= tf.reduce_mean(image, axis=[0, 1]) images.append(image) bboxes.append(bbox) images = tf.parallel_stack(images) images = tf.reshape(images, shape=[ self.batch_size, self.image_shape[0], self.image_shape[1], -1 ]) bboxes = tf.reshape(bboxes, (self.batch_size, 1, 4)) return images, bboxes
def testRecordInputEpochs(self): files = 100 records_per_file = 100 batches = 2 with self.cached_session() as sess: self.generateTestData("basic", files, records_per_file) records = data_flow_ops.RecordInput( file_pattern=os.path.join(self.get_temp_dir(), "basic.*"), parallelism=2, buffer_size=2000, batch_size=1, shift_ratio=0.33, seed=10, name="record_input", batches=batches) yield_op = records.get_yield_op() # cycle over 3 epochs and make sure we never duplicate for _ in range(3): epoch_set = set() for _ in range(int(files * records_per_file / batches)): op_list = self.evaluate(yield_op) self.assertTrue(len(op_list) is batches) for r in op_list: self.assertTrue(r[0] not in epoch_set) epoch_set.add(r[0])
def device_minibatches(cls, num_devices, data_dir, total_batch_size, height, width, distort_color, val=False): dtype = tf.float32 subset = 'validation' if val else 'train' nrecord = get_num_records(os.path.join(data_dir, '{}-*'.format(subset))) input_buffer_size = min(10000, nrecord) record_input = data_flow_ops.RecordInput( file_pattern=os.path.join(data_dir, '{}-*'.format(subset)), parallelism=64, # Note: This causes deadlock during init if # larger than dataset buffer_size=input_buffer_size, batch_size=total_batch_size, seed=0) records = record_input.get_yield_op() # Split batch into individual images records = tf.split(records, total_batch_size, 0) records = [tf.reshape(record, []) for record in records] # Deserialize and preprocess images into batches for each device images = defaultdict(list) labels = defaultdict(list) with tf.name_scope('input_pipeline'): for thread_id, record in enumerate(records): imgdata, label, bbox, _ = cls._deserialize_image_record(record) image = cls._preprocess(imgdata, bbox, thread_id, height, width, distort_color, val=val) label -= 1 # Change to 0-based (don't use background class) device_num = thread_id % num_devices images[device_num].append(image) labels[device_num].append(label) # Stack images back into a sub-batch for each device for device_num in xrange(num_devices): images[device_num] = tf.parallel_stack(images[device_num]) labels[device_num] = tf.concat(labels[device_num], 0) images[device_num] = tf.reshape(images[device_num], [-1, height, width, 3]) images[device_num] = tf.clip_by_value(images[device_num], 0., 255.) images[device_num] = tf.cast(images[device_num], dtype) return images, labels, nrecord
def input_fn(tf_glob, one_hot=True, classes=None, is_training=None, batch_shape=[32, 224, 224, 3], parallelism=1): """ Return tensor to read from TFRecord """ print('Creating graph for loading %s TFRecords...' % tf_glob) with tf.variable_scope("TFRecords"): record_input = data_flow_ops.RecordInput(tf_glob, batch_size=batch_shape[0], parallelism=parallelism) records_op = record_input.get_yield_op() records_op = tf.split(records_op, batch_shape[0], 0) records_op = [tf.reshape(record, []) for record in records_op] progbar = Progbar(len(records_op)) images = [] labels = [] for i, serialized_example in enumerate(records_op): progbar.update(i) with tf.variable_scope("parse_images", reuse=True): features = tf.parse_single_example( serialized_example, features={ 'image': tf.FixedLenFeature([], tf.string), 'label': tf.FixedLenFeature([], tf.int64), }) image_decoded = tf.image.decode_jpeg(features['image'], channels=3) image = tf.image.convert_image_dtype(image_decoded, tf.float32) resized_image = tf.image.resize_images( image, [batch_shape[1], batch_shape[2]]) label = tf.cast(features['label'], tf.int32) if one_hot and classes: label = tf.one_hot(label, classes) images.append(resized_image) labels.append(label) images = tf.parallel_stack(images, 0) labels = tf.parallel_stack(labels, 0) # images = tf.cast(images, tf.float32) # images = tf.reshape(images, shape=batch_shape) # StagingArea will store tensors # across multiple steps to # speed up execution images_shape = images.get_shape() labels_shape = labels.get_shape() copy_stage = data_flow_ops.StagingArea( [tf.float32, tf.float32], shapes=[images_shape, labels_shape]) copy_stage_op = copy_stage.put([images, labels]) staged_images, staged_labels = copy_stage.get() return images, labels
def minibatch(self, dataset, subset, use_datasets, cache_data, shift_ratio=-1): if shift_ratio < 0: shift_ratio = self.shift_ratio with tf.name_scope('batch_processing'): # Build final results per split. images = [[] for _ in range(self.num_splits)] labels = [[] for _ in range(self.num_splits)] if use_datasets: ds = data_utils.create_dataset(self.batch_size, self.num_splits, self.batch_size_per_split, self.parse_and_preprocess, dataset, subset, self.train, cache_data) ds_iterator = data_utils.create_iterator(ds) for d in xrange(self.num_splits): labels[d], images[d] = ds_iterator.get_next() else: record_input = data_flow_ops.RecordInput( file_pattern=dataset.tf_record_pattern(subset), seed=301, parallelism=64, buffer_size=10000, batch_size=self.batch_size, shift_ratio=shift_ratio, name='record_input') records = record_input.get_yield_op() records = tf.split(records, self.batch_size, 0) records = [tf.reshape(record, []) for record in records] for idx in xrange(self.batch_size): value = records[idx] (label, image) = self.parse_and_preprocess(value, idx) split_index = idx % self.num_splits labels[split_index].append(label) images[split_index].append(image) for split_index in xrange(self.num_splits): if not use_datasets: images[split_index] = tf.parallel_stack( images[split_index]) labels[split_index] = tf.concat(labels[split_index], 0) images[split_index] = tf.reshape(images[split_index], shape=[ self.batch_size_per_split, self.height, self.width, self.depth ]) labels[split_index] = tf.reshape(labels[split_index], [self.batch_size_per_split]) return images, labels
def read_and_decode_recordinput(tf_glob, one_hot=True, classes=None, is_train=None, batch_shape=[1000, 28, 28, 1], parallelism=1): """ Return tensor to read from TFRecord """ print 'Creating graph for loading %s TFRecords...' % tf_glob with tf.variable_scope("TFRecords"): record_input = data_flow_ops.RecordInput( tf_glob, batch_size=batch_shape[0], parallelism=parallelism) records_op = record_input.get_yield_op() records_op = tf.split(records_op, batch_shape[0], 0) records_op = [tf.reshape(record, []) for record in records_op] progbar = Progbar(len(records_op)) images = [] labels = [] for i, serialized_example in enumerate(records_op): progbar.update(i) with tf.variable_scope("parse_images", reuse=True): features = tf.parse_single_example( serialized_example, features={ 'label': tf.FixedLenFeature([], tf.int64), 'image_raw': tf.FixedLenFeature([], tf.string), }) img = tf.decode_raw(features['image_raw'], tf.uint8) img.set_shape(batch_shape[1] * batch_shape[2]) img = tf.reshape(img, [1] + batch_shape[1:]) img = tf.cast(img, tf.float32) * (1. / 255) - 0.5 label = tf.cast(features['label'], tf.int32) if one_hot and classes: label = tf.one_hot(label, classes) images.append(img) labels.append(label) images = tf.parallel_stack(images, 0) labels = tf.parallel_stack(labels, 0) images = tf.cast(images, tf.float32) images = tf.reshape(images, shape=batch_shape) # StagingArea will store tensors # across multiple steps to # speed up execution images_shape = images.get_shape() labels_shape = labels.get_shape() copy_stage = data_flow_ops.StagingArea( [tf.float32, tf.float32], shapes=[images_shape, labels_shape]) copy_stage_op = copy_stage.put( [images, labels]) staged_images, staged_labels = copy_stage.get() print(images, labels) return images, labels
def testRecordInputSimple(self): with self.cached_session() as sess: self.generateTestData("basic", 1, 1) yield_op = data_flow_ops.RecordInput( file_pattern=os.path.join(self.get_temp_dir(), "basic.*"), parallelism=1, buffer_size=1, batch_size=1, name="record_input").get_yield_op() self.assertEqual(self.evaluate(yield_op), b"0000000000")
def minibatch(self, dataset, subset): with tf.name_scope('batch_processing'): images = [[] for _ in range(self.device_count)] labels = [[] for _ in range(self.device_count)] # The RecordInput Op will continuously read a batch of records asynchronously # into a buffer of some fixed capacity (source: TF Docs) record_input = data_flow_ops.RecordInput( file_pattern=dataset.tf_record_pattern(subset), seed=301, parallelism=64, buffer_size=10000, batch_size=self.batch_size, name='record_input') records = record_input.get_yield_op() records = tf.split(records, self.batch_size, 0) records = [tf.reshape(record, []) for record in records] for i in range(self.batch_size): value = records[i] image_buffer, label_index, bbox, _ = parse_example_proto(value) image = self.preprocess(image_buffer, bbox, i % 4) device_index = i % self.device_count images[device_index].append(image) labels[device_index].append(label_index) label_index_batch = [None] * self.device_count for device_index in range(self.device_count): images[device_index] = tf.parallel_stack(images[device_index]) label_index_batch[device_index] = tf.concat( labels[device_index], 0) images[device_index] = tf.cast(images[device_index], self.dtype) depth = 3 images[device_index] = tf.reshape( images[device_index], shape=[ self.batch_size_per_device, self.height, self.width, depth ]) label_index_batch[device_index] = tf.reshape( label_index_batch[device_index], [self.batch_size_per_device]) return images, label_index_batch
def testRecordInputSimpleZlib(self): with self.test_session() as sess: self.generateTestData("basic", 1, 1, compression_type=tf_record.TFRecordCompressionType.ZLIB) yield_op = data_flow_ops.RecordInput( file_pattern=os.path.join(self.get_temp_dir(), "basic.*"), parallelism=1, buffer_size=1, batch_size=1, name="record_input", compression_type= tf_record.TFRecordCompressionType.ZLIB).get_yield_op() self.assertEqual(sess.run(yield_op), b"0000000000")
def testDoesNotDeadlock(self): # Iterate multiple times to cause deadlock if there is a chance it can occur for _ in range(30): with self.cached_session() as sess: self.generateTestData("basic", 1, 1) records = data_flow_ops.RecordInput( file_pattern=os.path.join(self.get_temp_dir(), "basic.*"), parallelism=1, buffer_size=100, batch_size=1, name="record_input") yield_op = records.get_yield_op() for _ in range(50): self.evaluate(yield_op)
def record_input_batch(self, pre_process_func=None, seed=301, parallelism=64, buffer_size=10000, batch_size=32, cols=None): """使用RecordInput从TFRecord随机读取一个batch的数据 :param pre_process_func: 预处理函数。为None时不进行预处理。预处理函数接收的参数数目需要和len(cols)相同,返回参数数目不限制 :param seed: 随机种子 :param parallelism: 并发数 :param buffer_size: The maximum number of records the buffer will contain. :param batch_size: 一次返回多少records :param cols: 要返回TFRecord中的哪些feature。get_keys()函数返回值得子集。 :return: 一个batch的数据 """ if cols is None: cols = self.get_keys() record_input = data_flow_ops.RecordInput( # return : A tensor of shape [batch_size]. file_pattern=self.pattern, seed=seed, parallelism=parallelism, buffer_size=buffer_size, batch_size=batch_size, name='record_input') records = record_input.get_yield_op() records = tf.split(records, batch_size, 0) records = [tf.reshape(record, []) for record in records] batch_examples = [[] for _ in range(len(cols))] keys = self.get_keys() types = self.get_types() for i in range(batch_size): value = records[i] cols_types = [types[keys.index(col)] for col in cols] # 返回cols对应的类型 features = _parse_single_example_proto_cols_closure( cols_types, cols)(value) if pre_process_func is not None: # 是否进行预处理 features = pre_process_func(*features) # 调用者可能使用pre_process_func返回单个Tensor,需要转换成Sequence if not isinstance(features, collections.Sequence): features = (features, ) for j, feature in enumerate(features): batch_examples[j].append(feature) return batch_examples
def minibatch(self, dataset, subset): with tf.name_scope('batch_processing'): images = [[] for i in range(self.device_count)] labels = [[] for i in range(self.device_count)] filenames = [[] for i in range(self.device_count)] record_input = data_flow_ops.RecordInput( file_pattern=dataset.tf_record_pattern(subset), seed=randint(0, 9000), parallelism=64, buffer_size=10000, batch_size=self.batch_size, name='record_input') records = record_input.get_yield_op() records = tf.split(records, self.batch_size, 0) records = [tf.reshape(record, []) for record in records] for i in xrange(self.batch_size): value = records[i] image_buffer, label_index, bbox, _, filename = parse_example_proto( value) image = self.preprocess(image_buffer, bbox, i % 4) device_index = i % self.device_count images[device_index].append(image) labels[device_index].append(label_index) filenames[device_index].append(filename) label_index_batch = [None] * self.device_count for device_index in xrange(self.device_count): images[device_index] = tf.parallel_stack(images[device_index]) label_index_batch[device_index] = tf.concat( labels[device_index], 0) # dynamic_pad=True) # HACK TESTING dynamic_pad=True images[device_index] = tf.cast(images[device_index], self.dtype) depth = 3 images[device_index] = tf.reshape( images[device_index], shape=[ self.batch_size_per_device, self.height, self.width, depth ]) label_index_batch[device_index] = tf.reshape( label_index_batch[device_index], [self.batch_size_per_device]) # Display the training images in the visualizer. # tf.summary.image('images', images) return images, label_index_batch, filenames
def minibatch(self): """ Returns minibatch of images and labels from TF records file. """ mode = self.mode batch_size = self.params['batch_size'] if mode not in ['train', 'validation', 'test']: mode = 'train' if self.debug: self.inspect_tfrecords(mode) record_input = data_flow_ops.RecordInput( file_pattern=os.path.join(self.params['data_dir'], '*.tfrecords'), parallelism=self.params['IO_threads'], buffer_size=self.params['buffer_cap'], batch_size=batch_size) records = record_input.get_yield_op() # Split batch into individual images records = tf.split(records, batch_size, 0) records = [tf.reshape(record, []) for record in records] #print('record contents %s' %(format(records))) #print('record length %s and contents %s' %(len(records),format(records))) # Deserialize and preprocess images into batches for each device images = [] labels = [] with tf.name_scope('input_pipeline'): if self.params[mode + '_distort']: print_rank('images will be distorted') for i, record in enumerate(records): image, label = self.decode_image_label(record) if self.params[mode + '_distort']: # image = self.add_noise_image(image) image = self.distort(image) images.append(image) labels.append(label) image_shape = image.get_shape().as_list() label_shape = label.get_shape().as_list() # Stack images and labels back into a single tensor labels = tf.parallel_stack(labels) images = tf.parallel_stack(images) # reshape them to the expected shape: labels_newshape = [batch_size] + label_shape images_newshape = [batch_size] + image_shape labels = tf.reshape(labels, labels_newshape) images = tf.reshape(images, images_newshape) # glimpse images: moved to GPU #images = self.get_glimpses(images) # Display the training images in the Tensorboard visualizer. if self.debug: tf.summary.image("images", images, max_outputs=4) # resize if self.params['resize']: images = tf.image.resize_bilinear(images, [ self.params['RESIZE_WIDTH'], self.params['RESIZE_HEIGHT'] ]) if self.params['tile']: images = tf.ones([ self.params['IMAGE_DEPTH'], self.params['IMAGE_HEIGHT'], self.params['IMAGE_WIDTH'] ], dtype=self.params['IMAGE_DTYPE']) labels = tf.ones([256, 512, 512], dtype=self.params['LABEL_DTYPE']) return images, labels
def minibatch(self, dataset, subset, use_datasets, shift_ratio=-1): if shift_ratio < 0: shift_ratio = self.shift_ratio with tf.name_scope('batch_processing'): # Build final results per split. images = [[] for _ in range(self.num_splits)] labels = [[] for _ in range(self.num_splits)] if use_datasets: glob_pattern = dataset.tf_record_pattern(subset) file_names = gfile.Glob(glob_pattern) if not file_names: raise ValueError('Found no files in --data_dir matching: {}' .format(glob_pattern)) ds = tf.contrib.data.TFRecordDataset(file_names) counter = tf.contrib.data.Dataset.range(self.batch_size) counter = counter.repeat() ds = tf.contrib.data.Dataset.zip((ds, counter)) ds = ds.map( self.parse_and_preprocess, num_parallel_calls=self.batch_size) ds = ds.prefetch(buffer_size=self.batch_size) ds = ds.shuffle(buffer_size=10000) ds = ds.repeat() ds_iterator = ds.make_one_shot_iterator() # TODO(jsimsa): Use datasets' batch transformation instead of (see # below) once the transformation implements parallel data copy. # # NOTE: The current implementation does not preserve the order of # elements between the shuffle buffer and the batch. for idx in xrange(self.batch_size): label, image = ds_iterator.get_next() split_index = idx % self.num_splits labels[split_index].append(label) images[split_index].append(image) else: record_input = data_flow_ops.RecordInput( file_pattern=dataset.tf_record_pattern(subset), seed=301, parallelism=64, buffer_size=10000, batch_size=self.batch_size, shift_ratio=shift_ratio, name='record_input') records = record_input.get_yield_op() records = tf.split(records, self.batch_size, 0) records = [tf.reshape(record, []) for record in records] for idx in xrange(self.batch_size): value = records[idx] (label, image) = self.parse_and_preprocess(value, idx) split_index = idx % self.num_splits labels[split_index].append(label) images[split_index].append(image) for split_index in xrange(self.num_splits): images[split_index] = tf.parallel_stack(images[split_index]) labels[split_index] = tf.concat(labels[split_index], 0) images[split_index] = tf.cast(images[split_index], self.dtype) depth = 3 images[split_index] = tf.reshape( images[split_index], shape=[self.batch_size_per_split, self.height, self.width, depth]) labels[split_index] = tf.reshape(labels[split_index], [self.batch_size_per_split]) return images, labels
def minibatch(self, dataset, subset, use_data_sets): with tf.name_scope('batch_processing'): images = [[] for _ in range(self.num_splits)] labels = [[] for _ in range(self.num_splits)] if use_data_sets: file_names = glob.glob(dataset.tf_record_pattern(subset)) ds = tf.contrib.data.TFRecordDataset(file_names) counter = tf.contrib.data.Dataset.range(self.batch_size) counter = counter.repeat() ds = tf.contrib.data.Dataset.zip((ds, counter)) ds = ds.map(self.parse_and_preprocess, num_parallel_calls=self.batch_size, output_buffer_size=self.batch_size) ds = ds.shuffle(buffer_size=10000) ds = ds.repeat() ds = ds.batch(batch_size=(self.batch_size / self.num_splits)) ds_iterator = ds.make_one_shot_iterator() for d in xrange(self.num_splits): labels[d], images[d] = ds_iterator.get_next() else: # Build final results per split. record_input = data_flow_ops.RecordInput( file_pattern=dataset.tf_record_pattern(subset), seed=301, parallelism=64, buffer_size=10000, batch_size=self.batch_size, shift_ratio=self.shift_ratio, name='record_input') records = record_input.get_yield_op() records = tf.split(records, self.batch_size, 0) records = [tf.reshape(record, []) for record in records] for idx in xrange(self.batch_size): value = records[idx] (label_index, image) = self.parse_and_preprocess(value, idx) split_index = idx % self.num_splits images[split_index].append(image) labels[split_index].append(label_index) label_index_batch = [None] * self.num_splits for split_index in xrange(self.num_splits): if use_data_sets: label_index_batch[split_index] = labels[split_index] else: images[split_index] = tf.parallel_stack( images[split_index]) label_index_batch[split_index] = tf.concat( labels[split_index], 0) images[split_index] = tf.cast(images[split_index], self.dtype) depth = 3 images[split_index] = tf.reshape(images[split_index], shape=[ self.batch_size_per_split, self.height, self.width, depth ]) label_index_batch[split_index] = tf.reshape( label_index_batch[split_index], [self.batch_size_per_split]) return images, label_index_batch
def minibatch(self, file_pattern): with tf.name_scope('batch_processing'): output_data = [[] for i in range(self.device_count)] labels = [[] for i in range(self.device_count)] record_input = data_flow_ops.RecordInput( file_pattern=file_pattern, seed=301, parallelism=64, buffer_size=10000, batch_size=self.batch_size, name='record_input') records = record_input.get_yield_op() records = tf.split(records, self.batch_size, 0) records = [tf.reshape(record, []) for record in records] for i in xrange(self.batch_size): value = records[i] data_buffer, label_index, _, frames = self.parse_example_proto( value) processed_data = self.preprocess(data_buffer, frames) device_index = i % self.device_count output_data[device_index].append(processed_data) labels[device_index].append(label_index) label_index_batch = [None] * self.device_count for device_index in xrange(self.device_count): output_data[device_index] = tf.parallel_stack( output_data[device_index]) label_index_batch[device_index] = tf.concat( labels[device_index], 0) # dynamic_pad=True) # HACK TESTING dynamic_pad=True output_data[device_index] = tf.cast(output_data[device_index], self.dtype) if self.data_type is 'rgb': depth = 3 output_data[device_index] = tf.reshape( output_data[device_index], shape=[ self.batch_size_per_device, self.time_window, self.cropped_size[0], self.cropped_size[1], depth ]) # shape=[self.batch_size_per_device, -1, self.cropped_size[0], self.cropped_size[1], depth]) elif self.data_type is 'flow': depth = 2 output_data[device_index] = tf.reshape( output_data[device_index], shape=[ self.batch_size_per_device, self.time_window, self.cropped_size[0], self.cropped_size[1], depth ]) # shape=[self.batch_size_per_device, -1, self.cropped_size[0], self.cropped_size[1], depth]) # elif self.data_type is 'audio': # TBD else: raise ('data_type error, get: ', self.data_type) label_index_batch[device_index] = tf.reshape( label_index_batch[device_index], [self.batch_size_per_device]) # Display the training images in the visualizer. # tf.summary.image('images', images) return output_data, label_index_batch
def minibatch(self): with tf.name_scope('batch_processing'): images = [] bboxes = [] labels = [] slcs = [] record_input = data_flow_ops.RecordInput( file_pattern=dp.tf_records( self.data_set, '{}_localization'.format(self.region)), seed=301, parallelism=64, buffer_size=5000, shift_ratio=0.2, batch_size=self.batch_size, name='record_input') records = record_input.get_yield_op() records = tf.split(records, self.batch_size, 0) records = [tf.reshape(record, []) for record in records] for i in xrange(self.batch_size): value = records[i] image, bbox, dim, label, slc = self._parse_example_proto( value, i) image = tf.image.resize_images(image, self.image_shape) bbox = tf.cast( tf.cast(bbox, tf.float32) * float(IMAGE_LENGTH) / tf.cast(dim, tf.float32), tf.int64) image, bbox = self.distort_image(image, bbox) unmaked_image = image bbox = tf.cast(bbox, tf.int32) bbox_mask = tf.ones((bbox[3] - bbox[1], bbox[2] - bbox[0])) bbox_mask_left = tf.zeros((bbox[3] - bbox[1], bbox[0])) bbox_mask_right = tf.zeros( (bbox[3] - bbox[1], IMAGE_LENGTH - bbox[2])) bbox_mask_top = tf.zeros((bbox[1], IMAGE_LENGTH)) bbox_mask_bottom = tf.zeros( (IMAGE_LENGTH - bbox[3], IMAGE_LENGTH)) bbox_mask = tf.concat( (bbox_mask_left, bbox_mask, bbox_mask_right), axis=1) bbox_mask = tf.concat( (bbox_mask_top, bbox_mask, bbox_mask_bottom), axis=0) bbox_mask *= 150 image = image[:, :, 0] image -= self.mean_subtract image += bbox_mask image = tf.minimum(image, 255) bbox_mask.set_shape((IMAGE_LENGTH, IMAGE_LENGTH)) images.append(image) bbox = tf.cast(bbox, tf.int64) bboxes.append(bbox) labels.append(label) slcs.append(slc) images = tf.parallel_stack(images) images = tf.reshape(images, shape=[ self.batch_size, self.image_shape[0], self.image_shape[1], -1 ]) bboxes = tf.reshape(bboxes, (self.batch_size, BOX_COUNT, 4)) x_min, y_min, x_max, y_max = tf.split(value=tf.reshape( bboxes, (-1, 4)), num_or_size_splits=4, axis=1) normalized_boxes = tf.cast( tf.reshape(tf.stack((y_min, x_min, y_max, x_max), axis=1), (self.batch_size, BOX_COUNT, 4)), tf.float32) / float(IMAGE_LENGTH) # images = tf.image.draw_bounding_boxes(images, normalized_boxes) labels = tf.reshape(labels, (self.batch_size, 1)) slcs = tf.reshape(slcs, (self.batch_size, )) return images, labels, slcs
def minibatch(self, dataset, subset, use_datasets, cache_data, shift_ratio=-1): if shift_ratio < 0: shift_ratio = self.shift_ratio with tf.name_scope('batch_processing'): # Build final results per split. images = [[] for _ in range(self.num_splits)] labels = [[] for _ in range(self.num_splits)] if use_datasets: glob_pattern = dataset.tf_record_pattern(subset) file_names = gfile.Glob(glob_pattern) if not file_names: raise ValueError( 'Found no files in --data_dir matching: {}'.format( glob_pattern)) ds = tf.data.TFRecordDataset.list_files(file_names) ds = ds.apply( interleave_ops.parallel_interleave(tf.data.TFRecordDataset, cycle_length=10)) if cache_data: ds = ds.take(1).cache().repeat() counter = tf.data.Dataset.range(self.batch_size) counter = counter.repeat() ds = tf.data.Dataset.zip((ds, counter)) ds = ds.prefetch(buffer_size=self.batch_size) ds = ds.shuffle(buffer_size=10000) ds = ds.repeat() ds = ds.apply( batching.map_and_batch( map_func=self.parse_and_preprocess, batch_size=self.batch_size_per_split, num_parallel_batches=self.num_splits)) ds = ds.prefetch(buffer_size=self.num_splits) ds_iterator = ds.make_one_shot_iterator() for d in xrange(self.num_splits): labels[d], images[d] = ds_iterator.get_next() else: record_input = data_flow_ops.RecordInput( file_pattern=dataset.tf_record_pattern(subset), seed=301, parallelism=64, buffer_size=10000, batch_size=self.batch_size, shift_ratio=shift_ratio, name='record_input') records = record_input.get_yield_op() records = tf.split(records, self.batch_size, 0) records = [tf.reshape(record, []) for record in records] for idx in xrange(self.batch_size): value = records[idx] (label, image) = self.parse_and_preprocess(value, idx) split_index = idx % self.num_splits labels[split_index].append(label) images[split_index].append(image) for split_index in xrange(self.num_splits): if not use_datasets: images[split_index] = tf.parallel_stack( images[split_index]) labels[split_index] = tf.concat(labels[split_index], 0) images[split_index] = tf.cast(images[split_index], self.dtype) depth = 3 images[split_index] = tf.reshape(images[split_index], shape=[ self.batch_size_per_split, self.height, self.width, depth ]) labels[split_index] = tf.reshape(labels[split_index], [self.batch_size_per_split]) return images, labels
def minibatch(self, dataset, subset, use_data_sets): with tf.name_scope('batch_processing'): images = [[] for i in range(self.device_count)] labels = [[] for i in range(self.device_count)] if use_data_sets: file_names = glob.glob(dataset.tf_record_pattern(subset)) batch_size_per = self.batch_size / self.device_count num_threads = 10 output_buffer_size = num_threads * 2000 counter = tf.data.Dataset.range(sys.maxint) ds = tf.data.TFRecordDataset(file_names) ds = tf.data.Dataset.zip((ds, counter)) ds = ds.map(self.parse_and_preprocess, num_parallel_calls=num_threads).prefetch( output_buffer_size) shuffle_buffer_size = 10000 ds = ds.shuffle(shuffle_buffer_size) repeat_count = -1 # infinite repetition ds = ds.repeat(repeat_count) ds = ds.batch(batch_size_per) ds_iterator = ds.make_one_shot_iterator() for d in xrange(self.device_count): labels[d], images[d] = ds_iterator.get_next() else: # Build final results per device. record_input = data_flow_ops.RecordInput( file_pattern=dataset.tf_record_pattern(subset), seed=301, parallelism=64, buffer_size=10000, batch_size=self.batch_size, shift_ratio=self.shift_ratio, name='record_input') records = record_input.get_yield_op() records = tf.split(records, self.batch_size, 0) records = [tf.reshape(record, []) for record in records] for i in xrange(self.batch_size): value = records[i] (label_index, image) = self.parse_and_preprocess(value, i % 4) device_index = i % self.device_count images[device_index].append(image) labels[device_index].append(label_index) label_index_batch = [None] * self.device_count for device_index in xrange(self.device_count): if use_data_sets: label_index_batch[device_index] = labels[device_index] else: images[device_index] = tf.parallel_stack( images[device_index]) label_index_batch[device_index] = tf.concat( labels[device_index], 0) images[device_index] = tf.cast(images[device_index], self.dtype) depth = 3 images[device_index] = tf.reshape( images[device_index], shape=[ self.batch_size_per_device, self.height, self.width, depth ]) label_index_batch[device_index] = tf.reshape( label_index_batch[device_index], [self.batch_size_per_device]) if FLAGS.summary_verbosity >= 2: # Display the training images in the visualizer. tf.summary.image('images', images) return images, label_index_batch