def batch_inputs(feature_map, data_files, height=2048, width=2448, batch_size=1, is_train=True, num_readers=1, num_preprocess_threads=4): # feature_map: 对应proto的数据映射。 # data_files: list类型,存放的是tfrecord的文件列表。 # batch_size: 一个批次batch的大小。 # is_train: DataProvider在train和test节点的表现形式有所不同,主要test时并不需要一个循环队列。 # num_reader: 每一个线程reader的个数。 # num_preprocess_threads: 处理数据的线程的个数。 with tf.name_scope('reader_defination'): # 创建文件队列,如果是训练,创建一个随机文件队列,如果是测试,创建一个顺序文件队列。 if is_train: filename_queue = tf.train.string_input_producer(data_files, shuffle=True, capacity=16) else: filename_queue = tf.train.string_input_producer(data_files, shuffle=False, capacity=1) # reader的个数至少为1。 num_readers = 1 if num_readers < 1 else num_readers if num_readers > 1: # 定义缓冲池的大小。 examples_per_shard = 1024 min_queue_examples = examples_per_shard * 16 if is_train: examples_queue = tf.RandomShuffleQueue(capacity=min_queue_examples + 3 * batch_size, min_after_dequeue=min_queue_examples, dtypes=[tf.string]) else: examples_queue = tf.FIFOQueue(capacity=examples_per_shard + 3 * batch_size, dtypes=[tf.string]) # 多个reader时对reader队列进行管理。 enqueue_ops = [] for _ in range(num_readers): reader = tf.TFRecordReader() _, value = reader.read(filename_queue) enqueue_ops.append(examples_queue.enqueue([value])) tf.train.queue_runner.add_queue_runner(tf.train.queue_runner.QueueRunner(examples_queue, enqueue_ops)) example_serialized = examples_queue.dequeue() else: reader = tf.TFRecordReader() _, example_serialized = reader.read(filename_queue) samples = [] for _ in range(num_preprocess_threads): features = tf.parse_single_example(example_serialized, feature_map) samples.append([image_processing(features['image/encoded'], height, width), features['image/format']]) batch_data = tf.train.batch_join(samples, batch_size=batch_size, capacity=2 * num_preprocess_threads * batch_size) data = tf.reshape(batch_data[0], [batch_size, -1]) label = tf.reshape(batch_data[1], [batch_size]) return (data, label)
def _shuffle_inputs(input_tensors, capacity, min_after_dequeue, num_threads): """Shuffles tensors in `input_tensors`, maintaining grouping.""" shuffle_queue = tf.RandomShuffleQueue( capacity, min_after_dequeue, dtypes=[t.dtype for t in input_tensors]) enqueue_op = shuffle_queue.enqueue(input_tensors) runner = tf.train.QueueRunner(shuffle_queue, [enqueue_op] * num_threads) tf.train.add_queue_runner(runner) output_tensors = shuffle_queue.dequeue() for i in range(len(input_tensors)): output_tensors[i].set_shape(input_tensors[i].shape) return output_tensors
def make_batch(observations): queue = tf.RandomShuffleQueue(capacity=queue_capacity, min_after_dequeue=queue_min, shapes=[[k, height, width, nch]], dtypes=[tf.float32]) example = tf.stack(observations, axis=0) enqueue_op = queue.enqueue(example) qr = tf.train.QueueRunner(queue, [enqueue_op] * queue_nthreads) tf.train.add_queue_runner(qr) tf.summary.scalar('queue_size', queue.size()) return queue.dequeue_many(batch_size)
def _shuffling_queue(shuffling_queue_capacity, min_after_dequeue, dtypes, fields_as_list): """Creates a shuffling queue with enqueue/dequeue pair. Always a single writing thread.""" # Named tuples loose the 'named' part when going via queue shuffling_queue = tf.RandomShuffleQueue(shuffling_queue_capacity, min_after_dequeue, dtypes) # The following call to .size has a side effect of creating a new node in the TF graph. We are interested # in the side effect so we can read the queue size somewhere else, addressing the node by a 'well-known-name' shuffling_queue.size(name=RANDOM_SHUFFLING_QUEUE_SIZE) # We need the queue only for shuffling, so we use only a single enqueuing thread (actually would be happy # not to introduce any threads. Not sure if there is such a mechanism in TF) queue_runner = tf.train.QueueRunner(shuffling_queue, 1 * [shuffling_queue.enqueue(fields_as_list)]) tf.train.add_queue_runner(queue_runner) # Passed through the queue. We got an ordered list. The order matches the order of fields in unischema fields_as_list = shuffling_queue.dequeue() return fields_as_list
def _build(self): # Find split file from which we are going to read. split_path = os.path.join(self._dataset_dir, '{}.tfrecords'.format(self._split)) if not tf.gfile.Exists(split_path): raise InvalidDataDirectory( '"{}" does not exist.'.format(split_path)) # String input producer allows for a variable number of files to read # from. We just know we have a single file. filename_queue = tf.train.string_input_producer( [split_path], num_epochs=self._num_epochs, seed=self._seed) # Define reader to parse records. reader = tf.TFRecordReader() _, raw_record = reader.read(filename_queue) values, dtypes, names = self.read_record(raw_record) if self._random_shuffle: queue = tf.RandomShuffleQueue(capacity=100, min_after_dequeue=0, dtypes=dtypes, names=names, name='tfrecord_random_queue', seed=self._seed) else: queue = tf.FIFOQueue(capacity=100, dtypes=dtypes, names=names, name='tfrecord_fifo_queue') # Generate queueing ops for QueueRunner. enqueue_ops = [queue.enqueue(values)] * self._total_queue_ops self.queue_runner = tf.train.QueueRunner(queue, enqueue_ops) tf.train.add_queue_runner(self.queue_runner) return queue.dequeue()
def imagenet_inputs(batch_size, image_size, num_readers=1, num_preprocess_threads=4): """Loads a batch of imagenet inputs. Used as a replacement for inception.image_processing.inputs in tensorflow/checkpoint in order to get around the use of hard-coded flags in the image_processing module. Args: batch_size: int, batch size. image_size: int. The images will be resized bilinearly to shape [image_size, image_size]. num_readers: int, number of preprocessing threads per tower. Must be a multiple of 4. num_preprocess_threads: int, number of parallel readers. Returns: 4-D tensor of images of shape [batch_size, image_size, image_size, 3], with values in [0, 1]. Raises: IOError: If ImageNet data files cannot be found. ValueError: If `num_preprocess_threads is not a multiple of 4 or `num_readers` is less than 1. """ imagenet = imagenet_data.ImagenetData('train') with tf.name_scope('batch_processing'): data_files = imagenet.data_files() if data_files is None: raise IOError('No ImageNet data files found') # Create filename_queue. filename_queue = tf.train.string_input_producer(data_files, shuffle=True, capacity=16) if num_preprocess_threads % 4: raise ValueError('Please make num_preprocess_threads a multiple ' 'of 4 (%d %% 4 != 0).' % num_preprocess_threads) if num_readers < 1: raise ValueError('Please make num_readers at least 1') # Approximate number of examples per shard. examples_per_shard = 1024 # Size the random shuffle queue to balance between good global # mixing (more examples) and memory use (fewer examples). # 1 image uses 299*299*3*4 bytes = 1MB # The default input_queue_memory_factor is 16 implying a shuffling queue # size: examples_per_shard * 16 * 1MB = 17.6GB input_queue_memory_factor = 16 min_queue_examples = examples_per_shard * input_queue_memory_factor examples_queue = tf.RandomShuffleQueue( capacity=min_queue_examples + 3 * batch_size, min_after_dequeue=min_queue_examples, dtypes=[tf.string]) # Create multiple readers to populate the queue of examples. enqueue_ops = [] for _ in range(num_readers): reader = imagenet.reader() _, value = reader.read(filename_queue) enqueue_ops.append(examples_queue.enqueue([value])) tf.train.queue_runner.add_queue_runner( tf.train.queue_runner.QueueRunner(examples_queue, enqueue_ops)) example_serialized = examples_queue.dequeue() images_and_labels = [] for _ in range(num_preprocess_threads): # Parse a serialized Example proto to extract the image and metadata. image_buffer, label_index, _, _ = _parse_example_proto( example_serialized) image = tf.image.decode_jpeg(image_buffer, channels=3) # pylint: disable=protected-access image = _aspect_preserving_resize(image, image_size + 2) image = _central_crop([image], image_size, image_size)[0] # pylint: enable=protected-access image.set_shape([image_size, image_size, 3]) image = tf.to_float(image) / 255.0 images_and_labels.append([image, label_index]) images, label_index_batch = tf.train.batch_join( images_and_labels, batch_size=batch_size, capacity=2 * num_preprocess_threads * batch_size) images = tf.reshape(images, shape=[batch_size, image_size, image_size, 3]) # Display the training images in the visualizer. tf.summary.image('images', images) return images, tf.reshape(label_index_batch, [batch_size])
def arbitrary_style_image_inputs(style_dataset_file, batch_size=None, image_size=None, center_crop=True, shuffle=True, augment_style_images=False, random_style_image_size=False, min_rand_image_size=128, max_rand_image_size=300): """Loads a batch of random style image given the path of tfrecord dataset. This method does not return pre-compute Gram matrices for the images like style_image_inputs. But it can provide data augmentation. If augment_style_images is equal to True, then style images will randomly modified (eg. changes in brightness, hue or saturation) for data augmentation. If random_style_image_size is set to True then all images in one batch will be resized to a random size. Args: style_dataset_file: str, path to the tfrecord dataset of style files. batch_size: int. If provided, batches style images. Defaults to None. image_size: int. The images will be resized bilinearly so that the smallest side has size image_size. Defaults to None. center_crop: bool. If True, center-crops to [image_size, image_size]. Defaults to False. shuffle: bool, whether to shuffle style files at random. Defaults to False. augment_style_images: bool. Wheather to augment style images or not. random_style_image_size: bool. If this value is True, then all the style images in one batch will be resized to a random size between min_rand_image_size and max_rand_image_size. min_rand_image_size: int. If random_style_image_size is True, this value specifies the minimum image size. max_rand_image_size: int. If random_style_image_size is True, this value specifies the maximum image size. Returns: 4-D tensor of shape [1, ?, ?, 3] with values in [0, 1] for the style image (with random changes for data augmentation if augment_style_image_size is set to true), and 0-D tensor for the style label, 4-D tensor of shape [1, ?, ?, 3] with values in [0, 1] for the style image without random changes for data augmentation. Raises: ValueError: if center cropping is requested but no image size is provided, or if batch size is specified but center-cropping or augment-style-images is not requested, or if both augment-style-images and center-cropping are requested. """ if center_crop and image_size is None: raise ValueError('center-cropping requires specifying the image size.') if center_crop and augment_style_images: raise ValueError( 'When augment_style_images is true images will be randomly cropped.' ) if batch_size is not None and not center_crop and not augment_style_images: raise ValueError( 'batching requires same image sizes (Set center-cropping or ' 'augment_style_images to true)') with tf.name_scope('style_image_processing'): # Force all input processing onto CPU in order to reserve the GPU for the # forward inference and back-propagation. with tf.device('/cpu:0'): filename_queue = tf.train.string_input_producer( [style_dataset_file], shuffle=False, capacity=1, name='filename_queue') if shuffle: examples_queue = tf.RandomShuffleQueue( capacity=64, min_after_dequeue=32, dtypes=[tf.string], name='random_examples_queue') else: examples_queue = tf.FIFOQueue(capacity=64, dtypes=[tf.string], name='fifo_examples_queue') reader = tf.TFRecordReader() _, value = reader.read(filename_queue) enqueue_ops = [examples_queue.enqueue([value])] tf.train.queue_runner.add_queue_runner( tf.train.queue_runner.QueueRunner(examples_queue, enqueue_ops)) example_serialized = examples_queue.dequeue() features = tf.parse_single_example( example_serialized, features={ 'label': tf.FixedLenFeature([], tf.int64), 'image_raw': tf.FixedLenFeature([], tf.string) }) image = tf.image.decode_jpeg(features['image_raw']) image.set_shape([None, None, 3]) label = features['label'] if image_size is not None: image_channels = int(image.shape[2]) if augment_style_images: image_orig = image image = tf.image.random_brightness(image, max_delta=0.8) image = tf.image.random_saturation(image, lower=0.5, upper=1.5) image = tf.image.random_hue(image, max_delta=0.2) image = tf.image.random_flip_left_right(image) image = tf.image.random_flip_up_down(image) random_larger_image_size = tf.random_uniform( [], minval=image_size + 2, maxval=image_size + 200, dtype=tf.int32) image = _aspect_preserving_resize( image, random_larger_image_size) image = tf.random_crop( image, size=[image_size, image_size, image_channels]) image.set_shape([image_size, image_size, image_channels]) image_orig = _aspect_preserving_resize( image_orig, image_size + 2) image_orig = _central_crop([image_orig], image_size, image_size)[0] image_orig.set_shape([image_size, image_size, 3]) elif center_crop: image = _aspect_preserving_resize(image, image_size + 2) image = _central_crop([image], image_size, image_size)[0] image.set_shape([image_size, image_size, image_channels]) image_orig = image else: image = _aspect_preserving_resize(image, image_size) image_orig = image image = tf.to_float(image) / 255.0 image_orig = tf.to_float(image_orig) / 255.0 if batch_size is None: image = tf.expand_dims(image, 0) else: [image, image_orig, label] = tf.train.batch([image, image_orig, label], batch_size=batch_size) if random_style_image_size: # Selects a random size for the style images and resizes all the images # in the batch to that size. image = _aspect_preserving_resize( image, tf.random_uniform([], minval=min_rand_image_size, maxval=max_rand_image_size, dtype=tf.int32)) return image, label, image_orig
def style_image_inputs(style_dataset_file, batch_size=None, image_size=None, square_crop=False, shuffle=True): """Loads a batch of random style image given the path of tfrecord dataset. Args: style_dataset_file: str, path to the tfrecord dataset of style files. The dataset is produced via the create_style_dataset.py script and is made of Example protobufs with the following features: * 'image_raw': byte encoding of the JPEG string of the style image. * 'label': integer identifier of the style image in [0, N - 1], where N is the number of examples in the dataset. * 'vgg_16/<LAYER_NAME>': Gram matrix at layer <LAYER_NAME> of the VGG-16 network (<LAYER_NAME> in {conv,pool}{1,2,3,4,5}) for the style image. batch_size: int. If provided, batches style images. Defaults to None. image_size: int. The images will be resized bilinearly so that the smallest side has size image_size. Defaults to None. square_crop: bool. If True, square-crops to [image_size, image_size]. Defaults to False. shuffle: bool, whether to shuffle style files at random. Defaults to True. Returns: If batch_size is defined, a 4-D tensor of shape [batch_size, ?, ?, 3] with values in [0, 1] for the style image, and 1-D tensor for the style label. Raises: ValueError: if center cropping is requested but no image size is provided, or if batch size is specified but center-cropping is not requested. """ vgg_layers = [ 'vgg_16/conv1', 'vgg_16/pool1', 'vgg_16/conv2', 'vgg_16/pool2', 'vgg_16/conv3', 'vgg_16/pool3', 'vgg_16/conv4', 'vgg_16/pool4', 'vgg_16/conv5', 'vgg_16/pool5' ] if square_crop and image_size is None: raise ValueError('center-cropping requires specifying the image size.') if batch_size is not None and not square_crop: raise ValueError('batching requires center-cropping.') with tf.name_scope('style_image_processing'): filename_queue = tf.train.string_input_producer([style_dataset_file], shuffle=False, capacity=1, name='filename_queue') if shuffle: examples_queue = tf.RandomShuffleQueue( capacity=64, min_after_dequeue=32, dtypes=[tf.string], name='random_examples_queue') else: examples_queue = tf.FIFOQueue(capacity=64, dtypes=[tf.string], name='fifo_examples_queue') reader = tf.TFRecordReader() _, value = reader.read(filename_queue) enqueue_ops = [examples_queue.enqueue([value])] tf.train.queue_runner.add_queue_runner( tf.train.queue_runner.QueueRunner(examples_queue, enqueue_ops)) example_serialized = examples_queue.dequeue() features = tf.parse_single_example( example_serialized, features={ 'label': tf.FixedLenFeature([], tf.int64), 'image_raw': tf.FixedLenFeature([], tf.string), 'vgg_16/conv1': tf.FixedLenFeature([64, 64], tf.float32), 'vgg_16/pool1': tf.FixedLenFeature([64, 64], tf.float32), 'vgg_16/conv2': tf.FixedLenFeature([128, 128], tf.float32), 'vgg_16/pool2': tf.FixedLenFeature([128, 128], tf.float32), 'vgg_16/conv3': tf.FixedLenFeature([256, 256], tf.float32), 'vgg_16/pool3': tf.FixedLenFeature([256, 256], tf.float32), 'vgg_16/conv4': tf.FixedLenFeature([512, 512], tf.float32), 'vgg_16/pool4': tf.FixedLenFeature([512, 512], tf.float32), 'vgg_16/conv5': tf.FixedLenFeature([512, 512], tf.float32), 'vgg_16/pool5': tf.FixedLenFeature([512, 512], tf.float32) }) image = tf.image.decode_jpeg(features['image_raw']) label = features['label'] gram_matrices = [features[vgg_layer] for vgg_layer in vgg_layers] image.set_shape([None, None, 3]) if image_size: if square_crop: image = _aspect_preserving_resize(image, image_size + 2) image = _central_crop([image], image_size, image_size)[0] image.set_shape([image_size, image_size, 3]) else: image = _aspect_preserving_resize(image, image_size) image = tf.to_float(image) / 255.0 if batch_size is None: image = tf.expand_dims(image, 0) else: image_label_gram_matrices = tf.train.batch([image, label] + gram_matrices, batch_size=batch_size) image, label = image_label_gram_matrices[:2] gram_matrices = image_label_gram_matrices[2:] gram_matrices = dict( (vgg_layer, gram_matrix) for vgg_layer, gram_matrix in zip(vgg_layers, gram_matrices)) return image, label, gram_matrices
def group_choose_k(named_id_to_fps, k, n=None, with_replacement=False, capacity=4096, min_after_dequeue=2048, nthreads=4): assert k > 0 # Join (variable-length) groups into CSV strings for enqueueing avg_group_size = int( np.ceil( np.mean([len(group_fps) for group_fps in named_id_to_fps.values()]))) named_id_to_fps = [ ','.join(group_fps) for group_fps in named_id_to_fps.values() ] # If n is None, compute a reasonable value (avg group len choose k) if n is None: f = math.factorial n = f(avg_group_size) / f(k) / f(avg_group_size - k) # Dequeue one and split it into group group_fps = tf.train.string_input_producer(named_id_to_fps).dequeue() group_fps = tf.string_split([group_fps], ',').values group_size = tf.shape(group_fps)[0] tf.summary.histogram('group_size', group_size) # Select some at random # TODO: Should be some way to sample without replacement here rather than manually filtering tuple_ids = tf.random_uniform([n, k], minval=0, maxval=group_size, dtype=tf.int32) # Count num tuples enqueued ntotal = tf.Variable(0) tf.summary.scalar('tuples_ntotal', ntotal) add_total = tf.assign_add(ntotal, n) # Filter duplicates if sampling tuples without replacement if not with_replacement and k > 1: # Find unique tuples tuple_unique = tf.ones([n], tf.bool) for i in xrange(k): for j in xrange(k): if i == j: continue pair_unique = tf.not_equal(tuple_ids[:, i], tuple_ids[:, j]) tuple_unique = tf.logical_and(tuple_unique, pair_unique) # Filter tuples with duplicates valid_tuples = tf.where(tuple_unique)[:, 0] # Count num valid tuples enqueued nvalid = tf.Variable(0) tf.summary.scalar('tuples_nvalid', nvalid) tf.summary.scalar( 'tuples_valid_ratio', tf.cast(nvalid, tf.float32) / tf.cast(ntotal, tf.float32)) add_valid = tf.assign_add(nvalid, tf.shape(valid_tuples)[0]) # Gather valid ids with tf.control_dependencies([add_valid]): tuple_ids = tf.gather(tuple_ids, valid_tuples) # Gather valid tuples with tf.control_dependencies([add_total]): tuples = tf.gather(group_fps, tuple_ids) # Make batches tuple_q = tf.RandomShuffleQueue(capacity, min_after_dequeue, tuples.dtype, [k]) tuple_enq = tuple_q.enqueue_many(tuples) tf.train.add_queue_runner( tf.train.QueueRunner(tuple_q, [tuple_enq] * nthreads)) tf.summary.scalar('tuples_queue_size', tuple_q.size()) return tuple_q.dequeue()
reader = tf.TextLineReader(skip_header_lines=1) key, value = reader.read(filename_queue) x1, x2, target = tf.decode_csv(value, record_defaults=[[-1.], [-1.], [-1]]) features = tf.stack([x1, x2]) enqueue_instance = instance_queue.enqueue([features, target]) return enqueue_instance filename_queue = tf.FIFOQueue(capacity=10, dtypes=[tf.string], shapes=[()]) filename = tf.placeholder(tf.string) enqueue_filename = filename_queue.enqueue([filename]) close_filename_queue = filename_queue.close() instance_queue = tf.RandomShuffleQueue(capacity=10, min_after_dequeue=2, dtypes=[tf.float32, tf.int32], shapes=[[2], []], name="instance_q", shared_name="shared_instance_q") minibatch_instances, minibatch_targets = instance_queue.dequeue_up_to(2) read_and_enqueue_ops = [ read_and_push_instance(filename_queue, instance_queue) for i in range(5) ] queue_runner = tf.train.QueueRunner(instance_queue, read_and_enqueue_ops) with tf.Session() as sess: sess.run(enqueue_filename, feed_dict={filename: "my_test.csv"}) sess.run(close_filename_queue) coord = tf.train.Coordinator() enqueue_threads = queue_runner.create_threads(sess,