Ejemplo n.º 1
0
 def device_minibatches(self, total_batch_size):
     record_input = data_flow_ops.RecordInput(
         file_pattern=os.path.join(FLAGS.data_dir, '%s-*' % self.subset),
         parallelism=64,
         # Note: This causes deadlock during init if larger than dataset
         buffer_size=FLAGS.input_buffer_size,
         batch_size=total_batch_size)
     records = record_input.get_yield_op()
     # Split batch into individual images
     records = tf.split(records, total_batch_size, 0)
     records = [tf.reshape(record, []) for record in records]
     # Deserialize and preprocess images into batches for each device
     images = defaultdict(list)
     labels = defaultdict(list)
     with tf.name_scope('input_pipeline'):
         for i, record in enumerate(records):
             imgdata, label, bbox, text = deserialize_image_record(record)
             image = self.preprocess(imgdata, bbox, thread_id=i)
             label -= 1  # Change to 0-based (don't use background class)
             device_num = i % self.num_devices
             images[device_num].append(image)
             labels[device_num].append(label)
         # Stack images back into a sub-batch for each device
         for device_num in range(self.num_devices):
             images[device_num] = tf.parallel_stack(images[device_num])
             labels[device_num] = tf.concat(labels[device_num], 0)
             images[device_num] = tf.reshape(
                 images[device_num], [-1, self.height, self.width, 3])
             images[device_num] = tf.clip_by_value(images[device_num], 0.,
                                                   255.)
             images[device_num] = tf.cast(images[device_num], self.dtype)
     return images, labels
Ejemplo n.º 2
0
    def minibatch(self):
        with tf.name_scope('batch_processing'):
            images = []

            record_input = data_flow_ops.RecordInput(
                file_pattern=dp.tf_records(
                    self.data_set,
                    '{0}_crop_patch_full_{1}'.format(self.file_format,
                                                     self.region)),
                seed=301,
                parallelism=64,
                buffer_size=5000,
                shift_ratio=0.2,
                batch_size=self.batch_size,
                name='record_input')
            records = record_input.get_yield_op()
            records = tf.split(records, self.batch_size, 0)
            records = [tf.reshape(record, []) for record in records]
            for i in xrange(self.batch_size):
                value = records[i]
                image, dim = self._parse_example_proto(value, i)
                image = tf.image.resize_images(image, self.image_shape)
                image = self.distort_image(image)
                image = image[:, :, 0]
                images.append(image)
            images = tf.parallel_stack(images)

            images = tf.reshape(images,
                                shape=[
                                    self.batch_size, self.image_shape[0],
                                    self.image_shape[1], -1
                                ])
            return images
Ejemplo n.º 3
0
 def testEmptyGlob(self):
   with self.cached_session() as sess:
     record_input = data_flow_ops.RecordInput(file_pattern="foo")
     yield_op = record_input.get_yield_op()
     self.evaluate(variables.global_variables_initializer())
     with self.assertRaises(NotFoundError):
       self.evaluate(yield_op)
Ejemplo n.º 4
0
    def minibatch(self):
        with tf.name_scope('batch_processing'):
            images = []
            bboxes = []

            record_input = data_flow_ops.RecordInput(
                file_pattern=dp.tf_records(
                    self.data_set, '{0}_keypoint'.format(self.keypoint.name)),
                seed=301,
                parallelism=64,
                buffer_size=2000,
                shift_ratio=0.2,
                batch_size=self.batch_size,
                name='record_input')
            records = record_input.get_yield_op()
            records = tf.split(records, self.batch_size, 0)
            records = [tf.reshape(record, []) for record in records]
            for i in xrange(self.batch_size):
                value = records[i]
                image, bbox = self._parse_example_proto(value, i)
                bbox = bbox[:4]
                image, bbox = self.distort_image(image, bbox)
                image -= tf.reduce_mean(image, axis=[0, 1])
                images.append(image)
                bboxes.append(bbox)
            images = tf.parallel_stack(images)

            images = tf.reshape(images,
                                shape=[
                                    self.batch_size, self.image_shape[0],
                                    self.image_shape[1], -1
                                ])
            bboxes = tf.reshape(bboxes, (self.batch_size, 1, 4))

            return images, bboxes
Ejemplo n.º 5
0
  def testRecordInputEpochs(self):
    files = 100
    records_per_file = 100
    batches = 2
    with self.cached_session() as sess:
      self.generateTestData("basic", files, records_per_file)

      records = data_flow_ops.RecordInput(
          file_pattern=os.path.join(self.get_temp_dir(), "basic.*"),
          parallelism=2,
          buffer_size=2000,
          batch_size=1,
          shift_ratio=0.33,
          seed=10,
          name="record_input",
          batches=batches)

      yield_op = records.get_yield_op()

      # cycle over 3 epochs and make sure we never duplicate
      for _ in range(3):
        epoch_set = set()
        for _ in range(int(files * records_per_file / batches)):
          op_list = self.evaluate(yield_op)
          self.assertTrue(len(op_list) is batches)
          for r in op_list:
            self.assertTrue(r[0] not in epoch_set)
            epoch_set.add(r[0])
Ejemplo n.º 6
0
    def device_minibatches(cls,
                           num_devices,
                           data_dir,
                           total_batch_size,
                           height,
                           width,
                           distort_color,
                           val=False):
        dtype = tf.float32
        subset = 'validation' if val else 'train'

        nrecord = get_num_records(os.path.join(data_dir,
                                               '{}-*'.format(subset)))
        input_buffer_size = min(10000, nrecord)

        record_input = data_flow_ops.RecordInput(
            file_pattern=os.path.join(data_dir, '{}-*'.format(subset)),
            parallelism=64,
            # Note: This causes deadlock during init if
            # larger than dataset
            buffer_size=input_buffer_size,
            batch_size=total_batch_size,
            seed=0)

        records = record_input.get_yield_op()

        # Split batch into individual images
        records = tf.split(records, total_batch_size, 0)
        records = [tf.reshape(record, []) for record in records]
        # Deserialize and preprocess images into batches for each device
        images = defaultdict(list)
        labels = defaultdict(list)
        with tf.name_scope('input_pipeline'):
            for thread_id, record in enumerate(records):
                imgdata, label, bbox, _ = cls._deserialize_image_record(record)
                image = cls._preprocess(imgdata,
                                        bbox,
                                        thread_id,
                                        height,
                                        width,
                                        distort_color,
                                        val=val)
                label -= 1  # Change to 0-based (don't use background class)
                device_num = thread_id % num_devices
                images[device_num].append(image)
                labels[device_num].append(label)

            # Stack images back into a sub-batch for each device
            for device_num in xrange(num_devices):
                images[device_num] = tf.parallel_stack(images[device_num])
                labels[device_num] = tf.concat(labels[device_num], 0)
                images[device_num] = tf.reshape(images[device_num],
                                                [-1, height, width, 3])
                images[device_num] = tf.clip_by_value(images[device_num], 0.,
                                                      255.)
                images[device_num] = tf.cast(images[device_num], dtype)

        return images, labels, nrecord
Ejemplo n.º 7
0
def input_fn(tf_glob,
             one_hot=True,
             classes=None,
             is_training=None,
             batch_shape=[32, 224, 224, 3],
             parallelism=1):
    """ Return tensor to read from TFRecord """
    print('Creating graph for loading %s TFRecords...' % tf_glob)
    with tf.variable_scope("TFRecords"):
        record_input = data_flow_ops.RecordInput(tf_glob,
                                                 batch_size=batch_shape[0],
                                                 parallelism=parallelism)
        records_op = record_input.get_yield_op()
        records_op = tf.split(records_op, batch_shape[0], 0)
        records_op = [tf.reshape(record, []) for record in records_op]
        progbar = Progbar(len(records_op))

        images = []
        labels = []
        for i, serialized_example in enumerate(records_op):
            progbar.update(i)
            with tf.variable_scope("parse_images", reuse=True):
                features = tf.parse_single_example(
                    serialized_example,
                    features={
                        'image': tf.FixedLenFeature([], tf.string),
                        'label': tf.FixedLenFeature([], tf.int64),
                    })
                image_decoded = tf.image.decode_jpeg(features['image'],
                                                     channels=3)
                image = tf.image.convert_image_dtype(image_decoded, tf.float32)
                resized_image = tf.image.resize_images(
                    image, [batch_shape[1], batch_shape[2]])
                label = tf.cast(features['label'], tf.int32)
                if one_hot and classes:
                    label = tf.one_hot(label, classes)

                images.append(resized_image)
                labels.append(label)

        images = tf.parallel_stack(images, 0)
        labels = tf.parallel_stack(labels, 0)
        #         images = tf.cast(images, tf.float32)

        #         images = tf.reshape(images, shape=batch_shape)

        # StagingArea will store tensors
        # across multiple steps to
        # speed up execution
        images_shape = images.get_shape()
        labels_shape = labels.get_shape()
        copy_stage = data_flow_ops.StagingArea(
            [tf.float32, tf.float32], shapes=[images_shape, labels_shape])
        copy_stage_op = copy_stage.put([images, labels])
        staged_images, staged_labels = copy_stage.get()

        return images, labels
Ejemplo n.º 8
0
    def minibatch(self,
                  dataset,
                  subset,
                  use_datasets,
                  cache_data,
                  shift_ratio=-1):
        if shift_ratio < 0:
            shift_ratio = self.shift_ratio
        with tf.name_scope('batch_processing'):
            # Build final results per split.
            images = [[] for _ in range(self.num_splits)]
            labels = [[] for _ in range(self.num_splits)]
            if use_datasets:
                ds = data_utils.create_dataset(self.batch_size,
                                               self.num_splits,
                                               self.batch_size_per_split,
                                               self.parse_and_preprocess,
                                               dataset, subset, self.train,
                                               cache_data)
                ds_iterator = data_utils.create_iterator(ds)
                for d in xrange(self.num_splits):
                    labels[d], images[d] = ds_iterator.get_next()

            else:
                record_input = data_flow_ops.RecordInput(
                    file_pattern=dataset.tf_record_pattern(subset),
                    seed=301,
                    parallelism=64,
                    buffer_size=10000,
                    batch_size=self.batch_size,
                    shift_ratio=shift_ratio,
                    name='record_input')
                records = record_input.get_yield_op()
                records = tf.split(records, self.batch_size, 0)
                records = [tf.reshape(record, []) for record in records]
                for idx in xrange(self.batch_size):
                    value = records[idx]
                    (label, image) = self.parse_and_preprocess(value, idx)
                    split_index = idx % self.num_splits
                    labels[split_index].append(label)
                    images[split_index].append(image)

            for split_index in xrange(self.num_splits):
                if not use_datasets:
                    images[split_index] = tf.parallel_stack(
                        images[split_index])
                    labels[split_index] = tf.concat(labels[split_index], 0)
                images[split_index] = tf.reshape(images[split_index],
                                                 shape=[
                                                     self.batch_size_per_split,
                                                     self.height, self.width,
                                                     self.depth
                                                 ])
                labels[split_index] = tf.reshape(labels[split_index],
                                                 [self.batch_size_per_split])
            return images, labels
Ejemplo n.º 9
0
def read_and_decode_recordinput(tf_glob, one_hot=True, classes=None, is_train=None,
                                batch_shape=[1000, 28, 28, 1], parallelism=1):
    """ Return tensor to read from TFRecord """
    print 'Creating graph for loading %s TFRecords...' % tf_glob
    with tf.variable_scope("TFRecords"):
        record_input = data_flow_ops.RecordInput(
            tf_glob, batch_size=batch_shape[0], parallelism=parallelism)
        records_op = record_input.get_yield_op()
        records_op = tf.split(records_op, batch_shape[0], 0)
        records_op = [tf.reshape(record, []) for record in records_op]
        progbar = Progbar(len(records_op))

        images = []
        labels = []
        for i, serialized_example in enumerate(records_op):
            progbar.update(i)
            with tf.variable_scope("parse_images", reuse=True):
                features = tf.parse_single_example(
                    serialized_example,
                    features={
                        'label': tf.FixedLenFeature([], tf.int64),
                        'image_raw': tf.FixedLenFeature([], tf.string),
                    })
                img = tf.decode_raw(features['image_raw'], tf.uint8)
                img.set_shape(batch_shape[1] * batch_shape[2])
                img = tf.reshape(img, [1] + batch_shape[1:])

                img = tf.cast(img, tf.float32) * (1. / 255) - 0.5

                label = tf.cast(features['label'], tf.int32)
                if one_hot and classes:
                    label = tf.one_hot(label, classes)

                images.append(img)
                labels.append(label)

        images = tf.parallel_stack(images, 0)
        labels = tf.parallel_stack(labels, 0)
        images = tf.cast(images, tf.float32)

        images = tf.reshape(images, shape=batch_shape)

        # StagingArea will store tensors
        # across multiple steps to
        # speed up execution
        images_shape = images.get_shape()
        labels_shape = labels.get_shape()
        copy_stage = data_flow_ops.StagingArea(
            [tf.float32, tf.float32],
            shapes=[images_shape, labels_shape])
        copy_stage_op = copy_stage.put(
            [images, labels])
        staged_images, staged_labels = copy_stage.get()
        print(images, labels)
        return images, labels
Ejemplo n.º 10
0
  def testRecordInputSimple(self):
    with self.cached_session() as sess:
      self.generateTestData("basic", 1, 1)

      yield_op = data_flow_ops.RecordInput(
          file_pattern=os.path.join(self.get_temp_dir(), "basic.*"),
          parallelism=1,
          buffer_size=1,
          batch_size=1,
          name="record_input").get_yield_op()

      self.assertEqual(self.evaluate(yield_op), b"0000000000")
Ejemplo n.º 11
0
Archivo: input.py Proyecto: eskilj/ml
    def minibatch(self, dataset, subset):
        with tf.name_scope('batch_processing'):
            images = [[] for _ in range(self.device_count)]
            labels = [[] for _ in range(self.device_count)]

            # The RecordInput Op will continuously read a batch of records asynchronously
            # into a buffer of some fixed capacity (source: TF Docs)
            record_input = data_flow_ops.RecordInput(
                file_pattern=dataset.tf_record_pattern(subset),
                seed=301,
                parallelism=64,
                buffer_size=10000,
                batch_size=self.batch_size,
                name='record_input')

            records = record_input.get_yield_op()
            records = tf.split(records, self.batch_size, 0)
            records = [tf.reshape(record, []) for record in records]

            for i in range(self.batch_size):
                value = records[i]
                image_buffer, label_index, bbox, _ = parse_example_proto(value)
                image = self.preprocess(image_buffer, bbox, i % 4)
                device_index = i % self.device_count
                images[device_index].append(image)
                labels[device_index].append(label_index)

            label_index_batch = [None] * self.device_count

            for device_index in range(self.device_count):
                images[device_index] = tf.parallel_stack(images[device_index])
                label_index_batch[device_index] = tf.concat(
                    labels[device_index], 0)

                images[device_index] = tf.cast(images[device_index],
                                               self.dtype)
                depth = 3
                images[device_index] = tf.reshape(
                    images[device_index],
                    shape=[
                        self.batch_size_per_device, self.height, self.width,
                        depth
                    ])

                label_index_batch[device_index] = tf.reshape(
                    label_index_batch[device_index],
                    [self.batch_size_per_device])

            return images, label_index_batch
Ejemplo n.º 12
0
  def testRecordInputSimpleZlib(self):
    with self.test_session() as sess:
      self.generateTestData("basic", 1, 1,
          compression_type=tf_record.TFRecordCompressionType.ZLIB)

      yield_op = data_flow_ops.RecordInput(
          file_pattern=os.path.join(self.get_temp_dir(), "basic.*"),
          parallelism=1,
          buffer_size=1,
          batch_size=1,
          name="record_input",
          compression_type=
              tf_record.TFRecordCompressionType.ZLIB).get_yield_op()

      self.assertEqual(sess.run(yield_op), b"0000000000")
Ejemplo n.º 13
0
  def testDoesNotDeadlock(self):
    # Iterate multiple times to cause deadlock if there is a chance it can occur
    for _ in range(30):
      with self.cached_session() as sess:
        self.generateTestData("basic", 1, 1)

        records = data_flow_ops.RecordInput(
            file_pattern=os.path.join(self.get_temp_dir(), "basic.*"),
            parallelism=1,
            buffer_size=100,
            batch_size=1,
            name="record_input")

        yield_op = records.get_yield_op()
        for _ in range(50):
          self.evaluate(yield_op)
Ejemplo n.º 14
0
    def record_input_batch(self,
                           pre_process_func=None,
                           seed=301,
                           parallelism=64,
                           buffer_size=10000,
                           batch_size=32,
                           cols=None):
        """使用RecordInput从TFRecord随机读取一个batch的数据

        :param pre_process_func: 预处理函数。为None时不进行预处理。预处理函数接收的参数数目需要和len(cols)相同,返回参数数目不限制
        :param seed: 随机种子
        :param parallelism: 并发数
        :param buffer_size: The maximum number of records the buffer will contain.
        :param batch_size: 一次返回多少records
        :param cols: 要返回TFRecord中的哪些feature。get_keys()函数返回值得子集。
        :return: 一个batch的数据
        """
        if cols is None:
            cols = self.get_keys()
        record_input = data_flow_ops.RecordInput(  # return : A tensor of shape [batch_size].
            file_pattern=self.pattern,
            seed=seed,
            parallelism=parallelism,
            buffer_size=buffer_size,
            batch_size=batch_size,
            name='record_input')
        records = record_input.get_yield_op()
        records = tf.split(records, batch_size, 0)
        records = [tf.reshape(record, []) for record in records]
        batch_examples = [[] for _ in range(len(cols))]
        keys = self.get_keys()
        types = self.get_types()
        for i in range(batch_size):
            value = records[i]
            cols_types = [types[keys.index(col)]
                          for col in cols]  # 返回cols对应的类型
            features = _parse_single_example_proto_cols_closure(
                cols_types, cols)(value)

            if pre_process_func is not None:  # 是否进行预处理
                features = pre_process_func(*features)
                # 调用者可能使用pre_process_func返回单个Tensor,需要转换成Sequence
                if not isinstance(features, collections.Sequence):
                    features = (features, )
            for j, feature in enumerate(features):
                batch_examples[j].append(feature)
        return batch_examples
Ejemplo n.º 15
0
    def minibatch(self, dataset, subset):
        with tf.name_scope('batch_processing'):
            images = [[] for i in range(self.device_count)]
            labels = [[] for i in range(self.device_count)]
            filenames = [[] for i in range(self.device_count)]
            record_input = data_flow_ops.RecordInput(
                file_pattern=dataset.tf_record_pattern(subset),
                seed=randint(0, 9000),
                parallelism=64,
                buffer_size=10000,
                batch_size=self.batch_size,
                name='record_input')
            records = record_input.get_yield_op()
            records = tf.split(records, self.batch_size, 0)
            records = [tf.reshape(record, []) for record in records]
            for i in xrange(self.batch_size):
                value = records[i]
                image_buffer, label_index, bbox, _, filename = parse_example_proto(
                    value)
                image = self.preprocess(image_buffer, bbox, i % 4)
                device_index = i % self.device_count
                images[device_index].append(image)
                labels[device_index].append(label_index)
                filenames[device_index].append(filename)
            label_index_batch = [None] * self.device_count
            for device_index in xrange(self.device_count):
                images[device_index] = tf.parallel_stack(images[device_index])
                label_index_batch[device_index] = tf.concat(
                    labels[device_index], 0)

                # dynamic_pad=True) # HACK TESTING dynamic_pad=True
                images[device_index] = tf.cast(images[device_index],
                                               self.dtype)
                depth = 3
                images[device_index] = tf.reshape(
                    images[device_index],
                    shape=[
                        self.batch_size_per_device, self.height, self.width,
                        depth
                    ])
                label_index_batch[device_index] = tf.reshape(
                    label_index_batch[device_index],
                    [self.batch_size_per_device])
                # Display the training images in the visualizer.
                # tf.summary.image('images', images)

            return images, label_index_batch, filenames
Ejemplo n.º 16
0
    def minibatch(self):
        """
        Returns minibatch of images and labels from TF records file.
        """
        mode = self.mode
        batch_size = self.params['batch_size']
        if mode not in ['train', 'validation', 'test']:
            mode = 'train'

        if self.debug: self.inspect_tfrecords(mode)

        record_input = data_flow_ops.RecordInput(
            file_pattern=os.path.join(self.params['data_dir'], '*.tfrecords'),
            parallelism=self.params['IO_threads'],
            buffer_size=self.params['buffer_cap'],
            batch_size=batch_size)
        records = record_input.get_yield_op()

        # Split batch into individual images
        records = tf.split(records, batch_size, 0)
        records = [tf.reshape(record, []) for record in records]
        #print('record contents %s' %(format(records)))
        #print('record length %s and contents %s' %(len(records),format(records)))
        # Deserialize and preprocess images into batches for each device
        images = []
        labels = []
        with tf.name_scope('input_pipeline'):
            if self.params[mode + '_distort']:
                print_rank('images will be distorted')

            for i, record in enumerate(records):
                image, label = self.decode_image_label(record)
                if self.params[mode + '_distort']:
                    # image = self.add_noise_image(image)
                    image = self.distort(image)
                images.append(image)
                labels.append(label)
                image_shape = image.get_shape().as_list()
                label_shape = label.get_shape().as_list()
            # Stack images and labels back into a single tensor
            labels = tf.parallel_stack(labels)
            images = tf.parallel_stack(images)

            # reshape them to the expected shape:
            labels_newshape = [batch_size] + label_shape
            images_newshape = [batch_size] + image_shape
            labels = tf.reshape(labels, labels_newshape)
            images = tf.reshape(images, images_newshape)

            # glimpse images: moved to GPU
            #images = self.get_glimpses(images)

            # Display the training images in the Tensorboard visualizer.
            if self.debug: tf.summary.image("images", images, max_outputs=4)

            # resize
            if self.params['resize']:
                images = tf.image.resize_bilinear(images, [
                    self.params['RESIZE_WIDTH'], self.params['RESIZE_HEIGHT']
                ])
            if self.params['tile']:
                images = tf.ones([
                    self.params['IMAGE_DEPTH'], self.params['IMAGE_HEIGHT'],
                    self.params['IMAGE_WIDTH']
                ],
                                 dtype=self.params['IMAGE_DTYPE'])
                labels = tf.ones([256, 512, 512],
                                 dtype=self.params['LABEL_DTYPE'])

        return images, labels
Ejemplo n.º 17
0
  def minibatch(self, dataset, subset, use_datasets, shift_ratio=-1):
    if shift_ratio < 0:
      shift_ratio = self.shift_ratio
    with tf.name_scope('batch_processing'):
      # Build final results per split.
      images = [[] for _ in range(self.num_splits)]
      labels = [[] for _ in range(self.num_splits)]
      if use_datasets:
        glob_pattern = dataset.tf_record_pattern(subset)
        file_names = gfile.Glob(glob_pattern)
        if not file_names:
          raise ValueError('Found no files in --data_dir matching: {}'
                           .format(glob_pattern))
        ds = tf.contrib.data.TFRecordDataset(file_names)
        counter = tf.contrib.data.Dataset.range(self.batch_size)
        counter = counter.repeat()
        ds = tf.contrib.data.Dataset.zip((ds, counter))
        ds = ds.map(
            self.parse_and_preprocess,
            num_parallel_calls=self.batch_size)
        ds = ds.prefetch(buffer_size=self.batch_size)
        ds = ds.shuffle(buffer_size=10000)
        ds = ds.repeat()
        ds_iterator = ds.make_one_shot_iterator()
        # TODO(jsimsa): Use datasets' batch transformation instead of (see
        # below) once the transformation implements parallel data copy.
        #
        # NOTE: The current implementation does not preserve the order of
        # elements between the shuffle buffer and the batch.
        for idx in xrange(self.batch_size):
          label, image = ds_iterator.get_next()
          split_index = idx % self.num_splits
          labels[split_index].append(label)
          images[split_index].append(image)

      else:
        record_input = data_flow_ops.RecordInput(
            file_pattern=dataset.tf_record_pattern(subset),
            seed=301,
            parallelism=64,
            buffer_size=10000,
            batch_size=self.batch_size,
            shift_ratio=shift_ratio,
            name='record_input')
        records = record_input.get_yield_op()
        records = tf.split(records, self.batch_size, 0)
        records = [tf.reshape(record, []) for record in records]
        for idx in xrange(self.batch_size):
          value = records[idx]
          (label, image) = self.parse_and_preprocess(value, idx)
          split_index = idx % self.num_splits
          labels[split_index].append(label)
          images[split_index].append(image)

      for split_index in xrange(self.num_splits):
        images[split_index] = tf.parallel_stack(images[split_index])
        labels[split_index] = tf.concat(labels[split_index], 0)
        images[split_index] = tf.cast(images[split_index], self.dtype)
        depth = 3
        images[split_index] = tf.reshape(
            images[split_index],
            shape=[self.batch_size_per_split, self.height, self.width, depth])
        labels[split_index] = tf.reshape(labels[split_index],
                                         [self.batch_size_per_split])
      return images, labels
Ejemplo n.º 18
0
    def minibatch(self, dataset, subset, use_data_sets):
        with tf.name_scope('batch_processing'):
            images = [[] for _ in range(self.num_splits)]
            labels = [[] for _ in range(self.num_splits)]
            if use_data_sets:
                file_names = glob.glob(dataset.tf_record_pattern(subset))
                ds = tf.contrib.data.TFRecordDataset(file_names)
                counter = tf.contrib.data.Dataset.range(self.batch_size)
                counter = counter.repeat()
                ds = tf.contrib.data.Dataset.zip((ds, counter))
                ds = ds.map(self.parse_and_preprocess,
                            num_parallel_calls=self.batch_size,
                            output_buffer_size=self.batch_size)
                ds = ds.shuffle(buffer_size=10000)
                ds = ds.repeat()
                ds = ds.batch(batch_size=(self.batch_size / self.num_splits))
                ds_iterator = ds.make_one_shot_iterator()

                for d in xrange(self.num_splits):
                    labels[d], images[d] = ds_iterator.get_next()

            else:
                # Build final results per split.
                record_input = data_flow_ops.RecordInput(
                    file_pattern=dataset.tf_record_pattern(subset),
                    seed=301,
                    parallelism=64,
                    buffer_size=10000,
                    batch_size=self.batch_size,
                    shift_ratio=self.shift_ratio,
                    name='record_input')
                records = record_input.get_yield_op()
                records = tf.split(records, self.batch_size, 0)
                records = [tf.reshape(record, []) for record in records]
                for idx in xrange(self.batch_size):
                    value = records[idx]
                    (label_index,
                     image) = self.parse_and_preprocess(value, idx)
                    split_index = idx % self.num_splits
                    images[split_index].append(image)
                    labels[split_index].append(label_index)

            label_index_batch = [None] * self.num_splits
            for split_index in xrange(self.num_splits):
                if use_data_sets:
                    label_index_batch[split_index] = labels[split_index]
                else:
                    images[split_index] = tf.parallel_stack(
                        images[split_index])
                    label_index_batch[split_index] = tf.concat(
                        labels[split_index], 0)
                images[split_index] = tf.cast(images[split_index], self.dtype)
                depth = 3
                images[split_index] = tf.reshape(images[split_index],
                                                 shape=[
                                                     self.batch_size_per_split,
                                                     self.height, self.width,
                                                     depth
                                                 ])
                label_index_batch[split_index] = tf.reshape(
                    label_index_batch[split_index],
                    [self.batch_size_per_split])

            return images, label_index_batch
    def minibatch(self, file_pattern):
        with tf.name_scope('batch_processing'):
            output_data = [[] for i in range(self.device_count)]
            labels = [[] for i in range(self.device_count)]
            record_input = data_flow_ops.RecordInput(
                file_pattern=file_pattern,
                seed=301,
                parallelism=64,
                buffer_size=10000,
                batch_size=self.batch_size,
                name='record_input')
            records = record_input.get_yield_op()
            records = tf.split(records, self.batch_size, 0)
            records = [tf.reshape(record, []) for record in records]
            for i in xrange(self.batch_size):
                value = records[i]
                data_buffer, label_index, _, frames = self.parse_example_proto(
                    value)

                processed_data = self.preprocess(data_buffer, frames)

                device_index = i % self.device_count
                output_data[device_index].append(processed_data)
                labels[device_index].append(label_index)
            label_index_batch = [None] * self.device_count
            for device_index in xrange(self.device_count):
                output_data[device_index] = tf.parallel_stack(
                    output_data[device_index])
                label_index_batch[device_index] = tf.concat(
                    labels[device_index], 0)

                # dynamic_pad=True) # HACK TESTING dynamic_pad=True
                output_data[device_index] = tf.cast(output_data[device_index],
                                                    self.dtype)
                if self.data_type is 'rgb':
                    depth = 3
                    output_data[device_index] = tf.reshape(
                        output_data[device_index],
                        shape=[
                            self.batch_size_per_device, self.time_window,
                            self.cropped_size[0], self.cropped_size[1], depth
                        ])
                    # shape=[self.batch_size_per_device, -1, self.cropped_size[0], self.cropped_size[1], depth])
                elif self.data_type is 'flow':
                    depth = 2
                    output_data[device_index] = tf.reshape(
                        output_data[device_index],
                        shape=[
                            self.batch_size_per_device, self.time_window,
                            self.cropped_size[0], self.cropped_size[1], depth
                        ])
                    # shape=[self.batch_size_per_device, -1, self.cropped_size[0], self.cropped_size[1], depth])
                # elif self.data_type is 'audio':
                # TBD
                else:
                    raise ('data_type error, get: ', self.data_type)
                label_index_batch[device_index] = tf.reshape(
                    label_index_batch[device_index],
                    [self.batch_size_per_device])
                # Display the training images in the visualizer.
                # tf.summary.image('images', images)

            return output_data, label_index_batch
Ejemplo n.º 20
0
    def minibatch(self):
        with tf.name_scope('batch_processing'):
            images = []
            bboxes = []
            labels = []
            slcs = []

            record_input = data_flow_ops.RecordInput(
                file_pattern=dp.tf_records(
                    self.data_set, '{}_localization'.format(self.region)),
                seed=301,
                parallelism=64,
                buffer_size=5000,
                shift_ratio=0.2,
                batch_size=self.batch_size,
                name='record_input')
            records = record_input.get_yield_op()
            records = tf.split(records, self.batch_size, 0)
            records = [tf.reshape(record, []) for record in records]
            for i in xrange(self.batch_size):
                value = records[i]
                image, bbox, dim, label, slc = self._parse_example_proto(
                    value, i)
                image = tf.image.resize_images(image, self.image_shape)
                bbox = tf.cast(
                    tf.cast(bbox, tf.float32) * float(IMAGE_LENGTH) /
                    tf.cast(dim, tf.float32), tf.int64)
                image, bbox = self.distort_image(image, bbox)
                unmaked_image = image
                bbox = tf.cast(bbox, tf.int32)
                bbox_mask = tf.ones((bbox[3] - bbox[1], bbox[2] - bbox[0]))
                bbox_mask_left = tf.zeros((bbox[3] - bbox[1], bbox[0]))
                bbox_mask_right = tf.zeros(
                    (bbox[3] - bbox[1], IMAGE_LENGTH - bbox[2]))
                bbox_mask_top = tf.zeros((bbox[1], IMAGE_LENGTH))
                bbox_mask_bottom = tf.zeros(
                    (IMAGE_LENGTH - bbox[3], IMAGE_LENGTH))
                bbox_mask = tf.concat(
                    (bbox_mask_left, bbox_mask, bbox_mask_right), axis=1)
                bbox_mask = tf.concat(
                    (bbox_mask_top, bbox_mask, bbox_mask_bottom), axis=0)
                bbox_mask *= 150
                image = image[:, :, 0]
                image -= self.mean_subtract
                image += bbox_mask
                image = tf.minimum(image, 255)
                bbox_mask.set_shape((IMAGE_LENGTH, IMAGE_LENGTH))
                images.append(image)
                bbox = tf.cast(bbox, tf.int64)
                bboxes.append(bbox)
                labels.append(label)
                slcs.append(slc)
            images = tf.parallel_stack(images)

            images = tf.reshape(images,
                                shape=[
                                    self.batch_size, self.image_shape[0],
                                    self.image_shape[1], -1
                                ])
            bboxes = tf.reshape(bboxes, (self.batch_size, BOX_COUNT, 4))
            x_min, y_min, x_max, y_max = tf.split(value=tf.reshape(
                bboxes, (-1, 4)),
                                                  num_or_size_splits=4,
                                                  axis=1)
            normalized_boxes = tf.cast(
                tf.reshape(tf.stack((y_min, x_min, y_max, x_max), axis=1),
                           (self.batch_size, BOX_COUNT, 4)),
                tf.float32) / float(IMAGE_LENGTH)
            # images = tf.image.draw_bounding_boxes(images, normalized_boxes)
            labels = tf.reshape(labels, (self.batch_size, 1))
            slcs = tf.reshape(slcs, (self.batch_size, ))
            return images, labels, slcs
Ejemplo n.º 21
0
    def minibatch(self,
                  dataset,
                  subset,
                  use_datasets,
                  cache_data,
                  shift_ratio=-1):
        if shift_ratio < 0:
            shift_ratio = self.shift_ratio
        with tf.name_scope('batch_processing'):
            # Build final results per split.
            images = [[] for _ in range(self.num_splits)]
            labels = [[] for _ in range(self.num_splits)]
            if use_datasets:
                glob_pattern = dataset.tf_record_pattern(subset)
                file_names = gfile.Glob(glob_pattern)
                if not file_names:
                    raise ValueError(
                        'Found no files in --data_dir matching: {}'.format(
                            glob_pattern))
                ds = tf.data.TFRecordDataset.list_files(file_names)
                ds = ds.apply(
                    interleave_ops.parallel_interleave(tf.data.TFRecordDataset,
                                                       cycle_length=10))
                if cache_data:
                    ds = ds.take(1).cache().repeat()
                counter = tf.data.Dataset.range(self.batch_size)
                counter = counter.repeat()
                ds = tf.data.Dataset.zip((ds, counter))
                ds = ds.prefetch(buffer_size=self.batch_size)
                ds = ds.shuffle(buffer_size=10000)
                ds = ds.repeat()
                ds = ds.apply(
                    batching.map_and_batch(
                        map_func=self.parse_and_preprocess,
                        batch_size=self.batch_size_per_split,
                        num_parallel_batches=self.num_splits))
                ds = ds.prefetch(buffer_size=self.num_splits)
                ds_iterator = ds.make_one_shot_iterator()
                for d in xrange(self.num_splits):
                    labels[d], images[d] = ds_iterator.get_next()

            else:
                record_input = data_flow_ops.RecordInput(
                    file_pattern=dataset.tf_record_pattern(subset),
                    seed=301,
                    parallelism=64,
                    buffer_size=10000,
                    batch_size=self.batch_size,
                    shift_ratio=shift_ratio,
                    name='record_input')
                records = record_input.get_yield_op()
                records = tf.split(records, self.batch_size, 0)
                records = [tf.reshape(record, []) for record in records]
                for idx in xrange(self.batch_size):
                    value = records[idx]
                    (label, image) = self.parse_and_preprocess(value, idx)
                    split_index = idx % self.num_splits
                    labels[split_index].append(label)
                    images[split_index].append(image)

            for split_index in xrange(self.num_splits):
                if not use_datasets:
                    images[split_index] = tf.parallel_stack(
                        images[split_index])
                    labels[split_index] = tf.concat(labels[split_index], 0)
                images[split_index] = tf.cast(images[split_index], self.dtype)
                depth = 3
                images[split_index] = tf.reshape(images[split_index],
                                                 shape=[
                                                     self.batch_size_per_split,
                                                     self.height, self.width,
                                                     depth
                                                 ])
                labels[split_index] = tf.reshape(labels[split_index],
                                                 [self.batch_size_per_split])
            return images, labels
Ejemplo n.º 22
0
    def minibatch(self, dataset, subset, use_data_sets):
        with tf.name_scope('batch_processing'):
            images = [[] for i in range(self.device_count)]
            labels = [[] for i in range(self.device_count)]
            if use_data_sets:
                file_names = glob.glob(dataset.tf_record_pattern(subset))
                batch_size_per = self.batch_size / self.device_count
                num_threads = 10
                output_buffer_size = num_threads * 2000

                counter = tf.data.Dataset.range(sys.maxint)
                ds = tf.data.TFRecordDataset(file_names)
                ds = tf.data.Dataset.zip((ds, counter))
                ds = ds.map(self.parse_and_preprocess,
                            num_parallel_calls=num_threads).prefetch(
                                output_buffer_size)
                shuffle_buffer_size = 10000
                ds = ds.shuffle(shuffle_buffer_size)
                repeat_count = -1  # infinite repetition
                ds = ds.repeat(repeat_count)
                ds = ds.batch(batch_size_per)
                ds_iterator = ds.make_one_shot_iterator()

                for d in xrange(self.device_count):
                    labels[d], images[d] = ds_iterator.get_next()

            else:
                # Build final results per device.
                record_input = data_flow_ops.RecordInput(
                    file_pattern=dataset.tf_record_pattern(subset),
                    seed=301,
                    parallelism=64,
                    buffer_size=10000,
                    batch_size=self.batch_size,
                    shift_ratio=self.shift_ratio,
                    name='record_input')
                records = record_input.get_yield_op()
                records = tf.split(records, self.batch_size, 0)
                records = [tf.reshape(record, []) for record in records]
                for i in xrange(self.batch_size):
                    value = records[i]
                    (label_index,
                     image) = self.parse_and_preprocess(value, i % 4)
                    device_index = i % self.device_count
                    images[device_index].append(image)
                    labels[device_index].append(label_index)

            label_index_batch = [None] * self.device_count
            for device_index in xrange(self.device_count):
                if use_data_sets:
                    label_index_batch[device_index] = labels[device_index]
                else:
                    images[device_index] = tf.parallel_stack(
                        images[device_index])
                    label_index_batch[device_index] = tf.concat(
                        labels[device_index], 0)
                images[device_index] = tf.cast(images[device_index],
                                               self.dtype)
                depth = 3
                images[device_index] = tf.reshape(
                    images[device_index],
                    shape=[
                        self.batch_size_per_device, self.height, self.width,
                        depth
                    ])
                label_index_batch[device_index] = tf.reshape(
                    label_index_batch[device_index],
                    [self.batch_size_per_device])
                if FLAGS.summary_verbosity >= 2:
                    # Display the training images in the visualizer.
                    tf.summary.image('images', images)

            return images, label_index_batch