Beispiel #1
0
def read_dataset(file_read_func, decode_func, input_files, config):
  """Reads a dataset, and handles repetition and shuffling.

  Args:
    file_read_func: Function to use in tf.data.Dataset.interleave, to read
      every individual file into a tf.data.Dataset.
    decode_func: Function to apply to all records.
    input_files: A list of file paths to read.
    config: A input_reader_builder.InputReader object.

  Returns:
    A tf.data.Dataset based on config.
  """
  # Shard, shuffle, and read files.
  filenames = tf.concat([tf.matching_files(pattern) for pattern in input_files],
                        0)
  filename_dataset = tf.data.Dataset.from_tensor_slices(filenames)
  if config.shuffle:
    filename_dataset = filename_dataset.shuffle(
        config.filenames_shuffle_buffer_size)
  elif config.num_readers > 1:
    tf.logging.warning('`shuffle` is false, but the input data stream is '
                       'still slightly shuffled since `num_readers` > 1.')

  filename_dataset = filename_dataset.repeat(config.num_epochs or None)

  records_dataset = filename_dataset.apply(
      tf.contrib.data.parallel_interleave(
          file_read_func, cycle_length=config.num_readers,
          block_length=config.read_block_length, sloppy=config.shuffle))
  if config.shuffle:
    records_dataset = records_dataset.shuffle(config.shuffle_buffer_size)
  tensor_dataset = records_dataset.map(
      decode_func, num_parallel_calls=config.num_parallel_map_calls)
  return tensor_dataset.prefetch(config.prefetch_size)
Beispiel #2
0
 def __init__(self, in_pattern, batch_size, num_buckets=0, num_epochs=None):
     self._batch_size = batch_size
     self.num_buckets = num_buckets
     self._epoch = 0
     self._step = 1.
     self.num_epochs = num_epochs
     file_pattern = in_pattern + '/examples.proto' if os.path.isdir(in_pattern) else in_pattern
     filenames = tf.matching_files(file_pattern)
     # filenames = tf.Print(filenames, [filenames], message='filenames: ')
     self.next_batch_op = self.input_pipeline(filenames, self._batch_size, self.num_buckets, self.num_epochs)
Beispiel #3
0
  def testMatchingFiles(self):
    cases = ['ABcDEF.GH', 'ABzDEF.GH', 'ABasdfjklDEF.GH', 'AB3DEF.GH',
             'AB4DEF.GH', 'ABDEF.GH', 'XYZ']
    files = [tempfile.NamedTemporaryFile(
        prefix=c, dir=self.get_temp_dir(), delete=True) for c in cases]

    with self.test_session():
      # Test exact match without wildcards.
      for f in files:
        self.assertEqual(tf.matching_files(f.name).eval(),
                         tf.compat.as_bytes(f.name))

      # We will look for files matching "ABxDEF.GH*" where "x" is some wildcard.
      pos = files[0].name.find(cases[0])
      pattern = files[0].name[:pos] + 'AB%sDEF.GH*'

      self.assertEqual(set(tf.matching_files(pattern % 'z').eval()),
                       self._subset(files, [1]))
      self.assertEqual(set(tf.matching_files(pattern % '?').eval()),
                       self._subset(files, [0, 1, 3, 4]))
      self.assertEqual(set(tf.matching_files(pattern % '*').eval()),
                       self._subset(files, [0, 1, 2, 3, 4, 5]))
      # NOTE(mrry): Windows uses PathMatchSpec to match file patterns, which
      # does not support the following expressions.
      if os.name != 'nt':
        self.assertEqual(set(tf.matching_files(pattern % '[cxz]').eval()),
                         self._subset(files, [0, 1]))
        self.assertEqual(set(tf.matching_files(pattern % '[0-9]').eval()),
                         self._subset(files, [3, 4]))

    for f in files:
      f.close()
Beispiel #4
0
  def testMatchingFiles(self):
    cases = ['ABcDEF.GH', 'ABzDEF.GH', 'ABasdfjklDEF.GH', 'AB3DEF.GH',
             'AB4DEF.GH', 'ABDEF.GH', 'XYZ']
    files = [tempfile.NamedTemporaryFile(prefix=c) for c in cases]

    with self.test_session():
      # Test exact match without wildcards.
      for f in files:
        self.assertEqual(tf.matching_files(f.name).eval(),
                         tf.compat.as_bytes(f.name))

      # We will look for files matching "ABxDEF.GH*" where "x" is some wildcard.
      pos = files[0].name.find(cases[0])
      pattern = files[0].name[:pos] + 'AB%sDEF.GH*'

      self.assertEqual(set(tf.matching_files(pattern % 'z').eval()),
                       self._subset(files, [1]))
      self.assertEqual(set(tf.matching_files(pattern % '?').eval()),
                       self._subset(files, [0, 1, 3, 4]))
      self.assertEqual(set(tf.matching_files(pattern % '*').eval()),
                       self._subset(files, [0, 1, 2, 3, 4, 5]))
      self.assertEqual(set(tf.matching_files(pattern % '[cxz]').eval()),
                       self._subset(files, [0, 1]))
      self.assertEqual(set(tf.matching_files(pattern % '[0-9]').eval()),
                       self._subset(files, [3, 4]))
Beispiel #5
0
def batch_inputs(paths, reference_shape,
        batch_size=32, is_training=False, num_landmarks=68):
    """Reads the files off the disk and produces batches.

    Args:
      paths: a list of directories that contain training images and
        the corresponding landmark files.
      reference_shape: a numpy array [num_landmarks, 2]
      batch_size: the batch size.
      is_traininig: whether in training mode.
      num_landmarks: the number of landmarks in the training images.
    Returns:
      images: a tf tensor of shape [batch_size, width, height, 3].
      lms: a tf tensor of shape [batch_size, 68, 2].
      lms_init: a tf tensor of shape [batch_size, 68, 2].
    """

    files = tf.concat(0, [tf.matching_files(d) for d in paths])

    filename_queue = tf.train.string_input_producer(
        files, shuffle=is_training, capacity=1000)

    image, lms, lms_init = tf.py_func(
        partial(load_image, is_training=is_training),
        [filename_queue.dequeue(), reference_shape], # input arguments
        [tf.float32, tf.float32, tf.float32], # output types
        name='load_image'
    )

    # The image has always 3 channels.
    image.set_shape([None, None, 3])

    if is_training:
        image = distort_color(image)

    lms = tf.reshape(lms, [num_landmarks, 2])
    lms_init = tf.reshape(lms_init, [num_landmarks, 2])

    images, lms, inits = tf.train.batch(
        [image, lms, lms_init],
        batch_size=batch_size,
        num_threads=4,
        capacity=1000,
        enqueue_many=False,
        dynamic_pad=True
    )

    return images, lms, inits
Beispiel #6
0
def get_dataset(tfrecords_dir, subset, batch_size):
    """Read TFRecords files and turn them into a TFRecordDataset."""
    files = tf.matching_files(os.path.join(tfrecords_dir, '%s-*' % subset))
    shards = tf.data.Dataset.from_tensor_slices(files)
    shards = shards.shuffle(tf.cast(tf.shape(files)[0], tf.int64))
    shards = shards.repeat()
    dataset = shards.interleave(tf.data.TFRecordDataset, cycle_length=4)
    dataset = dataset.shuffle(buffer_size=8192)
    parser = partial(
        _parse_fn, is_training=True if subset == 'train' else False)
    dataset = dataset.apply(
        tf.data.experimental.map_and_batch(
            map_func=parser,
            batch_size=batch_size,
            num_parallel_calls=config.NUM_DATA_WORKERS))
    dataset = dataset.prefetch(batch_size)
    return dataset
Beispiel #7
0
def read_dataset(file_read_func, decode_func, input_files, config):
    """Reads a dataset, and handles repetition and shuffling.

  Args:
    file_read_func: Function to use in tf.data.Dataset.interleave, to read
      every individual file into a tf.data.Dataset.
    decode_func: Function to apply to all records.
    input_files: A list of file paths to read.
    config: A input_reader_builder.InputReader object.

  Returns:
    A tf.data.Dataset based on config.
  """
    # Shard, shuffle, and read files.
    filenames = tf.concat(
        [tf.matching_files(pattern) for pattern in input_files], 0)
    filename_dataset = tf.data.Dataset.from_tensor_slices(filenames)
    if config.shuffle:
        filename_dataset = filename_dataset.shuffle(
            config.filenames_shuffle_buffer_size)
    elif config.num_readers > 1:
        tf.logging.warning('`shuffle` is false, but the input data stream is '
                           'still slightly shuffled since `num_readers` > 1.')

    filename_dataset = filename_dataset.repeat(config.num_epochs or None)

    try:
        records_dataset = filename_dataset.apply(
            tf.contrib.data.parallel_interleave(
                file_read_func,
                cycle_length=config.num_readers,
                block_length=config.read_block_length,
                sloppy=True))
    except:
        # For tf < 1.5
        records_dataset = filename_dataset.apply(
            tf.contrib.data.sloppy_interleave(
                map_func=file_read_func,
                cycle_length=config.num_readers,
                block_length=config.read_block_length))

    if config.shuffle:
        records_dataset.shuffle(config.shuffle_buffer_size)
    tensor_dataset = records_dataset.map(
        decode_func, num_parallel_calls=config.num_parallel_map_calls)
    return tensor_dataset.prefetch(config.prefetch_size)
Beispiel #8
0
def read_dataset(file_read_func,
                 decode_func,
                 input_files,
                 config,
                 num_workers=1,
                 worker_index=0):
    """Reads a dataset, and handles repetition and shuffling.

    Args:
      file_read_func: Function to use in tf.data.Dataset.interleave, to read
        every individual file into a tf.data.Dataset.
      decode_func: Function to apply to all records.
      input_files: A list of file paths to read.
      config: A input_reader_builder.InputReader object.
      num_workers: Number of workers / shards.
      worker_index: Id for the current worker.

    Returns:
      A tf.data.Dataset based on config.
    """
    # Shard, shuffle, and read files.
    filenames = tf.concat(
        [tf.matching_files(pattern) for pattern in input_files], 0)
    dataset = tf.data.Dataset.from_tensor_slices(filenames)
    dataset = dataset.shard(num_workers, worker_index)
    dataset = dataset.repeat(config.num_epochs or None)
    if config.shuffle:
        dataset = dataset.shuffle(config.filenames_shuffle_buffer_size,
                                  reshuffle_each_iteration=True)

    # Read file records and shuffle them.
    # If cycle_length is larger than the number of files, more than one reader
    # will be assigned to the same file, leading to repetition.
    cycle_length = tf.cast(tf.minimum(config.num_readers, tf.size(filenames)),
                           tf.int64)
    # TODO: find the optimal block_length.
    dataset = dataset.interleave(file_read_func,
                                 cycle_length=cycle_length,
                                 block_length=1)

    if config.shuffle:
        dataset = dataset.shuffle(config.shuffle_buffer_size,
                                  reshuffle_each_iteration=True)

    dataset = dataset.map(decode_func, num_parallel_calls=config.num_readers)
    return dataset.prefetch(config.prefetch_buffer_size)
Beispiel #9
0
    def _load_dataset(self, binaries_fname_pattern):
        """Creates an ImageNet data set (helper used by ``.make_*_datset`` below).

    Args:
        pattern (str): Pattern of the files from which
            to load images and labels (e.g. ``some/path/train-00000-of-01024``).

    Returns:
        A tf.data.Dataset yielding ImageNet data.
    """

        with tf.name_scope(self._name):
            with tf.device("/cpu:0"):
                filenames = tf.matching_files(binaries_fname_pattern)
                filenames = tf.random_shuffle(filenames)
                data = tf.data.TFRecordDataset(filenames)

        return data
Beispiel #10
0
def tfrecords_input_fn(files_name_pattern,
                       feature_spec,
                       label,
                       mode=tf.estimator.ModeKeys.EVAL,
                       num_epochs=None,
                       batch_size=64):
    shuffle = True if mode == tf.estimator.ModeKeys.TRAIN else False
    file_names = tf.matching_files(files_name_pattern)
    dataset = data.TFRecordDataset(filenames=file_names)

    if shuffle:
        dataset = dataset.shuffle(buffer_size=2 * batch_size + 1)

    dataset = dataset.batch(batch_size)
    dataset = dataset.map(
        lambda tf_example: parse_tf_example(tf_example, label, feature_spec))
    dataset = dataset.repeat(num_epochs)
    return dataset
Beispiel #11
0
 def input_pipeline(self,
                    file_pattern,
                    batch_size,
                    num_epochs=None,
                    num_threads=10):
     filenames = tf.matching_files(file_pattern)
     filename_queue = tf.train.string_input_producer(filenames,
                                                     num_epochs=num_epochs,
                                                     shuffle=True)
     parsed_batch = self.example_parser(filename_queue)
     min_after_dequeue = 10000
     capacity = min_after_dequeue + 12 * batch_size
     next_batch = tf.train.batch(parsed_batch,
                                 batch_size=batch_size,
                                 capacity=capacity,
                                 num_threads=num_threads,
                                 dynamic_pad=True,
                                 allow_smaller_final_batch=True)
     return next_batch
Beispiel #12
0
def read_dataset(
    file_read_func, decode_func, input_files, config, num_workers=1,
    worker_index=0):
  """Reads a dataset, and handles repetition and shuffling.

  Args:
    file_read_func: Function to use in tf.data.Dataset.interleave, to read
      every individual file into a tf.data.Dataset.
    decode_func: Function to apply to all records.
    input_files: A list of file paths to read.
    config: A input_reader_builder.InputReader object.
    num_workers: Number of workers / shards.
    worker_index: Id for the current worker.

  Returns:
    A tf.data.Dataset based on config.
  """
  # Shard, shuffle, and read files.
  filenames = tf.concat([tf.matching_files(pattern) for pattern in input_files],
                        0)
  dataset = tf.data.Dataset.from_tensor_slices(filenames)
  dataset = dataset.shard(num_workers, worker_index)
  dataset = dataset.repeat(config.num_epochs or None)
  if config.shuffle:
    dataset = dataset.shuffle(config.filenames_shuffle_buffer_size,
                              reshuffle_each_iteration=True)

  # Read file records and shuffle them.
  # If cycle_length is larger than the number of files, more than one reader
  # will be assigned to the same file, leading to repetition.
  cycle_length = tf.cast(
      tf.minimum(config.num_readers, tf.size(filenames)), tf.int64)
  # TODO: find the optimal block_length.
  dataset = dataset.interleave(
      file_read_func, cycle_length=cycle_length, block_length=1)

  if config.shuffle:
    dataset = dataset.shuffle(config.shuffle_buffer_size,
                              reshuffle_each_iteration=True)

  dataset = dataset.map(decode_func, num_parallel_calls=config.num_readers)
  return dataset.prefetch(config.prefetch_buffer_size)
Beispiel #13
0
def input_fn(input_dir,
             mode,
             batch_size=1,
             num_epochs=100,
             label_name=None,
             feature_spec=None):
    """Reads TFRecords and returns the features and labels"""
    def read_and_decode_fn(example):
        """Parses Serialized Example."""
        features = tf.parse_single_example(example, feature_spec)
        image = features['image']
        image = tf.reshape(image, [256, 256, 256])
        label = tf.cast(features[label_name], tf.int32)
        return image, label

    if feature_spec is None:
        tf_transform_output = tft.TFTransformOutput(
            os.path.join(input_dir, 'transformed_metadata'))
        feature_spec = tf_transform_output.transformed_feature_spec()

    prefix = str(mode).lower()
    suffix = '.tfrecord'
    num_cpus = multiprocessing.cpu_count()
    file_pattern = os.path.join(input_dir, 'data', prefix,
                                prefix + '*' + suffix)
    filenames = tf.matching_files(file_pattern)
    dataset = tf.data.TFRecordDataset(filenames=filenames,
                                      buffer_size=None,
                                      num_parallel_reads=num_cpus)
    if mode == tf.estimator.ModeKeys.TRAIN:
        dataset = dataset.apply(
            tf.data.experimental.shuffle_and_repeat(buffer_size=100))
    dataset = dataset.apply(
        tf.data.experimental.map_and_batch(
            map_func=read_and_decode_fn,
            batch_size=batch_size,
            num_parallel_calls=tf.data.experimental.AUTOTUNE))
    iterator = dataset.make_one_shot_iterator()
    features, labels = iterator.get_next()
    if mode == tf.estimator.ModeKeys.PREDICT:
        return features
    return features, labels
Beispiel #14
0
            def input_fn():
                inputs = (
                    user_input_fn(filenames_to_queue(
                        tf.matching_files(FLAGS.infer_file)
                        if mode == tf.contrib.learn.ModeKeys.INFER else
                        {mode: tf.train.match_filenames_once(
                            getattr(FLAGS, "{}_file".format(mode)),
                            name="{}_filenames".format(mode))
                         for mode in [tf.contrib.learn.ModeKeys.TRAIN,
                                      tf.contrib.learn.ModeKeys.EVAL]}[mode]))
                    if prepare_filename_queues else
                    user_input_fn())

                inputs = ((inputs,)
                          if type(inputs) in {dict, tf.Tensor} else
                          inputs)

                _check_inputs(inputs)

                return _batch_inputs(inputs, mode) if batch_inputs else inputs
Beispiel #15
0
def tf_loadVideoFromFile(tf_dirname):
    # tf_pattern = tf.concat([tf_dirname, '*.jpg'], 0)
    tf_matchingfiles = tf.matching_files(tf_dirname)

    n_files = tf.shape(tf_matchingfiles)[0]

    fileQ = tf.FIFOQueue(100, tf.string)
    fQinit = fileQ.enqueue_many((tf_matchingfiles, ))
    with tf.control_dependencies([fQinit]):
        # s_file = fileQ.dequeue()

        reader = tf.WholeFileReader()
        _, value = reader.read(fileQ)

        image_seq = tf.image.decode_jpeg(value, channels=3)
        image_seq = tf.expand_dims(image_seq, axis=0)
        # image_seq = tf.ones([2,2,2])
        i = tf.constant(1)

        def condition(i, m):
            return tf.less(i, n_files)

        def body(i, m):
            # s_file =fileQ.dequeue()
            _, value = reader.read(fileQ)
            tf_s_image = tf.image.decode_jpeg(value, channels=3)
            tf_s_image = tf.expand_dims(tf_s_image, axis=0)

            # tf_s_image = tf.ones([2, 2, 2])
            return i + 1, tf.concat(concat_dim=0, values=[m, tf_s_image])

    _, loaded_images = tf.while_loop(cond=condition,
                                     body=body,
                                     loop_vars=[i, image_seq],
                                     shape_invariants=[
                                         i.get_shape(),
                                         tf.TensorShape(
                                             [None, None, None, None])
                                     ])

    return loaded_images, fileQ
Beispiel #16
0
def dataset_input_fn(data_folder,
                     prefix=None,
                     mode=None,
                     params=None,
                     count=None):
    """Creates a dataset reading example from filenames.

  Args:
    data_folder: Location of the files finishing with a '/'
    prefix: Start of the file names
    mode: tf.estimator.ModeKeys(TRAIN, EVAL)
    params: hyperparameters
  Returns:
    features and targets
  """
    shuffle = True if mode == tf.estimator.ModeKeys.TRAIN else False

    # Read CSV files into a Dataset
    filenames = tf.matching_files('{}{}*.csv'.format(data_folder, prefix))
    dataset = tf.data.TextLineDataset(filenames)

    # Parse the record into tensors.
    dataset = dataset.map(parse_csv)

    # Shuffle the dataset
    if shuffle:
        dataset = dataset.shuffle(buffer_size=params.buffer_size)

    # Repeat the input indefinitely if count is None
    dataset = dataset.repeat(count=count)

    # Generate batches
    dataset = dataset.batch(params.batch_size)

    # Create a one-shot iterator
    iterator = dataset.make_one_shot_iterator()

    # Get batch X and y
    features, target = iterator.get_next()

    return features, target
Beispiel #17
0
def input_fn(files_name_pattern,
             mode=tf.estimator.ModeKeys.EVAL,
             skip_header_lines=0,
             num_epochs=1,
             batch_size=200):

    shuffle = True if mode == tf.estimator.ModeKeys.TRAIN else False
    num_threads = multiprocessing.cpu_count() if MULTI_THREADING else 1
    buffer_size = 2 * batch_size + 1

    print("")
    print("* data input_fn:")
    print("================")
    print("Input file(s): {}".format(files_name_pattern))
    print("Batch size: {}".format(batch_size))
    print("Epoch Count: {}".format(num_epochs))
    print("Mode: {}".format(mode))
    print("Thread Count: {}".format(num_threads))
    print("Shuffle: {}".format(shuffle))
    print("================")
    print("")

    file_names = tf.matching_files(files_name_pattern)
    dataset = data.TextLineDataset(filenames=file_names)
    dataset = dataset.skip(skip_header_lines)

    if shuffle:
        dataset = dataset.shuffle(buffer_size)

    dataset = dataset.map(lambda tsv_row: parse_tsv_row(tsv_row),
                          num_parallel_calls=num_threads)

    dataset = dataset.batch(batch_size)
    dataset = dataset.repeat(num_epochs)
    dataset = dataset.prefetch(buffer_size)

    iterator = dataset.make_one_shot_iterator()

    features, target = iterator.get_next()
    return features, parse_label_column(target)
Beispiel #18
0
def read_dataset(
    file_read_func, decode_func, input_files, config, num_workers=1,
    worker_index=0):
  filenames = tf.concat([tf.matching_files(pattern) for pattern in input_files],
                        0)
  dataset = tf.data.Dataset.from_tensor_slices(filenames)
  dataset = dataset.shard(num_workers, worker_index)
  dataset = dataset.repeat(config.num_epochs or None)
  if config.shuffle:
    dataset = dataset.shuffle(config.filenames_shuffle_buffer_size,
                              reshuffle_each_iteration=True)
  cycle_length = tf.cast(
      tf.minimum(config.num_readers, tf.size(filenames)), tf.int64)
  dataset = dataset.interleave(
      file_read_func, cycle_length=cycle_length, block_length=1)

  if config.shuffle:
    dataset = dataset.shuffle(config.shuffle_buffer_size,
                              reshuffle_each_iteration=True)

  dataset = dataset.map(decode_func, num_parallel_calls=config.num_readers)
  return dataset.prefetch(config.prefetch_buffer_size)
def csv_input_fn(files_name_pattern,
                 mode=tf.estimator.ModeKeys.EVAL,
                 skip_header_lines=0,
                 num_epochs=1,
                 batch_size=20):

    file_names = tf.matching_files(files_name_pattern)
    dataset = data.TextLineDataset(filenames=file_names)
    dataset = dataset.skip(skip_header_lines)
    num_threads = multiprocessing.cpu_count() if MULTI_THREADING else 1
    if mode == tf.estimator.ModeKeys.TRAIN:
        dataset = dataset.shuffle(buffer_size=2 * batch_size + 1)

    dataset = dataset.batch(batch_size)
    dataset = dataset.map(lambda csv_row: parse_csv_row(csv_row),
                          num_parallel_calls=num_threads)

    # dataset = dataset.batch(batch_size) #??? very long time
    dataset = dataset.repeat(num_epochs)
    iterator = dataset.make_one_shot_iterator()

    features, target = iterator.get_next()
    return features, target
Beispiel #20
0
 def load_tfrecords_dataset(self,
                            dataset_path_format,
                            image_size,
                            num_parallel_calls=None):
     """
     load TFRecord format dataset.
     :param dataset_path_format: str, path to dataset file format
     :param image_size: dict, contains original image size information, height, width, and channel
     :param num_parallel_calls: int or None, number of parallel processes to load dataset
     :return dataset: tf.data.Dataset API
     """
     filenames = tf.matching_files(dataset_path_format)
     num_shard = tf.cast(tf.shape(filenames)[0], tf.int64)
     dataset = tf.data.Dataset.list_files(filenames).shuffle(num_shard)
     dataset = dataset.interleave(
         lambda filename: tf.data.TFRecordDataset(filename),
         cycle_length=num_shard)
     h = image_size['height']
     w = image_size['width']
     c = image_size['channel']
     dataset = dataset.map(lambda x: self.parse_example(x, h, w, c),
                           num_parallel_calls=num_parallel_calls)
     return dataset
Beispiel #21
0
def csv_input_fn(file_name, mode=tf.estimator.ModeKeys.EVAL,
                 skip_header_lines=0,
                 num_epochs=1,
                 batch_size=500):

  shuffle = True if mode == tf.estimator.ModeKeys.TRAIN else False

  print(file_name)
  file_names = tf.matching_files(file_name)
  
  dataset = tf.data.TextLineDataset(filenames=file_names)
  dataset = dataset.skip(skip_header_lines)

  if shuffle:
      dataset = dataset.shuffle(buffer_size=2 * batch_size + 1)

  dataset = dataset.batch(batch_size)
  dataset = dataset.map(lambda csv_row: parse_csv_row(csv_row))
  dataset = dataset.repeat(num_epochs)
  iterator = dataset.make_one_shot_iterator()

  features, target = iterator.get_next()
  return features, target
Beispiel #22
0
def input_fn(input_dir,
             mode,
             batch_size,
             num_epochs,
             label_name=None,
             shuffle_buffer_size=10000,
             feature_spec=None):
    """Reads TFRecords and returns the features and labels."""
    if feature_spec is None:
        tf_transform_output = tft.TFTransformOutput(
            os.path.join(input_dir, 'transformed_metadata'))
        feature_spec = tf_transform_output.transformed_feature_spec()
    prefix = str(mode).lower()
    suffix = '.tfrecord'
    num_cpus = multiprocessing.cpu_count()

    file_pattern = os.path.join(input_dir, 'data', prefix,
                                prefix + '*' + suffix)
    filenames = tf.matching_files(file_pattern)
    dataset = tf.data.TFRecordDataset(filenames=filenames,
                                      buffer_size=None,
                                      num_parallel_reads=num_cpus)

    if mode == tf.estimator.ModeKeys.TRAIN:
        dataset = dataset.shuffle(shuffle_buffer_size)

    dataset = dataset.repeat(num_epochs)
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(
        lambda examples: tf.parse_example(examples, feature_spec))
    iterator = dataset.make_one_shot_iterator()
    features = iterator.get_next()
    if mode == tf.estimator.ModeKeys.PREDICT:
        return features

    label = features.pop(label_name)
    return features, label
Beispiel #23
0
    def testMatchingFiles(self):
        cases = ["ABcDEF.GH", "ABzDEF.GH", "ABasdfjklDEF.GH", "AB3DEF.GH", "AB4DEF.GH", "ABDEF.GH", "XYZ"]
        files = [tempfile.NamedTemporaryFile(prefix=c) for c in cases]

        with self.test_session():
            # Test exact match without wildcards.
            for f in files:
                self.assertEqual(tf.matching_files(f.name).eval(), f.name)

            # We will look for files matching "ABxDEF.GH*" where "x" is some wildcard.
            pos = files[0].name.find(cases[0])
            pattern = files[0].name[:pos] + "AB%sDEF.GH*"

            self.assertEqual(set(tf.matching_files(pattern % "z").eval()), self._subset(files, [1]))
            self.assertEqual(set(tf.matching_files(pattern % "?").eval()), self._subset(files, [0, 1, 3, 4]))
            self.assertEqual(set(tf.matching_files(pattern % "*").eval()), self._subset(files, [0, 1, 2, 3, 4, 5]))
            self.assertEqual(set(tf.matching_files(pattern % "[cxz]").eval()), self._subset(files, [0, 1]))
            self.assertEqual(set(tf.matching_files(pattern % "[0-9]").eval()), self._subset(files, [3, 4]))
    def input_fn(self, name, csv_path=None):
        """Creates a dataset object for the model to consume. Input function for estimator

        Arguments:
                name : string, Name of the data [Train or Eval]
                csv_path : The path of the csv on any storage system

        Returns:
                features : tf.data.TextLineDataset object, Dataset containing batch of features
                labels : tf.data.TextLineDataset object, Dataset containing batch of labels
        """
        pattern = self._get_pattern(name, csv_path)
        tf.logging.info('The Pattern of files is : %s', pattern)
        filenames = tf.matching_files(pattern=pattern)
        dataset = tf.data.TextLineDataset(filenames).skip(1).map(
            self.parse_csv, num_parallel_calls=cpu_count())
        dataset = dataset.shuffle(buffer_size=self.batch_size * 100)
        dataset = dataset.apply(tf.contrib.data.ignore_errors())
        dataset = dataset.repeat(self.num_epochs)
        dataset = dataset.batch(self.batch_size)  # determine the ideal number
        dataset = dataset.prefetch(self.buffer_size)
        iterator = dataset.make_one_shot_iterator()
        feats, labs = iterator.get_next()
        return feats, labs
Beispiel #25
0
    def read_dataset(self,
                     file_read_func,
                     decode_func,
                     input_files,
                     num_epochs=None):
        filenames = tf.concat(
            [tf.matching_files(pattern) for pattern in input_files], 0)
        # Shard, shuffle, and read files.
        filename_dataset = tf.data.Dataset.from_tensor_slices(filenames)
        filename_dataset = filename_dataset.shuffle(
            self.flags.filenames_shuffle_buffer_size)

        filename_dataset = filename_dataset.repeat(num_epochs)

        records_dataset = filename_dataset.apply(
            tf.contrib.data.parallel_interleave(
                file_read_func,
                cycle_length=self.flags.num_readers,
                block_length=self.flags.read_block_length,
                sloppy=True))
        records_dataset.shuffle(self.flags.shuffle_buffer_size)
        tensor_dataset = records_dataset.map(
            decode_func, num_parallel_calls=self.num_parallel_calls)
        return tensor_dataset.prefetch(self.flags.prefetch_size)
def tfrecods_input_fn(files_name_pattern,
                      mode=tf.estimator.ModeKeys.EVAL,
                      num_epochs=None,
                      batch_size=200):

    shuffle = True if mode == tf.estimator.ModeKeys.TRAIN else False

    print("")
    print("* data input_fn:")
    print("================")
    print("Input file(s): {}".format(files_name_pattern))
    print("Batch size: {}".format(batch_size))
    print("Epoch Count: {}".format(num_epochs))
    print("Mode: {}".format(mode))
    print("Shuffle: {}".format(shuffle))
    print("================")
    print("")

    file_names = tf.matching_files(files_name_pattern)
    dataset = data.TFRecordDataset(filenames=file_names)

    if shuffle:
        dataset = dataset.shuffle(buffer_size=2 * batch_size + 1)

    dataset = dataset.batch(batch_size)
    dataset = dataset.map(lambda tf_example: parse_tf_example(tf_example))

    if PROCESS_FEATURES:
        dataset = dataset.map(
            lambda features, target: (process_features(features), target))

    dataset = dataset.repeat(num_epochs)
    iterator = dataset.make_one_shot_iterator()

    features, target = iterator.get_next()
    return features, target
Beispiel #27
0
  def testMatchingFiles(self):
    cases = ['ABcDEF.GH', 'ABzDEF.GH', 'ABasdfjklDEF.GH', 'AB3DEF.GH',
             'AB4DEF.GH', 'ABDEF.GH', 'XYZ']
    files = [tempfile.NamedTemporaryFile(prefix=c) for c in cases]

    with self.test_session():
      # Test exact match without wildcards.
      for f in files:
        self.assertEqual(tf.matching_files(f.name).eval(), f.name)

      # We will look for files matching "ABxDEF.GH*" where "x" is some wildcard.
      pos = files[0].name.find(cases[0])
      pattern = files[0].name[:pos] + 'AB%sDEF.GH*'

      self.assertEqual(set(tf.matching_files(pattern % 'z').eval()),
                       self._subset(files, [1]))
      self.assertEqual(set(tf.matching_files(pattern % '?').eval()),
                       self._subset(files, [0, 1, 3, 4]))
      self.assertEqual(set(tf.matching_files(pattern % '*').eval()),
                       self._subset(files, [0, 1, 2, 3, 4, 5]))
      self.assertEqual(set(tf.matching_files(pattern % '[cxz]').eval()),
                       self._subset(files, [0, 1]))
      self.assertEqual(set(tf.matching_files(pattern % '[0-9]').eval()),
                       self._subset(files, [3, 4]))
Beispiel #28
0
def load_tfrecord_dataset(batch_size,
                          split='train',
                          size=128,
                          augmentation=False,
                          shuffle=True,
                          classes=None,
                          normalize=True,
                          dequantize=True,
                          reader_threads=32):
    """Read the images and labels from 'filenames'."""
    filenames = tf.matching_files(
        os.path.join(TFRECORD_ROOT, "%s-00*-of-00*" % split))
    dataset = tf.data.TFRecordDataset(filenames).repeat()

    if 'imagenet64' in split:
        disk_sz = 64
    elif 'sngan_128' in split or 'imagenet128' in split:
        disk_sz = 128
    else:
        disk_sz = 256

    def parse_single_example(serialized_example,
                             height=disk_sz,
                             width=disk_sz,
                             depth=3):
        """Parses a single tf.Example into image and label tensors."""
        features = tf.parse_single_example(serialized_example,
                                           features={
                                               'image':
                                               tf.FixedLenFeature([],
                                                                  tf.string),
                                               'label':
                                               tf.FixedLenFeature([],
                                                                  tf.int64),
                                           })
        image = tf.decode_raw(features['image'], tf.uint8)
        image.set_shape([height * width * depth])
        # Reshape from [height * width * depth] to [height, width, depth].
        image = tf.reshape(image, [height, width, depth])
        image = tf.cast(image, tf.float32)
        image = tf.image.resize_images(image, [size, size])
        if augmentation:
            image = tf.image.random_flip_left_right(image)
        label = features['label']
        if normalize:
            image = (image / 128.0 - 1.0)
            if dequantize:
                image = image + tf.random_uniform(
                    shape=[size, size, 3], minval=0.0, maxval=1. / 128)

        return image, label

    if shuffle:
        dataset = dataset.shuffle(buffer_size=256)

    dataset = dataset.map(parse_single_example,
                          num_parallel_calls=reader_threads)

    if classes is not None:
        classes = tf.convert_to_tensor(classes, dtype=tf.int64)
        dataset = dataset.filter(
            lambda x, y: tf.reduce_any(tf.equal(y, classes)))

    dataset = dataset.repeat()
    if shuffle:
        dataset = dataset.shuffle(buffer_size=10000)

    images, labels = dataset.batch(
        batch_size).make_one_shot_iterator().get_next()
    return images, labels
    def _input_fn():
        print("\nread_dataset_frame: _input_fn: file_pattern = {}".format(file_pattern))
        print("read_dataset_frame: _input_fn: mode = {}".format(mode))
        print("read_dataset_frame: _input_fn: batch_size = {}".format(batch_size))
        
        # This function dequantizes our tensors to bring them back to full floating point precision
        def dequantize(feat_vector, max_quantized_value = 2, min_quantized_value = -2):
            assert max_quantized_value > min_quantized_value # ensure the max value is larger than the min value
            quantized_range = max_quantized_value - min_quantized_value # find the range between max and min
            scalar = quantized_range / 255.0 # create a scale factor where 0 is the min and 1 is the max
            bias = (quantized_range / 512.0) + min_quantized_value # create bias term to shift our scaled feature vector
            return feat_vector * scalar + bias # return the scaled and shifted feature vector

        # This function resizes our frames axis so that we only get a subset of frames
        def resize_axis(tensor, axis, new_size, fill_value = 0):
            tensor = tf.convert_to_tensor(value = tensor) # ensure tensor is a tensor
            shape = tf.unstack(value = tf.shape(input = tensor)) # create a list where each element is a 1-D tensor the size of each dimension in tensor

            pad_shape = shape[:] # create a copy of the shape list of 1-D tensors
            pad_shape[axis] = tf.maximum(x = 0, y = new_size - shape[axis]) # change the size of the axis dimension to the maximum of 0 and the new size of our padded shape

            shape[axis] = tf.minimum(x = shape[axis], y = new_size) # change the size of the axis dimension to the minimum of our original shape and the new size of our padded shape
            shape = tf.stack(values = shape) # stack the list of tensor sizes back into a larger tensor

            resized = tf.concat(values = [
                tf.slice(input_ = tensor, begin = tf.zeros_like(tensor = shape), size = shape), # slice the tensor starting at the 0th index in each dimension and going as far as our adjusted shape in each dimension
                tf.fill(dims = tf.stack(values = pad_shape), value = tf.cast(x = fill_value, dtype = tensor.dtype)) # fill the rest of the tensor with the fill value
            ], axis = axis) # concatenate our sliced tensor with our fill value tensor together

            new_shape = tensor.get_shape().as_list() # get the static shape of the tensor and output it to a list
            new_shape[axis] = new_size # change the static shape's axis to our new size
            resized.set_shape(shape = new_shape) # set the static shape of our resized tensor to our new shape
            return resized # return the resized tensor

        # Read files from file_pattern which provided by args
        input_file_names = tf.matching_files(pattern = file_pattern)

        # Determine amount of times to repeat file and if we should shuffle the file queue based on if we are training or evaluating
        if mode == tf.estimator.ModeKeys.TRAIN:
            num_epochs = None # forever
            shuffle = True
        else:
            num_epochs = 1 # until EOF
            shuffle = False

        # Create filename queue from our input file names
        filename_queue = tf.train.string_input_producer(string_tensor = input_file_names, num_epochs = num_epochs, shuffle = shuffle)

        # Create a TF Record reader to read in our TF Record files
        reader = tf.TFRecordReader()

        # Use our TF Record reader to read from the filename queue
        queue, serialized_examples = reader.read(queue = filename_queue)
        
        # Create context and sequence feature map
        context_features = {
            "video_id": tf.FixedLenFeature(shape = [], dtype = tf.string),
            "labels": tf.VarLenFeature(dtype = tf.int64)
        }
        sequence_features = {
            "rgb": tf.FixedLenSequenceFeature(shape = [], dtype = tf.string),
            "audio": tf.FixedLenSequenceFeature(shape = [], dtype = tf.string)
        }

        # Parse TF Records into our features
        contexts, features = tf.parse_single_sequence_example(
            serialized = serialized_examples, 
            context_features = context_features,
            sequence_features = sequence_features)
        print("read_dataset_frame: _input_fn: contexts = {}".format(contexts)) # shape = video_id = (), labels = SparseTensor object
        print("read_dataset_frame: _input_fn: features = {}".format(features)) # shape = rgb = (frames_per_video,), audio = (frames_per_video,)

        # Create features
        # Pass video_id to features
        features['video_id'] = contexts['video_id'] # shape = video_id = (), rgb = (frames_per_video,), audio = (frames_per_video,)
        print("read_dataset_frame: _input_fn: features = {}".format(features))

        # Fix rgb data
        decoded_rgb = tf.reshape(tensor = tf.cast(x = tf.decode_raw(bytes = features["rgb"], out_type = tf.uint8), dtype = tf.float32), shape = [-1, 1024]) # shape = (frames_per_video, 1024)
        print("read_dataset_frame: _input_fn: decoded_rgb = {}".format(decoded_rgb))
        rgb_matrix = resize_axis(tensor = dequantize(decoded_rgb), axis = 0, new_size = MAX_FRAMES) # shape = (MAX_FRAMES, 1024)
        print("read_dataset_frame: _input_fn: rgb_matrix = {}".format(rgb_matrix))
        features['rgb'] = rgb_matrix
        print("read_dataset_frame: _input_fn: features = {}".format(features)) # shape = video_id = (), rgb = (MAX_FRAMES, 1024), audio = (frames_per_video,)

        # Fix audio data
        decoded_audio = tf.reshape(tensor = tf.cast(x = tf.decode_raw(bytes = features["audio"], out_type = tf.uint8), dtype = tf.float32), shape = [-1, 128]) # shape = (frames_per_video, 128)
        print("read_dataset_frame: _input_fn: decoded_audio = {}".format(decoded_audio))
        audio_matrix = resize_axis(tensor = dequantize(decoded_audio), axis = 0, new_size = MAX_FRAMES) # shape = (MAX_FRAMES, 128)
        print("read_dataset_frame: _input_fn: audio_matrix = {}".format(audio_matrix))
        features['audio'] = audio_matrix
        print("read_dataset_frame: _input_fn: features = {}".format(features)) # shape = video_id = (), rgb = (MAX_FRAMES, 1024), audio = (MAX_FRAMES, 128)

        # Add labels to features dictionary and change to correct format from sparse to dense and to floats
        features['labels'] = tf.cast(x = tf.sparse_to_dense(sparse_indices = contexts['labels'].values, output_shape = (NUM_CLASSES,), sparse_values = 1, validate_indices = False), dtype = tf.float32)
        print("read_dataset_frame: _input_fn: features = {}".format(features)) # shape = video_id = (), rgb = (MAX_FRAMES, 1024), audio = (MAX_FRAMES, 128), labels = (NUM_CLASSES,)

        # Shuffle and batch features
        batch_features = tf.train.shuffle_batch(
            tensors = features, 
            batch_size = batch_size, 
            capacity = batch_size * 10, 
            min_after_dequeue = batch_size,
            num_threads = 1,
            enqueue_many = False,
            allow_smaller_final_batch = True)
        print("read_dataset_frame: _input_fn: batch_features = {}".format(batch_features)) # shape = video_id = (batch_size,), rgb = (batch_size, MAX_FRAMES, 1024), audio = (batch_size, MAX_FRAMES, 128), labels = (batch_size, NUM_CLASSES)

        # Pop off labels from feature dictionary
        batch_labels = batch_features.pop('labels')
        print("read_dataset_frame: _input_fn: batch_labels = {}\n".format(batch_labels)) # shape = (batch_size, NUM_CLASSES)

        return batch_features, batch_labels
Beispiel #30
0
 def _get_tfrecords_files(self):
     full_pattern = os.path.join(self.tfrecords_dir, self.tfrecords_pattern)
     tfrecords_files = tf.matching_files(full_pattern)
     return tfrecords_files
    def _input_fn():
        print("\nread_dataset_video: _input_fn: file_pattern = {}".format(
            file_pattern))
        print("read_dataset_video: _input_fn: mode = {}".format(mode))
        print("read_dataset_video: _input_fn: batch_size = {}".format(
            batch_size))

        # Read files from file_pattern which provided by args
        input_file_names = tf.matching_files(file_pattern)

        # Determine amount of times to repeat file and if we should shuffle the file queue based on if we are training or evaluating
        if mode == tf.estimator.ModeKeys.TRAIN:
            num_epochs = None  # forever
            shuffle = True
        else:
            num_epochs = 1  # until EOF
            shuffle = False

        # Create filename queue from our input file names
        filename_queue = tf.train.string_input_producer(
            string_tensor=input_file_names,
            num_epochs=num_epochs,
            shuffle=shuffle)

        # Create a TF Record reader to read in our TF Record files
        reader = tf.TFRecordReader()

        # Use our TF Record reader to read from the filename queue
        queue, serialized_examples = reader.read(queue=filename_queue)

        # Create feature map
        feature_map = {
            'video_id': tf.FixedLenFeature(shape=[], dtype=tf.string),
            'labels': tf.VarLenFeature(dtype=tf.int64),
            'mean_rgb': tf.FixedLenFeature(shape=[1024], dtype=tf.float32),
            'mean_audio': tf.FixedLenFeature(shape=[128], dtype=tf.float32)
        }

        # Parse TF Records into our features
        features = tf.parse_single_example(serialized=serialized_examples,
                                           features=feature_map)
        print(
            "read_dataset_video: _input_fn: features = {}".format(features)
        )  # shape = video_id = (), mean_rgb = (1024,), mean_audio = (128,), labels = SparseTensor object

        # Add labels to features dictionary and change to correct format from sparse to dense and to floats
        features['labels'] = tf.cast(x=tf.sparse_to_dense(
            sparse_indices=features['labels'].values,
            output_shape=(NUM_CLASSES, ),
            sparse_values=1,
            validate_indices=False),
                                     dtype=tf.float32)
        print(
            "read_dataset_video: _input_fn: features = {}".format(features)
        )  # shape = video_id = (), mean_rgb = (1024,), mean_audio = (128,), labels = (NUM_CLASSES,)

        # Shuffle and batch features
        batch_features = tf.train.shuffle_batch(tensors=features,
                                                batch_size=batch_size,
                                                capacity=batch_size * 10,
                                                min_after_dequeue=batch_size,
                                                num_threads=1,
                                                enqueue_many=False,
                                                allow_smaller_final_batch=True)
        print(
            "read_dataset_video: _input_fn: batch_features = {}".format(
                batch_features)
        )  # shape = video_id = (batch_size,), mean_rgb = (batch_size, 1024), mean_audio = (batch_size, 128), labels = (batch_size, NUM_CLASSES)

        # Pop off labels from feature dictionary
        batch_labels = batch_features.pop('labels')
        print("read_dataset_video: _input_fn: batch_labels = {}".format(
            batch_labels))  # shape = (batch_size, NUM_CLASSES)

        return batch_features, batch_labels
Beispiel #32
0
    def _make_dataset(self,
                      binaries_fname_pattern,
                      data_augmentation=False,
                      shuffle=True):
        """Creates a SVHN dataset (helper used by ``.make_*_datset`` below).

    Args:
        binaries_fname_pattern (str): Pattern of the ``.bin`` files from which
            to load images and labels (e.g. ``some/path/data_batch_*.bin``).
        data_augmentation (bool): Whether to apply data augmentation operations.
        shuffle (bool):  Switch to turn on or off shuffling of the data set.
            Defaults to ``True``.

    Returns:
        A tf.data.Dataset yielding batches of SVHN data.
    """
        # Set number of bytes to read.
        label_bytes = 1
        label_offset = 0
        num_classes = 10
        depth = 3
        image_size = 32
        image_bytes = image_size * image_size * depth
        record_bytes = label_bytes + label_offset + image_bytes

        def parse_func(raw_record):
            """Function parsing data from raw binary records."""
            # Decode raw_record.
            record = tf.reshape(
                tf.decode_raw(raw_record, tf.uint8), [record_bytes])
            label = tf.cast(
                tf.slice(record, [label_offset], [label_bytes]), tf.int32)
            image = tf.reshape(
                tf.slice(record, [label_bytes], [image_bytes]),
                [image_size, image_size, depth])
            image = tf.cast(image, tf.float32)

            # Add image pre-processing.
            if data_augmentation:
                image = tf.image.resize_image_with_crop_or_pad(
                    image, image_size + 4, image_size + 4)
                image = tf.random_crop(image, [32, 32, 3])
                image = tf.image.random_brightness(image, max_delta=63. / 255.)
                image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
                image = tf.image.random_contrast(image, lower=0.2, upper=1.8)
            else:
                image = tf.image.resize_image_with_crop_or_pad(image, 32, 32)

            image = tf.image.per_image_standardization(image)
            label = tf.squeeze(tf.one_hot(label, depth=num_classes))
            return image, label

        with tf.name_scope(self._name):
            with tf.device('/cpu:0'):
                filenames = tf.matching_files(binaries_fname_pattern)
                filenames = tf.random_shuffle(filenames)
                data = tf.data.FixedLengthRecordDataset(
                    filenames=filenames, record_bytes=record_bytes)
                data = data.map(
                    parse_func,
                    num_parallel_calls=(8 if data_augmentation else 4))
                if shuffle:
                    data = data.shuffle(buffer_size=20000)
                data = data.batch(self._batch_size, drop_remainder=True)
                data = data.prefetch(buffer_size=4)
                return data
Beispiel #33
0
    def __init__(self,
        filenames,
        vocab_file,
        max_text_length=400,
        batch_size=128,
        skip_header_lines=0,
        text_feature_name='review',
        target_name='class',
        weight_column_name='weight',
        pad_word='#@PAD@#',
        csv_header=['class', 'polarity', 'source', 'fold', 'file', 'review'],
        target_labels=['False', 'True'],
        num_epochs=None,
        multi_threading=True,
        prefetch=1,
        words_to_ids=True,
        shuffle=True
        ):
        
        #self.filenames = tf.gfile.Glob(filenames)
        self.filenames = filenames
        #'data/op_spam_v1.4/vocab.csv'
        with open(vocab_file) as f:
            self.n_words = sum(1 for line in f) + 2
            
        self.vocab_file = vocab_file
        self.csv_header = csv_header
        self.max_text_length=max_text_length
        self.text_feature_name=text_feature_name
        self.target_name=target_name
        self.weight_column_name=weight_column_name
        self.pad_word=pad_word
        self.target_labels=target_labels
        
        num_threads = multiprocessing.cpu_count() if multi_threading else 1

        buffer_size = 2 * batch_size + 1

        print("")
        print("* data input_fn:")
        print("=" * 20)
        print("Input file(s): {}".format(self.filenames))
        print("Batch size: {}".format(batch_size))
        print("Epoch Count: {}".format(num_epochs))
        print("Thread Count: {}".format(num_threads))
        print("Shuffle: {}".format(shuffle))
        print("=" * 20)
        print("")


        dataset = tf.data.TextLineDataset(filenames=tf.matching_files(self.filenames))

        dataset = dataset.skip(skip_header_lines)

        if shuffle:
            dataset = dataset.shuffle(buffer_size)

        dataset = dataset.map(
            lambda tsv_row: self.parse_csv_row(tsv_row), 
            num_parallel_calls=num_threads)

        if batch_size:
            dataset = dataset.batch(batch_size)
        
        if num_epochs:
            dataset = dataset.repeat(num_epochs)

        if prefetch:
            dataset = dataset.prefetch(prefetch)

        iterator = dataset.make_one_shot_iterator()

        features, target = iterator.get_next()
        target = self.labels_to_ids(target)
        
        if words_to_ids:
            features[text_feature_name] = self.words_to_ids(features[text_feature_name])
        
        self.features = features
        self.target = target
Beispiel #34
0
 def match_files(self, type):
     with tf.name_scope("Match"):
         files = tf.matching_files(type + "*", name="match")
         #files=tf.train.match_filenames_once(type,name="match")
         #print(files)
         return files
Beispiel #35
0
    return tf.concat(3, tensor_list)


def concat(val, ax):
    return tf.concat(ax, val)


#get file queue working... https://stackoverflow.com/questions/37126108/how-to-read-data-into-tensorflow-batches-from-example-queue
initstddev = 1

dropout = 0.85
num_epochs = 200
num_batches = 31
batch_size_static = 88

filenames = tf.matching_files("./train_data/*.jpg")
#test on puck4 pls

print(filenames)

filename_queue = tf.train.string_input_producer(filenames,
                                                shuffle=False,
                                                name="DEMON_FROM_HELL")
images, names = getImage(filename_queue)
#batch, labels= tf.train.shuffle_batch([images, names], batch_size=batch_size_static, capacity=1000 + 3 * batch_size_static, allow_smaller_final_batch=True, min_after_dequeue=1000)
batch, labels = tf.train.batch([images, names],
                               batch_size=batch_size_static,
                               capacity=1000 + 3 * batch_size_static,
                               allow_smaller_final_batch=True)

save_path = 'TrainedModel'
Beispiel #36
0
def dataset_input_fn(file_names_pattern,
                     file_encoding='csv',
                     mode=tf.estimator.ModeKeys.EVAL,
                     skip_header_lines=0,
                     num_epochs=1,
                     batch_size=200,
                     multi_threading=True):
    """An input function for training or evaluation.
    This uses the Dataset APIs.

    Args:
        file_names_pattern: [str] - file name or file name patterns from which to read the data.
        mode: tf.estimator.ModeKeys - either TRAIN or EVAL.
            Used to determine whether or not to randomize the order of data.
        file_encoding: type of the text files. Can be 'csv' or 'tfrecords'
        skip_header_lines: int set to non-zero in order to skip header lines
          in CSV files.
        num_epochs: int - how many times through to read the data.
          If None will loop through data indefinitely
        batch_size: int - first dimension size of the Tensors returned by
          input_fn
        multi_threading: boolean - indicator to use multi-threading or not
    Returns:
        A function () -> (features, indices) where features is a dictionary of
          Tensors, and indices is a single Tensor of label indices.
    """

    shuffle = True if mode == tf.estimator.ModeKeys.TRAIN else False

    data_size = parameters.HYPER_PARAMS.train_size if mode == tf.estimator.ModeKeys.TRAIN else None

    num_threads = multiprocessing.cpu_count() if multi_threading else 1

    buffer_size = 2 * batch_size + 1

    print("")
    print("* data input_fn:")
    print("================")
    print("Mode: {}".format(mode))
    print("Input file(s): {}".format(file_names_pattern))
    print("Files encoding: {}".format(file_encoding))
    print("Data size: {}".format(data_size))
    print("Batch size: {}".format(batch_size))
    print("Epoch Count: {}".format(num_epochs))
    print("Thread Count: {}".format(num_threads))
    print("Shuffle: {}".format(shuffle))
    print("================")
    print("")

    file_names = tf.matching_files(file_names_pattern)

    if file_encoding == 'csv':
        dataset = data.TextLineDataset(filenames=file_names)
        dataset = dataset.skip(skip_header_lines)
        dataset = dataset.map(lambda csv_row: parse_csv(csv_row))

    else:
        dataset = data.TFRecordDataset(filenames=file_names)
        dataset = dataset.map(lambda tf_example: parse_tf_example(tf_example),
                              num_parallel_calls=num_threads)

    dataset = dataset.map(lambda features: get_features_target_tuple(features),
                          num_parallel_calls=num_threads)
    dataset = dataset.map(lambda features, target: (process_features(features), target),
                          num_parallel_calls=num_threads)

    if shuffle:
        dataset = dataset.shuffle(buffer_size)

    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(buffer_size)
    dataset = dataset.repeat(num_epochs)

    iterator = dataset.make_one_shot_iterator()
    features, target = iterator.get_next()

    return features, target
Beispiel #37
0
def get_edof_training_queue(target_dir, patch_size, batch_size, num_depths=4, color=False,
                            num_threads=4, loop=True, filetype='jpg'):
    if filetype == 'jpg':
        file_list = tf.matching_files(os.path.join(target_dir, '*.jpg'))
    elif filetype == 'png':
        file_list = tf.matching_files(os.path.join(target_dir, '*.png'))

    filename_queue = tf.train.string_input_producer(file_list,
                                                    num_epochs=None if loop else 1,
                                                    shuffle=True if loop else False)

    image_reader = tf.WholeFileReader()

    _, image_file = image_reader.read(filename_queue)
    if filetype == 'jpg':
        if color:
            print("Using color images")
            image = tf.image.decode_jpeg(image_file,
                                         channels=0)
        else:
            print("Using black and white images")
            image = tf.image.decode_jpeg(image_file,
                                         channels=1)
    elif filetype == 'png':
        if color:
            print("Using color images")
            image = tf.image.decode_png(image_file,
                                        channels=0)
        else:
            print("Using black and white images")
            image = tf.image.decode_png(image_file,
                                        channels=1)

    image = tf.cast(image, tf.float32)  # Shape [height, width, 1]
    image = tf.expand_dims(image, 0)
    image /= 255.

    # Get the ratio of the patch size to the smallest side of the image
    img_height_width = tf.cast(tf.shape(image)[1:3], tf.float32)

    size_ratio = patch_size / tf.reduce_min(img_height_width)

    # Extract a glimpse from the image
    offset_center = tf.random_uniform([1, 2], minval=0.0 + size_ratio / 2, maxval=1.0 - size_ratio / 2,
                                      dtype=tf.float32)
    offset_center = offset_center * img_height_width

    image = tf.image.extract_glimpse(image, size=[patch_size, patch_size], offsets=offset_center, centered=False,
                                     normalized=False)
    image = tf.squeeze(image, 0)

    all_depths = tf.convert_to_tensor([1 / 2, 1 / 1.5, 1 / 1, 1 / 0.5, 1000], tf.float32)

    depth_bins = []
    for i in range(num_depths):
        depth_idx = tf.multinomial(tf.log([5 * [1 / 5]]), num_samples=1)
        depth_bins.append(all_depths[tf.cast(depth_idx[0][0], tf.int32)])

    test_depth = np.concatenate(
        [np.ones((patch_size // len(depth_bins), patch_size)) * i for i in range(len(depth_bins))], axis=0)[:, :, None]

    if color:
        patch_dims = [patch_size, patch_size, 3]
    else:
        patch_dims = [patch_size, patch_size, 1]

    image_batch, depth_batch = tf.train.batch([image, test_depth],
                                              shapes=[patch_dims, [patch_size, patch_size, 1]],
                                              batch_size=batch_size,
                                              num_threads=num_threads,
                                              capacity=4 * batch_size)
    tf.summary.image("input_img", image_batch)
    tf.summary.scalar("input_img_max", tf.reduce_max(image_batch))
    tf.summary.scalar("input_img_min", tf.reduce_min(image_batch))
    tf.summary.histogram('depth', depth_bins)
    tf.summary.image('depth', tf.cast(depth_batch, tf.float32))

    return image_batch, depth_batch, depth_bins