Example #1
0
def input_fn(is_training, data_dir, batch_size, num_epochs=1,
             num_parallel_calls=1, multi_gpu=False):
  """Input_fn using the tf.data input pipeline for CIFAR-10 dataset.
  Args:
    is_training: A boolean denoting whether the input is for training.
    data_dir: The directory containing the input data.
    batch_size: The number of samples per batch.
    num_epochs: The number of epochs to repeat the dataset.
    num_parallel_calls: The number of records that are processed in parallel.
      This can be optimized per data set but for generally homogeneous data
      sets, should be approximately the number of available CPU cores.
    multi_gpu: Whether this is run multi-GPU. Note that this is only required
      currently to handle the batch leftovers, and can be removed
      when that is handled directly by Estimator.
  Returns:
    A dataset that can be used for iteration.
  """
  filenames = get_filenames(is_training, data_dir)
  dataset = tf.data.FixedLengthRecordDataset(filenames, _RECORD_BYTES)

  num_images = is_training and _NUM_IMAGES['train'] or _NUM_IMAGES['validation']

  return resnet_run_loop.process_record_dataset(
      dataset, is_training, batch_size, _NUM_IMAGES['train'],
      parse_record, num_epochs, num_parallel_calls,
      examples_per_epoch=num_images, multi_gpu=multi_gpu)
Example #2
0
def input_fn(is_training, data_dir, batch_size, num_epochs=1):
    """Input function which provides batches for train or eval.

  Args:
    is_training: A boolean denoting whether the input is for training.
    data_dir: The directory containing the input data.
    batch_size: The number of samples per batch.
    num_epochs: The number of epochs to repeat the dataset.

  Returns:
    A dataset that can be used for iteration.
  """
    filenames = get_filenames(is_training, data_dir)
    dataset = tf.data.Dataset.from_tensor_slices(filenames)

    if is_training:
        # Shuffle the input files
        dataset = dataset.shuffle(buffer_size=_NUM_TRAIN_FILES)

    # Convert to individual records
    dataset = dataset.flat_map(tf.data.TFRecordDataset)

    return resnet_run_loop.process_record_dataset(dataset, is_training,
                                                  batch_size, _SHUFFLE_BUFFER,
                                                  parse_record, num_epochs)
Example #3
0
def input_fn(is_training, data_dir, batch_size, num_epochs=1):
    filenames = get_filenames(is_training, data_dir)
    dataset = tf.data.FixedLengthRecordDataset(filenames, _RECORD_BYTES)

    return resnet_run_loop.process_record_dataset(dataset, is_training,
                                                  batch_size,
                                                  _NUM_IMAGES['train'],
                                                  parse_record, num_epochs)
Example #4
0
def input_fn(is_training, data_dir, batch_size, num_epochs=1):
  filenames = get_filenames(is_training, data_dir)
  dataset = tf.data.Dataset.from_tensor_slices(filenames)

  if is_training:
    # Shuffle the input files
    dataset = dataset.shuffle(buffer_size=_NUM_TRAIN_FILES)

  # Convert to individual records
  dataset = dataset.flat_map(tf.data.TFRecordDataset)

  return resnet_run_loop.process_record_dataset(
      dataset, is_training, batch_size, _SHUFFLE_BUFFER, parse_record,
      num_epochs
  )
Example #5
0
def input_fn(is_training,
             data_dir,
             batch_size,
             num_epochs=1,
             num_gpus=None,
             dtype=tf.float32,
             mix_up=False,
             oss_load=False):
    """Input function which provides batches for train or eval.

  Args:
    is_training: A boolean denoting whether the input is for training.
    data_dir: The directory containing the input data.
    batch_size: The number of samples per batch.
    num_epochs: The number of epochs to repeat the dataset.
    num_gpus: The number of gpus used for training.
    dtype: Data type to use for images/features

  Returns:
    A dataset that can be used for iteration.
  """
    mlperf_log.resnet_print(key=mlperf_log.INPUT_ORDER)
    if not oss_load:
        filenames = get_filenames(is_training, data_dir)
    else:
        filenames = get_filenames_oss(is_training)
    #print(filenames)
    dataset = tf.data.Dataset.from_tensor_slices(filenames)

    if is_training:
        # Shuffle the input files
        dataset = dataset.shuffle(buffer_size=_NUM_TRAIN_FILES)

    # Convert to individual records
    dataset = dataset.flat_map(tf.data.TFRecordDataset)

    return resnet_run_loop.process_record_dataset(
        dataset=dataset,
        is_training=is_training,
        batch_size=batch_size,
        shuffle_buffer=_SHUFFLE_BUFFER,
        parse_record_fn=parse_record,
        num_epochs=num_epochs,
        num_gpus=num_gpus,
        examples_per_epoch=_NUM_IMAGES['train'] if is_training else None,
        dtype=dtype,
        mix_up=mix_up)
Example #6
0
def input_fn(is_training,
             data_dir,
             batch_size,
             num_epochs=1,
             num_gpus=None,
             dtype=tf.float32):
    """Input function which provides batches for train or eval.

  Args:
    is_training: A boolean denoting whether the input is for training.
    data_dir: The directory containing the input data.
    batch_size: The number of samples per batch.
    num_epochs: The number of epochs to repeat the dataset.
    num_gpus: The number of gpus used for training.
    dtype: Data type to use for images/features

  Returns:
    A dataset that can be used for iteration.
  """
    filenames = get_filenames(is_training, data_dir)
    dataset = tf.data.Dataset.from_tensor_slices(filenames)

    if is_training:
        # Shuffle the input files
        dataset = dataset.shuffle(buffer_size=_NUM_TRAIN_FILES)

    # Convert to individual records.
    # cycle_length = 10 means 10 files will be read and deserialized in parallel.
    # This number is low enough to not cause too much contention on small systems
    # but high enough to provide the benefits of parallelization. You may want
    # to increase this number if you have a large number of CPU cores.
    dataset = dataset.apply(
        tf.contrib.data.parallel_interleave(tf.data.TFRecordDataset,
                                            cycle_length=10))

    return resnet_run_loop.process_record_dataset(
        dataset=dataset,
        is_training=is_training,
        batch_size=batch_size,
        shuffle_buffer=_SHUFFLE_BUFFER,
        parse_record_fn=parse_record,
        num_epochs=num_epochs,
        num_gpus=num_gpus,
        examples_per_epoch=_NUM_IMAGES['train'] if is_training else None,
        dtype=dtype)
def input_fn(is_training,
             data_dir,
             batch_size,
             num_epochs=1,
             num_parallel_calls=1,
             multi_gpu=False):
    """Input function which provides batches for train or eval.
  Args:
    is_training: A boolean denoting whether the input is for training.
    data_dir: The directory containing the input data.
    batch_size: The number of samples per batch.
    num_epochs: The number of epochs to repeat the dataset.
    num_parallel_calls: The number of records that are processed in parallel.
      This can be optimized per data set but for generally homogeneous data
      sets, should be approximately the number of available CPU cores.
    multi_gpu: Whether this is run multi-GPU. Note that this is only required
      currently to handle the batch leftovers, and can be removed
      when that is handled directly by Estimator.
  Returns:
    A dataset that can be used for iteration.
  """
    filenames = get_filenames(is_training, data_dir)
    dataset = tf.data.Dataset.from_tensor_slices(filenames)

    if is_training:
        # Shuffle the input files
        dataset = dataset.shuffle(buffer_size=_NUM_TRAIN_FILES)

    num_images = is_training and _NUM_IMAGES['train'] or _NUM_IMAGES[
        'validation']

    # Convert to individual records
    dataset = dataset.flat_map(tf.data.TFRecordDataset)

    return resnet_run_loop.process_record_dataset(
        dataset,
        is_training,
        batch_size,
        _SHUFFLE_BUFFER,
        parse_record,
        num_epochs,
        num_parallel_calls,
        examples_per_epoch=num_images,
        multi_gpu=multi_gpu)
Example #8
0
def input_fn(is_training,
             data_path,
             batch_size,
             num_epochs=1,
             num_parallel_calls=1,
             multi_gpu=False):
    dataset = tf.data.TFRecordDataset([data_path],
                                      num_parallel_reads=num_parallel_calls)

    num_images = is_training and _NUM_IMAGES['train'] or _NUM_IMAGES['test']

    return resnet_run_loop.process_record_dataset(
        dataset,
        is_training,
        batch_size,
        num_images,
        parse_record,
        num_epochs,
        num_parallel_calls,
        examples_per_epoch=num_images,
        multi_gpu=multi_gpu)
def input_fn_predict(is_training,
                     data_dir,
                     batch_size,
                     num_epochs=1,
                     num_gpus=None,
                     get_one_item=False,
                     conf_matrix=None):
    dir = "/home/yotamg/data/rgb/test"
    filenames = []
    for bin_file in os.listdir(dir):
        if bin_file.endswith(".bin"):
            filenames.append(os.path.join(dir, bin_file))
    dataset = tf.data.FixedLengthRecordDataset(filenames, _RECORD_BYTES)
    if get_one_item:
        dataset = dataset.take(100000)
    iterator = dataset.make_initializable_iterator()
    batch_sess = tf.Session()
    next_item = iterator.get_next()
    batch_sess.run(iterator.initializer)
    label = tf.decode_raw(next_item, tf.uint8)[0]
    cnt = 0
    while True:
        try:
            data = batch_sess.run(label)
            conf_matrix.output_classes.append(data)
            print("Going over sample #", str(cnt))
            cnt += 1
        except tf.errors.OutOfRangeError:
            break

    return resnet_run_loop.process_record_dataset(
        dataset=dataset,
        is_training=is_training,
        batch_size=batch_size,
        shuffle_buffer=0,
        parse_record_fn=parse_record,
        num_epochs=num_epochs,
        num_gpus=num_gpus,
        examples_per_epoch=_NUM_IMAGES['train'] if is_training else None)
def input_fn(is_training, data_dir, batch_size, num_epochs=1, num_gpus=None):
  """Input_fn using the tf.data input pipeline for CIFAR-10 dataset.
  Args:
    is_training: A boolean denoting whether the input is for training.
    data_dir: The directory containing the input data.
    batch_size: The number of samples per batch.
    num_epochs: The number of epochs to repeat the dataset.
    num_gpus: The number of gpus used for training.
  Returns:
    A dataset that can be used for iteration.
  """
  filenames = get_filenames(is_training, data_dir)
  dataset = tf.data.FixedLengthRecordDataset(filenames, _RECORD_BYTES)

  return resnet_run_loop.process_record_dataset(
      dataset=dataset,
      is_training=is_training,
      batch_size=batch_size,
      shuffle_buffer=_NUM_IMAGES['train'],
      parse_record_fn=parse_record,
      num_epochs=num_epochs,
      num_gpus=num_gpus,
      examples_per_epoch=_NUM_IMAGES['train'] if is_training else None
  )
def input_fn(mode,
             data_dir,
             batch_size,
             num_epochs=1,
             num_parallel_calls=1,
             multi_gpu=False,
             n_crop=None):
    filename = os.path.join(data_dir, mode + '.tfrecord')
    dataset = tf.data.TFRecordDataset([filename],
                                      num_parallel_reads=num_parallel_calls)

    num_images = _NUM_IMAGES[mode]

    return resnet_run_loop.process_record_dataset(
        dataset,
        mode == 'train',
        batch_size,
        num_images,
        parse_record,
        num_epochs,
        num_parallel_calls,
        examples_per_epoch=num_images,
        multi_gpu=multi_gpu,
        n_crop=n_crop)
def input_fn(is_training,
             data_dir,
             batch_size,
             num_epochs=1,
             num_gpus=None,
             get_one_item=False,
             conf_matrix=None):
    """Input_fn using the tf.data input pipeline for CIFAR-10 dataset.

  Args:
    is_training: A boolean denoting whether the input is for training.
    data_dir: The directory containing the input data.
    batch_size: The number of samples per batch.
    num_epochs: The number of epochs to repeat the dataset.
    num_gpus: The number of gpus used for training.

  Returns:
    A dataset that can be used for iteration.
  """
    # filenames = get_filenames(is_training, data_dir)
    dir = "/home/yotamg/data/rgb/"
    filenames = []
    for bin_file in os.listdir(dir):
        if bin_file.endswith(".bin"):
            filenames.append(os.path.join(dir, bin_file))
    dataset = tf.data.FixedLengthRecordDataset(filenames, _RECORD_BYTES)

    return resnet_run_loop.process_record_dataset(
        dataset=dataset,
        is_training=is_training,
        batch_size=batch_size,
        shuffle_buffer=_NUM_IMAGES['train'],
        parse_record_fn=parse_record,
        num_epochs=num_epochs,
        num_gpus=num_gpus,
        examples_per_epoch=_NUM_IMAGES['train'] if is_training else None)
Example #13
0
def input_fn(is_training,
             data_dir,
             batch_size,
             num_epochs=1,
             dtype=tf.float32,
             datasets_num_private_threads=None,
             parse_record_fn=parse_record,
             input_context=None,
             drop_remainder=False,
             tf_data_experimental_slack=False):
    """Input function which provides batches for train or eval.

    Args:
      is_training: A boolean denoting whether the input is for training.
      data_dir: The directory containing the input data.
      batch_size: The number of samples per batch.
      num_epochs: The number of epochs to repeat the dataset.
      dtype: Data type to use for images/features
      datasets_num_private_threads: Number of private threads for tf.data.
      parse_record_fn: Function to use for parsing the records.
      input_context: A `tf.distribute.InputContext` object passed in by
        `tf.distribute.Strategy`.
      drop_remainder: A boolean indicates whether to drop the remainder of the
        batches. If True, the batch dimension will be static.
      tf_data_experimental_slack: Whether to enable tf.data's
        `experimental_slack` option.

    Returns:
      A dataset that can be used for iteration.
    """
    filenames = get_filenames(is_training, data_dir)
    dataset = tf.data.Dataset.from_tensor_slices(filenames)

    if input_context:
        tf.compat.v1.logging.info(
            'Sharding the dataset: input_pipeline_id=%d num_input_pipelines=%d' % (
                input_context.input_pipeline_id, input_context.num_input_pipelines))
        dataset = dataset.shard(input_context.num_input_pipelines,
                                input_context.input_pipeline_id)

    if is_training:
        # Shuffle the input files
        dataset = dataset.shuffle(buffer_size=_NUM_TRAIN_FILES)

    # Convert to individual records.
    # cycle_length = 10 means that up to 10 files will be read and deserialized in
    # parallel. You may want to increase this number if you have a large number of
    # CPU cores.
    dataset = dataset.interleave(
        tf.data.TFRecordDataset,
        cycle_length=10,
        num_parallel_calls=tf.data.experimental.AUTOTUNE)

    return resnet_run_loop.process_record_dataset(
        dataset=dataset,
        is_training=is_training,
        batch_size=batch_size,
        shuffle_buffer=_SHUFFLE_BUFFER,
        parse_record_fn=parse_record_fn,
        num_epochs=num_epochs,
        dtype=dtype,
        datasets_num_private_threads=datasets_num_private_threads,
        drop_remainder=drop_remainder,
        tf_data_experimental_slack=tf_data_experimental_slack,
    )
Example #14
0
def input_fn(
        is_training,
        augment,  # *SC*
        rand_labels,  # *SC*
        data_dir,  # *SC*
        batch_size,
        num_epochs=1,
        dtype=tf.float32,
        datasets_num_private_threads=None,
        parse_record_fn=parse_record,
        input_context=None):
    """Input function which provides batches for train or eval.

    Args:
      is_training: A boolean denoting whether the input is for training.
      data_dir: The directory containing the input data.
      batch_size: The number of samples per batch.
      num_epochs: The number of epochs to repeat the dataset.
      dtype: Data type to use for images/features
      datasets_num_private_threads: Number of private threads for tf.data.
      parse_record_fn: Function to use for parsing the records.
      input_context: A `tf.distribute.InputContext` object passed in by
        `tf.distribute.Strategy`.

    Returns:
      A dataset that can be used for iteration.
    """

    # *SC*
    if not rand_labels:
        data_dir = '/om/user/scasper/workspace/models/resnet_cifar/ID0_data_1/data/'
    else:
        data_dir = '/om/user/scasper/workspace/models/resnet_cifar/ID1_data_random/data/'
    # *SC*

    filenames = get_filenames(is_training, data_dir)

    # *SC*
    dataset = tf.data.Dataset.from_tensor_slices(filenames)
    # dataset = tf.data.FixedLengthRecordDataset(filenames, _RECORD_BYTES)
    # *SC*

    if input_context:
        tf.compat.v1.logging.info(
            'Sharding the dataset: input_pipeline_id=%d num_input_pipelines=%d'
            % (input_context.input_pipeline_id,
               input_context.num_input_pipelines))
        dataset = dataset.shard(input_context.num_input_pipelines,
                                input_context.input_pipeline_id)

    dataset = dataset.interleave(
        tf.data.TFRecordDataset,
        cycle_length=10,
        num_parallel_calls=tf.data.experimental.AUTOTUNE)

    return resnet_run_loop.process_record_dataset(
        dataset=dataset,
        is_training=is_training,
        augment=augment,  # *SC*
        batch_size=batch_size,
        shuffle_buffer=NUM_IMAGES['train'],
        parse_record_fn=parse_record_fn,
        num_epochs=num_epochs,
        dtype=dtype,
        datasets_num_private_threads=datasets_num_private_threads)