def testGetShapeFromExamplesPathInvalidPath(self):
     # This calls tf.gfile.Glob, which will raise errors.OpError,
     # at least on a Posix filesystem.  Other filesystems might
     # not fail like that, and will return an empty list, which
     # is turned into a different exception.
     with self.assertRaisesRegexp(Exception, '/this/path/does/not'):
         tf_utils.get_shape_from_examples_path('/this/path/does/not/exist')
Example #2
0
 def testGetShapeFromExamplesPathInvalidPath(self, source_paths,
                                             expected_partial_message):
     # This calls tf.gfile.Glob, which will raise errors.OpError,
     # at least on a Posix filesystem.  Other filesystems might
     # not fail like that, and will return an empty list, which
     # is turned into a different exception.
     with self.assertRaisesRegexp(Exception, expected_partial_message):
         tf_utils.get_shape_from_examples_path(source_paths)
Example #3
0
 def testGetShapeFromExamplesPath(self, file_name_to_write,
                                  tfrecord_path_to_match):
   example = example_pb2.Example()
   valid_shape = [1, 2, 3]
   example.features.feature['image/shape'].int64_list.value.extend(valid_shape)
   output_file = test_utils.test_tmpfile(file_name_to_write)
   io_utils.write_tfrecords([example], output_file)
   tf_utils.get_shape_from_examples_path(
       test_utils.test_tmpfile(tfrecord_path_to_match))
Example #4
0
 def testGetNoneShapeFromEmptyExamplesPath(self, file_name_to_write,
                                           tfrecord_path_to_match):
     output_file = test_utils.test_tmpfile(file_name_to_write)
     io_utils.write_tfrecords([], output_file)
     self.assertIsNone(
         tf_utils.get_shape_from_examples_path(
             test_utils.test_tmpfile(tfrecord_path_to_match)))
Example #5
0
  def __init__(self,
               name,
               source,
               num_examples,
               num_classes=DEFAULT_NUM_CLASSES,
               tensor_shape=None):
    """Creates a dataset.

    Args:
      name: str. The name of this dataset. Used to refer to this dataset on
        the command line.
      source: str or list[str]. A file path pattern or a comma-separated list of
        file path patterns pointing to TF.Example PIC images containing the data
        for this dataset.
      num_examples: A positive integer. The number of examples in this dataset.
      num_classes: A positive integer. The number of classes in the labels of
        this dataset. Currently defaults to DEFAULT_NUM_CLASSES.
      tensor_shape: None (whihc means we get the shape from the first example in
        source), or list of int [height, width, channel] for testing.
    """
    self.name = name
    self.source = source
    self.num_examples = num_examples
    self.num_classes = num_classes
    if tensor_shape:
      self.tensor_shape = tensor_shape
    else:
      self.tensor_shape = tf_utils.get_shape_from_examples_path(source)
    def __init__(self,
                 name,
                 source,
                 num_examples,
                 num_classes=DEFAULT_NUM_CLASSES,
                 tensor_shape=None):
        """Creates a dataset.

    Args:
      name: str. The name of this dataset. Used to refer to this dataset on
        the command line.
      source: str or list[str]. A file path pattern or a comma-separated list of
        file path patterns pointing to TF.Example PIC images containing the data
        for this dataset.
      num_examples: A positive integer. The number of examples in this dataset.
      num_classes: A positive integer. The number of classes in the labels of
        this dataset. Currently defaults to DEFAULT_NUM_CLASSES.
      tensor_shape: None (whihc means we get the shape from the first example in
        source), or list of int [height, width, channel] for testing.
    """
        self.name = name
        self.source = source
        self.num_examples = num_examples
        self.num_classes = num_classes
        if tensor_shape:
            self.tensor_shape = tensor_shape
        else:
            self.tensor_shape = tf_utils.get_shape_from_examples_path(source)
Example #7
0
def prepare_inputs(source_path, model, batch_size, num_readers=None):
  """Prepares image and encoded_variant ops.

  Reads image / encoded_variant tuples from source_path, extracting the image
  and encoded_variant tensors from source_path. The image is decoded from its
  png encoding and preprocessed with model.preprocess_image as well. Every
  example in source_path is read once (num_epoch=1).

  Args:
    source_path: Path to a TFRecord file containing deepvariant tf.Example
      protos.
    model: A DeepVariantModel whose preprocess_image function will be used on
      image.
    batch_size: int > 0. Size of batches to use during inference.
    num_readers: int > 0 or None. Number of parallel readers to use to read
      examples from source_path. If None, uses FLAGS.num_readers instead.

  Returns:
    A tuple of (image, encoded_variant, encoded_alt_allele_indices) TF ops.
    Image is a [height, width, channel] tensor.
    encoded_variants is a tf.string tensor containing a serialized Variant proto
    describing the variant call associated with image.
    encoded_alt_allele_indices is a tf.string tensor containing a serialized
    CallVariantsOutput.AltAlleleIndices proto containing the
    alternate alleles indices used as "alt" when constructing the image.
  """
  if not num_readers:
    num_readers = FLAGS.num_readers

  tensor_shape = tf_utils.get_shape_from_examples_path(source_path)

  def _parse_single_example(serialized_example):
    """Parses serialized example into a dictionary of de-serialized features."""
    features = tf.parse_single_example(
        serialized_example,
        features={
            'image/encoded': tf.FixedLenFeature([], tf.string),
            'variant/encoded': tf.FixedLenFeature([], tf.string),
            # deepvariant_pb2.CallVariantsOutput.AltAlleleIndices
            'alt_allele_indices/encoded': tf.FixedLenFeature([], tf.string),
        })
    return features

  with tf.name_scope('input'):

    def _preprocess_image(features):
      """Preprocess images (decode, reshape, and apply model-specific steps)."""
      image = features['image/encoded']
      # Bypassing the reshaping and preprocessing if there is no tensor_shape.
      # Currently that could happen when the input file is empty.
      if tensor_shape:
        image = tf.reshape(tf.decode_raw(image, tf.uint8), tensor_shape)
        image = model.preprocess_image(image)
      features['image/encoded'] = image
      return features

    files = tf.gfile.Glob(io_utils.NormalizeToShardedFilePattern(source_path))
    reader_options = io_utils.make_tfrecord_options(files)
    if reader_options.compression_type == (
        tf.python_io.TFRecordCompressionType.GZIP):
      compression_type = 'GZIP'
    else:
      compression_type = None
    dataset = tf.data.TFRecordDataset(files, compression_type=compression_type)
    dataset = dataset.map(
        _parse_single_example, num_parallel_calls=FLAGS.num_readers)
    dataset = dataset.map(
        _preprocess_image, num_parallel_calls=FLAGS.num_readers)
    dataset = dataset.prefetch(10 * batch_size)
    dataset = dataset.batch(batch_size)
    iterator = dataset.make_one_shot_iterator()
    features = iterator.get_next()
    return (features['image/encoded'], features['variant/encoded'],
            features['alt_allele_indices/encoded'])
Example #8
0
  def __init__(
      self,
      mode,
      input_file_spec,
      num_examples=None,
      num_classes=dv_constants.NUM_CLASSES,
      max_examples=None,
      tensor_shape=None,
      name=None,
      use_tpu=False,
      input_read_threads=_DEFAULT_INPUT_READ_THREADS,
      input_map_threads=_DEFAULT_INPUT_MAP_THREADS,
      shuffle_buffer_size=_DEFAULT_SHUFFLE_BUFFER_ELEMENTS,
      initial_shuffle_buffer_size=_DEFAULT_INITIAL_SHUFFLE_BUFFER_ELEMENTS,
      prefetch_dataset_buffer_size=_DEFAULT_PREFETCH_BUFFER_BYTES,
      sloppy=True,
      list_files_shuffle=True,
      debugging_true_label_mode=False):
    """Create an DeepVariantInput object, usable as an `input_fn`.

    Args:
      mode: the mode string (from `tf.estimator.ModeKeys`).
      input_file_spec: the input filename for a tfrecord[.gz] file containing
        examples.  Can contain sharding designators.
      num_examples: the number of examples contained in the input file.
        Required for setting learning rate schedule in train/eval only.
      num_classes: The number of classes in the labels of
        this dataset. Currently defaults to DEFAULT_NUM_CLASSES.
      max_examples: The maximum number of examples to use. If None, all examples
        will be used. If not None, the first n = min(max_examples, num_examples)
        will be used. This works with training, and the n examples will repeat
        over and over.
      tensor_shape: None (which means we get the shape from the first example in
        source), or list of int [height, width, channel] for testing.
      name: string, name of the dataset.
      use_tpu: use code paths tuned for TPU, in particular protobuf encoding.
        Default False.
      input_read_threads: number of threads for reading data.  Default 32.
      input_map_threads: number of threads for mapping data.  Default 48.
      shuffle_buffer_size: size of the final shuffle buffer, in elements.
        Default 100.
      initial_shuffle_buffer_size: int; the size of the dataset.shuffle buffer
        in elements.  Default is 1024.
      prefetch_dataset_buffer_size: int; the size of the TFRecordDataset buffer
        in bytes.  Default is 16 * 1000 * 1000.
      sloppy: boolean, allow parallel_interleave to be sloppy.  Default True.
      list_files_shuffle: boolean, allow list_files to shuffle.  Default True.
      debugging_true_label_mode: boolean. If true, the input examples are
                                 created with "training" mode. We'll parse the
                                 'label' field even if the `mode` is PREDICT.
    Raises:
      ValueError: if `num_examples` not provided, in a context requiring it.
    """
    self.mode = mode
    self.input_file_spec = input_file_spec
    self.name = name
    self.num_examples = num_examples
    self.num_classes = num_classes
    self.max_examples = max_examples

    self.use_tpu = use_tpu
    self.sloppy = sloppy
    self.list_files_shuffle = list_files_shuffle
    self.input_read_threads = input_read_threads
    self.input_map_threads = input_map_threads
    self.shuffle_buffer_size = shuffle_buffer_size
    self.initial_shuffle_buffer_size = initial_shuffle_buffer_size
    self.prefetch_dataset_buffer_size = prefetch_dataset_buffer_size
    self.debugging_true_label_mode = debugging_true_label_mode
    self.feature_extraction_spec = self.features_extraction_spec_for_mode(
        mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL) or
        debugging_true_label_mode)

    if num_examples is None and mode != tf.estimator.ModeKeys.PREDICT:
      raise ValueError('num_examples argument required for DeepVariantInput'
                       'in TRAIN/EVAL modes.')

    if max_examples is not None:
      if max_examples <= 0:
        raise ValueError(
            'max_examples must be > 0 if not None. Got {}'.format(max_examples))
      # We update our num_examples in the situation where num_examples is set
      # (i.e., is not None) to the smaller of max_examples and num_examples.
      if self.num_examples is not None:
        self.num_examples = min(max_examples, self.num_examples)

    if tensor_shape:
      self.tensor_shape = tensor_shape
    else:
      self.tensor_shape = tf_utils.get_shape_from_examples_path(input_file_spec)
    self.input_files = sharded_file_utils.glob_list_sharded_file_patterns(
        self.input_file_spec)
Example #9
0
def prepare_inputs(source_path, model, batch_size, num_readers=None):
  """Prepares image and encoded_variant ops.

  Reads image / encoded_variant tuples from source_path, extracting the image
  and encoded_variant tensors from source_path. The image is decoded from its
  png encoding and preprocessed with model.preprocess_image as well. Every
  example in source_path is read once (num_epoch=1).

  Args:
    source_path: Path to a TFRecord file containing deepvariant tf.Example
      protos.
    model: A DeepVariantModel whose preprocess_image function will be used on
      image.
    batch_size: int > 0. Size of batches to use during inference.
    num_readers: int > 0 or None. Number of parallel readers to use to read
      examples from source_path. If None, uses FLAGS.num_readers instead.

  Returns:
    A tuple of (image, encoded_variant, encoded_alt_allele_indices) TF ops.
    Image is a [height, width, channel] tensor.
    encoded_variants is a tf.string tensor containing a serialized Variant proto
    describing the variant call associated with image.
    encoded_alt_allele_indices is a tf.string tensor containing a serialized
    CallVariantsOutput.AltAlleleIndices proto containing the
    alternate alleles indices used as "alt" when constructing the image.
  """
  if not num_readers:
    num_readers = FLAGS.num_readers

  tensor_shape = tf_utils.get_shape_from_examples_path(source_path)

  def _parse_single_example(serialized_example):
    """Parses serialized example into a dictionary of de-serialized features."""
    features = tf.parse_single_example(
        serialized_example,
        features={
            'image/encoded': tf.FixedLenFeature([], tf.string),
            'variant/encoded': tf.FixedLenFeature([], tf.string),
            # deepvariant_pb2.CallVariantsOutput.AltAlleleIndices
            'alt_allele_indices/encoded': tf.FixedLenFeature([], tf.string),
        })
    return features

  with tf.name_scope('input'):

    def _preprocess_image(features):
      """Preprocess images (decode, reshape, and apply model-specific steps)."""
      image = features['image/encoded']
      # Bypassing the reshaping and preprocessing if there is no tensor_shape.
      # Currently that could happen when the input file is empty.
      if tensor_shape:
        image = tf.reshape(tf.decode_raw(image, tf.uint8), tensor_shape)
        image = model.preprocess_image(image)
      features['image/encoded'] = image
      return features

    files = tf.gfile.Glob(io_utils.NormalizeToShardedFilePattern(source_path))
    reader_options = io_utils.make_tfrecord_options(files)
    if reader_options.compression_type == (
        tf.python_io.TFRecordCompressionType.GZIP):
      compression_type = 'GZIP'
    else:
      compression_type = None
    dataset = tf.data.TFRecordDataset(files, compression_type=compression_type)
    dataset = dataset.map(
        _parse_single_example, num_parallel_calls=FLAGS.num_readers)
    dataset = dataset.map(
        _preprocess_image, num_parallel_calls=FLAGS.num_readers)
    dataset = dataset.prefetch(10 * batch_size)
    dataset = dataset.batch(batch_size)
    iterator = dataset.make_one_shot_iterator()
    features = iterator.get_next()
    return (features['image/encoded'], features['variant/encoded'],
            features['alt_allele_indices/encoded'])