def test_get_input_fn(self):
        test_file = test_utils.test_tmpfile('test.tfrecord')

        # Use a simple test example that consists of a read sequence of length 5.
        example = example_pb2.Example()
        read_sequence = 'ACGTA'
        true_sequence = 'ACCTA'
        aligned_qualities = [30, 30, 20, 30, 30]
        features = example.features
        features.feature['read_name'].bytes_list.value.append(
            six.b('test_seq'))
        features.feature['read_sequence'].int64_list.value.extend(
            ['ACGT'.index(b) for b in read_sequence])
        features.feature['read_qualities'].int64_list.value.extend(
            aligned_qualities)
        features.feature['true_sequence'].int64_list.value.extend(
            ['ACGT'.index(b) for b in true_sequence])
        features.feature['ref_match'].int64_list.value.extend([1, 1, 0, 1, 1])

        with genomics_writer.TFRecordWriter(test_file) as writer:
            writer.write(example)

        features, label = ngs_errors.get_input_fn(
            test_file,
            ngs_read_length=len(read_sequence),
            batch_size=1,
            num_epochs=1)()
        with tf.Session() as sess:
            features_val, label_val = sess.run([features, label])
            features_array = np.array(features_val)
            self.assertEqual((1, 4, 5, 3), features_array.shape)
            self.assertTrue(
                np.array_equal(
                    np.array([[[1, 0, 0, 0, 1], [0, 1, 0, 0,
                                                 0], [0, 0, 1, 0, 0],
                               [0, 0, 0, 1, 0]]]), features_array[:, :, :, 0]))
            self.assertTrue(
                np.array_equal(
                    np.array([[[1, 1, 0, 1, 1], [1, 1, 0, 1,
                                                 1], [1, 1, 0, 1, 1],
                               [1, 1, 0, 1, 1]]]), features_array[:, :, :, 1]))
            self.assertTrue(
                np.array_equal(
                    np.array([[[30, 30, 20, 30, 30], [30, 30, 20, 30, 30],
                               [30, 30, 20, 30, 30], [30, 30, 20, 30, 30]]]),
                    features_array[:, :, :, 2]))
            self.assertTrue(
                np.array_equal(np.array([[0, 1, 1, 3, 0]]),
                               np.array(label_val)))
Beispiel #2
0
def make_ngs_error_examples(ref_path,
                            vcf_path,
                            bam_path,
                            examples_out_path,
                            max_reads=None):
    """Driver program for ngs_errors.

  See module description for details.

  Args:
    ref_path: str. A path to an indexed fasta file.
    vcf_path: str. A path to an indexed VCF file.
    bam_path: str. A path to an SAM/BAM file.
    examples_out_path: str. A path where we will write out examples.
    max_reads: int or None. If not None, we will emit at most max_reads examples
      to examples_out_path.
  """

    # Create a ref_reader backed by ref.
    ref_reader = fasta.IndexedFastaReader(ref_path)

    # Create a vcf_reader backed by vcf.
    vcf_reader = vcf.VcfReader(vcf_path)

    # Create a sam_reader backed by bam. Provide an empty ReadRequirements
    # proto to the reader so it enables standard filtering based on the default
    # values of ReadRequirements. Also explicitly allow the reader to access an
    # unindexed BAM, so only the iterate() function is enabled.
    read_requirements = reads_pb2.ReadRequirements()
    sam_reader = sam.SamReader(bam_path, read_requirements=read_requirements)

    # Create our TFRecordWriter where we'll send our tf.Examples.
    examples_out = genomics_writer.TFRecordWriter(examples_out_path)

    # All our readers and writers are context managers, so use the `with`
    # construct to open all of the inputs/outputs and close them when we are done
    # looping over our reads.
    n_examples = 0
    with ref_reader, vcf_reader, sam_reader, examples_out:
        # Loop over the reads in our BAM file:
        for i, read in enumerate(sam_reader.iterate(), start=1):
            # Get the Range proto describing the chrom/start/stop spanned by our read.
            read_range = utils.read_range(read)

            # Get all of the variants that overlap our read range.
            variants = list(vcf_reader.query(read_range))

            # Get the reference bases spanned by our read.
            ref_bases = ref_reader.query(read_range)

            # Check that we can use our read for generating an example.
            if is_usable_training_example(read, variants, ref_bases):
                n_examples += 1

                # Convert read and ref_bases to a tf.Example with make_example.
                example = make_example(read, ref_bases)

                # And write it out to our TFRecord output file.
                examples_out.write(example)

                # Do a bit of convenient logging. This is very verbose if we convert a
                # lot of reads...
                logging.info((
                    'Added an example for read %s (span=%s) with cigar %s [%d added '
                    'of %d total reads]'), read.fragment_name,
                             ranges.to_literal(read_range),
                             cigar.format_cigar_units(read.alignment.cigar),
                             n_examples, i)

                if max_reads is not None and n_examples >= max_reads:
                    return
Beispiel #3
0
def Writer(path, compression_type=None):
  """A convenience wrapper around genomics_writer.TFRecordWriter."""
  return genomics_writer.TFRecordWriter(path, compression_type)