Example #1
0
    def test_round_trip_vcf(self, test_datum_name):
        # Round-trip variants through writing and reading:
        # 1. Read variants v1 from VcfReader;
        # 2. Write v1 to vcf using our VcfWriter;
        # 3. Read back in using VcfReader -- v2;
        # 4. compare v1 and v2.
        in_file = test_utils.genomics_core_testdata(test_datum_name)
        out_file = test_utils.test_tmpfile('output_' + test_datum_name)

        v1_reader = vcf.VcfReader(in_file)
        v1_records = list(v1_reader.iterate())
        self.assertTrue(v1_records, 'Reader failed to find records')

        header = copy.deepcopy(v1_reader.header)
        writer_options = variants_pb2.VcfWriterOptions()

        with vcf_writer.VcfWriter.to_file(out_file, header,
                                          writer_options) as writer:
            for record in v1_records:
                writer.write(record)

        v2_reader = vcf.VcfReader(out_file)
        v2_records = list(v2_reader.iterate())

        self.assertEqual(v1_records, v2_records,
                         'Round-tripped variants not as expected')
Example #2
0
    def setUp(self):
        self.sites_reader = vcf.VcfReader(
            test_utils.genomics_core_testdata('test_sites.vcf'),
            use_index=False)

        self.samples_reader = vcf.VcfReader(
            test_utils.genomics_core_testdata('test_samples.vcf.gz'),
            use_index=True)
Example #3
0
  def test_vcf_query(self):
    tabix.build_index(self.output_file)
    self.input_reader = vcf.VcfReader(self.input_file)
    self.output_reader = vcf.VcfReader(self.output_file)

    range1 = ranges.parse_literal('chr3:100,000-500,000')
    self.assertEqual(
        list(self.input_reader.query(range1)),
        list(self.output_reader.query(range1)))
Example #4
0
  def test_headerless_vcf(self):
    """Writes a headerless vcf and reads it back out."""
    test_vcf = test_utils.genomics_core_testdata('test_sites.vcf')
    output_vcf = test_utils.test_tmpfile('output.vcf')
    expected_variants = []
    with vcf.VcfReader(test_vcf) as reader:
      with vcf.VcfWriter(
          output_vcf, header=reader.header, exclude_header=True) as writer:
        for record in reader:
          expected_variants.append(record)
          writer.write(record)

      with vcf.VcfReader(output_vcf, header=reader.header) as actual_reader:
        self.assertEqual(expected_variants, list(actual_reader))
Example #5
0
def make_population_vcf_readers(
    population_vcf_filenames: Sequence[str]
) -> DefaultDict[str, Optional[vcf.VcfReader]]:
  """Creates VcfReaders for the given VCF file paths, organized by reference.

  VcfReaders can be made either from a single VCF that covers all the relevant
  reference sequences or strictly one VCF per reference sequence. By returning
  a defaultdict, any code using the output of this function does not have to
  consider whether there are multiple VCFs or not, it can simply query by
  chromosome and get a reader.

  Args:
    population_vcf_filenames: Paths to files (VCF or VCF.gz) with population
      genotypes.

  Raises:
    ValueError: If there is more than one VCF file containing variants
      from the same chromosome.

  Returns:
    A defaultdict that maps from a reference name to an associated VcfReader.
    If there was only one VCF provided, all references will map to that one
    reader. If more than one VCF was provided, the references will have a
    reader each, while any that were not included will map to None.
  """
  # If only one VCF file is provided.
  if len(population_vcf_filenames) == 1:
    # The DefaultDict allows later code to query for any chromosome and still
    # get the same reader. This is great for compatibility with multi-VCF below.
    return collections.defaultdict(
        lambda: vcf.VcfReader(population_vcf_filenames[0]))

  # If more than one VCF files are provided.
  population_vcf_readers = DefaultDict(lambda: None)

  for vcf_filename in population_vcf_filenames:
    population_vcf_reader = vcf.VcfReader(vcf_filename, header=None)

    # Get contig name from the first variant in a file.
    for var in population_vcf_reader:
      reference_name = var.reference_name
      break
    # There should not be more than one VCFs including variants in
    # reference_name.
    if population_vcf_readers.get(reference_name):
      raise ValueError('Variants on %s are included in multiple VCFs' %
                       reference_name)
    population_vcf_readers[reference_name] = population_vcf_reader

  return population_vcf_readers
Example #6
0
  def test_c_reader(self):
    self.assertNotEqual(self.sites_reader.c_reader, 0)
    self.assertNotEqual(self.samples_reader.c_reader, 0)

    tfrecord_reader = vcf.VcfReader(
        test_utils.genomics_core_testdata('test_samples.vcf.golden.tfrecord'))
    self.assertNotEqual(tfrecord_reader.c_reader, 0)
def main(argv):
    with errors.clean_commandline_error_exit():
        if len(argv) > 1:
            errors.log_and_raise(
                'Command line parsing failure: vcf_stats_report does not accept '
                'positional arguments but some are present on the command line: '
                '"{}".'.format(str(argv[1:])), errors.CommandLineError)

    with vcf.VcfReader(FLAGS.input_vcf) as reader:
        sample_names = reader.header.sample_names
        if len(sample_names) != 1:
            raise ValueError(
                'There must be exactly one sample in VCF: {}'.format(
                    FLAGS.input_vcf))
        sample_name = sample_names[0]

        # Missing GT causes error later while reading, so throw a clearer error here
        vcf_columns = [col.id for col in reader.header.formats]
        if 'GT' not in vcf_columns:
            errors.log_and_raise('ERROR: No GT sub-column in VCF.')

        if FLAGS.num_records == -1:
            variants = reader.iterate()
        else:
            variants = itertools.islice(reader.iterate(), FLAGS.num_records)

        vcf_stats.create_vcf_report(variants,
                                    output_basename=FLAGS.outfile_base,
                                    sample_name=sample_name,
                                    vcf_reader=reader)
def generate_trained_model_runner(truth_variants, reads, ref,
                                  output_model_proto, output_model_pckl,
                                  exclude_contig, from_contig, random_seed,
                                  indel_weight):
    """Runner for generate_trained_model.

  Args:
    truth_variants: path to the VCF.
    reads: path to the reads BAM.
    ref: path to the reference FASTA.
    output_model_proto: path to write the AlleleCountLinearModel proto.
    output_model_pckl: path to write the LogisticRegression pickle.
    exclude_contig: string identifier of a contig to exclude from training,
    from_contig: string identifier of the contig from which we sample baseline.
    random_seed: int used as random seed for reproducibility.
    indel_weight: float of the weight od indels relative to the rest in
      the training.
  """
    vcf_reader = vcf.VcfReader(truth_variants)
    ref_reader = fasta.IndexedFastaReader(ref)
    sam_reader = sam.SamReader(reads)

    random.seed(random_seed)

    dataframe = generate_data(vcf_reader, ref_reader, sam_reader, from_contig,
                              exclude_contig)
    model = train_model(dataframe, indel_weight=indel_weight)

    if output_model_pckl:
        joblib.dump(model, output_model_pckl)

    model_proto = model_to_proto(model)
    with tf.gfile.GFile(output_model_proto, 'w') as f:
        f.write(text_format.MessageToString(model_proto))
Example #9
0
    def test_roundtrip(self,
                       expected_infos,
                       expected_fmt,
                       expected_fmt1,
                       expected_fmt2,
                       reader_excluded_info=None,
                       reader_excluded_format=None,
                       writer_excluded_info=None,
                       writer_excluded_format=None):
        expected_records = [
            record.format(info=info, fmt=expected_fmt, efmts1=e1,
                          efmts2=e2) for record, info, e1, e2 in zip(
                              self.record_format_strings, expected_infos,
                              expected_fmt1, expected_fmt2)
        ]
        expected = self.header + ''.join(expected_records)
        with vcf.VcfReader(
                test_utils.genomics_core_testdata('test_py_roundtrip.vcf'),
                excluded_info_fields=reader_excluded_info,
                excluded_format_fields=reader_excluded_format) as reader:

            records = list(reader.iterate())
            output_path = test_utils.test_tmpfile('test_roundtrip_tmpfile.vcf')
            with vcf.VcfWriter(
                    output_path,
                    header=reader.header,
                    excluded_info_fields=writer_excluded_info,
                    excluded_format_fields=writer_excluded_format) as writer:
                for record in records:
                    writer.write(record)

        with open(output_path) as f:
            actual = f.read()
        self.assertEqual(actual, expected)
  def test_vcf_caller_end2end_outputs(self):
    # Confirming that the proposed VCF (input) has the same variants
    # as the VCF output converted from the output of make_examples.
    variants = list(
        labeled_examples_to_vcf.examples_to_variants(
            testdata.GOLDEN_VCF_CALLER_TRAINING_EXAMPLES))
    with vcf.VcfReader(testdata.TRUTH_VARIANTS_VCF) as proposed_vcf_reader:
      # This checks the keys (like chr20:10099832:A->G) are the same.
      self.assertEqual([variant_utils.variant_key(v1) for v1 in variants], [
          variant_utils.variant_key(v2) for v2 in proposed_vcf_reader.iterate()
      ])

    with vcf.VcfReader(testdata.TRUTH_VARIANTS_VCF) as proposed_vcf_reader:
      self.assertEqual(
          [variant_utils.genotype_as_alleles(v1) for v1 in variants], [
              variant_utils.genotype_as_alleles(
                  variant_utils.unphase_all_genotypes(v2))
              for v2 in proposed_vcf_reader.iterate()
          ])
    def test_sample_name_flag(self):
        FLAGS.ref = testdata.CHR20_FASTA
        FLAGS.examples = testdata.GOLDEN_TRAINING_EXAMPLES
        FLAGS.sample_name = 'sample_name'
        FLAGS.output_vcf = test_utils.test_tmpfile('no_sample_name.vcf')

        labeled_examples_to_vcf.main(0)

        with vcf.VcfReader(FLAGS.output_vcf) as vcf_reader:
            self.assertEqual(list(vcf_reader.header.sample_names),
                             [FLAGS.sample_name])
 def test_create_vcf_report(self):
     base_dir = tempfile.mkdtemp()
     outfile_base = os.path.join(base_dir, 'stats_test')
     sample_name = 'test_sample_name'
     with vcf.VcfReader(testdata.GOLDEN_POSTPROCESS_OUTPUT) as reader:
         vcf_stats.create_vcf_report(variants=reader.iterate(),
                                     output_basename=outfile_base,
                                     sample_name=sample_name,
                                     vcf_reader=reader)
     self.assertTrue(
         tf.io.gfile.exists(outfile_base + '.visual_report.html'))
Example #13
0
def model_evaluation_runner(truth_variants, reads, ref, input_model_pckl,
                            eval_region, output_report_csv):
    """Outputs precision-recall for a sklearn model using AlleleCount features.

  Args:
    truth_variants: path to the VCF.
    reads: path to the reads BAM.
    ref: path to the reference FASTA.
    input_model_pckl: path to read the LogisticRegression pickle from.
    eval_region: str, region to evaluate on in the 'chr:start-end',
      'chr:position' or 'chr' format.
    output_report_csv: path to the output report csv.

  Raises:
    ValueError: if eval_region cannot be parsed.
  """
    sam_reader = sam.SamReader(reads)
    ref_reader = fasta.IndexedFastaReader(ref)

    read_reqs = reads_pb2.ReadRequirements(
        min_base_quality=10,
        min_mapping_quality=10,
        min_base_quality_mode=reads_pb2.ReadRequirements.ENFORCED_BY_CLIENT)
    allele_counter_options = deepvariant_pb2.AlleleCounterOptions(
        partition_size=1, read_requirements=read_reqs)

    model = joblib.load(input_model_pckl)

    with vcf.VcfReader(truth_variants) as vcf_reader:
        region = ranges.parse_literal(eval_region,
                                      contig_map=ranges.contigs_dict(
                                          ref_reader.header.contigs))
        true_indels = [
            var for var in vcf_reader.query(region)
            if (variant_utils.is_indel(var))
        ]

    precisions = compute_precision(model, true_indels, sam_reader, ref_reader,
                                   allele_counter_options, _THRESHOLDS, region)
    recalls = compute_effective_recall(model, true_indels, sam_reader,
                                       ref_reader, allele_counter_options,
                                       _THRESHOLDS)

    with tf.gfile.GFile(output_report_csv, 'w') as csvfile:
        fieldnames = ['threshold', 'precision', 'recall']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for threshold in _THRESHOLDS:
            writer.writerow({
                'threshold': threshold,
                'precision': precisions[threshold],
                'recall': recalls[threshold]
            })
Example #14
0
def processing_regions_from_options(options):
    """Computes the calling regions from our options.

  This function does all of the work needed to read our input files and region
  specifications to determine the list of regions we should generate examples
  over. It also computes the confident regions needed to label variants.

  Args:
    options: deepvariant.DeepVariantOptions proto containing information about
      our input data sources.

  Raises:
    ValueError: if the regions to call is empty.

  Returns:
    Two values. The first is a list of nucleus.genomics.v1.Range protos of the
    regions we should process. The second is a RangeSet containing the confident
    regions for labeling, or None if we are running in training mode.
  """
    ref_contigs = fasta.RefFastaReader(
        options.reference_filename).header.contigs
    sam_contigs = sam.SamReader(options.reads_filename).header.contigs

    # Add in confident regions and vcf_contigs if in training mode.
    vcf_contigs = None
    if in_training_mode(options):
        vcf_contigs = vcf.VcfReader(
            options.truth_variants_filename).header.contigs

    contigs = _ensure_consistent_contigs(ref_contigs, sam_contigs, vcf_contigs,
                                         options.exclude_contigs,
                                         options.min_shared_contigs_basepairs)
    logging.info('Common contigs are %s', [c.name for c in contigs])
    calling_regions = build_calling_regions(ref_contigs,
                                            options.calling_regions,
                                            options.exclude_calling_regions)
    if not calling_regions:
        raise ValueError(
            'The regions to call is empty. Check your --regions and '
            '--exclude_regions flags to make sure they are not '
            'resulting in set of empty region to process. This also '
            'happens if you use "chr20" for a BAM where contig names '
            'don\'t have "chr"s (or vice versa).')
    regions = regions_to_process(
        contigs=contigs,
        partition_size=options.allele_counter_options.partition_size,
        calling_regions=calling_regions,
        task_id=options.task_id,
        num_shards=options.num_shards)

    return regions
Example #15
0
 def test_find_matching_allele_frequency(self, variant, expected_return,
                                         label):
     ref_reader = fasta.IndexedFastaReader(testdata.GRCH38_FASTA)
     vcf_reader = vcf.VcfReader(testdata.VCF_WITH_ALLELE_FREQUENCIES)
     allele_frequencies = allele_frequency.find_matching_allele_frequency(
         variant, vcf_reader, ref_reader)
     # Compare keys.
     self.assertSetEqual(set(allele_frequencies.keys()),
                         set(expected_return.keys()),
                         msg=label)
     # Compare values (almost equal).
     for key in allele_frequencies.keys():
         self.assertAlmostEqual(allele_frequencies[key],
                                expected_return[key],
                                msg=label)
Example #16
0
    def test_align_to_all_haplotypes(self, window_width):
        # align_to_all_haplotypes() will pull from the reference, so choose a
        # real variant.
        region = ranges.parse_literal('chr20:10,046,000-10,046,400')
        nist_reader = vcf.VcfReader(testdata.TRUTH_VARIANTS_VCF)
        nist_variants = list(nist_reader.query(region))
        # We picked this region to have exactly one known variant:
        # reference_bases: "AAGAAAGAAAG"
        # alternate_bases: "A", a deletion of 10 bp
        # start: 10046177
        # end: 10046188
        # reference_name: "chr20"

        variant = nist_variants[0]

        self.processor.pic = mock.Mock()
        self.processor.pic.width = window_width
        self.processor.pic.half_width = int((self.processor.pic.width - 1) / 2)

        self.processor.realigner = mock.Mock()
        # Using a real ref_reader to test that the reference allele matches
        # between the variant and the reference at the variant's coordinates.
        self.processor.realigner.ref_reader = self.ref_reader

        read = test_utils.make_read('A' * 101,
                                    start=10046100,
                                    cigar='101M',
                                    quals=[30] * 101)

        self.processor.realigner.align_to_haplotype = mock.Mock()
        alt_info = self.processor.align_to_all_haplotypes(variant, [read])
        hap_alignments = alt_info['alt_alignments']
        hap_sequences = alt_info['alt_sequences']
        # Both outputs are keyed by alt allele.
        self.assertCountEqual(hap_alignments.keys(), ['A'])
        self.assertCountEqual(hap_sequences.keys(), ['A'])

        # Sequence must be the length of the window.
        self.assertLen(hap_sequences['A'], self.processor.pic.width)

        # align_to_haplotype should be called once for each alt (1 alt here).
        self.processor.realigner.align_to_haplotype.assert_called_once()

        # If variant reference_bases are wrong, it should raise a ValueError.
        variant.reference_bases = 'G'
        with six.assertRaisesRegex(
                self, ValueError, 'does not match the bases in the reference'):
            self.processor.align_to_all_haplotypes(variant, [read])
Example #17
0
  def _make_labeler_from_options(self):
    truth_vcf_reader = vcf.VcfReader(
        self.options.truth_variants_filename,
        excluded_format_fields=['GL', 'GQ', 'PL'])
    confident_regions = read_confident_regions(self.options)

    if (self.options.labeler_algorithm ==
        deepvariant_pb2.DeepVariantOptions.POSITIONAL_LABELER):
      return positional_labeler.PositionalVariantLabeler(
          truth_vcf_reader=truth_vcf_reader,
          confident_regions=confident_regions)
    elif (self.options.labeler_algorithm ==
          deepvariant_pb2.DeepVariantOptions.HAPLOTYPE_LABELER):
      return haplotype_labeler.HaplotypeLabeler(
          truth_vcf_reader=truth_vcf_reader,
          ref_reader=self.ref_reader,
          confident_regions=confident_regions)
    else:
      raise ValueError('Unexpected labeler_algorithm',
                       self.options.labeler_algorithm)
Example #18
0
 def test_add_allele_frequencies_to_candidates(self, dv_calls,
                                               expected_return, testcase):
     if testcase == 'valid':
         pop_vcf_reader = vcf.VcfReader(
             testdata.VCF_WITH_ALLELE_FREQUENCIES)
         ref_reader = fasta.IndexedFastaReader(testdata.GRCH38_FASTA)
     elif testcase == 'no VCF':
         pop_vcf_reader = None
         ref_reader = None
     else:
         raise ValueError('Invalid testcase for parameterized test.')
     updated_dv_call = list(
         allele_frequency.add_allele_frequencies_to_candidates(
             dv_calls, pop_vcf_reader, ref_reader))
     actual_frequency = updated_dv_call[0].allele_frequency
     # Compare keys.
     self.assertSetEqual(set(actual_frequency.keys()),
                         set(expected_return.keys()))
     # Compare values (almost equal).
     for key in actual_frequency.keys():
         self.assertAlmostEqual(actual_frequency[key], expected_return[key])
Example #19
0
    def test_header_format_mixed_order(self):
        """Tests reading a VCF with unconventional FORMAT field definition.

    Tests reading a VCF in which the properties of the format
    fields are defined in mixed order in the header. For example,

    ##FORMAT=<ID=GT,Type=String,Number=1,Description="GT description">

    (In normal VCFs "Number" should come before "Type".)
    """
        with vcf.VcfReader(
                test_utils.genomics_core_testdata(
                    'header_format_mixed_order.vcf')) as vreader:
            formats = vreader.header.formats
            variants = list(vreader)
        self.assertLen(formats, 1)
        self.assertEqual(formats[0].id, 'GT')
        self.assertEqual(formats[0].number, '1')
        self.assertEqual(formats[0].type, 'String')
        self.assertEqual(formats[0].description, 'GT description')
        self.assertLen(variants, 2)
        self.assertEqual(variants[0].calls[0].genotype, [0, 1])
        self.assertEqual(variants[1].calls[0].genotype, [1, 1])
Example #20
0
    def _make_labeler_from_options(self):
        """Creates the labeler from options."""
        truth_vcf_reader = vcf.VcfReader(
            self.options.truth_variants_filename,
            excluded_format_fields=['GL', 'GQ', 'PL'])
        confident_regions = read_confident_regions(self.options)

        if (self.options.labeler_algorithm ==
                deepvariant_pb2.DeepVariantOptions.POSITIONAL_LABELER):
            return positional_labeler.PositionalVariantLabeler(
                truth_vcf_reader=truth_vcf_reader,
                confident_regions=confident_regions)
        elif (self.options.labeler_algorithm ==
              deepvariant_pb2.DeepVariantOptions.HAPLOTYPE_LABELER):
            return haplotype_labeler.HaplotypeLabeler(
                truth_vcf_reader=truth_vcf_reader,
                ref_reader=self.ref_reader,
                confident_regions=confident_regions)
        elif (self.options.labeler_algorithm ==
              deepvariant_pb2.DeepVariantOptions.CUSTOMIZED_CLASSES_LABELER):
            if (not FLAGS.customized_classes_labeler_classes_list
                    or not FLAGS.customized_classes_labeler_info_field_name):
                raise ValueError(
                    'For -labeler_algorithm=customized_classes_labeler, '
                    'you need to set '
                    '-customized_classes_labeler_classes_list and '
                    '-customized_classes_labeler_info_field_name.')
            return customized_classes_labeler.CustomizedClassesVariantLabeler(
                truth_vcf_reader=truth_vcf_reader,
                confident_regions=confident_regions,
                classes_list=FLAGS.customized_classes_labeler_classes_list,
                info_field_name=FLAGS.
                customized_classes_labeler_info_field_name)
        else:
            raise ValueError('Unexpected labeler_algorithm',
                             self.options.labeler_algorithm)
Example #21
0
    def setUp(self):
        self.sites_reader = vcf.VcfReader(
            test_utils.genomics_core_testdata('test_sites.vcf'))

        self.samples_reader = vcf.VcfReader(
            test_utils.genomics_core_testdata('test_samples.vcf.gz'))
Example #22
0
def main(argv=()):
    with errors.clean_commandline_error_exit():
        if len(argv) > 1:
            errors.log_and_raise(
                'Command line parsing failure: postprocess_variants does not accept '
                'positional arguments but some are present on the command line: '
                '"{}".'.format(str(argv)), errors.CommandLineError)
        del argv  # Unused.

        if (not FLAGS.nonvariant_site_tfrecord_path) != (
                not FLAGS.gvcf_outfile):
            errors.log_and_raise(
                'gVCF creation requires both nonvariant_site_tfrecord_path and '
                'gvcf_outfile flags to be set.', errors.CommandLineError)

        proto_utils.uses_fast_cpp_protos_or_die()

        logging_level.set_from_flag()

        fasta_reader = fasta.IndexedFastaReader(FLAGS.ref,
                                                cache_size=_FASTA_CACHE_SIZE)
        contigs = fasta_reader.header.contigs
        paths = io_utils.maybe_generate_sharded_filenames(FLAGS.infile)
        # Read one CallVariantsOutput record and extract the sample name from it.
        # Note that this assumes that all CallVariantsOutput protos in the infile
        # contain a single VariantCall within their constituent Variant proto, and
        # that the call_set_name is identical in each of the records.
        record = tf_utils.get_one_example_from_examples_path(
            ','.join(paths), proto=deepvariant_pb2.CallVariantsOutput)
        if record is None:
            raise ValueError('Cannot find any records in {}'.format(
                ','.join(paths)))

        sample_name = _extract_single_sample_name(record)
        header = dv_vcf_constants.deepvariant_header(
            contigs=contigs, sample_names=[sample_name])
        with tempfile.NamedTemporaryFile() as temp:
            postprocess_variants_lib.process_single_sites_tfrecords(
                contigs, paths, temp.name)
            independent_variants = _transform_call_variants_output_to_variants(
                input_sorted_tfrecord_path=temp.name,
                qual_filter=FLAGS.qual_filter,
                multi_allelic_qual_filter=FLAGS.multi_allelic_qual_filter,
                sample_name=sample_name)
            variant_generator = haplotypes.maybe_resolve_conflicting_variants(
                independent_variants)
            write_variants_to_vcf(variant_generator=variant_generator,
                                  output_vcf_path=FLAGS.outfile,
                                  header=header)

        # Also write out the gVCF file if it was provided.
        if FLAGS.nonvariant_site_tfrecord_path:
            nonvariant_generator = io_utils.read_shard_sorted_tfrecords(
                FLAGS.nonvariant_site_tfrecord_path,
                key=_get_contig_based_variant_sort_keyfn(contigs),
                proto=variants_pb2.Variant)
            with vcf.VcfReader(FLAGS.outfile) as variant_reader:
                lessthanfn = _get_contig_based_lessthan(contigs)
                gvcf_variants = (_transform_to_gvcf_record(variant)
                                 for variant in variant_reader.iterate())
                merged_variants = merge_variants_and_nonvariants(
                    gvcf_variants, nonvariant_generator, lessthanfn,
                    fasta_reader)
                write_variants_to_vcf(variant_generator=merged_variants,
                                      output_vcf_path=FLAGS.gvcf_outfile,
                                      header=header)
Example #23
0
def main(argv=()):
  with errors.clean_commandline_error_exit():
    if len(argv) > 1:
      errors.log_and_raise(
          'Command line parsing failure: postprocess_variants does not accept '
          'positional arguments but some are present on the command line: '
          '"{}".'.format(str(argv)), errors.CommandLineError)
    del argv  # Unused.

    if (not FLAGS.nonvariant_site_tfrecord_path) != (not FLAGS.gvcf_outfile):
      errors.log_and_raise(
          'gVCF creation requires both nonvariant_site_tfrecord_path and '
          'gvcf_outfile flags to be set.', errors.CommandLineError)

    proto_utils.uses_fast_cpp_protos_or_die()

    logging_level.set_from_flag()

    fasta_reader = fasta.IndexedFastaReader(
        FLAGS.ref, cache_size=_FASTA_CACHE_SIZE)
    contigs = fasta_reader.header.contigs
    paths = sharded_file_utils.maybe_generate_sharded_filenames(FLAGS.infile)
    # Read one CallVariantsOutput record and extract the sample name from it.
    # Note that this assumes that all CallVariantsOutput protos in the infile
    # contain a single VariantCall within their constituent Variant proto, and
    # that the call_set_name is identical in each of the records.
    record = tf_utils.get_one_example_from_examples_path(
        ','.join(paths), proto=deepvariant_pb2.CallVariantsOutput)
    if record is None:
      logging.info('call_variants_output is empty. Writing out empty VCF.')
      sample_name = dv_constants.DEFAULT_SAMPLE_NAME
      if FLAGS.sample_name:
        logging.info(
            '--sample_name is set in postprocess_variant. Using %s as the '
            'sample name.', FLAGS.sample_name)
        sample_name = FLAGS.sample_name
      variant_generator = iter([])
    else:
      sample_name = _extract_single_sample_name(record)
      temp = tempfile.NamedTemporaryFile()
      start_time = time.time()
      postprocess_variants_lib.process_single_sites_tfrecords(
          contigs, paths, temp.name)
      logging.info('CVO sorting took %s minutes',
                   (time.time() - start_time) / 60)

      logging.info('Transforming call_variants_output to variants.')
      independent_variants = _transform_call_variants_output_to_variants(
          input_sorted_tfrecord_path=temp.name,
          qual_filter=FLAGS.qual_filter,
          multi_allelic_qual_filter=FLAGS.multi_allelic_qual_filter,
          sample_name=sample_name,
          group_variants=FLAGS.group_variants,
          use_multiallelic_model=FLAGS.use_multiallelic_model)
      variant_generator = haplotypes.maybe_resolve_conflicting_variants(
          independent_variants)

    header = dv_vcf_constants.deepvariant_header(
        contigs=contigs, sample_names=[sample_name])
    use_csi = _decide_to_use_csi(contigs)

    start_time = time.time()
    if not FLAGS.nonvariant_site_tfrecord_path:
      logging.info('Writing variants to VCF.')
      write_variants_to_vcf(
          variant_iterable=variant_generator,
          output_vcf_path=FLAGS.outfile,
          header=header)
      if FLAGS.outfile.endswith('.gz'):
        build_index(FLAGS.outfile, use_csi)
      logging.info('VCF creation took %s minutes',
                   (time.time() - start_time) / 60)
    else:
      logging.info('Merging and writing variants to VCF and gVCF.')
      lessthanfn = _get_contig_based_lessthan(contigs)
      with vcf.VcfWriter(
          FLAGS.outfile, header=header, round_qualities=True) as vcf_writer, \
          vcf.VcfWriter(
              FLAGS.gvcf_outfile, header=header, round_qualities=True) \
          as gvcf_writer:
        nonvariant_generator = tfrecord.read_shard_sorted_tfrecords(
            FLAGS.nonvariant_site_tfrecord_path,
            key=_get_contig_based_variant_sort_keyfn(contigs),
            proto=variants_pb2.Variant)
        merge_and_write_variants_and_nonvariants(variant_generator,
                                                 nonvariant_generator,
                                                 lessthanfn, fasta_reader,
                                                 vcf_writer, gvcf_writer)
      if FLAGS.outfile.endswith('.gz'):
        build_index(FLAGS.outfile, use_csi)
      if FLAGS.gvcf_outfile.endswith('.gz'):
        build_index(FLAGS.gvcf_outfile, use_csi)
      logging.info('Finished writing VCF and gVCF in %s minutes.',
                   (time.time() - start_time) / 60)
    if FLAGS.vcf_stats_report:
      outfile_base = _get_base_path(FLAGS.outfile)
      with vcf.VcfReader(FLAGS.outfile) as reader:
        vcf_stats.create_vcf_report(
            variants=reader.iterate(),
            output_basename=outfile_base,
            sample_name=sample_name,
            vcf_reader=reader)
    if record:
      temp.close()
Example #24
0
 def setUp(self):
     self.vcf_reader = vcf.VcfReader(
         test_utils.genomics_core_testdata('test_sites.vcf'))
     self.cache = self.vcf_reader.field_access_cache
Example #25
0
  def test_make_examples_end2end(self,
                                 mode,
                                 num_shards,
                                 test_condition=TestConditions.USE_BAM,
                                 labeler_algorithm=None,
                                 use_fast_pass_aligner=True):
    self.assertIn(mode, {'calling', 'training'})
    region = ranges.parse_literal('chr20:10,000,000-10,010,000')
    FLAGS.write_run_info = True
    FLAGS.ref = testdata.CHR20_FASTA
    if test_condition == TestConditions.USE_BAM:
      FLAGS.reads = testdata.CHR20_BAM
    elif test_condition == TestConditions.USE_CRAM:
      FLAGS.reads = testdata.CHR20_CRAM
    elif test_condition == TestConditions.USE_MULTI_BAMS:
      FLAGS.reads = ','.join(
          [testdata.CHR20_BAM_FIRST_HALF, testdata.CHR20_BAM_SECOND_HALF])

    FLAGS.candidates = test_utils.test_tmpfile(
        _sharded('vsc.tfrecord', num_shards))
    FLAGS.examples = test_utils.test_tmpfile(
        _sharded('examples.tfrecord', num_shards))
    FLAGS.regions = [ranges.to_literal(region)]
    FLAGS.partition_size = 1000
    FLAGS.mode = mode
    FLAGS.gvcf_gq_binsize = 5
    FLAGS.use_fast_pass_aligner = use_fast_pass_aligner
    if labeler_algorithm is not None:
      FLAGS.labeler_algorithm = labeler_algorithm

    if mode == 'calling':
      FLAGS.gvcf = test_utils.test_tmpfile(
          _sharded('gvcf.tfrecord', num_shards))
    else:
      FLAGS.truth_variants = testdata.TRUTH_VARIANTS_VCF
      FLAGS.confident_regions = testdata.CONFIDENT_REGIONS_BED

    for task_id in range(max(num_shards, 1)):
      FLAGS.task = task_id
      options = make_examples.default_options(add_flags=True)
      # We need to overwrite bam_fname for USE_CRAM test since Golden Set
      # generated from BAM file. BAM filename is stored in candidates. If we
      # don't overwrite default_options variants won't match and test fail.
      options.bam_fname = 'NA12878_S1.chr20.10_10p1mb.bam'
      make_examples_core.make_examples_runner(options)

      # Check that our run_info proto contains the basic fields we'd expect:
      # (a) our options are written to the run_info.options field.
      run_info = make_examples_core.read_make_examples_run_info(
          options.run_info_filename)
      self.assertEqual(run_info.options, options)
      # (b) run_info.resource_metrics is present and contains our hostname.
      self.assertTrue(run_info.HasField('resource_metrics'))
      self.assertEqual(run_info.resource_metrics.host_name, platform.node())

    # Test that our candidates are reasonable, calling specific helper functions
    # to check lots of properties of the output.
    candidates = sorted(
        tfrecord.read_tfrecords(
            FLAGS.candidates, proto=deepvariant_pb2.DeepVariantCall),
        key=lambda c: variant_utils.variant_range_tuple(c.variant))
    self.verify_deepvariant_calls(candidates, options)
    self.verify_variants([call.variant for call in candidates],
                         region,
                         options,
                         is_gvcf=False)

    # Verify that the variants in the examples are all good.
    examples = self.verify_examples(
        FLAGS.examples, region, options, verify_labels=mode == 'training')
    example_variants = [tf_utils.example_variant(ex) for ex in examples]
    self.verify_variants(example_variants, region, options, is_gvcf=False)

    # Verify the integrity of the examples and then check that they match our
    # golden labeled examples. Note we expect the order for both training and
    # calling modes to produce deterministic order because we fix the random
    # seed.
    if mode == 'calling':
      golden_file = _sharded(testdata.GOLDEN_CALLING_EXAMPLES, num_shards)
    else:
      golden_file = _sharded(testdata.GOLDEN_TRAINING_EXAMPLES, num_shards)
    self.assertDeepVariantExamplesEqual(
        examples, list(tfrecord.read_tfrecords(golden_file)))

    if mode == 'calling':
      nist_reader = vcf.VcfReader(testdata.TRUTH_VARIANTS_VCF)
      nist_variants = list(nist_reader.query(region))
      self.verify_nist_concordance(example_variants, nist_variants)

      # Check the quality of our generated gvcf file.
      gvcfs = variant_utils.sorted_variants(
          tfrecord.read_tfrecords(FLAGS.gvcf, proto=variants_pb2.Variant))
      self.verify_variants(gvcfs, region, options, is_gvcf=True)
      self.verify_contiguity(gvcfs, region)
      gvcf_golden_file = _sharded(testdata.GOLDEN_POSTPROCESS_GVCF_INPUT,
                                  num_shards)
      expected_gvcfs = list(
          tfrecord.read_tfrecords(gvcf_golden_file, proto=variants_pb2.Variant))
      # Despite the name, assertCountEqual checks that all elements match.
      self.assertCountEqual(gvcfs, expected_gvcfs)

    if (mode == 'training' and num_shards == 0 and
        labeler_algorithm != 'positional_labeler'):
      # The positional labeler doesn't track metrics, so don't try to read them
      # in when that's the mode.
      self.assertEqual(
          make_examples_core.read_make_examples_run_info(
              testdata.GOLDEN_MAKE_EXAMPLES_RUN_INFO).labeling_metrics,
          run_info.labeling_metrics)