Ejemplo n.º 1
0
def main(argv):
    with errors.clean_commandline_error_exit():
        if len(argv) > 1:
            errors.log_and_raise(
                'Command line parsing failure: vcf_stats_report does not accept '
                'positional arguments but some are present on the command line: '
                '"{}".'.format(str(argv[1:])), errors.CommandLineError)

    with vcf.VcfReader(FLAGS.input_vcf) as reader:
        sample_names = reader.header.sample_names
        if len(sample_names) != 1:
            raise ValueError(
                'There must be exactly one sample in VCF: {}'.format(
                    FLAGS.input_vcf))
        sample_name = sample_names[0]

        # Missing GT causes error later while reading, so throw a clearer error here
        vcf_columns = [col.id for col in reader.header.formats]
        if 'GT' not in vcf_columns:
            errors.log_and_raise('ERROR: No GT sub-column in VCF.')

        if FLAGS.num_records == -1:
            variants = reader.iterate()
        else:
            variants = itertools.islice(reader.iterate(), FLAGS.num_records)

        vcf_stats.create_vcf_report(variants,
                                    output_basename=FLAGS.outfile_base,
                                    sample_name=sample_name,
                                    vcf_reader=reader)
Ejemplo n.º 2
0
 def test_create_vcf_report(self):
     base_dir = tempfile.mkdtemp()
     outfile_base = os.path.join(base_dir, 'stats_test')
     sample_name = 'test_sample_name'
     with vcf.VcfReader(testdata.GOLDEN_POSTPROCESS_OUTPUT) as reader:
         vcf_stats.create_vcf_report(variants=reader.iterate(),
                                     output_basename=outfile_base,
                                     sample_name=sample_name,
                                     vcf_reader=reader)
     self.assertTrue(
         tf.io.gfile.exists(outfile_base + '.visual_report.html'))
Ejemplo n.º 3
0
def main(argv=()):
  with errors.clean_commandline_error_exit():
    if len(argv) > 1:
      errors.log_and_raise(
          'Command line parsing failure: postprocess_variants does not accept '
          'positional arguments but some are present on the command line: '
          '"{}".'.format(str(argv)), errors.CommandLineError)
    del argv  # Unused.

    if (not FLAGS.nonvariant_site_tfrecord_path) != (not FLAGS.gvcf_outfile):
      errors.log_and_raise(
          'gVCF creation requires both nonvariant_site_tfrecord_path and '
          'gvcf_outfile flags to be set.', errors.CommandLineError)

    proto_utils.uses_fast_cpp_protos_or_die()

    logging_level.set_from_flag()

    fasta_reader = fasta.IndexedFastaReader(
        FLAGS.ref, cache_size=_FASTA_CACHE_SIZE)
    contigs = fasta_reader.header.contigs
    paths = sharded_file_utils.maybe_generate_sharded_filenames(FLAGS.infile)
    # Read one CallVariantsOutput record and extract the sample name from it.
    # Note that this assumes that all CallVariantsOutput protos in the infile
    # contain a single VariantCall within their constituent Variant proto, and
    # that the call_set_name is identical in each of the records.
    record = tf_utils.get_one_example_from_examples_path(
        ','.join(paths), proto=deepvariant_pb2.CallVariantsOutput)
    if record is None:
      logging.info('call_variants_output is empty. Writing out empty VCF.')
      sample_name = dv_constants.DEFAULT_SAMPLE_NAME
      if FLAGS.sample_name:
        logging.info(
            '--sample_name is set in postprocess_variant. Using %s as the '
            'sample name.', FLAGS.sample_name)
        sample_name = FLAGS.sample_name
      variant_generator = iter([])
    else:
      sample_name = _extract_single_sample_name(record)
      temp = tempfile.NamedTemporaryFile()
      start_time = time.time()
      postprocess_variants_lib.process_single_sites_tfrecords(
          contigs, paths, temp.name)
      logging.info('CVO sorting took %s minutes',
                   (time.time() - start_time) / 60)

      logging.info('Transforming call_variants_output to variants.')
      independent_variants = _transform_call_variants_output_to_variants(
          input_sorted_tfrecord_path=temp.name,
          qual_filter=FLAGS.qual_filter,
          multi_allelic_qual_filter=FLAGS.multi_allelic_qual_filter,
          sample_name=sample_name,
          group_variants=FLAGS.group_variants,
          use_multiallelic_model=FLAGS.use_multiallelic_model)
      variant_generator = haplotypes.maybe_resolve_conflicting_variants(
          independent_variants)

    header = dv_vcf_constants.deepvariant_header(
        contigs=contigs, sample_names=[sample_name])
    use_csi = _decide_to_use_csi(contigs)

    start_time = time.time()
    if not FLAGS.nonvariant_site_tfrecord_path:
      logging.info('Writing variants to VCF.')
      write_variants_to_vcf(
          variant_iterable=variant_generator,
          output_vcf_path=FLAGS.outfile,
          header=header)
      if FLAGS.outfile.endswith('.gz'):
        build_index(FLAGS.outfile, use_csi)
      logging.info('VCF creation took %s minutes',
                   (time.time() - start_time) / 60)
    else:
      logging.info('Merging and writing variants to VCF and gVCF.')
      lessthanfn = _get_contig_based_lessthan(contigs)
      with vcf.VcfWriter(
          FLAGS.outfile, header=header, round_qualities=True) as vcf_writer, \
          vcf.VcfWriter(
              FLAGS.gvcf_outfile, header=header, round_qualities=True) \
          as gvcf_writer:
        nonvariant_generator = tfrecord.read_shard_sorted_tfrecords(
            FLAGS.nonvariant_site_tfrecord_path,
            key=_get_contig_based_variant_sort_keyfn(contigs),
            proto=variants_pb2.Variant)
        merge_and_write_variants_and_nonvariants(variant_generator,
                                                 nonvariant_generator,
                                                 lessthanfn, fasta_reader,
                                                 vcf_writer, gvcf_writer)
      if FLAGS.outfile.endswith('.gz'):
        build_index(FLAGS.outfile, use_csi)
      if FLAGS.gvcf_outfile.endswith('.gz'):
        build_index(FLAGS.gvcf_outfile, use_csi)
      logging.info('Finished writing VCF and gVCF in %s minutes.',
                   (time.time() - start_time) / 60)
    if FLAGS.vcf_stats_report:
      outfile_base = _get_base_path(FLAGS.outfile)
      with vcf.VcfReader(FLAGS.outfile) as reader:
        vcf_stats.create_vcf_report(
            variants=reader.iterate(),
            output_basename=outfile_base,
            sample_name=sample_name,
            vcf_reader=reader)
    if record:
      temp.close()