def test_round_trip_vcf(self, test_datum_name): # Round-trip variants through writing and reading: # 1. Read variants v1 from VcfReader; # 2. Write v1 to vcf using our VcfWriter; # 3. Read back in using VcfReader -- v2; # 4. compare v1 and v2. in_file = test_utils.genomics_core_testdata(test_datum_name) out_file = test_utils.test_tmpfile('output_' + test_datum_name) v1_reader = vcf.VcfReader(in_file) v1_records = list(v1_reader.iterate()) self.assertTrue(v1_records, 'Reader failed to find records') header = copy.deepcopy(v1_reader.header) writer_options = variants_pb2.VcfWriterOptions() with vcf_writer.VcfWriter.to_file(out_file, header, writer_options) as writer: for record in v1_records: writer.write(record) v2_reader = vcf.VcfReader(out_file) v2_records = list(v2_reader.iterate()) self.assertEqual(v1_records, v2_records, 'Round-tripped variants not as expected')
def setUp(self): self.sites_reader = vcf.VcfReader( test_utils.genomics_core_testdata('test_sites.vcf'), use_index=False) self.samples_reader = vcf.VcfReader( test_utils.genomics_core_testdata('test_samples.vcf.gz'), use_index=True)
def test_vcf_query(self): tabix.build_index(self.output_file) self.input_reader = vcf.VcfReader(self.input_file) self.output_reader = vcf.VcfReader(self.output_file) range1 = ranges.parse_literal('chr3:100,000-500,000') self.assertEqual( list(self.input_reader.query(range1)), list(self.output_reader.query(range1)))
def test_headerless_vcf(self): """Writes a headerless vcf and reads it back out.""" test_vcf = test_utils.genomics_core_testdata('test_sites.vcf') output_vcf = test_utils.test_tmpfile('output.vcf') expected_variants = [] with vcf.VcfReader(test_vcf) as reader: with vcf.VcfWriter( output_vcf, header=reader.header, exclude_header=True) as writer: for record in reader: expected_variants.append(record) writer.write(record) with vcf.VcfReader(output_vcf, header=reader.header) as actual_reader: self.assertEqual(expected_variants, list(actual_reader))
def make_population_vcf_readers( population_vcf_filenames: Sequence[str] ) -> DefaultDict[str, Optional[vcf.VcfReader]]: """Creates VcfReaders for the given VCF file paths, organized by reference. VcfReaders can be made either from a single VCF that covers all the relevant reference sequences or strictly one VCF per reference sequence. By returning a defaultdict, any code using the output of this function does not have to consider whether there are multiple VCFs or not, it can simply query by chromosome and get a reader. Args: population_vcf_filenames: Paths to files (VCF or VCF.gz) with population genotypes. Raises: ValueError: If there is more than one VCF file containing variants from the same chromosome. Returns: A defaultdict that maps from a reference name to an associated VcfReader. If there was only one VCF provided, all references will map to that one reader. If more than one VCF was provided, the references will have a reader each, while any that were not included will map to None. """ # If only one VCF file is provided. if len(population_vcf_filenames) == 1: # The DefaultDict allows later code to query for any chromosome and still # get the same reader. This is great for compatibility with multi-VCF below. return collections.defaultdict( lambda: vcf.VcfReader(population_vcf_filenames[0])) # If more than one VCF files are provided. population_vcf_readers = DefaultDict(lambda: None) for vcf_filename in population_vcf_filenames: population_vcf_reader = vcf.VcfReader(vcf_filename, header=None) # Get contig name from the first variant in a file. for var in population_vcf_reader: reference_name = var.reference_name break # There should not be more than one VCFs including variants in # reference_name. if population_vcf_readers.get(reference_name): raise ValueError('Variants on %s are included in multiple VCFs' % reference_name) population_vcf_readers[reference_name] = population_vcf_reader return population_vcf_readers
def test_c_reader(self): self.assertNotEqual(self.sites_reader.c_reader, 0) self.assertNotEqual(self.samples_reader.c_reader, 0) tfrecord_reader = vcf.VcfReader( test_utils.genomics_core_testdata('test_samples.vcf.golden.tfrecord')) self.assertNotEqual(tfrecord_reader.c_reader, 0)
def main(argv): with errors.clean_commandline_error_exit(): if len(argv) > 1: errors.log_and_raise( 'Command line parsing failure: vcf_stats_report does not accept ' 'positional arguments but some are present on the command line: ' '"{}".'.format(str(argv[1:])), errors.CommandLineError) with vcf.VcfReader(FLAGS.input_vcf) as reader: sample_names = reader.header.sample_names if len(sample_names) != 1: raise ValueError( 'There must be exactly one sample in VCF: {}'.format( FLAGS.input_vcf)) sample_name = sample_names[0] # Missing GT causes error later while reading, so throw a clearer error here vcf_columns = [col.id for col in reader.header.formats] if 'GT' not in vcf_columns: errors.log_and_raise('ERROR: No GT sub-column in VCF.') if FLAGS.num_records == -1: variants = reader.iterate() else: variants = itertools.islice(reader.iterate(), FLAGS.num_records) vcf_stats.create_vcf_report(variants, output_basename=FLAGS.outfile_base, sample_name=sample_name, vcf_reader=reader)
def generate_trained_model_runner(truth_variants, reads, ref, output_model_proto, output_model_pckl, exclude_contig, from_contig, random_seed, indel_weight): """Runner for generate_trained_model. Args: truth_variants: path to the VCF. reads: path to the reads BAM. ref: path to the reference FASTA. output_model_proto: path to write the AlleleCountLinearModel proto. output_model_pckl: path to write the LogisticRegression pickle. exclude_contig: string identifier of a contig to exclude from training, from_contig: string identifier of the contig from which we sample baseline. random_seed: int used as random seed for reproducibility. indel_weight: float of the weight od indels relative to the rest in the training. """ vcf_reader = vcf.VcfReader(truth_variants) ref_reader = fasta.IndexedFastaReader(ref) sam_reader = sam.SamReader(reads) random.seed(random_seed) dataframe = generate_data(vcf_reader, ref_reader, sam_reader, from_contig, exclude_contig) model = train_model(dataframe, indel_weight=indel_weight) if output_model_pckl: joblib.dump(model, output_model_pckl) model_proto = model_to_proto(model) with tf.gfile.GFile(output_model_proto, 'w') as f: f.write(text_format.MessageToString(model_proto))
def test_roundtrip(self, expected_infos, expected_fmt, expected_fmt1, expected_fmt2, reader_excluded_info=None, reader_excluded_format=None, writer_excluded_info=None, writer_excluded_format=None): expected_records = [ record.format(info=info, fmt=expected_fmt, efmts1=e1, efmts2=e2) for record, info, e1, e2 in zip( self.record_format_strings, expected_infos, expected_fmt1, expected_fmt2) ] expected = self.header + ''.join(expected_records) with vcf.VcfReader( test_utils.genomics_core_testdata('test_py_roundtrip.vcf'), excluded_info_fields=reader_excluded_info, excluded_format_fields=reader_excluded_format) as reader: records = list(reader.iterate()) output_path = test_utils.test_tmpfile('test_roundtrip_tmpfile.vcf') with vcf.VcfWriter( output_path, header=reader.header, excluded_info_fields=writer_excluded_info, excluded_format_fields=writer_excluded_format) as writer: for record in records: writer.write(record) with open(output_path) as f: actual = f.read() self.assertEqual(actual, expected)
def test_vcf_caller_end2end_outputs(self): # Confirming that the proposed VCF (input) has the same variants # as the VCF output converted from the output of make_examples. variants = list( labeled_examples_to_vcf.examples_to_variants( testdata.GOLDEN_VCF_CALLER_TRAINING_EXAMPLES)) with vcf.VcfReader(testdata.TRUTH_VARIANTS_VCF) as proposed_vcf_reader: # This checks the keys (like chr20:10099832:A->G) are the same. self.assertEqual([variant_utils.variant_key(v1) for v1 in variants], [ variant_utils.variant_key(v2) for v2 in proposed_vcf_reader.iterate() ]) with vcf.VcfReader(testdata.TRUTH_VARIANTS_VCF) as proposed_vcf_reader: self.assertEqual( [variant_utils.genotype_as_alleles(v1) for v1 in variants], [ variant_utils.genotype_as_alleles( variant_utils.unphase_all_genotypes(v2)) for v2 in proposed_vcf_reader.iterate() ])
def test_sample_name_flag(self): FLAGS.ref = testdata.CHR20_FASTA FLAGS.examples = testdata.GOLDEN_TRAINING_EXAMPLES FLAGS.sample_name = 'sample_name' FLAGS.output_vcf = test_utils.test_tmpfile('no_sample_name.vcf') labeled_examples_to_vcf.main(0) with vcf.VcfReader(FLAGS.output_vcf) as vcf_reader: self.assertEqual(list(vcf_reader.header.sample_names), [FLAGS.sample_name])
def test_create_vcf_report(self): base_dir = tempfile.mkdtemp() outfile_base = os.path.join(base_dir, 'stats_test') sample_name = 'test_sample_name' with vcf.VcfReader(testdata.GOLDEN_POSTPROCESS_OUTPUT) as reader: vcf_stats.create_vcf_report(variants=reader.iterate(), output_basename=outfile_base, sample_name=sample_name, vcf_reader=reader) self.assertTrue( tf.io.gfile.exists(outfile_base + '.visual_report.html'))
def model_evaluation_runner(truth_variants, reads, ref, input_model_pckl, eval_region, output_report_csv): """Outputs precision-recall for a sklearn model using AlleleCount features. Args: truth_variants: path to the VCF. reads: path to the reads BAM. ref: path to the reference FASTA. input_model_pckl: path to read the LogisticRegression pickle from. eval_region: str, region to evaluate on in the 'chr:start-end', 'chr:position' or 'chr' format. output_report_csv: path to the output report csv. Raises: ValueError: if eval_region cannot be parsed. """ sam_reader = sam.SamReader(reads) ref_reader = fasta.IndexedFastaReader(ref) read_reqs = reads_pb2.ReadRequirements( min_base_quality=10, min_mapping_quality=10, min_base_quality_mode=reads_pb2.ReadRequirements.ENFORCED_BY_CLIENT) allele_counter_options = deepvariant_pb2.AlleleCounterOptions( partition_size=1, read_requirements=read_reqs) model = joblib.load(input_model_pckl) with vcf.VcfReader(truth_variants) as vcf_reader: region = ranges.parse_literal(eval_region, contig_map=ranges.contigs_dict( ref_reader.header.contigs)) true_indels = [ var for var in vcf_reader.query(region) if (variant_utils.is_indel(var)) ] precisions = compute_precision(model, true_indels, sam_reader, ref_reader, allele_counter_options, _THRESHOLDS, region) recalls = compute_effective_recall(model, true_indels, sam_reader, ref_reader, allele_counter_options, _THRESHOLDS) with tf.gfile.GFile(output_report_csv, 'w') as csvfile: fieldnames = ['threshold', 'precision', 'recall'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for threshold in _THRESHOLDS: writer.writerow({ 'threshold': threshold, 'precision': precisions[threshold], 'recall': recalls[threshold] })
def processing_regions_from_options(options): """Computes the calling regions from our options. This function does all of the work needed to read our input files and region specifications to determine the list of regions we should generate examples over. It also computes the confident regions needed to label variants. Args: options: deepvariant.DeepVariantOptions proto containing information about our input data sources. Raises: ValueError: if the regions to call is empty. Returns: Two values. The first is a list of nucleus.genomics.v1.Range protos of the regions we should process. The second is a RangeSet containing the confident regions for labeling, or None if we are running in training mode. """ ref_contigs = fasta.RefFastaReader( options.reference_filename).header.contigs sam_contigs = sam.SamReader(options.reads_filename).header.contigs # Add in confident regions and vcf_contigs if in training mode. vcf_contigs = None if in_training_mode(options): vcf_contigs = vcf.VcfReader( options.truth_variants_filename).header.contigs contigs = _ensure_consistent_contigs(ref_contigs, sam_contigs, vcf_contigs, options.exclude_contigs, options.min_shared_contigs_basepairs) logging.info('Common contigs are %s', [c.name for c in contigs]) calling_regions = build_calling_regions(ref_contigs, options.calling_regions, options.exclude_calling_regions) if not calling_regions: raise ValueError( 'The regions to call is empty. Check your --regions and ' '--exclude_regions flags to make sure they are not ' 'resulting in set of empty region to process. This also ' 'happens if you use "chr20" for a BAM where contig names ' 'don\'t have "chr"s (or vice versa).') regions = regions_to_process( contigs=contigs, partition_size=options.allele_counter_options.partition_size, calling_regions=calling_regions, task_id=options.task_id, num_shards=options.num_shards) return regions
def test_find_matching_allele_frequency(self, variant, expected_return, label): ref_reader = fasta.IndexedFastaReader(testdata.GRCH38_FASTA) vcf_reader = vcf.VcfReader(testdata.VCF_WITH_ALLELE_FREQUENCIES) allele_frequencies = allele_frequency.find_matching_allele_frequency( variant, vcf_reader, ref_reader) # Compare keys. self.assertSetEqual(set(allele_frequencies.keys()), set(expected_return.keys()), msg=label) # Compare values (almost equal). for key in allele_frequencies.keys(): self.assertAlmostEqual(allele_frequencies[key], expected_return[key], msg=label)
def test_align_to_all_haplotypes(self, window_width): # align_to_all_haplotypes() will pull from the reference, so choose a # real variant. region = ranges.parse_literal('chr20:10,046,000-10,046,400') nist_reader = vcf.VcfReader(testdata.TRUTH_VARIANTS_VCF) nist_variants = list(nist_reader.query(region)) # We picked this region to have exactly one known variant: # reference_bases: "AAGAAAGAAAG" # alternate_bases: "A", a deletion of 10 bp # start: 10046177 # end: 10046188 # reference_name: "chr20" variant = nist_variants[0] self.processor.pic = mock.Mock() self.processor.pic.width = window_width self.processor.pic.half_width = int((self.processor.pic.width - 1) / 2) self.processor.realigner = mock.Mock() # Using a real ref_reader to test that the reference allele matches # between the variant and the reference at the variant's coordinates. self.processor.realigner.ref_reader = self.ref_reader read = test_utils.make_read('A' * 101, start=10046100, cigar='101M', quals=[30] * 101) self.processor.realigner.align_to_haplotype = mock.Mock() alt_info = self.processor.align_to_all_haplotypes(variant, [read]) hap_alignments = alt_info['alt_alignments'] hap_sequences = alt_info['alt_sequences'] # Both outputs are keyed by alt allele. self.assertCountEqual(hap_alignments.keys(), ['A']) self.assertCountEqual(hap_sequences.keys(), ['A']) # Sequence must be the length of the window. self.assertLen(hap_sequences['A'], self.processor.pic.width) # align_to_haplotype should be called once for each alt (1 alt here). self.processor.realigner.align_to_haplotype.assert_called_once() # If variant reference_bases are wrong, it should raise a ValueError. variant.reference_bases = 'G' with six.assertRaisesRegex( self, ValueError, 'does not match the bases in the reference'): self.processor.align_to_all_haplotypes(variant, [read])
def _make_labeler_from_options(self): truth_vcf_reader = vcf.VcfReader( self.options.truth_variants_filename, excluded_format_fields=['GL', 'GQ', 'PL']) confident_regions = read_confident_regions(self.options) if (self.options.labeler_algorithm == deepvariant_pb2.DeepVariantOptions.POSITIONAL_LABELER): return positional_labeler.PositionalVariantLabeler( truth_vcf_reader=truth_vcf_reader, confident_regions=confident_regions) elif (self.options.labeler_algorithm == deepvariant_pb2.DeepVariantOptions.HAPLOTYPE_LABELER): return haplotype_labeler.HaplotypeLabeler( truth_vcf_reader=truth_vcf_reader, ref_reader=self.ref_reader, confident_regions=confident_regions) else: raise ValueError('Unexpected labeler_algorithm', self.options.labeler_algorithm)
def test_add_allele_frequencies_to_candidates(self, dv_calls, expected_return, testcase): if testcase == 'valid': pop_vcf_reader = vcf.VcfReader( testdata.VCF_WITH_ALLELE_FREQUENCIES) ref_reader = fasta.IndexedFastaReader(testdata.GRCH38_FASTA) elif testcase == 'no VCF': pop_vcf_reader = None ref_reader = None else: raise ValueError('Invalid testcase for parameterized test.') updated_dv_call = list( allele_frequency.add_allele_frequencies_to_candidates( dv_calls, pop_vcf_reader, ref_reader)) actual_frequency = updated_dv_call[0].allele_frequency # Compare keys. self.assertSetEqual(set(actual_frequency.keys()), set(expected_return.keys())) # Compare values (almost equal). for key in actual_frequency.keys(): self.assertAlmostEqual(actual_frequency[key], expected_return[key])
def test_header_format_mixed_order(self): """Tests reading a VCF with unconventional FORMAT field definition. Tests reading a VCF in which the properties of the format fields are defined in mixed order in the header. For example, ##FORMAT=<ID=GT,Type=String,Number=1,Description="GT description"> (In normal VCFs "Number" should come before "Type".) """ with vcf.VcfReader( test_utils.genomics_core_testdata( 'header_format_mixed_order.vcf')) as vreader: formats = vreader.header.formats variants = list(vreader) self.assertLen(formats, 1) self.assertEqual(formats[0].id, 'GT') self.assertEqual(formats[0].number, '1') self.assertEqual(formats[0].type, 'String') self.assertEqual(formats[0].description, 'GT description') self.assertLen(variants, 2) self.assertEqual(variants[0].calls[0].genotype, [0, 1]) self.assertEqual(variants[1].calls[0].genotype, [1, 1])
def _make_labeler_from_options(self): """Creates the labeler from options.""" truth_vcf_reader = vcf.VcfReader( self.options.truth_variants_filename, excluded_format_fields=['GL', 'GQ', 'PL']) confident_regions = read_confident_regions(self.options) if (self.options.labeler_algorithm == deepvariant_pb2.DeepVariantOptions.POSITIONAL_LABELER): return positional_labeler.PositionalVariantLabeler( truth_vcf_reader=truth_vcf_reader, confident_regions=confident_regions) elif (self.options.labeler_algorithm == deepvariant_pb2.DeepVariantOptions.HAPLOTYPE_LABELER): return haplotype_labeler.HaplotypeLabeler( truth_vcf_reader=truth_vcf_reader, ref_reader=self.ref_reader, confident_regions=confident_regions) elif (self.options.labeler_algorithm == deepvariant_pb2.DeepVariantOptions.CUSTOMIZED_CLASSES_LABELER): if (not FLAGS.customized_classes_labeler_classes_list or not FLAGS.customized_classes_labeler_info_field_name): raise ValueError( 'For -labeler_algorithm=customized_classes_labeler, ' 'you need to set ' '-customized_classes_labeler_classes_list and ' '-customized_classes_labeler_info_field_name.') return customized_classes_labeler.CustomizedClassesVariantLabeler( truth_vcf_reader=truth_vcf_reader, confident_regions=confident_regions, classes_list=FLAGS.customized_classes_labeler_classes_list, info_field_name=FLAGS. customized_classes_labeler_info_field_name) else: raise ValueError('Unexpected labeler_algorithm', self.options.labeler_algorithm)
def setUp(self): self.sites_reader = vcf.VcfReader( test_utils.genomics_core_testdata('test_sites.vcf')) self.samples_reader = vcf.VcfReader( test_utils.genomics_core_testdata('test_samples.vcf.gz'))
def main(argv=()): with errors.clean_commandline_error_exit(): if len(argv) > 1: errors.log_and_raise( 'Command line parsing failure: postprocess_variants does not accept ' 'positional arguments but some are present on the command line: ' '"{}".'.format(str(argv)), errors.CommandLineError) del argv # Unused. if (not FLAGS.nonvariant_site_tfrecord_path) != ( not FLAGS.gvcf_outfile): errors.log_and_raise( 'gVCF creation requires both nonvariant_site_tfrecord_path and ' 'gvcf_outfile flags to be set.', errors.CommandLineError) proto_utils.uses_fast_cpp_protos_or_die() logging_level.set_from_flag() fasta_reader = fasta.IndexedFastaReader(FLAGS.ref, cache_size=_FASTA_CACHE_SIZE) contigs = fasta_reader.header.contigs paths = io_utils.maybe_generate_sharded_filenames(FLAGS.infile) # Read one CallVariantsOutput record and extract the sample name from it. # Note that this assumes that all CallVariantsOutput protos in the infile # contain a single VariantCall within their constituent Variant proto, and # that the call_set_name is identical in each of the records. record = tf_utils.get_one_example_from_examples_path( ','.join(paths), proto=deepvariant_pb2.CallVariantsOutput) if record is None: raise ValueError('Cannot find any records in {}'.format( ','.join(paths))) sample_name = _extract_single_sample_name(record) header = dv_vcf_constants.deepvariant_header( contigs=contigs, sample_names=[sample_name]) with tempfile.NamedTemporaryFile() as temp: postprocess_variants_lib.process_single_sites_tfrecords( contigs, paths, temp.name) independent_variants = _transform_call_variants_output_to_variants( input_sorted_tfrecord_path=temp.name, qual_filter=FLAGS.qual_filter, multi_allelic_qual_filter=FLAGS.multi_allelic_qual_filter, sample_name=sample_name) variant_generator = haplotypes.maybe_resolve_conflicting_variants( independent_variants) write_variants_to_vcf(variant_generator=variant_generator, output_vcf_path=FLAGS.outfile, header=header) # Also write out the gVCF file if it was provided. if FLAGS.nonvariant_site_tfrecord_path: nonvariant_generator = io_utils.read_shard_sorted_tfrecords( FLAGS.nonvariant_site_tfrecord_path, key=_get_contig_based_variant_sort_keyfn(contigs), proto=variants_pb2.Variant) with vcf.VcfReader(FLAGS.outfile) as variant_reader: lessthanfn = _get_contig_based_lessthan(contigs) gvcf_variants = (_transform_to_gvcf_record(variant) for variant in variant_reader.iterate()) merged_variants = merge_variants_and_nonvariants( gvcf_variants, nonvariant_generator, lessthanfn, fasta_reader) write_variants_to_vcf(variant_generator=merged_variants, output_vcf_path=FLAGS.gvcf_outfile, header=header)
def main(argv=()): with errors.clean_commandline_error_exit(): if len(argv) > 1: errors.log_and_raise( 'Command line parsing failure: postprocess_variants does not accept ' 'positional arguments but some are present on the command line: ' '"{}".'.format(str(argv)), errors.CommandLineError) del argv # Unused. if (not FLAGS.nonvariant_site_tfrecord_path) != (not FLAGS.gvcf_outfile): errors.log_and_raise( 'gVCF creation requires both nonvariant_site_tfrecord_path and ' 'gvcf_outfile flags to be set.', errors.CommandLineError) proto_utils.uses_fast_cpp_protos_or_die() logging_level.set_from_flag() fasta_reader = fasta.IndexedFastaReader( FLAGS.ref, cache_size=_FASTA_CACHE_SIZE) contigs = fasta_reader.header.contigs paths = sharded_file_utils.maybe_generate_sharded_filenames(FLAGS.infile) # Read one CallVariantsOutput record and extract the sample name from it. # Note that this assumes that all CallVariantsOutput protos in the infile # contain a single VariantCall within their constituent Variant proto, and # that the call_set_name is identical in each of the records. record = tf_utils.get_one_example_from_examples_path( ','.join(paths), proto=deepvariant_pb2.CallVariantsOutput) if record is None: logging.info('call_variants_output is empty. Writing out empty VCF.') sample_name = dv_constants.DEFAULT_SAMPLE_NAME if FLAGS.sample_name: logging.info( '--sample_name is set in postprocess_variant. Using %s as the ' 'sample name.', FLAGS.sample_name) sample_name = FLAGS.sample_name variant_generator = iter([]) else: sample_name = _extract_single_sample_name(record) temp = tempfile.NamedTemporaryFile() start_time = time.time() postprocess_variants_lib.process_single_sites_tfrecords( contigs, paths, temp.name) logging.info('CVO sorting took %s minutes', (time.time() - start_time) / 60) logging.info('Transforming call_variants_output to variants.') independent_variants = _transform_call_variants_output_to_variants( input_sorted_tfrecord_path=temp.name, qual_filter=FLAGS.qual_filter, multi_allelic_qual_filter=FLAGS.multi_allelic_qual_filter, sample_name=sample_name, group_variants=FLAGS.group_variants, use_multiallelic_model=FLAGS.use_multiallelic_model) variant_generator = haplotypes.maybe_resolve_conflicting_variants( independent_variants) header = dv_vcf_constants.deepvariant_header( contigs=contigs, sample_names=[sample_name]) use_csi = _decide_to_use_csi(contigs) start_time = time.time() if not FLAGS.nonvariant_site_tfrecord_path: logging.info('Writing variants to VCF.') write_variants_to_vcf( variant_iterable=variant_generator, output_vcf_path=FLAGS.outfile, header=header) if FLAGS.outfile.endswith('.gz'): build_index(FLAGS.outfile, use_csi) logging.info('VCF creation took %s minutes', (time.time() - start_time) / 60) else: logging.info('Merging and writing variants to VCF and gVCF.') lessthanfn = _get_contig_based_lessthan(contigs) with vcf.VcfWriter( FLAGS.outfile, header=header, round_qualities=True) as vcf_writer, \ vcf.VcfWriter( FLAGS.gvcf_outfile, header=header, round_qualities=True) \ as gvcf_writer: nonvariant_generator = tfrecord.read_shard_sorted_tfrecords( FLAGS.nonvariant_site_tfrecord_path, key=_get_contig_based_variant_sort_keyfn(contigs), proto=variants_pb2.Variant) merge_and_write_variants_and_nonvariants(variant_generator, nonvariant_generator, lessthanfn, fasta_reader, vcf_writer, gvcf_writer) if FLAGS.outfile.endswith('.gz'): build_index(FLAGS.outfile, use_csi) if FLAGS.gvcf_outfile.endswith('.gz'): build_index(FLAGS.gvcf_outfile, use_csi) logging.info('Finished writing VCF and gVCF in %s minutes.', (time.time() - start_time) / 60) if FLAGS.vcf_stats_report: outfile_base = _get_base_path(FLAGS.outfile) with vcf.VcfReader(FLAGS.outfile) as reader: vcf_stats.create_vcf_report( variants=reader.iterate(), output_basename=outfile_base, sample_name=sample_name, vcf_reader=reader) if record: temp.close()
def setUp(self): self.vcf_reader = vcf.VcfReader( test_utils.genomics_core_testdata('test_sites.vcf')) self.cache = self.vcf_reader.field_access_cache
def test_make_examples_end2end(self, mode, num_shards, test_condition=TestConditions.USE_BAM, labeler_algorithm=None, use_fast_pass_aligner=True): self.assertIn(mode, {'calling', 'training'}) region = ranges.parse_literal('chr20:10,000,000-10,010,000') FLAGS.write_run_info = True FLAGS.ref = testdata.CHR20_FASTA if test_condition == TestConditions.USE_BAM: FLAGS.reads = testdata.CHR20_BAM elif test_condition == TestConditions.USE_CRAM: FLAGS.reads = testdata.CHR20_CRAM elif test_condition == TestConditions.USE_MULTI_BAMS: FLAGS.reads = ','.join( [testdata.CHR20_BAM_FIRST_HALF, testdata.CHR20_BAM_SECOND_HALF]) FLAGS.candidates = test_utils.test_tmpfile( _sharded('vsc.tfrecord', num_shards)) FLAGS.examples = test_utils.test_tmpfile( _sharded('examples.tfrecord', num_shards)) FLAGS.regions = [ranges.to_literal(region)] FLAGS.partition_size = 1000 FLAGS.mode = mode FLAGS.gvcf_gq_binsize = 5 FLAGS.use_fast_pass_aligner = use_fast_pass_aligner if labeler_algorithm is not None: FLAGS.labeler_algorithm = labeler_algorithm if mode == 'calling': FLAGS.gvcf = test_utils.test_tmpfile( _sharded('gvcf.tfrecord', num_shards)) else: FLAGS.truth_variants = testdata.TRUTH_VARIANTS_VCF FLAGS.confident_regions = testdata.CONFIDENT_REGIONS_BED for task_id in range(max(num_shards, 1)): FLAGS.task = task_id options = make_examples.default_options(add_flags=True) # We need to overwrite bam_fname for USE_CRAM test since Golden Set # generated from BAM file. BAM filename is stored in candidates. If we # don't overwrite default_options variants won't match and test fail. options.bam_fname = 'NA12878_S1.chr20.10_10p1mb.bam' make_examples_core.make_examples_runner(options) # Check that our run_info proto contains the basic fields we'd expect: # (a) our options are written to the run_info.options field. run_info = make_examples_core.read_make_examples_run_info( options.run_info_filename) self.assertEqual(run_info.options, options) # (b) run_info.resource_metrics is present and contains our hostname. self.assertTrue(run_info.HasField('resource_metrics')) self.assertEqual(run_info.resource_metrics.host_name, platform.node()) # Test that our candidates are reasonable, calling specific helper functions # to check lots of properties of the output. candidates = sorted( tfrecord.read_tfrecords( FLAGS.candidates, proto=deepvariant_pb2.DeepVariantCall), key=lambda c: variant_utils.variant_range_tuple(c.variant)) self.verify_deepvariant_calls(candidates, options) self.verify_variants([call.variant for call in candidates], region, options, is_gvcf=False) # Verify that the variants in the examples are all good. examples = self.verify_examples( FLAGS.examples, region, options, verify_labels=mode == 'training') example_variants = [tf_utils.example_variant(ex) for ex in examples] self.verify_variants(example_variants, region, options, is_gvcf=False) # Verify the integrity of the examples and then check that they match our # golden labeled examples. Note we expect the order for both training and # calling modes to produce deterministic order because we fix the random # seed. if mode == 'calling': golden_file = _sharded(testdata.GOLDEN_CALLING_EXAMPLES, num_shards) else: golden_file = _sharded(testdata.GOLDEN_TRAINING_EXAMPLES, num_shards) self.assertDeepVariantExamplesEqual( examples, list(tfrecord.read_tfrecords(golden_file))) if mode == 'calling': nist_reader = vcf.VcfReader(testdata.TRUTH_VARIANTS_VCF) nist_variants = list(nist_reader.query(region)) self.verify_nist_concordance(example_variants, nist_variants) # Check the quality of our generated gvcf file. gvcfs = variant_utils.sorted_variants( tfrecord.read_tfrecords(FLAGS.gvcf, proto=variants_pb2.Variant)) self.verify_variants(gvcfs, region, options, is_gvcf=True) self.verify_contiguity(gvcfs, region) gvcf_golden_file = _sharded(testdata.GOLDEN_POSTPROCESS_GVCF_INPUT, num_shards) expected_gvcfs = list( tfrecord.read_tfrecords(gvcf_golden_file, proto=variants_pb2.Variant)) # Despite the name, assertCountEqual checks that all elements match. self.assertCountEqual(gvcfs, expected_gvcfs) if (mode == 'training' and num_shards == 0 and labeler_algorithm != 'positional_labeler'): # The positional labeler doesn't track metrics, so don't try to read them # in when that's the mode. self.assertEqual( make_examples_core.read_make_examples_run_info( testdata.GOLDEN_MAKE_EXAMPLES_RUN_INFO).labeling_metrics, run_info.labeling_metrics)