def test_vcf_query(self): tabix.build_index(self.output_file) self.input_reader = vcf.VcfReader(self.input_file) self.output_reader = vcf.VcfReader(self.output_file) range1 = ranges.parse_literal('chr3:100,000-500,000') self.assertEqual( list(self.input_reader.query(range1)), list(self.output_reader.query(range1)))
def build_index(vcf_file, csi=False): """A helper function for indexing VCF files. Args: vcf_file: string. Path to the VCF file to be indexed. csi: bool. If true, index using the CSI format. """ if csi: tabix.build_csi_index(vcf_file, min_shift=14) else: tabix.build_index(vcf_file)
def test_build_index(self): self.assertFalse(gfile.Exists(self.tbx_index_file)) tabix.build_index(self.output_file) self.assertTrue(gfile.Exists(self.tbx_index_file))
def main(argv=()): with errors.clean_commandline_error_exit(): if len(argv) > 1: errors.log_and_raise( 'Command line parsing failure: postprocess_variants does not accept ' 'positional arguments but some are present on the command line: ' '"{}".'.format(str(argv)), errors.CommandLineError) del argv # Unused. if (not FLAGS.nonvariant_site_tfrecord_path) != ( not FLAGS.gvcf_outfile): errors.log_and_raise( 'gVCF creation requires both nonvariant_site_tfrecord_path and ' 'gvcf_outfile flags to be set.', errors.CommandLineError) proto_utils.uses_fast_cpp_protos_or_die() logging_level.set_from_flag() fasta_reader = fasta.IndexedFastaReader(FLAGS.ref, cache_size=_FASTA_CACHE_SIZE) contigs = fasta_reader.header.contigs paths = sharded_file_utils.maybe_generate_sharded_filenames( FLAGS.infile) # Read one CallVariantsOutput record and extract the sample name from it. # Note that this assumes that all CallVariantsOutput protos in the infile # contain a single VariantCall within their constituent Variant proto, and # that the call_set_name is identical in each of the records. record = tf_utils.get_one_example_from_examples_path( ','.join(paths), proto=deepvariant_pb2.CallVariantsOutput) if record is None: raise ValueError('Cannot find any records in {}'.format( ','.join(paths))) sample_name = _extract_single_sample_name(record) header = dv_vcf_constants.deepvariant_header( contigs=contigs, sample_names=[sample_name]) with tempfile.NamedTemporaryFile() as temp: start_time = time.time() postprocess_variants_lib.process_single_sites_tfrecords( contigs, paths, temp.name) logging.info('CVO sorting took %s minutes', (time.time() - start_time) / 60) logging.info('Transforming call_variants_output to variants.') start_time = time.time() independent_variants = _transform_call_variants_output_to_variants( input_sorted_tfrecord_path=temp.name, qual_filter=FLAGS.qual_filter, multi_allelic_qual_filter=FLAGS.multi_allelic_qual_filter, sample_name=sample_name) variant_generator = haplotypes.maybe_resolve_conflicting_variants( independent_variants) start_time = time.time() if not FLAGS.nonvariant_site_tfrecord_path: logging.info('Writing variants to VCF.') write_variants_to_vcf(variant_iterable=variant_generator, output_vcf_path=FLAGS.outfile, header=header) if FLAGS.outfile.endswith('.gz'): tabix.build_index(FLAGS.outfile) logging.info('VCF creation took %s minutes', (time.time() - start_time) / 60) else: logging.info('Merging and writing variants to VCF and gVCF.') lessthanfn = _get_contig_based_lessthan(contigs) with vcf.VcfWriter( FLAGS.outfile, header=header, round_qualities=True) as vcf_writer, \ vcf.VcfWriter( FLAGS.gvcf_outfile, header=header, round_qualities=True) \ as gvcf_writer: nonvariant_generator = tfrecord.read_shard_sorted_tfrecords( FLAGS.nonvariant_site_tfrecord_path, key=_get_contig_based_variant_sort_keyfn(contigs), proto=variants_pb2.Variant) merge_and_write_variants_and_nonvariants( variant_generator, nonvariant_generator, lessthanfn, fasta_reader, vcf_writer, gvcf_writer) if FLAGS.outfile.endswith('.gz'): tabix.build_index(FLAGS.outfile) if FLAGS.gvcf_outfile.endswith('.gz'): tabix.build_index(FLAGS.gvcf_outfile) logging.info('Finished writing VCF and gVCF in %s minutes.', (time.time() - start_time) / 60)