def test_write_vcf_annotated_using_coverage_from_gramtools(self): """test write_vcf_annotated_using_coverage_from_gramtools""" vcf_file_in = os.path.join( data_dir, "write_vcf_annotated_using_coverage_from_gramtools.in.vcf") quasimap_dir = os.path.join( data_dir, "write_vcf_annotated_using_coverage_from_gramtools.quasimap") mean_depth, depth_variance, vcf_header, vcf_records, allele_coverage, allele_groups = gramtools.load_gramtools_vcf_and_allele_coverage_files( vcf_file_in, quasimap_dir) tmp_outfile = ( "tmp.gramtools.write_vcf_annotated_using_coverage_from_gramtools.vcf" ) tmp_outfile_filtered = tmp_outfile + ".filter.vcf" error_rate = 0.001 gramtools.write_vcf_annotated_using_coverage_from_gramtools( mean_depth, vcf_records, allele_coverage, allele_groups, error_rate, tmp_outfile, sample_name="sample_42", max_read_length=200, filtered_outfile=tmp_outfile_filtered, ) expected_vcf = os.path.join( data_dir, "write_vcf_annotated_using_coverage_from_gramtools.out.vcf") expected_vcf_filtered = os.path.join( data_dir, "write_vcf_annotated_using_coverage_from_gramtools.out.vcf.filter.vcf", ) # Today's date and the verison of minos get added to the header. # We'll have to take account # of those by fixing what we get from the expected file def check_vcfs(expected_vcf, got_vcf): expected_header, expected_vcf_records = vcf_file_read.vcf_file_to_list( expected_vcf) got_header, got_vcf_records = vcf_file_read.vcf_file_to_list( got_vcf) for i in range(len(expected_header)): if expected_header[i].startswith("##fileDate="): expected_header[i] = "##fileDate=" + str( datetime.date.today()) elif expected_header[i].startswith("##source=minos"): expected_header[ i] = "##source=minos, version " + minos_version self.assertEqual(expected_header, got_header) self.assertEqual(expected_vcf_records, got_vcf_records) check_vcfs(expected_vcf, tmp_outfile) check_vcfs(expected_vcf_filtered, tmp_outfile_filtered) os.unlink(tmp_outfile) os.unlink(tmp_outfile_filtered)
def _run_gramtools_not_split_vcf(self): self.gramtools_kmer_size = Adjudicator._get_gramtools_kmer_size( self.gramtools_build_dir, self.gramtools_kmer_size) build_report, quasimap_report = gramtools.run_gramtools( self.gramtools_build_dir, self.gramtools_quasimap_dir, self.clustered_vcf, self.ref_fasta, self.reads_files, self.max_read_length, kmer_size=self.gramtools_kmer_size, ) logging.info('Loading gramtools quasimap output files ' + self.gramtools_quasimap_dir) mean_depth, depth_variance, vcf_header, vcf_records, allele_coverage, allele_groups = gramtools.load_gramtools_vcf_and_allele_coverage_files( self.perl_generated_vcf, self.gramtools_quasimap_dir) logging.info('Finished loading gramtools files') if self.sample_name is None: sample_name = vcf_file_read.get_sample_name_from_vcf_header_lines( vcf_header) else: sample_name = self.sample_name assert sample_name is not None logging.info('Writing VCf output file ' + self.final_vcf) gramtools.write_vcf_annotated_using_coverage_from_gramtools( mean_depth, vcf_records, allele_coverage, allele_groups, self.read_error_rate, self.unfiltered_vcf_file, self.gramtools_kmer_size, sample_name=sample_name, max_read_length=self.max_read_length, filtered_outfile=self.final_vcf) logging.info( f'Adding GT_CONF_PERCENTLE to final VCF file {self.final_vcf}, using mean depth {mean_depth}, depth variance {depth_variance}, error rate {self.read_error_rate}, and {self.genotype_simulation_iterations} simulation iterations' ) Adjudicator._add_gt_conf_percentile_to_vcf_file( self.final_vcf, mean_depth, depth_variance, self.read_error_rate, self.genotype_simulation_iterations) if self.clean: os.rename( os.path.join(self.gramtools_quasimap_dir, 'report.json'), os.path.join(self.outdir, 'gramtools.quasimap.report.json')) shutil.rmtree(self.gramtools_quasimap_dir) if not self.user_supplied_gramtools_build_dir: os.rename( os.path.join(self.gramtools_build_dir, 'build_report.json'), os.path.join(self.outdir, 'gramtools.build.report.json')) shutil.rmtree(self.gramtools_build_dir)
def run_adjudicate(self, build_dir, quasimap_dir, vcf, reads_files, final_vcf, debug_vcf): build_report, quasimap_report = gramtools.run_gramtools( build_dir, quasimap_dir, vcf, self.ref_fasta, reads_files, kmer_size=self.gramtools_kmer_size, ) build_vcf = os.path.join(build_dir, "build.vcf") logging.info("Loading gramtools quasimap output files " + quasimap_dir) ( self.mean_depth, self.variance_depth, vcf_header, vcf_records, allele_coverage, allele_groups, ) = gramtools.load_gramtools_vcf_and_allele_coverage_files( build_vcf, quasimap_dir) logging.info("Finished loading gramtools files") if self.clean: os.rename( os.path.join(quasimap_dir, "quasimap_outputs", "quasimap_report.json"), self.gramtools_quasimap_json, ) shutil.rmtree(quasimap_dir) if not self.user_supplied_gramtools_build_dir: os.rename( os.path.join(build_dir, "build_report.json"), self.gramtools_build_json, ) shutil.rmtree(build_dir) gramtools.write_vcf_annotated_using_coverage_from_gramtools( self.mean_depth, self.variance_depth, vcf_records, allele_coverage, allele_groups, self.read_error_rate, debug_vcf, sample_name=self.sample_name, filtered_outfile=final_vcf, ref_seq_lengths=self.ref_seq_lengths, call_hets=self.call_hets, )
def run_adjudicate(self, build_dir, quasimap_dir, vcf, reads_files, final_vcf, debug_vcf): build_report, quasimap_report = gramtools.run_gramtools( build_dir, quasimap_dir, vcf, self.ref_fasta, reads_files, self.max_read_length, kmer_size=self.gramtools_kmer_size, ) build_vcf = os.path.join(build_dir, "build.vcf") logging.info("Loading gramtools quasimap output files " + quasimap_dir) mean_depth, variance_depth, vcf_header, vcf_records, allele_coverage, allele_groups = gramtools.load_gramtools_vcf_and_allele_coverage_files( build_vcf, quasimap_dir) Adjudicator.mean_depths.append(mean_depth) Adjudicator.variance_depths.append(variance_depth) logging.info("Finished loading gramtools files") if self.clean: os.rename( os.path.join(quasimap_dir, "quasimap_outputs", "quasimap_report.json"), quasimap_dir + ".report.json", ) shutil.rmtree(quasimap_dir) if not self.user_supplied_gramtools_build_dir: os.rename( os.path.join(build_dir, "build_report.json"), os.path.join(build_dir, "build.report.json"), ) shutil.rmtree(build_dir) if self.sample_name is None: sample_name = vcf_file_read.get_sample_name_from_vcf_header_lines( vcf_header) else: sample_name = self.sample_name gramtools.write_vcf_annotated_using_coverage_from_gramtools( mean_depth, vcf_records, allele_coverage, allele_groups, self.read_error_rate, debug_vcf, sample_name=sample_name, max_read_length=self.max_read_length, filtered_outfile=final_vcf, )
def _run_gramtools_with_split_vcf(self): logging.info('Splitting VCF files into chunks (if not already done)') chunker = vcf_chunker.VcfChunker( self.split_input_dir, vcf_infile=self.clustered_vcf, ref_fasta=self.ref_fasta, variants_per_split=self.variants_per_split, alleles_per_split=self.alleles_per_split, max_read_length=self.max_read_length, total_splits=self.total_splits, flank_length=self.max_read_length, gramtools_kmer_size=self.gramtools_kmer_size, ) chunker.make_split_files() self.gramtools_kmer_size = chunker.gramtools_kmer_size logging.info('VCF file split into ' + str(chunker.total_split_files) + ' chunks') try: os.mkdir(self.split_output_dir) except: raise Error('Error making output split directory ' + self.split_output_dir) unmapped_reads_file = os.path.join(self.split_output_dir, 'unmapped_reads.bam') bam_read_extract.get_unmapped_reads(self.reads_files[0], unmapped_reads_file) split_vcf_outfiles = {} split_vcf_outfiles_unfiltered = {} mean_depths = [] depth_variances = [] for ref_name, split_file_list in chunker.vcf_split_files.items(): split_vcf_outfiles[ref_name] = [] split_vcf_outfiles_unfiltered[ref_name] = [] for split_file in split_file_list: logging.info( '===== Start analysing variants in VCF split file ' + split_file.filename + ' =====') split_reads_file = os.path.join( self.split_output_dir, 'split.' + str(split_file.file_number) + '.reads.bam') bam_read_extract.get_region( self.reads_files[0], split_file.chrom, split_file.chrom_start, split_file.chrom_end, split_reads_file, ) gramtools_quasimap_dir = os.path.join( self.split_output_dir, 'split.' + str(split_file.file_number) + '.gramtools.quasimap') build_report, quasimap_report = gramtools.run_gramtools( split_file.gramtools_build_dir, gramtools_quasimap_dir, split_file.filename, self.ref_fasta, [unmapped_reads_file, split_reads_file], self.max_read_length, kmer_size=self.gramtools_kmer_size, ) logging.info('Loading split gramtools quasimap output files ' + gramtools_quasimap_dir) perl_generated_vcf = os.path.join( split_file.gramtools_build_dir, 'perl_generated_vcf') mean_depth, depth_variance, vcf_header, vcf_records, allele_coverage, allele_groups = gramtools.load_gramtools_vcf_and_allele_coverage_files( perl_generated_vcf, gramtools_quasimap_dir) mean_depths.append(mean_depth) depth_variances.append(depth_variance) logging.info('Finished loading gramtools files') if self.sample_name is None: sample_name = vcf_file_read.get_sample_name_from_vcf_header_lines( vcf_header) else: sample_name = self.sample_name assert sample_name is not None split_vcf_out = os.path.join( self.split_output_dir, 'split.' + str(split_file.file_number) + '.out.vcf') unfiltered_vcf_out = os.path.join( self.split_output_dir, 'split.' + str(split_file.file_number) + '.out.debug.calls_with_zero_cov_alleles.vcf') logging.info('Writing VCf output file ' + split_vcf_out + ' for split VCF file ' + split_file.filename) gramtools.write_vcf_annotated_using_coverage_from_gramtools( mean_depth, vcf_records, allele_coverage, allele_groups, self.read_error_rate, unfiltered_vcf_out, self.gramtools_kmer_size, sample_name=sample_name, max_read_length=self.max_read_length, filtered_outfile=split_vcf_out, ) split_vcf_outfiles[ref_name].append(split_vcf_out) split_vcf_outfiles_unfiltered[ref_name].append( unfiltered_vcf_out) if self.clean: logging.info( 'Cleaning gramtools files from split VCF file ' + split_file.filename) if not self.user_supplied_gramtools_build_dir: os.rename( os.path.join(split_file.gramtools_build_dir, 'build_report.json'), split_file.gramtools_build_dir + '.report.json') shutil.rmtree(split_file.gramtools_build_dir) os.unlink(split_file.filename) os.rename( os.path.join(gramtools_quasimap_dir, 'report.json'), gramtools_quasimap_dir + '.report.json') shutil.rmtree(gramtools_quasimap_dir) os.unlink(split_reads_file) logging.info( '===== Finish analysing variants in VCF split file ' + split_file.filename + ' =====') logging.info('Merging VCF files into one output file ' + self.final_vcf) chunker.merge_files(split_vcf_outfiles, self.final_vcf) chunker.merge_files(split_vcf_outfiles_unfiltered, self.unfiltered_vcf_file) mean_depth = statistics.mean(mean_depths) depth_variance = statistics.mean(depth_variances) logging.info( f'Adding GT_CONF_PERCENTLE to final VCF file {self.final_vcf}, using mean depth {mean_depth}, depth variance {depth_variance}, error rate {self.read_error_rate}, and {self.genotype_simulation_iterations} simulation iterations' ) Adjudicator._add_gt_conf_percentile_to_vcf_file( self.final_vcf, mean_depth, depth_variance, self.read_error_rate, self.genotype_simulation_iterations) if self.clean: logging.info('Deleting temp split VCF files') for d in split_vcf_outfiles, split_vcf_outfiles_unfiltered: for file_list in d.values(): for filename in file_list: os.unlink(filename) os.unlink(unmapped_reads_file)
def _run_gramtools_with_split_vcf(self): logging.info("Splitting VCF files into chunks (if not already done)") chunker = vcf_chunker.VcfChunker( self.split_input_dir, vcf_infile=self.clustered_vcf, ref_fasta=self.ref_fasta, variants_per_split=self.variants_per_split, alleles_per_split=self.alleles_per_split, total_splits=self.total_splits, gramtools_kmer_size=self.gramtools_kmer_size, ) chunker.make_split_files() self.gramtools_kmer_size = chunker.gramtools_kmer_size logging.info("VCF file split into " + str(chunker.total_split_files) + " chunks") try: os.mkdir(self.split_output_dir) except: raise Exception("Error making output split directory " + self.split_output_dir) if self.use_unmapped_reads: unmapped_reads_file = os.path.join(self.split_output_dir, "unmapped_reads.bam") bam_read_extract.get_unmapped_reads(self.reads_files[0], unmapped_reads_file) else: unmapped_reads_file = None read_coverage = [] build_reports = {} quasimap_reports = {} # Run gramtools quasimap on each split. Get back the read depth # from each split, which we need to get the global read depth and # variance, to then use for genotyping for ref_name, split_file_list in chunker.vcf_split_files.items(): for split_file in split_file_list: read_cov, build_report, quasimap_report = self._run_quasimap_one_split( split_file, unmapped_reads_file) read_coverage.extend(read_cov) build_reports[split_file.file_number] = build_report quasimap_reports[split_file.file_number] = quasimap_report with open(self.gramtools_quasimap_json, "w") as f: json.dump(quasimap_reports, f, indent=2, sort_keys=True) if not self.user_supplied_gramtools_build_dir: with open(self.gramtools_build_json, "w") as f: json.dump(build_reports, f, indent=2, sort_keys=True) self.mean_depth = round(statistics.mean(read_coverage), 3) self.variance_depth = round(statistics.variance(read_coverage), 3) # Can now genotype each split VCF, using the global mean depth and variance split_vcf_outfiles = {} split_vcf_outfiles_unfiltered = {} for ref_name, split_file_list in chunker.vcf_split_files.items(): split_vcf_outfiles[ref_name] = [] split_vcf_outfiles_unfiltered[ref_name] = [] for split_file in split_file_list: build_vcf = os.path.join(split_file.gramtools_build_dir, "build.vcf") quasimap_dir = os.path.join( self.split_output_dir, f"split.{split_file.file_number}.gramtools.quasimap", ) logging.info(f"Loading gramtools quasimap output files " + quasimap_dir) ( _, # mean depth for this split, which we don't want _, # depth variance for this split, which we don't want vcf_header, vcf_records, allele_coverage, allele_groups, ) = gramtools.load_gramtools_vcf_and_allele_coverage_files( build_vcf, quasimap_dir) logging.info("Finished loading gramtools files") if self.clean: shutil.rmtree(quasimap_dir) vcf_prefix = os.path.join( self.split_output_dir, f"split.{split_file.file_number}.out", ) split_vcf_out = f"{vcf_prefix}.vcf" unfiltered_vcf_out = ( f"{vcf_prefix}.debug.calls_with_zero_cov_alleles.vcf") gramtools.write_vcf_annotated_using_coverage_from_gramtools( self.mean_depth, self.variance_depth, vcf_records, allele_coverage, allele_groups, self.read_error_rate, unfiltered_vcf_out, sample_name=self.sample_name, filtered_outfile=split_vcf_out, ref_seq_lengths=self.ref_seq_lengths, call_hets=self.call_hets, ) split_vcf_outfiles[ref_name].append(split_vcf_out) split_vcf_outfiles_unfiltered[ref_name].append( unfiltered_vcf_out) # We now have minos run on each split VCF. Merge into one VCF, then can # add gt conf and gcp to the merged VCF. logging.info("Merging VCF files into one output file " + self.final_vcf) chunker.merge_files(split_vcf_outfiles, self.final_vcf) chunker.merge_files(split_vcf_outfiles_unfiltered, self.unfiltered_vcf_file) self.run_gt_conf() if self.clean: logging.info("Deleting temp split VCF files") for d in split_vcf_outfiles, split_vcf_outfiles_unfiltered: for file_list in d.values(): for filename in file_list: os.unlink(filename) if self.use_unmapped_reads: os.unlink(unmapped_reads_file)