def _nextflow_helper_process_input_vcf_file(cls, infile, out_small_vars, out_big_vars, out_sample_name, min_large_ref_length): splitter = vcf_file_split_deletions.VcfFileSplitDeletions( infile, out_small_vars, out_big_vars, min_large_ref_length=min_large_ref_length, ) splitter.run() header_lines = vcf_file_read.get_header_lines_from_vcf_file(infile) sample_name = vcf_file_read.get_sample_name_from_vcf_header_lines( header_lines) assert sample_name is not None max_read_length = None for line in header_lines: if line.startswith("##minos_max_read_length="): max_read_length = int(line.rstrip().split("=")[1]) with open(out_sample_name, "w") as f: sample_name = vcf_file_read.get_sample_name_from_vcf_file(infile) assert sample_name is not None print(sample_name, file=f) return max_read_length
def run(self): # Cluster together variants in each vcf if self.filter_and_cluster_vcf: EvaluateRecall._filter_vcf_for_clustering(self.truth_vcf_file, self.filtered_truth_vcf, self.discard_ref_calls) EvaluateRecall._filter_vcf_for_clustering(self.query_vcf_file, self.filtered_query_vcf, self.discard_ref_calls) if self.discard_ref_calls: clusterer_query = vcf_clusterer.VcfClusterer([self.filtered_query_vcf], self.query_vcf_ref, self.clustered_vcf_query, merge_method='simple', max_distance_between_variants=self.merge_length) clusterer_truth = vcf_clusterer.VcfClusterer([self.filtered_truth_vcf], self.truth_vcf_ref, self.clustered_vcf_truth, merge_method='simple', max_distance_between_variants=self.merge_length) else: clusterer_query = vcf_clusterer.VcfClusterer([self.filtered_query_vcf], self.query_vcf_ref, self.clustered_vcf_query, merge_method='gt_aware', max_distance_between_variants=self.merge_length) clusterer_truth = vcf_clusterer.VcfClusterer([self.filtered_truth_vcf], self.truth_vcf_ref, self.clustered_vcf_truth, merge_method='gt_aware', max_distance_between_variants=self.merge_length) clusterer_query.run() clusterer_truth.run() vcf_header, vcf_records_truth = vcf_file_read.vcf_file_to_dict(self.vcf_to_check_truth, sort=True, remove_useless_start_nucleotides=True) vcf_header, vcf_records_query = vcf_file_read.vcf_file_to_dict(self.vcf_to_check_query, sort=True, remove_useless_start_nucleotides=True) sample_from_header = vcf_file_read.get_sample_name_from_vcf_header_lines(vcf_header) if sample_from_header is None: sample_from_header = 'sample' truth_vcf_ref_seqs = {} pyfastaq.tasks.file_to_dict(self.truth_vcf_ref, truth_vcf_ref_seqs) query_vcf_ref_seqs = {} pyfastaq.tasks.file_to_dict(self.query_vcf_ref, query_vcf_ref_seqs) EvaluateRecall._write_vars_plus_flanks_to_fasta(self.seqs_out_truth, vcf_records_truth, truth_vcf_ref_seqs, self.flank_length, ref_only=True) EvaluateRecall._write_vars_plus_flanks_to_fasta(self.seqs_out_query, vcf_records_query, query_vcf_ref_seqs, self.flank_length, number_ns=self.number_ns) EvaluateRecall._map_seqs_to_seqs(self.seqs_out_query, self.seqs_out_truth, self.sam_file_out) #for f in glob.glob(self.seqs_out_truth + '*'): #os.unlink(f) #for f in glob.glob(self.seqs_out_query + '*'): #os.unlink(f) EvaluateRecall._index_vcf(self.vcf_to_check_query) self.vcf_to_check_query = self.vcf_to_check_query + ".gz" EvaluateRecall._parse_sam_files(self.vcf_to_check_truth, self.sam_file_out, self.vcf_to_check_query, self.sam_summary, self.flank_length, allow_mismatches=self.allow_flank_mismatches, exclude_regions=self.exclude_regions, max_soft_clipped=self.max_soft_clipped, number_ns=self.number_ns) stats, gt_conf_hist = EvaluateRecall._gather_stats(self.sam_summary) #os.unlink(self.seqs_out_truth) #os.unlink(self.seqs_out_truth) #for f in glob.glob(self.vcf_to_check_truth + '*'): # os.unlink(f) #for f in glob.glob(self.vcf_to_check_query + '*'): # os.unlink(f) # write stats file with open(self.stats_out, 'w') as f: keys = stats.keys() print(*keys, sep='\t', file=f) print(*[stats[x] for x in keys], sep='\t', file=f) # write GT_CONF histogram files with open(self.gt_conf_hist_out, 'w') as f: print('GT_CONF\tCount', file=f) for gt_conf, count in sorted(gt_conf_hist.items()): print(gt_conf, count, sep='\t', file=f)
def test_get_sample_name_from_vcf_header_lines(self): """test get_sample_name_from_vcf_header_lines""" lines = ["foo", "bar"] self.assertEqual( None, vcf_file_read.get_sample_name_from_vcf_header_lines(lines)) lines.append("#CHROM\twrong!") with self.assertRaises(Exception): vcf_file_read.get_sample_name_from_vcf_header_lines(lines) lines[-1] = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO" self.assertEqual( None, vcf_file_read.get_sample_name_from_vcf_header_lines(lines)) lines[-1] = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT" self.assertEqual( None, vcf_file_read.get_sample_name_from_vcf_header_lines(lines)) lines[ -1] = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tsample_name" self.assertEqual( "sample_name", vcf_file_read.get_sample_name_from_vcf_header_lines(lines)) lines[ -1] = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tsample_name\tsample_name_2" self.assertEqual( "sample_name", vcf_file_read.get_sample_name_from_vcf_header_lines(lines))
def _run_gramtools_not_split_vcf(self): self.gramtools_kmer_size = Adjudicator._get_gramtools_kmer_size( self.gramtools_build_dir, self.gramtools_kmer_size) build_report, quasimap_report = gramtools.run_gramtools( self.gramtools_build_dir, self.gramtools_quasimap_dir, self.clustered_vcf, self.ref_fasta, self.reads_files, self.max_read_length, kmer_size=self.gramtools_kmer_size, ) logging.info('Loading gramtools quasimap output files ' + self.gramtools_quasimap_dir) mean_depth, depth_variance, vcf_header, vcf_records, allele_coverage, allele_groups = gramtools.load_gramtools_vcf_and_allele_coverage_files( self.perl_generated_vcf, self.gramtools_quasimap_dir) logging.info('Finished loading gramtools files') if self.sample_name is None: sample_name = vcf_file_read.get_sample_name_from_vcf_header_lines( vcf_header) else: sample_name = self.sample_name assert sample_name is not None logging.info('Writing VCf output file ' + self.final_vcf) gramtools.write_vcf_annotated_using_coverage_from_gramtools( mean_depth, vcf_records, allele_coverage, allele_groups, self.read_error_rate, self.unfiltered_vcf_file, self.gramtools_kmer_size, sample_name=sample_name, max_read_length=self.max_read_length, filtered_outfile=self.final_vcf) logging.info( f'Adding GT_CONF_PERCENTLE to final VCF file {self.final_vcf}, using mean depth {mean_depth}, depth variance {depth_variance}, error rate {self.read_error_rate}, and {self.genotype_simulation_iterations} simulation iterations' ) Adjudicator._add_gt_conf_percentile_to_vcf_file( self.final_vcf, mean_depth, depth_variance, self.read_error_rate, self.genotype_simulation_iterations) if self.clean: os.rename( os.path.join(self.gramtools_quasimap_dir, 'report.json'), os.path.join(self.outdir, 'gramtools.quasimap.report.json')) shutil.rmtree(self.gramtools_quasimap_dir) if not self.user_supplied_gramtools_build_dir: os.rename( os.path.join(self.gramtools_build_dir, 'build_report.json'), os.path.join(self.outdir, 'gramtools.build.report.json')) shutil.rmtree(self.gramtools_build_dir)
def run_adjudicate(self, build_dir, quasimap_dir, vcf, reads_files, final_vcf, debug_vcf): build_report, quasimap_report = gramtools.run_gramtools( build_dir, quasimap_dir, vcf, self.ref_fasta, reads_files, self.max_read_length, kmer_size=self.gramtools_kmer_size, ) build_vcf = os.path.join(build_dir, "build.vcf") logging.info("Loading gramtools quasimap output files " + quasimap_dir) mean_depth, variance_depth, vcf_header, vcf_records, allele_coverage, allele_groups = gramtools.load_gramtools_vcf_and_allele_coverage_files( build_vcf, quasimap_dir) Adjudicator.mean_depths.append(mean_depth) Adjudicator.variance_depths.append(variance_depth) logging.info("Finished loading gramtools files") if self.clean: os.rename( os.path.join(quasimap_dir, "quasimap_outputs", "quasimap_report.json"), quasimap_dir + ".report.json", ) shutil.rmtree(quasimap_dir) if not self.user_supplied_gramtools_build_dir: os.rename( os.path.join(build_dir, "build_report.json"), os.path.join(build_dir, "build.report.json"), ) shutil.rmtree(build_dir) if self.sample_name is None: sample_name = vcf_file_read.get_sample_name_from_vcf_header_lines( vcf_header) else: sample_name = self.sample_name gramtools.write_vcf_annotated_using_coverage_from_gramtools( mean_depth, vcf_records, allele_coverage, allele_groups, self.read_error_rate, debug_vcf, sample_name=sample_name, max_read_length=self.max_read_length, filtered_outfile=final_vcf, )
def test_get_sample_name_from_vcf_header_lines(self): '''test get_sample_name_from_vcf_header_lines''' lines = ['foo', 'bar'] self.assertEqual(None, vcf_file_read.get_sample_name_from_vcf_header_lines(lines)) lines.append('#CHROM\twrong!') with self.assertRaises(vcf_file_read.Error): vcf_file_read.get_sample_name_from_vcf_header_lines(lines) lines[-1] = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO' self.assertEqual(None, vcf_file_read.get_sample_name_from_vcf_header_lines(lines)) lines[-1] = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT' self.assertEqual(None, vcf_file_read.get_sample_name_from_vcf_header_lines(lines)) lines[-1] = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tsample_name' self.assertEqual('sample_name', vcf_file_read.get_sample_name_from_vcf_header_lines(lines)) lines[-1] = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tsample_name\tsample_name_2' self.assertEqual('sample_name', vcf_file_read.get_sample_name_from_vcf_header_lines(lines))
def _load_vcf_files(cls, filename_list, homozygous_only=False, max_REF_len=None, min_SNP_qual=None, min_dp4=None, min_GT_conf=None): '''Loads all the vcf files from filename_list. Returns tuple of: 1. Sample name. If more than one sample name found, uses the first one and warns to stderr 2. Dictionary. filename => list of header lines for that file 3. Dictionary. ref name => list of VcfRecords sorted by position''' headers = {} vcf_records = None sample_name = None for filename in filename_list: headers[filename], new_records = vcf_file_read.vcf_file_to_dict(filename, homozygous_only=homozygous_only, remove_asterisk_alts=True, max_REF_len=max_REF_len, remove_useless_start_nucleotides=True, min_SNP_qual=min_SNP_qual, min_dp4=min_dp4, min_GT_conf=min_GT_conf) new_sample_name = vcf_file_read.get_sample_name_from_vcf_header_lines(headers[filename]) if sample_name is None and new_sample_name is not None: sample_name = new_sample_name elif new_sample_name != sample_name: logging.warning('Using first sample name found "' + str(sample_name) + '". Found a different (or no) sample name "' + str(new_sample_name) + '", which will not be used') if vcf_records is None: vcf_records = new_records else: for ref_name, record_list in new_records.items(): if ref_name not in vcf_records: vcf_records[ref_name] = record_list else: vcf_records[ref_name].extend(record_list) for record_list in vcf_records.values(): record_list.sort(key=operator.attrgetter('POS')) if sample_name is None: logging.warning('No sample name found in VCF files. Going to use "sample"') sample_name = 'sample' return sample_name, headers, vcf_records
def run(self): if self.filter_and_cluster_vcf: MappingBasedVerifier._filter_vcf_for_clustering( self.vcf_file_in, self.filtered_vcf, discard_ref_calls=self.discard_ref_calls) if self.discard_ref_calls: clusterer = vcf_clusterer.VcfClusterer( [self.filtered_vcf], self.vcf_reference_file, self.clustered_vcf, merge_method='simple', max_distance_between_variants=self.merge_length) else: clusterer = vcf_clusterer.VcfClusterer( [self.filtered_vcf], self.vcf_reference_file, self.clustered_vcf, merge_method='gt_aware', max_distance_between_variants=self.merge_length) clusterer.run() vcf_header, vcf_records = vcf_file_read.vcf_file_to_dict( self.vcf_to_check, sort=True, remove_useless_start_nucleotides=True) sample_from_header = vcf_file_read.get_sample_name_from_vcf_header_lines( vcf_header) if sample_from_header is None: sample_from_header = 'sample' vcf_ref_seqs = {} pyfastaq.tasks.file_to_dict(self.vcf_reference_file, vcf_ref_seqs) verify_ref_seqs = {} pyfastaq.tasks.file_to_dict(self.verify_reference_file, verify_ref_seqs) MappingBasedVerifier._write_vars_plus_flanks_to_fasta( self.seqs_out, vcf_records, vcf_ref_seqs, self.flank_length) MappingBasedVerifier._map_seqs_to_ref(self.seqs_out, self.verify_reference_file, self.sam_file_out) os.unlink(self.seqs_out) stats, gt_conf_hists = MappingBasedVerifier._parse_sam_file_and_update_vcf_records_and_gather_stats( self.sam_file_out, vcf_records, self.flank_length, verify_ref_seqs, allow_mismatches=self.allow_flank_mismatches, exclude_regions=self.exclude_regions, max_soft_clipped=self.max_soft_clipped) with open(self.vcf_file_out, 'w') as f: print(*vcf_header, sep='\n', file=f) for r in vcf_records: for v in vcf_records[r]: print(v, file=f) # false negative stats, if possible stats['variant_regions_total'] = 'NA' stats['called_variant_regions'] = 'NA' if self.run_dnadiff: dnadiffer = dnadiff.Dnadiff( self.verify_reference_file, self.vcf_reference_file, self.dnadiff_outprefix, ) dnadiffer.run() stats['variant_regions_total'], stats[ 'called_variant_regions'] = MappingBasedVerifier._get_total_length_of_expected_regions_called( dnadiffer.all_variant_intervals, vcf_records) expected_variants = dnadiffer.variants elif self.expected_variants_vcf is not None: header, expected_variants = vcf_file_read.vcf_file_to_dict( self.expected_variants_vcf, sort=True, remove_useless_start_nucleotides=True) else: expected_variants = None if expected_variants is None: stats['false_negatives'] = 'NA' else: missed_vcf_records = MappingBasedVerifier._get_missing_vcf_records( vcf_records, expected_variants) stats['false_negatives'] = 0 with open(self.vcf_false_negatives_file_out, 'w') as f: print('##fileformat=VCFv4.2', file=f) print('#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', sample_from_header, sep='\t', file=f) for vcf_list in missed_vcf_records.values(): stats['false_negatives'] += len(vcf_list) if len(vcf_list) > 0: print(*vcf_list, sep='\n', file=f) # write stats file with open(self.stats_out, 'w') as f: keys = [ 'total', 'gt_correct', 'gt_wrong', 'gt_excluded', 'HET', 'tp_edit_dist', 'fp_edit_dist', 'UNKNOWN_NO_GT', 'variant_regions_total', 'called_variant_regions', 'false_negatives' ] print(*keys, sep='\t', file=f) print(*[stats[x] for x in keys], sep='\t', file=f) # write GT_CONG histogram files for key, filename in self.gt_conf_hists_filenames.items(): with open(filename, 'w') as f: print('GT_CONF\tCount', file=f) for gt_conf, count in sorted(gt_conf_hists[key].items()): print(gt_conf, count, sep='\t', file=f) plots.plots_from_minos_vcf(self.vcf_file_out, self.vcf_file_plots_out)
def run(self): # Write files of sequences to search for in each vcf DnadiffMappingBasedVerifier._write_dnadiff_plus_flanks_to_fastas( self.dnadiff_snps_file, self.dnadiff_file1, self.dnadiff_file2, self.seqs_out_dnadiff1, self.seqs_out_dnadiff2, self.flank_length) # Cluster together variants in each vcf if self.filter_and_cluster_vcf: DnadiffMappingBasedVerifier._filter_vcf_for_clustering( self.vcf_file_in1, self.filtered_vcf1, self.discard_ref_calls) DnadiffMappingBasedVerifier._filter_vcf_for_clustering( self.vcf_file_in2, self.filtered_vcf2, self.discard_ref_calls) if self.discard_ref_calls: clusterer1 = vcf_clusterer.VcfClusterer( [self.filtered_vcf1], self.vcf_reference_file, self.clustered_vcf1, merge_method='simple', max_distance_between_variants=self.merge_length) clusterer2 = vcf_clusterer.VcfClusterer( [self.filtered_vcf2], self.vcf_reference_file, self.clustered_vcf2, merge_method='simple', max_distance_between_variants=self.merge_length) else: clusterer1 = vcf_clusterer.VcfClusterer( [self.filtered_vcf1], self.vcf_reference_file, self.clustered_vcf1, merge_method='gt_aware', max_distance_between_variants=self.merge_length) clusterer2 = vcf_clusterer.VcfClusterer( [self.filtered_vcf2], self.vcf_reference_file, self.clustered_vcf2, merge_method='gt_aware', max_distance_between_variants=self.merge_length) clusterer1.run() clusterer2.run() vcf_header, vcf_records1 = vcf_file_read.vcf_file_to_dict( self.vcf_to_check1, sort=True, remove_useless_start_nucleotides=True) vcf_header, vcf_records2 = vcf_file_read.vcf_file_to_dict( self.vcf_to_check2, sort=True, remove_useless_start_nucleotides=True) sample_from_header = vcf_file_read.get_sample_name_from_vcf_header_lines( vcf_header) if sample_from_header is None: sample_from_header = 'sample' vcf_ref_seqs = {} pyfastaq.tasks.file_to_dict(self.vcf_reference_file, vcf_ref_seqs) DnadiffMappingBasedVerifier._write_vars_plus_flanks_to_fasta( self.seqs_out_vcf1, vcf_records1, vcf_ref_seqs, self.flank_length, self.number_ns) DnadiffMappingBasedVerifier._write_vars_plus_flanks_to_fasta( self.seqs_out_vcf2, vcf_records2, vcf_ref_seqs, self.flank_length, self.number_ns) DnadiffMappingBasedVerifier._map_seqs_to_seqs(self.seqs_out_vcf1, self.seqs_out_dnadiff1, self.sam_file_out1) DnadiffMappingBasedVerifier._map_seqs_to_seqs(self.seqs_out_vcf2, self.seqs_out_dnadiff2, self.sam_file_out2) #for f in glob.glob(self.seqs_out_vcf1 + '*'): #os.unlink(f) #for f in glob.glob(self.seqs_out_vcf2 + '*'): #os.unlink(f) DnadiffMappingBasedVerifier._index_vcf(self.vcf_to_check1) self.vcf_to_check1 = self.vcf_to_check1 + ".gz" DnadiffMappingBasedVerifier._index_vcf(self.vcf_to_check2) self.vcf_to_check2 = self.vcf_to_check2 + ".gz" DnadiffMappingBasedVerifier._parse_sam_files( self.dnadiff_snps_file, self.sam_file_out1, self.sam_file_out2, self.vcf_to_check1, self.vcf_to_check2, self.seqs_out_dnadiff1, self.seqs_out_dnadiff2, self.sam_summary, self.flank_length, allow_mismatches=self.allow_flank_mismatches, exclude_regions1=self.exclude_regions1, exclude_regions2=self.exclude_regions2, max_soft_clipped=self.max_soft_clipped, number_ns=self.number_ns) stats, gt_conf_hist = DnadiffMappingBasedVerifier._gather_stats( self.sam_summary) #os.unlink(self.seqs_out_dnadiff1) #os.unlink(self.seqs_out_dnadiff2) #for f in glob.glob(self.vcf_to_check1 + '*'): # os.unlink(f) #for f in glob.glob(self.vcf_to_check2 + '*'): # os.unlink(f) # write stats file with open(self.stats_out, 'w') as f: keys = stats.keys() print(*keys, sep='\t', file=f) print(*[stats[x] for x in keys], sep='\t', file=f) # write GT_CONF histogram files with open(self.gt_conf_hist_out, 'w') as f: print('GT_CONF\tCount', file=f) for gt_conf, count in sorted(gt_conf_hist.items()): print(gt_conf, count, sep='\t', file=f)
def _run_gramtools_with_split_vcf(self): logging.info('Splitting VCF files into chunks (if not already done)') chunker = vcf_chunker.VcfChunker( self.split_input_dir, vcf_infile=self.clustered_vcf, ref_fasta=self.ref_fasta, variants_per_split=self.variants_per_split, alleles_per_split=self.alleles_per_split, max_read_length=self.max_read_length, total_splits=self.total_splits, flank_length=self.max_read_length, gramtools_kmer_size=self.gramtools_kmer_size, ) chunker.make_split_files() self.gramtools_kmer_size = chunker.gramtools_kmer_size logging.info('VCF file split into ' + str(chunker.total_split_files) + ' chunks') try: os.mkdir(self.split_output_dir) except: raise Error('Error making output split directory ' + self.split_output_dir) unmapped_reads_file = os.path.join(self.split_output_dir, 'unmapped_reads.bam') bam_read_extract.get_unmapped_reads(self.reads_files[0], unmapped_reads_file) split_vcf_outfiles = {} split_vcf_outfiles_unfiltered = {} mean_depths = [] depth_variances = [] for ref_name, split_file_list in chunker.vcf_split_files.items(): split_vcf_outfiles[ref_name] = [] split_vcf_outfiles_unfiltered[ref_name] = [] for split_file in split_file_list: logging.info( '===== Start analysing variants in VCF split file ' + split_file.filename + ' =====') split_reads_file = os.path.join( self.split_output_dir, 'split.' + str(split_file.file_number) + '.reads.bam') bam_read_extract.get_region( self.reads_files[0], split_file.chrom, split_file.chrom_start, split_file.chrom_end, split_reads_file, ) gramtools_quasimap_dir = os.path.join( self.split_output_dir, 'split.' + str(split_file.file_number) + '.gramtools.quasimap') build_report, quasimap_report = gramtools.run_gramtools( split_file.gramtools_build_dir, gramtools_quasimap_dir, split_file.filename, self.ref_fasta, [unmapped_reads_file, split_reads_file], self.max_read_length, kmer_size=self.gramtools_kmer_size, ) logging.info('Loading split gramtools quasimap output files ' + gramtools_quasimap_dir) perl_generated_vcf = os.path.join( split_file.gramtools_build_dir, 'perl_generated_vcf') mean_depth, depth_variance, vcf_header, vcf_records, allele_coverage, allele_groups = gramtools.load_gramtools_vcf_and_allele_coverage_files( perl_generated_vcf, gramtools_quasimap_dir) mean_depths.append(mean_depth) depth_variances.append(depth_variance) logging.info('Finished loading gramtools files') if self.sample_name is None: sample_name = vcf_file_read.get_sample_name_from_vcf_header_lines( vcf_header) else: sample_name = self.sample_name assert sample_name is not None split_vcf_out = os.path.join( self.split_output_dir, 'split.' + str(split_file.file_number) + '.out.vcf') unfiltered_vcf_out = os.path.join( self.split_output_dir, 'split.' + str(split_file.file_number) + '.out.debug.calls_with_zero_cov_alleles.vcf') logging.info('Writing VCf output file ' + split_vcf_out + ' for split VCF file ' + split_file.filename) gramtools.write_vcf_annotated_using_coverage_from_gramtools( mean_depth, vcf_records, allele_coverage, allele_groups, self.read_error_rate, unfiltered_vcf_out, self.gramtools_kmer_size, sample_name=sample_name, max_read_length=self.max_read_length, filtered_outfile=split_vcf_out, ) split_vcf_outfiles[ref_name].append(split_vcf_out) split_vcf_outfiles_unfiltered[ref_name].append( unfiltered_vcf_out) if self.clean: logging.info( 'Cleaning gramtools files from split VCF file ' + split_file.filename) if not self.user_supplied_gramtools_build_dir: os.rename( os.path.join(split_file.gramtools_build_dir, 'build_report.json'), split_file.gramtools_build_dir + '.report.json') shutil.rmtree(split_file.gramtools_build_dir) os.unlink(split_file.filename) os.rename( os.path.join(gramtools_quasimap_dir, 'report.json'), gramtools_quasimap_dir + '.report.json') shutil.rmtree(gramtools_quasimap_dir) os.unlink(split_reads_file) logging.info( '===== Finish analysing variants in VCF split file ' + split_file.filename + ' =====') logging.info('Merging VCF files into one output file ' + self.final_vcf) chunker.merge_files(split_vcf_outfiles, self.final_vcf) chunker.merge_files(split_vcf_outfiles_unfiltered, self.unfiltered_vcf_file) mean_depth = statistics.mean(mean_depths) depth_variance = statistics.mean(depth_variances) logging.info( f'Adding GT_CONF_PERCENTLE to final VCF file {self.final_vcf}, using mean depth {mean_depth}, depth variance {depth_variance}, error rate {self.read_error_rate}, and {self.genotype_simulation_iterations} simulation iterations' ) Adjudicator._add_gt_conf_percentile_to_vcf_file( self.final_vcf, mean_depth, depth_variance, self.read_error_rate, self.genotype_simulation_iterations) if self.clean: logging.info('Deleting temp split VCF files') for d in split_vcf_outfiles, split_vcf_outfiles_unfiltered: for file_list in d.values(): for filename in file_list: os.unlink(filename) os.unlink(unmapped_reads_file)
def _load_vcf_files( cls, filename_list, reference_seqs, homozygous_only=False, max_REF_len=None, min_SNP_qual=None, min_dp4=None, min_GT_conf=None, ): """Loads all the vcf files from filename_list. Returns tuple of: 1. Sample name. If more than one sample name found, uses the first one and warns to stderr 2. Dictionary. filename => list of header lines for that file 3. Dictionary. ref name => list of VcfRecords sorted by position. reference_seqs should be a dictionary of sequence name -> sequence. This causes all records from the VCF to be sanity checked against the reference sequence, and any records where the REF seq does not match the expected sequence is removed.""" headers = {} vcf_records = None sample_name = None for filename in filename_list: headers[filename], new_records = vcf_file_read.vcf_file_to_dict( filename, homozygous_only=homozygous_only, remove_asterisk_alts=True, max_REF_len=max_REF_len, remove_useless_start_nucleotides=True, min_SNP_qual=min_SNP_qual, min_dp4=min_dp4, min_GT_conf=min_GT_conf, reference_seqs=reference_seqs, error_on_bad_POS=False, ) new_sample_name = vcf_file_read.get_sample_name_from_vcf_header_lines( headers[filename] ) if sample_name is None and new_sample_name is not None: sample_name = new_sample_name elif new_sample_name != sample_name: logging.warning( 'Using first sample name found "' + str(sample_name) + '". Found a different (or no) sample name "' + str(new_sample_name) + '", which will not be used' ) if vcf_records is None: vcf_records = new_records else: for ref_name, record_list in new_records.items(): if ref_name not in vcf_records: vcf_records[ref_name] = record_list else: vcf_records[ref_name].extend(record_list) for record_list in vcf_records.values(): record_list.sort(key=operator.attrgetter("POS")) if sample_name is None: logging.warning('No sample name found in VCF files. Going to use "sample"') sample_name = "sample" return sample_name, headers, vcf_records
def run(self): if self.filter_and_cluster_vcf: MappingBasedVerifier._filter_vcf_for_clustering( self.vcf_file_in, self.filtered_vcf, discard_ref_calls=self.discard_ref_calls, ) if self.discard_ref_calls: clusterer = vcf_clusterer.VcfClusterer( [self.filtered_vcf], self.vcf_reference_file, self.clustered_vcf, merge_method="simple", cluster_boundary_size=self.merge_length, ) else: clusterer = vcf_clusterer.VcfClusterer( [self.filtered_vcf], self.vcf_reference_file, self.clustered_vcf, merge_method="gt_aware", cluster_boundary_size=self.merge_length, ) clusterer.run() vcf_header, vcf_records = vcf_file_read.vcf_file_to_dict( self.vcf_to_check, sort=True, remove_useless_start_nucleotides=True) sample_from_header = vcf_file_read.get_sample_name_from_vcf_header_lines( vcf_header) if sample_from_header is None: sample_from_header = "sample" vcf_ref_seqs = {} pyfastaq.tasks.file_to_dict(self.vcf_reference_file, vcf_ref_seqs) verify_ref_seqs = {} pyfastaq.tasks.file_to_dict(self.verify_reference_file, verify_ref_seqs) MappingBasedVerifier._write_vars_plus_flanks_to_fasta( self.seqs_out, vcf_records, vcf_ref_seqs, self.flank_length) MappingBasedVerifier._map_seqs_to_ref(self.seqs_out, self.verify_reference_file, self.sam_file_out) os.unlink(self.seqs_out) stats, gt_conf_hists = MappingBasedVerifier._parse_sam_file_and_update_vcf_records_and_gather_stats( self.sam_file_out, vcf_records, self.flank_length, verify_ref_seqs, allow_mismatches=self.allow_flank_mismatches, exclude_regions=self.exclude_regions, max_soft_clipped=self.max_soft_clipped, ) with open(self.vcf_file_out, "w") as f: print(*vcf_header, sep="\n", file=f) for r in vcf_records: for v in vcf_records[r]: print(v, file=f) # false negative stats, if possible stats["variant_regions_total"] = "NA" stats["called_variant_regions"] = "NA" if self.run_dnadiff: dnadiffer = dnadiff.Dnadiff( self.verify_reference_file, self.vcf_reference_file, self.dnadiff_outprefix, ) dnadiffer.run() stats["variant_regions_total"], stats[ "called_variant_regions"] = MappingBasedVerifier._get_total_length_of_expected_regions_called( dnadiffer.all_variant_intervals, vcf_records) expected_variants = dnadiffer.variants elif self.expected_variants_vcf is not None: header, expected_variants = vcf_file_read.vcf_file_to_dict( self.expected_variants_vcf, sort=True, remove_useless_start_nucleotides=True, ) else: expected_variants = None if expected_variants is None: stats["false_negatives"] = "NA" else: missed_vcf_records = MappingBasedVerifier._get_missing_vcf_records( vcf_records, expected_variants) stats["false_negatives"] = 0 with open(self.vcf_false_negatives_file_out, "w") as f: print("##fileformat=VCFv4.2", file=f) print( "#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT", sample_from_header, sep="\t", file=f, ) for vcf_list in missed_vcf_records.values(): stats["false_negatives"] += len(vcf_list) if len(vcf_list) > 0: print(*vcf_list, sep="\n", file=f) # write stats file with open(self.stats_out, "w") as f: keys = [ "total", "gt_correct", "gt_wrong", "gt_excluded", "HET", "tp_edit_dist", "fp_edit_dist", "UNKNOWN_NO_GT", "variant_regions_total", "called_variant_regions", "false_negatives", ] print(*keys, sep="\t", file=f) print(*[stats[x] for x in keys], sep="\t", file=f) # write GT_CONG histogram files for key, filename in self.gt_conf_hists_filenames.items(): with open(filename, "w") as f: print("GT_CONF\tCount", file=f) for gt_conf, count in sorted(gt_conf_hists[key].items()): print(gt_conf, count, sep="\t", file=f) plots.plots_from_minos_vcf(self.vcf_file_out, self.vcf_file_plots_out)