def test_run_with_small_var_vcf_chunking_total_splits(self): '''test run with chunking small variatn VCF file using total_splits option''' input_tsv = 'tmp.multi_sample_pipeline.run.in.tsv' ref_fasta = os.path.join(data_dir, 'run.ref.0.fa') with open(input_tsv, 'w') as f: for i in '1', '2': reads = os.path.join(data_dir, 'run.reads.' + i + '.sorted.bam') vcf = os.path.join(data_dir, 'run.calls.' + i + '.vcf') print(vcf, reads, sep='\t', file=f) outdir = 'tmp.multi_sample_pipeline.run.out' if os.path.exists(outdir): shutil.rmtree(outdir) pipeline = multi_sample_pipeline.MultiSamplePipeline(ref_fasta, input_tsv, outdir, total_splits=3, min_large_ref_length=10, testing=True, clean=False) pipeline.run() expected_vcf = os.path.join(data_dir, 'run.out.vcf') expected_header, expected_lines = vcf_file_read.vcf_file_to_list(expected_vcf) got_vcf = os.path.join(outdir, 'combined_calls.vcf') self.assertTrue(os.path.exists(got_vcf)) got_header, got_lines = vcf_file_read.vcf_file_to_list(got_vcf) # the datei, minos version, and bcftools verisons might not match expected_header = [x for x in expected_header if not (x.startswith('##fileDate') or x.startswith('##source=minos') or x.startswith('##bcftools_mergeVersion'))] got_header = [x for x in got_header if not (x.startswith('##fileDate') or x.startswith('##source=minos') or x.startswith('##bcftools_mergeVersion'))] self.assertEqual(expected_header, got_header) self.assertEqual(expected_lines, got_lines) shutil.rmtree(outdir) os.unlink(input_tsv)
def check_vcfs(expected_vcf, got_vcf): expected_header, expected_vcf_records = vcf_file_read.vcf_file_to_list( expected_vcf) got_header, got_vcf_records = vcf_file_read.vcf_file_to_list(got_vcf) for i in range(len(expected_header)): if expected_header[i].startswith("##fileDate="): expected_header[i] = "##fileDate=" + str(datetime.date.today()) elif expected_header[i].startswith("##source=cluster_vcf_records"): expected_header[i] = ("##source=cluster_vcf_records, version " + cluster_vcf_records_version) return expected_header == got_header and expected_vcf_records == got_vcf_records
def _test_run_with_small_var_vcf_chunking_vars_per_split(self): """test run with chunking small variant VCF file using variants_per_split option""" input_tsv = "tmp.multi_sample_pipeline.run.in.tsv" ref_fasta = os.path.join(data_dir, "run.ref.0.fa") with open(input_tsv, "w") as f: for i in "1", "2": reads1 = os.path.join(data_dir, "run.reads." + i + ".sorted.bam") reads2 = os.path.join(data_dir, "run.reads." + i + ".sorted.bam") vcf = os.path.join(data_dir, "run.calls." + i + ".vcf") print(vcf, reads1, reads2, sep="\t", file=f) outdir = "tmp.multi_sample_pipeline.run.out" if os.path.exists(outdir): shutil.rmtree(outdir) pipeline = multi_sample_pipeline.MultiSamplePipeline( ref_fasta, input_tsv, outdir, variants_per_split=3, min_large_ref_length=10, testing=True, clean=False, ) pipeline.run() expected_vcf = os.path.join(data_dir, "run.out.vcf") expected_header, expected_lines = vcf_file_read.vcf_file_to_list( expected_vcf) got_vcf = os.path.join(outdir, "combined_calls.vcf") self.assertTrue(os.path.exists(got_vcf)) got_header, got_lines = vcf_file_read.vcf_file_to_list(got_vcf) # the datei, minos version, and bcftools verisons might not match expected_header = [ x for x in expected_header if not (x.startswith("##fileDate") or x.startswith( "##source=minos") or x.startswith("##bcftools_mergeVersion")) ] got_header = [ x for x in got_header if not (x.startswith("##fileDate") or x.startswith( "##source=minos") or x.startswith("##bcftools_mergeVersion")) ] self.assertEqual(expected_header, got_header) self.assertEqual(expected_lines, got_lines) shutil.rmtree(outdir) os.unlink(input_tsv)
def check_vcfs(expected_vcf, got_vcf): expected_header, expected_vcf_records = vcf_file_read.vcf_file_to_list( expected_vcf) got_header, got_vcf_records = vcf_file_read.vcf_file_to_list( got_vcf) for i in range(len(expected_header)): if expected_header[i].startswith("##fileDate="): expected_header[i] = "##fileDate=" + str( datetime.date.today()) elif expected_header[i].startswith("##source=minos"): expected_header[ i] = "##source=minos, version " + minos_version self.assertEqual(expected_header, got_header) self.assertEqual(expected_vcf_records, got_vcf_records)
def test_load_gramtools_vcf_and_allele_coverage_files(self): """test load_gramtools_vcf_and_allele_coverage_files""" vcf_file = os.path.join(data_dir, "load_gramtools_vcf_and_allele_coverage.vcf") quasimap_dir = os.path.join( data_dir, "load_gramtools_vcf_and_allele_coverage_files.quasimap") got_mean_depth, got_depth_variance, got_vcf_header, got_vcf_records, got_allele_coverage, got_allele_groups = gramtools.load_gramtools_vcf_and_allele_coverage_files( vcf_file, quasimap_dir) expected_header, expected_vcf_records = vcf_file_read.vcf_file_to_list( vcf_file) self.assertEqual(expected_header, got_vcf_header) self.assertEqual(expected_vcf_records, got_vcf_records) self.assertEqual(10.500, got_mean_depth) self.assertEqual(0.5, got_depth_variance) # now test bad files cause error to be raised vcf_file = os.path.join( data_dir, "load_gramtools_vcf_and_allele_coverage.short.vcf") with self.assertRaises(Exception): gramtools.load_gramtools_vcf_and_allele_coverage_files( vcf_file, quasimap_dir) vcf_file = os.path.join( data_dir, "load_gramtools_vcf_and_allele_coverage.long.vcf") with self.assertRaises(Exception): gramtools.load_gramtools_vcf_and_allele_coverage_files( vcf_file, quasimap_dir) vcf_file = os.path.join( data_dir, "load_gramtools_vcf_and_allele_coverage.bad_allele_count.vcf") with self.assertRaises(Exception): gramtools.load_gramtools_vcf_and_allele_coverage_files( vcf_file, quasimap_dir)
def test_vcf_file_to_list(self): """test vcf_file_to_list""" expected_header = ["# header1", "# header2"] lines = [ "ref_42\t12\tid_foo\tC\tG\t42.43\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,53:39.81", "ref_42\t11\tid_foo\tA\tG\t42.42\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,52:39.80", "ref_43\t42\tid_foo\tT\tG\t43.42\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,54:39.82", ] expected_records = [vcf_record.VcfRecord(x) for x in lines] infile = os.path.join(data_dir, "vcf_file_to_list.vcf") got_header, got_records = vcf_file_read.vcf_file_to_list(infile) self.assertEqual(expected_header, got_header) self.assertEqual(expected_records, got_records) infile = os.path.join(data_dir, "vcf_file_to_list.vcf.gz") got_header, got_records = vcf_file_read.vcf_file_to_list(infile) self.assertEqual(expected_header, got_header) self.assertEqual(expected_records, got_records)
def test_vcf_file_to_list(self): '''test vcf_file_to_list''' expected_header = ['# header1', '# header2'] lines = [ 'ref_42\t12\tid_foo\tC\tG\t42.43\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,53:39.81', 'ref_42\t11\tid_foo\tA\tG\t42.42\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,52:39.80', 'ref_43\t42\tid_foo\tT\tG\t43.42\tPASS\tKMER=31;SVLEN=0;SVTYPE=SNP\tGT:COV:GT_CONF\t1/1:0,54:39.82', ] expected_records = [vcf_record.VcfRecord(x) for x in lines] infile = os.path.join(data_dir, 'vcf_file_to_list.vcf') got_header, got_records = vcf_file_read.vcf_file_to_list(infile) self.assertEqual(expected_header, got_header) self.assertEqual(expected_records, got_records) infile = os.path.join(data_dir, 'vcf_file_to_list.vcf.gz') got_header, got_records = vcf_file_read.vcf_file_to_list(infile) self.assertEqual(expected_header, got_header) self.assertEqual(expected_records, got_records)
def per_record_stats_from_vcf_file(infile): """Gathers stats for each record in a VCF file. Returns a list of dictionaries of stats. One dict per VCF line. List is sorted by ref seq name (CHROM), then position (POS)""" stats = [] wanted_keys = [ "DP", "DPF", "FRS", "GT_CONF", "GT_CONF_PERCENTILE", "VFR_IN_MASK", "VFR_ED_RA", "VFR_ED_TR", "VFR_ED_TA", "VFR_FILTER", "VFR_ALLELE_LEN", "VFR_ALLELE_MATCH_COUNT", "VFR_ALLELE_MATCH_FRAC", "VFR_RESULT", ] key_types = { "DP": int, "DPF": float, "GT_CONF": float, "GT_CONF_PERCENTILE": float, "FRS": float, "VFR_IN_MASK": int, "VFR_ED_RA": int, "VFR_ED_TR": int, "VFR_ED_TA": int, "VFR_ALLELE_MATCH_FRAC": float, "VFR_ALLELE_LEN": int, "VFR_ALLELE_MATCH_COUNT": int, } header_lines, vcf_records = vcf_file_read.vcf_file_to_list(infile) for record in vcf_records: record_stats = {x: record.FORMAT.get(x, "NA") for x in wanted_keys} record_stats["FRS"] = _frs_from_vcf_record(record) record_stats["CHROM"] = record.CHROM record_stats["POS"] = record.POS + 1 for key, key_type in key_types.items(): try: record_stats[key] = key_type(record_stats[key]) except: pass stats.append(record_stats) stats.sort(key=itemgetter("CHROM", "POS")) return stats
def test_normalise_vcf(): infile = os.path.join(data_dir, "normalise_vcf.in.vcf") ref_fa = os.path.join(data_dir, "normalise_vcf.in.fa") expect = os.path.join(data_dir, "normalise_vcf.out.vcf") tmp_out = "tmp.normalise_vcf.vcf" utils.rm_rf(tmp_out) utils.normalise_vcf(infile, ref_fa, tmp_out) expected_header, expected_vcf_records = vcf_file_read.vcf_file_to_list( expect) got_header, got_vcf_records = vcf_file_read.vcf_file_to_list(tmp_out) # The normalizing commands add lots of lines to the header. # We don't care about those, so just check the actual records. assert got_vcf_records == expected_vcf_records os.unlink(tmp_out) # test again but without breaking alleles into separate records utils.normalise_vcf(infile, ref_fa, tmp_out, break_alleles=False) expect = os.path.join(data_dir, "normalise_vcf.out.no_break_alleles.vcf") expected_header, expected_vcf_records = vcf_file_read.vcf_file_to_list( expect) got_header, got_vcf_records = vcf_file_read.vcf_file_to_list(tmp_out) assert got_vcf_records == expected_vcf_records os.unlink(tmp_out)
def get_probes_and_vcf_records( vcf_file, ref_seqs, flank_length, use_fail_conflict=False ): """Input vcf_file is assumed to have been made by vcf_qc_annotate.add_qc_to_vcf(), so that each record has the FORMAT tag VFR_FILTER. For each line of the input VCF file, yields a tuple (vcf_record, alt probe sequence). vcf_file = name of VCF file. ref_seqs = dictionary of sequence name -> sequence. flank_length = number of nucleotides to add either side of variant sequence.""" header_lines, vcf_records = vcf_file_read.vcf_file_to_list(vcf_file) yield header_lines wanted_format = _get_wanted_format(use_fail_conflict) for record in vcf_records: if record.FORMAT["VFR_FILTER"] not in wanted_format: yield record, None, None continue flank_start = max(0, record.POS - flank_length) ref_seq = ref_seqs[record.CHROM] if ref_seq[record.POS : record.POS + len(record.REF)] != record.REF: record.set_format_key_value("VFR_FILTER", "REF_STRING_MISMATCH") yield record, None, None continue flank_end = min(len(ref_seq) - 1, record.ref_end_pos() + flank_length) probe_allele_start = record.POS - flank_start alt_index = int(record.FORMAT["GT"].split("/")[0]) alt_allele = record.REF if alt_index == 0 else record.ALT[alt_index - 1] alt_probe_allele_end = probe_allele_start + len(alt_allele) - 1 alt_probe_seq = ( ref_seq[flank_start : record.POS] + alt_allele + ref_seq[record.ref_end_pos() + 1 : flank_end + 1] ) alt_probe = probe.Probe(alt_probe_seq, probe_allele_start, alt_probe_allele_end) assert alt_probe.allele_seq() == alt_allele ref_probe_allele_end = probe_allele_start + len(record.REF) - 1 ref_probe_seq = ( ref_seq[flank_start : record.POS] + record.REF + ref_seq[record.ref_end_pos() + 1 : flank_end + 1] ) ref_probe = probe.Probe(ref_probe_seq, probe_allele_start, ref_probe_allele_end) assert ref_probe.allele_seq() == record.REF yield record, ref_probe, alt_probe
def _vcf_file_to_dict(vcf_file): """Loads VCF file. Returns a dictionary of sequence name -> sorted list by position of variants""" records = {} header_lines, vcf_records = vcf_file_read.vcf_file_to_list(vcf_file) for record in vcf_records: if record.CHROM not in records: records[record.CHROM] = [] records[record.CHROM].append(record) for l in records.values(): l.sort(key=operator.attrgetter("POS")) return records
def run(self): header_lines, vcf_records = vcf_file_read.vcf_file_to_list(self.infile) with open(self.outfile_small_vars, "w") as f_small, open(self.outfile_long_deletions, "w") as f_big: print(*header_lines, sep="\n", file=f_small) print(*header_lines, sep="\n", file=f_big) for original_record in vcf_records: split_records = original_record.split_into_snps() for record in split_records: if len(record.REF) < self.min_large_ref_length: print(record, file=f_small) else: print(record, file=f_big)
def get_probes_and_vcf_records(vcf_file, ref_seqs, flank_length, use_fail_conflict=False): """For each line of the input VCF file, yields a tuple (vcf_record, alt probe sequence). vcf_file = name of VCF file. ref_seqs = dictionary of sequence name -> sequence. flank_length = number of nucleotides to add either side of variant sequence.""" header_lines, vcf_records = vcf_file_read.vcf_file_to_list(vcf_file) yield header_lines for i, vcf_record in enumerate(vcf_records): ref_probe, alt_probe = make_probes(ref_seqs, vcf_records, i, flank_length) yield vcf_record, ref_probe, alt_probe
def load_gramtools_vcf_and_allele_coverage_files(vcf_file, quasimap_dir): """Loads the perl_generated_vcf file and allele_coverage files. Sanity checks that they agree: 1) same number of lines (excluding header lines in vcf) and 2) number of alts agree on each line. Raises error at the first time somthing wrong is found. Returns a list of tuples: (VcfRecord, dict of allele -> coverage)""" allele_base_counts_file = os.path.join(quasimap_dir, "quasimap_outputs", "allele_base_coverage.json") grouped_allele_counts_file = os.path.join( quasimap_dir, "quasimap_outputs", "grouped_allele_counts_coverage.json") all_allele_coverage, allele_groups = load_allele_files( allele_base_counts_file, grouped_allele_counts_file) vcf_header, vcf_lines = vcf_file_read.vcf_file_to_list(vcf_file) coverages = [] if len(all_allele_coverage) != len(vcf_lines): raise Exception("Number of records in VCF (" + str(len(vcf_lines)) + ") does not match number output from gramtools.(" + str(len(all_allele_coverage)) + "). Cannot continue") for i, (allele_combi_coverage, allele_per_base_coverage) in enumerate(all_allele_coverage): if len(allele_per_base_coverage) != 1 + len(vcf_lines[i].ALT): raise Exception( "Mismatch in number of alleles for this VCF record:\n" + str(vcf_lines[i]) + "\nLine number is " + str(i + 1)) coverages.append(sum(allele_combi_coverage.values())) assert len(coverages) > 0 # Unlikely to happen edge case on real data is when coverages has length 1. # It happens when running test_run in adjudicator_test, with a split VCf. # One of the splits only has 1 record. if len(coverages) == 1: variance = 1.000 else: variance = round(statistics.variance(coverages), 3) return ( round(statistics.mean(coverages), 3), variance, vcf_header, vcf_lines, all_allele_coverage, allele_groups, )
def minos_vcf_to_plot_data(infile, outfile): header, records = vcf_file_read.vcf_file_to_list(infile) data = [] tp_or_fp_types = set() output_cols = ["DP", "GT_CONF"] for record in records: if record.FORMAT is None: continue check_geno = record.FORMAT.get("MINOS_CHECK_GENOTYPE", None) dp = record.FORMAT.get("DP", None) gt_conf = record.FORMAT.get("GT_CONF", None) if dp is not None and gt_conf is not None: to_append = [dp, gt_conf] if check_geno is not None: if check_geno == "0": tp_or_fp_type = "FP" elif check_geno == "1": tp_or_fp_type = "TP" else: tp_or_fp_type = "Unknown" tp_or_fp_types.add(tp_or_fp_type) to_append.append(tp_or_fp_type) data.append(to_append) if len(data) == 0: logging.warning("No DP and GT_CONF data found in VCF file " + infile + " therefore no plots will be made") return None if "TP" in tp_or_fp_types or "FP" in tp_or_fp_types: output_cols.append("TP_OR_FP") with open(outfile, "w") as f: print(*output_cols, sep="\t", file=f) for l in data: if len(l) < len(output_cols): l.append("Unknown") print(*l[:len(output_cols)], sep="\t", file=f) return tp_or_fp_types
def minos_vcf_to_plot_data(infile, outfile): header, records = vcf_file_read.vcf_file_to_list(infile) data = [] tp_or_fp_types = set() output_cols = ['DP', 'GT_CONF'] for record in records: if record.FORMAT is None: continue check_geno = record.FORMAT.get('MINOS_CHECK_GENOTYPE', None) dp = record.FORMAT.get('DP', None) gt_conf = record.FORMAT.get('GT_CONF', None) if dp is not None and gt_conf is not None: to_append = [dp, gt_conf] if check_geno is not None: if check_geno == '0': tp_or_fp_type = 'FP' elif check_geno == '1': tp_or_fp_type = 'TP' else: tp_or_fp_type = 'Unknown' tp_or_fp_types.add(tp_or_fp_type) to_append.append(tp_or_fp_type) data.append(to_append) if len(data) == 0: logging.warning('No DP and GT_CONF data found in VCF file ' + infile + ' therefore no plots will be made') return None if 'TP' in tp_or_fp_types or 'FP' in tp_or_fp_types: output_cols.append('TP_OR_FP') with open(outfile, 'w') as f: print(*output_cols, sep='\t', file=f) for l in data: if len(l) < len(output_cols): l.append('Unknown') print(*l[:len(output_cols)], sep='\t', file=f) return tp_or_fp_types
def _filter_fps_and_long_vars_from_probe_mapped_vcf(vcf_in, vcf_out, max_ref_len, detailed_VCF=False): """vcf_in should be file made by _merge_vcf_files_for_probe_mapping, and then annotated using probe_mapping.annotate_vcf_with_probe_mapping(). Outputs a new VCF file that only contains the TPs, based on probe mapping""" header_lines, vcf_records = vcf_file_read.vcf_file_to_list(vcf_in) with open(vcf_out, "w") as f: for line in header_lines: if ( line.startswith("#CHROM") or line.startswith("##fileformat") or line.startswith("##contig") or line.startswith("##FORMAT=<ID=GT") ): print(line, file=f) i = 0 while i < len(vcf_records): records = [vcf_records[i]] i += 1 while i < len(vcf_records) and vcf_records[i].ID == records[0].ID: records.append(vcf_records[i]) i += 1 records = [ x for x in records if x.FORMAT.get("VFR_RESULT", "FP") == "TP" and x.FORMAT.get("VFR_IN_MASK", "0") != "1" ] if max_ref_len is not None: records = [x for x in records if len(x.REF) <= max_ref_len] if len(records) > 1: logging.warning( "Skipping the following VCF lines. They conflict, but probe mapping thinks they are all true-positives:" ) for record in records: logging.warning(f" {record}") elif len(records) == 1: if not detailed_VCF: records[0].FORMAT = {"GT": "1/1"} print(records[0], file=f)
def load_gramtools_vcf_and_allele_coverage_files(vcf_file, quasimap_dir): """Loads the perl_generated_vcf file and allele_coverage files. Sanity checks that they agree: 1) same number of lines (excluding header lines in vcf) and 2) number of alts agree on each line. Raises error at the first time somthing wrong is found. Returns a list of tuples: (VcfRecord, dict of allele -> coverage)""" vcf_header, vcf_lines = vcf_file_read.vcf_file_to_list(vcf_file) all_allele_coverage, allele_groups = _load_quasimap_json_files(quasimap_dir) coverages = _coverage_list_from_allele_coverage( all_allele_coverage, vcf_lines=vcf_lines ) coverages = [x for x in coverages if x is not None] # Unlikely to happen edge case when there were no SNPs in the input. # Or the few SNPs we do get coverage for all all coverage zero. # By default, we only counted read depth at SNPs, so in this edge case we # get no read depths. Do the coverage estimate again, but use indels to # estimate. Is likely to be a little less accurate, but we have no choice. if len(coverages) == 0 or max(coverages) == 0: coverages = _coverage_list_from_allele_coverage( all_allele_coverage, vcf_lines=vcf_lines, use_indels=True ) coverages = [x for x in coverages if x is not None] assert len(coverages) > 0 # Unlikely to happen edge case on real data is when coverages has length 1. # It happens when running test_run in adjudicator_test, with a split VCf. # One of the splits only has 1 record. if len(coverages) == 1: variance = 1.000 else: variance = round(statistics.variance(coverages), 3) return ( round(statistics.mean(coverages), 3), variance, vcf_header, vcf_lines, all_allele_coverage, allele_groups, )
def _vcf_file_to_dict(vcf_file, pass_only=True): """Loads VCF file. Returns a dictionary of sequence name -> sorted list by position of variants""" records = {} wanted_format = {"PASS"} if not pass_only: wanted_format.add("FAIL_BUT_TEST") header_lines, vcf_records = vcf_file_read.vcf_file_to_list(vcf_file) for record in vcf_records: if record.FORMAT["VFR_FILTER"] not in wanted_format: continue if record.CHROM not in records: records[record.CHROM] = [] records[record.CHROM].append(record) for l in records.values(): l.sort(key=operator.attrgetter("POS")) return records
def _add_gt_conf_percentile_to_vcf_file(cls, vcf_file, mean_depth, depth_variance, error_rate, iterations): '''Overwrites vcf_file, with new version that has GT_CONF_PERCENTILE added''' simulations = genotype_confidence_simulator.GenotypeConfidenceSimulator( mean_depth, depth_variance, error_rate, allele_length=1, iterations=iterations) simulations.run_simulations() vcf_header, vcf_lines = vcf_file_read.vcf_file_to_list(vcf_file) for i, line in enumerate(vcf_header): if line.startswith('##FORMAT=<ID=GT_CONF'): break else: raise Exception( f'No GT_CONF description found in header of VCF file {vcf_file}. Cannot continue' ) vcf_header.insert( i + 1, r'''##FORMAT=<ID=GT_CONF_PERCENTILE,Number=1,Type=Float,Description="Percentile of GT_CONF"''' ) with open(vcf_file, 'w') as f: print(*vcf_header, sep='\n', file=f) for vcf_record in vcf_lines: if 'GT_CONF' in vcf_record.FORMAT: conf = int(round(float(vcf_record.FORMAT['GT_CONF']))) if 'GT' in vcf_record.FORMAT and '.' not in vcf_record.FORMAT[ 'GT']: vcf_record.set_format_key_value( 'GT_CONF_PERCENTILE', str(simulations.get_percentile(conf))) print(vcf_record, file=f)
def _merge_vcf_files_for_probe_mapping(list_of_vcf_files, ref_fasta, vcf_out): ref_seqs = utils.file_to_dict_of_seqs(ref_fasta) # This makes a merged file, where two different ALTs at the same place # result in one record with a list of ALTs. For probe mapping, we want # a separate record for each allele. Also need genotype to be "1/1" vcf_merge.merge_vcf_files(list_of_vcf_files, ref_seqs, vcf_out) header_lines, vcf_records = vcf_file_read.vcf_file_to_list(vcf_out) with open(vcf_out, "w") as f: print("##fileformat=VCFv4.2", file=f) for seq in ref_seqs.values(): print(f"##contig=<ID={seq.id},length={len(seq)}>", file=f) print('##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">', file=f) print("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tsample", file=f) for i, record in enumerate(vcf_records): for alt in record.ALT: new_record = copy.copy(record) new_record.ID = str(i) new_record.ALT = [alt] new_record.INFO = {} new_record.FILTER = set(["PASS"]) new_record.FORMAT = {"GT": "1/1", "VFR_FILTER": "PASS"} print(new_record, file=f)
def _add_gt_conf_percentile_and_filters_to_vcf_file( cls, vcf_file, mean_depth, depth_variance, error_rate, iterations, min_dp=5, min_gcp=5, ): """Overwrites vcf_file, with new version that has GT_CONF_PERCENTILE added, and filter for DP and GT_CONF_PERCENTILE""" if mean_depth > 0: simulations = genotype_confidence_simulator.GenotypeConfidenceSimulator( mean_depth, depth_variance, error_rate, allele_length=1, iterations=iterations, ) simulations.run_simulations() vcf_header, vcf_lines = vcf_file_read.vcf_file_to_list(vcf_file) for i, line in enumerate(vcf_header): if line.startswith("##FORMAT=<ID=GT_CONF"): break else: raise Exception( f"No GT_CONF description found in header of VCF file {vcf_file}. Cannot continue" ) vcf_header.insert( i + 1, r"""##FORMAT=<ID=GT_CONF_PERCENTILE,Number=1,Type=Float,Description="Percentile of GT_CONF">""", ) vcf_header.insert( i + 1, f'##FILTER=<ID=MIN_DP,Description="Minimum DP of {min_dp}">') vcf_header.insert( i + 1, f'##FILTER=<ID=MIN_GCP,Description="Minimum GT_CONF_PERCENTILE of {min_gcp}">', ) with open(vcf_file, "w") as f: print(*vcf_header, sep="\n", file=f) for vcf_record in vcf_lines: vcf_record.FILTER = set() if "GT" in vcf_record.FORMAT and "GT_CONF" in vcf_record.FORMAT: if "." not in vcf_record.FORMAT["GT"]: conf = int(round(float(vcf_record.FORMAT["GT_CONF"]))) vcf_record.set_format_key_value( "GT_CONF_PERCENTILE", str(simulations.get_percentile(conf))) if ("DP" in vcf_record.FORMAT and float(vcf_record.FORMAT["DP"]) < min_dp): vcf_record.FILTER.add("MIN_DP") if float(vcf_record.FORMAT["GT_CONF_PERCENTILE"] ) < min_gcp: vcf_record.FILTER.add("MIN_GCP") if len(vcf_record.FILTER) == 0: vcf_record.FILTER.add("PASS") else: # Add a default null percentile vcf_record.set_format_key_value( "GT_CONF_PERCENTILE", "0.0") print(vcf_record, file=f)
def vcf_records_are_the_same(file1, file2): """Returns True if records in the two VCF files are the same. Ignores header lines in the files. Returns False if any lines are different""" _, expect_records = vcf_file_read.vcf_file_to_list(file1) _, got_records = vcf_file_read.vcf_file_to_list(file2) return got_records == expect_records
def _add_gt_conf_percentile_and_filters_to_vcf_file( cls, vcf_file, geno_simulations, min_dp=0, min_gcp=5, min_frs=0.9, conf_scores_file=None, ): """Overwrites vcf_file, with new version that has GT_CONF_PERCENTILE added, and filter for DP, GT_CONF_PERCENTILE, and FRS""" if conf_scores_file is not None: real_conf_scores = [] vcf_header, vcf_lines = vcf_file_read.vcf_file_to_list(vcf_file) for i, line in enumerate(vcf_header): if line.startswith("##FORMAT=<ID=GT_CONF"): break else: raise Exception( f"No GT_CONF description found in header of VCF file {vcf_file}. Cannot continue" ) vcf_header[i + 1:i + 1] = [ '##FORMAT=<ID=GT_CONF_PERCENTILE,Number=1,Type=Float,Description="Percentile of GT_CONF">', f'##FILTER=<ID=MIN_FRS,Description="Minimum FRS of {min_frs}">', f'##FILTER=<ID=MIN_DP,Description="Minimum DP of {min_dp}">', f'##FILTER=<ID=MIN_GCP,Description="Minimum GT_CONF_PERCENTILE of {min_gcp}">', ] with open(vcf_file, "w") as f: print(*vcf_header, sep="\n", file=f) for vcf_record in vcf_lines: vcf_record.FILTER = set() if "GT" in vcf_record.FORMAT and "GT_CONF" in vcf_record.FORMAT: if "." not in vcf_record.FORMAT["GT"]: conf = int(round(float(vcf_record.FORMAT["GT_CONF"]))) vcf_record.set_format_key_value( "GT_CONF_PERCENTILE", str(geno_simulations.get_percentile(conf)), ) if ("DP" in vcf_record.FORMAT and float(vcf_record.FORMAT["DP"]) < min_dp): vcf_record.FILTER.add("MIN_DP") if float(vcf_record.FORMAT["GT_CONF_PERCENTILE"] ) < min_gcp: vcf_record.FILTER.add("MIN_GCP") if ("FRS" in vcf_record.FORMAT and float(vcf_record.FORMAT["FRS"]) < min_frs): vcf_record.FILTER.add("MIN_FRS") if len(vcf_record.FILTER) == 0: vcf_record.FILTER.add("PASS") if conf_scores_file is not None: real_conf_scores.append(conf) else: # Add a default null percentile vcf_record.set_format_key_value( "GT_CONF_PERCENTILE", "0.0") print(vcf_record, file=f) if conf_scores_file is not None: with open(conf_scores_file, "w") as f: print(*real_conf_scores, sep="\n", file=f)
def _add_gt_conf_percentile_and_filters_to_vcf_file( cls, vcf_file, mean_depth, depth_variance, error_rate, iterations, min_dp=2, min_gt_conf_percentile=2.5): '''Overwrites vcf_file, with new version that has GT_CONF_PERCENTILE added, and filter for DP and GT_CONF_PERCENTILE''' simulations = genotype_confidence_simulator.GenotypeConfidenceSimulator( mean_depth, depth_variance, error_rate, allele_length=1, iterations=iterations) simulations.run_simulations() vcf_header, vcf_lines = vcf_file_read.vcf_file_to_list(vcf_file) for i, line in enumerate(vcf_header): if line.startswith('##FORMAT=<ID=GT_CONF'): break else: raise Exception( f'No GT_CONF description found in header of VCF file {vcf_file}. Cannot continue' ) vcf_header.insert( i + 1, r'''##FORMAT=<ID=GT_CONF_PERCENTILE,Number=1,Type=Float,Description="Percentile of GT_CONF">''' ) vcf_header.insert( i + 1, f'##FILTER=<ID=MIN_DP,Description="Minimum DP of {min_dp}">') vcf_header.insert( i + 1, f'##FILTER=<ID=MIN_GCP,Description="Minimum GT_CONF_PERCENTILE of {min_gt_conf_percentile}">' ) with open(vcf_file, 'w') as f: print(*vcf_header, sep='\n', file=f) for vcf_record in vcf_lines: vcf_record.FILTER = set() if 'GT_CONF' in vcf_record.FORMAT: conf = int(round(float(vcf_record.FORMAT['GT_CONF']))) if 'GT' in vcf_record.FORMAT and '.' not in vcf_record.FORMAT[ 'GT']: vcf_record.set_format_key_value( 'GT_CONF_PERCENTILE', str(simulations.get_percentile(conf))) if 'DP' in vcf_record.FORMAT and float( vcf_record.FORMAT['DP']) < min_dp: vcf_record.FILTER.add('MIN_DP') if float(vcf_record.FORMAT['GT_CONF_PERCENTILE'] ) < min_gt_conf_percentile: vcf_record.FILTER.add('MIN_GCP') if len(vcf_record.FILTER) == 0: vcf_record.FILTER.add('PASS') print(vcf_record, file=f)