def test_file_to_dict_of_seqs(): infile = os.path.join(data_dir, "file_to_dict_of_seqs.fa") expect = { "seq1": pyfastaq.sequences.Fasta("seq1", "A"), "seq2": pyfastaq.sequences.Fasta("seq2", "G"), } x = utils.file_to_dict_of_seqs(infile) for k, v in x.items(): print(k, v) assert utils.file_to_dict_of_seqs(infile) == expect
def fix_minimap2_vcf(input_vcf_file, output_vcf_file, ref_fasta, qry_fasta, snps_only): ref_seqs = utils.file_to_dict_of_seqs(ref_fasta) qry_seqs = utils.file_to_dict_of_seqs(qry_fasta) discarded_variants = [] with open(input_vcf_file) as input_vcf_filehandler,\ open(output_vcf_file, "w") as output_vcf_filehandler: for line in input_vcf_filehandler: if line.startswith("#"): output_vcf_filehandler.write(line) else: line_split = line.strip().split() # change QUAL to "." so that it is equal to dnadiff line_split[5] = "." # get REF_LENGTH and QRY_LENGTH ref_chrom = line_split[0] ref_length = len(ref_seqs[ref_chrom]) info = line_split[7] qry_chrom = qname_matcher.match(info).group(1) qry_length = len(qry_seqs[qry_chrom]) # change info info = f"LENGTH_QRY={qry_length};LENGTH_REF={ref_length};" + info line_split[7] = info # remake line line = "\t".join(line_split) ref = line_split[3] alts = line_split[4] is_snp = ref in ["A", "C", "G", "T"] and alts in ["A", "C", "G", "T"] if not snps_only or (snps_only and is_snp): print(line, file=output_vcf_filehandler) elif snps_only and not is_snp: discarded_variants.append(line) if snps_only: with open(f"{output_vcf_file}.discarded_not_snps.vcf", "w") as discarded_variants_fh: for discarded_variant in discarded_variants: print(discarded_variant, file=discarded_variants_fh)
def _deduplicate_vcf_files_for_probe_mapping(to_merge, ref_fasta, vcf_out): ref_seqs = utils.file_to_dict_of_seqs(ref_fasta) vcf_lines = _deduplicate_vcf_files(to_merge, f"{vcf_out}.disagreements_between_dnadiff_and_minimap2") vcf_lines = _identify_vcf_lines(vcf_lines) with open(vcf_out, "w") as f: print("##fileformat=VCFv4.2", file=f) for seq in ref_seqs.values(): print(f"##contig=<ID={seq.id},length={len(seq)}>", file=f) print('##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">', file=f) print("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tsample", file=f) print("\n".join(vcf_lines), file=f)
def apply_variants_to_genome(ref_fasta, vcf_file, out_fasta): """Takes the variants in vcf_file, and applies them to the associated reference genome in ref_fasta. Writes a new file out_fasta that has those variants applied""" ref_sequences = utils.file_to_dict_of_seqs(ref_fasta) vcf_dict = _vcf_file_to_dict(vcf_file) with open(out_fasta, "w") as f: for ref_name, vcf_records in sorted(vcf_dict.items()): old_seq = ref_sequences[ref_name] new_seq = list(old_seq.seq) previous_ref_start = None # Applying indels messes up the coords of any subsequent variant, # so start at the end and work backwards for vcf_record in reversed(vcf_records): genotype = set(vcf_record.FORMAT["GT"].split("/")) assert len(genotype) == 1 allele_index = int(genotype.pop()) if allele_index == 0: continue # Some tools report two (or more) variants that overlap. # No clear "right" option here. # If the current record overlaps the previous one, ignore it. # We could try to be cleverer about this (take best records # based on likelihoods or whatever else), but every tool is # different so no sane consistent way of doing this across tools if ( previous_ref_start is not None and vcf_record.ref_end_pos() >= previous_ref_start ): logging.warn( f"Skipping this record when calculating recall because it overlaps another record: {vcf_record}" ) continue previous_ref_start = vcf_record.POS allele = vcf_record.ALT[allele_index - 1] start, end = vcf_record.POS, vcf_record.ref_end_pos() + 1 assert old_seq[start:end] == "".join(new_seq[start:end]) new_seq[start:end] = [allele] new_seq = pyfastaq.sequences.Fasta(f"{ref_name}.mutated", "".join(new_seq)) print(new_seq, file=f)
def _merge_vcf_files_for_probe_mapping(list_of_vcf_files, ref_fasta, vcf_out): ref_seqs = utils.file_to_dict_of_seqs(ref_fasta) # This makes a merged file, where two different ALTs at the same place # result in one record with a list of ALTs. For probe mapping, we want # a separate record for each allele. Also need genotype to be "1/1" vcf_merge.merge_vcf_files(list_of_vcf_files, ref_seqs, vcf_out) header_lines, vcf_records = vcf_file_read.vcf_file_to_list(vcf_out) with open(vcf_out, "w") as f: print("##fileformat=VCFv4.2", file=f) for seq in ref_seqs.values(): print(f"##contig=<ID={seq.id},length={len(seq)}>", file=f) print('##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">', file=f) print("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tsample", file=f) for i, record in enumerate(vcf_records): for alt in record.ALT: new_record = copy.copy(record) new_record.ID = str(i) new_record.ALT = [alt] new_record.INFO = {} new_record.FILTER = set(["PASS"]) new_record.FORMAT = {"GT": "1/1", "VFR_FILTER": "PASS"} print(new_record, file=f)
def apply_variants_to_genome(ref_fasta, vcf_file, out_fasta, pass_only=True): """Takes the variants in vcf_file, and applies them to the associated reference genome in ref_fasta. Writes a new file out_fasta that has those variants applied""" ref_sequences = utils.file_to_dict_of_seqs(ref_fasta) vcf_dict = _vcf_file_to_dict(vcf_file, pass_only=pass_only) with open(out_fasta, "w") as f: for ref_name, vcf_records in sorted(vcf_dict.items()): old_seq = ref_sequences[ref_name] new_seq = list(old_seq.seq) # Applying indels messes up the coords of any subsequent variant, # so start at the end and work backwards for vcf_record in reversed(vcf_records): genotype = set(vcf_record.FORMAT["GT"].split("/")) assert len(genotype) == 1 allele_index = int(genotype.pop()) if allele_index == 0: continue allele = vcf_record.ALT[allele_index - 1] start, end = vcf_record.POS, vcf_record.ref_end_pos() + 1 assert old_seq[start:end] == "".join(new_seq[start:end]) new_seq[start:end] = [allele] new_seq = pyfastaq.sequences.Fasta(f"{ref_name}.mutated", "".join(new_seq)) print(new_seq, file=f)
def annotate_vcf_with_probe_mapping( vcf_in, vcf_ref_fasta, truth_ref_fasta, flank_length, vcf_out, map_outfile=None, use_fail_conflict=False, use_ref_calls=False, debug=False, truth_mask=None, ): vcf_ref_seqs = utils.file_to_dict_of_seqs(vcf_ref_fasta) truth_ref_seqs = utils.file_to_dict_of_seqs(truth_ref_fasta) probes_and_vcf_reader = get_probes_and_vcf_records( vcf_in, vcf_ref_seqs, flank_length, use_fail_conflict=use_fail_conflict, ) # Some notes on the mapper options... # # From the docs: score is the "scoring system. It is a tuple/list consisting # of 4, 6 or 7 positive integers. The first 4 elements specify match scoring, # mismatch penalty, gap open and gap extension penalty. The 5th and 6th # elements, if present, set long-gap open and long-gap extension penalty. # The 7th sets a mismatch penalty involving ambiguous bases." # The default mappy Python API do not work. In the tests, results in mappings # that make FPs turn into TPs. # The options 1,1,5,3 are actually the defaults from bowtie2 and seem to work. # # k=15 and w=10 are the CLI defaults. On the test data, these values result # in the probes near the start of the genome getting mapped, whereas those # probes do not get mapped using whatever the Python defaults are. # # extra_flags=0x4000000 turns on extended cigars, which we use to more easily # determine where the matches and mismatches are between the probe and truth # reference. mapper = mappy.Aligner( fn_idx_in=truth_ref_fasta, k=15, w=10, preset="sr", n_threads=1, extra_flags=0x4000000, scoring=[1, 1, 5, 3], ) header_lines = next(probes_and_vcf_reader) if map_outfile is not None: f_map = open(map_outfile, "w") else: f_map = None new_header_lines = [ '##FORMAT=<ID=VFR_IN_MASK,Number=1,Type=String,Description="Whether or not the variant is in the truth genome mask">', '##FORMAT=<ID=VFR_RESULT,Number=1,Type=String,Description="FP, TP, or Partial_TP when part of the allele matches the truth reference">', '##FORMAT=<ID=VFR_ALLELE_LEN,Number=1,Type=Integer,Description="Number of positions in allele that were checked if they match the truth">', '##FORMAT=<ID=VFR_ALLELE_MATCH_COUNT,Number=1,Type=String,Description="Number of positions in allele that match the truth">', '##FORMAT=<ID=VFR_ALLELE_MATCH_FRAC,Number=1,Type=String,Description="Fraction of positions in allele that match the truth">', '##FORMAT=<ID=VFR_ED_RA,Number=1,Type=String,Description="Edit distance between ref and alt allele (using the called allele where more than one alt)">', '##FORMAT=<ID=VFR_ED_TR,Number=1,Type=String,Description="Edit distance between truth and ref allele">', '##FORMAT=<ID=VFR_ED_TA,Number=1,Type=String,Description="Edit distance between truth and alt allele">', '##FORMAT=<ID=VFR_ED_SCORE,Number=1,Type=String,Description="Edit distance score">', ] with open(vcf_out, "w") as f_vcf: print( *header_lines[:-1], *new_header_lines, header_lines[-1], sep="\n", file=f_vcf, ) for (vcf_record, ref_probe, alt_probe) in probes_and_vcf_reader: evaluate_vcf_record( mapper, vcf_record, ref_probe, alt_probe, vcf_ref_seqs[vcf_record.CHROM], truth_ref_seqs, map_outfile=f_map, use_fail_conflict=use_fail_conflict, truth_mask=truth_mask, ) print(vcf_record, file=f_vcf) if map_outfile is not None: f_map.close()
def _snps_file_to_vcf(snps_file, query_fasta, outfile, snps_only): """Loads the .snps file made by dnadiff. query_fasta = fasta file of query sequences. Writes a new VCF file unmerged records.""" vcf_records = {} variants = pymummer.snp_file.get_all_variants(snps_file) query_seqs = utils.file_to_dict_of_seqs(query_fasta) discarded_variants = [] for variant in variants: # If the variant is reversed, it means that either the ref or query had to be # reverse complemented when aligned by mummer. Need to do the appropriate # reverse (complement) fixes so the VCF has the correct REF and ALT sequences if variant.reverse: qry_seq = pyfastaq.sequences.Fasta("x", variant.qry_base) qry_seq.revcomp() variant.qry_base = "".join(reversed(qry_seq.seq)) ref_seq = pyfastaq.sequences.Fasta("x", variant.ref_base) ref_seq.revcomp() variant.ref_base = ref_seq.seq if variant.var_type == pymummer.variant.SNP: new_record = vcf_record.VcfRecord("\t".join([ variant.qry_name, str(variant.qry_start + 1), ".", variant.qry_base, variant.ref_base, ".", ".", f"QNAME={variant.ref_name};" f"QSTART={variant.ref_start + 1};" f"QSTRAND={'-' if variant.reverse else '+'};" f"LENGTH_REF={variant.qry_length};" f"LENGTH_QRY={variant.ref_length}", "GT", "1/1", ])) elif variant.var_type == pymummer.variant.DEL: if snps_only: discarded_variants.append(variant) continue # The query has sequence missing, compared to the # reference. We're making VCF records w.r.t. the # query, so this is an insertion. So need to # get the nucleotide before the insertion as well. new_record = vcf_record.VcfRecord("\t".join([ variant.qry_name, str(variant.qry_start + 1), ".", query_seqs[variant.qry_name][variant.qry_start], query_seqs[variant.qry_name][variant.qry_start] + variant.ref_base, ".", ".", "SVTYPE=DNADIFF_INS", "GT", "1/1", ])) elif variant.var_type == pymummer.variant.INS: if snps_only: discarded_variants.append(variant) continue # The ref has sequence missing, compared to the # query. We're making VCF records w.r.t. the # query, so this is a deletion. So need to # get the nucleotide before the deletion as well. new_record = vcf_record.VcfRecord("\t".join([ variant.qry_name, str(variant.qry_start), ".", query_seqs[variant.qry_name][variant.qry_start - 1] + variant.qry_base, query_seqs[variant.qry_name][variant.qry_start - 1], ".", ".", "SVTYPE=DNADIFF_DEL", "GT", "1/1", ])) else: raise Exception("Unknown variant type: " + str(variant)) assert (new_record.REF == query_seqs[new_record.CHROM] [new_record.POS:new_record.POS + len(new_record.REF)]) if new_record.CHROM not in vcf_records: vcf_records[new_record.CHROM] = [] vcf_records[new_record.CHROM].append(new_record) for vcf_list in vcf_records.values(): vcf_list.sort(key=attrgetter("POS")) with open(outfile, "w") as f: print("##fileformat=VCFv4.2", file=f) for seq in query_seqs.values(): print(f"##contig=<ID={seq.id},length={len(seq)}>", file=f) print("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tsample", file=f) for key, vcf_list in sorted(vcf_records.items()): for record in vcf_list: print(record, file=f) if snps_only: with open(f"{outfile}.discarded_not_snps.vcf", "w") as discarded_variants_fh: for discarded_variant in discarded_variants: print(discarded_variant, file=discarded_variants_fh)
def evaluate_vcf( vcf_to_eval, vcf_ref_fasta, truth_ref_fasta, flank_length, outdir, truth_vcf=None, debug=False, force=False, ref_mask_bed_file=None, truth_mask_bed_file=None, discard_ref_calls=True, max_recall_ref_len=None, filter_pass=None, ): if force: subprocess.check_output(f"rm -rf {outdir}", shell=True) os.mkdir(outdir) # Mask if needed if ref_mask_bed_file is None: vcf_to_filter = vcf_to_eval else: logging.info("Masking VCF...") masked_vcf = os.path.join(outdir, "variants_to_eval.masked.vcf") utils.mask_vcf_file(vcf_to_eval, ref_mask_bed_file, masked_vcf) vcf_to_filter = masked_vcf logging.info("Masked VCF") vcf_ref_seqs = utils.file_to_dict_of_seqs(vcf_ref_fasta) filtered_vcf = os.path.join(outdir, "variants_to_eval.filtered.vcf") excluded_vcf = os.path.join(outdir, "variants_to_eval.excluded.vcf") logging.info("Filtering VCF...") filtered_counts = _filter_vcf( vcf_to_filter, filtered_vcf, excluded_vcf, vcf_ref_seqs, filter_pass=filter_pass, keep_ref_calls=not discard_ref_calls, ) logging.info("Filtering VCF done") vcf_for_precision = os.path.join(outdir, "precision.vcf") map_outfile = f"{vcf_for_precision}.debug.map" if debug else None if truth_mask_bed_file is None: truth_mask = None else: truth_mask = utils.load_mask_bed_file(truth_mask_bed_file) logging.info("Annotating VCF with TP/FP for precision...") probe_mapping.annotate_vcf_with_probe_mapping( filtered_vcf, vcf_ref_fasta, truth_ref_fasta, flank_length, vcf_for_precision, map_outfile=map_outfile, use_ref_calls=not discard_ref_calls, truth_mask=truth_mask, ) logging.info("Annotatiing VCF with with TP/FP for precision done") logging.info("Calculating recall...") recall_dir = os.path.join(outdir, "recall") vcf_for_recall = recall.get_recall( vcf_ref_fasta, filtered_vcf, recall_dir, flank_length, debug=debug, truth_fasta=truth_ref_fasta if truth_vcf is None else None, truth_vcf=truth_vcf, truth_mask=truth_mask, max_ref_len=max_recall_ref_len, ) if ref_mask_bed_file is not None: logging.info("Masking recall VCF...") utils.mask_vcf_file( vcf_for_recall, ref_mask_bed_file, f"{vcf_for_recall}.masked.vcf" ) vcf_for_recall = f"{vcf_for_recall}.masked.vcf" logging.info("Masking recall VCF done") logging.info("Recall calculation done") # Gather stats and make plots logging.info("Gathering stats...") per_record_recall = vcf_stats.per_record_stats_from_vcf_file(vcf_for_recall) recall_stats = vcf_stats.summary_stats_from_per_record_stats( per_record_recall, for_recall=True ) per_record_precision = vcf_stats.per_record_stats_from_vcf_file(vcf_for_precision) precision_stats = vcf_stats.summary_stats_from_per_record_stats( per_record_precision ) summary_stats = {"Recall": recall_stats, "Precision": precision_stats} _add_overall_precision_and_recall_to_summary_stats(summary_stats) summary_stats["Excluded_record_counts"] = filtered_counts summary_stats_json = os.path.join(outdir, "summary_stats.json") with open(summary_stats_json, "w") as f: json.dump(summary_stats, f, indent=2, sort_keys=True) logging.info(f"Done. Results written to {summary_stats_json}")