Exemple #1
0
def test_file_to_dict_of_seqs():
    infile = os.path.join(data_dir, "file_to_dict_of_seqs.fa")
    expect = {
        "seq1": pyfastaq.sequences.Fasta("seq1", "A"),
        "seq2": pyfastaq.sequences.Fasta("seq2", "G"),
    }
    x = utils.file_to_dict_of_seqs(infile)
    for k, v in x.items():
        print(k, v)
    assert utils.file_to_dict_of_seqs(infile) == expect
Exemple #2
0
def fix_minimap2_vcf(input_vcf_file, output_vcf_file, ref_fasta, qry_fasta, snps_only):
    ref_seqs = utils.file_to_dict_of_seqs(ref_fasta)
    qry_seqs = utils.file_to_dict_of_seqs(qry_fasta)

    discarded_variants = []
    with open(input_vcf_file) as input_vcf_filehandler,\
         open(output_vcf_file, "w") as output_vcf_filehandler:
        for line in input_vcf_filehandler:
            if line.startswith("#"):
                output_vcf_filehandler.write(line)
            else:
                line_split = line.strip().split()

                # change QUAL to "." so that it is equal to dnadiff
                line_split[5] = "."

                # get REF_LENGTH and QRY_LENGTH
                ref_chrom = line_split[0]
                ref_length = len(ref_seqs[ref_chrom])
                info = line_split[7]
                qry_chrom = qname_matcher.match(info).group(1)
                qry_length = len(qry_seqs[qry_chrom])

                # change info
                info = f"LENGTH_QRY={qry_length};LENGTH_REF={ref_length};" + info
                line_split[7] = info

                # remake line
                line = "\t".join(line_split)

                ref = line_split[3]
                alts = line_split[4]
                is_snp = ref in ["A", "C", "G", "T"] and alts in ["A", "C", "G", "T"]

                if not snps_only or (snps_only and is_snp):
                    print(line, file=output_vcf_filehandler)
                elif snps_only and not is_snp:
                    discarded_variants.append(line)

    if snps_only:
        with open(f"{output_vcf_file}.discarded_not_snps.vcf", "w") as discarded_variants_fh:
            for discarded_variant in discarded_variants:
                print(discarded_variant, file=discarded_variants_fh)
Exemple #3
0
def _deduplicate_vcf_files_for_probe_mapping(to_merge, ref_fasta, vcf_out):
    ref_seqs = utils.file_to_dict_of_seqs(ref_fasta)
    vcf_lines = _deduplicate_vcf_files(to_merge, f"{vcf_out}.disagreements_between_dnadiff_and_minimap2")
    vcf_lines = _identify_vcf_lines(vcf_lines)

    with open(vcf_out, "w") as f:
        print("##fileformat=VCFv4.2", file=f)
        for seq in ref_seqs.values():
            print(f"##contig=<ID={seq.id},length={len(seq)}>", file=f)
        print('##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">', file=f)
        print("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tsample", file=f)
        print("\n".join(vcf_lines), file=f)
Exemple #4
0
def apply_variants_to_genome(ref_fasta, vcf_file, out_fasta):
    """Takes the variants in vcf_file, and applies them to the associated
    reference genome in ref_fasta. Writes a new file out_fasta that has those
    variants applied"""
    ref_sequences = utils.file_to_dict_of_seqs(ref_fasta)
    vcf_dict = _vcf_file_to_dict(vcf_file)
    with open(out_fasta, "w") as f:
        for ref_name, vcf_records in sorted(vcf_dict.items()):
            old_seq = ref_sequences[ref_name]
            new_seq = list(old_seq.seq)
            previous_ref_start = None
            # Applying indels messes up the coords of any subsequent variant,
            # so start at the end and work backwards
            for vcf_record in reversed(vcf_records):
                genotype = set(vcf_record.FORMAT["GT"].split("/"))
                assert len(genotype) == 1
                allele_index = int(genotype.pop())
                if allele_index == 0:
                    continue

                # Some tools report two (or more) variants that overlap.
                # No clear "right" option here.
                # If the current record overlaps the previous one, ignore it.
                # We could try to be cleverer about this (take best records
                # based on likelihoods or whatever else), but every tool is
                # different so no sane consistent way of doing this across tools
                if (
                    previous_ref_start is not None
                    and vcf_record.ref_end_pos() >= previous_ref_start
                ):
                    logging.warn(
                        f"Skipping this record when calculating recall because it overlaps another record: {vcf_record}"
                    )
                    continue

                previous_ref_start = vcf_record.POS
                allele = vcf_record.ALT[allele_index - 1]
                start, end = vcf_record.POS, vcf_record.ref_end_pos() + 1
                assert old_seq[start:end] == "".join(new_seq[start:end])
                new_seq[start:end] = [allele]
            new_seq = pyfastaq.sequences.Fasta(f"{ref_name}.mutated", "".join(new_seq))
            print(new_seq, file=f)
Exemple #5
0
def _merge_vcf_files_for_probe_mapping(list_of_vcf_files, ref_fasta, vcf_out):
    ref_seqs = utils.file_to_dict_of_seqs(ref_fasta)
    # This makes a merged file, where two different ALTs at the same place
    # result in one record with a list of ALTs. For probe mapping, we want
    # a separate record for each allele. Also need genotype to be "1/1"
    vcf_merge.merge_vcf_files(list_of_vcf_files, ref_seqs, vcf_out)
    header_lines, vcf_records = vcf_file_read.vcf_file_to_list(vcf_out)
    with open(vcf_out, "w") as f:
        print("##fileformat=VCFv4.2", file=f)
        for seq in ref_seqs.values():
            print(f"##contig=<ID={seq.id},length={len(seq)}>", file=f)
        print('##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">', file=f)
        print("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tsample", file=f)

        for i, record in enumerate(vcf_records):
            for alt in record.ALT:
                new_record = copy.copy(record)
                new_record.ID = str(i)
                new_record.ALT = [alt]
                new_record.INFO = {}
                new_record.FILTER = set(["PASS"])
                new_record.FORMAT = {"GT": "1/1", "VFR_FILTER": "PASS"}
                print(new_record, file=f)
Exemple #6
0
def apply_variants_to_genome(ref_fasta, vcf_file, out_fasta, pass_only=True):
    """Takes the variants in vcf_file, and applies them to the associated
    reference genome in ref_fasta. Writes a new file out_fasta that has those
    variants applied"""
    ref_sequences = utils.file_to_dict_of_seqs(ref_fasta)
    vcf_dict = _vcf_file_to_dict(vcf_file, pass_only=pass_only)
    with open(out_fasta, "w") as f:
        for ref_name, vcf_records in sorted(vcf_dict.items()):
            old_seq = ref_sequences[ref_name]
            new_seq = list(old_seq.seq)
            # Applying indels messes up the coords of any subsequent variant,
            # so start at the end and work backwards
            for vcf_record in reversed(vcf_records):
                genotype = set(vcf_record.FORMAT["GT"].split("/"))
                assert len(genotype) == 1
                allele_index = int(genotype.pop())
                if allele_index == 0:
                    continue
                allele = vcf_record.ALT[allele_index - 1]
                start, end = vcf_record.POS, vcf_record.ref_end_pos() + 1
                assert old_seq[start:end] == "".join(new_seq[start:end])
                new_seq[start:end] = [allele]
            new_seq = pyfastaq.sequences.Fasta(f"{ref_name}.mutated", "".join(new_seq))
            print(new_seq, file=f)
Exemple #7
0
def annotate_vcf_with_probe_mapping(
    vcf_in,
    vcf_ref_fasta,
    truth_ref_fasta,
    flank_length,
    vcf_out,
    map_outfile=None,
    use_fail_conflict=False,
    use_ref_calls=False,
    debug=False,
    truth_mask=None,
):
    vcf_ref_seqs = utils.file_to_dict_of_seqs(vcf_ref_fasta)
    truth_ref_seqs = utils.file_to_dict_of_seqs(truth_ref_fasta)
    probes_and_vcf_reader = get_probes_and_vcf_records(
        vcf_in,
        vcf_ref_seqs,
        flank_length,
        use_fail_conflict=use_fail_conflict,
    )

    # Some notes on the mapper options...
    #
    # From the docs: score is the "scoring system. It is a tuple/list consisting
    # of 4, 6 or 7 positive integers. The first 4 elements specify match scoring,
    # mismatch penalty, gap open and gap extension penalty. The 5th and 6th
    # elements, if present, set long-gap open and long-gap extension penalty.
    # The 7th sets a mismatch penalty involving ambiguous bases."
    # The default mappy Python API do not work. In the tests, results in mappings
    # that make FPs turn into TPs.
    # The options 1,1,5,3 are actually the defaults from bowtie2 and seem to work.
    #
    # k=15 and w=10 are the CLI defaults. On the test data, these values result
    # in the probes near the start of the genome getting mapped, whereas those
    # probes do not get mapped using whatever the Python defaults are.
    #
    # extra_flags=0x4000000 turns on extended cigars, which we use to more easily
    # determine where the matches and mismatches are between the probe and truth
    # reference.
    mapper = mappy.Aligner(
        fn_idx_in=truth_ref_fasta,
        k=15,
        w=10,
        preset="sr",
        n_threads=1,
        extra_flags=0x4000000,
        scoring=[1, 1, 5, 3],
    )
    header_lines = next(probes_and_vcf_reader)

    if map_outfile is not None:
        f_map = open(map_outfile, "w")
    else:
        f_map = None

    new_header_lines = [
        '##FORMAT=<ID=VFR_IN_MASK,Number=1,Type=String,Description="Whether or not the variant is in the truth genome mask">',
        '##FORMAT=<ID=VFR_RESULT,Number=1,Type=String,Description="FP, TP, or Partial_TP when part of the allele matches the truth reference">',
        '##FORMAT=<ID=VFR_ALLELE_LEN,Number=1,Type=Integer,Description="Number of positions in allele that were checked if they match the truth">',
        '##FORMAT=<ID=VFR_ALLELE_MATCH_COUNT,Number=1,Type=String,Description="Number of positions in allele that match the truth">',
        '##FORMAT=<ID=VFR_ALLELE_MATCH_FRAC,Number=1,Type=String,Description="Fraction of positions in allele that match the truth">',
        '##FORMAT=<ID=VFR_ED_RA,Number=1,Type=String,Description="Edit distance between ref and alt allele (using the called allele where more than one alt)">',
        '##FORMAT=<ID=VFR_ED_TR,Number=1,Type=String,Description="Edit distance between truth and ref allele">',
        '##FORMAT=<ID=VFR_ED_TA,Number=1,Type=String,Description="Edit distance between truth and alt allele">',
        '##FORMAT=<ID=VFR_ED_SCORE,Number=1,Type=String,Description="Edit distance score">',
    ]

    with open(vcf_out, "w") as f_vcf:
        print(
            *header_lines[:-1],
            *new_header_lines,
            header_lines[-1],
            sep="\n",
            file=f_vcf,
        )

        for (vcf_record, ref_probe, alt_probe) in probes_and_vcf_reader:
            evaluate_vcf_record(
                mapper,
                vcf_record,
                ref_probe,
                alt_probe,
                vcf_ref_seqs[vcf_record.CHROM],
                truth_ref_seqs,
                map_outfile=f_map,
                use_fail_conflict=use_fail_conflict,
                truth_mask=truth_mask,
            )
            print(vcf_record, file=f_vcf)

    if map_outfile is not None:
        f_map.close()
Exemple #8
0
def _snps_file_to_vcf(snps_file, query_fasta, outfile, snps_only):
    """Loads the .snps file made by dnadiff.
    query_fasta = fasta file of query sequences.
    Writes a new VCF file unmerged records."""
    vcf_records = {}
    variants = pymummer.snp_file.get_all_variants(snps_file)
    query_seqs = utils.file_to_dict_of_seqs(query_fasta)
    discarded_variants = []

    for variant in variants:
        # If the variant is reversed, it means that either the ref or query had to be
        # reverse complemented when aligned by mummer. Need to do the appropriate
        # reverse (complement) fixes so the VCF has the correct REF and ALT sequences
        if variant.reverse:
            qry_seq = pyfastaq.sequences.Fasta("x", variant.qry_base)
            qry_seq.revcomp()
            variant.qry_base = "".join(reversed(qry_seq.seq))
            ref_seq = pyfastaq.sequences.Fasta("x", variant.ref_base)
            ref_seq.revcomp()
            variant.ref_base = ref_seq.seq

        if variant.var_type == pymummer.variant.SNP:
            new_record = vcf_record.VcfRecord("\t".join([
                variant.qry_name,
                str(variant.qry_start + 1),
                ".",
                variant.qry_base,
                variant.ref_base,
                ".",
                ".",
                f"QNAME={variant.ref_name};"
                f"QSTART={variant.ref_start + 1};"
                f"QSTRAND={'-' if variant.reverse else '+'};"
                f"LENGTH_REF={variant.qry_length};"
                f"LENGTH_QRY={variant.ref_length}",
                "GT",
                "1/1",
            ]))
        elif variant.var_type == pymummer.variant.DEL:
            if snps_only:
                discarded_variants.append(variant)
                continue

            # The query has sequence missing, compared to the
            # reference. We're making VCF records w.r.t. the
            # query, so this is an insertion. So need to
            # get the nucleotide before the insertion as well.
            new_record = vcf_record.VcfRecord("\t".join([
                variant.qry_name,
                str(variant.qry_start + 1),
                ".",
                query_seqs[variant.qry_name][variant.qry_start],
                query_seqs[variant.qry_name][variant.qry_start] +
                variant.ref_base,
                ".",
                ".",
                "SVTYPE=DNADIFF_INS",
                "GT",
                "1/1",
            ]))
        elif variant.var_type == pymummer.variant.INS:
            if snps_only:
                discarded_variants.append(variant)
                continue

            # The ref has sequence missing, compared to the
            # query. We're making VCF records w.r.t. the
            # query, so this is a deletion. So need to
            # get the nucleotide before the deletion as well.
            new_record = vcf_record.VcfRecord("\t".join([
                variant.qry_name,
                str(variant.qry_start),
                ".",
                query_seqs[variant.qry_name][variant.qry_start - 1] +
                variant.qry_base,
                query_seqs[variant.qry_name][variant.qry_start - 1],
                ".",
                ".",
                "SVTYPE=DNADIFF_DEL",
                "GT",
                "1/1",
            ]))
        else:
            raise Exception("Unknown variant type: " + str(variant))

        assert (new_record.REF == query_seqs[new_record.CHROM]
                [new_record.POS:new_record.POS + len(new_record.REF)])

        if new_record.CHROM not in vcf_records:
            vcf_records[new_record.CHROM] = []

        vcf_records[new_record.CHROM].append(new_record)

    for vcf_list in vcf_records.values():
        vcf_list.sort(key=attrgetter("POS"))

    with open(outfile, "w") as f:
        print("##fileformat=VCFv4.2", file=f)
        for seq in query_seqs.values():
            print(f"##contig=<ID={seq.id},length={len(seq)}>", file=f)
        print("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tsample",
              file=f)

        for key, vcf_list in sorted(vcf_records.items()):
            for record in vcf_list:
                print(record, file=f)

    if snps_only:
        with open(f"{outfile}.discarded_not_snps.vcf",
                  "w") as discarded_variants_fh:
            for discarded_variant in discarded_variants:
                print(discarded_variant, file=discarded_variants_fh)
Exemple #9
0
def evaluate_vcf(
    vcf_to_eval,
    vcf_ref_fasta,
    truth_ref_fasta,
    flank_length,
    outdir,
    truth_vcf=None,
    debug=False,
    force=False,
    ref_mask_bed_file=None,
    truth_mask_bed_file=None,
    discard_ref_calls=True,
    max_recall_ref_len=None,
    filter_pass=None,
):
    if force:
        subprocess.check_output(f"rm -rf {outdir}", shell=True)
    os.mkdir(outdir)

    # Mask if needed
    if ref_mask_bed_file is None:
        vcf_to_filter = vcf_to_eval
    else:
        logging.info("Masking VCF...")
        masked_vcf = os.path.join(outdir, "variants_to_eval.masked.vcf")
        utils.mask_vcf_file(vcf_to_eval, ref_mask_bed_file, masked_vcf)
        vcf_to_filter = masked_vcf
        logging.info("Masked VCF")

    vcf_ref_seqs = utils.file_to_dict_of_seqs(vcf_ref_fasta)
    filtered_vcf = os.path.join(outdir, "variants_to_eval.filtered.vcf")
    excluded_vcf = os.path.join(outdir, "variants_to_eval.excluded.vcf")
    logging.info("Filtering VCF...")
    filtered_counts = _filter_vcf(
        vcf_to_filter,
        filtered_vcf,
        excluded_vcf,
        vcf_ref_seqs,
        filter_pass=filter_pass,
        keep_ref_calls=not discard_ref_calls,
    )
    logging.info("Filtering VCF done")

    vcf_for_precision = os.path.join(outdir, "precision.vcf")
    map_outfile = f"{vcf_for_precision}.debug.map" if debug else None
    if truth_mask_bed_file is None:
        truth_mask = None
    else:
        truth_mask = utils.load_mask_bed_file(truth_mask_bed_file)

    logging.info("Annotating VCF with TP/FP for precision...")
    probe_mapping.annotate_vcf_with_probe_mapping(
        filtered_vcf,
        vcf_ref_fasta,
        truth_ref_fasta,
        flank_length,
        vcf_for_precision,
        map_outfile=map_outfile,
        use_ref_calls=not discard_ref_calls,
        truth_mask=truth_mask,
    )
    logging.info("Annotatiing VCF with with TP/FP for precision done")

    logging.info("Calculating recall...")
    recall_dir = os.path.join(outdir, "recall")
    vcf_for_recall = recall.get_recall(
        vcf_ref_fasta,
        filtered_vcf,
        recall_dir,
        flank_length,
        debug=debug,
        truth_fasta=truth_ref_fasta if truth_vcf is None else None,
        truth_vcf=truth_vcf,
        truth_mask=truth_mask,
        max_ref_len=max_recall_ref_len,
    )
    if ref_mask_bed_file is not None:
        logging.info("Masking recall VCF...")
        utils.mask_vcf_file(
            vcf_for_recall, ref_mask_bed_file, f"{vcf_for_recall}.masked.vcf"
        )
        vcf_for_recall = f"{vcf_for_recall}.masked.vcf"
        logging.info("Masking recall VCF done")
    logging.info("Recall calculation done")

    # Gather stats and make plots
    logging.info("Gathering stats...")
    per_record_recall = vcf_stats.per_record_stats_from_vcf_file(vcf_for_recall)
    recall_stats = vcf_stats.summary_stats_from_per_record_stats(
        per_record_recall, for_recall=True
    )

    per_record_precision = vcf_stats.per_record_stats_from_vcf_file(vcf_for_precision)
    precision_stats = vcf_stats.summary_stats_from_per_record_stats(
        per_record_precision
    )

    summary_stats = {"Recall": recall_stats, "Precision": precision_stats}
    _add_overall_precision_and_recall_to_summary_stats(summary_stats)
    summary_stats["Excluded_record_counts"] = filtered_counts

    summary_stats_json = os.path.join(outdir, "summary_stats.json")
    with open(summary_stats_json, "w") as f:
        json.dump(summary_stats, f, indent=2, sort_keys=True)

    logging.info(f"Done. Results written to {summary_stats_json}")