Esempio n. 1
0
def eval_isophase_phaseswitch(isophase_vcf: str,
                              config_file: Path,
                              out_f: TextIOWrapper,
                              name: str = "NA") -> None:

    _chr, _start, _end, _strand = read_config(config_file)

    reader = vcfpy.Reader(isophase_vcf)
    # record the first SNP for each isoform
    prev = {}  # sample -> CallData.GT (ex: '0|1')
    r = next(reader)
    for c in r.samples:
        prev[c.sample] = c.data.GT

    num_switch = 0

    for r in reader:
        for c in r.samples:
            if c.data.GT.find("|") == -1:
                continue  # ignore those with just one allele
            a, b = c.data.GT.split("|")
            if a == b:
                continue  # for now, ignore IsoPhase results that only uses one allele
            if prev[c.sample] != c.data.GT:
                num_switch += 1
            prev[c.sample] = c.data.GT

    out_f.write(
        f"{name}\t{_chr}\t{_start}\t{_end}\t{_strand}\t{len(r.samples)}\t{num_switch}\n"
    )
Esempio n. 2
0
def brangus(vcf_filename, out_filename, unzip_snps=None):
    if unzip_snps is None:
        unzip_snps = defaultdict(lambda: {})
        for r in vcfpy.Reader(vcf_filename):
            unzip_snps[r.CHROM][r.POS] = r

    logger.info(f"Finished reading {vcf_filename}")
    with open(out_filename, "w") as out_f:
        FIELDS = [
            "dir",
            "chrom",
            "pos",
            "strand",
            "ref",
            "alt_Short",
            "alt_PB",
            "in_Short",
            "in_PB",
            "cov_Short",
            "cov_PB",
            "genomic_HP",
        ]
        writer = DictWriter(out_f, FIELDS, delimiter="\t")
        writer.writeheader()
        dirs = glob.glob("by_loci/*size*/")
        for d1 in dirs:
            mpileup = Path(d1, "ccs.mpileup")
            mapfile = Path(d1, "fake.mapping.txt")
            vcffile = Path(d1, "phased.partial.vcf")
            config = Path(d1, "config")
            nosnp = Path(d1, "phased.partial.NO_SNPS_FOUND")
            if not vcffile.exists():
                if not nosnp.exists():
                    logger.error(f"Skipping {d1} because no SNPs found.")
            else:
                logger.info(f"Evaluating {d1}.")
                strand = "NA"
                if config.exists():  # find the strand this gene family is on
                    for line in open(config):
                        if line.startswith("ref_strand="):
                            strand = line.strip().split("=")[1]
                good_positions, cov_at_pos = get_positions_to_recover(
                    mapfile, mpileup, unzip_snps, min_cov=30)
                name = d1.split("/")[1]
                eval_isophase(
                    vcffile,
                    unzip_snps,
                    good_positions,
                    cov_at_pos,
                    {},
                    {},
                    writer,
                    name,
                    strand,
                )

    return
def collect_all_vcf(
    dirs: str,
    vcf_filename: str = "phased.partial.vcf",
    output: str = "IsoSeq_IsoPhase.vcf",
) -> None:
    no_snp_found_filename = Path(f"{Path(vcf_filename).stem}.NO_SNPS_FOUND")
    snps_by_chrom = defaultdict(lambda: [])

    reader = None

    for d in dirs:
        filename = Path(d, vcf_filename)
        if not filename.exists():
            if not no_snp_found_filename.exists():
                logger.info("VCF file {filename} does not exist. Skipping.")
            continue
        with open(filename) as rf:
            reader = vcfpy.Reader(rf)

            for r in reader:
                c = Counter()  # genotype -> count
                for x in r.samples:
                    if x.data.GT.count("|") == 0:
                        c[x.data.GT] += x.data.HQ
                    else:
                        for i, gt in enumerate(x.data.GT.split("|")):
                            c[gt] += x.data.HQ[i]
                c_keys = c.keys()
                genotype = "|".join(str(k) for k in c_keys)
                counts = ",".join(str(c[k]) for k in c_keys)
                r.samples = [
                    vcfpy.Call(
                        r,
                        "SAMPLE",
                        vcfpy.OrderedDict([("GT", genotype), ("HQ", counts)]),
                    )
                ]
                snps_by_chrom[r.CHROM].append((r.POS, r))

    keys = list(snps_by_chrom.keys())
    keys.sort()

    if reader is not None:
        reader.samples = ["SAMPLE"]
        with open(output, "w") as f:
            f = vcfpy.Writer(f, reader)
            for k in keys:
                v = snps_by_chrom[k]
                v.sort(key=lambda x: x[0])
                for _, rec in v:
                    f.write_record(rec)
        print("Output written to:", output)
Esempio n. 4
0
def filter_vcf_file(args: argparse.ArgumentParser):
    """filter_vcf_file: taking parameters from args, apply filters to VCF file, generating filtered VCF file"""
    variant_filters = []
    for filter_class in get_filters():
        variant_filters.append(filter_class(args))

    variant_filter = UnionFilter(variant_filters)
    print(variant_filter, file=sys.stderr)

    reader = vcfpy.Reader(args.input_file)
    writer = vcfpy.Writer(args.output_file, header=reader.header)
    masked_records = 0
    for record in reader:
        record = variant_filter(record)
        if record:
            # write the (possibly transformed) Record
            writer.write_record(record)
        else:
            masked_records += 1
    return masked_records
    writer.writeheader()

    for d in dirs:
        size = 0
        for r in SeqIO.parse(d.joinpath("ccs.fasta").open(), "fasta"):
            size += 1

        rec = {"locus": d, "size": size}

        if d.joinpath(d, "phased.nopartial.NO_SNPS_FOUND").exists():
            rec["num_snp"] = 0
            rec["num_hap_nopartial"] = 0
            rec["num_hap_withpartial"] = 0
        else:
            rec["num_snp"] = len(
                [x for x in vcfpy.Reader(d.joinpath("phased.partial.vcf"))])
            if d.joinpath("phased.nopartial.NO_HAPS_FOUND").exists():
                rec["num_hap_nopartial"] = 0
                rec["num_hap_withpartial"] = 0
            else:
                file1 = d.joinpath(
                    "phased.nopartial.cleaned.human_readable.txt")
                file2 = d.joinpath("phased.partial.cleaned.human_readable.txt")
                with open(file1, "r") as h1, open(file2, "r") as h2:
                    h1.readline()  # skip header
                    h2.readline()  # skip header
                    rec["num_hap_nopartial"] = len(
                        [r for r in DictReader(h1, delimiter="\t")])
                    rec["num_hap_withpartial"] = len(
                        [r for r in DictReader(h2, delimiter="\t")])
        writer.writerow(rec)
Esempio n. 6
0
def eval_isophase(
    isophase_vcf,
    genome_snp,
    good_positions,
    cov_at_pos,
    repeat_by_chrom,
    shortread_cov,
    writer_f,
    name="NA",
    strand="NA",
):
    for r in vcfpy.Reader(isophase_vcf):
        out = {
            "dir": name,
            "chrom": "NA",
            "pos": r.POS,
            "strand": strand,
            "ref": r.REF,
            "alt_Short": "NA",
            "alt_PB": "NA",
            "in_Short": "NA",
            "in_PB": "NA",
            "cov_Short": "NA",
            "cov_PB": "NA",
            "genomic_HP": "NA",
        }

        r.CHROM = r.CHROM.split("|")[0]
        out["chrom"] = r.CHROM
        out["alt_PB"] = r.ALT[0]

        out["genomic_HP"] = ("Y" if (
            r.CHROM in repeat_by_chrom
            and len(repeat_by_chrom[r.CHROM].find(r.POS, r.POS)) > 0) else "N")
        try:
            out["cov_Short"] = shortread_cov[r.CHROM][r.POS]
        except KeyError:
            out["cov_Short"] = 0
        out["cov_PB"] = cov_at_pos[r.CHROM, r.POS - 1]
        if (r.CHROM, r.POS) not in good_positions:
            out["alt_Short"] = "NA"
            out["in_Short"] = "N"
            out["in_PB"] = "Y"
        else:
            out["alt_Short"] = genome_snp[r.CHROM][r.POS].ALT[0]
            out["in_Short"] = "Y"
            out["in_PB"] = "Y"
            good_positions.remove((r.CHROM, r.POS))
        writer_f.writerow(out)

    # now we write out everything that is only in Shortread
    for chrom, pos in good_positions:
        out = {
            "dir":
            name,
            "chrom":
            chrom,
            "pos":
            pos,
            "strand":
            strand,
            "ref":
            genome_snp[chrom][pos].REF,
            "alt_Short":
            genome_snp[chrom][pos].ALT[0],
            "alt_PB":
            "NA",
            "in_Short":
            "Y",
            "in_PB":
            "N",
            "cov_Short":
            "NA",
            "cov_PB":
            cov_at_pos[chrom, pos - 1],
            "genomic_HP":
            "Y" if
            (chrom in repeat_by_chrom
             and len(repeat_by_chrom[chrom].find(pos, pos)) > 0) else "N",
        }
        try:
            out["cov_Short"] = shortread_cov[chrom][pos]
        except KeyError:
            out["cov_Short"] = 0
        writer_f.writerow(out)
Esempio n. 7
0
def main_maize(ki11_snps=None, dirs=None):
    if ki11_snps is None:
        ki11_snps = defaultdict(lambda: {})  # chrom -> pos -> VCF record
        debug_count = 0
        for r in vcfpy.Reader("B73Ki11.q20.vcf"):
            ki11_snps[r.CHROM][r.POS] = r
            # if debug_count > 100000: break
            debug_count += 1

    logger.info("Finished reading B73Ki11.q20.vcf.")

    ki11_shortread_cov = defaultdict(
        lambda: {})  # chrom -> pos -> short read cov
    # read the raw Ki11 pileup to get coverage in places where no SNPs were called
    for r in MPileUpReader("Ki11.raw.mpileup"):
        if r is not None:
            ki11_shortread_cov[r.chr][r.pos] = r.cov
    logger.info("Fnished reading Ki11.raw.mpileup.")

    repeat_by_chrom = {}
    # read the Tandem Repeat Finder summary
    for r in DictReader(open("B73_RefV4.fa.repeat_list.txt"), delimiter="\t"):
        if r["chrom"] not in repeat_by_chrom:
            repeat_by_chrom[r["chrom"]] = IntervalTree()
        repeat_by_chrom[r["chrom"]].add(int(r["start0"]), int(r["end1"]))

    logger.info("Finished reading B73_RefV4.fa.repeat_list.txt.")

    FIELDS = [
        "dir",
        "chrom",
        "pos",
        "ref",
        "alt_Short",
        "alt_PB",
        "in_Short",
        "in_PB",
        "cov_Short",
        "cov_PB",
        "genomic_HP",
    ]
    with open("evaled.isophase_SNP.txt", "w") as out_f:
        writer_f = DictWriter(out_f, FIELDS, delimiter="\t")
        writer_f.writeheader()

        debug_count = 0
        if dirs is None:
            dirs = glob.glob("by_loci/*size*/")
        for d1 in dirs:
            # if debug_count > 100: break
            debug_count += 1
            mpileup = Path(d1, "ccs.mpileup")
            mapfile = Path(d1, "fake.mapping.txt")
            vcffile = Path(d1, "phased.partial.vcf")
            nosnp = Path(d1, "phased.partial.NO_SNPS_FOUND")
            if not vcffile.exists():
                assert nosnp.exists()
                logger.info(f"Skipping {d1} because no SNPs found.")
            else:
                logger.info(f"Evaluating {d1}.")
                good_positions, cov_at_pos = get_positions_to_recover(
                    mapfile, mpileup, ki11_snps, min_cov=30
                )  # use lower min cov here becuz a few close cases where BQ filtering lowered cov
                name = d1.split("/")[1]
                eval_isophase(
                    vcffile,
                    ki11_snps,
                    good_positions,
                    cov_at_pos,
                    repeat_by_chrom,
                    ki11_shortread_cov,
                    writer_f,
                    name,
                )

    return ki11_snps
Esempio n. 8
0
    def write_haplotype_to_vcf(self, fake_genome_mapping_filename,
                               isoform_tally, output_prefix):
        """
        The following functions must first be called first:
        -- self.get_haplotype_vcf_assignment
        """
        if self.haplotype_vcf_index is None or self.alt_at_pos is None:
            raise Exception(
                "Must call self.get_haplotype_vcf_assignment() first!")

        self.sanity_check()

        name_isoforms = list(isoform_tally.keys())
        name_isoforms.sort()

        # write a fake VCF example so we can read the headers in
        with open("template.vcf", "w") as f:
            f.write(__VCF_EXAMPLE__)
        reader = vcfpy.Reader(open("template.vcf"))
        reader.samples = name_isoforms
        f_vcf = vcfpy.Writer(f"{output_prefix}.vcf", reader)

        # human readable text:
        # first line: assoc VCF filename
        # second line: haplotype, list of sorted isoforms
        # third line onwards: haplotype and assoc count
        with open(f"{output_prefix}.human_readable.txt", "w") as f_human:
            f_human.write(f"Associated VCF file: {output_prefix}.vcf\n")
            f_human.write("haplotype\t{samples}\n".format(
                samples="\t".join(name_isoforms)))
            for hap_index, hap_str in enumerate(self.haplotypes):
                f_human.write(hap_str)
                for _iso in name_isoforms:
                    if hap_index in isoform_tally[_iso]:
                        f_human.write(f"\t{isoform_tally[_iso][hap_index]}")
                    else:
                        f_human.write("\t0")
                f_human.write("\n")

        # read fake genome mapping file
        fake_map = {}  # 0-based position on fake --> (, 0-based ref position)
        with open(fake_genome_mapping_filename) as f:
            for line in f:
                fake_pos, ref_chr, ref_pos = line.strip().split(",")
                fake_map[int(fake_pos)] = (ref_chr, int(ref_pos))

        # for each position, write out the ref and alt bases
        # then fill in for each isoform (aka "sample"):
        #  if this isoform only shows one allele, then it's just that allele (0 for ref, 1+ otherwise)
        #  if this isoform shows 2+ allele, then the first allele is indicated by self.haplotypes[0]
        for i, pos in enumerate(self.hap_var_positions):
            ref_chr, ref_pos = fake_map[pos]
            total_count = sum(self.count_of_vars_by_pos[pos].values())
            alt_freq = [
                f"{self.count_of_vars_by_pos[pos][b] * 1.0 / total_count:.2f}"
                for b in self.alt_at_pos[pos]
            ]
            rec = vcfpy.Record(
                CHROM=ref_chr,
                POS=ref_pos + 1,
                ID=".",
                REF=self.ref_at_pos[pos],
                ALT=[vcfpy.Substitution(b) for b in self.alt_at_pos[pos]],
                QUAL=".",
                FILTER="PASS",
                INFO={
                    "AF": alt_freq,
                    "DP": total_count
                },
                FORMAT="GT:HQ",
                sample_indexes=None,
            )

            rec.samples = []
            for _iso in name_isoforms:
                # isoform_tally[_iso] is a dict of haplotype index --> count
                # the index for thos base at this pos would thus be haplotype_vcf_index[hap_index][i]
                # we always need to show the phases in haplotype index order sorted
                hap_indices = list(isoform_tally[_iso].keys())
                hap_indices.sort()
                genotype = "|".join(
                    str(self.haplotype_vcf_index[hap_index][pos])
                    for hap_index in hap_indices)
                counts = ",".join(
                    str(isoform_tally[_iso][hap_index])
                    for hap_index in hap_indices)
                rec.samples.append(
                    vcfpy.Call(
                        rec, _iso,
                        vcfpy.OrderedDict([("GT", genotype), ("HQ", counts)])))
            f_vcf.write_record(rec)
        f_vcf.close()
Esempio n. 9
0
def write_snp_to_vcf(
    snp_filename: Path,
    vcf_filename: Path,
    genome_filename: Path,
    genome_d: LazyFastaReader = None,
) -> None:
    # read the genome is genome_d is not given
    if genome_d is None:
        genome_d = LazyFastaReader(genome_filename)

    # read the first SNP record so we know the query name
    snp_reader = SNPReader(snp_filename)
    snp_rec = next(snp_reader)
    sample_name = snp_rec.query_name
    cur_recs = [snp_rec]
    genome_rec = genome_d[snp_rec.ref_name]

    with open("template.vcf", "w+") as f:
        f.write(f"{__VCF_EXAMPLE__}\n")
        reader = vcfpy.Reader(f)
        reader.samples = [sample_name]
        f_vcf = vcfpy.Writer(vcf_filename, reader)

        for r1 in snp_reader:
            if r1.ref_pos == cur_recs[
                    -1].ref_pos:  # multi-nt insertion, keep recording
                cur_recs.append(r1)
            elif (r1.query_base == "." and cur_recs[-1].query_base
                  == "."):  # multi-nt deletion, keep recording
                cur_recs.append(r1)
            else:  # time to write out the current set of records
                # multiple records mean it could be:
                # 1. multi-nucleotide insertions
                # 2. multi-nucleotide deletions

                if (len(cur_recs) == 1 and cur_recs[0].ref_base != "." and
                        cur_recs[0].query_base != "."):  # just a SNP record
                    pos = cur_recs[0].ref_pos
                    ref_base = cur_recs[0].ref_base
                    alt_base = cur_recs[0].query_base
                elif cur_recs[0].ref_base == ".":
                    # is a single or multi-nt insertions, must retrieve ref base from genome
                    # ex: in out.snps_files it is . --> ATG
                    # in VCF it should be T --> TATG (meaning insertion of ATG)
                    pos = cur_recs[0].ref_pos
                    ref_base = genome_rec[cur_recs[0].ref_pos]
                    alt_base = ref_base + "".join(r.query_base
                                                  for r in cur_recs)
                else:
                    # is a single multi-nt deletions, we need to get one more ref base before the first deletion
                    # ex: in out.snps_files it is GGG --> deletion
                    # in VCF it should be TGGG --> T (meaning deletion of GGG)
                    pos = cur_recs[0].ref_pos - 1
                    ref_base_prev = genome_rec[pos]
                    ref_base = ref_base_prev + "".join(r.ref_base
                                                       for r in cur_recs)
                    alt_base = ref_base_prev

                rec = vcfpy.Record(
                    CHROM=snp_rec.ref_name,
                    POS=pos + 1,
                    ID=".",
                    REF=ref_base,
                    ALT=[vcfpy.Substitution(alt_base)],
                    QUAL=".",
                    FILTER="PASS",
                    INFO={"AF": 0.5},
                    FORMAT="GT",
                    sample_indexes=None,
                )

                rec.samples.append(
                    vcfpy.Call(rec, sample_name,
                               vcfpy.OrderedDict([("GT", "0|1")])))
                f_vcf.write_record(rec)
                if r1.ref_name != cur_recs[0].ref_name:
                    genome_rec = genome_d[r1.ref_name]
                cur_recs = [r1]