def eval_isophase_phaseswitch(isophase_vcf: str, config_file: Path, out_f: TextIOWrapper, name: str = "NA") -> None: _chr, _start, _end, _strand = read_config(config_file) reader = vcfpy.Reader(isophase_vcf) # record the first SNP for each isoform prev = {} # sample -> CallData.GT (ex: '0|1') r = next(reader) for c in r.samples: prev[c.sample] = c.data.GT num_switch = 0 for r in reader: for c in r.samples: if c.data.GT.find("|") == -1: continue # ignore those with just one allele a, b = c.data.GT.split("|") if a == b: continue # for now, ignore IsoPhase results that only uses one allele if prev[c.sample] != c.data.GT: num_switch += 1 prev[c.sample] = c.data.GT out_f.write( f"{name}\t{_chr}\t{_start}\t{_end}\t{_strand}\t{len(r.samples)}\t{num_switch}\n" )
def brangus(vcf_filename, out_filename, unzip_snps=None): if unzip_snps is None: unzip_snps = defaultdict(lambda: {}) for r in vcfpy.Reader(vcf_filename): unzip_snps[r.CHROM][r.POS] = r logger.info(f"Finished reading {vcf_filename}") with open(out_filename, "w") as out_f: FIELDS = [ "dir", "chrom", "pos", "strand", "ref", "alt_Short", "alt_PB", "in_Short", "in_PB", "cov_Short", "cov_PB", "genomic_HP", ] writer = DictWriter(out_f, FIELDS, delimiter="\t") writer.writeheader() dirs = glob.glob("by_loci/*size*/") for d1 in dirs: mpileup = Path(d1, "ccs.mpileup") mapfile = Path(d1, "fake.mapping.txt") vcffile = Path(d1, "phased.partial.vcf") config = Path(d1, "config") nosnp = Path(d1, "phased.partial.NO_SNPS_FOUND") if not vcffile.exists(): if not nosnp.exists(): logger.error(f"Skipping {d1} because no SNPs found.") else: logger.info(f"Evaluating {d1}.") strand = "NA" if config.exists(): # find the strand this gene family is on for line in open(config): if line.startswith("ref_strand="): strand = line.strip().split("=")[1] good_positions, cov_at_pos = get_positions_to_recover( mapfile, mpileup, unzip_snps, min_cov=30) name = d1.split("/")[1] eval_isophase( vcffile, unzip_snps, good_positions, cov_at_pos, {}, {}, writer, name, strand, ) return
def collect_all_vcf( dirs: str, vcf_filename: str = "phased.partial.vcf", output: str = "IsoSeq_IsoPhase.vcf", ) -> None: no_snp_found_filename = Path(f"{Path(vcf_filename).stem}.NO_SNPS_FOUND") snps_by_chrom = defaultdict(lambda: []) reader = None for d in dirs: filename = Path(d, vcf_filename) if not filename.exists(): if not no_snp_found_filename.exists(): logger.info("VCF file {filename} does not exist. Skipping.") continue with open(filename) as rf: reader = vcfpy.Reader(rf) for r in reader: c = Counter() # genotype -> count for x in r.samples: if x.data.GT.count("|") == 0: c[x.data.GT] += x.data.HQ else: for i, gt in enumerate(x.data.GT.split("|")): c[gt] += x.data.HQ[i] c_keys = c.keys() genotype = "|".join(str(k) for k in c_keys) counts = ",".join(str(c[k]) for k in c_keys) r.samples = [ vcfpy.Call( r, "SAMPLE", vcfpy.OrderedDict([("GT", genotype), ("HQ", counts)]), ) ] snps_by_chrom[r.CHROM].append((r.POS, r)) keys = list(snps_by_chrom.keys()) keys.sort() if reader is not None: reader.samples = ["SAMPLE"] with open(output, "w") as f: f = vcfpy.Writer(f, reader) for k in keys: v = snps_by_chrom[k] v.sort(key=lambda x: x[0]) for _, rec in v: f.write_record(rec) print("Output written to:", output)
def filter_vcf_file(args: argparse.ArgumentParser): """filter_vcf_file: taking parameters from args, apply filters to VCF file, generating filtered VCF file""" variant_filters = [] for filter_class in get_filters(): variant_filters.append(filter_class(args)) variant_filter = UnionFilter(variant_filters) print(variant_filter, file=sys.stderr) reader = vcfpy.Reader(args.input_file) writer = vcfpy.Writer(args.output_file, header=reader.header) masked_records = 0 for record in reader: record = variant_filter(record) if record: # write the (possibly transformed) Record writer.write_record(record) else: masked_records += 1 return masked_records
writer.writeheader() for d in dirs: size = 0 for r in SeqIO.parse(d.joinpath("ccs.fasta").open(), "fasta"): size += 1 rec = {"locus": d, "size": size} if d.joinpath(d, "phased.nopartial.NO_SNPS_FOUND").exists(): rec["num_snp"] = 0 rec["num_hap_nopartial"] = 0 rec["num_hap_withpartial"] = 0 else: rec["num_snp"] = len( [x for x in vcfpy.Reader(d.joinpath("phased.partial.vcf"))]) if d.joinpath("phased.nopartial.NO_HAPS_FOUND").exists(): rec["num_hap_nopartial"] = 0 rec["num_hap_withpartial"] = 0 else: file1 = d.joinpath( "phased.nopartial.cleaned.human_readable.txt") file2 = d.joinpath("phased.partial.cleaned.human_readable.txt") with open(file1, "r") as h1, open(file2, "r") as h2: h1.readline() # skip header h2.readline() # skip header rec["num_hap_nopartial"] = len( [r for r in DictReader(h1, delimiter="\t")]) rec["num_hap_withpartial"] = len( [r for r in DictReader(h2, delimiter="\t")]) writer.writerow(rec)
def eval_isophase( isophase_vcf, genome_snp, good_positions, cov_at_pos, repeat_by_chrom, shortread_cov, writer_f, name="NA", strand="NA", ): for r in vcfpy.Reader(isophase_vcf): out = { "dir": name, "chrom": "NA", "pos": r.POS, "strand": strand, "ref": r.REF, "alt_Short": "NA", "alt_PB": "NA", "in_Short": "NA", "in_PB": "NA", "cov_Short": "NA", "cov_PB": "NA", "genomic_HP": "NA", } r.CHROM = r.CHROM.split("|")[0] out["chrom"] = r.CHROM out["alt_PB"] = r.ALT[0] out["genomic_HP"] = ("Y" if ( r.CHROM in repeat_by_chrom and len(repeat_by_chrom[r.CHROM].find(r.POS, r.POS)) > 0) else "N") try: out["cov_Short"] = shortread_cov[r.CHROM][r.POS] except KeyError: out["cov_Short"] = 0 out["cov_PB"] = cov_at_pos[r.CHROM, r.POS - 1] if (r.CHROM, r.POS) not in good_positions: out["alt_Short"] = "NA" out["in_Short"] = "N" out["in_PB"] = "Y" else: out["alt_Short"] = genome_snp[r.CHROM][r.POS].ALT[0] out["in_Short"] = "Y" out["in_PB"] = "Y" good_positions.remove((r.CHROM, r.POS)) writer_f.writerow(out) # now we write out everything that is only in Shortread for chrom, pos in good_positions: out = { "dir": name, "chrom": chrom, "pos": pos, "strand": strand, "ref": genome_snp[chrom][pos].REF, "alt_Short": genome_snp[chrom][pos].ALT[0], "alt_PB": "NA", "in_Short": "Y", "in_PB": "N", "cov_Short": "NA", "cov_PB": cov_at_pos[chrom, pos - 1], "genomic_HP": "Y" if (chrom in repeat_by_chrom and len(repeat_by_chrom[chrom].find(pos, pos)) > 0) else "N", } try: out["cov_Short"] = shortread_cov[chrom][pos] except KeyError: out["cov_Short"] = 0 writer_f.writerow(out)
def main_maize(ki11_snps=None, dirs=None): if ki11_snps is None: ki11_snps = defaultdict(lambda: {}) # chrom -> pos -> VCF record debug_count = 0 for r in vcfpy.Reader("B73Ki11.q20.vcf"): ki11_snps[r.CHROM][r.POS] = r # if debug_count > 100000: break debug_count += 1 logger.info("Finished reading B73Ki11.q20.vcf.") ki11_shortread_cov = defaultdict( lambda: {}) # chrom -> pos -> short read cov # read the raw Ki11 pileup to get coverage in places where no SNPs were called for r in MPileUpReader("Ki11.raw.mpileup"): if r is not None: ki11_shortread_cov[r.chr][r.pos] = r.cov logger.info("Fnished reading Ki11.raw.mpileup.") repeat_by_chrom = {} # read the Tandem Repeat Finder summary for r in DictReader(open("B73_RefV4.fa.repeat_list.txt"), delimiter="\t"): if r["chrom"] not in repeat_by_chrom: repeat_by_chrom[r["chrom"]] = IntervalTree() repeat_by_chrom[r["chrom"]].add(int(r["start0"]), int(r["end1"])) logger.info("Finished reading B73_RefV4.fa.repeat_list.txt.") FIELDS = [ "dir", "chrom", "pos", "ref", "alt_Short", "alt_PB", "in_Short", "in_PB", "cov_Short", "cov_PB", "genomic_HP", ] with open("evaled.isophase_SNP.txt", "w") as out_f: writer_f = DictWriter(out_f, FIELDS, delimiter="\t") writer_f.writeheader() debug_count = 0 if dirs is None: dirs = glob.glob("by_loci/*size*/") for d1 in dirs: # if debug_count > 100: break debug_count += 1 mpileup = Path(d1, "ccs.mpileup") mapfile = Path(d1, "fake.mapping.txt") vcffile = Path(d1, "phased.partial.vcf") nosnp = Path(d1, "phased.partial.NO_SNPS_FOUND") if not vcffile.exists(): assert nosnp.exists() logger.info(f"Skipping {d1} because no SNPs found.") else: logger.info(f"Evaluating {d1}.") good_positions, cov_at_pos = get_positions_to_recover( mapfile, mpileup, ki11_snps, min_cov=30 ) # use lower min cov here becuz a few close cases where BQ filtering lowered cov name = d1.split("/")[1] eval_isophase( vcffile, ki11_snps, good_positions, cov_at_pos, repeat_by_chrom, ki11_shortread_cov, writer_f, name, ) return ki11_snps
def write_haplotype_to_vcf(self, fake_genome_mapping_filename, isoform_tally, output_prefix): """ The following functions must first be called first: -- self.get_haplotype_vcf_assignment """ if self.haplotype_vcf_index is None or self.alt_at_pos is None: raise Exception( "Must call self.get_haplotype_vcf_assignment() first!") self.sanity_check() name_isoforms = list(isoform_tally.keys()) name_isoforms.sort() # write a fake VCF example so we can read the headers in with open("template.vcf", "w") as f: f.write(__VCF_EXAMPLE__) reader = vcfpy.Reader(open("template.vcf")) reader.samples = name_isoforms f_vcf = vcfpy.Writer(f"{output_prefix}.vcf", reader) # human readable text: # first line: assoc VCF filename # second line: haplotype, list of sorted isoforms # third line onwards: haplotype and assoc count with open(f"{output_prefix}.human_readable.txt", "w") as f_human: f_human.write(f"Associated VCF file: {output_prefix}.vcf\n") f_human.write("haplotype\t{samples}\n".format( samples="\t".join(name_isoforms))) for hap_index, hap_str in enumerate(self.haplotypes): f_human.write(hap_str) for _iso in name_isoforms: if hap_index in isoform_tally[_iso]: f_human.write(f"\t{isoform_tally[_iso][hap_index]}") else: f_human.write("\t0") f_human.write("\n") # read fake genome mapping file fake_map = {} # 0-based position on fake --> (, 0-based ref position) with open(fake_genome_mapping_filename) as f: for line in f: fake_pos, ref_chr, ref_pos = line.strip().split(",") fake_map[int(fake_pos)] = (ref_chr, int(ref_pos)) # for each position, write out the ref and alt bases # then fill in for each isoform (aka "sample"): # if this isoform only shows one allele, then it's just that allele (0 for ref, 1+ otherwise) # if this isoform shows 2+ allele, then the first allele is indicated by self.haplotypes[0] for i, pos in enumerate(self.hap_var_positions): ref_chr, ref_pos = fake_map[pos] total_count = sum(self.count_of_vars_by_pos[pos].values()) alt_freq = [ f"{self.count_of_vars_by_pos[pos][b] * 1.0 / total_count:.2f}" for b in self.alt_at_pos[pos] ] rec = vcfpy.Record( CHROM=ref_chr, POS=ref_pos + 1, ID=".", REF=self.ref_at_pos[pos], ALT=[vcfpy.Substitution(b) for b in self.alt_at_pos[pos]], QUAL=".", FILTER="PASS", INFO={ "AF": alt_freq, "DP": total_count }, FORMAT="GT:HQ", sample_indexes=None, ) rec.samples = [] for _iso in name_isoforms: # isoform_tally[_iso] is a dict of haplotype index --> count # the index for thos base at this pos would thus be haplotype_vcf_index[hap_index][i] # we always need to show the phases in haplotype index order sorted hap_indices = list(isoform_tally[_iso].keys()) hap_indices.sort() genotype = "|".join( str(self.haplotype_vcf_index[hap_index][pos]) for hap_index in hap_indices) counts = ",".join( str(isoform_tally[_iso][hap_index]) for hap_index in hap_indices) rec.samples.append( vcfpy.Call( rec, _iso, vcfpy.OrderedDict([("GT", genotype), ("HQ", counts)]))) f_vcf.write_record(rec) f_vcf.close()
def write_snp_to_vcf( snp_filename: Path, vcf_filename: Path, genome_filename: Path, genome_d: LazyFastaReader = None, ) -> None: # read the genome is genome_d is not given if genome_d is None: genome_d = LazyFastaReader(genome_filename) # read the first SNP record so we know the query name snp_reader = SNPReader(snp_filename) snp_rec = next(snp_reader) sample_name = snp_rec.query_name cur_recs = [snp_rec] genome_rec = genome_d[snp_rec.ref_name] with open("template.vcf", "w+") as f: f.write(f"{__VCF_EXAMPLE__}\n") reader = vcfpy.Reader(f) reader.samples = [sample_name] f_vcf = vcfpy.Writer(vcf_filename, reader) for r1 in snp_reader: if r1.ref_pos == cur_recs[ -1].ref_pos: # multi-nt insertion, keep recording cur_recs.append(r1) elif (r1.query_base == "." and cur_recs[-1].query_base == "."): # multi-nt deletion, keep recording cur_recs.append(r1) else: # time to write out the current set of records # multiple records mean it could be: # 1. multi-nucleotide insertions # 2. multi-nucleotide deletions if (len(cur_recs) == 1 and cur_recs[0].ref_base != "." and cur_recs[0].query_base != "."): # just a SNP record pos = cur_recs[0].ref_pos ref_base = cur_recs[0].ref_base alt_base = cur_recs[0].query_base elif cur_recs[0].ref_base == ".": # is a single or multi-nt insertions, must retrieve ref base from genome # ex: in out.snps_files it is . --> ATG # in VCF it should be T --> TATG (meaning insertion of ATG) pos = cur_recs[0].ref_pos ref_base = genome_rec[cur_recs[0].ref_pos] alt_base = ref_base + "".join(r.query_base for r in cur_recs) else: # is a single multi-nt deletions, we need to get one more ref base before the first deletion # ex: in out.snps_files it is GGG --> deletion # in VCF it should be TGGG --> T (meaning deletion of GGG) pos = cur_recs[0].ref_pos - 1 ref_base_prev = genome_rec[pos] ref_base = ref_base_prev + "".join(r.ref_base for r in cur_recs) alt_base = ref_base_prev rec = vcfpy.Record( CHROM=snp_rec.ref_name, POS=pos + 1, ID=".", REF=ref_base, ALT=[vcfpy.Substitution(alt_base)], QUAL=".", FILTER="PASS", INFO={"AF": 0.5}, FORMAT="GT", sample_indexes=None, ) rec.samples.append( vcfpy.Call(rec, sample_name, vcfpy.OrderedDict([("GT", "0|1")]))) f_vcf.write_record(rec) if r1.ref_name != cur_recs[0].ref_name: genome_rec = genome_d[r1.ref_name] cur_recs = [r1]