def varpileup_line2vcf_line(pileupfields): """ Converts Variant Pileup format to VCF format """ t = '\t' chr = str(pileupfields[0]) pos = str(pileupfields[1]) ref = str(pileupfields[2]) alt = str(pileupfields[3]) consqual = str(pileupfields[4]) snpqual = str(pileupfields[5]) mapqual = str(pileupfields[6]) depth = str(pileupfields[7]) alt_count = str(count_alt(depth, pileupfields[8])) GT = '1/1' if fu.isOnTheList(HETERO.keys(), alt): GT = '0/1' alt = hetero2homo(ref, alt) # 4 - Phred-scaled likelihood that the genotype is wrong, which is also called `consensus quality'. # 5 - Phred-scaled likelihood that the genotype is identical to the reference, which is also called `SNP quality'. # Suppose the reference base is A and in alignment we see 17 G and 3 A. We will get a low consensus quality because # it is difficult to distinguish an A/G heterozygote from a G/G homozygote. We will get a high SNP quality, though, # because the evidence of a SNP is very strong. # 6 - root mean square (RMS) mapping quality return chr + t + pos + t + '.' + t + ref + t + alt + t + mapqual + t + 'PASS' + t + '.' + t + 'GT:GQ:DP:AD' + t + GT + ':' + consqual + ':' + depth + ':' + alt_count
def varpileup_line2vcf_line(pileupfields): """ Converts Variant Pileup format to VCF format """ t='\t' chr=str(pileupfields[0]) pos=str(pileupfields[1]) ref=str(pileupfields[2]) alt=str(pileupfields[3]) consqual=str(pileupfields[4]) snpqual=str(pileupfields[5]) mapqual=str(pileupfields[6]) depth=str(pileupfields[7]) alt_count=str(count_alt(depth, pileupfields[8])) GT='1/1' if fu.isOnTheList(HETERO.keys(), alt): GT='0/1' alt=hetero2homo(ref,alt) # 4 - Phred-scaled likelihood that the genotype is wrong, which is also called `consensus quality'. # 5 - Phred-scaled likelihood that the genotype is identical to the reference, which is also called `SNP quality'. # Suppose the reference base is A and in alignment we see 17 G and 3 A. We will get a low consensus quality because # it is difficult to distinguish an A/G heterozygote from a G/G homozygote. We will get a high SNP quality, though, # because the evidence of a SNP is very strong. # 6 - root mean square (RMS) mapping quality return chr+t+pos+t+'.' +t+ ref +t+ alt +t+ mapqual +t+ 'PASS' +t+ '.' + t + 'GT:GQ:DP:AD'+t+GT+':'+consqual+':'+depth+':'+alt_count
def hetero2homo(ref, alt): """ Converts heterozygous symbols from Samtools pileup to A, G, T, C """ if fu.isOnTheList(HETERO.keys(), alt) == False: return alt else: alt_x=HETERO[alt] if str(ref)==str(alt_x)[0]: return str(alt_x)[1] else: return str(alt_x)[0]
def hetero2homo(ref, alt): """ Converts heterozygous symbols from Samtools pileup to A, G, T, C """ if fu.isOnTheList(HETERO.keys(), alt) == False: return alt else: alt_x = HETERO[alt] if str(ref) == str(alt_x)[0]: return str(alt_x)[1] else: return str(alt_x)[0]
def varpileup_line2vcf_line(pileupfields): """ Converts Variant Pileup format to VCF format """ t = '\t' chr = str(pileupfields[0]) pos = str(pileupfields[1]) ref = str(pileupfields[2]) alt = str(pileupfields[3]) consqual = str(pileupfields[4]) snpqual = str(pileupfields[5]) mapqual = str(pileupfields[6]) depth = str(pileupfields[7]) alt_count = str(count_alt(depth, pileupfields[8])) GT = '1/1' if fu.isOnTheList(HETERO.keys(), alt): GT = '0/1' alt = hetero2homo(ref, alt) return chr + t + pos + t + '.' + t + ref + t + alt + t + mapqual + \ t + 'PASS' + t + '.' + t + 'GT:GQ:DP:AD' + t + GT + ':' + \ consqual + ':' + depth + ':' + alt_count
def addOverlapWithGadAll(vcf, format='vcf', table='gadAll', tmpextin='', tmpextout='.1', sep='\t'): basefile = vcf vcf = basefile + tmpextin outfile = basefile + tmpextout fh_out = open(outfile, "w") fh = open(vcf) logcountfile = basefile + '.count.log' fh_log = open(logcountfile, 'a') var_count = 0 line_count = 0 inds = getFormatSpecificIndices(format=format) conn = u.db_connect() cursor = conn.cursor() linenum = 1 for line in fh: line = line.strip() ## not comments if not line.startswith("##"): #header line if (line.startswith('CHROM') or line.startswith('#CHROM')): fh_out.write(line + '\n') else: fields = line.split(sep) chr = fields[inds[0]].strip() # For some reason this table has no "chr" preceeding number if chr.startswith("chr"): chr = str(chr).replace("chr", "") pos = fields[inds[1]].strip() isOverlap = False sql = 'select * from ' + table + ' where chromosome="' + \ str(chr) + '" AND (chromStart <= ' + str(pos) + \ ' AND ' + str(pos) + ' <= chromEnd);' cursor.execute(sql) rows = cursor.fetchall() records = [] if (len(rows) > 0): records_count = 1 line_count = line_count + 1 r_tmp = [] for row in rows: var_count = var_count + 1 if not fu.isOnTheList(r_tmp, str(row[3])): r_tmp.append(str(row[3])) records.append(str(table) + '=' + str(row[3])) records_count = records_count + 1 if str(fields[7]).endswith(';'): fields[7] = fields[7] + ';'.join(records) else: fields[7] = fields[7] + ';' + ';'.join(records) fh_out.write('\t '.join(fields) + '\n') else: fh_out.write(line + '\n') linenum = linenum + 1 else: fh_out.write(line + '\n') fh_log.write(f"In {str(table)}: {str(var_count)} in " + \ f"{str(line_count)} variants\n") fh_log.close() conn.close() fh.close() fh_out.close()
def addOverlapWitHUGOGeneNomenclature(vcf, format='vcf', table='hugo', tmpextin='', tmpextout='.1', sep='\t'): basefile = vcf vcf = basefile + tmpextin outfile = basefile + tmpextout fh_out = open(outfile, "w") fh = open(vcf) logcountfile = basefile + '.count.log' fh_log = open(logcountfile, 'a') var_count = 0 line_count = 0 inds = getFormatSpecificIndices(format=format) conn = u.db_connect() cursor = conn.cursor() linenum = 1 for line in fh: line = line.strip() ## not comments if not line.startswith("##"): #header line if (line.startswith('CHROM') or line.startswith('#CHROM')): fh_out.write(line + '\n') else: fields = line.split(sep) chr = fields[inds[0]].strip() if not chr.startswith("chr"): chr = "chr" + chr pos = fields[inds[1]].strip() isOverlap = False sql = 'select * from ' + table + ' where chrom="' + \ str(chr) + '" AND (chromStart <= ' + str(pos) + \ ' AND ' + str(pos) + ' <= chromEnd);' cursor.execute(sql) rows = cursor.fetchall() records = [] if (len(rows) > 0): line_count = line_count + 1 records_count = 1 r_tmp = [] for row in rows: var_count = var_count + 1 t = str(str(row[5]) + ',' + str(row[6])).strip() if not fu.isOnTheList(r_tmp, t): r_tmp.append(t) records.append('HGNC_GeneAnnotation' + '=' + t) records_count = records_count + 1 records_str = ','.join(records).replace(';', ',') if str(fields[7]).endswith(';'): fields[7] = fields[7] + records_str else: fields[7] = fields[7] + ';' + records_str fh_out.write('\t'.join(fields) + '\n') else: fh_out.write(line + '\n') linenum = linenum + 1 else: fh_out.write(line + '\n') fh_log.write(f"In {str(table)}: {str(var_count)} in " + \ f"{str(line_count)} variants\n") fh_log.close() conn.close() fh.close() fh_out.close()