def filter_snp(general, snp_info, sample_stats, mincov, minsnp, ploidy): ref = general[2] dp4 = map(int, snp_info.get('DP4', '0,0,0,0').split( ',')) # fw.ref, rev.ref, fw.alt, rev.alt (filtered) total_reads = sum(dp4) if total_reads < mincov: # coverage low return '0' # '/'.join([ref]*ploidy) ratio = 100.0 * (dp4[2] + dp4[3]) / total_reads if ratio < minsnp / ploidy: return "-" alt = general[3] alts = [ref] + alt.split(',') sample = sample_stats[1] # ex: "0/1:48,0,53:14:0:50" genotype = sample.split(':')[ 0] # todo: better according to *format* = GT:PL:DP:SP:GQ sep = '/' if '/' in genotype else '|' # phased if |, unphased if / genotype = [alts[int(i)] for i in genotype.split(sep)] # Diploid: GT: '0/1' -> 'T/A (50% of 8)' [if ref='T' and alt='A'] # Haploid: GT: '0/1' -> 'A (50% of 8)' if ploidy == 1: genotype = unique(genotype) if ref in genotype: genotype.remove(ref) genotype = sep.join(genotype) genotype = "%s (%.0f%% of %d)" % (genotype, ratio, total_reads) return genotype
def _write_buffer(_buffer, outex): new_codon = None # One position at a time for chr, pos, refbase, variants, cds, strand, ref_codon, shift in _buffer: varbase = list( variants ) # Ex: ['G','G','G','T (56% of 167)','G'], ['A/A','G/G (100% of 7)'] if new_codon is None: new_codon = [[ref_codon] for _ in range(len(varbase))] variants = [] # [[variants sample1], [variants sample2], ...] # One sample at a time for variant in varbase: if variant in ['0', '-']: variants.append([refbase]) else: # Ex: 'C/G (80% of 10)' : heterozygous simple (ref is C) or double snp (ref is not C) v = variant.split()[0] # Ex: C/G v = unique(v.split('/')) # Ex: G/G -> 'G' if refbase in v: v.remove(refbase) # Ex: C/G -> 'G' (if ref is C) variants.append(v) # One sample at a time for k, v in enumerate(variants): cnumb = len(new_codon[k]) newc = new_codon[k] * len(v) for i, vari in enumerate(v): for j in range(cnumb): newc[i * cnumb + j] = newc[i * cnumb + j][:shift] + vari + newc[i * cnumb + j][shift + 1:] assert ref_codon[ shift] == refbase, "bug with shift within codon" new_codon[k] = newc if new_codon is None: return if strand == -1: ref_codon = revcomp(ref_codon) new_codon = [[revcomp(s) for s in c] for c in new_codon] for chr, pos, refbase, variants, cds, strand, dummy, shift in _buffer: refc = [iupac.get(x, x) for x in ref_codon] ref_codon = [''.join(x) for x in product(*refc)] newc = [[[iupac.get(x, x) for x in variant] for variant in sample] for sample in new_codon] new_codon = [[ ''.join(x) for codon in sample for x in product(*codon) ] for sample in newc] if refbase == "*": result = [chr, pos+1, refbase] + list(variants) + [cds, strand] \ + [','.join([translate.get(refc,'?') for refc in ref_codon])] + ["indel"] else: result = [chr, pos+1, refbase] + list(variants) + [cds, strand] \ + [','.join([translate.get(refc,'?') for refc in ref_codon])] \ + [','.join([translate.get(s,'?') for s in newc]) for newc in new_codon] outex.write("\t".join([str(r) for r in result]) + "\n")
def _write_buffer(_buffer, outex): new_codon = None # One position at a time for chr,pos,refbase,variants,cds,strand,ref_codon,shift in _buffer: varbase = list(variants) # Ex: ['G','G','G','T (56% of 167)','G'], ['A/A','G/G (100% of 7)'] if new_codon is None: new_codon = [[ref_codon] for _ in range(len(varbase))] variants = [] # [[variants sample1], [variants sample2], ...] # One sample at a time for variant in varbase: if variant in ['0','-']: variants.append([refbase]) else: # Ex: 'C/G (80% of 10)' : heterozygous simple (ref is C) or double snp (ref is not C) v = variant.split()[0] # Ex: C/G v = unique(v.split('/')) # Ex: G/G -> 'G' if refbase in v: v.remove(refbase) # Ex: C/G -> 'G' (if ref is C) variants.append(v) # One sample at a time for k,v in enumerate(variants): cnumb = len(new_codon[k]) newc = new_codon[k]*len(v) for i,vari in enumerate(v): for j in range(cnumb): newc[i*cnumb+j] = newc[i*cnumb+j][:shift] +vari +newc[i*cnumb+j][shift+1:] assert ref_codon[shift] == refbase, "bug with shift within codon" new_codon[k] = newc if new_codon is None: return if strand == -1: ref_codon = revcomp(ref_codon) new_codon = [[revcomp(s) for s in c] for c in new_codon] for chr,pos,refbase,variants,cds,strand,dummy,shift in _buffer: refc = [iupac.get(x,x) for x in ref_codon] ref_codon = [''.join(x) for x in product(*refc)] newc = [[[iupac.get(x,x) for x in variant] for variant in sample] for sample in new_codon] new_codon = [[''.join(x) for codon in sample for x in product(*codon)] for sample in newc] if refbase == "*": result = [chr, pos+1, refbase] + list(variants) + [cds, strand] \ + [','.join([translate.get(refc,'?') for refc in ref_codon])] + ["indel"] else: result = [chr, pos+1, refbase] + list(variants) + [cds, strand] \ + [','.join([translate.get(refc,'?') for refc in ref_codon])] \ + [','.join([translate.get(s,'?') for s in newc]) for newc in new_codon] outex.write("\t".join([str(r) for r in result])+"\n")
def filter_snp(general,snp_info,sample_stats,mincov,minsnp,ploidy): ref = general[2] dp4 = map(int, snp_info.get('DP4','0,0,0,0').split(',')) # fw.ref, rev.ref, fw.alt, rev.alt (filtered) total_reads = sum(dp4) if dp4[2]+dp4[3] < mincov: # too few supporting alt return '-' # '/'.join([ref]*ploidy) ratio = 100.0*(dp4[2]+dp4[3])/total_reads if ratio < minsnp/ploidy: return "0" alt = general[3] alts = [ref]+alt.split(',') sample = sample_stats[1] # ex: "0/1:48,0,53:14:0:50" genotype = sample.split(':')[0] # todo: better according to *format* = GT:PL:DP:SP:GQ sep = '/' if '/' in genotype else '|' # phased if |, unphased if / genotype = [alts[int(i)] for i in genotype.split(sep)] # Diploid: GT: '0/1' -> 'T/A (50% of 8)' [if ref='T' and alt='A'] # Haploid: GT: '0/1' -> 'A (50% of 8)' if ploidy == 1: genotype = unique(genotype) if ref in genotype: genotype.remove(ref) genotype = sep.join(genotype) genotype = "%s (%.0f%% of %d)" % (genotype,ratio,total_reads) return genotype