def test_intersect(self): # Test from the snp workflow. expected = ('chr',91143,91144, ('C','*A','0','|EBMYCG00000002479|Rv0083',1,0)) a = genrep.Assembly('mycoTube_H37RV') c = concat_fields(a.annot_track('CDS','chr'), infields=['name','strand','frame'], as_tuple=True) feat = fstream([('chr',91143,91144,('C','*A','0'))], fields=['chr','start','end','rest']) g = intersect([feat,c], win_size=10000) self.assertEqual(g.next(),expected) fields = ['chr','start','end','name','strand','score'] s1 = fstream([('chr',0,20,'a1',1,6.),('chr',40,60,'b',1,3.)], fields=fields) s2 = fstream([('chr',10,30,'a2',1,8.),('chr',50,70,'b',-1,4.)], fields=fields) res = list(intersect([s1,s2])) expected = [('chr',10,20,'a1|a2',1,14.),('chr',50,60,'b|b',0,7.)] self.assertListEqual(res,expected)
def test_intersect(self): # Test from the snp workflow. expected = ('chr', 91143, 91144, ('C', '*A', '0', '|EBMYCG00000002479|Rv0083', 1, 0)) a = genrep.Assembly('mycoTube_H37RV') c = concat_fields(a.annot_track('CDS', 'chr'), infields=['name', 'strand', 'frame'], as_tuple=True) feat = fstream([('chr', 91143, 91144, ('C', '*A', '0'))], fields=['chr', 'start', 'end', 'rest']) g = intersect([feat, c], win_size=10000) self.assertEqual(g.next(), expected) fields = ['chr', 'start', 'end', 'name', 'strand', 'score'] s1 = fstream([('chr', 0, 20, 'a1', 1, 6.), ('chr', 40, 60, 'b', 1, 3.)], fields=fields) s2 = fstream([('chr', 10, 30, 'a2', 1, 8.), ('chr', 50, 70, 'b', -1, 4.)], fields=fields) res = list(intersect([s1, s2])) expected = [('chr', 10, 20, 'a1|a2', 1, 14.), ('chr', 50, 60, 'b|b', 0, 7.)] self.assertListEqual(res, expected)
def exon_snps(chrom,outexons,allsnps,assembly,sample_names,genomeRef={}, logfile=sys.stdout,debugfile=sys.stderr): """Annotates SNPs described in `filedict` (a dictionary of the form {chromosome: filename} where `filename` is an output of parse_pileupFile). Adds columns 'gene', 'location_type' and 'distance' to the output of parse_pileupFile. Returns two files: the first contains all SNPs annotated with their position respective to genes in the specified assembly, and the second contains only SNPs found within CDS regions. :param chrom: (str) chromosome name. :param outexons: (str) name of the file containing the list of SNPs on exons. :param allsnps: list of tuples (chr,start,end,ref,alt1..altN) as returned by all_snps(). Ex: [('chr', 3684115, 3684116, 'G', 'G', 'G', 'G', 'T (56% of 167)', 'G'), ...] :param assembly: genrep.Assembly object :param sample_names: list of sample names. :param genomeRef: dict of the form {'chr1': filename}, where filename is the name of a fasta file containing the reference sequence for the chromosome. """ def _write_buffer(_buffer, outex): new_codon = None # One position at a time for chr,pos,refbase,variants,cds,strand,ref_codon,shift in _buffer: varbase = list(variants) # Ex: ['G','G','G','T (56% of 167)','G'], ['A/A','G/G (100% of 7)'] if new_codon is None: new_codon = [[ref_codon] for _ in range(len(varbase))] variants = [] # [[variants sample1], [variants sample2], ...] # One sample at a time for variant in varbase: if variant in ['0','-']: variants.append([refbase]) else: # Ex: 'C/G (80% of 10)' : heterozygous simple (ref is C) or double snp (ref is not C) v = variant.split()[0] # Ex: C/G v = unique(v.split('/')) # Ex: G/G -> 'G' if refbase in v: v.remove(refbase) # Ex: C/G -> 'G' (if ref is C) variants.append(v) # One sample at a time for k,v in enumerate(variants): cnumb = len(new_codon[k]) newc = new_codon[k]*len(v) for i,vari in enumerate(v): for j in range(cnumb): newc[i*cnumb+j] = newc[i*cnumb+j][:shift] +vari +newc[i*cnumb+j][shift+1:] assert ref_codon[shift] == refbase, "bug with shift within codon" new_codon[k] = newc if new_codon is None: return if strand == -1: ref_codon = revcomp(ref_codon) new_codon = [[revcomp(s) for s in c] for c in new_codon] for chr,pos,refbase,variants,cds,strand,dummy,shift in _buffer: refc = [iupac.get(x,x) for x in ref_codon] ref_codon = [''.join(x) for x in product(*refc)] newc = [[[iupac.get(x,x) for x in variant] for variant in sample] for sample in new_codon] new_codon = [[''.join(x) for codon in sample for x in product(*codon)] for sample in newc] if refbase == "*": result = [chr, pos+1, refbase] + list(variants) + [cds, strand] \ + [','.join([translate.get(refc,'?') for refc in ref_codon])] + ["indel"] else: result = [chr, pos+1, refbase] + list(variants) + [cds, strand] \ + [','.join([translate.get(refc,'?') for refc in ref_codon])] \ + [','.join([translate.get(s,'?') for s in newc]) for newc in new_codon] outex.write("\t".join([str(r) for r in result])+"\n") ############################################################# snp_stream = FeatureStream(allsnps, fields=['chr','start','end','ref']+sample_names) inclstream = concat_fields(snp_stream, infields=snp_stream.fields[3:], as_tuple=True) snp_stream = FeatureStream(allsnps, fields=['chr','start','end','ref']+sample_names) inclstream = concat_fields(snp_stream, infields=snp_stream.fields[3:], as_tuple=True) try: annotstream = concat_fields(assembly.annot_track('CDS',chrom), infields=['name','strand','frame'], as_tuple=True) annotstream = FeatureStream((x[:3]+(x[1:3]+x[3],) for x in annotstream),fields=annotstream.fields) except: return False _buffer = {1:[], -1:[]} last_start = {1:-1, -1:-1} logfile.write(" Intersection with CDS - codon changes\n"); logfile.flush() outex = open(outexons,"a") for x in gm_stream.intersect([inclstream, annotstream]): # x = (chr, start,end, (alt1,alt2, , start1,end1,cds1,strand1,phase1, start2,end2,cds2,strand2,phase2 )) # x = ('chrV',1606,1607, ('T','C (43%)', 1612,1724,'YEL077C|YEL077C',-1,0, 1712,1723,'YEL077W-A|YEL077W-A',1,0)) nsamples = len(sample_names) chr = x[0]; pos = x[1]; rest = x[3] refbase = rest[0] annot = [rest[5*i+nsamples+1 : 5*i+5+nsamples+1] for i in range(len(rest[nsamples+1:])/5)] # list of (start,end,cds,strand,phase) for es,ee,cds,strand,phase in annot: if strand == 1: shift = (pos-es-phase) % 3 elif strand == -1: shift = (pos-ee+phase) % 3 else: continue codon_start = pos-shift ref_codon = assembly.fasta_from_regions({chr: [[codon_start,codon_start+3]]}, out={}, path_to_ref=genomeRef.get(chr))[0][chr][0] info = [chr,pos,refbase,list(rest[1:nsamples+1]),cds,strand, ref_codon.upper(),shift] # Either the codon is the same as the previous one on this strand, or it will never be. # Only if one codon is passed, can write its snps to a file. if codon_start == last_start[strand]: _buffer[strand].append(info) else: _write_buffer(_buffer[strand],outex) _buffer[strand] = [info] last_start[strand] = codon_start for strand in [1,-1]: _write_buffer(_buffer[strand],outex) outex.close() return True
def exon_snps(chrom, outexons, allsnps, assembly, sample_names, genomeRef={}, logfile=sys.stdout, debugfile=sys.stderr): """Annotates SNPs described in `filedict` (a dictionary of the form {chromosome: filename} where `filename` is an output of parse_pileupFile). Adds columns 'gene', 'location_type' and 'distance' to the output of parse_pileupFile. Returns two files: the first contains all SNPs annotated with their position respective to genes in the specified assembly, and the second contains only SNPs found within CDS regions. :param chrom: (str) chromosome name. :param outexons: (str) name of the file containing the list of SNPs on exons. :param allsnps: list of tuples (chr,start,end,ref,alt1..altN) as returned by all_snps(). Ex: [('chr', 3684115, 3684116, 'G', 'G', 'G', 'G', 'T (56% of 167)', 'G'), ...] :param assembly: genrep.Assembly object :param sample_names: list of sample names. :param genomeRef: dict of the form {'chr1': filename}, where filename is the name of a fasta file containing the reference sequence for the chromosome. """ def _write_buffer(_buffer, outex): new_codon = None # One position at a time for chr, pos, refbase, variants, cds, strand, ref_codon, shift in _buffer: varbase = list( variants ) # Ex: ['G','G','G','T (56% of 167)','G'], ['A/A','G/G (100% of 7)'] if new_codon is None: new_codon = [[ref_codon] for _ in range(len(varbase))] variants = [] # [[variants sample1], [variants sample2], ...] # One sample at a time for variant in varbase: if variant in ['0', '-']: variants.append([refbase]) else: # Ex: 'C/G (80% of 10)' : heterozygous simple (ref is C) or double snp (ref is not C) v = variant.split()[0] # Ex: C/G v = unique(v.split('/')) # Ex: G/G -> 'G' if refbase in v: v.remove(refbase) # Ex: C/G -> 'G' (if ref is C) variants.append(v) # One sample at a time for k, v in enumerate(variants): cnumb = len(new_codon[k]) newc = new_codon[k] * len(v) for i, vari in enumerate(v): for j in range(cnumb): newc[i * cnumb + j] = newc[i * cnumb + j][:shift] + vari + newc[i * cnumb + j][shift + 1:] assert ref_codon[ shift] == refbase, "bug with shift within codon" new_codon[k] = newc if new_codon is None: return if strand == -1: ref_codon = revcomp(ref_codon) new_codon = [[revcomp(s) for s in c] for c in new_codon] for chr, pos, refbase, variants, cds, strand, dummy, shift in _buffer: refc = [iupac.get(x, x) for x in ref_codon] ref_codon = [''.join(x) for x in product(*refc)] newc = [[[iupac.get(x, x) for x in variant] for variant in sample] for sample in new_codon] new_codon = [[ ''.join(x) for codon in sample for x in product(*codon) ] for sample in newc] if refbase == "*": result = [chr, pos+1, refbase] + list(variants) + [cds, strand] \ + [','.join([translate.get(refc,'?') for refc in ref_codon])] + ["indel"] else: result = [chr, pos+1, refbase] + list(variants) + [cds, strand] \ + [','.join([translate.get(refc,'?') for refc in ref_codon])] \ + [','.join([translate.get(s,'?') for s in newc]) for newc in new_codon] outex.write("\t".join([str(r) for r in result]) + "\n") ############################################################# snp_stream = FeatureStream(allsnps, fields=['chr', 'start', 'end', 'ref'] + sample_names) inclstream = concat_fields(snp_stream, infields=snp_stream.fields[3:], as_tuple=True) snp_stream = FeatureStream(allsnps, fields=['chr', 'start', 'end', 'ref'] + sample_names) inclstream = concat_fields(snp_stream, infields=snp_stream.fields[3:], as_tuple=True) try: annotstream = concat_fields(assembly.annot_track('CDS', chrom), infields=['name', 'strand', 'frame'], as_tuple=True) annotstream = FeatureStream( (x[:3] + (x[1:3] + x[3], ) for x in annotstream), fields=annotstream.fields) except: return False _buffer = {1: [], -1: []} last_start = {1: -1, -1: -1} logfile.write(" Intersection with CDS - codon changes\n") logfile.flush() outex = open(outexons, "a") for x in gm_stream.intersect([inclstream, annotstream]): # x = (chr, start,end, (alt1,alt2, , start1,end1,cds1,strand1,phase1, start2,end2,cds2,strand2,phase2 )) # x = ('chrV',1606,1607, ('T','C (43%)', 1612,1724,'YEL077C|YEL077C',-1,0, 1712,1723,'YEL077W-A|YEL077W-A',1,0)) nsamples = len(sample_names) chr = x[0] pos = x[1] rest = x[3] refbase = rest[0] annot = [ rest[5 * i + nsamples + 1:5 * i + 5 + nsamples + 1] for i in range(len(rest[nsamples + 1:]) / 5) ] # list of (start,end,cds,strand,phase) for es, ee, cds, strand, phase in annot: if strand == 1: shift = (pos - es - phase) % 3 elif strand == -1: shift = (pos - ee + phase) % 3 else: continue codon_start = pos - shift ref_codon = assembly.fasta_from_regions( {chr: [[codon_start, codon_start + 3]]}, out={}, path_to_ref=genomeRef.get(chr))[0][chr][0] info = [ chr, pos, refbase, list(rest[1:nsamples + 1]), cds, strand, ref_codon.upper(), shift ] # Either the codon is the same as the previous one on this strand, or it will never be. # Only if one codon is passed, can write its snps to a file. if codon_start == last_start[strand]: _buffer[strand].append(info) else: _write_buffer(_buffer[strand], outex) _buffer[strand] = [info] last_start[strand] = codon_start for strand in [1, -1]: _write_buffer(_buffer[strand], outex) outex.close() return True