def __init__(self, samfile, seqfile=None, zerobase=False, use_slider=False): if isinstance(samfile, basestring): self.samfile = open(samfile) else: self.samfile = samfile if use_slider: from rnarry import batchutils self.samfile = batchutils.slider_file(self.samfile) self.samfile = QueueableLineReader(self.samfile) self.seqfile = seqfile if seqfile is not None: self.seqidx = GiantFASTAFile(seqfile) self.zerobase = zerobase self.seqlen = {}
'%s' % dbinfo['geneName'], '.', '+' ]) else: print >> bedout, '\t'.join([ nracc, '0', str(exonlength), '%s' % dbinfo['geneName'], '.', '+' ]) print >> faout, '>%s %s' % (nracc, dbinfo['geneName']) faout.write(textwrap(mutatedseq)) if __name__ == '__main__': refflatdbpath = sys.argv[1] nrlistpath = sys.argv[2] genomefastapath = sys.argv[3] rnaseqgspace = sys.argv[4] fastaoutpath = sys.argv[5] bedoutpath = sys.argv[6] refFlat = shelve.open(refflatdbpath, 'r') mm9 = GiantFASTAFile(genomefastapath) rseqarr = MultiTrackSplitBinnedArray(rnaseqgspace) nrlist = open(nrlistpath).read().split() bedout = open(bedoutpath, 'w') faout = open(fastaoutpath, 'w') process(nrlist)
output.write(name.ljust(32, '\x00')) output.write(struct.pack('II', len(refseq), len(grp))) output.write(refseq) for _, start, end, nreads in grp: output.write(struct.pack('III', start, end, nreads)) if __name__ == '__main__': import sys refseqfile = sys.argv[1] # fasta format mappingfile = sys.argv[2] # gzipped gmap native format readerrorfile = sys.argv[3] # python pickle format packfile = sys.argv[4] greffile = GiantFASTAFile(refseqfile) mappingf = gzip.open(mappingfile) packf = open(packfile, 'w') print 'Loading read error profile' profile, maxpos = load_profile(readerrorfile) print 'Dumping read error profile' dump_profile(profile, maxpos, packf) print 'Loading gmap mapping file' spans = scan_gmap(mappingf) print 'Dumping mapped reads' dump_gmap(spans, packf, greffile)
print >> outd, kcnt, refbase, '%.6f' % del_ratio print >> outm, kcnt, refbase, '%.6f' % mod_ratio print >> outmd, kcnt, refbase, '%.6f' % moddel_ratio print >> oute, kcnt, refbase, '%.6f' % sentropy if __name__ == '__main__': import sys mappingfile = sys.argv[1] # gmap native format greffile = sys.argv[2] outputprefix = sys.argv[3] mappingf = gzip.open(mappingfile) greffile = GiantFASTAFile(greffile) def opensortedout(fname): return os.popen( "sort -k1,1n -k2,2 -k3,3r | uniq -c | " "awk '{ print $2, $3, $4, $1; }' | gzip - >%s" % fname, 'w') nonzeroout = gzip.open(outputprefix + 'nonzero.posrcnt.gz', 'w') distout = { 'D': opensortedout(outputprefix + 'del.real.gz'), 'M': opensortedout(outputprefix + 'mod.real.gz'), 'MD': opensortedout(outputprefix + 'moddel.real.gz'), 'E': opensortedout(outputprefix + 'entropy.real.gz'), } spans = scan_gmap(mappingf)
class SAMParser(object): def __init__(self, samfile, seqfile=None, zerobase=False, use_slider=False): if isinstance(samfile, basestring): self.samfile = open(samfile) else: self.samfile = samfile if use_slider: from rnarry import batchutils self.samfile = batchutils.slider_file(self.samfile) self.samfile = QueueableLineReader(self.samfile) self.seqfile = seqfile if seqfile is not None: self.seqidx = GiantFASTAFile(seqfile) self.zerobase = zerobase self.seqlen = {} def getsubseq(self, name, seqfrom, seqto): # [from, to) both 0-base if self.seqfile is None: raise ValueError, 'Sequence file is not given.' return self.seqidx.get(name, seqfrom, seqto) def __iter__(self): return self.iteralignments() # WARNING: 'mapped' coordiates are 1-based, both side inclusive by default def iteralignments(self, strands='+-', withref=False): geteditdist= lambda x: x[4] for line in self.samfile: fields = line[:-1].split('\t') if line[0] == '@': if line[:3] != '@SQ': continue sqname = fields[1][3:] sqlen = [int(fl[3:]) for fl in fields[2:] if fl[:3] == 'LN:'][0] self.seqlen[sqname] = sqlen continue qname = fields[0] flags = int(fields[1]) rname = fields[2] pos = int(fields[3]) # 1-based leftmost mapq = int(fields[4]) # phred-scaled cigar = fields[5] seq = fields[9] options = dict(v.split(':', 1) for v in fields[11:]) if flags & F_REVERSE_STRAND: strand = '-' seq = reverse_complement(seq) else: strand = '+' editdist = int(options.get('NM', 'i:-1')[2:]) if rname == '*' or strand not in strands: mapped = [] else: reflen, _ = calculate_cigar_length(cigar) stop = pos + reflen - 1 start = pos - 1 if self.zerobase else pos mapped = [(rname, start, stop, strand, editdist, cigar)] for altmatch in options.get('XA', 'Z:')[2:].split(';')[:-1]: altfields = altmatch.split(',') strand = altfields[1][0] pos = int(altfields[1][1:]) rname = altfields[0] cigar = altfields[2] editdist = int(altfields[3]) reflen, _ = calculate_cigar_length(cigar) stop = pos + reflen - 1 start = pos - 1 if self.zerobase else pos if strand in strands: mapped.append((rname, start, stop, strand, editdist, cigar)) # search for alternative reads for altline in self.samfile: altfields = altline[:-1].split('\t') altqname = altfields[0] altflags = int(altfields[1]) if altqname != qname: self.samfile.push(altline) break altrname = altfields[2] altpos = int(altfields[3]) # 1-based leftmost altmapq = int(altfields[4]) # phred-scaled altcigar = altfields[5] altseq = altfields[9] altoptions = dict(v.split(':', 1) for v in altfields[11:]) if altflags & F_REVERSE_STRAND: altstrand = '-' altseq = reverse_complement(altseq) else: altstrand = '+' alteditdist = int(altoptions.get('NM', 'i:-1')[2:]) if altrname != '*' and altstrand in strands: altreflen, _ = calculate_cigar_length(altcigar) altstop = altpos + altreflen - 1 altstart = altpos - 1 if self.zerobase else altpos mapped.append((altrname, altstart, altstop, altstrand, alteditdist, altcigar)) mapped.sort(key=geteditdist) if withref: newmapped = [] for m in mapped: subseq = self.getsubseq(m[0], m[1]-1, m[2]) if m[3] == '-': subseq = reverse_complement(subseq) newmapped.append(m + (subseq,)) mapped = newmapped yield { 'qname': qname, 'flags': flags, 'mapq': mapq, 'seq': seq, 'options': options, 'mapped': mapped, # positions are 1-based }