def buildIndex(K, inputs, output): """ Create a new k-mer index. The FASTA files named in the list `inputs` are read in and the `K` length k-mers and their reverse complements are extracted and collated to create an index that maps from k-mer to sequence number (numbering from 0). The `names` member of the KmerIndex object can be used to retrieve the name from the sequence number. """ seqs = [] for inp in inputs: with openFile(inp) as f: seqs += list(readFasta(f)) S = [] nms = [] lens = array.array('I', []) for i in xrange(len(seqs)): (nm, seq) = seqs[i] nms.append(nm) xs = list(kmers(K, seq, True)) xs.sort() uniq(xs) seqs[i] = [nm, xs] lens.append(len(xs)) S += xs S.sort() uniq(S) S = sparse(2 * K, S) T = array.array('I', [0 for i in xrange(S.count() + 1)]) for i in xrange(len(seqs)): for x in seqs[i][1]: r = S.rank(x) T[r] += 1 t0 = 0 for i in xrange(len(T)): t1 = t0 + T[i] T[i] = t0 t0 = t1 T0 = [c for c in T] U = array.array('H', [0 for i in xrange(t0)]) for i in xrange(len(seqs)): for x in seqs[i][1]: r = S.rank(x) U[T0[r]] = i T0[r] += 1 with container(output, 'w') as z: writeKmers(K, S.xs, z) n = write32(z, T, 'offsets') z.meta['T'] = n n = write16(z, U, 'postings') z.meta['U'] = n n = write32(z, lens, 'lens') z.meta['lens'] = n z.meta['names'] = nms
def main(argv): opts = docopt.docopt(__doc__, argv) K = int(opts['-k']) B = int(opts['-b']) paired = opts['-p'] verbose = opts['-v'] Z = opts['-z'] names = [] seqs = [] baits = {} with openFile(opts['<sequences>']) as f: for (nm, seq) in readFasta(f): n = len(names) names.append(nm) seqs.append(seq) for x in kmersList(K, seq, True): if x not in baits: baits[x] = set([]) baits[x].add(n) N = len(names) caches = [ReadCache(opts['-P'], names[n], paired, B, Z) for n in range(N)] nr = 0 nh = 0 for itm in reads(opts['<input>'], reads=True, kmers=True, fwdOnly=True, paired=paired, verbose=verbose): nr += 1 E = len(itm.kmers) hits = set([]) for i in xrange(E): fwd = itm.kmers[i] for x in fwd: if x in baits: hits |= baits[x] for n in hits: caches[n].add(itm.reads) if len(hits) > 0: nh += 1 for n in xrange(N): caches[n].end()
def __getitem__(self, acc): if acc != self.prevAcc: acc = normalizeAccession(acc) pth = self.home + '/' + acc + '.fa' if not os.path.exists(pth): pth = pth + '.gz' with openFile(pth) as f: for (nm, seq) in readFasta(f): self.prevAcc = acc self.prevSeq = seq break return self.prevSeq
def __getitem__(self, acc): if acc != self.prevAcc: if acc not in hgvs.refSeq2Hg19: print >> sys.stderr, "accession %s not available." % (acc) assert acc in hgvs.refSeq2Hg19 h = hgvs.refSeq2Hg19[acc] with openFile(self.home + "/" + h + ".fa.gz") as f: for (nm, seq) in readFasta(f): self.prevAcc = acc self.prevSeq = seq break return self.prevSeq
def __getitem__(self, acc): if acc != self.prevAcc: if acc in refSeq2Hg19: h = refSeq2Hg19[acc] else: h = acc with openFile(self.home + "/" + h + ".fa.gz") as f: for (nm, seq) in readFasta(f): self.prevAcc = acc self.prevSeq = seq break return self.prevSeq
def __getitem__(self, acc): if acc != self.prevAcc: if acc in refSeq2Hg19: h = refSeq2Hg19[acc] else: h = acc pth = self.home + '/' + h + '.fa' if not os.path.exists(pth): pth += '.gz' with openFile(pth) as f: for (nm, seq) in readFasta(f): self.prevAcc = acc self.prevSeq = seq break return self.prevSeq
def main(argv): opts = docopt.docopt(__doc__, argv) K = int(opts['-k']) C = int(opts['-c']) Q = int(opts['-q']) S = int(opts['-S']) P = float(opts['-p']) verbose = opts['-v'] both = True if opts['-s']: both = False res = [] for fn in opts['<input>']: fres = {} fres['file'] = fn fres['contigs'] = [] glob = {} ncontig = 0 with openFile(fn) as f: for (nm, seq) in readFasta(f): ncontig += 1 scaff = {} for x in kmersList(K, seq, both): if sub(S, P, x): scaff[x] = 1 + scaff.get(x, 0) summary = summarize(scaff, C, Q) summary['name'] = nm fres['contigs'].append(summary) for (x, c) in scaff.items(): glob[x] = c + glob.get(x, 0) fres['global'] = summarize(glob, C, Q) res.append(fres) yaml.safe_dump(res, sys.stdout)
def main(argv): opts = docopt.docopt(__doc__, argv) K = int(opts['-k']) M = (1 << (2*K)) - 1 paired = True if opts['-s']: paired = False p = float(opts['-p']) T = int(M * p) if opts['-r']: refs = [] with openFile(opts['-r']) as f: for (nm, seq) in readFasta(f): refs += kmersList(K, seq, False) refs = set(refs) kill = set([]) for x in refs: y = rc(K, x) if y in refs: kill.add(x) kill.add(y) print >> sys.stderr, 'removing %d/%d' % (len(kill), len(refs)) refs -= set(kill) fwd = {} rev = {} for xs in parseFiles(K, paired, opts['<fastq>'], opts['-v']): fn = 0 for x in xs: if x in refs: fn += 1 ys = [rc(K, x) for x in xs] rn = 0 for y in ys: if y in refs: rn += 1 if fn + rn == 0: continue q = float(fn) / float(fn + rn) if random.random() < q: for x in xs: fwd[x] = 1 + fwd.get(x, 0) else: for y in ys: rev[y] = 1 + rev.get(y, 0) for (x,xc) in fwd.iteritems(): y = rc(K, x) yc = 0 if y in rev: yc = rev[y] del rev[y] print '%d\t%d' % (xc, yc) for (y,yc) in rev.iteritems(): print '%d\t%d' % (0, yc) return kx = {} for xs in parseFiles(K, paired, opts['<fastq>'], opts['-v']): for x in xs: if x in kx: kx[x] += 1 continue y = rc(K, x) z = murmer(min(x, y), 17) if (z & M) > T: continue kx[x] = 1 for x in kx.keys(): y = rc(K, x) if x > y: continue xc = kx[x] yc = kx.get(y, 0) if murmer(x, 17) >= murmer(y, 17): (a, b) = (x, y) (ac, bc) = (xc, yc) else: (a, b) = (y, x) (ac, bc) = (yc, xc) #print '%s\t%d\t%s\t%d' % (render(K, a), ac, render(K, b), bc) print '%d\t%d' % (ac, bc)
def main(argv): opts = docopt.docopt(__doc__, argv) fns = opts['<input>'] p = None if opts['-p'] is not None: p = float(opts['-p']) if len(fns) == 1 and isFasta(fns[0]): K = 25 seqs = [] with openFile(fns[0]) as f: for (nm, seq) in readFasta(f): xs = set(basics.kmers(K, seq, True)) xs = list(xs) xs.sort() xs = array.array('L', xs) seqs.append((nm.split()[0], xs)) Z = 1 if opts['-a']: Z = len(seqs) print len(seqs) for i in xrange(Z): xnm = seqs[i][0] xs = seqs[i][1] for j in xrange(i + 1, len(seqs)): ynm = seqs[j][0] ys = seqs[j][1] (isec, union, d) = jaccard(xs, ys) if p is None: print '%s\t%s\t%d\t%d\t%d\t%d\t%f' % ( xnm, ynm, len(xs), len(ys), isec, union, d) else: pv = logIx(p, isec + 1, (union - isec) + 1) / math.log(10) q05 = quantBeta(0.05, isec + 1, (union - isec) + 1) q95 = quantBeta(0.95, isec + 1, (union - isec) + 1) print '%s\t%s\t%d\t%d\t%d\t%d\t%f\t-%f\t+%f\t%f' % ( xnm, ynm, len(xs), len(ys), isec, union, d, d - q05, q95 - d, pv) sys.stdout.flush() return Z = 1 if opts['-a']: Z = len(fns) for i in xrange(Z): with kmers(fns[i], 'r') as z0: xK = z0.meta['K'] xs = array.array('L', readKmers(z0)) for j in xrange(i + 1, len(fns)): with kmers(fns[j], 'r') as z1: yK = z1.meta['K'] ys = array.array('L', readKmers(z1)) if xK != yK: print >> sys.stderr, 'mismatched K:', fns[j] sys.exit(1) (isec, union, d) = jaccard(xs, ys) if p is None: print '%s\t%s\t%d\t%d\t%d\t%d\t%f' % ( fns[i], fns[j], len(xs), len(ys), isec, union, d) else: pv = logIx(p, isec + 1, (union - isec) + 1) / math.log(10) q05 = quantBeta(0.05, isec + 1, (union - isec) + 1) q95 = quantBeta(0.95, isec + 1, (union - isec) + 1) print '%s\t%s\t%d\t%d\t%d\t%d\t%f\t-%f\t+%f\t%f' % ( fns[i], fns[j], len(xs), len(ys), isec, union, d, d - q05, q95 - d, pv) sys.stdout.flush()
def main(argv): opts = docopt.docopt(__doc__, argv) random.seed(17) K = int(opts['-k']) S = 2*(K-3) frameAnchors = {} knownStops = {} sequences = {} seqKmers = {} if opts['-r']: with openFile(opts['-r']) as f: for (nm,seq) in readFasta(f): sequences[nm] = seq # trim polyA tails seq = re.sub('AAAAAA*$', '', seq) seqKmers[nm] = set([]) for (x,p1) in kmersWithPosList(K, seq, False): seqKmers[nm].add(x) p = p1 - 1 w = p % 3 if x not in frameAnchors: frameAnchors[x] = set([]) frameAnchors[x].add((nm,p)) y = x & 63 if w == 0 and y in stops: if x not in knownStops: knownStops[x] = set([]) knownStops[x].add(nm) rn = 0 res = {} for fn in opts['<input>']: with openFile(fn) as f: for rd in readFastq(f): L = len(rd[1]) rn += 1 fwdAndRev = kmersWithPosLists(K, rd[1]) frames = {} possibleStops = {} for i in range(2): #print i, sorted([p for (x,p) in fwdAndRev[i]]) for (x,p) in fwdAndRev[i]: if x in frameAnchors: for (nm,q) in frameAnchors[x]: o = (q - p) k = (nm, o, i) frames[k] = 1 + frames.get(k, 0) if len(frames) == 0: continue n = sum(frames.values()) probs = [] for ((nm, off, strnd), cnt) in sorted(frames.items()): probs.append((float(cnt)/float(n), cnt, off, strnd, nm)) v = random.random() for (pv, cnt, off, strnd, nm) in probs: if v < pv: #print rd[1] #print proj(strnd, sequences[nm][off:off+len(rd[1])]) #print codons(off % 3, rd[1]), off for (x,p) in fwdAndRev[strnd]: if (p + off + K - 3) % 3 == 0 and (x & 63) in stops: if nm not in res: res[nm] = {} if x not in res[nm]: res[nm][x] = 0 res[nm][x] += 1 break v -= pv for (nm,stps) in res.iteritems(): for (x,c) in stps.iteritems(): (d,y) = nearest3(K, seqKmers[nm], x) if x in knownStops: k = 'known' else: k = 'novel' print '%s\t%s\t%d\t%d\t%s\t%s' % (k, render(K, x), c, d, render(K, y), nm)
def main(argv): opts = docopt.docopt(__doc__, argv) if opts['-X']: K = 27 S = [] N = 0 qacgt = [0, 0, 0, 0] for fn in opts['<input>']: with open(fn) as f: for (nm, seq) in readFasta(f): if len(seq) < K: continue for (x, p) in kmersWithPos(K, seq, True): S.append(x) qacgt[x & 3] += 1 N += 1 S.sort() qacgt = [float(c) / float(N) for c in qacgt] S = sparse(2 * K, array.array('L', uniq(S))) lens = [] nms = [] seqs = [] n = 0 tmp = [[] for i in xrange(S.count())] for fn in opts['<input>']: with open(fn) as f: for (nm, seq) in readFasta(f): if len(seq) < K: print >> sys.stderr, "warning: `%s' skipped" % (nm, ) continue nms.append(nm) seqs.append(seq) lens.append(len(seq)) for (x, p) in kmersWithPos(K, seq, True): r = S.rank(x) tmp[r].append((n, p)) n += 1 T = array.array('I', []) U = array.array('I', []) V = array.array('i', []) t = 0 for nps in tmp: T.append(t) t += len(nps) for (n, p) in nps: U.append(n) V.append(p) T.append(t) del tmp gfn = opts['<genes>'] with casket(gfn, 'w') as z: meta = {} meta['K'] = K meta['lens'] = lens meta['qacgt'] = qacgt meta['nms'] = nms meta['seqs'] = seqs z.add_content('__meta__', json.dumps(meta)) write64(z, S.xs, 'S') write32(z, T, 'T') write32(z, U, 'U') write32s(z, V, 'V') return print >> sys.stderr, "loading..." gfn = opts['<genes>'] with casket(gfn, 'r') as z: mf = z.open('__meta__') meta = json.load(mf) K = meta['K'] lens = meta['lens'] qacgt = meta['qacgt'] nms = meta['nms'] seqs = meta['seqs'] S = read64(z, 'S') S = sparse(2 * K, S) T = read32(z, 'T') U = read32(z, 'U') V = read32s(z, 'V') print >> sys.stderr, "done." for fn in opts['<input>']: L = array.array('B', [0 for i in xrange(S.count())]) Y = array.array('L', [0 for i in xrange(S.count())]) with kmers(fn, 'r') as z: sacgt = z.meta['acgt'] xs = readKmers(z) X = array.array('L', xs) M = len(X) resolveAll(K, S, L, Y, X) X = sparse(2 * K, X) g = sum([qp * sp for (qp, sp) in zip(qacgt, sacgt)]) print >> sys.stderr, "g =", g nm = [null(g, M, j) for j in range(0, K + 1)] # counts for computing distribution of prefix lengths cnt = [[0 for j in xrange(K + 1)] for i in xrange(len(nms))] # the k-mers that we pulled by lcp from the sample # for each position of each query. P = [ array.array('L', [0 for j in xrange(lens[i] - K + 1)]) for i in xrange(len(lens)) ] # the length of the lcp for each position of each query. Q = [ array.array('B', [0 for j in xrange(lens[i] - K + 1)]) for i in xrange(len(lens)) ] for i in xrange(S.count()): for j in xrange(T[i], T[i + 1]): n = U[j] p = V[j] y = Y[i] l = L[i] cnt[n][l] += 1 if p > 0: p -= 1 else: p = -(p + 1) y = rc(K, y) if l > Q[n][p]: Q[n][p] = l P[n][p] = y for i in xrange(len(nms)): # iterate over the queries qc = math.log(K * 0.05 / float(lens[i] - K + 1) / 2) # Link up "de Bruijn" sequences m = (1 << (2 * K - 2)) - 1 py = 0 u = unionfind() for j in xrange(lens[i] - K + 1): x = P[i][j] y = x >> 2 if j > 0: d = ham(py, y) if d == 0: u.union(j - 1, j) py = x & m # Gather up the de Bruin fragments udx = {} for j in xrange(lens[i] - K + 1): v = u.find(j) if v not in udx: udx[v] = [] udx[v].append(j) # Index the left hand k-mers idxLhs = {} kx = [] for (jx, js) in udx.iteritems(): q = 0 for j in js: q += math.log1p(-nm[Q[i][j]]) if q > math.log(0.05 / len(js)): continue kx.append((-len(js), jx)) idxLhs[P[i][js[0]]] = jx kx.sort() # Attempt to link up fragments links = {} for (_, jx) in kx: jR = udx[jx][-1] if jR == lens[i] - K + 1: continue x = P[i][jR] xs = [] lnk = None for k in xrange(100): ys = succ(K, X, x) if len(ys) != 1: break x = ys[0] if x in idxLhs: lnk = idxLhs[x] break xs.append(x) if lnk is not None: links[jx] = xs u.union(jx, lnk) # Gather up the linked fragments vdx = {} for j in [jx for (_, jx) in kx]: v = u.find(j) if v not in vdx: vdx[v] = [] vdx[v].append(j) res = [] for (jxx, jxs) in vdx.iteritems(): # Order the gragments by start position fs = [(udx[jx][0], jx) for jx in jxs] fs.sort() sxs = [] for fj in xrange(len(fs)): (_, jx) = fs[fj] beg = udx[jx][0] end = udx[jx][-1] + 1 if fj == 0: for j in xrange(beg): sxs.append((0, 0)) xs = links.get(jx, None) for j in xrange(beg, end): x = P[i][j] l = Q[i][j] sxs.append((x, l)) if xs: for x in xs: sxs.append((x, 27)) else: if fj < len(fs) - 1: nxt = fs[fj + 1][0] else: nxt = lens[i] - K + 1 for j in xrange(end, nxt): sxs.append((0, 0)) seq = [[0, 0, 0, 0] for j in xrange(len(sxs) + K - 1)] for j in xrange(len(sxs)): (x, l) = sxs[j] p = math.log1p(-nm[l]) for k in xrange(K): seq[j + K - k - 1][x & 3] += p x >>= 2 ax = [] p = None inf = False for j in xrange(len(seq)): b = 0 for k in xrange(4): if seq[j][k] < qc: b |= 1 << k ax.append(fasta(b)) ssj = sum(seq[j]) if p is None: p = ssj else: p = logAdd(p, ssj) if ssj > -1e-300: inf = True dst = counts2cdf(cnt[i]) (_, kd) = ksDistance2(dst, nm) df = math.ceil(len(seq) / float(K)) if inf: q = 1e300 pv = 0.0 else: q = 2 * math.exp(p) pv = chi2(df, q) res.append((pv, q, kd, ''.join(ax))) if len(res) == 0: continue res.sort() if res[0][0] < -2: #ed = lev(seqs[i], res[0][2]) ed = 0 pv = res[0][0] / math.log(10) c2 = res[0][1] kd = res[0][2] a = res[0][3] print '%d\t%d\t%d\t%g\t%g\t%g\t%s\t%s' % ( i, lens[i], len(a), kd, c2, pv, nms[i], a) sys.stdout.flush()
def main(argv): opts = docopt.docopt(__doc__, argv) K = int(opts['-k']) if (K & 1) != 0: print >> sys.stderr, "K must be even." return minCov = int(opts['-m']) verbose = opts['-v'] J = K // 2 S = 2*(K - J) Mj = (1 << (2*J)) - 1 names = [] seqs = {} bait = {} wtFst = [] wtLst = [] posIdx = [] rds = [] with openFile(opts['<sequences>']) as f: for (nm, seq) in readFasta(f): n = len(names) names.append(nm) seqs[nm] = seq wf = {} wl = {} for x in kmersList(K, seq, False): if x not in bait: bait[x] = set([]) bait[x].add(n) y0 = x >> S y1 = x & Mj #print '- %s\t%s\t%s' % (render(K, x), render(J, y0), render(J, y1)) if y0 not in wf: wf[y0] = set([]) wf[y0].add(y1) if y1 not in wl: wl[y1] = set([]) wl[y1].add(y0) wtFst.append(wf) wtLst.append(wl) px = {} for (x,p) in kmersWithPosList(J, seq, False): if x not in px: px[x] = [] px[x].append(p) posIdx.append(px) for (a, b, c, d) in findDup(wtFst[n], wtLst[n], wtFst[n], wtLst[n]): pps = positions(posIdx[n], J, a, b, c, d) if pps is None: continue for pp in pps: ab = a << S | b cb = c << S | b cd = c << S | d dd = pp[2] - pp[0] print >> sys.stderr, 'warning: phantom dumplication: %s-%s-%s (%d)' % (render(K, ab), render(K, cb), render(K, cd), dd) rds.append([]) N = len(names) L = None X = [{} for n in range(N)] for itm in reads(opts['<input>'], K=K, reads=True, kmers=True, both=True, verbose=verbose): rd = itm.reads[0] L = len(rd) xs = itm.kmers[0] hits = set([]) for x in xs: if x in bait: hits |= bait[x] for n in hits: for x in xs: if x not in X[n]: X[n][x] = 0 X[n][x] += 1 rds[n].append(rd) hdrShown = False vn = 0 for n in range(N): xs = {} for (x,c) in X[n].iteritems(): if c >= 10: xs[x] = c seq = seqs[names[n]] rngs = [] st = None en = None inside = False xx = [] for x in kmersList(K, seq, False): if x in xs: xx.append('.') else: xx.append('X') print ''.join(xx) for x in kmersList(K, seq, False): if not inside: if x in xs: st = x else: inside = True else: if x in xs: en = x rngs.append((st, en)) st = x en = None inside = False if inside: rngs.append((st, en)) pthr = Pather(K, xs) for (x,y) in rngs: if x is None or y is None: continue print render(K, x), render(K, y) for p in pthr.trace(x, y, 100): print renderPath(K, p) continue fst = {} lst = {} for (x,c) in xs.iteritems(): #if c < 5: # continue y0 = x >> S y1 = x & Mj if y0 not in fst: fst[y0] = [] fst[y0].append(y1) if y1 not in lst: lst[y1] = [] lst[y1].append(y0) #for (a, b, c, d) in findDupDeNovo(fst, lst): for (a, b, c, d) in findDup(wtFst[n], wtLst[n], fst, lst): #continue pps = positions(posIdx[n], J, a, b, c, d) if pps is None: continue for pp in pps: ab = a << S | b cb = c << S | b cd = c << S | d #print [(render(J, w), p) for (w,p) in zip([a, b, c, d], pps)] dd = pp[2] - pp[0] if not opts['-a'] and dd % 3 != 0: continue if opts['-s']: fstPath = interpolate(K, xs, ab, cb, dd+1) sndPath = interpolate(K, xs, cb, cd, dd+1) if fstPath is None: continue if sndPath is None: continue if fstPath[J:-J] != sndPath[J:-J]: continue pa = pp[0] pb = pp[1] pc = pp[2] pd = pp[3] cab = xs.get(ab, 0) ccb = xs.get(cb, 0) ccd = xs.get(cd, 0) if cab < minCov: continue if ccb < minCov: continue if ccd < minCov: continue m = (cab + ccd) / 2.0 # Assume the true std dev is 10% of the mean w = ccb / m hgvs = '%s:c.%d_%ddup' % (names[n], pb, pd - 1) v = Duplication(names[n], pb, pd-1, seqs) if opts['-A']: showAnchoredReads(K, {ab:'AB', cb:'CB', cd:'CD'}, rds[n]) vn += 1 hdrs = ['n'] fmts = ['%d'] outs = [vn] hdrs += ['left', 'leftCov'] fmts += ['%s','%d'] outs += [render(K, ab), cab] hdrs += ['mid', 'midCov'] fmts += ['%s','%d'] outs += [render(K, cb), ccb] hdrs += ['right', 'rightCov'] fmts += ['%s','%d'] outs += [render(K, cd), ccd] hdrs += ['len'] fmts += ['%d'] outs += [dd] hdrs += ['vaf'] fmts += ['%g'] outs += [w] hdrs += ['hgvs'] fmts += ['%s'] outs += [hgvs] if not hdrShown: hdrShown = True print '\t'.join(hdrs) print '\t'.join(fmts) % tuple(outs)
def main(argv): opts = docopt.docopt(__doc__, argv) K = 25 nms = [] idx = {} for (nm, seq) in readFasta(openFile(opts['<baits>'])): n = len(nms) nms.append(nm) for x in kmersList(K, seq, True): if x not in idx: idx[x] = set([]) idx[x].add(n) for x in idx.keys(): idx[x] = list(idx[x]) idx[x].sort() anti = set([]) if opts['-U']: with openFile(opts['-U']) as f: for (nm, seq) in readFasta(f): for x in kmersList(K, seq, True): anti.add(x) rn = 0 if opts['-p']: hist = {} for (fn1, fn2) in pairs(opts['<input>']): tmps = [(tmpfile('_1.fastq'), tmpfile('_2.fastq')) for i in xrange(len(nms))] cache = [[[], []] for i in xrange(len(nms))] counts = [0 for i in xrange(len(nms))] with openFile(fn1) as f1, openFile(fn2) as f2: for fq1, fq2 in both(readFastq(f1), readFastq(f2)): hits = set([]) pushup = False for x in kmersList(K, fq1[1]): if x in anti: pushup = True break for i in idx.get(x, []): hits.add(i) for x in kmersList(K, fq2[1]): if x in anti: pushup = True break for i in idx.get(x, []): hits.add(i) if pushup: continue n = len(hits) hist[n] = 1 + hist.get(n, 0) for i in hits: counts[i] += 1 cache[i][0].append(fq1) cache[i][1].append(fq2) if len(cache[i][0]) >= 1024: with open(tmps[i][0], 'a') as f: for rd in cache[i][0]: print >> f, rd[0] print >> f, rd[1] print >> f, rd[2] print >> f, rd[3] with open(tmps[i][1], 'a') as f: for rd in cache[i][1]: print >> f, rd[0] print >> f, rd[1] print >> f, rd[2] print >> f, rd[3] cache[i][0] = [] cache[i][1] = [] for i in xrange(len(cache)): if len(cache[i][0]) > 0: with open(tmps[i][0], 'a') as f: for rd in cache[i][0]: print >> f, rd[0] print >> f, rd[1] print >> f, rd[2] print >> f, rd[3] with open(tmps[i][1], 'a') as f: for rd in cache[i][1]: print >> f, rd[0] print >> f, rd[1] print >> f, rd[2] print >> f, rd[3] cache[i][0] = [] cache[i][1] = [] with zipfile.ZipFile(opts['<output>'], 'w', zipfile.ZIP_DEFLATED) as z: for i in xrange(len(nms)): if counts[i] > 0: pth = '/'.join(nms[i].split()) z.write(tmps[i][0], pth + '/' + fn1) os.remove(tmps[i][0]) z.write(tmps[i][1], pth + '/' + fn2) os.remove(tmps[i][1]) hist = hist.items() hist.sort() for (n, f) in hist: print '%d\t%d' % (n, f) else: raise "not implemented"
def next(self): self.readNum += 1 if (self.readNum & self.M) == 0 and self.progress is not None: self.progress.update(self.M) while True: if self.currParsers is None: if self.currFilesInd is None: self.currFilesInd = 0 else: self.currFilesInd += self.N if self.progress is not None: self.progress.update(self.readNum & self.M) if self.currFilesInd + (self.N - 1) >= len(self.files): raise StopIteration if self.verbose: pfx = ' & '.join([ basename(self.files[i]) for i in range(self.currFilesInd, self.currFilesInd + self.N) ]) self.progress = tqdm(unit=' reads', unit_scale=True) self.progress.set_postfix(reading=pfx, refresh=True) self.currParsers = [] for i in range(self.currFilesInd, self.currFilesInd + self.N): fn = self.files[i] f = openFile(fn) if isFasta(fn): self.currParsers.append(readFasta(f)) else: self.currParsers.append(readFastq(f)) self.currReads = [] try: for p in self.currParsers: self.currReads.append(p.next()) except StopIteration: if len(self.currReads) != 0: print >> sys.stderr, 'warning: files had unequal length' self.currParsers = None if self.progress is not None: self.progress.close() self.progress = None continue if self.kmers: self.currKmers = [] for rd in self.currReads: if self.fwdOnly: self.currKmers.append(kmersList(self.K, rd[1], False)) elif self.both: self.currKmers.append(kmersList(self.K, rd[1], True)) else: assert self.separate self.currKmers.append(kmersLists(self.K, rd[1])) res = Reads() if self.reads: res.reads = self.currReads if self.kmers: res.kmers = self.currKmers return res
def main(argv): opts = docopt.docopt(__doc__, argv) verbose = opts['-v'] K = int(opts['<k>']) out = opts['<output>'] Z = 1024 * 1024 * 32 if opts['-m'] is not None: Z = 1024 * 1024 * int(opts['-m']) buf = KmerAccumulator2(K) n = 0 tmps = [] acgt = [0, 0, 0, 0] m = 0 d = None if opts['-D'] is not None: d = float(opts['-D']) S = 0 if opts['-S'] is not None: S = int(opts['-S']) cacheYes = set([]) cacheNo = set([]) B = opts['-C'] if B is not None: xs = set([]) for (nm, seq) in readFasta(openFile(B)): xs |= set(kmersList(K, seq, True)) B = xs tmpnm = tmpfile('.pmc') with casket(tmpnm, 'w') as z: nr = 0 for itm in reads(opts['<input>'], K=K, pairs=False, reads=False, kmers=True, both=True, verbose=verbose): xs = itm.kmers[0] for x in xs: acgt[x & 3] += 1 if d is not None: for x in xs: if x in cacheNo: continue if x not in cacheYes: if not sub(S, d, x): cacheNo.add(x) continue cacheYes.add(x) buf.add(x) m += 1 n += 1 if len(cacheYes) > 1000000: cacheYes = set([]) if len(cacheNo) > 1000000: cacheNo = set([]) elif B is not None: found = False for x in xs: if x in B: found = True break if found: buf.addList(xs) for x in xs: m += 1 n += 1 else: buf.addList(xs) for x in xs: m += 1 n += 1 nr += 1 if (nr & 1023) == 0 and buf.mem() >= Z // 2: fn = 'tmps-%d' % (len(tmps), ) tmps.append(fn) writeKmersAndCounts2(z, buf.kmersOnly(), buf.countsOnly(), fn) buf.clear() n = 0 if len(tmps) and len(buf): fn = 'tmps-%d' % (len(tmps), ) tmps.append(fn) writeKmersAndCounts2(z, buf.kmersOnly(), buf.countsOnly(), fn) buf = [] with zotk.kmers(out, 'w') as z: h = {} if len(tmps) == 0: for c in buf.countsOnly(): h[c] = 1 + h.get(c, 0) writeKmersAndCounts2(z, buf.kmersOnly(), buf.countsOnly()) elif len(tmps) == 1: with casket(tmpnm, 'r') as z0: writeKmersAndCounts(z, readKmersAndCounts(z0, tmps[0])) else: with casket(tmpnm, 'r') as z0: xss = [readKmersAndCounts(z0, t) for t in tmps] mergeNinto(K, xss, h, z) n = float(sum(acgt)) acgt = [c / n for c in acgt] z.meta['K'] = K z.meta['kmers'] = 'kmers' z.meta['counts'] = 'counts' z.meta['hist'] = h z.meta['acgt'] = acgt z.meta['reads'] = nr os.remove(tmpnm)