def main(argv): opts = docopt.docopt(__doc__, argv) p = 0.01 if opts['-P'] is not None: p = float(opts['-P']) inp = opts['<input>'] out = opts['<output>'] with container(out, 'w') as z: h = {} with container(inp, 'r') as z0: K = z0.meta['K'] z.meta = z0.meta.copy() del z.meta['kmers'] del z.meta['counts'] xs = readKmersAndCounts(z0) if opts['-D'] is None: if opts['-S'] is not None: S = long(opts['-S']) random.seed(S) writeKmersAndCounts(K, sampleR(p, xs, h), z) else: S = 0 if opts['-S'] is not None: S = long(opts['-S']) writeKmersAndCounts(K, sampleD(p, S, xs, h), z) z.meta['hist'] = h
def test_rw_2(): K = 27 M = (1 << (2 * K)) - 1 N = 100000 random.seed(17) xs = [random.randint(0, M) for i in xrange(N)] nm = tmpfile() with container.container(nm, 'w') as z: w = vecs.writer64(z, "wibble") for x in xs: w.append(x) with pytest.raises(KeyError): with container.container(nm, 'r') as z: ys = list(vecs.read64(z, 'wibble', N)) os.remove(nm)
def main(argv): opts = docopt.docopt(__doc__, argv) if opts['-X']: K = 27 if opts['-K']: K = int(opts['-K']) buildIndex(K, opts['<input>'], opts['<alleles>']) return idx = index(opts['<alleles>']) for inp in opts['<input>']: with container(inp, 'r') as z: K0 = z.meta['K'] if K0 != idx.K: print >> sys.stderr, 'Input "%d" has different k to index' % ( inp, ) sys.exit(1) xs = readKmers(z) cs = [idx.lens[i] for i in xrange(len(idx.lens))] for x in xs: for j in idx[x]: cs[j] -= 1 for j in xrange(len(idx.lens)): assert cs[j] >= 0 if cs[j] == 0: print '%s\t%d\t%s' % (inp, j, idx.names[j])
def test_merg2_0(): K = 27 M = (1 << (2 * K)) - 1 N = 100000 random.seed(17) xs = [(random.randint(0, M), pois(10)) for i in xrange(N)] xs.sort() ys = [(random.randint(0, M), pois(10)) for i in xrange(N)] ys.sort() nm0 = tmpfile() with container(nm0, 'w') as z: writeKmersAndCounts(K, xs, z, 'xs') writeKmersAndCounts(K, ys, z, 'ys') nm1 = tmpfile() h = {} with container(nm0, 'r') as z0, container(nm1, 'w') as z: merge2(z, K, readKmersAndCounts(z0, 'xs'), readKmersAndCounts(z0, 'ys'), h, 'zs') h = h.items() h.sort() ws = {} for (x, c) in xs: ws[x] = c + ws.get(x, 0) for (y, c) in ys: ws[y] = c + ws.get(y, 0) ws = ws.items() ws.sort() with container(nm1, 'r') as z: zs = list(readKmersAndCounts(z, 'zs')) assert len(ws) == len(zs) for i in xrange(len(ws)): assert ws[i] == zs[i] h1 = {} for (_, c) in ws: h1[c] = 1 + h1.get(c, 0) h1 = h1.items() h1.sort() assert len(h) == len(h1) for i in xrange(len(h)): assert h[i] == h1[i]
def buildIndex(K, inputs, output): """ Create a new k-mer index. The FASTA files named in the list `inputs` are read in and the `K` length k-mers and their reverse complements are extracted and collated to create an index that maps from k-mer to sequence number (numbering from 0). The `names` member of the KmerIndex object can be used to retrieve the name from the sequence number. """ seqs = [] for inp in inputs: with openFile(inp) as f: seqs += list(readFasta(f)) S = [] nms = [] lens = array.array('I', []) for i in xrange(len(seqs)): (nm, seq) = seqs[i] nms.append(nm) xs = list(kmers(K, seq, True)) xs.sort() uniq(xs) seqs[i] = [nm, xs] lens.append(len(xs)) S += xs S.sort() uniq(S) S = sparse(2*K, S) T = array.array('I', [0 for i in xrange(S.count() + 1)]) for i in xrange(len(seqs)): for x in seqs[i][1]: r = S.rank(x) T[r] += 1 t0 = 0 for i in xrange(len(T)): t1 = t0 + T[i] T[i] = t0 t0 = t1 T0 = [c for c in T] U = array.array('H', [0 for i in xrange(t0)]) for i in xrange(len(seqs)): for x in seqs[i][1]: r = S.rank(x) U[T0[r]] = i T0[r] += 1 with container(output, 'w') as z: writeKmers(K, S.xs, z) n = write32(z, T, 'offsets') z.meta['T'] = n n = write16(z, U, 'postings') z.meta['U'] = n n = write32(z, lens, 'lens') z.meta['lens'] = n z.meta['names'] = nms
def index(fn): """ Load a k-mer index into memory and return the resulting KmerIndex object. """ with container(fn, 'r') as z: idx = KmerIndex(z) return idx
def test_std_0(): K = 27 M = (1 << (2 * K)) - 1 N = 100000 random.seed(17) xs = [(random.randint(0, M), pois(10)) for i in xrange(N)] nm = tmpfile() with container.container(nm, 'w') as z: std.writeKmersAndCounts(K, xs, z, 'wibble') with container.container(nm, 'r') as z: ys = list(std.readKmersAndCounts(z, 'wibble')) assert len(ys) == N for i in xrange(N): assert xs[i] == ys[i] os.remove(nm)
def test_rw_1(): K = 27 M = (1 << (2 * K)) - 1 N = 100000 random.seed(17) xs = [random.randint(0, M) for i in xrange(N)] nm = tmpfile() with container.container(nm, 'w') as z: vecs.write64(z, xs, "wibble") with container.container(nm, 'r') as z: ys = list(vecs.read64(z, 'wibble', N)) assert len(ys) == N for i in xrange(N): assert xs[i] == ys[i] os.remove(nm)
def main(argv): opts = docopt.docopt(__doc__, argv) for inp in opts['<input>']: with container(inp, 'r') as z: if 'hist' in z.meta: h = z.meta['hist'].items() h.sort() for (f,c) in h: print '%s\t%d\t%d' % (inp, f, c)
def main(argv): opts = docopt.docopt(__doc__, argv) with container(opts['<ref>'], 'r') as z: K = z.meta['K'] xs = array.array('L', readKmers(z)) Z = len(xs) with container(opts['<input>'], 'r') as z0: K0 = z0.meta['K'] if K0 != K: print >> sys.stderr, "mismatched K (%d)" % (K0, ) sys.exit(1) with container(opts['<output>'], 'w') as z: if 'counts' in z0.meta: ys = readKmersAndCounts(z0) writeKmersAndCounts(K, project2(xs, ys), z) else: ys = readKmers(z0) writeKmers(K, project1(xs, ys), z)
def main(argv): opts = docopt.docopt(__doc__, argv) with container(opts['<ref>'], 'r') as z: K = z.meta['K'] xs = readKmers(z) if opts['-H'] is not None: d = int(opts['-H']) ref = hamming(K, d, xs) elif opts['-L'] is not None: d = int(opts['-L']) ref = levenshtein(K, d, xs) else: ref = ksnp(K, xs) xs = [] for ys in ref: xs += ys xs.sort() with container(opts['<output>'], 'w') as z: writeKmers(K, xs, z)
def main(argv): opts = docopt.docopt(__doc__, argv) inp = opts['<input>'] out = opts['<output>'] c = 0 if opts['-c'] is not None: c = int(opts['-c']) with container(inp, 'r') as z: K = z.meta['K'] h = z.meta['hist'] if c == 0: c = infer(K, h) print >> sys.stderr, 'inferred cutoff:', c xs = readKmersAndCounts(z) with container(out, 'w') as w: w.meta = z.meta.copy() del w.meta['kmers'] del w.meta['counts'] writeKmersAndCounts(K, trim(xs, c), w)
def main(argv): opts = docopt.docopt(__doc__, argv) L0 = None if opts['-l']: L0 = int(opts['-l']) for inp in opts['<input>']: with container(inp, 'r') as z: K = z.meta['K'] L = L0 if L is None: L = 2*K xs = array.array('L', readKmers(z)) S = sparse(2*K, xs) seen = bitvec(S.count()) for i in xrange(S.count()): if seen[i]: continue x = S.select(i) xb = rc(K, x) xp = succ(K, S, xb) if xp == 1: # x isn't the start of a contig continue pth = [x] seen[i] = 1 xn = succ(K, S, x) while len(xn) == 1: if seen[xn[0]] == 1: break x = S.select(xn[0]) pth.append(x) seen[xn[0]] = 1 xb = rc(K, x) j = S.rank(xb) seen[j] = 1 xn = succ(K, S, x) if len(pth)+K-1 < L: continue s = [render(K, pth[0])] for j in xrange(1, len(pth)): s.append("ACGT"[pth[j]&3]) print '>contig_%d\n%s' % (i, ''.join(s))
def main(argv): opts = docopt.docopt(__doc__, argv) fns = opts['<input>'] Z = 1 if opts['-a']: Z = len(fns) p = None if opts['-p'] is not None: p = float(opts['-p']) for i in xrange(Z): with container(fns[i], 'r') as z0: xK = z0.meta['K'] xs = array.array('L', readKmers(z0)) for j in xrange(i + 1, len(fns)): with container(fns[j], 'r') as z1: yK = z1.meta['K'] ys = array.array('L', readKmers(z1)) if xK != yK: print >> sys.stderr, 'mismatched K:', fns[j] sys.exit(1) (isec, union, d) = jaccard(xs, ys) if p is None: print '%s\t%s\t%d\t%d\t%d\t%d\t%f' % ( fns[i], fns[j], len(xs), len(ys), isec, union, d) else: pv = logIx(p, isec + 1, (union - isec) + 1) / math.log(10) q05 = quantBeta(0.05, isec + 1, (union - isec) + 1) q95 = quantBeta(0.95, isec + 1, (union - isec) + 1) print '%s\t%s\t%d\t%d\t%d\t%d\t%f\t-%f\t+%f\t%f' % ( fns[i], fns[j], len(xs), len(ys), isec, union, d, d - q05, q95 - d, pv) sys.stdout.flush()
def main(argv): opts = docopt.docopt(__doc__, argv) nms = [str(i + 1) for i in xrange(len(probesMTB))] probes = probesMTB if opts['-p'] is not None: nms = [] probes = [] bad = False with open(opts['-p']) as f: i = 0 ln = 0 for l in f: ln += 1 if l[0] == '#': continue i += 1 t = l.split() if len(t) == 1: nms.append(str(i)) probes.append(probe(t[0])) elif len(t) == 2: nms.append(t[0]) probes.append(probe(t[1])) else: bad = True print >> sys.stderr, '%s line %d, badly formatted.' % ( opts['-p'], i) if bad: sys.exit(1) for inp in opts['<input>']: with container(inp, 'r') as z: K = z.meta['K'] xs = readKmers(z) xs = sparse(2 * K, array.array('L', xs)) res = [] for i in xrange(len(probes)): if findProbe(probes[i], K, xs): res.append('1') else: res.append('0') if opts['-l']: for i in xrange(len(nms)): print '%s\t%s\t%s' % (inp, nms[i], res[i]) else: print inp + '\t' + ''.join(res)
def main(argv): opts = docopt.docopt(__doc__, argv) inp = opts['<input>'] with container(inp, 'r') as z: K = z.meta['K'] if 'kmers' not in z.meta: print >> sys.stderr, 'cannot dump "%s" as it contains no k-mers' % ( inp, ) return if 'counts' in z.meta: xs = readKmersAndCounts(z) for (x, c) in xs: print '%s\t%d' % (render(K, x), c) else: xs = readKmers(z) for x in xs: print render(K, x)
def main(argv): opts = docopt.docopt(__doc__, argv) K = None out = opts['<output>'] px = list(pairs(opts['<input>'])) if len(px) == 1: with container(out, 'w') as z: h = {} acgt = [0, 0, 0, 0] ix = px[0] if len(ix) == 1: with container(ix[0], 'r') as z0: K = z0.meta['K'] xs = readKmersAndCounts(z0) zs = hist(xs, h, acgt) writeKmersAndCounts(K, xs, z) else: with container(ix[0], 'r') as z0: K = z0.meta['K'] xs = readKmersAndCounts(z0) with container(ix[1], 'r') as z1: K1 = z1.meta['K'] if K1 != K: print >> sys.stderr, "mismatched K" sys.exit(1) ys = readKmersAndCounts(z1) zs = hist(merge(xs, ys), h, acgt) writeKmersAndCounts(K, zs, z) n = float(sum(acgt)) acgt = [c/n for c in acgt] z.meta['hist'] = h z.meta['acgt'] = acgt return tmps = [] tmpnm = tmpfile('.pmc') with container(tmpnm, 'w') as z: for ix in px: if len(ix) == 1: nm = 'tmp-' + str(len(tmps)) tmps.append(nm) with container(ix[0], 'r') as z0: if K is None: K = z0.meta['K'] else: K0 = z0.meta['K'] if K0 != K: print >> sys.stderr, "mismatched K" sys.exit(1) xs = readKmersAndCounts(z0) writeKmersAndCounts(K, xs, z, nm) else: nm = 'tmp-' + str(len(tmps)) tmps.append(nm) with container(ix[0], 'r') as z0: if K is None: K = z0.meta['K'] else: K0 = z0.meta['K'] if K0 != K: print >> sys.stderr, "mismatched K" sys.exit(1) xs = readKmersAndCounts(z0) with container(ix[1], 'r') as z1: K1 = z1.meta['K'] if K1 != K: print >> sys.stderr, "mismatched K" sys.exit(1) ys = readKmersAndCounts(z1) writeKmersAndCounts(K, merge(xs, ys), z, nm) assert K is not None with container(out, 'w') as z: h = {} acgt = [0, 0, 0, 0] with container(tmpnm, 'r') as z0: zs = None for fn in tmps: xs = readKmersAndCounts(z0, fn) if zs is None: zs = xs else: zs = merge(zs, xs) zs = hist(zs, h, acgt) writeKmersAndCounts(K, zs, z) n = float(sum(acgt)) acgt = [c/n for c in acgt] z.meta['hist'] = h z.meta['acgt'] = acgt os.remove(tmpnm)
def main(argv): opts = docopt.docopt(__doc__, argv) if opts['-X']: K = 27 S = [] N = 0 qacgt = [0, 0, 0, 0] for fn in opts['<input>']: with open(fn) as f: for (nm, seq) in readFasta(f): if len(seq) < K: continue for (x,p) in kmersWithPos(K, seq, True): S.append(x) qacgt[x&3] += 1 N += 1 S.sort() qacgt = [float(c)/float(N) for c in qacgt] S = sparse(2*K, array.array('L', uniq(S))) lens = array.array('I', []) nms = [] seqs = [] n = 0 tmp = [[] for i in xrange(S.count())] for fn in opts['<input>']: with open(fn) as f: for (nm, seq) in readFasta(f): if len(seq) < K: print >> sys.stderr, "warning: `%s' skipped" % (nm,) continue nms.append(nm) seqs.append(seq) lens.append(len(seq)) for (x,p) in kmersWithPos(K, seq, True): r = S.rank(x) tmp[r].append((n, p)) n += 1 T = array.array('I', []) U = array.array('I', []) V = array.array('i', []) t = 0 for nps in tmp: T.append(t) t += len(nps) for (n, p) in nps: U.append(n) V.append(p) T.append(t) del tmp gfn = opts['<genes>'] with container(gfn, 'w') as z: z.meta['K'] = K z.meta['S'] = S.count() write64(z, S.xs, 'S') z.meta['T'] = len(T) write64(z, T, 'T') z.meta['U'] = len(U) write32(z, U, 'U') z.meta['V'] = len(V) write32s(z, V, 'V') z.meta['lens'] = lens z.meta['qacgt'] = qacgt z.meta['nms'] = nms z.meta['seqs'] = seqs return print >> sys.stderr, "loading..." gfn = opts['<genes>'] with container(gfn, 'r') as z: K = z.meta['K'] S = array.array('L', read64(z, 'S', z.meta['S'])) S = sparse(2*K, S) T = array.array('L', read64(z, 'T', z.meta['T'])) U = array.array('I', read32(z, 'U', z.meta['U'])) V = array.array('i', read32s(z, 'V', z.meta['V'])) lens = z.meta['lens'] qacgt = z.meta['qacgt'] nms = z.meta['nms'] seqs = z.meta['seqs'] print >> sys.stderr, "done." for fn in opts['<input>']: L = array.array('B', [0 for i in xrange(S.count())]) Y = array.array('L', [0 for i in xrange(S.count())]) with container(fn, 'r') as z: sacgt = z.meta['acgt'] xs = readKmers(z) X = array.array('L', xs) M = len(X) resolveAll(K, S, L, Y, X) X = sparse(2*K, X) g = sum([qp*sp for (qp, sp) in zip(qacgt, sacgt)]) print >> sys.stderr, "g =", g nm = [null(g, M, j) for j in range(0, K+1)] # counts for computing distribution of prefix lengths cnt = [[0 for j in xrange(K+1)] for i in xrange(len(nms))] # the k-mers that we pulled by lcp from the sample # for each position of each query. P = [array.array('L', [0 for j in xrange(lens[i] - K + 1)]) for i in xrange(len(lens))] # the length of the lcp for each position of each query. Q = [array.array('B', [0 for j in xrange(lens[i] - K + 1)]) for i in xrange(len(lens))] for i in xrange(S.count()): for j in xrange(T[i], T[i+1]): n = U[j] p = V[j] y = Y[i] l = L[i] cnt[n][l] += 1 if p > 0: p -= 1 else: p = -(p + 1) y = rc(K, y) if l > Q[n][p]: Q[n][p] = l P[n][p] = y for i in xrange(len(nms)): # iterate over the queries qc = math.log(K*0.05/float(lens[i] - K + 1)/2) # Link up "de Bruijn" sequences m = (1 << (2*K - 2)) - 1 py = 0 u = unionfind() for j in xrange(lens[i] - K + 1): x = P[i][j] y = x >> 2 if j > 0: d = ham(py, y) if d == 0: u.union(j-1, j) py = x & m # Gather up the de Bruin fragments udx = {} for j in xrange(lens[i] - K + 1): v = u.find(j) if v not in udx: udx[v] = [] udx[v].append(j) # Index the left hand k-mers idxLhs = {} kx = [] for (jx, js) in udx.iteritems(): q = 0 for j in js: q += math.log1p(-nm[Q[i][j]]) if q > math.log(0.05/len(js)): continue kx.append((-len(js), jx)) idxLhs[P[i][js[0]]] = jx kx.sort() # Attempt to link up fragments links = {} for (_, jx) in kx: jR = udx[jx][-1] if jR == lens[i] - K + 1: continue x = P[i][jR] xs = [] lnk = None for k in xrange(100): ys = succ(K, X, x) if len(ys) != 1: break x = ys[0] if x in idxLhs: lnk = idxLhs[x] break xs.append(x) if lnk is not None: links[jx] = xs u.union(jx, lnk) # Gather up the linked fragments vdx = {} for j in [jx for (_, jx) in kx]: v = u.find(j) if v not in vdx: vdx[v] = [] vdx[v].append(j) res = [] for (jxx, jxs) in vdx.iteritems(): # Order the gragments by start position fs = [(udx[jx][0], jx) for jx in jxs] fs.sort() sxs = [] for fj in xrange(len(fs)): (_, jx) = fs[fj] beg = udx[jx][0] end = udx[jx][-1] + 1 if fj == 0: for j in xrange(beg): sxs.append((0, 0)) xs = links.get(jx, None) for j in xrange(beg, end): x = P[i][j] l = Q[i][j] sxs.append((x, l)) if xs: for x in xs: sxs.append((x, 27)) else: if fj < len(fs) - 1: nxt = fs[fj+1][0] else: nxt = lens[i] - K + 1 for j in xrange(end, nxt): sxs.append((0, 0)) seq = [[0, 0, 0, 0] for j in xrange(len(sxs) + K - 1)] for j in xrange(len(sxs)): (x, l) = sxs[j] p = math.log1p(-nm[l]) for k in xrange(K): seq[j + K - k - 1][x&3] += p x >>= 2 ax = [] p = None inf = False for j in xrange(len(seq)): b = 0 for k in xrange(4): if seq[j][k] < qc: b |= 1 << k ax.append(fasta(b)) ssj = sum(seq[j]) if p is None: p = ssj else: p = logAdd(p, ssj) if ssj > -1e-300: inf = True dst = counts2cdf(cnt[i]) (_, kd) = ksDistance2(dst, nm) df = math.ceil(len(seq)/float(K)) if inf: q = 1e300 pv = 0.0 else: q = 2*math.exp(p) pv = chi2(df, q) res.append((pv, q, kd, ''.join(ax))) if len(res) == 0: continue res.sort() if res[0][0] < -2: #ed = lev(seqs[i], res[0][2]) ed = 0 pv = res[0][0]/math.log(10) c2 = res[0][1] kd = res[0][2] a = res[0][3] print '%d\t%d\t%d\t%g\t%g\t%g\t%s\t%s' % (i, lens[i], len(a), kd, c2, pv, nms[i], a) sys.stdout.flush()
def main(argv): opts = docopt.docopt(__doc__, argv) K = int(opts['<k>']) out = opts['<output>'] Z = 1024 * 1024 * 32 if opts['-m'] is not None: Z = 1024 * 1024 * int(opts['-m']) buf = KmerAccumulator() n = 0 tmps = [] acgt = [0, 0, 0, 0] m = 0 d = None if opts['-D'] is not None: d = float(opts['-D']) S = 0 if opts['-S'] is not None: S = int(opts['-S']) cacheYes = set([]) cacheNo = set([]) tmpnm = tmpfile('.pmc') with container(tmpnm, 'w') as z: pass PN = 1024 * 1024 nr = 0 t0 = time.time() for fn in opts['<input>']: for rds in mkParser(fn): for (nm, seq) in rds: nr += 1 if nr & (PN - 1) == 0: t1 = time.time() print >> sys.stderr, 'reads processed:', nr, (PN) / ( t1 - t0), 'reads/second' t0 = t1 #buf.stat() xs = kmersList(K, seq, True) if d is None: buf.addList(xs) for x in xs: acgt[x & 3] += 1 m += 1 n += 1 else: for x in xs: if x in cacheNo: continue if x not in cacheYes: if not sub(S, d, x): cacheNo.add(x) continue cacheYes.add(x) buf.add(x) acgt[x & 3] += 1 m += 1 n += 1 if len(cacheYes) > 1000000: cacheYes = set([]) if len(cacheNo) > 1000000: cacheNo = set([]) if 8 * n >= Z: fn = 'tmps-%d' % (len(tmps), ) #print >> sys.stderr, "writing " + fn + "\t" + tmpnm tmps.append(fn) with container(tmpnm, 'a') as z: writeKmersAndCounts(K, mkPairs(buf.kmers()), z, fn) buf.clear() n = 0 t1 = time.time() print >> sys.stderr, 'reads processed:', nr, (nr % PN) / ( t1 - t0), 'reads/second' if len(tmps) and len(buf): fn = 'tmps-%d' % (len(tmps), ) #print >> sys.stderr, "writing " + fn + "\t" + tmpnm tmps.append(fn) with container(tmpnm, 'a') as z: writeKmersAndCounts(K, mkPairs(buf.kmers()), z, fn) buf = [] while len(tmps) > 2: tmpnm2 = tmpfile('.pmc') tmps2 = [] with container(tmpnm, 'r') as z0, container(tmpnm2, 'w') as z: ps = pairs(tmps) for p in ps: fn = 'tmps-%d' % (len(tmps2), ) tmps2.append(fn) if len(p) == 1: writeKmersAndCounts(K, readKmersAndCounts(z0, p[0]), z, fn) continue h = {} merge2(z, K, readKmersAndCounts(z0, p[0]), readKmersAndCounts(z0, p[1]), h, fn) os.remove(tmpnm) tmpnm = tmpnm2 tmps = tmps2 with container(out, 'w') as z: h = {} if len(tmps) == 0: zs = hist(mkPairs(buf.kmers()), h) writeKmersAndCounts(K, zs, z) elif len(tmps) == 1: with container(tmpnm, 'r') as z0: writeKmersAndCounts(K, hist(readKmersAndCounts(z0, tmps[0]), h), z) else: assert len(tmps) == 2 with container(tmpnm, 'r') as z0: merge2(z, K, readKmersAndCounts(z0, tmps[0]), readKmersAndCounts(z0, tmps[1]), h) n = float(sum(acgt)) acgt = [c / n for c in acgt] z.meta['hist'] = h z.meta['acgt'] = acgt z.meta['reads'] = nr