def main(argv): opts = docopt.docopt(__doc__, argv) L0 = None if opts['-l']: L0 = int(opts['-l']) for inp in opts['<input>']: with kmers(inp, 'r') as z: K = z.meta['K'] L = L0 if L is None: L = 2*K xs = array.array('L', readKmers(z)) S = sparse(2*K, xs) seen = bitvec(S.count()) for i in xrange(S.count()): if seen[i]: continue x = S.select(i) xb = rc(K, x) xp = succ(K, S, xb) if xp == 1: # x isn't the start of a contig continue pth = [x] seen[i] = 1 xn = succ(K, S, x) while len(xn) == 1: if seen[xn[0]] == 1: break x = S.select(xn[0]) pth.append(x) seen[xn[0]] = 1 xb = rc(K, x) j = S.rank(xb) seen[j] = 1 xn = succ(K, S, x) if len(pth)+K-1 < L: continue s = [render(K, pth[0])] for j in xrange(1, len(pth)): s.append("ACGT"[pth[j]&3]) print '>contig_%d\n%s' % (i, ''.join(s))
def parseFiles(K, paired, fns, verbose): M = (1 << 18) - 1 rn = 0 if not paired: for fn in fns: with openFile(fn) as f: rn += 1 if verbose and (rn & M) == 0: print >> sys.stderr, 'reads processed: %d' % (rn,) xs = kmersList(K, fq1[1], False) yield xs return for (fn1, fn2) in pairs(fns): with openFile(fn1) as f1, openFile(fn2) as f2: for fq1, fq2 in both(readFastq(f1), readFastq(f2)): rn += 1 if verbose and (rn & M) == 0: print >> sys.stderr, 'read pairs processed: %d' % (rn,) xs = kmersList(K, fq1[1], False) + [rc(K, x) for x in kmersList(K, fq2[1], False)] yield xs
def computeBias(K, zs, verbose=False): S = summarizer() for (x, xc) in zs.iteritems(): y = rc(K, x) if y < x: continue yc = zs.get(y, 0) if xc > yc: a = xc b = yc else: a = yc b = xc apb = a + b if apb > 0: v = float(a) / float(apb) else: v = 0.5 if verbose: print '%s\t%s\t%d\t%d\t%g' % (render(K, x), render(K, y), xc, yc, v) S.add(v) return (S.mean(), S.var())
def main(argv): opts = docopt.docopt(__doc__, argv) K = int(opts['-k']) M = (1 << (2*K)) - 1 paired = True if opts['-s']: paired = False p = float(opts['-p']) T = int(M * p) if opts['-r']: refs = [] with openFile(opts['-r']) as f: for (nm, seq) in readFasta(f): refs += kmersList(K, seq, False) refs = set(refs) kill = set([]) for x in refs: y = rc(K, x) if y in refs: kill.add(x) kill.add(y) print >> sys.stderr, 'removing %d/%d' % (len(kill), len(refs)) refs -= set(kill) fwd = {} rev = {} for xs in parseFiles(K, paired, opts['<fastq>'], opts['-v']): fn = 0 for x in xs: if x in refs: fn += 1 ys = [rc(K, x) for x in xs] rn = 0 for y in ys: if y in refs: rn += 1 if fn + rn == 0: continue q = float(fn) / float(fn + rn) if random.random() < q: for x in xs: fwd[x] = 1 + fwd.get(x, 0) else: for y in ys: rev[y] = 1 + rev.get(y, 0) for (x,xc) in fwd.iteritems(): y = rc(K, x) yc = 0 if y in rev: yc = rev[y] del rev[y] print '%d\t%d' % (xc, yc) for (y,yc) in rev.iteritems(): print '%d\t%d' % (0, yc) return kx = {} for xs in parseFiles(K, paired, opts['<fastq>'], opts['-v']): for x in xs: if x in kx: kx[x] += 1 continue y = rc(K, x) z = murmer(min(x, y), 17) if (z & M) > T: continue kx[x] = 1 for x in kx.keys(): y = rc(K, x) if x > y: continue xc = kx[x] yc = kx.get(y, 0) if murmer(x, 17) >= murmer(y, 17): (a, b) = (x, y) (ac, bc) = (xc, yc) else: (a, b) = (y, x) (ac, bc) = (yc, xc) #print '%s\t%d\t%s\t%d' % (render(K, a), ac, render(K, b), bc) print '%d\t%d' % (ac, bc)
def main(argv): opts = docopt.docopt(__doc__, argv) if opts['-X']: K = 27 S = [] N = 0 qacgt = [0, 0, 0, 0] for fn in opts['<input>']: with open(fn) as f: for (nm, seq) in readFasta(f): if len(seq) < K: continue for (x, p) in kmersWithPos(K, seq, True): S.append(x) qacgt[x & 3] += 1 N += 1 S.sort() qacgt = [float(c) / float(N) for c in qacgt] S = sparse(2 * K, array.array('L', uniq(S))) lens = [] nms = [] seqs = [] n = 0 tmp = [[] for i in xrange(S.count())] for fn in opts['<input>']: with open(fn) as f: for (nm, seq) in readFasta(f): if len(seq) < K: print >> sys.stderr, "warning: `%s' skipped" % (nm, ) continue nms.append(nm) seqs.append(seq) lens.append(len(seq)) for (x, p) in kmersWithPos(K, seq, True): r = S.rank(x) tmp[r].append((n, p)) n += 1 T = array.array('I', []) U = array.array('I', []) V = array.array('i', []) t = 0 for nps in tmp: T.append(t) t += len(nps) for (n, p) in nps: U.append(n) V.append(p) T.append(t) del tmp gfn = opts['<genes>'] with casket(gfn, 'w') as z: meta = {} meta['K'] = K meta['lens'] = lens meta['qacgt'] = qacgt meta['nms'] = nms meta['seqs'] = seqs z.add_content('__meta__', json.dumps(meta)) write64(z, S.xs, 'S') write32(z, T, 'T') write32(z, U, 'U') write32s(z, V, 'V') return print >> sys.stderr, "loading..." gfn = opts['<genes>'] with casket(gfn, 'r') as z: mf = z.open('__meta__') meta = json.load(mf) K = meta['K'] lens = meta['lens'] qacgt = meta['qacgt'] nms = meta['nms'] seqs = meta['seqs'] S = read64(z, 'S') S = sparse(2 * K, S) T = read32(z, 'T') U = read32(z, 'U') V = read32s(z, 'V') print >> sys.stderr, "done." for fn in opts['<input>']: L = array.array('B', [0 for i in xrange(S.count())]) Y = array.array('L', [0 for i in xrange(S.count())]) with kmers(fn, 'r') as z: sacgt = z.meta['acgt'] xs = readKmers(z) X = array.array('L', xs) M = len(X) resolveAll(K, S, L, Y, X) X = sparse(2 * K, X) g = sum([qp * sp for (qp, sp) in zip(qacgt, sacgt)]) print >> sys.stderr, "g =", g nm = [null(g, M, j) for j in range(0, K + 1)] # counts for computing distribution of prefix lengths cnt = [[0 for j in xrange(K + 1)] for i in xrange(len(nms))] # the k-mers that we pulled by lcp from the sample # for each position of each query. P = [ array.array('L', [0 for j in xrange(lens[i] - K + 1)]) for i in xrange(len(lens)) ] # the length of the lcp for each position of each query. Q = [ array.array('B', [0 for j in xrange(lens[i] - K + 1)]) for i in xrange(len(lens)) ] for i in xrange(S.count()): for j in xrange(T[i], T[i + 1]): n = U[j] p = V[j] y = Y[i] l = L[i] cnt[n][l] += 1 if p > 0: p -= 1 else: p = -(p + 1) y = rc(K, y) if l > Q[n][p]: Q[n][p] = l P[n][p] = y for i in xrange(len(nms)): # iterate over the queries qc = math.log(K * 0.05 / float(lens[i] - K + 1) / 2) # Link up "de Bruijn" sequences m = (1 << (2 * K - 2)) - 1 py = 0 u = unionfind() for j in xrange(lens[i] - K + 1): x = P[i][j] y = x >> 2 if j > 0: d = ham(py, y) if d == 0: u.union(j - 1, j) py = x & m # Gather up the de Bruin fragments udx = {} for j in xrange(lens[i] - K + 1): v = u.find(j) if v not in udx: udx[v] = [] udx[v].append(j) # Index the left hand k-mers idxLhs = {} kx = [] for (jx, js) in udx.iteritems(): q = 0 for j in js: q += math.log1p(-nm[Q[i][j]]) if q > math.log(0.05 / len(js)): continue kx.append((-len(js), jx)) idxLhs[P[i][js[0]]] = jx kx.sort() # Attempt to link up fragments links = {} for (_, jx) in kx: jR = udx[jx][-1] if jR == lens[i] - K + 1: continue x = P[i][jR] xs = [] lnk = None for k in xrange(100): ys = succ(K, X, x) if len(ys) != 1: break x = ys[0] if x in idxLhs: lnk = idxLhs[x] break xs.append(x) if lnk is not None: links[jx] = xs u.union(jx, lnk) # Gather up the linked fragments vdx = {} for j in [jx for (_, jx) in kx]: v = u.find(j) if v not in vdx: vdx[v] = [] vdx[v].append(j) res = [] for (jxx, jxs) in vdx.iteritems(): # Order the gragments by start position fs = [(udx[jx][0], jx) for jx in jxs] fs.sort() sxs = [] for fj in xrange(len(fs)): (_, jx) = fs[fj] beg = udx[jx][0] end = udx[jx][-1] + 1 if fj == 0: for j in xrange(beg): sxs.append((0, 0)) xs = links.get(jx, None) for j in xrange(beg, end): x = P[i][j] l = Q[i][j] sxs.append((x, l)) if xs: for x in xs: sxs.append((x, 27)) else: if fj < len(fs) - 1: nxt = fs[fj + 1][0] else: nxt = lens[i] - K + 1 for j in xrange(end, nxt): sxs.append((0, 0)) seq = [[0, 0, 0, 0] for j in xrange(len(sxs) + K - 1)] for j in xrange(len(sxs)): (x, l) = sxs[j] p = math.log1p(-nm[l]) for k in xrange(K): seq[j + K - k - 1][x & 3] += p x >>= 2 ax = [] p = None inf = False for j in xrange(len(seq)): b = 0 for k in xrange(4): if seq[j][k] < qc: b |= 1 << k ax.append(fasta(b)) ssj = sum(seq[j]) if p is None: p = ssj else: p = logAdd(p, ssj) if ssj > -1e-300: inf = True dst = counts2cdf(cnt[i]) (_, kd) = ksDistance2(dst, nm) df = math.ceil(len(seq) / float(K)) if inf: q = 1e300 pv = 0.0 else: q = 2 * math.exp(p) pv = chi2(df, q) res.append((pv, q, kd, ''.join(ax))) if len(res) == 0: continue res.sort() if res[0][0] < -2: #ed = lev(seqs[i], res[0][2]) ed = 0 pv = res[0][0] / math.log(10) c2 = res[0][1] kd = res[0][2] a = res[0][3] print '%d\t%d\t%d\t%g\t%g\t%g\t%s\t%s' % ( i, lens[i], len(a), kd, c2, pv, nms[i], a) sys.stdout.flush()
def main(argv): opts = docopt.docopt(__doc__, argv) verbose = opts['-v'] K = int(opts['-k']) D = int(opts['-D']) Q = int(opts['-C']) V = float(opts['-V']) d = "." if opts['-g']: d = opts['-g'] sf = SequenceFactory(d) if opts['-X']: Wcap = int(opts['-w']) Wval = int(opts['-W']) variants = opts['<variant>'] if opts['-f']: with openFile(opts['-f']) as f: variants += f.read().split() vx = {} for v in variants: x = makeHGVS(v) if x is None: print >> sys.stderr, "unable to parse %s" % (v, ) continue x.setSequenceFactory(sf) acc = x.accession() if acc not in vx: vx[acc] = [] vx[acc].append(x) chk = None if opts['-T']: chk = {} rs = [] for (acc, vs) in vx.iteritems(): for v in vs: r = makeIndexedVariant(v, K, Wcap, Wval) if r is not None: rs.append(r) if chk is not None: xs = kmersList( K, ''.join([ r['lhsFlank'][-(K - 1):], r['wtSeq'], r['rhsFlank'][:K - 1] ]), True) for x in xs: if x not in chk: chk[x] = set([]) chk[x].add(('wt', str(v))) if r['mutSeq'] is None: continue xs = kmersList( K, ''.join([ r['lhsFlank'][-(K - 1):], r['mutSeq'], r['rhsFlank'][:K - 1] ]), True) for x in xs: if x not in chk: chk[x] = set([]) chk[x].add(('mut', str(v))) if chk is not None: counts = dict([(x, 0) for x in chk.keys()]) for acc in refSeq2Hg19.keys(): if verbose: print >> sys.stderr, 'scanning', acc seq = sf[acc] for x in kmers(K, seq): if x in counts: counts[x] += 1 res = {} seen = set([]) for x in counts.keys(): y = rc(K, x) z = min(x, y) if z in seen: continue seen.add(z) c = counts[x] + counts[y] for (a, v) in chk[x]: if v not in res: res[v] = {} if a not in res[v]: res[v][a] = {} if c not in res[v][a]: res[v][a][c] = 0 res[v][a][c] += 1 yaml.safe_dump(res, sys.stdout, default_flow_style=False) return with open(opts['<index>'], 'w') as f: yaml.safe_dump(rs, f, default_flow_style=False) return capt = False zipname = None if opts['-c']: capt = True zipname = opts['-c'] fmt = set([]) if opts['-F']: fmt = set(opts['-F'].split(',')) if verbose: print >> sys.stderr, "loading index." with open(opts['<index>']) as f: hgvsVars = yaml.load(f, Loader=yaml.FullLoader) NV = len(hgvsVars) combineStrands = True if opts['-s']: combineStrands = False cap = capture(K, reads=capt, kmers=True, verbose=verbose) for n in range(NV): itm = hgvsVars[n] h = itm['hgvs'] v = makeHGVS(h) itm['var'] = v lhs = itm['lhsFlank'] rhs = itm['rhsFlank'] wt = itm['wtSeq'] mut = itm['mutSeq'] bait = [lhs, wt, rhs] if mut is not None: bait += ['N'] bait += [lhs, mut, rhs] bait = ''.join(bait) n0 = cap.addBait(h, bait) assert n0 == n if verbose: print >> sys.stderr, "done." rn = 0 for itm in reads(opts['<input>'], K=K, paired=True, reads=True, kmers=False, both=True, verbose=verbose): rn += 1 cap.addReadPairAndKmers(itm.reads[0], itm.reads[1]) if capt: cap.saveReads(zipname) scorer = Scorer(K) globHist = {} for n in range(NV): mx = cap.capKmers[n] for c in mx.itervalues(): if c < Q: continue if c not in globHist: globHist[c] = 0 globHist[c] += 1 with outputFile(opts['-o']) as out: hdrShown = False for n in range(NV): itm = hgvsVars[n] v = itm['var'] h = itm['hgvs'] mx = cap.capKmers[n] nr = cap.capReadCounts[n] if 'kmers' in fmt: for (x, c) in mx.iteritems(): print '%d\t%s\t%d' % (n, render(K, x), c) lhsFlank = itm['lhsFlank'] rhsFlank = itm['rhsFlank'] alleles = {} alleles['wt'] = [] alleles['mut'] = [] wtSeq = itm['wtSeq'] wtZ = len(wtSeq) mutSeq = itm['mutSeq'] mutZ = v.size() cs = [c for (x, c) in mx.iteritems() if c >= Q] cs.sort() nk = len(cs) if nk == 0: cs = [0] q10 = cs[1 * len(cs) // 10] q50 = cs[5 * len(cs) // 10] q90 = cs[9 * len(cs) // 10] af = AlleleFinder(K, D, v, mx, lhsFlank, rhsFlank, wtSeq, mutSeq, wtZ, mutZ) finders = [] if not v.anonymous(): finders.append(af.definiteAlleles()) else: finders.append(af.bridgingAlleles()) j = 0 for (t, a) in cat(finders): assert t == 'wt' or t == 'mut' alleles[t].append(a) j += 1 wtRes = {} wtRes['covMin'] = 0 wtRes['binom'] = 1.0 wtRes['ksDist'] = 0.0 wtRes['hamming'] = 0 wtRes['path'] = [] for pthRes in alleles['wt']: scorer.score(pthRes, lhsFlank, wtSeq, rhsFlank) if isBetter(pthRes, wtRes): wtRes = pthRes mutRes = {} mutRes['covMin'] = 0 mutRes['binom'] = 1.0 mutRes['ksDist'] = 0.0 mutRes['hamming'] = 0 mutRes['path'] = [] for pthRes in alleles['mut']: scorer.score(pthRes, lhsFlank, mutSeq, rhsFlank) if isBetter(pthRes, mutRes): mutRes = pthRes if True: wtXs = [mx.get(x, 0) for x in wtRes['path']] if len(wtXs) == 0: wtXs = [0] wtXs.sort() wtCount = sum(wtXs) wtLen = len(wtXs) wtMean = float(wtCount) / float(wtLen) wtMedian = wtXs[wtLen // 2] mutXs = [mx.get(x, 0) for x in mutRes['path']] if len(mutXs) == 0: mutXs = [0] mutXs.sort() mutCount = sum(mutXs) mutLen = len(mutXs) mutMean = float(mutCount) / float(mutLen) mutMedian = mutXs[mutLen // 2] totX = max([1.0, float(wtMedian + mutMedian), float(q90)]) wtVaf = wtMedian / totX mutVaf = mutMedian / totX hdrs = ['n'] fmts = ['%d'] outs = [n] wtAllele = ((wtRes['covMin'] > Q) and (wtRes['hamming'] < 4)) and (wtVaf > V) mutAllele = ((mutRes['covMin'] > Q) and (mutRes['hamming'] < 4)) and (mutVaf > V) resV = 1 * wtAllele + 2 * mutAllele res = ['null', 'wt', 'mut', 'wt/mut'][resV] hdrs += ['res'] fmts += ['%s'] outs += [res] if 'rds' in fmt: hdrs += ['numReads'] fmts += ['%d'] outs += [nr] hdrs += ['numKmers', 'covQ10', 'covQ50', 'covQ90'] fmts += ['%d', '%d', '%d', '%d'] outs += [nk, q10, q50, q90] hdrs += ['wtMin', 'mutMin'] fmts += ['%d', '%d'] outs += [wtRes['covMin'], mutRes['covMin']] hdrs += ['wtHam', 'mutHam'] fmts += ['%d', '%d'] outs += [wtRes['hamming'], mutRes['hamming']] if 'ks' in fmt: hdrs += ['wtD', 'mutD'] fmts += ['%g', '%g'] outs += [wtRes['ksDist'], mutRes['ksDist']] if 'binom' in fmt: hdrs += ['wtQ', 'mutQ'] fmts += ['%g', '%g'] outs += [wtRes['binom'], mutRes['binom']] if 'vaf' in fmt: hdrs += ['wtVaf', 'mutVaf'] fmts += ['%g', '%g'] outs += [wtVaf, mutVaf] hdrs += ['hgvs'] fmts += ['%s'] outs += [h] if not hdrShown: hdrShown = True print >> out, '\t'.join(hdrs) print >> out, '\t'.join(fmts) % tuple(outs) out.flush()