Example #1
0
def main(argv):
    opts = docopt.docopt(__doc__, argv)

    L0 = None
    if opts['-l']:
        L0 = int(opts['-l'])

    for inp in opts['<input>']:
        with kmers(inp, 'r') as z:
            K = z.meta['K']
            L = L0
            if L is None:
                L = 2*K

            xs = array.array('L', readKmers(z))
            S = sparse(2*K, xs)

            seen = bitvec(S.count())
            for i in xrange(S.count()):
                if seen[i]:
                    continue

                x = S.select(i)
                xb = rc(K, x)
                xp = succ(K, S, xb)
                if xp == 1:
                    # x isn't the start of a contig
                    continue

                pth = [x]
                seen[i] = 1
                xn = succ(K, S, x)
                while len(xn) == 1:
                    if seen[xn[0]] == 1:
                        break
                    x = S.select(xn[0])
                    pth.append(x)
                    seen[xn[0]] = 1
                    xb = rc(K, x)
                    j = S.rank(xb)
                    seen[j] = 1
                    xn = succ(K, S, x)

                if len(pth)+K-1 < L:
                    continue

                s = [render(K, pth[0])]
                for j in xrange(1, len(pth)):
                    s.append("ACGT"[pth[j]&3])

                print '>contig_%d\n%s' % (i, ''.join(s))
Example #2
0
def parseFiles(K, paired, fns, verbose):
    M = (1 << 18) - 1
    rn = 0

    if not paired:
        for fn in fns:
            with openFile(fn) as f:
                rn += 1
                if verbose and (rn & M) == 0:
                    print >> sys.stderr, 'reads processed: %d' % (rn,)
                xs = kmersList(K, fq1[1], False)
                yield xs
        return

    for (fn1, fn2) in pairs(fns):
        with openFile(fn1) as f1, openFile(fn2) as f2:
            for fq1, fq2 in both(readFastq(f1), readFastq(f2)):
                rn += 1
                if verbose and (rn & M) == 0:
                    print >> sys.stderr, 'read pairs processed: %d' % (rn,)
                xs = kmersList(K, fq1[1], False) + [rc(K, x) for x in kmersList(K, fq2[1], False)]
                yield xs
Example #3
0
def computeBias(K, zs, verbose=False):
    S = summarizer()
    for (x, xc) in zs.iteritems():
        y = rc(K, x)
        if y < x:
            continue
        yc = zs.get(y, 0)

        if xc > yc:
            a = xc
            b = yc
        else:
            a = yc
            b = xc
        apb = a + b
        if apb > 0:
            v = float(a) / float(apb)
        else:
            v = 0.5
        if verbose:
            print '%s\t%s\t%d\t%d\t%g' % (render(K, x), render(K,
                                                               y), xc, yc, v)
        S.add(v)
    return (S.mean(), S.var())
Example #4
0
def main(argv):
    opts = docopt.docopt(__doc__, argv)

    K = int(opts['-k'])
    M = (1 << (2*K)) - 1

    paired = True
    if opts['-s']:
        paired = False

    p = float(opts['-p'])
    T = int(M * p)

    if opts['-r']:
        refs = []
        with openFile(opts['-r']) as f:
            for (nm, seq) in readFasta(f):
                refs += kmersList(K, seq, False)
        refs = set(refs)

        kill = set([])
        for x in refs:
            y = rc(K, x)
            if y in refs:
                kill.add(x)
                kill.add(y)
        print >> sys.stderr, 'removing %d/%d' % (len(kill), len(refs))

        refs -= set(kill)

        fwd = {}
        rev = {}
        for xs in parseFiles(K, paired, opts['<fastq>'], opts['-v']):
            fn = 0
            for x in xs:
                if x in refs:
                    fn += 1

            ys = [rc(K, x) for x in xs]
            rn = 0
            for y in ys:
                if y in refs:
                    rn += 1
            
            if fn + rn == 0:
                continue

            q = float(fn) / float(fn + rn)
            if random.random() < q:
                for x in xs:
                    fwd[x] = 1 + fwd.get(x, 0)
            else:
                for y in ys:
                    rev[y] = 1 + rev.get(y, 0)

        for (x,xc) in fwd.iteritems():
            y = rc(K, x)
            yc = 0
            if y in rev:
                yc = rev[y]
                del rev[y]
            print '%d\t%d' % (xc, yc)

        for (y,yc) in rev.iteritems():
            print '%d\t%d' % (0, yc)

        return

    kx = {}
    for xs in parseFiles(K, paired, opts['<fastq>'], opts['-v']):
        for x in xs:
            if x in kx:
                kx[x] += 1
                continue
            y = rc(K, x)
            z = murmer(min(x, y), 17)
            if (z & M) > T:
                continue
            kx[x] = 1

    for x in kx.keys():
        y = rc(K, x)
        if x > y:
            continue
        xc = kx[x]
        yc = kx.get(y, 0)
        if murmer(x, 17) >= murmer(y, 17):
            (a, b) = (x, y)
            (ac, bc) = (xc, yc)
        else:
            (a, b) = (y, x)
            (ac, bc) = (yc, xc)
        #print '%s\t%d\t%s\t%d' % (render(K, a), ac, render(K, b), bc)
        print '%d\t%d' % (ac, bc)
Example #5
0
def main(argv):
    opts = docopt.docopt(__doc__, argv)

    if opts['-X']:
        K = 27
        S = []
        N = 0
        qacgt = [0, 0, 0, 0]
        for fn in opts['<input>']:
            with open(fn) as f:
                for (nm, seq) in readFasta(f):
                    if len(seq) < K:
                        continue
                    for (x, p) in kmersWithPos(K, seq, True):
                        S.append(x)
                        qacgt[x & 3] += 1
                        N += 1
        S.sort()
        qacgt = [float(c) / float(N) for c in qacgt]
        S = sparse(2 * K, array.array('L', uniq(S)))
        lens = []
        nms = []
        seqs = []
        n = 0
        tmp = [[] for i in xrange(S.count())]
        for fn in opts['<input>']:
            with open(fn) as f:
                for (nm, seq) in readFasta(f):
                    if len(seq) < K:
                        print >> sys.stderr, "warning: `%s' skipped" % (nm, )
                        continue
                    nms.append(nm)
                    seqs.append(seq)
                    lens.append(len(seq))
                    for (x, p) in kmersWithPos(K, seq, True):
                        r = S.rank(x)
                        tmp[r].append((n, p))
                    n += 1
        T = array.array('I', [])
        U = array.array('I', [])
        V = array.array('i', [])
        t = 0
        for nps in tmp:
            T.append(t)
            t += len(nps)
            for (n, p) in nps:
                U.append(n)
                V.append(p)
        T.append(t)
        del tmp

        gfn = opts['<genes>']
        with casket(gfn, 'w') as z:
            meta = {}
            meta['K'] = K
            meta['lens'] = lens
            meta['qacgt'] = qacgt
            meta['nms'] = nms
            meta['seqs'] = seqs

            z.add_content('__meta__', json.dumps(meta))
            write64(z, S.xs, 'S')
            write32(z, T, 'T')
            write32(z, U, 'U')
            write32s(z, V, 'V')

        return

    print >> sys.stderr, "loading..."

    gfn = opts['<genes>']
    with casket(gfn, 'r') as z:
        mf = z.open('__meta__')
        meta = json.load(mf)
        K = meta['K']
        lens = meta['lens']
        qacgt = meta['qacgt']
        nms = meta['nms']
        seqs = meta['seqs']

        S = read64(z, 'S')
        S = sparse(2 * K, S)
        T = read32(z, 'T')
        U = read32(z, 'U')
        V = read32s(z, 'V')

    print >> sys.stderr, "done."

    for fn in opts['<input>']:
        L = array.array('B', [0 for i in xrange(S.count())])
        Y = array.array('L', [0 for i in xrange(S.count())])
        with kmers(fn, 'r') as z:
            sacgt = z.meta['acgt']
            xs = readKmers(z)
            X = array.array('L', xs)
        M = len(X)
        resolveAll(K, S, L, Y, X)
        X = sparse(2 * K, X)

        g = sum([qp * sp for (qp, sp) in zip(qacgt, sacgt)])
        print >> sys.stderr, "g =", g
        nm = [null(g, M, j) for j in range(0, K + 1)]

        # counts for computing distribution of prefix lengths
        cnt = [[0 for j in xrange(K + 1)] for i in xrange(len(nms))]

        # the k-mers that we pulled by lcp from the sample
        # for each position of each query.
        P = [
            array.array('L', [0 for j in xrange(lens[i] - K + 1)])
            for i in xrange(len(lens))
        ]

        # the length of the lcp for each position of each query.
        Q = [
            array.array('B', [0 for j in xrange(lens[i] - K + 1)])
            for i in xrange(len(lens))
        ]

        for i in xrange(S.count()):
            for j in xrange(T[i], T[i + 1]):
                n = U[j]
                p = V[j]
                y = Y[i]
                l = L[i]
                cnt[n][l] += 1
                if p > 0:
                    p -= 1
                else:
                    p = -(p + 1)
                    y = rc(K, y)
                if l > Q[n][p]:
                    Q[n][p] = l
                    P[n][p] = y

        for i in xrange(len(nms)):
            # iterate over the queries

            qc = math.log(K * 0.05 / float(lens[i] - K + 1) / 2)

            # Link up "de Bruijn" sequences
            m = (1 << (2 * K - 2)) - 1
            py = 0
            u = unionfind()
            for j in xrange(lens[i] - K + 1):
                x = P[i][j]
                y = x >> 2
                if j > 0:
                    d = ham(py, y)
                    if d == 0:
                        u.union(j - 1, j)
                py = x & m

            # Gather up the de Bruin fragments
            udx = {}
            for j in xrange(lens[i] - K + 1):
                v = u.find(j)
                if v not in udx:
                    udx[v] = []
                udx[v].append(j)

            # Index the left hand k-mers
            idxLhs = {}
            kx = []
            for (jx, js) in udx.iteritems():
                q = 0
                for j in js:
                    q += math.log1p(-nm[Q[i][j]])
                if q > math.log(0.05 / len(js)):
                    continue
                kx.append((-len(js), jx))
                idxLhs[P[i][js[0]]] = jx
            kx.sort()

            # Attempt to link up fragments
            links = {}
            for (_, jx) in kx:
                jR = udx[jx][-1]
                if jR == lens[i] - K + 1:
                    continue
                x = P[i][jR]
                xs = []
                lnk = None
                for k in xrange(100):
                    ys = succ(K, X, x)
                    if len(ys) != 1:
                        break
                    x = ys[0]
                    if x in idxLhs:
                        lnk = idxLhs[x]
                        break
                    xs.append(x)
                if lnk is not None:
                    links[jx] = xs
                    u.union(jx, lnk)

            # Gather up the linked fragments
            vdx = {}
            for j in [jx for (_, jx) in kx]:
                v = u.find(j)
                if v not in vdx:
                    vdx[v] = []
                vdx[v].append(j)

            res = []
            for (jxx, jxs) in vdx.iteritems():
                # Order the gragments by start position
                fs = [(udx[jx][0], jx) for jx in jxs]
                fs.sort()
                sxs = []
                for fj in xrange(len(fs)):
                    (_, jx) = fs[fj]
                    beg = udx[jx][0]
                    end = udx[jx][-1] + 1
                    if fj == 0:
                        for j in xrange(beg):
                            sxs.append((0, 0))
                    xs = links.get(jx, None)
                    for j in xrange(beg, end):
                        x = P[i][j]
                        l = Q[i][j]
                        sxs.append((x, l))
                    if xs:
                        for x in xs:
                            sxs.append((x, 27))
                    else:
                        if fj < len(fs) - 1:
                            nxt = fs[fj + 1][0]
                        else:
                            nxt = lens[i] - K + 1
                        for j in xrange(end, nxt):
                            sxs.append((0, 0))
                seq = [[0, 0, 0, 0] for j in xrange(len(sxs) + K - 1)]
                for j in xrange(len(sxs)):
                    (x, l) = sxs[j]
                    p = math.log1p(-nm[l])
                    for k in xrange(K):
                        seq[j + K - k - 1][x & 3] += p
                        x >>= 2
                ax = []
                p = None
                inf = False
                for j in xrange(len(seq)):
                    b = 0
                    for k in xrange(4):
                        if seq[j][k] < qc:
                            b |= 1 << k
                    ax.append(fasta(b))
                    ssj = sum(seq[j])
                    if p is None:
                        p = ssj
                    else:
                        p = logAdd(p, ssj)
                    if ssj > -1e-300:
                        inf = True
                dst = counts2cdf(cnt[i])
                (_, kd) = ksDistance2(dst, nm)
                df = math.ceil(len(seq) / float(K))
                if inf:
                    q = 1e300
                    pv = 0.0
                else:
                    q = 2 * math.exp(p)
                    pv = chi2(df, q)
                res.append((pv, q, kd, ''.join(ax)))

            if len(res) == 0:
                continue

            res.sort()
            if res[0][0] < -2:
                #ed = lev(seqs[i], res[0][2])
                ed = 0
                pv = res[0][0] / math.log(10)
                c2 = res[0][1]
                kd = res[0][2]
                a = res[0][3]
                print '%d\t%d\t%d\t%g\t%g\t%g\t%s\t%s' % (
                    i, lens[i], len(a), kd, c2, pv, nms[i], a)
            sys.stdout.flush()
Example #6
0
def main(argv):
    opts = docopt.docopt(__doc__, argv)

    verbose = opts['-v']

    K = int(opts['-k'])

    D = int(opts['-D'])

    Q = int(opts['-C'])

    V = float(opts['-V'])

    d = "."
    if opts['-g']:
        d = opts['-g']
    sf = SequenceFactory(d)

    if opts['-X']:
        Wcap = int(opts['-w'])
        Wval = int(opts['-W'])

        variants = opts['<variant>']
        if opts['-f']:
            with openFile(opts['-f']) as f:
                variants += f.read().split()

        vx = {}
        for v in variants:
            x = makeHGVS(v)
            if x is None:
                print >> sys.stderr, "unable to parse %s" % (v, )
                continue
            x.setSequenceFactory(sf)
            acc = x.accession()
            if acc not in vx:
                vx[acc] = []
            vx[acc].append(x)

        chk = None
        if opts['-T']:
            chk = {}

        rs = []
        for (acc, vs) in vx.iteritems():
            for v in vs:
                r = makeIndexedVariant(v, K, Wcap, Wval)
                if r is not None:
                    rs.append(r)
                if chk is not None:
                    xs = kmersList(
                        K, ''.join([
                            r['lhsFlank'][-(K - 1):], r['wtSeq'],
                            r['rhsFlank'][:K - 1]
                        ]), True)
                    for x in xs:
                        if x not in chk:
                            chk[x] = set([])
                        chk[x].add(('wt', str(v)))
                    if r['mutSeq'] is None:
                        continue
                    xs = kmersList(
                        K, ''.join([
                            r['lhsFlank'][-(K - 1):], r['mutSeq'],
                            r['rhsFlank'][:K - 1]
                        ]), True)
                    for x in xs:
                        if x not in chk:
                            chk[x] = set([])
                        chk[x].add(('mut', str(v)))

        if chk is not None:
            counts = dict([(x, 0) for x in chk.keys()])
            for acc in refSeq2Hg19.keys():
                if verbose:
                    print >> sys.stderr, 'scanning', acc
                seq = sf[acc]
                for x in kmers(K, seq):
                    if x in counts:
                        counts[x] += 1
            res = {}
            seen = set([])
            for x in counts.keys():
                y = rc(K, x)
                z = min(x, y)
                if z in seen:
                    continue
                seen.add(z)
                c = counts[x] + counts[y]
                for (a, v) in chk[x]:
                    if v not in res:
                        res[v] = {}
                    if a not in res[v]:
                        res[v][a] = {}
                    if c not in res[v][a]:
                        res[v][a][c] = 0
                    res[v][a][c] += 1
            yaml.safe_dump(res, sys.stdout, default_flow_style=False)
            return

        with open(opts['<index>'], 'w') as f:
            yaml.safe_dump(rs, f, default_flow_style=False)

        return

    capt = False
    zipname = None
    if opts['-c']:
        capt = True
        zipname = opts['-c']

    fmt = set([])
    if opts['-F']:
        fmt = set(opts['-F'].split(','))

    if verbose:
        print >> sys.stderr, "loading index."

    with open(opts['<index>']) as f:
        hgvsVars = yaml.load(f, Loader=yaml.FullLoader)

    NV = len(hgvsVars)

    combineStrands = True
    if opts['-s']:
        combineStrands = False

    cap = capture(K, reads=capt, kmers=True, verbose=verbose)

    for n in range(NV):
        itm = hgvsVars[n]
        h = itm['hgvs']
        v = makeHGVS(h)
        itm['var'] = v
        lhs = itm['lhsFlank']
        rhs = itm['rhsFlank']
        wt = itm['wtSeq']
        mut = itm['mutSeq']
        bait = [lhs, wt, rhs]
        if mut is not None:
            bait += ['N']
            bait += [lhs, mut, rhs]
        bait = ''.join(bait)
        n0 = cap.addBait(h, bait)
        assert n0 == n

    if verbose:
        print >> sys.stderr, "done."

    rn = 0
    for itm in reads(opts['<input>'],
                     K=K,
                     paired=True,
                     reads=True,
                     kmers=False,
                     both=True,
                     verbose=verbose):
        rn += 1
        cap.addReadPairAndKmers(itm.reads[0], itm.reads[1])

    if capt:
        cap.saveReads(zipname)

    scorer = Scorer(K)

    globHist = {}

    for n in range(NV):
        mx = cap.capKmers[n]
        for c in mx.itervalues():
            if c < Q:
                continue
            if c not in globHist:
                globHist[c] = 0
            globHist[c] += 1

    with outputFile(opts['-o']) as out:
        hdrShown = False
        for n in range(NV):
            itm = hgvsVars[n]
            v = itm['var']
            h = itm['hgvs']

            mx = cap.capKmers[n]

            nr = cap.capReadCounts[n]

            if 'kmers' in fmt:
                for (x, c) in mx.iteritems():
                    print '%d\t%s\t%d' % (n, render(K, x), c)

            lhsFlank = itm['lhsFlank']
            rhsFlank = itm['rhsFlank']

            alleles = {}
            alleles['wt'] = []
            alleles['mut'] = []

            wtSeq = itm['wtSeq']
            wtZ = len(wtSeq)

            mutSeq = itm['mutSeq']
            mutZ = v.size()

            cs = [c for (x, c) in mx.iteritems() if c >= Q]
            cs.sort()
            nk = len(cs)
            if nk == 0:
                cs = [0]

            q10 = cs[1 * len(cs) // 10]
            q50 = cs[5 * len(cs) // 10]
            q90 = cs[9 * len(cs) // 10]

            af = AlleleFinder(K, D, v, mx, lhsFlank, rhsFlank, wtSeq, mutSeq,
                              wtZ, mutZ)
            finders = []
            if not v.anonymous():
                finders.append(af.definiteAlleles())
            else:
                finders.append(af.bridgingAlleles())

            j = 0
            for (t, a) in cat(finders):
                assert t == 'wt' or t == 'mut'
                alleles[t].append(a)
                j += 1

            wtRes = {}
            wtRes['covMin'] = 0
            wtRes['binom'] = 1.0
            wtRes['ksDist'] = 0.0
            wtRes['hamming'] = 0
            wtRes['path'] = []
            for pthRes in alleles['wt']:
                scorer.score(pthRes, lhsFlank, wtSeq, rhsFlank)
                if isBetter(pthRes, wtRes):
                    wtRes = pthRes

            mutRes = {}
            mutRes['covMin'] = 0
            mutRes['binom'] = 1.0
            mutRes['ksDist'] = 0.0
            mutRes['hamming'] = 0
            mutRes['path'] = []
            for pthRes in alleles['mut']:
                scorer.score(pthRes, lhsFlank, mutSeq, rhsFlank)
                if isBetter(pthRes, mutRes):
                    mutRes = pthRes

            if True:
                wtXs = [mx.get(x, 0) for x in wtRes['path']]
                if len(wtXs) == 0:
                    wtXs = [0]
                wtXs.sort()
                wtCount = sum(wtXs)
                wtLen = len(wtXs)
                wtMean = float(wtCount) / float(wtLen)
                wtMedian = wtXs[wtLen // 2]

                mutXs = [mx.get(x, 0) for x in mutRes['path']]
                if len(mutXs) == 0:
                    mutXs = [0]
                mutXs.sort()
                mutCount = sum(mutXs)
                mutLen = len(mutXs)
                mutMean = float(mutCount) / float(mutLen)
                mutMedian = mutXs[mutLen // 2]

                totX = max([1.0, float(wtMedian + mutMedian), float(q90)])
                wtVaf = wtMedian / totX
                mutVaf = mutMedian / totX

            hdrs = ['n']
            fmts = ['%d']
            outs = [n]

            wtAllele = ((wtRes['covMin'] > Q) and
                        (wtRes['hamming'] < 4)) and (wtVaf > V)
            mutAllele = ((mutRes['covMin'] > Q) and
                         (mutRes['hamming'] < 4)) and (mutVaf > V)
            resV = 1 * wtAllele + 2 * mutAllele
            res = ['null', 'wt', 'mut', 'wt/mut'][resV]

            hdrs += ['res']
            fmts += ['%s']
            outs += [res]

            if 'rds' in fmt:
                hdrs += ['numReads']
                fmts += ['%d']
                outs += [nr]

            hdrs += ['numKmers', 'covQ10', 'covQ50', 'covQ90']
            fmts += ['%d', '%d', '%d', '%d']
            outs += [nk, q10, q50, q90]

            hdrs += ['wtMin', 'mutMin']
            fmts += ['%d', '%d']
            outs += [wtRes['covMin'], mutRes['covMin']]

            hdrs += ['wtHam', 'mutHam']
            fmts += ['%d', '%d']
            outs += [wtRes['hamming'], mutRes['hamming']]

            if 'ks' in fmt:
                hdrs += ['wtD', 'mutD']
                fmts += ['%g', '%g']
                outs += [wtRes['ksDist'], mutRes['ksDist']]

            if 'binom' in fmt:
                hdrs += ['wtQ', 'mutQ']
                fmts += ['%g', '%g']
                outs += [wtRes['binom'], mutRes['binom']]

            if 'vaf' in fmt:
                hdrs += ['wtVaf', 'mutVaf']
                fmts += ['%g', '%g']
                outs += [wtVaf, mutVaf]

            hdrs += ['hgvs']
            fmts += ['%s']
            outs += [h]

            if not hdrShown:
                hdrShown = True
                print >> out, '\t'.join(hdrs)
            print >> out, '\t'.join(fmts) % tuple(outs)
            out.flush()