Esempio n. 1
0
def main(argv):
    opts = docopt.docopt(__doc__, argv)

    with kmers(opts['<ref>'], 'r') as z:
        K = z.meta['K']
        xs = array.array('L', readKmers(z))
    Z = len(xs)

    with kmers(opts['<input>'], 'r') as z0:
        K0 = z0.meta['K']
        if K0 != K:
            print >> sys.stderr, "mismatched K (%d)" % (K0, )
            sys.exit(1)

        with kmers(opts['<output>'], 'w') as z:
            z.meta['K'] = K
            if 'counts' in z0.meta:
                ys = readKmersAndCounts(z0)
                writeKmersAndCounts(z, project2(xs, ys))
                z.meta['kmers'] = 'kmers'
                z.meta['counts'] = 'counts'
            else:
                ys = readKmers(z0)
                writeKmers(z, project1(xs, ys))
                z.meta['kmers'] = 'kmers'
            z.meta['hist'] = z0.meta['hist']
Esempio n. 2
0
def main(argv):
    opts = docopt.docopt(__doc__, argv)

    inp = opts['<input>']
    out = opts['<output>']

    c = 0
    if opts['-c'] is not None:
        c = int(opts['-c'])

    C = None
    if opts['-C'] is not None:
        C0 = int(opts['-C'])
        if C0 > 0:
            C = C0

    with kmers(inp, 'r') as z:
        K = z.meta['K']
        h = z.meta['hist']
        if c == 0:
            c = infer(K, h)
            print >> sys.stderr, 'inferred cutoff:', c
        xs = readKmersAndCounts(z)
        with kmers(out, 'w') as w:
            w.meta = z.meta.copy()
            del w.meta['kmers']
            del w.meta['counts']
            writeKmersAndCounts(w, trim(xs, c, C))
            w.meta['K'] = K
            w.meta['kmers'] = 'kmers'
            w.meta['counts'] = 'counts'
            w.meta['hist'] = h
Esempio n. 3
0
def main(argv):
    opts = docopt.docopt(__doc__, argv)

    p = 0.01
    if opts['-P'] is not None:
        p = float(opts['-P'])
    inp = opts['<input>']
    out = opts['<output>']
    with kmers(out, 'w') as z:
        h = {}
        with kmers(inp, 'r') as z0:
            K = z0.meta['K']
            z.meta = z0.meta.copy()
            del z.meta['kmers']
            del z.meta['counts']
            xs = readKmersAndCounts(z0)
            S = 0
            if opts['-D'] is None:
                if opts['-S']:
                    S = long(opts['-S'])
                    random.seed(S)
                writeKmersAndCounts(z, sampleR(p, xs, h))
            else:
                if opts['-S']:
                    S = long(opts['-S'])
                writeKmersAndCounts(z, sampleD(p, S, xs, h))
        z.meta['K'] = K
        z.meta['kmers'] = 'kmers'
        z.meta['counts'] = 'counts'
        z.meta['hist'] = h
Esempio n. 4
0
def main(argv):
    opts = docopt.docopt(__doc__, argv)

    inp = opts['<input>']
    with kmers(inp, 'r') as z:
        K = z.meta['K']
        if 'kmers' not in z.meta:
            print >> sys.stderr, 'cannot dump "%s" as it contains no k-mers' % (inp,)
            return
        if 'counts' in z.meta:
            xs = readKmersAndCounts(z)
            for (x, c) in xs:
                print '%s\t%d' % (render(K, x), c)
        else:
            xs = readKmers(z)
            for x in xs:
                print render(K, x)
Esempio n. 5
0
def main(argv):
    opts = docopt.docopt(__doc__, argv)

    K = getK(opts['<input>'])
    J = K - 1
    M = (1 << (2 * (K - J))) - 1

    if opts['-r'] is not None:
        with kmers(opts['-r'], 'r') as z:
            xs = list(group(K, J, 0, readKmersAndCounts(z)))

        for fn in opts['<input>']:
            with kmers(fn, 'r') as z:
                samXs = readKmersAndCounts(z)
                i = 0
                for (yCtx, _, yGrp) in group(K, J, 0, samXs):
                    while i < len(xs) and xs[i][0] < yCtx:
                        i += 1
                    assert i < len(xs)
                    assert xs[i][0] == yCtx
                    gt = float(sum([c for (x,c) in xs[i][2]]))
                    gx = [0 for j in xrange(M+1)]
                    for (x,c) in xs[i][2]:
                        gx[x&M] = c
                    st = sum([c for (x,c) in yGrp])
                    sx = [0 for j in xrange(M+1)]
                    for (x,c) in yGrp:
                        sx[x&M] = c
                    ss = []
                    b = 0
                    for j in xrange(M+1):
                        p = float(gx[j])/gt
                        v = 0.0
                        if 0.0 < p and p < 1.0:
                            v = logBinGe(p, st, sx[j])
                            if v < -10:
                                b |= 1 << j
                        ss.append('%3.2g' % (v,))
                    if b > 0:
                        print '%s\t%s\t%s' % (render(J, yCtx), fasta(b), '\t'.join(ss))
                    i += 1
        return

    # Parse files in parallel to get global distribution

    N = len(opts['<input>'])
    h = heap.heap()
    i = 0
    for fn in opts['<input>']:
        (_, xs) = kfset.read(fn)
        i += 1
        h.push(Group(K, J, i, xs))

    while len(h) > 0:
        xfs = []
        g = h.pop()
        gy = g.this()[0]
        xfs.append(g.this())
        g.next()
        if g.valid():
            h.push(g)
        for x in h.xs:
            assert x.valid()
        while len(h) > 0 and h.front().this()[0] == gy:
            g = h.pop()
            xfs.append(g.this())
            g.next()
            if g.valid():
                h.push(g)
            for i in xrange(len(h.xs)):
                assert h.xs[i].valid()

        ds = []
        gc = [0 for i in xrange(M+1)]
        for (_, n, xc) in xfs:
            t = sum([c for (x,c) in xc])
            d = [0 for i in xrange(M+1)]
            for (x,c) in xc:
                j = x & M
                gc[j] += c
                d[j] = c
            ds.append((n, d))

        res = ['*' for i in xrange(N)]
        seen = set([])
        gt = float(sum(gc))
        for (n, d) in ds:
            t = sum(d)
            b = [0 for i in xrange((M+1)/4)]
            for i in xrange(M+1):
                p = float(gc[i])/gt
                if 0.0 < p and p < 1.0:
                    #vL = logBinLe(p, t, d[i])
                    #vG = logBinGe(p, t, d[i])
                    #v = min(vL, vG)
                    v = logBinGe(p, t, d[i])
                    if v > -10:
                        w = i >> 2
                        j = i & 3
                        b[w] |= 1 << j
            res[n-1] = ''.join([fasta(b0) for b0 in b])
            seen.add(res[n-1])
        if len(seen) > 1:
            print '%s\t%s' % (render(J, gy), '\t'.join(res))
Esempio n. 6
0
def main(argv):
    opts = docopt.docopt(__doc__, argv)

    verbose = opts['-v']

    K = int(opts['<k>'])

    out = opts['<output>']

    Z = 1024 * 1024 * 32
    if opts['-m'] is not None:
        Z = 1024 * 1024 * int(opts['-m'])

    buf = KmerAccumulator2(K)
    n = 0
    tmps = []
    acgt = [0, 0, 0, 0]
    m = 0

    d = None
    if opts['-D'] is not None:
        d = float(opts['-D'])

        S = 0
        if opts['-S'] is not None:
            S = int(opts['-S'])

        cacheYes = set([])
        cacheNo = set([])

    B = opts['-C']
    if B is not None:
        xs = set([])
        for (nm, seq) in readFasta(openFile(B)):
            xs |= set(kmersList(K, seq, True))
        B = xs

    tmpnm = tmpfile('.pmc')
    with casket(tmpnm, 'w') as z:
        nr = 0
        for itm in reads(opts['<input>'],
                         K=K,
                         pairs=False,
                         reads=False,
                         kmers=True,
                         both=True,
                         verbose=verbose):
            xs = itm.kmers[0]
            for x in xs:
                acgt[x & 3] += 1
            if d is not None:
                for x in xs:
                    if x in cacheNo:
                        continue
                    if x not in cacheYes:
                        if not sub(S, d, x):
                            cacheNo.add(x)
                            continue
                        cacheYes.add(x)
                    buf.add(x)
                    m += 1
                    n += 1
                if len(cacheYes) > 1000000:
                    cacheYes = set([])
                if len(cacheNo) > 1000000:
                    cacheNo = set([])
            elif B is not None:
                found = False
                for x in xs:
                    if x in B:
                        found = True
                        break
                if found:
                    buf.addList(xs)
                    for x in xs:
                        m += 1
                        n += 1
            else:
                buf.addList(xs)
                for x in xs:
                    m += 1
                    n += 1

            nr += 1
            if (nr & 1023) == 0 and buf.mem() >= Z // 2:
                fn = 'tmps-%d' % (len(tmps), )
                tmps.append(fn)
                writeKmersAndCounts2(z, buf.kmersOnly(), buf.countsOnly(), fn)
                buf.clear()
                n = 0

        if len(tmps) and len(buf):
            fn = 'tmps-%d' % (len(tmps), )
            tmps.append(fn)
            writeKmersAndCounts2(z, buf.kmersOnly(), buf.countsOnly(), fn)
            buf = []

    with zotk.kmers(out, 'w') as z:
        h = {}
        if len(tmps) == 0:
            for c in buf.countsOnly():
                h[c] = 1 + h.get(c, 0)
            writeKmersAndCounts2(z, buf.kmersOnly(), buf.countsOnly())
        elif len(tmps) == 1:
            with casket(tmpnm, 'r') as z0:
                writeKmersAndCounts(z, readKmersAndCounts(z0, tmps[0]))
        else:
            with casket(tmpnm, 'r') as z0:
                xss = [readKmersAndCounts(z0, t) for t in tmps]
                mergeNinto(K, xss, h, z)
        n = float(sum(acgt))
        acgt = [c / n for c in acgt]
        z.meta['K'] = K
        z.meta['kmers'] = 'kmers'
        z.meta['counts'] = 'counts'
        z.meta['hist'] = h
        z.meta['acgt'] = acgt
        z.meta['reads'] = nr
    os.remove(tmpnm)
Esempio n. 7
0
def main(argv):
    opts = docopt.docopt(__doc__, argv)

    K = None

    out = opts['<output>']

    px = list(pairs(opts['<input>']))
    if len(px) == 1:
        with kmers(out, 'w') as z:
            h = {}
            acgt = [0, 0, 0, 0]
            ix = px[0]
            if len(ix) == 1:
                with kmers(ix[0], 'r') as z0:
                    K = z0.meta['K']
                    xs = readKmersAndCounts(z0)
                    zs = hist(xs, h, acgt)
                    writeKmersAndCounts(z, xs)
            else:
                with kmers(ix[0], 'r') as z0, kmers(ix[1], 'r') as z1:
                    K = z0.meta['K']
                    K1 = z1.meta['K']
                    if K1 != K:
                        print >> sys.stderr, "mismatched K"
                        sys.exit(1)
                    xs = readKmersAndCounts(z0)
                    ys = readKmersAndCounts(z1)
                    zs = hist(merge(xs, ys), h, acgt)
                    writeKmersAndCounts(z, zs)
            n = float(sum(acgt))
            acgt = [c/n for c in acgt]
            z.meta['hist'] = h
            z.meta['acgt'] = acgt
        return

    tmps = []
    tmpnm = tmpfile('.pmc')
    with casket(tmpnm, 'w') as z:
        for ix in px:
            if len(ix) == 1:
                nm = 'tmp-' + str(len(tmps))
                tmps.append(nm)
                with kmers(ix[0], 'r') as z0:
                    if K is None:
                        K = z0.meta['K']
                    else:
                        K0 = z0.meta['K']
                        if K0 != K:
                            print >> sys.stderr, "mismatched K"
                            sys.exit(1)
                    xs = readKmersAndCounts(z0)
                    writeKmersAndCounts(z, xs, nm)
            else:
                nm = 'tmp-' + str(len(tmps))
                tmps.append(nm)
                with kmers(ix[0], 'r') as z0, kmers(ix[1], 'r') as z1:
                    if K is None:
                        K = z0.meta['K']
                    else:
                        K0 = z0.meta['K']
                        if K0 != K:
                            print >> sys.stderr, "mismatched K"
                            sys.exit(1)
                        K1 = z1.meta['K']
                        if K1 != K:
                            print >> sys.stderr, "mismatched K"
                            sys.exit(1)
                    xs = readKmersAndCounts(z0)
                    ys = readKmersAndCounts(z1)
                    writeKmersAndCounts(z, merge(xs, ys), nm)

    assert K is not None

    with kmers(out, 'w') as z:
        h = {}
        acgt = [0, 0, 0, 0]
        with casket(tmpnm, 'r') as z0:
            xss = [readKmersAndCounts(z0, t) for t in tmps]
            mergeNinto(K, xss, h, acgt, z)
        n = float(sum(acgt))
        acgt = [c/n for c in acgt]
        z.meta['K'] = K
        z.meta['kmers'] = 'kmers'
        z.meta['counts'] = 'counts'
        z.meta['hist'] = h
        z.meta['acgt'] = acgt

    os.remove(tmpnm)