Example #1
0
def main(argv):
    opts = docopt.docopt(__doc__, argv)

    p = 0.01
    if opts['-P'] is not None:
        p = float(opts['-P'])
    inp = opts['<input>']
    out = opts['<output>']
    with container(out, 'w') as z:
        h = {}
        with container(inp, 'r') as z0:
            K = z0.meta['K']
            z.meta = z0.meta.copy()
            del z.meta['kmers']
            del z.meta['counts']
            xs = readKmersAndCounts(z0)
            if opts['-D'] is None:
                if opts['-S'] is not None:
                    S = long(opts['-S'])
                    random.seed(S)
                    writeKmersAndCounts(K, sampleR(p, xs, h), z)
            else:
                S = 0
                if opts['-S'] is not None:
                    S = long(opts['-S'])
                    writeKmersAndCounts(K, sampleD(p, S, xs, h), z)
        z.meta['hist'] = h
Example #2
0
def test_merg2_0():
    K = 27
    M = (1 << (2 * K)) - 1
    N = 100000
    random.seed(17)
    xs = [(random.randint(0, M), pois(10)) for i in xrange(N)]
    xs.sort()
    ys = [(random.randint(0, M), pois(10)) for i in xrange(N)]
    ys.sort()

    nm0 = tmpfile()
    with container(nm0, 'w') as z:
        writeKmersAndCounts(K, xs, z, 'xs')
        writeKmersAndCounts(K, ys, z, 'ys')

    nm1 = tmpfile()
    h = {}
    with container(nm0, 'r') as z0, container(nm1, 'w') as z:
        merge2(z, K, readKmersAndCounts(z0, 'xs'),
               readKmersAndCounts(z0, 'ys'), h, 'zs')
    h = h.items()
    h.sort()

    ws = {}
    for (x, c) in xs:
        ws[x] = c + ws.get(x, 0)
    for (y, c) in ys:
        ws[y] = c + ws.get(y, 0)
    ws = ws.items()
    ws.sort()

    with container(nm1, 'r') as z:
        zs = list(readKmersAndCounts(z, 'zs'))

    assert len(ws) == len(zs)
    for i in xrange(len(ws)):
        assert ws[i] == zs[i]

    h1 = {}
    for (_, c) in ws:
        h1[c] = 1 + h1.get(c, 0)
    h1 = h1.items()
    h1.sort()

    assert len(h) == len(h1)
    for i in xrange(len(h)):
        assert h[i] == h1[i]
Example #3
0
def test_std_0():
    K = 27
    M = (1 << (2 * K)) - 1
    N = 100000
    random.seed(17)
    xs = [(random.randint(0, M), pois(10)) for i in xrange(N)]
    nm = tmpfile()
    with container.container(nm, 'w') as z:
        std.writeKmersAndCounts(K, xs, z, 'wibble')
    with container.container(nm, 'r') as z:
        ys = list(std.readKmersAndCounts(z, 'wibble'))

    assert len(ys) == N
    for i in xrange(N):
        assert xs[i] == ys[i]

    os.remove(nm)
Example #4
0
def main(argv):
    opts = docopt.docopt(__doc__, argv)

    inp = opts['<input>']
    with container(inp, 'r') as z:
        K = z.meta['K']
        if 'kmers' not in z.meta:
            print >> sys.stderr, 'cannot dump "%s" as it contains no k-mers' % (
                inp, )
            return
        if 'counts' in z.meta:
            xs = readKmersAndCounts(z)
            for (x, c) in xs:
                print '%s\t%d' % (render(K, x), c)
        else:
            xs = readKmers(z)
            for x in xs:
                print render(K, x)
Example #5
0
def main(argv):
    opts = docopt.docopt(__doc__, argv)

    with container(opts['<ref>'], 'r') as z:
        K = z.meta['K']
        xs = array.array('L', readKmers(z))
    Z = len(xs)

    with container(opts['<input>'], 'r') as z0:
        K0 = z0.meta['K']
        if K0 != K:
            print >> sys.stderr, "mismatched K (%d)" % (K0, )
            sys.exit(1)

        with container(opts['<output>'], 'w') as z:
            if 'counts' in z0.meta:
                ys = readKmersAndCounts(z0)
                writeKmersAndCounts(K, project2(xs, ys), z)
            else:
                ys = readKmers(z0)
                writeKmers(K, project1(xs, ys), z)
Example #6
0
def main(argv):
    opts = docopt.docopt(__doc__, argv)

    inp = opts['<input>']
    out = opts['<output>']

    c = 0
    if opts['-c'] is not None:
        c = int(opts['-c'])

    with container(inp, 'r') as z:
        K = z.meta['K']
        h = z.meta['hist']
        if c == 0:
            c = infer(K, h)
            print >> sys.stderr, 'inferred cutoff:', c
        xs = readKmersAndCounts(z)
        with container(out, 'w') as w:
            w.meta = z.meta.copy()
            del w.meta['kmers']
            del w.meta['counts']
            writeKmersAndCounts(K, trim(xs, c), w)
Example #7
0
def main(argv):
    opts = docopt.docopt(__doc__, argv)

    K = None

    out = opts['<output>']

    px = list(pairs(opts['<input>']))
    if len(px) == 1:
        with container(out, 'w') as z:
            h = {}
            acgt = [0, 0, 0, 0]
            ix = px[0]
            if len(ix) == 1:
                with container(ix[0], 'r') as z0:
                    K = z0.meta['K']
                    xs = readKmersAndCounts(z0)
                    zs = hist(xs, h, acgt)
                    writeKmersAndCounts(K, xs, z)
            else:
                with container(ix[0], 'r') as z0:
                    K = z0.meta['K']
                    xs = readKmersAndCounts(z0)
                    with container(ix[1], 'r') as z1:
                        K1 = z1.meta['K']
                        if K1 != K:
                            print >> sys.stderr, "mismatched K"
                            sys.exit(1)
                        ys = readKmersAndCounts(z1)
                        zs = hist(merge(xs, ys), h, acgt)
                        writeKmersAndCounts(K, zs, z)
            n = float(sum(acgt))
            acgt = [c/n for c in acgt]
            z.meta['hist'] = h
            z.meta['acgt'] = acgt
        return

    tmps = []
    tmpnm = tmpfile('.pmc')
    with container(tmpnm, 'w') as z:
        for ix in px:
            if len(ix) == 1:
                nm = 'tmp-' + str(len(tmps))
                tmps.append(nm)
                with container(ix[0], 'r') as z0:
                    if K is None:
                        K = z0.meta['K']
                    else:
                        K0 = z0.meta['K']
                        if K0 != K:
                            print >> sys.stderr, "mismatched K"
                            sys.exit(1)
                    xs = readKmersAndCounts(z0)
                    writeKmersAndCounts(K, xs, z, nm)
            else:
                nm = 'tmp-' + str(len(tmps))
                tmps.append(nm)
                with container(ix[0], 'r') as z0:
                    if K is None:
                        K = z0.meta['K']
                    else:
                        K0 = z0.meta['K']
                        if K0 != K:
                            print >> sys.stderr, "mismatched K"
                            sys.exit(1)
                    xs = readKmersAndCounts(z0)
                    with container(ix[1], 'r') as z1:
                        K1 = z1.meta['K']
                        if K1 != K:
                            print >> sys.stderr, "mismatched K"
                            sys.exit(1)
                        ys = readKmersAndCounts(z1)
                        writeKmersAndCounts(K, merge(xs, ys), z, nm)

    assert K is not None

    with container(out, 'w') as z:
        h = {}
        acgt = [0, 0, 0, 0]
        with container(tmpnm, 'r') as z0:
            zs = None
            for fn in tmps:
                xs = readKmersAndCounts(z0, fn)
                if zs is None:
                    zs = xs
                else:
                    zs = merge(zs, xs)
            zs = hist(zs, h, acgt)
            writeKmersAndCounts(K, zs, z)
        n = float(sum(acgt))
        acgt = [c/n for c in acgt]
        z.meta['hist'] = h
        z.meta['acgt'] = acgt

    os.remove(tmpnm)
Example #8
0
def main(argv):
    opts = docopt.docopt(__doc__, argv)

    K = int(opts['<k>'])
    out = opts['<output>']
    Z = 1024 * 1024 * 32
    if opts['-m'] is not None:
        Z = 1024 * 1024 * int(opts['-m'])

    buf = KmerAccumulator()
    n = 0
    tmps = []
    acgt = [0, 0, 0, 0]
    m = 0

    d = None
    if opts['-D'] is not None:
        d = float(opts['-D'])

        S = 0
        if opts['-S'] is not None:
            S = int(opts['-S'])

        cacheYes = set([])
        cacheNo = set([])

    tmpnm = tmpfile('.pmc')
    with container(tmpnm, 'w') as z:
        pass

    PN = 1024 * 1024

    nr = 0
    t0 = time.time()
    for fn in opts['<input>']:
        for rds in mkParser(fn):
            for (nm, seq) in rds:
                nr += 1
                if nr & (PN - 1) == 0:
                    t1 = time.time()
                    print >> sys.stderr, 'reads processed:', nr, (PN) / (
                        t1 - t0), 'reads/second'
                    t0 = t1
                    #buf.stat()
                xs = kmersList(K, seq, True)
                if d is None:
                    buf.addList(xs)
                    for x in xs:
                        acgt[x & 3] += 1
                        m += 1
                        n += 1
                else:
                    for x in xs:
                        if x in cacheNo:
                            continue
                        if x not in cacheYes:
                            if not sub(S, d, x):
                                cacheNo.add(x)
                                continue
                            cacheYes.add(x)
                        buf.add(x)
                        acgt[x & 3] += 1
                        m += 1
                        n += 1
                    if len(cacheYes) > 1000000:
                        cacheYes = set([])
                    if len(cacheNo) > 1000000:
                        cacheNo = set([])
                if 8 * n >= Z:
                    fn = 'tmps-%d' % (len(tmps), )
                    #print >> sys.stderr, "writing " + fn + "\t" + tmpnm
                    tmps.append(fn)
                    with container(tmpnm, 'a') as z:
                        writeKmersAndCounts(K, mkPairs(buf.kmers()), z, fn)
                    buf.clear()
                    n = 0

    t1 = time.time()
    print >> sys.stderr, 'reads processed:', nr, (nr % PN) / (
        t1 - t0), 'reads/second'

    if len(tmps) and len(buf):
        fn = 'tmps-%d' % (len(tmps), )
        #print >> sys.stderr, "writing " + fn + "\t" + tmpnm
        tmps.append(fn)
        with container(tmpnm, 'a') as z:
            writeKmersAndCounts(K, mkPairs(buf.kmers()), z, fn)
        buf = []

    while len(tmps) > 2:
        tmpnm2 = tmpfile('.pmc')
        tmps2 = []
        with container(tmpnm, 'r') as z0, container(tmpnm2, 'w') as z:
            ps = pairs(tmps)
            for p in ps:
                fn = 'tmps-%d' % (len(tmps2), )
                tmps2.append(fn)
                if len(p) == 1:
                    writeKmersAndCounts(K, readKmersAndCounts(z0, p[0]), z, fn)
                    continue
                h = {}
                merge2(z, K, readKmersAndCounts(z0, p[0]),
                       readKmersAndCounts(z0, p[1]), h, fn)
        os.remove(tmpnm)
        tmpnm = tmpnm2
        tmps = tmps2

    with container(out, 'w') as z:
        h = {}
        if len(tmps) == 0:
            zs = hist(mkPairs(buf.kmers()), h)
            writeKmersAndCounts(K, zs, z)
        elif len(tmps) == 1:
            with container(tmpnm, 'r') as z0:
                writeKmersAndCounts(K, hist(readKmersAndCounts(z0, tmps[0]),
                                            h), z)
        else:
            assert len(tmps) == 2
            with container(tmpnm, 'r') as z0:
                merge2(z, K, readKmersAndCounts(z0, tmps[0]),
                       readKmersAndCounts(z0, tmps[1]), h)
        n = float(sum(acgt))
        acgt = [c / n for c in acgt]
        z.meta['hist'] = h
        z.meta['acgt'] = acgt
        z.meta['reads'] = nr