Esempio n. 1
0
def test_rank1():
    random.seed(17)
    K = 27
    M = (1 << (2 * K)) - 1
    N = 10000
    xs = set([])
    for i in xrange(N):
        xs.add(random.randint(0, M))
    N = len(xs)
    ys = list(xs)
    ys.sort()
    S = sparse.sparse(2 * K, array.array('L', ys))
    assert S.count() == N
    for i in xrange(N):
        r = S.access(ys[i])
        assert r is not None
        assert r == i
        s = S.rank(ys[i] + 1)
        assert s == r + 1

    for i in xrange(N):
        x = random.randint(0, M)
        while x in xs:
            x = (x + random.randint(1, M)) & M
        a = S.access(x)
        assert a is None
        r = S.rank(x)
        if r < N:
            assert ys[r] > x
        else:
            assert ys[-1] < x
        if r > 0:
            assert ys[r - 1] < x
        else:
            assert ys[0] > x
Esempio n. 2
0
def buildIndex(K, inputs, output):
    """
    Create a new k-mer index. The FASTA files named in the list
    `inputs` are read in and the `K` length k-mers and their reverse
    complements are extracted and collated to create an index that
    maps from k-mer to sequence number (numbering from 0). The
    `names` member of the KmerIndex object can be used to retrieve
    the name from the sequence number.
    """
    seqs = []
    for inp in inputs:
        with openFile(inp) as f:
            seqs += list(readFasta(f))

    S = []
    nms = []
    lens = array.array('I', [])
    for i in xrange(len(seqs)):
        (nm, seq) = seqs[i]
        nms.append(nm)
        xs = list(kmers(K, seq, True))
        xs.sort()
        uniq(xs)
        seqs[i] = [nm, xs]
        lens.append(len(xs))
        S += xs
    S.sort()
    uniq(S)
    S = sparse(2*K, S)

    T = array.array('I', [0 for i in xrange(S.count() + 1)])
    for i in xrange(len(seqs)):
        for x in seqs[i][1]:
            r = S.rank(x)
            T[r] += 1

    t0 = 0
    for i in xrange(len(T)):
        t1 = t0 + T[i]
        T[i] = t0
        t0 = t1

    T0 = [c for c in T]
    U = array.array('H', [0 for i in xrange(t0)])
    for i in xrange(len(seqs)):
        for x in seqs[i][1]:
            r = S.rank(x)
            U[T0[r]] = i
            T0[r] += 1

    with container(output, 'w') as z:
        writeKmers(K, S.xs, z)
        n = write32(z, T, 'offsets')
        z.meta['T'] = n
        n = write16(z, U, 'postings')
        z.meta['U'] = n
        n = write32(z, lens, 'lens')
        z.meta['lens'] = n
        z.meta['names'] = nms
Esempio n. 3
0
 def __init__(self, z):
     self.K = z.meta['K']
     S = array.array('L', readKmers(z))
     self.S = sparse(2*self.K, S)
     n = z.meta['T']
     self.T = array.array('I', read32(z, 'offsets', n))
     n = z.meta['U']
     self.U = array.array('H', read16(z, 'postings', n))
     n = z.meta['lens']
     self.lens = array.array('I', read32(z, 'lens', n))
     self.names = z.meta['names']
Esempio n. 4
0
def main(argv):
    opts = docopt.docopt(__doc__, argv)

    L0 = None
    if opts['-l']:
        L0 = int(opts['-l'])

    for inp in opts['<input>']:
        with container(inp, 'r') as z:
            K = z.meta['K']
            L = L0
            if L is None:
                L = 2*K

            xs = array.array('L', readKmers(z))
            S = sparse(2*K, xs)

            seen = bitvec(S.count())
            for i in xrange(S.count()):
                if seen[i]:
                    continue

                x = S.select(i)
                xb = rc(K, x)
                xp = succ(K, S, xb)
                if xp == 1:
                    # x isn't the start of a contig
                    continue

                pth = [x]
                seen[i] = 1
                xn = succ(K, S, x)
                while len(xn) == 1:
                    if seen[xn[0]] == 1:
                        break
                    x = S.select(xn[0])
                    pth.append(x)
                    seen[xn[0]] = 1
                    xb = rc(K, x)
                    j = S.rank(xb)
                    seen[j] = 1
                    xn = succ(K, S, x)

                if len(pth)+K-1 < L:
                    continue

                s = [render(K, pth[0])]
                for j in xrange(1, len(pth)):
                    s.append("ACGT"[pth[j]&3])

                print '>contig_%d\n%s' % (i, ''.join(s))
Esempio n. 5
0
def main(argv):
    opts = docopt.docopt(__doc__, argv)

    nms = [str(i + 1) for i in xrange(len(probesMTB))]
    probes = probesMTB

    if opts['-p'] is not None:
        nms = []
        probes = []
        bad = False
        with open(opts['-p']) as f:
            i = 0
            ln = 0
            for l in f:
                ln += 1
                if l[0] == '#':
                    continue
                i += 1
                t = l.split()
                if len(t) == 1:
                    nms.append(str(i))
                    probes.append(probe(t[0]))
                elif len(t) == 2:
                    nms.append(t[0])
                    probes.append(probe(t[1]))
                else:
                    bad = True
                    print >> sys.stderr, '%s line %d, badly formatted.' % (
                        opts['-p'], i)
        if bad:
            sys.exit(1)

    for inp in opts['<input>']:
        with container(inp, 'r') as z:
            K = z.meta['K']
            xs = readKmers(z)
            xs = sparse(2 * K, array.array('L', xs))

        res = []
        for i in xrange(len(probes)):
            if findProbe(probes[i], K, xs):
                res.append('1')
            else:
                res.append('0')
        if opts['-l']:
            for i in xrange(len(nms)):
                print '%s\t%s\t%s' % (inp, nms[i], res[i])
        else:
            print inp + '\t' + ''.join(res)
Esempio n. 6
0
def test_rank1():
    random.seed(17)
    K = 27
    M = (1 << (2 * K)) - 1
    N = 10000
    xs = set([])
    for i in xrange(N):
        xs.add(random.randint(0, M))
    N = len(xs)
    xs = list(xs)
    xs.sort()
    S = sparse.sparse(2 * K, array.array('L', xs))
    assert S.count() == N
    for i in xrange(N):
        assert S.rank(xs[i]) == i
Esempio n. 7
0
def test_empty():
    K = 27
    S = sparse.sparse(2 * K, array.array('L', []))
    assert S.count() == 0
    assert S.rank(12345) == 0
Esempio n. 8
0
def main(argv):
    opts = docopt.docopt(__doc__, argv)

    if opts['-X']:
        K = 27
        S = []
        N = 0
        qacgt = [0, 0, 0, 0]
        for fn in opts['<input>']:
            with open(fn) as f:
                for (nm, seq) in readFasta(f):
                    if len(seq) < K:
                        continue
                    for (x,p) in kmersWithPos(K, seq, True):
                        S.append(x) 
                        qacgt[x&3] += 1
                        N += 1
        S.sort()
        qacgt = [float(c)/float(N) for c in qacgt]
        S = sparse(2*K, array.array('L', uniq(S)))
        lens = array.array('I', [])
        nms = []
        seqs = []
        n = 0
        tmp = [[] for i in xrange(S.count())]
        for fn in opts['<input>']:
            with open(fn) as f:
                for (nm, seq) in readFasta(f):
                    if len(seq) < K:
                        print >> sys.stderr, "warning: `%s' skipped" % (nm,)
                        continue
                    nms.append(nm)
                    seqs.append(seq)
                    lens.append(len(seq))
                    for (x,p) in kmersWithPos(K, seq, True):
                        r = S.rank(x)
                        tmp[r].append((n, p))
                    n += 1
        T = array.array('I', [])
        U = array.array('I', [])
        V = array.array('i', [])
        t = 0
        for nps in tmp:
            T.append(t)
            t += len(nps)
            for (n, p) in nps:
                U.append(n)
                V.append(p)
        T.append(t)
        del tmp

        gfn = opts['<genes>']
        with container(gfn, 'w') as z:
            z.meta['K'] = K
            z.meta['S'] = S.count()
            write64(z, S.xs, 'S')
            z.meta['T'] = len(T)
            write64(z, T, 'T')
            z.meta['U'] = len(U)
            write32(z, U, 'U')
            z.meta['V'] = len(V)
            write32s(z, V, 'V')
            z.meta['lens'] = lens
            z.meta['qacgt'] = qacgt
            z.meta['nms'] = nms
            z.meta['seqs'] = seqs

        return

    print >> sys.stderr, "loading..."

    gfn = opts['<genes>']
    with container(gfn, 'r') as z:
        K = z.meta['K']
        S = array.array('L', read64(z, 'S', z.meta['S']))
        S = sparse(2*K, S)
        T = array.array('L', read64(z, 'T', z.meta['T']))
        U = array.array('I', read32(z, 'U', z.meta['U']))
        V = array.array('i', read32s(z, 'V', z.meta['V']))
        lens = z.meta['lens']
        qacgt = z.meta['qacgt']
        nms = z.meta['nms']
        seqs = z.meta['seqs']

    print >> sys.stderr, "done."

    for fn in opts['<input>']:
        L = array.array('B', [0 for i in xrange(S.count())])
        Y = array.array('L', [0 for i in xrange(S.count())])
        with container(fn, 'r') as z:
            sacgt = z.meta['acgt']
            xs = readKmers(z)
            X = array.array('L', xs)
        M = len(X)
        resolveAll(K, S, L, Y, X)
        X = sparse(2*K, X)

        g = sum([qp*sp for (qp, sp) in zip(qacgt, sacgt)])
        print >> sys.stderr, "g =", g
        nm = [null(g, M, j) for j in range(0, K+1)]

        # counts for computing distribution of prefix lengths
        cnt = [[0 for j in xrange(K+1)] for i in xrange(len(nms))]

        # the k-mers that we pulled by lcp from the sample
        # for each position of each query.
        P = [array.array('L', [0 for j in xrange(lens[i] - K + 1)]) for i in xrange(len(lens))]

        # the length of the lcp for each position of each query.
        Q = [array.array('B', [0 for j in xrange(lens[i] - K + 1)]) for i in xrange(len(lens))]

        for i in xrange(S.count()):
            for j in xrange(T[i], T[i+1]):
                n = U[j]
                p = V[j]
                y = Y[i]
                l = L[i]
                cnt[n][l] += 1
                if p > 0:
                    p -= 1
                else:
                    p = -(p + 1)
                    y = rc(K, y)
                if l > Q[n][p]:
                    Q[n][p] = l
                    P[n][p] = y

        for i in xrange(len(nms)):
            # iterate over the queries

            qc = math.log(K*0.05/float(lens[i] - K + 1)/2)

            # Link up "de Bruijn" sequences
            m = (1 << (2*K - 2)) - 1
            py = 0
            u = unionfind()
            for j in xrange(lens[i] - K + 1):
                x = P[i][j]
                y = x >> 2
                if j > 0:
                    d = ham(py, y)
                    if d == 0:
                        u.union(j-1, j)
                py = x & m

            # Gather up the de Bruin fragments
            udx = {}
            for j in xrange(lens[i] - K + 1):
                v = u.find(j)
                if v not in udx:
                    udx[v] = []
                udx[v].append(j)

            # Index the left hand k-mers
            idxLhs = {}
            kx = []
            for (jx, js) in udx.iteritems():
                q = 0
                for j in js:
                    q += math.log1p(-nm[Q[i][j]])
                if q > math.log(0.05/len(js)):
                    continue
                kx.append((-len(js), jx))
                idxLhs[P[i][js[0]]] = jx
            kx.sort()

            # Attempt to link up fragments
            links = {}
            for (_, jx) in kx:
                jR = udx[jx][-1]
                if jR == lens[i] - K + 1:
                    continue
                x = P[i][jR]
                xs = []
                lnk = None
                for k in xrange(100):
                    ys = succ(K, X, x)
                    if len(ys) != 1:
                        break
                    x = ys[0]
                    if x in idxLhs:
                        lnk = idxLhs[x]
                        break
                    xs.append(x)
                if lnk is not None:
                    links[jx] = xs
                    u.union(jx, lnk)

            # Gather up the linked fragments
            vdx = {}
            for j in [jx for (_, jx) in kx]:
                v = u.find(j)
                if v not in vdx:
                    vdx[v] = []
                vdx[v].append(j)

            res = []
            for (jxx, jxs) in vdx.iteritems():
                # Order the gragments by start position
                fs = [(udx[jx][0], jx) for jx in jxs]
                fs.sort()
                sxs = []
                for fj in xrange(len(fs)):
                    (_, jx) = fs[fj]
                    beg = udx[jx][0]
                    end = udx[jx][-1] + 1
                    if fj == 0:
                        for j in xrange(beg):
                            sxs.append((0, 0))
                    xs = links.get(jx, None)
                    for j in xrange(beg, end):
                        x = P[i][j]
                        l = Q[i][j]
                        sxs.append((x, l))
                    if xs:
                        for x in xs:
                            sxs.append((x, 27))
                    else:
                        if fj < len(fs) - 1:
                            nxt = fs[fj+1][0]
                        else:
                            nxt = lens[i] - K + 1
                        for j in xrange(end, nxt):
                            sxs.append((0, 0))
                seq = [[0, 0, 0, 0] for j in xrange(len(sxs) + K - 1)]
                for j in xrange(len(sxs)):
                    (x, l) = sxs[j]
                    p = math.log1p(-nm[l])
                    for k in xrange(K):
                        seq[j + K - k - 1][x&3] += p
                        x >>= 2
                ax = []
                p = None
                inf = False
                for j in xrange(len(seq)):
                    b = 0
                    for k in xrange(4):
                        if seq[j][k] < qc:
                            b |= 1 << k
                    ax.append(fasta(b))
                    ssj = sum(seq[j])
                    if p is None:
                        p = ssj
                    else:
                        p = logAdd(p, ssj)
                    if ssj > -1e-300:
                        inf = True
                dst = counts2cdf(cnt[i])
                (_, kd) = ksDistance2(dst, nm)
                df = math.ceil(len(seq)/float(K))
                if inf:
                    q = 1e300
                    pv = 0.0
                else:
                    q = 2*math.exp(p)
                    pv = chi2(df, q)
                res.append((pv, q, kd, ''.join(ax)))

            if len(res) == 0:
                continue

            res.sort()
            if res[0][0] < -2:
                #ed = lev(seqs[i], res[0][2])
                ed = 0
                pv = res[0][0]/math.log(10)
                c2 = res[0][1]
                kd = res[0][2]
                a = res[0][3]
                print '%d\t%d\t%d\t%g\t%g\t%g\t%s\t%s' % (i, lens[i], len(a), kd, c2, pv, nms[i], a)
            sys.stdout.flush()