Esempio n. 1
0
    def score(self, res, lhs, seq, rhs):
        Km1 = self.K - 1
        Kp1 = self.K + 1

        xs = res['path']

        r = [0 for i in range(Kp1)]

        if seq is None:
            n = len(res['allele'])
            allele = consAllele(lhs, n * 'N', rhs, Km1 + res['lhsPos'],
                                Km1 + res['rhsPos'])
            r[0] += len(xs)
        else:
            allele = consAllele(lhs, seq, rhs, Km1 + res['lhsPos'],
                                Km1 + res['rhsPos'])
            ys = kmersList(self.K, allele)
            for d in [ham(x, y) for (x, y) in zip(xs, ys)]:
                r[d] += 1

        res['hammingProfile'] = r
        res['hammingCdf'] = counts2cdf(r)
        res['ksDist'] = ksDistance2(res['hammingCdf'], self.nullCdf)[0]
        res['hamming'] = hamming(allele, renderPath(self.K, xs))

        p = 1.0
        for d in xrange(Kp1):
            p *= math.pow(self.binModel[d], r[d])
        res['binom'] = 1.0 - p
Esempio n. 2
0
def nearest3(K, xs, x):
    d = K+1
    z = x
    for y in xs:
        d0 = ham(x >> 3, y >> 3)
        if d0 < d:
            d = d0
            z = y
    return (d, z >> 3)
Esempio n. 3
0
def nearest(K, xs, x):
    d = K+1
    z = x
    for y in xs:
        d0 = ham(x, y)
        if d0 < d:
            d = d0
            z = y
    return (d, z)
Esempio n. 4
0
def nearest(m, xs, y):
    dMin = m
    posMin = []
    for (x,p) in xs.iteritems():
        d = ham(x, y)
        if d > dMin:
            continue
        if d < dMin:
            dMin = d
            posMin = []
        posMin += p
    return posMin
Esempio n. 5
0
    def path_kmers(self, xs):
        L = len(xs)

        # Step 1: gather up all likely looking k-mers
        #
        Ys = []
        for i in range(L):
            x = xs[i]
            m = self.get_mask(i, L)
            x0 = x & m
            ys = []
            for y in self.X.iterkeys():
                if (y & m) != x0:
                    continue
                d = ham(x, y)
                if d < self.D:
                    ys.append(y)
            if len(ys) == 0:
                return None
            Ys.append(ys)

        # Step 2: eliminate k-mers that don't form part
        # of a complete path from xs[0]--xs[-1]
        #
        done = False
        while not done:
            done = True
            for j in range(1, L):
                i = j - 1
                vs0 = Ys[i]
                ws0 = Ys[j]
                (vs1, ws1) = debruijn_intersection(self.K, vs0, ws0)
                if len(vs1) == 0 or len(ws1) == 0:
                    return None
                if len(vs0) != len(vs1):
                    Ys[i] = vs1
                    done = False
                if len(ws0) != len(ws1):
                    Ys[j] = ws1
                    done = False

        return Ys
Esempio n. 6
0
def trim(K, kx):
    tau = 3
    msks = make_masks(K, tau)
    frac = 0.05

    vx = {}
    for x in kx.iterkeys():
        for m in msks:
            y = (x & m)
            if y not in vx:
                vx[y] = []
            vx[y].append(x)

    wx = unionfind()
    for xs in vx.itervalues():
        for i in xrange(len(xs)):
            x = xs[i]
            for j in xrange(i + 1, len(xs)):
                y = xs[j]
                d = ham(x, y)
                if d <= tau:
                    wx.union(x, y)

    vx = {}
    for x in kx.iterkeys():
        a = wx.find(x)
        if a not in vx:
            vx[a] = []
        vx[a].append((kx[x], x))

    wx = set([])
    for xs in vx.values():
        xs.sort()
        i = 0
        while i < len(xs) and xs[i][0] < frac * xs[-1][0]:
            wx.add(xs[i][1])
            i += 1
    for x in wx:
        del kx[x]
Esempio n. 7
0
 def grp(itms):
     z = len(itms)
     u = unionfind()
     for i in xrange(z):
         for j in xrange(i + 1, z):
             d0 = ham(itms[i], itms[j])
             if d0 <= d:
                 u.union(i, j)
     idx = {}
     for i in xrange(z):
         j = u.find(i)
         if j not in idx:
             idx[j] = []
         idx[j].append(i)
     for ys in idx.itervalues():
         if len(ys) == 1:
             continue
         zs = [itms[y] for y in ys]
         m = 0
         for z in zs:
             m |= 1 << ((z >> T) & 3)
         if popcnt(m) == 1:
             continue
         yield zs
Esempio n. 8
0
def main(argv):
    opts = docopt.docopt(__doc__, argv)

    if opts['-X']:
        K = 27
        S = []
        N = 0
        qacgt = [0, 0, 0, 0]
        for fn in opts['<input>']:
            with open(fn) as f:
                for (nm, seq) in readFasta(f):
                    if len(seq) < K:
                        continue
                    for (x, p) in kmersWithPos(K, seq, True):
                        S.append(x)
                        qacgt[x & 3] += 1
                        N += 1
        S.sort()
        qacgt = [float(c) / float(N) for c in qacgt]
        S = sparse(2 * K, array.array('L', uniq(S)))
        lens = []
        nms = []
        seqs = []
        n = 0
        tmp = [[] for i in xrange(S.count())]
        for fn in opts['<input>']:
            with open(fn) as f:
                for (nm, seq) in readFasta(f):
                    if len(seq) < K:
                        print >> sys.stderr, "warning: `%s' skipped" % (nm, )
                        continue
                    nms.append(nm)
                    seqs.append(seq)
                    lens.append(len(seq))
                    for (x, p) in kmersWithPos(K, seq, True):
                        r = S.rank(x)
                        tmp[r].append((n, p))
                    n += 1
        T = array.array('I', [])
        U = array.array('I', [])
        V = array.array('i', [])
        t = 0
        for nps in tmp:
            T.append(t)
            t += len(nps)
            for (n, p) in nps:
                U.append(n)
                V.append(p)
        T.append(t)
        del tmp

        gfn = opts['<genes>']
        with casket(gfn, 'w') as z:
            meta = {}
            meta['K'] = K
            meta['lens'] = lens
            meta['qacgt'] = qacgt
            meta['nms'] = nms
            meta['seqs'] = seqs

            z.add_content('__meta__', json.dumps(meta))
            write64(z, S.xs, 'S')
            write32(z, T, 'T')
            write32(z, U, 'U')
            write32s(z, V, 'V')

        return

    print >> sys.stderr, "loading..."

    gfn = opts['<genes>']
    with casket(gfn, 'r') as z:
        mf = z.open('__meta__')
        meta = json.load(mf)
        K = meta['K']
        lens = meta['lens']
        qacgt = meta['qacgt']
        nms = meta['nms']
        seqs = meta['seqs']

        S = read64(z, 'S')
        S = sparse(2 * K, S)
        T = read32(z, 'T')
        U = read32(z, 'U')
        V = read32s(z, 'V')

    print >> sys.stderr, "done."

    for fn in opts['<input>']:
        L = array.array('B', [0 for i in xrange(S.count())])
        Y = array.array('L', [0 for i in xrange(S.count())])
        with kmers(fn, 'r') as z:
            sacgt = z.meta['acgt']
            xs = readKmers(z)
            X = array.array('L', xs)
        M = len(X)
        resolveAll(K, S, L, Y, X)
        X = sparse(2 * K, X)

        g = sum([qp * sp for (qp, sp) in zip(qacgt, sacgt)])
        print >> sys.stderr, "g =", g
        nm = [null(g, M, j) for j in range(0, K + 1)]

        # counts for computing distribution of prefix lengths
        cnt = [[0 for j in xrange(K + 1)] for i in xrange(len(nms))]

        # the k-mers that we pulled by lcp from the sample
        # for each position of each query.
        P = [
            array.array('L', [0 for j in xrange(lens[i] - K + 1)])
            for i in xrange(len(lens))
        ]

        # the length of the lcp for each position of each query.
        Q = [
            array.array('B', [0 for j in xrange(lens[i] - K + 1)])
            for i in xrange(len(lens))
        ]

        for i in xrange(S.count()):
            for j in xrange(T[i], T[i + 1]):
                n = U[j]
                p = V[j]
                y = Y[i]
                l = L[i]
                cnt[n][l] += 1
                if p > 0:
                    p -= 1
                else:
                    p = -(p + 1)
                    y = rc(K, y)
                if l > Q[n][p]:
                    Q[n][p] = l
                    P[n][p] = y

        for i in xrange(len(nms)):
            # iterate over the queries

            qc = math.log(K * 0.05 / float(lens[i] - K + 1) / 2)

            # Link up "de Bruijn" sequences
            m = (1 << (2 * K - 2)) - 1
            py = 0
            u = unionfind()
            for j in xrange(lens[i] - K + 1):
                x = P[i][j]
                y = x >> 2
                if j > 0:
                    d = ham(py, y)
                    if d == 0:
                        u.union(j - 1, j)
                py = x & m

            # Gather up the de Bruin fragments
            udx = {}
            for j in xrange(lens[i] - K + 1):
                v = u.find(j)
                if v not in udx:
                    udx[v] = []
                udx[v].append(j)

            # Index the left hand k-mers
            idxLhs = {}
            kx = []
            for (jx, js) in udx.iteritems():
                q = 0
                for j in js:
                    q += math.log1p(-nm[Q[i][j]])
                if q > math.log(0.05 / len(js)):
                    continue
                kx.append((-len(js), jx))
                idxLhs[P[i][js[0]]] = jx
            kx.sort()

            # Attempt to link up fragments
            links = {}
            for (_, jx) in kx:
                jR = udx[jx][-1]
                if jR == lens[i] - K + 1:
                    continue
                x = P[i][jR]
                xs = []
                lnk = None
                for k in xrange(100):
                    ys = succ(K, X, x)
                    if len(ys) != 1:
                        break
                    x = ys[0]
                    if x in idxLhs:
                        lnk = idxLhs[x]
                        break
                    xs.append(x)
                if lnk is not None:
                    links[jx] = xs
                    u.union(jx, lnk)

            # Gather up the linked fragments
            vdx = {}
            for j in [jx for (_, jx) in kx]:
                v = u.find(j)
                if v not in vdx:
                    vdx[v] = []
                vdx[v].append(j)

            res = []
            for (jxx, jxs) in vdx.iteritems():
                # Order the gragments by start position
                fs = [(udx[jx][0], jx) for jx in jxs]
                fs.sort()
                sxs = []
                for fj in xrange(len(fs)):
                    (_, jx) = fs[fj]
                    beg = udx[jx][0]
                    end = udx[jx][-1] + 1
                    if fj == 0:
                        for j in xrange(beg):
                            sxs.append((0, 0))
                    xs = links.get(jx, None)
                    for j in xrange(beg, end):
                        x = P[i][j]
                        l = Q[i][j]
                        sxs.append((x, l))
                    if xs:
                        for x in xs:
                            sxs.append((x, 27))
                    else:
                        if fj < len(fs) - 1:
                            nxt = fs[fj + 1][0]
                        else:
                            nxt = lens[i] - K + 1
                        for j in xrange(end, nxt):
                            sxs.append((0, 0))
                seq = [[0, 0, 0, 0] for j in xrange(len(sxs) + K - 1)]
                for j in xrange(len(sxs)):
                    (x, l) = sxs[j]
                    p = math.log1p(-nm[l])
                    for k in xrange(K):
                        seq[j + K - k - 1][x & 3] += p
                        x >>= 2
                ax = []
                p = None
                inf = False
                for j in xrange(len(seq)):
                    b = 0
                    for k in xrange(4):
                        if seq[j][k] < qc:
                            b |= 1 << k
                    ax.append(fasta(b))
                    ssj = sum(seq[j])
                    if p is None:
                        p = ssj
                    else:
                        p = logAdd(p, ssj)
                    if ssj > -1e-300:
                        inf = True
                dst = counts2cdf(cnt[i])
                (_, kd) = ksDistance2(dst, nm)
                df = math.ceil(len(seq) / float(K))
                if inf:
                    q = 1e300
                    pv = 0.0
                else:
                    q = 2 * math.exp(p)
                    pv = chi2(df, q)
                res.append((pv, q, kd, ''.join(ax)))

            if len(res) == 0:
                continue

            res.sort()
            if res[0][0] < -2:
                #ed = lev(seqs[i], res[0][2])
                ed = 0
                pv = res[0][0] / math.log(10)
                c2 = res[0][1]
                kd = res[0][2]
                a = res[0][3]
                print '%d\t%d\t%d\t%g\t%g\t%g\t%s\t%s' % (
                    i, lens[i], len(a), kd, c2, pv, nms[i], a)
            sys.stdout.flush()
Esempio n. 9
0
def findAnchors(K, seq, mx, isLhs, D):
    xps = kmersWithPosList(K, seq, False)
    xps = [(x, p - 1) for (x, p) in xps]

    # Find the highest coverage k-mer that intersects.
    xc = 0
    for (x, p) in xps:
        if x in mx and mx[x] >= xc:
            xc = mx[x]

    if xc == 0:
        return set([])

    # Seeds should be in the same order of magnitude
    # as the highest-coverage seed.
    t = int(math.exp(math.log(xc) - 1.5))

    xs = {}
    for (x, c) in mx.iteritems():
        if c < t:
            continue
        xs[x] = c

    ys = xs.keys()

    zs = {}
    for (x, p) in xps:
        zs[p] = set([])

        if x not in xs:
            continue

        for y in ys:
            d = ham(x, y)
            if d > D:
                continue
            zs[p].add(y)

    e = set([])
    res = set([])
    if isLhs:
        for (x, p) in xps:
            ss = zs.get(p, e)
            tt = zs.get(p - 1, e)
            for s in ss:
                res.add((s, p))
                for t in tt:
                    if debruijn(K, t, s):
                        res.discard((t, p - 1))
    else:
        for (x, p) in xps[::-1]:
            ss = zs.get(p, e)
            tt = zs.get(p + 1, e)
            for s in ss:
                res.add((s, p))
                for t in tt:
                    if debruijn(K, s, t):
                        res.discard((t, p + 1))

    if isLhs:
        l = len(seq) - K
        res = [(x, l - p) for (x, p) in res]
    else:
        res = list(res)
    res.sort()

    return res