Beispiel #1
0
def showAnchoredReads(K, anks, rds):
    A = set(anks.keys())

    res = {}
    for rd in rds:
        (xs, ys) = kmersWithPosLists(K, rd[1])

        fwd = set([])
        for (x,p) in xs:
            if x in A:
                fwd.add((x,p))

        rev = set([])
        for (y,p) in ys:
            if y in A:
                rev.add((y,p))

        if len(fwd) > 0:
            p0 = min([p for (x,p) in fwd])
            fwd = frozenset([(x,p-p0) for (x,p) in fwd])
            seq = rd[1]
            if fwd not in res:
                res[fwd] = {}
            k = (p0,seq)
            if k not in res[fwd]:
                res[fwd][k] = 0
            res[fwd][k] += 1

        if len(rev) > 0:
            p0 = min([p for (y,p) in rev])
            rev = frozenset([(y,p-p0) for (y,p) in rev])
            seq = revComp(rd[1])
            if rev not in res:
                res[rev] = {}
            k = (p0,seq)
            if k not in res[rev]:
                res[rev][k] = 0
            res[rev][k] += 1

    for s in sorted(res.keys()):
        q = max([p for (p,seq) in res[s].keys()])
        lab = ','.join(sorted([anks[x] for (x,p) in s]))
        hdr = [lab,'\t\t', (q-1)*' ']
        pp = None
        for (p,x) in sorted([(p1,x1) for (x1,p1) in s]):
            if pp is None:
                hdr += [render(K, x)]
            else:
                d = p - pp - K
                hdr += [d*' ', render(K, x)]
            pp = p
        print ''.join(hdr)
        vv = []
        for (k,c) in sorted(res[s].items()):
            (p0, seq) = k
            vv.append((p0, c, seq))
        for (p0, c, seq) in sorted(vv):
            print '%d\t%d\t%s%s' % (p0, c, (q-p0)*' ', seq)
        print
Beispiel #2
0
def renderPath(K, xs):
    if len(xs) == 0:
        return ''
    res = [render(K, xs[0])]
    for x in xs[1:]:
        res.append('ACGT'[x&3])
    return ''.join(res)
Beispiel #3
0
def test_fixed_path_kmers_1() :
    random.seed(17)
    K = 25
    D = 3
    N = 100
    e = 0.01
    alts = {'A':['C','G','T'], 'C':['A','G','T'], 'G':['A','C','T'], 'T':['A','C','G']}
    seq = 'TACTTGCACTGGGAGGCACAGCGGCTTTTCAGTGTCACAGGTATTACGAG'
    L = len(seq)
    xs = kmersList(K, seq)
    X = {}
    for i in range(N):
        r = []
        for j in range(L):
            b = seq[j]
            if random.random() < e:
                b = random.choice(alts[b])
            r.append(b)
        s = ''.join(r)
        ys = kmersList(K, s)
        for y in ys:
            if y not in X:
                X[y] = 0
            X[y] += 1
    Y = fixed_path_kmers(K, D, X, xs)
    assert Y is not None
    assert len(Y) == len(xs)
    for i in range(len(Y)):
        assert xs[i] in Y[i]
        V = [(render(K, y), X[y]) for y in Y[i] if y != xs[i] and X[y] > X[xs[i]]]
        assert len(V) == 0
Beispiel #4
0
def renderPath(K, xs):
    if len(xs) == 0:
        return ''
    r = [render(K, xs[0])]
    for x in xs[1:]:
        r.append("ACGT"[x & 3])
    return ''.join(r)
Beispiel #5
0
def main(argv):
    opts = docopt.docopt(__doc__, argv)

    inp = opts['<input>']
    with kmers(inp, 'r') as z:
        K = z.meta['K']
        if 'kmers' not in z.meta:
            print >> sys.stderr, 'cannot dump "%s" as it contains no k-mers' % (inp,)
            return
        if 'counts' in z.meta:
            xs = readKmersAndCounts(z)
            for (x, c) in xs:
                print '%s\t%d' % (render(K, x), c)
        else:
            xs = readKmers(z)
            for x in xs:
                print render(K, x)
Beispiel #6
0
 def trace1(self, pth, xe, n):
     if len(pth) > n:
         return
     if pth[-1] == xe:
         yield pth
     print len(pth), render(self.K, pth[-1])
     for y in self.succ(pth[-1]):
         for p in self.trace1(pth + [y], xe, n):
             yield p
Beispiel #7
0
def main(argv):
    opts = docopt.docopt(__doc__, argv)

    L0 = None
    if opts['-l']:
        L0 = int(opts['-l'])

    for inp in opts['<input>']:
        with kmers(inp, 'r') as z:
            K = z.meta['K']
            L = L0
            if L is None:
                L = 2*K

            xs = array.array('L', readKmers(z))
            S = sparse(2*K, xs)

            seen = bitvec(S.count())
            for i in xrange(S.count()):
                if seen[i]:
                    continue

                x = S.select(i)
                xb = rc(K, x)
                xp = succ(K, S, xb)
                if xp == 1:
                    # x isn't the start of a contig
                    continue

                pth = [x]
                seen[i] = 1
                xn = succ(K, S, x)
                while len(xn) == 1:
                    if seen[xn[0]] == 1:
                        break
                    x = S.select(xn[0])
                    pth.append(x)
                    seen[xn[0]] = 1
                    xb = rc(K, x)
                    j = S.rank(xb)
                    seen[j] = 1
                    xn = succ(K, S, x)

                if len(pth)+K-1 < L:
                    continue

                s = [render(K, pth[0])]
                for j in xrange(1, len(pth)):
                    s.append("ACGT"[pth[j]&3])

                print '>contig_%d\n%s' % (i, ''.join(s))
Beispiel #8
0
def test_kmersList():
    K = 25
    M = (1 << (2 * K)) - 1
    N = 65536

    random.seed(17)
    xs = [random.randint(0, M) for i in xrange(N)]
    xs.sort()

    with autoremove():
        t = tmpfile()

        with casket(t, 'w') as z:
            zs = [x for x in xs]
            writeKmersList(z, zs)

        with casket(t, 'r') as z:
            ys = list(readKmers(z))

        assert len(xs) == len(ys)
        for i in xrange(len(xs)):
            assert xs[i] == ys[i], '%d\t%s\t%s' % (i, render(
                K, xs[i]), render(K, ys[i]))
Beispiel #9
0
def computeBias(K, zs, verbose=False):
    S = summarizer()
    for (x, xc) in zs.iteritems():
        y = rc(K, x)
        if y < x:
            continue
        yc = zs.get(y, 0)

        if xc > yc:
            a = xc
            b = yc
        else:
            a = yc
            b = xc
        apb = a + b
        if apb > 0:
            v = float(a) / float(apb)
        else:
            v = 0.5
        if verbose:
            print '%s\t%s\t%d\t%d\t%g' % (render(K, x), render(K,
                                                               y), xc, yc, v)
        S.add(v)
    return (S.mean(), S.var())
Beispiel #10
0
def remapReads(K, L, rds, v):
    ctx = v.context(2*L)
    idx = {}
    for (x,p) in kmersWithPosList(K, ctx[1], False):
        if x not in idx:
            idx[x] = []
        idx[x].append(p)

    res = {}
    for fq in rds:
        for (x,p) in locate(K, idx, fq[1]):
            if p not in res:
                res[p] = {}
            if x not in res[p]:
                res[p][x] = 0
            res[p][x] += 1

    for (p,ys) in sorted(res.items()):
        for (y,c) in sorted(ys.items()):
            print '%d\t%s\t%d' % (p, render(K, y), c)
Beispiel #11
0
def main(argv):
    opts = docopt.docopt(__doc__, argv)

    random.seed(17)

    K = int(opts['-k'])
    S = 2*(K-3)

    frameAnchors = {}
    knownStops = {}
    sequences = {}
    seqKmers = {}

    if opts['-r']:
        with openFile(opts['-r']) as f:
            for (nm,seq) in readFasta(f):
                sequences[nm] = seq
                # trim polyA tails
                seq = re.sub('AAAAAA*$', '', seq)
                seqKmers[nm] = set([])
                for (x,p1) in kmersWithPosList(K, seq, False):
                    seqKmers[nm].add(x)
                    p = p1 - 1
                    w = p % 3
                    if x not in frameAnchors:
                        frameAnchors[x] = set([])
                    frameAnchors[x].add((nm,p))
                    y = x & 63
                    if w == 0 and y in stops:
                        if x not in knownStops:
                            knownStops[x] = set([])
                        knownStops[x].add(nm)

    rn = 0
    res = {}
    for fn in opts['<input>']:
        with openFile(fn) as f:
            for rd in readFastq(f):
                L = len(rd[1])
                rn += 1
                fwdAndRev = kmersWithPosLists(K, rd[1])
                frames = {}
                possibleStops = {}
                for i in range(2):
                    #print i, sorted([p for (x,p) in fwdAndRev[i]])
                    for (x,p) in fwdAndRev[i]:
                        if x in frameAnchors:
                            for (nm,q) in frameAnchors[x]:
                                o = (q - p)
                                k = (nm, o, i)
                                frames[k] = 1 + frames.get(k, 0)
                if len(frames) == 0:
                    continue
                n = sum(frames.values())
                probs = []
                for ((nm, off, strnd), cnt) in sorted(frames.items()):
                    probs.append((float(cnt)/float(n), cnt, off, strnd, nm))
                v = random.random()
                for (pv, cnt, off, strnd, nm) in probs:
                    if v < pv:
                        #print rd[1]
                        #print proj(strnd, sequences[nm][off:off+len(rd[1])])
                        #print codons(off % 3, rd[1]), off
                        for (x,p) in fwdAndRev[strnd]:
                            if (p + off + K - 3) % 3 == 0 and (x & 63) in stops:
                                if nm not in res:
                                    res[nm] = {}
                                if x not in res[nm]:
                                    res[nm][x] = 0
                                res[nm][x] += 1
                        break
                    v -= pv
    for (nm,stps) in res.iteritems():
        for (x,c) in stps.iteritems():
            (d,y) = nearest3(K, seqKmers[nm], x)
            if x in knownStops:
                k = 'known'
            else:
                k = 'novel'
            print '%s\t%s\t%d\t%d\t%s\t%s' % (k, render(K, x), c, d, render(K, y), nm)
Beispiel #12
0
def main(argv):
    opts = docopt.docopt(__doc__, argv)

    K = getK(opts['<input>'])
    J = K - 1
    M = (1 << (2 * (K - J))) - 1

    if opts['-r'] is not None:
        with kmers(opts['-r'], 'r') as z:
            xs = list(group(K, J, 0, readKmersAndCounts(z)))

        for fn in opts['<input>']:
            with kmers(fn, 'r') as z:
                samXs = readKmersAndCounts(z)
                i = 0
                for (yCtx, _, yGrp) in group(K, J, 0, samXs):
                    while i < len(xs) and xs[i][0] < yCtx:
                        i += 1
                    assert i < len(xs)
                    assert xs[i][0] == yCtx
                    gt = float(sum([c for (x,c) in xs[i][2]]))
                    gx = [0 for j in xrange(M+1)]
                    for (x,c) in xs[i][2]:
                        gx[x&M] = c
                    st = sum([c for (x,c) in yGrp])
                    sx = [0 for j in xrange(M+1)]
                    for (x,c) in yGrp:
                        sx[x&M] = c
                    ss = []
                    b = 0
                    for j in xrange(M+1):
                        p = float(gx[j])/gt
                        v = 0.0
                        if 0.0 < p and p < 1.0:
                            v = logBinGe(p, st, sx[j])
                            if v < -10:
                                b |= 1 << j
                        ss.append('%3.2g' % (v,))
                    if b > 0:
                        print '%s\t%s\t%s' % (render(J, yCtx), fasta(b), '\t'.join(ss))
                    i += 1
        return

    # Parse files in parallel to get global distribution

    N = len(opts['<input>'])
    h = heap.heap()
    i = 0
    for fn in opts['<input>']:
        (_, xs) = kfset.read(fn)
        i += 1
        h.push(Group(K, J, i, xs))

    while len(h) > 0:
        xfs = []
        g = h.pop()
        gy = g.this()[0]
        xfs.append(g.this())
        g.next()
        if g.valid():
            h.push(g)
        for x in h.xs:
            assert x.valid()
        while len(h) > 0 and h.front().this()[0] == gy:
            g = h.pop()
            xfs.append(g.this())
            g.next()
            if g.valid():
                h.push(g)
            for i in xrange(len(h.xs)):
                assert h.xs[i].valid()

        ds = []
        gc = [0 for i in xrange(M+1)]
        for (_, n, xc) in xfs:
            t = sum([c for (x,c) in xc])
            d = [0 for i in xrange(M+1)]
            for (x,c) in xc:
                j = x & M
                gc[j] += c
                d[j] = c
            ds.append((n, d))

        res = ['*' for i in xrange(N)]
        seen = set([])
        gt = float(sum(gc))
        for (n, d) in ds:
            t = sum(d)
            b = [0 for i in xrange((M+1)/4)]
            for i in xrange(M+1):
                p = float(gc[i])/gt
                if 0.0 < p and p < 1.0:
                    #vL = logBinLe(p, t, d[i])
                    #vG = logBinGe(p, t, d[i])
                    #v = min(vL, vG)
                    v = logBinGe(p, t, d[i])
                    if v > -10:
                        w = i >> 2
                        j = i & 3
                        b[w] |= 1 << j
            res[n-1] = ''.join([fasta(b0) for b0 in b])
            seen.add(res[n-1])
        if len(seen) > 1:
            print '%s\t%s' % (render(J, gy), '\t'.join(res))
Beispiel #13
0
def main(argv):
    opts = docopt.docopt(__doc__, argv)

    K = int(opts['-k'])
    if (K & 1) != 0:
        print >> sys.stderr, "K must be even."
        return

    minCov = int(opts['-m'])

    verbose = opts['-v']

    J = K // 2
    S = 2*(K - J)
    Mj = (1 << (2*J)) - 1

    names = []
    seqs = {}
    bait = {}
    wtFst = []
    wtLst = []
    posIdx = []
    rds = []
    with openFile(opts['<sequences>']) as f:
        for (nm, seq) in readFasta(f):
            n = len(names)
            names.append(nm)
            seqs[nm] = seq
            wf = {}
            wl = {}
            for x in kmersList(K, seq, False):
                if x not in bait:
                    bait[x] = set([])
                bait[x].add(n)

                y0 = x >> S
                y1 = x & Mj
                #print '- %s\t%s\t%s' % (render(K, x), render(J, y0), render(J, y1))

                if y0 not in wf:
                    wf[y0] = set([])
                wf[y0].add(y1)

                if y1 not in wl:
                    wl[y1] = set([])
                wl[y1].add(y0)

            wtFst.append(wf)
            wtLst.append(wl)
            
            px = {}
            for (x,p) in kmersWithPosList(J, seq, False):
                if x not in px:
                    px[x] = []
                px[x].append(p)
            posIdx.append(px)

            for (a, b, c, d) in findDup(wtFst[n], wtLst[n], wtFst[n], wtLst[n]):
                pps = positions(posIdx[n], J, a, b, c, d)
                if pps is None:
                    continue
                for pp in pps:
                    ab = a << S | b
                    cb = c << S | b
                    cd = c << S | d
                    dd = pp[2] - pp[0]
                    print >> sys.stderr, 'warning: phantom dumplication: %s-%s-%s (%d)' % (render(K, ab), render(K, cb), render(K, cd), dd)

            rds.append([])

    N = len(names)

    L = None
    X = [{} for n in range(N)]
    for itm in reads(opts['<input>'], K=K, reads=True, kmers=True, both=True, verbose=verbose):
        rd = itm.reads[0]
        L = len(rd)

        xs = itm.kmers[0]
        hits = set([])
        for x in xs:
            if x in bait:
                hits |= bait[x]
        for n in hits:
            for x in xs:
                if x not in X[n]:
                    X[n][x] = 0
                X[n][x] += 1
            rds[n].append(rd)

    hdrShown = False
    vn = 0
    for n in range(N):
        xs = {}
        for (x,c) in X[n].iteritems():
            if c >= 10:
                xs[x] = c

        seq = seqs[names[n]]

        rngs = []
        st = None
        en = None
        inside = False
        xx = []
        for x in kmersList(K, seq, False):
            if x in xs:
                xx.append('.')
            else:
                xx.append('X')
        print ''.join(xx)
        for x in kmersList(K, seq, False):
            if not inside:
                if x in xs:
                    st = x
                else:
                    inside = True
            else:
                if x in xs:
                    en = x
                    rngs.append((st, en))
                    st = x
                    en = None
                    inside = False
        if inside:
            rngs.append((st, en))

        pthr = Pather(K, xs)

        for (x,y) in rngs:
            if x is None or y is None:
                continue
            print render(K, x), render(K, y)
            for p in pthr.trace(x, y, 100):
                print renderPath(K, p)
        continue

        fst = {}
        lst = {}
        for (x,c) in xs.iteritems():
            #if c < 5:
            #    continue
            y0 = x >> S
            y1 = x & Mj

            if y0 not in fst:
                fst[y0] = []
            fst[y0].append(y1)

            if y1 not in lst:
                lst[y1] = []
            lst[y1].append(y0)

        #for (a, b, c, d) in findDupDeNovo(fst, lst):
        for (a, b, c, d) in findDup(wtFst[n], wtLst[n], fst, lst):
            #continue
            pps = positions(posIdx[n], J, a, b, c, d)
            if pps is None:
                continue
            for pp in pps:
                ab = a << S | b
                cb = c << S | b
                cd = c << S | d
                #print [(render(J, w), p) for (w,p) in zip([a, b, c, d], pps)]

                dd = pp[2] - pp[0]

                if not opts['-a'] and dd % 3 != 0:
                    continue

                if opts['-s']:
                    fstPath = interpolate(K, xs, ab, cb, dd+1)
                    sndPath = interpolate(K, xs, cb, cd, dd+1)

                    if fstPath is None:
                        continue
                    if sndPath is None:
                        continue

                    if fstPath[J:-J] != sndPath[J:-J]:
                        continue

                pa = pp[0]
                pb = pp[1]
                pc = pp[2]
                pd = pp[3]

                cab = xs.get(ab, 0)
                ccb = xs.get(cb, 0)
                ccd = xs.get(cd, 0)

                if cab < minCov:
                    continue
                if ccb < minCov:
                    continue
                if ccd < minCov:
                    continue

                m = (cab + ccd) / 2.0
                # Assume the true std dev is 10% of the mean
                w = ccb / m

                hgvs = '%s:c.%d_%ddup' % (names[n], pb, pd - 1)
                v = Duplication(names[n], pb, pd-1, seqs)
                if opts['-A']:
                    showAnchoredReads(K, {ab:'AB', cb:'CB', cd:'CD'}, rds[n])

                vn += 1

                hdrs = ['n']
                fmts = ['%d']
                outs = [vn]

                hdrs += ['left', 'leftCov']
                fmts += ['%s','%d']
                outs += [render(K, ab), cab]

                hdrs += ['mid', 'midCov']
                fmts += ['%s','%d']
                outs += [render(K, cb), ccb]

                hdrs += ['right', 'rightCov']
                fmts += ['%s','%d']
                outs += [render(K, cd), ccd]

                hdrs += ['len']
                fmts += ['%d']
                outs += [dd]

                hdrs += ['vaf']
                fmts += ['%g']
                outs += [w]

                hdrs += ['hgvs']
                fmts += ['%s']
                outs += [hgvs]

                if not hdrShown:
                    hdrShown = True
                    print '\t'.join(hdrs)
                print '\t'.join(fmts) % tuple(outs)
Beispiel #14
0
def main(argv):
    opts = docopt.docopt(__doc__, argv)

    verbose = opts['-v']

    K = int(opts['-k'])

    D = int(opts['-D'])

    Q = int(opts['-C'])

    V = float(opts['-V'])

    d = "."
    if opts['-g']:
        d = opts['-g']
    sf = SequenceFactory(d)

    if opts['-X']:
        Wcap = int(opts['-w'])
        Wval = int(opts['-W'])

        variants = opts['<variant>']
        if opts['-f']:
            with openFile(opts['-f']) as f:
                variants += f.read().split()

        vx = {}
        for v in variants:
            x = makeHGVS(v)
            if x is None:
                print >> sys.stderr, "unable to parse %s" % (v, )
                continue
            x.setSequenceFactory(sf)
            acc = x.accession()
            if acc not in vx:
                vx[acc] = []
            vx[acc].append(x)

        chk = None
        if opts['-T']:
            chk = {}

        rs = []
        for (acc, vs) in vx.iteritems():
            for v in vs:
                r = makeIndexedVariant(v, K, Wcap, Wval)
                if r is not None:
                    rs.append(r)
                if chk is not None:
                    xs = kmersList(
                        K, ''.join([
                            r['lhsFlank'][-(K - 1):], r['wtSeq'],
                            r['rhsFlank'][:K - 1]
                        ]), True)
                    for x in xs:
                        if x not in chk:
                            chk[x] = set([])
                        chk[x].add(('wt', str(v)))
                    if r['mutSeq'] is None:
                        continue
                    xs = kmersList(
                        K, ''.join([
                            r['lhsFlank'][-(K - 1):], r['mutSeq'],
                            r['rhsFlank'][:K - 1]
                        ]), True)
                    for x in xs:
                        if x not in chk:
                            chk[x] = set([])
                        chk[x].add(('mut', str(v)))

        if chk is not None:
            counts = dict([(x, 0) for x in chk.keys()])
            for acc in refSeq2Hg19.keys():
                if verbose:
                    print >> sys.stderr, 'scanning', acc
                seq = sf[acc]
                for x in kmers(K, seq):
                    if x in counts:
                        counts[x] += 1
            res = {}
            seen = set([])
            for x in counts.keys():
                y = rc(K, x)
                z = min(x, y)
                if z in seen:
                    continue
                seen.add(z)
                c = counts[x] + counts[y]
                for (a, v) in chk[x]:
                    if v not in res:
                        res[v] = {}
                    if a not in res[v]:
                        res[v][a] = {}
                    if c not in res[v][a]:
                        res[v][a][c] = 0
                    res[v][a][c] += 1
            yaml.safe_dump(res, sys.stdout, default_flow_style=False)
            return

        with open(opts['<index>'], 'w') as f:
            yaml.safe_dump(rs, f, default_flow_style=False)

        return

    capt = False
    zipname = None
    if opts['-c']:
        capt = True
        zipname = opts['-c']

    fmt = set([])
    if opts['-F']:
        fmt = set(opts['-F'].split(','))

    if verbose:
        print >> sys.stderr, "loading index."

    with open(opts['<index>']) as f:
        hgvsVars = yaml.load(f, Loader=yaml.FullLoader)

    NV = len(hgvsVars)

    combineStrands = True
    if opts['-s']:
        combineStrands = False

    cap = capture(K, reads=capt, kmers=True, verbose=verbose)

    for n in range(NV):
        itm = hgvsVars[n]
        h = itm['hgvs']
        v = makeHGVS(h)
        itm['var'] = v
        lhs = itm['lhsFlank']
        rhs = itm['rhsFlank']
        wt = itm['wtSeq']
        mut = itm['mutSeq']
        bait = [lhs, wt, rhs]
        if mut is not None:
            bait += ['N']
            bait += [lhs, mut, rhs]
        bait = ''.join(bait)
        n0 = cap.addBait(h, bait)
        assert n0 == n

    if verbose:
        print >> sys.stderr, "done."

    rn = 0
    for itm in reads(opts['<input>'],
                     K=K,
                     paired=True,
                     reads=True,
                     kmers=False,
                     both=True,
                     verbose=verbose):
        rn += 1
        cap.addReadPairAndKmers(itm.reads[0], itm.reads[1])

    if capt:
        cap.saveReads(zipname)

    scorer = Scorer(K)

    globHist = {}

    for n in range(NV):
        mx = cap.capKmers[n]
        for c in mx.itervalues():
            if c < Q:
                continue
            if c not in globHist:
                globHist[c] = 0
            globHist[c] += 1

    with outputFile(opts['-o']) as out:
        hdrShown = False
        for n in range(NV):
            itm = hgvsVars[n]
            v = itm['var']
            h = itm['hgvs']

            mx = cap.capKmers[n]

            nr = cap.capReadCounts[n]

            if 'kmers' in fmt:
                for (x, c) in mx.iteritems():
                    print '%d\t%s\t%d' % (n, render(K, x), c)

            lhsFlank = itm['lhsFlank']
            rhsFlank = itm['rhsFlank']

            alleles = {}
            alleles['wt'] = []
            alleles['mut'] = []

            wtSeq = itm['wtSeq']
            wtZ = len(wtSeq)

            mutSeq = itm['mutSeq']
            mutZ = v.size()

            cs = [c for (x, c) in mx.iteritems() if c >= Q]
            cs.sort()
            nk = len(cs)
            if nk == 0:
                cs = [0]

            q10 = cs[1 * len(cs) // 10]
            q50 = cs[5 * len(cs) // 10]
            q90 = cs[9 * len(cs) // 10]

            af = AlleleFinder(K, D, v, mx, lhsFlank, rhsFlank, wtSeq, mutSeq,
                              wtZ, mutZ)
            finders = []
            if not v.anonymous():
                finders.append(af.definiteAlleles())
            else:
                finders.append(af.bridgingAlleles())

            j = 0
            for (t, a) in cat(finders):
                assert t == 'wt' or t == 'mut'
                alleles[t].append(a)
                j += 1

            wtRes = {}
            wtRes['covMin'] = 0
            wtRes['binom'] = 1.0
            wtRes['ksDist'] = 0.0
            wtRes['hamming'] = 0
            wtRes['path'] = []
            for pthRes in alleles['wt']:
                scorer.score(pthRes, lhsFlank, wtSeq, rhsFlank)
                if isBetter(pthRes, wtRes):
                    wtRes = pthRes

            mutRes = {}
            mutRes['covMin'] = 0
            mutRes['binom'] = 1.0
            mutRes['ksDist'] = 0.0
            mutRes['hamming'] = 0
            mutRes['path'] = []
            for pthRes in alleles['mut']:
                scorer.score(pthRes, lhsFlank, mutSeq, rhsFlank)
                if isBetter(pthRes, mutRes):
                    mutRes = pthRes

            if True:
                wtXs = [mx.get(x, 0) for x in wtRes['path']]
                if len(wtXs) == 0:
                    wtXs = [0]
                wtXs.sort()
                wtCount = sum(wtXs)
                wtLen = len(wtXs)
                wtMean = float(wtCount) / float(wtLen)
                wtMedian = wtXs[wtLen // 2]

                mutXs = [mx.get(x, 0) for x in mutRes['path']]
                if len(mutXs) == 0:
                    mutXs = [0]
                mutXs.sort()
                mutCount = sum(mutXs)
                mutLen = len(mutXs)
                mutMean = float(mutCount) / float(mutLen)
                mutMedian = mutXs[mutLen // 2]

                totX = max([1.0, float(wtMedian + mutMedian), float(q90)])
                wtVaf = wtMedian / totX
                mutVaf = mutMedian / totX

            hdrs = ['n']
            fmts = ['%d']
            outs = [n]

            wtAllele = ((wtRes['covMin'] > Q) and
                        (wtRes['hamming'] < 4)) and (wtVaf > V)
            mutAllele = ((mutRes['covMin'] > Q) and
                         (mutRes['hamming'] < 4)) and (mutVaf > V)
            resV = 1 * wtAllele + 2 * mutAllele
            res = ['null', 'wt', 'mut', 'wt/mut'][resV]

            hdrs += ['res']
            fmts += ['%s']
            outs += [res]

            if 'rds' in fmt:
                hdrs += ['numReads']
                fmts += ['%d']
                outs += [nr]

            hdrs += ['numKmers', 'covQ10', 'covQ50', 'covQ90']
            fmts += ['%d', '%d', '%d', '%d']
            outs += [nk, q10, q50, q90]

            hdrs += ['wtMin', 'mutMin']
            fmts += ['%d', '%d']
            outs += [wtRes['covMin'], mutRes['covMin']]

            hdrs += ['wtHam', 'mutHam']
            fmts += ['%d', '%d']
            outs += [wtRes['hamming'], mutRes['hamming']]

            if 'ks' in fmt:
                hdrs += ['wtD', 'mutD']
                fmts += ['%g', '%g']
                outs += [wtRes['ksDist'], mutRes['ksDist']]

            if 'binom' in fmt:
                hdrs += ['wtQ', 'mutQ']
                fmts += ['%g', '%g']
                outs += [wtRes['binom'], mutRes['binom']]

            if 'vaf' in fmt:
                hdrs += ['wtVaf', 'mutVaf']
                fmts += ['%g', '%g']
                outs += [wtVaf, mutVaf]

            hdrs += ['hgvs']
            fmts += ['%s']
            outs += [h]

            if not hdrShown:
                hdrShown = True
                print >> out, '\t'.join(hdrs)
            print >> out, '\t'.join(fmts) % tuple(outs)
            out.flush()
Beispiel #15
0
def main(argv):
    global verbose

    opts = docopt.docopt(__doc__, argv)

    verbose = opts['-v']

    genomeDir = '.'
    if opts['-g']:
        genomeDir = opts['-g']
    sf = SequenceFactory(genomeDir)

    if opts['-P']:
        if opts['-t']:
            prepareBedFileGeneTx(opts['<gene-list>'], opts['<refgene>'],
                                 opts['<bedfile>'])
        else:
            prepareBedFileGene(opts['<gene-list>'], opts['<refgene>'],
                               opts['<bedfile>'])
        return

    if opts['-X']:
        with openFile(opts['<index>'], 'w') as out:
            yaml.safe_dump_all(indexBedFiles(opts['<must-have>'], sf),
                               out,
                               default_flow_style=False)
        return

    K = int(opts['-k'])
    minGeneReads = int(opts['-M'])
    minExonReads = int(opts['-m'])
    minGeneRate = float(opts['-R'])
    minExonRate = float(opts['-r'])
    (minGeneCount, maxGeneCount) = map(int, opts['-Z'].split(':'))
    (minExonCount, maxExonCount) = map(int, opts['-z'].split(':'))

    with openFile(opts['<index>']) as f:
        ref = list(yaml.load_all(f, Loader=yaml.BaseLoader))

    if True:
        # Test the double-layer index
        idx = ExonIndex(K, ref)

        acc = {}
        toc = {}
        rn = 0
        for itm in reads(opts['<input>'],
                         K=K,
                         paired=True,
                         reads=True,
                         kmers=False,
                         both=True,
                         verbose=verbose):
            rn += 1
            (lhsFwd, lhsRev) = kmersLists(K, itm.reads[0][1])
            (rhsFwd, rhsRev) = kmersLists(K, itm.reads[1][1])
            xs0 = lhsFwd + rhsRev
            rh0 = idx.readHash(xs0)
            if rh0 is not None:
                (h0, ys0) = rh0
                if h0 not in acc:
                    acc[h0] = []
                    toc[h0] = ys0
                acc[h0].append((compressRead(itm.reads[0][1]),
                                compressRead(itm.reads[1][1])))

            xs1 = lhsRev + rhsFwd
            rh1 = idx.readHash(xs1)
            if rh1 is not None:
                (h1, ys1) = rh1
                if h1 not in acc:
                    acc[h1] = []
                    toc[h1] = ys1
                acc[h1].append((compressRead(itm.reads[0][1]),
                                compressRead(itm.reads[1][1])))

        nx = 0
        for h in sorted(acc.keys()):
            for (x, c) in sorted(acc[h].items()):
                nx += 1
                if c <= 1:
                    continue
                print '%016x\t%s\t%d' % (h, render(K, x), c)

        print >> sys.stderr, 'nx =', nx
        return

    if False:
        # Position index
        idx = {}
        for i in range(len(ref)):
            itm = ref[i]
            for (x, p) in kmersWithPosList(K, itm['seq'], False):
                p -= 1
                if x not in idx:
                    idx[x] = []
                idx[x].append((i, p))

    if True:
        # Exon tuple index
        idx = {}
        lens = [0 for i in range(len(ref))]
        for i in range(len(ref)):
            itm = ref[i]
            for (x, p) in kmersWithPosList(K, itm['seq'], False):
                if x not in idx:
                    idx[x] = set([])
                idx[x].add(i)
                lens[i] += 1
        for x in idx.iterkeys():
            idx[x] = tuple(sorted(idx[x]))

    if opts['-T']:
        ak = {}
        for x in sorted(idx.iterkeys()):
            if len(idx[x]) == 1:
                continue
            xStr = render(K, x)
            ak[xStr] = []
            for i in idx[x]:
                itm = ref[i]
                k = '%s/%s' % (itm['gene'], itm['exon'])
                ak[xStr].append(k)
            ak[xStr].sort()
        rep = {}
        rep['aliasing-within'] = ak
        chrs = set([])
        for i in range(len(ref)):
            itm = ref[i]
            chrs.add(itm['chr'])
        counts = [0 for i in range(len(ref))]
        for ch in sorted(chrs):
            if verbose:
                print >> sys.stderr, 'processing %s' % (ch, )
            seq = sf[ch]
            for (x, p) in kmersWithPos(K, seq, True):
                if x not in idx:
                    continue
                for i in idx[x]:
                    counts[i] += 1
        gk = {}
        for i in range(len(ref)):
            if lens[i] == counts[i]:
                continue
            itm = ref[i]
            k = '%s/%s' % (itm['gene'], itm['exon'])
            gk[k] = {'indexed': lens[i], 'genomic': counts[i]}
        rep['aliasing-genomic'] = gk
        yaml.safe_dump(rep, sys.stdout, default_flow_style=False)
        return

    acc = {}
    rn = 0
    hitStats = Summary()
    hitHist = [0 for i in range(1000)]
    for itm in reads(opts['<input>'],
                     K=K,
                     paired=True,
                     reads=True,
                     kmers=False,
                     both=True,
                     verbose=verbose):
        rn += 1
        (lhsFwd, lhsRev) = kmersWithPosLists(K, itm.reads[0][1])
        (rhsFwd, rhsRev) = kmersWithPosLists(K, itm.reads[1][1])
        (hits0, hitCount0) = recHits(idx, lhsFwd + rhsRev)
        (hits1, hitCount1) = recHits(idx, lhsRev + rhsFwd)
        if len(hits0) > 0:
            k = tuple(sorted(hits0.keys()))
            v = sum(hits0.values())
            if k not in acc:
                acc[k] = [0, 0]
            acc[k][0] += 1
            acc[k][1] += v
            hitStats.add(hitCount0)
            hitHist[hitCount0] += 1

        if len(hits1) > 0:
            k = tuple(sorted(hits1.keys()))
            v = sum(hits1.values())
            if k not in acc:
                acc[k] = [0, 0]
            acc[k][0] += 1
            acc[k][1] += v
            hitStats.add(hitCount1)
            hitHist[hitCount1] += 1

    if verbose:
        print >> sys.stderr, 'total read hits: %d' % (len(hitStats), )
        print >> sys.stderr, 'total hits per read: %g (%g)' % (hitStats.mean(),
                                                               hitStats.sd())
        print >> sys.stderr, 'total reads: %d' % (rn, )
        for i in range(len(hitHist)):
            if hitHist[i] > 0:
                print >> sys.stderr, '\t%d\t%d' % (i, hitHist[i])

    def gex(s):
        r = []
        for n in s:
            itm = ref[n]
            r.append('%s/%s' % (itm['gene'], itm['exon']))
        return '|'.join(r)

    def fmtKey(k):
        nex = len(k)
        gx = set([])
        kStrParts = []
        for s in k:
            kStrParts.append(gex(s))
            gx |= set([ref[i]['gene'] for i in s])
        kStr = '--'.join(sorted(kStrParts))
        return (nex, gx, kStr)

    gxCounts = {}
    for k in acc.keys():
        gx = set([])
        ex = set([])
        for s in k:
            gx |= set([ref[i]['gene'] for i in s])
            ex |= set(s)
        gx = tuple(sorted(gx))
        if gx not in gxCounts:
            gxCounts[gx] = [0, 0]
        gxCounts[gx][0] += acc[k][0]
        gxCounts[gx][1] += acc[k][1]

    hdr = ['numReads', 'numKmers', 'kmersPerRead']
    hdr += ['ggNumReads', 'ggNumKmers', 'ggKmersPerRead']
    hdr += ['numExons', 'numGenes', 'geneGroup', 'exonGroup']
    print '\t'.join(hdr)
    for k in acc.keys():
        (nex, gx, kStr) = fmtKey(k)
        gx = tuple(sorted(gx))
        if len(gx) < minGeneCount or len(gx) > maxGeneCount:
            continue
        if len(ex) < minExonCount or len(ex) > maxExonCount:
            continue
        if gxCounts[gx][0] < minGeneReads:
            continue
        if acc[k][0] < minExonReads:
            continue
        gxRate = float(gxCounts[gx][1]) / float(gxCounts[gx][0])
        if gxRate < minGeneRate:
            continue
        exRate = float(acc[k][1]) / float(acc[k][0])
        if exRate < minExonRate:
            continue
        gxStr = ':'.join(gx)

        print '%d\t%d\t%g\t%d\t%d\t%g\t%d\t%d\t%s\t%s' % (
            acc[k][0], acc[k][1], exRate, gxCounts[gx][0], gxCounts[gx][1],
            gxRate, nex, len(gx), gxStr, kStr)