Esempio n. 1
0
def buildIndex(K, inputs, output):
    """
    Create a new k-mer index. The FASTA files named in the list
    `inputs` are read in and the `K` length k-mers and their reverse
    complements are extracted and collated to create an index that
    maps from k-mer to sequence number (numbering from 0). The
    `names` member of the KmerIndex object can be used to retrieve
    the name from the sequence number.
    """
    seqs = []
    for inp in inputs:
        with openFile(inp) as f:
            seqs += list(readFasta(f))

    S = []
    nms = []
    lens = array.array('I', [])
    for i in xrange(len(seqs)):
        (nm, seq) = seqs[i]
        nms.append(nm)
        xs = list(kmers(K, seq, True))
        xs.sort()
        uniq(xs)
        seqs[i] = [nm, xs]
        lens.append(len(xs))
        S += xs
    S.sort()
    uniq(S)
    S = sparse(2 * K, S)

    T = array.array('I', [0 for i in xrange(S.count() + 1)])
    for i in xrange(len(seqs)):
        for x in seqs[i][1]:
            r = S.rank(x)
            T[r] += 1

    t0 = 0
    for i in xrange(len(T)):
        t1 = t0 + T[i]
        T[i] = t0
        t0 = t1

    T0 = [c for c in T]
    U = array.array('H', [0 for i in xrange(t0)])
    for i in xrange(len(seqs)):
        for x in seqs[i][1]:
            r = S.rank(x)
            U[T0[r]] = i
            T0[r] += 1

    with container(output, 'w') as z:
        writeKmers(K, S.xs, z)
        n = write32(z, T, 'offsets')
        z.meta['T'] = n
        n = write16(z, U, 'postings')
        z.meta['U'] = n
        n = write32(z, lens, 'lens')
        z.meta['lens'] = n
        z.meta['names'] = nms
Esempio n. 2
0
def main(argv):
    opts = docopt.docopt(__doc__, argv)

    K = int(opts['-k'])

    B = int(opts['-b'])

    paired = opts['-p']

    verbose = opts['-v']

    Z = opts['-z']

    names = []
    seqs = []
    baits = {}
    with openFile(opts['<sequences>']) as f:
        for (nm, seq) in readFasta(f):
            n = len(names)
            names.append(nm)
            seqs.append(seq)
            for x in kmersList(K, seq, True):
                if x not in baits:
                    baits[x] = set([])
                baits[x].add(n)

    N = len(names)

    caches = [ReadCache(opts['-P'], names[n], paired, B, Z) for n in range(N)]

    nr = 0
    nh = 0
    for itm in reads(opts['<input>'],
                     reads=True,
                     kmers=True,
                     fwdOnly=True,
                     paired=paired,
                     verbose=verbose):
        nr += 1
        E = len(itm.kmers)
        hits = set([])
        for i in xrange(E):
            fwd = itm.kmers[i]
            for x in fwd:
                if x in baits:
                    hits |= baits[x]
        for n in hits:
            caches[n].add(itm.reads)

        if len(hits) > 0:
            nh += 1

    for n in xrange(N):
        caches[n].end()
Esempio n. 3
0
 def __getitem__(self, acc):
     if acc != self.prevAcc:
         acc = normalizeAccession(acc)
         pth = self.home + '/' + acc + '.fa'
         if not os.path.exists(pth):
             pth = pth + '.gz'
         with openFile(pth) as f:
             for (nm, seq) in readFasta(f):
                 self.prevAcc = acc
                 self.prevSeq = seq
                 break
     return self.prevSeq
Esempio n. 4
0
    def __getitem__(self, acc):
        if acc != self.prevAcc:
            if acc not in hgvs.refSeq2Hg19:
                print >> sys.stderr, "accession %s not available." % (acc)
            assert acc in hgvs.refSeq2Hg19
            h = hgvs.refSeq2Hg19[acc]

            with openFile(self.home + "/" + h + ".fa.gz") as f:
                for (nm, seq) in readFasta(f):
                    self.prevAcc = acc
                    self.prevSeq = seq
                    break
        return self.prevSeq
Esempio n. 5
0
    def __getitem__(self, acc):
        if acc != self.prevAcc:
            if acc in refSeq2Hg19:
                h = refSeq2Hg19[acc]
            else:
                h = acc

            with openFile(self.home + "/" + h + ".fa.gz") as f:
                for (nm, seq) in readFasta(f):
                    self.prevAcc = acc
                    self.prevSeq = seq
                    break
        return self.prevSeq
Esempio n. 6
0
    def __getitem__(self, acc):
        if acc != self.prevAcc:
            if acc in refSeq2Hg19:
                h = refSeq2Hg19[acc]
            else:
                h = acc

            pth = self.home + '/' + h + '.fa'
            if not os.path.exists(pth):
                pth += '.gz'

            with openFile(pth) as f:
                for (nm, seq) in readFasta(f):
                    self.prevAcc = acc
                    self.prevSeq = seq
                    break
        return self.prevSeq
Esempio n. 7
0
def main(argv):
    opts = docopt.docopt(__doc__, argv)

    K = int(opts['-k'])
    C = int(opts['-c'])
    Q = int(opts['-q'])
    S = int(opts['-S'])
    P = float(opts['-p'])

    verbose = opts['-v']

    both = True
    if opts['-s']:
        both = False

    res = []
    for fn in opts['<input>']:
        fres = {}
        fres['file'] = fn
        fres['contigs'] = []
        glob = {}
        ncontig = 0
        with openFile(fn) as f:
            for (nm, seq) in readFasta(f):
                ncontig += 1
                scaff = {}
                for x in kmersList(K, seq, both):
                    if sub(S, P, x):
                        scaff[x] = 1 + scaff.get(x, 0)
                summary = summarize(scaff, C, Q)
                summary['name'] = nm
                fres['contigs'].append(summary)
                for (x, c) in scaff.items():
                    glob[x] = c + glob.get(x, 0)
        fres['global'] = summarize(glob, C, Q)
        res.append(fres)

    yaml.safe_dump(res, sys.stdout)
Esempio n. 8
0
def main(argv):
    opts = docopt.docopt(__doc__, argv)

    K = int(opts['-k'])
    M = (1 << (2*K)) - 1

    paired = True
    if opts['-s']:
        paired = False

    p = float(opts['-p'])
    T = int(M * p)

    if opts['-r']:
        refs = []
        with openFile(opts['-r']) as f:
            for (nm, seq) in readFasta(f):
                refs += kmersList(K, seq, False)
        refs = set(refs)

        kill = set([])
        for x in refs:
            y = rc(K, x)
            if y in refs:
                kill.add(x)
                kill.add(y)
        print >> sys.stderr, 'removing %d/%d' % (len(kill), len(refs))

        refs -= set(kill)

        fwd = {}
        rev = {}
        for xs in parseFiles(K, paired, opts['<fastq>'], opts['-v']):
            fn = 0
            for x in xs:
                if x in refs:
                    fn += 1

            ys = [rc(K, x) for x in xs]
            rn = 0
            for y in ys:
                if y in refs:
                    rn += 1
            
            if fn + rn == 0:
                continue

            q = float(fn) / float(fn + rn)
            if random.random() < q:
                for x in xs:
                    fwd[x] = 1 + fwd.get(x, 0)
            else:
                for y in ys:
                    rev[y] = 1 + rev.get(y, 0)

        for (x,xc) in fwd.iteritems():
            y = rc(K, x)
            yc = 0
            if y in rev:
                yc = rev[y]
                del rev[y]
            print '%d\t%d' % (xc, yc)

        for (y,yc) in rev.iteritems():
            print '%d\t%d' % (0, yc)

        return

    kx = {}
    for xs in parseFiles(K, paired, opts['<fastq>'], opts['-v']):
        for x in xs:
            if x in kx:
                kx[x] += 1
                continue
            y = rc(K, x)
            z = murmer(min(x, y), 17)
            if (z & M) > T:
                continue
            kx[x] = 1

    for x in kx.keys():
        y = rc(K, x)
        if x > y:
            continue
        xc = kx[x]
        yc = kx.get(y, 0)
        if murmer(x, 17) >= murmer(y, 17):
            (a, b) = (x, y)
            (ac, bc) = (xc, yc)
        else:
            (a, b) = (y, x)
            (ac, bc) = (yc, xc)
        #print '%s\t%d\t%s\t%d' % (render(K, a), ac, render(K, b), bc)
        print '%d\t%d' % (ac, bc)
Esempio n. 9
0
def main(argv):
    opts = docopt.docopt(__doc__, argv)

    fns = opts['<input>']

    p = None
    if opts['-p'] is not None:
        p = float(opts['-p'])

    if len(fns) == 1 and isFasta(fns[0]):
        K = 25
        seqs = []
        with openFile(fns[0]) as f:
            for (nm, seq) in readFasta(f):
                xs = set(basics.kmers(K, seq, True))
                xs = list(xs)
                xs.sort()
                xs = array.array('L', xs)
                seqs.append((nm.split()[0], xs))
        Z = 1
        if opts['-a']:
            Z = len(seqs)

        print len(seqs)

        for i in xrange(Z):
            xnm = seqs[i][0]
            xs = seqs[i][1]
            for j in xrange(i + 1, len(seqs)):
                ynm = seqs[j][0]
                ys = seqs[j][1]
                (isec, union, d) = jaccard(xs, ys)
                if p is None:
                    print '%s\t%s\t%d\t%d\t%d\t%d\t%f' % (
                        xnm, ynm, len(xs), len(ys), isec, union, d)
                else:
                    pv = logIx(p, isec + 1, (union - isec) + 1) / math.log(10)
                    q05 = quantBeta(0.05, isec + 1, (union - isec) + 1)
                    q95 = quantBeta(0.95, isec + 1, (union - isec) + 1)
                    print '%s\t%s\t%d\t%d\t%d\t%d\t%f\t-%f\t+%f\t%f' % (
                        xnm, ynm, len(xs), len(ys), isec, union, d, d - q05,
                        q95 - d, pv)
                sys.stdout.flush()

        return

    Z = 1
    if opts['-a']:
        Z = len(fns)

    for i in xrange(Z):
        with kmers(fns[i], 'r') as z0:
            xK = z0.meta['K']
            xs = array.array('L', readKmers(z0))
            for j in xrange(i + 1, len(fns)):
                with kmers(fns[j], 'r') as z1:
                    yK = z1.meta['K']
                    ys = array.array('L', readKmers(z1))
                    if xK != yK:
                        print >> sys.stderr, 'mismatched K:', fns[j]
                        sys.exit(1)
                    (isec, union, d) = jaccard(xs, ys)
                    if p is None:
                        print '%s\t%s\t%d\t%d\t%d\t%d\t%f' % (
                            fns[i], fns[j], len(xs), len(ys), isec, union, d)
                    else:
                        pv = logIx(p, isec + 1,
                                   (union - isec) + 1) / math.log(10)
                        q05 = quantBeta(0.05, isec + 1, (union - isec) + 1)
                        q95 = quantBeta(0.95, isec + 1, (union - isec) + 1)
                        print '%s\t%s\t%d\t%d\t%d\t%d\t%f\t-%f\t+%f\t%f' % (
                            fns[i], fns[j], len(xs), len(ys), isec, union, d,
                            d - q05, q95 - d, pv)
                    sys.stdout.flush()
Esempio n. 10
0
def main(argv):
    opts = docopt.docopt(__doc__, argv)

    random.seed(17)

    K = int(opts['-k'])
    S = 2*(K-3)

    frameAnchors = {}
    knownStops = {}
    sequences = {}
    seqKmers = {}

    if opts['-r']:
        with openFile(opts['-r']) as f:
            for (nm,seq) in readFasta(f):
                sequences[nm] = seq
                # trim polyA tails
                seq = re.sub('AAAAAA*$', '', seq)
                seqKmers[nm] = set([])
                for (x,p1) in kmersWithPosList(K, seq, False):
                    seqKmers[nm].add(x)
                    p = p1 - 1
                    w = p % 3
                    if x not in frameAnchors:
                        frameAnchors[x] = set([])
                    frameAnchors[x].add((nm,p))
                    y = x & 63
                    if w == 0 and y in stops:
                        if x not in knownStops:
                            knownStops[x] = set([])
                        knownStops[x].add(nm)

    rn = 0
    res = {}
    for fn in opts['<input>']:
        with openFile(fn) as f:
            for rd in readFastq(f):
                L = len(rd[1])
                rn += 1
                fwdAndRev = kmersWithPosLists(K, rd[1])
                frames = {}
                possibleStops = {}
                for i in range(2):
                    #print i, sorted([p for (x,p) in fwdAndRev[i]])
                    for (x,p) in fwdAndRev[i]:
                        if x in frameAnchors:
                            for (nm,q) in frameAnchors[x]:
                                o = (q - p)
                                k = (nm, o, i)
                                frames[k] = 1 + frames.get(k, 0)
                if len(frames) == 0:
                    continue
                n = sum(frames.values())
                probs = []
                for ((nm, off, strnd), cnt) in sorted(frames.items()):
                    probs.append((float(cnt)/float(n), cnt, off, strnd, nm))
                v = random.random()
                for (pv, cnt, off, strnd, nm) in probs:
                    if v < pv:
                        #print rd[1]
                        #print proj(strnd, sequences[nm][off:off+len(rd[1])])
                        #print codons(off % 3, rd[1]), off
                        for (x,p) in fwdAndRev[strnd]:
                            if (p + off + K - 3) % 3 == 0 and (x & 63) in stops:
                                if nm not in res:
                                    res[nm] = {}
                                if x not in res[nm]:
                                    res[nm][x] = 0
                                res[nm][x] += 1
                        break
                    v -= pv
    for (nm,stps) in res.iteritems():
        for (x,c) in stps.iteritems():
            (d,y) = nearest3(K, seqKmers[nm], x)
            if x in knownStops:
                k = 'known'
            else:
                k = 'novel'
            print '%s\t%s\t%d\t%d\t%s\t%s' % (k, render(K, x), c, d, render(K, y), nm)
Esempio n. 11
0
def main(argv):
    opts = docopt.docopt(__doc__, argv)

    if opts['-X']:
        K = 27
        S = []
        N = 0
        qacgt = [0, 0, 0, 0]
        for fn in opts['<input>']:
            with open(fn) as f:
                for (nm, seq) in readFasta(f):
                    if len(seq) < K:
                        continue
                    for (x, p) in kmersWithPos(K, seq, True):
                        S.append(x)
                        qacgt[x & 3] += 1
                        N += 1
        S.sort()
        qacgt = [float(c) / float(N) for c in qacgt]
        S = sparse(2 * K, array.array('L', uniq(S)))
        lens = []
        nms = []
        seqs = []
        n = 0
        tmp = [[] for i in xrange(S.count())]
        for fn in opts['<input>']:
            with open(fn) as f:
                for (nm, seq) in readFasta(f):
                    if len(seq) < K:
                        print >> sys.stderr, "warning: `%s' skipped" % (nm, )
                        continue
                    nms.append(nm)
                    seqs.append(seq)
                    lens.append(len(seq))
                    for (x, p) in kmersWithPos(K, seq, True):
                        r = S.rank(x)
                        tmp[r].append((n, p))
                    n += 1
        T = array.array('I', [])
        U = array.array('I', [])
        V = array.array('i', [])
        t = 0
        for nps in tmp:
            T.append(t)
            t += len(nps)
            for (n, p) in nps:
                U.append(n)
                V.append(p)
        T.append(t)
        del tmp

        gfn = opts['<genes>']
        with casket(gfn, 'w') as z:
            meta = {}
            meta['K'] = K
            meta['lens'] = lens
            meta['qacgt'] = qacgt
            meta['nms'] = nms
            meta['seqs'] = seqs

            z.add_content('__meta__', json.dumps(meta))
            write64(z, S.xs, 'S')
            write32(z, T, 'T')
            write32(z, U, 'U')
            write32s(z, V, 'V')

        return

    print >> sys.stderr, "loading..."

    gfn = opts['<genes>']
    with casket(gfn, 'r') as z:
        mf = z.open('__meta__')
        meta = json.load(mf)
        K = meta['K']
        lens = meta['lens']
        qacgt = meta['qacgt']
        nms = meta['nms']
        seqs = meta['seqs']

        S = read64(z, 'S')
        S = sparse(2 * K, S)
        T = read32(z, 'T')
        U = read32(z, 'U')
        V = read32s(z, 'V')

    print >> sys.stderr, "done."

    for fn in opts['<input>']:
        L = array.array('B', [0 for i in xrange(S.count())])
        Y = array.array('L', [0 for i in xrange(S.count())])
        with kmers(fn, 'r') as z:
            sacgt = z.meta['acgt']
            xs = readKmers(z)
            X = array.array('L', xs)
        M = len(X)
        resolveAll(K, S, L, Y, X)
        X = sparse(2 * K, X)

        g = sum([qp * sp for (qp, sp) in zip(qacgt, sacgt)])
        print >> sys.stderr, "g =", g
        nm = [null(g, M, j) for j in range(0, K + 1)]

        # counts for computing distribution of prefix lengths
        cnt = [[0 for j in xrange(K + 1)] for i in xrange(len(nms))]

        # the k-mers that we pulled by lcp from the sample
        # for each position of each query.
        P = [
            array.array('L', [0 for j in xrange(lens[i] - K + 1)])
            for i in xrange(len(lens))
        ]

        # the length of the lcp for each position of each query.
        Q = [
            array.array('B', [0 for j in xrange(lens[i] - K + 1)])
            for i in xrange(len(lens))
        ]

        for i in xrange(S.count()):
            for j in xrange(T[i], T[i + 1]):
                n = U[j]
                p = V[j]
                y = Y[i]
                l = L[i]
                cnt[n][l] += 1
                if p > 0:
                    p -= 1
                else:
                    p = -(p + 1)
                    y = rc(K, y)
                if l > Q[n][p]:
                    Q[n][p] = l
                    P[n][p] = y

        for i in xrange(len(nms)):
            # iterate over the queries

            qc = math.log(K * 0.05 / float(lens[i] - K + 1) / 2)

            # Link up "de Bruijn" sequences
            m = (1 << (2 * K - 2)) - 1
            py = 0
            u = unionfind()
            for j in xrange(lens[i] - K + 1):
                x = P[i][j]
                y = x >> 2
                if j > 0:
                    d = ham(py, y)
                    if d == 0:
                        u.union(j - 1, j)
                py = x & m

            # Gather up the de Bruin fragments
            udx = {}
            for j in xrange(lens[i] - K + 1):
                v = u.find(j)
                if v not in udx:
                    udx[v] = []
                udx[v].append(j)

            # Index the left hand k-mers
            idxLhs = {}
            kx = []
            for (jx, js) in udx.iteritems():
                q = 0
                for j in js:
                    q += math.log1p(-nm[Q[i][j]])
                if q > math.log(0.05 / len(js)):
                    continue
                kx.append((-len(js), jx))
                idxLhs[P[i][js[0]]] = jx
            kx.sort()

            # Attempt to link up fragments
            links = {}
            for (_, jx) in kx:
                jR = udx[jx][-1]
                if jR == lens[i] - K + 1:
                    continue
                x = P[i][jR]
                xs = []
                lnk = None
                for k in xrange(100):
                    ys = succ(K, X, x)
                    if len(ys) != 1:
                        break
                    x = ys[0]
                    if x in idxLhs:
                        lnk = idxLhs[x]
                        break
                    xs.append(x)
                if lnk is not None:
                    links[jx] = xs
                    u.union(jx, lnk)

            # Gather up the linked fragments
            vdx = {}
            for j in [jx for (_, jx) in kx]:
                v = u.find(j)
                if v not in vdx:
                    vdx[v] = []
                vdx[v].append(j)

            res = []
            for (jxx, jxs) in vdx.iteritems():
                # Order the gragments by start position
                fs = [(udx[jx][0], jx) for jx in jxs]
                fs.sort()
                sxs = []
                for fj in xrange(len(fs)):
                    (_, jx) = fs[fj]
                    beg = udx[jx][0]
                    end = udx[jx][-1] + 1
                    if fj == 0:
                        for j in xrange(beg):
                            sxs.append((0, 0))
                    xs = links.get(jx, None)
                    for j in xrange(beg, end):
                        x = P[i][j]
                        l = Q[i][j]
                        sxs.append((x, l))
                    if xs:
                        for x in xs:
                            sxs.append((x, 27))
                    else:
                        if fj < len(fs) - 1:
                            nxt = fs[fj + 1][0]
                        else:
                            nxt = lens[i] - K + 1
                        for j in xrange(end, nxt):
                            sxs.append((0, 0))
                seq = [[0, 0, 0, 0] for j in xrange(len(sxs) + K - 1)]
                for j in xrange(len(sxs)):
                    (x, l) = sxs[j]
                    p = math.log1p(-nm[l])
                    for k in xrange(K):
                        seq[j + K - k - 1][x & 3] += p
                        x >>= 2
                ax = []
                p = None
                inf = False
                for j in xrange(len(seq)):
                    b = 0
                    for k in xrange(4):
                        if seq[j][k] < qc:
                            b |= 1 << k
                    ax.append(fasta(b))
                    ssj = sum(seq[j])
                    if p is None:
                        p = ssj
                    else:
                        p = logAdd(p, ssj)
                    if ssj > -1e-300:
                        inf = True
                dst = counts2cdf(cnt[i])
                (_, kd) = ksDistance2(dst, nm)
                df = math.ceil(len(seq) / float(K))
                if inf:
                    q = 1e300
                    pv = 0.0
                else:
                    q = 2 * math.exp(p)
                    pv = chi2(df, q)
                res.append((pv, q, kd, ''.join(ax)))

            if len(res) == 0:
                continue

            res.sort()
            if res[0][0] < -2:
                #ed = lev(seqs[i], res[0][2])
                ed = 0
                pv = res[0][0] / math.log(10)
                c2 = res[0][1]
                kd = res[0][2]
                a = res[0][3]
                print '%d\t%d\t%d\t%g\t%g\t%g\t%s\t%s' % (
                    i, lens[i], len(a), kd, c2, pv, nms[i], a)
            sys.stdout.flush()
Esempio n. 12
0
def main(argv):
    opts = docopt.docopt(__doc__, argv)

    K = int(opts['-k'])
    if (K & 1) != 0:
        print >> sys.stderr, "K must be even."
        return

    minCov = int(opts['-m'])

    verbose = opts['-v']

    J = K // 2
    S = 2*(K - J)
    Mj = (1 << (2*J)) - 1

    names = []
    seqs = {}
    bait = {}
    wtFst = []
    wtLst = []
    posIdx = []
    rds = []
    with openFile(opts['<sequences>']) as f:
        for (nm, seq) in readFasta(f):
            n = len(names)
            names.append(nm)
            seqs[nm] = seq
            wf = {}
            wl = {}
            for x in kmersList(K, seq, False):
                if x not in bait:
                    bait[x] = set([])
                bait[x].add(n)

                y0 = x >> S
                y1 = x & Mj
                #print '- %s\t%s\t%s' % (render(K, x), render(J, y0), render(J, y1))

                if y0 not in wf:
                    wf[y0] = set([])
                wf[y0].add(y1)

                if y1 not in wl:
                    wl[y1] = set([])
                wl[y1].add(y0)

            wtFst.append(wf)
            wtLst.append(wl)
            
            px = {}
            for (x,p) in kmersWithPosList(J, seq, False):
                if x not in px:
                    px[x] = []
                px[x].append(p)
            posIdx.append(px)

            for (a, b, c, d) in findDup(wtFst[n], wtLst[n], wtFst[n], wtLst[n]):
                pps = positions(posIdx[n], J, a, b, c, d)
                if pps is None:
                    continue
                for pp in pps:
                    ab = a << S | b
                    cb = c << S | b
                    cd = c << S | d
                    dd = pp[2] - pp[0]
                    print >> sys.stderr, 'warning: phantom dumplication: %s-%s-%s (%d)' % (render(K, ab), render(K, cb), render(K, cd), dd)

            rds.append([])

    N = len(names)

    L = None
    X = [{} for n in range(N)]
    for itm in reads(opts['<input>'], K=K, reads=True, kmers=True, both=True, verbose=verbose):
        rd = itm.reads[0]
        L = len(rd)

        xs = itm.kmers[0]
        hits = set([])
        for x in xs:
            if x in bait:
                hits |= bait[x]
        for n in hits:
            for x in xs:
                if x not in X[n]:
                    X[n][x] = 0
                X[n][x] += 1
            rds[n].append(rd)

    hdrShown = False
    vn = 0
    for n in range(N):
        xs = {}
        for (x,c) in X[n].iteritems():
            if c >= 10:
                xs[x] = c

        seq = seqs[names[n]]

        rngs = []
        st = None
        en = None
        inside = False
        xx = []
        for x in kmersList(K, seq, False):
            if x in xs:
                xx.append('.')
            else:
                xx.append('X')
        print ''.join(xx)
        for x in kmersList(K, seq, False):
            if not inside:
                if x in xs:
                    st = x
                else:
                    inside = True
            else:
                if x in xs:
                    en = x
                    rngs.append((st, en))
                    st = x
                    en = None
                    inside = False
        if inside:
            rngs.append((st, en))

        pthr = Pather(K, xs)

        for (x,y) in rngs:
            if x is None or y is None:
                continue
            print render(K, x), render(K, y)
            for p in pthr.trace(x, y, 100):
                print renderPath(K, p)
        continue

        fst = {}
        lst = {}
        for (x,c) in xs.iteritems():
            #if c < 5:
            #    continue
            y0 = x >> S
            y1 = x & Mj

            if y0 not in fst:
                fst[y0] = []
            fst[y0].append(y1)

            if y1 not in lst:
                lst[y1] = []
            lst[y1].append(y0)

        #for (a, b, c, d) in findDupDeNovo(fst, lst):
        for (a, b, c, d) in findDup(wtFst[n], wtLst[n], fst, lst):
            #continue
            pps = positions(posIdx[n], J, a, b, c, d)
            if pps is None:
                continue
            for pp in pps:
                ab = a << S | b
                cb = c << S | b
                cd = c << S | d
                #print [(render(J, w), p) for (w,p) in zip([a, b, c, d], pps)]

                dd = pp[2] - pp[0]

                if not opts['-a'] and dd % 3 != 0:
                    continue

                if opts['-s']:
                    fstPath = interpolate(K, xs, ab, cb, dd+1)
                    sndPath = interpolate(K, xs, cb, cd, dd+1)

                    if fstPath is None:
                        continue
                    if sndPath is None:
                        continue

                    if fstPath[J:-J] != sndPath[J:-J]:
                        continue

                pa = pp[0]
                pb = pp[1]
                pc = pp[2]
                pd = pp[3]

                cab = xs.get(ab, 0)
                ccb = xs.get(cb, 0)
                ccd = xs.get(cd, 0)

                if cab < minCov:
                    continue
                if ccb < minCov:
                    continue
                if ccd < minCov:
                    continue

                m = (cab + ccd) / 2.0
                # Assume the true std dev is 10% of the mean
                w = ccb / m

                hgvs = '%s:c.%d_%ddup' % (names[n], pb, pd - 1)
                v = Duplication(names[n], pb, pd-1, seqs)
                if opts['-A']:
                    showAnchoredReads(K, {ab:'AB', cb:'CB', cd:'CD'}, rds[n])

                vn += 1

                hdrs = ['n']
                fmts = ['%d']
                outs = [vn]

                hdrs += ['left', 'leftCov']
                fmts += ['%s','%d']
                outs += [render(K, ab), cab]

                hdrs += ['mid', 'midCov']
                fmts += ['%s','%d']
                outs += [render(K, cb), ccb]

                hdrs += ['right', 'rightCov']
                fmts += ['%s','%d']
                outs += [render(K, cd), ccd]

                hdrs += ['len']
                fmts += ['%d']
                outs += [dd]

                hdrs += ['vaf']
                fmts += ['%g']
                outs += [w]

                hdrs += ['hgvs']
                fmts += ['%s']
                outs += [hgvs]

                if not hdrShown:
                    hdrShown = True
                    print '\t'.join(hdrs)
                print '\t'.join(fmts) % tuple(outs)
Esempio n. 13
0
def main(argv):
    opts = docopt.docopt(__doc__, argv)

    K = 25

    nms = []
    idx = {}
    for (nm, seq) in readFasta(openFile(opts['<baits>'])):
        n = len(nms)
        nms.append(nm)
        for x in kmersList(K, seq, True):
            if x not in idx:
                idx[x] = set([])
            idx[x].add(n)

    for x in idx.keys():
        idx[x] = list(idx[x])
        idx[x].sort()

    anti = set([])
    if opts['-U']:
        with openFile(opts['-U']) as f:
            for (nm, seq) in readFasta(f):
                for x in kmersList(K, seq, True):
                    anti.add(x)

    rn = 0
    if opts['-p']:

        hist = {}
        for (fn1, fn2) in pairs(opts['<input>']):
            tmps = [(tmpfile('_1.fastq'), tmpfile('_2.fastq'))
                    for i in xrange(len(nms))]
            cache = [[[], []] for i in xrange(len(nms))]
            counts = [0 for i in xrange(len(nms))]
            with openFile(fn1) as f1, openFile(fn2) as f2:
                for fq1, fq2 in both(readFastq(f1), readFastq(f2)):
                    hits = set([])
                    pushup = False
                    for x in kmersList(K, fq1[1]):
                        if x in anti:
                            pushup = True
                            break
                        for i in idx.get(x, []):
                            hits.add(i)
                    for x in kmersList(K, fq2[1]):
                        if x in anti:
                            pushup = True
                            break
                        for i in idx.get(x, []):
                            hits.add(i)
                    if pushup:
                        continue
                    n = len(hits)
                    hist[n] = 1 + hist.get(n, 0)
                    for i in hits:
                        counts[i] += 1
                        cache[i][0].append(fq1)
                        cache[i][1].append(fq2)
                        if len(cache[i][0]) >= 1024:
                            with open(tmps[i][0], 'a') as f:
                                for rd in cache[i][0]:
                                    print >> f, rd[0]
                                    print >> f, rd[1]
                                    print >> f, rd[2]
                                    print >> f, rd[3]
                            with open(tmps[i][1], 'a') as f:
                                for rd in cache[i][1]:
                                    print >> f, rd[0]
                                    print >> f, rd[1]
                                    print >> f, rd[2]
                                    print >> f, rd[3]
                            cache[i][0] = []
                            cache[i][1] = []
            for i in xrange(len(cache)):
                if len(cache[i][0]) > 0:
                    with open(tmps[i][0], 'a') as f:
                        for rd in cache[i][0]:
                            print >> f, rd[0]
                            print >> f, rd[1]
                            print >> f, rd[2]
                            print >> f, rd[3]
                    with open(tmps[i][1], 'a') as f:
                        for rd in cache[i][1]:
                            print >> f, rd[0]
                            print >> f, rd[1]
                            print >> f, rd[2]
                            print >> f, rd[3]
                    cache[i][0] = []
                    cache[i][1] = []
            with zipfile.ZipFile(opts['<output>'], 'w',
                                 zipfile.ZIP_DEFLATED) as z:
                for i in xrange(len(nms)):
                    if counts[i] > 0:
                        pth = '/'.join(nms[i].split())
                        z.write(tmps[i][0], pth + '/' + fn1)
                        os.remove(tmps[i][0])
                        z.write(tmps[i][1], pth + '/' + fn2)
                        os.remove(tmps[i][1])
        hist = hist.items()
        hist.sort()
        for (n, f) in hist:
            print '%d\t%d' % (n, f)
    else:
        raise "not implemented"
Esempio n. 14
0
    def next(self):
        self.readNum += 1
        if (self.readNum & self.M) == 0 and self.progress is not None:
            self.progress.update(self.M)

        while True:
            if self.currParsers is None:

                if self.currFilesInd is None:
                    self.currFilesInd = 0
                else:
                    self.currFilesInd += self.N

                if self.progress is not None:
                    self.progress.update(self.readNum & self.M)

                if self.currFilesInd + (self.N - 1) >= len(self.files):
                    raise StopIteration

                if self.verbose:
                    pfx = ' & '.join([
                        basename(self.files[i])
                        for i in range(self.currFilesInd, self.currFilesInd +
                                       self.N)
                    ])
                    self.progress = tqdm(unit=' reads', unit_scale=True)
                    self.progress.set_postfix(reading=pfx, refresh=True)

                self.currParsers = []
                for i in range(self.currFilesInd, self.currFilesInd + self.N):
                    fn = self.files[i]
                    f = openFile(fn)
                    if isFasta(fn):
                        self.currParsers.append(readFasta(f))
                    else:
                        self.currParsers.append(readFastq(f))

            self.currReads = []
            try:
                for p in self.currParsers:
                    self.currReads.append(p.next())
            except StopIteration:
                if len(self.currReads) != 0:
                    print >> sys.stderr, 'warning: files had unequal length'
                self.currParsers = None
                if self.progress is not None:
                    self.progress.close()
                    self.progress = None
                continue

            if self.kmers:
                self.currKmers = []
                for rd in self.currReads:
                    if self.fwdOnly:
                        self.currKmers.append(kmersList(self.K, rd[1], False))
                    elif self.both:
                        self.currKmers.append(kmersList(self.K, rd[1], True))
                    else:
                        assert self.separate
                        self.currKmers.append(kmersLists(self.K, rd[1]))

            res = Reads()

            if self.reads:
                res.reads = self.currReads
            if self.kmers:
                res.kmers = self.currKmers
            return res
Esempio n. 15
0
def main(argv):
    opts = docopt.docopt(__doc__, argv)

    verbose = opts['-v']

    K = int(opts['<k>'])

    out = opts['<output>']

    Z = 1024 * 1024 * 32
    if opts['-m'] is not None:
        Z = 1024 * 1024 * int(opts['-m'])

    buf = KmerAccumulator2(K)
    n = 0
    tmps = []
    acgt = [0, 0, 0, 0]
    m = 0

    d = None
    if opts['-D'] is not None:
        d = float(opts['-D'])

        S = 0
        if opts['-S'] is not None:
            S = int(opts['-S'])

        cacheYes = set([])
        cacheNo = set([])

    B = opts['-C']
    if B is not None:
        xs = set([])
        for (nm, seq) in readFasta(openFile(B)):
            xs |= set(kmersList(K, seq, True))
        B = xs

    tmpnm = tmpfile('.pmc')
    with casket(tmpnm, 'w') as z:
        nr = 0
        for itm in reads(opts['<input>'],
                         K=K,
                         pairs=False,
                         reads=False,
                         kmers=True,
                         both=True,
                         verbose=verbose):
            xs = itm.kmers[0]
            for x in xs:
                acgt[x & 3] += 1
            if d is not None:
                for x in xs:
                    if x in cacheNo:
                        continue
                    if x not in cacheYes:
                        if not sub(S, d, x):
                            cacheNo.add(x)
                            continue
                        cacheYes.add(x)
                    buf.add(x)
                    m += 1
                    n += 1
                if len(cacheYes) > 1000000:
                    cacheYes = set([])
                if len(cacheNo) > 1000000:
                    cacheNo = set([])
            elif B is not None:
                found = False
                for x in xs:
                    if x in B:
                        found = True
                        break
                if found:
                    buf.addList(xs)
                    for x in xs:
                        m += 1
                        n += 1
            else:
                buf.addList(xs)
                for x in xs:
                    m += 1
                    n += 1

            nr += 1
            if (nr & 1023) == 0 and buf.mem() >= Z // 2:
                fn = 'tmps-%d' % (len(tmps), )
                tmps.append(fn)
                writeKmersAndCounts2(z, buf.kmersOnly(), buf.countsOnly(), fn)
                buf.clear()
                n = 0

        if len(tmps) and len(buf):
            fn = 'tmps-%d' % (len(tmps), )
            tmps.append(fn)
            writeKmersAndCounts2(z, buf.kmersOnly(), buf.countsOnly(), fn)
            buf = []

    with zotk.kmers(out, 'w') as z:
        h = {}
        if len(tmps) == 0:
            for c in buf.countsOnly():
                h[c] = 1 + h.get(c, 0)
            writeKmersAndCounts2(z, buf.kmersOnly(), buf.countsOnly())
        elif len(tmps) == 1:
            with casket(tmpnm, 'r') as z0:
                writeKmersAndCounts(z, readKmersAndCounts(z0, tmps[0]))
        else:
            with casket(tmpnm, 'r') as z0:
                xss = [readKmersAndCounts(z0, t) for t in tmps]
                mergeNinto(K, xss, h, z)
        n = float(sum(acgt))
        acgt = [c / n for c in acgt]
        z.meta['K'] = K
        z.meta['kmers'] = 'kmers'
        z.meta['counts'] = 'counts'
        z.meta['hist'] = h
        z.meta['acgt'] = acgt
        z.meta['reads'] = nr
    os.remove(tmpnm)