Beispiel #1
0
def test_fixed_path_kmers_1() :
    random.seed(17)
    K = 25
    D = 3
    N = 100
    e = 0.01
    alts = {'A':['C','G','T'], 'C':['A','G','T'], 'G':['A','C','T'], 'T':['A','C','G']}
    seq = 'TACTTGCACTGGGAGGCACAGCGGCTTTTCAGTGTCACAGGTATTACGAG'
    L = len(seq)
    xs = kmersList(K, seq)
    X = {}
    for i in range(N):
        r = []
        for j in range(L):
            b = seq[j]
            if random.random() < e:
                b = random.choice(alts[b])
            r.append(b)
        s = ''.join(r)
        ys = kmersList(K, s)
        for y in ys:
            if y not in X:
                X[y] = 0
            X[y] += 1
    Y = fixed_path_kmers(K, D, X, xs)
    assert Y is not None
    assert len(Y) == len(xs)
    for i in range(len(Y)):
        assert xs[i] in Y[i]
        V = [(render(K, y), X[y]) for y in Y[i] if y != xs[i] and X[y] > X[xs[i]]]
        assert len(V) == 0
Beispiel #2
0
def test_junction_kmers() :
    K = 25
    seq = 'TACTTGCACTGGGAGGCACAGCGGCTTTTCAGTGTCACAGGTATTACGAG'
    xs = kmersList(K, seq)
    assert len(xs) == K+1
    ys = junction_kmers(K, xs[0], xs[-1])
    assert xs == ys
Beispiel #3
0
    def addReadAndKmers(self, rd):
        xs = kmersList(self.K, rd[1], True)
        ns = []
        for x in xs:
            if x not in self.baits:
                continue
            ns += self.baits[x]

        if len(ns) == 0:
            return

        ns = set(ns)

        for n in ns:
            self.capReadCounts[n] += 1

        if self.reads:
            for n in ns:
                self.capReads[n][0] += rd

        if self.kmers:
            for n in ns:
                ys = self.capKmers[n]
                for x in xs:
                    if x not in ys:
                        ys[x] = 0
                    ys[x] += 1
Beispiel #4
0
def findFairlySimpleAllele(K, D, lhs, mid, rhs, mx):
    allele = consAllele(lhs, mid, rhs, K - 1, K - 1)
    xs = kmersList(K, allele)

    Y = fixed_path_kmers(K, D, mx, xs)
    if Y is None:
        return

    ancPath = []
    cov = []
    for i in range(len(xs)):
        ys = Y[i]
        (y, c) = consensus_kmer(K, ys, mx)
        ancPath.append(y)
        cov.append(c)

    seq = renderPath(K, ancPath)

    res = {}
    res['ancPath'] = ancPath
    res['allele'] = seq
    res['covMin'] = min(cov)
    res['path'] = ancPath
    res['lhsPos'] = 0
    res['rhsPos'] = 0
    yield res
Beispiel #5
0
    def score(self, res, lhs, seq, rhs):
        Km1 = self.K - 1
        Kp1 = self.K + 1

        xs = res['path']

        r = [0 for i in range(Kp1)]

        if seq is None:
            n = len(res['allele'])
            allele = consAllele(lhs, n * 'N', rhs, Km1 + res['lhsPos'],
                                Km1 + res['rhsPos'])
            r[0] += len(xs)
        else:
            allele = consAllele(lhs, seq, rhs, Km1 + res['lhsPos'],
                                Km1 + res['rhsPos'])
            ys = kmersList(self.K, allele)
            for d in [ham(x, y) for (x, y) in zip(xs, ys)]:
                r[d] += 1

        res['hammingProfile'] = r
        res['hammingCdf'] = counts2cdf(r)
        res['ksDist'] = ksDistance2(res['hammingCdf'], self.nullCdf)[0]
        res['hamming'] = hamming(allele, renderPath(self.K, xs))

        p = 1.0
        for d in xrange(Kp1):
            p *= math.pow(self.binModel[d], r[d])
        res['binom'] = 1.0 - p
Beispiel #6
0
def test_fixed_path_kmers_2() :
    random.seed(17)
    K = 25
    D = 3
    N = 100
    e = 0.01
    seq0 = 'TACTTGCACTGGGAGGCACAGCGGCTTTTCAGTGTCACAGGTATTACGAG'
    seq1 = 'TACTTGCACTGGGAGGCCCAGCGGCTTTTCAGTGTCACAGGTATTAGGAG'
    xs = kmersList(K, seq0)
    ys = kmersList(K, seq1)
    X = dict([(y, 1) for y in ys])
    Y = fixed_path_kmers(K, D, X, xs)
    assert Y is not None
    assert len(Y) == len(xs)
    for i in range(len(Y)):
        assert len(Y[i]) == 1
        assert ys[i] in Y[i]
Beispiel #7
0
def test_interpolate0Exacxt() :
    K = 25
    seq = "GGAGTTTCCAAGAGAAAATTTAGAGTTTGGGAAGGTACTAGGATCAGGTGCTTTTGGAAAAGTGATGAAC"
    ks = kmersList(K, seq, False)
    x = ks[0]
    xs = dict([(x, 1)])
    p = interpolate(K, xs, x, x, 1)
    assert p is not None
    assert len(p) == 1
    assert p[0] == x
Beispiel #8
0
def main(argv):
    opts = docopt.docopt(__doc__, argv)

    K = int(opts['-k'])

    B = int(opts['-b'])

    paired = opts['-p']

    verbose = opts['-v']

    Z = opts['-z']

    names = []
    seqs = []
    baits = {}
    with openFile(opts['<sequences>']) as f:
        for (nm, seq) in readFasta(f):
            n = len(names)
            names.append(nm)
            seqs.append(seq)
            for x in kmersList(K, seq, True):
                if x not in baits:
                    baits[x] = set([])
                baits[x].add(n)

    N = len(names)

    caches = [ReadCache(opts['-P'], names[n], paired, B, Z) for n in range(N)]

    nr = 0
    nh = 0
    for itm in reads(opts['<input>'],
                     reads=True,
                     kmers=True,
                     fwdOnly=True,
                     paired=paired,
                     verbose=verbose):
        nr += 1
        E = len(itm.kmers)
        hits = set([])
        for i in xrange(E):
            fwd = itm.kmers[i]
            for x in fwd:
                if x in baits:
                    hits |= baits[x]
        for n in hits:
            caches[n].add(itm.reads)

        if len(hits) > 0:
            nh += 1

    for n in xrange(N):
        caches[n].end()
Beispiel #9
0
def findSimpleAllele(K, lhs, mid, rhs, mx):
    allele = consAllele(lhs, mid, rhs, K - 1, K - 1)
    xs = kmersList(K, allele)
    m = min([mx.get(x, 0) for x in xs])
    res = {}
    res['allele'] = mid
    res['covMin'] = m
    res['path'] = xs
    res['lhsPos'] = 0
    res['rhsPos'] = 0
    return res
Beispiel #10
0
def test_interpolate3Exacxt() :
    K = 25
    seq = "GGAGTTTCCAAGAGAAAATTTAGAGTTTGGGAAGGTACTAGGATCAGGTGCTTTTGGAAAAGTGATGAAC"
    ks = kmersList(K, seq, False)
    x = ks[0]
    y = ks[-1]
    xs = dict([(z, 1) for z in ks])
    p = interpolate(K, xs, x, y, slice(len(ks)+1))
    assert p is not None
    assert len(p) == len(ks)
    for i in range(len(ks)):
        assert ks[i] == p[i]
Beispiel #11
0
def test_interpolate1Exacxt() :
    K = 25
    seq = "GGAGTTTCCAAGAGAAAATTTAGAGTTTGGGAAGGTACTAGGATCAGGTGCTTTTGGAAAAGTGATGAAC"
    ks = kmersList(K, seq, False)[:2]
    x = ks[0]
    y = ks[1]
    xs = dict([(z, 1) for z in ks])
    p = interpolate(K, xs, x, y, 2)
    assert p is not None
    assert len(p) == 2
    assert p[0] == x
    assert p[1] == y
Beispiel #12
0
def test_fixed_path_kmers_0() :
    K = 25
    D = 3
    seq = 'TACTTGCACTGGGAGGCACAGCGGCTTTTCAGTGTCACAGGTATTACGAG'
    xs = kmersList(K, seq)
    X = dict([(x,1) for x in xs])
    Y = fixed_path_kmers(K, D, X, xs)
    assert Y is not None
    assert len(Y) == len(xs)
    for i in range(len(Y)):
        assert len(Y[i]) == 1
        assert Y[i][0] == xs[i]
Beispiel #13
0
def parseFiles(K, paired, fns, verbose):
    M = (1 << 18) - 1
    rn = 0

    if not paired:
        for fn in fns:
            with openFile(fn) as f:
                rn += 1
                if verbose and (rn & M) == 0:
                    print >> sys.stderr, 'reads processed: %d' % (rn,)
                xs = kmersList(K, fq1[1], False)
                yield xs
        return

    for (fn1, fn2) in pairs(fns):
        with openFile(fn1) as f1, openFile(fn2) as f2:
            for fq1, fq2 in both(readFastq(f1), readFastq(f2)):
                rn += 1
                if verbose and (rn & M) == 0:
                    print >> sys.stderr, 'read pairs processed: %d' % (rn,)
                xs = kmersList(K, fq1[1], False) + [rc(K, x) for x in kmersList(K, fq2[1], False)]
                yield xs
Beispiel #14
0
 def addBait(self, nm, seq, bothStrands=True):
     n = len(self.names)
     self.names.append(nm)
     self.nameIdx[nm] = n
     for x in kmersList(self.K, seq, bothStrands):
         if x not in self.baits:
             self.baits[x] = [n]
         elif self.baits[x][-1] != n:
             self.baits[x].append(n)
     self.capReadCounts.append(0)
     self.capReads.append([[], []])
     self.capKmers.append({})
     return n
Beispiel #15
0
    def addReadPairAndKmers(self, lhs, rhs):
        xs = kmersList(self.K, lhs[1], True)
        ys = kmersList(self.K, rhs[1], True)
        ns = []
        for x in xs:
            if x not in self.baits:
                continue
            ns += self.baits[x]
        for y in ys:
            if y not in self.baits:
                continue
            ns += self.baits[y]

        if len(ns) == 0:
            return

        ns = set(ns)

        for n in ns:
            self.capReadCounts[n] += 1

        if self.reads:
            for n in ns:
                self.capReads[n][0] += lhs
                self.capReads[n][1] += rhs

        if self.kmers:
            for n in ns:
                zs = self.capKmers[n]
                for x in xs:
                    if x not in zs:
                        zs[x] = 0
                    zs[x] += 1
                for y in ys:
                    if y not in zs:
                        zs[y] = 0
                    zs[y] += 1
Beispiel #16
0
    def addRead(self, rd):
        assert self.reads

        xs = kmersList(self.K, rd[1], True)
        ns = []
        for x in xs:
            if x not in self.baits:
                continue
            ns += self.baits[x]

        if len(ns) == 0:
            return

        ns = set(ns)

        for n in ns:
            self.capReadCounts[n] += 1
            self.capReads[n][0] += rd
Beispiel #17
0
    def __init__(self, K, refList):
        self.K = K
        self.exonLengths = [0 for i in range(len(refList))]

        idx = {}
        for i in range(len(refList)):
            itm = refList[i]
            for x in kmersList(self.K, itm['seq'], False):
                if x not in idx:
                    idx[x] = set([])
                idx[x].add(i)
                self.exonLengths[i] += 1

        self.idxUpper = {}
        self.idxLower = {}
        for x in idx.iterkeys():
            k = tuple(sorted(idx[x]))
            h = sig(k)
            if h not in self.idxLower:
                self.idxLower[h] = k
            self.idxUpper[x] = h
Beispiel #18
0
def main(argv):
    opts = docopt.docopt(__doc__, argv)

    K = int(opts['-k'])
    C = int(opts['-c'])
    Q = int(opts['-q'])
    S = int(opts['-S'])
    P = float(opts['-p'])

    verbose = opts['-v']

    both = True
    if opts['-s']:
        both = False

    res = []
    for fn in opts['<input>']:
        fres = {}
        fres['file'] = fn
        fres['contigs'] = []
        glob = {}
        ncontig = 0
        with openFile(fn) as f:
            for (nm, seq) in readFasta(f):
                ncontig += 1
                scaff = {}
                for x in kmersList(K, seq, both):
                    if sub(S, P, x):
                        scaff[x] = 1 + scaff.get(x, 0)
                summary = summarize(scaff, C, Q)
                summary['name'] = nm
                fres['contigs'].append(summary)
                for (x, c) in scaff.items():
                    glob[x] = c + glob.get(x, 0)
        fres['global'] = summarize(glob, C, Q)
        res.append(fres)

    yaml.safe_dump(res, sys.stdout)
Beispiel #19
0
def main(argv):
    opts = docopt.docopt(__doc__, argv)

    K = int(opts['-k'])
    M = (1 << (2*K)) - 1

    paired = True
    if opts['-s']:
        paired = False

    p = float(opts['-p'])
    T = int(M * p)

    if opts['-r']:
        refs = []
        with openFile(opts['-r']) as f:
            for (nm, seq) in readFasta(f):
                refs += kmersList(K, seq, False)
        refs = set(refs)

        kill = set([])
        for x in refs:
            y = rc(K, x)
            if y in refs:
                kill.add(x)
                kill.add(y)
        print >> sys.stderr, 'removing %d/%d' % (len(kill), len(refs))

        refs -= set(kill)

        fwd = {}
        rev = {}
        for xs in parseFiles(K, paired, opts['<fastq>'], opts['-v']):
            fn = 0
            for x in xs:
                if x in refs:
                    fn += 1

            ys = [rc(K, x) for x in xs]
            rn = 0
            for y in ys:
                if y in refs:
                    rn += 1
            
            if fn + rn == 0:
                continue

            q = float(fn) / float(fn + rn)
            if random.random() < q:
                for x in xs:
                    fwd[x] = 1 + fwd.get(x, 0)
            else:
                for y in ys:
                    rev[y] = 1 + rev.get(y, 0)

        for (x,xc) in fwd.iteritems():
            y = rc(K, x)
            yc = 0
            if y in rev:
                yc = rev[y]
                del rev[y]
            print '%d\t%d' % (xc, yc)

        for (y,yc) in rev.iteritems():
            print '%d\t%d' % (0, yc)

        return

    kx = {}
    for xs in parseFiles(K, paired, opts['<fastq>'], opts['-v']):
        for x in xs:
            if x in kx:
                kx[x] += 1
                continue
            y = rc(K, x)
            z = murmer(min(x, y), 17)
            if (z & M) > T:
                continue
            kx[x] = 1

    for x in kx.keys():
        y = rc(K, x)
        if x > y:
            continue
        xc = kx[x]
        yc = kx.get(y, 0)
        if murmer(x, 17) >= murmer(y, 17):
            (a, b) = (x, y)
            (ac, bc) = (xc, yc)
        else:
            (a, b) = (y, x)
            (ac, bc) = (yc, xc)
        #print '%s\t%d\t%s\t%d' % (render(K, a), ac, render(K, b), bc)
        print '%d\t%d' % (ac, bc)
Beispiel #20
0
def main(argv):
    opts = docopt.docopt(__doc__, argv)

    verbose = opts['-v']

    K = int(opts['<k>'])

    out = opts['<output>']

    Z = 1024 * 1024 * 32
    if opts['-m'] is not None:
        Z = 1024 * 1024 * int(opts['-m'])

    buf = KmerAccumulator2(K)
    n = 0
    tmps = []
    acgt = [0, 0, 0, 0]
    m = 0

    d = None
    if opts['-D'] is not None:
        d = float(opts['-D'])

        S = 0
        if opts['-S'] is not None:
            S = int(opts['-S'])

        cacheYes = set([])
        cacheNo = set([])

    B = opts['-C']
    if B is not None:
        xs = set([])
        for (nm, seq) in readFasta(openFile(B)):
            xs |= set(kmersList(K, seq, True))
        B = xs

    tmpnm = tmpfile('.pmc')
    with casket(tmpnm, 'w') as z:
        nr = 0
        for itm in reads(opts['<input>'],
                         K=K,
                         pairs=False,
                         reads=False,
                         kmers=True,
                         both=True,
                         verbose=verbose):
            xs = itm.kmers[0]
            for x in xs:
                acgt[x & 3] += 1
            if d is not None:
                for x in xs:
                    if x in cacheNo:
                        continue
                    if x not in cacheYes:
                        if not sub(S, d, x):
                            cacheNo.add(x)
                            continue
                        cacheYes.add(x)
                    buf.add(x)
                    m += 1
                    n += 1
                if len(cacheYes) > 1000000:
                    cacheYes = set([])
                if len(cacheNo) > 1000000:
                    cacheNo = set([])
            elif B is not None:
                found = False
                for x in xs:
                    if x in B:
                        found = True
                        break
                if found:
                    buf.addList(xs)
                    for x in xs:
                        m += 1
                        n += 1
            else:
                buf.addList(xs)
                for x in xs:
                    m += 1
                    n += 1

            nr += 1
            if (nr & 1023) == 0 and buf.mem() >= Z // 2:
                fn = 'tmps-%d' % (len(tmps), )
                tmps.append(fn)
                writeKmersAndCounts2(z, buf.kmersOnly(), buf.countsOnly(), fn)
                buf.clear()
                n = 0

        if len(tmps) and len(buf):
            fn = 'tmps-%d' % (len(tmps), )
            tmps.append(fn)
            writeKmersAndCounts2(z, buf.kmersOnly(), buf.countsOnly(), fn)
            buf = []

    with zotk.kmers(out, 'w') as z:
        h = {}
        if len(tmps) == 0:
            for c in buf.countsOnly():
                h[c] = 1 + h.get(c, 0)
            writeKmersAndCounts2(z, buf.kmersOnly(), buf.countsOnly())
        elif len(tmps) == 1:
            with casket(tmpnm, 'r') as z0:
                writeKmersAndCounts(z, readKmersAndCounts(z0, tmps[0]))
        else:
            with casket(tmpnm, 'r') as z0:
                xss = [readKmersAndCounts(z0, t) for t in tmps]
                mergeNinto(K, xss, h, z)
        n = float(sum(acgt))
        acgt = [c / n for c in acgt]
        z.meta['K'] = K
        z.meta['kmers'] = 'kmers'
        z.meta['counts'] = 'counts'
        z.meta['hist'] = h
        z.meta['acgt'] = acgt
        z.meta['reads'] = nr
    os.remove(tmpnm)
Beispiel #21
0
    def next(self):
        self.readNum += 1
        if (self.readNum & self.M) == 0 and self.progress is not None:
            self.progress.update(self.M)

        while True:
            if self.currParsers is None:

                if self.currFilesInd is None:
                    self.currFilesInd = 0
                else:
                    self.currFilesInd += self.N

                if self.progress is not None:
                    self.progress.update(self.readNum & self.M)

                if self.currFilesInd + (self.N - 1) >= len(self.files):
                    raise StopIteration

                if self.verbose:
                    pfx = ' & '.join([
                        basename(self.files[i])
                        for i in range(self.currFilesInd, self.currFilesInd +
                                       self.N)
                    ])
                    self.progress = tqdm(unit=' reads', unit_scale=True)
                    self.progress.set_postfix(reading=pfx, refresh=True)

                self.currParsers = []
                for i in range(self.currFilesInd, self.currFilesInd + self.N):
                    fn = self.files[i]
                    f = openFile(fn)
                    if isFasta(fn):
                        self.currParsers.append(readFasta(f))
                    else:
                        self.currParsers.append(readFastq(f))

            self.currReads = []
            try:
                for p in self.currParsers:
                    self.currReads.append(p.next())
            except StopIteration:
                if len(self.currReads) != 0:
                    print >> sys.stderr, 'warning: files had unequal length'
                self.currParsers = None
                if self.progress is not None:
                    self.progress.close()
                    self.progress = None
                continue

            if self.kmers:
                self.currKmers = []
                for rd in self.currReads:
                    if self.fwdOnly:
                        self.currKmers.append(kmersList(self.K, rd[1], False))
                    elif self.both:
                        self.currKmers.append(kmersList(self.K, rd[1], True))
                    else:
                        assert self.separate
                        self.currKmers.append(kmersLists(self.K, rd[1]))

            res = Reads()

            if self.reads:
                res.reads = self.currReads
            if self.kmers:
                res.kmers = self.currKmers
            return res
Beispiel #22
0
def main(argv):
    opts = docopt.docopt(__doc__, argv)

    verbose = opts['-v']

    K = int(opts['-k'])

    D = int(opts['-D'])

    Q = int(opts['-C'])

    V = float(opts['-V'])

    d = "."
    if opts['-g']:
        d = opts['-g']
    sf = SequenceFactory(d)

    if opts['-X']:
        Wcap = int(opts['-w'])
        Wval = int(opts['-W'])

        variants = opts['<variant>']
        if opts['-f']:
            with openFile(opts['-f']) as f:
                variants += f.read().split()

        vx = {}
        for v in variants:
            x = makeHGVS(v)
            if x is None:
                print >> sys.stderr, "unable to parse %s" % (v, )
                continue
            x.setSequenceFactory(sf)
            acc = x.accession()
            if acc not in vx:
                vx[acc] = []
            vx[acc].append(x)

        chk = None
        if opts['-T']:
            chk = {}

        rs = []
        for (acc, vs) in vx.iteritems():
            for v in vs:
                r = makeIndexedVariant(v, K, Wcap, Wval)
                if r is not None:
                    rs.append(r)
                if chk is not None:
                    xs = kmersList(
                        K, ''.join([
                            r['lhsFlank'][-(K - 1):], r['wtSeq'],
                            r['rhsFlank'][:K - 1]
                        ]), True)
                    for x in xs:
                        if x not in chk:
                            chk[x] = set([])
                        chk[x].add(('wt', str(v)))
                    if r['mutSeq'] is None:
                        continue
                    xs = kmersList(
                        K, ''.join([
                            r['lhsFlank'][-(K - 1):], r['mutSeq'],
                            r['rhsFlank'][:K - 1]
                        ]), True)
                    for x in xs:
                        if x not in chk:
                            chk[x] = set([])
                        chk[x].add(('mut', str(v)))

        if chk is not None:
            counts = dict([(x, 0) for x in chk.keys()])
            for acc in refSeq2Hg19.keys():
                if verbose:
                    print >> sys.stderr, 'scanning', acc
                seq = sf[acc]
                for x in kmers(K, seq):
                    if x in counts:
                        counts[x] += 1
            res = {}
            seen = set([])
            for x in counts.keys():
                y = rc(K, x)
                z = min(x, y)
                if z in seen:
                    continue
                seen.add(z)
                c = counts[x] + counts[y]
                for (a, v) in chk[x]:
                    if v not in res:
                        res[v] = {}
                    if a not in res[v]:
                        res[v][a] = {}
                    if c not in res[v][a]:
                        res[v][a][c] = 0
                    res[v][a][c] += 1
            yaml.safe_dump(res, sys.stdout, default_flow_style=False)
            return

        with open(opts['<index>'], 'w') as f:
            yaml.safe_dump(rs, f, default_flow_style=False)

        return

    capt = False
    zipname = None
    if opts['-c']:
        capt = True
        zipname = opts['-c']

    fmt = set([])
    if opts['-F']:
        fmt = set(opts['-F'].split(','))

    if verbose:
        print >> sys.stderr, "loading index."

    with open(opts['<index>']) as f:
        hgvsVars = yaml.load(f, Loader=yaml.FullLoader)

    NV = len(hgvsVars)

    combineStrands = True
    if opts['-s']:
        combineStrands = False

    cap = capture(K, reads=capt, kmers=True, verbose=verbose)

    for n in range(NV):
        itm = hgvsVars[n]
        h = itm['hgvs']
        v = makeHGVS(h)
        itm['var'] = v
        lhs = itm['lhsFlank']
        rhs = itm['rhsFlank']
        wt = itm['wtSeq']
        mut = itm['mutSeq']
        bait = [lhs, wt, rhs]
        if mut is not None:
            bait += ['N']
            bait += [lhs, mut, rhs]
        bait = ''.join(bait)
        n0 = cap.addBait(h, bait)
        assert n0 == n

    if verbose:
        print >> sys.stderr, "done."

    rn = 0
    for itm in reads(opts['<input>'],
                     K=K,
                     paired=True,
                     reads=True,
                     kmers=False,
                     both=True,
                     verbose=verbose):
        rn += 1
        cap.addReadPairAndKmers(itm.reads[0], itm.reads[1])

    if capt:
        cap.saveReads(zipname)

    scorer = Scorer(K)

    globHist = {}

    for n in range(NV):
        mx = cap.capKmers[n]
        for c in mx.itervalues():
            if c < Q:
                continue
            if c not in globHist:
                globHist[c] = 0
            globHist[c] += 1

    with outputFile(opts['-o']) as out:
        hdrShown = False
        for n in range(NV):
            itm = hgvsVars[n]
            v = itm['var']
            h = itm['hgvs']

            mx = cap.capKmers[n]

            nr = cap.capReadCounts[n]

            if 'kmers' in fmt:
                for (x, c) in mx.iteritems():
                    print '%d\t%s\t%d' % (n, render(K, x), c)

            lhsFlank = itm['lhsFlank']
            rhsFlank = itm['rhsFlank']

            alleles = {}
            alleles['wt'] = []
            alleles['mut'] = []

            wtSeq = itm['wtSeq']
            wtZ = len(wtSeq)

            mutSeq = itm['mutSeq']
            mutZ = v.size()

            cs = [c for (x, c) in mx.iteritems() if c >= Q]
            cs.sort()
            nk = len(cs)
            if nk == 0:
                cs = [0]

            q10 = cs[1 * len(cs) // 10]
            q50 = cs[5 * len(cs) // 10]
            q90 = cs[9 * len(cs) // 10]

            af = AlleleFinder(K, D, v, mx, lhsFlank, rhsFlank, wtSeq, mutSeq,
                              wtZ, mutZ)
            finders = []
            if not v.anonymous():
                finders.append(af.definiteAlleles())
            else:
                finders.append(af.bridgingAlleles())

            j = 0
            for (t, a) in cat(finders):
                assert t == 'wt' or t == 'mut'
                alleles[t].append(a)
                j += 1

            wtRes = {}
            wtRes['covMin'] = 0
            wtRes['binom'] = 1.0
            wtRes['ksDist'] = 0.0
            wtRes['hamming'] = 0
            wtRes['path'] = []
            for pthRes in alleles['wt']:
                scorer.score(pthRes, lhsFlank, wtSeq, rhsFlank)
                if isBetter(pthRes, wtRes):
                    wtRes = pthRes

            mutRes = {}
            mutRes['covMin'] = 0
            mutRes['binom'] = 1.0
            mutRes['ksDist'] = 0.0
            mutRes['hamming'] = 0
            mutRes['path'] = []
            for pthRes in alleles['mut']:
                scorer.score(pthRes, lhsFlank, mutSeq, rhsFlank)
                if isBetter(pthRes, mutRes):
                    mutRes = pthRes

            if True:
                wtXs = [mx.get(x, 0) for x in wtRes['path']]
                if len(wtXs) == 0:
                    wtXs = [0]
                wtXs.sort()
                wtCount = sum(wtXs)
                wtLen = len(wtXs)
                wtMean = float(wtCount) / float(wtLen)
                wtMedian = wtXs[wtLen // 2]

                mutXs = [mx.get(x, 0) for x in mutRes['path']]
                if len(mutXs) == 0:
                    mutXs = [0]
                mutXs.sort()
                mutCount = sum(mutXs)
                mutLen = len(mutXs)
                mutMean = float(mutCount) / float(mutLen)
                mutMedian = mutXs[mutLen // 2]

                totX = max([1.0, float(wtMedian + mutMedian), float(q90)])
                wtVaf = wtMedian / totX
                mutVaf = mutMedian / totX

            hdrs = ['n']
            fmts = ['%d']
            outs = [n]

            wtAllele = ((wtRes['covMin'] > Q) and
                        (wtRes['hamming'] < 4)) and (wtVaf > V)
            mutAllele = ((mutRes['covMin'] > Q) and
                         (mutRes['hamming'] < 4)) and (mutVaf > V)
            resV = 1 * wtAllele + 2 * mutAllele
            res = ['null', 'wt', 'mut', 'wt/mut'][resV]

            hdrs += ['res']
            fmts += ['%s']
            outs += [res]

            if 'rds' in fmt:
                hdrs += ['numReads']
                fmts += ['%d']
                outs += [nr]

            hdrs += ['numKmers', 'covQ10', 'covQ50', 'covQ90']
            fmts += ['%d', '%d', '%d', '%d']
            outs += [nk, q10, q50, q90]

            hdrs += ['wtMin', 'mutMin']
            fmts += ['%d', '%d']
            outs += [wtRes['covMin'], mutRes['covMin']]

            hdrs += ['wtHam', 'mutHam']
            fmts += ['%d', '%d']
            outs += [wtRes['hamming'], mutRes['hamming']]

            if 'ks' in fmt:
                hdrs += ['wtD', 'mutD']
                fmts += ['%g', '%g']
                outs += [wtRes['ksDist'], mutRes['ksDist']]

            if 'binom' in fmt:
                hdrs += ['wtQ', 'mutQ']
                fmts += ['%g', '%g']
                outs += [wtRes['binom'], mutRes['binom']]

            if 'vaf' in fmt:
                hdrs += ['wtVaf', 'mutVaf']
                fmts += ['%g', '%g']
                outs += [wtVaf, mutVaf]

            hdrs += ['hgvs']
            fmts += ['%s']
            outs += [h]

            if not hdrShown:
                hdrShown = True
                print >> out, '\t'.join(hdrs)
            print >> out, '\t'.join(fmts) % tuple(outs)
            out.flush()
Beispiel #23
0
def main(argv):
    opts = docopt.docopt(__doc__, argv)

    K = 25

    nms = []
    idx = {}
    for (nm, seq) in readFasta(openFile(opts['<baits>'])):
        n = len(nms)
        nms.append(nm)
        for x in kmersList(K, seq, True):
            if x not in idx:
                idx[x] = set([])
            idx[x].add(n)

    for x in idx.keys():
        idx[x] = list(idx[x])
        idx[x].sort()

    anti = set([])
    if opts['-U']:
        with openFile(opts['-U']) as f:
            for (nm, seq) in readFasta(f):
                for x in kmersList(K, seq, True):
                    anti.add(x)

    rn = 0
    if opts['-p']:

        hist = {}
        for (fn1, fn2) in pairs(opts['<input>']):
            tmps = [(tmpfile('_1.fastq'), tmpfile('_2.fastq'))
                    for i in xrange(len(nms))]
            cache = [[[], []] for i in xrange(len(nms))]
            counts = [0 for i in xrange(len(nms))]
            with openFile(fn1) as f1, openFile(fn2) as f2:
                for fq1, fq2 in both(readFastq(f1), readFastq(f2)):
                    hits = set([])
                    pushup = False
                    for x in kmersList(K, fq1[1]):
                        if x in anti:
                            pushup = True
                            break
                        for i in idx.get(x, []):
                            hits.add(i)
                    for x in kmersList(K, fq2[1]):
                        if x in anti:
                            pushup = True
                            break
                        for i in idx.get(x, []):
                            hits.add(i)
                    if pushup:
                        continue
                    n = len(hits)
                    hist[n] = 1 + hist.get(n, 0)
                    for i in hits:
                        counts[i] += 1
                        cache[i][0].append(fq1)
                        cache[i][1].append(fq2)
                        if len(cache[i][0]) >= 1024:
                            with open(tmps[i][0], 'a') as f:
                                for rd in cache[i][0]:
                                    print >> f, rd[0]
                                    print >> f, rd[1]
                                    print >> f, rd[2]
                                    print >> f, rd[3]
                            with open(tmps[i][1], 'a') as f:
                                for rd in cache[i][1]:
                                    print >> f, rd[0]
                                    print >> f, rd[1]
                                    print >> f, rd[2]
                                    print >> f, rd[3]
                            cache[i][0] = []
                            cache[i][1] = []
            for i in xrange(len(cache)):
                if len(cache[i][0]) > 0:
                    with open(tmps[i][0], 'a') as f:
                        for rd in cache[i][0]:
                            print >> f, rd[0]
                            print >> f, rd[1]
                            print >> f, rd[2]
                            print >> f, rd[3]
                    with open(tmps[i][1], 'a') as f:
                        for rd in cache[i][1]:
                            print >> f, rd[0]
                            print >> f, rd[1]
                            print >> f, rd[2]
                            print >> f, rd[3]
                    cache[i][0] = []
                    cache[i][1] = []
            with zipfile.ZipFile(opts['<output>'], 'w',
                                 zipfile.ZIP_DEFLATED) as z:
                for i in xrange(len(nms)):
                    if counts[i] > 0:
                        pth = '/'.join(nms[i].split())
                        z.write(tmps[i][0], pth + '/' + fn1)
                        os.remove(tmps[i][0])
                        z.write(tmps[i][1], pth + '/' + fn2)
                        os.remove(tmps[i][1])
        hist = hist.items()
        hist.sort()
        for (n, f) in hist:
            print '%d\t%d' % (n, f)
    else:
        raise "not implemented"
Beispiel #24
0
def main(argv):
    opts = docopt.docopt(__doc__, argv)

    K = int(opts['-k'])
    if (K & 1) != 0:
        print >> sys.stderr, "K must be even."
        return

    minCov = int(opts['-m'])

    verbose = opts['-v']

    J = K // 2
    S = 2*(K - J)
    Mj = (1 << (2*J)) - 1

    names = []
    seqs = {}
    bait = {}
    wtFst = []
    wtLst = []
    posIdx = []
    rds = []
    with openFile(opts['<sequences>']) as f:
        for (nm, seq) in readFasta(f):
            n = len(names)
            names.append(nm)
            seqs[nm] = seq
            wf = {}
            wl = {}
            for x in kmersList(K, seq, False):
                if x not in bait:
                    bait[x] = set([])
                bait[x].add(n)

                y0 = x >> S
                y1 = x & Mj
                #print '- %s\t%s\t%s' % (render(K, x), render(J, y0), render(J, y1))

                if y0 not in wf:
                    wf[y0] = set([])
                wf[y0].add(y1)

                if y1 not in wl:
                    wl[y1] = set([])
                wl[y1].add(y0)

            wtFst.append(wf)
            wtLst.append(wl)
            
            px = {}
            for (x,p) in kmersWithPosList(J, seq, False):
                if x not in px:
                    px[x] = []
                px[x].append(p)
            posIdx.append(px)

            for (a, b, c, d) in findDup(wtFst[n], wtLst[n], wtFst[n], wtLst[n]):
                pps = positions(posIdx[n], J, a, b, c, d)
                if pps is None:
                    continue
                for pp in pps:
                    ab = a << S | b
                    cb = c << S | b
                    cd = c << S | d
                    dd = pp[2] - pp[0]
                    print >> sys.stderr, 'warning: phantom dumplication: %s-%s-%s (%d)' % (render(K, ab), render(K, cb), render(K, cd), dd)

            rds.append([])

    N = len(names)

    L = None
    X = [{} for n in range(N)]
    for itm in reads(opts['<input>'], K=K, reads=True, kmers=True, both=True, verbose=verbose):
        rd = itm.reads[0]
        L = len(rd)

        xs = itm.kmers[0]
        hits = set([])
        for x in xs:
            if x in bait:
                hits |= bait[x]
        for n in hits:
            for x in xs:
                if x not in X[n]:
                    X[n][x] = 0
                X[n][x] += 1
            rds[n].append(rd)

    hdrShown = False
    vn = 0
    for n in range(N):
        xs = {}
        for (x,c) in X[n].iteritems():
            if c >= 10:
                xs[x] = c

        seq = seqs[names[n]]

        rngs = []
        st = None
        en = None
        inside = False
        xx = []
        for x in kmersList(K, seq, False):
            if x in xs:
                xx.append('.')
            else:
                xx.append('X')
        print ''.join(xx)
        for x in kmersList(K, seq, False):
            if not inside:
                if x in xs:
                    st = x
                else:
                    inside = True
            else:
                if x in xs:
                    en = x
                    rngs.append((st, en))
                    st = x
                    en = None
                    inside = False
        if inside:
            rngs.append((st, en))

        pthr = Pather(K, xs)

        for (x,y) in rngs:
            if x is None or y is None:
                continue
            print render(K, x), render(K, y)
            for p in pthr.trace(x, y, 100):
                print renderPath(K, p)
        continue

        fst = {}
        lst = {}
        for (x,c) in xs.iteritems():
            #if c < 5:
            #    continue
            y0 = x >> S
            y1 = x & Mj

            if y0 not in fst:
                fst[y0] = []
            fst[y0].append(y1)

            if y1 not in lst:
                lst[y1] = []
            lst[y1].append(y0)

        #for (a, b, c, d) in findDupDeNovo(fst, lst):
        for (a, b, c, d) in findDup(wtFst[n], wtLst[n], fst, lst):
            #continue
            pps = positions(posIdx[n], J, a, b, c, d)
            if pps is None:
                continue
            for pp in pps:
                ab = a << S | b
                cb = c << S | b
                cd = c << S | d
                #print [(render(J, w), p) for (w,p) in zip([a, b, c, d], pps)]

                dd = pp[2] - pp[0]

                if not opts['-a'] and dd % 3 != 0:
                    continue

                if opts['-s']:
                    fstPath = interpolate(K, xs, ab, cb, dd+1)
                    sndPath = interpolate(K, xs, cb, cd, dd+1)

                    if fstPath is None:
                        continue
                    if sndPath is None:
                        continue

                    if fstPath[J:-J] != sndPath[J:-J]:
                        continue

                pa = pp[0]
                pb = pp[1]
                pc = pp[2]
                pd = pp[3]

                cab = xs.get(ab, 0)
                ccb = xs.get(cb, 0)
                ccd = xs.get(cd, 0)

                if cab < minCov:
                    continue
                if ccb < minCov:
                    continue
                if ccd < minCov:
                    continue

                m = (cab + ccd) / 2.0
                # Assume the true std dev is 10% of the mean
                w = ccb / m

                hgvs = '%s:c.%d_%ddup' % (names[n], pb, pd - 1)
                v = Duplication(names[n], pb, pd-1, seqs)
                if opts['-A']:
                    showAnchoredReads(K, {ab:'AB', cb:'CB', cd:'CD'}, rds[n])

                vn += 1

                hdrs = ['n']
                fmts = ['%d']
                outs = [vn]

                hdrs += ['left', 'leftCov']
                fmts += ['%s','%d']
                outs += [render(K, ab), cab]

                hdrs += ['mid', 'midCov']
                fmts += ['%s','%d']
                outs += [render(K, cb), ccb]

                hdrs += ['right', 'rightCov']
                fmts += ['%s','%d']
                outs += [render(K, cd), ccd]

                hdrs += ['len']
                fmts += ['%d']
                outs += [dd]

                hdrs += ['vaf']
                fmts += ['%g']
                outs += [w]

                hdrs += ['hgvs']
                fmts += ['%s']
                outs += [hgvs]

                if not hdrShown:
                    hdrShown = True
                    print '\t'.join(hdrs)
                print '\t'.join(fmts) % tuple(outs)