コード例 #1
0
ファイル: evaluation.py プロジェクト: JohnReid/JEMIMA
def _countWmers(index, Ws, countunique):
    counts = npy.zeros((2*len(index), len(Ws)), dtype=npy.uint)
    rootcounts = wmers.countWmersMulti(
        index.topdownhistory(), Ws, counts, countunique=countunique)
    # Count how many W-mers are represented by the children
    # of each node
    childfreqs = npy.zeros((2*len(index), len(Ws), jem.SIGMA))
    wmers.countChildren(
        index.topdownhistory(), Ws, counts, childfreqs)
    childfreqs = jem.normalisearray(childfreqs)
    return rootcounts, counts, childfreqs
コード例 #2
0
def _countWmers(index, Ws, countunique):
    counts = npy.zeros((2 * len(index), len(Ws)), dtype=npy.uint)
    rootcounts = wmers.countWmersMulti(index.topdownhistory(),
                                       Ws,
                                       counts,
                                       countunique=countunique)
    # Count how many W-mers are represented by the children
    # of each node
    childfreqs = npy.zeros((2 * len(index), len(Ws), jem.SIGMA))
    wmers.countChildren(index.topdownhistory(), Ws, counts, childfreqs)
    childfreqs = jem.normalisearray(childfreqs)
    return rootcounts, counts, childfreqs
コード例 #3
0
ファイル: evaluation.py プロジェクト: JohnReid/JEMIMA
def handleseed(seedidx, seqsdata, Widx, seed, args):
    """Test the methods on one seed."""
    seedstats = collections.defaultdict(list)
    iterstats = collections.defaultdict(list)
    methodstats = collections.defaultdict(list)
    strippedfasta = stripfastaname(seqsdata.fasta)
    logger.info("Stripped FASTA: %s", strippedfasta)
    W = args.Ws[Widx]
    logger.info('Seed: %s; W=%2d', seed, W)
    numseqs = len(seqsdata.seqs)
    numoccs = seqsdata.numoccs[Widx]
    numunique = seqsdata.numunique[Widx]
    meannumsamples = npy.log10(numunique) * 600
    numseedsites = rdm.randint(max(1, numseqs / 10), numseqs * 2)
    lambda_ = numseqs / float(numoccs)
    pwm = jem.pwmfromWmer(seed, numseedsites, args.pseudocount)
    if args.writelogos:
        jem.logo(pwm, 'seed-%03d' % seedidx)
    seedstats['seedidx'].append(seedidx)
    seedstats['fasta'].append(strippedfasta)
    seedstats['seed'].append(str(seed))
    seedstats['W'].append(W)
    seedstats['numseedsites'].append(numseedsites)

    for iteration in xrange(args.maxiters):
        # numsamples = rdm.randint(max(1, numoccs / 10), numoccs / 2)
        numsamples = \
            int(rdm.lognormal(mean=npy.log(meannumsamples), sigma=.5)) + 1
        pwmIC = jem.informationcontent(pwm, seqsdata.bgfreqs)
        start = time.time()
        summer = dotrueiteration(seqsdata, W, pwm, lambda_)
        truetime = time.time() - start
        logger.debug('Sums:\n%s', summer.sums)
        Znsumtrue = summer.sums[0].sum()
        pwmtrue = jem.normalisearray(summer.sums)
        pwmtrueIC = jem.informationcontent(pwmtrue, seqsdata.bgfreqs)
        lambdatrue = Znsumtrue / float(numoccs)
        if args.writelogos:
            jem.logo(
                pwmtrue,
                'seed-%03d-%03d-true' % (seedidx, iteration))
        distperbase = npy.linalg.norm(pwmtrue - pwm, ord=1) / W
        logging.info(
            'Iteration: %3d, IC/base=%.2f bits, PWM distance/base=%.4f',
            iteration, pwmtrueIC/W,
            npy.linalg.norm(pwmtrue - pwm, ord=1) / W)
        iterstats['seedidx'].append(seedidx)
        iterstats['iteration'].append(iteration)
        iterstats['truetime'].append(truetime)
        iterstats['numsamples'].append(numsamples)
        iterstats['ICstart'].append(pwmIC)
        iterstats['ICtrue'].append(pwmtrueIC)
        iterstats['Znsumtrue'].append(Znsumtrue)
        iterstats['lambdastart'].append(lambda_)
        iterstats['lambdatrue'].append(lambdatrue)

        for methodname in args.methods:
            start = time.time()
            domainsize, iscb = METHODS[methodname](
                seqsdata, pwm, lambda_, Widx, numsamples, args)
            duration = time.time() - start
            pwmestimate = jem.normalisearray(iscb.cb.sums)
            Znsumestimate = iscb.cb.sums[0].sum() * \
                float(domainsize) / numsamples
            methodstats['seedidx'].append(seedidx)
            methodstats['iteration'].append(iteration)
            methodstats['method'].append(methodname)
            methodstats['methodtime'].append(duration)
            methodstats['ICestimate'].append(
                jem.informationcontent(pwmestimate, seqsdata.bgfreqs))
            methodstats['Znsumestimate'].append(Znsumestimate)
            methodstats['var'].append(iscb.var())
            methodstats['lambdaestimate'].append(
                Znsumestimate / float(numoccs))
            # Various measures of how different the estimated PWM is from the
            # true PWM.
            methodstats['frobeniusdist'].append(
                npy.linalg.norm(pwmtrue - pwmestimate, ord='fro'))
            methodstats['maxdist'].append(npy.abs(pwmtrue - pwmestimate).max())
            methodstats['absdist'].append(npy.abs(pwmtrue - pwmestimate).sum())
            methodstats['euclideandist'].append(
                npy.linalg.norm((pwmtrue - pwmestimate).flatten(), ord=2))
            methodstats['KLtrueestimate'].append(
                jem.pwmKL(pwmtrue, pwmestimate))
            methodstats['KLestimatetrue'].append(
                jem.pwmKL(pwmestimate, pwmtrue))
            if args.writelogos:
                jem.logo(
                    pwmestimate,
                    'seed-%03d-%03d-%s' % (seedidx, iteration, methodname))

        pwm = pwmtrue
        # lambda_ = lambdatrue

        if distperbase < args.stopthreshold:
            break
    return seedstats, iterstats, methodstats
コード例 #4
0
def handleseed(seedidx, seqsdata, Widx, seed, args):
    """Test the methods on one seed."""
    seedstats = collections.defaultdict(list)
    iterstats = collections.defaultdict(list)
    methodstats = collections.defaultdict(list)
    strippedfasta = stripfastaname(seqsdata.fasta)
    logger.info("Stripped FASTA: %s", strippedfasta)
    W = args.Ws[Widx]
    logger.info('Seed: %s; W=%2d', seed, W)
    numseqs = len(seqsdata.seqs)
    numoccs = seqsdata.numoccs[Widx]
    numunique = seqsdata.numunique[Widx]
    meannumsamples = npy.log10(numunique) * 600
    numseedsites = rdm.randint(max(1, numseqs / 10), numseqs * 2)
    lambda_ = numseqs / float(numoccs)
    pwm = jem.pwmfromWmer(seed, numseedsites, args.pseudocount)
    if args.writelogos:
        jem.logo(pwm, 'seed-%03d' % seedidx)
    seedstats['seedidx'].append(seedidx)
    seedstats['fasta'].append(strippedfasta)
    seedstats['seed'].append(str(seed))
    seedstats['W'].append(W)
    seedstats['numseedsites'].append(numseedsites)

    for iteration in xrange(args.maxiters):
        # numsamples = rdm.randint(max(1, numoccs / 10), numoccs / 2)
        numsamples = \
            int(rdm.lognormal(mean=npy.log(meannumsamples), sigma=.5)) + 1
        pwmIC = jem.informationcontent(pwm, seqsdata.bgfreqs)
        start = time.time()
        summer = dotrueiteration(seqsdata, W, pwm, lambda_)
        truetime = time.time() - start
        logger.debug('Sums:\n%s', summer.sums)
        Znsumtrue = summer.sums[0].sum()
        pwmtrue = jem.normalisearray(summer.sums)
        pwmtrueIC = jem.informationcontent(pwmtrue, seqsdata.bgfreqs)
        lambdatrue = Znsumtrue / float(numoccs)
        if args.writelogos:
            jem.logo(pwmtrue, 'seed-%03d-%03d-true' % (seedidx, iteration))
        distperbase = npy.linalg.norm(pwmtrue - pwm, ord=1) / W
        logging.info(
            'Iteration: %3d, IC/base=%.2f bits, PWM distance/base=%.4f',
            iteration, pwmtrueIC / W,
            npy.linalg.norm(pwmtrue - pwm, ord=1) / W)
        iterstats['seedidx'].append(seedidx)
        iterstats['iteration'].append(iteration)
        iterstats['truetime'].append(truetime)
        iterstats['numsamples'].append(numsamples)
        iterstats['ICstart'].append(pwmIC)
        iterstats['ICtrue'].append(pwmtrueIC)
        iterstats['Znsumtrue'].append(Znsumtrue)
        iterstats['lambdastart'].append(lambda_)
        iterstats['lambdatrue'].append(lambdatrue)

        for methodname in args.methods:
            start = time.time()
            domainsize, iscb = METHODS[methodname](seqsdata, pwm, lambda_,
                                                   Widx, numsamples, args)
            duration = time.time() - start
            pwmestimate = jem.normalisearray(iscb.cb.sums)
            Znsumestimate = iscb.cb.sums[0].sum() * \
                float(domainsize) / numsamples
            methodstats['seedidx'].append(seedidx)
            methodstats['iteration'].append(iteration)
            methodstats['method'].append(methodname)
            methodstats['methodtime'].append(duration)
            methodstats['ICestimate'].append(
                jem.informationcontent(pwmestimate, seqsdata.bgfreqs))
            methodstats['Znsumestimate'].append(Znsumestimate)
            methodstats['var'].append(iscb.var())
            methodstats['lambdaestimate'].append(Znsumestimate /
                                                 float(numoccs))
            # Various measures of how different the estimated PWM is from the
            # true PWM.
            methodstats['frobeniusdist'].append(
                npy.linalg.norm(pwmtrue - pwmestimate, ord='fro'))
            methodstats['maxdist'].append(npy.abs(pwmtrue - pwmestimate).max())
            methodstats['absdist'].append(npy.abs(pwmtrue - pwmestimate).sum())
            methodstats['euclideandist'].append(
                npy.linalg.norm((pwmtrue - pwmestimate).flatten(), ord=2))
            methodstats['KLtrueestimate'].append(
                jem.pwmKL(pwmtrue, pwmestimate))
            methodstats['KLestimatetrue'].append(
                jem.pwmKL(pwmestimate, pwmtrue))
            if args.writelogos:
                jem.logo(
                    pwmestimate,
                    'seed-%03d-%03d-%s' % (seedidx, iteration, methodname))

        pwm = pwmtrue
        # lambda_ = lambdatrue

        if distperbase < args.stopthreshold:
            break
    return seedstats, iterstats, methodstats
コード例 #5
0
# seqs = seqan.StringDNASet(('AAAAAAAA', 'ACGTACGT', 'TATATATA'))
numbases, seqs, ids = seqan.readFastaDNA('T00759-small.fa')
logging.info('Loaded %d bases from %d sequences', numbases, len(seqs))
lambda_ = len(seqs) / float(numbases)

logging.info('Building index')
index = seqan.IndexStringDNASetESA(seqs)

logging.info('Counting W-mers')
Ws = [W]
Wmercounts = npy.zeros((2*len(index), len(Ws)), dtype=npy.uint)
numWmers = wmers.countWmersMulti(index.topdownhistory(), Ws, Wmercounts)[0]
logging.info('Got %d %d-mers', numWmers, W)
childWmerfreqs = npy.zeros((2*len(index), len(Ws), jem.SIGMA))
wmers.countWmerChildren(index.topdownhistory(), W, Wmercounts, childWmerfreqs)
childWmerfreqs = jem.normalisearray(childWmerfreqs)
sumestimator = jis.makesumestimator(numWmers)

logging.info('Importance sampling using background model to find one seed')
rdm.seed(2)
memocb = jis.importancesample(
    index, W, childWmerfreqs[:, 0], jis.UniformImportanceWeight(),
    numsamples=1, callback=jis.ISCbMemo())
pwm = jem.pwmfromWmer(memocb.Xns[0], numseedsites, 1.)
jem.logo(pwm, 'seed')

numsamples = 3000
distsbs = []
distsbg = []
truesums = []
varratios = []