def _countWmers(index, Ws, countunique): counts = npy.zeros((2*len(index), len(Ws)), dtype=npy.uint) rootcounts = wmers.countWmersMulti( index.topdownhistory(), Ws, counts, countunique=countunique) # Count how many W-mers are represented by the children # of each node childfreqs = npy.zeros((2*len(index), len(Ws), jem.SIGMA)) wmers.countChildren( index.topdownhistory(), Ws, counts, childfreqs) childfreqs = jem.normalisearray(childfreqs) return rootcounts, counts, childfreqs
def _countWmers(index, Ws, countunique): counts = npy.zeros((2 * len(index), len(Ws)), dtype=npy.uint) rootcounts = wmers.countWmersMulti(index.topdownhistory(), Ws, counts, countunique=countunique) # Count how many W-mers are represented by the children # of each node childfreqs = npy.zeros((2 * len(index), len(Ws), jem.SIGMA)) wmers.countChildren(index.topdownhistory(), Ws, counts, childfreqs) childfreqs = jem.normalisearray(childfreqs) return rootcounts, counts, childfreqs
def handleseed(seedidx, seqsdata, Widx, seed, args): """Test the methods on one seed.""" seedstats = collections.defaultdict(list) iterstats = collections.defaultdict(list) methodstats = collections.defaultdict(list) strippedfasta = stripfastaname(seqsdata.fasta) logger.info("Stripped FASTA: %s", strippedfasta) W = args.Ws[Widx] logger.info('Seed: %s; W=%2d', seed, W) numseqs = len(seqsdata.seqs) numoccs = seqsdata.numoccs[Widx] numunique = seqsdata.numunique[Widx] meannumsamples = npy.log10(numunique) * 600 numseedsites = rdm.randint(max(1, numseqs / 10), numseqs * 2) lambda_ = numseqs / float(numoccs) pwm = jem.pwmfromWmer(seed, numseedsites, args.pseudocount) if args.writelogos: jem.logo(pwm, 'seed-%03d' % seedidx) seedstats['seedidx'].append(seedidx) seedstats['fasta'].append(strippedfasta) seedstats['seed'].append(str(seed)) seedstats['W'].append(W) seedstats['numseedsites'].append(numseedsites) for iteration in xrange(args.maxiters): # numsamples = rdm.randint(max(1, numoccs / 10), numoccs / 2) numsamples = \ int(rdm.lognormal(mean=npy.log(meannumsamples), sigma=.5)) + 1 pwmIC = jem.informationcontent(pwm, seqsdata.bgfreqs) start = time.time() summer = dotrueiteration(seqsdata, W, pwm, lambda_) truetime = time.time() - start logger.debug('Sums:\n%s', summer.sums) Znsumtrue = summer.sums[0].sum() pwmtrue = jem.normalisearray(summer.sums) pwmtrueIC = jem.informationcontent(pwmtrue, seqsdata.bgfreqs) lambdatrue = Znsumtrue / float(numoccs) if args.writelogos: jem.logo( pwmtrue, 'seed-%03d-%03d-true' % (seedidx, iteration)) distperbase = npy.linalg.norm(pwmtrue - pwm, ord=1) / W logging.info( 'Iteration: %3d, IC/base=%.2f bits, PWM distance/base=%.4f', iteration, pwmtrueIC/W, npy.linalg.norm(pwmtrue - pwm, ord=1) / W) iterstats['seedidx'].append(seedidx) iterstats['iteration'].append(iteration) iterstats['truetime'].append(truetime) iterstats['numsamples'].append(numsamples) iterstats['ICstart'].append(pwmIC) iterstats['ICtrue'].append(pwmtrueIC) iterstats['Znsumtrue'].append(Znsumtrue) iterstats['lambdastart'].append(lambda_) iterstats['lambdatrue'].append(lambdatrue) for methodname in args.methods: start = time.time() domainsize, iscb = METHODS[methodname]( seqsdata, pwm, lambda_, Widx, numsamples, args) duration = time.time() - start pwmestimate = jem.normalisearray(iscb.cb.sums) Znsumestimate = iscb.cb.sums[0].sum() * \ float(domainsize) / numsamples methodstats['seedidx'].append(seedidx) methodstats['iteration'].append(iteration) methodstats['method'].append(methodname) methodstats['methodtime'].append(duration) methodstats['ICestimate'].append( jem.informationcontent(pwmestimate, seqsdata.bgfreqs)) methodstats['Znsumestimate'].append(Znsumestimate) methodstats['var'].append(iscb.var()) methodstats['lambdaestimate'].append( Znsumestimate / float(numoccs)) # Various measures of how different the estimated PWM is from the # true PWM. methodstats['frobeniusdist'].append( npy.linalg.norm(pwmtrue - pwmestimate, ord='fro')) methodstats['maxdist'].append(npy.abs(pwmtrue - pwmestimate).max()) methodstats['absdist'].append(npy.abs(pwmtrue - pwmestimate).sum()) methodstats['euclideandist'].append( npy.linalg.norm((pwmtrue - pwmestimate).flatten(), ord=2)) methodstats['KLtrueestimate'].append( jem.pwmKL(pwmtrue, pwmestimate)) methodstats['KLestimatetrue'].append( jem.pwmKL(pwmestimate, pwmtrue)) if args.writelogos: jem.logo( pwmestimate, 'seed-%03d-%03d-%s' % (seedidx, iteration, methodname)) pwm = pwmtrue # lambda_ = lambdatrue if distperbase < args.stopthreshold: break return seedstats, iterstats, methodstats
def handleseed(seedidx, seqsdata, Widx, seed, args): """Test the methods on one seed.""" seedstats = collections.defaultdict(list) iterstats = collections.defaultdict(list) methodstats = collections.defaultdict(list) strippedfasta = stripfastaname(seqsdata.fasta) logger.info("Stripped FASTA: %s", strippedfasta) W = args.Ws[Widx] logger.info('Seed: %s; W=%2d', seed, W) numseqs = len(seqsdata.seqs) numoccs = seqsdata.numoccs[Widx] numunique = seqsdata.numunique[Widx] meannumsamples = npy.log10(numunique) * 600 numseedsites = rdm.randint(max(1, numseqs / 10), numseqs * 2) lambda_ = numseqs / float(numoccs) pwm = jem.pwmfromWmer(seed, numseedsites, args.pseudocount) if args.writelogos: jem.logo(pwm, 'seed-%03d' % seedidx) seedstats['seedidx'].append(seedidx) seedstats['fasta'].append(strippedfasta) seedstats['seed'].append(str(seed)) seedstats['W'].append(W) seedstats['numseedsites'].append(numseedsites) for iteration in xrange(args.maxiters): # numsamples = rdm.randint(max(1, numoccs / 10), numoccs / 2) numsamples = \ int(rdm.lognormal(mean=npy.log(meannumsamples), sigma=.5)) + 1 pwmIC = jem.informationcontent(pwm, seqsdata.bgfreqs) start = time.time() summer = dotrueiteration(seqsdata, W, pwm, lambda_) truetime = time.time() - start logger.debug('Sums:\n%s', summer.sums) Znsumtrue = summer.sums[0].sum() pwmtrue = jem.normalisearray(summer.sums) pwmtrueIC = jem.informationcontent(pwmtrue, seqsdata.bgfreqs) lambdatrue = Znsumtrue / float(numoccs) if args.writelogos: jem.logo(pwmtrue, 'seed-%03d-%03d-true' % (seedidx, iteration)) distperbase = npy.linalg.norm(pwmtrue - pwm, ord=1) / W logging.info( 'Iteration: %3d, IC/base=%.2f bits, PWM distance/base=%.4f', iteration, pwmtrueIC / W, npy.linalg.norm(pwmtrue - pwm, ord=1) / W) iterstats['seedidx'].append(seedidx) iterstats['iteration'].append(iteration) iterstats['truetime'].append(truetime) iterstats['numsamples'].append(numsamples) iterstats['ICstart'].append(pwmIC) iterstats['ICtrue'].append(pwmtrueIC) iterstats['Znsumtrue'].append(Znsumtrue) iterstats['lambdastart'].append(lambda_) iterstats['lambdatrue'].append(lambdatrue) for methodname in args.methods: start = time.time() domainsize, iscb = METHODS[methodname](seqsdata, pwm, lambda_, Widx, numsamples, args) duration = time.time() - start pwmestimate = jem.normalisearray(iscb.cb.sums) Znsumestimate = iscb.cb.sums[0].sum() * \ float(domainsize) / numsamples methodstats['seedidx'].append(seedidx) methodstats['iteration'].append(iteration) methodstats['method'].append(methodname) methodstats['methodtime'].append(duration) methodstats['ICestimate'].append( jem.informationcontent(pwmestimate, seqsdata.bgfreqs)) methodstats['Znsumestimate'].append(Znsumestimate) methodstats['var'].append(iscb.var()) methodstats['lambdaestimate'].append(Znsumestimate / float(numoccs)) # Various measures of how different the estimated PWM is from the # true PWM. methodstats['frobeniusdist'].append( npy.linalg.norm(pwmtrue - pwmestimate, ord='fro')) methodstats['maxdist'].append(npy.abs(pwmtrue - pwmestimate).max()) methodstats['absdist'].append(npy.abs(pwmtrue - pwmestimate).sum()) methodstats['euclideandist'].append( npy.linalg.norm((pwmtrue - pwmestimate).flatten(), ord=2)) methodstats['KLtrueestimate'].append( jem.pwmKL(pwmtrue, pwmestimate)) methodstats['KLestimatetrue'].append( jem.pwmKL(pwmestimate, pwmtrue)) if args.writelogos: jem.logo( pwmestimate, 'seed-%03d-%03d-%s' % (seedidx, iteration, methodname)) pwm = pwmtrue # lambda_ = lambdatrue if distperbase < args.stopthreshold: break return seedstats, iterstats, methodstats
# seqs = seqan.StringDNASet(('AAAAAAAA', 'ACGTACGT', 'TATATATA')) numbases, seqs, ids = seqan.readFastaDNA('T00759-small.fa') logging.info('Loaded %d bases from %d sequences', numbases, len(seqs)) lambda_ = len(seqs) / float(numbases) logging.info('Building index') index = seqan.IndexStringDNASetESA(seqs) logging.info('Counting W-mers') Ws = [W] Wmercounts = npy.zeros((2*len(index), len(Ws)), dtype=npy.uint) numWmers = wmers.countWmersMulti(index.topdownhistory(), Ws, Wmercounts)[0] logging.info('Got %d %d-mers', numWmers, W) childWmerfreqs = npy.zeros((2*len(index), len(Ws), jem.SIGMA)) wmers.countWmerChildren(index.topdownhistory(), W, Wmercounts, childWmerfreqs) childWmerfreqs = jem.normalisearray(childWmerfreqs) sumestimator = jis.makesumestimator(numWmers) logging.info('Importance sampling using background model to find one seed') rdm.seed(2) memocb = jis.importancesample( index, W, childWmerfreqs[:, 0], jis.UniformImportanceWeight(), numsamples=1, callback=jis.ISCbMemo()) pwm = jem.pwmfromWmer(memocb.Xns[0], numseedsites, 1.) jem.logo(pwm, 'seed') numsamples = 3000 distsbs = [] distsbg = [] truesums = [] varratios = []