def retrain(models, data, calibdir, finaldir, nfold=1, ntrial=1): kangaroo.globals.flags.push("disable_softmax",True) for trial in range(10): Z = kangaroo.predict(data, finaldir, finaldir)["bound"] I0 = [i for i in range(len(data)) if data.Y[i] < .5] I1 = [i for i in range(len(data)) if data.Y[i] > .5] #threshold = np.mean(Z[I1].ravel()) - 4*np.std(Z[I1].ravel()) #threshold = np.percentile(Z[I0].ravel(), 95) threshold = np.percentile(Z[I1].ravel(), 1) numshuffle = 0 for i in I0: if Z[i] > threshold: data.sequences[i][0] = doublet_shuffle(data.sequences[i][0]) data.X_seq[i] = data.sequences[i][0] numshuffle += 1 numshuffle_pct = float(numshuffle)/len(I0)*100 print "retrain trial %d: had to shuffle %.1f%% unbound sequences" % (trial, numshuffle_pct) if numshuffle <= .0001*len(I0): break kangaroo.globals.flags.pop("disable_softmax") kangaroo.train(models, data, calibdir, finaldir, nfold=nfold, ntrial=ntrial) return
def train_without_outliers(cfgs, data, calibdir, outdir, ntrial=1, auxfilter=None): # Same as kangaroo.train, except this version trains twice: once with all # the data, and again with worst outliers 'removed'. # Step 1. Train with all training data #kangaroo.train(cfgs, data, calibdir, outdir, nfold=1, ntrial=ntrial, auxfilter=auxfilter) # Step 2. Load predictions on the training data, and remove% # the examples that were misclassified the worst. if not isinstance(data,dict): data = { id : data.astargets([id]) for id in data.targetnames } kangaroo.globals.flags.push("disable_softmax",True) pred = kangaroo.predict(data, outdir, outdir) for targetname in sorted(data.targetnames): for name, Z in pred.iteritems(): Y = data[name].Y.ravel() allidx = [] allseq = set() roworder = np.argsort(-Z.ravel())
def save_metrics(data, groupname, outdir, modeldir=None): if modeldir is None: modeldir = outdir if not isinstance(data,dict): data = { id : data.astargets([id]) for id in data.targetnames } pred = kangaroo.predict(data, modeldir, outdir) for targetname in data.keys(): z = pred[targetname].ravel() y = data[targetname].Y.ravel() rowidx = data[targetname].rowidx _update_metrics(outdir, targetname, groupname, rowidx, z, y) check_repeat_correlation = False if check_repeat_correlation: for targetname in data.keys(): z = pred[targetname].ravel() y = data[targetname].Y.ravel() s = [data[targetname].X_seq[i] for i in range(len(data[targetname]))] # Only get the bound examples. z = [s[i] for i in range(len(data[targetname])) if y[i] != 0] s = [s[i] for i in range(len(data[targetname])) if y[i] != 0] class revcomp_str(object): def __init__(self, s): self.s = s; def __eq__(self, other): return self.s == other.s or self.s == revcomp(other.s) # For each percentile, count the number of repeats print targetname order = np.argsort(z) for denom in reversed([2,8,32,128,512,2048]): top_s = [s[i] for i in order[len(order)//denom:]] top_unique = set([revcomp(top_s[i]) for i in range(len(top_s))]) nall = float(len(top_s)) nunique = float(len(top_unique)) percent_repeat = 100*(nall - nunique) / nall print " top %.4f%% (n=%d)\t=> %.2f%% repeats" % (100./denom, nall, percent_repeat)
def save_featuremaps(data, modeldir, outdir, maxrows=1000000): if not isinstance(data,dict): data = { id : data.astargets([id]) for id in data.targetnames } disable_softmax() pred = kangaroo.predict(data, modeldir, outdir) # Find a list of rows, in order of decreasing confidence, and with all # duplicate sequences deleted for name, Z in pred.iteritems(): Y = data[name].Y.ravel() allidx = [] allseq = set() roworder = np.argsort(-Z.ravel()) for i in range(len(roworder)): s = data[name].X_seq[roworder[i]] #if s in allseq: # continue allidx.append(roworder[i]) allseq.add(s) if maxrows and len(allidx) >= maxrows: break # Now actually dump the featuremaps for all the rows specified print "Generating feature maps...", datasub = data[name][allidx] kangaroo.globals.set_multiprocessing(False) # Needed to get back collected values in globals; uugh if "reverse_complement" in kangaroo.globals.flags: kangaroo.globals.flags.push("collect_Zmask",True) kangaroo.globals.flags.push("collect_featuremaps",True) kangaroo.globals.flags.push("disable_relu",True) kangaroo.predict(datasub, modeldir, outdir) kangaroo.globals.flags.pop("disable_relu") fmaps = kangaroo.globals.flags.pop("collect_featuremaps") Zmask = kangaroo.globals.flags.pop("collect_Zmask").ravel() if "reverse_complement" in kangaroo.globals.flags else None kangaroo.globals.set_multiprocessing(True) seqs = [] for i in range(len(datasub)): s = datasub.sequences[i][0] seqs.append(s) if "reverse_complement" in kangaroo.globals.flags: seqs.append(revcomp(s)) if "reverse_complement" in kangaroo.globals.flags: # Remove the strand that was not used for prediction fmaps = [fmaps[i] for i in range(len(fmaps)) if Zmask[i]] seqs = [seqs[i] for i in range(len(seqs)) if Zmask[i]] filter_len = fmaps[0].shape[1] - len(seqs[0]) + 1 # Make tuples of (sequence, max_value, max_index) for each featuremap pfmargs = [] for k in range(fmaps[0].shape[0]): pfmarg = [(seqs[i], float(np.max(fmaps[i][k])), int(np.argmax(fmaps[i][k]))) for i in range(len(seqs))] pfmargs.append(pfmarg) pfmargs = np.array(pfmargs, dtype='a%d,f4,i4' % max(len(seqs[i]) for i in range(len(seqs)))) print "done" #np.savez_compressed(outdir + "/%s.pfm_info.npz"%(name), pfmargs=pfmargs) # Compute PFMs from the pfmargs array print "Computing PFMs for %s..." % name, pfms, ic, kl_dist, pval, counts = compute_pfms(pfmargs, filter_len=filter_len, num_permut=500, rev_comp=False) print "done" makepath(outdir) with open(outdir + "/%s.pfms.pkl"%(name),"wb") as f: cPickle.dump({"pfms" : pfms, "ic" : ic, "kl_dist" : kl_dist, "pval" : pval, "counts" : counts}, f) enable_softmax()