Beispiel #1
0
def retrain(models, data, calibdir, finaldir, nfold=1, ntrial=1):

    kangaroo.globals.flags.push("disable_softmax",True)
    for trial in range(10):
        Z = kangaroo.predict(data, finaldir, finaldir)["bound"]
        I0 = [i for i in range(len(data)) if data.Y[i] < .5]
        I1 = [i for i in range(len(data)) if data.Y[i] > .5]
        #threshold = np.mean(Z[I1].ravel()) - 4*np.std(Z[I1].ravel())
        #threshold = np.percentile(Z[I0].ravel(), 95)
        threshold = np.percentile(Z[I1].ravel(), 1)
        numshuffle = 0
        for i in I0:
            if Z[i] > threshold:
                data.sequences[i][0] = doublet_shuffle(data.sequences[i][0])
                data.X_seq[i] = data.sequences[i][0]
                numshuffle += 1

        numshuffle_pct = float(numshuffle)/len(I0)*100
        print "retrain trial %d: had to shuffle %.1f%% unbound sequences" % (trial, numshuffle_pct)
        if numshuffle <= .0001*len(I0):
            break

    kangaroo.globals.flags.pop("disable_softmax")

    kangaroo.train(models, data, calibdir, finaldir, nfold=nfold, ntrial=ntrial)
    return
Beispiel #2
0
def train_without_outliers(cfgs, data, calibdir, outdir, ntrial=1, auxfilter=None):
    # Same as kangaroo.train, except this version trains twice: once with all
    # the data, and again with worst outliers 'removed'.

    # Step 1. Train with all training data
    #kangaroo.train(cfgs, data, calibdir, outdir, nfold=1, ntrial=ntrial, auxfilter=auxfilter)

    # Step 2. Load predictions on the training data, and remove%
    #         the examples that were misclassified the worst.
    if not isinstance(data,dict):
        data = { id : data.astargets([id]) for id in data.targetnames }

    kangaroo.globals.flags.push("disable_softmax",True)
    pred = kangaroo.predict(data, outdir, outdir)
    for targetname in sorted(data.targetnames):
        for name, Z in pred.iteritems():
            Y = data[name].Y.ravel()
            allidx = []
            allseq = set()
            roworder = np.argsort(-Z.ravel())
Beispiel #3
0
def save_metrics(data, groupname, outdir, modeldir=None):
    if modeldir is None:
        modeldir = outdir

    if not isinstance(data,dict):
        data = { id : data.astargets([id]) for id in data.targetnames }

    pred = kangaroo.predict(data, modeldir, outdir)
    for targetname in data.keys():
        z = pred[targetname].ravel()
        y = data[targetname].Y.ravel()
        rowidx = data[targetname].rowidx
        _update_metrics(outdir, targetname, groupname, rowidx, z, y)

    check_repeat_correlation = False
    if check_repeat_correlation:
        for targetname in data.keys():
            z = pred[targetname].ravel()
            y = data[targetname].Y.ravel()
            s = [data[targetname].X_seq[i] for i in range(len(data[targetname]))]

            # Only get the bound examples.
            z = [s[i] for i in range(len(data[targetname])) if y[i] != 0]
            s = [s[i] for i in range(len(data[targetname])) if y[i] != 0]

            class revcomp_str(object):
                def __init__(self, s): self.s = s;
                def __eq__(self, other): return self.s == other.s or self.s == revcomp(other.s)

            # For each percentile, count the number of repeats
            print targetname
            order = np.argsort(z)
            for denom in reversed([2,8,32,128,512,2048]):
                top_s = [s[i] for i in order[len(order)//denom:]]
                top_unique = set([revcomp(top_s[i]) for i in range(len(top_s))])
                nall = float(len(top_s))
                nunique = float(len(top_unique))
                percent_repeat = 100*(nall - nunique) / nall
                print "  top %.4f%% (n=%d)\t=> %.2f%% repeats"  % (100./denom, nall, percent_repeat)
Beispiel #4
0
def save_featuremaps(data, modeldir, outdir, maxrows=1000000):
    if not isinstance(data,dict):
        data = { id : data.astargets([id]) for id in data.targetnames }

    disable_softmax()
    pred = kangaroo.predict(data, modeldir, outdir)

    # Find a list of rows, in order of decreasing confidence, and with all
    # duplicate sequences deleted
    for name, Z in pred.iteritems():
        Y = data[name].Y.ravel()
        allidx = []
        allseq = set()
        roworder = np.argsort(-Z.ravel())
        for i in range(len(roworder)):
            s = data[name].X_seq[roworder[i]]
            #if s in allseq:
            #    continue
            allidx.append(roworder[i])
            allseq.add(s)
            if maxrows and len(allidx) >= maxrows:
                break

        # Now actually dump the featuremaps for all the rows specified
        print "Generating feature maps...",
        datasub = data[name][allidx]
        kangaroo.globals.set_multiprocessing(False) # Needed to get back collected values in globals; uugh
        if "reverse_complement" in kangaroo.globals.flags:
            kangaroo.globals.flags.push("collect_Zmask",True)
        kangaroo.globals.flags.push("collect_featuremaps",True)
        kangaroo.globals.flags.push("disable_relu",True)
        kangaroo.predict(datasub, modeldir, outdir)
        kangaroo.globals.flags.pop("disable_relu")
        fmaps = kangaroo.globals.flags.pop("collect_featuremaps")
        Zmask = kangaroo.globals.flags.pop("collect_Zmask").ravel() if "reverse_complement" in kangaroo.globals.flags else None
        kangaroo.globals.set_multiprocessing(True)

        seqs = []
        for i in range(len(datasub)):
            s = datasub.sequences[i][0]
            seqs.append(s)
            if "reverse_complement" in kangaroo.globals.flags:
                seqs.append(revcomp(s))

        if "reverse_complement" in kangaroo.globals.flags:
            # Remove the strand that was not used for prediction
            fmaps = [fmaps[i] for i in range(len(fmaps)) if Zmask[i]]
            seqs  = [seqs[i]  for i in range(len(seqs))  if Zmask[i]]
        filter_len = fmaps[0].shape[1] - len(seqs[0]) + 1

        # Make tuples of (sequence, max_value, max_index) for each featuremap
        pfmargs = []
        for k in range(fmaps[0].shape[0]):
            pfmarg = [(seqs[i], float(np.max(fmaps[i][k])), int(np.argmax(fmaps[i][k]))) for i in range(len(seqs))]
            pfmargs.append(pfmarg)
        pfmargs = np.array(pfmargs, dtype='a%d,f4,i4' % max(len(seqs[i]) for i in range(len(seqs))))
        print "done"

        #np.savez_compressed(outdir + "/%s.pfm_info.npz"%(name), pfmargs=pfmargs)
        

        # Compute PFMs from the pfmargs array
        print "Computing PFMs for %s..." % name,
        pfms, ic, kl_dist, pval, counts  = compute_pfms(pfmargs, filter_len=filter_len, num_permut=500, rev_comp=False)
        print "done"
        makepath(outdir)
        with open(outdir + "/%s.pfms.pkl"%(name),"wb") as f:
            cPickle.dump({"pfms" : pfms, "ic" : ic, "kl_dist" : kl_dist, "pval" : pval, "counts" : counts}, f)

    enable_softmax()