Ejemplo n.º 1
0
def greedyRLS(XPath, yPath, metaPath, fcount=5, scount=50, resultPath=None):
    X, Y = readAuto(XPath, yPath)
    meta = {}
    if metaPath != None:
        print "Loading metadata from", metaPath
        meta = result.getMeta(metaPath)
    X_train, X_hidden, Y_train, Y_hidden = hidden.split(X, Y, meta=meta) 
    #if "classes" in meta:
    #    print "Class distribution = ", getClassDistribution(y)

    #logrps = range(15, 25)
    logrps = range(15, 26)
    print "Training RLS"
    loopCount = 1
    best_perf = -1
    best_logrp = None
    best_scount = None
    for logrp in logrps:
        kf = KFold(len(Y_train), n_folds=fcount, indices=True, shuffle=True, random_state=77)
        for train, test in kf:
            perfs = []
            print "------------ Processing fold", str(loopCount) + "/" + str(fcount), "------------"
            kwargs = {}
            kwargs['train_features'] = X_train[train]
            kwargs['train_labels'] = Y_train[train]
            kwargs['subsetsize'] = scount
            kwargs['regparam'] = 2.**logrp
            kwargs['bias'] = 1
            cb = CallbackFunction(X_train[test], Y_train[test])
            kwargs['callback_obj'] = cb
            rls = GreedyRLS.createLearner(**kwargs)
            rls.train()
            perfs.append(cb.perfs)
            loopCount += 1
            print "---------------------------------------------------"
        perfs = np.mean(perfs, axis=0)
        perf = np.max(perfs)
        perf = perfs[-1]
        sc = np.argmax(perfs)+1
        print "%f AUC, %d logrp, %d selected" %(perf, logrp, sc)
        if perf>best_perf:
            best_perf = perf
            best_logrp = logrp
            best_scount = sc
    kwargs = {}
    kwargs['train_features'] = X_train
    kwargs['train_labels'] = Y_train
    kwargs['subsetsize'] = scount
    kwargs['regparam'] = 2.**best_logrp
    kwargs['bias'] = 1
    cb = CallbackFunction(X_hidden, Y_hidden)
    kwargs['callback_obj'] = cb
    rls = GreedyRLS.createLearner(**kwargs)
    rls.train()
    perfs = cb.perfs
    selected = rls.selected
    model = rls.getModel()
    #if resultPath != None:
    #    saveResults(meta, resultPath, perfs, selected)
    return model, perfs, selected, best_logrp, best_scount
Ejemplo n.º 2
0
def looRLS(XPath, yPath, metaPath):
    X, Y = readAuto(XPath, yPath)
    meta = {}
    if metaPath != None:
        print "Loading metadata from", metaPath
        meta = result.getMeta(metaPath)
    X_train, X_hidden, Y_train, Y_hidden = hidden.split(X, Y, meta=meta) 
    kwargs = {}
    kwargs['train_features'] = X_train
    kwargs['train_labels'] = Y_train
    kwargs['regparam'] = 1.0
    rls = RLS.createLearner(**kwargs)
    rls.train()
    bestperf = -1. 
    for logrp in range(5, 25):
        rp = 2. ** logrp
        rls.solve(rp)
        Ploo = rls.computeLOO()
        perf = cindex(Y_train, Ploo)
        print "Leave-one-out %f for lambda 2^%d" %(perf, logrp)
        if perf > bestperf:
            bestperf = perf
            bestlogrp = logrp
    rp = 2. ** bestlogrp
    print "Best leave-one-out %f for lambda 2^%d" %(bestperf, bestlogrp)
    rls.solve(rp)
    model = rls.getModel()
    P = model.predict(X_hidden)
    perf = cindex(Y_hidden, P)
    print "final performance: %f" %perf
Ejemplo n.º 3
0
def test(XPath, yPath, metaPath, resultPath, classifier, classifierArgs, 
         getCV=getStratifiedKFoldCV, numFolds=10, verbose=3, parallel=1, 
         preDispatch='2*n_jobs', randomize=False, analyzeResults=False,
         databaseCGI=None, metric="roc_auc", useFeatures=None, reclassify=False, details=True):
    X, y = readAuto(XPath, yPath, useFeatures=useFeatures)
    meta = {}
    if metaPath != None:
        meta = result.getMeta(metaPath)
    if "classes" in meta:
        print "Class distribution = ", getClassDistribution(y)
        if randomize:
            classes = meta["classes"].values()
            y = [random.choice(classes) for x in range(len(y))]
            print "Randomized class distribution = ", getClassDistribution(y)
    X_train, X_hidden, y_train, y_hidden = hidden.split(X, y, meta=meta)
    print "Sizes", [len(X_train), len(y_train)], [len(X_hidden), len(y_hidden)]

    print "Cross-validating for", numFolds, "folds"
    print "Args", classifierArgs
    cv = getCV(y_train, meta, numFolds=numFolds)
    if preDispatch.isdigit():
        preDispatch = int(preDispatch)
    scorer = getScorer(metric)
    search = ExtendedGridSearchCV(classifier(), [classifierArgs], refit=len(X_hidden) > 0, cv=cv, scoring=scorer, verbose=verbose, n_jobs=parallel, pre_dispatch=preDispatch)
    search.fit(X_train, y_train) 
    if hasattr(search, "best_estimator_"):
        print "----------------------------- Best Estimator -----------------------------------"
        print search.best_estimator_
        if hasattr(search.best_estimator_, "doRFE"):
            print "*** RFE ***"
            search.best_estimator_.doRFE(X_train, y_train)
    #print "--------------------------------------------------------------------------------"
    print "---------------------- Grid scores on development set --------------------------"
    results = []
    extras = None
    index = 0
    bestIndex = 0
    for params, mean_score, scores in search.grid_scores_:
        print scores
        print "%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() / 2, params)
        results.append({"classifier":classifier.__name__, "cv":cv.__class__.__name__, "folds":numFolds,
                   "metric":metric,"scores":list(scores), 
                   "mean":float(mean_score), "std":float(scores.std() / 2), "params":params})
        if index == 0 or float(mean_score) > results[bestIndex]["mean"]:
            bestIndex = index
            if hasattr(search, "extras_"):
                extras = search.extras_[index]
        index += 1
    print "---------------------- Best scores on development set --------------------------"
    params, mean_score, scores = search.grid_scores_[bestIndex]
    print scores
    print "%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() / 2, params)
    hiddenResults = None
    hiddenDetails = None
    if len(X_hidden) > 0:
        print "----------------------------- Classifying Hidden Set -----------------------------------"
        hiddenResults = {"classifier":search.best_estimator_.__class__.__name__, 
                         "score":search.score(X_hidden, y_hidden),
                         "metric":metric,
                         "params":search.best_params_}
        print "Score =", hiddenResults["score"], "(" + metric + ")"
        y_hidden_pred = search.predict(X_hidden)
        #print y_hidden_pred
        #print search.predict_proba(X_hidden)
        hiddenDetails = {"predictions":{i:x for i,x in enumerate(y_hidden_pred)}}
        if hasattr(search.best_estimator_, "feature_importances_"):
            hiddenDetails["importances"] = search.best_estimator_.feature_importances_
        try:
            print classification_report(y_hidden, y_hidden_pred)
        except ValueError, e:
            print "ValueError in classification_report:", e