def greedyRLS(XPath, yPath, metaPath, fcount=5, scount=50, resultPath=None): X, Y = readAuto(XPath, yPath) meta = {} if metaPath != None: print "Loading metadata from", metaPath meta = result.getMeta(metaPath) X_train, X_hidden, Y_train, Y_hidden = hidden.split(X, Y, meta=meta) #if "classes" in meta: # print "Class distribution = ", getClassDistribution(y) #logrps = range(15, 25) logrps = range(15, 26) print "Training RLS" loopCount = 1 best_perf = -1 best_logrp = None best_scount = None for logrp in logrps: kf = KFold(len(Y_train), n_folds=fcount, indices=True, shuffle=True, random_state=77) for train, test in kf: perfs = [] print "------------ Processing fold", str(loopCount) + "/" + str(fcount), "------------" kwargs = {} kwargs['train_features'] = X_train[train] kwargs['train_labels'] = Y_train[train] kwargs['subsetsize'] = scount kwargs['regparam'] = 2.**logrp kwargs['bias'] = 1 cb = CallbackFunction(X_train[test], Y_train[test]) kwargs['callback_obj'] = cb rls = GreedyRLS.createLearner(**kwargs) rls.train() perfs.append(cb.perfs) loopCount += 1 print "---------------------------------------------------" perfs = np.mean(perfs, axis=0) perf = np.max(perfs) perf = perfs[-1] sc = np.argmax(perfs)+1 print "%f AUC, %d logrp, %d selected" %(perf, logrp, sc) if perf>best_perf: best_perf = perf best_logrp = logrp best_scount = sc kwargs = {} kwargs['train_features'] = X_train kwargs['train_labels'] = Y_train kwargs['subsetsize'] = scount kwargs['regparam'] = 2.**best_logrp kwargs['bias'] = 1 cb = CallbackFunction(X_hidden, Y_hidden) kwargs['callback_obj'] = cb rls = GreedyRLS.createLearner(**kwargs) rls.train() perfs = cb.perfs selected = rls.selected model = rls.getModel() #if resultPath != None: # saveResults(meta, resultPath, perfs, selected) return model, perfs, selected, best_logrp, best_scount
def looRLS(XPath, yPath, metaPath): X, Y = readAuto(XPath, yPath) meta = {} if metaPath != None: print "Loading metadata from", metaPath meta = result.getMeta(metaPath) X_train, X_hidden, Y_train, Y_hidden = hidden.split(X, Y, meta=meta) kwargs = {} kwargs['train_features'] = X_train kwargs['train_labels'] = Y_train kwargs['regparam'] = 1.0 rls = RLS.createLearner(**kwargs) rls.train() bestperf = -1. for logrp in range(5, 25): rp = 2. ** logrp rls.solve(rp) Ploo = rls.computeLOO() perf = cindex(Y_train, Ploo) print "Leave-one-out %f for lambda 2^%d" %(perf, logrp) if perf > bestperf: bestperf = perf bestlogrp = logrp rp = 2. ** bestlogrp print "Best leave-one-out %f for lambda 2^%d" %(bestperf, bestlogrp) rls.solve(rp) model = rls.getModel() P = model.predict(X_hidden) perf = cindex(Y_hidden, P) print "final performance: %f" %perf
def test(XPath, yPath, metaPath, resultPath, classifier, classifierArgs, getCV=getStratifiedKFoldCV, numFolds=10, verbose=3, parallel=1, preDispatch='2*n_jobs', randomize=False, analyzeResults=False, databaseCGI=None, metric="roc_auc", useFeatures=None, reclassify=False, details=True): X, y = readAuto(XPath, yPath, useFeatures=useFeatures) meta = {} if metaPath != None: meta = result.getMeta(metaPath) if "classes" in meta: print "Class distribution = ", getClassDistribution(y) if randomize: classes = meta["classes"].values() y = [random.choice(classes) for x in range(len(y))] print "Randomized class distribution = ", getClassDistribution(y) X_train, X_hidden, y_train, y_hidden = hidden.split(X, y, meta=meta) print "Sizes", [len(X_train), len(y_train)], [len(X_hidden), len(y_hidden)] print "Cross-validating for", numFolds, "folds" print "Args", classifierArgs cv = getCV(y_train, meta, numFolds=numFolds) if preDispatch.isdigit(): preDispatch = int(preDispatch) scorer = getScorer(metric) search = ExtendedGridSearchCV(classifier(), [classifierArgs], refit=len(X_hidden) > 0, cv=cv, scoring=scorer, verbose=verbose, n_jobs=parallel, pre_dispatch=preDispatch) search.fit(X_train, y_train) if hasattr(search, "best_estimator_"): print "----------------------------- Best Estimator -----------------------------------" print search.best_estimator_ if hasattr(search.best_estimator_, "doRFE"): print "*** RFE ***" search.best_estimator_.doRFE(X_train, y_train) #print "--------------------------------------------------------------------------------" print "---------------------- Grid scores on development set --------------------------" results = [] extras = None index = 0 bestIndex = 0 for params, mean_score, scores in search.grid_scores_: print scores print "%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() / 2, params) results.append({"classifier":classifier.__name__, "cv":cv.__class__.__name__, "folds":numFolds, "metric":metric,"scores":list(scores), "mean":float(mean_score), "std":float(scores.std() / 2), "params":params}) if index == 0 or float(mean_score) > results[bestIndex]["mean"]: bestIndex = index if hasattr(search, "extras_"): extras = search.extras_[index] index += 1 print "---------------------- Best scores on development set --------------------------" params, mean_score, scores = search.grid_scores_[bestIndex] print scores print "%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() / 2, params) hiddenResults = None hiddenDetails = None if len(X_hidden) > 0: print "----------------------------- Classifying Hidden Set -----------------------------------" hiddenResults = {"classifier":search.best_estimator_.__class__.__name__, "score":search.score(X_hidden, y_hidden), "metric":metric, "params":search.best_params_} print "Score =", hiddenResults["score"], "(" + metric + ")" y_hidden_pred = search.predict(X_hidden) #print y_hidden_pred #print search.predict_proba(X_hidden) hiddenDetails = {"predictions":{i:x for i,x in enumerate(y_hidden_pred)}} if hasattr(search.best_estimator_, "feature_importances_"): hiddenDetails["importances"] = search.best_estimator_.feature_importances_ try: print classification_report(y_hidden, y_hidden_pred) except ValueError, e: print "ValueError in classification_report:", e