def getCached(dbPath, experimentName, experimentOptions, meta, verbose=False): if meta == None or (isinstance(meta, basestring) and not os.path.exists(meta)): # nothing to compare with if verbose: print "No existing metadata file", [meta] return None # Load previous experiment meta = getMeta(meta) # Load current experiment template = buildExamples.parseExperiment(experimentName).copy() template = parseTemplateOptions(experimentOptions, template) # Get database information dbPath = os.path.abspath(os.path.expanduser(dbPath)) dbModified = time.strftime("%c", time.localtime(os.path.getmtime(dbPath))) # Compare settings metaExp = meta["experiment"] if verbose: print "dbPath", dbPath print "dbFile", metaExp["dbFile"] print "dbModified", dbModified print "metaExp['dbModified']", metaExp["dbModified"] print "template", json.dumps(template) print "meta['template']", json.dumps(meta["template"]) if metaExp["dbFile"] == dbPath and metaExp["dbModified"] == dbModified and template == meta["template"]: return meta # is the same experiment else: return None # previous experiment differs
def curvePoint(XPath, yPath, meta, resultPath, featureCount, classifier, classifierArgs, getCV, numFolds, verbose, parallel, preDispatch, randomize, metric): if isinstance(meta, basestring): meta = result.getMeta(meta) count = 0 featureSet = [] for featureName in meta["features"]: # features must be already analysed featureSet.append(meta["features"][featureName]["id"]) count += 1 if count > featureCount: break print "Testing", len(featureSet), "features", featureSet meta["curve"] = {"count":len(featureSet), "indices":featureSet} classifierNameMap = { "LinearSVC":"svm.LinearSVC", "svm.LinearSVC":"svm.LinearSVC", "ExtraTreesClassifier":"ensemble.ExtraTreesClassifier", "ensemble.ExtraTreesClassifier":"ensemble.ExtraTreesClassifier", "RLScore":"RLScore" } classifierName = classifierNameMap[classifier] classifier, classifierArgs = learn.getClassifier(classifierName, eval(classifierArgs)) meta, results, extras, hiddenResults, hiddenDetails = learn.test( XPath, yPath, meta, resultPath, classifier=classifier, classifierArgs=classifierArgs, getCV=eval(getCV), numFolds=numFolds, verbose=verbose, parallel=parallel, preDispatch=preDispatch, randomize=randomize, analyzeResults=False, metric=metric, useFeatures=featureSet, reclassify=True, details=False) return [meta, results, extras, hiddenResults, hiddenDetails]
def analyze(meta, dbPath=None, resultPath=None, verbose=False): meta = result.getMeta(meta) if dbPath == None: dbPath = settings.CGI_DB_PATH print "Analyzing", dbPath con = DB.connect(dbPath) result.sortFeatures(meta) features = meta["features"] count = 1 numFeatures = len(features) nonSelected = [] for featureName in features: if not isinstance(features[featureName], int): if verbose: print "Processing feature", featureName, str(count) + "/" + str(numFeatures) geneName = getGeneName(featureName) if geneName != None: mappings = getTermAnalysis(con, geneName, "disease") result.setValue(features[featureName], "CancerGeneIndex", mappings) mappings = getTermAnalysis(con, geneName, "drug") result.setValue(features[featureName], "CancerGeneDrug", mappings) else: geneName = getGeneName(featureName) if geneName != None: nonSelected.append(geneName) count += 1 result.setValue(meta, "CancerGeneIndex", analyzeTermCoverage(features), "analysis") result.setValue(meta["analysis"], "non-selected", getCancerGeneCoverage(con, nonSelected), "CancerGeneIndex") if resultPath != None: result.saveMeta(meta, resultPath) return meta
def greedyRLS(XPath, yPath, metaPath, fcount=5, scount=50, resultPath=None): X, Y = readAuto(XPath, yPath) meta = {} if metaPath != None: print "Loading metadata from", metaPath meta = result.getMeta(metaPath) X_train, X_hidden, Y_train, Y_hidden = hidden.split(X, Y, meta=meta) #if "classes" in meta: # print "Class distribution = ", getClassDistribution(y) #logrps = range(15, 25) logrps = range(15, 26) print "Training RLS" loopCount = 1 best_perf = -1 best_logrp = None best_scount = None for logrp in logrps: kf = KFold(len(Y_train), n_folds=fcount, indices=True, shuffle=True, random_state=77) for train, test in kf: perfs = [] print "------------ Processing fold", str(loopCount) + "/" + str(fcount), "------------" kwargs = {} kwargs['train_features'] = X_train[train] kwargs['train_labels'] = Y_train[train] kwargs['subsetsize'] = scount kwargs['regparam'] = 2.**logrp kwargs['bias'] = 1 cb = CallbackFunction(X_train[test], Y_train[test]) kwargs['callback_obj'] = cb rls = GreedyRLS.createLearner(**kwargs) rls.train() perfs.append(cb.perfs) loopCount += 1 print "---------------------------------------------------" perfs = np.mean(perfs, axis=0) perf = np.max(perfs) perf = perfs[-1] sc = np.argmax(perfs)+1 print "%f AUC, %d logrp, %d selected" %(perf, logrp, sc) if perf>best_perf: best_perf = perf best_logrp = logrp best_scount = sc kwargs = {} kwargs['train_features'] = X_train kwargs['train_labels'] = Y_train kwargs['subsetsize'] = scount kwargs['regparam'] = 2.**best_logrp kwargs['bias'] = 1 cb = CallbackFunction(X_hidden, Y_hidden) kwargs['callback_obj'] = cb rls = GreedyRLS.createLearner(**kwargs) rls.train() perfs = cb.perfs selected = rls.selected model = rls.getModel() #if resultPath != None: # saveResults(meta, resultPath, perfs, selected) return model, perfs, selected, best_logrp, best_scount
def saveResults(meta, resultPath, results, extras, bestIndex, analyze, hiddenResults=None, hiddenDetails=None, databaseCGI=None, reclassify=False, details=True): if extras == None: print "No detailed information for cross-validation" return if not os.path.exists(os.path.dirname(resultPath)): os.makedirs(os.path.dirname(resultPath)) meta = result.getMeta(meta) # Add general results meta["results"] = {"best":results[bestIndex], "all":results} if hiddenResults != None: meta["results"]["hidden"] = hiddenResults # Insert detailed results if details: featureByIndex = result.getFeaturesByIndex(meta) if hiddenDetails != None: saveDetails(meta, hiddenDetails.get("predictions", None), hiddenDetails.get("importances", None), "hidden", featureByIndex, reclassify=reclassify) fold = 0 for extra in extras: saveDetails(meta, extra.get("predictions", None), extra.get("importances", None), fold, featureByIndex, reclassify=reclassify) fold += 1 else: if "examples" in meta: del meta["examples"] if "features" in meta: del meta["features"] # Analyze results if analyze: print "Analyzing results" meta = gene.analyze.analyze(meta, databaseCGI) # Save results if resultPath != None: result.saveMeta(meta, resultPath)
def looRLS(XPath, yPath, metaPath): X, Y = readAuto(XPath, yPath) meta = {} if metaPath != None: print "Loading metadata from", metaPath meta = result.getMeta(metaPath) X_train, X_hidden, Y_train, Y_hidden = hidden.split(X, Y, meta=meta) kwargs = {} kwargs['train_features'] = X_train kwargs['train_labels'] = Y_train kwargs['regparam'] = 1.0 rls = RLS.createLearner(**kwargs) rls.train() bestperf = -1. for logrp in range(5, 25): rp = 2. ** logrp rls.solve(rp) Ploo = rls.computeLOO() perf = cindex(Y_train, Ploo) print "Leave-one-out %f for lambda 2^%d" %(perf, logrp) if perf > bestperf: bestperf = perf bestlogrp = logrp rp = 2. ** bestlogrp print "Best leave-one-out %f for lambda 2^%d" %(bestperf, bestlogrp) rls.solve(rp) model = rls.getModel() P = model.predict(X_hidden) perf = cindex(Y_hidden, P) print "final performance: %f" %perf
def saveResults(meta, resultPath, perfs, selected): if not os.path.exists(os.path.dirname(resultPath)): os.makedirs(os.path.dirname(resultPath)) meta = result.getMeta(meta) featureByIndex = result.getFeaturesByIndex(meta) for foldIndex in range(len(selected)): ranks = selected[foldIndex] for featureRank in range(len(ranks)): featureIndex = ranks[featureRank] feature = result.getFeature(meta, featureIndex, featureByIndex) result.setValue(feature, foldIndex, featureRank, "ranks") result.setValue(feature, "sort", -sum(feature["ranks"].values()) / len(feature["ranks"])) result.saveMeta(meta, resultPath)
def process(database, inputMetaPath, resultBaseDir, cutoff=50, verbose=3, parallel=1, preDispatch='2*n_jobs', randomize=False, limit=1, debug=False, dummy=False, rerun=None, hideFinished=False, slurm=False): meta = result.getMeta(inputMetaPath) connection = batch.getConnection(slurm, debug) makeDir(resultBaseDir) cacheDir = makeDir(os.path.join(resultBaseDir, "cache")) resultDir = makeDir(os.path.join(resultBaseDir, "results")) jobDir = makeDir(os.path.join(resultBaseDir, "jobs")) #cachedMetaPath = os.path.join(cacheDir, "base.json") baseXPath, baseYPath, baseMetaPath = cache.getExperiment( experiment=meta["experiment"]["name"], experimentOptions=meta["experiment"]["options"], database=database, writer="writeNumpyText", useCached=True, cacheDir=cacheDir) features = meta["features"] count = 0 featureSet = [] cls = meta["results"]["best"] paramSets = [x["params"] for x in meta["results"]["all"]] classifierArgs = {} for paramSet in paramSets: for key in paramSet: if not key in classifierArgs: classifierArgs[key] = [] classifierArgs[key].append(paramSet[key]) classifierNameMap = { "LinearSVC":"svm.LinearSVC", "svm.LinearSVC":"svm.LinearSVC", "ExtraTreesClassifier":"ensemble.ExtraTreesClassifier", "ensemble.ExtraTreesClassifier":"ensemble.ExtraTreesClassifier", "RLScore":"RLScore", "RFEWrapper":"svm.LinearSVC" } classifierName = classifierNameMap[cls["classifier"]] #classifier, classifierArgs = learn.getClassifier(classifierName, params) submitCount = 0 sleepTime = 15 for featureName in features: feature = features[featureName] batch.waitForJobs(limit, submitCount, connection, sleepTime) print "Processing feature", featureName print feature featureSet.append(feature["id"]) jobName = "_".join([meta["experiment"]["name"], meta["template"]["project"], classifierName, "feature-" + str(feature["rank"])]) pointResultPath = os.path.join(resultDir, jobName + ".json") print "Feature set", featureSet if len(featureSet) > 1: # hiddenResults = curvePoint(baseXPath, baseYPath, baseMetaPath, featureSet, pointResultPath, # classifier=classifier, classifierArgs=params, getCV=eval(cls["cv"]), # numFolds=cls["folds"], verbose=verbose, parallel=parallel, # preDispatch=preDispatch, randomize=randomize, metric=cls["metric"])[3] #results.append(hiddenResults) command = "python curvePoint.py" command += " -X " + baseXPath command += " -y " + baseYPath command += " -m " + inputMetaPath command += " -o " + pointResultPath command += " --cutoff " + str(count) command += " --classifier " + classifierName command += " --classifierArgs \"" + str(classifierArgs) + "\"" command += " --iteratorCV " + cls["cv"] command += " --numFolds " + str(cls["folds"]) command += " --verbose " + str(verbose) command += " --parallel " + str(parallel) command += " --preDispatch \"" + str(preDispatch) + "\"" if randomize: command += " --randomize " command += " --metric " + cls["metric"] if batch.submitJob(command, connection, jobDir, jobName, dummy, rerun, hideFinished): submitCount += 1 count += 1 if count > cutoff: break
def test(XPath, yPath, metaPath, resultPath, classifier, classifierArgs, getCV=getStratifiedKFoldCV, numFolds=10, verbose=3, parallel=1, preDispatch='2*n_jobs', randomize=False, analyzeResults=False, databaseCGI=None, metric="roc_auc", useFeatures=None, reclassify=False, details=True): X, y = readAuto(XPath, yPath, useFeatures=useFeatures) meta = {} if metaPath != None: meta = result.getMeta(metaPath) if "classes" in meta: print "Class distribution = ", getClassDistribution(y) if randomize: classes = meta["classes"].values() y = [random.choice(classes) for x in range(len(y))] print "Randomized class distribution = ", getClassDistribution(y) X_train, X_hidden, y_train, y_hidden = hidden.split(X, y, meta=meta) print "Sizes", [len(X_train), len(y_train)], [len(X_hidden), len(y_hidden)] print "Cross-validating for", numFolds, "folds" print "Args", classifierArgs cv = getCV(y_train, meta, numFolds=numFolds) if preDispatch.isdigit(): preDispatch = int(preDispatch) scorer = getScorer(metric) search = ExtendedGridSearchCV(classifier(), [classifierArgs], refit=len(X_hidden) > 0, cv=cv, scoring=scorer, verbose=verbose, n_jobs=parallel, pre_dispatch=preDispatch) search.fit(X_train, y_train) if hasattr(search, "best_estimator_"): print "----------------------------- Best Estimator -----------------------------------" print search.best_estimator_ if hasattr(search.best_estimator_, "doRFE"): print "*** RFE ***" search.best_estimator_.doRFE(X_train, y_train) #print "--------------------------------------------------------------------------------" print "---------------------- Grid scores on development set --------------------------" results = [] extras = None index = 0 bestIndex = 0 for params, mean_score, scores in search.grid_scores_: print scores print "%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() / 2, params) results.append({"classifier":classifier.__name__, "cv":cv.__class__.__name__, "folds":numFolds, "metric":metric,"scores":list(scores), "mean":float(mean_score), "std":float(scores.std() / 2), "params":params}) if index == 0 or float(mean_score) > results[bestIndex]["mean"]: bestIndex = index if hasattr(search, "extras_"): extras = search.extras_[index] index += 1 print "---------------------- Best scores on development set --------------------------" params, mean_score, scores = search.grid_scores_[bestIndex] print scores print "%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() / 2, params) hiddenResults = None hiddenDetails = None if len(X_hidden) > 0: print "----------------------------- Classifying Hidden Set -----------------------------------" hiddenResults = {"classifier":search.best_estimator_.__class__.__name__, "score":search.score(X_hidden, y_hidden), "metric":metric, "params":search.best_params_} print "Score =", hiddenResults["score"], "(" + metric + ")" y_hidden_pred = search.predict(X_hidden) #print y_hidden_pred #print search.predict_proba(X_hidden) hiddenDetails = {"predictions":{i:x for i,x in enumerate(y_hidden_pred)}} if hasattr(search.best_estimator_, "feature_importances_"): hiddenDetails["importances"] = search.best_estimator_.feature_importances_ try: print classification_report(y_hidden, y_hidden_pred) except ValueError, e: print "ValueError in classification_report:", e