Exemple #1
0
def getCached(dbPath, experimentName, experimentOptions, meta, verbose=False):
    if meta == None or (isinstance(meta, basestring) and not os.path.exists(meta)): # nothing to compare with
        if verbose:
            print "No existing metadata file", [meta]
        return None
    # Load previous experiment
    meta = getMeta(meta)
    # Load current experiment
    template = buildExamples.parseExperiment(experimentName).copy()
    template = parseTemplateOptions(experimentOptions, template)
    # Get database information
    dbPath = os.path.abspath(os.path.expanduser(dbPath))
    dbModified = time.strftime("%c", time.localtime(os.path.getmtime(dbPath)))
    # Compare settings
    metaExp = meta["experiment"]
    if verbose:
        print "dbPath", dbPath
        print "dbFile", metaExp["dbFile"]
        print "dbModified", dbModified
        print "metaExp['dbModified']", metaExp["dbModified"]
        print "template", json.dumps(template)
        print "meta['template']", json.dumps(meta["template"])
    if metaExp["dbFile"] == dbPath and metaExp["dbModified"] == dbModified and template == meta["template"]:
        return meta # is the same experiment
    else:
        return None # previous experiment differs
Exemple #2
0
def curvePoint(XPath, yPath, meta, resultPath, featureCount, classifier, classifierArgs, getCV, numFolds, verbose, parallel, preDispatch, randomize, metric):
    if isinstance(meta, basestring):
        meta = result.getMeta(meta)
    
    count = 0
    featureSet = []
    for featureName in meta["features"]: # features must be already analysed
        featureSet.append(meta["features"][featureName]["id"])
        count += 1
        if count > featureCount:
            break
    print "Testing", len(featureSet), "features", featureSet
    meta["curve"] = {"count":len(featureSet), "indices":featureSet}
    
    classifierNameMap = {
        "LinearSVC":"svm.LinearSVC",
        "svm.LinearSVC":"svm.LinearSVC",
        "ExtraTreesClassifier":"ensemble.ExtraTreesClassifier",
        "ensemble.ExtraTreesClassifier":"ensemble.ExtraTreesClassifier",
        "RLScore":"RLScore"
    }
    classifierName = classifierNameMap[classifier]
    classifier, classifierArgs = learn.getClassifier(classifierName, eval(classifierArgs))
    
    meta, results, extras, hiddenResults, hiddenDetails = learn.test(
        XPath, yPath, meta, resultPath, 
        classifier=classifier, classifierArgs=classifierArgs, getCV=eval(getCV), 
        numFolds=numFolds, verbose=verbose, parallel=parallel, preDispatch=preDispatch, 
        randomize=randomize, analyzeResults=False, 
        metric=metric, useFeatures=featureSet, reclassify=True, details=False)
    return [meta, results, extras, hiddenResults, hiddenDetails]
Exemple #3
0
def analyze(meta, dbPath=None, resultPath=None, verbose=False):
    meta = result.getMeta(meta)
    if dbPath == None:
        dbPath = settings.CGI_DB_PATH
    print "Analyzing", dbPath
    con = DB.connect(dbPath)
    result.sortFeatures(meta)
    features = meta["features"]
    count = 1
    numFeatures = len(features)
    nonSelected = []
    for featureName in features:
        if not isinstance(features[featureName], int):
            if verbose:
                print "Processing feature", featureName, str(count) + "/" + str(numFeatures)
            geneName = getGeneName(featureName)
            if geneName != None:
                mappings = getTermAnalysis(con, geneName, "disease")
                result.setValue(features[featureName], "CancerGeneIndex", mappings)
                mappings = getTermAnalysis(con, geneName, "drug")
                result.setValue(features[featureName], "CancerGeneDrug", mappings)
        else:
            geneName = getGeneName(featureName)
            if geneName != None:
                nonSelected.append(geneName)
        count += 1
    result.setValue(meta, "CancerGeneIndex", analyzeTermCoverage(features), "analysis")
    result.setValue(meta["analysis"], "non-selected", getCancerGeneCoverage(con, nonSelected), "CancerGeneIndex")
    if resultPath != None:
        result.saveMeta(meta, resultPath)
    return meta
Exemple #4
0
def greedyRLS(XPath, yPath, metaPath, fcount=5, scount=50, resultPath=None):
    X, Y = readAuto(XPath, yPath)
    meta = {}
    if metaPath != None:
        print "Loading metadata from", metaPath
        meta = result.getMeta(metaPath)
    X_train, X_hidden, Y_train, Y_hidden = hidden.split(X, Y, meta=meta) 
    #if "classes" in meta:
    #    print "Class distribution = ", getClassDistribution(y)

    #logrps = range(15, 25)
    logrps = range(15, 26)
    print "Training RLS"
    loopCount = 1
    best_perf = -1
    best_logrp = None
    best_scount = None
    for logrp in logrps:
        kf = KFold(len(Y_train), n_folds=fcount, indices=True, shuffle=True, random_state=77)
        for train, test in kf:
            perfs = []
            print "------------ Processing fold", str(loopCount) + "/" + str(fcount), "------------"
            kwargs = {}
            kwargs['train_features'] = X_train[train]
            kwargs['train_labels'] = Y_train[train]
            kwargs['subsetsize'] = scount
            kwargs['regparam'] = 2.**logrp
            kwargs['bias'] = 1
            cb = CallbackFunction(X_train[test], Y_train[test])
            kwargs['callback_obj'] = cb
            rls = GreedyRLS.createLearner(**kwargs)
            rls.train()
            perfs.append(cb.perfs)
            loopCount += 1
            print "---------------------------------------------------"
        perfs = np.mean(perfs, axis=0)
        perf = np.max(perfs)
        perf = perfs[-1]
        sc = np.argmax(perfs)+1
        print "%f AUC, %d logrp, %d selected" %(perf, logrp, sc)
        if perf>best_perf:
            best_perf = perf
            best_logrp = logrp
            best_scount = sc
    kwargs = {}
    kwargs['train_features'] = X_train
    kwargs['train_labels'] = Y_train
    kwargs['subsetsize'] = scount
    kwargs['regparam'] = 2.**best_logrp
    kwargs['bias'] = 1
    cb = CallbackFunction(X_hidden, Y_hidden)
    kwargs['callback_obj'] = cb
    rls = GreedyRLS.createLearner(**kwargs)
    rls.train()
    perfs = cb.perfs
    selected = rls.selected
    model = rls.getModel()
    #if resultPath != None:
    #    saveResults(meta, resultPath, perfs, selected)
    return model, perfs, selected, best_logrp, best_scount
Exemple #5
0
def saveResults(meta, resultPath, results, extras, bestIndex, analyze, hiddenResults=None, hiddenDetails=None, databaseCGI=None, reclassify=False, details=True):
    if extras == None:
        print "No detailed information for cross-validation"
        return
    if not os.path.exists(os.path.dirname(resultPath)):
        os.makedirs(os.path.dirname(resultPath))
    meta = result.getMeta(meta)
    # Add general results
    meta["results"] = {"best":results[bestIndex], "all":results}
    if hiddenResults != None:
        meta["results"]["hidden"] = hiddenResults
    # Insert detailed results
    if details:
        featureByIndex = result.getFeaturesByIndex(meta)
        if hiddenDetails != None:
            saveDetails(meta, hiddenDetails.get("predictions", None), hiddenDetails.get("importances", None), "hidden", featureByIndex, reclassify=reclassify)
        fold = 0
        for extra in extras:
            saveDetails(meta, extra.get("predictions", None), extra.get("importances", None), fold, featureByIndex, reclassify=reclassify)
            fold += 1
    else:
        if "examples" in meta:
            del meta["examples"]
        if "features" in meta:
            del meta["features"]
    
    # Analyze results
    if analyze:
        print "Analyzing results"
        meta = gene.analyze.analyze(meta, databaseCGI)              
    # Save results
    if resultPath != None:
        result.saveMeta(meta, resultPath)
Exemple #6
0
def looRLS(XPath, yPath, metaPath):
    X, Y = readAuto(XPath, yPath)
    meta = {}
    if metaPath != None:
        print "Loading metadata from", metaPath
        meta = result.getMeta(metaPath)
    X_train, X_hidden, Y_train, Y_hidden = hidden.split(X, Y, meta=meta) 
    kwargs = {}
    kwargs['train_features'] = X_train
    kwargs['train_labels'] = Y_train
    kwargs['regparam'] = 1.0
    rls = RLS.createLearner(**kwargs)
    rls.train()
    bestperf = -1. 
    for logrp in range(5, 25):
        rp = 2. ** logrp
        rls.solve(rp)
        Ploo = rls.computeLOO()
        perf = cindex(Y_train, Ploo)
        print "Leave-one-out %f for lambda 2^%d" %(perf, logrp)
        if perf > bestperf:
            bestperf = perf
            bestlogrp = logrp
    rp = 2. ** bestlogrp
    print "Best leave-one-out %f for lambda 2^%d" %(bestperf, bestlogrp)
    rls.solve(rp)
    model = rls.getModel()
    P = model.predict(X_hidden)
    perf = cindex(Y_hidden, P)
    print "final performance: %f" %perf
Exemple #7
0
def saveResults(meta, resultPath, perfs, selected):
    if not os.path.exists(os.path.dirname(resultPath)):
        os.makedirs(os.path.dirname(resultPath))
    meta = result.getMeta(meta)
    featureByIndex = result.getFeaturesByIndex(meta)
    for foldIndex in range(len(selected)):
        ranks = selected[foldIndex]
        for featureRank in range(len(ranks)):
            featureIndex = ranks[featureRank]
            feature = result.getFeature(meta, featureIndex, featureByIndex)
            result.setValue(feature, foldIndex, featureRank, "ranks")
            result.setValue(feature, "sort", -sum(feature["ranks"].values()) / len(feature["ranks"]))
    result.saveMeta(meta, resultPath)
Exemple #8
0
def process(database, inputMetaPath, resultBaseDir, cutoff=50, verbose=3, parallel=1, 
            preDispatch='2*n_jobs', randomize=False, limit=1, debug=False,
            dummy=False, rerun=None, hideFinished=False, slurm=False):
    meta = result.getMeta(inputMetaPath)
    
    connection = batch.getConnection(slurm, debug)
    
    makeDir(resultBaseDir)
    cacheDir = makeDir(os.path.join(resultBaseDir, "cache"))
    resultDir = makeDir(os.path.join(resultBaseDir, "results"))
    jobDir = makeDir(os.path.join(resultBaseDir, "jobs"))

    #cachedMetaPath = os.path.join(cacheDir, "base.json")
    
    baseXPath, baseYPath, baseMetaPath = cache.getExperiment(
         experiment=meta["experiment"]["name"], experimentOptions=meta["experiment"]["options"], 
         database=database, writer="writeNumpyText", useCached=True, cacheDir=cacheDir)

    features = meta["features"]
    count = 0
    featureSet = []
    cls = meta["results"]["best"]
    paramSets = [x["params"] for x in meta["results"]["all"]]
    classifierArgs = {}
    for paramSet in paramSets:
        for key in paramSet:
            if not key in classifierArgs:
                classifierArgs[key] = []
            classifierArgs[key].append(paramSet[key])
    classifierNameMap = {
        "LinearSVC":"svm.LinearSVC",
        "svm.LinearSVC":"svm.LinearSVC",
        "ExtraTreesClassifier":"ensemble.ExtraTreesClassifier",
        "ensemble.ExtraTreesClassifier":"ensemble.ExtraTreesClassifier",
        "RLScore":"RLScore",
        "RFEWrapper":"svm.LinearSVC"
    }
    classifierName = classifierNameMap[cls["classifier"]]
    #classifier, classifierArgs = learn.getClassifier(classifierName, params)
    submitCount = 0
    sleepTime = 15
    for featureName in features:
        feature = features[featureName]
        batch.waitForJobs(limit, submitCount, connection, sleepTime)
        print "Processing feature", featureName
        print feature
        featureSet.append(feature["id"])
        jobName = "_".join([meta["experiment"]["name"], meta["template"]["project"], classifierName, "feature-" + str(feature["rank"])])
        pointResultPath = os.path.join(resultDir, jobName + ".json")
        print "Feature set", featureSet
        if len(featureSet) > 1:
#             hiddenResults = curvePoint(baseXPath, baseYPath, baseMetaPath, featureSet, pointResultPath, 
#                        classifier=classifier, classifierArgs=params, getCV=eval(cls["cv"]),
#                        numFolds=cls["folds"], verbose=verbose, parallel=parallel,
#                        preDispatch=preDispatch, randomize=randomize, metric=cls["metric"])[3]
            #results.append(hiddenResults)
            command = "python curvePoint.py"
            command +=  " -X " + baseXPath
            command +=  " -y " + baseYPath
            command +=  " -m " + inputMetaPath
            command +=  " -o " + pointResultPath
            command +=  " --cutoff " + str(count)
            command +=  " --classifier " + classifierName
            command +=  " --classifierArgs \"" + str(classifierArgs) + "\"" 
            command +=  " --iteratorCV " + cls["cv"]
            command +=  " --numFolds " + str(cls["folds"])
            command +=  " --verbose " + str(verbose)
            command +=  " --parallel " + str(parallel)
            command +=  " --preDispatch \"" + str(preDispatch) + "\""
            if randomize: 
                command +=  " --randomize "
            command +=  " --metric " + cls["metric"]
            
            if batch.submitJob(command, connection, jobDir, jobName, dummy, rerun, hideFinished):
                submitCount += 1
        count += 1
        if count > cutoff:
            break
Exemple #9
0
def test(XPath, yPath, metaPath, resultPath, classifier, classifierArgs, 
         getCV=getStratifiedKFoldCV, numFolds=10, verbose=3, parallel=1, 
         preDispatch='2*n_jobs', randomize=False, analyzeResults=False,
         databaseCGI=None, metric="roc_auc", useFeatures=None, reclassify=False, details=True):
    X, y = readAuto(XPath, yPath, useFeatures=useFeatures)
    meta = {}
    if metaPath != None:
        meta = result.getMeta(metaPath)
    if "classes" in meta:
        print "Class distribution = ", getClassDistribution(y)
        if randomize:
            classes = meta["classes"].values()
            y = [random.choice(classes) for x in range(len(y))]
            print "Randomized class distribution = ", getClassDistribution(y)
    X_train, X_hidden, y_train, y_hidden = hidden.split(X, y, meta=meta)
    print "Sizes", [len(X_train), len(y_train)], [len(X_hidden), len(y_hidden)]

    print "Cross-validating for", numFolds, "folds"
    print "Args", classifierArgs
    cv = getCV(y_train, meta, numFolds=numFolds)
    if preDispatch.isdigit():
        preDispatch = int(preDispatch)
    scorer = getScorer(metric)
    search = ExtendedGridSearchCV(classifier(), [classifierArgs], refit=len(X_hidden) > 0, cv=cv, scoring=scorer, verbose=verbose, n_jobs=parallel, pre_dispatch=preDispatch)
    search.fit(X_train, y_train) 
    if hasattr(search, "best_estimator_"):
        print "----------------------------- Best Estimator -----------------------------------"
        print search.best_estimator_
        if hasattr(search.best_estimator_, "doRFE"):
            print "*** RFE ***"
            search.best_estimator_.doRFE(X_train, y_train)
    #print "--------------------------------------------------------------------------------"
    print "---------------------- Grid scores on development set --------------------------"
    results = []
    extras = None
    index = 0
    bestIndex = 0
    for params, mean_score, scores in search.grid_scores_:
        print scores
        print "%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() / 2, params)
        results.append({"classifier":classifier.__name__, "cv":cv.__class__.__name__, "folds":numFolds,
                   "metric":metric,"scores":list(scores), 
                   "mean":float(mean_score), "std":float(scores.std() / 2), "params":params})
        if index == 0 or float(mean_score) > results[bestIndex]["mean"]:
            bestIndex = index
            if hasattr(search, "extras_"):
                extras = search.extras_[index]
        index += 1
    print "---------------------- Best scores on development set --------------------------"
    params, mean_score, scores = search.grid_scores_[bestIndex]
    print scores
    print "%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() / 2, params)
    hiddenResults = None
    hiddenDetails = None
    if len(X_hidden) > 0:
        print "----------------------------- Classifying Hidden Set -----------------------------------"
        hiddenResults = {"classifier":search.best_estimator_.__class__.__name__, 
                         "score":search.score(X_hidden, y_hidden),
                         "metric":metric,
                         "params":search.best_params_}
        print "Score =", hiddenResults["score"], "(" + metric + ")"
        y_hidden_pred = search.predict(X_hidden)
        #print y_hidden_pred
        #print search.predict_proba(X_hidden)
        hiddenDetails = {"predictions":{i:x for i,x in enumerate(y_hidden_pred)}}
        if hasattr(search.best_estimator_, "feature_importances_"):
            hiddenDetails["importances"] = search.best_estimator_.feature_importances_
        try:
            print classification_report(y_hidden, y_hidden_pred)
        except ValueError, e:
            print "ValueError in classification_report:", e