Beispiel #1
0
def getModelFileName(config, modelname):
    prefix = None
    if isRegression(config['problem']):
        prefix = "regressor"
    elif isClassification(config['problem']):
        prefix = "classifier"
    modelFileName = setFile(getModelsDir(config), "{0}-{1}.p".format(prefix,modelname))
    
    return modelFileName
Beispiel #2
0
def testModel(modelname, estimator, X_test, config):  
    info("Testing a {0} estimator".format(modelname), ind=0)
    info("X data is {0}".format(getDim(X_test)), ind=2)
    
    problemType = config['problem']
    results = {"good": True, "label": None, "prob": None, "pred": None}
    
    if isinstance(estimator, dict):
        estimator = estimator['estimator']
        
    
    if estimator is None:
        error("The {0} estimator is NULL".format(modelname))
        results['good'] = False
        return results
    
    
    if isClassification(problemType):
        info("Predicting classification labels/classes for {0}".format(modelname), ind=4)
        try:
            results['label'] = estimator.predict(X_test)
        except:
            results['good'] = False
            error("There is a problem getting labels for {0}".format(modelname), ind=4)
        
        info("Predicting classification probabilities for {0}".format(modelname), ind=4)
        try:
            proba = estimator.predict_proba(X_test)
            results['prob'] = proba[:,1]
        except:
            results['good'] = False
            error("There is a problem getting probabilities for {0}".format(modelname), ind=4)
            

    if isRegression(problemType):
        info("Predicting regression score/output for {0}".format(modelname), ind=4)
        try:
            results['pred'] = estimator.predict(X_test)
        except:
            results['good'] = False
            error("There is a problem getting prediction for {0}".format(modelname), ind=4)


    if results['good'] == True:
        info("Everything looks good for the {0} estimator".format(modelname), ind=4)
    else:        
        info("There is a problem with the {0} estimator".format(modelname), ind=4)


    return results
Beispiel #3
0
def getParamDist(config, modelname, nfeatures = None):
    info("Getting parameter distributions for {0}".format(modelname), ind=2)
    
    param_dist = None
    epsilon = 0.000001
    
    problemType = config['problem']
    
    treeParams = {"max_depth": [2, 4, 6, None],
                  "max_features": ['auto', 'sqrt', 'log2', None],
                  "min_impurity_decrease": sp_randfloat(0.0, 1-epsilon),
                  "min_samples_leaf": sp_randint(1, 10)}

    

    ###########################################################################
    ## rf, extratrees
    ###########################################################################
    if modelname in ["rf", "extratrees", "dtree", "gbm"]:
        param_dist = treeParams
        if modelname == "rf" or modelname == "extratrees":
            param_dist["bootstrap"] = [True, False]
        if modelname == "gbm":
            param_dist["learning_rate"] = sp_randfloat(0.01, 0.5)

        if modelname = ["rf", "extratrees", "dtree"]:
            if isClassification(problemType):
                param_dist["criterion"] = ["gini", "entropy"]
            if isRegression(problemType):
                param_dist["criterion"] = ["mae", "mse"]
        if isClassification(problemType):
            param_dist["criterion"] = ['mse', 'friedman_mse']
            param_dist["loss"] = ['deviance', 'exponential']
        if isRegression(problemType):
            param_dist["criterion"] = ["friedman_mse"]
            param_dist["loss"] = ['ls']
Beispiel #4
0
def getModels(config, level):
    info("Getting Models For Level {0}".format(level), ind=0)
    problemType = config['problem']

    if isClassification(problemType):
        models0 = ["xgboost", "logistic"]
        models1 = ["rf", "nn", "svmnulinear", "gbm"]
        models2 = [
            "extratrees", "sgd", "nb", "lda", "kneighbors", "svmepslinear"
        ]
        models3 = [
            "passagg", "gaussproc", "qda", "nbbern", "nbmulti", "dtree",
            "rneighbors", "svmlin", "svmnu", "adaboost", "svmnupoly",
            "svmepspoly", "svmnusigmoid", "svmepssigmoid", "svmnurbf",
            "svmepsrbf"
        ]
        if level == 0:
            models = models0
        elif level == 1:
            models = models0 + models1
        elif level == 2:
            models = models0 + models1 + models2
        else:
            models = models0 + models1 + models2 + models3

    if isRegression(problemType):
        models0 = ["xgboost", "linear"]
        models1 = ["xgboost", "rf", "linear", "nn", "svm", "gbm"]
        models2 = ["extratrees", "sgd", "earth", "kernelridge", "kneighbors"]
        models3 = [
            "passagg", "gaussproc", "lasso", "ridge", "elasticnet",
            "bayesridge", "huber", "theilsen", "ransac", "dtree", "rneighbors",
            "svmlin", "svmnu", "adaboost"
        ]
        if level == 0:
            models = models0
        elif level == 1:
            models = models0 + models1
        elif level == 2:
            models = models0 + models1 + models2
        else:
            models = models0 + models1 + models2 + models3

    info("Using the following models: {0}".format(models), 2)
    return models
Beispiel #5
0
def plotResults(perfs, y_truth, config):
    info("Making Performance Plots", ind=0)

    outdir = getPlotsDir(config)
    performanceConfig = config['performance']
    ext = performanceConfig['ext']
    isPdf = ext == 'pdf'
    isMultipage = performanceConfig['multipage']
    if isMultipage and isPdf:
        pdfname = setFile(outdir, 'results.pdf')
        info("Saving all performance plots to {0}".format(pdfname), ind=2)
        pp = PdfPages(pdfname)
    else:
        info("Saving all performance plots individually as {0}".format(ext),
             ind=2)
        pp = None

    badModels = [x for x in perfs.keys() if len(perfs[x]) == 0]
    for modelname in badModels:
        info("Not plotting {0}".format(modelname))
        del perfs[modelname]

    if isClassification(config['problem']):
        plotKappa(perfs, outdir, ext, pp)
        plotPrecision(perfs, outdir, ext, pp)
        plotRecall(perfs, outdir, ext, pp)
        plotLogLoss(perfs, outdir, ext, pp)
        plotAccuracy(perfs, outdir, ext, pp)
        plotPrecisionRecall(perfs, outdir, ext, pp)
        plotROC(perfs, outdir, ext, pp)
        plotConfusionMatrix(perfs, config, outdir, ext, pp)

    if isRegression(config['problem']):
        plotMAE(perfs, outdir, ext, pp)
        plotMSE(perfs, outdir, ext, pp)
        plotExplainedVariance(perfs, outdir, ext, pp)
        plotR2(perfs, outdir, ext, pp)
        plotResiduals(perfs, outdir, ext, pp)

    if isMultipage and isPdf:
        info("Closing multipage pdf", ind=2)
        pp.savefig()
        pp.close()
Beispiel #6
0
def getModelPerformance(y_truth, testResults, config):
    info("Getting model performance", ind=0)

    problemType = config['problem']

    if isClassification(problemType):
        try:
            results = getClassifierPerformance(y_truth, testResults)
        except:
            error(
                "There was a problem getting classification performance data",
                ind=4)
            results = {}

    if isRegression(problemType):
        try:
            results = getRegressionPerformance(y_truth, testResults)
        except:
            error("There was a problem getting regression performance data",
                  ind=4)
            results = {}

    return results
Beispiel #7
0
def formatData(trainData, testData, config):
    info('Formatting training data of size ' + getDim(trainData), ind=0)
    info('Formatting testing data of size ' + getDim(testData), ind=0)

    ## Config info
    targetConfig = config['target']
    targetcol = targetConfig['colname']
    positiveTarget = targetConfig['positive']
    targetNAstrategy = targetConfig['NAstrategy']
    featureConfig = config['feature']
    featureNAstrategy = featureConfig['NAstrategy']

    if not isColumn(trainData, targetcol):
        raise ValueError("Target column", targetcol, "is not a valid column.")

    # 1) Get problem type
    targetData = trainData[targetcol]
    if config.get('problem'):
        problemType = config['problem']
    else:
        problemType = getProblemType(targetData)
        config['problem'] = problemType

    # 2) format target based on what we want
    info('Formatting target', ind=1)
    if isClassification(problemType):
        convertToBinaryInt(trainData, targetcol, positiveTarget)
        if isColumn(testData, targetcol):
            convertToBinaryInt(testData, targetcol, positiveTarget)
    if isRegression(problemType):
        info('Not formatting target since it is regression', ind=1)

    # 3) replace NA
    info('Replace NA in data', ind=1)
    print featureNAstrategy
    replaceTargetNA(trainData, targetcol, targetNAstrategy)
    replaceFeatureNA(trainData, targetcol, featureNAstrategy)
    if isColumn(testData, targetcol):
        replaceTargetNA(testData, targetcol, targetNAstrategy)
    replaceFeatureNA(testData, targetcol, featureNAstrategy)

    # 4) drop columns we don't need
    dropData(trainData, config)
    dropData(testData, config)

    return trainData, testData

    # 5) format remaining data to numeric
    info('Formatting features to numeric', ind=1)
    convertCategoricalToNumeric(trainData, targetcol)
    convertCategoricalToNumeric(testData, targetcol)
    info('Post formatting the training data is now ' + getDim(trainData),
         ind=2)
    info('Post formatting the testing data is now ' + getDim(trainData), ind=2)

    #pddata.drop([colname], axis = 1, inplace = True)
    #pddata = pddata.join(expData)

    # 5) replace low variance
    info('Remove low variance features in data', ind=1)

    info('Finished formatting data', ind=0)

    return pddata
Beispiel #8
0
def formatData(pddf, config):
    info('Formatting data of size ' + getDim(pddf), ind=0)

    ## Config info
    targetConfig = config['target']
    targetcol = targetConfig['colname']
    positiveTarget = targetConfig['positive']
    targetNAstrategy = targetConfig['NAstrategy']
    featureConfig = config['feature']
    featureNAstrategy = featureConfig['NAstrategy']

    if not isColumn(pddf, targetcol):
        raise ValueError("Target column", targetcol, "is not a valid column.")

    # 1) Get problem type
    targetData = pddf[targetcol]
    if config.get('problem'):
        problemType = config['problem']
    else:
        problemType = getProblemType(targetData)
        config['problem'] = problemType

    # 2) format target based on what we want
    info('Formatting target', ind=2)
    if isClassification(problemType):
        convertToBinaryInt(pddf, targetcol, positiveTarget)
    if isRegression(problemType):
        info('Not formatting target since it is regression', ind=1)

    # 3) replace NA
    info('Replace NA in data', ind=2)
    replaceTargetNA(pddf, targetcol, targetNAstrategy)
    replaceFeatureNA(pddf, targetcol, featureNAstrategy)

    # 4) remove low variance data
    info('Remove low variance in data', ind=2)

    # 5) drop columns we don't need
    info('Analyze data for possible drops', ind=2)
    analyzeColumns(pddf, config)
    dropData(pddf, config)
    info('Post column data the data is now ' + getDim(pddf), ind=2)

    # 6) label and one-hot encode data
    info('Label encode training data to numeric', ind=2)
    pddf, encodedCatData, labelEncoders = getEncodedData(pddf)
    info('Hot encode training data to sparse data frame', ind=1)
    encodedData = getHotEncodedData(encodedCatData, labelEncoders)
    info('Join training data together', ind=2)
    pddf = pddf.join(encodedData)
    info('Post formatting the data is now ' + getDim(pddf), ind=2)

    # 7) replace low variance
    info('Remove low variance features in data', ind=2)
    if isClassification(problemType):
        info('Classification is To do!', ind=4)
    if isRegression(problemType):
        info('Not removing any features since it is regression', ind=1)

    # 8) replace NA (if any remain)
    info('Replace NA (if any) in data', ind=2)
    replaceTargetNA(pddf, targetcol, targetNAstrategy)
    replaceFeatureNA(pddf, targetcol, featureNAstrategy)
    if sum(pddf.isnull().any()) > 0:
        error("There are still NA entries in the dataset!", ind=4)

    info('Finished formatting data. Data is now ' + getDim(pddf), ind=2)

    return pddf
Beispiel #9
0
def tuneModel(modelname, estimator, params, X_train, y_train, config):  
    info("Tuning a {0} estimator".format(modelname), ind=0)
    
    if estimator is None or params is None:
        error("There is no estimator with parameters information.", ind=2)
        return {"estimator": None, "params": None, "cv": None}

    problemType    = config['problem']
    try:
        modelData = getModelData(config, modelname)
    except:
        error("There is no model parameter data for the {0} estimator".format(modelname))

    if isClassification(problemType):
        scorers = ["accuracy", "average_precision", "f1", "f1_micro",
                   "f1_macro", "f1_weighted", "f1_samples", "neg_log_loss",
                   "precision", "recall", "roc_auc"]
        scorer = "roc_auc"
    
    if isClustering(problemType):
        scorers = ["adjusted_mutual_info_score", "adjusted_rand_score",
                   "completeness_score", "fowlkes_mallows_score",
                   "homogeneity_score", "mutual_info_score",
                   "normalized_mutual_info_score", "v_measure_score"]
        scorer = "adjusted_mutual_info_score"
    
    if isRegression(problemType):
        scorers = ["explained_variance", "neg_mean_absolute_error",
                   "neg_mean_squared_error", "neg_mean_squared_log_error",
                   "neg_median_absolute_error", "r2"]
        scorer = "neg_mean_absolute_error"

    if scorer not in scorers:
        raise ValueError("Scorer {0} is not allowed".format(scorer))

    searchType = "random"    
    if searchType == "grid":
        param_grid = params['grid']
        tuneEstimator = GridSearchCV(estimator, param_grid=param_grid, cv=2,
                                     scoring=scorer, verbose=1)
    elif searchType == "random":        
        n_iter_search = modelData.get('iter')
        if n_iter_search is None:
            n_iter_search = 10
        param_dist = params['dist']
        tuneEstimator = RandomizedSearchCV(estimator, param_distributions=param_dist, 
                                           cv=2, n_iter=n_iter_search, 
                                           verbose=1, n_jobs=-1,
                                           return_train_score=True)
    else:
        raise ValueError("Search type {0} is not allowed".format(searchType))


    info("Running {0} parameter search".format(searchType), ind=2)        
    tuneEstimator.fit(X_train, y_train)
    bestEstimator = tuneEstimator.best_estimator_        
    bestScore     = tuneEstimator.best_score_
    bestParams    = tuneEstimator.best_params_
    cvResults     = tuneEstimator.cv_results_
    cvScores      = cvResults['mean_test_score']
    fitTimes      = cvResults['mean_fit_time']

    info("Tested {0} Parameter Sets".format(len(fitTimes)), ind=4)
    info("CV Fit Time Info (Mean,Std): ({0} , {1})".format(round(fitTimes.mean(),1), round(fitTimes.std(),1)), ind=4)
    info("Best Score                 : {0}".format(round(bestScore, 3)), ind=4)
    info("CV Test Scores (Mean,Std)  : ({0} , {1})".format(round(cvScores.mean(),1), round(cvScores.std(),1)), ind=4)
    info("Best Parameters", ind=4)
    for paramName, paramVal in bestParams.iteritems():
        info("Param: {0} = {1}".format(paramName, paramVal), ind=6)
    

    return {"estimator": bestEstimator, "params": bestParams, "cv": cvResults}
Beispiel #10
0
def getModel(config, modelname):
    info("Getting {0} Model".format(modelname), ind=0)

    problemType = config['problem']
    modelData = getModelData(config, modelname)
    modelParams = modelData.get('params')
    retval = None

    ###########################################################################
    # Classification
    ###########################################################################
    if isClassification(problemType):
        if modelname == "logistic":
            retval = classifier(modelname, LogisticRegression(), modelParams)
        if modelname == "sgd":
            retval = classifier(modelname, SGDClassifier(), modelParams)
        if modelname == "passagg":
            retval = classifier(modelname, PassiveAggressiveClassifier(),
                                modelParams)

        if modelname == "mlp":
            retval = classifier(modelname, MLPClassifier(), modelParams)

        if modelname == "xgboost":
            retval = classifier(modelname, XGBClassifier(), modelParams)

        if modelname == "gaussproc":
            retval = classifier(modelname, GaussianProcessClassifier(),
                                modelParams)

        if modelname == "lda":
            retval = classifier(modelname, LinearDiscriminantAnalysis(),
                                modelParams)
        if modelname == "qda":
            retval = classifier(modelname, QuadraticDiscriminantAnalysis(),
                                modelParams)

        if modelname == "nb":
            retval = classifier(modelname, GaussianNB(), modelParams)
        if modelname == "nbbern":
            retval = classifier(modelname, BernoulliNB(), modelParams)
        if modelname == "nbmulti":
            retval = classifier(modelname, MultinomialNB(), modelParams)

        if modelname == "dtree":
            retval = classifier(modelname, DecisionTreeClassifier(),
                                modelParams)

        if modelname == "kneighbors":
            retval = classifier(modelname, KNeighborsClassifier(), modelParams)
        if modelname == "rneighbors":
            retval = classifier(modelname, RadiusNeighborsClassifier(),
                                modelParams)

        if modelname == "svmlin":
            retval = classifier(modelname, LinearSVC(), modelParams)
        if modelname == "svmnupoly":
            retval = classifier(modelname, NuSVC(), modelParams)
        if modelname == "svmnulinear":
            retval = classifier(modelname, NuSVC(), modelParams)
        if modelname == "svmnusigmoid":
            retval = classifier(modelname, NuSVC(), modelParams)
        if modelname == "svmnurbf":
            retval = classifier(modelname, NuSVC(), modelParams)
        if modelname == "svmepspoly":
            retval = classifier(modelname, SVC(), modelParams)
        if modelname == "svmepslinear":
            retval = classifier(modelname, SVC(), modelParams)
        if modelname == "svmepssigmoid":
            retval = classifier(modelname, SVC(), modelParams)
        if modelname == "svmepsrbf":
            retval = classifier(modelname, SVC(), modelParams)

        if modelname == "rf":
            retval = classifier(modelname, RandomForestClassifier(),
                                modelParams)
        if modelname == "extratrees":
            retval = classifier(modelname, ExtraTreesClassifier(), modelParams)
        if modelname == "adaboost":
            retval = classifier(modelname, AdaBoostClassifier(), modelParams)
        if modelname == "gbm":
            retval = classifier(modelname, GradientBoostingClassifier(),
                                modelParams)

        if modelname == "tpot":
            retval = classifier(modelname, TPOTClassifier(), modelParams)

        #######################################################################
        # Regression
        #######################################################################
        if modelname == "lightning":
            retval = external.extlightning.createLightningClassifier(
                modelParams)

    ###########################################################################
    # Regression
    ###########################################################################
    if isRegression(problemType):
        if modelname == "linear":
            retval = classifier(modelname, LinearRegression(), modelParams)
        if modelname == "ridge":
            retval = classifier(modelname, Ridge(), modelParams)
        if modelname == "lasso":
            retval = classifier(modelname, Lasso(), modelParams)
        if modelname == "elasticnet":
            retval = classifier(modelname, ElasticNet(), modelParams)
        if modelname == "omp":
            retval = classifier(modelname, OrthogonalMatchingPursuit(),
                                modelParams)
        if modelname == "bayesridge":
            retval = classifier(modelname, BayesianRidge(), modelParams)
        if modelname == "ard":
            retval = classifier(modelname, ARDRegression(), modelParams)
        if modelname == "sgd":
            retval = classifier(modelname, SGDRegressor(), modelParams)
        if modelname == "passagg":
            retval = classifier(modelname, PassiveAggressiveRegressor(),
                                modelParams)
        if modelname == "perceptron":
            retval = None
        if modelname == "huber":
            retval = classifier(modelname, HuberRegressor(), modelParams)
        if modelname == "theilsen":
            retval = classifier(modelname, TheilSenRegressor(), modelParams)
        if modelname == "ransac":
            retval = classifier(modelname, RANSACRegressor(), modelParams)

        if modelname == "mlp":
            retval = classifier(modelname, MLPRegressor(), modelParams)

        if modelname == "xgboost":
            retval = classifier(modelname, XGBRegressor(), modelParams)

        if modelname == "gaussproc":
            retval = classifier(modelname, GaussianProcessRegressor(),
                                modelParams)

        if modelname == "dtree":
            retval = classifier(modelname, DecisionTreeRegressor(),
                                modelParams)

        if modelname == "kneighbors":
            retval = classifier(modelname, KNeighborsRegressor(), modelParams)
        if modelname == "rneighbors":
            retval = classifier(modelname, RadiusNeighborsRegressor(),
                                modelParams)

        if modelname == "svmlin":
            retval = classifier(modelname, LinearSVR(), modelParams)
        if modelname == "svmnupoly":
            retval = classifier(modelname, NuSVR(), modelParams)
        if modelname == "svmnulinear":
            retval = classifier(modelname, NuSVR(), modelParams)
        if modelname == "svmnusigmoid":
            retval = classifier(modelname, NuSVR(), modelParams)
        if modelname == "svmnurbf":
            retval = classifier(modelname, NuSVR(), modelParams)
        if modelname == "svmepspoly":
            retval = classifier(modelname, SVR(), modelParams)
        if modelname == "svmepslinear":
            retval = classifier(modelname, SVR(), modelParams)
        if modelname == "svmepssigmoid":
            retval = classifier(modelname, SVR(), modelParams)
        if modelname == "svmepsrbf":
            retval = classifier(modelname, SVR(), modelParams)

        if modelname == "rf":
            retval = classifier(modelname, RandomForestRegressor(),
                                modelParams)
        if modelname == "extratrees":
            retval = classifier(modelname, ExtraTreesRegressor(), modelParams)
        if modelname == "adaboost":
            retval = classifier(modelname, AdaBoostRegressor(), modelParams)
        if modelname == "gbm":
            retval = classifier(modelname, GradientBoostingRegressor(),
                                modelParams)

        if modelname == "isotonic":
            retval = classifier(modelname, IsotonicRegression(), modelParams)

        if modelname == "earth":
            retval = classifier(modelname, Earth(), modelParams)

        if modelname == "symbolic":
            retval = classifier(modelname, SymbolicRegressor(), modelParams)

        if modelname == "tpot":
            retval = classifier(modelname, TPOTRegressor(), modelParams)

    if retval is None:
        raise ValueError(
            "No model with name {0} was created".format(modelname))

    model = retval.get()

    return model