Esempio n. 1
0
def gridsearch(x, y, cv):
    scoring = {
        'auc_score': 'roc_auc',
        'accuracy': 'accuracy',
        'scores_p_1': 'precision',
        'scores_r_1': 'recall',
        'scores_f_1_1': 'f1',
        'scores_p_0': make_scorer(precision_0),
        'scores_r_0': make_scorer(recall_0),
        'scores_f_1_0': make_scorer(f1_0),
        'mcc': make_scorer(matthews_corrcoef),
        'precision_micro': 'precision_micro',
        'precision_macro': 'precision_macro',
        'recall_macro': 'recall_macro',
        'recall_micro': 'recall_micro',
        'f1_macro': 'f1_macro',
        'f1_micro': 'f1_micro'
    }
    grid_search = GridSearchCV(
        SVC(kernel='rbf', probability=True),
        param_grid={
            'C': [1000, 500, 250, 100, 50, 25, 1, 0.1, 0.01, 0.001, 0.0001],
            'gamma': [100, 10, 1, 0.1, 0.01, 0.001, 0.0001]
        },
        scoring=scoring,
        cv=cv,
        n_jobs=40,
        refit='auc_score',
        verbose=2)
    grid_search.fit(x, y)
    return grid_search
Esempio n. 2
0
def gridsearch_linear(x, y, cv):
    # 1000, 500, 200, 100, 50,20, 10, 2, 1, 0.2, 0.5,0.01, 0.02, 0.05, 0.001
    '''
    param_grid={'C': [1000, 500, 200, 100, 50,
                                                 20, 10, 2, 1, 0.2, 0.5,
                                                 0.01, 0.02, 0.05, 0.001],
                                           'gamma': [1000, 500, 200, 100,
                                                     50, 20, 10, 5, 2, 1,
                                                     0.2, 0.5, 0.01, 0.02,
                                                     0.05, 0.001, 0.0001]},'''
    scoring = {
        'auc_score': 'roc_auc',
        'accuracy': 'accuracy',
        'scores_p_1': 'precision',
        'scores_r_1': 'recall',
        'scores_f_1_1': 'f1',
        'scores_p_0': make_scorer(precision_0),
        'scores_r_0': make_scorer(recall_0),
        'scores_f_1_0': make_scorer(f1_0),
        'mcc': make_scorer(matthews_corrcoef),
        'precision_micro': 'precision_micro',
        'precision_macro': 'precision_macro',
        'recall_macro': 'recall_macro',
        'recall_micro': 'recall_micro',
        'f1_macro': 'f1_macro',
        'f1_micro': 'f1_micro'
    }
    grid_search = GridSearchCV(LinearSVC(max_iter=1000),
                               param_grid={
                                   'penalty': ['l2'],
                                   'C': [
                                       1000, 500, 200, 100, 50, 20, 10, 2, 1,
                                       0.2, 0.5, 0.01, 0.02, 0.05, 0.001
                                   ]
                               },
                               scoring=scoring,
                               cv=cv,
                               n_jobs=40,
                               refit='auc_score',
                               verbose=2)
    '''grid_search = GridSearchCV(LinearSVC(max_iter=1000),
                               param_grid={ 'penalty' : ['l2'],
                                            'C': [1000, 500, 200, 100, 50,
                                                 20, 10, 2, 1, 0.2, 0.5,
                                                 0.01, 0.02, 0.05, 0.001]},
                               scoring={'accuracy','roc_auc'}, cv=cv, n_jobs=-1, refit='accuracy')
    grid_search = GridSearchCV(SVC(kernel='rbf', cache_size=2000, probability=True),
                               param_grid={'C': [10000, 5000, 1],
                                           'gamma': ['scale']},
                               scoring={'accuracy','roc_auc'}, cv=cv, n_jobs=-1, refit='accuracy')'''
    grid_search.fit(x, y)
    return grid_search
Esempio n. 3
0
def scale_svd_rf_pipe():
    from h2o.transforms.decomposition import H2OSVD

    print("Importing USArrests.csv data...")
    arrests = h2o.upload_file(
        pyunit_utils.locate("smalldata/pca_test/USArrests.csv"))

    # build  transformation pipeline using sklearn's Pipeline and H2OSVD
    pipe = Pipeline([("standardize", H2OScaler()), ("svd", H2OSVD()),
                     ("rf", H2ORandomForestEstimator())])

    params = {
        "standardize__center": [True, False],
        "standardize__scale": [True, False],
        "svd__nv": [2, 3],
        "rf__ntrees": randint(50, 60),
        "rf__max_depth": randint(4, 8),
        "rf__min_rows": randint(5, 10),
        "svd__transform": ["none", "standardize"],
    }

    custom_cv = H2OKFold(arrests, n_folds=5, seed=42)
    random_search = RandomizedSearchCV(pipe,
                                       params,
                                       n_iter=5,
                                       scoring=make_scorer(h2o_r2_score),
                                       cv=custom_cv,
                                       random_state=42,
                                       n_jobs=1)

    random_search.fit(arrests[1:], arrests[0])
    print(random_search.best_estimator_)
def scale_pca_rf_pipe_new_import():
  from h2o.estimators.pca import H2OPrincipalComponentAnalysisEstimator
  iris = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))

  # build transformation pipeline using sklearn's Pipeline and H2O transforms
  pipe = Pipeline([
                    ("standardize", H2OScaler()),
                    ("pca", H2OPrincipalComponentAnalysisEstimator().init_for_pipeline()),
                    ("rf", H2ORandomForestEstimator())
                  ])

  params = {"standardize__center":    [True, False],             # Parameters to test
            "standardize__scale":     [True, False],
            "pca__k":                 randint(2, iris[1:].shape[1]),
            "rf__ntrees":             randint(50,60),
            "rf__max_depth":          randint(4,8),
            "rf__min_rows":           randint(5,10),
            "pca__transform":         ["none", "standardize"],
            }

  custom_cv = H2OKFold(iris, n_folds=5, seed=42)
  random_search = RandomizedSearchCV(pipe, 
                                     params,
                                     n_iter=5,
                                     scoring=make_scorer(h2o_r2_score),
                                     cv=custom_cv,
                                     random_state=42,
                                     n_jobs=1)

  random_search.fit(iris[1:],iris[0])

  print(random_search.best_estimator_)
Esempio n. 5
0
def selectTasks():
    while True:
        print "\nSelect the Model for classification:"
        print "Enter 1 : Logistic Regression"
        print "Enter 2 : Naive Bayes"
        print "Enter 3 : Support Vector Machne Model using SKlearn library"
        print "Enter 4 : Random Forest Model using SKlearn library"
        print "Enter 5 : To exit!!!!"
        options = {
            1: ModelLogisticRegression,
            2: NaiveBayes,
            3: svm.SVC,
            4: RandomForestClassifier
        }
        print "Enter Your Choice >>> "
        x = input()
        if x == 5:
            break
        elif x == 4:
            print "Classification on Random Forest Model using SKLearn Library"
            runModel(options[x](n_jobs=2, random_state=0))
        elif x == 3:
            print "Classification on Support Vector Machine Model using SKLearn Library"
            parameters = {
                'C': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10],
                'kernel': ['linear'],
                'random_state': [1]
            }
            svc = svm.SVC(kernel='linear', probability=True, random_state=0)
            roc_auc_scorer = make_scorer(roc_auc_score)
            modelObj = GridSearchCV(svc, parameters, scoring=roc_auc_scorer)
            runModel(modelObj)
        else:
            print "Classification on " + MODEL[x]
            runModel(options[x](PARAMS[x]))
Esempio n. 6
0
 def make_scorer_for_search(self, kwargs):
     if self.check_should_run_search():
         custom_scorer = make_scorer(
             self.search_scoring.score,
             greater_is_better=self.search_scoring.greater_is_better,
             needs_proba=self.search_scoring.needs_proba,
             needs_threshold=False,
             **kwargs)
     else:
         custom_scorer = make_scorer(
             self.scorer.score,
             greater_is_better=self.scorer.greater_is_better,
             needs_proba=self.scorer.needs_proba,
             needs_threshold=False,
             **kwargs)
     return custom_scorer
    def report(self, pipeline: AbstractPipeline):

        report_dict = defaultdict(list)

        for model in pipeline.get_models():
            model_name = model.short_name

            custom_scorer = make_scorer(
                model.scorer.score,
                greater_is_better=model.scorer.greater_is_better,
                needs_proba=model.scorer.needs_proba,
                needs_threshold=False)

            try:
                cv = cross_val_score(model.best_model,
                                     pipeline.train,
                                     pipeline.train_y,
                                     verbose=self.verbose,
                                     scoring=custom_scorer,
                                     cv=self.cv_num,
                                     n_jobs=-1)
                report_dict['model_name'].append(model_name)
                report_dict['cross_val_score'].append(cv)
            except:
                print('Cross Val Failed: ' + model_name)

        report_df = pd.DataFrame(report_dict)
        folder = Configuration.get_cache_subfolder()
        path = pkg_resources.resource_filename(
            'crcdal', 'cache/' + folder + '/' + self.sub_folder + '/')
        pkg_resources.ensure_directory(path)
        report_df.to_csv(path + pipeline.dataset_tag +
                         '_model_cross_val_report.csv')
def steam_learning_forest(data, NUM_FOLDS):
    """
    Trains a random forest model using the given data.
    Uses K-Fold validation with NUM_FOLDS folds.
    A string describing the results is returned.
    Takes roughly 8 minutes to run.
    Number of trees was measured for time efficiency after the rate of decrease in the error diminished. 
    At ~200, this peaks. If we choose arbitrarily larger, 1500 trees, we only achieve a decrease in the thousandths.
    """
    trees = 200

    forest_train = data[["positive_ratings_", "negative_ratings_", "owners_", "average_playtime_", "median_playtime_"]]
    forest_label = data[["price_"]]
    
    kfold = KFold(n_splits=NUM_FOLDS, random_state=None, shuffle=True)
    forest_regressor = RandomForestRegressor(n_estimators=trees, random_state=0)

    mse_scorer = make_scorer(mean_squared_error)
    results = cross_val_score(forest_regressor, forest_train, forest_label.values.ravel(), scoring=mse_scorer, cv=kfold)
    print(f"Random Forest - MSE Array: {results}")

    mean_overall = np.mean(results)
    final_results = f"Random Forest - Mean MSE over {NUM_FOLDS} folds: {mean_overall}"
    print(final_results)
    return final_results
def categorize(clf, target, data, njobs=6):
    """
    Expects a pandas series and a pandas data frame.
    Both need to be indexed with the same index.
    """
    from imblearn.pipeline import Pipeline
    from sklearn.metrics.scorer import make_scorer
    from sklearn.metrics import recall_score, precision_score
    from sklearn.utils.multiclass import type_of_target

    # Determine prediction target:
    y_type = type_of_target(target)
    if y_type == "multiclass":
        metrics = {"roc_auc": make_scorer(multiclass_roc, average="weighted")}
    else:
        metrics = ["roc_auc"]

    score = cross_validate(
        clf,
        data,
        target,
        cv=10,
        scoring=metrics,
        return_train_score=False,
        n_jobs=njobs,
    )
    del score["fit_time"]
    del score["score_time"]
    score = {k: np.mean(v) for k, v in list(score.items())}
    print(score)

    return score
Esempio n. 10
0
def fit_model(X_train, y_train):
    # 创建决策树模型
    from sklearn.tree import DecisionTreeRegressor 
    model = DecisionTreeRegressor()
    
    from sklearn.cross_validation import KFold
    from sklearn.metrics import make_scorer
    from sklearn import grid_search
    from sklearn import metrics
    
    cross_validator = KFold(5)
    param_grid = {"max_depth":[4,5,6,7],
#                   "min_samples_split": [30,20,40],
#                   "min_samples_leaf":[10,20,30]
                  }
    
    from sklearn.metrics import r2_score
    def performance_metric(y_test, y_pred):
        score = r2_score(y_test, y_pred)
        return score
    
    scoring_fnc = make_scorer(performance_metric)
    
    model = grid_search.GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=1, cv=cross_validator, scoring=scoring_fnc, verbose=10)
    
    
    model.fit(X_train, y_train)
    print(model.best_estimator_)
    print(model.grid_scores_)
    print(model.best_params_)
    print(model.best_score_)
    return model.best_estimator_
Esempio n. 11
0
def selectTasks():
    while True:
        print "\nSelect the Model for classification:"
        print "Enter 1 : Logistic Regression"
        print "Enter 2 : Naive Bayes"
        print "Enter 3 : Support Vector Machne Model using SKlearn library"
        print "Enter 4 : Random Forest Model using SKlearn library"
        print "Enter 5 : To exit!!!!"
        options={1:ModelLogisticRegression,
                 2:NaiveBayes,
                 3:svm.SVC,
                 4:RandomForestClassifier}
        print "Enter Your Choice >>> "
        x=input()
        if x==5:
            break
        elif x==4:
            print "Classification on Random Forest Model using SKLearn Library"
            runModel(options[x](n_jobs=2,random_state=0))
        elif x==3:
            print "Classification on Support Vector Machine Model using SKLearn Library"
            parameters={'C' :[0.00001,0.0001,0.001,0.01,0.1,1,10],'kernel':['linear'], 'random_state': [1]}
            svc=svm.SVC(kernel = 'linear', probability = True, random_state = 0)
            roc_auc_scorer = make_scorer(roc_auc_score)
            modelObj = GridSearchCV(svc, parameters, scoring=roc_auc_scorer)
            runModel(modelObj)
        else:
            print "Classification on "+MODEL[x]
            runModel(options[x](PARAMS[x]))
Esempio n. 12
0
def train_7(X_train_dev, y_train_dev, dev_size=0.1, n_folds=10):
    print(
        'Model 7 - (MinMaxScaler) + RandomForestRegressor on log1p(n_clicks) with GridSearchCV'
    )
    num_transformer = Pipeline([('normalizer',
                                 MinMaxScaler(feature_range=(0, 1)))])
    regressor = RandomForestRegressor(random_state=0)
    model = Pipeline([('preprocessor', num_transformer),
                      ('regressor', regressor)])
    wmse_scorer = make_scorer(evaluate.wmse_log, greater_is_better=False)
    y_train_dev = np.log1p(y_train_dev)
    param_grid = {
        'regressor__n_estimators': [50, 100, 200],
        'regressor__max_features': [None, 1 / 3]
    }
    grid = GridSearchCV(model,
                        param_grid,
                        cv=10,
                        scoring=wmse_scorer,
                        iid=False,
                        return_train_score=True,
                        error_score=np.nan,
                        n_jobs=-1,
                        verbose=5)
    grid.fit(X_train_dev, y_train_dev)
    print('Best Parameters:', grid.best_params_)
    print('Validation WMSE:', round(-grid.best_score_, 5))
    best_model = clone(grid.best_estimator_)
    best_model.fit(X_train_dev, y_train_dev)
    return best_model
Esempio n. 13
0
def compute_SVR(train_x, train_y, test_x):

        # make MAE scoring
        MAE = make_scorer(compute_error, greater_is_better = False)

        ######### SVR - Polynomial/rbf Kernel #########
        # make pipeline
        std_SVR = make_pipeline(StandardScaler(), SVR())
        params = {'svr__kernel': ['poly', 'rbf'], 'svr__degree': [1, 2]}
        gs = GridSearchCV(estimator = std_SVR, param_grid = params, scoring = MAE, n_jobs=-1, cv = 5, return_train_score = True)


	# fit grid search
        gs.fit(train_x, train_y)

        print('SVR train score', -gs.cv_results_['mean_train_score'])
        print('SVR test score', -gs.cv_results_['mean_test_score'])
        print('Best Parameter', gs.best_params_)
        print('Best score', -gs.best_score_)
        print('Parameters', gs.cv_results_['params'])
        
        # Train the best Model
        best_SVR = make_pipeline(StandardScaler(), SVR(kernel='poly', degree=1))
        best_SVR.fit(train_x, train_y)

        # Make Prediction
        test_y = best_SVR.predict(test_x)
        # Create test output values
        predicted_y = test_y * -1
        # Output file location
        file_name = '../Predictions/SVR_best.csv'

        # Writing output in Kaggle format
        print('Writing output to ', file_name)
        kaggle.kaggleize(predicted_y, file_name)
def OptimizeClassifier(data, target, clf, grid, scores={'f1': make_scorer(f1)}, cv=10, refit='f1'):
    data_train, data_test, target_train, target_test = train_test_split(data, target, test_size=0.3)
    for score in scores:
        print("# Tuning hyper-parameters for %s" % score)
        print()

        clf = GridSearchCV(clf, grid, cv=cv,
                           scoring=scores, refit=refit)
        clf.fit(data_train, target_train)

        print("Best parameters set found on development set:")
        print()
        print(clf.best_params_)
        print()
        print("Grid scores on development set:")
        print()
        means = clf.cv_results_['mean_test_f1']
        stds = clf.cv_results_['std_test_f1']
        for mean, std, params in zip(means, stds, clf.cv_results_['params']):
            print("%0.5f (+/-%0.03f) for %r"
                  % (mean, std * 2, params))
        print()

        print("Detailed classification report:")
        print()
        print("The model is trained on the full development set.")
        print("The scores are computed on the full evaluation set.")
        print()
        y_true, y_pred = target_test, clf.predict(data_test)
        print(classification_report(y_true, y_pred))
        print()
Esempio n. 15
0
def check_cv():
    df_cv = filter_one_vs_one(df_train,1,5)
    model = SVC(kernel="poly",degree=2,gamma=1,coef0=1)
    params = {
            "C": [.0001,.001,.01,.1,1]
            }
    scorer = make_scorer(calc_error,greater_is_better=False)
    
    scores = {}
    scores_list = {}
    for i in range(100):
        cv = KFold(n_splits=10,shuffle=True,random_state=i)
        gs = GridSearchCV(model,params,scoring=scorer,cv=cv)
        gs.fit(df_cv[predictors],df_cv.digit)
        
        best_param = gs.best_params_["C"]
        best_score = gs.best_score_
        if best_param not in scores:
            scores[best_param] = 1
        else:
            scores[best_param] += 1
        if best_param not in scores_list:
            scores_list[best_param] = []
        else:
            scores_list[best_param].append(best_score)
    print("Score count:")
    for k, v in scores.items():
        print("C={}: {} with average score {:.3f}".format(k, v, pd.np.abs(pd.np.mean(scores_list[k]))))
def reducer_creation(df_final, target_column, reducer, dataset):
    X = df_final.loc[:, df_final.columns != target_column]
    Y = df_final.loc[:, df_final.columns == target_column]

    my_scorer = make_scorer(cluster_acc, greater_is_better=True)
    km = kmeans(random_state=5)
    gmm = GMM(random_state=5)

    components = np.linspace(2,
                             len(X.columns) - 1,
                             5,
                             dtype=np.int64,
                             endpoint=True)
    estimators = [('reduce_dim', reducer), ('clf', km)]
    param_grid = [
        dict(reduce_dim__n_components=components, clf__n_clusters=components)
    ]
    pipe = Pipeline(estimators)
    grid_search = GridSearchCV(pipe, param_grid=param_grid, scoring=my_scorer)
    grid_search.fit(X, Y)
    mean_scores = np.array(grid_search.cv_results_['mean_test_score']).reshape(
        len(components), -1, len(components))
    plotReducerAndCluster(mean_scores, components)

    estimators = [('reduce_dim', reducer), ('clf', gmm)]
    param_grid = [
        dict(reduce_dim__n_components=components, clf__n_components=components)
    ]
    pipe = Pipeline(estimators)
    grid_search = GridSearchCV(pipe, param_grid=param_grid, scoring=my_scorer)
    grid_search.fit(X, Y)

    mean_scores = np.array(grid_search.cv_results_['mean_test_score']).reshape(
        len(components), -1, len(components))
    plotReducerAndCluster(mean_scores, components)
Esempio n. 17
0
def runTPOT(X, y, metric, algo):
    aml_config_dict = aml_config()

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=0.75,
                                                        test_size=0.25)

    if algo == "Classifier":
        pipeline_optimizer = TPOTClassifier(generations=1,
                                            population_size=5,
                                            verbosity=2,
                                            warm_start=True)
        pipeline_optimizer.fit(X_train, y_train)
        print(pipeline_optimizer.score(X_test, y_test))
    elif algo == 'Regressor':

        def aml_reg_scorer(y_pred, y_test):
            rsme = sqrt(mean_squared_error(y_test, y_pred))
            return rsme

        aml_custom_scorer = make_scorer(aml_reg_scorer,
                                        greater_is_better=False)
        pipeline_optimizer = TPOTRegressor(generations=1,
                                           population_size=5,
                                           verbosity=2,
                                           warm_start=True,
                                           scoring=aml_custom_scorer)
        pipeline_optimizer.fit(X_train, y_train)
        print(pipeline_optimizer.score(X_test, y_test))
    else:
        raise Exception('Incorrect Problem Type')
    return pipeline_optimizer, pipeline_optimizer.score(X_test, y_test), len(
        pipeline_optimizer.evaluated_individuals_)
    def __init__(self, algorithm, params=None):
        '''
        Initialize the class with a list of possible algorithms and 
        recommended hyperparameter ranges
        '''
        if algorithm == 'etr':  # Extra trees regressor
            from sklearn.ensemble import ExtraTreesRegressor
            self.hyper_range = {
                "max_depth": [4, 8, 12, 16, 20],
                "min_samples_split": np.arange(2, 11),
                "min_samples_leaf": np.arange(1, 11),
                "n_estimators": np.arange(10, 801, 40)
            }
            self.algorithm = ExtraTreesRegressor()

        elif algorithm == 'gbm':  # Gradient boosting model
            from sklearn.ensemble import GradientBoostingRegressor
            self.hyper_range = {
                "max_depth": [4, 8, 12, 16, 20],
                "min_samples_split": np.arange(2, 11),
                "min_samples_leaf": np.arange(1, 11),
                "n_estimators": np.arange(10, 801, 40)
            }
            self.algorithm = GradientBoostingRegressor()

        elif algorithm == 'gam':  # Generalized additive model
            from pygam import GAM
            self.hyper_range = {'n_splines': np.arange(5, 40)}
            self.algorithm = GAM()

        # Set scorer as R2
        self.my_scorer = make_scorer(r2_score, greater_is_better=True)
def steam_learning_boosting(data, NUM_FOLDS):
    """
    Ensemble AdaBoosting to boost over each fold
    Uses K-Fold validation with NUM_FOLDS folds.
    A string describing the results is returned.
    Number of trees was measured for time efficiency after the rate of decrease in the error diminished. 
    At ~200, this peaks. If we choose arbitrarily larger, 1500 trees, we only achieve a decrease in the thousandths.
    Seed set for predictable results
    """
    trees = 200

    X = data[["positive_ratings_", "negative_ratings_", "owners_", "average_playtime_", "median_playtime_"]]
    y = data[["price_"]]

    kfold = KFold(n_splits=NUM_FOLDS)

    model = AdaBoostRegressor(n_estimators=trees)
    
    mse_scorer = make_scorer(mean_squared_error)
    results = cross_val_score(model, X, y.values.ravel(), scoring=mse_scorer, cv=kfold)
    print(f"Boosting - MSE Array: {results}")

    final_results = f"Boosting - Mean MSE over {NUM_FOLDS} folds: {np.mean(results)}"
    print(final_results)
    return(final_results)
def model(X,y,z):
    import numpy as np
    import pandas as pd
    from sklearn.ensemble import RandomForestRegressor
    from sklearn import linear_model
    from sklearn.tree import DecisionTreeRegressor
    from sklearn.neighbors import KNeighborsRegressor
    import xgboost as xgb
    from sklearn.model_selection import RandomizedSearchCV
    from xgboost import XGBRegressor
    from sklearn.metrics.scorer import make_scorer
    
    def rmse_eval(y, y0):
        error = np.sqrt(np.mean(np.power(y-y0, 2)))
        return error
    my_scorer = make_scorer(rmse_eval, greater_is_better=True)
    par_rf = {'n_estimators': [100, 150, 200,300],
              'max_depth' : [3,6,9,12]}
    par_dt = {'max_features': ['auto', 'sqrt', 'log2'],
              'max_depth' : [3,6,9,12]}
    par_xg = {'n_estimators': [100, 150, 200,300],
              'max_depth' : [3,6,9,12]}
    
    model1 = RandomizedSearchCV(estimator=RandomForestRegressor(random_state=42), param_distributions=par_rf, cv= 10, n_iter=1,scoring=my_scorer)
    model2 = RandomizedSearchCV(estimator=DecisionTreeRegressor(random_state=42), param_distributions=par_dt, cv= 10, n_iter=1,scoring=my_scorer)
    model3 = RandomizedSearchCV(estimator=xgb.XGBRegressor(random_state=42), param_distributions=par_xg, cv= 5, n_iter=10,scoring=my_scorer)

    # Fit it to the data
    model1.fit(X, y)
    model2.fit(X, y)
    model3.fit(X, y)

    #store preds on test and train data
    preds1 = model1.predict(X)
    preds2 = model2.predict(X)
    preds3 = model3.predict(X)
    test_preds1 = model1.predict(z)
    test_preds2 = model2.predict(z)
    test_preds3 = model3.predict(z)
    print(X.values)
    print(preds1)
    
    #store predictions
    stacked_predictions = np.column_stack((preds1,preds2,preds3))
    stacked_test_predictions = np.column_stack((test_preds1,test_preds2,test_preds3))
    print(stacked_predictions)
    #Fit & predict with the meta model
    meta_model = linear_model.LinearRegression()
    meta_model.fit(stacked_predictions,y)
    final_predictions = np.expm1(meta_model.predict(stacked_test_predictions))
    df2 = pd.DataFrame(data=[])
    df2['true'] = np.expm1(meta_model.predict(stacked_predictions))
    df2['pred_rf'] = np.expm1(y)
    df2[['true','pred_rf']].plot()
    print('Train score of model1:',rmse_eval(y, preds1))
    print('Train score of model2:',rmse_eval(y, preds2))
    print('Train score of model3:',rmse_eval(y, preds3))
    print('Train score of stacked model:',rmse_eval(y, meta_model.predict(stacked_predictions)))
    return final_predictions
Esempio n. 21
0
def crossValidatedScores(data, target, hlayers, clf):
    data_train, data_test, target_train, target_test = train_test_split(data, target, test_size=0.3)
    clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=hlayers, random_state=1)
    scoring = {'tp': make_scorer(metrics.tp), 'tn': make_scorer(metrics.tp),
               'fp': make_scorer(metrics.fp), 'fn': make_scorer(metrics.fn),
               'f1': make_scorer(metrics.f1), 'precision': make_scorer(metrics.precision),
               'sensitivity': make_scorer(metrics.sensitivity), 'specificity': make_scorer(metrics.specificity)}
               #'ROC': make_scorer(roc)}
    results = cross_validate(clf.fit(data_train, target_train), data_test, target_test, scoring=scoring, cv=10)
    return results
Esempio n. 22
0
 def convert_sklearn_metric_function(scoring):
     if callable(scoring):
         module = getattr(scoring, '__module__', None)
         if (hasattr(module, 'startswith')
                 and module.startwith('sklearn.metrics.')
                 and not module.startwith('sklearn.metrics.scorer')
                 and not module.startwith('sklearn.metrics.tests')):
             return make_scorer(scoring)
     return scoring
Esempio n. 23
0
def train_9(X_train_dev, y_train_dev, dev_size=0.1, n_folds=10):
    print(
        'Model 9 - (MinMaxScaler) + (OneHotEncoder), XGBRegressor on log1p(n_clicks) with GridSearchCV'
    )
    cat_transformer = Pipeline([('encoder',
                                 OneHotEncoder(categories='auto',
                                               handle_unknown='ignore'))])
    num_transformer = Pipeline([('normalizer',
                                 MinMaxScaler(feature_range=(0, 1)))])
    no_transformer = Pipeline([('transformer', DummyTransformer())])
    preprocessor = ColumnTransformer([('cat', cat_transformer, [4]),
                                      ('num', num_transformer, [1, 2, 5, 6,
                                                                7]),
                                      ('no', no_transformer, [0, 3, 8, 9,
                                                              10])])
    regressor = xgb.XGBRegressor()
    model = Pipeline([('preprocessor', preprocessor),
                      ('regressor', regressor)])
    wmse_scorer = make_scorer(evaluate.wmse_log, greater_is_better=False)
    y_train_dev = np.log1p(y_train_dev)
    param_grid = {
        'regressor__n_estimators': [500],
        'regressor__learning_rate': [.005, 0.01, .03, .05],
        'regressor__max_depth': [5, 7, 9, 11],
        'regressor__n_jobs': [-1],
        'regressor__random_state': [0]
    }
    cv = KFold(n_splits=n_folds, shuffle=True, random_state=0)
    grid = GridSearchCV(model,
                        param_grid,
                        cv=cv,
                        scoring=wmse_scorer,
                        iid=False,
                        return_train_score=True,
                        error_score=np.nan,
                        n_jobs=-1,
                        verbose=3)
    X_xgb_train_dev, X_xgb_test, y_xgb_train_dev, y_xgb_test = train_test_split(
        X_train_dev,
        y_train_dev,
        test_size=dev_size,
        shuffle=True,
        random_state=0)
    fit_params = {
        'regressor__early_stopping_rounds': 10,
        'regressor__sample_weight': 1 + y_xgb_train_dev,
        'regressor__sample_weight_eval_set': [1 + y_xgb_test],
        'regressor__eval_metric': wmse_log_xgb,
        'regressor__eval_set': [[X_xgb_test, y_xgb_test]]
    }
    grid.fit(X_xgb_train_dev, y_xgb_train_dev, **fit_params)
    print('Best Parameters:', grid.best_params_)
    print('Validation WMSE:', round(-grid.best_score_, 5))
    best_model = clone(grid.best_estimator_)
    best_model.fit(X_train_dev, y_train_dev)
    return best_model
def crossValidatedScores(data, target, clf, cv=3):
    scoring = {'tp': make_scorer(tp), 'tn': make_scorer(tn), 'fp': make_scorer(fp), 'fn': make_scorer(fn),
               'accuracy': make_scorer(accuracy), 'f1': make_scorer(f1), 'precision': make_scorer(precision),
               'sensitivity': make_scorer(sensitivity), 'specificity': make_scorer(specificity)}
               #'ROC': make_scorer(roc)}

    results = cross_validate(clf, data, target, scoring=scoring, cv=cv, return_train_score=False)
    return results
Esempio n. 25
0
 def __init__(self, X, Y):
     self.X = X
     self.Y = Y
     self.scoring = {  # 'auc_score_macro': make_scorer(self.roc_auc_macro),
         # 'auc_score_micro': make_scorer(self.roc_auc_micro),
         'accuracy': 'accuracy',
         'scores_p_1': 'precision',
         'scores_r_1': 'recall',
         'scores_f1_1': 'f1',
         'scores_f1_0': make_scorer(self.f1_0),
         'scores_p_0': make_scorer(self.precision_0),
         'scores_r_0': make_scorer(self.recall_0),
         'precision_micro': 'precision_micro',
         'precision_macro': 'precision_macro',
         'recall_macro': 'recall_macro',
         'recall_micro': 'recall_micro',
         'f1_macro': 'f1_macro',
         'f1_micro': 'f1_micro'
     }
    def test_model(**params):
        model.set_params(**params)
        scores = cross_val_score(model, X_full, y_full,
                                 cv=ShuffleSplit(n_splits=1, test_size=0.10, random_state=42),
                                 scoring=make_scorer(r2_score))
        r2_now = np.mean(scores)
        if r2_now < 0:
            r2_now =0

        return np.sqrt(r2_now)
Esempio n. 27
0
def multiGridSearch(filename,classifiers,classNames, parameters, crossVal, Nfrac,nTests,test_set_fraction,plotResults=False, impute_scale= True, parallel=False):
    allResults=[]
    best=0
    bestEstim= None
    bestEstimName=None
    scorer=make_scorer(matthews_corrcoef)
    print("Grid search on %s fraction of dataset" % Nfrac)
    print("_" * 10)
    print("\n"*5)
    for i in range(len(classifiers)):
        classif= classifiers[i]
        className=classNames[i]
        param=parameters[i]

        results=[]

        print("Evaluating performance for classifier: %s" % className)
        for j in range(nTests) :
            print("Test number %d for %s:" % (j+1, className))
            X_train, y_train, X_test, y_test, test_id = importData_chunks(filename,Nfrac,test_set_fraction)
            if impute_scale:
                X_train, X_test= imputeAndScale(X_train,X_test)
            if parallel:
                clf=GridSearchCV(classif,param,scoring=scorer,cv=crossVal, verbose=1, n_jobs=-1)
                y_pred= GridSearch(clf, X_train, y_train, X_test)
                perf = evaluate(y_pred,y_test)
                results.append(perf)
            else:
                clf=GridSearchCV(classif,param,scoring=scorer,cv=crossVal, verbose=1)
                print(X_train.shape)
                print(y_train.shape)
                print(X_test.shape)
                y_pred= GridSearch(clf, X_train, y_train, X_test)
                print(y_pred.shape)
                perf = evaluate(y_pred,y_test)
                if(className == "SGB"):
                    print(clf.feature_importances)
                results.append(perf)
            if perf > best:
                bestEstim = clf
                bestEstimName=className
            del X_test
            del X_train
            del y_train
            del y_test
       
        allResults.append(results)
        print("_" * 10)
        print("\n"*5)
    print("Best results overall:")
    print("Best classifier:%s" % bestEstimName)
    print("Best parameters:")
    print(bestEstim.best_params_)
    return allResults, bestEstim
Esempio n. 28
0
def cross_val_full_scores(clf, df, y, cv=10):
    scoring = {
        'prec_macro': 'precision_macro',
        'rec_micro': make_scorer(recall_score, average='macro')
    }
    scores = cross_validate(clf,
                            df,
                            y,
                            scoring=scoring,
                            cv=10,
                            return_train_score=True)
Esempio n. 29
0
def convert_sklearn_metric_function(scoring):
    """If ``scoring`` is a sklearn metric function, convert it to a
    sklearn scorer and return it. Otherwise, return ``scoring`` unchanged."""
    if callable(scoring):
        module = getattr(scoring, '__module__', None)
        if (hasattr(module, 'startswith')
                and module.startswith('sklearn.metrics.')
                and not module.startswith('sklearn.metrics.scorer')
                and not module.startswith('sklearn.metrics.tests.')):
            return make_scorer(scoring)
    return scoring
Esempio n. 30
0
 def test_model(**params):
     model.set_params(**params)
     scores = cross_val_score(model,
                              X_full,
                              y_full,
                              cv=ShuffleSplit(n_splits=1,
                                              test_size=0.1,
                                              random_state=42),
                              scoring=make_scorer(r2_score))
     r2_test = np.mean(scores)
     return r2_test
Esempio n. 31
0
def cv(data_x, data_y, clf):
	'''
	对输入模型进行 K-Folder 交叉验证,返回验证结果
	:param data_x: 训练向量
	:param data_y: 训练标签
	:param clf: 分类器模型
	:return: 验证结果:各种评估得分
	'''
	scoring = {
		'precision_macro': 'precision_macro',
		'recall_macro': make_scorer(metrics.recall_score, average='macro'),
		'roc_auc_macro': make_scorer(metrics.roc_auc_score, average='macro'),
		'f1_macro': make_scorer(metrics.f1_score, average="macro"),
		'accuracy': make_scorer(metrics.accuracy_score),
	}
	cv_results = cross_validate(clf, data_x, data_y, scoring=scoring,
	                            n_jobs=4, cv=N_SPLIT, return_train_score=False, )
	for key in cv_results.keys():
		print(f"{key}:\t{np.mean(cv_results[key])}")
		# print("----------")
	return cv_results
Esempio n. 32
0
 def _crossValidate(self, y_train, X_train, refit=False):
     # Run the grid search
     print "Cross-validating for", self.numFolds, "folds"
     print "Args", self.classifierArgs
     cv = StratifiedKFold(y_train, n_folds=self.numFolds, shuffle=True, random_state=1) #self.getCV(y_train, self.meta.meta, numFolds=self.numFolds)
     #cv = BalancedIteratorCV(y_train, n_folds=self.numFolds, shuffle=True, random_state=1, examples=[x for x in self.meta.db.query("SELECT * from example WHERE [set] == 'train';")], groupBy="project_code")
     classifier, classifierArgs = self._getClassifier()
     metric = self.metric
     if metric == "bas":
         metric = make_scorer(balanced_accuracy_score)
     search = ExtendedGridSearchCV(classifier(), classifierArgs, refit=refit, cv=cv, 
                                   scoring=metric, verbose=self.verbose, n_jobs=self.parallel, 
                                   pre_dispatch=int(self.preDispatch) if self.preDispatch.isdigit() else self.preDispatch)
     search.fit(X_train, y_train)
     print "---------------------- Grid scores on development set --------------------------"
     results = []
     index = 0
     bestExtras = None
     bestScores = None
     for params, mean_score, scores in search.grid_scores_:
         print "Grid:", params
         results.append(self._getResult("train", classifier, cv, params, None, None, mean_score, scores, extra={"train_size":None, "test_size":None}))
         if bestScores == None or float(mean_score) > bestScores[1]:
             bestScores = (params, mean_score, scores)
             if hasattr(search, "extras_"):
                 bestExtras = search.extras_[index]
         for fold in range(len(scores)):
             result = self._getResult("train", classifier, cv, params, scores[fold], fold)
             if hasattr(search, "extras_"):
                 for key in search.extras_[index][fold].get("counts", {}).keys():
                     result[key + "_size"] = search.extras_[index][fold]["counts"][key]
             results.append(result)
         if hasattr(search, "extras_") and self.classes and len(self.classes) == 2:
             print ["%0.8f" % x for x in self._validateExtras(search.extras_[index], y_train)], "(eval:auc)"
         print scores, "(" + self.metric + ")"
         print "%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() / 2, params)                    
         index += 1
     print "---------------------- Best scores on development set --------------------------"
     params, mean_score, scores = bestScores
     print scores
     print "%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() / 2, params)
     baselines = self._calculateBaseline(cv, y_train)
     print "MCB = %0.3f (+/-%0.03f) for" % (np.mean(baselines), np.std(baselines) / 2), ["%0.3f" % x for x in baselines], "(" + self.metric + ")"
     print "--------------------------------------------------------------------------------"
     # Save the grid search results
     print "Saving results"
     self._insert("result", results)
     self._saveExtras(bestExtras, "train")
     self.meta.flush() 
     return search
Esempio n. 33
0
def scale_pca_rf_pipe():

  from h2o.transforms.preprocessing import H2OScaler
  from h2o.transforms.decomposition import H2OPCA
  from h2o.estimators.random_forest import H2ORandomForestEstimator
  from sklearn.pipeline import Pipeline
  from sklearn.grid_search import RandomizedSearchCV
  from h2o.cross_validation import H2OKFold
  from h2o.model.regression import h2o_r2_score
  from sklearn.metrics.scorer import make_scorer
  from scipy.stats import randint


  iris = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))

  # build  transformation pipeline using sklearn's Pipeline and H2O transforms
  pipe = Pipeline([("standardize", H2OScaler()),
                   ("pca", H2OPCA(n_components=2)),
                   ("rf", H2ORandomForestEstimator(seed=42,ntrees=50))])

  params = {"standardize__center":    [True, False],             # Parameters to test
            "standardize__scale":     [True, False],
            "pca__n_components":      randint(2, iris[1:].shape[1]),
            "rf__ntrees":             randint(50,60),
            "rf__max_depth":          randint(4,8),
            "rf__min_rows":           randint(5,10),}

  custom_cv = H2OKFold(iris, n_folds=5, seed=42)
  random_search = RandomizedSearchCV(pipe, params,
                                     n_iter=5,
                                     scoring=make_scorer(h2o_r2_score),
                                     cv=custom_cv,
                                     random_state=42,
                                     n_jobs=1)


  random_search.fit(iris[1:],iris[0])

  print random_search.best_estimator_
Esempio n. 34
0
pearson_data(datasetName, model_stacking_models)

#print('normalizing!')
#X = normalizer.fit_transform(X)

# LABELING
# labelProp = sksemi.label_propagation.LabelSpreading(kernel='rbf', gamma=150, n_neighbors=3, alpha=0.15, max_iter=600000, tol=0.001)
# print('fitting label spreader')
# labelProp.fit(X, Y)
# print('predicting labels for Y')
# Y=labelProp.transduction_
# print('Shape of Y:', Y.shape)
# print('first row: ', Y[0])

# SCORER
scorer = make_scorer(score_func=singleLabelScore, greater_is_better=False)

# PREPROCESSING
# SCALING
minMaxScaler = MinMaxScaler(feature_range=(0.0,1.0))
#normalizer = skprep.Normalizer()
columnDeleter = fs.FeatureDeleter()

# FEATURE SELECTION
varianceThresholdSelector = VarianceThreshold(threshold=(0))
percentileSelector = SelectPercentile(score_func=f_classif, percentile=20)
kBestSelector = SelectKBest(f_classif, 1000)

# FEATURE EXTRACTION
#rbmPipe = skpipe.Pipeline(steps=[('scaling', minMaxScaler), ('rbm', rbm)])
nmf = NMF(n_components=150)
## Create features and labels
my_dataset = df.T.to_dict('dict')
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)

## Define scoring method to return 
## f1 when recall and precision are > 30 
def score_func(y_true, y_pred, **kwargs):
    r = recall_score(y_true, y_pred, **kwargs)
    p = precision_score(y_true, y_pred, **kwargs)
    if r > 0.30 and p > 0.30:
       return f1_score(y_true, y_pred, **kwargs)
    else:
       return 0

scorer       = make_scorer(score_func)

clf = Pipeline(steps=[
   # ('scaler', MinMaxScaler()),
   # ('features', FeatureUnion([
   #    ('ngram_tf_idf', Pipeline([
   #      ('kbest', SelectKBest(k=5, score_func=f_classif)),
   #      ('lda', LDA(n_components=1, priors=None, shrinkage=None, solver='svd', store_covariance=False, tol=0.0001)), 
   #      ('kmeans', MiniBatchKMeans(n_clusters=20, n_init=10, max_no_improvement=10, verbose=0)),
   #    ]))])),
   ('kbest', SelectKBest(k=5, score_func=f_classif)),
   ('lda', LDA(n_components=1, priors=None, shrinkage=None, solver='svd', store_covariance=False, tol=0.0001)), 
   ('kmeans', MiniBatchKMeans(n_clusters=20, n_init=10, max_no_improvement=10, verbose=0)),
   ('classifier', GaussianNB()) 
   ])
import math


def MAPE_scorer(y, y_pred):
    error = 0
    num = len(y)
    for i in range(0, num):
        if y[i] > 0:
            error += math.fabs(y_pred[i] - y[i]) / y[i]
    #print('error, num:', error, num)
    if num > 0:
        return error / num
    else:
        return 0

my_scorer = make_scorer(MAPE_scorer)
# y = df['gap'][:21*66*144]
# MAPE_scorer(y, [1 for i in range(0, 21*66*144)])


def cal_dist(dist):
    global cv_pred_all, cv_real_all
    # split training, CV, test set
    df = pd.read_csv("data/season_1/features/"+ str(dist)+".csv")
    df["date"] = df["date"].apply(lambda x: pd.to_datetime(x, errors='coerce'))
    training = df.loc[(df.date < '2016-01-17') | (df.date == '2016-01-18')]
    training_time = training.loc[df.time_slice.isin([46, 58, 70, 82, 94, 106, 118, 130, 142])]
    cv = df.loc[df['date'].isin(['2016-01-17','2016-01-19','2016-01-20',
                                '2016-01-21'])]
    cv_time = cv.loc[df.time_slice.isin([46, 58, 70, 82, 94, 106, 118, 130, 142])]
    # only catch time slice in test set
Esempio n. 37
0
    def __init__(self, generations=100, population_size=100, offspring_size=None,
                 mutation_rate=0.9, crossover_rate=0.1,
                 scoring=None, cv=5, subsample=1.0, n_jobs=1,
                 max_time_mins=None, max_eval_time_mins=5,
                 random_state=None, config_dict=None, warm_start=False,
                 verbosity=0, disable_update_check=False):
        """Set up the genetic programming algorithm for pipeline optimization.

        Parameters
        ----------
        generations: int, optional (default: 100)
            Number of iterations to the run pipeline optimization process.
            Generally, TPOT will work better when you give it more generations (and
            therefore time) to optimize the pipeline. TPOT will evaluate
            POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total.
        population_size: int, optional (default: 100)
            Number of individuals to retain in the GP population every generation.
            Generally, TPOT will work better when you give it more individuals
            (and therefore time) to optimize the pipeline. TPOT will evaluate
            POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total.
        offspring_size: int, optional (default: None)
            Number of offspring to produce in each GP generation.
            By default, offspring_size = population_size.
        mutation_rate: float, optional (default: 0.9)
            Mutation rate for the genetic programming algorithm in the range [0.0, 1.0].
            This parameter tells the GP algorithm how many pipelines to apply random
            changes to every generation. We recommend using the default parameter unless
            you understand how the mutation rate affects GP algorithms.
        crossover_rate: float, optional (default: 0.1)
            Crossover rate for the genetic programming algorithm in the range [0.0, 1.0].
            This parameter tells the genetic programming algorithm how many pipelines to
            "breed" every generation. We recommend using the default parameter unless you
            understand how the mutation rate affects GP algorithms.
        scoring: string or callable, optional
            Function used to evaluate the quality of a given pipeline for the
            problem. By default, accuracy is used for classification problems and
            mean squared error (MSE) for regression problems.

            Offers the same options as sklearn.model_selection.cross_val_score as well as
            a built-in score 'balanced_accuracy'. Classification metrics:

            ['accuracy', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy',
            'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted',
            'precision', 'precision_macro', 'precision_micro', 'precision_samples',
            'precision_weighted', 'recall', 'recall_macro', 'recall_micro',
            'recall_samples', 'recall_weighted', 'roc_auc']

            Regression metrics:

            ['neg_median_absolute_error', 'neg_mean_absolute_error',
            'neg_mean_squared_error', 'r2']

            If you would like to use a custom scoring function, you can pass a callable
            function to this parameter with the signature scorer(y_true, y_pred).
            See the section on scoring functions in the documentation for more details.

            TPOT assumes that any custom scoring function with "error" or "loss" in the
            name is meant to be minimized, whereas any other functions will be maximized.
        cv: int or cross-validation generator, optional (default: 5)
            If CV is a number, then it is the number of folds to evaluate each
            pipeline over in k-fold cross-validation during the TPOT optimization
             process. If it is an object then it is an object to be used as a
             cross-validation generator.
        subsample: float, optional (default: 1.0)
            Subsample ratio of the training instance. Setting it to 0.5 means that TPOT
            randomly collects half of training samples for pipeline optimization process.
        n_jobs: int, optional (default: 1)
            Number of CPUs for evaluating pipelines in parallel during the TPOT
            optimization process. Assigning this to -1 will use as many cores as available
            on the computer.
        max_time_mins: int, optional (default: None)
            How many minutes TPOT has to optimize the pipeline.
            If provided, this setting will override the "generations" parameter and allow
            TPOT to run until it runs out of time.
        max_eval_time_mins: int, optional (default: 5)
            How many minutes TPOT has to optimize a single pipeline.
            Setting this parameter to higher values will allow TPOT to explore more
            complex pipelines, but will also allow TPOT to run longer.
        random_state: int, optional (default: None)
            Random number generator seed for TPOT. Use this parameter to make sure
            that TPOT will give you the same results each time you run it against the
            same data set with that seed.
        config_dict: a Python dictionary or string, optional (default: None)
            Python dictionary:
                A dictionary customizing the operators and parameters that
                TPOT uses in the optimization process.
                For examples, see config_regressor.py and config_classifier.py
            Path for configuration file:
                A path to a configuration file for customizing the operators and parameters that
                TPOT uses in the optimization process.
                For examples, see config_regressor.py and config_classifier.py
            String 'TPOT light':
                TPOT uses a light version of operator configuration dictionary instead of
                the default one.
            String 'TPOT MDR':
                TPOT uses a list of TPOT-MDR operator configuration dictionary instead of
                the default one.
        warm_start: bool, optional (default: False)
            Flag indicating whether the TPOT instance will reuse the population from
            previous calls to fit().
        verbosity: int, optional (default: 0)
            How much information TPOT communicates while it's running.
            0 = none, 1 = minimal, 2 = high, 3 = all.
            A setting of 2 or higher will add a progress bar during the optimization procedure.
        disable_update_check: bool, optional (default: False)
            Flag indicating whether the TPOT version checker should be disabled.

        Returns
        -------
        None

        """
        if self.__class__.__name__ == 'TPOTBase':
            raise RuntimeError('Do not instantiate the TPOTBase class directly; use TPOTRegressor or TPOTClassifier instead.')

        # Prompt the user if their version is out of date
        self.disable_update_check = disable_update_check
        if not self.disable_update_check:
            update_check('tpot', __version__)

        self._pareto_front = None
        self._optimized_pipeline = None
        self.fitted_pipeline_ = None
        self._fitted_imputer = None
        self._pop = None
        self.warm_start = warm_start
        self.population_size = population_size
        self.generations = generations
        self.max_time_mins = max_time_mins
        self.max_eval_time_mins = max_eval_time_mins

        # Set offspring_size equal to population_size by default
        if offspring_size:
            self.offspring_size = offspring_size
        else:
            self.offspring_size = population_size

        self._setup_config(config_dict)

        self.operators = []
        self.arguments = []
        for key in sorted(self.config_dict.keys()):
            op_class, arg_types = TPOTOperatorClassFactory(
                key,
                self.config_dict[key],
                BaseClass=Operator,
                ArgBaseClass=ARGType
            )
            if op_class:
                self.operators.append(op_class)
                self.arguments += arg_types

        # Schedule TPOT to run for many generations if the user specifies a
        # run-time limit TPOT will automatically interrupt itself when the timer
        # runs out
        if max_time_mins is not None:
            self.generations = 1000000

        self.mutation_rate = mutation_rate
        self.crossover_rate = crossover_rate

        if self.mutation_rate + self.crossover_rate > 1:
            raise ValueError(
                'The sum of the crossover and mutation probabilities must be <= 1.0.'
            )

        self.verbosity = verbosity
        self.operators_context = {
            'make_pipeline': make_pipeline,
            'make_union': make_union,
            'StackingEstimator': StackingEstimator,
            'FunctionTransformer': FunctionTransformer,
            'copy': copy
        }
        self._pbar = None

        # Dictionary of individuals that have already been evaluated in previous
        # generations
        self.evaluated_individuals_ = {}
        self.random_state = random_state

        # If the user passed a custom scoring function, store it in the sklearn
        # SCORERS dictionary
        if scoring:
            if hasattr(scoring, '__call__'):
                scoring_name = scoring.__name__
                greater_is_better = 'loss' not in scoring_name and 'error' not in scoring_name
                SCORERS[scoring_name] = make_scorer(scoring, greater_is_better=greater_is_better)
                self.scoring_function = scoring_name
            else:
                if scoring not in SCORERS:
                    raise ValueError(
                        'The scoring function {} is not available. Please '
                        'choose a valid scoring function from the TPOT '
                        'documentation.'.format(scoring)
                    )
                self.scoring_function = scoring

        self.cv = cv
        self.subsample = subsample
        if self.subsample <= 0.0 or self.subsample > 1.0:
            raise ValueError(
                'The subsample ratio of the training instance must be in the range (0.0, 1.0].'
            )
        # If the OS is windows, reset cpu number to 1 since the OS did not have multiprocessing module
        if sys.platform.startswith('win') and n_jobs != 1:
            print(
                'Warning: Although parallelization is currently supported in '
                'TPOT for Windows, pressing Ctrl+C will freeze the optimization '
                'process without saving the best pipeline! Thus, Please DO NOT '
                'press Ctrl+C during the optimization procss if n_jobs is not '
                'equal to 1. For quick test in Windows, please set n_jobs to 1 '
                'for saving the best pipeline in the middle of the optimization '
                'process via Ctrl+C.'
            )
        if n_jobs == -1:
            self.n_jobs = cpu_count()
        else:
            self.n_jobs = n_jobs

        self._setup_pset()
        self._setup_toolbox()
Esempio n. 38
0
File: base.py Progetto: rhiever/tpot
    def __init__(self, population_size=100, generations=100,
                 mutation_rate=0.9, crossover_rate=0.05,
                 scoring=None, num_cv_folds=3, max_time_mins=None, max_eval_time_mins=5,
                 random_state=None, verbosity=0,
                 disable_update_check=False):
        """Sets up the genetic programming algorithm for pipeline optimization.

        Parameters
        ----------
        population_size: int (default: 100)
            The number of pipelines in the genetic algorithm population. Must
            be > 0.The more pipelines in the population, the slower TPOT will
            run, but it's also more likely to find better pipelines.
        generations: int (default: 100)
            The number of generations to run pipeline optimization for. Must
            be > 0. The more generations you give TPOT to run, the longer it
            takes, but it's also more likely to find better pipelines.
        mutation_rate: float (default: 0.9)
            The mutation rate for the genetic programming algorithm in the range
            [0.0, 1.0]. This tells the genetic programming algorithm how many
            pipelines to apply random changes to every generation. We don't
            recommend that you tweak this parameter unless you know what you're
            doing.
        crossover_rate: float (default: 0.05)
            The crossover rate for the genetic programming algorithm in the
            range [0.0, 1.0]. This tells the genetic programming algorithm how
            many pipelines to "breed" every generation. We don't recommend that
            you tweak this parameter unless you know what you're doing.
        scoring: function or str
            Function used to evaluate the quality of a given pipeline for the
            problem. By default, balanced class accuracy is used for
            classification problems, mean squared error for regression problems.
            TPOT assumes that this scoring function should be maximized, i.e.,
            higher is better.

            Offers the same options as sklearn.model_selection.cross_val_score:

            ['accuracy', 'adjusted_rand_score', 'average_precision', 'f1',
            'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted',
            'precision', 'precision_macro', 'precision_micro', 'precision_samples',
            'precision_weighted', 'r2', 'recall', 'recall_macro', 'recall_micro',
            'recall_samples', 'recall_weighted', 'roc_auc']
        num_cv_folds: int (default: 3)
            The number of folds to evaluate each pipeline over in k-fold
            cross-validation during the TPOT pipeline optimization process
        max_time_mins: int (default: None)
            How many minutes TPOT has to optimize the pipeline. If not None,
            this setting will override the `generations` parameter.
        max_eval_time_mins: int (default: 5)
            How many minutes TPOT has to optimize a single pipeline.
            Setting this parameter to higher values will allow TPOT to explore more complex
            pipelines but will also allow TPOT to run longer.
        random_state: int (default: 0)
            The random number generator seed for TPOT. Use this to make sure
            that TPOT will give you the same results each time you run it
            against the same data set with that seed.
        verbosity: int (default: 0)
            How much information TPOT communicates while it's running.
            0 = none, 1 = minimal, 2 = all
        disable_update_check: bool (default: False)
            Flag indicating whether the TPOT version checker should be disabled.

        Returns
        -------
        None

        """
        if self.__class__.__name__ == 'TPOTBase':
            raise RuntimeError('Do not instantiate the TPOTBase class directly; '
                               'use TPOTRegressor or TPOTClassifier instead.')

        # Prompt the user if their version is out of date
        self.disable_update_check = disable_update_check
        if not self.disable_update_check:
            update_check('tpot', __version__)

        self._hof = None
        self._optimized_pipeline = None
        self._fitted_pipeline = None
        self.population_size = population_size
        self.generations = generations
        self.max_time_mins = max_time_mins
        self.max_eval_time_mins = max_eval_time_mins

        # Schedule TPOT to run for a very long time if the user specifies a run-time
        # limit TPOT will automatically interrupt itself when the timer runs out
        if not (max_time_mins is None):
            self.generations = 1000000

        self.mutation_rate = mutation_rate
        self.crossover_rate = crossover_rate
        self.verbosity = verbosity
        self.operators_context = {
            'make_pipeline': make_pipeline,
            'make_union': make_union,
            'VotingClassifier': VotingClassifier,
            'FunctionTransformer': FunctionTransformer
        }

        self._pbar = None
        self._gp_generation = 0

        self.random_state = random_state

        # If the user passed a custom scoring function, store it in the sklearn SCORERS dictionary
        if scoring:
            if hasattr(scoring, '__call__'):
                scoring_name = scoring.__name__

                if 'loss' in scoring_name or 'error' in scoring_name:
                    greater_is_better = False
                else:
                    greater_is_better = True

                SCORERS[scoring_name] = make_scorer(scoring, greater_is_better=greater_is_better)
                self.scoring_function = scoring_name
            else:
                self.scoring_function = scoring

        self.num_cv_folds = num_cv_folds

        self._setup_pset()
        self._setup_toolbox()
                 ("pca", H2OPCA(k=2)),
                 ("gbm", H2OGradientBoostingEstimator(distribution="multinomial"))])
pipeline.fit(iris_df[:4],iris_df[4])

# Random CV using H2O and Scikit-learn
from sklearn.grid_search import RandomizedSearchCV
from h2o.cross_validation import H2OKFold
from h2o.model.regression import h2o_r2_score
from sklearn.metrics.scorer import make_scorer
params = {"standardize__center":    [True, False],             # Parameters to test
          "standardize__scale":     [True, False],
          "pca__k":                 [2,3],
          "gbm__ntrees":            [10,20],
          "gbm__max_depth":         [1,2,3],
          "gbm__learn_rate":        [0.1,0.2]}
custom_cv = H2OKFold(iris_df, n_folds=5, seed=42)
pipeline = Pipeline([("standardize", H2OScaler()),
                     ("pca", H2OPCA(k=2)),
                     ("gbm", H2OGradientBoostingEstimator(distribution="gaussian"))])
random_search = RandomizedSearchCV(pipeline, params,
                                   n_iter=5,
                                   scoring=make_scorer(h2o_r2_score),
                                   cv=custom_cv,
                                   random_state=42,
                                   n_jobs=1)
random_search.fit(iris_df[1:], iris_df[0])
print random_search.best_estimator_