X = df.drop(["Customer Id", "Cost"], axis=1).values
    y = df['Cost'].values

    regressor = cb.CatBoostRegressor(verbose=0)
    param_grid = {
        "n_estimators": np.arange(100, 800, 100),
        "max_depth": np.arange(1, 20, 1),
        "learning_rate": np.arange(0.01, 0.1, 0.01)
    }

    model = model_selection.RandomizedSearchCV(estimator=regressor,
                                               param_distributions=param_grid,
                                               n_iter=5,
                                               scoring=make_scorer(
                                                   calc_metric.calc_score,
                                                   greater_is_better=True),
                                               verbose=10,
                                               cv=5,
                                               n_jobs=4)
    model.fit(X, np.log(y))

    print()
    print("Best score: ", model.best_score_)
    print()
    print("Best params: ", model.best_params_)
    print()
    print("Best estimator: ", model.best_estimator_)
    print()

# n_estimators=700, max_depth=3, score=-0.715, total= 1.7min
    ###############################################################################
    #selection of RF hyper-parameters by cross validation
    print "Selecting hyper-parameters"

    param_dist = {
        "n_estimators": sp_randint(100, 500),
        "max_features": ['auto', 'sqrt']
    }
    model = ensemble.RandomForestClassifier(class_weight='balanced_subsample',
                                            n_jobs=ncores)

    n_iter_search = 100
    rf_model = model_selection.RandomizedSearchCV(
        estimator=model,
        param_distributions=param_dist,
        n_iter=n_iter_search,
        scoring='accuracy',
        cv=5)  #probar accuracy y precision

    rf_model.fit(X_train, y_train)

    print "Model selected: \"%s\"" % rf_model.best_estimator_

    print "Best score: \"%s\"" % rf_model.best_score_

    print "Best param: \"%s\"" % rf_model.best_params_

    ###############################################################################
    #testing model performance
    print "testing model performance"
Ejemplo n.º 3
0
def train_nn():
    x, y, col_names = get_cleaned_data()
    x_train, x_test, y_train, y_test = model_selection.train_test_split(
        x, y, test_size=0.3, random_state=0)
    x_train, x_test, col_names = lasso_fs(x_train, y_train, x_test, y_test,
                                          col_names)

    #%% Neural net + grid serach
    reg = neural_network.MLPRegressor(hidden_layer_sizes=(50, ),
                                      activation='relu',
                                      solver='lbfgs',
                                      alpha=0.0001,
                                      batch_size='auto',
                                      learning_rate='adaptive',
                                      learning_rate_init=0.001,
                                      power_t=0.5,
                                      max_iter=2000,
                                      shuffle=True,
                                      random_state=0,
                                      tol=0.0001,
                                      verbose=False,
                                      warm_start=False,
                                      momentum=0.9,
                                      nesterovs_momentum=True)

    param_grid = {
        'alpha': [0.001, 0.01],
        'hidden_layer_sizes': [40, 50, 60],
        'activation': ['logistic'],
        'solver': ['lbfgs']
    }
    gscv = model_selection.GridSearchCV(reg,
                                        param_grid,
                                        scoring='neg_mean_absolute_error',
                                        fit_params=None,
                                        refit=True,
                                        cv=3,
                                        verbose=2,
                                        return_train_score=True)
    #%% Neural net + randomized search
    reg_rn = neural_network.MLPRegressor(hidden_layer_sizes=(50, ),
                                         activation='relu',
                                         solver='lbfgs',
                                         alpha=0.0001,
                                         batch_size='auto',
                                         learning_rate='adaptive',
                                         learning_rate_init=0.001,
                                         power_t=0.5,
                                         max_iter=2000,
                                         shuffle=True,
                                         random_state=0,
                                         tol=0.0001,
                                         verbose=False,
                                         warm_start=False,
                                         momentum=0.9,
                                         nesterovs_momentum=True)

    param_dist = {
        "hidden_layer_sizes": range(2, 100),
        "activation": ['relu', 'logistic'],
        'alpha': [0.001, 0.01, 0.0001],
    }

    rscv = model_selection.RandomizedSearchCV(
        reg_rn,
        param_dist,
        n_iter=20,
        scoring='neg_mean_absolute_error',
        fit_params=None,
        n_jobs=1,
        iid=True,
        refit=True,
        cv=None,
        verbose=2,
        pre_dispatch='2*n_jobs',
        random_state=None,
        error_score='raise',
        return_train_score=True)

    print 'Using grid search CV'
    gscv.fit(x_train, y_train)
    reg = gscv.best_estimator_
    print reg

    reg.fit(x_train, y_train)
    y_pred = reg.predict(x_test)
    mae_n = mean_absolute_error(y_pred, y_test)
    rmse_n = np.sqrt(mean_squared_error(y_pred, y_test))
    print 'MAE', mae_n
    print 'RMSE', rmse_n

    print 'Using random search CV'
    rscv.fit(x_train, y_train)
    reg_rn = rscv.best_estimator_
    print reg_rn
    reg_rn.fit(x_train, y_train)
    y_pred = reg_rn.predict(x_test)
    mae = mean_absolute_error(y_pred, y_test)
    rmse = np.sqrt(mean_squared_error(y_pred, y_test))
    print 'MAE', mae
    print 'RMSE', rmse
    nn_res = {
        'rcv_mae': mae_n,
        'rcv_rmse': rmse_n,
        'gcv_mae': mae,
        'gcv_rmse': rmse
    }
    pickle.dump(nn_res, open("results/nn_res.p", "wb"))
Ejemplo n.º 4
0
   X = df.drop(['Selling_Price', 'id'], 1).values
   y = df.Selling_Price.values

   regressor = ensemble.RandomForestRegressor(n_jobs=-1)

   param_grid = {
      "n_estimators": np.arange(100, 1500, 100),
      "max_depth": np.arange(1, 31),
      "criterion": ["mse", "mae"]
   }

   model = model_selection.RandomizedSearchCV(
      estimator=regressor,
      param_distributions=param_grid,
      n_iter=30,
      verbose=10,
      n_jobs=1,
      cv=5
   )

   model.fit(X, y)
   print(f"best score: {model.best_score_}")

   print("best parameter set: ")
   best_param = model.best_estimator_.get_params()

   for param_name in sorted(param_grid.keys()):
      print(f"\t{param_name} : {best_param[param_name]}")


"""
        "rf__max_depth": np.arange(1, 20),
        "rf__criterion": ["gini", "entropy"]
    }

    # model = model_selection.GridSearchCV(
    #     estimator=classifier,
    #     param_grid=hyper_params_for_grid_search,
    #     n_jobs=1,
    #     cv=5,
    #     verbose=10,
    #     scoring="accuracy"
    # )
    model = model_selection.RandomizedSearchCV(
        estimator=classifier,
        param_distributions=hyper_params_for_pipeline_classifier,
        n_iter=10,
        n_jobs=1,
        verbose=10,
        scoring="accuracy",
        cv=5)
    model.fit(X, y)
    print(model.best_score_)  # gives the best score for the model
    print(
        model.best_estimator_.get_params()
    )  # gives the best params    # the main params are criterion, n_esitamtors and max_depth
"""
    best params found out for the model.gridsearchcv 

{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 7, 'max_features': 'auto', 'max_leaf_nodes': None,
 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0, 'n_estimators': 200, 'n_jobs': -1, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}
Ejemplo n.º 6
0
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    df[all_columns], df[target], test_size=0.15, random_state=42)

model = lgb.LGBMClassifier(n_jobs=-1, random_state=42, metric='auc')

params = {
    "num_leaves": [20, 50],
    "max_depth": [8, 10, 12, 15],
    "n_estimators": [100, 250, 500],
    "learning_rate": [0.01, 0.1, 0.9],
    "subsample": [0.1, 0.20, 0.5, 0.7, 1]
}

search = model_selection.RandomizedSearchCV(model,
                                            params,
                                            cv=3,
                                            scoring='roc_auc',
                                            verbose=5000,
                                            n_iter=100)
search.fit(X_train, y_train)
df_search = pd.DataFrame(search.cv_results_)
##
best_pars = df_search['params'][df_search.rank_test_score == 1].iloc[0]

print(best_pars)

model = lgb.LGBMClassifier(n_jobs=-1,
                           random_state=42,
                           metric='auc',
                           **best_pars)

model.fit(X_train, y_train)
Ejemplo n.º 7
0
        'criterion': ['gini', 'entropy']
    }
    '''
    intialize random search 
    estimator is the model that we have defined 
    param_distributions is the grid of parameters
    we use accuracy as our metric.
    higher value of verbose implies a lot of details are printed
    cv = 5 means that we are using 5 folds cv
    n_iter is the number of iterations we want
    if param_distributions has alll the values as list,
    random search will be done by sampling without replacement
    if any of the parameters come from a distribution,
    random search uses sampling with repalcement
    '''
    model = model_selection.RandomizedSearchCV(estimator=classifier,
                                               param_distributions=params,
                                               n_iter=20,
                                               scoring='accuracy',
                                               verbose=10,
                                               n_jobs=-1,
                                               cv=5)

    # fit the model and extract best score
    model.fit(x, y)
    print(f'Best score : {model.best_score_}')

    print("Best parameters set:")
    best_parameters = model.best_estimator_.get_params()
    for param_name in sorted(params.keys()):
        print(f'\t {param_name} : {best_parameters[param_name]}')
    scoring = make_scorer(f1_score, pos_label=options.positiveClass)

    if options.classifier == "SVM":
        classifier = SVC()
        if options.kernel == 'rbf':
            paramGrid = {'C': scipy.stats.expon(scale=10), 'gamma': scipy.stats.expon(scale=.1),
                         'kernel': ['rbf'], 'class_weight': ['balanced']}
        elif options.kernel == 'linear':
            paramGrid = {'C': scipy.stats.expon(scale=10), 'kernel': ['linear'],
                         'class_weight': ['balanced']}
        elif options.kernel == 'poly':
            paramGrid = {'C': scipy.stats.expon(scale=10), 'gamma': scipy.stats.expon(scale=.1),
                         'degree': [2, 3],
                         'kernel': ['poly'], 'class_weight': ['balanced']}
        classifier_cv = model_selection.RandomizedSearchCV(classifier, paramGrid,
                                                          cv=10, n_jobs=30, verbose=3,
                                                          scoring=scoring, random_state=42)

    if options.classifier == "MLP":
        classifier = MLPClassifier()
        paramGrid= {'hidden_layer_sizes':[(x,) for x in sample(range(30,101),2)],'max_iter': sample(range(80, 201),2)} 
        classifier_cv = model_selection.GridSearchCV(classifier, paramGrid,
                                                          cv=10, n_jobs=30, verbose=3,
                                                          scoring=scoring)
    t1 = time()
    print("   Training and cross validation...")
    classifier_cv.fit(matrixTraining, trueTrainingClasses)
    best_score = classifier_cv.best_score_
    best_parameters = classifier_cv.best_estimator_.get_params()
    print("     Training and cross validation done in {:.2} seg".format((time() - t1)))
Ejemplo n.º 9
0
print("\n\n\n----------------------------------- Random Forest -----------------------------------------")


# Random forest
classifier = RandomForestClassifier()
algorithmName = "RandomForest"
paramGrid = {
    'n_estimators': [100, 150,200,300],
    'bootstrap': [True, False],
    'criterion': ["gini", "entropy"],
    'class_weight': ['balanced', None],
}

myClassifier = model_selection.RandomizedSearchCV(classifier, paramGrid,
                                            cv=crossV, n_jobs=jobs,
                                            scoring=myScorer)


myClassifier.fit(X_train, y_train)
predict = classificator_score(myClassifier, X_test)
print(classification_report(y_test, predict))


print("\n\n\n----------------------------------- SGDClassifier  -----------------------------------------")

classifier = SGDClassifier(loss = 'log')
algorithmName = "SGDClassifier"
paramGrid = {'alpha' : [10**(-x) for x in range(7)],
            'penalty' : ['elasticnet', 'l1', 'l2'],
            'l1_ratio' : [0.15, 0.25, 0.5, 0.75],
Ejemplo n.º 10
0
from sklearn import svm
import scipy

# Data:Iris
X = datasets.load_iris().data
y = datasets.load_iris().target
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X, y, test_size=0.2, random_state=1)
""" 1. 网格搜索 """
# 设置超参数网格
hyperparams = {
    'C': [1, 10, 100],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['auto', 'scale', 1, 2, 3]
}
# 设置模型
clf = model_selection.GridSearchCV(svm.SVC(), hyperparams, cv=10)
clf.fit(X, y)
print(f'Best hyperparameters: {clf.best_params_}')
print(f'Top score: {clf.best_score_}')
""" 2. 随机搜索 (从scipy.stats提供的分布类型中,连续、随机挑选一些值作为超参数) """
# 设置超参数分布
hyperparams = {'C': scipy.stats.expon(), 'gamma': scipy.stats.uniform()}
# 设置模型
clf = model_selection.RandomizedSearchCV(svm.SVC(),
                                         hyperparams,
                                         cv=10,
                                         random_state=1)
clf.fit(X, y)
print(f'Best hyperparameters: {clf.best_params_}')
print(f'Top score: {clf.best_score_}')
Ejemplo n.º 11
0
def train_cv_model(init_model,
                   X,
                   y,
                   n_splits=10,
                   training_size=0.7,
                   test_size=0.3,
                   search="Grid"):
    #
    cv, indices, cos_theta = data_split.imitate_split(y,
                                                      n_splits,
                                                      training_size,
                                                      test_size,
                                                      cos_theta_lim=0.8)
    print("cos_theta_list:", cos_theta)
    #
    ##    cv = model_selection.ShuffleSplit(n_splits, training_size, test_size,  random_state = 0)
    #
    train_val_data = []
    train_val_scores = []
    train_val_params = []
    model = None
    #
    if search == "Grid":
        optimize_parameters = {
            "alpha": [0.0001, 0.0002, 0.0003, 0.0004, 0.0005],
            "max_iter": [200, 500, 1000, 1400, 1700, 2000, 2500, 3000],
            "random_state": [0, 1, 5, 10, 20, 30, 50, 60, 80, 100]
        }
        #
        model = model_selection.GridSearchCV(
            init_model,
            optimize_parameters,
            cv=cv,
            refit="r2",
            scoring=("r2", "neg_mean_squared_error"),
            return_train_score=True,
            n_jobs=4)
    elif search == "Random":
        #
        optimize_parameters = {
            "alpha": scipy_expon(scale=0.0005),
            "max_iter": scipy_randint(200, 3000),
            "random_state": scipy_randint(0, 100)
        }
        #
        model = model_selection.RandomizedSearchCV(
            init_model,
            optimize_parameters,
            refit="r2",
            scoring=("r2", "neg_mean_squared_error"),
            cv=cv,
            n_iter=100,
            return_train_score=True,
            n_jobs=4)
    else:
        raise ("""Error Parameter input "search"!""")
        return model, train_val_scores, train_val_data, train_val_params
    #
    model.fit(X, y)
    ###
    #
    mean_train_r2 = model.cv_results_.get("mean_train_r2")
    mean_val_r2 = model.cv_results_.get("mean_test_r2")
    mean_train_neg_mean_squared_error = model.cv_results_.get(
        "mean_train_neg_mean_squared_error")
    mean_val_neg_mean_squared_error = model.cv_results_.get(
        "mean_test_neg_mean_squared_error")
    train_val_scores.extend([
        mean_train_r2, mean_val_r2, mean_train_neg_mean_squared_error,
        mean_val_neg_mean_squared_error
    ])
    ###
    #
    X_train, X_val = X[indices[0]], X[indices[1]]
    y_train, y_val = y[indices[0]], y[indices[1]]
    prediction_train = model.predict(X_train)
    prediction_val = model.predict(X_val)
    train_val_data.extend(
        [X_train, X_val, y_train, y_val, prediction_train, prediction_val])
    ###
    if search == "Grid":
        train_val_params.extend([optimize_parameters, model.best_params_])
    elif search == "Random":
        tmp_params = {}
        params = model.cv_results_.get("params")
        keys = list(params[0].keys())
        for key in keys:
            tmp_params.update({key: []})

        for params_dict in params:
            for key in keys:
                tmp_params.get(key).append(params_dict.get(key))

        for key in keys:
            tmp_params.get(key).sort()

        train_val_params.extend([tmp_params, model.best_params_])
    #
    return model, train_val_data, train_val_scores, train_val_params
Ejemplo n.º 12
0
def randomized_search_cv(
    X_fit,
    y_fit,
    X_train,
    y_train,
    X_val,
    y_val,
    model,
    params_dist,
    scorer,
    cv,
    n_jobs,
    random_search_params,
    log_residuals,
):
    if random_search_params[0]:
        print(f"\n-------------- Randomized Grid SearchCV started....")
        pprint(f"Parameters' distributions: {params_dist}")
        model_name = type(model).__name__
        # Setup MLflow tracking server
        exp_id = mlflow_set_exp_id("Model:Fit")
        run_name = f"{model_name}-rand"
        ## Enable autologging
        mlflow.sklearn.autolog(log_model_signatures=False)
        print(f"Autologging {model_name} started...")
        # Define RANDOMIZED grid search
        random_search = model_selection.RandomizedSearchCV(
            model,
            param_distributions=params_dist,
            n_iter=random_search_params[1],  # default 10
            scoring=scorer,
            n_jobs=n_jobs,
            cv=cv,
            refit=True,
            return_train_score=True,
            verbose=3,
            random_state=rnd_state,
        )
        ##* Fit model with MLflow logging
        with mlflow.start_run(experiment_id=exp_id, run_name=run_name):
            tic = time.time()
            model_random_search = random_search.fit(
                X_fit,
                y_fit,
            )
            min, sec = divmod(time.time() - tic, 60)
            ## Disable autologging
            mlflow.sklearn.autolog(disable=True)
            # Log custom metrics and data
            print(f"Randomized grid search took: {int(min)}min {int(sec)}sec")
            print(f"Log custom metrics...")
            log_custom_metrics(model_random_search, X_train, y_train, X_val,
                               y_val)
            if log_residuals:
                log_model_residuals(model_random_search, X_train, y_train,
                                    X_val, y_val)

        print(
            f"Randomized search: Best params are:\n {model_random_search.best_params_}"
        )
        print(f"{model_name.title()}: Random search:")
        print_custom_metrics(model_random_search, X_train, y_train, X_val,
                             y_val)
        winsound.Beep(frequency=2000, duration=300)
        return model, model_random_search.best_estimator_, model_random_search.best_params_
    else:
        print(f"\nSkip a Randomized Grid SearchCV....")
        return model, None, None
Ejemplo n.º 13
0
#----  random forest training with hyperparameter tuning
random_grid = {'n_estimators': [10, 100, 500, 1000],
               'max_features': [0.25, 0.50, 0.75],
               'max_depth': [5, 10, 20, 25],
               'min_samples_split': [10, 20],
               'min_samples_leaf': [5, 7, 10],
               'bootstrap': [True, False],
               'random_state': [random_seed]}

print('> Random Forest classifier...')
optimized_rfc = skms.RandomizedSearchCV(estimator = RandomForestClassifier(), 
                                        param_distributions = random_grid, 
                                        n_iter = 100, 
                                        cv = 5, 
                                        scoring=['roc_auc', 'recall'],
                                        refit ='roc_auc',
                                        verbose=1, 
                                        n_jobs = -1,
                                        random_state = random_seed)

optimized_rfc.fit(X_train, y_train)
print('\n')


#----  obtaining results of the grid run
cv_results = optimized_rfc.cv_results_
cv_results_df = pd.DataFrame(cv_results)

print('> hyperparameter tuning results')
print(cv_results_df)
Ejemplo n.º 14
0
def hyper_parameter_optimization_example():
    from time import time
    from scipy.stats import randint as sp_randint

    # Get some data.
    digits = datasets.load_digits()
    X, y = digits.data, digits.target

    # Build a classifier.
    clf = ensemble.RandomForestClassifier(n_estimators=20)

    # Utility function to report best scores.
    def report(results, n_top=3):
        for i in range(1, n_top + 1):
            candidates = np.flatnonzero(results['rank_test_score'] == i)
            for candidate in candidates:
                print('Model with rank: {0}'.format(i))
                print('Mean validation score: {0:.3f} (std: {1:.3f})'.format(
                    results['mean_test_score'][candidate],
                    results['std_test_score'][candidate]))
                print('Parameters: {0}'.format(results['params'][candidate]))
                print('')

    # Specify parameters and distributions to sample from.
    param_dist = {
        'max_depth': [3, None],
        'max_features': sp_randint(1, 11),
        'min_samples_split': sp_randint(2, 11),
        'min_samples_leaf': sp_randint(1, 11),
        'bootstrap': [True, False],
        'criterion': ['gini', 'entropy'],
    }

    # Run randomized search.
    n_iter_search = 20
    random_search = model_selection.RandomizedSearchCV(
        clf, param_distributions=param_dist, n_iter=n_iter_search)

    start = time()
    random_search.fit(X, y)
    print(
        'RandomizedSearchCV took %.2f seconds for %d candidates parameter settings.'
        % ((time() - start), n_iter_search))
    report(random_search.cv_results_)

    # Use a full grid over all parameters.
    param_grid = {
        'max_depth': [3, None],
        'max_features': [1, 3, 10],
        'min_samples_split': [2, 3, 10],
        'min_samples_leaf': [1, 3, 10],
        'bootstrap': [True, False],
        'criterion': ['gini', 'entropy'],
    }

    # Run grid search.
    #os.environ["OMP_NUM_THREADS"] = "2"
    grid_search = model_selection.GridSearchCV(clf,
                                               param_grid=param_grid,
                                               verbose=1,
                                               n_jobs=2)
    start = time()
    grid_search.fit(X, y)
    print(
        'GridSearchCV took %.2f seconds for %d candidate parameter settings.' %
        (time() - start, len(grid_search.cv_results_['params'])))
    report(grid_search.cv_results_)
Ejemplo n.º 15
0
    classifier= pipeline.Pipeline(
            [
                ('scaling',  scl),
                ('pca', pca),
                ('rf', rf)
            ]
        )

    param_grid= {
        "pca__n_components": np.arange(5, 10),
        "rf__n_estimators": np.arange(100, 1500, 100), # for grid search: [100, 200, 300, 400], 
        "rf__max_depth": np.arange(1, 20),             # for grid search: [1, 3, 5, 7, 9, 11],
        "rf__criterion": ['gini', 'entropy']

    }

    model= model_selection.RandomizedSearchCV(
        estimator=classifier,
        param_distributions=param_grid,
        n_iter= 15,
        scoring="accuracy",
        verbose=10, 
        n_jobs= 1,
        cv=5
        )
    
    model.fit(X,y)

    print(model.best_score_)
    print(model.best_estimator_.get_params())
Ejemplo n.º 16
0
def hyper_parameter_tuning_example():
    # Hyper-parameters are parameters that are not directly learnt within estimators.
    # In scikit-learn they are passed as arguments to the constructor of the estimator classes.
    # Typical examples include C, kernel and gamma for Support Vector Classifier, alpha for Lasso, etc.
    # It is possible and recommended to search the hyper-parameter space for the best cross validation score.
    # Any parameter provided when constructing an estimator may be optimized in this manner.
    # Specifically, to find the names and current values for all parameters for a given estimator:
    #	estimator.get_params()
    """
	parameters = {
		'C': scipy.stats.expon(scale=100),
		'gamma': scipy.stats.expon(scale=0.1),
		'kernel': ['rbf'],
		'class_weight':['balanced', None],
	}
	parameters = {
		'C': utils.fixes.loguniform(1e0, 1e3),
		'gamma': utils.fixes.loguniform(1e-4, 1e-3),
		'kernel': ['rbf'],
		'class_weight':['balanced', None]
	}
	"""

    #--------------------
    # Exhaustive grid search.
    # REF [site] >> https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

    if True:
        iris = datasets.load_iris()

        #param_grid = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
        param_grid = [
            {
                'C': [1, 10, 100, 1000],
                'kernel': ['linear']
            },
            {
                'C': [1, 10, 100, 1000],
                'gamma': [0.001, 0.0001],
                'kernel': ['rbf']
            },
        ]
        clf = svm.SVC()
        search = model_selection.GridSearchCV(clf, param_grid)
        search.fit(iris.data, iris.target)

        print('CV keys = {}.'.format(sorted(search.cv_results_.keys())))
        print(pd.DataFrame(search.cv_results_))
        print('Best params: {}.'.format(search.best_params_))
        print('Best estimator: {}.'.format(search.best_estimator_))
        print('Best score = {}.'.format(search.best_score_))

    #--------------------
    # Randomized search.
    # REF [site] >> https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html

    if True:
        iris = datasets.load_iris()

        clf = linear_model.LogisticRegression(solver='saga',
                                              tol=1e-2,
                                              max_iter=200,
                                              random_state=0)
        param_distributions = {
            'C': scipy.stats.uniform(loc=0, scale=4),
            'penalty': ['l2', 'l1']
        }
        search = model_selection.RandomizedSearchCV(clf,
                                                    param_distributions,
                                                    random_state=0)
        search = search.fit(iris.data, iris.target)

        print('CV keys = {}.'.format(sorted(search.cv_results_.keys())))
        print(pd.DataFrame(search.cv_results_))
        print('Best params: {}.'.format(search.best_params_))
        print('Best estimator: {}.'.format(search.best_estimator_))
        print('Best score = {}.'.format(search.best_score_))

    #--------------------
    # Randomized parameter optimization.
    # REF [site] >> https://scikit-learn.org/stable/auto_examples/model_selection/plot_randomized_search.html

    if True:
        X, y = datasets.load_digits(return_X_y=True, n_class=3)

        # Build a classifier.
        clf = linear_model.SGDClassifier(loss="hinge",
                                         penalty="elasticnet",
                                         fit_intercept=True)

        # Utility function to report best scores.
        def report(results, n_top=3):
            for i in range(1, n_top + 1):
                candidates = np.flatnonzero(results["rank_test_score"] == i)
                for candidate in candidates:
                    print("Model with rank: {0}".format(i))
                    print(
                        "Mean validation score: {0:.3f} (std: {1:.3f})".format(
                            results["mean_test_score"][candidate],
                            results["std_test_score"][candidate]))
                    print("Parameters: {0}".format(
                        results["params"][candidate]))
                    print("")

        # Specify parameters and distributions to sample from.
        param_dist = {
            "average": [True, False],
            "l1_ratio": scipy.stats.uniform(0, 1),
            "alpha": utils.fixes.loguniform(1e-2, 1e0),
        }

        # Run randomized search.
        n_iter_search = 15
        random_search = model_selection.RandomizedSearchCV(
            clf, param_distributions=param_dist, n_iter=n_iter_search)

        start = time.time()
        random_search.fit(X, y)
        print(
            "RandomizedSearchCV took %.2f seconds for %d candidates parameter settings."
            % ((time.time() - start), n_iter_search))
        report(random_search.cv_results_)
        #print('CV keys = {}.'.format(sorted(random_search.cv_results_.keys())))
        #print(pd.DataFrame(random_search.cv_results_))

        # Use a full grid over all parameters.
        param_grid = {
            "average": [True, False],
            "l1_ratio": np.linspace(0, 1, num=10),
            "alpha": np.power(10, np.arange(-2, 1, dtype=float)),
        }

        # Run grid search.
        grid_search = model_selection.GridSearchCV(clf, param_grid=param_grid)
        start = time.time()
        grid_search.fit(X, y)

        print(
            "GridSearchCV took %.2f seconds for %d candidate parameter settings."
            % (time.time() - start, len(grid_search.cv_results_["params"])))
        report(grid_search.cv_results_)
Ejemplo n.º 17
0
    def get_best_model(self,
                       model_name,
                       binary=False,
                       standarize=False,
                       feature_sel=False):
        """
        Train all the classifiers.

        Parameters
        ----------
        model_name : string
            Name of the model to test.
            The implemented ones are
            `svc`: Support Vector Machines
            `lsvc`: Linear Support Vector Machines
            `RandomForest`: Random Forest
            `knn`: K-Nearest Neighbor
        binary : bool
            If true the variables are converted to 0 / 1 (no porpoise / porpoise) instead of noise/lq/hq clicks
        standarize : bool
            Set to True if the variables should be standarized before training
        feature_sel : bool
            Set to True if the best features should be selected instead of all of them

        Returns
        -------
        Dictionary with the name of the model as key and another dictionary as value
        with ind_vars, model, binary as keys and their respective representations in values
        (It also adds it to the property "models" of the class)
        """
        x = self.train_data[self.ind_vars]
        y = self.train_data[self.dep_var]
        if binary:
            # Convert the classes in 0 (no porpoise) or 1 (porpoise)
            y = self.convert2binary(y)

        # If standarize is considered, append it to the pipeline steps
        steps = []
        if standarize:
            # Standarize the data
            scaler = preprocessing.StandardScaler()
            steps.append(('scaler', scaler))

        # Some common parameters
        tol = 1e-3
        gamma = utils.fixes.loguniform(1e-4, 1000)
        c_values = utils.fixes.loguniform(0.1, 1000)
        class_weight = ['balanced', None]

        # Get the model
        if model_name == 'svc':
            # List all the possible parameters that want to be checked
            kernel_list = ['poly', 'rbf']
            degree = stats.randint(1, 4)
            param_distr = {
                'degree': degree,
                'C': c_values,
                'gamma': gamma,
                'kernel': kernel_list
            }

            # Classifier with fixed values
            clf = svm.SVC(tol=tol,
                          cache_size=500,
                          probability=True,
                          max_iter=500)

        elif model_name == 'logit':
            # List all the possible parameters that want to be checked
            penalty = ['l1', 'l2', 'elasticnet', 'none']
            param_distr = {
                'penalty': penalty,
                'C': c_values,
                'class_weight': class_weight
            }
            # Classifier with fixed values
            clf = linear_model.LogisticRegression()

        elif model_name == 'forest':
            # List all the possible parameters that want to be checked
            n_estimators = stats.randint(100, 300)
            param_distr = {'n_estimators': n_estimators}
            # Classifier with fixed values
            clf = ensemble.RandomForestClassifier()

        elif model_name == 'knn':
            # List all the possible parameters that want to be checked
            n_neighbors = stats.randint(2, 9)
            algorithm = ['auto', 'ball_tree', 'kd_tree']
            param_distr = {'n_neighbors': n_neighbors, 'algorithm': algorithm}
            # Classifier with fixed values
            clf = neighbors.KNeighborsClassifier()

        else:
            raise Exception('%s is not implemented!' % model_name)

        if feature_sel:
            # selection = feature_selection.RFECV(estimator=svm.LinearSVC(), step=1, scoring='roc_auc')
            selection = feature_selection.SelectFromModel(
                ensemble.ExtraTreesClassifier(n_estimators=50))
            # selection = feature_selection.SelectFromModel(svm.LinearSVC())
            # Add the feature selection to the steps
            steps.append(('feature_selection', selection))

        # Search for the best parameters
        gm_cv = model_selection.RandomizedSearchCV(
            estimator=clf,
            scoring='roc_auc',
            param_distributions=param_distr,
            n_iter=100)
        steps.append(('classification', gm_cv))

        # Create pipeline and fit
        model = pipeline.Pipeline(steps)
        model.fit(x, y)

        if feature_sel:
            ind_vars = model['feature_selection'].transform(
                self.test_data[self.ind_vars])
        else:
            ind_vars = self.ind_vars

        print(model['classification'].best_estimator_)
        self.models[model_name] = {
            'ind_vars': ind_vars,
            'model': model,
            'binary': binary
        }

        # Save the model as a pickle file!
        pickle.dump(model, open('pyporcc/models/%s.pkl' % model_name, 'wb'))
        return self.models[model_name]
Ejemplo n.º 18
0
   X = data.drop(['Price', 'kfold'], 1).values
   y = data.Price.values

   forest = ensemble.RandomForestRegressor(n_jobs=-1)

   params = {
      "n_estimators": np.arange(100, 1500, 100),
      "criterion" : ["mse", "mae"],
      "max_depth": np.arange(1, 31)
   }

   model = model_selection.RandomizedSearchCV(
      estimator=forest,
      cv=5,
      verbose=10,
      param_distributions=params,
      n_iter=20,
      n_jobs=1,
   )

   model.fit(X, y)
   print(f"best score = {model.best_score_}")

   print(f"best params = {model.best_params_}")


"""
didn't get the params as the model took 20 mins to train for 1 iter - 1 fold,
if you got a high processing CPU, feel free to check it

"""
#get the logic or model learned by Algorithm
#issue: not readable
print(final_estimator.tree_)

#get the readable tree structure from tree_ object
#visualize the deciion tree
dot_data = io.StringIO() 
tree.export_graphviz(final_estimator, out_file = dot_data, feature_names = X_train.columns)
graph = pydot.graph_from_dot_data(dot_data.getvalue())[0] 
os.chdir("C:\\Users\\vesuraju\\OneDrive - DXC Production\\Venkat\\Personal\\Trainings\\ML\\Classes_Year 2020\\Codes_2020\\Datasets\\Submissions")
graph.write_pdf("tree_GridsearchCV.pdf")


#Random search
dt_rand_estimator = model_selection.RandomizedSearchCV(dt_estimator, dt_grid, cv=10, n_iter=20)
dt_rand_estimator.fit(X_train, y_train)

#access the results
print(dt_rand_estimator.best_params_)
print(dt_rand_estimator.best_score_)
final_estimator_rand = dt_rand_estimator.best_estimator_
results = dt_rand_estimator.cv_results_
print(results.get("mean_test_score"))
print(results.get("mean_train_score"))
print(results.get("params"))

#get the logic or model learned by Algorithm
#issue: not readable
print(final_estimator_rand.tree_)
Ejemplo n.º 20
0
                paramGrid = {
                    'C': scipy.stats.expon(scale=100),
                    'kernel': ['linear'],
                    'class_weight': ['balanced', None]
                }
            elif args.kernel == 'poly':
                paramGrid = {
                    'C': scipy.stats.expon(scale=100),
                    'gamma': scipy.stats.expon(scale=.1),
                    'degree': [2, 3],
                    'kernel': ['poly'],
                    'class_weight': ['balanced', None]
                }
            myClassifier = model_selection.RandomizedSearchCV(classifier,
                                                              paramGrid,
                                                              n_iter=nIter,
                                                              cv=crossV,
                                                              n_jobs=jobs,
                                                              verbose=3)

        elif args.classifier == 'BernoulliNB':
            # BernoulliNB
            classifier = BernoulliNB()
            paramGrid = {'alpha': scipy.stats.expon(scale=1.0)}
            myClassifier = model_selection.RandomizedSearchCV(classifier,
                                                              paramGrid,
                                                              n_iter=nIter,
                                                              cv=crossV,
                                                              n_jobs=jobs,
                                                              verbose=3,
                                                              scoring=myScorer)
        elif args.classifier == 'MultinomialNB':
Ejemplo n.º 21
0
# load data
train = pd.read_csv("train_macro.csv")
test = pd.read_csv("test_macro.csv")

# features which will be used
features = [col for col in train.columns if
            col not in ['id', 'timestamp', 'price_doc', 'price_log', 'price_per_sq']]
"""
Model 2 - Random Forest
"""
rf_param_distr = dict(n_estimators=scipy.stats.randint(1, 300 + 1),
                      max_features=scipy.stats.uniform(loc=0.1, scale=0.9),
                      max_depth=scipy.stats.randint(1, 20 + 1),
                      min_samples_split=scipy.stats.randint(2, 20 + 1),
                      min_samples_leaf=scipy.stats.randint(1, 30 + 1))

rf_rand_param_search = model_selection.RandomizedSearchCV(estimator=ensemble.RandomForestRegressor(),
                                                          param_distributions=rf_param_distr,
                                                          n_iter=200,
                                                          n_jobs=2,
                                                          cv=5,
                                                          verbose=20)

rf_rand_param_search.fit(train[features].values, train.price_doc.values)
psr_rf = param_search_res(rf_rand_param_search.cv_results_)
pickle.dump(psr_rf, open("psr_rf", "wb"))

rf_rand_param_search.best_params_
# best score = 0.67122194