def evalModel(data, labels):
    loss = make_scorer(get_rmsle, greater_is_better=False)
    seed1 = 42
    clf = xgb.XGBRegressor(seed=seed1, silent=True)

    param_dist = {
        "learning_rate": sp_uniform(0.01, 0.1),
        "n_estimators": sp_randint(50, 500),
        "max_depth": sp_randint(2, 6),
        "subsample": sp_uniform(0.5, 0.4),
        "max_delta_step": sp_uniform(1, 2),
        "min_child_weight": sp_uniform(1, 6),
        "colsample_bytree": sp_uniform(0.8, 0.2)
    }

    n_iter_search = 60
    random_search = RandomizedSearchCV(clf,
                                       param_distributions=param_dist,
                                       cv=5,
                                       scoring=loss,
                                       n_iter=n_iter_search,
                                       n_jobs=-1,
                                       pre_dispatch='n_jobs',
                                       verbose=2)
    report(random_search.grid_scores_, n_top=5)
Ejemplo n.º 2
0
    def fit(self, x_train, y_train):
        self.processing_steps = [StandardScaler()]
        svr = SVR(kernel='rbf', gamma=0.1)

        # http://www.csie.ntu.edu.tw/~cjlin/papers/guide/guide.pdf
        # C = [2**i for i in np.arange(start=-5, stop=16, step=2)]
        # gamma = [2**i for i in np.arange(start=-15, stop=4, step=2)]
        # https://stats.stackexchange.com/questions/43943/
        # which-search-range-for-determining-svm-optimal-c-
        # and-gamma-parameters

        C = [2**i for i in [-3, -2, -1, 0, 1, 2, 3, 4, 5]]
        gamma = [2**i for i in [-5, -4, -3, -2, -1, 0, 1, 2, 3]]

        params = {"C": sp_uniform(0.125, 32), "gamma": sp_uniform(0.03125, 8)}
        params.update(self.kwargs)

        reg = RandomizedSearchCV(estimator=svr,
                                 param_distributions=params,
                                 n_iter=10,
                                 scoring=self.score['function'],
                                 cv=3,
                                 iid=True)

        clf = MultiOutputRegressor(reg)
        self._update_pipeline_and_fit(x_train, y_train, [clf])
Ejemplo n.º 3
0
def params_optimize(x_train, y_train):
    x_train, x_test, y_train, y_test = train_test_split(x_train,
                                                        y_train,
                                                        test_size=0.10,
                                                        stratify=y_train)
    fit_params = {
        "early_stopping_rounds": 30,
        "eval_metric": 'auc',
        "eval_set": [(x_test, y_test)],
        'eval_names': ['valid'],
        # 'callbacks': [lgb.reset_parameter(learning_rate=learning_rate_010_decay_power_099)],
        'verbose': 100,
        'categorical_feature': 'auto'
    }

    param_test = {
        'num_leaves': sp_randint(6, 50),
        'min_child_samples': sp_randint(100, 500),
        'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
        'subsample': sp_uniform(loc=0.2, scale=0.9),
        'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
        'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
        'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]
    }
    hp_points_to_test = 200
    clf = lgb.LGBMClassifier(max_depth=-1,
                             silent=True,
                             metric='None',
                             n_jobs=4,
                             n_estimators=10000)
    clf = LGBMClassifier(
        nthread=4,
        n_estimators=10000,
        learning_rate=0.02,
        num_leaves=34,
        colsample_bytree=0.9497036,
        subsample=0.8715623,
        max_depth=8,
        reg_alpha=0.041545473,
        reg_lambda=0.0735294,
        min_split_gain=0.0222415,
        min_child_weight=39.3259775,
        silent=-1,
        verbose=-1,
    )

    gs = RandomizedSearchCV(estimator=clf,
                            param_distributions=param_test,
                            n_iter=hp_points_to_test,
                            scoring='roc_auc',
                            cv=5,
                            refit=True,
                            verbose=True)

    gs.fit(x_train, y_train, **fit_params)
    print('Best score reached: {} with params: {} '.format(
        gs.best_score_, gs.best_params_))
Ejemplo n.º 4
0
def train_light_gbm_regressor(X, y, cv, n_params, test_size=.2, n_jobs=-1):

    LGBM_params = {
        'num_leaves': sp_randint(6, 50),
        'min_child_samples': sp_randint(100, 500),
        'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
        'subsample': sp_uniform(loc=0.2, scale=0.8),
        'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
        'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
        'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]
    }

    Xt, Xv, yt, yv = train_test_split(X, y, test_size=test_size)

    param_list = list(ParameterSampler(LGBM_params, n_iter=n_params))
    param_scores = []

    int_skf = KFold(n_splits=cv)

    for p in range(n_params):

        best_scs = []

        for train_i, test_i in int_skf.split(Xt, yt):

            Xt_train, yt_train = Xt[train_i], yt[train_i]
            Xt_test, yt_test = Xt[test_i], yt[test_i]

            model = LGBMRegressor(n_jobs=n_jobs,
                                  silent=True,
                                  n_estimators=5000,
                                  **param_list[p])
            model.fit(Xt_train,
                      yt_train,
                      eval_set=(Xt_test, yt_test),
                      verbose=False,
                      early_stopping_rounds=300)

            best_sc = model.best_score_['valid_0']['l2']
            best_scs.append(best_sc)

        param_scores.append(np.mean(best_scs))

    bp_ind = np.argmin(param_scores)
    model = LGBMRegressor(n_jobs=n_jobs,
                          silent=True,
                          n_estimators=5000,
                          **param_list[bp_ind])
    model.fit(Xt,
              yt,
              eval_set=(Xv, yv),
              verbose=False,
              early_stopping_rounds=500)

    return model
def hyperparameter_seach(train_x, train_y):
    from scipy.stats import randint as sp_randint
    from scipy.stats import uniform as sp_uniform

    fit_params = {
        "early_stopping_rounds":
        30,
        "eval_metric":
        'multiclass',
        "eval_set": [(train_x, train_y)],
        'eval_names': ['valid'],
        'verbose':
        100,
        'categorical_feature': [
            'max_dist_mode', 'min_dist_mode', 'max_price_mode',
            'min_price_mode', 'max_eta_mode', 'min_eta_mode', 'first_mode',
            'weekday', 'hour'
        ]
    }

    param_test = {
        'num_leaves': sp_randint(6, 50),
        'min_child_samples': sp_randint(100, 500),
        'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
        'subsample': sp_uniform(loc=0.2, scale=0.8),
        'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
        'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
        'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]
    }

    #This parameter defines the number of HP points to be tested
    n_HP_points_to_test = 100

    import lightgbm as lgb
    from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

    # n_estimators is set to a "large value". The actual number of trees build will depend on early stopping and 5000 define only the           # absolute maximum
    clf = lgb.LGBMClassifier(max_depth=-1,
                             random_state=314,
                             silent=True,
                             n_jobs=4,
                             n_estimators=5000)
    gs = RandomizedSearchCV(estimator=clf,
                            param_distributions=param_test,
                            n_iter=n_HP_points_to_test,
                            scoring='f1',
                            cv=3,
                            refit=True,
                            random_state=314,
                            verbose=True)

    gs.fit(train_x, train_y, **fit_params)
    print('Best score reached: {} with params: {} '.format(
        gs.best_score_, gs.best_params_))
Ejemplo n.º 6
0
def random_search():
    from time import time
    from scipy.stats import uniform as sp_uniform, randint as sp_randint
    from sklearn.grid_search import RandomizedSearchCV
    from sklearn.cross_validation import ShuffleSplit

    crimes = np.load(DATA_FILE)
    # features_train = crimes['features_train']
    all_labels = sorted(list(set(np.unique(crimes['labels_train'])) | set(np.unique(crimes['labels_val']))))
    batch_size = 64

    labels_train = create_labels(crimes['labels_train'], all_labels)
    labels_vals = create_labels(crimes['labels_val'], all_labels)
    labels_full = create_labels(crimes['labels'], all_labels)

    param_dist = {'layers': sp_randint(1, 3),
                  "hidden_units": [64, 128, 256],
                  'input_dropout': sp_uniform(0, 0.5),
                  "hidden_dropout": sp_uniform(0, 0.75),
                  "learning_rate": sp_uniform(0.01, 0.1),
                  "weight_decay": sp_uniform(0, 0.01)
                  }

    model = NeuralNetworkClassifier(n_classes=len(all_labels), batch_size=batch_size,
                                    valid_set=(crimes['features_val'], labels_vals))

    n_iter_search = 40
    np.random.seed(42)

    random_searcher = RandomizedSearchCV(model, param_distributions=param_dist, scoring=None,
                                         n_iter=n_iter_search, random_state=42, error_score=100,
                                         verbose=5,
                                         cv=ShuffleSplit(n=crimes['features_train'].shape[0], n_iter=1, test_size=0))

    start = time()
    random_searcher.fit(crimes['features_train'], labels_train.ravel())

    print("RandomizedSearchCV took %.2f seconds for %d candidates"
          " parameter settings." % ((time() - start), n_iter_search))
    report(random_searcher.grid_scores_)

    loss_train = log_loss(labels_train, random_searcher.predict_proba(crimes['features_train']))
    loss_val = log_loss(labels_vals, random_searcher.predict_proba(crimes['features_val']))
    loss_all = log_loss(labels_full, random_searcher.predict_proba(crimes['features']))

    print 'loss_all: ', loss_all
    print 'loss_train: ', loss_train
    print 'loss_val: ', loss_val

    return loss_val
Ejemplo n.º 7
0
    def fit(self, x_train, y_train):

        self.processing_steps = [StandardScaler()]

        ann = MLPRegressor()
        params = {
            'hidden_layer_sizes': sp_randint(20, 150),
            'alpha': sp_uniform(0, 100),
            'max_iter': sp_randint(100, 2000),
            'solver': ['lbfgs'],
            # 'identity', 'logistic', 'tanh', 'relu'
            'activation': ['relu']
        }

        if 'hidden_layer_sizes' in self.kwargs:
            self.kwargs['hidden_layer_sizes'] = self.parsefunction(
                self.kwargs['hidden_layer_sizes'])

        params.update(self.kwargs)
        clf = RandomizedSearchCV(estimator=ann,
                                 param_distributions=params,
                                 n_iter=10,
                                 scoring=self.score['function'],
                                 cv=3,
                                 iid=True)

        self._update_pipeline_and_fit(x_train, y_train, [clf])
def get_random_grid_CV_params():
    """Define the Random Grid Search parameters for each model."""
    logit_params = {"C": sp_expon(loc=0.001, scale=1),
                    "fit_intercept": [True, False],
                    "intercept_scaling": sp_randint(1, 5),
                    "warm_start": [False, True]
                    }
    rf_params = {"min_samples_split": sp_randint(1, 50),
                 "min_samples_leaf": sp_randint(1, 50),
                 "criterion": ["gini", "entropy"],
                 "class_weight": ['balanced', 'balanced_subsample']
                 }
    ada_dt_params = {"learning_rate": sp_expon(loc=0.001, scale=1.5),
                     "algorithm": ['SAMME.R', 'SAMME']
                     }
    gbc_params = {"learning_rate": sp_expon(loc=0.001, scale=0.5),
                  "subsample": sp_uniform(loc=0.2, scale=0.8),
                  "max_features": [None, 'auto'],
                  "max_depth": sp_randint(2, 6),
                  }
    svc_params = {"C": sp_expon(loc=0.001, scale=2),
                  "kernel": ['rbf', 'poly'],
                  "degree": sp_randint(2, 10),
                  "coef0": [0, 1, 2],
                  "shrinking": [True, False]
                  }
    rnd_CV_param_distributions = {'Logistic': logit_params,
                                  'RandomForest': rf_params,
                                  'AdaBoost_DT': ada_dt_params,
                                  'GBC': gbc_params,
                                  'SVC': svc_params
                                  }
    return rnd_CV_param_distributions
def evalModel(data, labels):
    loss  = make_scorer(get_rmsle, greater_is_better=False)
    seed1 = 42
    clf = xgb.XGBRegressor(seed=seed1, silent=True)
    
    param_dist = { "learning_rate":sp_uniform(0.01,0.1),
                   "n_estimators":sp_randint(50,500),
                   "max_depth": sp_randint(2,6), 
                   "subsample": sp_uniform(0.5,0.4),
                   "max_delta_step": sp_uniform(1,2),
                   "min_child_weight":sp_uniform(1,6),
                   "colsample_bytree":sp_uniform(0.8,0.2)};    
    
    n_iter_search = 60
    random_search = RandomizedSearchCV(clf, param_distributions=param_dist, cv=5, scoring=loss,
                                       n_iter=n_iter_search,n_jobs=-1,pre_dispatch='n_jobs',verbose=2)
    report(random_search.grid_scores_,n_top=5)
Ejemplo n.º 10
0
def get_param_distribution_for_model(model_str, iter_count):
    pdist = {}

    if model_str in ['ET', 'RF']:
        pdist['n_estimators'] = sp_randint(100, 500)
        pdist['max_features'] = [
            0.15, 0.2, 0.25, 0.3, 0.35, 0.40, 0.45, 0.50, 0.55, 0.60, 0.65,
            0.70, 0.75, 0.80, 0.85, 0.90, 0.95, 1
        ]  #sp_uniform(0.5, 1)
        pdist['min_samples_split'] = sp_randint(1, 15)
        pdist['min_samples_leaf'] = sp_randint(1, 15)
        pdist['bootstrap'] = [True, False]
    elif model_str == 'GP':
        #Fails because Gp will accpet either single values or array-like values, and it seems
        #RandomizedSearchCV etc. get confused (as they must, given no other information)
        #corr_methods = ['absolute_exponential', 'squared_exponential', 'generalized_exponential', 'cubic', 'linear']
        pdist['corr'] = [
            'absolute_exponential', 'squared_exponential', 'cubic', 'linear'
        ]
        #theta0_range = sp_uniform(0.1, 0.9)
        #thetaL_range = sp_uniform(1e-5, 3e-1)
        #thetaU_range = sp_uniform(7e-1, 1)
        #random_start_range = sp_randint(1, 3)
        '''
        pdist = []
        for i in range(iter_count):
            trial_dict = {}
            trial_dict['corr'] = [random.choice(corr_methods)]
            #trial_dict['theta0'] = [theta0_range.rvs()]
            #trial_dict['thetaL'] = [[thetaL_range.rvs()]]
            #trial_dict['thetaU'] = [[thetaU_range.rvs()]]
            #trial_dict['random_start'] = [random_start_range.rvs()]
            pdist.append(trial_dict)
        '''
    elif model_str == 'KNN':
        pdist['weights'] = ['uniform', 'distance']
        pdist['metric'] = ['euclidean', 'manhattan', 'chebyshev']
        pdist['n_neighbors'] = sp_randint(2, 50)
    elif model_str == 'SVR':
        pdist['kernel'] = ['rbf', 'sigmoid', 'poly']
        pdist['degree'] = sp_randint(2, 6)
        pdist['gamma'] = sp_uniform(1e-2, 1)
        pdist['coef0'] = sp_uniform(0, 1)
        pdist['epsilon'] = sp_uniform(1e-2, 3e-1)

    return pdist
Ejemplo n.º 11
0
 def get_param_dist(self, X):
     num_rows = X.shape[0]
     num_features = X[self.inputs].shape[1]
     param_dist = {
         'rank': sp_randint(1, num_features),
         'batch_size': sp_randint(1, num_rows),
         'lr': sp_uniform(loc=0.001, scale=0.01),
     }
     return param_dist
Ejemplo n.º 12
0
def get_param_distribution_for_model(model_str, iter_count):
    pdist = {}

    if model_str in ['ET', 'RF']:
        pdist['n_estimators'] = sp_randint(100, 500)
        pdist['max_features'] = [0.15, 0.2, 0.25, 0.3, 0.35, 0.40, 0.45, 0.50, 0.55, 0.60, 0.65, 0.70, 0.75, 0.80, 0.85, 0.90, 0.95, 1]#sp_uniform(0.5, 1)
        pdist['min_samples_split'] = sp_randint(1, 15)
        pdist['min_samples_leaf'] = sp_randint(1, 15)
        pdist['bootstrap'] = [True, False]
    elif model_str == 'GP':
        #Fails because Gp will accpet either single values or array-like values, and it seems
        #RandomizedSearchCV etc. get confused (as they must, given no other information)
        #corr_methods = ['absolute_exponential', 'squared_exponential', 'generalized_exponential', 'cubic', 'linear']
        pdist['corr'] = ['absolute_exponential', 'squared_exponential', 'cubic', 'linear']
        #theta0_range = sp_uniform(0.1, 0.9)
        #thetaL_range = sp_uniform(1e-5, 3e-1)
        #thetaU_range = sp_uniform(7e-1, 1)
        #random_start_range = sp_randint(1, 3)

        '''
        pdist = []
        for i in range(iter_count):
            trial_dict = {}
            trial_dict['corr'] = [random.choice(corr_methods)]
            #trial_dict['theta0'] = [theta0_range.rvs()]
            #trial_dict['thetaL'] = [[thetaL_range.rvs()]]
            #trial_dict['thetaU'] = [[thetaU_range.rvs()]]
            #trial_dict['random_start'] = [random_start_range.rvs()]
            pdist.append(trial_dict)
        '''
    elif model_str == 'KNN':
        pdist['weights'] = ['uniform', 'distance']
        pdist['metric'] = ['euclidean', 'manhattan', 'chebyshev']
        pdist['n_neighbors'] = sp_randint(2, 50)
    elif model_str == 'SVR':
        pdist['kernel'] = ['rbf', 'sigmoid', 'poly']
        pdist['degree'] = sp_randint(2, 6)
        pdist['gamma'] = sp_uniform(1e-2, 1)
        pdist['coef0'] = sp_uniform(0, 1)
        pdist['epsilon'] = sp_uniform(1e-2, 3e-1)

    return pdist
Ejemplo n.º 13
0
def hpsearch_lgb(x_tr, y_tr, x_va, y_va):
    n_HP_points_to_test = 100

    from scipy.stats import randint as sp_randint
    from scipy.stats import uniform as sp_uniform
    param_test = {
        'num_leaves': sp_randint(6, 50),
        'min_child_samples': sp_randint(100, 500),
        'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
        'subsample': sp_uniform(loc=0.2, scale=0.8),
        'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
        'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
        'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]
    }

    reg = lgb.LGBMRegressor(max_depth=-1,
                            random_state=314,
                            silent=True,
                            metric='None',
                            n_jobs=4,
                            n_estimators=5000)
    gs = RandomizedSearchCV(estimator=reg,
                            param_distributions=param_test,
                            n_iter=n_HP_points_to_test,
                            scoring='neg_root_mean_squared_error',
                            cv=3,
                            refit=True,
                            random_state=314,
                            verbose=True)

    fit_params = {
        "early_stopping_rounds": 30,
        "eval_metric": 'rmse',
        "eval_set": [(x_va, y_va)],
        'eval_names': ['valid'],
        #'callbacks': [lgb.reset_parameter(learning_rate=learning_rate_010_decay_power_099)],
        'verbose': 100
    }
    #'categorical_feature': 'auto'}
    gs.fit(x_tr, y_tr, **fit_params)
    print('Best score reached: {} with params: {} '.format(
        gs.best_score_, gs.best_params_))
Ejemplo n.º 14
0
def method_lgbm(random_state):
    params = {
        'learning_rate': 0.01,
        'n_jobs': 10,
        'n_estimators': 3000,
        'random_state': random_state,
        'verbose': -1,
        'device': 'cpu',
        'subsample': 0.5,
        'feature_fraction': 0.01,
        'lambda_l2': 0.1,
        'max_depth': 1,
        'min_data_in_leaf': 20
    }

    return lgb.LGBMClassifier(**params), {
        'learning_rate': sp_uniform(loc=0.001, scale=0.03),
        'subsample': sp_uniform(loc=0.5, scale=0.3),
        'max_depth': [1, 3, 7],
        'min_data_in_leaf': [1, 3, 7, 10, 20],
    }
Ejemplo n.º 15
0
    def turning_lgb(self, X_train, y_train, X_val, y_val):
        """ Applying Randomized search on turning lgb's parameters on validation dataset

        Args:
           X_train: Dataframe df: train set
           y_train: series: train set response
           X_val: Dataframe df: validation set
           y_val: series: validation set response
        return:
             the turned parameters for lgb

        """
        param_test = {
            'min_child_samples': sp_randint(10, 100),
            'subsample': sp_uniform(loc=0.2, scale=0.8),
            'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1],
            'colsample_bytree': sp_uniform(loc=0.4, scale=0.6)
        }
        # This parameter defines the number of HP points to be tested
        n_HP_points_to_test = 300
        clf = lgb.LGBMClassifier(is_unbalance=True)
        gs = RandomizedSearchCV(estimator=clf,
                                param_distributions=param_test,
                                n_iter=n_HP_points_to_test,
                                scoring='f1',
                                cv=3,
                                refit=True,
                                verbose=False)
        fit_params = {
            "early_stopping_rounds": 30,
            "eval_metric": ['logloss'],
            "eval_set": [(X_val, y_val)],
            'eval_names': ['valid']
        }
        gs.fit(X_train, y_train, **fit_params)
        print('Best f1_score reached: {} with params: {} '.format(
            gs.best_score_, gs.best_params_))
        return (gs.best_params_)
Ejemplo n.º 16
0
    def __init__(self, low, high, step_name, variable_name):
        """Random variable uniformly distributed between `low` and `high`.

        Inputs
        ------
        low, high : float

        step_name, variable_name : str
            The name of the step in the sklearn pipeline and the name of the
            variable.
        """
        super().__init__(step_name, variable_name, sp_uniform(low, high - low))
        self.low = min(low, high)
        self.high = max(low, high)
Ejemplo n.º 17
0
 def rand_distribution_hr(self):
     # LDA and GaussianNB don't need hyperparameters research
     param_dist = {}
     if self.method == 'SVC':
         param_dist.update({'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
                            'C': sp_randint(1, 50),
                            'degree': sp_randint(1, 10),
                            'coef0': sp_randint(1, 50),
                            'gamma': sp_randint(1, 20)})
     elif self.method == 'MultinomialNB':
         param_dist.update({'alpha': sp_uniform(0, 5)})  # (min_value, max_value - min_value)
     elif self.method == 'RF':
         param_dist.update({'n_estimators': sp_randint(10, 100),
                            'max_features': ['log2', 'sqrt', 1.0]})
     elif self.method == 'KNN':
         param_dist.update({'n_neighbors': sp_randint(5, 30),
                            'weights': ['uniform', 'distance'],
                            'leaf_size': sp_randint(10, 50)})
     elif self.method == 'MLP':
         param_dist.update({'hidden_layer_sizes': [(30, 30), (40, 40), (50, 50), (50, 30), (50, 20)],
                            'activation': ['identity', 'logistic', 'tanh', 'relu'],
                            'alpha': sp_uniform(1e-5, 1e-1)})
     return param_dist
Ejemplo n.º 18
0
def fit_lgb(X, y, lgb_path):
    
	# set the hyperparameters
	# Define the search space

	param_test ={'num_leaves': sp_randint(6, 50), 
	'min_child_samples': sp_randint(100, 500), 'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
	'subsample': sp_uniform(loc=0.2, scale=0.8), 
	'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
	'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
	'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]}

	n_HP_points_to_test = 100

	# call the model
	clf = lgb.LGBMClassifier(max_depth=-1, random_state=314, silent=True, metric='auc', objective = 'binary', n_jobs=-1, n_estimators=5000)
	
	# perform the randomized grid search
	grid_search = RandomizedSearchCV(estimator=clf, param_distributions=param_test, n_iter=n_HP_points_to_test, scoring='roc_auc',
		cv=3, refit=True, random_state=314, verbose=True)
	grid_search.fit(X, y)

	# save the best model
	model = grid_search.best_estimator_
Ejemplo n.º 19
0
def tune_grad_boost_regressor(X, y, k_fold, n_iter_search):
    model_all = []
    r2_all = []
    r2_mean_all = []

    # Gradient Boosting Regressor
    #loss = 'ls'  # ls, huber, lad
    regressor = GradientBoostingRegressor(loss='ls')

    # specify parameters and distributions to sample from
    param_dist = {
        "learning_rate": sp_uniform(0, 1),
        "n_estimators": sp_randint(40, 1500),
        "max_depth": sp_randint(2, 6),
        "max_features": sp_randint(1, 11),
        "min_samples_split": sp_randint(2, 11),
        "min_samples_leaf": sp_randint(1, 11)
    }

    # run randomized search
    random_search = RandomizedSearchCV(estimator=regressor,
                                       param_distributions=param_dist,
                                       n_iter=n_iter_search,
                                       fit_params=None,
                                       n_jobs=1,
                                       iid=True,
                                       refit=True,
                                       cv=k_fold,
                                       verbose=1,
                                       pre_dispatch='2*n_jobs',
                                       random_state=None,
                                       error_score='raise',
                                       return_train_score=True)

    start = time()
    random_search.fit(X, y)
    print("RandomizedSearchCV took %.2f seconds for %d candidates"
          " parameter settings." % ((time() - start), n_iter_search))

    # report results from
    report(random_search.cv_results_)

    model = random_search.best_estimator_

    print("The best score in the search process is %f",
          random_search.best_score_)

    return model
    def tune(self, X_train, Y_train):
        # define grid of possible hyper parameter values
        param_dist = {
            'nodes': sp_randint(5, 50),
            'eta': sp_norm(.05, .1),
            'lmbda': sp_uniform(0, 1),
            'patience': sp_randint(5, 15)
        }

        self.net.set_params(verbose=False)
        random_search = RandomizedSearchCV(self.net,
                                           scoring='neg_mean_squared_error',
                                           param_distributions=param_dist,
                                           cv=5,
                                           n_jobs=8)
        random_search.fit(X_train, Y_train)
        self.params = random_search.best_params_
        self.net = Network(self.NF,
                           nodes=self.params['nodes'],
                           eta=self.params['eta'],
                           lmbda=self.params['lmbda'],
                           patience=self.params['patience'])
        return self.params
# Set up decay learning rate
def learning_rate_power(current_round):
    base_learning_rate = 0.19000424246380565
    min_learning_rate = 0.01
    lr = base_learning_rate * np.power(0.995, current_round)
    return max(lr, min_learning_rate)


from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

tune_params = {
    'n_estimators': [200, 500, 1000, 2500, 5000],
    'max_depth': sp_randint(4, 12),
    'colsample_bytree': sp_uniform(loc=0.8, scale=0.15),
    'min_child_samples': sp_randint(60, 120),
    'subsample': sp_uniform(loc=0.75, scale=0.25),
    'reg_lambda': [1e-3, 1e-2, 1e-1, 1]
}

fit_params = {
    'early_stopping_rounds': 40,
    'eval_metric': 'accuracy',
    'eval_set': [(X_train, y_train), (X_val, y_val)],
    'verbose': 20,
    'callbacks': [lgb.reset_parameter(learning_rate=learning_rate_power)]
}

lgb_clf = lgb.LGBMClassifier(n_jobs=4, objective='binary', random_state=1)
gs = RandomizedSearchCV(estimator=lgb_clf,
Ejemplo n.º 22
0
import numpy as np
import pickle

n_iter = 2000
k_fold = 10

file_dat = np.load("train_test_v2.npz")
X_train = file_dat["X_train"]
Y_train = file_dat["y_train"]
X_test = file_dat["X_test"]  
cv = StratifiedKFold(Y_train,n_folds=k_fold,shuffle=True)
# initialize the classifier
GB = xgb.XGBClassifier()
  
param_grid = {'max_depth': sp_randint(1, 100),
              'learning_rate': sp_uniform(loc=0e0,scale=1e0),
              'objective':['binary:logistic'],
              'nthread': [8],
              'missing': [np.nan],
              'reg_alpha': [0.01,0.017782794,0.031622777,0.056234133,\
                            0.1,0.17782794,0.31622777,0.56234133,1.,1.77827941,\
                            3.16227766,5.62341325,10.,\
                            17.7827941,31.6227766,56.2341325,100.],
              'colsample_bytree': sp_uniform(loc=0.2e0,scale=0.8e0),
              'subsample': np.arange(0.6,1.0,step=0.05),
              'n_estimators': sp_randint(200,800)
	}
  
search_GB = RandomizedSearchCV(GB,param_grid,scoring='accuracy',\
               n_iter=n_iter,cv=cv,verbose=True)
search_GB.fit(X_train,Y_train)
#starting parameters
param_test ={
# =============================================================================
#              'num_leaves': sp_randint(100, 1000),
#              'max_depth': sp_randint(1, 10),
#              'min_data_in_leaf': sp_randint(1, 100),
# =============================================================================
# =============================================================================
#              'min_child_samples': sp_randint(100, 1000), 
#              'min_child_weight': sp_uniform(loc=0, scale=1.0),#[1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
#              'subsample': sp_uniform(loc=0.2, scale=0.8), 
#              'colsample_bytree': sp_uniform(loc=0.4, scale=0.6)
# =============================================================================
             'bagging_freq': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
             'bagging_fraction':sp_uniform(loc=0.0, scale=1.0),
             'reg_alpha': sp_uniform(loc=0.0, scale=1.0),#[0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
             'reg_lambda': sp_uniform(loc=0.0, scale=1.0)#[0, 1e-1, 1, 5, 10, 20, 50, 100]
             }


n_hyper_parameter_points_to_test = 100

#n_estimators is set to a "large value". The actual number of trees build will depend on early stopping and 50000 define only the absolute maximum
lgb_reg = lgb.LGBMRegressor(random_state=14, silent=True, metric='gamma', boosting='gbdt', n_jobs=1, n_estimators=50000, bagging_seed=1,
                            max_depth=4, min_data_in_leaf=2, num_leaves=321, colsample_bytree=0.5155872558124424, min_child_samples=815,
                            min_child_weight=0.5122614044196322, subsample=0.5555279793433687
                           )
gs = RandomizedSearchCV(estimator=lgb_reg, param_distributions=param_test, n_iter=n_hyper_parameter_points_to_test,
                                 scoring='neg_mean_absolute_error', cv=5, refit=True, random_state=14, verbose=True, n_jobs = 6)
Ejemplo n.º 24
0
    from evaluate import model_selection_pipeline, generate_challenge_run
    # from sklearn.utils.estimator_checks import check_estimator
    # check_estimator(RandomForest)
    from scipy.stats import randint as sp_randint, uniform as sp_uniform

    dataset = '../data/pl_trusted_size1_noclc_scaled_pca.csv'

    init_param = dict(n_estimators=250, max_depth=3, bootstrap=False)

    param_grid = {
        'n_estimators': sp_randint(50, 500),
        'criterion': ['gini', 'entropy'],
        'max_depth': sp_randint(2, 15),
        'min_samples_split': sp_randint(2, 20),
        'min_samples_leaf': sp_randint(1, 20),
        'max_features': sp_uniform(0.2, 0.8),  # range [0.2, 1.]
        'bootstrap': [False, True]
    }
    results_file = 'experiments/random_forest_model.txt'
    model_file = 'experiments/random_forest_model.pkl'

    model_selection_pipeline(dataset,
                             RandomForestModel,
                             param_grid,
                             results_file=results_file,
                             model_file=model_file)

    # -----------------------------------------------------------------------

    # Random search results on subsample: (random search of 20)
Ejemplo n.º 25
0
            #'callbacks': [lgb.reset_parameter(learning_rate=learning_rate_010_decay_power_099)],
            'verbose': 500,
            'categorical_feature': 'auto',
          },
}

from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

SEARCH_PARAMS = {
    '_': {
        'min_child_samples': sp_randint(2, 30),
        # 'num_leaves': [50, 100, 150, 200, 300, 500],
        # 'subsample': [0.2, 0.4, 0.6, 0.8, 0.9, 1],
        # 'learning_rate': sp_uniform(loc=0.001, scale=0.020),
        'subsample': sp_uniform(loc=0.3, scale=0.7),
        'colsample_bytree': sp_uniform(loc=0.3, scale=0.7),
        'reg_alpha': sp_uniform(loc=0.0, scale=0.4),
        'reg_lambda': sp_uniform(loc=0.0, scale=0.4),
    },
}

# train, test, structures, contributions = t4_load_data(INPUT_DIR)
#
# train, test = t4_criskiev_features(train, test, structures)
#
# structures = t4_merge_yukawa(INPUT_DIR, structures)
#
# structures = t4_crane_features(structures)
#
# train, test = t4_merge_structures(train, test, structures)
        print("Parameters: {0}".format(score.parameters))
        print("")


print("Starting RandomizedSearchCV")

n_features = X_train.shape[1]
N_FOLDS = 10

model = xgb.XGBRegressor()
# specify parameters and distributions to sample from
param_dist = {"objective": ["reg:linear"],
#              "booster" : ["gbtree"],
#              "eta": [0.1, 0.3, 0.5, 0.7],
              "max_depth": sp_randint(10, 30),
              "subsample": sp_uniform(0.1, 0.9),
              "colsample_bytree": sp_uniform(0.1, 1.0),
              "silent": [1],
              "seed": [42]
             }

# run randomized search
n_iter_search = 30
folds = cv.KFold(n=len(y_train), n_folds=N_FOLDS, shuffle=True, random_state=42)
random_search = RandomizedSearchCV(model,
                                   param_distributions=param_dist,
                                   n_iter=n_iter_search,
                                   cv=folds,
                                   n_jobs=-1,
                                   scoring=utils.rmspe_scorer,
                                   iid=True,
Ejemplo n.º 27
0
x = np.load("/home/arjun/PycharmProjects/ML_proj/scripts/dataset1.npy")
y = np.load("/home/arjun/PycharmProjects/ML_proj/scripts/outcome1.npy")

X_train, X_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=1)

n_iter = 5
k_fold = 5
cv = StratifiedKFold(n_splits=k_fold, shuffle=True)

GB = xgb.XGBClassifier()

param_grid = {'max_depth': sp_randint(1, 90),
              'learning_rate': sp_uniform(loc=0e0, scale=1e0),
              'objective': ['multi:softprob'],
              'nthread': [8],
              'missing': [np.nan],
              'reg_alpha': [0.01, 0.017782794, 0.031622777, 0.056234133, \
                            0.1, 0.17782794, 0.31622777, 0.56234133, 1., 1.77827941, \
                            3.16227766, 5.62341325, 10., \
                            17.7827941, 31.6227766, 56.2341325, 100.],
              'colsample_bytree': sp_uniform(loc=0.2e0, scale=0.8e0),
              'subsample': sp_uniform(loc=0.2e0, scale=0.8e0),
              'n_estimators': sp_randint(50, 200)}

search_GB = RandomizedSearchCV(GB,param_grid,\
               n_iter=n_iter,cv=cv,verbose=True).fit(X_train,y_train)
print(search_GB.cv_results_)
print(' ', search_GB.best_score_)
Ejemplo n.º 28
0
    param_dist = {"max_depth": sp_randint(5, 50),
                  "max_features": [0.1, 0.01, 0.001, 'auto', 'log2'],
                  "min_samples_split": sp_randint(1, 11),
                  "min_samples_leaf": sp_randint(1, 11),
                  "bootstrap": [True, False]
                  }
    rf1 = model_param_search(
        rf1, x_train, y_train, param_dist, scoring, n_iter_search, n_cv, verbose, model_id='rf1')
    rf2 = model_param_search(
        rf2, x_train, y_train, param_dist, scoring, n_iter_search, n_cv, verbose, model_id='rf2')
    ext1 = model_param_search(
        rf1, x_train, y_train, param_dist, scoring, n_iter_search, n_cv, verbose, model_id='ext1')
    ext2 = model_param_search(
        rf2, x_train, y_train, param_dist, scoring, n_iter_search, n_cv, verbose, model_id='ext2')

    xgb_estimator_fit(xgb1, x_train_xgb, y_train, 'mlogloss',
                      useTrainCV=True, cv_folds=n_cv, early_stopping_rounds=50)

    param_dist = {"max_depth": sp_randint(10, 40),
                  "min_child_weight": sp_randint(1, 20),
                  "subsample": sp_uniform(0, 1),
                  "colsample_bytree": sp_uniform(0, 1),
                  "gamma": [i/10.0 for i in range(0, 5)]
                  }

    xgb1 = model_param_search(
        xgb1, x_train_xgb, y_train, param_dist, scoring, n_iter_search, n_cv, verbose, model_id='xgb1')

    xgb_estimator_fit(xgb1, x_train_xgb, y_train, 'mlogloss',
                      useTrainCV=True, cv_folds=n_cv, early_stopping_rounds=50)
Ejemplo n.º 29
0
def uniform(a, b):
    loc = a
    scale = b - a
    return sp_uniform(loc, scale)
Ejemplo n.º 30
0
from scipy.stats import uniform as sp_uniform
# param_test ={'num_leaves': sp_randint(6, 50), 
#              'min_child_samples': sp_randint(100, 500), 
#              'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
#              'subsample': sp_uniform(loc=0.2, scale=0.8), 
#              'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
#              'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
#              'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]}

# Custom
param_test ={'num_leaves': sp_randint(10, 20), 
             'min_child_samples': sp_randint(250, 450), 
             'min_child_weight': [1e-2, 1e-1, 1, 1e1],
             'subsample': [0.8],
             "max_depth": [8, 12],
             'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
             'reg_alpha': [0, 1, 2, 5],
             'reg_lambda': [0, 1, 5]}
param_test = {'colsample_bytree': [0.95], 'max_depth': [10],
              'min_child_samples': [429], 'min_child_weight': [1],
              'num_leaves': [12], 'reg_alpha': [5], 'reg_lambda': [5],
              'subsample': [0.8, 0.85]}


# In[18]:


#This parameter defines the number of HP points to be tested
n_HP_points_to_test = 2

import lightgbm as lgb
Ejemplo n.º 31
0
    def test_opt_lightgbm(self):
        df_train = dsutils.load_adult().head(1000)
        y = df_train.pop(14).values
        X = df_train
        cols = X.columns
        num_cols = X._get_numeric_data().columns
        cat_cols = list(set(cols) - set(num_cols))

        le = LabelEncoder()
        for c in cat_cols:
            X[c] = le.fit_transform(X[c])

        clf = LGBMClassifier(n_estimators=10,
                             boosting_type='gbdt',
                             categorical_feature=cat_cols,
                             num_leaves=31)
        fit_params = {'eval_metric': 'roc_auc'}
        # randomized_search
        param_distributions = {
            # 'iterations': sp_randint(10, 1000),
            'max_depth': [1, 3, 5],  # sp_randint(1, 5),
            'learning_rate': sp_uniform(0.01, 1.0),
        }
        best_params1 = BatchTrainer.randomized_search(clf,
                                                      param_distributions,
                                                      X,
                                                      y,
                                                      fit_params=fit_params,
                                                      scoring='roc_auc',
                                                      n_jobs=1,
                                                      cv=5)

        # grid_search
        param_grid = {
            # 'iterations': [10, 30],
            'max_depth': [1, 3, 5],  # sp_randint(1, 5),
            'learning_rate': [0.01, 0.05, 0.1],
        }
        best_params2 = BatchTrainer.grid_search(clf,
                                                param_grid,
                                                X,
                                                y,
                                                fit_params=fit_params,
                                                scoring='roc_auc',
                                                n_jobs=1,
                                                cv=5)

        # bayes_search
        search_spaces = {
            'max_depth': Integer(1, 5),
            'learning_rate': Real(0.02, 0.6, 'log-uniform'),
        }
        best_params3 = BatchTrainer.bayes_search(clf,
                                                 search_spaces,
                                                 X,
                                                 y,
                                                 fit_params=fit_params,
                                                 scoring='roc_auc',
                                                 n_jobs=1,
                                                 cv=5,
                                                 n_iter=10)

        assert best_params1['max_depth'] > 0
        assert best_params2['max_depth'] > 0
        assert best_params3['max_depth'] > 0
Ejemplo n.º 32
0
    def test_opt_catboost(self):
        df_train = dsutils.load_adult().head(1000)
        y = df_train.pop(14).values
        X = df_train
        cols = X.columns
        num_cols = X._get_numeric_data().columns
        cat_cols = list(set(cols) - set(num_cols))
        clf = CatBoostClassifier(thread_count=4,
                                 loss_function='Logloss',
                                 cat_features=cat_cols,
                                 od_type='Iter',
                                 nan_mode='Min',
                                 iterations=1,
                                 eval_metric='AUC',
                                 metric_period=50,
                                 verbose=False)
        fit_params = {'early_stopping_rounds': 10}
        # randomized_search
        param_distributions = {
            # 'iterations': sp_randint(10, 1000),
            'depth': [1, 3, 5],  # sp_randint(1, 5),
            'learning_rate': sp_uniform(0.01, 1.0),
        }
        best_params1 = BatchTrainer.randomized_search(clf,
                                                      param_distributions,
                                                      X,
                                                      y,
                                                      fit_params=fit_params,
                                                      scoring='roc_auc',
                                                      n_jobs=1,
                                                      cv=5)

        # grid_search
        param_grid = {
            # 'iterations': [10, 30],
            'depth': [1, 3, 5],  # sp_randint(1, 5),
            'learning_rate': [0.01, 0.05, 0.1],
        }
        best_params2 = BatchTrainer.grid_search(clf,
                                                param_grid,
                                                X,
                                                y,
                                                fit_params=fit_params,
                                                scoring='roc_auc',
                                                n_jobs=1,
                                                cv=5)

        # bayes_search
        search_spaces = {
            'depth': Integer(1, 5),
            'learning_rate': Real(0.02, 0.6, 'log-uniform'),
        }
        best_params3 = BatchTrainer.bayes_search(clf,
                                                 search_spaces,
                                                 X,
                                                 y,
                                                 fit_params=fit_params,
                                                 scoring='roc_auc',
                                                 n_jobs=1,
                                                 cv=5,
                                                 n_iter=10)

        assert best_params1['depth'] > 0
        assert best_params2['depth'] > 0
        assert best_params3['depth'] > 0
Ejemplo n.º 33
0
print("BEST CV SCORE: " + str(gs_results.best_score_))

# Predict (after fitting GridSearchCV is an estimator with best parameters)
y_pred = gs.predict(X_test)

# Score
score = r2_score(y_test, y_pred)
print("R2 SCORE ON TEST DATA: {}".format(score))

#==============================================================================
# Random Search CV
#==============================================================================
hyper_space = {'n_estimators': sp_randint(1000, 2500),
               'max_depth':  [4, 5, 8, -1],
               'num_leaves': [15, 31, 63, 127],
               'subsample': sp_uniform(0.6, 0.4),
               'colsample_bytree': sp_uniform(0.6, 0.4)}

# Random Search CV
rs = RandomizedSearchCV(est, hyper_space, n_iter=60, scoring='r2', cv=4, 
                         verbose=1, random_state=2018)
rs_results = rs.fit(X_train, y_train)
print("BEST PARAMETERS: " + str(rs_results.best_params_))
print("BEST CV SCORE: " + str(rs_results.best_score_))

# Predict (after fitting RandomizedSearchCV is an estimator with best parameters)
y_pred = rs.predict(X_test)

# Score
score = r2_score(y_test, y_pred)
print("R2 SCORE ON TEST DATA: {}".format(score))
Ejemplo n.º 34
0
for bs_ind in range(N_bs):
    x_train, x_test, y_train, y_test =         train_test_split(X, y, test_size=0.25, random_state=bs_ind)
    
    scaler = StandardScaler()  
    scaler.fit(x_train)  
    x_train_stand = scaler.transform(x_train)  
    x_test_stand = scaler.transform(x_test)  

    scaler = StandardScaler()
    scaler.fit(y_train)
    y_train_stand = scaler.transform(y_train)
    y_test_stand = scaler.transform(y_test)
    
    SGD = SGDRegressor(random_state=bs_ind)
    param_dist1 = {'penalty': ['l1', 'l2'], 'alpha': sp_uniform(1e-5, 10.0)}
    sgd_lr = RandomizedSearchCV(SGD, param_dist1, 
                n_iter=200, n_jobs=-1, cv=5, random_state=25, scoring='neg_mean_squared_error')
    sgd = sgd_lr.fit(x_train_stand, y_train_stand)
    
    y_pred = sgd.predict(x_test_stand)
    
    MSE_vec_sgd[bs_ind] = mean_squared_error(y_test_stand, y_pred)
    print('MSE for test set', bs_ind, ' is', MSE_vec_sgd[bs_ind])


# In[54]:


mse_min = 0.3
for i, mse in enumerate(MSE_vec_sgd):
Ejemplo n.º 35
0
    def hyperparameter_tuning(self) -> None:
        """
        Performs a hyperparameter tuning search (either grid search or randomised search) on the defined parameters and
        saves the results in a CSV file for further analysis.
        Note: only designed to work with MLP (determined based on initial evaluations).
        :return: None.
        """
        # Determine scoring metric to use based on dataset.
        scoring = str()
        if config.dataset == "binary":
            scoring = "f1"
        elif config.dataset == "multi":
            scoring = "f1_weighted"

        parameters = dict()
        search_alg_str = str()
        # Initialise Grid Search.
        if config.is_grid_search:
            print("Hyperparameter tuning technique chosen: GRID SEARCH")
            if config.dataset == "binary":
                parameters = {
                    "hidden_layer_sizes": [(98,), (98, 98), (114,), (114, 114)],
                    "learning_rate_init": [0.001, 0.03, 0.04, 0.1],
                    "alpha": [0.0001, 0.26, 0.96]
                }
                print(parameters)
            elif config.dataset == "multi":
                parameters = {
                    "hidden_layer_sizes": [(68,), (68, 68), (100,), (100, 100)],
                    "learning_rate_init": [0.001, 0.01, 0.1],
                    "momentum": [0.1, 0.9],
                    "alpha": [0.0001, 0.1, 0.9]
                }
            searchCV = GridSearchCV(
                param_grid=parameters,
                estimator=self.clf,
                cv=self.folds,
                scoring=scoring
            )
            search_alg_str = "gs"
        # Initialise Randomised Search.
        elif config.is_randomised_search:
            print("Hyperparameter tuning technique chosen: RANDOMISED SEARCH")
            parameters = {
                'hidden_layer_sizes': (sp_randint(1, 150)),
                'learning_rate_init': sp_uniform(0.001, 1),
                'momentum': sp_uniform(0.1, 0.9),
                'alpha': sp_uniform(0.0001, 1)
            }
            searchCV = RandomizedSearchCV(
                param_distributions=parameters,
                estimator=self.clf,
                n_iter=100,
                cv=self.folds,
                scoring=scoring
            )
            search_alg_str = "rs"

        # Run the search and save results in a CSV file.
        gs_results = searchCV.fit(self.X, self.y)
        gs_results_df = pd.DataFrame(gs_results.cv_results_)
        gs_results_df.to_csv("../results/grid_search/{}_{}_{}.csv".format(config.dataset, config.model, search_alg_str))

        # Print the best model found by hyperparameter tuning algorithm for the MLP and save the model in a Pickle file.
        final_model = gs_results.best_estimator_
        print("\nBest model hyperparameters found by randomised search algorithm:")
        print(final_model)
        print("Score: {}".format(gs_results.best_score_))
        save_model(final_model, config.dataset,
                   "{}_{}_{}_best_estimator".format(config.dataset, config.model, search_alg_str))
Ejemplo n.º 36
0
fit_params = {
    "early_stopping_rounds": 100,
    "eval_metric": 'auc',
    "eval_set": [(X, y)],
    'eval_names': ['valid'],
    'verbose': 0,
    'categorical_feature': 'auto'
}

param_test = {
    'learning_rate': [0.01, 0.02, 0.03, 0.04, 0.05, 0.08, 0.1, 0.2, 0.3, 0.4],
    'n_estimators': [100, 200, 300, 400, 500, 600, 800, 1000, 1500, 2000],
    'num_leaves': sp_randint(6, 50),
    'min_child_samples': sp_randint(100, 500),
    'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
    'subsample': sp_uniform(loc=0.2, scale=0.8),
    'max_depth': [-1, 1, 2, 3, 4, 5, 6, 7],
    'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
    'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
    'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]
}

#number of combinations
n_iter = 300

#intialize lgbm and lunch the search
lgbm_clf = lgbm.LGBMClassifier(random_state=random_state,
                               silent=True,
                               metric='None',
                               n_jobs=4)
grid_search = RandomizedSearchCV(estimator=lgbm_clf,
Ejemplo n.º 37
0
Y = Y[rnd_idx]

#split data to train and test
#from sklearn.cross_validation import train_test_split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_data, Y, 
                                                    test_size=0.15, 
                                                    random_state=42)


############### Classification with random search ##############

from scipy.stats import uniform as sp_uniform

# Create parameters grid for RBF kernel, we have to set C and gamma
C_dist = sp_uniform(scale=10)
gamma_dist = sp_uniform(scale=1)
parameters = {'kernel':['rbf'],
              'C':C_dist, 
              'gamma': gamma_dist
 }
from sklearn.model_selection import RandomizedSearchCV
n_iter_search = 8
svm_clsf = svm.SVC()
rnd_clsf = RandomizedSearchCV(estimator=svm_clsf,
                              param_distributions=parameters,
                              n_iter=n_iter_search, 
                              cv=3,
                              n_jobs=1,
                              verbose=2)