Ejemplo n.º 1
0
    def search(self, search_space, search_iter, n_estimators, x, y):
        if 'n_estimators' in search_space:
            del search_space['n_estimators']
        params = {
            'boosting_type': ['gbdt'],
            'min_child_weight': [5],
            'min_split_gain': [1.0],
            'subsample': [0.8],
            'colsample_bytree': [0.6],
            'max_depth': [10],
            'n_estimators': n_estimators,
            'num_leaves': [70],
            'learning_rate': [0.04],
        }
        params.update(search_space)
        if self.verbose:
            print(params)
        folds = 3
        score_metric, skf = self.get_skf(folds)

        random_search = RandomizedSearchCV(self.lgbm, param_distributions=params, n_iter=search_iter,
                                           scoring=score_metric,
                                           n_jobs=1, cv=skf, verbose=0, random_state=1001)

        random_search.fit(x, y)
        self.clf = random_search.best_estimator_

        return random_search.best_params_
def parameter_search(model, X, y, params, metric, n=10):
    '''
    returns the best parameters of the classification model
    '''
    random_search = RandomizedSearchCV(model, param_distributions=params, \
    scoring = metric, n_jobs=3, n_iter=n)
    random_search.fit(X, y)
    return random_search
Ejemplo n.º 3
0
    def pr_curve(i):
        label = labels[i]
        statistics_l = Statistics()
        print('Doing label {}'.format(label))

        for train_idx, valid_idx in folds:
            rng = np.random.RandomState()
            rng.seed(seeds[i])
            training_fold = developement_df.loc[train_idx, ]
            training_fold = training_fold.reset_index(drop=True)
            validation_fold = developement_df.loc[valid_idx, ]
            validation_fold = validation_fold.reset_index(drop=True)
            base_estimators = make_classifiers(method, balanced, labels, random_state=rng)

            # Find the best params, then do a final proper calibration.
            base_estimator = base_estimators[label]
            estimator = RandomizedSearchCV(
                estimator=base_estimator, param_distributions=params,
                n_iter=60, scoring='f1', cv=3, random_state=rng,
                error_score=0.0, n_jobs=1, pre_dispatch='2*n_jobs',
                refit=True
            )

            # Set up the vectorizer for the bag-of-words representation
            if vectorizer_method == 'tf-idf':
                vectorizer = TfidfVectorizer(
                    stop_words=['go', '', ' '], binary=binary, lowercase=True,
                    sublinear_tf=False, max_df=1.0, min_df=0
                )
                vectorizer.fit(training_fold['terms'].values)
            elif vectorizer_method == 'count':
                vectorizer = CountVectorizer(
                    stop_words=['go', '', ' '], binary=binary, lowercase=True
                )
                vectorizer.fit(training_fold['terms'].values)

            # Fit an evaluate the performance of the classifier.
            x_train = vectorizer.transform(training_fold['terms'].values)
            y_train = np.asarray(training_fold[label].values, dtype=int)

            x_valid = vectorizer.transform(validation_fold['terms'].values)
            y_valid = np.asarray(validation_fold[label].values, dtype=int)

            estimator.fit(x_train, y_train)

            for t in thresholds:
                y_pred = [int(p[1] >= t) for p in estimator.predict_proba(x_valid)]
                precision = precision_score(y_valid, y_pred, labels=[0, 1], pos_label=1)
                recall = recall_score(y_valid, y_pred, labels=[0, 1], pos_label=1)
                f1 = f1_score(y_valid, y_pred, labels=[0, 1], pos_label=1)
                statistics_l.update_statistics(label=t, s_type='Precision', data=precision)
                statistics_l.update_statistics(label=t, s_type='Recall', data=recall)
                statistics_l.update_statistics(label=t, s_type='F1-Score', data=f1)

        statistics_l.frame()['reaction'] = label
        return statistics_l
Ejemplo n.º 4
0
def build_nn(x_train, y_train, x_test, y_test, n_features):
    """
    Constructing a regression neural network model from input dataframe
    :param x_train: features dataframe for model training
    :param y_train: target dataframe for model training
    :param x_test: features dataframe for model testing
    :param y_test: target dataframe for model testing
    :return: None
    """
    net = NeuralNet(layers=[('input', InputLayer),
                            ('hidden0', DenseLayer),
                            ('hidden1', DenseLayer),
                            ('output', DenseLayer)],
                    input_shape=(None, x_train.shape[1]),  # Number of i/p nodes = number of columns in x
                    hidden0_num_units=15,
                    hidden0_nonlinearity=lasagne.nonlinearities.softmax,
                    hidden1_num_units=17,
                    hidden1_nonlinearity=lasagne.nonlinearities.softmax,
                    output_num_units=1,  # Number of o/p nodes = number of columns in y
                    output_nonlinearity=lasagne.nonlinearities.softmax,
                    max_epochs=100,
                    update_learning_rate=0.01,
                    regression=True,
                    verbose=0)

    # Finding the optimal set of params for each variable in the training of the neural network
    param_dist = {'hidden0_num_units':sp_randint(3, 30), 'hidden1_num_units':sp_randint(3, 30)}
    clf = RandomizedSearchCV(estimator=net, param_distributions=param_dist,
                             n_iter=15, n_jobs=-1)
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)

    # Mean absolute error regression loss
    mean_abs = sklearn.metrics.mean_absolute_error(y_test, y_pred)
    # Mean squared error regression loss
    mean_sq = sklearn.metrics.mean_squared_error(y_test, y_pred)
    # Median absolute error regression loss
    median_abs = sklearn.metrics.median_absolute_error(y_test, y_pred)
    # R^2 (coefficient of determination) regression score function
    r2 = sklearn.metrics.r2_score(y_test, y_pred)
    # Explained variance regression score function
    exp_var_score = sklearn.metrics.explained_variance_score(y_test, y_pred)

    with open('../trained_networks/nn_%d_data.pkl' % n_features, 'wb') as results:
        pickle.dump(clf, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(net, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(mean_abs, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(mean_sq, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(median_abs, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(r2, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(exp_var_score, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(y_pred, results, pickle.HIGHEST_PROTOCOL)

    return
Ejemplo n.º 5
0
def test_trivial_cv_results_attr():
    # Test search over a "grid" with only one point.
    # Non-regression test: grid_scores_ wouldn't be set by GridSearchCV.
    clf = MockClassifier()
    grid_search = GridSearchCV(clf, {'foo_param': [1]})
    grid_search.fit(X, y)
    assert_true(hasattr(grid_search, "cv_results_"))

    random_search = RandomizedSearchCV(clf, {'foo_param': [0]}, n_iter=1)
    random_search.fit(X, y)
    assert_true(hasattr(grid_search, "cv_results_"))
Ejemplo n.º 6
0
def test_pickle():
    # Test that a fit search can be pickled
    clf = MockClassifier()
    grid_search = GridSearchCV(clf, {'foo_param': [1, 2, 3]}, refit=True)
    grid_search.fit(X, y)
    pickle.dumps(grid_search)  # smoke test

    random_search = RandomizedSearchCV(clf, {'foo_param': [1, 2, 3]},
                                       refit=True, n_iter=3)
    random_search.fit(X, y)
    pickle.dumps(random_search)  # smoke test
Ejemplo n.º 7
0
def test_randomgridsearch_slm(make_gaus_data):

    X, y, Xs, ys = make_gaus_data

    slm = StandardLinearModel(LinearBasis(onescol=True))

    param_dict = {
        'var': [Parameter(1.0 / v, Positive()) for v in range(1, 6)]
    }
    estimator = RandomizedSearchCV(slm, param_dict, n_jobs=-1, n_iter=2)

    estimator.fit(X, y)
    Ey = estimator.predict(Xs)
    assert len(ys) == len(Ey)  # we just want to make sure this all runs
Ejemplo n.º 8
0
def test_randomgridsearch_glm(make_gaus_data):

    X, y, Xs, ys = make_gaus_data

    glm = GeneralizedLinearModel(Gaussian(), LinearBasis(onescol=True),
                                 random_state=1, maxiter=100)

    param_dict = {'batch_size': range(1, 11)}
    estimator = RandomizedSearchCV(glm, param_dict, verbose=1, n_jobs=-1,
                                   n_iter=2)

    estimator.fit(X, y)
    Ey = estimator.predict(Xs)
    assert len(ys) == len(Ey)  # we just want to make sure this all runs
Ejemplo n.º 9
0
def test_pickle():
    # Test that a fit search can be pickled
    clf = MockClassifier()
    grid_search = GridSearchCV(clf, {'foo_param': [1, 2, 3]}, refit=True)
    grid_search.fit(X, y)
    grid_search_pickled = pickle.loads(pickle.dumps(grid_search))
    assert_array_almost_equal(grid_search.predict(X),
                              grid_search_pickled.predict(X))

    random_search = RandomizedSearchCV(clf, {'foo_param': [1, 2, 3]},
                                       refit=True, n_iter=3)
    random_search.fit(X, y)
    random_search_pickled = pickle.loads(pickle.dumps(random_search))
    assert_array_almost_equal(random_search.predict(X),
                              random_search_pickled.predict(X))
Ejemplo n.º 10
0
    def test__extract_arfftrace(self):
        param_grid = {"max_depth": [3, None],
                      "max_features": [1, 2, 3, 4],
                      "bootstrap": [True, False],
                      "criterion": ["gini", "entropy"]}
        num_iters = 10
        task = openml.tasks.get_task(20)
        clf = RandomizedSearchCV(RandomForestClassifier(), param_grid, num_iters)
        # just run the task
        train, _ = task.get_train_test_split_indices(0, 0)
        X, y = task.get_X_and_y()
        clf.fit(X[train], y[train])

        trace_attribute_list = _extract_arfftrace_attributes(clf)
        trace_list = _extract_arfftrace(clf, 0, 0)
        self.assertIsInstance(trace_attribute_list, list)
        self.assertEquals(len(trace_attribute_list), 5 + len(param_grid))
        self.assertIsInstance(trace_list, list)
        self.assertEquals(len(trace_list), num_iters)

        # found parameters
        optimized_params = set()

        for att_idx in range(len(trace_attribute_list)):
            att_type = trace_attribute_list[att_idx][1]
            att_name = trace_attribute_list[att_idx][0]
            if att_name.startswith("parameter_"):
                # add this to the found parameters
                param_name = att_name[len("parameter_"):]
                optimized_params.add(param_name)

                for line_idx in range(len(trace_list)):
                    val = json.loads(trace_list[line_idx][att_idx])
                    legal_values = param_grid[param_name]
                    self.assertIn(val, legal_values)
            else:
                # repeat, fold, itt, bool
                for line_idx in range(len(trace_list)):
                    val = trace_list[line_idx][att_idx]
                    if isinstance(att_type, list):
                        self.assertIn(val, att_type)
                    elif att_name in ['repeat', 'fold', 'iteration']:
                        self.assertIsInstance(trace_list[line_idx][att_idx], int)
                    else: # att_type = real
                        self.assertIsInstance(trace_list[line_idx][att_idx], float)


        self.assertEqual(set(param_grid.keys()), optimized_params)
Ejemplo n.º 11
0
    def test_large_grid():
        """In this test, we purposely overfit a RandomForest to completely random data
        in order to assert that the test error will far supercede the train error.
        """

        if not SK18:
            custom_cv = KFold(n=y_train.shape[0], n_folds=3, shuffle=True, random_state=42)
        else:
            custom_cv = KFold(n_splits=3, shuffle=True, random_state=42)

        # define the pipe
        pipe = Pipeline([
            ('scaler', SelectiveScaler()),
            ('pca', SelectivePCA(weight=True)),
            ('rf', RandomForestClassifier(random_state=42))
        ])

        # define hyper parameters
        hp = {
            'scaler__scaler': [StandardScaler(), RobustScaler(), MinMaxScaler()],
            'pca__whiten': [True, False],
            'pca__weight': [True, False],
            'pca__n_components': uniform(0.75, 0.15),
            'rf__n_estimators': randint(5, 10),
            'rf__max_depth': randint(5, 15)
        }

        # define the grid
        grid = RandomizedSearchCV(pipe, hp, n_iter=2, scoring='accuracy', n_jobs=1, cv=custom_cv, random_state=42)

        # this will fail because we haven't fit yet
        assert_fails(grid.score, (ValueError, AttributeError), X_train, y_train)

        # fit the grid
        grid.fit(X_train, y_train)

        # score for coverage -- this might warn...
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            grid.score(X_train, y_train)

        # coverage:
        assert grid._estimator_type == 'classifier'

        # get predictions
        tr_pred, te_pred = grid.predict(X_train), grid.predict(X_test)

        # evaluate score (SHOULD be better than random...)
        accuracy_score(y_train, tr_pred), accuracy_score(y_test, te_pred)

        # grid score reports:
        # assert fails for bad percentile
        assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'percentile': 0.0})
        assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'percentile': 1.0})

        # assert fails for bad y_axis
        assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'y_axis': 'bad_axis'})

        # assert passes otherwise
        report_grid_score_detail(grid, charts=True, percentile=0.95)  # just ensure percentile works
Ejemplo n.º 12
0
def fit(x, y, estimator, dataframe, params):
    vectorizer = CountVectorizer(stop_words=['go', '', ' '], binary=False, lowercase=True)
    vectorizer.fit(dataframe[x].values)
    fresh_estimator = clone(estimator)
    x_np, y_np, feature_names, selector = \
    select_features(
        df = dataframe,
        vectorizer=vectorizer,
        feature_col=x,
        label_col=y,
        select_method=None,
        continuous_col=None
    )
    estimator = RandomizedSearchCV(estimator, params, n_iter=60, cv=3, n_jobs=3, refit=True)
    estimator.fit(x_np, y_np)
    best_params = estimator.best_params_

    if method not in ['lr', 'svm']:
        print("Calibrating...")
        estimator = CalibratedClassifierCV(fresh_estimator.set_params(**best_params), 'isotonic', 3)
        estimator.fit(x_np, y_np)

    from sklearn.base import _pprint
    _pprint(estimator.get_params(deep=True), offset=2)
    return estimator, selector, vectorizer
Ejemplo n.º 13
0
def model_param_search(estimator, X, y, param_dist, scoring,
                       n_iter=1, n_cv=5, verbose=10, random_state=1, model_id='model', save_search=True):
    start = time.time()

    random_search = RandomizedSearchCV(estimator, param_distributions=param_dist,
                                       n_iter=n_iter, scoring=scoring, cv=n_cv,
                                       verbose=verbose, random_state=random_state)
    random_search.fit(X, y)
    print('Best param: ', random_search.best_params_)
    print('Best score: ', random_search.best_score_)
    print('Best model: ', random_search.best_estimator_)
    if save_search:
        with open(model_id+'.pickle', 'wb') as f:
            pickle.dump(random_search, f)
    print('Time searching param for {}: {}'.format(
        model_id, (time.time() - start) / 60))

    return random_search.best_estimator_
Ejemplo n.º 14
0
def Stacking(real_train_tar):
    predictions_train = pd.DataFrame([np.expm1(y_lasso_predict), np.expm1(y_ridge_predict), np.expm1(y_rf_predict), np.expm1(y_xgb_predict)]).T
    sns.pairplot(predictions_train)
    
    learning_rate = [round(float(x), 2) for x in np.linspace(start = .1, stop = .2, num = 11)]
        # Minimum for sum of weights for observations in a node
    min_child_weight = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
        # Maximum nodes in each tree
    max_depth = [int(x) for x in np.linspace(1, 10, num = 10)]
    n_estimators=[int(x) for x in np.linspace(start = 100, stop = 2000, num = 20)]
    subsample=[0.3, 0.4,0.5,0.6, 0.7]
    stack_model = xgb.XGBRegressor()
    random_grid = {'learning_rate': learning_rate,
                    'max_depth': max_depth,
                    'min_child_weight': min_child_weight,
                    'subsample': subsample,
                    'n_estimators':n_estimators
                    }
    
        # Make a RandomizedSearchCV object with correct model and specified hyperparams
    xgb_stack = RandomizedSearchCV(estimator=stack_model, param_distributions=random_grid, n_iter=1000, cv=5, verbose=2, random_state=42, n_jobs=-1)
    start = time.time()
        # Fit models
    xgb_stack.fit(predictions_train, real_train_tar)
    xgb_stack.best_params_
    write_pkl(xgb_stack.best_params_, '/Users/vickywinter/Documents/NYC/Machine Learning Proj/Pickle/stack_params.pkl')
    
    model_stacking = XGBRegressor(**xgb_stack.best_params_)
    #model_xgb = XGBRegressor(**best_params_)
    start=time.time()
    model_stacking.fit(predictions_train,real_train_tar)
    end=time.time()
    print("MSE for train data is: %f" % mean_squared_error(np.log1p(real_train_tar),np.log1p( model_stacking.predict(predictions_train))))
    print('Time elapsed: %.4f seconds' % (end-start))
    
    
    y_stack_predict=model_stacking.predict(predictions_train)
    x_line = np.arange(700000)
    y_line=x_line
    plt.scatter(real_train_tar,y_stack_predict)
    plt.plot(x_line, y_line, color='r')
    plt.xlabel('Actual Sale Price')
    plt.ylabel('Predict Sle Price')
Ejemplo n.º 15
0
def test_grid_search_with_multioutput_data():
    # Test search with multi-output estimator

    X, y = make_multilabel_classification(return_indicator=True,
                                          random_state=0)

    est_parameters = {"max_depth": [1, 2, 3, 4]}
    cv = KFold(random_state=0)

    estimators = [DecisionTreeRegressor(random_state=0),
                  DecisionTreeClassifier(random_state=0)]

    # Test with grid search cv
    for est in estimators:
        grid_search = GridSearchCV(est, est_parameters, cv=cv)
        grid_search.fit(X, y)
        res_params = grid_search.cv_results_['params']
        for cand_i in range(len(res_params)):
            est.set_params(**res_params[cand_i])

            for i, (train, test) in enumerate(cv.split(X, y)):
                est.fit(X[train], y[train])
                correct_score = est.score(X[test], y[test])
                assert_almost_equal(
                    correct_score,
                    grid_search.cv_results_['split%d_test_score' % i][cand_i])

    # Test with a randomized search
    for est in estimators:
        random_search = RandomizedSearchCV(est, est_parameters,
                                           cv=cv, n_iter=3)
        random_search.fit(X, y)
        res_params = random_search.cv_results_['params']
        for cand_i in range(len(res_params)):
            est.set_params(**res_params[cand_i])

            for i, (train, test) in enumerate(cv.split(X, y)):
                est.fit(X[train], y[train])
                correct_score = est.score(X[test], y[test])
                assert_almost_equal(
                    correct_score,
                    random_search.cv_results_['split%d_test_score'
                                              % i][cand_i])
Ejemplo n.º 16
0
 def train_classifier(self, trainvectors, labels, c='1.0', kernel='linear', gamma='0.1', degree='1', class_weight='balanced', jobs=1, iterations=10, scoring='f1_micro', v=2):
     if len(list(set(labels))) > 2: # more than two classes to distinguish
         parameters = ['estimator__C', 'estimator__kernel', 'estimator__gamma', 'estimator__degree']
         multi = True
     else: # only two classes to distinguish
         parameters = ['C', 'kernel', 'gamma', 'degree']
         multi = False
     if len(class_weight.split(':')) > 1: # dictionary
         class_weight = dict([label_weight.split(':') for label_weight in class_weight.split()])
     c_values = [0.001, 0.005, 0.01, 0.5, 1, 5, 10, 50, 100, 500, 1000] if c == 'search' else [float(x) for x in c.split()]
     kernel_values = ['linear', 'rbf', 'poly'] if kernel == 'search' else [k for  k in kernel.split()]
     gamma_values = [0.0005, 0.002, 0.008, 0.032, 0.128, 0.512, 1.024, 2.048] if gamma == 'search' else [float(x) for x in gamma.split()]
     degree_values = [1, 2, 3, 4] if degree == 'search' else [int(x) for x in degree.split()]
     grid_values = [c_values, kernel_values, gamma_values, degree_values]
     if not False in [len(x) == 1 for x in grid_values]: # only sinle parameter settings
         settings = {}
         for i, parameter in enumerate(parameters):
             settings[parameter] = grid_values[i][0]
     else:
         param_grid = {}
         for i, parameter in enumerate(parameters):
             param_grid[parameter] = grid_values[i]
         model = svm.SVC(probability=True)
         if multi:
             model = OutputCodeClassifier(model)
             trainvectors = trainvectors.todense()
         paramsearch = RandomizedSearchCV(model, param_grid, cv = 5, scoring=scoring, verbose = v, n_iter = iterations, n_jobs = jobs, pre_dispatch = 4)
         paramsearch.fit(trainvectors, labels)
         settings = paramsearch.best_params_
     # train an SVC classifier with the settings that led to the best performance
     self.model = svm.SVC(
         probability = True,
         C = settings[parameters[0]],
         kernel = settings[parameters[1]],
         gamma = settings[parameters[2]],
         degree = settings[parameters[3]],
         class_weight = class_weight,
         cache_size = 1000,
         verbose = v
     )
     self.model.fit(trainvectors, labels)
Ejemplo n.º 17
0
def build_lasso(x_train, y_train, x_test, y_test, n_features):
    """
    Constructing a Lasso linear model with cross validation from input dataframe
    :param x_train: features dataframe for model training
    :param y_train: target dataframe for model training
    :param x_test: features dataframe for model testing
    :param y_test: target dataframe for model testing
    :return: None
    """

    model = Lasso(random_state=1)
    # Random state has int value for non-random sampling
    param_dist = {'alpha': np.arange( 0.0001, 1, 0.001 ).tolist()}
    clf = RandomizedSearchCV(estimator=model, param_distributions=param_dist,
                             n_iter=15, n_jobs=-1)
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)

    print(clf.best_params_, clf.best_score_)

    # Mean absolute error regression loss
    mean_abs = sklearn.metrics.mean_absolute_error(y_test, y_pred)
    # Mean squared error regression loss
    mean_sq = sklearn.metrics.mean_squared_error(y_test, y_pred)
    # Median absolute error regression loss
    median_abs = sklearn.metrics.median_absolute_error(y_test, y_pred)
    # R^2 (coefficient of determination) regression score function
    r2 = sklearn.metrics.r2_score(y_test, y_pred)
    # Explained variance regression score function
    exp_var_score = sklearn.metrics.explained_variance_score(y_test, y_pred)

    with open('../trained_networks/lasso_%d_data.pkl' % n_features, 'wb') as results:
        pickle.dump(clf, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(mean_abs, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(mean_sq, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(median_abs, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(r2, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(exp_var_score, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(y_pred, results, pickle.HIGHEST_PROTOCOL)

    return
Ejemplo n.º 18
0
def test_randomized_search_grid_scores():
    # Make a dataset with a lot of noise to get various kind of prediction
    # errors across CV folds and parameter settings
    X, y = make_classification(n_samples=200, n_features=100, n_informative=3,
                               random_state=0)

    # XXX: as of today (scipy 0.12) it's not possible to set the random seed
    # of scipy.stats distributions: the assertions in this test should thus
    # not depend on the randomization
    params = dict(C=expon(scale=10),
                  gamma=expon(scale=0.1))
    n_cv_iter = 3
    n_search_iter = 30
    search = RandomizedSearchCV(SVC(), n_iter=n_search_iter, cv=n_cv_iter,
                                param_distributions=params, iid=False)
    search.fit(X, y)
    assert_equal(len(search.grid_scores_), n_search_iter)

    # Check consistency of the structure of each cv_score item
    for cv_score in search.grid_scores_:
        assert_equal(len(cv_score.cv_validation_scores), n_cv_iter)
        # Because we set iid to False, the mean_validation score is the
        # mean of the fold mean scores instead of the aggregate sample-wise
        # mean score
        assert_almost_equal(np.mean(cv_score.cv_validation_scores),
                            cv_score.mean_validation_score)
        assert_equal(list(sorted(cv_score.parameters.keys())),
                     list(sorted(params.keys())))

    # Check the consistency with the best_score_ and best_params_ attributes
    sorted_grid_scores = list(sorted(search.grid_scores_,
                              key=lambda x: x.mean_validation_score))
    best_score = sorted_grid_scores[-1].mean_validation_score
    assert_equal(search.best_score_, best_score)

    tied_best_params = [s.parameters for s in sorted_grid_scores
                        if s.mean_validation_score == best_score]
    assert_true(search.best_params_ in tied_best_params,
                "best_params_={0} is not part of the"
                " tied best models: {1}".format(
                    search.best_params_, tied_best_params))
Ejemplo n.º 19
0
def build_tree(x_train, y_train, x_test, y_test, n_features):
    """
    Constructing a decision trees regression model from input dataframe
    :param x_train: features dataframe for model training
    :param y_train: target dataframe for model training
    :param x_test: features dataframe for model testing
    :param y_test: target dataframe for model testing
    :return: None
    """
    model = DecisionTreeRegressor()
    param_dist = {'max_depth': sp_randint(1, 15),
                  'min_samples_split': sp_randint(2, 15)}
    clf = RandomizedSearchCV(estimator=model, param_distributions=param_dist,
                             n_iter=15, n_jobs=-1)
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)

    print(clf.best_params_, clf.best_score_)

    # Mean absolute error regression loss
    mean_abs = sklearn.metrics.mean_absolute_error(y_test, y_pred)
    # Mean squared error regression loss
    mean_sq = sklearn.metrics.mean_squared_error(y_test, y_pred)
    # Median absolute error regression loss
    median_abs = sklearn.metrics.median_absolute_error(y_test, y_pred)
    # R^2 (coefficient of determination) regression score function
    r2 = sklearn.metrics.r2_score(y_test, y_pred)
    # Explained variance regression score function
    exp_var_score = sklearn.metrics.explained_variance_score(y_test, y_pred)

    with open('../trained_networks/dt_%d_data.pkl' % n_features, 'wb') as results:
        pickle.dump(clf, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(mean_abs, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(mean_sq, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(median_abs, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(r2, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(exp_var_score, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(y_pred, results, pickle.HIGHEST_PROTOCOL)

    return
Ejemplo n.º 20
0
 def random_grid_search_tuning(self,cat_param, cat_param_distribution, f_score, n_jobs, n_iter):
     cat_estimator = cat.CatBoostClassifier(**cat_param)
     cat_rgs = RandomizedSearchCV(
         estimator=cat_estimator,
         param_distributions=cat_param_distribution,
         cv=self.skf,
         scoring=make_scorer(f_score, greater_is_better=True, needs_proba=True),
         n_iter=n_iter,
         n_jobs=n_jobs,
         verbose=2,
         refit=False,
     )
     time_begin = time.time()
     cat_rgs.fit(self.X, self.y)
     time_end = time.time()
     logging.info('Random grid search eat time {0}'.format(time_end - time_begin))
     logging.info('best_score_ : {0}'.format(cat_rgs.best_score_))
     logging.info('best_params_ : {0}'.format(cat_rgs.best_params_))
     for score in cat_rgs.grid_scores_:
         logging.info('grid_scores_ : {0}'.format(score))
     gc.collect()
     return cat_rgs.best_params_
Ejemplo n.º 21
0
 def evaluate_model(self, pipelines):
     n,m = pipelines
     parameters = self.get_params(n, self.optimizer)
     if self.optimizer == 'GridSearchCV':
         print("Performing GridSearchCV...", n)
         grid_search_t = GridSearchCV(m, parameters, verbose=1)
         grid_search_t.fit(self.evaluator.X_train, self.evaluator.y_train)
         return [grid_search_t.best_score_,grid_search_t.best_params_]
     elif self.optimizer == 'RandomizedSearchCV':
         print("Performing RandomizedSearchCV...", n)
         random_search_t = RandomizedSearchCV(m, parameters, verbose=1)
         random_search_t.fit(self.evaluator.X_train, self.evaluator.y_train)
         return [random_search_t.best_score_,random_search_t.best_params_]
     elif self.optimizer == 'GeneticSearchCV':
         print("Performing GeneticSearchCV...", n)
         genetic_search_t = GeneticSearchCV(m, parameters, scoring=None, cv=KFold(n_splits=5), n_jobs=1, verbose=1, refit=False, population_size=50, gene_mutation_prob=0.10, gene_crossover_prob=0.5, tournament_size=3, generations_number=10)
         genetic_search_t.fit(self.evaluator.X_train, self.evaluator.y_train)
         return [genetic_search_t.best_score_,genetic_search_t.best_params_]
     elif self.optimizer == 'EdasSearch':
         print("Performing EdasSearch...", n)
         eda_search_t = EdasSearch(getModelAccuracy, parameters, m,iterations=2, sample_size=15, select_ratio=0.3, debug=False, n_jobs=1)
         eda_search_t.fit()
         return [eda_search_t.best_score_,eda_search_t.best_params_]
Ejemplo n.º 22
0
 def train_classifier(self, trainvectors, labels, n_neighbors='3', weights='uniform', algorithm='auto', leaf_size='30', metric='euclidean', p=2, scoring='roc_auc', jobs=1, v=2):
     if len(list(set(labels))) > 2: # more than two classes to distinguish
         parameters = ['estimator__n_neighbors','estimator__weights', 'estimator__leaf_size', 'estimator__metric']
         multi = True
     else: # only two classes to distinguish
         parameters = ['n_neighbors','weights', 'leaf_size', 'metric'] 
         multi = False
     n_neighbours = [3,5,7,9] if n_neighbors == 'search' else [int(x) for x in n_neighbors.split()]
     weights = ['uniform','distance'] if weights == 'search' else weights.split()
     leaf_size = [10,20,30,40,50] if n_neighbors == 'search' else [int(x) for x in leaf_size.split()]
     metric = ['minkowski','euclidean','manhattan','hamming'] if metric == 'search' else metric.split()
     grid_values = [n_neighbors, weights, leaf_size, metric]
     if not False in [len(x) == 1 for x in grid_values]: # only sinle parameter settings
         settings = {}
         for i, parameter in enumerate(parameters):
             settings[parameter] = grid_values[i][0]
     else:
         param_grid = {}
         for i, parameter in enumerate(parameters):
             param_grid[parameter] = grid_values[i]
         model = KNeighborsClassifier(algorithm=algorithm,p=p) 
         if multi:
             model = OutputCodeClassifier(model)
             trainvectors = trainvectors.todense()
         paramsearch = RandomizedSearchCV(model, param_grid, verbose=v, scoring=scoring, cv=5, n_jobs=jobs)
         paramsearch.fit(trainvectors, labels)
         settings = paramsearch.best_params_
     self.model = KNeighborsClassifier(
         algorithm=algorithm,
         p=p,
         n_neighbors=settings[parameters[0]],
         weights=settings[parameters[1]],
         leaf_size=settings[parameters[2]],
         metric=settings[parameters[3]]
     )
     self.model.fit(trainvectors, labels)
def model(clf_name, features, labels):

    start_time = time.time()
    # specify parameters and distributions to sample from
    clf        = make_pipeline(StandardScaler(), PCA(), classifiers[clf_name]) #PCA optional: n_components=2

    '''clf = Pipeline([
        ('reduce_dim', PCA()),
        ('classify',  classifiers[clf_name])
    ])'''
    
    # select correct param set, adjust the pca to current window
    param_dist = param_dict[clf_name]

    param_dist["pca__n_components"] = sp_randint(2, features.shape[1]-1)

    #if 'randomforestclassifier__max_features' in param_dist:
    #    param_dist['randomforestclassifier__max_features'] = sp_randint(2, features.shape[1])

    # run randomized search
    n_iter_search = 50
    random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                       n_iter=n_iter_search,
                                       scoring="f1_weighted")#,
                                       #n_jobs = 8)#,pre_dispatch=8)

    #start = time.time()
    random_search.fit(features, labels)
    #print("RandomizedSearchCV took %.2f seconds for %d candidates"
    #      " parameter settings." % ((time.time() - start), n_iter_search))
    #report(random_search.cv_results_)

    elapsed_time = time.time() - start_time
    print 'Optimizing %s on window %d took %d sec'%(clf_name, features.shape[1]/6, elapsed_time)

    return random_search
Ejemplo n.º 24
0
def test_random_search_cv_results():
    # Make a dataset with a lot of noise to get various kind of prediction
    # errors across CV folds and parameter settings
    X, y = make_classification(n_samples=200, n_features=100, n_informative=3,
                               random_state=0)

    # scipy.stats dists now supports `seed` but we still support scipy 0.12
    # which doesn't support the seed. Hence the assertions in the test for
    # random_search alone should not depend on randomization.
    n_splits = 3
    n_search_iter = 30
    params = dict(C=expon(scale=10), gamma=expon(scale=0.1))
    random_search = RandomizedSearchCV(SVC(), n_iter=n_search_iter,
                                       cv=n_splits, iid=False,
                                       param_distributions=params)
    random_search.fit(X, y)
    random_search_iid = RandomizedSearchCV(SVC(), n_iter=n_search_iter,
                                           cv=n_splits, iid=True,
                                           param_distributions=params)
    random_search_iid.fit(X, y)

    param_keys = ('param_C', 'param_gamma')
    score_keys = ('mean_test_score', 'mean_train_score',
                  'rank_test_score',
                  'split0_test_score', 'split1_test_score',
                  'split2_test_score',
                  'split0_train_score', 'split1_train_score',
                  'split2_train_score',
                  'std_test_score', 'std_train_score',
                  'mean_fit_time', 'std_fit_time',
                  'mean_score_time', 'std_score_time')
    n_cand = n_search_iter

    for search, iid in zip((random_search, random_search_iid), (False, True)):
        assert_equal(iid, search.iid)
        cv_results = search.cv_results_
        # Check results structure
        check_cv_results_array_types(cv_results, param_keys, score_keys)
        check_cv_results_keys(cv_results, param_keys, score_keys, n_cand)
        # For random_search, all the param array vals should be unmasked
        assert_false(any(cv_results['param_C'].mask) or
                     any(cv_results['param_gamma'].mask))
        check_cv_results_grid_scores_consistency(search)

n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]
params = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
cv_result = RandomizedSearchCV(RandomForestClassifier(),params,cv=3,scoring='accuracy',random_state = 5)
cv_result.fit(X_resampled, y_resampled)
cv_result.best_params_


# In[114]:


classifier_op = RandomForestClassifier(n_estimators = 2000, min_samples_split=2,min_samples_leaf=1,max_features='auto',
                               max_depth=70, random_state = 42,bootstrap=False)
classifier_op.fit(X_resampled, y_resampled)


# In[115]:

Ejemplo n.º 26
0
    return model


def create_hyperparameters():
    batches = [10, 20]
    optimizers = ['rmsprop', 'adam', 'adadelta']
    dropout = [0.1]
    return {'batch_size': batches, 'optimizer': optimizers, 'drop': dropout}


hyperparameters = create_hyperparameters()

# 우리가 만든 케라스 모델을 싸이킷런에 넣을수 있게 wrapping!
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
model = KerasClassifier(build_fn=build_model, verbose=0)

# wrapping된 모델을 이용해서 싸이킷런의 GridSearchCV 이용!
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
# search = GridSearchCV(model, hyperparameters, cv=3)
search = RandomizedSearchCV(model, hyperparameters, cv=3)
search.fit(x_train, y_train)

acc = search.score(x_test, y_test)
print('최적의 파라미터 :', search.best_params_)
print('최종 스코어:', acc)
print('hyper_cnn')
'''
최적의 파라미터 : {'optimizer': 'adam', 'drop': 0.1, 'batch_size': 20}
최종 스코어: 0.9825000166893005
'''
Ejemplo n.º 27
0
    batches = [50,100,120]
    optimizers = ['rmsprop', 'adam', 'adadelta'] # 용도에 맞게 쓰자.
    dropout = np.linspace(0.1, 0.25, 0.5, 5)
    epochs = [1,2]
    return{"kerasclassifier__batch_size":batches, "kerasclassifier__optimizer":optimizers, "kerasclassifier__epochs":epochs} #, "keep_prob":dropout}
# 밑에서 make를 쓸때는 각 parameter앞에 kerasclassifier__를, 그냥 Pipeline 쓸때는 svc__를 써주면 됨!
from keras.wrappers.scikit_learn import KerasClassifier # 사이킷런과 호환하도록 함. (mnist에서 쓸듯)
# from keras.wrappers.scikit_learn import KerasRegressor # 사이킷런의 교차검증을 keras에서 사용하기 위해 wrapping함
model = KerasClassifier(build_fn=build_network, verbose=1) # verbose=0 위에서 만든 함수형 모델 당겨옴.

from sklearn.preprocessing import MinMaxScaler                                                   
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

hyperparameters = create_hyperparameters()

pipe = make_pipeline(MinMaxScaler(), model)

from sklearn.model_selection import RandomizedSearchCV
search = RandomizedSearchCV(estimator=pipe,
                             param_distributions=hyperparameters,
                             n_iter=10, n_jobs=1, cv=3, verbose=1)
#                              # 작업이 10회 수행, 3겹 교차검증 사용(3조각을 나눠서 검증). n_jobs는 알아서 찾아볼 것.
# KFold가 5번 돌았다면 얘는 랜덤하게 돈다. 이 작업을 하는 것은 위의 하이퍼파라미터 중 최적의 결과를 내는 파라미터들을 찾기 위함.
# search.fit(data["x_train"], data["y_train"])

search.fit(x_train, y_train) # 데이터 집어넣기!

print(search.best_score_)
print(search.best_params_)
Ejemplo n.º 28
0
                      results['std_test_score'][candidate]))
                print("Parameters: {0}".format(results['params'][candidate]))
                print("")


    # specify parameters and distributions to sample from
    param_dist = {"max_depth": [3, None],
                  "max_features": sp_randint(1, 6),
                  "min_samples_split": sp_randint(2, 11),
                  "min_samples_leaf": sp_randint(2, 11),
                  "bootstrap": [True, False],
                  "criterion": ["gini", "entropy"]}

    # run randomized search
    n_iter_search = 20
    random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                       n_iter=n_iter_search)

    start = time()
    random_search.fit(info_train, OL_train)
    print("RandomizedSearchCV took %.2f seconds for %d candidates"
          " parameter settings." % ((time() - start), n_iter_search))
    report(random_search.cv_results_, n_top=10)

    stop

    forest = RandomForestClassifier(n_estimators=100, **random_search.best_params_)
    scores = cross_val_score(forest, class_info, stars)
    print scores.mean()

    forest.fit(info_train, OL_train)
    #joblib.dump(forest, 'trained_forest_classifier.pkl')
Ejemplo n.º 29
0
X = train_df.drop("Survived", axis=1).copy()
y = train_df["Survived"]

# In[ ]:

param_grid = {
    'max_depth': st.randint(6, 11),
    'n_estimators': st.randint(300, 500),
    'max_features': np.arange(0.5, .81, 0.05),
    'max_leaf_nodes': st.randint(6, 10)
}

grid = RandomizedSearchCV(rfc,
                          param_grid,
                          cv=10,
                          scoring='accuracy',
                          verbose=1,
                          n_iter=20)

grid.fit(X, y)

# In[ ]:

grid.best_estimator_

# In[ ]:

grid.best_score_

# Ok so now let's generate our predictions based on the best estimator model.
Ejemplo n.º 30
0
# specify parameters for hyperparameter search
# specify parameters and distributions to sample from
param_dist = {
    "max_depth": [6, None],
    "max_features": sp_randint(1, max_features),
    "min_samples_split": sp_randint(2, 30),
    "min_samples_leaf": sp_randint(1, 30),
    "bootstrap": [True, False],
    "criterion": ["gini", "entropy"]
}

# run randomized search
n_iter_search = _RF_iterations
random_search = RandomizedSearchCV(
    clf,
    param_distributions=param_dist,
    n_iter=n_iter_search,
    verbose=_VERBOSITY
)  # http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html

# let's start training the model.
print("Beginning hyper-parameter search")
start = time()
random_search.fit(X_train, y_train)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))

print(random_search.best_params_)

# now let's train a model on the entire training set
# with those parameters
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")


# specify parameters and distributions to sample from
param_dist = {"max_depth": [3, None],
              "max_features": sp_randint(1, 11),
              "min_samples_split": sp_randint(2, 11),
              "min_samples_leaf": sp_randint(1, 11),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

# run randomized search
n_iter_search = 20
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search)

start = time()
random_search.fit(X, y)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.cv_results_)

# use a full grid over all parameters
param_grid = {"max_depth": [3, None],
              "max_features": [1, 3, 10],
              "min_samples_split": [2, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}
from scipy.stats import uniform as sp_uniform

# Create parameters grid for RBF kernel, we have to set C and gamma
C_dist = sp_uniform(scale=10)
gamma_dist = sp_uniform(scale=1)
parameters = {'kernel':['rbf'],
              'C':C_dist, 
              'gamma': gamma_dist
 }
from sklearn.model_selection import RandomizedSearchCV
n_iter_search = 8
svm_clsf = svm.SVC()
rnd_clsf = RandomizedSearchCV(estimator=svm_clsf,
                              param_distributions=parameters,
                              n_iter=n_iter_search, 
                              cv=3,
                              n_jobs=1,
                              verbose=2)

# Warning! It takes really long time to compute this about 2 days
start_time = dt.datetime.now()
print('Start param searching at {}'.format(str(start_time)))

rnd_clsf.fit(X_train, y_train)

elapsed_time= dt.datetime.now() - start_time
print('Elapsed time, param searching {}'.format(str(elapsed_time)))
sorted(rnd_clsf.cv_results_.keys())

classifier = rnd_clsf.best_estimator_
params = rnd_clsf.best_params_
Ejemplo n.º 33
0
    'bootstrap': bootstrap,
    'max_samples': max_samples
}
pprint(random_grid)

#%%

# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation,
# search across 10000 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator=rf,
                               param_distributions=random_grid,
                               scoring='neg_mean_absolute_error',
                               n_iter=10000,
                               cv=5,
                               verbose=2,
                               random_state=42,
                               n_jobs=-1)

# Fit the random search model
rf_random.fit(train_features, train_labels)
#%%
# Create dataframe with metrics of every single combination tested
random_results = pd.DataFrame(rf_random.cv_results_)
random_results = random_results.sort_values('rank_test_score')
random_results.to_json(
    r'/Users/matthew/Desktop/data/RF_randomized_search_CV_mae.json')

# set RF with best parameters from random sampling
print(rf_random.best_params_)
Ejemplo n.º 34
0
def run_test(filename, results_dir, models, random_state, external_split,
             internal_split, optimization_iterations):
    global df_results
    print(filename)
    data_dict['Dataset Name'] = filename.replace('.csv', '')
    df = pd.read_csv(directory + '/' + filename)
    X, Y = fix_dataset(df)
    kf = StratifiedKFold(n_splits=external_split,
                         random_state=random_state,
                         shuffle=True)
    for fold_index, (train_index, test_index) in enumerate(kf.split(X, Y)):
        data_dict['Cross Validation[1-10]'] = fold_index
        print("fold index =", fold_index)
        x_train = X.iloc[train_index]
        y_train = Y.iloc[train_index]
        x_test = X.iloc[test_index]
        y_test = Y.iloc[test_index]
        for model_name, model_class, model, model_dict in models:
            print('Model:', model_name)
            data_dict['Algorithm Name'] = model_name
            # distributions = dict(C=uniform(loc=0, scale=4), penalty=['l2', 'l1'])
            distributions = model_dict
            start_training_time = time.time()
            randomSearcher = RandomizedSearchCV(
                model,
                distributions,
                random_state=random_state,
                cv=internal_split,
                n_iter=optimization_iterations,
                scoring=make_scorer(accuracy_score))
            randomSearcher.fit(x_train, y_train.values.ravel())

            if model_class is wprb:
                params = {
                    k.replace("estimator__", ""): v
                    for k, v in randomSearcher.best_params_.items()
                }
                best_model = OneVsRestClassifier(model_class(**params))
            else:
                params = randomSearcher.best_params_
                best_model = model_class(**params)
            data_dict['Hyper-Parameters Values'] = params
            best_model.fit(x_train, y_train.values.ravel())
            data_dict['Training Time'] = time.time() - start_training_time
            print("best params:", params)
            print(
                "train accuracy:",
                round(accuracy_score(y_train, best_model.predict(x_train)), 4))
            start_inference_time = time.time()
            test_pred = best_model.predict(x_test)
            test_pred_proba = best_model.predict_proba(x_test)
            data_dict['Inference Time'] = (
                time.time() - start_inference_time) / (len(x_test)) * 1000
            print("test accuracy:", round(accuracy_score(y_test, test_pred),
                                          4))
            print()
            data_dict['Accuracy'] = accuracy_score(y_test, test_pred)
            data_dict['Precision'] = precision_score(
                y_test,
                test_pred,
                average='macro',
                labels=np.unique(test_pred))
            unique_labels = np.unique(Y.values)
            if len(unique_labels) == 2:  # multiclass vs binary classification
                data_dict['AUC'] = roc_auc_score(y_true=y_test,
                                                 y_score=test_pred_proba[:, 1])
            else:
                # plaster = test_pred_proba[:, [np.where(np.unique(Y.values) == x)[0][0] for x in np.unique(y_test)]]
                # plaster2 = np.array([[x / sum(y) for x in y] for y in plaster])
                data_dict['AUC'] = roc_auc_score(y_true=y_test,
                                                 y_score=test_pred_proba,
                                                 multi_class='ovr',
                                                 labels=np.unique(y_test))
            all_TPR = []
            all_FPR = []
            all_PR_CURVE = []
            for index, class_label in enumerate(np.unique(y_test)):
                tn, fp, fn, tp = confusion_matrix(
                    y_test == class_label, test_pred == class_label).ravel()
                all_FPR.append(fp / (fp + tn))
                all_TPR.append(tp / (tp + fn))
                precision, recall, _ = precision_recall_curve(
                    y_test == class_label, test_pred_proba[:, index])
                all_PR_CURVE.append(auc(recall, precision))
            data_dict['FPR'] = np.mean(all_FPR)
            data_dict['TPR'] = np.mean(all_TPR)
            data_dict['PR Curve'] = np.mean(all_PR_CURVE)

            df_results = df_results.append(data_dict, ignore_index=True)
    df_results.to_csv(results_dir + '/' + filename, index=False)
    df_results = df_results.iloc[0:0]
Ejemplo n.º 35
0
def Model(train_linear, test_linear):
    train_linear_fea=train_linear.drop(columns=['SalePrice'])
    train_linear_tar=train_linear.SalePrice
    x_train, x_test, y_train, y_test = train_test_split(train_linear_fea, train_linear_tar,test_size=0.2, random_state=0)
    def evaluate(model, test_features, test_labels,train_features, train_labels):
        predictions = model.predict(test_features)
        errors = abs(predictions - test_labels)
        mape = 100 * np.mean(errors / test_labels)
        accuracy = 100 - mape
        print('Model Performance')
        print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
        print('Accuracy = {:0.2f}%.'.format(accuracy))    
        print("MSE for train data is: %f" % mean_squared_error(y_train, model.predict(x_train)))
        print("MSE for validation data is: %f" % mean_squared_error(y_test, model.predict(x_test)))
        return accuracy
    real_train_tar=np.expm1(train_linear_tar)
    """
        . Lasso model
    """
    
    lassocv = LassoCV(alphas = np.logspace(-5, 4, 400), )
    lassocv.fit(train_linear_fea, train_linear_tar)
    lassocv_score = lassocv.score(train_linear_fea, train_linear_tar)
    lassocv_alpha = lassocv.alpha_
    print("Best alpha : ", lassocv_alpha, "Score: ",lassocv_score)
    
    start=time.time()
    lasso =Lasso(normalize = True)
    lasso.set_params(alpha=lassocv_alpha,max_iter = 10000)
    lasso.fit(x_train, y_train)
    end=time.time()
    mean_squared_error(y_test, lasso.predict(x_test))
    coef_lasso=pd.Series(lassocv.coef_, index=x_train.columns).sort_values(ascending =False)
    evaluate(lasso,x_test,y_test,x_train,y_train)
    print('Time elapsed: %.4f seconds' % (end-start))
    
    y_lasso_predict=lasso.predict(train_linear_fea)
    x_line = np.arange(700000)
    y_line=x_line
    plt.scatter(real_train_tar,np.expm1(y_lasso_predict))
    plt.plot(x_line, y_line, color='r')
    plt.xlabel('Actual Sale Price')
    plt.ylabel('Predict Sle Price')
    
    test_prediction_lasso=np.expm1(lasso.predict(test_linear))
    
    
    """
        . Ridge model
    """
    
    ridgecv = RidgeCV(alphas = np.logspace(-5, 4, 400))
    ridgecv.fit(x_train, y_train)
    ridgecv_score = ridgecv.score(x_train, y_train)
    ridgecv_alpha = ridgecv.alpha_
    print("Best alpha : ", ridgecv_alpha, "Score: ",ridgecv_score)
    coef=pd.Series(ridgecv.coef_, index=x_train.columns).sort_values(ascending =False)
    
    start=time.time()
    ridge =Ridge(normalize = True)
    ridge.set_params(alpha=ridgecv_alpha,max_iter = 10000)
    ridge.fit(x_train, y_train)
    end=time.time()
    mean_squared_error(y_test, ridge.predict(x_test))
    coef_ridge=pd.Series(ridgecv.coef_, index=x_train.columns).sort_values(ascending =False)
    evaluate(ridge,x_test,y_test,x_train,y_train)
    print('Time elapsed: %.4f seconds' % (end-start))
    
    y_ridge_predict=ridge.predict(train_linear_fea)
    x_line = np.arange(700000)
    y_line=x_line
    plt.scatter(real_train_tar,np.expm1(y_ridge_predict))
    plt.plot(x_line, y_line, color='r')
    plt.xlabel('Actual Sale Price')
    plt.ylabel('Predict Sle Price')
    
    test_prediction_ridge=np.expm1(ridge.predict(test_linear))
    
    
    """
        . Random Forest
    """
    #train=train.drop(columns=['DateSold'])
    #test=test.drop(columns=['DateSold'])
    #X_train=train.drop(columns=['SalePrice'])
    #Y_train=train['SalePrice']
    X_train=train_linear_fea
    Y_train=train_linear_tar
    x_train_rf, x_test_rf, y_train_rf, y_test_rf = train_test_split(X_train, Y_train,test_size=0.2, random_state=0)
    
    
    n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 20)]
    max_features = ['auto', 'sqrt']
    max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
    min_samples_split = [2, 5, 10]
    min_samples_leaf = [1, 2, 4]
    bootstrap = [True, False]
    random_grid = {'n_estimators': n_estimators,
                   'max_features': max_features,
                   'max_depth': max_depth,
                   'min_samples_split': min_samples_split,
                   'min_samples_leaf': min_samples_leaf,
                   'bootstrap': bootstrap}
    
    rf = RandomForestRegressor()
    # Random search of parameters, using 3 fold cross validation, 
    # search across 100 different combinations, and use all available cores
    #
    rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
    rf_random.fit(X_train, Y_train)
    #rf_random.fit(x_train_rf, y_train_rf)
    rf_random.best_params_
    
    #Random search allowed us to narrow down the range for each hyperparameter. Now that we know where to concentrate our search,
    # we can explicitly specify every combination of settings to try. 
    param_grid = {
        'bootstrap': [False],
        'max_depth': [80, 90, 100, 110,120,130],
        'max_features': [2, 3],
        'min_samples_leaf': [1,2,3, 4],
        'min_samples_split': [2,4,6,8, 10, 12],
        'n_estimators': [600,700, 800, 900, 1000]
    }
    # Create a based model
    rf = RandomForestRegressor()
    # Instantiate the grid search model
    grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, cv = 3, n_jobs = -1, verbose = 2)
    #grid_search.fit(x_train, y_train)
    grid_search.fit(X_train, Y_train)
    grid_search.best_params_
    
    best_random = grid_search.best_estimator_
    start=time.time()
    best_random.fit(x_train_rf,y_train_rf)
    end=time.time()
    evaluate(best_random, x_test_rf, y_test_rf,x_train_rf,y_train_rf)
    print('Time elapsed: %.4f seconds' % (end-start))
    
    y_rf_predict=best_random.predict(train_linear_fea)
    x_line = np.arange(700000)
    y_line=x_line
    plt.scatter(real_train_tar,np.expm1(y_rf_predict))
    plt.plot(x_line, y_line, color='r')
    plt.xlabel('Actual Sale Price')
    plt.ylabel('Predict Sle Price')
    importance_rf = pd.DataFrame({'features':train_linear_fea.columns, 'imp':best_random.feature_importances_}).\
                            sort_values('imp',ascending=False)
    
    importance_top20_rf = importance_rf.iloc[:20,]
    
    plt.barh(importance_top20_rf.features, importance_top20_rf.imp)
    plt.xlabel('Feature Importance')
    
    test_prediction_rf=np.expm1(best_random.predict(test_linear))
    
    """
        . Xgboost
    """
    
    learning_rate = [round(float(x), 2) for x in np.linspace(start = .1, stop = .2, num = 11)]
        # Minimum for sum of weights for observations in a node
    min_child_weight = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
        # Maximum nodes in each tree
    max_depth = [int(x) for x in np.linspace(1, 10, num = 10)]
    n_estimators=[int(x) for x in np.linspace(start = 100, stop = 2000, num = 20)]
    subsample=[0.3, 0.4,0.5,0.6, 0.7]
    model = xgb.XGBRegressor()
    random_grid = {'learning_rate': learning_rate,
                    'max_depth': max_depth,
                    'min_child_weight': min_child_weight,
                    'subsample': subsample,
                    'n_estimators':n_estimators
                    }
    
        # Make a RandomizedSearchCV object with correct model and specified hyperparams
    xgb_random = RandomizedSearchCV(estimator=model, param_distributions=random_grid, n_iter=1000, cv=5, verbose=2, random_state=42, n_jobs=-1)
    start = time.time()
        # Fit models
    xgb_random.fit(X_train, Y_train)
    xgb_random.best_params_
    
    
    """
    best_params_={'learning_rate': 0.1,
     'max_depth': 2,
     'min_child_weight': 4,
     'n_estimators': 900,
     'subsample': 0.5}
    """
    model_xgb = XGBRegressor(**xgb_random.best_params_)
    #model_xgb = XGBRegressor(**best_params_)
    start=time.time()
    model_xgb.fit(x_train_rf,y_train_rf)
    end=time.time()
    evaluate(model_xgb, x_test_rf, y_test_rf,x_train_rf,y_train_rf)
    print('Time elapsed: %.4f seconds' % (end-start))
    
    
    
    y_xgb_predict=model_xgb.predict(train_linear_fea)
    x_line = np.arange(700000)
    y_line=x_line
    plt.scatter(real_train_tar,np.expm1(y_xgb_predict))
    plt.plot(x_line, y_line, color='r')
    plt.xlabel('Actual Sale Price')
    plt.ylabel('Predict Sle Price')
    importance_xgb = pd.DataFrame({'features':train_linear_fea.columns, 'imp':model_xgb.feature_importances_}).\
                            sort_values('imp',ascending=False)
    
    importance_top20_xgb = importance_xgb.iloc[:20,]
    
    plt.barh(importance_top20_xgb.features, importance_top20_xgb.imp)
    plt.xlabel('Feature Importance')
    
    test_prediction_xgb=np.expm1(model_xgb.predict(test_linear))
    
    return(test_prediction_lasso, test_prediction_ridge, test_prediction_rf, test_prediction_xgb,y_lasso_predict, y_ridge_predict, y_rf_predict, y_xgb_predict)
Ejemplo n.º 36
0
def _compute_thresh(this_data, method='bayesian_optimization',
                    cv=10, y=None, random_state=None):
    """Compute the rejection threshold for one channel.

    Parameters
    ----------
    this_data: array (n_epochs, n_times)
        Data for one channel.
    method : str
        'bayesian_optimization' or 'random_search'
    cv : iterator
        Iterator for cross-validation.
    random_state : int seed, RandomState instance, or None (default)
        The seed of the pseudo random number generator to use.

    Returns
    -------
    best_thresh : float
        The best threshold.

    Notes
    -----
    For method='random_search', the random_state parameter gives deterministic
    results only for scipy versions >= 0.16. This is why we recommend using
    autoreject with scipy version 0.16 or greater.
    """
    est = _ChannelAutoReject()
    all_threshes = np.sort(np.ptp(this_data, axis=1))

    if method == 'random_search':
        param_dist = dict(thresh=uniform(all_threshes[0],
                                         all_threshes[-1]))
        rs = RandomizedSearchCV(est,
                                param_distributions=param_dist,
                                n_iter=20, cv=cv,
                                random_state=random_state)
        rs.fit(this_data, y)
        best_thresh = rs.best_estimator_.thresh
    elif method == 'bayesian_optimization':
        cache = dict()

        def func(thresh):
            idx = np.where(thresh - all_threshes >= 0)[0][-1]
            thresh = all_threshes[idx]
            if thresh not in cache:
                est.set_params(thresh=thresh)
                obj = -np.mean(cross_val_score(est, this_data, y=y, cv=cv))
                cache.update({thresh: obj})
            return cache[thresh]

        n_epochs = all_threshes.shape[0]
        idx = np.concatenate((
            np.linspace(0, n_epochs, 40, endpoint=False, dtype=int),
            [n_epochs - 1]))  # ensure last point is in init
        idx = np.unique(idx)  # linspace may be non-unique if n_epochs < 40
        initial_x = all_threshes[idx]
        best_thresh, _ = bayes_opt(func, initial_x,
                                   all_threshes,
                                   expected_improvement,
                                   max_iter=10, debug=False,
                                   random_state=random_state)

    return best_thresh
Ejemplo n.º 37
0
# Import necessary modules
from scipy.stats import randint
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV

# Setup the parameters and distributions to sample from: param_dist
param_dist = {"max_depth": [3, None],
              "max_features": randint(1, 9),
              "min_samples_leaf": randint(1, 9),
              "criterion": ["gini", "entropy"]}

# Instantiate a Decision Tree classifier: tree
tree = DecisionTreeClassifier()

# Instantiate the RandomizedSearchCV object: tree_cv
tree_cv = RandomizedSearchCV(tree, param_dist, cv=5)

# Fit it to the data
tree_cv.fit(X,y)

# Print the tuned parameters and score
print("Tuned Decision Tree Parameters: {}".format(tree_cv.best_params_))
print("Best score is {}".format(tree_cv.best_score_))
#Tuned Decision Tree Parameters: {'criterion': 'gini', 'max_depth': 3, 'max_features': 5, 'min_samples_leaf': 2}
#Best score is 0.7395833333333334

# RandomizedSearchCV will never outperform GridSearchCV. Instead, it is valuable because it saves on computation time.

####################33
# First: Hold-out evaluation data
# how well can the model perform on never seen data?
#random grid
random_grid = {
    'n_estimators': n_estimators,
    'max_features': max_features,
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf
}

#Random forest regressor with random grid
rf = RandomForestRegressor()

rf_random = RandomizedSearchCV(estimator=rf,
                               param_distributions=random_grid,
                               n_iter=100,
                               scoring='neg_mean_squared_error',
                               verbose=2,
                               random_state=7,
                               cv=5)


#timer function
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        hour, temp_sec = divmod((datetime.now() - start_time).total_seconds(),
                                3600)
        mins, sec = divmod(temp_sec, 60)
        print("\n Time taken: %i:%i:%s" % (hour, mins, round(sec, 2)))
Ejemplo n.º 39
0
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV  # Create the parameter grid based on the results of random search
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 500, 1000]
}
# Create a based model
rf = RandomForestRegressor()  # Instantiate the grid search model
random_search_rf = RandomizedSearchCV(estimator=rf,
                                      param_distributions=param_grid,
                                      cv=10,
                                      n_jobs=-1,
                                      verbose=2)
model_fit_RF = random_search_rf.fit(X_train, y_train)
print(model_fit_RF.best_params_)
##Testing the model
test_predict_RF = model_fit_RF.predict(X_test)

from sklearn import metrics

print('RF Mean Absolute Error:',
      metrics.mean_absolute_error(y_test, test_predict_RF))
print('RF Mean Squared Error:',
      metrics.mean_squared_error(y_test, test_predict_RF))
print('RF R2:', metrics.r2_score(y_test, test_predict_RF))
print('RF Root Mean Squared Error:',
Ejemplo n.º 40
0
def ml_tests(imputed_data):
    # ScikitLearn Anforderung: Nur numerische Werte - Transformation der kategorischen Spalten
    categorical_mask = (imputed_data.dtypes == "category")
    categorical_columns = imputed_data.columns[categorical_mask].tolist()
    category_enc = pd.get_dummies(imputed_data[categorical_columns])
    imputed_data = pd.concat([imputed_data, category_enc], axis=1)
    imputed_data = imputed_data.drop(columns=categorical_columns)

    imputed_data = imputed_data.reset_index()

    # Ausgabe
    # print(imputed_data.info())
    # imputed_data.to_excel(excel_writer="Files/Tests/imputed_data.xlsx", sheet_name="Immobilien")

    # XGBoost Standardmodell
    print("XGBoost Standardmodell:")
    x = imputed_data.drop(columns=["angebotspreis"]).values
    y = imputed_data["angebotspreis"].values
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
    xg_reg = xgb.XGBRegressor(objective="reg:squarederror",
                              n_estimators=20,
                              seed=123)
    xg_reg.fit(x_train, y_train)
    preds = xg_reg.predict(x_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    print("RMSE: %f" % rmse)
    print()

    print_feature_importances(
        model=xg_reg, data=imputed_data.drop(columns=["angebotspreis"]))

    # Grid Search parameter Tuning
    print("Grid Search Parameter Tuning:")
    gbm_param_grid = {
        'colsample_bytree': [0.3, 0.7],
        'n_estimators': [50],
        'max_depth': [2, 5]
    }
    gbm = xgb.XGBRegressor(objective="reg:squarederror")
    grid_mse = GridSearchCV(estimator=gbm,
                            param_grid=gbm_param_grid,
                            scoring="neg_mean_squared_error",
                            cv=4,
                            verbose=1)
    grid_mse.fit(x_train, y_train)
    print("Best parameters found: ", grid_mse.best_params_)
    print("Lowest RMSE Grid Search found: ",
          np.sqrt(np.abs(grid_mse.best_score_)))
    print()

    # Randomized Search parameter tuning
    print("Randomized Search Parameter Tuning:")
    gbm_param_grid2 = {'n_estimators': [25], 'max_depth': range(2, 12)}

    gbm2 = xgb.XGBRegressor(objective="reg:squarederror", n_estimators=10)
    randomized_mse = RandomizedSearchCV(estimator=gbm2,
                                        param_distributions=gbm_param_grid2,
                                        scoring="neg_mean_squared_error",
                                        n_iter=5,
                                        cv=4,
                                        verbose=1)
    randomized_mse.fit(x_train, y_train)
    print("Best parameters found: ", randomized_mse.best_params_)
    print("Lowest RMSE Randomized Search found: ",
          np.sqrt(np.abs(randomized_mse.best_score_)))

    dm_train = xgb.DMatrix(data=x_train, label=y_train)
    dm_test = xgb.DMatrix(data=x_test, label=y_test)
    params = {"booster": "gblinear", "objective": "reg:squarederror"}
    xg_reg2 = xgb.train(dtrain=dm_train, params=params, num_boost_round=15)
    preds2 = xg_reg2.predict(dm_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds2))
    print("RMSE: %f" % rmse)

    reg_params = [0.1, 0.3, 0.7, 1, 10, 100]
    params1 = {"objective": "reg:squarederror", "max_depth": 3}
    rmses_l2 = []
    for reg in reg_params:
        params1["lambda"] = reg
        cv_results_rmse = xgb.cv(dtrain=dm_train,
                                 params=params1,
                                 nfold=3,
                                 num_boost_round=15,
                                 metrics="rmse",
                                 as_pandas=True)
        rmses_l2.append(cv_results_rmse["test-rmse-mean"].tail(1).values[0])

    print("Best rmse as a function of l2:")
    print(pd.DataFrame(list(zip(reg_params, rmses_l2)), columns=["l2",
                                                                 "rmse"]))
    print()

    print_feature_importances(
        model=xg_reg2, data=imputed_data.drop(columns=["angebotspreis"]))

    # Stochastic Gradient Boosting
    print("Stochastic Gradient Boosting:")
    sgbr = GradientBoostingRegressor(max_depth=4,
                                     subsample=0.9,
                                     max_features=0.75,
                                     n_estimators=200,
                                     random_state=2)

    sgbr.fit(x_train, y_train)
    y_pred = sgbr.predict(x_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print("RMSE: %f" % rmse)
    print()

    print_feature_importances(
        model=sgbr, data=imputed_data.drop(columns=["angebotspreis"]))

    # Random Forrest
    print("Random Forrest:")
    rf = RandomForestRegressor(n_estimators=25, random_state=2)
    rf.fit(x_train, y_train)
    y_pred2 = rf.predict(x_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred2))
    print("RMSE: %f" % rmse)
    print()

    print_feature_importances(
        model=rf, data=imputed_data.drop(columns=["angebotspreis"]))
Ejemplo n.º 41
0
n_estimators = [1000]
max_features = ['auto']
max_depth = [3, 5, 7]
min_samples_leaf = [2, 4]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_leaf': min_samples_leaf
              }

rf = RandomForestClassifier()
rf_random = RandomizedSearchCV(
                estimator=rf,
                param_distributions=random_grid,
                n_iter=100,
                cv=3,
                scoring='accuracy',
                verbose=5,
                n_jobs=-1)

rf_random.fit(X_train, y_train)
print(rf_random.best_params_)

print("Scores for the Train Dataset: ")
y_train_pred = rf_random.predict(X_train)
accuracy_train = accuracy_score(y_train, y_train_pred)
print("Accuracy: %.2f%%" % (accuracy_train * 100.0))

print("- - - - - - - - - - ")

print("Scores for the Test Dataset: ")
np.random.seed(SEED)
param_space = {
    "max_depth": [3, 5, 10, 15, 20, 30, None],
    "min_samples_split" : randint(2, 150),
    "min_samples_leaf" : randint(2, 150),
    "criterion" : ["gini", "entropy"]
}

raw_train_x, validation_x, raw_train_y, validation_y = train_test_split(x, y, test_size=0.25, random_state=SEED, stratify=y)

from sklearn.model_selection import RandomizedSearchCV

search = RandomizedSearchCV(DecisionTreeClassifier(),
                   param_space,
                   n_iter=100,
                   cv = 5,
                   random_state = SEED)
search.fit(raw_train_x, raw_train_y)
results = pd.DataFrame(search.cv_results_)
results.head()

print(len(results))

print(search.best_params_)
print(search.best_score_)

best = search.best_estimator_
best

predictions = best.predict(validation_x)
Ejemplo n.º 43
0
loss = ['deviance', 'exponential']

grid_random = {
    'n_estimators': n_estimators,
    'max_features': max_features,
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf,
    'loss': loss
}

# random search of parameters, parameters of the randomnized search can be modified
GB_random = RandomizedSearchCV(estimator=GD_Classifier,
                               param_distributions=grid_random,
                               n_iter=20,
                               cv=3,
                               verbose=2,
                               random_state=42,
                               n_jobs=-1)

# fit the randomnized search CV
GB_random.fit(X_train, Y_train)

print(
    '>>> Starting Grid Search Cross Validation to find optimal Gradient Boosting Classfifier...'
)

# create the parameter grid based on the results of random search
# varying slightly the optimised parameters found
grid_parameters = {
    'loss': [GB_random.best_params_['loss']],
Ejemplo n.º 44
0
                 'max_leaf_nodes': (1,10,100,),
                 'min_samples_split': (0.1,0.25,0.5,0.75,1.0,),
                 'min_samples_leaf': (1,10,100,),
}]

#est=ensemble.RandomForestRegressor()
#est=kernel_ridge.KernelRidge()
#est=neighbors.NearestNeighbors()
#est=neighbors.KNeighborsRegressor()
est=ensemble.ExtraTreesRegressor()

# https://stackoverflow.com/questions/37161563/how-to-graph-grid-scores-from-gridsearchcv

# run randomized search
n_iter_search = 20
rs = RandomizedSearchCV(est, param_distributions=hyper_params, n_iter=n_iter_search)
t0 = time.time()
rs.fit(x_train, y_train.ravel())
runtime = time.time() - t0
print("RandomizedSearchCV took %.6f seconds for %d candidates" " parameter settings." % (runtime, n_iter_search))
print(rs.cv_results_)

#scores = [x[1] for x in rs.grid_scores_]
#scores = np.array(scores).reshape(len(Cs), len(Gammas))
#
#for ind, i in enumerate(Cs):
#    plt.plot(Gammas, scores[ind], label='C: ' + str(i))
#plt.legend()
#plt.xlabel('Gamma')
#plt.ylabel('Mean score')
#plt.show()
from sklearn.model_selection import ShuffleSplit
clf_xgb = xgb.XGBClassifier(objective = 'binary:logistic')
param_dist = {'silent': [False],
        'max_depth': [6, 10, 15, 20],
        'learning_rate': [0.001, 0.01, 0.1, 0.2, .3, .4, 0,3],
        'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
        'colsample_bytree': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
        'colsample_bylevel': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
        'min_child_weight': [0.5, 1.0, 3.0, 5.0, 7.0, 10.0],
        'gamma': [0, 0.25, 0.3, 0.4, 0.5, 1.0],
        'reg_lambda': [0.1, 1.0, 5.0, 10.0, 50.0, 100.0],
        'reg_alpha': 2. ** np.arange(-13, 10, 2),
        'n_estimators': [100,150,200]
             }
clf = RandomizedSearchCV(clf_xgb, param_distributions = param_dist, 
                         n_iter = 25, 
                         scoring = 'f1', error_score = 0,
                         verbose = 3, n_jobs = -1)

rs = ShuffleSplit(n_splits=3, test_size=.25, random_state=0)
gc.collect()

estimators = []
results = np.zeros(len(one_hot_encoded_X))
score = []
for train_index, test_index in rs.split(one_hot_encoded_X):
#    print('Iteration:', i)
    X_train, X_test = one_hot_encoded_X.iloc[train_index], one_hot_encoded_X.iloc[test_index]
    y_train, y_test = new_y_binary.iloc[train_index], new_y_binary.iloc[test_index]
    clf.fit(X_train, y_train)

    estimators.append(clf.best_estimator_)
Ejemplo n.º 46
0
    'scaler': scalers_to_test,
    'red_dim': [PCA()],
    'red_dim__n_components': n_features_to_test,
    'clf__hidden_layer_sizes': l,
    'clf__activation': ['identity', 'logistic', 'tanh', 'relu'],
    'clf__solver': ['lbfgs', 'sgd', 'adam'],
    'clf__batch_size': b_size,
    'clf__learning_rate': ['constant', 'invscaling', 'adaptive']
}]

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

grid = RandomizedSearchCV(pipeline,
                          param_distributions=parameteres,
                          n_iter=100,
                          cv=5,
                          random_state=1)

print('a')

grid.fit(X_train, y_train)

print('b')

score = {grid.score(X_test, y_test)}

score = grid.score(X_test, y_test)
best_p = grid.best_params_
#print(f'score = {grid.score(X_test, y_test)}')
#print(grid.best_params_)
# Define the parameters that will be tuned randomly
keras_param_options = {'filters' : [4, 8, 16],
             'filters_LSTM' : [4, 8, 16],
             'strides' : [1],
             'padding' : ['valid'],
             'activation_convolution' : [None],
             'activation_LSTM' : ['tanh'],
             'optimizers' : ['Adam', 'Adadelta'],
             'number_hidden_units' : [4, 8],
             'epochs' : [30],
             'batch_size' : [8, 16, 32]}

# Using RandomizedSearchCV to find the best model randomly
random_search = RandomizedSearchCV(model,
                                   param_distributions = keras_param_options,
                                   n_iter = 50,
                                   cv = 5,
                                   verbose = 10)

# Fit to the training data
random_search.fit(x_train, y_train)
df_result_hyper_tuned = pd.DataFrame.from_dict(random_search.cv_results_)
df_result_hyper_tuned.to_csv('/hpc-home/kristian/effector-non-effector/scripts-cnn-lstm-separate-group/scripts-scan-multiclass/bacteria/results/all_scan_results_cnn_lstm_scan_bacteria_secreted.csv')

# Save all of the params to be used to predict on the test data
df_result_hyper_tuned['mean_test_score']= pd.to_numeric(df_result_hyper_tuned['mean_test_score'])
param_best_model_dict = dict(df_result_hyper_tuned.nlargest(30, 'mean_test_score')['params'])
params = list(param_best_model_dict.values())
print(params)

# Get info ahead about the best model obtained
y_dat = data['cancel_1.0']
X_dat = data.drop(['id','train','credit','state','cancel_1.0'], axis=1)

X_train, X_test, y_train,y_test = cross_validation.train_test_split(X_dat, y_dat, test_size=0.2, random_state=0)

params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }

# The data is huge, so maybe don't need so many cv
folds = 3

# Test Enough of these
param_comb = 1

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)

# early stopping to decrease time
xgb = XGBClassifier(learning_rate=0.02, n_estimators=100, objective='binary:logistic', silent=True, nthread=1)

random_search = RandomizedSearchCV(xgb, param_distributions=params, n_iter=param_comb, scoring='roc_auc', n_jobs=8, cv=skf.split(X_train,y_train), verbose=3, random_state=1001 )
start_time = timer(None) # timing starts from this point for "start_time" variable
random_search.fit(X_train, y_train)
timer(start_time) # timing ends here for "start_time" variable

results = pd.DataFrame(random_search.cv_results_)
results.to_csv('xgb-random-grid-search-results-01.csv', index=False)
Ejemplo n.º 49
0
        "batch_size": batches,
        'opt': optimizer,
        "drop": droptout,
        'lr': lr,
        'act': act,
        'epochs': epoch,
        'validation_split': splt
    }


model = KerasClassifier(build_fn=build_model, verbose=1)
hyperparameters = create_hyper()

search = RandomizedSearchCV(estimator=model,
                            param_distributions=hyperparameters,
                            n_iter=1,
                            cv=None,
                            n_jobs=1)
search.fit(x_train, y_train)
# print(search.estimator.fit(x_test,y_test))
pred = search.predict(x_test)
print(pred)
print(y_train)

print("shape", y_test.shape)
print("shape", pred.shape)

try:
    pred = np.argmax(pred)
    score = accuracy_score(y_test, pred)
    print(score)
#predict_train=clf.predict(X_train)
pred_tree_test=decisionTree.predict(X_test)
pred_tree_train=decisionTree.predict(X_train)
tree_accuracy_train=model_generate_reports(y_train,pred_tree_train)
tree_accuracy_test=model_generate_reports(y_test,pred_tree_test)
from sklearn.metrics import confusion_matrix
confusion_matrixTree = confusion_matrix(y_test, pred_tree_test)

## randomized search cv for decision tree
param_dist = {"max_depth": [2,3,4,5,6,7,8,9, None], 
              "max_features": [2,4,6,8,10,12,14,16,18,20], 
              "min_samples_leaf":[2,4,6,8,10,12,14,16,18,20,22,24,26], 
              "criterion": ["gini", "entropy"]} 


tree_cv = RandomizedSearchCV(decisionTree, param_dist, cv = 5) 
tree_cv.fit(X_train,y_train)
print("Tuned Decision Tree Parameters: {}".format(tree_cv.best_params_)) 
pred_tree_train=tree_cv.predict(X_train)
pred_tree_test=tree_cv.predict(X_test)
tree_accuracy_train=model_generate_reports(y_train,pred_tree_train)
tree_accuracy_test=model_generate_reports(y_test,pred_tree_test)
confusion_matrixTree=confusion_matrix(y_test, pred_tree_test)

############## final decision tree#################################
tree_algo=DecisionTreeClassifier(random_state=0,min_samples_leaf=10,max_features=20,max_depth=9,criterion='entropy')
tree_algo.fit(X_train,y_train)
#predict_train=clf.predict(X_train)
pred_tree_test=tree_algo.predict(X_test)
pred_tree_train=tree_algo.predict(X_train)
tree_accuracy_train=model_generate_reports(y_train,pred_tree_train)
Ejemplo n.º 51
0
    'learning_rate': [0.1],
    'max_depth': [6],
    'booster': ['dart'],
    'rate_drop': [0.21],
    'eval_metric': ['logloss', 'mae'],
    'is_training_metric': [True],
    'max_leaves': [144],
    'colsample_bytree': [0.8],
    'subsample': [0.8],
    'seed': [66]
}]
kfold = KFold(n_splits=5, shuffle=True, random_state=66)
y_test_pred = []
y_pred = []
search = RandomizedSearchCV(XGBRegressor(n_jobs=6),
                            parameters,
                            cv=kfold,
                            n_iter=1)

for i in range(4):
    fit_params = {
        'verbose': True,
        'eval_metric': ['logloss', 'mae'],
        'eval_set': [(x_train, y_train[:, i]), (x_test, y_test[:, i])],
        'early_stopping_rounds': 5
    }
    search.fit(x_train, y_train[:, i], **fit_params)
    y_pred.append(search.predict(x_pred))
    y_test_pred.append(search.predict(x_test))
    # print(search.best_score_)

#############################
Ejemplo n.º 52
0
class Trainer(object):
    # Mlflow parameters identifying the experiment, you can add all the parameters you wish
    ESTIMATOR = "Linear"
    EXPERIMENT_NAME = "TaxifareModel"

    def __init__(self, X, y, **kwargs):
        """
        FYI:
        __init__ is called every time you instatiate Trainer
        Consider kwargs as a dict containig all possible parameters given to your constructor
        Example:
            TT = Trainer(nrows=1000, estimator="Linear")
               ==> kwargs = {"nrows": 1000,
                            "estimator": "Linear"}
        :param X:
        :param y:
        :param kwargs:
        """
        self.pipeline = None
        self.kwargs = kwargs
        self.grid = kwargs.get("gridsearch", False)  # apply gridsearch if True
        self.local = kwargs.get("local",
                                True)  # if True training is done locally
        self.optimize = kwargs.get(
            "optimize",
            False)  # Optimizes size of Training Data if set to True
        self.mlflow = kwargs.get("mlflow", False)  # if True log info to nlflow
        self.upload = kwargs.get("upload", False)  # if True log info to nlflow
        self.experiment_name = kwargs.get("experiment_name",
                                          self.EXPERIMENT_NAME)  # cf doc above
        self.model_params = None  # for
        self.X_train = X
        self.y_train = y
        del X, y
        self.split = self.kwargs.get("split", True)  # cf doc above
        if self.split:
            self.X_train, self.X_val, self.y_train, self.y_val = train_test_split(
                self.X_train, self.y_train, test_size=0.15)
        self.nrows = self.X_train.shape[0]  # nb of rows to train on
        self.log_kwargs_params()
        self.log_machine_specs()

    def get_estimator(self):
        estimator = self.kwargs.get("estimator", self.ESTIMATOR)
        if estimator == "Lasso":
            model = Lasso()
        elif estimator == "Ridge":
            model = Ridge()
        elif estimator == "Linear":
            model = LinearRegression()
        elif estimator == "GBM":
            model = GradientBoostingRegressor()
        elif estimator == "RandomForest":
            model = RandomForestRegressor()
            self.model_params = {  # 'n_estimators': [int(x) for x in np.linspace(start = 50, stop = 200, num = 10)],
                'max_features': ['auto', 'sqrt']
            }
            # 'max_depth' : [int(x) for x in np.linspace(10, 110, num = 11)]}
        elif estimator == "xgboost":
            model = XGBRegressor(objective='reg:squarederror',
                                 n_jobs=-1,
                                 max_depth=10,
                                 learning_rate=0.05,
                                 gamma=3)
            self.model_params = {
                'max_depth': range(10, 20, 2),
                'n_estimators': range(60, 220, 40),
                'learning_rate': [0.1, 0.01, 0.05]
            }
        else:
            model = Lasso()
        estimator_params = self.kwargs.get("estimator_params", {})
        self.mlflow_log_param("estimator", estimator)
        model.set_params(**estimator_params)
        print(colored(model.__class__.__name__, "red"))
        return model

    def set_pipeline(self):
        memory = self.kwargs.get("pipeline_memory", None)
        dist = self.kwargs.get("distance_type", "euclidian")
        feateng_steps = self.kwargs.get(
            "feateng",
            ["distance", "time_features", 'direction', 'distance_to_center'])
        if memory:
            memory = mkdtemp()

        # Define feature engineering pipeline blocks here
        pipe_time_features = make_pipeline(
            TimeFeaturesEncoder(time_column='pickup_datetime'),
            OneHotEncoder(handle_unknown='ignore'))
        pipe_distance = make_pipeline(
            DistanceTransformer(distance_type=dist, **DIST_ARGS),
            RobustScaler())
        pipe_geohash = make_pipeline(AddGeohash(), ce.HashingEncoder())
        pipe_direction = make_pipeline(Direction(), RobustScaler())
        pipe_distance_to_center = make_pipeline(DistanceToCenter(),
                                                RobustScaler())

        # Define default feature engineering blocs
        feateng_blocks = [
            ('distance', pipe_distance, list(DIST_ARGS.values())),
            ('time_features', pipe_time_features, ['pickup_datetime']),
            ('geohash', pipe_geohash, list(DIST_ARGS.values())),
            ('direction', pipe_direction, list(DIST_ARGS.values())),
            ('distance_to_center', pipe_distance_to_center,
             list(DIST_ARGS.values())),
        ]
        # Filter out some bocks according to input parameters
        for bloc in feateng_blocks:
            if bloc[0] not in feateng_steps:
                feateng_blocks.remove(bloc)

        features_encoder = ColumnTransformer(feateng_blocks,
                                             n_jobs=None,
                                             remainder="drop")

        self.pipeline = Pipeline(steps=[('features', features_encoder),
                                        ('rgs', self.get_estimator())],
                                 memory=memory)

        if self.optimize:
            self.pipeline.steps.insert(
                -1,
                ['optimize_size', OptimizeSize(verbose=False)])

    def add_grid_search(self):
        """"
        Apply Gridsearch on self.params defined in get_estimator
        {'rgs__n_estimators': [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)],
          'rgs__max_features' : ['auto', 'sqrt'],
          'rgs__max_depth' : [int(x) for x in np.linspace(10, 110, num = 11)]}
        """
        # Here to apply ramdom search to pipeline, need to follow naming "rgs__paramname"
        params = {"rgs__" + k: v for k, v in self.model_params.items()}
        self.pipeline = RandomizedSearchCV(estimator=self.pipeline,
                                           param_distributions=params,
                                           n_iter=10,
                                           cv=2,
                                           verbose=1,
                                           random_state=42,
                                           n_jobs=None)

    @simple_time_tracker
    def train(self, gridsearch=False):
        tic = time.time()
        self.set_pipeline()
        if gridsearch:
            self.add_grid_search()
        self.pipeline.fit(self.X_train, self.y_train)
        # mlflow logs
        self.mlflow_log_metric("train_time", int(time.time() - tic))

    def evaluate(self):
        rmse_train = self.compute_rmse(self.X_train, self.y_train)
        self.mlflow_log_metric("rmse_train", rmse_train)
        if self.split:
            rmse_val = self.compute_rmse(self.X_val, self.y_val, show=True)
            self.mlflow_log_metric("rmse_val", rmse_val)
            print(
                colored(
                    "rmse train: {} || rmse val: {}".format(
                        rmse_train, rmse_val), "blue"))
        else:
            print(colored("rmse train: {}".format(rmse_train), "blue"))

    def compute_rmse(self, X_test, y_test, show=False):
        if self.pipeline is None:
            raise ("Cannot evaluate an empty pipeline")
        y_pred = self.pipeline.predict(X_test)
        if show:
            res = pd.DataFrame(y_test)
            res["pred"] = y_pred
            print(colored(res.sample(5), "blue"))
        rmse = compute_rmse(y_pred, y_test)
        return round(rmse, 3)

    def save_model(self):
        """Save the model into a .joblib and upload it on Google Storage /models folder
        HINTS : use sklearn.joblib (or jbolib) libraries and google-cloud-storage"""
        joblib.dump(self.pipeline, 'model.joblib')
        print(colored("model.joblib saved locally", "green"))

        if self.upload:
            storage_upload(model_version=MODEL_VERSION)

    ### MLFlow methods
    @memoized_property
    def mlflow_client(self):
        mlflow.set_tracking_uri(MLFLOW_URI)
        return MlflowClient()

    @memoized_property
    def mlflow_experiment_id(self):
        try:
            return self.mlflow_client.create_experiment(self.experiment_name)
        except BaseException:
            return self.mlflow_client.get_experiment_by_name(
                self.experiment_name).experiment_id

    @memoized_property
    def mlflow_run(self):
        return self.mlflow_client.create_run(self.mlflow_experiment_id)

    def mlflow_log_param(self, key, value):
        if self.mlflow:
            self.mlflow_client.log_param(self.mlflow_run.info.run_id, key,
                                         value)

    def mlflow_log_metric(self, key, value):
        if self.mlflow:
            self.mlflow_client.log_metric(self.mlflow_run.info.run_id, key,
                                          value)

    def log_estimator_params(self):
        reg = self.get_estimator()
        self.mlflow_log_param('estimator_name', reg.__class__.__name__)
        params = reg.get_params()
        for k, v in params.items():
            self.mlflow_log_param(k, v)

    def log_kwargs_params(self):
        if self.mlflow:
            for k, v in self.kwargs.items():
                self.mlflow_log_param(k, v)

    def log_machine_specs(self):
        cpus = multiprocessing.cpu_count()
        mem = virtual_memory()
        ram = int(mem.total / 1000000000)
        self.mlflow_log_param("ram", ram)
        self.mlflow_log_param("cpus", cpus)
Ejemplo n.º 53
0
        def do_fold(j):
            print("\tFold " + str(j+1))
            train_idx = folds_i[j][0]
            valid_idx = folds_i[j][1]
            training_fold = developement_df.loc[train_idx, ]
            training_fold = training_fold.reset_index(drop=True)
            validation_fold = developement_df.loc[valid_idx, ]
            validation_fold = validation_fold.reset_index(drop=True)

            # shuffle the folds
            training_stats_i_f = Statistics()
            validation_stats_i_f = Statistics()
            testing_stats_i_f = Statistics()

            # Init the label ranking lists.
            label_pred_proba_train = []
            label_pred_proba_valid = []
            label_pred_proba_test = []

            label_y_train = []
            label_y_valid = []
            label_y_test = []

            # Set up the vectorizer for the bag-of-words representation
            if vectorizer_method == 'tf-idf':
                vectorizer = TfidfVectorizer(
                    stop_words=['go', '', ' '], binary=binary, lowercase=True,
                    sublinear_tf=True, max_df=1.0, min_df=0
                )
                vectorizer.fit(training_fold['terms'].values)
                alpha = None
                percentile = 100
            elif vectorizer_method == 'count':
                vectorizer = CountVectorizer(
                    stop_words=['go', '', ' '], binary=binary, lowercase=True
                )
                vectorizer.fit(training_fold['terms'].values)
                alpha = None
                percentile = 100
            else:
                raise TypeError("Vectorizer_method has type {}.".format(type(vectorizer_method)))

            selectors = generate_selectors(selection, vectorizer.get_feature_names(), dag)
            base_estimators = make_classifiers(method, balanced, labels, selectors, selection, rng)
            for label in sorted(labels):
                print("\t\tFitting for label {}...".format(label))

                # SVMs make the assumption of standardised features. Hence we scale the features
                # avoiding the use of mean to maintain the structure of count sparsity. Scaling
                # May also help with linear model convergence speed.
                x_train_l = vectorizer.transform(training_fold['terms'].values)
                y_train_l = np.asarray(training_fold[label].values, dtype=int)

                x_valid_l = vectorizer.transform(validation_fold['terms'].values)
                y_valid_l = np.asarray(validation_fold[label].values, dtype=int)

                x_test_l = vectorizer.transform(testing_df['terms'].values)
                y_test_l = np.asarray(test_df_i[label].values, dtype=int)

                if scale:
                    x_train_l = mean_center(x_train_l, with_mean=False)
                    x_valid_l = mean_center(x_valid_l, with_mean=False)
                    x_test_l = mean_center(x_test_l, with_mean=False)

                # We generate the folds for randomised search up-front. We hold out one of the folds for
                # Probability calibration so each sampled param set gets calibrated on the same data.
                # This leaves cv_folds-2 folds for randomised search cross-validation.
                # cv_rand = StratifiedKFold(n_splits=3, shuffle=True, random_state=rng)
                base_estimator_l = base_estimators[label]
                fresh_estimator = clone(base_estimator_l)

                # Find the best params, then do a final proper calibration.
                params = sk_generate_params(method, selection)
                estimator_l = RandomizedSearchCV(
                    estimator=base_estimator_l, param_distributions=params,
                    n_iter=60, scoring='f1', cv=3, random_state=rng,
                    error_score=0.0, n_jobs=1, pre_dispatch='2*n_jobs',
                    refit=True
                )

                # Test if there's any signal if we permute the labels.
                # Classifier should do poorly if we do so.
                if permute:
                    y_train_l = rng.permutation(y_train_l)

                threshold = 0.5
                estimator_l.fit(x_train_l, y_train_l)
                best_params_l = estimator_l.best_params_

                # Calibrate the random forest with the best hyperparameters.
                if method not in ['lr']:
                    estimator_l = CalibratedClassifierCV(fresh_estimator.set_params(**best_params_l),
                                                         cv=3, method='sigmoid')
                    estimator_l.fit(x_train_l, y_train_l)

                # Evaluate Performance characteristics and test on training to check overfitting.
                y_train_prob_l = estimator_l.predict_proba(x_train_l)
                y_valid_prob_l = estimator_l.predict_proba(x_valid_l)
                y_test_prob_l = estimator_l.predict_proba(x_test_l)
                training_stats_i_f.merge(evaluate_model(y_train_l, y_train_prob_l, label, threshold))
                validation_stats_i_f.merge(evaluate_model(y_valid_l, y_valid_prob_l, label,threshold))

                # Compute independent test data performance
                testing_stats_i_f.merge(evaluate_model(y_test_l, y_test_prob_l, label, threshold))

                # Get label ranking info
                label_pred_proba_train.append([p[1] for p in y_train_prob_l])
                label_pred_proba_valid.append([p[1] for p in y_valid_prob_l])
                label_pred_proba_test.append([p[1] for p in y_test_prob_l])

                label_y_train.append(y_train_l)
                label_y_valid.append(y_valid_l)
                label_y_test.append(y_test_l)

                print(validation_stats_i_f.frame())

            # Compute multi-label performance statistics
            y = np.vstack(zip(*label_y_train))
            y_prob = np.vstack(zip(*label_pred_proba_train))
            training_stats_i_f.merge(multi_label_evaluate(y, y_prob, threshold))

            y = np.vstack(zip(*label_y_valid))
            y_prob = np.vstack(zip(*label_pred_proba_valid))
            validation_stats_i_f.merge(multi_label_evaluate(y, y_prob, threshold))

            y = np.vstack(zip(*label_y_test))
            y_prob = np.vstack(zip(*label_pred_proba_test))
            testing_stats_i_f.merge(multi_label_evaluate(y, y_prob, threshold))

            return training_stats_i_f, validation_stats_i_f, testing_stats_i_f
Ejemplo n.º 54
0
bootstrap = [True, False]
# Create the random grid
random_grid = {
    'n_estimators': n_estimators,
    'max_features': max_features,
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf,
    'bootstrap': bootstrap
}
pprint(random_grid)
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator=rfc,
                               param_distributions=random_grid,
                               n_iter=100,
                               cv=kfold,
                               verbose=2,
                               random_state=42,
                               n_jobs=-1)
rf_random.fit(x_train, y_train)
rf_random.best_params_
rf_best_random = rf_random.best_estimator_
prediction_RF = rf_best_random.predict(x_test)
print("for Random Forest we get " +
      str(round(accuracy_score(y_test, prediction_RF), 5)))

CM_RF = confusion_matrix(y_test, prediction_RF)
df_cm = pd.DataFrame(CM_RF,
                     index=["Predicted No", "Predicted Yes"],
                     columns=["Actual No", "Actual Yes"])
plt.figure()
Ejemplo n.º 55
0
def tune_xgb_params_randomized(estimator_cls,
                               label: np.ndarray,
                               metric_sklearn: str,
                               n_jobs: int,
                               params: dict,
                               strat_folds: KFold,
                               train: np.ndarray,
                               n_iter: int = 20,
                               verbosity_level: int = 0,
                               **kwargs):
    """
    :param estimator_cls:
        The class type of the estimator to instantiate - either an XGBClassifier or an XGBRegressor.
    :param label:
        An array-like containing the labels of the classification or regression problem.
    :param metric_sklearn:
        The evaluation metric to be passed to scikit-learn's GridSearchCV - see
        http://scikit-learn.org/stable/modules/model_evaluation.html
        for the options this can take - e.g. 'neg_mean_squared_error' for RMSE.
    :param n_jobs:
        The number of jobs to run simultaneously.
    :param params:
        A dictionary of XGB parameters.
    :param strat_folds:
        A KFold object to cross validate the parameters.
    :param train:
        An array-like containing the training input samples.
    :param n_iter:
        An optional parameter to control the number of parameter settings that are sampled.
    :param n_jobs:
        An optional parameter to control the amount of parallel jobs - defaults to the amount of CPUs available.
    :param verbosity_level:
        An optional parameter to control the verbosity of the grid searching - defaults to the most verbose option.
    :param kwargs:
        Parameter distributions may be controlled through keyword arguments - e.g. to sample uniformly between 0.5 and 0.7 for
        colsample_bytree, supply colsample_bytree_loc=0.5 and colsample_bytree_scale=0.2.
    :return:
        A dictionary of tuned parameters and a list of the parameters found at each step with their respective scores.
    """
    params_copy = clean_params_for_sk(params)
    param_distributions = {
        'colsample_bytree': uniform(kwargs.get('colsample_bytree_loc', 0.2), kwargs.get('colsample_bytree_scale', 0.8)),
        'gamma': uniform(kwargs.get('gamma_loc', 0), kwargs.get('gamma_scale', 0.9)),
        'max_depth': sp_randint(kwargs.get('max_depth_low', 2), kwargs.get('max_depth_high', 11)),
        'min_child_weight': sp_randint(kwargs.get('min_child_weight_low', 1), kwargs.get('min_child_weight_high', 11)),
        'reg_alpha': halfnorm(kwargs.get('reg_alpha_loc', 0), kwargs.get('reg_alpha_scale', 5)),
        'reg_lambda': halfnorm(kwargs.get('reg_alpha_loc', 0), kwargs.get('reg_alpha_scale', 5)),
        'subsample': uniform(kwargs.get('subsample_loc', 0.2), kwargs.get('subsample_scale', 0.8))
    }

    rand_search = RandomizedSearchCV(
        cv=strat_folds.split(train, label),
        estimator=estimator_cls(**params_copy),
        n_iter=n_iter,
        n_jobs=n_jobs,
        param_distributions=param_distributions,
        scoring=metric_sklearn,
        verbose=verbosity_level
    )
    rand_search.fit(train, label)
    return rand_search.best_params_, [(rand_search.best_params_, rand_search.best_score_)]
Ejemplo n.º 56
0
    outputs = Dense(10, activation='softmax', name='outputs')(x)
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(optimizer=optimizer, metrics=['acc'], loss='categorical_crossentropy')
    return model

def create_hyperparameters():
    batchs = [10,20,30,40,50]
    optimizers = ['rmsprop', 'adam', 'adadelta']
    dropout = [0.1, 0.2, 0.3]
    return{'batch_size' : batchs, 'optimizer': optimizers, 'drop':dropout}

hyperparameters = create_hyperparameters()
model2 = build_model()

from tensorflow.keras.wrappers.scikit_learn import KerasClassifier  #머신러닝이 케라스보다 더 먼저 나와서 랩핑을 해줘야 한다
model2 = KerasClassifier(build_fn=build_model, verbose=1)



from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
search = RandomizedSearchCV(model2, hyperparameters, cv=3)  #cv cross validation
# search = GridSearchCV(model2, hyperparameters, cv=3)  #cv cross validation

search.fit(x_train, y_train, verbose=1)
print(search.best_params_) #{'optimizer': 'rmsprop', 'drop': 0.1, 'batch_size': 50}
print(search.best_estimator_)
print(search.best_score_) #0.9588499863942465
acc = search.score(x_test, y_test)
print('최종 스코어 : ', acc ) #최종 스코어 :  0.9682999849319458

#{'optimizer': 'rmsprop', 'drop': 0.2, 'batch_size': 30} 고쳐도 된다 
Ejemplo n.º 57
0
print("R-squared:", metrics.r2_score(y_test_pred, test_minmax[:, 1200]))
print("-----------------------------------------------------------")
print("-----------------------------------------------------------")

# ----------  HYPERPARAMETERS TUNING WITH RANDOM-SEARCH ------------

print("TUNED HYPERPARAMETERS WITH RANDOM-SEARCH")
print('-------------------------------------------------')

# KNN

param_dist = {'n_neighbors': sp_randint(2, 20)}

knnrs = RandomizedSearchCV(knn,
                           param_distributions=param_dist,
                           scoring='neg_mean_squared_error',
                           cv=tr_val_partition,
                           n_jobs=1,
                           verbose=1)

# Training the model with the random-search
np.random.seed(123)
knnrs.fit(train_closest, train_minmax[:, 1200])

# Making predictions on the testing partition
y_test_pred = knnrs.predict(test_closest)

# And finally computing the test accuracy
print("Mean squared error of KNN with tuned hyperparameters:",
      metrics.mean_squared_error(y_test_pred, test_minmax[:, 1200]))
print("R-squared:", metrics.r2_score(y_test_pred, test_minmax[:, 1200]))
print("-----------------------------------------------------------")
def main():
    """ Main function
    """

    amigos_data = np.loadtxt('features_all_20s.csv',skiprows=1, delimiter=',')
    labels = np.loadtxt('Final_Personality_20s.csv',skiprows=1, delimiter=',')
    ids=np.loadtxt('ids_20s.csv',skiprows=1, delimiter=',')
    labels=labels[:, 1]
    kf = KFold(n_splits=2)
    gkf=GroupKFold(n_splits=5)

    # tune XGB classifier parameters
    grid_search_params_xgb = {
        'max_depth': [3,4,5],
        'n_estimators': [10,15,20]
    }
    other_tuning_params_xgb = {
        'learning_rate': np.arange(0.01, 0.41, 0.01),
        'gamma': np.arange(0, 10.1, 0.5),
        'min_child_weight': np.arange(0.80, 1.21, 0.01),
        'max_delta_step': np.arange(0, 2.05, 0.05),
        'subsample': np.arange(1.00, 0.59, -0.01),
        'colsample_bytree': np.arange(1.00, 0.09, -0.01),
        'colsample_bylevel': np.arange(1.00, 0.09, -0.01),
        'reg_alpha': np.arange(0, 2.05, 0.05),
        'reg_lambda': np.arange(0.50, 2.55, 0.05),
        'scale_pos_weight': np.arange(0.80, 1.21, 0.01),
        'base_score': np.arange(0.40, 0.61, 0.01),
        'seed': np.arange(0, 41)
    }

    
    # XGB grid search tuning
    best_params = {
        'max_depth': 3,
        'n_estimators': 20
    }
    acc = 0
    print('Tuning max_depth and n_estimators')
    for param in grid_search_params_xgb['max_depth']:
        print('in grid search')
        print('max_depth', param)
        xgb_clf = {
            'a': xgb.XGBClassifier(max_depth=param, objective="binary:logistic"),
            }
        tuning_params = grid_search_params_xgb['n_estimators']
        param, tmp_acc = tuning(
            xgb_clf, 'n_estimators', tuning_params, amigos_data, labels, kf)
        print('param',param,'tmp_acc',tmp_acc)
        if tmp_acc >= acc:
            best_params['max_depth'] = param
            best_params['n_estimators'] = param
            acc = tmp_acc
    # XGB tune other parameters
    for param_name, tuning_params in other_tuning_params_xgb.items():
        print('Tuning', param_name)
        xgb_clf = {
            'a': xgb.XGBClassifier(objective="binary:logistic"),
              }
        xgb_clf['a'].set_params(**best_params)
     
        param,_ = tuning(
            xgb_clf, param_name, tuning_params, amigos_data, labels, kf)
        best_params[param_name] = param
        


    # tune RF parameters
    grid_search_params_rf = {
               'max_features': [10,15,20],
               'max_depth': [3,5,10],
               }
    
    rf_clf=RandomForestClassifier()   
    rf_random = RandomizedSearchCV(estimator = rf_clf, param_distributions = grid_search_params_rf, n_iter = 100, cv = gkf, verbose=2, random_state=42, n_jobs = 5)
    rf_random.fit(amigos_data,np.ravel(labels),groups=ids)

    #the optimized hyperparameters
    print('XGBoost best parameters:', best_params) 
    print('Random forest best parameters:',rf_random.best_params_)
Ejemplo n.º 59
0
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import SGDClassifier
from scipy.stats import lognorm as sp_lognormal

import cs231n.cifar10 as cf

np.random.seed(31337)

X_train, y_train, X_test, y_test, scaler = cf.get_normalised_data()

basic_svm = SGDClassifier(loss="hinge", penalty="l2", l1_ratio=0.0, random_state=31337, n_jobs=5)

# From the Scipy docs: to sample a random variable Y such that Y=exp(X) where X~N(mu,sigma), use
# scipy.stats.lognormal(s=sigma, scale=np.exp(mu))
random_search = RandomizedSearchCV(basic_svm,
                       param_distributions={'alpha': sp_lognormal(s=2, scale=np.exp(-4))},
                       n_iter=20, verbose=1)

random_search.fit(X_train, y_train)

print("Chosen: ", random_search.best_params_["alpha"])
print("Best CV score: ", random_search.best_score_)
chosen_svm = random_search.best_estimator_

os.makedirs("output/svc", exist_ok=True)
labels = cf.get_label_names()
for i in range(10):
    # Don't forget to rescale the hyperplanes to get human-readable versions---the l2 penalty makes
    # them close to the origin, so they look indistinguishable.
    this_hyperplane = 127*(chosen_svm.coef_[i]/np.max(np.abs(chosen_svm.coef_[i]))) + scaler.mean_
    cf.plot_image(this_hyperplane, "output/svc/archetype " + labels[i] + ".png")
Ejemplo n.º 60
0
#%%Ridge Regresion Model
model_name = "ridge_poly2"
X = train.drop(['energy'], axis=1)

cat_cols = ['hour', 'month', 'day_of_week']
cat_cols_idx = [X.columns.get_loc(c) for c in X.columns if c in cat_cols]
onehot = OneHotEncoder(categorical_features=cat_cols_idx, sparse=False)
regr = Ridge(fit_intercept=False)
poly = PolynomialFeatures(2)
tscv = TimeSeriesSplit(n_splits=3)

param_dist = {'alpha': st.uniform(1e-4, 5.0)}
regr_cv = RandomizedSearchCV(estimator=regr,
                            param_distributions=param_dist,
                            n_iter=20,
                            scoring='mean_squared_error',
                            iid=False,
                            cv=tscv,
                            verbose=2,
                            n_jobs=1)
regr_pipe = Pipeline([('onehot', onehot), ('poly', poly), ('regr_cv', regr_cv)])
regr_pipe.fit(X, y=train['energy'])

cv_results = pd.DataFrame(regr_pipe.named_steps['regr_cv'].cv_results_)
cv_results.sort_values(by='rank_test_score').head()

#%% Linear regression with recursive feature elimination
model_name = "linear_regression"
X = train.drop(['timeStamp','energy'], axis=1)
cat_cols = ['hour', 'month', 'day_of_week']
cat_cols_idx = [X.columns.get_loc(c) for c in X.columns if c in cat_cols]
onehot = OneHotEncoder(categorical_features=cat_cols_idx, sparse=False)