random_grid = {
    'n_estimators': n_estimators,
    'max_features': max_features,
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf,
    'bootstrap': bootstrap
}

print(random_grid)

rf = RandomForestClassifier()
rf_random = RandomizedSearchCV(estimator=rf,
                               param_distributions=random_grid,
                               n_iter=100,
                               cv=ps,
                               verbose=2,
                               random_state=42,
                               n_jobs=-1)
rf_random.fit(df_train, label)

print('Best parameter setting found:')
print(rf_random.best_params_)
best_grid = rf_random.best_estimator_

best_grid = load(r'heuristic_vgg16_feature_maps_rf.joblib')
y_pred = best_grid.predict(df_test)

# Results of default RF model Prediction
print('Evaluation on test set:')
print('Acurracy' + str(accuracy_score(label_test, y_pred)))
                             cv=5)
    print(test, np.mean(scores))
if tune:
    # Random grid search for hyperparameter tuning
    from sklearn.model_selection import RandomizedSearchCV
    random_grid_decision = {
        'max_features': ['sqrt', 'log2'],
        'max_depth': [None, 20, 40, 60, 80, 100, 120],
        'min_samples_split': [2, 4, 8],
        'min_samples_leaf': [1, 2, 4],
    }
    # First we perform the decision tree optimisation
    search = RandomizedSearchCV(tree_classifier,
                                param_distributions=random_grid_decision,
                                n_iter=75,
                                cv=5,
                                n_jobs=-1,
                                scoring='roc_auc',
                                random_state=20)
    search.fit(train_values, train_labels)
    # Save the best model
    tree_classifier = search.best_estimator_
    save_model('best_tree', tree_classifier)

    # And the random forest optimisation
    random_grid_forest = {
        'n_estimators': [100, 200, 400, 600, 800, 1000, 1200],
        'max_features': ['sqrt', 'log2'],
        'max_depth': [None, 20, 40, 60, 80, 100, 120],
        'min_samples_split': [2, 4, 8],
        'min_samples_leaf': [1, 2, 4],
Exemple #3
0
    def rand_search(self):
        '''running a randomized search to find the parameter combination for a random forest
     which gives the best accuracy score'''
        print('*' * 80)
        print(
            '*    Running RandomizedSearch for best parameter combination for RandomForest'
        )
        print('*' * 80)

        #create the decision forest
        extra_clf_rand = ExtraTreesClassifier(random_state=100,
                                              max_depth=1,
                                              n_jobs=-1)

        with open(
                os.path.join(self.newdata_minusEP,
                             'extreme_randomforest_randomsearch.txt'),
                'a') as text_file:
            text_file.write('Created random forest: extra_clf_rand \n')

        #set up randomized search
        param_rand = {
            "criterion": ["gini",
                          "entropy"],  #metric to judge reduction of impurity
            'class_weight': ['balanced', None],
            'n_estimators': randint(100, 10000),  #number of trees in forest
            #'max_features': randint(2, 5),#max number of features when splitting
            "min_samples_split":
            randint(2, 20),  #min samples per node to induce split
            #"max_depth": randint(1, 10),#max number of splits to do
            "min_samples_leaf": randint(1,
                                        20),  #min number of samples in a leaf
            "max_leaf_nodes": randint(10, 20)
        }  #max number of leaves

        with open(
                os.path.join(self.newdata_minusEP,
                             'extreme_randomforest_randomsearch.txt'),
                'a') as text_file:
            text_file.write(
                'Running randomized search for the following parameters: %s \n'
                % param_rand)
            text_file.write('use cv=3, scoring=accuracy \n')

        #building and running the randomized search
        rand_search = RandomizedSearchCV(extra_clf_rand,
                                         param_rand,
                                         random_state=5,
                                         cv=3,
                                         n_iter=500,
                                         scoring='accuracy',
                                         n_jobs=-1)

        rand_search_transform = rand_search.fit(self.X_newdata_transform_train,
                                                self.y_train)
        with open(
                os.path.join(self.newdata_minusEP,
                             'extreme_randomforest_randomsearch.txt'),
                'a') as text_file:
            text_file.write('Best parameters: ' +
                            str(rand_search_transform.best_params_) + '\n')
            text_file.write('Best score: ' +
                            str(rand_search_transform.best_score_) + '\n')
        feature_importances_transform = rand_search_transform.best_estimator_.feature_importances_
        feature_importances_transform_ls = sorted(zip(
            feature_importances_transform, self.X_newdata_transform_train),
                                                  reverse=True)
        with open(
                os.path.join(self.newdata_minusEP,
                             'extreme_randomforest_randomsearch.txt'),
                'a') as text_file:
            text_file.write('Feature importances: %s \n' %
                            feature_importances_transform_ls)

        self.best_params_transform = rand_search_transform.best_params_

        self.feature_importances_transform_ls = feature_importances_transform_ls

        def feature_importances_best_estimator(feature_list, name, directory):
            datestring = datetime.strftime(datetime.now(), '%Y%m%d_%H%M')
            feature_list.sort(key=lambda x: x[1], reverse=True)
            feature = list(zip(*feature_list))[1]
            score = list(zip(*feature_list))[0]
            x_pos = np.arange(len(feature))
            plt.bar(x_pos, score, align='center')
            plt.figure(figsize=(20, 10))
            plt.xticks(x_pos, feature, rotation=90, fontsize=2)
            plt.title(
                'Histogram of Feature Importances for best RandomForest using features %s '
                % name)
            plt.xlabel('Features')
            plt.tight_layout()
            plt.savefig(
                os.path.join(
                    directory, 'feature_importances_best_bar_plot_rand_bag_' +
                    name + datestring + '.png'))
            plt.close()

        feature_importances_best_estimator(
            self.feature_importances_transform_ls, 'newdata_minusEP',
            self.newdata_minusEP)
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 1000,10)]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10,14]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4,6,8]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}
print(random_grid)

rf_1=RandomForestClassifier()
randomcv=RandomizedSearchCV(estimator=rf_1,param_distributions=random_grid,n_iter=100,cv=3,verbose=2,
                               random_state=100,n_jobs=-1)
### fit the randomized model
randomcv.fit(X_train,Y_train)

#getting the best parameters
best_grid=randomcv.best_estimator_

#fitting into the data and predicting
best_grid.fit(X_train,Y_train)
pred_2=best_grid.predict(X_test)

#validation
print(confusion_matrix(Y_test,pred_2))
print(accuracy_score(Y_test,pred_2))
print(classification_report(Y_test,pred_2))
Exemple #5
0
    'scale_pos_weight': st.randint(1, 13),
    'reg_alpha': st.randint(1, 5)
}

fit_dict = {
    "eval_set": [(X_train, y_train), (X_valid, y_valid)],
    "early_stopping_rounds": 20,
    "eval_metric": "auc",
    "verbose": 100
}

alg = xgb.XGBClassifier(**params)
print("Model Parameters: ", alg.get_params().keys())
clf = RandomizedSearchCV(estimator=alg,
                         n_iter=4,
                         param_distributions=param_grid,
                         cv=2,
                         scoring="roc_auc")

print("Parameter Search:")
clf.fit(X_train, y_train, **fit_dict)

print("Best All Params: ", clf.get_params())
print("Best Score: ", clf.best_score_)
print("Best Parametes: ", clf.best_params_)
xgb_pred = clf.predict_proba(test)[:, 1]
xgb_pred[:5]

# Submit
xgb_sub = pd.DataFrame(xgb_pred, columns=["TARGET"], index=testdex)
xgb_sub.to_csv("XGB.csv", index=True, float_format='%.8f')
    "max_depth": [3, 4, 5, 6, 8, 10, 12, 15],
    "min_child_weight": [1, 3, 5, 7],
    "gamma": [0.0, 0.1, 0.2, 0.3, 0.4],
    "colsample_bytree": [0.3, 0.4, 0.5, 0.7]
}

from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

import xgboost

model = xgboost.XGBClassifier()

random_search = RandomizedSearchCV(model,
                                   param_distributions=params,
                                   n_iter=5,
                                   scoring='roc_auc',
                                   n_jobs=-1,
                                   cv=5,
                                   verbose=3)

random_search.fit(x, y)

x.head()

random_search.best_estimator_

random_search.best_params_

model = xgboost.XGBClassifier(base_score=0.5,
                              booster='gbtree',
                              colsample_bylevel=1,
def test_hyperparameter_selection(digits):
    X_train, X_test, y_train, y_test = digits
    param_grid = {'eta': [0.02, 0.03]}
    mod = BasicSGDClassifier(max_iter=5)
    xval = RandomizedSearchCV(mod, param_grid, cv=2)
    xval.fit(X_train, y_train)
Exemple #8
0
Y = pd.DataFrame(Y, columns=['target'])

#model building
param_grid = {
    'eta': [0.05, 0.1, 0.15],
    'max_depth': [6, 7, 8],
    'gamma': [0.5, 1, 1.5],
    'min_child_weight': [1, 5, 10]
}

xgb_model = xgb.XGBClassifier(n_estimators=500,
                              objective='binary:logistic',
                              metric='auc',
                              scale_pos_weight=2)
fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=2)
rs_cv = RandomizedSearchCV(xgb_model, param_grid, cv=fold.split(X, Y))

st = datetime.now()
rs_cv.fit(X, Y)
#end=datetime.now()
print("Time taken is:", datetime.now() - st)

best_params = rs_cv.best_params_
#rs_cv.best_score_ = 0.9976542717402616

model_fit = xgb.XGBClassifier(params=best_params,
                              n_estimators=500,
                              objective='binary:logistic',
                              metric='auc',
                              scale_pos_weight=2)
xgb_model = model_fit.fit(X, Y)
Exemple #9
0
#dictionary for parameters
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05)
}

# use the f1 score metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted',
                        labels=labels)

# search
rs = RandomizedSearchCV(crf,
                        params_space,
                        cv=10,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=20,
                        scoring=f1_scorer)
rs.fit(X_train, y_train)

crf = sklearn_crfsuite.CRF(algorithm='lbfgs',
                           c1=0.02,
                           c2=0.3,
                           max_iterations=2000,
                           all_possible_transitions=True,
                           verbose=False)
crf.fit(x_train, y_train)
labels = ["O", "D", "T"]
y_pred = crf.predict(x_test)
print("F1 score (unweighted average) is %lf " %
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf,
    'bootstrap': bootstrap
}

# In[ ]:

cv = KFold(5, shuffle=True)

# In[ ]:

rf_random = RandomizedSearchCV(estimator=base_model,
                               param_distributions=random_grid,
                               n_iter=100,
                               cv=cv,
                               verbose=2,
                               random_state=101,
                               n_jobs=-1)

# In[ ]:

rf_random.fit(X_train, y_train)

# In[ ]:

print(rf_random.best_params_)

# In[ ]:

# y_pred=rf_random.predict(X_test)
Exemple #11
0
    return {
        'batch_size': batches,
        'optimizer': optimizers,
        'drop': dropout,
        'learning_rate': learning_rate
    }


# KerasClassifier 모델 구성하기
model = KerasClassifier(build_fn=build_model, verbose=1)

# hyperparameters 변수 정의
hyperparameters = create_hyperparameter()

search = RandomizedSearchCV(estimator=model,
                            param_distributions=hyperparameters,
                            cv=3)

# 모델 훈련
search.fit(x_train, y_train)
score = search.score(x_test, y_test)
print(search.best_params_
      )  # {'optimizer': 'adadelta', 'drop': 0.2, 'batch_size': 20}
print("score : ", score)  # 0.9661999940872192


def sum_of_squares(v):
    return sum(v_i**2 for v_i in v)


# 실수 벡터를 입력하면 요소의 제곱으ㅢ 합을 리턴해주는 비용함수
Exemple #12
0
        "Random Forest Classifier": clf,
        "Support Vector Machine": svm,
    },
    index=["accuracy"])
model_compare.T.plot.bar(figsize=(15, 10))

# In[14]:

# Create a hyperparameter grid for LogisticRegression
log_reg_grid = {"C": np.logspace(-4, 4, 20), "solver": ["liblinear"]}
# Tune LogisticRegression
np.random.seed(42)
# Setup random hyperparameter search for LogisticRegression
rs_log_reg = RandomizedSearchCV(LogisticRegression(),
                                param_distributions=log_reg_grid,
                                cv=5,
                                n_iter=20,
                                verbose=True)
# Fit random hyperparameter search model for LogisticRegression
rs_log_reg.fit(X_train, y_train)
score = rs_log_reg.score(X_test, y_test)
print(score * 100)

# In[15]:

log_reg_grid = {'C': np.logspace(-4, 4, 30), "solver": ["liblinear"]}
#setup  the gird cv
gs_log_reg = GridSearchCV(LogisticRegression(),
                          param_grid=log_reg_grid,
                          cv=5,
                          verbose=True)
Exemple #13
0
def gridsearchGradientBoostingR(X, y, n_jobs=1, verbose=True):
    if verbose == True: verbose = 2
    cv = 10
    n_iter = 100
    #    cv=2
    #    n_iter=10
    n_jobs = np.maximum(n_jobs, 1)

    if 'pandas' in str(type(X)):
        X = X.as_matrix().astype(np.float)
    if 'pandas' in str(type(y)):
        y = y.as_matrix().astype(np.float)

    # Loss function to be optimized (minimize)
    loss = ['ls', 'lad', 'huber', 'quantile']

    # Number of weak learnes (trees) used in the boosting process
    n_estimators = [100, 250, 300, 500, 600, 750]

    # Maximum depth of each tree
    max_depth = [2, 3, 5, 10, 15]

    # Minimum number of samples per leaf
    min_samples_leaf = [1, 2, 4, 6, 8, 10]

    # Minimum number of samples to split a node
    min_samples_split = [2, 4, 6, 10, 12]

    # Maximum number of features to consider for making splits
    max_features = ['auto', 'sqrt', 'log2', None]

    # Maximum number of features to consider for making splits
    criterion = ['friedman_mse', 'mse']

    #%% Make the grid.
    hyperparameter_grid = {
        'loss': loss,
        'n_estimators': n_estimators,
        'max_depth': max_depth,
        'min_samples_leaf': min_samples_leaf,
        'min_samples_split': min_samples_split,
        'max_features': max_features,
        'criterion': criterion
    }

    # Create the model to use for hyperparameter tuning
    model = GradientBoostingRegressor()
    #    model = xgboost.XGBRegressor()

    # Set up the random search with 5-fold cross validation
    random_cv = RandomizedSearchCV(
        estimator=model,
        param_distributions=hyperparameter_grid,
        cv=cv,
        n_iter=n_iter,
        scoring='neg_mean_absolute_error',
        n_jobs=n_jobs,
        verbose=verbose,
        return_train_score=True,
        refit=True,  #Refit using the best found parameters on the whole dataset.
    )

    # Fit on the training data
    random_cv.fit(X, y)

    # Show some results:
    if verbose:
        report(random_cv.cv_results_)

    # Find the best combination of settings
    model = random_cv.best_estimator_
    #    random_cv.best_score_
    #    random_cv.best_params_
    #    random_cv.best_index_
    #    random_cv.cv_results_['params'][search.best_index_]
    #    random_results = pd.DataFrame(random_cv.cv_results_).sort_values('mean_test_score', ascending = False)
    #    bestparams=random_cv.cv_results_['params'][random_cv.best_index_]

    return (model, random_cv)
Exemple #14
0
X = scaler.fit_transform(x)

################################################################################################################################

# make scoring and kfold instance
# scoring = make_scorer(balanced_accuracy_score) # also run with this scoring method
scoring = make_scorer(f1_score)
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
it = 25

################################################################################################################################

# KNN Classifier
knn = KNeighborsClassifier()
p_grid = {"n_neighbors": range(1, 30), "leaf_size": range(1, 50)}
gknn = RandomizedSearchCV(knn, p_grid, n_iter=it, cv=kf, scoring=scoring)
gknn.fit(X, y)
cv_knn = gknn.cv_results_['mean_test_score']
print(cv_knn)
df_knn = pd.DataFrame(cv_knn)
df_knn.columns = ['KNN']

###############################################################################################################################

# Random Forest Classifier
rf = ses.RandomForestClassifier()
p_grid = {
    "max_depth": range(1, 25),
    "n_estimators": range(15, 75),
    "min_samples_leaf": range(1, 25),
    "min_samples_split": range(2, 25)
parameters = {
    'max_features': [2, 3, 4],
    'max_samples': [0.5, 0.7, 0.9],
    "base_estimator__C": [0.0001, 0.001, 0.01, 1, 10, 100]
}

# **Задание 11.** Следующая задача обучить бэггинг классификатор (`random_state`=42). В качестве базовых классификаторов возьмите 100 логистических регрессий и на этот раз используйте не `GridSearchCV`, а `RandomizedSearchCV`. Так как перебирать все 54 варианта комбинаций долго, то поставьте максимальное число итераций 20 для `RandomizedSearchCV`. Также не забудьте передать параметр валидации `cv` и `random_state=1`. Какая лучшая точность получилась?

# In[23]:

bg_clf = BaggingClassifier(base_estimator=lr,
                           n_estimators=100,
                           random_state=42)
bg_clf_grid_random = RandomizedSearchCV(bg_clf,
                                        param_distributions=parameters,
                                        n_iter=20,
                                        cv=skf,
                                        random_state=1,
                                        n_jobs=-1)
bg_clf_grid_random.fit(X, y)

# In[24]:

bg_clf_grid_random.best_score_

# **Задача 12.** Дайте интерпретацию лучших параметров для бэггинга. Почему именно такие значения оказались лучшими?
#
# - для бэггинга важно использовать как можно меньше признаков
# - бэггинг лучше работает на небольших выборках
# - меньше корреляция между одиночными моделями
# - чем больше признаков, тем меньше теряется информации
Exemple #16
0
def train(X,
          y,
          weight_classes=True,
          n_iter_search=500,
          score='roc_auc',
          random_state=123):
    '''
    Train a binary SGD classifier using a randomized grid search with given scoring metric.

    Parameters:
        X (list-like): list of normalized attachment texts
        y (list-like): list of validated targets (0 = red, 1 = green)
        weight_classes (bool): whether or not to use the “balanced” mode to adjust class weights.
        n_iter_search (int):  number of parameter settings that are sampled. Trades off runtime vs quality
                              of the solution.
        score (str):  the scorer used to evaluate the predictions on the test set. `roc_auc` by
                      default. Available options include:  accuracy, roc_auc, precision, fbeta, recall.
                      Note: for fbeta, beta is set to 1.5 to favor recall of the positive class.
        random_state (int): sets the random seed for reproducibility.
    Returns:
        results (dict): a dict of scoring metrics and their values
        best_score (float): mean cross-validated score of the best_estimator.
        best_estimator (sklearn estimator): estimator that was chosen by the search
        best_params (dict): parameter setting that gave the best results on the hold out data.
    '''

    if weight_classes:
        clf = SGDClassifier(class_weight='balanced')
    else:
        clf = clf = SGDClassifier()
    scoring = {
        'accuracy': metrics.make_scorer(metrics.accuracy_score),
        'roc_auc': metrics.make_scorer(metrics.roc_auc_score),
        'precision': metrics.make_scorer(metrics.average_precision_score),
        'fbeta': metrics.make_scorer(metrics.fbeta_score, beta=.5),
        'recall': metrics.make_scorer(metrics.recall_score)
    }
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, stratify=y, test_size=0.2, random_state=random_state)
    pipe = Pipeline([('vectorizer', TfidfVectorizer(stop_words='english')),
                     ('select', SelectKBest(chi2)), ('clf', clf)])
    param_dist = get_param_distribution()
    random_search = RandomizedSearchCV(pipe,
                                       param_distributions=param_dist,
                                       scoring=scoring,
                                       refit=score,
                                       n_iter=n_iter_search,
                                       cv=5,
                                       n_jobs=-1,
                                       verbose=1,
                                       random_state=random_state)
    try:
        random_search.fit(X_train, y_train)
    except Exception as e:
        logger.error(f"Exception occurred training a new model:  \
                        {e}",
                     exc_info=True)
    y_pred = random_search.predict(X_test)
    #get the col number of the positive class (i.e. green)
    positive_class_col = list(random_search.classes_).index(1)
    try:
        y_score = random_search.predict_proba(X_test)[:, positive_class_col]
    except AttributeError:
        y_score = random_search.decision_function(X_test)
    average_precision = metrics.average_precision_score(y_test, y_score)
    acc = metrics.accuracy_score(y_test, y_pred)
    try:
        roc_auc = metrics.roc_auc_score(y_test, y_pred)
    except ValueError:
        roc_auc = None
    precisions, recalls, _ = metrics.precision_recall_curve(y_test, y_score)
    try:
        auc = metrics.auc(recalls, precisions)
    except ValueError:
        auc = None
    fbeta = metrics.fbeta_score(y_test, y_pred, beta=1.5)
    recall = metrics.recall_score(y_test, y_pred)
    best_estimator = random_search.best_estimator_
    best_params = random_search.best_params_
    best_score = random_search.best_score_
    result_values = [
        y_pred, y_score, precisions, recall, average_precision, acc, roc_auc,
        auc, fbeta, recalls, best_score, best_estimator, y_test
    ]
    result_keys = [
        'y_pred', 'y_score', 'precisions', 'recall', 'average_precision',
        'acc', 'roc_auc', 'auc', 'fbeta', 'recalls', 'best_score',
        'best_estimator', 'y_test'
    ]
    results = {k: v for k, v in zip(result_keys, result_values)}

    return results, best_score, best_estimator, best_params
#     {"mo__C" : [1,10,100,1000],"mo__kernel":["sigmoid"],"mo__gamma":[0.001,0.0001]}
# ]

Parameters = [
    {"svc__C" : [1,10,100,1000],"svc__kernel":["linear"] },
    {"svc__C" : [1,10,100],"svc__kernel":['rbf'],"svc__gamma":[0.001,0.0001]},
    {"svc__C" : [1,10,100,1000],"svc__kernel":["sigmoid"],"svc__gamma":[0.001,0.0001]}
]
# 2

# pipe = Pipeline([('scaler',MinMaxScaler()),('mo',SVC())]) 
# 아레와 결과치는 동일 하다 이방법은 이름을 정해줄수있다  이름을 정해줘야 위에 Parameters를 조정가능하다(mo__:이름으로 지정) 
pipe = make_pipeline(StandardScaler(),SVC()) #이걸 사용할때는 (SVC__)로 해야 된다

# model = GridSearchCV(pipe,Parameters,cv = 5)
model = RandomizedSearchCV(pipe,Parameters,cv = 5)

model.fit(x_train, y_train)

results = model.score(x_test,y_test)

print('최적의 매개변수 : ', model.best_estimator_) # model.best_estimator_ : 어떤것이 가장 좋은것(매개변수)인지 나온다 
print(results)

# ===================
# for문 으로 grid랑 random돌릴때
# models = [GridSearchCV(pipe,Parameters,cv = 5),RandomizedSearchCV(pipe,Parameters,cv = 5)]

# for algorithm in models :  
#     model = algorithm
#     model.fit(x_train, y_train)
                                                    y,
                                                    test_size=0.2,
                                                    random_state=45)

kfold = KFold(n_splits=5, shuffle=True)

parameters = [{
    'n_estimators': [100, 150, 200, 250],
    'max_depth': [6, 8, 10, 12],
    'min_samples_leaf': [1, 3, 5, 7, 10],
    'min_samples_split': [2, 3, 5, 10],
    'n_jobs': [-1, 2, 4]
}]

# 2. 모델
model = RandomizedSearchCV(RandomForestClassifier(), parameters, cv=kfold)

start = time()
model.fit(x_train, y_train)

print('RandomizedSearchCV took %.2f seconds' % (time() - start))
print('최적의 매개변수 :', model.best_estimator_)

y_pred = model.predict(x_test)
print('최종 정답률 :', accuracy_score(y_test, y_pred))
print('최종 정답률 :', model.score(x_test, y_test))
'''
RandomizedSearchCV took 20.05 seconds
최적의 매개변수 : RandomForestClassifier(max_depth=8, min_samples_leaf=3, min_samples_split=3,
                       n_estimators=150, n_jobs=2)
최종 정답률 : 0.9649122807017544
Exemple #19
0
X_test = test.drop(['subject', 'Activity'], axis=1)
y_test = test.Activity
print('Training data size : ', X_train.shape)
print('Test data size : ', X_test.shape)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

import warnings
warnings.filterwarnings("ignore")
parameters = {'C': np.arange(10, 61, 10), 'penalty': ['l2', 'l1']}
lr_classifier = LogisticRegression()
lr_classifier_rs = RandomizedSearchCV(lr_classifier,
                                      param_distributions=parameters,
                                      random_state=42)
lr_classifier_rs.fit(X_train, y_train)
y_pred = lr_classifier_rs.predict(X_test)
lr_accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)
print("Accuracy using Logistic Regression : ", lr_accuracy)
labels = np.unique(y_pred)
labels
plt.figure(figsize=(10, 10))
sns.heatmap(confusion_matrix(y_test.values, y_pred),
            annot=True,
            cmap='Blues',
            fmt='',
            xticklabels=labels,
            yticklabels=labels)
Exemple #20
0
        scores = {}
        for name, grid in param_grids:
            scores[name] = []

        from validator import val
        from postprocessing import MetricEqualizer, Average

        # we perform a 7-fold validated random search over 7 param options 7 times, then switch genders: B I B L I C A L
        # no but for real the exact numbers don't really matter
        for _ in range(7):
            for name, param_grid in param_grids:
                try:
                    rand_cv = RandomizedSearchCV(
                        pipeline,
                        param_distributions=param_grid,
                        n_iter=7,
                        scoring=dist_score,
                        cv=7,
                        return_train_score=False)
                    rand_cv.fit(couples, y)

                    score = val(
                        [rand_cv.best_estimator_],
                        [Average(),
                         MetricEqualizer(metric="percentage")], swap)["score"]
                    print(score)

                    scores[name].append(score)

                    if score < best_score:
                        best_score = score
def hyperparameter_tuning(X_train,
                          Y_train,
                          X_test,
                          Y_test,
                          save_name='SVM_model'):

    # Initialise the SVM classifier
    classifier = svm.SVC(cache_size=1024,
                         class_weight='balanced',
                         random_state=random.randint(1, 10000))

    # Set possible parameter values
    C = [0.01, 0.1, 1, 10]
    gamma = [0.001, 0.01, 0.1]
    kernel = ['linear', 'rbf', 'poly', 'sigmoid']
    tol = [0.0001, 0.001, 0.01]

    # Wrap the parameter values in random_grid
    random_grid = {'C': C, 'gamma': gamma, 'kernel': kernel, 'tol': tol}

    #===Run a randomised search for the optimal parameter setting===#
    classifier_random = RandomizedSearchCV(estimator=classifier,
                                           param_distributions=random_grid,
                                           n_iter=100,
                                           cv=5,
                                           verbose=2,
                                           random_state=random.randint(
                                               1, 10000),
                                           n_jobs=-1)

    # Fit the random search model
    param_opt_rand = classifier_random.fit(X_train, Y_train)

    #=== Narrow down random optimal solutions to the best hyperparamets ===#
    # Extrapolate random optimal parameter values
    tol = param_opt_rand.best_estimator_.tol
    kernel = param_opt_rand.best_estimator_.kernel
    gamma = param_opt_rand.best_estimator_.gamma
    C = param_opt_rand.best_estimator_.C

    # IDEA: implement a precision variable. Divide the step size and the offset
    # used in the np.arange() by the precision which should by default be 1.
    param_grid = {
        'C': np.arange(C - 0.8, C + 0.81, 0.4),
        'gamma': [0.5 * gamma, gamma, gamma * 2],
        'tol': [0.5 * tol, tol, 2 * tol]
    }
    # Set kernel
    classifier.kernel = kernel

    # Find the optimal hyperparameters using gridsearch
    # initialise the grid search with cross validation
    classifier_gridsearch = GridSearchCV(estimator=classifier,
                                         param_grid=param_grid,
                                         cv=5,
                                         n_jobs=-1,
                                         verbose=2)
    # Run the grid search to find the model with the optimal hyperparamterers
    classifier_gridsearch.fit(X_train, Y_train)

    # Extrapolate the optimal hyperparamters
    SVC_params_opt = classifier_gridsearch.best_params_

    # Evaluate the optimal model
    final_accuracy = classifier_gridsearch.best_estimator_.score(
        X_test, Y_test)
    final_roc_auc_score = metrics.roc_auc_score(
        Y_test, classifier_gridsearch.best_estimator_.predict(X_test))

    # Locally save results
    SVM_params_file = open("Hyperparameters/SVM.pkl", "wb")
    pickle.dump(SVC_params_opt, SVM_params_file)
    SVM_params_file.close()

    # Locally save the entire model
    joblib.dump(classifier_gridsearch.best_estimator_,
                f'Models/SVM/{save_name}.pkl')

    # print the results
    print(
        "Support Vector Machine hyperparamter optimisation completed succesfully."
    )
    print(f"Best SVM model accuracy: {final_accuracy}")
    print(f"Best SVM model roc_auc: {final_roc_auc_score}")

    return classifier_gridsearch.best_estimator_
Exemple #22
0
print(random_grid)

# In[23]:

# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()

# In[24]:

# Random search of parameters, using 3 fold cross validation,
# search across 100 different combinations
rf_random = RandomizedSearchCV(estimator=rf,
                               param_distributions=random_grid,
                               scoring='neg_mean_squared_error',
                               n_iter=100,
                               cv=5,
                               verbose=2,
                               random_state=42,
                               n_jobs=1)

# In[25]:

#fit the random forest model
rf_random.fit(X_train, y_train)

# In[26]:

#displaying the best parameters
rf_random.best_params_

# In[27]:
Exemple #23
0
#	model.add(Dense(12, input_dim=8, kernel_initializer='uniform', activation='linear', kernel_constraint=maxnorm(weight_constraint)))
#	model.add(Dropout(dropout_rate))
#	model.add(Dense(1, kernel_initializer='uniform', activation='sigmoid'))
# Compile model
#	model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
#	return model

#weight_constraint = [1, 2, 3, 4, 5]
#dropout_rate = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
#param_grid = dict(dropout_rate=dropout_rate, weight_constraint=weight_constraint)
##############################################################################################################

weight_constraint = [1, 2, 3, 4, 5]
dropout_rate = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
param_grid = dict(dropout_rate=dropout_rate,
                  weight_constraint=weight_constraint)
grid = RandomizedSearchCV(estimator=model,
                          param_grid=param_grid,
                          n_jobs=1,
                          random_state=3,
                          cv=3)

grid_result = grid.fit(X, Y)
# summarize results
print("Best: %f using %s" %
      (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))
Exemple #24
0
def ml_tests(x_train, x_test, y_train, y_test, imputed_data):

    # XGBoost Standardmodell

    xg_reg = xgb.XGBRegressor(objective="reg:squarederror",
                              n_estimators=50,
                              seed=123)
    xg_reg.fit(x_train, y_train)
    preds = xg_reg.predict(x_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    print("RMSE: %f" % rmse)
    print()

    datestr = time.strftime("%Y%m%d-%H%M")

    xg_reg_file = 'XGB_Standardmodell.pckl'
    with open(xg_reg_file, 'wb') as f:
        pickle.dump(xg_reg, f)

    plot_importance(xg_reg, max_num_features=10)
    fig = plt.gcf()
    fig.set_size_inches(17.5, 8)
    plt.savefig(
        'Files/Feature_Importances_Grafiken/xgb_feature_importances.jpg')

    # Grid Search parameter Tuning
    print("Grid Search Parameter Tuning:")
    gbm_param_grid = {
        'colsample_bytree': [0.3, 0.7],
        'n_estimators': [25, 50, 80, 100],
        'max_depth': [2, 5, 7]
    }
    gbm = xgb.XGBRegressor(objective="reg:squarederror")
    grid_mse = GridSearchCV(estimator=gbm,
                            param_grid=gbm_param_grid,
                            scoring="neg_mean_squared_error",
                            cv=4,
                            verbose=1)
    grid_mse.fit(x_train, y_train)
    print("Best parameters found: ", grid_mse.best_params_)
    print("Lowest RMSE Grid Search found: ",
          np.sqrt(np.abs(grid_mse.best_score_)))
    print()

    # Randomized Search parameter tuning
    print("Randomized Search Parameter Tuning:")
    gbm_param_grid2 = {'n_estimators': [25], 'max_depth': range(2, 12)}

    gbm2 = xgb.XGBRegressor(objective="reg:squarederror", n_estimators=10)
    randomized_mse = RandomizedSearchCV(estimator=gbm2,
                                        param_distributions=gbm_param_grid2,
                                        scoring="neg_mean_squared_error",
                                        n_iter=5,
                                        cv=4,
                                        verbose=1)
    randomized_mse.fit(x_train, y_train)
    print("Best parameters found: ", randomized_mse.best_params_)
    print("Lowest RMSE Randomized Search found: ",
          np.sqrt(np.abs(randomized_mse.best_score_)))

    dm_train = xgb.DMatrix(data=x_train, label=y_train)
    dm_test = xgb.DMatrix(data=x_test, label=y_test)
    params = {"booster": "gblinear", "objective": "reg:squarederror"}
    xg_reg2 = xgb.train(dtrain=dm_train, params=params, num_boost_round=15)
    preds2 = xg_reg2.predict(dm_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds2))
    print("RMSE: %f" % rmse)

    reg_params = [0.1, 0.3, 0.7, 1, 10, 100]
    params1 = {"objective": "reg:squarederror", "max_depth": 3}
    rmses_l2 = []
    for reg in reg_params:
        params1["lambda"] = reg
        cv_results_rmse = xgb.cv(dtrain=dm_train,
                                 params=params1,
                                 nfold=3,
                                 num_boost_round=15,
                                 metrics="rmse",
                                 as_pandas=True)
        rmses_l2.append(cv_results_rmse["test-rmse-mean"].tail(1).values[0])

    print("Best rmse as a function of l2:")
    print(pd.DataFrame(list(zip(reg_params, rmses_l2)), columns=["l2",
                                                                 "rmse"]))
    print()

    #print_feature_importances(model=xg_reg2, data=imputed_data.drop(columns=["angebotspreis"]))

    # Stochastic Gradient Boosting
    print("Stochastic Gradient Boosting:")
    sgbr = GradientBoostingRegressor(max_depth=4,
                                     subsample=0.9,
                                     max_features=0.75,
                                     n_estimators=200,
                                     random_state=2)

    sgbr.fit(x_train, y_train)
    y_pred = sgbr.predict(x_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print("RMSE: %f" % rmse)
    print()

    sgbr_file = 'sgbr_Standardmodell.pckl'
    with open(sgbr_file, 'wb') as f:
        pickle.dump(sgbr, f)

    print_feature_importances(
        model=sgbr,
        data=imputed_data.drop(columns=["angebotspreis"]),
        save_string=
        'Files/Feature_Importances_Grafiken/sgbr_feature_importances.jpg')

    # Random Forrest
    print("Random Forrest:")
    rf = RandomForestRegressor(n_estimators=25, random_state=2)
    rf.fit(x_train, y_train)
    y_pred2 = rf.predict(x_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred2))
    print("RMSE: %f" % rmse)
    print()

    rf_file = 'rf_Standardmodell.pckl'
    with open(rf_file, 'wb') as f:
        pickle.dump(rf, f)

    print_feature_importances(
        model=rf,
        data=imputed_data.drop(columns=["angebotspreis"]),
        save_string=
        'Files/Feature_Importances_Grafiken/rf_feature_importances.jpg')
print('Length of X (test): {} | Length of y (test): {}'.format(
    len(original_Xtest), len(original_ytest)))

# List to append the score and then find the average
accuracy_lst = []
precision_lst = []
recall_lst = []
f1_lst = []
auc_lst = []

# Classifier with optimal parameters
# log_reg_sm = grid_log_reg.best_estimator_
log_reg_sm = LogisticRegression()

rand_log_reg = RandomizedSearchCV(LogisticRegression(),
                                  log_reg_params,
                                  n_iter=4)

# Implementing SMOTE Technique
# Cross Validating the right way
# Parameters
log_reg_params = {
    "penalty": ['l1', 'l2'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
}
for train, test in sss.split(original_Xtrain, original_ytrain):
    pipeline = imbalanced_make_pipeline(
        SMOTE(sampling_strategy='minority'),
        rand_log_reg)  # SMOTE happens during Cross Validation not before..
    model = pipeline.fit(original_Xtrain[train], original_ytrain[train])
    best_est = rand_log_reg.best_estimator_
    def fineTuneClassifiers(self, X, y, classifiers):
        """Search over specified parameter values for various estimators/classifiers and choose the best one.

        This method searches over specified values and selects the classifier that
        achieves the best avg accuracy score for all evaluations. The supported search methods are:

        * *GridSearchCV*: Exhaustive search over specified parameter values for supported estimators.
          The following variables are defined in :func:`~src.config.MLConf` :

         * :attr:`~src.config.MLConf.MLP_hyperparameters`
         * :attr:`~src.config.MLConf.RandomForests_hyperparameters`
         * :attr:`~src.config.MLConf.XGBoost_hyperparameters`
         * :attr:`~src.config.MLConf.SVM_hyperparameters`
         * :attr:`~src.config.MLConf.DecisionTree_hyperparameters`

        * *RandomizedSearchCV*: Randomized search over continuous distribution space. :attr:`~src.config.MLConf.max_iter`
          defines the number of parameter settings that are sampled. :py:attr:`~src.config.MLConf.max_iter` trades off
          runtime vs quality of the solution. The following variables are defined in :func:`~src.config.MLConf` :

         * :attr:`~src.config.MLConf.MLP_hyperparameters_dist`
         * :attr:`~src.config.MLConf.RandomForests_hyperparameters_dist`
         * :attr:`~src.config.MLConf.XGBoost_hyperparameters_dist`
         * :attr:`~src.config.MLConf.SVM_hyperparameters_dist`
         * :attr:`~src.config.MLConf.DecisionTree_hyperparameters_dist`

        Parameters
        ----------
        X: array-like or sparse matrix, shape = [n_samples, n_features]
            The training input samples.
        y: array-like, shape = [n_samples] or [n_samples, n_outputs]
            The target values, i.e. class labels.

        Returns
        -------
        out: :obj:`dict` of {:obj:`str`: :obj:`int`, :obj:`str`: :obj:`str`}
            It returns a dictionary with keys *accuracy*, i.e., the used similarity score, and *classifier*, i.e.,
            the name of the model in reference.
        """
        hyperparams_data = list()

        for clf_key in classifiers:
            try:
                print(f'Tuning {clf_key}...')

                clf = None
                if self.search_method.lower() == 'grid':
                    clf = GridSearchCV(self.clf_names[clf_key][0](
                        random_state=config.seed_no),
                                       self.clf_names[clf_key][1],
                                       cv=self.outer_cv,
                                       scoring=config.MLConf.score,
                                       verbose=1,
                                       n_jobs=self.n_jobs)
                # elif self.search_method.lower() == 'hyperband' and clf_key in ['XGBoost', 'Extra-Trees', 'Random Forest']:
                #     HyperbandSearchCV(
                #         clf_val[0](probability=True) if clf_key == 'SVM' else clf_val[0](), clf_val[2].copy().pop('n_estimators'),
                #         resource_param='n_estimators',
                #         min_iter=500 if clf_key == 'XGBoost' else 200,
                #         max_iter=3000 if clf_key == 'XGBoost' else 1000,
                #         cv=self.inner_cv, random_state=seed_no, scoring=score
                #     )
                else:  # randomized is used as default
                    clf = RandomizedSearchCV(self.clf_names[clf_key][0](),
                                             self.clf_names[clf_key][2],
                                             cv=self.outer_cv,
                                             scoring=config.MLConf.score,
                                             verbose=1,
                                             n_jobs=self.n_jobs,
                                             n_iter=self.n_iter)
                clf.fit(X, y)

                hyperparams_found = dict()
                hyperparams_found['score'] = clf.best_score_
                hyperparams_found['results'] = clf.cv_results_
                hyperparams_found['hyperparams'] = clf.best_params_
                hyperparams_found['estimator'] = clf.best_estimator_
                hyperparams_found['clf_name'] = clf_key
                hyperparams_found['scorers'] = clf.scorer_

                hyperparams_data.append(hyperparams_found)
            except KeyError as e:
                print("type error: {} for key: {}".format(str(e), clf_key))

        _, best_clf = max(enumerate(hyperparams_data),
                          key=(lambda x: x[1]['score']))

        return best_clf
Exemple #27
0
pd.DataFrame(grid_search.cv_results_)

# In[103]:


from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_distribs = {
    'n_estimators': randint(low=1, high=200),
    'max_features': randint(low=1, high=8),
}

forest_reg = RandomForestRegressor(random_state=42)
rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,
                                n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=42)
rnd_search.fit(housing_prepared, housing_labels)

# In[104]:


cvres = rnd_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

# In[105]:


feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances
        "model__epochs": epochs
    }


from keras.wrappers.scikit_learn import KerasClassifier  # classifier 분류
model = KerasClassifier(build_fn=build_network, verbose=1)  # 사이킥런으로 랩핑을 하다.

hyperparameters = create_hyperparameters()

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold, GridSearchCV

# estimator=> model을 가져온다.
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
# estimator=> model을 가져온다.
pipe = Pipeline([("scaler", MinMaxScaler()), ('model', model)])
search = RandomizedSearchCV(pipe,
                            hyperparameters,
                            n_iter=10,
                            n_jobs=15,
                            cv=5,
                            verbose=1)

search.fit(x_train, y_train)

print(search.best_params_)
score = search.score(x_test, y_test)
print("Score : ", score)
    intermediate_dims = np.arange(1,8)*128
    latent_dims = np.arange(2,10) 
    latent_activations = ['relu', 'elu'] 
    kernel_initializers = ['glorot_normal' , 'glorot_uniform', 
                            'he_normal', 'he_uniform',
                            'lecun_normal', 'lecun_uniform']
    
    param_grid = dict(n_hidden_layers = n_hidden_layers,
                      # kernel_initializer = kernel_initializers,
                      intermediate_dim = intermediate_dims,
                      latent_dim = latent_dims,
                      latent_activation = latent_activations)

    grid = RandomizedSearchCV(estimator = model, 
                              param_distributions = param_grid, 
                              error_score = np.nan,
                              n_iter = n_RSCV_iters,
                              cv = n_cv)

grid_result = grid.fit(y_train, y_train)

time_stamp = int(time())
data_dir = '/SaveFiles/'
save_dir = os.environ['HOME'] + data_dir

savename_tmplt = save_dir + 'grid_vae_{}_{}'

if verbose: print("[INFO] Saving fitted model every way that I know how.")

# if verbose: print("[INFO] Saving full model")
# grid.save(savename_tmplt.format(time_stamp, 'full_model_save.hdf5'))
    # Gera os parametros de entrada aleatoriamente. Alguns sao uniformes nos
    # EXPOENTES.
    alpha = 10**np.linspace(-3, 3, 10)

    # Une os parametros de entrada em um unico dicionario a ser passado para a
    # funcao.
    parametros = {'alpha': alpha}

    shuffle_splitter = ShuffleSplit(n_splits=5,
                                    test_size=0.3,
                                    random_state=1234)
    regressor = Ridge()
    cv_results = \
        RandomizedSearchCV(estimator=regressor, cv=shuffle_splitter,
                           param_distributions=parametros,
                           verbose=1,
                           n_jobs=4,
                           scoring="neg_root_mean_squared_error")

    # Realizamos a busca atraves do treinamento
    cv_results.fit(X_data_scaled, y_data)

    print("\n---------------------LINEAR_REGRESSION_L2-------------------")

    print("\nMelhor conjunto de parâmetros: \n", cv_results.best_estimator_)

    print("\nMelhor error score: \n", -cv_results.best_score_)

    # Deafult do sklearn. Coloquei uma lista de 10 parametros iguais so pra nao dar warning, performance nao eh critico aqui
    alpha = [1.0] * 10