Esempio n. 1
0
 def gscv_para(self, C_list, gamma_list, x_train, y_train):
     '''
     用网格搜索和交叉验证调节参数,考虑类别不平衡的情况,k-fold的k为10
     Args:
         C_list:
             C参数的备选列表
         gamma_list:
             gamma参数的备选列表
         x_train:
             训练集中的特征数据
         y_train:
             训练集中的类别数据
     Returns:
         最优的C参数,gamma参数和最优时的score(平均值)
         优化的标准是SVC的score值,score越高表示,表示参数越好
     '''
     clf = SVC(class_weight='balanced', cache_size=4000)
     gscv = GSCV(clf,
                 param_grid={
                     'C': C_list,
                     'gamma': gamma_list
                 },
                 n_jobs=-1,
                 cv=10,
                 pre_dispatch=4)
     gscv.fit(x_train, y_train)
     return gscv.best_params_.values(), gscv.best_score_
def SVM_gridsearch(parameters, data_train, labels_train, number_splits,
                   num_threads):
    svm_clf = svm.SVC(gamma="scale", probability=True)
    # multiprocessing.cpu_count()
    clf = GSCV(svm_clf,
               parameters,
               cv=SKF(n_splits=number_splits),
               verbose=2,
               n_jobs=num_threads)
    clf.fit(data_train, labels_train)
    return clf
Esempio n. 3
0
 def cvgridsearch(self, skip_train=5):
     self.GS = GSCV(estimator=self.estimator,
                    param_grid=self.param_dict,
                    cv=self.cv,
                    n_jobs=self.n_jobs,
                    return_train_score=True)
     X_train = self.X_train[0:-1:skip_train, :]
     y_train = self.y_train[0:-1:skip_train]
     self.GS.fit(X_train, y_train)
     # self.plot_results_cvgridsearch()
     self.table_results_cvgridsearch()
Esempio n. 4
0
def GridSearchPara():
    N = 100
    impurity = np.linspace(0, 0.2, N)
    hyperpara = {
        'criterion': ['gini', 'entropy'],
        'min_impurity_decrease': impurity,
        'max_depth': np.linspace(1, 200, N)
    }
    model = GSCV(DTC(), hyperpara, cv=5)
    model.fit(X, Y)
    print model.best_params_, model.best_score_
    return
Esempio n. 5
0
File: ARmvp.py Progetto: fdupless/AR
def fitlearner(X, Y, acts=activities, classifier="RFC", name="_"):

    clf = GSCV(
        RFC(),
        {
            "n_estimators": np.arange(1, 5, 1) * 10,
            "max_features": ["auto", "sqrt", "log2", None],
        },
    )  # default classifier
    print("Training the classifier...")
    clf.fit(X, Y)

    testX, testY = gettrainData(["108"], acts, 1)

    predictions = clf.predict(testX)
    success = (predictions == testY).sum() * 1.0 / len(predictions)
    print("Success Rate", success)
    _ = joblib.dump(clf, "Classifier_" + name)
    return clf
Esempio n. 6
0
def GridSearchSinglePara():
    N = 50  # 点数量
    max_depth = np.linspace(1, 200, N)
    hyperpara = {'max_depth': max_depth}  # 参数字典
    # GridSearchCV 对象
    model = GSCV(DTC(), hyperpara, cv=5)
    model.fit(X, Y)
    print model.best_params_, model.best_score_
    # 作图部分
    R = model.cv_results_
    # 平均训练评分
    mtrains = R['mean_train_score']
    # 标准训练评分
    strains = R['std_train_score']
    # 平均验证评分
    mtests = R['mean_test_score']
    # 标准验证评分
    stests = R['std_test_score']
    # 作图
    fig = plt.figure(figsize=(10, 10))
    # 填充
    plt.fill_between(max_depth,
                     mtrains - strains,
                     mtrains + strains,
                     color='lightgray',
                     alpha=0.3)
    plt.fill_between(max_depth,
                     mtests - stests,
                     mtests + stests,
                     color='lightgray',
                     alpha=0.3)
    # 曲线
    plt.plot(max_depth, mtrains, color='r', label='train mean scores')
    plt.plot(max_depth, mtests, color='g', label='test mean scores')
    # 图的基本设置
    plt.grid()
    plt.legend()
    plt.title('max_depth gridsearch')
    plt.xlabel('max_depth')
    plt.ylabel('score %')
    plt.show()
Esempio n. 7
0
def run_experiment(arguments):
    # Load data set
    X, Y, log_tf = load_data_set(arguments['dataset'], path_to_source)
    estim = load_estimator(arguments['estimator_kwargs'], path_to_source)
    # Prepare for experiments
    n_test_sets = arguments['n_test_sets']
    test_size = arguments['test_size']
    param_grid = arguments[
        'param_grid']  # Parameter grid for estimator to CV over
    cv_folds = arguments['cv_folds']
    n_jobs = arguments['n_jobs']
    kf = ShuffleSplit(n_splits=arguments['n_test_sets'],
                      test_size=arguments['test_size'])
    test_error = np.zeros(n_test_sets)
    best_parameters = {}
    test_iter = 0
    computational_time = np.zeros(n_test_sets)
    # Extra array to store dot products if estimator is nsim
    almost_linearity_param = np.zeros(n_test_sets)
    for idx_train, idx_test in kf.split(X):
        start = time.time()
        reg = GSCV(estimator=estim,
                   param_grid=param_grid,
                   scoring='neg_mean_squared_error',
                   iid=False,
                   cv=cv_folds,
                   verbose=0,
                   pre_dispatch=n_jobs,
                   error_score=np.nan,
                   refit=True)  # If estimator fitting raises an exception
        X_train, Y_train = X[idx_train, :], Y[idx_train]
        X_test, Y_test = X[idx_test, :], Y[idx_test]
        reg = reg.fit(X_train, Y_train)
        Y_predict = reg.best_estimator_.predict(X_test)
        end = time.time()
        best_parameters[test_iter] = reg.best_params_
        if arguments['estimator_kwargs']['estimator'] in [
                'isotron', 'slisotron'
        ]:
            best_parameters[test_iter] = reg.best_estimator_.n_iter_cv()
        if log_tf:
            test_error[test_iter] = np.sqrt(
                mean_squared_error(np.exp(Y_test), np.exp(Y_predict)))
        else:
            test_error[test_iter] = np.sqrt(
                mean_squared_error(Y_test, Y_predict))
        computational_time[test_iter] = end - start
        if arguments['estimator_kwargs']['estimator'] == 'nsim':
            almost_linearity_param[
                test_iter] = reg.best_estimator_.measure_almost_linearity()
        test_iter += 1
        print best_parameters
        print test_error
    # Save results
    mean_error = np.mean(test_error)
    std_error = np.std(test_error)
    mean_computational_time = np.mean(computational_time)
    mean_almost_linearity_param = np.mean(almost_linearity_param)
    filename = arguments['filename']
    filename_mod = filename
    save_itr = 0
    while os.path.exists('../results/' + filename_mod + '/'):
        save_itr += 1
        filename_mod = filename + '_' + str(save_itr)
    else:
        os.makedirs('../results/' + filename_mod + '/')
        np.save('../results/' + filename_mod + '/test_errors.npy', test_error)
        np.savetxt('../results/' + filename_mod + '/test_errors.txt',
                   test_error)
        np.savetxt('../results/' + filename_mod + '/computational_time.txt',
                   computational_time)
        np.savetxt(
            '../results/' + filename_mod + '/computational_time_summary.txt',
            [mean_computational_time])
        np.savetxt('../results/' + filename_mod + '/test_errors_summary.txt',
                   np.array([mean_error, std_error]))
        np.save('../results/' + filename_mod + '/best_params.npy',
                best_parameters)
        if arguments['estimator_kwargs']['estimator'] == 'nsim':
            np.savetxt(
                '../results/' + filename_mod + '/almost_linearity_param.txt',
                almost_linearity_param)
            np.savetxt(
                '../results/' + filename_mod + '/almost_linearity_summary.txt',
                [mean_almost_linearity_param])
        with open('../results/' + filename_mod + '/best_params_json.txt',
                  'w') as file:
            file.write(json.dumps(best_parameters, indent=4))
        with open('../results/' + filename_mod + '/log.txt', 'w') as file:
            file.write(json.dumps(arguments, indent=4))
Esempio n. 8
0
                                                    random_state=47)

mlpc_norm_pipe = mp(MinMaxScaler(), MLPC(random_state=47))

mlp_param_grid1 = {
    'mlpclassifier__hidden_layer_sizes': [10, 100, (10, 10), (100, 100)],
    'mlpclassifier__activation': ['identity', 'logistic', 'tanh', 'relu'],
    'mlpclassifier__solver': ['lbfgs', 'sgd', 'adam']
}
mlp_param_grid2 = {
    'hidden_layer_sizes': [10, 100, (10, 10), (100, 100)],
    'activation': ['identity', 'logistic', 'tanh', 'relu'],
    'solver': ['lbfgs', 'sgd', 'adam']
}

mlp_norm_grid = GSCV(mlpc_norm_pipe, mlp_param_grid1, scoring='f1', cv=5)
mlp_norm_grid.fit(X_train, y_train)
print("Test set score: {:.2f}".format(mlp_norm_grid.score(X_test, y_test)))
print("Best parameters: {}".format(mlp_norm_grid.best_params_))

mlp_norm_grid = GSCV(MLPC(), mlp_param_grid2, scoring='f1', cv=5)
mlp_norm_grid.fit(X_train, y_train)
print("Test set score: {:.2f}".format(mlp_norm_grid.score(X_test, y_test)))
print("Best parameters: {}".format(mlp_norm_grid.best_params_))

mlpc_results = pd.DataFrame(mlp_norm_grid.cv_results_)
display(mlpc_results.head)

y_pred = mlp_norm_grid.predict(X_test)
metrics.roc_auc_score(y_test, y_pred)
metrics.accuracy_score(y_test, y_pred)
Esempio n. 9
0
# In[ ]:

from sklearn.preprocessing import StandardScaler as SS
from sklearn.neural_network import MLPClassifier as MLP
from sklearn.pipeline import Pipeline
SS = SS()
clf = MLP()
#print(clf.get_params().keys())
pipe = Pipeline(steps=[('scaler', SS), ('MLP', clf)])
params = {
    'MLP__hidden_layer_sizes': list(range(1000, 30000, 1000)),
    'MLP__activation': ['logistic', 'tanh', 'relu']
}

grid_search = GSCV(pipe, params, cv=8, scoring='accuracy')
grid_search.fit(datax, datay)
best_act = grid_search.best_params_.get('MLP__activation')
best_hl = grid_search.best_params_.get('MLP__hidden_layer_size')
print('Best Parameters:', grid_search.best_params_)
print("Accuracy:", grid_search.best_score_)

nested_score = cross_val_score(grid_search, datax, datay, cv=8)
print('Nested Score:', nested_score.mean())

# In[ ]:

import pickle
final_model = grid_search

filename = 'finalized_ANN_Alzh.sav'
Esempio n. 10
0
    #padronizacao dos dados baseados no conjunto de treinamento
    scale = StandardScaler().fit(data_tr)
    data_tr = scale.transform(data_tr)
    data_te = scale.transform(data_te)

    #KNN

    #PCA
    pca = PCA(0.8)
    pca.fit(data_tr)
    data_tr_pca = pca.transform(data_tr)
    data_te_pca = pca.transform(data_te)

    #estimando o parametro em 3 fold
    grid = GSCV(KNN(), p_knn)
    grid.fit(data_tr_pca, classes_tr)

    #acuracia
    knn = KNN(n_neighbors=grid.best_params_['n_neighbors'])
    knn.fit(data_tr_pca, classes_tr)
    acc = knn.score(data_te_pca, classes_te)
    acc_mean[0] += acc / 5

    #SVM

    #estimando o parametro em 3 fold
    grid = GSCV(SVC(), p_svm)
    grid.fit(data_tr, classes_tr)

    #acuracia
Esempio n. 11
0



# Applying k-Fold Cross Validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
accuracies.mean()
accuracies.std()





# Applying Grid Search to find the best model and the best parameters
from sklearn.model_selection import GridSearchCV as GSCV
# Will be a list of Dictionary
# Can make this dictionary by optimizing the values that we need
# to put in class SVC...(in this question)
parameters = [{'C': [1, 0.9, 0.8, 0.7]}
              ]
# Grid Search investigates all different Combinations and brings
# out the best one
# cv i.e Applying k-fold
grid_search = GSCV(estimator = classifier, param_grid = parameters, scoring = 'accuracy', cv = 10)
grid_search = grid_search.fit(X_train, y_train)
# accuracy that we get through 10 fold validation
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_

Esempio n. 12
0
        ]
        return self

    def predict_proba(self, x):
        # 预测新数据   返回每一类的概率数组[n_samples, n_classes]
        logprobs = np.vstack(
            [model.score_samples(x) for model in self.models_]).T
        result = np.exp(logprobs + self.logpriors_)
        return result / result.sum(axis=1, keepdims=True)

    def predict(self, x):
        # 概率分类器
        return self.classes_[np.argmax(self.predict_proba(x), axis=1)]


# 使用自定义的评估类
bandwidths = 10**np.linspace(0, 2, 100)
grid = GSCV(KDEClassifier(), {
    'bandwidth': bandwidths
}).fit(dig.data, dig.target)
scores = [val.mean_validation_score for val in grid.grid_scores_]

# 交叉检验值分数曲线
plt.figure(figsize=(12, 8))
plt.semilogx(bandwidths, scores)
plt.xlabel('bandwidth')
plt.ylabel('accuracy')
plt.title('KDE Model Performance')
print(grid.best_params_, grid.best_score_)
print(cross_val_score(GNB(), dig.data, dig.target).mean())
Esempio n. 13
0
mlpc_norm_pipe = mp(MinMaxScaler(), MLPC(random_state=47))
mlpc_stand_pipe = mp(StandardScaler(), MLPC(random_state=47))
mlpc_pca_pipe = mp(PCA(), MLPC())
"""
#####kNN grid#####
"""
kNN_param_grid = {
    'kneighborsclassifier__n_neighbors': [1, 2, 3, 4, 5],
    'kneighborsclassifier__weights': ['uniform', 'distance'],
    'kneighborsclassifier__p': [1, 2, 3]
}
"""
Test set score: 0.12
Best parameters: {'kneighborsclassifier__n_neighbors': 1, 'kneighborsclassifier__p': 1, 'kneighborsclassifier__weights': 'uniform'}
"""
kNN_norm_grid = GSCV(knn_norm_pipe, kNN_param_grid, scoring='f1', cv=5)
kNN_norm_grid.fit(rnafolding_X_train, rnafolding_y_train)
print("Test set score: {:.2f}".format(
    kNN_norm_grid.score(rnafolding_X_test, rnafolding_y_test)))
print("Best parameters: {}".format(kNN_norm_grid.best_params_))
kNN_norm_results = pd.DataFrame(kNN_norm_grid.cv_results_)
display(kNN_norm_results.head)
"""
Test set score: 0.11
Best parameters: {'kneighborsclassifier__n_neighbors': 1, 'kneighborsclassifier__p': 1, 'kneighborsclassifier__weights': 'uniform'}
"""
kNN_stand_grid = GSCV(knn_stand_pipe, kNN_param_grid, scoring='f1', cv=5)
kNN_stand_grid.fit(rnafolding_X_train, rnafolding_y_train)
print("Test set score: {:.2f}".format(
    kNN_stand_grid.score(rnafolding_X_test, rnafolding_y_test)))
print("Best parameters: {}".format(kNN_stand_grid.best_params_))
Esempio n. 14
0
    def train_model(self,
                    x_data=[],
                    y_data=[],
                    con_cols=[],
                    cat_cols=[],
                    model=[],
                    imputer=[],
                    cvsplit=4,
                    rstate=101,
                    misper=[]):
        import warnings
        warnings.simplefilter('ignore', DeprecationWarning)
        import numpy as np
        import xgboost as xgb
        from sklearn.model_selection import GridSearchCV as GSCV
        from sklearn.model_selection import KFold, StratifiedKFold
        # ----model imports ----------------
        from sklearn.ensemble import AdaBoostClassifier as ABC
        from sklearn.linear_model import LogisticRegression as LR
        from sklearn.ensemble import RandomForestClassifier as RFC
        import xgboost as xgb

        #-----------------Selecting the imputer------------------------------------------
        Imputed_Data = self.Impute_the_data(imputer=imputer,
                                            x_data=x_data,
                                            y_data=y_data,
                                            con_cols=con_cols,
                                            cat_cols=cat_cols,
                                            misper=misper)

        train_y = Imputed_Data['train_y']
        train_x = Imputed_Data['train_x']
        X_resampled = Imputed_Data['X_resampled']
        y_resampled = Imputed_Data['y_resampled']

        #-----------------Selecting the model to run-------------------------------------

        if model == "xgboost":

            paramGrid = {
                'max_depth': [5, 10],
                'min_child_weight': np.arange(1, 9, 1),
                'gamma': np.arange(0, 1, 0.001),
                'subsample': np.arange(0.1, 0.9, 0.05),
                'colsample_bytree': np.arange(0.1, 0.9, 0.05),
                'n_estimator': [50, 100, 200],
                'objective': ['binary:logistic', 'binary:logitraw'],
                'learning_rate': [0.001, 0.01, 0.1]
            }

            xgb_params = {'eval_metric': 'auc'}
            model_run = xgb.XGBClassifier()
            gridsearch = GSCV(model_run,
                              paramGrid,
                              verbose=1,
                              fit_params=xgb_params,
                              cv=KFold(n_splits=cvsplit,
                                       random_state=rstate).get_n_splits(
                                           [train_x, train_y]))
            gridsearch.fit(train_x, train_y)
            xgb_params = dict(gridsearch.best_params_)
            params = xgb_params

        elif model == "adaboost":
            model_run = ABC()
            paramGrid = {
                'learning_rate': [0.001, 0.01, 0.1],
                'n_estimators': [50, 100, 200]
            }
            gridsearch = GSCV(model_run,
                              paramGrid,
                              verbose=1,
                              cv=KFold(n_splits=cvsplit,
                                       random_state=rstate).get_n_splits(
                                           [train_x, train_y]))
            gridsearch.fit(train_x, train_y)
            ada_params = dict(gridsearch.best_params_)
            params = ada_params

        elif model == "logreg":
            model_run = LR()
            params = []

        elif model == "randomforest":
            model_run = RFC()
            params = []

        elif model == "lightgbm":
            print('lightgbm still not configured\n')
            sys.exit()

        Output = self.run_the_model(model_run, model, X_resampled, y_resampled,
                                    train_x, params, rstate, cvsplit)

        return {
            'model': Output['model'],
            'Acc_vals': Output['Acc_vals'],
            'Mean_vals': Output['Mean_vals'],
            'dataset': Output['dataset'],
            'modeltype': Output['modeltype']
        }
# print("F_score:", grid_search.best_score_)

# nested_score = cross_val_score(grid_search, newx, newy, cv=3)
# print('Nested Score:',nested_score.mean())

# In[18]:

# your code goes here
from sklearn.ensemble import RandomForestClassifier as RFC
clf = RFC()
params = {
    'max_depth': list(range(40, 80)),
    'min_samples_leaf': [2, 3, 4, 5, 6, 7, 10],
    'max_features': ['sqrt', 'log2']
}
grid_search = GSCV(clf, params, cv=15, scoring='f1_macro')
grid_search.fit(datax, datay)

best_depth = grid_search.best_params_.get('max_depth')
best_msl = grid_search.best_params_.get('min_samples_leaf')
best_features = grid_search.best_params_.get('max_features')

print('Best Parameters:', grid_search.best_params_)
print("F_score:", grid_search.best_score_)

# In[19]:

nested_score = cross_val_score(grid_search, datax, datay, cv=15)
print('Nested Score:', nested_score.mean())

# In[17]:
Esempio n. 16
0
    "kernel": ["rbf"],
    "gamma": [
        0.1,
        0.2,
        0.3,
        0.4,
        0.5,
        0.6,
        0.7,
        0.8,
        0.9,
    ]
}]
grid_search = GSCV(estimator=classifier,
                   param_grid=parameters,
                   scoring="accuracy",
                   cv=10,
                   n_jobs=-1)

grid_search = grid_search.fit(x_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_

#-------<Clasificar con valores optimos>----

classifier = SVC(C=1, kernel="rbf", gamma=0.7, random_state=0)
classifier.fit(x_train, y_train)
print(classifier)
y_pred = classifier.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
Esempio n. 17
0
    if err > 0:
        y += err * rng.randn(n)
    return x, y


x, y = make_data(200)
xtest = np.linspace(0, 1, 500)[:, np.newaxis]

model = make_pipeline(PF(), LR())
tune_params = {
    'polynomialfeatures__degree': np.arange(20),
    'linearregression__fit_intercept': [True, False],
}
grid = GSCV(model,
            param_grid=tune_params,
            n_jobs=4,
            cv=5,
            verbose=1,
            refit=True)
grid.fit(x, y)
print(grid.best_params_, grid.best_score_, sep='\t\t')
# {'linearregression__fit_intercept': True, 'polynomialfeatures__degree': 9}
optimal = grid.best_estimator_
ypred = optimal.predict(xtest)

# not all parameter values are tried out, but rather a fixed number of parameter setting
randomized = RSCV(model,
                  param_distributions=tune_params,
                  n_jobs=4,
                  cv=5,
                  verbose=1)
randomized.fit(x, y)
y_sample = test[target]

X_train_scaled = train_scaled[features]
y_train_scaled = train_scaled[target]
X_sample_scaled = test_scaled[features]

parameters = {
    'n_estimators': [90],
    'max_depth': [9],
    'learning_rate': [0.2],
    'min_child_weight': range(5, 21, 1),
}
model = XG_model.get_model()

GS = GSCV(estimator=model,
          param_grid=parameters,
          cv=5,
          refit=True,
          scoring='neg_mean_squared_error')

ndarray = plot_figure.plot_chart(GS, df, test, X_train_scaled, y_train_scaled,
                                 X_sample_scaled)
#calculate time cost
end = time.time()
print('total time cost {:.2f} sec.'.format(end - start))

test_track = list(zip(range(len(test)), ndarray))
pre_y_track = list(zip(range(len(test)), test['predict_y_Value'].values))
distans = ar.frechet_distance(test_track, pre_y_track)
print('appraisal:\nfrechet_distance =', distans)