Esempio n. 1
0
def fit_model(X, y):
    """ Performs grid search over the 'max_depth' parameter for a 
        decision tree regressor trained on the input data [X, y]. """

    # Create cross-validation sets from the training data
    # sklearn version 0.18: ShuffleSplit(n_splits=10, test_size=0.1, train_size=None, random_state=None)
    # sklearn versiin 0.17: ShuffleSplit(n, n_iter=10, test_size=0.1, train_size=None, random_state=None)
    cv_sets = ShuffleSplit(X.shape[0],
                           n_iter=10,
                           test_size=0.20,
                           random_state=0)

    # TODO: Create a decision tree regressor object
    regressor = dtc()

    # TODO: Create a dictionary for the parameter 'max_depth' with a range from 1 to 10
    params = {'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}

    # TODO: Transform 'performance_metric' into a scoring function using 'make_scorer'
    scoring_fnc = make_scorer(performance_metric)

    # TODO: Create the grid search cv object --> GridSearchCV()
    # Make sure to include the right parameters in the object:
    # (estimator, param_grid, scoring, cv) which have values 'regressor', 'params', 'scoring_fnc', and 'cv_sets' respectively.
    grid = gscv(regressor, params, scoring=scoring_fnc)

    # Fit the grid search object to the data to compute the optimal model
    grid = grid.fit(X, y)

    # Return the optimal model after fitting the data
    return grid.best_estimator_
Esempio n. 2
0
def perform_grid_search(train_csv_path, headers, num_heroes):
    df = pd.read_csv(train_csv_path, names=headers, nrows=10000)
    print('Number of observations in the training data:', len(df))

    enhanced_features = enhance_features(headers, df, None, num_heroes)
    combined_features = enhanced_features + headers[1:4]

    tuned_parameters = {
        'n_estimators': [50, 100],
        'max_depth': [6, 8],
        'subsample': [0.5],
        'learning_rate': [0.01, 0.05]
    }
    splitter = skf(5, shuffle=True, random_state=0)
    clf = gscv(gdc(), tuned_parameters, cv=splitter, n_jobs=-1)
    clf.fit(df[combined_features], df['score'])

    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in sorted(zip(means, stds,
                                        clf.cv_results_['params'])):
        print("%0.4f (+/-%0.04f) for %r" % (mean, std * 2, params))

    print()
    print('Best score: ' + '\x1b[1;33;40m', clf.best_score_, '\x1b[0m')
    print('Best parameters set found on development set:')
    print()
    print(clf.best_params_)
                                   max_depth=None,
                                   class_weight='balanced')
    model.fit(xtrain, ytrain)
    s = model.score(xtest, ytest)
    print('Accuracy RFC with ', i, 'estimators: ', s)
    Ss.append(s)
plt.plot(range(10, 50, 5), Ss)
plt.ylabel('Accuracy')
plt.xlabel('Number of estimators')
plt.xticks(range(10, 50, 5))
plt.title('RTC accuacy vs nestimators')
plt.show()

model = gscv(RandomForestClassifier(class_weight='balanced'), {
    'n_estimators': [10, 15, 20, 25, 30, 35, 40],
    'max_depth': [3, 4, 5, 10, 20]
},
             cv=5,
             return_train_score=False)
model.fit(X, y)
bestRFCgscv = model.best_estimator_
ressRFC = model.cv_results_
resRFC = pd.DataFrame(ressRFC)
# print(resRFC)
print('Best parameters for RFC: ', bestRFCgscv)
savegscv = resRFC.to_csv('gscvRFCWEIGHTED.csv', index=True)

bestRTC = RandomForestClassifier(n_estimators=40,
                                 max_depth=20,
                                 class_weight='balanced')
bestRTC.fit(xtrain, ytrain)
bestypredRFC = bestRTC.predict(xtest)
Esempio n. 4
0
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predict')
    plt.savefig(name+"_confusion_matrix.jpg")
    plt.close()


x_train, x_validation, x_test, x_train_SMOTE, x_train_undersample,\
y_train, y_validation, y_test, y_train_SMOTE, y_train_undersample = get_data()

params = {'C': [0.0001, 0.001, 0.01, 0.1, 1,
              10, 100, 1000, 10000],
          'penalty': ['l1', 'l2']}

grid_normal = gscv(lgr(), params, cv=10)
grid_undersample = gscv(lgr(), params, cv=10)
grid_SMOTE = gscv(lgr(), params, cv=10)

grid_normal.fit(x_train, y_train)
grid_undersample.fit(x_train_undersample, y_train_undersample)
grid_SMOTE.fit(x_train_SMOTE, y_train_SMOTE)

result_normal = pd.DataFrame(grid_normal.cv_results_)
result_undersample = pd.DataFrame(grid_undersample.cv_results_)
result_SMOTE = pd.DataFrame(grid_SMOTE.cv_results_)

#best1 = np.argmax(result1.mean_test_score.values)

name = ["normal", "undersample", "SMOTE"]
y_normal_predict = grid_normal.predict(x_test)
Esempio n. 5
0
model.feature_importances_

dick={}
for i in range (0,len(model.feature_importances_)):
    if model.feature_importances_[i]>0.02:
        dick[x.columns[i]]=model.feature_importances_[i]

refined_x=data[dick.keys()]
refined_x=refined_x.apply(le.fit_transform)

xtrain,xtest,ytrain,ytest=tts(refined_x,y,test_size=0.3,random_state=13)

param={'criterion':['gini','entropy'],'max_depth':range(1,10),'max_leaf_nodes':range(2,10)}

from sklearn.model_selection import GridSearchCV as gscv
cv=gscv(model,param,n_jobs=-1,scoring='accuracy',cv=5)
cv_model=cv.fit(xtrain,ytrain)

cv_model.best_params_
#Out[79]: {'criterion': 'entropy', 'max_depth': 5, 'max_leaf_nodes': 9}

rf_model=rfc(criterion= 'entropy', max_depth=5, max_leaf_nodes=9,random_state=13)

rf_model.fit(xtrain,ytrain)
final_pred=rf_model.predict(xtest)

accuracy(ytest,final_pred)
#Out[88]: 0.8503401360544217

#########Boosting#########
#Gradient boosting 
Esempio n. 6
0
              kernel_initializer='uniform',
              activation='relu',
              input_dim=17))
    model.add(Dense(units=9, kernel_initializer='uniform', activation='relu'))
    model.add(Dense(units=3, kernel_initializer='uniform',
                    activation='linear'))
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    return model


seed = 7
np.random.seed(seed)

model = KerasClassifier(build_fn=create_model,
                        epochs=100,
                        batch_size=5,
                        verbose=50)

from sklearn.model_selection import GridSearchCV as gscv
batch_size = [32, 64, 100]
epochs = [25, 50, 100, 200, 150]
param_grid = dict(batch_size=batch_size, epochs=epochs)
grid = gscv(estimator=model, param_grid=param_grid, verbose=60, n_jobs=-1)

grid_search = grid.fit(X_train, Y_train)

grid_search.best_score_  #0.7499999933772616
grid_search.best_params_  #{'batch_size': 100, 'epochs': 200}