def fit_model(X, y): """ Performs grid search over the 'max_depth' parameter for a decision tree regressor trained on the input data [X, y]. """ # Create cross-validation sets from the training data # sklearn version 0.18: ShuffleSplit(n_splits=10, test_size=0.1, train_size=None, random_state=None) # sklearn versiin 0.17: ShuffleSplit(n, n_iter=10, test_size=0.1, train_size=None, random_state=None) cv_sets = ShuffleSplit(X.shape[0], n_iter=10, test_size=0.20, random_state=0) # TODO: Create a decision tree regressor object regressor = dtc() # TODO: Create a dictionary for the parameter 'max_depth' with a range from 1 to 10 params = {'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]} # TODO: Transform 'performance_metric' into a scoring function using 'make_scorer' scoring_fnc = make_scorer(performance_metric) # TODO: Create the grid search cv object --> GridSearchCV() # Make sure to include the right parameters in the object: # (estimator, param_grid, scoring, cv) which have values 'regressor', 'params', 'scoring_fnc', and 'cv_sets' respectively. grid = gscv(regressor, params, scoring=scoring_fnc) # Fit the grid search object to the data to compute the optimal model grid = grid.fit(X, y) # Return the optimal model after fitting the data return grid.best_estimator_
def perform_grid_search(train_csv_path, headers, num_heroes): df = pd.read_csv(train_csv_path, names=headers, nrows=10000) print('Number of observations in the training data:', len(df)) enhanced_features = enhance_features(headers, df, None, num_heroes) combined_features = enhanced_features + headers[1:4] tuned_parameters = { 'n_estimators': [50, 100], 'max_depth': [6, 8], 'subsample': [0.5], 'learning_rate': [0.01, 0.05] } splitter = skf(5, shuffle=True, random_state=0) clf = gscv(gdc(), tuned_parameters, cv=splitter, n_jobs=-1) clf.fit(df[combined_features], df['score']) print("Grid scores on development set:") print() means = clf.cv_results_['mean_test_score'] stds = clf.cv_results_['std_test_score'] for mean, std, params in sorted(zip(means, stds, clf.cv_results_['params'])): print("%0.4f (+/-%0.04f) for %r" % (mean, std * 2, params)) print() print('Best score: ' + '\x1b[1;33;40m', clf.best_score_, '\x1b[0m') print('Best parameters set found on development set:') print() print(clf.best_params_)
max_depth=None, class_weight='balanced') model.fit(xtrain, ytrain) s = model.score(xtest, ytest) print('Accuracy RFC with ', i, 'estimators: ', s) Ss.append(s) plt.plot(range(10, 50, 5), Ss) plt.ylabel('Accuracy') plt.xlabel('Number of estimators') plt.xticks(range(10, 50, 5)) plt.title('RTC accuacy vs nestimators') plt.show() model = gscv(RandomForestClassifier(class_weight='balanced'), { 'n_estimators': [10, 15, 20, 25, 30, 35, 40], 'max_depth': [3, 4, 5, 10, 20] }, cv=5, return_train_score=False) model.fit(X, y) bestRFCgscv = model.best_estimator_ ressRFC = model.cv_results_ resRFC = pd.DataFrame(ressRFC) # print(resRFC) print('Best parameters for RFC: ', bestRFCgscv) savegscv = resRFC.to_csv('gscvRFCWEIGHTED.csv', index=True) bestRTC = RandomForestClassifier(n_estimators=40, max_depth=20, class_weight='balanced') bestRTC.fit(xtrain, ytrain) bestypredRFC = bestRTC.predict(xtest)
plt.tight_layout() plt.ylabel('True label') plt.xlabel('Predict') plt.savefig(name+"_confusion_matrix.jpg") plt.close() x_train, x_validation, x_test, x_train_SMOTE, x_train_undersample,\ y_train, y_validation, y_test, y_train_SMOTE, y_train_undersample = get_data() params = {'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000], 'penalty': ['l1', 'l2']} grid_normal = gscv(lgr(), params, cv=10) grid_undersample = gscv(lgr(), params, cv=10) grid_SMOTE = gscv(lgr(), params, cv=10) grid_normal.fit(x_train, y_train) grid_undersample.fit(x_train_undersample, y_train_undersample) grid_SMOTE.fit(x_train_SMOTE, y_train_SMOTE) result_normal = pd.DataFrame(grid_normal.cv_results_) result_undersample = pd.DataFrame(grid_undersample.cv_results_) result_SMOTE = pd.DataFrame(grid_SMOTE.cv_results_) #best1 = np.argmax(result1.mean_test_score.values) name = ["normal", "undersample", "SMOTE"] y_normal_predict = grid_normal.predict(x_test)
model.feature_importances_ dick={} for i in range (0,len(model.feature_importances_)): if model.feature_importances_[i]>0.02: dick[x.columns[i]]=model.feature_importances_[i] refined_x=data[dick.keys()] refined_x=refined_x.apply(le.fit_transform) xtrain,xtest,ytrain,ytest=tts(refined_x,y,test_size=0.3,random_state=13) param={'criterion':['gini','entropy'],'max_depth':range(1,10),'max_leaf_nodes':range(2,10)} from sklearn.model_selection import GridSearchCV as gscv cv=gscv(model,param,n_jobs=-1,scoring='accuracy',cv=5) cv_model=cv.fit(xtrain,ytrain) cv_model.best_params_ #Out[79]: {'criterion': 'entropy', 'max_depth': 5, 'max_leaf_nodes': 9} rf_model=rfc(criterion= 'entropy', max_depth=5, max_leaf_nodes=9,random_state=13) rf_model.fit(xtrain,ytrain) final_pred=rf_model.predict(xtest) accuracy(ytest,final_pred) #Out[88]: 0.8503401360544217 #########Boosting######### #Gradient boosting
kernel_initializer='uniform', activation='relu', input_dim=17)) model.add(Dense(units=9, kernel_initializer='uniform', activation='relu')) model.add(Dense(units=3, kernel_initializer='uniform', activation='linear')) model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) return model seed = 7 np.random.seed(seed) model = KerasClassifier(build_fn=create_model, epochs=100, batch_size=5, verbose=50) from sklearn.model_selection import GridSearchCV as gscv batch_size = [32, 64, 100] epochs = [25, 50, 100, 200, 150] param_grid = dict(batch_size=batch_size, epochs=epochs) grid = gscv(estimator=model, param_grid=param_grid, verbose=60, n_jobs=-1) grid_search = grid.fit(X_train, Y_train) grid_search.best_score_ #0.7499999933772616 grid_search.best_params_ #{'batch_size': 100, 'epochs': 200}