async def do_run_async(self): training_set = super().load_train_images() training_labels = super().load_train_labels() test_set = super().load_test_images() test_labels = super().load_test_labels() # Training and test sets need to be reshaped from 3D (m,28,28) to 2D (m, 784) for the classifiers to be able to # use in training phase training_set_tr = training_set.reshape((60000, 784)) test_set_tr = test_set.reshape((10000, 784)) # Hard Voting (The class with the higher number of votes is output) sgd_clf = SGDClassifier() rnd_clf = RandomForestClassifier() k_clf = KNeighborsClassifier( ) # Note: training this is very slow on the MNIST data set voting_clf = VotingClassifier(estimators=[('sgd', sgd_clf), ('rf', rnd_clf), ('k', k_clf)], voting='hard') # Compute and compare the accuracy score. The voting classifier should get an accuracy score better than each individual for clf in [sgd_clf, rnd_clf, k_clf, voting_clf]: clf.fit(training_set_tr, training_labels) predictions = clf.predict(test_set_tr) print(type(clf).__name__, accuracy_score(test_labels, predictions)) # Soft Voting (The class with the highest probability averaged across all classifiers is output) # All classifiers in the ensemble need to be able to predict probabilities (predict_proba) voting_clf.voting = 'soft'
] named_estimators = [ ("forest_clf", forest_clf), ("extra_trees_clf", extra_trees_clf), ("svc_clf", svc_clf), ("mlp_clf", mlp_clf) ] voting_clf = VotingClassifier(named_estimators) voting_clf.fit(X_train, y_train) voting_clf.score(X_validation, y_validation) # by default the voting classifier uses hard voting, to change to soft voting, we simply change the scoring # no need to train the model again voting_clf.voting = "soft" voting_clf.score(X_test, y_test) # stacking ensemble X_validation_predictions = np.empty((len(X_validation), len(estimators)), dtype=np.float32) for index, estimator in enumerate(estimators): X_validation_predictions[:, index] = estimator.predict(X_validation) random_forest_blender = RandomForestClassifier(n_estimators=200, oob_score=True, random_state=42) random_forest_blender.fit(X_validation_predictions, y_validation) # train on the output of the previous predictions print(random_forest_blender.oob_score_) # test on testing set X_test_predictions = np.empty((len(X_test), len(estimators)), dtype=np.float32)
However, it did not update the list of _trained_ estimators: voting_clf.estimators_ So we can either fit the `VotingClassifier` again, or just remove the SVM from the list of trained estimators: del voting_clf.estimators_[2] Now let's evaluate the `VotingClassifier` again: voting_clf.score(X_val, y_val) A bit better! The SVM was hurting performance. Now let's try using a soft voting classifier. We do not actually need to retrain the classifier, we can just set `voting` to `"soft"`: voting_clf.voting = "soft" voting_clf.score(X_val, y_val) Nope, hard voting wins in this case. _Once you have found one, try it on the test set. How much better does it perform compared to the individual classifiers?_ voting_clf.voting = "hard" voting_clf.score(X_test, y_test) [estimator.score(X_test, y_test) for estimator in voting_clf.estimators_] The voting classifier only very slightly reduced the error rate of the best model in this case. ## 9. Stacking Ensemble
#evaluation of validation set voting.score(X_val,y_val) #for each estimator in voting classifier evalute on validation set [estimator.score(X_val,y_val) for estimator in voting.estimators_] voting.set_params(svm=None) #change the param value of svm to none voting.estimators_ #or deleting svm classifier as its outperforms and affect the votiong model del voting.estimators_[1] voting.score(X_val, y_val) voting.voting = 'soft' voting.voting = 'hard' #test set [estimator.score(X_test,y_test) for estimator in voting.estimators_] ''' Exercise: Run the individual classifiers from the previous exercise to make predictions on the validation set, and create a new training set with the resulting predictions: each training instance is a vector containing the set of predictions from all your classifiers for an image, and the target is the image's class. Train a classifier on this new training set. ''' #making empty array for estimator prediction as datatype float32
X_train = X[:int(len(X) * train_ratio), :] y_train = y[:int(len(X) * train_ratio)] X_test = X[int(len(X) * train_ratio):, :] y_test = y[int(len(X) * train_ratio):] log_clf = LogisticRegression(multi_class='multinomial', solver='sag') rnd_clf = RandomForestClassifier() svm_clf = SVC(probability=True) estimators = [('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)] # Hard voting print('\n-------Hard Voting-------') voting_clf = VotingClassifier(estimators=estimators, voting='hard') # The accuracy of each classifiers for clf in (log_clf, rnd_clf, svm_clf, voting_clf): clf.fit(X_train, y_train) #y_pred = clf.predict_proba(X_test) #y_pred = np.argmax(y_pred, axis=1) y_pred = clf.predict(X_test) print(clf.__class__.__name__, accuracy_score(y_test, y_pred)) print('\n-------Soft Voting-------') voting_clf.voting = 'soft' for clf in (log_clf, rnd_clf, svm_clf, voting_clf): clf.fit(X_train, y_train) y_pred = clf.predict_proba(X_test) y_pred = np.argmax(y_pred, axis=1) print(clf.__class__.__name__, accuracy_score(y_test, y_pred))