def two_feature_classification(): dataset = load_digits() X, y = dataset.data, dataset.target == 1 X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) # Create a two-feature input vector matching the example plot above # We jitter the points (add a small amount of random noise) in case there are areas # in feature space where many instances have the same features. jitter_delta = 0.25 X_twovar_train = X_train[:,[20,59]]+ np.random.rand(X_train.shape[0], 2) - jitter_delta X_twovar_test = X_test[:,[20,59]] + np.random.rand(X_test.shape[0], 2) - jitter_delta clf = SVC(kernel = 'linear').fit(X_twovar_train, y_train) grid_values = {'class_weight':['balanced', {1:2},{1:3},{1:4},{1:5},{1:10},{1:20},{1:50}]} plt.figure(figsize=(9,6)) for i, eval_metric in enumerate(('precision','recall', 'f1','roc_auc')): grid_clf_custom = GridSearchCV(clf, param_grid=grid_values, scoring=eval_metric) grid_clf_custom.fit(X_twovar_train, y_train) print('Grid best parameter (max. {0}): {1}'.format(eval_metric, grid_clf_custom.best_params_)) print('Grid best score ({0}): {1}'.format(eval_metric, grid_clf_custom.best_score_)) plt.subplots_adjust(wspace=0.3, hspace=0.3) plot_class_regions_for_classifier_subplot(grid_clf_custom, X_twovar_test, y_test, None, None, None, plt.subplot(2, 2, i+1)) plt.title(eval_metric+'-oriented SVC') plt.tight_layout() plt.show() y_scores = clf.decision_function(X_twovar_test) precision, recall, thresholds = precision_recall_curve(y_test, y_scores) closest_zero = np.argmin(np.abs(thresholds)) closest_zero_p = precision[closest_zero] closest_zero_r = recall[closest_zero] plot_class_regions_for_classifier(clf, X_twovar_test, y_test) plt.title("SVC, class_weight = 'balanced', optimized for accuracy") plt.show() plt.figure() plt.xlim([0.0, 1.01]) plt.ylim([0.0, 1.01]) plt.title ("Precision-recall curve: SVC, class_weight = 'balanced'") plt.plot(precision, recall, label = 'Precision-Recall Curve') plt.plot(closest_zero_p, closest_zero_r, 'o', markersize=12, fillstyle='none', c='r', mew=3) plt.xlabel('Precision', fontsize=16) plt.ylabel('Recall', fontsize=16) plt.axes().set_aspect('equal') plt.show() print('At zero threshold, precision: {:.2f}, recall: {:.2f}'.format(closest_zero_p, closest_zero_r))
plt.show() # ## Kernelized Support Vector Machines # ### Classification # In[ ]: from sklearn.svm import SVC from adspy_shared_utilities import plot_class_regions_for_classifier X_train, X_test, y_train, y_test = train_test_split(X_D2, y_D2, random_state=0) # The default SVC kernel is radial basis function (RBF) plot_class_regions_for_classifier(SVC().fit(X_train, y_train), X_train, y_train, None, None, 'Support Vector Classifier: RBF kernel') # Compare decision boundries with polynomial kernel, degree = 3 plot_class_regions_for_classifier( SVC(kernel='poly', degree=3).fit(X_train, y_train), X_train, y_train, None, None, 'Support Vector Classifier: Polynomial kernel, degree = 3') # #### Support Vector Machine with RBF kernel: gamma parameter # In[ ]: from adspy_shared_utilities import plot_class_regions_for_classifier X_train, X_test, y_train, y_test = train_test_split(X_D2, y_D2, random_state=0) fig, subaxes = plt.subplots(3, 1, figsize=(4, 11))
# create a two-feature input vector matching the example plot above jitter_delta = 0.25 X_twovar_train = X_train[:,[20,59]]+ np.random.rand(X_train.shape[0], 2) - jitter_delta X_twovar_test = X_test[:,[20,59]] + np.random.rand(X_test.shape[0], 2) - jitter_delta clf = SVC(kernel='linear', class_weight='balanced').fit(X_twovar_train, y_train) y_scores = clf.decision_function(X_twovar_test) precision, recall, thresholds = precision_recall_curve(y_test, y_scores) closest_zero = np.argmin(np.abs(thresholds)) closest_zero_p = precision[closest_zero] closest_zero_r = recall[closest_zero] plot_class_regions_for_classifier(clf, X_twovar_test, y_test) plt.title("SVC, class_weight = 'balanced', optimized for accuracy") plt.show() plt.figure() plt.xlim([0.0, 1.01]) plt.ylim([0.0, 1.01]) plt.title ("Precision-recall curve: SVC, class_weight = 'balanced'") plt.plot(precision, recall, label = 'Precision-Recall Curve') plt.plot(closest_zero_p, closest_zero_r, 'o', markersize=12, fillstyle='none', c='r', mew=3) plt.xlabel('Precision', fontsize=16) plt.ylabel('Recall', fontsize=16) plt.axes().set_aspect('equal') plt.show() print('At zero threshold, precision: {:.2f}, recall: {:.2f}' .format(closest_zero_p, closest_zero_r))
# plt.title('Sample binary classification problem with non-linearly separable classes') # plt.scatter(X_D2[:,0], X_D2[:,1], c=y_D2, marker='o', s=50, cmap=cmap_bold) # plt.show() # Breast cancer dataset for classification cancer = load_breast_cancer() (X_cancer, y_cancer) = load_breast_cancer(return_X_y=True) from sklearn.naive_bayes import GaussianNB from adspy_shared_utilities import plot_class_regions_for_classifier X_train, X_test, y_train, y_test = train_test_split(X_C2, y_C2, random_state=0) nbclf = GaussianNB().fit(X_train, y_train) plot_class_regions_for_classifier( nbclf, X_train, y_train, X_test, y_test, 'Gaussian Naive Bayes classifier: Dataset 1') X_train, X_test, y_train, y_test = train_test_split(X_D2, y_D2, random_state=0) nbclf = GaussianNB().fit(X_train, y_train) plot_class_regions_for_classifier( nbclf, X_train, y_train, X_test, y_test, 'Gaussian Naive Bayes classifier: Dataset 2') # Application to a real-world dataset X_train, X_test, y_train, y_test = train_test_split(X_cancer, y_cancer, random_state=0)
plot_class_regions_for_classifier_subplot(nnclf, X_train, y_train, X_test, y_test, title, axis) plt.tight_layout() # two hidden layers from adspy_shared_utilities import plot_class_regions_for_classifier X_train, X_test, y_train, y_test = train_test_split(X_D2, y_D2, random_state=0) nnclf = MLPClassifier(hidden_layer_sizes = [10, 10], solver='lbfgs', random_state = 0).fit(X_train, y_train) plot_class_regions_for_classifier(nnclf, X_train, y_train, X_test, y_test, 'Dataset 1: Neural net classifier, 2 layers, 10/10 units') # Regularization parameter: alpha X_train, X_test, y_train, y_test = train_test_split(X_D2, y_D2, random_state=0) fig, subaxes = plt.subplots(4, 1, figsize=(6, 23)) for this_alpha, axis in zip([0.01, 0.1, 1.0, 5.0], subaxes): nnclf = MLPClassifier(solver='lbfgs', activation='tanh', alpha=this_alpha, hidden_layer_sizes=[100, 100], random_state=0).fit(X_train, y_train) title = 'Dataset 2: NN classifier, alpha = {:.3f} '.format(this_alpha)
# Communities and Crime dataset (X_crime, y_crime) = load_crime_dataset() # ## Naive Bayes classifiers # In[ ]: from sklearn.naive_bayes import GaussianNB from adspy_shared_utilities import plot_class_regions_for_classifier X_train, X_test, y_train, y_test = train_test_split(X_C2, y_C2, random_state=0) nbclf = GaussianNB().fit(X_train, y_train) plot_class_regions_for_classifier( nbclf, X_train, y_train, X_test, y_test, 'Gaussian Naive Bayes classifier: Dataset 1') # In[ ]: X_train, X_test, y_train, y_test = train_test_split(X_D2, y_D2, random_state=0) nbclf = GaussianNB().fit(X_train, y_train) plot_class_regions_for_classifier( nbclf, X_train, y_train, X_test, y_test, 'Gaussian Naive Bayes classifier: Dataset 2') # ### Application to a real-world dataset # In[ ]:
cmap_bold = ListedColormap(['#FFFF00', '#00FF00', '#0000FF','#000000']) ### Naive Bayes Classifiers """ Pros: Easy to understand; Simple/efficient parameter estimation; works well with high-dimensional data; useful as a baseline comparison against more sophisticated methods; Cons: Assumption that features are condition ally independent; as a result, other classifier types often have better generalization performance Their confidence estimates for predictions are not very accurate """ from sklearn.naive_bayes import GaussianNB from adspy_shared_utilities import plot_class_regions_for_classifier X_train, X_test, y_train, y_test = train_test_split(X_C2, y_C2, random_state=0) clf = GaussianNB.fit(X_train,y_train) plot_class_regions_for_classifier(clf, X_train,y_train, X_test, y_test, "Gaussian N.B. Classifier") ### Ensembles of Decision Trees ###Random Forest from sklearn.ensemble import RandomForestClassifier """ Pros: Widely used, excellent prediction performance on many problems Doesn't require normalization of features or extensive parameter turning Handles a mixture of feature types, like decision trees Easily paralleled across multiple CPUs Cons: The resulting models are often difficult for humans to interpret may not be good for VERY HIGH-Dimensional tasks(text classifiers) compared to linear models(faster/accurate) Parameters: n_estimators: number of trees, default =10
#SVC classification X_train, X_test, y_train, y_test = train_test_split(X_D2, y_D2, random_state=0) plot_class_regions_for_classifier_subplot(SVC().fit(X_train, y_train), X_train, y_train, None, None, 'Default SVC', subaxes) #by default uses the radial basis function #plt.show() plot_class_regions_for_classifier_subplot(SVC(kernel='poly', degree=3).fit(X_train, y_train), X_train, y_train, None, None, 'Degree 3 Poly SVC', subaxes) #plt.show() #SVC radial basis function gamma parameter #higher value of gamma => points need to be closer to be classified into the same class print('Radial Basis Function SVC with Variable Gamma') fig, subaxes = plt.subplots(3, 1, figsize=(4,11)) for this_gamma, subplot in (zip([0.1, 1, 10], subaxes)): clf = SVC(kernel='rbf', gamma=this_gamma).fit(X_train, y_train) title = 'SVC RBF Kernel\nGamma: {:.2f}'.format(this_gamma) plot_class_regions_for_classifier(clf, X_train, y_train, None, None, title, subplot) plt.tight_layout() fig, subaxes = plt.subplots(3, 4, figsize=(15, 10), dpi=50) for this_gamma, this_axes in zip([0.01, 1.0, 10.0], subaxes): for this_c, subplot in zip([0.01, 1, 10, 100],this_axes): clf = SVC(kernel = 'rbf', gamma=this_gamma, C=this_c).fit(X_train, y_train) title = 'Support Vector Classifier: \nRBF kernel, gamma = {:.2f}, C = {:.2f}'.format(this_gamma,this_c) plot_class_regions_for_classifier_subplot(clf, X_train, y_train, None, None, title, subplot) plt.tight_layout() #test SVM on breast cancer data X_train, X_test, y_train, y_test = train_test_split(X_cancer, y_cancer, random_state = 0) clf = SVC(C=10).fit(X_train, y_train) print('Accuracy training: {:.3f}'.format(clf.score(X_train, y_train))) print('Accuracy test: {:.3f}'.format(clf.score(X_test, y_test)))
### multi-class classfication ### 1 to all, namely if class1 or all else, if class 2 or all else, if class3 or all else,...choose the biggest chance ### Kernelized support vector machines # transform current data to higher dimentional data and may easier to group """ radial basis function kernel(Gausian kerneal) the kernelized SVM can compute these more complex decision boundaries just in terms of similarity calculations between pairs of points in the high dimensional space where the transformed feature representation is implicit. This similarity function which mathematically is a kind of dot product is the kernel in kernelized """ from sklearn.svm import SVC from adspy_shared_utilities import(plot_class_regions_for_classifier) X_train, X_test, y_train, y_test = (train_test_split(X_D2, y_D2, random_state = 0)) plot_class_regions_for_classifier(SVC().fit(X_train, y_train), X_train, y_train, None, None, "SV CLassifier: RBF kernel") plot_class_regions_for_classifier(SVC(kernel="poly", degree =3).fit(X_train, y_train), X_train, y_train, None, None, "SV CLassifier: Poly(3) kernel") # RBF kernel has gamma parameters, exp(-gamma(x-x')2), Gamma controls how far the influence of # a single trending example reaches, which in turn affects how tightly the decision boundaries # end up surrounding points in the input space. # Small gamma means a larger similarity radius. So that points farther apart are considered similar. # Which results in more points being group together and smoother decision boundaries. ###***large gamma: more tight, complex boundary, each training point importance ###***small C: more regularization, get larger margin for DB, even more point wrong labeled fig, subaxes = plt.subplots(1, 3, figsize=(11,4)) for this_gamma, subplot in zip([0.01, 1.0,10.0], subaxes): clf = SVC(kernel = "rbf", gamma = this_gamma).fit(X_train, y_train)
def neural_network(): #Synthetic dataset 1: single hidden layer X_D2, y_D2 = make_blobs(n_samples = 100, n_features = 2, centers = 8, cluster_std = 1.3, random_state = 4) y_D2 = y_D2 % 2 X_train, X_test, y_train, y_test = train_test_split(X_D2, y_D2, random_state=0) fig, subaxes = plt.subplots(3, 1, figsize=(6,18)) for units, axis in zip([1, 10, 100], subaxes): nnclf = MLPClassifier(hidden_layer_sizes = [units], solver='lbfgs', random_state = 0).fit(X_train, y_train) title = 'Dataset 1: Neural net classifier, 1 layer, {} units'.format(units) plot_class_regions_for_classifier_subplot(nnclf, X_train, y_train, X_test, y_test, title, axis) plt.tight_layout() # Synthetic dataset 1: two hidden layers nnclf = MLPClassifier(hidden_layer_sizes = [10, 10], solver='lbfgs', random_state = 0).fit(X_train, y_train) plot_class_regions_for_classifier(nnclf, X_train, y_train, X_test, y_test, 'Dataset 1: Neural net classifier, 2 layers, 10/10 units') #Regularization parameter: alpha fig, subaxes = plt.subplots(4, 1, figsize=(6, 23)) for this_alpha, axis in zip([0.01, 0.1, 1.0, 5.0], subaxes): nnclf = MLPClassifier(solver='lbfgs', activation = 'tanh', alpha = this_alpha, hidden_layer_sizes = [100, 100], random_state = 0).fit(X_train, y_train) title = 'Dataset 2: NN classifier, alpha = {:.3f} '.format(this_alpha) plot_class_regions_for_classifier_subplot(nnclf, X_train, y_train, X_test, y_test, title, axis) plt.tight_layout() #The effect of different choices of activation function X_train, X_test, y_train, y_test = train_test_split(X_D2, y_D2, random_state=0) fig, subaxes = plt.subplots(3, 1, figsize=(6,18)) for this_activation, axis in zip(['logistic', 'tanh', 'relu'], subaxes): nnclf = MLPClassifier(solver='lbfgs', activation = this_activation, alpha = 0.1, hidden_layer_sizes = [10, 10], random_state = 0).fit(X_train, y_train) title = 'Dataset 2: NN classifier, 2 layers 10/10, {} activation function'.format(this_activation) plot_class_regions_for_classifier_subplot(nnclf, X_train, y_train, X_test, y_test, title, axis) plt.tight_layout() #Neural networks: Regression plt.figure() plt.title('Sample regression problem with one input variable') X_R1, y_R1 = make_regression(n_samples = 100, n_features=1,n_informative=1, bias = 150.0, noise = 30, random_state=0) plt.scatter(X_R1, y_R1, marker= 'o', s=50) plt.show() fig, subaxes = plt.subplots(2, 3, figsize=(11,8), dpi=70) X_predict_input = np.linspace(-3, 3, 50).reshape(-1,1) X_train, X_test, y_train, y_test = train_test_split(X_R1[0::5], y_R1[0::5], random_state = 0) for thisaxisrow, thisactivation in zip(subaxes, ['tanh', 'relu']): for thisalpha, thisaxis in zip([0.0001, 1.0, 100], thisaxisrow): mlpreg = MLPRegressor(hidden_layer_sizes = [100,100], activation = thisactivation, alpha = thisalpha, solver = 'lbfgs').fit(X_train, y_train) y_predict_output = mlpreg.predict(X_predict_input) thisaxis.set_xlim([-2.5, 0.75]) thisaxis.plot(X_predict_input, y_predict_output, '^', markersize = 10) thisaxis.plot(X_train, y_train, 'o') thisaxis.set_xlabel('Input feature') thisaxis.set_ylabel('Target value') thisaxis.set_title('MLP regression\nalpha={}, activation={})' .format(thisalpha, thisactivation)) plt.tight_layout() #Application to real-world dataset for classification cancer = load_breast_cancer() (X_cancer, y_cancer) = load_breast_cancer(return_X_y = True) scaler = MinMaxScaler() X_train, X_test, y_train, y_test = train_test_split(X_cancer, y_cancer, random_state = 0) X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) clf = MLPClassifier(hidden_layer_sizes = [100, 100], alpha = 5.0, random_state = 0, solver='lbfgs').fit(X_train_scaled, y_train) print('Breast cancer dataset') print('Accuracy of NN classifier on training set: {:.2f}'.format(clf.score(X_train_scaled, y_train))) print('Accuracy of NN classifier on test set: {:.2f}'.format(clf.score(X_test_scaled, y_test)))
def naive_bayes(): cmap_bold = ListedColormap(['#FFFF00', '#00FF00', '#0000FF','#000000']) fruits = pd.read_table('fruit_data_with_colors.txt') feature_names_fruits = ['height', 'width', 'mass', 'color_score'] X_fruits = fruits[feature_names_fruits] y_fruits = fruits['fruit_label'] target_names_fruits = ['apple', 'mandarin', 'orange', 'lemon'] X_fruits_2d = fruits[['height', 'width']] y_fruits_2d = fruits['fruit_label'] plt.figure() plt.title('Sample regression problem with one input variable') X_R1, y_R1 = make_regression(n_samples = 100, n_features=1,n_informative=1, bias = 150.0, noise = 30, random_state=0) plt.scatter(X_R1, y_R1, marker= 'o', s=50) plt.show() plt.figure() plt.title('Complex regression problem with one input variable') X_F1, y_F1 = make_friedman1(n_samples = 100, n_features = 7, random_state=0) plt.scatter(X_F1[:, 2], y_F1, marker= 'o', s=50) plt.show() plt.figure() plt.title('Sample binary classification problem with two informative features') X_C2, y_C2 = make_classification(n_samples = 100, n_features=2, n_redundant=0, n_informative=2, n_clusters_per_class=1, flip_y = 0.1, class_sep = 0.5, random_state=0) plt.scatter(X_C2[:, 0], X_C2[:, 1], marker= 'o', c=y_C2, s=50, cmap=cmap_bold) plt.show() X_train, X_test, y_train, y_test = train_test_split(X_C2, y_C2, random_state=0) nbclf = GaussianNB().fit(X_train, y_train) plot_class_regions_for_classifier(nbclf, X_train, y_train, X_test, y_test, 'Gaussian Naive Bayes classifier: Dataset 1') # more difficult synthetic dataset for classification (binary) # with classes that are not linearly separable X_D2, y_D2 = make_blobs(n_samples = 100, n_features = 2, centers = 8, cluster_std = 1.3, random_state = 4) y_D2 = y_D2 % 2 plt.figure() plt.title('Sample binary classification problem with non-linearly separable classes') plt.scatter(X_D2[:,0], X_D2[:,1], c=y_D2, marker= 'o', s=50, cmap=cmap_bold) plt.show() X_train, X_test, y_train, y_test = train_test_split(X_D2, y_D2, random_state=0) nbclf = GaussianNB().fit(X_train, y_train) plot_class_regions_for_classifier(nbclf, X_train, y_train, X_test, y_test, 'Gaussian Naive Bayes classifier: Dataset 2') # Breast cancer dataset for classification cancer = load_breast_cancer() (X_cancer, y_cancer) = load_breast_cancer(return_X_y = True) nbclf = GaussianNB().fit(X_train, y_train) print('Breast cancer dataset') print('Accuracy of GaussianNB classifier on training set: {:.2f}' .format(nbclf.score(X_train, y_train))) print('Accuracy of GaussianNB classifier on test set: {:.2f}' .format(nbclf.score(X_test, y_test))) # Communities and Crime dataset (X_crime, y_crime) = load_crime_dataset() print('Crime dataset') print('Accuracy of GaussianNB classifier on training set: {:.2f}' .format(nbclf.score(X_train, y_train))) print('Accuracy of GaussianNB classifier on test set: {:.2f}' .format(nbclf.score(X_test, y_test)))
def kernal(): X_D2, y_D2 = make_blobs(n_samples=100, n_features=2, centers=8, cluster_std=1.3, random_state=4) X_train, X_test, y_train, y_test = train_test_split(X_D2, y_D2, random_state=0) plot_class_regions_for_classifier(SVC().fit(X_train, y_train), X_train, y_train, None, None, 'Support Vector Classifier: RBF kernel') plot_class_regions_for_classifier( SVC(kernel='poly', degree=3).fit(X_train, y_train), X_train, y_train, None, None, 'Support Vector Classifier: Polynomial kernel, degree = 3') X_train, X_test, y_train, y_test = train_test_split(X_D2, y_D2, random_state=0) fig, subaxes = plt.subplots(3, 1, figsize=(4, 11)) for this_gamma, subplot in zip([0.01, 1.0, 10.0], subaxes): clf = SVC(kernel='rbf', gamma=this_gamma).fit(X_train, y_train) title = 'Support Vector Classifier: \nRBF kernel, gamma = {:.2f}'.format( this_gamma) plot_class_regions_for_classifier_subplot(clf, X_train, y_train, None, None, title, subplot) plt.tight_layout() X_train, X_test, y_train, y_test = train_test_split(X_D2, y_D2, random_state=0) fig, subaxes = plt.subplots(3, 4, figsize=(15, 10), dpi=50) for this_gamma, this_axis in zip([0.01, 1, 5], subaxes): for this_C, subplot in zip([0.1, 1, 15, 250], this_axis): title = 'gamma = {:.2f}, C = {:.2f}'.format(this_gamma, this_C) clf = SVC(kernel='rbf', gamma=this_gamma, C=this_C).fit(X_train, y_train) plot_class_regions_for_classifier_subplot(clf, X_train, y_train, X_test, y_test, title, subplot) plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0) cancer = load_breast_cancer() X_cancer, y_cancer = load_breast_cancer(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X_cancer, y_cancer, random_state=0) clf = SVC(C=10).fit(X_train, y_train) print('Breast cancer dataset (unnormalized features)') print('Accuracy of RBF-kernel SVC on training set: {:.2f}'.format( clf.score(X_train, y_train))) print('Accuracy of RBF-kernel SVC on test set: {:.2f}'.format( clf.score(X_test, y_test))) cancer = load_breast_cancer() X_cancer, y_cancer = load_breast_cancer(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X_cancer, y_cancer, random_state=0) scaler = MinMaxScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) clf = SVC(C=10).fit(X_train_scaled, y_train) print('Breast cancer dataset (unnormalized features)') print('Accuracy of RBF-kernel SVC on training set: {:.2f}'.format( clf.score(X_train_scaled, y_train))) print('Accuracy of RBF-kernel SVC on test set: {:.2f}'.format( clf.score(X_test_scaled, y_test)))