plt.title('Input data') ############################################### # Train test split and SVM training from sklearn import model_selection from sklearn.svm import SVC X_train, X_test, y_train, y_test = model_selection.train_test_split( X, y, test_size=0.25, random_state=5) # params = {'kernel': 'linear'} # params = {'kernel': 'poly', 'degree': 3} params = {'kernel': 'rbf'} classifier = SVC(**params) classifier.fit(X_train, y_train) utilities.plot_classifier(classifier, X_train, y_train, 'Training dataset') y_test_pred = classifier.predict(X_test) utilities.plot_classifier(classifier, X_test, y_test, 'Test dataset') ############################################### # Evaluate classifier performance from sklearn.metrics import classification_report target_names = ['Class-' + str(int(i)) for i in set(y)] print("\n" + "#" * 30) print("\nClassifier performance on training dataset\n") print( classification_report(y_train, classifier.predict(X_train),
#params = {'kernel': 'linear'} # Building Nonlinear Classifier Using SVMs # Using Polynomial function # params = {'kernel': 'poly', 'degree': 3} # Using Radial Basis funciton params = {'kernel': 'rbf'} classifier = SVC(**params) classifier.fit(X_train, y_train) '''utilities.plot_classifier(classifier, X_train, y_train, 'Training dataset')''' y_test_pred = classifier.predict(X_test) classifier.fit(X_test, y_test) utilities.plot_classifier(classifier, X_test, y_test, 'Test dataset') # Evaluate classifiers performances from sklearn.metrics import classification_report target_names = ['Class-' + str(int(i)) for i in set(y)] print('#'*30 + '\n') print('\nClassifier Performance on Training Dataset\n') print(classification_report(y_train, classifier.predict(X_train), target_names=target_names)) print('#'*30 + '\n') print('#'*30 + '\n') print('\nClassifier Performance on Test Dataset\n') print(classification_report(y_test, classifier.predict(X_test), target_names=target_names)) print('#'*30 + '\n')
from sklearn import cross_validation X_train, X_test, y_train, y_test = cross_validation.train_test_split( X, y, test_size=0.25, random_state=5) params = {'kernel': 'rbf'} classifier = SVC(**params) classifier.fit(X_train, y_train) ############################################### # Measure distance from the boundary input_datapoints = np.array([[2, 1.5], [8, 9], [4.8, 5.2], [4, 4], [2.5, 7], [7.6, 2], [5.4, 5.9]]) print "\nDistance from the boundary:" for i in input_datapoints: print i, '-->', classifier.decision_function(i)[0] # Confidence measure params = {'kernel': 'rbf', 'probability': True} classifier = SVC(**params) classifier.fit(X_train, y_train) print "\nConfidence measure:" for i in input_datapoints: print i, '-->', classifier.predict_proba(i)[0] utilities.plot_classifier(classifier, input_datapoints, [0] * len(input_datapoints), 'Input datapoints', 'True') plt.show()
plt.figure() plt.scatter(class_0[:,0], class_0[:,1], facecolors='black', edgecolors='black', marker='s') plt.scatter(class_1[:,0], class_1[:,1], facecolors='None', edgecolors='black', marker='s') plt.title('Input data') ############################################### # Train test split from sklearn import cross_validation X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.25, random_state=5) params = {'kernel': 'linear'} #params = {'kernel': 'linear', 'class_weight': 'auto'} classifier = SVC(**params) classifier.fit(X_train, y_train) utilities.plot_classifier(classifier, X_train, y_train, 'Training dataset') y_test_pred = classifier.predict(X_test) utilities.plot_classifier(classifier, X_test, y_test, 'Test dataset') ############################################### # Evaluate classifier performance from sklearn.metrics import classification_report target_names = ['Class-' + str(int(i)) for i in set(y)] print "\n" + "#"*30 print "\nClassifier performance on training dataset\n" print classification_report(y_train, classifier.predict(X_train), target_names=target_names) print "#"*30 + "\n"
X, y = utilities.load_data(input_file) ############################################### # Train test split from sklearn import cross_validation X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.25, random_state=5) params = {'kernel': 'rbf'} classifier = SVC(**params) classifier.fit(X_train, y_train) ############################################### # Measure distance from the boundary input_datapoints = np.array([[2, 1.5], [8, 9], [4.8, 5.2], [4, 4], [2.5, 7], [7.6, 2], [5.4, 5.9]]) print "\nDistance from the boundary:" for i in input_datapoints: print i, '-->', classifier.decision_function(i)[0] # Confidence measure params = {'kernel': 'rbf', 'probability': True} classifier = SVC(**params) classifier.fit(X_train, y_train) print "\nConfidence measure:" for i in input_datapoints: print i, '-->', classifier.predict_proba(i)[0] utilities.plot_classifier(classifier, input_datapoints, [0]*len(input_datapoints), 'Input datapoints', 'True') plt.show()
edgecolors='black', marker='s') plt.title('Input data') plt.show() from sklearn import model_selection X_train, X_test, y_train, y_test = model_selection.train_test_split( X, y, test_size=0.25, random_state=5) '''The class_weight parameter will count the number of datapoints in each class to adjust the weights so that the imbalance doesn't adversely affect the performance.''' '''Probability counting ''' params = {'kernel': 'linear', 'class_weight': 'balanced', 'probability': True} classifier = SVC(**params, gamma='auto') classifier.fit(X_train, y_train) utilities.plot_classifier(classifier, X_train, y_train, 'Training dataset') plt.show() '''C is a hyperparameter that determines the penalty for the incorrect classification of an observation. So, we used a weight for the classes to manage unbalanced classes. In this way, we will assign a new value of C to the classes, defined as follows: C(i) = C * w(i) Where C is the penalty, w(i) is a weight inversely proportional to class i's frequency, and C(i) is the C value for class i. This method suggests increasing the penalty to classify the less represented classes so as to prevent them from being outclassed by the most represented class. In the scikit-learn library, when using SVC, we can set the values for Ci automaticallyby setting class_weight='balanced'.''' print("Confidence measure:") for i in class_0:
# Specify L1 regularization lr = LogisticRegression(penalty='l1') # Instantiate the GridSearchCV object and run the search searcher = GridSearchCV(lr, {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}) searcher.fit(X_train, y_train) # Report the best parameters print("Best CV params", searcher.best_params_) # Find the number of nonzero coefficients (selected features) best_lr = searcher.best_estimator_ coefs = best_lr.coef_ print("Total number of features:", coefs.size) print("Number of selected features:", np.count_nonzero(coefs)) plot_classifier(X_train, y_train, searcher, proba=True) # Get predicted probabilities proba = searcher.predict_proba(X_train) # Sort the example indices by their maximum probability proba_inds = np.argsort(np.max(proba, axis=1)) # function to plot the imagem according de index def show_digit(proba_inds): plt.gray() plt.matshow(digits.images[proba_inds]) return plt.show()
from sklearn.svm import SVC from sklearn import cross_validation filename = 'data_multivar.txt' X, y = utilities.load_data(filename) print u'-----svm--------' X_train, X_test, y_train, y_test = cross_validation.train_test_split( X, y, train_size=0.25, random_state=5) # params = {'kernel': 'rbf'} params = {'kernel': 'rbf', 'probability': True} classifier = SVC(**params) classifier.fit(X_train, y_train) utilities.plot_classifier(classifier, X_train, y_train, 'Training dataset') plt.show() ######################## # measure distance from the boundary input_datapoints = np.array([[2, 1.5], [8, 9], [4.8, 5.2], [4, 4], [2.5, 7], [7.6, 2], [5.4, 5.9]]) print "\n Distance from the boundary" for i in input_datapoints: # print i, '-->', classifier.decision_function(i.reshape(1, -1))[0] print i, '-->', classifier.predict_proba(i.reshape(1, -1))[0] utilities.plot_classifier(classifier, input_datapoints, [0] * len(input_datapoints), 'Input datapoints', True) plt.show()