def benchmark(clf): print 80 * '_' print "Training: " print clf t0 = time() clf.fit(X_train, y_train) train_time = time() - t0 print "train time: %0.3fs" % train_time t0 = time() pred = clf.predict(X_test) test_time = time() - t0 print "test time: %0.3fs" % test_time score = metrics.f1_score(y_test, pred) print "f1-score: %0.3f" % score if hasattr(clf, 'coef_'): nnz = clf.coef_.nonzero()[0].shape[0] print "non-zero coef: %d" % nnz print if print_report: print "classification report:" print metrics.classification_report(y_test, pred, target_names=categories) if print_cm: print "confusion matrix:" print metrics.confusion_matrix(y_test, pred) print return score, train_time, test_time
def benchmark(clf_class, params, name): print "parameters:", params t0 = time() clf = clf_class(**params).fit(X_train, y_train) print "done in %fs" % (time() - t0) if hasattr(clf, 'coef_'): print "Percentage of non zeros coef: %f" % (np.mean(clf.coef_ != 0) * 100) print "Predicting the outcomes of the testing set" t0 = time() pred = clf.predict(X_test) print "done in %fs" % (time() - t0) print "Classification report on test set for classifier:" print clf print print classification_report(y_test, pred, target_names=news_test.target_names) cm = confusion_matrix(y_test, pred) print "Confusion matrix:" print cm # Show confusion matrix pl.matshow(cm) pl.title('Confusion matrix of the %s classifier' % name) pl.colorbar()
def train(*filenames): """Returns a classifier that """ data = None answers = None all_images = [] for filename in filenames: print filename if not os.path.exists(filename) and os.path.exists(filename + '.code'): return False keys = getTrainingKey(filename + '.code') images = getImDictFromImage(filename) this_data = images.reshape(images.shape[0], -1) this_answers = numpy.array(keys) all_images.extend(images) if data is None: answers = this_answers data = this_data else: data = numpy.concatenate([data, this_data], 0) answers = numpy.concatenate([answers, this_answers], 0) print 'image shape', images.shape print 'data shape', data.shape print 'answers shape', answers.shape from scikits.learn import svm from scikits.learn.metrics import classification_report from scikits.learn.metrics import confusion_matrix classifier = svm.SVC() divider = 400 classifier.fit(data[:divider], answers[:divider]) expected = answers[divider:] predicted = classifier.predict(data[divider:]) print "check:" print classifier print 'predicted', predicted print print classification_report(expected, predicted) print confusion_matrix(expected, predicted) print 'len of all_images:', len(all_images) for index, (image, prediction) in enumerate( zip(all_images[divider:], predicted)[:25]): #for index, (image, prediction) in enumerate(zip(all_images, answers)[50:75]): print index, prediction pylab.subplot(5, 5, index + 1) pylab.imshow(image, cmap=pylab.cm.gray_r) pylab.title('Prediction: ' + numToTile(prediction)) pylab.show()
def train(*filenames): """Returns a classifier that """ data = None answers = None all_images = [] for filename in filenames: print filename if not os.path.exists(filename) and os.path.exists(filename+'.code'): return False keys = getTrainingKey(filename+'.code') images = getImDictFromImage(filename) this_data = images.reshape(images.shape[0], -1) this_answers = numpy.array(keys) all_images.extend(images) if data is None: answers = this_answers data = this_data else: data = numpy.concatenate([data, this_data], 0) answers = numpy.concatenate([answers, this_answers], 0) print 'image shape', images.shape print 'data shape', data.shape print 'answers shape', answers.shape from scikits.learn import svm from scikits.learn.metrics import classification_report from scikits.learn.metrics import confusion_matrix classifier = svm.SVC() divider = 400 classifier.fit(data[:divider], answers[:divider]) expected = answers[divider:] predicted = classifier.predict(data[divider:]) print "check:" print classifier print 'predicted', predicted print print classification_report(expected, predicted) print confusion_matrix(expected, predicted) print 'len of all_images:', len(all_images) for index, (image, prediction) in enumerate(zip(all_images[divider:], predicted)[:25]): #for index, (image, prediction) in enumerate(zip(all_images, answers)[50:75]): print index, prediction pylab.subplot(5, 5, index+1) pylab.imshow(image, cmap=pylab.cm.gray_r) pylab.title('Prediction: '+numToTile(prediction)) pylab.show()
def evaluate(model, testX, testY, testTitles=None, testLabels=None): """ Shows all the performance of `model` at predicting the testY from the testX """ dir = '/CurrentPorjects/LatentDirichletAllocation/data/NIPS1-17/' cnamesf = open( os.path.join(dir, 'NIPS_category_names.txt') ) names_of_categories = dict(enumerate( [w.strip().split(" ",1)[1] for w in cnamesf.readlines() ] )) cnamesf.close() n_of_cats=names_of_categories lnamesf = open( os.path.join(dir, 'NIPS_label_names.txt') ) label_names = dict(enumerate( [w.strip() for w in lnamesf.readlines() ] )) lnamesf.close() predicted = model.predict(testX) print metrics.confusion_matrix(testY, predicted) print metrics.classification_report(testY, predicted, target_names=names_of_categories.values()[1:9]) size = len(testY) if testTitles is not None: if testLabels is not None: label_heading = "NIPS Label" else: label_heading = "" print ("_"*80), "_________", "_________" print "Paper title".ljust(80), "predicted", "true ", label_heading print ("_"*80), "_________", "_________" for i in range(0, size): pred = predicted[i] true = testY[i] if pred != true: title = testTitles[i][0:80].ljust(80) pred_str= n_of_cats[pred][0:9].ljust(9) true_str= n_of_cats[true][0:9].ljust(9) if testLabels is not None: label_id = testLabels[i] label = label_names[label_id] else: label = "" print title, pred_str, true_str, label
def benchmark(clf): print 80 * '_' print "Training: " print clf t0 = time() clf.fit(X_train, y_train) train_time = time() - t0 print "train time: %0.3fs" % train_time t0 = time() pred = clf.predict(X_test) test_time = time() - t0 print "test time: %0.3fs" % test_time score = metrics.f1_score(y_test, pred) print "f1-score: %0.3f" % score if hasattr(clf, 'coef_'): nnz = clf.coef_.nonzero()[0].shape[0] print "non-zero coef: %d" % nnz if opts.print_top10: print "top 10 keywords per class:" for i, category in enumerate(categories): top10 = np.argsort(clf.coef_[i, :])[-10:] print trim("%s: %s" % (category, " ".join(vocabulary[top10]))) print if opts.print_report: print "classification report:" print metrics.classification_report(y_test, pred, target_names=categories) if opts.print_cm: print "confusion matrix:" print metrics.confusion_matrix(y_test, pred) print return score, train_time, test_time
def _svm(): clf = svm.SVC() t = time.time() clf.fit(data_train, label_train) print "SVM: time elapsed in fitting: %f secs" % (time.time()-t) t = time.time() predicted = clf.predict(data_test) print "SVM: time elapsed in predicting: %f secs" % (time.time()-t) print "Classification report for SVM:\n%s\n" % (metrics.classification_report(label_test, predicted)) print "Confusion matrix:\n%s" % metrics.confusion_matrix(label_test, predicted)
def _svm(): clf = svm.SVC() t = time.time() clf.fit(data_train, label_train) print "SVM: time elapsed in fitting: %f secs" % (time.time() - t) t = time.time() predicted = clf.predict(data_test) print "SVM: time elapsed in predicting: %f secs" % (time.time() - t) print "Classification report for SVM:\n%s\n" % ( metrics.classification_report(label_test, predicted)) print "Confusion matrix:\n%s" % metrics.confusion_matrix( label_test, predicted)
def opf(): # OPF only supports 32 bits labels at the moment label_train_32 = label_train.astype(numpy.int32) label_test_32 = label_test.astype(numpy.int32) O = libopf_py.OPF() t = time.time() O.fit(dist_train, label_train_32, precomputed_distance=True) # O.fit(dist_train, label_train_32, precomputed_distance=True, learning="agglomerative", split=0.8) print "OPF: time elapsed in fitting: %f secs" % (time.time()-t) t = time.time() predicted = O.predict(dist_test) print "OPF: time elapsed in predicting: %f secs" % (time.time()-t) print "Classification report for OPF:\n%s\n" % (metrics.classification_report(label_test_32, predicted)) print "Confusion matrix:\n%s" % metrics.confusion_matrix(label_test_32, predicted)
def test(model, X, y, output_path): print "Evaluating svm" y_pred = model.predict(X) #try: if True: acc = (y == y_pred).mean() print "Accuracy ",acc f = open(output_path,'w') f.write('Accuracy: '+str(acc)+'\n') if classification_report: cr = classification_report(y, y_pred)#, labels=selected_target, #class_names=category_names[selected_target]) print cr f.write(str(cr)) if confusion_matrix: cm = confusion_matrix(y, y_pred)#, labels=selected_target) print cm f.write(str(cm)) f.close() """except:
def test(model, X, y, output_path): print "Evaluating svm" y_pred = model.predict(X) #try: if True: acc = (y == y_pred).mean() print "Accuracy ", acc f = open(output_path, 'w') f.write('Accuracy: ' + str(acc) + '\n') if classification_report: cr = classification_report(y, y_pred) #, labels=selected_target, #class_names=category_names[selected_target]) print cr f.write(str(cr)) if confusion_matrix: cm = confusion_matrix(y, y_pred) #, labels=selected_target) print cm f.write(str(cm)) f.close() """except:
def opf(): # OPF only supports 32 bits labels at the moment label_train_32 = label_train.astype(numpy.int32) label_test_32 = label_test.astype(numpy.int32) O = libopf_py.OPF() t = time.time() O.fit(dist_train, label_train_32, precomputed_distance=True) # O.fit(dist_train, label_train_32, precomputed_distance=True, learning="agglomerative", split=0.8) print("OPF: time elapsed in fitting: %f secs" % (time.time() - t)) t = time.time() predicted = O.predict(dist_test) print("OPF: time elapsed in predicting: %f secs" % (time.time() - t)) print("Classification report for OPF:\n%s\n" % (metrics.classification_report(label_test_32, predicted))) print("Confusion matrix:\n%s" % metrics.confusion_matrix(label_test_32, predicted))
pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train) eigenfaces = pca.components_.reshape((n_components, h, w)) X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) # Train a SVM classification model param_grid = dict(C=[1, 5, 10, 50, 100], gamma=[0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1]) clf = GridSearchCV(SVC(kernel='rbf'), param_grid, fit_params={'class_weight': 'auto'}, verbose=1) clf = clf.fit(X_train_pca, y_train) print clf.best_estimator # Quantitative evaluation of the model quality on the test set from scikits.learn import metrics y_pred = clf.predict(X_test_pca) print metrics.classification_report(y_test, y_pred, target_names=target_names) print metrics.confusion_matrix(y_test, y_pred, labels=range(len(target_names))) # Plot the results import pylab as pl for index, (img, label_true, label_pred) in enumerate( zip(X_test[:8], y_test[:8], y_pred[:8])): pl.subplot(2, 4, index+1).imshow(img.reshape(h, w), cmap=pl.cm.gray) pl.title('%s, prediction: %s' % (label_true, label_pred))
# Build a vectorizer / classifier pipeline using the previous analyzer clf = Pipeline([ ('vec', CountVectorizer(analyzer=analyzer)), ('tfidf', TfidfTransformer(use_idf=False)), ('clf', LinearSVC(loss='l2', penalty='l1', dual=False, C=100)), ]) # Fit the pipeline on the training set clf.fit(docs_train, y_train) # Predict the outcome on the testing set y_predicted = clf.predict(docs_test) # Print the classification report print metrics.classification_report(y_test, y_predicted, class_names=dataset.target_names) # Plot the confusion matrix cm = metrics.confusion_matrix(y_test, y_predicted) print cm # import pylab as pl #pl.matshow(cm) #pl.show() # Predict the result on some short new sentences: sentences = [ u'This is a language detection test.', u'Ceci est un test de d\xe9tection de la langue.', u'Dies ist ein Test, um die Sprache zu erkennen.', ]
('clf', LinearSVC(C=1000)), ]) parameters = { 'vect__analyzer__max_n': (1, 2), 'vect__max_df': (.95, ), } # Fit the pipeline on the training set using grid search for the parameters grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1) grid_search.fit(docs_train[:200], y_train[:200]) # Refit the best parameter set on the complete training set clf = grid_search.best_estimator.fit(docs_train, y_train) # Predict the outcome on the testing set y_predicted = clf.predict(docs_test) # Print the classification report print metrics.classification_report(y_test, y_predicted, class_names=dataset.target_names) # Plot the confusion matrix cm = metrics.confusion_matrix(y_test, y_predicted) print cm # import pylab as pl #pl.matshow(cm) #pl.show()
'extr__n': (3, 4, 5, 6), 'svc__C': (1e-1, 1e-2, 1e9) } grid_search = GridSearchCV(pipeline, parameters) print "Loading data..." X, y = load_data() print "Searching for the best model..." t0 = time() grid_search.fit(X, y) print "Done in %0.3f" % (time() - t0) print "Best score: %0.3f" % grid_search.best_score clf = grid_search.best_estimator print clf yp = clf.predict(X) print classification_report(y, yp, targets, target_names) #pl.figure() #pl.title("Classification rate for 3-fold stratified CV") #pl.xlabel("n-gram maximum size") #pl.ylabel("successful classification rate") #ns = range(1, 11) #scores = [grid_search.grid_points_scores_[(('extr__n', i),)] for i in ns] #pl.plot(ns, scores, 'o-') #pl.show() ## Now we take apart the pipeline to do the plot #X = clf.named_steps['extr'].transform(X) #pca = RandomizedPCA(n_components=2).fit(X) #Xpca = pca.transform(X) #svc = clf.named_steps['svc']
clf = GridSearchCV(SVC(kernel="rbf"), param_grid, fit_params={"class_weight": "auto"}) clf = clf.fit(X_train_pca, y_train) print "done in %0.3fs" % (time() - t0) print "Best estimator found by grid search:" print clf.best_estimator ################################################################################ # Quantitative evaluation of the model quality on the test set print "Predicting the people names on the testing set" t0 = time() y_pred = clf.predict(X_test_pca) print "done in %0.3fs" % (time() - t0) print classification_report(y_test, y_pred, class_names=class_names) print confusion_matrix(y_test, y_pred, labels=range(n_classes)) ################################################################################ # Qualitative evaluation of the predictions using matplotlib n_row = 3 n_col = 4 def title(y_pred, y_test, class_names, i): pred_name = class_names[y_pred[i]].rsplit(" ", 1)[-1] true_name = class_names[y_test[i]].rsplit(" ", 1)[-1] return "predicted: %s\ntrue: %s" % (pred_name, true_name)
clf = GridSearchCV(SVC(kernel='rbf'), param_grid, fit_params={'class_weight': 'auto'}) #clf = SVC(kernel='rbf') #clf = SVC(kernel='linear') clf.fit(np.vstack([moto_vq_train,plane_vq_train]), np.array(labels)) print "Best estimator found by grid search:" #print clf.best_estimator ############################################################################### # Evaluation moto_vq_eval, plane_vq_eval = [np.load(file) for file in ['moto_vq_eval.npy','plane_vq_eval.npy']] y_name = ['moto']*moto_vq_eval.shape[0] + ['plane']* plane_vq_eval.shape[0] y_test = [0]* moto_vq_eval.shape[0] + [1]* plane_vq_eval.shape[0] y_test = np.array(y_test) y_pred = clf.predict(np.vstack([moto_vq_eval, plane_vq_eval])) print classification_report(y_test, y_pred, labels=labels, class_names=y_name) print confusion_matrix(y_test, y_pred)
# split the dataset in two equal part respecting label proportions train, test = iter(StratifiedKFold(y, 2)).next() ################################################################################ # Set the parameters by cross-validation tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]}, {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}] scores = [ ('precision', precision_score), ('recall', recall_score), ] for score_name, score_func in scores: clf = GridSearchCV(SVC(C=1), tuned_parameters, score_func=score_func) clf.fit(X[train], y[train], cv=StratifiedKFold(y[train], 5)) y_true, y_pred = y[test], clf.predict(X[test]) print "Classification report for the best estimator: " print clf.best_estimator print "Tuned for '%s' with optimal value: %0.3f" % ( score_name, score_func(y_true, y_pred)) print classification_report(y_true, y_pred) print "Grid scores:" pprint(clf.grid_scores_) print # Note the problem is too easy: the hyperparameter plateau is too flat and the # output model is the same for precision and recall with ties in quality
X_test = vectorizer.transform((open(f).read() for f in news_test.filenames)) y_test = news_test.target print "done in %fs" % (time() - t0) print "n_samples: %d, n_features: %d" % X_test.shape print "Predicting the outcomes of the testing set" t0 = time() pred = clf.predict(X_test) print "done in %fs" % (time() - t0) <<<<<<< HEAD print "precision: %0.3f" % precision(y_test, pred) print "recall: %0.3f" % recall(y_test, pred) print "f1_score: %0.3f" % f1_score(y_test, pred) ======= print "Classification report on test set:" print classification_report(news_test.target, pred, class_names=news_test.target_names) >>>>>>> remote cm = confusion_matrix(y_test, pred) print "Confusion matrix:" print cm # Show confusion matrix pl.matshow(cm) pl.title('Confusion matrix') pl.colorbar() pl.show()
param_grid = { 'C': [1, 5, 10, 100], 'gamma': [0.0001, 0.001, 0.01, 0.1], } clf = GridSearchCV(SVC(kernel='rbf'), param_grid, fit_params={'class_weight': 'auto'}, n_jobs=-1) clf = clf.fit(X_train_pca, y_train) print "Best estimator found by grid search:" print clf.best_estimator # Quantitative evaluation of the model quality on the test set y_pred = clf.predict(X_test_pca) print classification_report(y_test, y_pred, labels=selected_target, class_names=target_names[selected_target]) print confusion_matrix(y_test, y_pred, labels=selected_target) # Qualitative evaluation of the predictions using matplotlib n_row = 3 n_col = 4 def title(y_pred, y_test, target_names, i): pred_name = target_names[y_pred[i]].rsplit('_', 1)[-1] true_name = target_names[y_test[i]].rsplit('_', 1)[-1] return 'predicted: %s\ntrue: %s' % (pred_name, true_name)
'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], } clf = GridSearchCV(SVC(kernel='rbf'), param_grid, fit_params={'class_weight': 'auto'}) #clf = SVC(kernel='rbf') #clf = SVC(kernel='linear') clf.fit(np.vstack([moto_vq_train, plane_vq_train]), np.array(labels)) print "Best estimator found by grid search:" #print clf.best_estimator ############################################################################### # Evaluation moto_vq_eval, plane_vq_eval = [ np.load(file) for file in ['moto_vq_eval.npy', 'plane_vq_eval.npy'] ] y_name = ['moto'] * moto_vq_eval.shape[0] + ['plane'] * plane_vq_eval.shape[0] y_test = [0] * moto_vq_eval.shape[0] + [1] * plane_vq_eval.shape[0] y_test = np.array(y_test) y_pred = clf.predict(np.vstack([moto_vq_eval, plane_vq_eval])) print classification_report(y_test, y_pred, labels=labels, class_names=y_name) print confusion_matrix(y_test, y_pred)
data = digits.images.reshape((n_samples, -1)) # Import a classifier: from scikits.learn import svm from scikits.learn.metrics import classification_report from scikits.learn.metrics import confusion_matrix classifier = svm.SVC() # We learn the digits on the first half of the digits classifier.fit(data[:n_samples/2], digits.target[:n_samples/2]) # Now predict the value of the digit on the second half: expected = digits.target[n_samples/2:] predicted = classifier.predict(data[n_samples/2:]) print "Classification report for classifier:" print classifier print print classification_report(expected, predicted) print print "Confusion matrix:" print confusion_matrix(expected, predicted) for index, (image, prediction) in enumerate( zip(digits.images[n_samples/2:], predicted)[:4]): pl.subplot(2, 4, index+5) pl.imshow(image, cmap=pl.cm.gray_r) pl.title('Prediction: %i' % prediction) pl.show()
param_grid = { 'C': [1, 5, 10, 50, 100], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], } clf = GridSearchCV(SVC(kernel='rbf'), param_grid, fit_params={'class_weight': 'auto'}) clf = clf.fit(X_train_pca, y_train) print "Best estimator found by grid search:" print clf.best_estimator ################################################################################ # Quantitative evaluation of the model quality on the test set y_pred = clf.predict(X_test_pca) print classification_report(y_test, y_pred, labels=selected_target, class_names=category_names[selected_target]) print confusion_matrix(y_test, y_pred, labels=selected_target) ################################################################################ # Qualitative evaluation of the predictions using matplotlib n_row = 3 n_col = 4 pl.figure(figsize=(2 * n_col, 2.3 * n_row)) pl.subplots_adjust(bottom=0, left=.01, right=.99, top=.95, hspace=.15) for i in range(n_row * n_col): pl.subplot(n_row, n_col, i + 1) pl.imshow(X_test[i].reshape((64, 64)), cmap=pl.cm.gray)
print "Predicting the labels of the test set..." print "%d documents" % len(news_test.filenames) print "%d categories" % len(news_test.target_names) print "Extracting features from the dataset using the same vectorizer" t0 = time() X_test = vectorizer.transform((open(f).read() for f in news_test.filenames)) y_test = news_test.target print "done in %fs" % (time() - t0) print "n_samples: %d, n_features: %d" % X_test.shape print "Predicting the outcomes of the testing set" t0 = time() pred = clf.predict(X_test) print "done in %fs" % (time() - t0) print "Classification report on test set for classifier:" print clf print print classification_report(y_test, pred, class_names=news_test.target_names) cm = confusion_matrix(y_test, pred) print "Confusion matrix:" print cm # Show confusion matrix pl.matshow(cm) pl.title('Confusion matrix') pl.colorbar() pl.show()
## } ## print("Training LinearSVC on training set") ## clf = LinearSVC(**parameters) print("Training SGD with alpha=0.001 and n_iter=2") clf = SGD(alpha=0.001, n_iter=2) t0 = time() clf.fit(X_train, y_train) print "done in %fs" % (time() - t0) print "Predicting the outcomes of the testing set" t0 = time() pred = clf.predict(X_test) print "done in %fs" % (time() - t0) print "Classification performance:" print print metrics.classification_report( y_test, pred, labels=[-1, 1], class_names=['any other types', 'cover type 1']) print "" err = metrics.zero_one(y_test, pred) / float(pred.shape[0]) print "Error rate: %.4f" % err print "" cm = metrics.confusion_matrix(y_test, pred) print "Confusion matrix:" print cm
tuned_parameters = [{ 'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000] }, { 'kernel': ['linear'], 'C': [1, 10, 100, 1000] }] scores = [ ('precision', precision_score), ('recall', recall_score), ] for score_name, score_func in scores: clf = GridSearchCV(SVC(C=1), tuned_parameters, score_func=score_func) clf.fit(X[train], y[train], cv=StratifiedKFold(y[train], 5)) y_true, y_pred = y[test], clf.predict(X[test]) print "Classification report for the best estimator: " print clf.best_estimator print "Tuned for '%s' with optimal value: %0.3f" % ( score_name, score_func(y_true, y_pred)) print classification_report(y_true, y_pred) print "Grid scores:" pprint(clf.grid_scores_) print # Note the problem is too easy: the hyperparameter plateau is too flat and the # output model is the same for precision and recall with ties in quality
param_grid, fit_params={'class_weight': 'auto'}) clf = clf.fit(X_train_pca, y_train) print "done in %0.3fs" % (time() - t0) print "Best estimator found by grid search:" print clf.best_estimator ################################################################################ # Quantitative evaluation of the model quality on the test set print "Predicting the people names on the testing set" t0 = time() y_pred = clf.predict(X_test_pca) print "done in %0.3fs" % (time() - t0) print classification_report(y_test, y_pred, target_names=target_names) print confusion_matrix(y_test, y_pred, labels=range(n_classes)) ################################################################################ # Qualitative evaluation of the predictions using matplotlib n_row = 3 n_col = 4 def title(y_pred, y_test, target_names, i): pred_name = target_names[y_pred[i]].rsplit(' ', 1)[-1] true_name = target_names[y_test[i]].rsplit(' ', 1)[-1] return 'predicted: %s\ntrue: %s' % (pred_name, true_name)
# Train a SVM classification model print "Fitting the classifier to the training set" param_grid = {"C": [1, 5, 10, 100], "gamma": [0.0001, 0.001, 0.01, 0.1]} clf = GridSearchCV(SVC(kernel="rbf"), param_grid, fit_params={"class_weight": "auto"}, n_jobs=-1) clf = clf.fit(X_train_pca, y_train) print "Best estimator found by grid search:" print clf.best_estimator # Quantitative evaluation of the model quality on the test set y_pred = clf.predict(X_test_pca) print classification_report(y_test, y_pred, labels=selected_target, target_names=target_names[selected_target]) print confusion_matrix(y_test, y_pred, labels=selected_target) # Qualitative evaluation of the predictions using matplotlib n_row = 3 n_col = 4 def title(y_pred, y_test, target_names, i): pred_name = target_names[y_pred[i]].rsplit("_", 1)[-1] true_name = target_names[y_test[i]].rsplit("_", 1)[-1] return "predicted: %s\ntrue: %s" % (pred_name, true_name)
pl.subplot(2, 4, index + 1) pl.imshow(image, cmap=pl.cm.gray_r) pl.title('Training: %i' % label) # To apply an classifier on this data, we need to flatten the image, to # turn the data in a (samples, feature) matrix: n_samples = len(digits.images) data = digits.images.reshape((n_samples, -1)) # Create a classifier: a support vector classifier classifier = svm.SVC() # We learn the digits on the first half of the digits classifier.fit(data[:n_samples / 2], digits.target[:n_samples / 2]) # Now predict the value of the digit on the second half: expected = digits.target[n_samples / 2:] predicted = classifier.predict(data[n_samples / 2:]) print "Classification report for classifier %s:\n%s\n" % ( classifier, metrics.classification_report(expected, predicted)) print "Confusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted) for index, (image, prediction) in enumerate( zip(digits.images[n_samples / 2:], predicted)[:4]): pl.subplot(2, 4, index + 5) pl.imshow(image, cmap=pl.cm.gray_r) pl.title('Prediction: %i' % prediction) pl.show()
def evaluate(clf,Xt,yt,Xv,yv,title): print title clf.fit(Xt,yt) pred = clf.predict(Xv) print metrics.classification_report(yv,pred)
pl.subplot(2, 4, index+1) pl.imshow(image, cmap=pl.cm.gray_r) pl.title('Training: %i' % label) # To apply an classifier on this data, we need to flatten the image, to # turn the data in a (samples, feature) matrix: n_samples = len(digits.images) data = digits.images.reshape((n_samples, -1)) # Create a classifier: a support vector classifier classifier = svm.SVC() # We learn the digits on the first half of the digits classifier.fit(data[:n_samples/2], digits.target[:n_samples/2]) # Now predict the value of the digit on the second half: expected = digits.target[n_samples/2:] predicted = classifier.predict(data[n_samples/2:]) print "Classification report for classifier %s:\n%s\n" % ( classifier, metrics.classification_report(expected, predicted)) print "Confusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted) for index, (image, prediction) in enumerate( zip(digits.images[n_samples/2:], predicted)[:4]): pl.subplot(2, 4, index+5) pl.imshow(image, cmap=pl.cm.gray_r) pl.title('Prediction: %i' % prediction) pl.show()
data = digits.images.reshape((n_samples, -1)) # Import a classifier: from scikits.learn import svm from scikits.learn.metrics import classification_report from scikits.learn.metrics import confusion_matrix classifier = svm.SVC() # We learn the digits on the first half of the digits classifier.fit(data[:n_samples / 2], digits.target[:n_samples / 2]) # Now predict the value of the digit on the second half: expected = digits.target[n_samples / 2:] predicted = classifier.predict(data[n_samples / 2:]) print "Classification report for classifier:" print classifier print print classification_report(expected, predicted) print print "Confusion matrix:" print confusion_matrix(expected, predicted) for index, (image, prediction) in enumerate( zip(digits.images[n_samples / 2:], predicted)[:4]): pl.subplot(2, 4, index + 5) pl.imshow(image, cmap=pl.cm.gray_r) pl.title('Prediction: %i' % prediction) pl.show()
'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], } clf = GridSearchCV(SVC(kernel='rbf'), param_grid, fit_params={'class_weight': 'auto'}) clf = clf.fit(X_train_pca, y_train) print("Best estimator found by grid search:") print(clf.best_estimator) ################################################################################ # Quantitative evaluation of the model quality on the test set y_pred = clf.predict(X_test_pca) print( classification_report(y_test, y_pred, labels=selected_target, class_names=category_names[selected_target])) print(confusion_matrix(y_test, y_pred, labels=selected_target)) ################################################################################ # Qualitative evaluation of the predictions using matplotlib n_row = 3 n_col = 4 pl.figure(figsize=(2 * n_col, 2.3 * n_row)) pl.subplots_adjust(bottom=0, left=.01, right=.99, top=.95, hspace=.15) for i in range(n_row * n_col): pl.subplot(n_row, n_col, i + 1) pl.imshow(X_test[i].reshape((64, 64)), cmap=pl.cm.gray)