def test_accuracy_regression(self): expected_accuracies = { dataset.DatasetNames.IRIS: 0.96, dataset.DatasetNames.WINE: 0.9772727272727273, dataset.DatasetNames.VOWEL: 0.8560606060606061, } for dataset_name, expected_accuracy in expected_accuracies.items(): samples, labels = dataset.load_dataset(dataset_name) (training_samples, training_labels, test_samples, test_labels) = dataset.split_dataset(samples=samples, labels=labels, train_fraction=0.5, balance_classes=True, seed=0) classifier = bc.BayesClassifier.train(samples=training_samples, labels=training_labels, naive=False) test_predictions = classifier.classify(samples=test_samples) test_accuracy = bc.evaluate_accuracy(predictions=test_predictions, labels=test_labels) self.assertAlmostEqual(expected_accuracy, test_accuracy, places=6)
def assignment3p1(): print('Assignment 3.1') print('Use PCA to reduce the datasets to 2D and plot the boundaries.') for dataset_name in dataset.DatasetNames: samples, labels = dataset.load_dataset(dataset_name) training_samples, training_labels, test_samples, test_labels = \ dataset.split_dataset(samples=samples, labels=labels, train_fraction=0.5, balance_classes=True, seed=0) pca = decomposition.PCA(n_components=2) pca.fit(training_samples) training_samples = pca.transform(training_samples) test_samples = pca.transform(test_samples) classifier = bc.BayesClassifier.train(samples=training_samples, labels=training_labels, naive=False) test_predictions = classifier.classify(samples=test_samples) test_accuracy = bc.evaluate_accuracy(predictions=test_predictions, labels=test_labels) # Plot the classification of the test samples. fig, (ax1, ax2) = plt.subplots(1, 2) plotting.plot_samples_2d(ax=ax1, samples=test_samples, labels=test_labels) plotting.plot_gaussians(ax=ax1, labels=test_labels, mu=classifier.mu, sigma=classifier.sigma) ax1.legend() ax1.set_title('Ground truth') plotting.plot_samples_2d(ax=ax2, samples=test_samples, labels=test_predictions) plotting.plot_gaussians(ax=ax2, labels=test_predictions, mu=classifier.mu, sigma=classifier.sigma) plotting.plot_boundaries(ax=ax2, classifier=classifier, grid_size=1000) ax2.legend() ax2.set_title('Prediction') fig.suptitle('Assignment 3.1\n' 'Dataset: {}\n' 'Accuracy: {:.3f}'.format(dataset_name.value, test_accuracy)) plt.show() plt.close()
def assignment5p1(): print('Assignment 5.1') print('Compare weak classifiers and boosted classifiers on real datasets.') num_trials = 10 pretty_table = PrettyTable() pretty_table.field_names = ['Dataset', 'Weak', 'Boosted'] for dataset_name in dataset.DatasetNames: samples, labels = dataset.load_dataset(dataset_name) if dataset_name == dataset.DatasetNames.OLIVETTI: # The dimensionality of the Olivetti dataset is too big. Use PCA # to make the problem tractable. pca = decomposition.PCA(n_components=20) pca.fit(samples) samples = pca.transform(samples) weak_accuracies = [] boosted_accuracies = [] for trial_idx in range(num_trials): (training_samples, training_labels, test_samples, test_labels) = dataset.split_dataset(samples=samples, labels=labels, train_fraction=0.7, balance_classes=True, seed=trial_idx) weak_classifier = \ bayes_cls.BayesClassifier.train(samples=training_samples, labels=training_labels, naive=True, weights=None) weak_predictions = weak_classifier.classify(samples=test_samples) weak_accuracy = np.mean(weak_predictions == test_labels) weak_accuracies.append(weak_accuracy) classifier_params = {'naive': True} boost_classifier = boost_cls.BoostClassifier.train( classifier_class=bayes_cls.BayesClassifier, samples=training_samples, labels=training_labels, num_iters=10, **classifier_params) boost_predictions = boost_classifier.classify(samples=test_samples) boost_accuracy = np.mean(boost_predictions == test_labels) boosted_accuracies.append(boost_accuracy) mean_weak_accuracy = np.mean(weak_accuracies) std_weak_accuracy = np.std(weak_accuracies) mean_boosted_accuracy = np.mean(boosted_accuracies) std_boosted_accuracy = np.std(boosted_accuracies) def _format_acc(mean_, std_): acc_str = '{:.3f} +/- {:.3f}'.format(mean_, std_) return acc_str pretty_table.add_row([ dataset_name.value, _format_acc(mean_weak_accuracy, std_weak_accuracy), _format_acc(mean_boosted_accuracy, std_boosted_accuracy) ]) print() print('Mean accuracy on test set ({} trials)'.format(num_trials)) print(pretty_table)
def assignment6p1(): print('Assignment 6.1') print('Boosted trees, using Sklearn implementation: ' 'plot the boundaries.') for dataset_name in dataset.DatasetNames: samples, labels = dataset.load_dataset(dataset_name) pca = decomposition.PCA(n_components=2) pca.fit(samples) samples = pca.transform(samples) (training_samples, training_labels, test_samples, test_labels) = dataset.split_dataset(samples=samples, labels=labels, train_fraction=0.7, balance_classes=True, seed=0) weak_classifier = \ tree_cls.SklearnDecisionTreeClassifierWrapper.train( samples=training_samples, labels=training_labels, weights=None) weak_predictions = weak_classifier.classify(samples=test_samples) weak_accuracy = np.mean(weak_predictions == test_labels) boost_classifier = boost_cls.BoostClassifier.train( classifier_class=tree_cls.SklearnDecisionTreeClassifierWrapper, samples=training_samples, labels=training_labels, num_iters=10) boost_predictions = boost_classifier.classify(samples=test_samples) boost_accuracy = np.mean(boost_predictions == test_labels) # Plot the classification of the test samples. fig, (ax1, ax2, ax3) = plt.subplots(1, 3) plotting.plot_samples_2d(ax=ax1, samples=test_samples, labels=test_labels) ax1.legend() ax1.set_title('Ground truth') plotting.plot_samples_2d(ax=ax2, samples=test_samples, labels=weak_predictions) plotting.plot_boundaries(ax=ax2, classifier=weak_classifier, grid_size=1000) ax2.legend() ax2.set_title('Weak classifier') plotting.plot_samples_2d(ax=ax3, samples=test_samples, labels=boost_predictions) plotting.plot_boundaries(ax=ax3, classifier=boost_classifier, grid_size=1000) ax3.legend() ax3.set_title('Boosted classifier') fig.suptitle('Assignment 5.2\n' 'Dataset: {}\n' 'Weak classifier accuracy: {:.3f}\n' 'Boosted classifier accuracy: {:.3f}'.format( dataset_name.value, weak_accuracy, boost_accuracy)) plt.show() plt.close()
def assignment3(): print('Assignment 3') print('Test your Bayesian Classifier on real datasets.') num_trials = 100 pretty_table = PrettyTable() pretty_table.field_names = ['Dataset', 'Non naive', 'Naive'] datasets = [ dataset.DatasetNames.IRIS, dataset.DatasetNames.WINE, dataset.DatasetNames.VOWEL, ] for dataset_name in datasets: samples, labels = dataset.load_dataset(dataset_name) accuracies = [] naive_accuracies = [] for trial_idx in range(num_trials): (training_samples, training_labels, test_samples, test_labels) = dataset.split_dataset(samples=samples, labels=labels, train_fraction=0.5, balance_classes=True, seed=trial_idx) # Non-naive classifier. classifier = bc.BayesClassifier.train(samples=training_samples, labels=training_labels, naive=False) test_predictions = classifier.classify(samples=test_samples) test_accuracy = bc.evaluate_accuracy(predictions=test_predictions, labels=test_labels) accuracies.append(test_accuracy) # Naive classifier. naive_classifier = \ bc.BayesClassifier.train(samples=training_samples, labels=training_labels, naive=True) naive_test_predictions = \ naive_classifier.classify(samples=test_samples) naive_test_accuracy = bc.evaluate_accuracy( predictions=naive_test_predictions, labels=test_labels) naive_accuracies.append(naive_test_accuracy) mean_accuracy = np.mean(accuracies) std_accuracy = np.std(accuracies) naive_mean_accuracy = np.mean(naive_accuracies) naive_std_accuracy = np.std(naive_accuracies) def _format_acc(mean_, std_): acc_str = '{:.3f} +/- {:.3f}'.format(mean_, std_) return acc_str pretty_table.add_row([ dataset_name.value, _format_acc(mean_accuracy, std_accuracy), _format_acc(naive_mean_accuracy, naive_std_accuracy) ]) print() print('Mean accuracy on test set ({} trials)'.format(num_trials)) print(pretty_table)