Beispiel #1
0
    def test_accuracy_regression(self):
        expected_accuracies = {
            dataset.DatasetNames.IRIS: 0.96,
            dataset.DatasetNames.WINE: 0.9772727272727273,
            dataset.DatasetNames.VOWEL: 0.8560606060606061,
        }

        for dataset_name, expected_accuracy in expected_accuracies.items():
            samples, labels = dataset.load_dataset(dataset_name)

            (training_samples, training_labels, test_samples,
             test_labels) = dataset.split_dataset(samples=samples,
                                                  labels=labels,
                                                  train_fraction=0.5,
                                                  balance_classes=True,
                                                  seed=0)

            classifier = bc.BayesClassifier.train(samples=training_samples,
                                                  labels=training_labels,
                                                  naive=False)
            test_predictions = classifier.classify(samples=test_samples)
            test_accuracy = bc.evaluate_accuracy(predictions=test_predictions,
                                                 labels=test_labels)

            self.assertAlmostEqual(expected_accuracy, test_accuracy, places=6)
def assignment3p1():
    print('Assignment 3.1')
    print('Use PCA to reduce the datasets to 2D and plot the boundaries.')

    for dataset_name in dataset.DatasetNames:
        samples, labels = dataset.load_dataset(dataset_name)
        training_samples, training_labels, test_samples, test_labels = \
            dataset.split_dataset(samples=samples,
                                  labels=labels,
                                  train_fraction=0.5,
                                  balance_classes=True,
                                  seed=0)

        pca = decomposition.PCA(n_components=2)
        pca.fit(training_samples)
        training_samples = pca.transform(training_samples)
        test_samples = pca.transform(test_samples)

        classifier = bc.BayesClassifier.train(samples=training_samples,
                                              labels=training_labels,
                                              naive=False)
        test_predictions = classifier.classify(samples=test_samples)
        test_accuracy = bc.evaluate_accuracy(predictions=test_predictions,
                                             labels=test_labels)

        # Plot the classification of the test samples.
        fig, (ax1, ax2) = plt.subplots(1, 2)
        plotting.plot_samples_2d(ax=ax1,
                                 samples=test_samples,
                                 labels=test_labels)
        plotting.plot_gaussians(ax=ax1,
                                labels=test_labels,
                                mu=classifier.mu,
                                sigma=classifier.sigma)
        ax1.legend()
        ax1.set_title('Ground truth')

        plotting.plot_samples_2d(ax=ax2,
                                 samples=test_samples,
                                 labels=test_predictions)
        plotting.plot_gaussians(ax=ax2,
                                labels=test_predictions,
                                mu=classifier.mu,
                                sigma=classifier.sigma)
        plotting.plot_boundaries(ax=ax2, classifier=classifier, grid_size=1000)
        ax2.legend()
        ax2.set_title('Prediction')

        fig.suptitle('Assignment 3.1\n'
                     'Dataset: {}\n'
                     'Accuracy: {:.3f}'.format(dataset_name.value,
                                               test_accuracy))
        plt.show()
        plt.close()
def assignment5p1():
    print('Assignment 5.1')
    print('Compare weak classifiers and boosted classifiers on real datasets.')

    num_trials = 10

    pretty_table = PrettyTable()
    pretty_table.field_names = ['Dataset', 'Weak', 'Boosted']

    for dataset_name in dataset.DatasetNames:
        samples, labels = dataset.load_dataset(dataset_name)

        if dataset_name == dataset.DatasetNames.OLIVETTI:
            # The dimensionality of the Olivetti dataset is too big. Use PCA
            # to make the problem tractable.
            pca = decomposition.PCA(n_components=20)
            pca.fit(samples)
            samples = pca.transform(samples)

        weak_accuracies = []
        boosted_accuracies = []
        for trial_idx in range(num_trials):
            (training_samples, training_labels, test_samples,
             test_labels) = dataset.split_dataset(samples=samples,
                                                  labels=labels,
                                                  train_fraction=0.7,
                                                  balance_classes=True,
                                                  seed=trial_idx)

            weak_classifier = \
                bayes_cls.BayesClassifier.train(samples=training_samples,
                                                labels=training_labels,
                                                naive=True,
                                                weights=None)
            weak_predictions = weak_classifier.classify(samples=test_samples)
            weak_accuracy = np.mean(weak_predictions == test_labels)
            weak_accuracies.append(weak_accuracy)

            classifier_params = {'naive': True}
            boost_classifier = boost_cls.BoostClassifier.train(
                classifier_class=bayes_cls.BayesClassifier,
                samples=training_samples,
                labels=training_labels,
                num_iters=10,
                **classifier_params)
            boost_predictions = boost_classifier.classify(samples=test_samples)
            boost_accuracy = np.mean(boost_predictions == test_labels)
            boosted_accuracies.append(boost_accuracy)

        mean_weak_accuracy = np.mean(weak_accuracies)
        std_weak_accuracy = np.std(weak_accuracies)
        mean_boosted_accuracy = np.mean(boosted_accuracies)
        std_boosted_accuracy = np.std(boosted_accuracies)

        def _format_acc(mean_, std_):
            acc_str = '{:.3f} +/- {:.3f}'.format(mean_, std_)
            return acc_str

        pretty_table.add_row([
            dataset_name.value,
            _format_acc(mean_weak_accuracy, std_weak_accuracy),
            _format_acc(mean_boosted_accuracy, std_boosted_accuracy)
        ])

    print()
    print('Mean accuracy on test set ({} trials)'.format(num_trials))
    print(pretty_table)
def assignment6p1():
    print('Assignment 6.1')
    print('Boosted trees, using Sklearn implementation: '
          'plot the boundaries.')

    for dataset_name in dataset.DatasetNames:
        samples, labels = dataset.load_dataset(dataset_name)
        pca = decomposition.PCA(n_components=2)
        pca.fit(samples)
        samples = pca.transform(samples)

        (training_samples, training_labels, test_samples,
         test_labels) = dataset.split_dataset(samples=samples,
                                              labels=labels,
                                              train_fraction=0.7,
                                              balance_classes=True,
                                              seed=0)

        weak_classifier = \
            tree_cls.SklearnDecisionTreeClassifierWrapper.train(
                samples=training_samples,
                labels=training_labels,
                weights=None)
        weak_predictions = weak_classifier.classify(samples=test_samples)
        weak_accuracy = np.mean(weak_predictions == test_labels)

        boost_classifier = boost_cls.BoostClassifier.train(
            classifier_class=tree_cls.SklearnDecisionTreeClassifierWrapper,
            samples=training_samples,
            labels=training_labels,
            num_iters=10)
        boost_predictions = boost_classifier.classify(samples=test_samples)
        boost_accuracy = np.mean(boost_predictions == test_labels)

        # Plot the classification of the test samples.
        fig, (ax1, ax2, ax3) = plt.subplots(1, 3)
        plotting.plot_samples_2d(ax=ax1,
                                 samples=test_samples,
                                 labels=test_labels)
        ax1.legend()
        ax1.set_title('Ground truth')

        plotting.plot_samples_2d(ax=ax2,
                                 samples=test_samples,
                                 labels=weak_predictions)
        plotting.plot_boundaries(ax=ax2,
                                 classifier=weak_classifier,
                                 grid_size=1000)
        ax2.legend()
        ax2.set_title('Weak classifier')

        plotting.plot_samples_2d(ax=ax3,
                                 samples=test_samples,
                                 labels=boost_predictions)
        plotting.plot_boundaries(ax=ax3,
                                 classifier=boost_classifier,
                                 grid_size=1000)
        ax3.legend()
        ax3.set_title('Boosted classifier')

        fig.suptitle('Assignment 5.2\n'
                     'Dataset: {}\n'
                     'Weak classifier accuracy: {:.3f}\n'
                     'Boosted classifier accuracy: {:.3f}'.format(
                         dataset_name.value, weak_accuracy, boost_accuracy))

        plt.show()
        plt.close()
def assignment3():
    print('Assignment 3')
    print('Test your Bayesian Classifier on real datasets.')

    num_trials = 100

    pretty_table = PrettyTable()
    pretty_table.field_names = ['Dataset', 'Non naive', 'Naive']

    datasets = [
        dataset.DatasetNames.IRIS,
        dataset.DatasetNames.WINE,
        dataset.DatasetNames.VOWEL,
    ]
    for dataset_name in datasets:
        samples, labels = dataset.load_dataset(dataset_name)

        accuracies = []
        naive_accuracies = []
        for trial_idx in range(num_trials):
            (training_samples, training_labels, test_samples,
             test_labels) = dataset.split_dataset(samples=samples,
                                                  labels=labels,
                                                  train_fraction=0.5,
                                                  balance_classes=True,
                                                  seed=trial_idx)

            # Non-naive classifier.
            classifier = bc.BayesClassifier.train(samples=training_samples,
                                                  labels=training_labels,
                                                  naive=False)
            test_predictions = classifier.classify(samples=test_samples)
            test_accuracy = bc.evaluate_accuracy(predictions=test_predictions,
                                                 labels=test_labels)
            accuracies.append(test_accuracy)

            # Naive classifier.
            naive_classifier = \
                bc.BayesClassifier.train(samples=training_samples,
                                         labels=training_labels,
                                         naive=True)
            naive_test_predictions = \
                naive_classifier.classify(samples=test_samples)
            naive_test_accuracy = bc.evaluate_accuracy(
                predictions=naive_test_predictions, labels=test_labels)
            naive_accuracies.append(naive_test_accuracy)

        mean_accuracy = np.mean(accuracies)
        std_accuracy = np.std(accuracies)
        naive_mean_accuracy = np.mean(naive_accuracies)
        naive_std_accuracy = np.std(naive_accuracies)

        def _format_acc(mean_, std_):
            acc_str = '{:.3f} +/- {:.3f}'.format(mean_, std_)
            return acc_str

        pretty_table.add_row([
            dataset_name.value,
            _format_acc(mean_accuracy, std_accuracy),
            _format_acc(naive_mean_accuracy, naive_std_accuracy)
        ])

    print()
    print('Mean accuracy on test set ({} trials)'.format(num_trials))
    print(pretty_table)