Esempio n. 1
0
 def test_inner_product_knn(self):
     knn = KNN(1, inner_product_distance)
     features = [[1, 1], [1, 2], [2, 2], [9, 9], [8, 8], [8, 9]]
     values = [0, 0, 0, 1, 1, 1]
     knn.train(features, values)
     point1 = [0, 0]
     neighbor = knn.get_neighbors(point1)
     self.assertEqual(0, knn.get_response(neighbor))
     numpy.testing.assert_array_equal(numpy.array([[1, 1, 0]]), neighbor)
     point2 = [10, 10]
     neighbor = knn.get_neighbors(point2)
     numpy.testing.assert_array_equal(numpy.array([[1, 1, 0]]), neighbor)
     self.assertEqual(1, knn.get_response(neighbor))
Esempio n. 2
0
class TestKNN(TestCase):
    def setUp(self):
        self.knn = KNN(1, euclidean_distance)

    def test_train(self):
        features = [[1, 1], [1, 2], [2, 2], [9, 9], [8, 8], [8, 9]]
        values = [0, 0, 0, 1, 1, 1]
        self.knn.train(features, values)
        point1 = [0, 0]
        neighbor = self.knn.get_neighbors(point1)
        self.assertEqual(0, self.knn.get_response(neighbor))
        numpy.testing.assert_array_equal(numpy.array([[1, 1, 0]]), neighbor)
        point2 = [10, 10]
        neighbor = self.knn.get_neighbors(point2)
        numpy.testing.assert_array_equal(numpy.array([[9, 9, 1]]), neighbor)
        self.assertEqual(1, self.knn.get_response(neighbor))

    @skip("clarify inner product")
    def test_inner_product_knn(self):
        knn = KNN(1, inner_product_distance)
        features = [[1, 1], [1, 2], [2, 2], [9, 9], [8, 8], [8, 9]]
        values = [0, 0, 0, 1, 1, 1]
        knn.train(features, values)
        point1 = [0, 0]
        neighbor = knn.get_neighbors(point1)
        self.assertEqual(0, knn.get_response(neighbor))
        numpy.testing.assert_array_equal(numpy.array([[1, 1, 0]]), neighbor)
        point2 = [10, 10]
        neighbor = knn.get_neighbors(point2)
        numpy.testing.assert_array_equal(numpy.array([[1, 1, 0]]), neighbor)
        self.assertEqual(1, knn.get_response(neighbor))

    def test_predict(self):
        features = [[1, 1], [1, 2], [2, 2], [9, 9], [8, 8], [8, 9]]
        values = [0, 0, 0, 1, 1, 1]
        self.knn.train(features, values)
        points = [[0, 0], [10, 10]]
        self.assertListEqual([0, 1], self.knn.predict(points))

    def test_knn(self):
        features, labels = generate_data_cancer()
        train_features, train_labels = features[:400], labels[:400]
        valid_features, valid_labels = features[400:460], labels[400:460]
        test_features, test_labels = features[460:], labels[460:]
        assert len(train_features) == len(train_labels) == 400
        assert len(valid_features) == len(valid_labels) == 60
        assert len(test_features) == len(test_labels) == 109
        distance_funcs = {
            # 'euclidean': euclidean_distance,
            # 'gaussian': gaussian_kernel_distance,
            'inner_prod': inner_product_distance,
        }
        for name, func in distance_funcs.items():
            best_f1_score, best_k = -1, 0
            for k in [1]:
                model = KNN(k=k, distance_function=func)
                model.train(train_features, train_labels)
                # print(train_labels)
                # print(model.predict(train_features))
                train_f1_score = f1_score(train_labels,
                                          model.predict(train_features))
                valid_f1_score = f1_score(valid_labels,
                                          model.predict(valid_features))
                print(f'[part 2.1] {name}\tk: {k:d}\t'
                      f'train: {train_f1_score:.5f}\t'
                      f'valid: {valid_f1_score:.5f}')
                if valid_f1_score > best_f1_score:
                    best_f1_score, best_k = valid_f1_score, k
        model = KNN(k=best_k, distance_function=func)
        model.train(train_features + valid_features,
                    train_labels + valid_labels)
        test_f1_score = f1_score(test_labels, model.predict(test_features))
        print()
        print(f'[part 2.1] {name}\tbest_k: {best_k:d}\t'
              f'test f1 score: {test_f1_score:.5f}')
        print()

    def test_normalization(self):
        scaling_functions = {
            'min_max_scale': MinMaxScaler,
            'normalize': NormalizationScaler,
        }
        distance_funcs = {
            'euclidean': euclidean_distance,
            'gaussian': gaussian_kernel_distance,
            'inner_prod': inner_product_distance,
        }
        features, labels = generate_data_cancer()
        train_features, train_labels = features[:400], labels[:400]
        valid_features, valid_labels = features[400:460], labels[400:460]
        test_features, test_labels = features[460:], labels[460:]
        assert len(train_features) == len(train_labels) == 400
        assert len(valid_features) == len(valid_labels) == 60
        assert len(test_features) == len(test_labels) == 109
        for scaling_name, scaling_class in scaling_functions.items():
            for name, func in distance_funcs.items():
                scaler = scaling_class()
                train_features_scaled = scaler(train_features)
                valid_features_scaled = scaler(valid_features)

                best_f1_score, best_k = 0, -1
                for k in [1, 3, 10, 20, 50]:
                    model = KNN(k=k, distance_function=func)
                    model.train(train_features_scaled, train_labels)
                    train_f1_score = f1_score(
                        train_labels, model.predict(train_features_scaled))

                    valid_f1_score = f1_score(
                        valid_labels, model.predict(valid_features_scaled))
                    print('[part 2.2] {name}\t{scaling_name}\tk: {k:d}\t'.
                          format(name=name, scaling_name=scaling_name, k=k) +
                          'train: {train_f1_score:.5f}\t'.format(
                              train_f1_score=train_f1_score) +
                          'valid: {valid_f1_score:.5f}'.format(
                              valid_f1_score=valid_f1_score))

                    if valid_f1_score > best_f1_score:
                        best_f1_score, best_k = valid_f1_score, k

                # now change it to new scaler, since the training set changes
                scaler = scaling_class()
                combined_features_scaled = scaler(train_features +
                                                  valid_features)
                test_features_scaled = scaler(test_features)

                model = KNN(k=best_k, distance_function=func)
                model.train(combined_features_scaled,
                            train_labels + valid_labels)
                test_f1_score = f1_score(test_labels,
                                         model.predict(test_features_scaled))
                print()
                print('[part 2.2] {name}\t{scaling_name}\t'.format(
                    name=name, scaling_name=scaling_name) +
                      'best_k: {best_k:d}\ttest: {test_f1_score:.5f}'.format(
                          best_k=best_k, test_f1_score=test_f1_score))
                print()