Exemple #1
0
 def test_load_unlabelled(self):
     dataset = load(self.relative_to_base("datasets/"
                                          "3f_ids_header_no_labels.csv"),
                    has_labels=False)
     self.assertFalse(dataset.is_labelled())
     self.assertEqual(dataset.num_features(), 3)
     self.assertTrue(dataset.get_labels() is None)
Exemple #2
0
    def test_gradient_descent_2_parameters(self):
        """
        Test based on Assignment 1 of the free online
        Stanford Machine Learning online course.

        For population = 35,000, we predict a profit of 4519.767868
        For population = 70,000, we predict a profit of 45342.450129

        Final cost: 4.483388
        """
        dataset = loader.load(self.relative_to_base("datasets/ex1data1.txt"),
                              has_ids=False,
                              has_header=False,
                              has_labels=True,
                              delimiter=",")
        dataset.set_column("bias", pd.Series([1] * dataset.num_samples()))

        learning_rate = 0.01
        iter = 100

        initial_theta = pd.Series({0: 0, "bias": 0})
        theta = optimize.gradient_descent(dataset,
                                          initial_theta,
                                          learning_rate,
                                          iterations=iter)

        # assert_that(theta.tolist(), contains(-0.576556, 0.859582))
        assert_that(theta,
                    equals_series({
                        0: 0.859582,
                        "bias": -0.576556
                    }, places=6))
 def test_id3_build_tree_weekends(self):
     dataset = load(self.relative_to_base("/datasets/weekends.data"))
     tree = id3.build_tree(dataset)
     
     assert_that(tree,
         equals_tree( 
             {"weather": {
                 "sunny": {
                     "parents": {
                         "yes": "cinema",
                         "no": "tennis"
                     }
                 },
                 "windy": {
                     "parents": {
                         "yes": "cinema",
                         "no": {
                             "money": {
                                 "rich": "shopping",
                                 "poor": "cinema"
                             }
                         }
                     }
                 },
                 "rainy": {
                     "money": {
                         "poor": "cinema",
                         "rich": "stay in"
                     }
                 }
             }}
         )
     )
Exemple #4
0
    def test_gradient_descent_3_parameters(self):
        """
        Test based on Assignment 1 of the free online
        Stanford Machine Learning online course.
        """
        dataset = loader.load(self.relative_to_base("datasets/ex1data2.txt"),
                              has_ids=False,
                              has_header=False,
                              has_labels=True,
                              delimiter=",")
        dataset.normalize_features()
        dataset.set_column("bias", pd.Series([1] * dataset.num_samples()))

        learning_rate = 1.0
        iter = 50

        initial_theta = pd.Series({0: 0, 1: 0, "bias": 0})
        theta = optimize.gradient_descent(dataset,
                                          initial_theta,
                                          learning_rate,
                                          iterations=iter)

        assert_that(
            theta,
            equals_series(
                {
                    0: 110631.050279,
                    1: -6649.474271,
                    "bias": 340412.659574
                },
                places=6))
Exemple #5
0
    def test_id3_build_tree_weekends(self):
        dataset = load(self.relative_to_base("/datasets/weekends.data"))
        tree = id3.build_tree(dataset)

        assert_that(
            tree,
            equals_tree({
                "weather": {
                    "sunny": {
                        "parents": {
                            "yes": "cinema",
                            "no": "tennis"
                        }
                    },
                    "windy": {
                        "parents": {
                            "yes": "cinema",
                            "no": {
                                "money": {
                                    "rich": "shopping",
                                    "poor": "cinema"
                                }
                            }
                        }
                    },
                    "rainy": {
                        "money": {
                            "poor": "cinema",
                            "rich": "stay in"
                        }
                    }
                }
            }))
 def test_load_unlabelled(self):
     dataset = load(self.relative_to_base("datasets/"
                                          "3f_ids_header_no_labels.csv"), 
                    has_labels=False)
     self.assertFalse(dataset.is_labelled())
     self.assertEqual(dataset.num_features(), 3)
     self.assertTrue(dataset.get_labels() is None)
    def test_gradient_descent_2_parameters(self):
        """
        Test based on Assignment 1 of the free online
        Stanford Machine Learning online course.

        For population = 35,000, we predict a profit of 4519.767868
        For population = 70,000, we predict a profit of 45342.450129

        Final cost: 4.483388
        """
        dataset = loader.load(self.relative_to_base("datasets/ex1data1.txt"),
                              has_ids=False, has_header=False, has_labels=True,
                              delimiter=",")
        dataset.set_column("bias", pd.Series([1] * dataset.num_samples()))

        learning_rate = 0.01
        iter = 100

        initial_theta = pd.Series({0: 0, "bias": 0})
        theta = optimize.gradient_descent(dataset, initial_theta,
                                          learning_rate, iterations=iter)

        # assert_that(theta.tolist(), contains(-0.576556, 0.859582))
        assert_that(theta, equals_series({0: 0.859582,
                                         "bias": -0.576556},
                                         places=6))
 def test_classify_play_tennis(self):
     training = load(self.relative_to_base("/datasets/play_tennis.data"),
                     delimiter=" ")
     classifier = DecisionTree(training)
     sample = pd.Series(["Rain", "Cool", "High", "Strong"], 
                        index=['Outlook', 'Temperature', 'Humidity', 
                               'Wind'])
     self.assertEqual(classifier.classify(sample), "No")
Exemple #9
0
 def test_classify_play_tennis(self):
     training = load(self.relative_to_base("/datasets/play_tennis.data"),
                     delimiter=" ")
     classifier = DecisionTree(training)
     sample = pd.Series(
         ["Rain", "Cool", "High", "Strong"],
         index=['Outlook', 'Temperature', 'Humidity', 'Wind'])
     self.assertEqual(classifier.classify(sample), "No")
 def test_data_has_value_not_in_training(self):
     training = load(self.relative_to_base("/datasets/play_tennis.data"), 
                     delimiter=" ")
     classifier = DecisionTree(training)
     
     # NOTE: Snowing is not a value of Outlook seen in the training set
     sample = pd.Series({"Outlook": "Snowing", "Temperature": "Cool", 
                         "Humidity": "Normal", "Wind": "Strong"})
     
     assert_that(classifier.classify(sample), equal_to("Yes"))
 def test_classify_all_weekends(self):
     training = load(self.relative_to_base("/datasets/weekends.data"))
     classifier = DecisionTree(training)
     index = ['weather', 'parents', 'money']
     sample_0 = pd.Series(["windy", "no", "rich"], index=index)
     sample_1 = pd.Series(["sunny", "yes", "rich"], index=index)
     results = classifier.classify_all(
                     DataSet(pd.DataFrame([sample_0, sample_1])))
     assert_that(results.get_classifications(), 
                 equals_series({0: "shopping", 1: "cinema"}))
Exemple #12
0
    def test_normalize_features(self):
        # Note that last column is the training target, so we omitt it by
        # loading it as labels.
        dataset = loader.load(self.relative_to_base("datasets/ex1data2.txt"),
                              has_ids=False, has_header=False, has_labels=True)
        dataset.normalize_features()

        expected = np.loadtxt(self.relative_to_base("datasets/ex1data2norm.txt"),
                              delimiter=",")
        assert_that(dataset, equals_dataset(expected.tolist(), places=15))
Exemple #13
0
 def test_load_labelled(self):
     dataset = load(self.relative_to_base("datasets/3f_ids_header.csv"))
     self.assertTrue(dataset.is_labelled())
     labels = dataset.get_labels()
     assert_that(
         labels,
         equals_series({
             "V01": "c",
             "V02": "b",
             "V03": "b",
             "V04": "a"
         }))
Exemple #14
0
 def load_car_data(self):
     """
     Loads an example training set and related sample.
     """
     training_set = loader.load(self.relative_to_base("datasets/"
                                                      "car_thefts.data"))
     sample = {
               "color": "red",
               "type": "suv",
               "origin": "domestic"
               }
     return training_set, sample
Exemple #15
0
 def test_classify_all_weekends(self):
     training = load(self.relative_to_base("/datasets/weekends.data"))
     classifier = DecisionTree(training)
     index = ['weather', 'parents', 'money']
     sample_0 = pd.Series(["windy", "no", "rich"], index=index)
     sample_1 = pd.Series(["sunny", "yes", "rich"], index=index)
     results = classifier.classify_all(
         DataSet(pd.DataFrame([sample_0, sample_1])))
     assert_that(results.get_classifications(),
                 equals_series({
                     0: "shopping",
                     1: "cinema"
                 }))
Exemple #16
0
    def test_data_has_value_not_in_training(self):
        training = load(self.relative_to_base("/datasets/play_tennis.data"),
                        delimiter=" ")
        classifier = DecisionTree(training)

        # NOTE: Snowing is not a value of Outlook seen in the training set
        sample = pd.Series({
            "Outlook": "Snowing",
            "Temperature": "Cool",
            "Humidity": "Normal",
            "Wind": "Strong"
        })

        assert_that(classifier.classify(sample), equal_to("Yes"))
Exemple #17
0
    def test_id3_build_tree_marine_animals(self):
        dataset = load(self.relative_to_base("/datasets/marine_animal.data"))
        tree = id3.build_tree(dataset)

        assert_that(
            tree,
            equals_tree({
                "no_surfacing": {
                    "no": "no",
                    "yes": {
                        "has_flippers": {
                            "no": "no",
                            "yes": "yes"
                        }
                    }
                }
            }))
 def test_id3_build_tree_marine_animals(self):
     dataset = load(self.relative_to_base("/datasets/marine_animal.data"))
     tree = id3.build_tree(dataset)
     
     assert_that(tree,
         equals_tree(
             {"no_surfacing": {
                 "no": "no",
                 "yes": {
                     "has_flippers": {
                         "no": "no",
                         "yes": "yes"
                     }
                 }
              }
             }
         )
     )
    def test_gradient_descent_3_parameters(self):
        """
        Test based on Assignment 1 of the free online
        Stanford Machine Learning online course.
        """
        dataset = loader.load(self.relative_to_base("datasets/ex1data2.txt"),
                              has_ids=False, has_header=False, has_labels=True,
                              delimiter=",")
        dataset.normalize_features()
        dataset.set_column("bias", pd.Series([1] * dataset.num_samples()))

        learning_rate = 1.0
        iter = 50

        initial_theta = pd.Series({0: 0, 1: 0, "bias": 0})
        theta = optimize.gradient_descent(dataset, initial_theta,
                                          learning_rate, iterations=iter)

        assert_that(theta, equals_series({0: 110631.050279,
                                          1: -6649.474271,
                                         "bias": 340412.659574},
                                         places=6))
 def test_id3_build_tree_play_tennis(self):
     dataset = load(self.relative_to_base("/datasets/play_tennis.data"),
                    delimiter=" ")
     tree = id3.build_tree(dataset)
     
     assert_that(tree,
         equals_tree(
             {"Outlook": {
                 "Sunny": {
                     "Humidity": {
                         "High": "No",
                         "Normal": "Yes"
                     }
                 },
                 "Overcast": "Yes",
                 "Rain": {
                     "Wind": {
                         "Strong": "No",
                         "Weak": "Yes"
                     }
                 }
             }}
         )
     )
Exemple #21
0
    def test_id3_build_tree_play_tennis(self):
        dataset = load(self.relative_to_base("/datasets/play_tennis.data"),
                       delimiter=" ")
        tree = id3.build_tree(dataset)

        assert_that(
            tree,
            equals_tree({
                "Outlook": {
                    "Sunny": {
                        "Humidity": {
                            "High": "No",
                            "Normal": "Yes"
                        }
                    },
                    "Overcast": "Yes",
                    "Rain": {
                        "Wind": {
                            "Strong": "No",
                            "Weak": "Yes"
                        }
                    }
                }
            }))
 def test_id3_choose_feature_to_split(self):
     data = load(self.relative_to_base("/datasets/weekends.data"))
     root = id3.choose_feature_to_split(data)
     self.assertEqual(root, "weather")
Exemple #23
0
 def test_load_has_ids(self):
     dataset = load(self.relative_to_base("datasets/3f_ids_header.csv"))
     self.assertEqual(dataset.num_features(), 3)
     self.assertEqual(dataset.num_samples(), 4)
Exemple #24
0
 def test_load_tsv(self):
     data_set = load(self.relative_to_base("datasets/3f_header.tsv"),
                     delimiter="\t",
                     has_ids=False)
     self.assertEqual(data_set.num_features(), 3)
     self.assertEqual(data_set.num_samples(), 4)
Exemple #25
0
 def test_load_csv_no_header(self):
     data_set = load(self.relative_to_base("datasets/3f_no_header.csv"),
                     has_header=False,
                     has_ids=False)
     self.assertEqual(data_set.num_features(), 3)
     self.assertEqual(data_set.num_samples(), 4)
Exemple #26
0
 def test_load_csv_no_header(self):
     data_set = load(self.relative_to_base("datasets/3f_no_header.csv"), 
                     has_header=False, has_ids=False)
     self.assertEqual(data_set.num_features(), 3)
     self.assertEqual(data_set.num_samples(), 4)
Exemple #27
0
 def test_load_tsv(self):
     data_set = load(self.relative_to_base("datasets/3f_header.tsv"), 
                     delimiter="\t", has_ids=False)
     self.assertEqual(data_set.num_features(), 3)
     self.assertEqual(data_set.num_samples(), 4)
Exemple #28
0
 def test_id3_choose_feature_to_split(self):
     data = load(self.relative_to_base("/datasets/weekends.data"))
     root = id3.choose_feature_to_split(data)
     self.assertEqual(root, "weather")
Exemple #29
0
 def test_load_has_ids(self):
     dataset = load(self.relative_to_base("datasets/3f_ids_header.csv"))
     self.assertEqual(dataset.num_features(), 3)
     self.assertEqual(dataset.num_samples(), 4)
Exemple #30
0
 def test_load_labelled(self):
     dataset = load(self.relative_to_base("datasets/3f_ids_header.csv"))
     self.assertTrue(dataset.is_labelled())
     labels = dataset.get_labels()
     assert_that(labels, equals_series({"V01": "c", "V02": "b", "V03": "b", 
                                        "V04": "a"}))
Exemple #31
0
 def test_classify_weekends(self):
     training = load(self.relative_to_base("/datasets/weekends.data"))
     classifier = DecisionTree(training)
     sample = pd.Series(["windy", "no", "rich"],
                        index=['weather', 'parents', 'money'])
     self.assertEqual(classifier.classify(sample), "shopping")
 def test_classify_weekends(self):
     training = load(self.relative_to_base("/datasets/weekends.data"))
     classifier = DecisionTree(training)
     sample = pd.Series(["windy", "no", "rich"],
                        index=['weather', 'parents', 'money'])
     self.assertEqual(classifier.classify(sample), "shopping")