Ejemplo n.º 1
0
 def test_has_missing_values(self):
     dataset1 = DataSet([[4.2, np.NaN, 3.1], [2.5, 1.9, np.NaN], 
                         [1.1, 1.2, 1.7]])
     self.assertTrue(dataset1.has_missing_values())
     
     dataset2 = DataSet([[4.2, 3.9, 3.1], [2.5, 1.9, 2.2], [1.1, 1.2, 1.7]])
     self.assertFalse(dataset2.has_missing_values())
Ejemplo n.º 2
0
 def test_split_0(self):
     dataset = DataSet([[1, 2], [3, 4], [5, 6], [7, 8]])
     first, second = dataset.split(0)
     self.assertEqual(first.num_samples(), 0)
     assert_that(first, equals_dataset([]))
     self.assertEqual(second.num_samples(), 4)
     assert_that(second, equals_dataset([[1, 2], [3, 4], [5, 6], [7, 8]]))
Ejemplo n.º 3
0
 def test_contruct_dataset_from_dataset(self):
     original = DataSet([[1, 2], [3, 4], [5, 6]])
     new = DataSet(original)
     self.assertFalse(new is original)
     new._dataframe.ix[1] = 1
     assert_that(new, equals_dataset([[1, 2], [1, 1], [5, 6]]))
     assert_that(original, equals_dataset([[1, 2], [3, 4], [5, 6]]))
Ejemplo n.º 4
0
 def test_get_row(self):
     dataset = DataSet([[1, 2], [3, 4], [5, 6], [7, 8]])
     row = dataset.get_row(1)
     assert_that(row.values, contains(3, 4))
     # check that changes made to selected row are reflected in original
     row[:] = 1
     assert_that(dataset.get_row(1), contains(1, 1))
Ejemplo n.º 5
0
 def test_split_random(self):
     dataset = DataSet([[1, 2], [3, 4], [5, 6], [7, 8]])
     first, second = dataset.split(0.5, random=True)
     # since the split is random, can't assert that first or second 
     # contain particular rows, just the number of rows
     self.assertEqual(first.num_samples(), 2)
     self.assertEqual(second.num_samples(), 2)
Ejemplo n.º 6
0
 def test_get_label_value_counts(self):
     dataset = DataSet([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]], 
                       labels=["a", "b", "b", "c", "a", "b"])
     expected = {"a": 2, "b": 3, "c": 1}
     value_counts = dataset.get_label_value_counts()
     assert_that(value_counts, equals_series(expected))
     assert_that(value_counts.index, contains("b", "a", "c"))
Ejemplo n.º 7
0
 def test_unequal_split(self):
     dataset = DataSet([[1, 2], [3, 4], [5, 6], [7, 8]])
     first, second = dataset.split(0.3)
     self.assertEqual(first.num_samples(), 1)
     assert_that(first, equals_dataset([[1, 2]]))
     self.assertEqual(second.num_samples(), 3)
     assert_that(second, equals_dataset([[3, 4], [5, 6], [7, 8]]))
Ejemplo n.º 8
0
 def test_split_labelled(self):
     dataset = DataSet([[1, 2], [3, 4], [5, 6], [7, 8]], 
                       labels=["b", "b", "b", "a"])
     first, second = dataset.split(0.5)
     self.assertTrue(first.is_labelled())
     assert_that(first.get_labels(), equals_series({0: "b", 1: "b"}))
     self.assertTrue(second.is_labelled())
     assert_that(second.get_labels(), equals_series({2: "b", 3: "a"}))
Ejemplo n.º 9
0
 def test_filter_by_feature_value_with_labels(self):
     features = ["name", "hair colour"]
     df = pd.DataFrame([["Bill", "brown"], ["Bob", "black"], 
                        ["Jim", "brown"]], columns=features)
     dataset = DataSet(df, labels=["SENG", "SENG", "CENG"])
     filtered = dataset.value_filter("hair colour", "brown")
     assert_that(filtered.get_labels(), 
                 equals_series({0: "SENG", 2: "CENG"}))
Ejemplo n.º 10
0
 def test_bin_feature(self):
     df = pd.DataFrame([[0, 1], [7, 2], [6, 3]], 
                       columns=["MATH100", "PHYS125"])
     dataset = DataSet(df)
     
     dataset.bin("MATH100", [4, 7])
     
     assert_that(dataset, equals_dataset([[0, 1], [2, 2], [1, 3]]))
Ejemplo n.º 11
0
 def test_slice_features_list_indices(self):
     df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
     labels = ["m", "f", "m"]
     dataset = DataSet(df, labels=labels)
     sliced = dataset.slice_features([1, 2])
     assert_that(sliced, equals_dataset([[2, 3], [5, 6], [8, 9]]))
     assert_that(sliced.feature_list(), contains(1, 2))
     assert_that(sliced.get_labels(), contains(*labels))
Ejemplo n.º 12
0
 def test_get_row_by_id(self):
     df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], 
                       index=["V01", "V02", "V03"])
     dataset = DataSet(df)
     sample = dataset.get_row("V02")
     assert_that(sample, contains(4, 5, 6))
     # make sure position based index is still usable
     sample = dataset.get_row(1)
     assert_that(sample, contains(4, 5, 6))
Ejemplo n.º 13
0
 def test_bin_all(self):
     df = pd.DataFrame([[0, 6], [9, 2], [6, 4]], 
                       columns=["MATH100", "PHYS125"])
     dataset = DataSet(df)
     
     dataset.bin("*", [4, 7], bin_names=["low", "mid", "high"])
     
     assert_that(dataset, equals_dataset([["low", "mid"], ["high", "low"], 
                                          ["mid", "mid"]]))
Ejemplo n.º 14
0
    def test_get_labelled_data_frame(self):
        dataset = DataSet([[1, 2], [3, 4], [5, 6], [7, 8]], 
                          labels=pd.Series(["b", "b", "b", "a"]))
        df = dataset.get_labelled_data_frame()

        # TODO: non-numeric values in DataFrame matcher
        expected = [[1, 2, "b"], [3, 4, "b"], [5, 6, "b"], [7, 8, "a"]]
        for i in range(len(expected)):
            self.assertTrue(df.ix[i].tolist(), expected[i])
Ejemplo n.º 15
0
 def test_copy(self):
     dataset1 = DataSet([[1, 2], [3, 4]], labels=pd.Series(["a", "b"]))
     dataset2 = dataset1.copy()
     dataset2.set_column(1, pd.Series([4, 5]))
     
     assert_that(dataset2, equals_dataset([[1, 4], [3, 5]]))
     assert_that(dataset2.get_labels(), equals_series({0: "a", 1: "b"}))
     assert_that(dataset1, equals_dataset([[1, 2], [3, 4]]))
     assert_that(dataset2.get_labels(), equals_series({0: "a", 1: "b"}))
Ejemplo n.º 16
0
 def test_bin_feature_1_boundary(self):
     df = pd.DataFrame([[0, 1], [9, 2], [6, 3]], 
                       columns=["MATH100", "PHYS125"])
     dataset = DataSet(df)
     
     dataset.bin("MATH100", [3], bin_names=["low", "high"])
     
     assert_that(dataset, equals_dataset([["low", 1], ["high", 2], 
                                          ["high", 3]]))
Ejemplo n.º 17
0
 def test_bin_feature_floats(self):
     df = pd.DataFrame([[3.5, 1], [9.1, 2], [6.2, 3]], 
                       columns=["MATH100", "PHYS125"])
     dataset = DataSet(df)
     
     dataset.bin("MATH100", [3.9, 7], bin_names=["low", "mid", "high"])
     
     assert_that(dataset, equals_dataset([["low", 1], ["high", 2], 
                                          ["mid", 3]]))
Ejemplo n.º 18
0
 def test_copy_no_labels(self):
     dataset1 = DataSet([[1, 2], [3, 4]])
     dataset2 = dataset1.copy()
     dataset2.set_column(1, pd.Series([4, 5]))
     
     assert_that(dataset2, equals_dataset([[1, 4], [3, 5]]))
     self.assertFalse(dataset2.is_labelled())
     assert_that(dataset1, equals_dataset([[1, 2], [3, 4]]))
     self.assertFalse(dataset1.is_labelled())
Ejemplo n.º 19
0
    def test_drop_empty_samples(self):
        df = pd.DataFrame([[1, 2, np.NAN], [np.NAN, np.NAN, np.NAN], [7, 8, 9]])
        original = DataSet(df, labels=["a", "b", "c"])

        filtered = original.drop_empty_samples()
        assert_that(filtered.feature_list(), has_length(3))
        assert_that(filtered.num_samples(), equal_to(2))
        assert_that(filtered, equals_dataset([[1, 2, np.NAN], [7, 8, 9]]))
        assert_that(filtered.get_labels(), contains("a", "c"))
Ejemplo n.º 20
0
 def test_slice_features_list_string(self):
     df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
                       columns=["weight", "height", "age"])
     labels = ["m", "f", "m"]
     dataset = DataSet(df, labels=labels)
     sliced = dataset.slice_features(["weight", "height"])
     assert_that(sliced, equals_dataset([[1, 2], [4, 5], [7, 8]]))
     assert_that(sliced.feature_list(), contains("weight", "height"))
     assert_that(sliced.get_labels(), contains(*labels))
Ejemplo n.º 21
0
 def test_drop_column(self):
     original = DataSet([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
     self.assertEqual(original.num_features(), 3)
     filtered = original.drop_column(1)
     self.assertEqual(filtered.num_features(), 2)
     assert_that(filtered, equals_dataset([[1, 3], [4, 6], [7, 9]]))
     # make sure original unchanged
     self.assertEqual(original.num_features(), 3)
     assert_that(original, equals_dataset([[1, 2, 3], [4, 5, 6], 
                                           [7, 8, 9]]))
Ejemplo n.º 22
0
 def test_get_labelled_rows(self):
     dataset = DataSet([[1, 2], [3, 4], [5, 6], [7, 8]], 
                       labels=["a", "a", "b", "b"])
     selection = dataset.get_rows([1, 3])
     
     self.assertEqual(selection.num_samples(), 2)
     self.assertTrue(selection.is_labelled())
     # TODO incorporate labels equals_series into DataSet matcher?
     assert_that(selection, equals_dataset([[3, 4], [7, 8]]))
     assert_that(selection.get_labels(), equals_series({1: "a", 3: "b"}))
Ejemplo n.º 23
0
 def test_filter_by_feature_value(self):
     features = ["name", "hair colour"]
     df = pd.DataFrame([["Bill", "brown"], ["Bob", "black"], 
                        ["Jim", "brown"]], columns=features)
     dataset = DataSet(df)
     filtered = dataset.value_filter("hair colour", "brown")
     self.assertEqual(filtered.feature_list(), features)
     assert_that(filtered.get_sample_ids(), contains(0, 2))
     assert_that(filtered, equals_dataset([["Bill", "brown"], 
                                           ["Jim", "brown"]]))
Ejemplo n.º 24
0
 def test_to_string(self):
     df = pd.DataFrame([[4.2, np.NaN, 3.1], [2.5, 1.9, np.NaN], 
                        [1.1, 1.2, 1.7]], 
                       columns=["weight", "height", "length"])
     dataset = DataSet(df, labels=["cat", "bird", "bat"])
     expected = "\n".join(("Features: ['weight', 'height', 'length']",
                           "Samples: 3",
                           "Missing values? yes",
                           "Labelled? yes"))
     self.assertEqual(expected, dataset.__repr__())
Ejemplo n.º 25
0
    def test_drop_empty_samples_original_unchanged(self):
        data_list = [[1, 2, np.NAN], [np.NAN, np.NAN, np.NAN], [7, 8, 9]]
        label_list = ["a", "b", "c"]
        original = DataSet(pd.DataFrame(data_list), labels=label_list)

        filtered = original.drop_empty_samples()
        filtered.set_column(0, [-1, -1])
        filtered.labels[0] = "z"

        assert_that(original, equals_dataset(data_list))
        assert_that(original.get_labels(), contains(*label_list))
Ejemplo n.º 26
0
 def test_get_feature_value_counts(self):
     df = pd.DataFrame([["Jim", 19, 180], ["John", 18, 177], 
                        ["Jack", 19, 185]], 
                       columns=["name", "age", "height"])
     dataset = DataSet(df)
     
     age_value_counts = dataset.get_feature_value_counts("age")
     assert_that(age_value_counts, equals_series({19: 2, 18: 1}))
     
     name_value_counts = dataset.get_feature_value_counts("name")
     assert_that(name_value_counts, 
                 equals_series({"Jim": 1, "John": 1, "Jack": 1}))
Ejemplo n.º 27
0
 def test_filter_by_multiple_labels(self):
     features = ["name", "hair colour"]
     df = pd.DataFrame([["Will", "black"], ["Rob", "blonde"],
                        ["Bill", "brown"], ["Bob", "black"],
                        ["Jim", "brown"]], columns=features)
     dataset = DataSet(df, labels=["ELEC", "SENG", "ELEC", "CENG", "SENG"])
     filtered = dataset.label_filter(["SENG", "CENG"])
     assert_that(filtered, equals_dataset([["Rob", "blonde"],
                                           ["Bob", "black"],
                                           ["Jim", "brown"]]))
     assert_that(filtered.get_labels(),
                 equals_series({1: "SENG", 3: "CENG", 4: "SENG"}))
Ejemplo n.º 28
0
 def test_get_values(self):
     df = pd.DataFrame([["Jim", 19, 180], ["John", 18, 177], 
                        ["Jack", 19, 185]], 
                       columns=["name", "age", "height"])
     dataset = DataSet(df)
     
     age_values = dataset.get_feature_values("age")
     self.assertEqual(len(age_values), 2)
     self.assertTrue(19 in age_values)
     self.assertTrue(18 in age_values)
     
     height_values = dataset.get_feature_values("height")
     self.assertTrue(180 in height_values)
     self.assertTrue(185 in height_values)
     self.assertTrue(177 in height_values)
Ejemplo n.º 29
0
 def test_as_dataset(self):
     original = DataSet([[1, 2], [3, 4], [5, 6]])
     dataset = as_dataset(original)
     self.assertTrue(dataset is original)
     dataset._dataframe.ix[1] = 1
     assert_that(dataset, equals_dataset([[1, 2], [1, 1], [5, 6]]))
     assert_that(original, equals_dataset([[1, 2], [1, 1], [5, 6]]))
Ejemplo n.º 30
0
    def test_kmeans_k_3(self):
        dataset = DataSet([[3, 13], [5, 13], [2, 11], [4, 11], [6, 11], [8, 5],
                           [5, 3], [6, 2], [9, 2], [16, 14], [18, 13],
                           [16, 11], [19, 10]])
        preset_centroids = [
            pd.Series([4, 9]),
            pd.Series([10, 6]),
            pd.Series([17, 9])
        ]

        clustered = clustering.kmeans(dataset, k=3, centroids=preset_centroids)
        assert_that(
            clustered.get_cluster_assignments(),
            equals_series({
                0: 0,
                1: 0,
                2: 0,
                3: 0,
                4: 0,
                5: 1,
                6: 1,
                7: 1,
                8: 1,
                9: 2,
                10: 2,
                11: 2,
                12: 2
            }))
Ejemplo n.º 31
0
 def test_classify_all(self):
     training_set = DataSet([[1, 1], [2, 2], [11, 11], [12, 12]], 
                            labels=["a", "a", "b", "b"])
     classifier = Knn(training_set, k=3)
     dataset = [[1.5, 1.3], [12.2, 12.9]]
     classes = classifier.classify_all(dataset).get_classifications()
     assert_that(classes, contains("a", "b"))
Ejemplo n.º 32
0
    def test_slice_features_original_unchanged(self):
        df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
                          columns=["weight", "height", "age"])
        labels = ["m", "f", "m"]
        dataset = DataSet(df, labels=labels)
        sliced = dataset.slice_features(["weight", "height"])

        # Modify sliced data
        sliced.set_column("weight", [0, 0, 0])
        sliced.labels[0] = "x"

        # Check that it was indeed changed
        assert_that(sliced.get_column("weight"), contains(0, 0, 0))
        assert_that(sliced.get_labels(), contains("x", "f", "m"))

        # Verify it was not changed in the original dataset
        assert_that(dataset.get_column("weight"), contains(1, 4, 7))
        assert_that(dataset.get_labels(), contains(*labels))
Ejemplo n.º 33
0
 def test_get_column(self):
     dataset = DataSet([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
     column1 = dataset.get_column(1)
     assert_that(column1.values, contains(2, 5, 8))
Ejemplo n.º 34
0
 def test_reduce_features(self):
     dataset = DataSet([[4, 9, 8], [2, 1, 7], [5, 6, 1]])
     reduced = dataset.reduce_features(min)
     assert_that(reduced.values, contains(2, 1, 1))