Ejemplo n.º 1
0
 def test_unequal_split(self):
     dataset = DataSet([[1, 2], [3, 4], [5, 6], [7, 8]])
     first, second = dataset.split(0.3)
     self.assertEqual(first.num_samples(), 1)
     assert_that(first, equals_dataset([[1, 2]]))
     self.assertEqual(second.num_samples(), 3)
     assert_that(second, equals_dataset([[3, 4], [5, 6], [7, 8]]))
Ejemplo n.º 2
0
 def test_split_0(self):
     dataset = DataSet([[1, 2], [3, 4], [5, 6], [7, 8]])
     first, second = dataset.split(0)
     self.assertEqual(first.num_samples(), 0)
     assert_that(first, equals_dataset([]))
     self.assertEqual(second.num_samples(), 4)
     assert_that(second, equals_dataset([[1, 2], [3, 4], [5, 6], [7, 8]]))
Ejemplo n.º 3
0
 def test_as_dataset(self):
     original = DataSet([[1, 2], [3, 4], [5, 6]])
     dataset = as_dataset(original)
     self.assertTrue(dataset is original)
     dataset._dataframe.ix[1] = 1
     assert_that(dataset, equals_dataset([[1, 2], [1, 1], [5, 6]]))
     assert_that(original, equals_dataset([[1, 2], [1, 1], [5, 6]]))
Ejemplo n.º 4
0
 def test_contruct_dataset_from_dataset(self):
     original = DataSet([[1, 2], [3, 4], [5, 6]])
     new = DataSet(original)
     self.assertFalse(new is original)
     new._dataframe.ix[1] = 1
     assert_that(new, equals_dataset([[1, 2], [1, 1], [5, 6]]))
     assert_that(original, equals_dataset([[1, 2], [3, 4], [5, 6]]))
Ejemplo n.º 5
0
 def test_copy(self):
     dataset1 = DataSet([[1, 2], [3, 4]], labels=pd.Series(["a", "b"]))
     dataset2 = dataset1.copy()
     dataset2.set_column(1, pd.Series([4, 5]))
     
     assert_that(dataset2, equals_dataset([[1, 4], [3, 5]]))
     assert_that(dataset2.get_labels(), equals_series({0: "a", 1: "b"}))
     assert_that(dataset1, equals_dataset([[1, 2], [3, 4]]))
     assert_that(dataset2.get_labels(), equals_series({0: "a", 1: "b"}))
Ejemplo n.º 6
0
 def test_copy_no_labels(self):
     dataset1 = DataSet([[1, 2], [3, 4]])
     dataset2 = dataset1.copy()
     dataset2.set_column(1, pd.Series([4, 5]))
     
     assert_that(dataset2, equals_dataset([[1, 4], [3, 5]]))
     self.assertFalse(dataset2.is_labelled())
     assert_that(dataset1, equals_dataset([[1, 2], [3, 4]]))
     self.assertFalse(dataset1.is_labelled())
Ejemplo n.º 7
0
 def test_drop_column(self):
     original = DataSet([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
     self.assertEqual(original.num_features(), 3)
     filtered = original.drop_column(1)
     self.assertEqual(filtered.num_features(), 2)
     assert_that(filtered, equals_dataset([[1, 3], [4, 6], [7, 9]]))
     # make sure original unchanged
     self.assertEqual(original.num_features(), 3)
     assert_that(original, equals_dataset([[1, 2, 3], [4, 5, 6], 
                                           [7, 8, 9]]))
Ejemplo n.º 8
0
    def test_otago_example(self):
        dataset = self.create_otago_dataset()
        transformed = self.get_transformed_otago_data()

        principal_components = pca.pca(dataset, 2)
        assert_that(principal_components, equals_dataset(transformed,
                                                         places=2))
Ejemplo n.º 9
0
 def test_slice_features_list_indices(self):
     df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
     labels = ["m", "f", "m"]
     dataset = DataSet(df, labels=labels)
     sliced = dataset.slice_features([1, 2])
     assert_that(sliced, equals_dataset([[2, 3], [5, 6], [8, 9]]))
     assert_that(sliced.feature_list(), contains(1, 2))
     assert_that(sliced.get_labels(), contains(*labels))
Ejemplo n.º 10
0
 def test_bin_feature(self):
     df = pd.DataFrame([[0, 1], [7, 2], [6, 3]], 
                       columns=["MATH100", "PHYS125"])
     dataset = DataSet(df)
     
     dataset.bin("MATH100", [4, 7])
     
     assert_that(dataset, equals_dataset([[0, 1], [2, 2], [1, 3]]))
Ejemplo n.º 11
0
 def test_bin_feature_floats(self):
     df = pd.DataFrame([[3.5, 1], [9.1, 2], [6.2, 3]], 
                       columns=["MATH100", "PHYS125"])
     dataset = DataSet(df)
     
     dataset.bin("MATH100", [3.9, 7], bin_names=["low", "mid", "high"])
     
     assert_that(dataset, equals_dataset([["low", 1], ["high", 2], 
                                          ["mid", 3]]))
Ejemplo n.º 12
0
 def test_bin_feature_1_boundary(self):
     df = pd.DataFrame([[0, 1], [9, 2], [6, 3]], 
                       columns=["MATH100", "PHYS125"])
     dataset = DataSet(df)
     
     dataset.bin("MATH100", [3], bin_names=["low", "high"])
     
     assert_that(dataset, equals_dataset([["low", 1], ["high", 2], 
                                          ["high", 3]]))
Ejemplo n.º 13
0
 def test_bin_all(self):
     df = pd.DataFrame([[0, 6], [9, 2], [6, 4]], 
                       columns=["MATH100", "PHYS125"])
     dataset = DataSet(df)
     
     dataset.bin("*", [4, 7], bin_names=["low", "mid", "high"])
     
     assert_that(dataset, equals_dataset([["low", "mid"], ["high", "low"], 
                                          ["mid", "mid"]]))
Ejemplo n.º 14
0
 def test_slice_features_list_string(self):
     df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
                       columns=["weight", "height", "age"])
     labels = ["m", "f", "m"]
     dataset = DataSet(df, labels=labels)
     sliced = dataset.slice_features(["weight", "height"])
     assert_that(sliced, equals_dataset([[1, 2], [4, 5], [7, 8]]))
     assert_that(sliced.feature_list(), contains("weight", "height"))
     assert_that(sliced.get_labels(), contains(*labels))
Ejemplo n.º 15
0
 def test_remove_means(self):
     dataset = DataSet([[4, 1, 9], [2, 3, 0], [5, 1, 3]])
     # column means are: 3.6667, 1.6667, 4
     pca.remove_means(dataset)
     assert_that(
         dataset,
         equals_dataset(
             [[0.33, -0.67, 5], [-1.67, 1.33, -4], [1.33, -0.67, -1]],
             places=2))
Ejemplo n.º 16
0
    def test_drop_empty_samples(self):
        df = pd.DataFrame([[1, 2, np.NAN], [np.NAN, np.NAN, np.NAN], [7, 8, 9]])
        original = DataSet(df, labels=["a", "b", "c"])

        filtered = original.drop_empty_samples()
        assert_that(filtered.feature_list(), has_length(3))
        assert_that(filtered.num_samples(), equal_to(2))
        assert_that(filtered, equals_dataset([[1, 2, np.NAN], [7, 8, 9]]))
        assert_that(filtered.get_labels(), contains("a", "c"))
Ejemplo n.º 17
0
 def test_filter_by_label(self):
     features = ["name", "hair colour"]
     df = pd.DataFrame([["Bill", "brown"], ["Bob", "black"], 
                        ["Jim", "brown"]], columns=features)
     dataset = DataSet(df, labels=["SENG", "SENG", "CENG"])
     filtered = dataset.label_filter("SENG")
     assert_that(filtered, equals_dataset([["Bill", "brown"], 
                                           ["Bob", "black"]]))
     assert_that(filtered.get_labels(), 
                 equals_series({0: "SENG", 1: "SENG"}))
Ejemplo n.º 18
0
 def test_filter_by_feature_value(self):
     features = ["name", "hair colour"]
     df = pd.DataFrame([["Bill", "brown"], ["Bob", "black"], 
                        ["Jim", "brown"]], columns=features)
     dataset = DataSet(df)
     filtered = dataset.value_filter("hair colour", "brown")
     self.assertEqual(filtered.feature_list(), features)
     assert_that(filtered.get_sample_ids(), contains(0, 2))
     assert_that(filtered, equals_dataset([["Bill", "brown"], 
                                           ["Jim", "brown"]]))
Ejemplo n.º 19
0
    def test_normalize_features(self):
        # Note that last column is the training target, so we omitt it by
        # loading it as labels.
        dataset = loader.load(self.relative_to_base("datasets/ex1data2.txt"),
                              has_ids=False, has_header=False, has_labels=True)
        dataset.normalize_features()

        expected = np.loadtxt(self.relative_to_base("datasets/ex1data2norm.txt"),
                              delimiter=",")
        assert_that(dataset, equals_dataset(expected.tolist(), places=15))
Ejemplo n.º 20
0
 def test_get_labelled_rows(self):
     dataset = DataSet([[1, 2], [3, 4], [5, 6], [7, 8]], 
                       labels=["a", "a", "b", "b"])
     selection = dataset.get_rows([1, 3])
     
     self.assertEqual(selection.num_samples(), 2)
     self.assertTrue(selection.is_labelled())
     # TODO incorporate labels equals_series into DataSet matcher?
     assert_that(selection, equals_dataset([[3, 4], [7, 8]]))
     assert_that(selection.get_labels(), equals_series({1: "a", 3: "b"}))
Ejemplo n.º 21
0
    def test_drop_empty_samples_original_unchanged(self):
        data_list = [[1, 2, np.NAN], [np.NAN, np.NAN, np.NAN], [7, 8, 9]]
        label_list = ["a", "b", "c"]
        original = DataSet(pd.DataFrame(data_list), labels=label_list)

        filtered = original.drop_empty_samples()
        filtered.set_column(0, [-1, -1])
        filtered.labels[0] = "z"

        assert_that(original, equals_dataset(data_list))
        assert_that(original.get_labels(), contains(*label_list))
Ejemplo n.º 22
0
 def test_filter_by_multiple_labels(self):
     features = ["name", "hair colour"]
     df = pd.DataFrame([["Will", "black"], ["Rob", "blonde"],
                        ["Bill", "brown"], ["Bob", "black"],
                        ["Jim", "brown"]], columns=features)
     dataset = DataSet(df, labels=["ELEC", "SENG", "ELEC", "CENG", "SENG"])
     filtered = dataset.label_filter(["SENG", "CENG"])
     assert_that(filtered, equals_dataset([["Rob", "blonde"],
                                           ["Bob", "black"],
                                           ["Jim", "brown"]]))
     assert_that(filtered.get_labels(),
                 equals_series({1: "SENG", 3: "CENG", 4: "SENG"}))
Ejemplo n.º 23
0
 def test_fill_missing_with_feature_means(self):
     dataset = DataSet([[2, np.NaN, np.NaN], [np.NaN, 6, 10], 
                        [5, 4, np.NaN]])
     dataset.fill_missing_with_feature_means()
     assert_that(dataset, equals_dataset([[2, 5, 10], [3.5, 6, 10], 
                                          [5, 4, 10]]))
Ejemplo n.º 24
0
 def test_get_rows(self):
     dataset = DataSet([[1, 2], [3, 4], [5, 6], [7, 8]])
     selection = dataset.get_rows([1, 3])
     
     self.assertEqual(selection.num_samples(), 2)
     assert_that(selection, equals_dataset([[3, 4], [7, 8]]))
Ejemplo n.º 25
0
 def test_fill_missing(self):
     dataset = DataSet([[1, np.NaN, 3], [np.NaN, 5, np.NaN]])
     dataset.fill_missing(0)
     assert_that(dataset, equals_dataset([[1, 0, 3], [0, 5, 0]]))
Ejemplo n.º 26
0
 def test_fill_missing_with_feature_means_feature_all_empty(self):
     dataset = DataSet([[2, np.NaN, np.NaN], [7, np.NaN, 10], 
                        [5, np.NaN, np.NaN]])
     dataset.fill_missing_with_feature_means()
     assert_that(dataset, equals_dataset([[2, 0, 10], [7, 0, 10], 
                                          [5, 0, 10]]))
Ejemplo n.º 27
0
    def test_create_dataset_from_numpy_array(self):
        as_list = [[0, 1], [2, 3]]
        np_array = np.array(as_list)
        dataset = DataSet(np_array)

        assert_that(dataset, equals_dataset(as_list))
Ejemplo n.º 28
0
 def test_set_new_column(self):
     dataset = DataSet([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
     dataset.set_column(3, [11, 11, 11])
     assert_that(dataset, equals_dataset([[1, 2, 3, 11], [4, 5, 6, 11], 
                                          [7, 8, 9, 11]]))