def test_unequal_split(self): dataset = DataSet([[1, 2], [3, 4], [5, 6], [7, 8]]) first, second = dataset.split(0.3) self.assertEqual(first.num_samples(), 1) assert_that(first, equals_dataset([[1, 2]])) self.assertEqual(second.num_samples(), 3) assert_that(second, equals_dataset([[3, 4], [5, 6], [7, 8]]))
def test_split_0(self): dataset = DataSet([[1, 2], [3, 4], [5, 6], [7, 8]]) first, second = dataset.split(0) self.assertEqual(first.num_samples(), 0) assert_that(first, equals_dataset([])) self.assertEqual(second.num_samples(), 4) assert_that(second, equals_dataset([[1, 2], [3, 4], [5, 6], [7, 8]]))
def test_as_dataset(self): original = DataSet([[1, 2], [3, 4], [5, 6]]) dataset = as_dataset(original) self.assertTrue(dataset is original) dataset._dataframe.ix[1] = 1 assert_that(dataset, equals_dataset([[1, 2], [1, 1], [5, 6]])) assert_that(original, equals_dataset([[1, 2], [1, 1], [5, 6]]))
def test_contruct_dataset_from_dataset(self): original = DataSet([[1, 2], [3, 4], [5, 6]]) new = DataSet(original) self.assertFalse(new is original) new._dataframe.ix[1] = 1 assert_that(new, equals_dataset([[1, 2], [1, 1], [5, 6]])) assert_that(original, equals_dataset([[1, 2], [3, 4], [5, 6]]))
def test_copy(self): dataset1 = DataSet([[1, 2], [3, 4]], labels=pd.Series(["a", "b"])) dataset2 = dataset1.copy() dataset2.set_column(1, pd.Series([4, 5])) assert_that(dataset2, equals_dataset([[1, 4], [3, 5]])) assert_that(dataset2.get_labels(), equals_series({0: "a", 1: "b"})) assert_that(dataset1, equals_dataset([[1, 2], [3, 4]])) assert_that(dataset2.get_labels(), equals_series({0: "a", 1: "b"}))
def test_copy_no_labels(self): dataset1 = DataSet([[1, 2], [3, 4]]) dataset2 = dataset1.copy() dataset2.set_column(1, pd.Series([4, 5])) assert_that(dataset2, equals_dataset([[1, 4], [3, 5]])) self.assertFalse(dataset2.is_labelled()) assert_that(dataset1, equals_dataset([[1, 2], [3, 4]])) self.assertFalse(dataset1.is_labelled())
def test_drop_column(self): original = DataSet([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) self.assertEqual(original.num_features(), 3) filtered = original.drop_column(1) self.assertEqual(filtered.num_features(), 2) assert_that(filtered, equals_dataset([[1, 3], [4, 6], [7, 9]])) # make sure original unchanged self.assertEqual(original.num_features(), 3) assert_that(original, equals_dataset([[1, 2, 3], [4, 5, 6], [7, 8, 9]]))
def test_otago_example(self): dataset = self.create_otago_dataset() transformed = self.get_transformed_otago_data() principal_components = pca.pca(dataset, 2) assert_that(principal_components, equals_dataset(transformed, places=2))
def test_slice_features_list_indices(self): df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) labels = ["m", "f", "m"] dataset = DataSet(df, labels=labels) sliced = dataset.slice_features([1, 2]) assert_that(sliced, equals_dataset([[2, 3], [5, 6], [8, 9]])) assert_that(sliced.feature_list(), contains(1, 2)) assert_that(sliced.get_labels(), contains(*labels))
def test_bin_feature(self): df = pd.DataFrame([[0, 1], [7, 2], [6, 3]], columns=["MATH100", "PHYS125"]) dataset = DataSet(df) dataset.bin("MATH100", [4, 7]) assert_that(dataset, equals_dataset([[0, 1], [2, 2], [1, 3]]))
def test_bin_feature_floats(self): df = pd.DataFrame([[3.5, 1], [9.1, 2], [6.2, 3]], columns=["MATH100", "PHYS125"]) dataset = DataSet(df) dataset.bin("MATH100", [3.9, 7], bin_names=["low", "mid", "high"]) assert_that(dataset, equals_dataset([["low", 1], ["high", 2], ["mid", 3]]))
def test_bin_feature_1_boundary(self): df = pd.DataFrame([[0, 1], [9, 2], [6, 3]], columns=["MATH100", "PHYS125"]) dataset = DataSet(df) dataset.bin("MATH100", [3], bin_names=["low", "high"]) assert_that(dataset, equals_dataset([["low", 1], ["high", 2], ["high", 3]]))
def test_bin_all(self): df = pd.DataFrame([[0, 6], [9, 2], [6, 4]], columns=["MATH100", "PHYS125"]) dataset = DataSet(df) dataset.bin("*", [4, 7], bin_names=["low", "mid", "high"]) assert_that(dataset, equals_dataset([["low", "mid"], ["high", "low"], ["mid", "mid"]]))
def test_slice_features_list_string(self): df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["weight", "height", "age"]) labels = ["m", "f", "m"] dataset = DataSet(df, labels=labels) sliced = dataset.slice_features(["weight", "height"]) assert_that(sliced, equals_dataset([[1, 2], [4, 5], [7, 8]])) assert_that(sliced.feature_list(), contains("weight", "height")) assert_that(sliced.get_labels(), contains(*labels))
def test_remove_means(self): dataset = DataSet([[4, 1, 9], [2, 3, 0], [5, 1, 3]]) # column means are: 3.6667, 1.6667, 4 pca.remove_means(dataset) assert_that( dataset, equals_dataset( [[0.33, -0.67, 5], [-1.67, 1.33, -4], [1.33, -0.67, -1]], places=2))
def test_drop_empty_samples(self): df = pd.DataFrame([[1, 2, np.NAN], [np.NAN, np.NAN, np.NAN], [7, 8, 9]]) original = DataSet(df, labels=["a", "b", "c"]) filtered = original.drop_empty_samples() assert_that(filtered.feature_list(), has_length(3)) assert_that(filtered.num_samples(), equal_to(2)) assert_that(filtered, equals_dataset([[1, 2, np.NAN], [7, 8, 9]])) assert_that(filtered.get_labels(), contains("a", "c"))
def test_filter_by_label(self): features = ["name", "hair colour"] df = pd.DataFrame([["Bill", "brown"], ["Bob", "black"], ["Jim", "brown"]], columns=features) dataset = DataSet(df, labels=["SENG", "SENG", "CENG"]) filtered = dataset.label_filter("SENG") assert_that(filtered, equals_dataset([["Bill", "brown"], ["Bob", "black"]])) assert_that(filtered.get_labels(), equals_series({0: "SENG", 1: "SENG"}))
def test_filter_by_feature_value(self): features = ["name", "hair colour"] df = pd.DataFrame([["Bill", "brown"], ["Bob", "black"], ["Jim", "brown"]], columns=features) dataset = DataSet(df) filtered = dataset.value_filter("hair colour", "brown") self.assertEqual(filtered.feature_list(), features) assert_that(filtered.get_sample_ids(), contains(0, 2)) assert_that(filtered, equals_dataset([["Bill", "brown"], ["Jim", "brown"]]))
def test_normalize_features(self): # Note that last column is the training target, so we omitt it by # loading it as labels. dataset = loader.load(self.relative_to_base("datasets/ex1data2.txt"), has_ids=False, has_header=False, has_labels=True) dataset.normalize_features() expected = np.loadtxt(self.relative_to_base("datasets/ex1data2norm.txt"), delimiter=",") assert_that(dataset, equals_dataset(expected.tolist(), places=15))
def test_get_labelled_rows(self): dataset = DataSet([[1, 2], [3, 4], [5, 6], [7, 8]], labels=["a", "a", "b", "b"]) selection = dataset.get_rows([1, 3]) self.assertEqual(selection.num_samples(), 2) self.assertTrue(selection.is_labelled()) # TODO incorporate labels equals_series into DataSet matcher? assert_that(selection, equals_dataset([[3, 4], [7, 8]])) assert_that(selection.get_labels(), equals_series({1: "a", 3: "b"}))
def test_drop_empty_samples_original_unchanged(self): data_list = [[1, 2, np.NAN], [np.NAN, np.NAN, np.NAN], [7, 8, 9]] label_list = ["a", "b", "c"] original = DataSet(pd.DataFrame(data_list), labels=label_list) filtered = original.drop_empty_samples() filtered.set_column(0, [-1, -1]) filtered.labels[0] = "z" assert_that(original, equals_dataset(data_list)) assert_that(original.get_labels(), contains(*label_list))
def test_filter_by_multiple_labels(self): features = ["name", "hair colour"] df = pd.DataFrame([["Will", "black"], ["Rob", "blonde"], ["Bill", "brown"], ["Bob", "black"], ["Jim", "brown"]], columns=features) dataset = DataSet(df, labels=["ELEC", "SENG", "ELEC", "CENG", "SENG"]) filtered = dataset.label_filter(["SENG", "CENG"]) assert_that(filtered, equals_dataset([["Rob", "blonde"], ["Bob", "black"], ["Jim", "brown"]])) assert_that(filtered.get_labels(), equals_series({1: "SENG", 3: "CENG", 4: "SENG"}))
def test_fill_missing_with_feature_means(self): dataset = DataSet([[2, np.NaN, np.NaN], [np.NaN, 6, 10], [5, 4, np.NaN]]) dataset.fill_missing_with_feature_means() assert_that(dataset, equals_dataset([[2, 5, 10], [3.5, 6, 10], [5, 4, 10]]))
def test_get_rows(self): dataset = DataSet([[1, 2], [3, 4], [5, 6], [7, 8]]) selection = dataset.get_rows([1, 3]) self.assertEqual(selection.num_samples(), 2) assert_that(selection, equals_dataset([[3, 4], [7, 8]]))
def test_fill_missing(self): dataset = DataSet([[1, np.NaN, 3], [np.NaN, 5, np.NaN]]) dataset.fill_missing(0) assert_that(dataset, equals_dataset([[1, 0, 3], [0, 5, 0]]))
def test_fill_missing_with_feature_means_feature_all_empty(self): dataset = DataSet([[2, np.NaN, np.NaN], [7, np.NaN, 10], [5, np.NaN, np.NaN]]) dataset.fill_missing_with_feature_means() assert_that(dataset, equals_dataset([[2, 0, 10], [7, 0, 10], [5, 0, 10]]))
def test_create_dataset_from_numpy_array(self): as_list = [[0, 1], [2, 3]] np_array = np.array(as_list) dataset = DataSet(np_array) assert_that(dataset, equals_dataset(as_list))
def test_set_new_column(self): dataset = DataSet([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) dataset.set_column(3, [11, 11, 11]) assert_that(dataset, equals_dataset([[1, 2, 3, 11], [4, 5, 6, 11], [7, 8, 9, 11]]))