def test_load_unlabelled(self): dataset = load(self.relative_to_base("datasets/" "3f_ids_header_no_labels.csv"), has_labels=False) self.assertFalse(dataset.is_labelled()) self.assertEqual(dataset.num_features(), 3) self.assertTrue(dataset.get_labels() is None)
def test_gradient_descent_2_parameters(self): """ Test based on Assignment 1 of the free online Stanford Machine Learning online course. For population = 35,000, we predict a profit of 4519.767868 For population = 70,000, we predict a profit of 45342.450129 Final cost: 4.483388 """ dataset = loader.load(self.relative_to_base("datasets/ex1data1.txt"), has_ids=False, has_header=False, has_labels=True, delimiter=",") dataset.set_column("bias", pd.Series([1] * dataset.num_samples())) learning_rate = 0.01 iter = 100 initial_theta = pd.Series({0: 0, "bias": 0}) theta = optimize.gradient_descent(dataset, initial_theta, learning_rate, iterations=iter) # assert_that(theta.tolist(), contains(-0.576556, 0.859582)) assert_that(theta, equals_series({ 0: 0.859582, "bias": -0.576556 }, places=6))
def test_id3_build_tree_weekends(self): dataset = load(self.relative_to_base("/datasets/weekends.data")) tree = id3.build_tree(dataset) assert_that(tree, equals_tree( {"weather": { "sunny": { "parents": { "yes": "cinema", "no": "tennis" } }, "windy": { "parents": { "yes": "cinema", "no": { "money": { "rich": "shopping", "poor": "cinema" } } } }, "rainy": { "money": { "poor": "cinema", "rich": "stay in" } } }} ) )
def test_gradient_descent_3_parameters(self): """ Test based on Assignment 1 of the free online Stanford Machine Learning online course. """ dataset = loader.load(self.relative_to_base("datasets/ex1data2.txt"), has_ids=False, has_header=False, has_labels=True, delimiter=",") dataset.normalize_features() dataset.set_column("bias", pd.Series([1] * dataset.num_samples())) learning_rate = 1.0 iter = 50 initial_theta = pd.Series({0: 0, 1: 0, "bias": 0}) theta = optimize.gradient_descent(dataset, initial_theta, learning_rate, iterations=iter) assert_that( theta, equals_series( { 0: 110631.050279, 1: -6649.474271, "bias": 340412.659574 }, places=6))
def test_id3_build_tree_weekends(self): dataset = load(self.relative_to_base("/datasets/weekends.data")) tree = id3.build_tree(dataset) assert_that( tree, equals_tree({ "weather": { "sunny": { "parents": { "yes": "cinema", "no": "tennis" } }, "windy": { "parents": { "yes": "cinema", "no": { "money": { "rich": "shopping", "poor": "cinema" } } } }, "rainy": { "money": { "poor": "cinema", "rich": "stay in" } } } }))
def test_gradient_descent_2_parameters(self): """ Test based on Assignment 1 of the free online Stanford Machine Learning online course. For population = 35,000, we predict a profit of 4519.767868 For population = 70,000, we predict a profit of 45342.450129 Final cost: 4.483388 """ dataset = loader.load(self.relative_to_base("datasets/ex1data1.txt"), has_ids=False, has_header=False, has_labels=True, delimiter=",") dataset.set_column("bias", pd.Series([1] * dataset.num_samples())) learning_rate = 0.01 iter = 100 initial_theta = pd.Series({0: 0, "bias": 0}) theta = optimize.gradient_descent(dataset, initial_theta, learning_rate, iterations=iter) # assert_that(theta.tolist(), contains(-0.576556, 0.859582)) assert_that(theta, equals_series({0: 0.859582, "bias": -0.576556}, places=6))
def test_classify_play_tennis(self): training = load(self.relative_to_base("/datasets/play_tennis.data"), delimiter=" ") classifier = DecisionTree(training) sample = pd.Series(["Rain", "Cool", "High", "Strong"], index=['Outlook', 'Temperature', 'Humidity', 'Wind']) self.assertEqual(classifier.classify(sample), "No")
def test_classify_play_tennis(self): training = load(self.relative_to_base("/datasets/play_tennis.data"), delimiter=" ") classifier = DecisionTree(training) sample = pd.Series( ["Rain", "Cool", "High", "Strong"], index=['Outlook', 'Temperature', 'Humidity', 'Wind']) self.assertEqual(classifier.classify(sample), "No")
def test_data_has_value_not_in_training(self): training = load(self.relative_to_base("/datasets/play_tennis.data"), delimiter=" ") classifier = DecisionTree(training) # NOTE: Snowing is not a value of Outlook seen in the training set sample = pd.Series({"Outlook": "Snowing", "Temperature": "Cool", "Humidity": "Normal", "Wind": "Strong"}) assert_that(classifier.classify(sample), equal_to("Yes"))
def test_classify_all_weekends(self): training = load(self.relative_to_base("/datasets/weekends.data")) classifier = DecisionTree(training) index = ['weather', 'parents', 'money'] sample_0 = pd.Series(["windy", "no", "rich"], index=index) sample_1 = pd.Series(["sunny", "yes", "rich"], index=index) results = classifier.classify_all( DataSet(pd.DataFrame([sample_0, sample_1]))) assert_that(results.get_classifications(), equals_series({0: "shopping", 1: "cinema"}))
def test_normalize_features(self): # Note that last column is the training target, so we omitt it by # loading it as labels. dataset = loader.load(self.relative_to_base("datasets/ex1data2.txt"), has_ids=False, has_header=False, has_labels=True) dataset.normalize_features() expected = np.loadtxt(self.relative_to_base("datasets/ex1data2norm.txt"), delimiter=",") assert_that(dataset, equals_dataset(expected.tolist(), places=15))
def test_load_labelled(self): dataset = load(self.relative_to_base("datasets/3f_ids_header.csv")) self.assertTrue(dataset.is_labelled()) labels = dataset.get_labels() assert_that( labels, equals_series({ "V01": "c", "V02": "b", "V03": "b", "V04": "a" }))
def load_car_data(self): """ Loads an example training set and related sample. """ training_set = loader.load(self.relative_to_base("datasets/" "car_thefts.data")) sample = { "color": "red", "type": "suv", "origin": "domestic" } return training_set, sample
def test_classify_all_weekends(self): training = load(self.relative_to_base("/datasets/weekends.data")) classifier = DecisionTree(training) index = ['weather', 'parents', 'money'] sample_0 = pd.Series(["windy", "no", "rich"], index=index) sample_1 = pd.Series(["sunny", "yes", "rich"], index=index) results = classifier.classify_all( DataSet(pd.DataFrame([sample_0, sample_1]))) assert_that(results.get_classifications(), equals_series({ 0: "shopping", 1: "cinema" }))
def test_data_has_value_not_in_training(self): training = load(self.relative_to_base("/datasets/play_tennis.data"), delimiter=" ") classifier = DecisionTree(training) # NOTE: Snowing is not a value of Outlook seen in the training set sample = pd.Series({ "Outlook": "Snowing", "Temperature": "Cool", "Humidity": "Normal", "Wind": "Strong" }) assert_that(classifier.classify(sample), equal_to("Yes"))
def test_id3_build_tree_marine_animals(self): dataset = load(self.relative_to_base("/datasets/marine_animal.data")) tree = id3.build_tree(dataset) assert_that( tree, equals_tree({ "no_surfacing": { "no": "no", "yes": { "has_flippers": { "no": "no", "yes": "yes" } } } }))
def test_id3_build_tree_marine_animals(self): dataset = load(self.relative_to_base("/datasets/marine_animal.data")) tree = id3.build_tree(dataset) assert_that(tree, equals_tree( {"no_surfacing": { "no": "no", "yes": { "has_flippers": { "no": "no", "yes": "yes" } } } } ) )
def test_gradient_descent_3_parameters(self): """ Test based on Assignment 1 of the free online Stanford Machine Learning online course. """ dataset = loader.load(self.relative_to_base("datasets/ex1data2.txt"), has_ids=False, has_header=False, has_labels=True, delimiter=",") dataset.normalize_features() dataset.set_column("bias", pd.Series([1] * dataset.num_samples())) learning_rate = 1.0 iter = 50 initial_theta = pd.Series({0: 0, 1: 0, "bias": 0}) theta = optimize.gradient_descent(dataset, initial_theta, learning_rate, iterations=iter) assert_that(theta, equals_series({0: 110631.050279, 1: -6649.474271, "bias": 340412.659574}, places=6))
def test_id3_build_tree_play_tennis(self): dataset = load(self.relative_to_base("/datasets/play_tennis.data"), delimiter=" ") tree = id3.build_tree(dataset) assert_that(tree, equals_tree( {"Outlook": { "Sunny": { "Humidity": { "High": "No", "Normal": "Yes" } }, "Overcast": "Yes", "Rain": { "Wind": { "Strong": "No", "Weak": "Yes" } } }} ) )
def test_id3_build_tree_play_tennis(self): dataset = load(self.relative_to_base("/datasets/play_tennis.data"), delimiter=" ") tree = id3.build_tree(dataset) assert_that( tree, equals_tree({ "Outlook": { "Sunny": { "Humidity": { "High": "No", "Normal": "Yes" } }, "Overcast": "Yes", "Rain": { "Wind": { "Strong": "No", "Weak": "Yes" } } } }))
def test_id3_choose_feature_to_split(self): data = load(self.relative_to_base("/datasets/weekends.data")) root = id3.choose_feature_to_split(data) self.assertEqual(root, "weather")
def test_load_has_ids(self): dataset = load(self.relative_to_base("datasets/3f_ids_header.csv")) self.assertEqual(dataset.num_features(), 3) self.assertEqual(dataset.num_samples(), 4)
def test_load_tsv(self): data_set = load(self.relative_to_base("datasets/3f_header.tsv"), delimiter="\t", has_ids=False) self.assertEqual(data_set.num_features(), 3) self.assertEqual(data_set.num_samples(), 4)
def test_load_csv_no_header(self): data_set = load(self.relative_to_base("datasets/3f_no_header.csv"), has_header=False, has_ids=False) self.assertEqual(data_set.num_features(), 3) self.assertEqual(data_set.num_samples(), 4)
def test_load_labelled(self): dataset = load(self.relative_to_base("datasets/3f_ids_header.csv")) self.assertTrue(dataset.is_labelled()) labels = dataset.get_labels() assert_that(labels, equals_series({"V01": "c", "V02": "b", "V03": "b", "V04": "a"}))
def test_classify_weekends(self): training = load(self.relative_to_base("/datasets/weekends.data")) classifier = DecisionTree(training) sample = pd.Series(["windy", "no", "rich"], index=['weather', 'parents', 'money']) self.assertEqual(classifier.classify(sample), "shopping")