def test_find_neighbors_numeric_nominal_covers(self): """Tests that the stats for a newly covered rule are updated (dist = 0)""" """Tests that global statistics are updated accordingly""" df = pd.DataFrame({ "A": ["low", "low", "high", "high", "low", "high"], "B": [1, 1, 1, 1, 0.5, 0.75], "C": [3, 2, 1, .5, 3, 2], "Class": ["apple", "apple", "banana", "banana", "banana", "banana"] }) class_col_name = "Class" lookup = \ { "A": { 'high': 2, 'low': 4, my_vars.CONDITIONAL: { 'high': Counter({ 'banana': 2 }), 'low': Counter({ 'banana': 2, 'apple': 2 }) } } } rules = [ pd.Series( { "A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=3, upper=3), "Class": "apple" }, name=0), pd.Series( { "A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=2, upper=2), "Class": "apple" }, name=1), pd.Series( { "A": "high", "B": Bounds(lower=4, upper=4), "C": Bounds(lower=1, upper=1), "Class": "banana" }, name=2), pd.Series( { "A": "low", "B": Bounds(lower=1.5, upper=1.5), "C": Bounds(lower=0.5, upper=0.5), "Class": "banana" }, name=3), pd.Series( { "A": "low", "B": Bounds(lower=0.5, upper=0.5), "C": Bounds(lower=3, upper=3), "Class": "banana" }, name=4), pd.Series( { "A": "high", "B": Bounds(lower=0.75, upper=0.75), "C": Bounds(lower=2, upper=2), "Class": "banana" }, name=5) ] my_vars.all_rules = { 0: rules[0], 1: rules[1], 2: rules[2], 3: rules[3], 4: rules[4], 5: rules[5] } my_vars.closest_rule_per_example = { 0: (1, 0.010000000000000002), 1: (0, 0.010000000000000002), 2: (5, 0.67015625), 3: (1, 0.038125), 4: (0, 0.015625), 5: (2, 0.67015625) } my_vars.closest_examples_per_rule = { 0: {1, 4}, 1: {0, 3}, 2: {5}, 5: {2} } k = 4 correct = df.iloc[[2, 3, 5, 4]] rule = pd.Series({"A": "high", "B": (1, 1), "Class": "banana"}, name=0) classes = ["apple", "banana"] my_vars.minority_class = "banana" min_max = pd.DataFrame({ "A": { "min": 1, "max": 5 }, "B": { "min": 1, "max": 11 } }) # An example could be covered by multiple rules, so example 2 should be covered by rules 0 and 1 at the end my_vars.examples_covered_by_rule = {1: {2}} correct_covered = {0: {2, 3}, 1: {2}} correct_examples_per_rule = {0: {1, 2, 3, 4, 5}, 1: {0}} correct_closest_rule_per_example = { 0: (1, 0.010000000000000002), 1: (0, 0.010000000000000002), 2: (0, 0.0), 3: (0, 0.0), 4: (0, 0.015625), 5: (0, 0.0006250000000000001) } neighbors, _, _ = find_nearest_examples( df, k, rule, class_col_name, lookup, min_max, classes, label_type=my_vars.SAME_LABEL_AS_RULE, only_uncovered_neighbors=False) self.assertTrue(neighbors.equals(correct)) self.assertTrue(correct_covered == my_vars.examples_covered_by_rule) self.assertTrue( correct_examples_per_rule == my_vars.closest_examples_per_rule) for example_id, (rule_id, dist) in correct_closest_rule_per_example.items(): self.assertTrue(example_id in my_vars.closest_rule_per_example) other_id, other_dist = my_vars.closest_rule_per_example[example_id] self.assertTrue(rule_id == other_id) self.assertTrue(abs(dist - other_dist) < 0.0001)
def test_find_neighbors_numeric_nominal(self): """Tests what happens if input has a numeric and a nominal feature""" df = pd.DataFrame({ "A": ["low", "low", "high", "low", "low", "high"], "B": [1, 1, 4, 1.5, 0.5, 0.75], "C": [3, 2, 1, .5, 3, 2], "Class": ["apple", "apple", "banana", "banana", "banana", "banana"] }) class_col_name = "Class" lookup = \ { "A": { 'high': 2, 'low': 4, my_vars.CONDITIONAL: { 'high': Counter({ 'banana': 2 }), 'low': Counter({ 'banana': 2, 'apple': 2 }) } } } k = 4 correct = None if k == 1: correct = df.iloc[[5]] elif k == 2: correct = df.iloc[[5, 2]] elif k == 3: correct = df.iloc[[5, 2, 3]] elif k >= 4: correct = df.iloc[[5, 2, 3, 4]] rule = pd.Series({ "A": "high", "B": Bounds(lower=1, upper=1), "Class": "banana" }) classes = ["apple", "banana"] min_max = pd.DataFrame({ "A": { "min": 1, "max": 5 }, "B": { "min": 1, "max": 11 } }) # Reset as other tests changed the content of the dictionary my_vars.closest_rule_per_example = {} neighbors, _, _ = find_nearest_examples( df, k, rule, class_col_name, lookup, min_max, classes, label_type=my_vars.SAME_LABEL_AS_RULE, only_uncovered_neighbors=False) if neighbors is not None: self.assertTrue(neighbors.shape[0] == k) self.assertTrue(neighbors.equals(correct))
def test_find_neighbors_numeric_nominal_covered(self): """Tests what happens if input has a numeric and a nominal feature and some examples are already covered by the rule""" df = pd.DataFrame({ "A": ["low", "low", "high", "low", "low", "high"], "B": [1, 1, 4, 1.5, 0.5, 0.75], "C": [3, 2, 1, .5, 3, 2], "Class": ["apple", "apple", "banana", "banana", "banana", "banana"] }) class_col_name = "Class" lookup = \ { "A": { 'high': 2, 'low': 4, my_vars.CONDITIONAL: { 'high': Counter({ 'banana': 2 }), 'low': Counter({ 'banana': 2, 'apple': 2 }) } } } k = 4 my_vars.closest_rule_per_example = {} correct = None if k == 1: correct = df.iloc[[5]] elif k == 2: correct = df.iloc[[5, 2]] elif k == 3: correct = df.iloc[[5, 2, 3]] elif k >= 4: # correct = df.iloc[[5, 2, 3, 4]] # Examples at indices 2 and 4 are already covered by the rule, so don't return them as neighbors my_vars.examples_covered_by_rule = {0: {2, 4}} correct = df.iloc[[5, 3]] my_vars.all_rules = {} rule = pd.Series( { "A": "high", "B": Bounds(lower=1, upper=1), "Class": "banana" }, name=0) classes = ["apple", "banana"] min_max = pd.DataFrame({ "A": { "min": 1, "max": 5 }, "B": { "min": 1, "max": 11 } }) neighbors, _, _ = find_nearest_examples( df, k, rule, class_col_name, lookup, min_max, classes, label_type=my_vars.SAME_LABEL_AS_RULE, only_uncovered_neighbors=True) self.assertTrue(neighbors.equals(correct))
def test_find_neighbors_numeric_nominal_stats(self): """Tests that global statistics are updated accordingly""" df = pd.DataFrame({ "A": ["low", "low", "high", "low", "low", "high"], "B": [1, 1, 4, 1.5, 0.5, 0.75], "C": [3, 2, 1, .5, 3, 2], "Class": ["apple", "apple", "banana", "banana", "banana", "banana"] }) class_col_name = "Class" lookup = \ { "A": { 'high': 2, 'low': 4, my_vars.CONDITIONAL: { 'high': Counter({ 'banana': 2 }), 'low': Counter({ 'banana': 2, 'apple': 2 }) } } } rule = pd.Series({"A": "high", "B": (1, 1), "Class": "banana"}, name=0) my_vars.closest_rule_per_example = { 0: Data(rule_id=1, dist=0.010000000000000002), 1: Data(rule_id=0, dist=0.010000000000000002), 2: Data(rule_id=5, dist=0.67015625), 3: Data(rule_id=1, dist=0.038125), 4: Data(rule_id=0, dist=0.015625), 5: Data(rule_id=2, dist=0.67015625) } # Reset because other tests added data, so if you only run this test it would work, but not if other # tests are run prior to that my_vars.examples_covered_by_rule = {} my_vars.closest_examples_per_rule = {} my_vars.closest_examples_per_rule = { 0: {1, 4}, 1: {0, 3}, 2: {5}, 5: {2} } rules = [ pd.Series( { "A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=3, upper=3), "Class": "apple" }, name=0), pd.Series( { "A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=2, upper=2), "Class": "apple" }, name=1), pd.Series( { "A": "high", "B": Bounds(lower=4, upper=4), "C": Bounds(lower=1, upper=1), "Class": "banana" }, name=2), pd.Series( { "A": "low", "B": Bounds(lower=1.5, upper=1.5), "C": Bounds(lower=0.5, upper=0.5), "Class": "banana" }, name=3), pd.Series( { "A": "low", "B": Bounds(lower=0.5, upper=0.5), "C": Bounds(lower=3, upper=3), "Class": "banana" }, name=4), pd.Series( { "A": "high", "B": Bounds(lower=0.75, upper=0.75), "C": Bounds(lower=2, upper=2), "Class": "banana" }, name=5) ] my_vars.all_rules = { 0: rules[0], 1: rules[1], 2: rules[2], 3: rules[3], 4: rules[4], 5: rules[5] } # my_vars.all_rules = {0: rule} k = 4 correct = df.iloc[[5, 2, 3, 4]] classes = ["apple", "banana"] my_vars.minority_class = "banana" min_max = pd.DataFrame({ "A": { "min": 1, "max": 5 }, "B": { "min": 1, "max": 11 } }) correct_covered = {} correct_examples_per_rule = {0: {1, 2, 4, 5}, 1: {0, 3}} correct_closest_rule_per_example = { 0: Data(rule_id=1, dist=0.010000000000000002), 1: Data(rule_id=0, dist=0.010000000000000002), 2: Data(rule_id=0, dist=0.09), 3: Data(rule_id=1, dist=0.038125), 4: Data(rule_id=0, dist=0.015625), 5: Data(rule_id=0, dist=0.0006250000000000001) } neighbors, _, _ = find_nearest_examples( df, k, rule, class_col_name, lookup, min_max, classes, label_type=my_vars.SAME_LABEL_AS_RULE, only_uncovered_neighbors=False) self.assertTrue(neighbors.equals(correct)) self.assertTrue(correct_covered == my_vars.examples_covered_by_rule) self.assertTrue( correct_examples_per_rule == my_vars.closest_examples_per_rule) for example_id, (rule_id, dist) in correct_closest_rule_per_example.items(): features = my_vars.all_rules[rule_id].size self.assertTrue(example_id in my_vars.closest_rule_per_example) other_id, other_dist = my_vars.closest_rule_per_example[example_id] other_features = my_vars.all_rules[other_id].size self.assertTrue(rule_id == other_id) self.assertTrue(features == other_features) self.assertTrue(abs(dist - other_dist) < 0.0001)
def test_find_neighbors_numeric_nominal_label_type(self): """Tests what happens if input has a numeric and a nominal feature and we vary label_type as parameter""" df = pd.DataFrame({ "A": ["low", "low", "high", "low", "low", "high"], "B": [1, 1, 4, 1.5, 0.5, 0.75], "C": [3, 2, 1, .5, 3, 2], "Class": ["apple", "apple", "banana", "banana", "banana", "banana"] }) class_col_name = "Class" lookup = \ { "A": { 'high': 2, 'low': 4, my_vars.CONDITIONAL: { 'high': Counter({ 'banana': 2 }), 'low': Counter({ 'banana': 2, 'apple': 2 }) } } } k = 3 rules = [ pd.Series( { "A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=3, upper=3), "Class": "apple" }, name=0), pd.Series( { "A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=2, upper=2), "Class": "apple" }, name=1), pd.Series( { "A": "high", "B": Bounds(lower=4, upper=4), "C": Bounds(lower=1, upper=1), "Class": "banana" }, name=2), pd.Series( { "A": "low", "B": Bounds(lower=1.5, upper=1.5), "C": Bounds(lower=0.5, upper=0.5), "Class": "banana" }, name=3), pd.Series( { "A": "low", "B": Bounds(lower=0.5, upper=0.5), "C": Bounds(lower=3, upper=3), "Class": "banana" }, name=4), pd.Series( { "A": "high", "B": Bounds(lower=0.75, upper=0.75), "C": Bounds(lower=2, upper=2), "Class": "banana" }, name=5) ] my_vars.all_rules = { 0: rules[0], 1: rules[1], 2: rules[2], 3: rules[3], 4: rules[4], 5: rules[5] } my_vars.closest_rule_per_example = {} my_vars.closest_examples_per_rule = {} correct_all = df.iloc[[5, 2, 0]] correct_same = df.iloc[[5, 2, 3]] correct_opposite = df.iloc[[0, 1]] rule = pd.Series({"A": "high", "B": (1, 1), "Class": "banana"}, name=0) classes = ["apple", "banana"] min_max = pd.DataFrame({ "A": { "min": 1, "max": 5 }, "B": { "min": 1, "max": 11 } }) neighbors_all, _, _ = find_nearest_examples( df, k, rule, class_col_name, lookup, min_max, classes, label_type=my_vars.ALL_LABELS, only_uncovered_neighbors=False) neighbors_same, _, _ = find_nearest_examples( df, k, rule, class_col_name, lookup, min_max, classes, label_type=my_vars.SAME_LABEL_AS_RULE, only_uncovered_neighbors=False) neighbors_opposite, _, _ = find_nearest_examples( df, k, rule, class_col_name, lookup, min_max, classes, label_type=my_vars.OPPOSITE_LABEL_TO_RULE, only_uncovered_neighbors=False) print(neighbors_all) print(neighbors_same) print(neighbors_opposite) self.assertTrue(neighbors_all.equals(correct_all)) self.assertTrue(neighbors_same.equals(correct_same)) self.assertTrue(neighbors_opposite.equals(correct_opposite))
def test_add_all_good_rules(self): """Tests that rule set is updated when a generalized rule improves F1""" df = pd.DataFrame({"A": ["low", "low", "high", "low", "low", "high"], "B": [1, 1, 4, 1.5, 0.5, 0.75], "C": [3, 2, 1, .5, 3, 2], "Class": ["apple", "apple", "banana", "banana", "banana", "banana"]}) class_col_name = "Class" lookup = \ { "A": { 'high': 2, 'low': 4, my_vars.CONDITIONAL: { 'high': Counter({ 'banana': 2 }), 'low': Counter({ 'banana': 2, 'apple': 2 }) } } } classes = ["apple", "banana"] min_max = pd.DataFrame({"B": {"min": 1, "max": 5}, "C": {"min": 1, "max": 11}}) # Use majority class as minority to have multiple neighbors and see if the function works correctly my_vars.minority_class = "banana" rules = [ pd.Series({"A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=3, upper=3), "Class": "apple"}, name=0), pd.Series({"A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=2, upper=2), "Class": "apple"}, name=1), pd.Series({"A": "low", "B": Bounds(lower=1.5, upper=1.5), "C": Bounds(lower=0.5, upper=0.5), "Class": "banana"}, name=3), pd.Series({"A": "low", "B": Bounds(lower=0.5, upper=0.5), "C": Bounds(lower=3, upper=3), "Class": "banana"}, name=4), pd.Series({"A": "high", "B": Bounds(lower=0.75, upper=0.75), "C": Bounds(lower=2, upper=2), "Class": "banana"}, name=5), pd.Series({"A": "high", "B": Bounds(lower=4, upper=4), "C": Bounds(lower=1, upper=1), "Class": "banana"}, name=2) # Current rule to be tested is always at the end ] test_idx = -1 my_vars.latest_rule_id = len(rules) - 1 my_vars.examples_covered_by_rule = {} my_vars.all_rules = {0: rules[0], 1: rules[1], 2: rules[test_idx], 3: rules[2], 4: rules[3], 5: rules[4]} my_vars.seed_rule_example = {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5} my_vars.seed_example_rule = {0: {0}, 1: {1}, 2: {2}, 3: {3}, 4: {4}, 5: {5}} my_vars.unique_rules = {} for rule in rules: hash_val = compute_hashable_key(rule) my_vars.unique_rules.setdefault(hash_val, set()).add(rule.name) initial_correct_closest_rule_per_example = { 0: Data(rule_id=1, dist=0.010000000000000002), 1: Data(rule_id=0, dist=0.010000000000000002), 2: Data(rule_id=5, dist=0.67015625), 3: Data(rule_id=1, dist=0.038125), 4: Data(rule_id=0, dist=0.015625), 5: Data(rule_id=2, dist=0.67015625)} initial_f1 = evaluate_f1_initialize_confusion_matrix(df, rules, class_col_name, lookup, min_max, classes) correct_confusion_matrix = {my_vars.TP: {2, 5}, my_vars.FP: set(), my_vars.TN: {0, 1}, my_vars.FN: {3, 4}} correct_rules = 8 self.assertTrue(my_vars.conf_matrix == correct_confusion_matrix) # Make sure confusion matrix, closest rule per example are correct at the beginning for example_id in my_vars.closest_rule_per_example: rule_id, dist = my_vars.closest_rule_per_example[example_id] self.assertTrue(rule_id == initial_correct_closest_rule_per_example[example_id].rule_id and abs(dist - initial_correct_closest_rule_per_example[example_id].dist) < 0.001) correct_initial_f1 = 2 * 0.5 * 1 / 1.5 self.assertTrue(initial_f1 == correct_initial_f1) k = 3 neighbors, dists, _ = find_nearest_examples(df, k, rules[test_idx], class_col_name, lookup, min_max, classes, label_type=my_vars.SAME_LABEL_AS_RULE, only_uncovered_neighbors= True) improved, updated_rules, f1 = add_all_good_rules(df, neighbors, rules[test_idx], rules, initial_f1, class_col_name, lookup, min_max, classes) self.assertTrue(improved is True) print("f1", f1) # correct_covered = {2: {0, 1, 2, 3, 4, 5}} correct_covered = {6: {0, 1, 2, 4, 5}, 7: {3}} correct_confusion_matrix = {my_vars.TP: {2, 3, 4, 5}, my_vars.FP: {0, 1}, my_vars.TN: set(), my_vars.FN: set()} # correct_closest_rule_per_example = { # 0: Data(rule_id=2, dist=0.0), # 1: Data(rule_id=2, dist=0.0), # 2: Data(rule_id=2, dist=0.0), # 3: Data(rule_id=2, dist=0.0), # 4: Data(rule_id=2, dist=0.0), # 5: Data(rule_id=2, dist=0.0)} correct_closest_rule_per_example = { 0: Data(rule_id=6, dist=0.0), 1: Data(rule_id=6, dist=0.0), 2: Data(rule_id=6, dist=0.0), 3: Data(rule_id=7, dist=0.0), 4: Data(rule_id=6, dist=0.0), 5: Data(rule_id=6, dist=0.0) } correct_f1 = 0.8 self.assertTrue(correct_f1 == f1) for example_id in my_vars.closest_rule_per_example: rule_id, dist = my_vars.closest_rule_per_example[example_id] self.assertTrue(rule_id == correct_closest_rule_per_example[example_id].rule_id and abs(dist - correct_closest_rule_per_example[example_id].dist) < 0.001) self.assertTrue(my_vars.conf_matrix == correct_confusion_matrix) # latest_rule_id must be 7 as 2 new rules were added to the 5 initial rules self.assertTrue(len(updated_rules) == correct_rules and my_vars.latest_rule_id == (correct_rules - 1)) self.assertTrue(correct_covered == my_vars.examples_covered_by_rule)
def test_add_one_best_rule_unique(self): """Tests that the best rule found by this function is unique and correspondingly updates relevant statistics if that's not the case""" df = pd.DataFrame({ "A": ["low", "low", "high", "low", "low", "high"], "B": [1, 1, 4, 1.5, 0.5, 0.75], "C": [3, 2, 1, .5, 3, 2], "Class": ["apple", "apple", "banana", "banana", "banana", "banana"] }) class_col_name = "Class" lookup = \ { "A": { 'high': 2, 'low': 4, my_vars.CONDITIONAL: { 'high': Counter({ 'banana': 2 }), 'low': Counter({ 'banana': 2, 'apple': 2 }) } } } test_idx = -1 classes = ["apple", "banana"] min_max = pd.DataFrame({ "B": { "min": 1, "max": 5 }, "C": { "min": 1, "max": 11 } }) my_vars.minority_class = "apple" # name=6 because this guy already exists in the rules and the new rule with name=0 becomes the same, so # it's removed correct_generalized_rule = pd.Series( { "A": "low", "B": (1, 1), "C": (2.0, 3), "Class": "apple" }, name=6) rules = [ pd.Series( { "A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=2, upper=2), "Class": "apple" }, name=1), pd.Series( { "A": "high", "B": Bounds(lower=4, upper=4), "C": Bounds(lower=1, upper=1), "Class": "banana" }, name=2), pd.Series( { "A": "low", "B": Bounds(lower=1.5, upper=1.5), "C": Bounds(lower=0.5, upper=0.5), "Class": "banana" }, name=3), pd.Series( { "A": "low", "B": Bounds(lower=0.5, upper=0.5), "C": Bounds(lower=3, upper=3), "Class": "banana" }, name=4), pd.Series( { "A": "high", "B": Bounds(lower=0.75, upper=0.75), "C": Bounds(lower=2, upper=2), "Class": "banana" }, name=5), pd.Series( { "A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=2.0, upper=3), "Class": "apple" }, name=6), # same as best rule pd.Series( { "A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=3, upper=3), "Class": "apple" }, name=0) # Current rule is always at the end of the list ] for rule in rules: rule_hash = compute_hashable_key(rule) my_vars.unique_rules[rule_hash] = {rule.name} correct_generalized_rule_hash = compute_hashable_key( correct_generalized_rule) my_vars.examples_covered_by_rule = {} my_vars.all_rules = { 0: rules[test_idx], 1: rules[0], 2: rules[1], 3: rules[2], 4: rules[3], 5: rules[4], 6: rules[5] } my_vars.seed_rule_example = {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 8} my_vars.seed_example_rule = { 0: {0}, 1: {1}, 2: {2}, 3: {3}, 4: {4}, 5: {5} } my_vars.closest_examples_per_rule = { 0: {1, 4}, 1: {0, 3}, 2: {5}, 5: {2} } # Note that 6: {8} is incorrect and was just added to test if the entries are merged correctly my_vars.examples_covered_by_rule = {6: {8}} print("rule hashes", my_vars.unique_rules) print(correct_generalized_rule_hash) my_vars.closest_rule_per_example = { 0: Data(rule_id=1, dist=0.010000000000000002), 1: Data(rule_id=6, dist=0.0), 2: Data(rule_id=5, dist=0.67015625), 3: Data(rule_id=1, dist=0.038125), 4: Data(rule_id=0, dist=0.015625), 5: Data(rule_id=2, dist=0.67015625), 8: Data(rule_id=6, dist=0) # Fake entry } my_vars.conf_matrix = { my_vars.TP: {0, 1}, my_vars.FP: {3, 4}, my_vars.TN: {2, 5}, my_vars.FN: set() } initial_f1 = 0.66666 k = 3 neighbors, dists, _ = find_nearest_examples( df, k, rules[test_idx], class_col_name, lookup, min_max, classes, label_type=my_vars.SAME_LABEL_AS_RULE, only_uncovered_neighbors=True) improved, updated_rules, f1 = add_one_best_rule( df, neighbors, rules[test_idx], rules, initial_f1, class_col_name, lookup, min_max, classes) correct_closest_rule_per_example = { 0: Data(rule_id=1, dist=0.010000000000000002), 1: Data(rule_id=6, dist=0.0), 2: Data(rule_id=5, dist=0.67015625), 3: Data(rule_id=1, dist=0.038125), 4: Data(rule_id=6, dist=0.015625), 5: Data(rule_id=2, dist=0.67015625), 8: Data(rule_id=6, dist=0) } self.assertTrue(improved is True) correct_f1 = 2 * 0.5 * 1 / 1.5 self.assertTrue(abs(correct_f1 - f1) < my_vars.PRECISION) correct_confusion_matrix = { my_vars.TP: {0, 1}, my_vars.FP: {3, 4}, my_vars.TN: {2, 5}, my_vars.FN: set() } # Make sure confusion matrix, closest rule per example, and rule set were updated with the updated rule too for example_id in my_vars.closest_rule_per_example: # 8 was only added to test something else, since it won't be in the result # if example_id != 8: rule_id, dist = my_vars.closest_rule_per_example[example_id] self.assertTrue( rule_id == correct_closest_rule_per_example[example_id].rule_id and abs(dist - correct_closest_rule_per_example[example_id].dist) < 0.001) self.assertTrue(updated_rules[5].equals(correct_generalized_rule)) self.assertTrue(my_vars.conf_matrix == correct_confusion_matrix) # Duplicate rule was deleted so that the last rule now corresponds to the rule with id self.assertTrue( len(rules) - 1 == len(updated_rules) and updated_rules[-1].name == 6)
def test_add_one_best_rule_no_update(self): """Tests that rule set is not updated when no generalized rule improves F1""" df = pd.DataFrame({ "A": ["low", "low", "high", "low", "low", "high"], "B": [1, 1, 4, 1.5, 0.5, 0.75], "C": [3, 2, 1, .5, 3, 2], "Class": ["apple", "apple", "banana", "banana", "banana", "banana"] }) class_col_name = "Class" lookup = \ { "A": { 'high': 2, 'low': 4, my_vars.CONDITIONAL: { 'high': Counter({ 'banana': 2 }), 'low': Counter({ 'banana': 2, 'apple': 2 }) } } } test_idx = -1 classes = ["apple", "banana"] min_max = pd.DataFrame({ "B": { "min": 1, "max": 5 }, "C": { "min": 1, "max": 11 } }) my_vars.minority_class = "apple" rules = [ pd.Series( { "A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=2, upper=2), "Class": "apple" }, name=1), pd.Series( { "A": "high", "B": Bounds(lower=4, upper=4), "C": Bounds(lower=1, upper=1), "Class": "banana" }, name=2), pd.Series( { "A": "low", "B": Bounds(lower=1.5, upper=1.5), "C": Bounds(lower=0.5, upper=0.5), "Class": "banana" }, name=3), pd.Series( { "A": "low", "B": Bounds(lower=0.5, upper=0.5), "C": Bounds(lower=3, upper=3), "Class": "banana" }, name=4), pd.Series( { "A": "high", "B": Bounds(lower=0.75, upper=0.75), "C": Bounds(lower=2, upper=2), "Class": "banana" }, name=5), pd.Series( { "A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=3, upper=3), "Class": "apple" }, name=0) # Current rule is always at the end of the list ] my_vars.closest_rule_per_example = { 0: Data(rule_id=1, dist=0.010000000000000002), 1: Data(rule_id=0, dist=0.010000000000000002), 2: Data(rule_id=5, dist=0.67015625), 3: Data(rule_id=1, dist=0.038125), 4: Data(rule_id=0, dist=0.015625), 5: Data(rule_id=2, dist=0.67015625) } my_vars.all_rules = { 0: rules[test_idx], 1: rules[1], 2: rules[2], 3: rules[3], 4: rules[4], 5: rules[0] } my_vars.seed_rule_example = {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5} my_vars.seed_example_rule = { 0: {0}, 1: {1}, 2: {2}, 3: {3}, 4: {4}, 5: {5} } my_vars.conf_matrix = { my_vars.TP: {0, 1}, my_vars.FP: set(), my_vars.TN: {2, 5}, my_vars.FN: {3, 4} } my_vars.examples_covered_by_rule = {} # F1 is actually 0.6666, but setting it to 0.8 makes it not update any rule initial_f1 = 0.8 k = 3 my_vars.unique_rules = {} for rule in rules: rule_hash = compute_hashable_key(rule) my_vars.unique_rules.setdefault(rule_hash, set()).add(rule.name) neighbors, dists, _ = find_nearest_examples( df, k, rules[test_idx], class_col_name, lookup, min_max, classes, label_type=my_vars.SAME_LABEL_AS_RULE, only_uncovered_neighbors=True) improved, updated_rules, f1 = add_one_best_rule( df, neighbors, rules[test_idx], rules, initial_f1, class_col_name, lookup, min_max, classes) correct_closest_rule_per_example = { 0: Data(rule_id=1, dist=0.010000000000000002), 1: Data(rule_id=0, dist=0.010000000000000002), 2: Data(rule_id=5, dist=0.67015625), 3: Data(rule_id=1, dist=0.038125), 4: Data(rule_id=0, dist=0.015625), 5: Data(rule_id=2, dist=0.67015625) } self.assertTrue(improved is False) correct_f1 = initial_f1 self.assertTrue(abs(correct_f1 - f1) < my_vars.PRECISION) correct_generalized_rule = pd.Series( { "A": "low", "B": (1, 1), "C": (3, 3), "Class": "apple" }, name=0) correct_confusion_matrix = { my_vars.TP: {0, 1}, my_vars.FP: set(), my_vars.TN: {2, 5}, my_vars.FN: {3, 4} } # Make sure confusion matrix, closest rule per example, and rule set were updated with the updated rule too for example_id in my_vars.closest_rule_per_example: rule_id, dist = my_vars.closest_rule_per_example[example_id] self.assertTrue( rule_id == correct_closest_rule_per_example[example_id].rule_id and abs(dist - correct_closest_rule_per_example[example_id].dist) < 0.001) print(rules[test_idx]) print(correct_generalized_rule) print("updated") print(updated_rules) self.assertTrue( updated_rules[test_idx].equals(correct_generalized_rule)) self.assertTrue(my_vars.conf_matrix == correct_confusion_matrix)
def test_add_one_best_rule_update_stats(self): """Tests that rule set is updated when a generalized rule improves F1 and also the mapping of closest rule per example changes""" df = pd.DataFrame({ "A": ["low", "low", "high", "low", "low", "high"], "B": [1, 1, 4, 1.5, 0.5, 0.75], "C": [3, 2, 1, .5, 3, 2], "Class": ["apple", "apple", "banana", "banana", "banana", "banana"] }) class_col_name = "Class" lookup = \ { "A": { 'high': 2, 'low': 4, my_vars.CONDITIONAL: { 'high': Counter({ 'banana': 2 }), 'low': Counter({ 'banana': 2, 'apple': 2 }) } } } test_idx = -1 classes = ["apple", "banana"] min_max = pd.DataFrame({ "B": { "min": 1, "max": 5 }, "C": { "min": 1, "max": 11 } }) my_vars.minority_class = "apple" rules = [ pd.Series( { "A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=2, upper=2), "Class": "apple" }, name=1), pd.Series( { "A": "high", "B": Bounds(lower=4, upper=4), "C": Bounds(lower=1, upper=1), "Class": "banana" }, name=2), pd.Series( { "A": "low", "B": Bounds(lower=1.5, upper=1.5), "C": Bounds(lower=0.5, upper=0.5), "Class": "banana" }, name=3), pd.Series( { "A": "low", "B": Bounds(lower=0.5, upper=0.5), "C": Bounds(lower=3, upper=3), "Class": "banana" }, name=4), pd.Series( { "A": "high", "B": Bounds(lower=0.75, upper=0.75), "C": Bounds(lower=2, upper=2), "Class": "banana" }, name=5), pd.Series( { "A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=3, upper=3), "Class": "apple" }, name=0) # Current rule is always at the end of the list ] my_vars.closest_examples_per_rule = { 0: {4}, 1: {0, 1, 3}, # Change compared to previous test case 2: {5}, 5: {2} } my_vars.closest_rule_per_example = { 0: Data(rule_id=1, dist=0.010000000000000002), 1: Data(rule_id=1, dist=0.010000000000000002 ), # Change compared to previous test case 2: Data(rule_id=5, dist=0.67015625), 3: Data(rule_id=1, dist=0.038125), 4: Data(rule_id=0, dist=0.015625), 5: Data(rule_id=2, dist=0.67015625) } # Reset because other tests change the data # my_vars.examples_covered_by_rule = {0: {0}, 1: {1}, 2: {2}, 3: {3}, 4: {4}, 5: {5}, 6: {8}} my_vars.examples_covered_by_rule = {} my_vars.all_rules = { 0: rules[test_idx], 1: rules[1], 2: rules[2], 3: rules[3], 4: rules[4], 5: rules[5] } my_vars.seed_rule_example = {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 8} my_vars.seed_example_rule = { 0: {0}, 1: {1}, 2: {2}, 3: {3}, 4: {4}, 5: {5} } my_vars.unique_rules = {} my_vars.unique_rules = {} for rule in rules: rule_hash = compute_hashable_key(rule) my_vars.unique_rules.setdefault(rule_hash, set()).add(rule.name) # Actually, correctly it should've been # my_vars.conf_matrix = {my_vars.TP: {0, 1}, my_vars.FP: set(), my_vars.TN: {2, 5}, my_vars.FN: {3, 4}} # at the start (i.e. F1=0.66666), but to see if it changes, it's changed my_vars.conf_matrix = { my_vars.TP: {0}, my_vars.FP: set(), my_vars.TN: {1, 2, 5}, my_vars.FN: {3, 4} } initial_f1 = 0.1 k = 3 neighbors, dists, _ = find_nearest_examples( df, k, rules[test_idx], class_col_name, lookup, min_max, classes, label_type=my_vars.SAME_LABEL_AS_RULE, only_uncovered_neighbors=True) improved, updated_rules, f1 = add_one_best_rule( df, neighbors, rules[test_idx], rules, initial_f1, class_col_name, lookup, min_max, classes) correct_closest_rule_per_example = { 0: Data(rule_id=1, dist=0.010000000000000002), 1: Data(rule_id=0, dist=0.0), 2: Data(rule_id=5, dist=0.67015625), 3: Data(rule_id=1, dist=0.038125), 4: Data(rule_id=0, dist=0.015625), 5: Data(rule_id=2, dist=0.67015625) } correct_closest_examples_per_rule = { 0: {1, 4}, 1: {0, 3}, 2: {5}, 5: {2} } correct_f1 = 2 * 0.5 * 1 / 1.5 self.assertTrue(abs(correct_f1 - f1) < my_vars.PRECISION) self.assertTrue(improved is True) correct_generalized_rule = pd.Series( { "A": "low", "B": (1, 1), "C": (2.0, 3), "Class": "apple" }, name=0) correct_confusion_matrix = { my_vars.TP: {0, 1}, my_vars.FP: set(), my_vars.TN: {2, 5}, my_vars.FN: {3, 4} } # Make sure confusion matrix, closest rule per example, and rule set were updated with the updated rule too for example_id in my_vars.closest_rule_per_example: rule_id, dist = my_vars.closest_rule_per_example[example_id] self.assertTrue( rule_id == correct_closest_rule_per_example[example_id].rule_id and abs(dist - correct_closest_rule_per_example[example_id].dist) < 0.001) self.assertTrue( updated_rules[test_idx].equals(correct_generalized_rule)) self.assertTrue(my_vars.conf_matrix == correct_confusion_matrix) print(correct_closest_examples_per_rule) print(my_vars.closest_examples_per_rule) self.assertTrue(correct_closest_examples_per_rule == my_vars.closest_examples_per_rule)