def test_find_duplicate_rule_id(self): """Tests that a duplicate rule is detected properly""" rules = [ pd.Series( { "A": "low", "B": Bounds(lower=0.5, upper=1.5), "C": Bounds(lower=0.5, upper=3.0), "Class": "banana" }, name=7), pd.Series( { "A": "low", "B": Bounds(lower=0.5, upper=1.5), "C": Bounds(lower=0.5, upper=3.0), "Class": "banana" }, name=12) # Duplicate ] duplicate_idx = 1 my_vars.unique_rules = {compute_hashable_key(rules[0]): {7}} my_vars.all_rules = {7: rules[0]} duplicate_hash = compute_hashable_key(rules[duplicate_idx]) duplicate_id = find_duplicate_rule_id(rules[duplicate_idx], duplicate_hash) print("duplicate ID:", duplicate_id) self.assertTrue(duplicate_id == rules[0].name)
def test_merge_rule_statistics_of_duplicate(self): """Checks that the statistics are updated correctly if a duplicate rule is generated during the generalization step in bracid()""" rules = [ pd.Series({"A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=3, upper=3), "Class": "apple"}, name=0), pd.Series({"A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=3, upper=3), "Class": "apple"}, name=1), # Duplicate ] orig_idx = 0 dupl_idx = 1 my_vars.unique_rules = {} my_vars.all_rules = {} for rule in rules: hash_val = compute_hashable_key(rule) my_vars.unique_rules.setdefault(hash_val, set()).add(rule.name) my_vars.all_rules[rule.name] = rule print("hashes", my_vars.unique_rules) # Some random values my_vars.seed_example_rule = {0: {1, 5}, 10: {0}, 4: {7}} my_vars.seed_rule_example = {5: 0, 1: 0, 0: 10, 7: 4} my_vars.closest_examples_per_rule = {0: {0, 3}, 1: {4}, 4: {8}} my_vars.closest_rule_per_example = {0: Data(rule_id=0, dist=3), 3: Data(rule_id=0, dist=2), 4: Data(rule_id=1, dist=0.13), 5: Data(rule_id=76, dist=3)} my_vars.examples_covered_by_rule = {0: {43, 12}, 1: {7}, 2: {3}} # Delete entries of the rule with ID 1 as the one with ID 0 already exists merge_rule_statistics_of_duplicate(rules[orig_idx], rules[dupl_idx]) # Read: example with ID 0 is seed for the rule with ID 5.... correct_seed_example_rule = {0: {5}, 10: {0}, 4: {7}} # Read: rule with ID 5 has as seed example the one with ID 0... correct_seed_rule_example = {5: 0, 0: 10, 7: 4} correct_unique_rules = {compute_hashable_key(rules[orig_idx]): {0}} correct_all_rules = {0: rules[orig_idx]} # extra_rule now also covers the 3 examples to which the 2 deleted rules were closest correct_closest_examples_per_rule = {0: {0, 3, 4}, 4: {8}} correct_closest_rule_per_example = {0: Data(rule_id=0, dist=3), 3: Data(rule_id=0, dist=2), 4: Data(rule_id=0, dist=0.13), 5: Data(rule_id=76, dist=3)} correct_covered_by_rule = {2: {3}, 0: {43, 12, 7}} self.assertTrue(my_vars.seed_rule_example == correct_seed_rule_example) self.assertTrue(my_vars.seed_example_rule == correct_seed_example_rule) self.assertTrue(my_vars.unique_rules == correct_unique_rules) self.assertTrue(my_vars.all_rules == correct_all_rules) self.assertTrue(my_vars.closest_examples_per_rule == correct_closest_examples_per_rule) self.assertTrue(my_vars.closest_rule_per_example == correct_closest_rule_per_example) self.assertTrue(my_vars.examples_covered_by_rule == correct_covered_by_rule)
def test_are_duplicates_length(self): """Tests that two rules of different lengths can never be duplicates""" rules = [ pd.Series( { "A": "high", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=1, upper=1), "Class": "apple" }, name=1), pd.Series( { "B": Bounds(lower=1, upper=1), "C": Bounds(lower=1, upper=1), "Class": "apple" }, name=2) ] duplicate = _are_duplicates(rules[0], rules[1]) self.assertTrue(duplicate is False)
def test_is_duplicate_false(self): """Tests if no duplicate rule is detected""" rules = [ pd.Series( { "A": "high", "B": Bounds(lower=1, upper=2), "C": Bounds(lower=1, upper=3), "Class": "apple" }, name=0), pd.Series( { "A": "high", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=1, upper=1), "Class": "apple" }, name=1) ] new_rule = pd.Series( { "A": "high", "B": Bounds(lower=1, upper=3), "C": Bounds(lower=1, upper=3), "Class": "apple" }, name=2) my_vars.all_rules = {0: rules[0], 1: rules[1]} rule_id = is_duplicate(new_rule, existing_rule_ids=[0, 1]) self.assertTrue(rule_id == -1)
def test_are_duplicates_true(self): """Tests that two rules are detected as duplicates if only the rule ID is different in both rules""" rules = [ pd.Series( { "A": "high", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=1, upper=1), "Class": "apple" }, name=1), pd.Series( { "A": "high", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=1, upper=1), "Class": "apple" }, name=2) ] duplicate = _are_duplicates(rules[0], rules[1]) self.assertTrue(duplicate is True)
def test_are_duplicates_bounds(self): """Tests that no duplicate rules are detected if they are different in a lower or upper boundary value""" rules = [ pd.Series( { "A": "high", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=1, upper=1), "Class": "apple" }, name=1), pd.Series( { "A": "high", "B": Bounds(lower=0.8, upper=1), "C": Bounds(lower=1, upper=1), "Class": "apple" }, name=2) ] duplicate = _are_duplicates(rules[0], rules[1]) self.assertTrue(duplicate is False)
def test_are_duplicates_nominal(self): """Tests that no duplicate rules are detected if they are different in a nominal feature""" rules = [ pd.Series( { "A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=2, upper=2), "Class": "apple" }, name=1), pd.Series( { "A": "high", "B": Bounds(lower=4, upper=4), "C": Bounds(lower=1, upper=1), "Class": "banana" }, name=2) ] duplicate = _are_duplicates(rules[0], rules[1]) self.assertTrue(duplicate is False)
def test_train(self): """Test with numeric and nominal features""" training_set = pd.DataFrame({"A": ["low", "low", "high", "low", "low", "high"], "B": [1, 1, 4, 1.5, 0.5, 0.75], "C": [3, 2, 1, .5, 3, 2], "Class": ["apple", "apple", "banana", "banana", "banana", "banana"]}) # Use majority class as minority to have multiple neighbors and see if the function works correctly minority_label = "banana" class_col_name = "Class" rules = { 2: pd.Series({"B": Bounds(lower=1.25, upper=4.0), "C": Bounds(lower=0.5, upper=1.5), "Class": "banana"}, name=2), 6: pd.Series({"A": "low", "B": Bounds(lower=0.5, upper=1.5), "C": Bounds(lower=0.5, upper=3.0), "Class": "banana"}, name=6), 5: pd.Series({"A": "high", "B": Bounds(lower=0.75, upper=4.0), "C": Bounds(lower=1.0, upper=2.5), "Class": "banana"}, name=5), 0: pd.Series({"A": "low", "B": Bounds(lower=0.5, upper=1.5), "C": Bounds(lower=0.5, upper=3.0), "Class": "apple"}, name=0), } model = train_binary(rules, training_set, minority_label, class_col_name) correct_model = {2: Support(minority=1.0, majority=0.0), 6: Support(minority=0.5, majority=0.5), 5: Support(minority=1.0, majority=0.0), 0: Support(minority=0.5, majority=0.5)} self.assertTrue(model == correct_model)
def test_add_one_best_rule_unique(self): """Tests that the best rule found by this function is unique and correspondingly updates relevant statistics if that's not the case""" df = pd.DataFrame({ "A": ["low", "low", "high", "low", "low", "high"], "B": [1, 1, 4, 1.5, 0.5, 0.75], "C": [3, 2, 1, .5, 3, 2], "Class": ["apple", "apple", "banana", "banana", "banana", "banana"] }) class_col_name = "Class" lookup = \ { "A": { 'high': 2, 'low': 4, my_vars.CONDITIONAL: { 'high': Counter({ 'banana': 2 }), 'low': Counter({ 'banana': 2, 'apple': 2 }) } } } test_idx = -1 classes = ["apple", "banana"] min_max = pd.DataFrame({ "B": { "min": 1, "max": 5 }, "C": { "min": 1, "max": 11 } }) my_vars.minority_class = "apple" # name=6 because this guy already exists in the rules and the new rule with name=0 becomes the same, so # it's removed correct_generalized_rule = pd.Series( { "A": "low", "B": (1, 1), "C": (2.0, 3), "Class": "apple" }, name=6) rules = [ pd.Series( { "A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=2, upper=2), "Class": "apple" }, name=1), pd.Series( { "A": "high", "B": Bounds(lower=4, upper=4), "C": Bounds(lower=1, upper=1), "Class": "banana" }, name=2), pd.Series( { "A": "low", "B": Bounds(lower=1.5, upper=1.5), "C": Bounds(lower=0.5, upper=0.5), "Class": "banana" }, name=3), pd.Series( { "A": "low", "B": Bounds(lower=0.5, upper=0.5), "C": Bounds(lower=3, upper=3), "Class": "banana" }, name=4), pd.Series( { "A": "high", "B": Bounds(lower=0.75, upper=0.75), "C": Bounds(lower=2, upper=2), "Class": "banana" }, name=5), pd.Series( { "A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=2.0, upper=3), "Class": "apple" }, name=6), # same as best rule pd.Series( { "A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=3, upper=3), "Class": "apple" }, name=0) # Current rule is always at the end of the list ] for rule in rules: rule_hash = compute_hashable_key(rule) my_vars.unique_rules[rule_hash] = {rule.name} correct_generalized_rule_hash = compute_hashable_key( correct_generalized_rule) my_vars.examples_covered_by_rule = {} my_vars.all_rules = { 0: rules[test_idx], 1: rules[0], 2: rules[1], 3: rules[2], 4: rules[3], 5: rules[4], 6: rules[5] } my_vars.seed_rule_example = {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 8} my_vars.seed_example_rule = { 0: {0}, 1: {1}, 2: {2}, 3: {3}, 4: {4}, 5: {5} } my_vars.closest_examples_per_rule = { 0: {1, 4}, 1: {0, 3}, 2: {5}, 5: {2} } # Note that 6: {8} is incorrect and was just added to test if the entries are merged correctly my_vars.examples_covered_by_rule = {6: {8}} print("rule hashes", my_vars.unique_rules) print(correct_generalized_rule_hash) my_vars.closest_rule_per_example = { 0: Data(rule_id=1, dist=0.010000000000000002), 1: Data(rule_id=6, dist=0.0), 2: Data(rule_id=5, dist=0.67015625), 3: Data(rule_id=1, dist=0.038125), 4: Data(rule_id=0, dist=0.015625), 5: Data(rule_id=2, dist=0.67015625), 8: Data(rule_id=6, dist=0) # Fake entry } my_vars.conf_matrix = { my_vars.TP: {0, 1}, my_vars.FP: {3, 4}, my_vars.TN: {2, 5}, my_vars.FN: set() } initial_f1 = 0.66666 k = 3 neighbors, dists, _ = find_nearest_examples( df, k, rules[test_idx], class_col_name, lookup, min_max, classes, label_type=my_vars.SAME_LABEL_AS_RULE, only_uncovered_neighbors=True) improved, updated_rules, f1 = add_one_best_rule( df, neighbors, rules[test_idx], rules, initial_f1, class_col_name, lookup, min_max, classes) correct_closest_rule_per_example = { 0: Data(rule_id=1, dist=0.010000000000000002), 1: Data(rule_id=6, dist=0.0), 2: Data(rule_id=5, dist=0.67015625), 3: Data(rule_id=1, dist=0.038125), 4: Data(rule_id=6, dist=0.015625), 5: Data(rule_id=2, dist=0.67015625), 8: Data(rule_id=6, dist=0) } self.assertTrue(improved is True) correct_f1 = 2 * 0.5 * 1 / 1.5 self.assertTrue(abs(correct_f1 - f1) < my_vars.PRECISION) correct_confusion_matrix = { my_vars.TP: {0, 1}, my_vars.FP: {3, 4}, my_vars.TN: {2, 5}, my_vars.FN: set() } # Make sure confusion matrix, closest rule per example, and rule set were updated with the updated rule too for example_id in my_vars.closest_rule_per_example: # 8 was only added to test something else, since it won't be in the result # if example_id != 8: rule_id, dist = my_vars.closest_rule_per_example[example_id] self.assertTrue( rule_id == correct_closest_rule_per_example[example_id].rule_id and abs(dist - correct_closest_rule_per_example[example_id].dist) < 0.001) self.assertTrue(updated_rules[5].equals(correct_generalized_rule)) self.assertTrue(my_vars.conf_matrix == correct_confusion_matrix) # Duplicate rule was deleted so that the last rule now corresponds to the rule with id self.assertTrue( len(rules) - 1 == len(updated_rules) and updated_rules[-1].name == 6)
def test_evaluate_f1_update_confusion_matrix_not_updated(self): """Tests what happens if input has a numeric and a nominal feature and a rule that predicts an example is not updated as F1 score doesn't improve""" df = pd.DataFrame({ "A": ["low", "low", "high", "low", "low", "high"], "B": [1, 1, 4, 1.5, 0.5, 0.75], "C": [3, 2, 1, .5, 3, 2], "Class": ["apple", "apple", "banana", "banana", "banana", "banana"] }) class_col_name = "Class" lookup = \ { "A": { 'high': 2, 'low': 4, my_vars.CONDITIONAL: { 'high': Counter({ 'banana': 2 }), 'low': Counter({ 'banana': 2, 'apple': 2 }) } } } classes = ["apple", "banana"] min_max = pd.DataFrame({ "B": { "min": 1, "max": 5 }, "C": { "min": 1, "max": 11 } }) my_vars.minority_class = "apple" rules = [ pd.Series( { "A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=3, upper=3), "Class": "apple" }, name=0), pd.Series( { "A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=2, upper=2), "Class": "apple" }, name=1), pd.Series( { "A": "high", "B": Bounds(lower=4, upper=4), "C": Bounds(lower=1, upper=1), "Class": "banana" }, name=2), pd.Series( { "A": "low", "B": Bounds(lower=1.5, upper=1.5), "C": Bounds(lower=0.5, upper=0.5), "Class": "banana" }, name=3), pd.Series( { "A": "low", "B": Bounds(lower=0.5, upper=0.5), "C": Bounds(lower=3, upper=3), "Class": "banana" }, name=4), pd.Series( { "A": "high", "B": Bounds(lower=0.75, upper=0.75), "C": Bounds(lower=2, upper=2), "Class": "banana" }, name=5) ] my_vars.examples_covered_by_rule = {} my_vars.closest_examples_per_rule = { 0: {1, 4}, 1: {0, 3}, 2: {5}, 5: {2} } my_vars.seed_rule_example = {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5} my_vars.seed_example_rule = { 0: {0}, 1: {1}, 2: {2}, 3: {3}, 4: {4}, 5: {5} } my_vars.all_rules = { 0: rules[0], 1: rules[1], 2: rules[2], 3: rules[3], 4: rules[4], 5: rules[5] } my_vars.closest_rule_per_example = { 0: Data(rule_id=1, dist=0.010000000000000002), 1: Data(rule_id=0, dist=0.010000000000000002), 2: Data(rule_id=5, dist=0.67015625), 3: Data(rule_id=1, dist=0.038125), 4: Data(rule_id=0, dist=0.015625), 5: Data(rule_id=2, dist=0.67015625) } my_vars.conf_matrix = { my_vars.TP: {0, 1}, my_vars.FP: set(), my_vars.TN: {2, 5}, my_vars.FN: {3, 4} } new_rule = pd.Series( { "A": "low", "B": (0.5, 0.5), "C": (3, 3), "Class": "banana" }, name=4) correct_f1 = 2 * 1 * 0.5 / 1.5 f1 = evaluate_f1_update_confusion_matrix(df, new_rule, class_col_name, lookup, min_max, classes) correct_closest_rule_per_example = { 0: Data(rule_id=1, dist=0.010000000000000002), 1: Data(rule_id=0, dist=0.010000000000000002), 2: Data(rule_id=5, dist=0.67015625), 3: Data(rule_id=1, dist=0.038125), 4: Data(rule_id=0, dist=0.015625), 5: Data(rule_id=2, dist=0.67015625) } self.assertTrue(f1 == correct_f1) for example_id in my_vars.closest_rule_per_example: rule_id, dist = my_vars.closest_rule_per_example[example_id] self.assertTrue( rule_id == correct_closest_rule_per_example[example_id][0] and abs(dist - correct_closest_rule_per_example[example_id][1]) < 0.001) correct_conf_matrix = { my_vars.TP: {0, 1}, my_vars.FP: set(), my_vars.TN: {2, 5}, my_vars.FN: {3, 4} } self.assertTrue(my_vars.conf_matrix == correct_conf_matrix)
def test_add_tags_all_tags(self): """Add tags when using nominal and numeric features and assigning noisy, borderline and safe as tags""" df = pd.DataFrame({ "A": ["low", "low", "high", "low", "low", "high"], "B": [1, 1, 4, 1.5, 0.5, 0.75], "C": [3, 2, 1, .5, 3, 2], "Class": ["apple", "apple", "banana", "banana", "banana", "banana"] }) class_col_name = "Class" my_vars.examples_covered_by_rule = { 0: {0}, 1: {1}, 2: {2}, 3: {3}, 4: {4}, 5: {5} } lookup = \ { "A": { 'high': 2, 'low': 4, CONDITIONAL: { 'high': Counter({ 'banana': 2 }), 'low': Counter({ 'banana': 2, 'apple': 2 }) } } } correct = pd.DataFrame({ "A": ["low", "low", "high", "low", "low", "high"], "B": [1, 1, 4, 1.5, 0.5, 0.75], "C": [3, 2, 1, .5, 3, 2], "Class": ["apple", "apple", "banana", "banana", "banana", "banana"], TAG: [BORDERLINE, BORDERLINE, SAFE, NOISY, NOISY, BORDERLINE] }) classes = ["apple", "banana"] min_max = pd.DataFrame({ "C": { "min": 1, "max": 5 }, "B": { "min": 1, "max": 11 } }) k = 2 rules = [ pd.Series( { "A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=3, upper=3), "Class": "apple" }, name=0), pd.Series( { "A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=2, upper=2), "Class": "apple" }, name=1), pd.Series( { "A": "high", "B": Bounds(lower=4, upper=4), "C": Bounds(lower=1, upper=1), "Class": "banana" }, name=2), pd.Series( { "A": "low", "B": Bounds(lower=1.5, upper=1.5), "C": Bounds(lower=0.5, upper=0.5), "Class": "banana" }, name=3), pd.Series( { "A": "low", "B": Bounds(lower=0.5, upper=0.5), "C": Bounds(lower=3, upper=3), "Class": "banana" }, name=4), pd.Series( { "A": "high", "B": Bounds(lower=0.75, upper=0.75), "C": Bounds(lower=2, upper=2), "Class": "banana" }, name=5) ] my_vars.all_rules = { 0: rules[0], 1: rules[1], 2: rules[2], 3: rules[3], 4: rules[4], 5: rules[5] } my_vars.closest_rule_per_example = {} my_vars.closest_examples_per_rule = {} my_vars.seed_rule_example = {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5} my_vars.seed_example_rule = {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5} # Note: examples_covered_by_rule implicitly includes the seeds of all rules my_vars.examples_covered_by_rule = {} tagged = add_tags(df, k, rules, class_col_name, lookup, min_max, classes) self.assertTrue(tagged.equals(correct))
def test_add_tags_nan(self): """Add tags when using nominal and numeric features when all examples contain at least one NaN value""" df = pd.DataFrame({ "A": [np.NaN, np.NaN, "high", np.NaN, "low", np.NaN], "B": [np.NaN, 1, np.NaN, 1.5, np.NaN, 0.75], "C": [3, 2, 1, .5, 3, 2], "Class": ["apple", "apple", "banana", "banana", "banana", "banana"] }) my_vars.examples_covered_by_rule = { 0: {0}, 1: {1}, 2: {2}, 3: {3}, 4: {4}, 5: {5} } class_col_name = "Class" lookup = \ { "A": { 'high': 1, 'low': 1, CONDITIONAL: { 'high': Counter({ 'banana': 1 }), 'low': Counter({ 'banana': 1 }) } } } correct = pd.DataFrame({ "A": [np.NaN, np.NaN, "high", np.NaN, "low", np.NaN], "B": [np.NaN, 1, np.NaN, 1.5, np.NaN, 0.75], "C": [3, 2, 1, .5, 3, 2], "Class": ["apple", "apple", "banana", "banana", "banana", "banana"], TAG: [BORDERLINE, NOISY, SAFE, SAFE, SAFE, SAFE] }) classes = ["apple", "banana"] min_max = pd.DataFrame({ "C": { "min": 1, "max": 5 }, "B": { "min": 1, "max": 11 } }) rules = [ pd.Series( { "A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=3, upper=3), "Class": "apple" }, name=0), pd.Series( { "A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=2, upper=2), "Class": "apple" }, name=1), pd.Series( { "A": "high", "B": Bounds(lower=4, upper=4), "C": Bounds(lower=1, upper=1), "Class": "banana" }, name=2), pd.Series( { "A": "low", "B": Bounds(lower=1.5, upper=1.5), "C": Bounds(lower=0.5, upper=0.5), "Class": "banana" }, name=3), pd.Series( { "A": "low", "B": Bounds(lower=0.5, upper=0.5), "C": Bounds(lower=3, upper=3), "Class": "banana" }, name=4), pd.Series( { "A": "high", "B": Bounds(lower=0.75, upper=0.75), "C": Bounds(lower=2, upper=2), "Class": "banana" }, name=5) ] k = 3 my_vars.all_rules = { 0: rules[0], 1: rules[1], 2: rules[2], 3: rules[3], 4: rules[4], 5: rules[5] } my_vars.closest_rule_per_example = {} my_vars.closest_examples_per_rule = {} my_vars.seed_rule_example = {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5} my_vars.seed_example_rule = {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5} # Note: examples_covered_by_rule implicitly includes the seeds of all rules my_vars.examples_covered_by_rule = {} tagged = add_tags(df, k, rules, class_col_name, lookup, min_max, classes) # Due to floating point precision, use approximate comparison self.assertTrue(tagged.equals(correct))
def test_find_neighbors_numeric_nominal(self): """Tests what happens if input has a numeric and a nominal feature""" df = pd.DataFrame({ "A": ["low", "low", "high", "low", "low", "high"], "B": [1, 1, 4, 1.5, 0.5, 0.75], "C": [3, 2, 1, .5, 3, 2], "Class": ["apple", "apple", "banana", "banana", "banana", "banana"] }) class_col_name = "Class" lookup = \ { "A": { 'high': 2, 'low': 4, my_vars.CONDITIONAL: { 'high': Counter({ 'banana': 2 }), 'low': Counter({ 'banana': 2, 'apple': 2 }) } } } k = 4 correct = None if k == 1: correct = df.iloc[[5]] elif k == 2: correct = df.iloc[[5, 2]] elif k == 3: correct = df.iloc[[5, 2, 3]] elif k >= 4: correct = df.iloc[[5, 2, 3, 4]] rule = pd.Series({ "A": "high", "B": Bounds(lower=1, upper=1), "Class": "banana" }) classes = ["apple", "banana"] min_max = pd.DataFrame({ "A": { "min": 1, "max": 5 }, "B": { "min": 1, "max": 11 } }) # Reset as other tests changed the content of the dictionary my_vars.closest_rule_per_example = {} neighbors, _, _ = find_nearest_examples( df, k, rule, class_col_name, lookup, min_max, classes, label_type=my_vars.SAME_LABEL_AS_RULE, only_uncovered_neighbors=False) if neighbors is not None: self.assertTrue(neighbors.shape[0] == k) self.assertTrue(neighbors.equals(correct))
def test_evaluate_f1_temporarily(self): """Tests that the global variables won't be updated despite local changes""" df = pd.DataFrame({ "A": ["low", "low", "high", "low", "low", "high"], "B": [1, 1, 4, 1.5, 0.5, 0.75], "C": [3, 2, 1, .5, 3, 2], "Class": ["apple", "apple", "banana", "banana", "banana", "banana"] }) class_col_name = "Class" lookup = \ { "A": { 'high': 2, 'low': 4, my_vars.CONDITIONAL: { 'high': Counter({ 'banana': 2 }), 'low': Counter({ 'banana': 2, 'apple': 2 }) } } } classes = ["apple", "banana"] min_max = pd.DataFrame({ "B": { "min": 1, "max": 5 }, "C": { "min": 1, "max": 11 } }) my_vars.minority_class = "apple" # Reset as other tests change the data my_vars.examples_covered_by_rule = {} my_vars.seed_rule_example = {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5} my_vars.seed_example_rule = { 0: {0}, 1: {1}, 2: {2}, 3: {3}, 4: {4}, 5: {5} } rules = [ pd.Series( { "A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=3, upper=3), "Class": "apple" }, name=0), pd.Series( { "A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=2, upper=2), "Class": "apple" }, name=1), pd.Series( { "A": "high", "B": Bounds(lower=4, upper=4), "C": Bounds(lower=1, upper=1), "Class": "banana" }, name=2), pd.Series( { "A": "low", "B": Bounds(lower=1.5, upper=1.5), "C": Bounds(lower=0.5, upper=0.5), "Class": "banana" }, name=3), pd.Series( { "A": "low", "B": Bounds(lower=0.5, upper=0.5), "C": Bounds(lower=3, upper=3), "Class": "banana" }, name=4), pd.Series( { "A": "high", "B": Bounds(lower=0.75, upper=0.75), "C": Bounds(lower=2, upper=2), "Class": "banana" }, name=5) ] my_vars.all_rules = { 0: rules[0], 1: rules[1], 2: rules[2], 3: rules[3], 4: rules[4], 5: rules[5] } my_vars.closest_rule_per_example = { 0: Data(rule_id=1, dist=0.010000000000000002), 1: Data(rule_id=0, dist=0.010000000000000002), 2: Data(rule_id=5, dist=0.67015625), 3: Data(rule_id=1, dist=0.038125), 4: Data(rule_id=0, dist=0.015625), 5: Data(rule_id=2, dist=0.67015625) } my_vars.closest_examples_per_rule = { 0: {1, 4}, 1: {0, 3}, 2: {5}, 5: {2} } correct_closest_rules = copy.deepcopy(my_vars.closest_rule_per_example) correct_closest_examples = copy.deepcopy( my_vars.closest_examples_per_rule) my_vars.conf_matrix = { my_vars.TP: {0, 1}, my_vars.FP: {3, 4}, my_vars.TN: {2, 5}, my_vars.FN: set() } new_rule = pd.Series( { "A": "low", "B": Bounds(lower=0.5, upper=1.0), "C": Bounds(lower=3, upper=3), "Class": "banana" }, name=0) correct_f1 = 0.8 f1, conf_matrix, closest_rules, closest_examples, covered, updated_example_ids = \ evaluate_f1_temporarily(df, new_rule, new_rule.name, class_col_name, lookup, min_max, classes) correct_closest_rule_per_example = { 0: Data(rule_id=1, dist=0.010000000000000002), 1: Data(rule_id=0, dist=0.010000000000000002), 2: Data(rule_id=5, dist=0.67015625), 3: Data(rule_id=1, dist=0.038125), 4: Data(rule_id=0, dist=0.0), 5: Data(rule_id=2, dist=0.67015625) } correct_covered = {0: {4}} correct_updated_examples = [4] self.assertTrue(updated_example_ids == correct_updated_examples) self.assertTrue(f1 == correct_f1) # Local result is still the same as in test_evaluate_f1_update_confusion_matrix.py for example_id in closest_rules: rule_id, dist = closest_rules[example_id] self.assertTrue( rule_id == correct_closest_rule_per_example[example_id][0] and abs(dist - correct_closest_rule_per_example[example_id][1]) < 0.001) self.assertTrue(closest_examples == my_vars.closest_examples_per_rule) correct_conf_matrix = { my_vars.TP: {0, 1}, my_vars.FP: {3}, my_vars.TN: {2, 4, 5}, my_vars.FN: set() } self.assertTrue(conf_matrix == correct_conf_matrix) # But now check that global variables remained unaffected by the changes correct_conf_matrix = { my_vars.TP: {0, 1}, my_vars.FP: {3, 4}, my_vars.TN: {2, 5}, my_vars.FN: set() } self.assertTrue(my_vars.conf_matrix == correct_conf_matrix) self.assertTrue( correct_closest_rules == my_vars.closest_rule_per_example) self.assertTrue( correct_closest_examples == my_vars.closest_examples_per_rule) self.assertTrue(correct_covered == covered)
def test_predict_covered(self): """Predict the class labels of covered examples""" test_set = pd.DataFrame({ "A": ["low", "low", "high", "low", "low", "high"], "B": [1, 1, 4, 1.5, 0.5, 0.75], "C": [3, 2, 1, .5, 3, 2], "Class": ["", "", "", "", "", ""] }) # Use majority class as minority to have multiple neighbors and see if the function works correctly classes = ["apple", "banana"] class_col_name = "Class" my_vars.minority_class = classes[0] rules = { 2: pd.Series( { "B": Bounds(lower=1.25, upper=4.0), "C": Bounds(lower=0.5, upper=1.5), "Class": "banana" }, name=2), 6: pd.Series( { "A": "low", "B": Bounds(lower=0.5, upper=1.5), "C": Bounds(lower=0.5, upper=3.0), "Class": "banana" }, name=6), 5: pd.Series( { "A": "high", "B": Bounds(lower=0.75, upper=4.0), "C": Bounds(lower=1.0, upper=2.5), "Class": "banana" }, name=5), 0: pd.Series( { "A": "low", "B": Bounds(lower=0.5, upper=1.5), "C": Bounds(lower=0.5, upper=3.0), "Class": "apple" }, name=0), } model = { 2: Support(minority=0.75, majority=0.25), 6: Support(minority=0.2, majority=0.8), 5: Support(minority=1.0, majority=0.0), 0: Support(minority=0, majority=1) } # Last 2 parameters aren't be used in this test df = predict_binary(model, test_set, rules, classes, class_col_name, None, None, for_multiclass=False) correct = pd.DataFrame({ my_vars.PREDICTED_LABEL: ["banana", "banana", "apple", "banana", "banana", "apple"], my_vars.PREDICTION_CONFIDENCE: [0.9, 0.9, 0.875, 0.683333, 0.9, 1] }) self.assertTrue( np.array_equal(correct[my_vars.PREDICTED_LABEL].values, df[my_vars.PREDICTED_LABEL].values)) self.assertTrue( np.allclose(correct[my_vars.PREDICTION_CONFIDENCE], df[my_vars.PREDICTION_CONFIDENCE]))
def test_evaluate_f1_initialize_confusion_matrix(self): """Tests what happens if input has a numeric and a nominal feature""" df = pd.DataFrame({"A": ["low", "low", "high", "low", "low", "high"], "B": [1, 1, 4, 1.5, 0.5, 0.75], "C": [3, 2, 1, .5, 3, 2], "Class": ["apple", "apple", "banana", "banana", "banana", "banana"]}) class_col_name = "Class" lookup = \ { "A": { 'high': 2, 'low': 4, my_vars.CONDITIONAL: { 'high': Counter({ 'banana': 2 }), 'low': Counter({ 'banana': 2, 'apple': 2 }) } } } classes = ["apple", "banana"] rules = [ pd.Series({"A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=3, upper=3), "Class": "apple"}, name=0), pd.Series({"A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=2, upper=2), "Class": "apple"}, name=1), pd.Series({"A": "high", "B": Bounds(lower=4, upper=4), "C": Bounds(lower=1, upper=1), "Class": "banana"}, name=2), pd.Series({"A": "low", "B": Bounds(lower=1.5, upper=1.5), "C": Bounds(lower=0.5, upper=0.5), "Class": "banana"}, name=3), pd.Series({"A": "low", "B": Bounds(lower=0.5, upper=0.5), "C": Bounds(lower=3, upper=3), "Class": "banana"}, name=4), pd.Series({"A": "high", "B": Bounds(lower=0.75, upper=0.75), "C": Bounds(lower=2, upper=2), "Class": "banana"}, name=5) ] min_max = pd.DataFrame({"B": {"min": 1, "max": 5}, "C": {"min": 1, "max": 11}}) my_vars.minority_class = "apple" # Reset as other tests changed the data my_vars.closest_rule_per_example = {} my_vars.closest_examples_per_rule = {} my_vars.all_rules = {0: rules[0], 1: rules[1], 2: rules[2], 3: rules[3], 4: rules[4], 5: rules[5]} my_vars.seed_rule_example = {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5} my_vars.seed_example_rule = {0: {0}, 1: {1}, 2: {2}, 3: {3}, 4: {4}, 5: {5}} # Note: examples_covered_by_rule implicitly includes the seeds of all rules my_vars.examples_covered_by_rule = {} # tagged, initial_rules = add_tags_and_extract_rules(df, 2, class_col_name, lookup, min_max, classes) correct_f1 = 2*1*0.5/1.5 f1 = evaluate_f1_initialize_confusion_matrix(df, rules, class_col_name, lookup, min_max, classes) correct_closest_rule_per_example = { 0: (1, 0.010000000000000002), 1: (0, 0.010000000000000002), 2: (5, 0.67015625), 3: (1, 0.038125), 4: (0, 0.015625), 5: (2, 0.67015625)} correct_closest_examples_per_rule = {1: {0, 3}, 0: {1, 4}, 5: {2}, 2: {5}} correct_conf_matrix = {'tp': {0, 1}, 'fp': {3, 4}, 'tn': {2, 5}, 'fn': set()} self.assertTrue(f1 == correct_f1) self.assertTrue(correct_closest_examples_per_rule == my_vars.closest_examples_per_rule) for example_id in my_vars.closest_rule_per_example: rule_id, dist = my_vars.closest_rule_per_example[example_id] self.assertTrue(rule_id == correct_closest_rule_per_example[example_id][0] and abs(dist - correct_closest_rule_per_example[example_id][1]) < 0.001) self.assertTrue(my_vars.conf_matrix == correct_conf_matrix)
def test_find_neighbors_too_few(self): """Test that warning is thrown if too few neighbors exist""" dataset = pd.DataFrame({ "A": [1, 2], "B": [1, 2], "C": [2, 2], "D": ["x", "y"], "Class": ["A", "B"] }) rule = pd.Series({ "A": (0.1, 1), "B": Bounds(lower=1, upper=1), "C": Bounds(lower=2, upper=2), "D": "x", "Class": "A" }) k = 3 classes = ["apple", "banana"] class_col_name = "Class" min_max = pd.DataFrame({ "A": { "min": 1, "max": 5 }, "B": { "min": 1, "max": 11 }, "C": { "min": 1, "max": 2 } }) lookup = \ { "D": { 'x': 1, 'y': 1, my_vars.CONDITIONAL: { 'x': Counter({ 'A': 1 }), 'y': Counter({ 'B': 1 }) } } } self.assertWarns(UserWarning, find_nearest_examples, dataset, k, rule, class_col_name, lookup, min_max, classes, label_type=my_vars.SAME_LABEL_AS_RULE, only_uncovered_neighbors=False)
def test_delete_rule_statistics_collision(self): """Deletes a rule that shares its hash with other rules""" extra_rule = pd.Series( { "A": "high", "B": Bounds(lower=0.1, upper=1), "C": Bounds(lower=1, upper=2), "Class": "apple" }, name=4) rules = [ extra_rule, pd.Series( { "A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=3, upper=3), "Class": "apple" }, name=0), pd.Series( { "A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=3, upper=3), "Class": "apple" }, name=1), # Duplicate ] df = pd.DataFrame({ "A": ["low", "low", "high", "low", "low", "high"], "B": [1, 1, 4, 1.5, 0.5, 0.75], "C": [3, 2, 1, .5, 3, 2], "Class": ["apple", "apple", "apple", "apple", "apple", "apple"] }) class_col_name = "Class" lookup = \ { "A": { 'high': 1, 'low': 2, my_vars.CONDITIONAL: { 'high': Counter({ 'apple': 1 }), 'low': Counter({ 'apple': 2 }) } } } classes = ["apple", "banana"] min_max = pd.DataFrame({ "B": { "min": 0.1, "max": 1 }, "C": { "min": 1, "max": 3 } }) my_vars.minority_class = "apple" my_vars.unique_rules = {} my_vars.all_rules = {} for rule in rules: hash_val = compute_hashable_key(rule) my_vars.unique_rules.setdefault(hash_val, set()).add(rule.name) my_vars.all_rules[rule.name] = rule print("hashes", my_vars.unique_rules) # Some random values my_vars.seed_example_rule = {0: {1, 5}, 10: {0}, 4: {7}} my_vars.seed_rule_example = {5: 0, 1: 0, 0: 10, 7: 4} my_vars.closest_examples_per_rule = {0: {0, 3}, 1: {4}, 4: {8}} my_vars.closest_rule_per_example = { 0: Data(rule_id=0, dist=3), 3: Data(rule_id=0, dist=2), 4: Data(rule_id=1, dist=0.13), 5: Data(rule_id=76, dist=3) } my_vars.examples_covered_by_rule = {0: {43, 12}, 1: {7}, 2: {3}} final_rules = {} # Delete entries for rules with IDs 0 and 1 from all statistics rule1 = rules.pop() delete_rule_statistics(df, rule1, rules, final_rules, class_col_name, lookup, min_max, classes) rule2 = rules.pop() delete_rule_statistics(df, rule2, rules, final_rules, class_col_name, lookup, min_max, classes) correct_seed_example_rule = {4: {7}} correct_seed_rule_example = {5: 0, 7: 4} correct_unique_rules = {compute_hashable_key(extra_rule): {4}} correct_all_rules = {4: extra_rule} # extra_rule now also covers the 3 examples to which the 2 deleted rules were closest correct_closest_examples_per_rule = {4: {8, 0, 3, 4}} correct_closest_rule_per_example = { 5: Data(rule_id=76, dist=3), 4: Data(rule_id=4, dist=0.25), 0: Data(rule_id=4, dist=0.25), 3: Data(rule_id=4, dist=0.371141975308642) } correct_covered_by_rule = {2: {3}} self.assertTrue(my_vars.seed_rule_example == correct_seed_rule_example) self.assertTrue(my_vars.seed_example_rule == correct_seed_example_rule) self.assertTrue(my_vars.unique_rules == correct_unique_rules) self.assertTrue(my_vars.all_rules == correct_all_rules) self.assertTrue(my_vars.closest_examples_per_rule == correct_closest_examples_per_rule) self.assertTrue(my_vars.closest_rule_per_example == correct_closest_rule_per_example) self.assertTrue( my_vars.examples_covered_by_rule == correct_covered_by_rule)
def test_bracid_stops(self): """Tests that the method stops""" df = pd.DataFrame({ "A": ["low", "low", "high", "low", "low", "high"], "B": [1, 1, 4, 1.5, 0.5, 0.75], "C": [3, 2, 1, .5, 3, 2], "Class": ["apple", "apple", "banana", "banana", "banana", "banana"] }) class_col_name = "Class" lookup = \ { "A": { 'high': 2, 'low': 4, my_vars.CONDITIONAL: { 'high': Counter({ 'banana': 2 }), 'low': Counter({ 'banana': 2, 'apple': 2 }) } } } classes = ["apple", "banana"] min_max = pd.DataFrame({ "B": { "min": 1, "max": 5 }, "C": { "min": 1, "max": 11 } }) # Use majority class as minority to have multiple neighbors and see if the function works correctly minority_label = "banana" k = 3 correct_rules = { 0: pd.Series( { "A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=2, upper=3.0), "Class": "apple" }, name=0), 2: pd.Series( { "B": Bounds(lower=1.25, upper=4.0), "C": Bounds(lower=0.5, upper=1.5), "Class": "banana" }, name=2), 3: pd.Series( { "A": "low", "B": Bounds(lower=0.5, upper=1.5), "C": Bounds(lower=0.5, upper=3.0), "Class": "banana" }, name=3), 4: pd.Series( { "B": Bounds(lower=0.5, upper=0.875), "C": Bounds(lower=2.0, upper=3.0), "Class": "banana" }, name=4), 5: pd.Series( { "A": "high", "B": Bounds(lower=0.75, upper=4.0), "C": Bounds(lower=1.0, upper=2.5), "Class": "banana" }, name=5), } rules = bracid(df, k, class_col_name, lookup, min_max, classes, minority_label) all_rules_are_equal = True for r in rules: if not rules[r].equals(correct_rules[r]): all_rules_are_equal = False break self.assertTrue(all_rules_are_equal)
def test_extend_rule_mixed(self): """Test that a rule containing nominal and numeric features is extended correctly""" df = pd.DataFrame({ "A": ["low", "low", "high", "low", "low", "high"], "B": [1, 1, 4, 1.5, 0.5, 0.75], "C": [3, 2, 1, .5, 3.1, 3.2], "Class": ["apple", "apple", "banana", "banana", "banana", "banana"] }) class_col_name = "Class" lookup = \ { "A": { 'high': 2, 'low': 4, my_vars.CONDITIONAL: { 'high': Counter({ 'banana': 2 }), 'low': Counter({ 'banana': 2, 'apple': 2 }) } } } classes = ["apple", "banana"] min_max = pd.DataFrame({ "B": { "min": 1, "max": 5 }, "C": { "min": 1, "max": 11 } }) my_vars.minority_class = "apple" rules = [ pd.Series( { "A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=3, upper=3), "Class": "apple" }, name=0), pd.Series( { "A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=2, upper=2), "Class": "apple" }, name=1), pd.Series( { "A": "high", "B": Bounds(lower=4, upper=4), "C": Bounds(lower=1, upper=1), "Class": "banana" }, name=2), pd.Series( { "A": "low", "B": Bounds(lower=1.5, upper=1.5), "C": Bounds(lower=0.5, upper=0.5), "Class": "banana" }, name=3), pd.Series( { "A": "low", "B": Bounds(lower=0.5, upper=0.5), "C": Bounds(lower=3, upper=3), "Class": "banana" }, name=4), pd.Series( { "A": "high", "B": Bounds(lower=0.75, upper=0.75), "C": Bounds(lower=2, upper=2), "Class": "banana" }, name=5) ] k = 3 # Reset from previous test to make sure they don't affect the outcomes of this test my_vars.closest_examples_per_rule = {} my_vars.closest_rule_per_example = {} my_vars.examples_covered_by_rule = {} extended_rule = extend_rule(df, k, rules[0], class_col_name, lookup, min_max, classes) correct_rule = pd.Series( { "A": "low", "B": (0.875, 1.25), "C": (1.75, 3.05), "Class": "apple" }, name=0) print(extended_rule) self.assertTrue(extended_rule.equals(correct_rule))
def test_add_one_best_rule_no_update(self): """Tests that rule set is not updated when no generalized rule improves F1""" df = pd.DataFrame({ "A": ["low", "low", "high", "low", "low", "high"], "B": [1, 1, 4, 1.5, 0.5, 0.75], "C": [3, 2, 1, .5, 3, 2], "Class": ["apple", "apple", "banana", "banana", "banana", "banana"] }) class_col_name = "Class" lookup = \ { "A": { 'high': 2, 'low': 4, my_vars.CONDITIONAL: { 'high': Counter({ 'banana': 2 }), 'low': Counter({ 'banana': 2, 'apple': 2 }) } } } test_idx = -1 classes = ["apple", "banana"] min_max = pd.DataFrame({ "B": { "min": 1, "max": 5 }, "C": { "min": 1, "max": 11 } }) my_vars.minority_class = "apple" rules = [ pd.Series( { "A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=2, upper=2), "Class": "apple" }, name=1), pd.Series( { "A": "high", "B": Bounds(lower=4, upper=4), "C": Bounds(lower=1, upper=1), "Class": "banana" }, name=2), pd.Series( { "A": "low", "B": Bounds(lower=1.5, upper=1.5), "C": Bounds(lower=0.5, upper=0.5), "Class": "banana" }, name=3), pd.Series( { "A": "low", "B": Bounds(lower=0.5, upper=0.5), "C": Bounds(lower=3, upper=3), "Class": "banana" }, name=4), pd.Series( { "A": "high", "B": Bounds(lower=0.75, upper=0.75), "C": Bounds(lower=2, upper=2), "Class": "banana" }, name=5), pd.Series( { "A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=3, upper=3), "Class": "apple" }, name=0) # Current rule is always at the end of the list ] my_vars.closest_rule_per_example = { 0: Data(rule_id=1, dist=0.010000000000000002), 1: Data(rule_id=0, dist=0.010000000000000002), 2: Data(rule_id=5, dist=0.67015625), 3: Data(rule_id=1, dist=0.038125), 4: Data(rule_id=0, dist=0.015625), 5: Data(rule_id=2, dist=0.67015625) } my_vars.all_rules = { 0: rules[test_idx], 1: rules[1], 2: rules[2], 3: rules[3], 4: rules[4], 5: rules[0] } my_vars.seed_rule_example = {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5} my_vars.seed_example_rule = { 0: {0}, 1: {1}, 2: {2}, 3: {3}, 4: {4}, 5: {5} } my_vars.conf_matrix = { my_vars.TP: {0, 1}, my_vars.FP: set(), my_vars.TN: {2, 5}, my_vars.FN: {3, 4} } my_vars.examples_covered_by_rule = {} # F1 is actually 0.6666, but setting it to 0.8 makes it not update any rule initial_f1 = 0.8 k = 3 my_vars.unique_rules = {} for rule in rules: rule_hash = compute_hashable_key(rule) my_vars.unique_rules.setdefault(rule_hash, set()).add(rule.name) neighbors, dists, _ = find_nearest_examples( df, k, rules[test_idx], class_col_name, lookup, min_max, classes, label_type=my_vars.SAME_LABEL_AS_RULE, only_uncovered_neighbors=True) improved, updated_rules, f1 = add_one_best_rule( df, neighbors, rules[test_idx], rules, initial_f1, class_col_name, lookup, min_max, classes) correct_closest_rule_per_example = { 0: Data(rule_id=1, dist=0.010000000000000002), 1: Data(rule_id=0, dist=0.010000000000000002), 2: Data(rule_id=5, dist=0.67015625), 3: Data(rule_id=1, dist=0.038125), 4: Data(rule_id=0, dist=0.015625), 5: Data(rule_id=2, dist=0.67015625) } self.assertTrue(improved is False) correct_f1 = initial_f1 self.assertTrue(abs(correct_f1 - f1) < my_vars.PRECISION) correct_generalized_rule = pd.Series( { "A": "low", "B": (1, 1), "C": (3, 3), "Class": "apple" }, name=0) correct_confusion_matrix = { my_vars.TP: {0, 1}, my_vars.FP: set(), my_vars.TN: {2, 5}, my_vars.FN: {3, 4} } # Make sure confusion matrix, closest rule per example, and rule set were updated with the updated rule too for example_id in my_vars.closest_rule_per_example: rule_id, dist = my_vars.closest_rule_per_example[example_id] self.assertTrue( rule_id == correct_closest_rule_per_example[example_id].rule_id and abs(dist - correct_closest_rule_per_example[example_id].dist) < 0.001) print(rules[test_idx]) print(correct_generalized_rule) print("updated") print(updated_rules) self.assertTrue( updated_rules[test_idx].equals(correct_generalized_rule)) self.assertTrue(my_vars.conf_matrix == correct_confusion_matrix)
def test_extend_rule_no_change(self): """Test that a rule containing nominal and numeric features isn't extended due to no neighbors""" df = pd.DataFrame({ "A": ["low", "low", "high", "low", "low", "high"], "B": [1, 1, 1, 1, 1, 1], "C": [3, 2, 3, 3, 3, 3], "Class": ["apple", "apple", "banana", "banana", "banana", "banana"] }) class_col_name = "Class" lookup = \ { "A": { 'high': 2, 'low': 4, my_vars.CONDITIONAL: { 'high': Counter({ 'banana': 2 }), 'low': Counter({ 'banana': 2, 'apple': 2 }) } } } classes = ["apple", "banana"] min_max = pd.DataFrame({ "B": { "min": 1, "max": 5 }, "C": { "min": 1, "max": 11 } }) my_vars.minority_class = "apple" rules = [ pd.Series( { "A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=3, upper=3), "Class": "apple" }, name=0), pd.Series( { "A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=2, upper=2), "Class": "apple" }, name=1), pd.Series( { "A": "high", "B": Bounds(lower=4, upper=4), "C": Bounds(lower=1, upper=1), "Class": "banana" }, name=2), pd.Series( { "A": "low", "B": Bounds(lower=1.5, upper=1.5), "C": Bounds(lower=0.5, upper=0.5), "Class": "banana" }, name=3), pd.Series( { "A": "low", "B": Bounds(lower=0.5, upper=0.5), "C": Bounds(lower=3, upper=3), "Class": "banana" }, name=4), pd.Series( { "A": "high", "B": Bounds(lower=0.75, upper=0.75), "C": Bounds(lower=2, upper=2), "Class": "banana" }, name=5) ] my_vars.closest_examples_per_rule = {} my_vars.closest_rule_per_example = {} k = 3 extended_rule = extend_rule(df, k, rules[0], class_col_name, lookup, min_max, classes) correct_rule = pd.Series( { "A": "low", "B": (1, 1), "C": (3, 3), "Class": "apple" }, name=0) self.assertTrue(extended_rule.equals(correct_rule))
def test_find_neighbors_numeric_nominal_covers(self): """Tests that the stats for a newly covered rule are updated (dist = 0)""" """Tests that global statistics are updated accordingly""" df = pd.DataFrame({ "A": ["low", "low", "high", "high", "low", "high"], "B": [1, 1, 1, 1, 0.5, 0.75], "C": [3, 2, 1, .5, 3, 2], "Class": ["apple", "apple", "banana", "banana", "banana", "banana"] }) class_col_name = "Class" lookup = \ { "A": { 'high': 2, 'low': 4, my_vars.CONDITIONAL: { 'high': Counter({ 'banana': 2 }), 'low': Counter({ 'banana': 2, 'apple': 2 }) } } } rules = [ pd.Series( { "A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=3, upper=3), "Class": "apple" }, name=0), pd.Series( { "A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=2, upper=2), "Class": "apple" }, name=1), pd.Series( { "A": "high", "B": Bounds(lower=4, upper=4), "C": Bounds(lower=1, upper=1), "Class": "banana" }, name=2), pd.Series( { "A": "low", "B": Bounds(lower=1.5, upper=1.5), "C": Bounds(lower=0.5, upper=0.5), "Class": "banana" }, name=3), pd.Series( { "A": "low", "B": Bounds(lower=0.5, upper=0.5), "C": Bounds(lower=3, upper=3), "Class": "banana" }, name=4), pd.Series( { "A": "high", "B": Bounds(lower=0.75, upper=0.75), "C": Bounds(lower=2, upper=2), "Class": "banana" }, name=5) ] my_vars.all_rules = { 0: rules[0], 1: rules[1], 2: rules[2], 3: rules[3], 4: rules[4], 5: rules[5] } my_vars.closest_rule_per_example = { 0: (1, 0.010000000000000002), 1: (0, 0.010000000000000002), 2: (5, 0.67015625), 3: (1, 0.038125), 4: (0, 0.015625), 5: (2, 0.67015625) } my_vars.closest_examples_per_rule = { 0: {1, 4}, 1: {0, 3}, 2: {5}, 5: {2} } k = 4 correct = df.iloc[[2, 3, 5, 4]] rule = pd.Series({"A": "high", "B": (1, 1), "Class": "banana"}, name=0) classes = ["apple", "banana"] my_vars.minority_class = "banana" min_max = pd.DataFrame({ "A": { "min": 1, "max": 5 }, "B": { "min": 1, "max": 11 } }) # An example could be covered by multiple rules, so example 2 should be covered by rules 0 and 1 at the end my_vars.examples_covered_by_rule = {1: {2}} correct_covered = {0: {2, 3}, 1: {2}} correct_examples_per_rule = {0: {1, 2, 3, 4, 5}, 1: {0}} correct_closest_rule_per_example = { 0: (1, 0.010000000000000002), 1: (0, 0.010000000000000002), 2: (0, 0.0), 3: (0, 0.0), 4: (0, 0.015625), 5: (0, 0.0006250000000000001) } neighbors, _, _ = find_nearest_examples( df, k, rule, class_col_name, lookup, min_max, classes, label_type=my_vars.SAME_LABEL_AS_RULE, only_uncovered_neighbors=False) self.assertTrue(neighbors.equals(correct)) self.assertTrue(correct_covered == my_vars.examples_covered_by_rule) self.assertTrue( correct_examples_per_rule == my_vars.closest_examples_per_rule) for example_id, (rule_id, dist) in correct_closest_rule_per_example.items(): self.assertTrue(example_id in my_vars.closest_rule_per_example) other_id, other_dist = my_vars.closest_rule_per_example[example_id] self.assertTrue(rule_id == other_id) self.assertTrue(abs(dist - other_dist) < 0.0001)
def test_find_neighbors_numeric_nominal_label_type(self): """Tests what happens if input has a numeric and a nominal feature and we vary label_type as parameter""" df = pd.DataFrame({ "A": ["low", "low", "high", "low", "low", "high"], "B": [1, 1, 4, 1.5, 0.5, 0.75], "C": [3, 2, 1, .5, 3, 2], "Class": ["apple", "apple", "banana", "banana", "banana", "banana"] }) class_col_name = "Class" lookup = \ { "A": { 'high': 2, 'low': 4, my_vars.CONDITIONAL: { 'high': Counter({ 'banana': 2 }), 'low': Counter({ 'banana': 2, 'apple': 2 }) } } } k = 3 rules = [ pd.Series( { "A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=3, upper=3), "Class": "apple" }, name=0), pd.Series( { "A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=2, upper=2), "Class": "apple" }, name=1), pd.Series( { "A": "high", "B": Bounds(lower=4, upper=4), "C": Bounds(lower=1, upper=1), "Class": "banana" }, name=2), pd.Series( { "A": "low", "B": Bounds(lower=1.5, upper=1.5), "C": Bounds(lower=0.5, upper=0.5), "Class": "banana" }, name=3), pd.Series( { "A": "low", "B": Bounds(lower=0.5, upper=0.5), "C": Bounds(lower=3, upper=3), "Class": "banana" }, name=4), pd.Series( { "A": "high", "B": Bounds(lower=0.75, upper=0.75), "C": Bounds(lower=2, upper=2), "Class": "banana" }, name=5) ] my_vars.all_rules = { 0: rules[0], 1: rules[1], 2: rules[2], 3: rules[3], 4: rules[4], 5: rules[5] } my_vars.closest_rule_per_example = {} my_vars.closest_examples_per_rule = {} correct_all = df.iloc[[5, 2, 0]] correct_same = df.iloc[[5, 2, 3]] correct_opposite = df.iloc[[0, 1]] rule = pd.Series({"A": "high", "B": (1, 1), "Class": "banana"}, name=0) classes = ["apple", "banana"] min_max = pd.DataFrame({ "A": { "min": 1, "max": 5 }, "B": { "min": 1, "max": 11 } }) neighbors_all, _, _ = find_nearest_examples( df, k, rule, class_col_name, lookup, min_max, classes, label_type=my_vars.ALL_LABELS, only_uncovered_neighbors=False) neighbors_same, _, _ = find_nearest_examples( df, k, rule, class_col_name, lookup, min_max, classes, label_type=my_vars.SAME_LABEL_AS_RULE, only_uncovered_neighbors=False) neighbors_opposite, _, _ = find_nearest_examples( df, k, rule, class_col_name, lookup, min_max, classes, label_type=my_vars.OPPOSITE_LABEL_TO_RULE, only_uncovered_neighbors=False) print(neighbors_all) print(neighbors_same) print(neighbors_opposite) self.assertTrue(neighbors_all.equals(correct_all)) self.assertTrue(neighbors_same.equals(correct_same)) self.assertTrue(neighbors_opposite.equals(correct_opposite))
def test_add_all_good_rules(self): """Tests that rule set is updated when a generalized rule improves F1""" df = pd.DataFrame({"A": ["low", "low", "high", "low", "low", "high"], "B": [1, 1, 4, 1.5, 0.5, 0.75], "C": [3, 2, 1, .5, 3, 2], "Class": ["apple", "apple", "banana", "banana", "banana", "banana"]}) class_col_name = "Class" lookup = \ { "A": { 'high': 2, 'low': 4, my_vars.CONDITIONAL: { 'high': Counter({ 'banana': 2 }), 'low': Counter({ 'banana': 2, 'apple': 2 }) } } } classes = ["apple", "banana"] min_max = pd.DataFrame({"B": {"min": 1, "max": 5}, "C": {"min": 1, "max": 11}}) # Use majority class as minority to have multiple neighbors and see if the function works correctly my_vars.minority_class = "banana" rules = [ pd.Series({"A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=3, upper=3), "Class": "apple"}, name=0), pd.Series({"A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=2, upper=2), "Class": "apple"}, name=1), pd.Series({"A": "low", "B": Bounds(lower=1.5, upper=1.5), "C": Bounds(lower=0.5, upper=0.5), "Class": "banana"}, name=3), pd.Series({"A": "low", "B": Bounds(lower=0.5, upper=0.5), "C": Bounds(lower=3, upper=3), "Class": "banana"}, name=4), pd.Series({"A": "high", "B": Bounds(lower=0.75, upper=0.75), "C": Bounds(lower=2, upper=2), "Class": "banana"}, name=5), pd.Series({"A": "high", "B": Bounds(lower=4, upper=4), "C": Bounds(lower=1, upper=1), "Class": "banana"}, name=2) # Current rule to be tested is always at the end ] test_idx = -1 my_vars.latest_rule_id = len(rules) - 1 my_vars.examples_covered_by_rule = {} my_vars.all_rules = {0: rules[0], 1: rules[1], 2: rules[test_idx], 3: rules[2], 4: rules[3], 5: rules[4]} my_vars.seed_rule_example = {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5} my_vars.seed_example_rule = {0: {0}, 1: {1}, 2: {2}, 3: {3}, 4: {4}, 5: {5}} my_vars.unique_rules = {} for rule in rules: hash_val = compute_hashable_key(rule) my_vars.unique_rules.setdefault(hash_val, set()).add(rule.name) initial_correct_closest_rule_per_example = { 0: Data(rule_id=1, dist=0.010000000000000002), 1: Data(rule_id=0, dist=0.010000000000000002), 2: Data(rule_id=5, dist=0.67015625), 3: Data(rule_id=1, dist=0.038125), 4: Data(rule_id=0, dist=0.015625), 5: Data(rule_id=2, dist=0.67015625)} initial_f1 = evaluate_f1_initialize_confusion_matrix(df, rules, class_col_name, lookup, min_max, classes) correct_confusion_matrix = {my_vars.TP: {2, 5}, my_vars.FP: set(), my_vars.TN: {0, 1}, my_vars.FN: {3, 4}} correct_rules = 8 self.assertTrue(my_vars.conf_matrix == correct_confusion_matrix) # Make sure confusion matrix, closest rule per example are correct at the beginning for example_id in my_vars.closest_rule_per_example: rule_id, dist = my_vars.closest_rule_per_example[example_id] self.assertTrue(rule_id == initial_correct_closest_rule_per_example[example_id].rule_id and abs(dist - initial_correct_closest_rule_per_example[example_id].dist) < 0.001) correct_initial_f1 = 2 * 0.5 * 1 / 1.5 self.assertTrue(initial_f1 == correct_initial_f1) k = 3 neighbors, dists, _ = find_nearest_examples(df, k, rules[test_idx], class_col_name, lookup, min_max, classes, label_type=my_vars.SAME_LABEL_AS_RULE, only_uncovered_neighbors= True) improved, updated_rules, f1 = add_all_good_rules(df, neighbors, rules[test_idx], rules, initial_f1, class_col_name, lookup, min_max, classes) self.assertTrue(improved is True) print("f1", f1) # correct_covered = {2: {0, 1, 2, 3, 4, 5}} correct_covered = {6: {0, 1, 2, 4, 5}, 7: {3}} correct_confusion_matrix = {my_vars.TP: {2, 3, 4, 5}, my_vars.FP: {0, 1}, my_vars.TN: set(), my_vars.FN: set()} # correct_closest_rule_per_example = { # 0: Data(rule_id=2, dist=0.0), # 1: Data(rule_id=2, dist=0.0), # 2: Data(rule_id=2, dist=0.0), # 3: Data(rule_id=2, dist=0.0), # 4: Data(rule_id=2, dist=0.0), # 5: Data(rule_id=2, dist=0.0)} correct_closest_rule_per_example = { 0: Data(rule_id=6, dist=0.0), 1: Data(rule_id=6, dist=0.0), 2: Data(rule_id=6, dist=0.0), 3: Data(rule_id=7, dist=0.0), 4: Data(rule_id=6, dist=0.0), 5: Data(rule_id=6, dist=0.0) } correct_f1 = 0.8 self.assertTrue(correct_f1 == f1) for example_id in my_vars.closest_rule_per_example: rule_id, dist = my_vars.closest_rule_per_example[example_id] self.assertTrue(rule_id == correct_closest_rule_per_example[example_id].rule_id and abs(dist - correct_closest_rule_per_example[example_id].dist) < 0.001) self.assertTrue(my_vars.conf_matrix == correct_confusion_matrix) # latest_rule_id must be 7 as 2 new rules were added to the 5 initial rules self.assertTrue(len(updated_rules) == correct_rules and my_vars.latest_rule_id == (correct_rules - 1)) self.assertTrue(correct_covered == my_vars.examples_covered_by_rule)
def test_find_neighbors_numeric_nominal_covered(self): """Tests what happens if input has a numeric and a nominal feature and some examples are already covered by the rule""" df = pd.DataFrame({ "A": ["low", "low", "high", "low", "low", "high"], "B": [1, 1, 4, 1.5, 0.5, 0.75], "C": [3, 2, 1, .5, 3, 2], "Class": ["apple", "apple", "banana", "banana", "banana", "banana"] }) class_col_name = "Class" lookup = \ { "A": { 'high': 2, 'low': 4, my_vars.CONDITIONAL: { 'high': Counter({ 'banana': 2 }), 'low': Counter({ 'banana': 2, 'apple': 2 }) } } } k = 4 my_vars.closest_rule_per_example = {} correct = None if k == 1: correct = df.iloc[[5]] elif k == 2: correct = df.iloc[[5, 2]] elif k == 3: correct = df.iloc[[5, 2, 3]] elif k >= 4: # correct = df.iloc[[5, 2, 3, 4]] # Examples at indices 2 and 4 are already covered by the rule, so don't return them as neighbors my_vars.examples_covered_by_rule = {0: {2, 4}} correct = df.iloc[[5, 3]] my_vars.all_rules = {} rule = pd.Series( { "A": "high", "B": Bounds(lower=1, upper=1), "Class": "banana" }, name=0) classes = ["apple", "banana"] min_max = pd.DataFrame({ "A": { "min": 1, "max": 5 }, "B": { "min": 1, "max": 11 } }) neighbors, _, _ = find_nearest_examples( df, k, rule, class_col_name, lookup, min_max, classes, label_type=my_vars.SAME_LABEL_AS_RULE, only_uncovered_neighbors=True) self.assertTrue(neighbors.equals(correct))
def test_find_neighbors_numeric_nominal_stats(self): """Tests that global statistics are updated accordingly""" df = pd.DataFrame({ "A": ["low", "low", "high", "low", "low", "high"], "B": [1, 1, 4, 1.5, 0.5, 0.75], "C": [3, 2, 1, .5, 3, 2], "Class": ["apple", "apple", "banana", "banana", "banana", "banana"] }) class_col_name = "Class" lookup = \ { "A": { 'high': 2, 'low': 4, my_vars.CONDITIONAL: { 'high': Counter({ 'banana': 2 }), 'low': Counter({ 'banana': 2, 'apple': 2 }) } } } rule = pd.Series({"A": "high", "B": (1, 1), "Class": "banana"}, name=0) my_vars.closest_rule_per_example = { 0: Data(rule_id=1, dist=0.010000000000000002), 1: Data(rule_id=0, dist=0.010000000000000002), 2: Data(rule_id=5, dist=0.67015625), 3: Data(rule_id=1, dist=0.038125), 4: Data(rule_id=0, dist=0.015625), 5: Data(rule_id=2, dist=0.67015625) } # Reset because other tests added data, so if you only run this test it would work, but not if other # tests are run prior to that my_vars.examples_covered_by_rule = {} my_vars.closest_examples_per_rule = {} my_vars.closest_examples_per_rule = { 0: {1, 4}, 1: {0, 3}, 2: {5}, 5: {2} } rules = [ pd.Series( { "A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=3, upper=3), "Class": "apple" }, name=0), pd.Series( { "A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=2, upper=2), "Class": "apple" }, name=1), pd.Series( { "A": "high", "B": Bounds(lower=4, upper=4), "C": Bounds(lower=1, upper=1), "Class": "banana" }, name=2), pd.Series( { "A": "low", "B": Bounds(lower=1.5, upper=1.5), "C": Bounds(lower=0.5, upper=0.5), "Class": "banana" }, name=3), pd.Series( { "A": "low", "B": Bounds(lower=0.5, upper=0.5), "C": Bounds(lower=3, upper=3), "Class": "banana" }, name=4), pd.Series( { "A": "high", "B": Bounds(lower=0.75, upper=0.75), "C": Bounds(lower=2, upper=2), "Class": "banana" }, name=5) ] my_vars.all_rules = { 0: rules[0], 1: rules[1], 2: rules[2], 3: rules[3], 4: rules[4], 5: rules[5] } # my_vars.all_rules = {0: rule} k = 4 correct = df.iloc[[5, 2, 3, 4]] classes = ["apple", "banana"] my_vars.minority_class = "banana" min_max = pd.DataFrame({ "A": { "min": 1, "max": 5 }, "B": { "min": 1, "max": 11 } }) correct_covered = {} correct_examples_per_rule = {0: {1, 2, 4, 5}, 1: {0, 3}} correct_closest_rule_per_example = { 0: Data(rule_id=1, dist=0.010000000000000002), 1: Data(rule_id=0, dist=0.010000000000000002), 2: Data(rule_id=0, dist=0.09), 3: Data(rule_id=1, dist=0.038125), 4: Data(rule_id=0, dist=0.015625), 5: Data(rule_id=0, dist=0.0006250000000000001) } neighbors, _, _ = find_nearest_examples( df, k, rule, class_col_name, lookup, min_max, classes, label_type=my_vars.SAME_LABEL_AS_RULE, only_uncovered_neighbors=False) self.assertTrue(neighbors.equals(correct)) self.assertTrue(correct_covered == my_vars.examples_covered_by_rule) self.assertTrue( correct_examples_per_rule == my_vars.closest_examples_per_rule) for example_id, (rule_id, dist) in correct_closest_rule_per_example.items(): features = my_vars.all_rules[rule_id].size self.assertTrue(example_id in my_vars.closest_rule_per_example) other_id, other_dist = my_vars.closest_rule_per_example[example_id] other_features = my_vars.all_rules[other_id].size self.assertTrue(rule_id == other_id) self.assertTrue(features == other_features) self.assertTrue(abs(dist - other_dist) < 0.0001)
def test_predict_uncovered(self): """Predict the class labels of uncovered examples with handling ties (2 rules are equally distant) for example 4, namely rules 0 and 6""" # Assumptions: these are the data for the training set NOT for the test set lookup = \ { "A": { 'high': 2, 'low': 4, my_vars.CONDITIONAL: { 'high': Counter({ 'banana': 2 }), 'low': Counter({ 'banana': 2, 'apple': 2 }) } } } min_max = pd.DataFrame({ "B": { "min": 1, "max": 5 }, "C": { "min": 1, "max": 11 } }) classes = ["apple", "banana"] test_set = pd.DataFrame({ "A": ["low", "high", "high", "low", "low", "high"], "B": [4.1, 6.1, 5.4, 0.15, 0.05, 0.075], "C": [0.3, 4, 0.1, .4, 0.3, 5], "Class": ["", "", "", "", "", ""] }) # Use majority class as minority to have multiple neighbors and see if the function works correctly class_col_name = "Class" my_vars.minority_class = classes[0] my_vars.examples_covered_by_rule = {} my_vars.closest_examples_per_rule = {} my_vars.closest_rule_per_example = {} rules = { 2: pd.Series( { "B": Bounds(lower=1.25, upper=4.0), "C": Bounds(lower=0.5, upper=1.5), "Class": "banana" }, name=2), 6: pd.Series( { "A": "low", "B": Bounds(lower=0.5, upper=1.5), "C": Bounds(lower=0.5, upper=3.0), "Class": "banana" }, name=6), 5: pd.Series( { "A": "high", "B": Bounds(lower=0.75, upper=4.0), "C": Bounds(lower=1.0, upper=2.5), "Class": "banana" }, name=5), 0: pd.Series( { "A": "low", "B": Bounds(lower=0.5, upper=1.5), "C": Bounds(lower=0.5, upper=3.0), "Class": "apple" }, name=0), } my_vars.all_rules = rules model = { 2: Support(minority=0.75, majority=0.25), 6: Support(minority=0.2, majority=0.8), 5: Support(minority=1.0, majority=0.0), 0: Support(minority=0, majority=1) } correct_covered = {} correct_examples_per_rule = {} correct_rule_per_example = {} df = predict_binary(model, test_set, rules, classes, class_col_name, lookup, min_max, for_multiclass=False) correct = pd.DataFrame({ my_vars.PREDICTED_LABEL: ["apple", "apple", "apple", "banana", "banana", "apple"], my_vars.PREDICTION_CONFIDENCE: [0.75, 1, 0.75, 0.9, 0.9, 1] }) # Test that predictions didn't change internal statistics of the model self.assertTrue(correct_covered == my_vars.examples_covered_by_rule) self.assertTrue( correct_examples_per_rule == my_vars.closest_examples_per_rule) self.assertTrue( correct_rule_per_example == my_vars.closest_rule_per_example) self.assertTrue( np.array_equal(correct[my_vars.PREDICTED_LABEL].values, df[my_vars.PREDICTED_LABEL].values)) self.assertTrue( np.allclose(correct[my_vars.PREDICTION_CONFIDENCE], df[my_vars.PREDICTION_CONFIDENCE]))
def test_add_one_best_rule_update_stats(self): """Tests that rule set is updated when a generalized rule improves F1 and also the mapping of closest rule per example changes""" df = pd.DataFrame({ "A": ["low", "low", "high", "low", "low", "high"], "B": [1, 1, 4, 1.5, 0.5, 0.75], "C": [3, 2, 1, .5, 3, 2], "Class": ["apple", "apple", "banana", "banana", "banana", "banana"] }) class_col_name = "Class" lookup = \ { "A": { 'high': 2, 'low': 4, my_vars.CONDITIONAL: { 'high': Counter({ 'banana': 2 }), 'low': Counter({ 'banana': 2, 'apple': 2 }) } } } test_idx = -1 classes = ["apple", "banana"] min_max = pd.DataFrame({ "B": { "min": 1, "max": 5 }, "C": { "min": 1, "max": 11 } }) my_vars.minority_class = "apple" rules = [ pd.Series( { "A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=2, upper=2), "Class": "apple" }, name=1), pd.Series( { "A": "high", "B": Bounds(lower=4, upper=4), "C": Bounds(lower=1, upper=1), "Class": "banana" }, name=2), pd.Series( { "A": "low", "B": Bounds(lower=1.5, upper=1.5), "C": Bounds(lower=0.5, upper=0.5), "Class": "banana" }, name=3), pd.Series( { "A": "low", "B": Bounds(lower=0.5, upper=0.5), "C": Bounds(lower=3, upper=3), "Class": "banana" }, name=4), pd.Series( { "A": "high", "B": Bounds(lower=0.75, upper=0.75), "C": Bounds(lower=2, upper=2), "Class": "banana" }, name=5), pd.Series( { "A": "low", "B": Bounds(lower=1, upper=1), "C": Bounds(lower=3, upper=3), "Class": "apple" }, name=0) # Current rule is always at the end of the list ] my_vars.closest_examples_per_rule = { 0: {4}, 1: {0, 1, 3}, # Change compared to previous test case 2: {5}, 5: {2} } my_vars.closest_rule_per_example = { 0: Data(rule_id=1, dist=0.010000000000000002), 1: Data(rule_id=1, dist=0.010000000000000002 ), # Change compared to previous test case 2: Data(rule_id=5, dist=0.67015625), 3: Data(rule_id=1, dist=0.038125), 4: Data(rule_id=0, dist=0.015625), 5: Data(rule_id=2, dist=0.67015625) } # Reset because other tests change the data # my_vars.examples_covered_by_rule = {0: {0}, 1: {1}, 2: {2}, 3: {3}, 4: {4}, 5: {5}, 6: {8}} my_vars.examples_covered_by_rule = {} my_vars.all_rules = { 0: rules[test_idx], 1: rules[1], 2: rules[2], 3: rules[3], 4: rules[4], 5: rules[5] } my_vars.seed_rule_example = {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 8} my_vars.seed_example_rule = { 0: {0}, 1: {1}, 2: {2}, 3: {3}, 4: {4}, 5: {5} } my_vars.unique_rules = {} my_vars.unique_rules = {} for rule in rules: rule_hash = compute_hashable_key(rule) my_vars.unique_rules.setdefault(rule_hash, set()).add(rule.name) # Actually, correctly it should've been # my_vars.conf_matrix = {my_vars.TP: {0, 1}, my_vars.FP: set(), my_vars.TN: {2, 5}, my_vars.FN: {3, 4}} # at the start (i.e. F1=0.66666), but to see if it changes, it's changed my_vars.conf_matrix = { my_vars.TP: {0}, my_vars.FP: set(), my_vars.TN: {1, 2, 5}, my_vars.FN: {3, 4} } initial_f1 = 0.1 k = 3 neighbors, dists, _ = find_nearest_examples( df, k, rules[test_idx], class_col_name, lookup, min_max, classes, label_type=my_vars.SAME_LABEL_AS_RULE, only_uncovered_neighbors=True) improved, updated_rules, f1 = add_one_best_rule( df, neighbors, rules[test_idx], rules, initial_f1, class_col_name, lookup, min_max, classes) correct_closest_rule_per_example = { 0: Data(rule_id=1, dist=0.010000000000000002), 1: Data(rule_id=0, dist=0.0), 2: Data(rule_id=5, dist=0.67015625), 3: Data(rule_id=1, dist=0.038125), 4: Data(rule_id=0, dist=0.015625), 5: Data(rule_id=2, dist=0.67015625) } correct_closest_examples_per_rule = { 0: {1, 4}, 1: {0, 3}, 2: {5}, 5: {2} } correct_f1 = 2 * 0.5 * 1 / 1.5 self.assertTrue(abs(correct_f1 - f1) < my_vars.PRECISION) self.assertTrue(improved is True) correct_generalized_rule = pd.Series( { "A": "low", "B": (1, 1), "C": (2.0, 3), "Class": "apple" }, name=0) correct_confusion_matrix = { my_vars.TP: {0, 1}, my_vars.FP: set(), my_vars.TN: {2, 5}, my_vars.FN: {3, 4} } # Make sure confusion matrix, closest rule per example, and rule set were updated with the updated rule too for example_id in my_vars.closest_rule_per_example: rule_id, dist = my_vars.closest_rule_per_example[example_id] self.assertTrue( rule_id == correct_closest_rule_per_example[example_id].rule_id and abs(dist - correct_closest_rule_per_example[example_id].dist) < 0.001) self.assertTrue( updated_rules[test_idx].equals(correct_generalized_rule)) self.assertTrue(my_vars.conf_matrix == correct_confusion_matrix) print(correct_closest_examples_per_rule) print(my_vars.closest_examples_per_rule) self.assertTrue(correct_closest_examples_per_rule == my_vars.closest_examples_per_rule)