def test_basic_rule_tree(): rules = [ ("abc", "d"), ("abc", "f"), ("ac", "g"), ("bc", "d"), ("de", "g"), ] rules = map(lambda r: (ItemSet(r[0]), ItemSet(r[1])), rules) tree = RuleTree(4) for (antecedent, consequent) in rules: tree.insert(antecedent, consequent) # (ItemSet, [(antecedent, consquent)...]) test_cases = [ ("abcd", [1, 0, 0, 1, 0]), ("geabcd", [1, 0, 0.5, 1, 0.5]), ("abc", [2 / 3, 0.0, 1 / 3, 2 / 3, 1 / 3]), ("bcd", [0.5, 0.0, 0.25, 0.75, 0.25]), ("def", [0.25, 0.0, 0.25, 0.5, 0.25]), ("geab", [0.0, 0.0, 0.0, 0.25, 0.0]), ] test_cases = list(map(lambda t: (ItemSet(t[0]), t[1]), test_cases)) print() for (itemset, expected_results) in test_cases: print("Adding {}".format(itemset)) tree.record_matches(itemset) for (a, c) in tree.rules(): print(" {} -> {} ; {}".format(a, c, tree.match_count_of(a, c))) assert (expected_results == tree.match_vector())
class DriftDetector: def __init__(self, volatility_detector): self.volatility_detector = volatility_detector def train(self, window, rules): assert (len(rules) > 0) assert (len(window) > 0) self.training_rule_tree = RuleTree(len(window)) for (antecedent, consequent, _, _, _) in rules: self.training_rule_tree.insert(antecedent, consequent) # Populate the training rule tree with the rule frequencies from # the training window. for transaction in window: self.training_rule_tree.record_matches(transaction) # Populate the test rule tree with a deep copy of the training set. self.test_rule_tree = deepcopy(self.training_rule_tree) # Record the match vector; the vector of rules' supports in the # training window. self.training_match_vec = self.training_rule_tree.match_vector() self.num_test_transactions = 0 self.rule_vec_mean = RollingMean() self.rag_bag_mean = RollingMean() def check_for_drift(self, transaction, transaction_num): self.test_rule_tree.record_matches(transaction) self.num_test_transactions += 1 if self.num_test_transactions < SAMPLE_INTERVAL: return None # Sample and test for drift. self.num_test_transactions = 0 if (self.rule_vec_mean.n + 1 > SAMPLE_THRESHOLD or self.rag_bag_mean.n + 1 > SAMPLE_THRESHOLD): # We'll need the drift confidence below. Calculate it. # Note: the +1 is there because of the add_sample() call below. gamma = self.volatility_detector.drift_confidence(transaction_num) print("gamma at transaction {} is {}".format( transaction_num, gamma)) drift_confidence = 2.5 - gamma # Detect whether the rules' supports in the test window differ # from the rules' supports in the training window. distance = hellinger(self.training_match_vec, self.test_rule_tree.match_vector()) self.rule_vec_mean.add_sample(distance) if self.rule_vec_mean.n > SAMPLE_THRESHOLD: conf = self.rule_vec_mean.std_dev() * drift_confidence mean = self.rule_vec_mean.mean() if distance > mean + conf or distance < mean - conf: return Drift("rule-match-vector", distance, conf, mean) # Detect whether the rag bag differs between the training and # test windows. if not hoeffding_bound(self.training_rule_tree.rag_bag(), self.training_rule_tree.transaction_count, self.test_rule_tree.rag_bag(), self.test_rule_tree.transaction_count, 0.05): return Drift(drift_type="rag-bag") return None