def test_basic_rule_tree(): rules = [ ("abc", "d"), ("abc", "f"), ("ac", "g"), ("bc", "d"), ("de", "g"), ] rules = map(lambda r: (ItemSet(r[0]), ItemSet(r[1])), rules) tree = RuleTree(4) for (antecedent, consequent) in rules: tree.insert(antecedent, consequent) # (ItemSet, [(antecedent, consquent)...]) test_cases = [ ("abcd", [1, 0, 0, 1, 0]), ("geabcd", [1, 0, 0.5, 1, 0.5]), ("abc", [2 / 3, 0.0, 1 / 3, 2 / 3, 1 / 3]), ("bcd", [0.5, 0.0, 0.25, 0.75, 0.25]), ("def", [0.25, 0.0, 0.25, 0.5, 0.25]), ("geab", [0.0, 0.0, 0.0, 0.25, 0.0]), ] test_cases = list(map(lambda t: (ItemSet(t[0]), t[1]), test_cases)) print() for (itemset, expected_results) in test_cases: print("Adding {}".format(itemset)) tree.record_matches(itemset) for (a, c) in tree.rules(): print(" {} -> {} ; {}".format(a, c, tree.match_count_of(a, c))) assert (expected_results == tree.match_vector())
def train(self, window, rules): self.training_rule_tree = RuleTree() for (antecedent, consequent, _, _, _) in rules: self.training_rule_tree.insert(antecedent, consequent) # Populate the training rule tree with the rule frequencies from # the training window. for transaction in window: self.training_rule_tree.record_matches(transaction) self.previous_rule_tree = self.make_test_tree() self.current_rule_tree = self.make_test_tree() # Record the match vector; the vector of rules' supports in the # training window. self.training_mean, self.training_len = self.training_rule_tree.rule_miss_rate( ) self.num_test_transactions = 0
def train(self, window, rules): assert (len(rules) > 0) assert (len(window) > 0) self.training_rule_tree = RuleTree(len(window)) for (antecedent, consequent, _, _, _) in rules: self.training_rule_tree.insert(antecedent, consequent) # Populate the training rule tree with the rule frequencies from # the training window. for transaction in window: self.training_rule_tree.record_matches(transaction) # Populate the test rule tree with a deep copy of the training set. self.test_rule_tree = deepcopy(self.training_rule_tree) # Record the match vector; the vector of rules' supports in the # training window. self.training_match_vec = self.training_rule_tree.match_vector() self.num_test_transactions = 0 self.rule_vec_mean = RollingMean() self.rag_bag_mean = RollingMean()
def __init__(self, top_node, edge, rule, sentence): self.local_node_cxt = LocalNodeContext(top_node, sentence) self.node = top_node self.edge = edge self.rule = rule self.sent = sentence self.fields = dict((a, b) for a,b in edge.fvector.iteritems()) self.cluster_rhs = self.rule.rhs self.treelet = RuleTree.from_lhs_string(self.rule.lhs) self.sent = sentence self.clustering = False
class DriftDetector: def __init__(self, volatility_detector): self.volatility_detector = volatility_detector def train(self, window, rules): assert (len(rules) > 0) assert (len(window) > 0) self.training_rule_tree = RuleTree(len(window)) for (antecedent, consequent, _, _, _) in rules: self.training_rule_tree.insert(antecedent, consequent) # Populate the training rule tree with the rule frequencies from # the training window. for transaction in window: self.training_rule_tree.record_matches(transaction) # Populate the test rule tree with a deep copy of the training set. self.test_rule_tree = deepcopy(self.training_rule_tree) # Record the match vector; the vector of rules' supports in the # training window. self.training_match_vec = self.training_rule_tree.match_vector() self.num_test_transactions = 0 self.rule_vec_mean = RollingMean() self.rag_bag_mean = RollingMean() def check_for_drift(self, transaction, transaction_num): self.test_rule_tree.record_matches(transaction) self.num_test_transactions += 1 if self.num_test_transactions < SAMPLE_INTERVAL: return None # Sample and test for drift. self.num_test_transactions = 0 if (self.rule_vec_mean.n + 1 > SAMPLE_THRESHOLD or self.rag_bag_mean.n + 1 > SAMPLE_THRESHOLD): # We'll need the drift confidence below. Calculate it. # Note: the +1 is there because of the add_sample() call below. gamma = self.volatility_detector.drift_confidence(transaction_num) print("gamma at transaction {} is {}".format( transaction_num, gamma)) drift_confidence = 2.5 - gamma # Detect whether the rules' supports in the test window differ # from the rules' supports in the training window. distance = hellinger(self.training_match_vec, self.test_rule_tree.match_vector()) self.rule_vec_mean.add_sample(distance) if self.rule_vec_mean.n > SAMPLE_THRESHOLD: conf = self.rule_vec_mean.std_dev() * drift_confidence mean = self.rule_vec_mean.mean() if distance > mean + conf or distance < mean - conf: return Drift("rule-match-vector", distance, conf, mean) # Detect whether the rag bag differs between the training and # test windows. if not hoeffding_bound(self.training_rule_tree.rag_bag(), self.training_rule_tree.transaction_count, self.test_rule_tree.rag_bag(), self.test_rule_tree.transaction_count, 0.05): return Drift(drift_type="rag-bag") return None
class SeedDriftDetector: def __init__(self, volatility_detector=None): self.volatility_detector = volatility_detector def make_test_tree(self): # Copy the training rule tree, so that we get a copy of the rules. tree = deepcopy(self.training_rule_tree) # Clear the rule match counts so that we can re-generate them # as we read in more data. tree.clear_rule_match_counts() return tree def train(self, window, rules): self.training_rule_tree = RuleTree() for (antecedent, consequent, _, _, _) in rules: self.training_rule_tree.insert(antecedent, consequent) # Populate the training rule tree with the rule frequencies from # the training window. for transaction in window: self.training_rule_tree.record_matches(transaction) self.previous_rule_tree = self.make_test_tree() self.current_rule_tree = self.make_test_tree() # Record the match vector; the vector of rules' supports in the # training window. self.training_mean, self.training_len = self.training_rule_tree.rule_miss_rate( ) self.num_test_transactions = 0 def should_merge(self, transaction_num): if self.volatility_detector is not None: # ProSeed; we'll not merge blocks within the "exclusion zone" # around the next expected drift point; we'll drop them instead. next_drift = self.volatility_detector.next_expected_drift( transaction_num) if (next_drift is not None and abs(next_drift - \ transaction_num) < ProSeedMergeExclusionZone): return False prev_mean, prev_len = self.previous_rule_tree.rule_miss_rate() curr_mean, curr_len = self.current_rule_tree.rule_miss_rate() return hoeffding_bound(prev_mean, prev_len, curr_mean, curr_len, BlockCompareConfidence) def check_for_drift(self, transaction, transaction_num): # Append to current block. self.current_rule_tree.record_matches(transaction) self.num_test_transactions += 1 if self.num_test_transactions < SAMPLE_INTERVAL: return None # Test for drift. self.num_test_transactions = 0 if self.previous_rule_tree.transaction_count == 0: # First block, can't merge/drop. self.previous_rule_tree.take_and_add_matches( self.current_rule_tree) return None else: # Can the current block be merged with the previous block, # or should the previous be dropped? if self.should_merge(transaction_num): # Blocks are similar. Merge them. self.previous_rule_tree.take_and_add_matches( self.current_rule_tree) else: # Otherwise the blocks are different, we'll discard data in # the former block. self.previous_rule_tree.take_and_overwrite_matches( self.current_rule_tree) # Test to see whether training block is similar to test block. prev_mean, prev_len = self.previous_rule_tree.rule_miss_rate() if not hoeffding_bound(self.training_mean, self.training_len, prev_mean, prev_len, TrainingCompareConfidence): return Drift(drift_type=SeedDriftAlgorithm) return None
def extract_fsa(self, node): "Constructs the segment of the fsa associated with a node in the forest" # memoization if we have done this node already if self.memo.has_key(node.position_id): return self.memo[node.position_id] # Create the FSA state for this general node (non marked) # (These will go away during minimization) down_state = fsa.BasicState(self.fsa, (node, DOWN)) #self.create_state((node, DOWN), False) up_state = fsa.BasicState(self.fsa, (node, UP)) #self.create_state((node, UP), False) self.memo[node.position_id] = (down_state, up_state) for edge in node.edges: previous_state = down_state # start experiment # Enumerate internal (non-local terminal) nodes on left hand side lhs = edge.rule.lhs lhs_treelet = RuleTree.from_lhs_string(edge.rule.lhs) def non_fringe(tree): "get the non terminals that are not part of the fringe" if not tree.subs: return [] return [tree.label] + sum(map(non_fringe, tree.subs), []) lhs_internal = sum(map(non_fringe,lhs_treelet.subs), []) print "INTERNAL", lhs_internal for i, nt in enumerate(lhs_internal): extra = "+++"+str(edge.position_id)+"+++"+str(i-10) fake_down_state = self.create_state((str(nt)+extra, DOWN), False) fake_up_state = self.create_state((str(nt)+extra, UP), False) previous_state.add_edge(fake_down_state, 0.0) fake_down_state.add_edge(fake_up_state, 0.0) previous_state = fake_up_state # end experiment rhs = edge.rule.rhs # always start with the parent down state ( . P ) nts_num =0 for i,sym in enumerate(rhs): extra = "+++"+str(edge.position_id)+"+++"+str(i) # next is a word ( . lex ) if is_lex(sym): if self.unique_words: new_state = self.create_state((sym+extra, DOWN), True) else: new_state = self.create_state(sym, True, extra) previous_state.add_edge(new_state, 0.0) # Move the dot ( lex . ) previous_state = new_state else: # it's a symbol # local symbol name (lagrangians!) to_node = edge.subs[nts_num] nts_num += 1 # We are at (. N_id ) need to get to ( N_id .) # First, Create a unique named version of this state (. N_id) and ( N_id . ) # We need these so that we can assign lagrangians local_down_state = self.create_state((str(to_node)+extra, DOWN), False) local_up_state = self.create_state((str(to_node)+extra, UP), False) down_sym, up_sym = self.extract_fsa(to_node) previous_state.add_edge(local_down_state, 0.0) local_down_state.add_edge(down_sym, 0.0) up_sym.add_edge(local_up_state, 0.0) # move the dot previous_state = local_up_state # for nt in lhs_internal: # extra = "+++"+str(edge.position_id)+"+++-1" # local_up_state = self.create_state((str(nt)+extra, UP), False) # previous_state.add_edge(local_up_state,0.0) # previous_state = local_up_state #extra = "+++"+str(edge.position_id)+"+++"+str(i + 1) #end_hyp_edge = self.create_state(("edge"+extra, (edge.rule.tree_size(), edge.fvector["text-length"], edge.fvector) ), False) #previous_state.add_edge(end_hyp_edge, 0.0) #previous_state = end_hyp_edge # Finish by connecting back to parent up previous_state.add_edge(up_state, 0.0) return self.memo[node.position_id]