Exemple #1
0
def test_basic_rule_tree():
    rules = [
        ("abc", "d"),
        ("abc", "f"),
        ("ac", "g"),
        ("bc", "d"),
        ("de", "g"),
    ]
    rules = map(lambda r: (ItemSet(r[0]), ItemSet(r[1])), rules)

    tree = RuleTree(4)
    for (antecedent, consequent) in rules:
        tree.insert(antecedent, consequent)

    # (ItemSet, [(antecedent, consquent)...])
    test_cases = [
        ("abcd", [1, 0, 0, 1, 0]),
        ("geabcd", [1, 0, 0.5, 1, 0.5]),
        ("abc", [2 / 3, 0.0, 1 / 3, 2 / 3, 1 / 3]),
        ("bcd", [0.5, 0.0, 0.25, 0.75, 0.25]),
        ("def", [0.25, 0.0, 0.25, 0.5, 0.25]),
        ("geab", [0.0, 0.0, 0.0, 0.25, 0.0]),
    ]
    test_cases = list(map(lambda t: (ItemSet(t[0]), t[1]), test_cases))
    print()
    for (itemset, expected_results) in test_cases:
        print("Adding {}".format(itemset))
        tree.record_matches(itemset)
        for (a, c) in tree.rules():
            print("  {} -> {} ; {}".format(a, c, tree.match_count_of(a, c)))
        assert (expected_results == tree.match_vector())
    def train(self, window, rules):
        self.training_rule_tree = RuleTree()
        for (antecedent, consequent, _, _, _) in rules:
            self.training_rule_tree.insert(antecedent, consequent)

        # Populate the training rule tree with the rule frequencies from
        # the training window.
        for transaction in window:
            self.training_rule_tree.record_matches(transaction)

        self.previous_rule_tree = self.make_test_tree()
        self.current_rule_tree = self.make_test_tree()

        # Record the match vector; the vector of rules' supports in the
        # training window.
        self.training_mean, self.training_len = self.training_rule_tree.rule_miss_rate(
        )

        self.num_test_transactions = 0
Exemple #3
0
    def train(self, window, rules):
        assert (len(rules) > 0)
        assert (len(window) > 0)
        self.training_rule_tree = RuleTree(len(window))
        for (antecedent, consequent, _, _, _) in rules:
            self.training_rule_tree.insert(antecedent, consequent)

        # Populate the training rule tree with the rule frequencies from
        # the training window.
        for transaction in window:
            self.training_rule_tree.record_matches(transaction)

        # Populate the test rule tree with a deep copy of the training set.
        self.test_rule_tree = deepcopy(self.training_rule_tree)

        # Record the match vector; the vector of rules' supports in the
        # training window.
        self.training_match_vec = self.training_rule_tree.match_vector()

        self.num_test_transactions = 0
        self.rule_vec_mean = RollingMean()
        self.rag_bag_mean = RollingMean()
Exemple #4
0
  def __init__(self, top_node, edge, rule, sentence):
    self.local_node_cxt = LocalNodeContext(top_node, sentence)

    self.node = top_node
    self.edge = edge
    self.rule = rule
    self.sent = sentence
    
    self.fields = dict((a, b) for a,b in edge.fvector.iteritems())

    self.cluster_rhs = self.rule.rhs

    self.treelet = RuleTree.from_lhs_string(self.rule.lhs)
    self.sent = sentence

    self.clustering = False
Exemple #5
0
class DriftDetector:
    def __init__(self, volatility_detector):
        self.volatility_detector = volatility_detector

    def train(self, window, rules):
        assert (len(rules) > 0)
        assert (len(window) > 0)
        self.training_rule_tree = RuleTree(len(window))
        for (antecedent, consequent, _, _, _) in rules:
            self.training_rule_tree.insert(antecedent, consequent)

        # Populate the training rule tree with the rule frequencies from
        # the training window.
        for transaction in window:
            self.training_rule_tree.record_matches(transaction)

        # Populate the test rule tree with a deep copy of the training set.
        self.test_rule_tree = deepcopy(self.training_rule_tree)

        # Record the match vector; the vector of rules' supports in the
        # training window.
        self.training_match_vec = self.training_rule_tree.match_vector()

        self.num_test_transactions = 0
        self.rule_vec_mean = RollingMean()
        self.rag_bag_mean = RollingMean()

    def check_for_drift(self, transaction, transaction_num):
        self.test_rule_tree.record_matches(transaction)
        self.num_test_transactions += 1
        if self.num_test_transactions < SAMPLE_INTERVAL:
            return None

        # Sample and test for drift.
        self.num_test_transactions = 0

        if (self.rule_vec_mean.n + 1 > SAMPLE_THRESHOLD
                or self.rag_bag_mean.n + 1 > SAMPLE_THRESHOLD):
            # We'll need the drift confidence below. Calculate it.
            # Note: the +1 is there because of the add_sample() call below.
            gamma = self.volatility_detector.drift_confidence(transaction_num)
            print("gamma at transaction {} is {}".format(
                transaction_num, gamma))
            drift_confidence = 2.5 - gamma

        # Detect whether the rules' supports in the test window differ
        # from the rules' supports in the training window.
        distance = hellinger(self.training_match_vec,
                             self.test_rule_tree.match_vector())
        self.rule_vec_mean.add_sample(distance)
        if self.rule_vec_mean.n > SAMPLE_THRESHOLD:
            conf = self.rule_vec_mean.std_dev() * drift_confidence
            mean = self.rule_vec_mean.mean()
            if distance > mean + conf or distance < mean - conf:
                return Drift("rule-match-vector", distance, conf, mean)

        # Detect whether the rag bag differs between the training and
        # test windows.
        if not hoeffding_bound(self.training_rule_tree.rag_bag(),
                               self.training_rule_tree.transaction_count,
                               self.test_rule_tree.rag_bag(),
                               self.test_rule_tree.transaction_count, 0.05):
            return Drift(drift_type="rag-bag")

        return None
class SeedDriftDetector:
    def __init__(self, volatility_detector=None):
        self.volatility_detector = volatility_detector

    def make_test_tree(self):
        # Copy the training rule tree, so that we get a copy of the rules.
        tree = deepcopy(self.training_rule_tree)
        # Clear the rule match counts so that we can re-generate them
        # as we read in more data.
        tree.clear_rule_match_counts()
        return tree

    def train(self, window, rules):
        self.training_rule_tree = RuleTree()
        for (antecedent, consequent, _, _, _) in rules:
            self.training_rule_tree.insert(antecedent, consequent)

        # Populate the training rule tree with the rule frequencies from
        # the training window.
        for transaction in window:
            self.training_rule_tree.record_matches(transaction)

        self.previous_rule_tree = self.make_test_tree()
        self.current_rule_tree = self.make_test_tree()

        # Record the match vector; the vector of rules' supports in the
        # training window.
        self.training_mean, self.training_len = self.training_rule_tree.rule_miss_rate(
        )

        self.num_test_transactions = 0

    def should_merge(self, transaction_num):
        if self.volatility_detector is not None:
            # ProSeed; we'll not merge blocks within the "exclusion zone"
            # around the next expected drift point; we'll drop them instead.
            next_drift = self.volatility_detector.next_expected_drift(
                transaction_num)
            if (next_drift is not None and abs(next_drift - \
                transaction_num) < ProSeedMergeExclusionZone):
                return False
        prev_mean, prev_len = self.previous_rule_tree.rule_miss_rate()
        curr_mean, curr_len = self.current_rule_tree.rule_miss_rate()
        return hoeffding_bound(prev_mean, prev_len, curr_mean, curr_len,
                               BlockCompareConfidence)

    def check_for_drift(self, transaction, transaction_num):
        # Append to current block.
        self.current_rule_tree.record_matches(transaction)

        self.num_test_transactions += 1
        if self.num_test_transactions < SAMPLE_INTERVAL:
            return None
        # Test for drift.
        self.num_test_transactions = 0

        if self.previous_rule_tree.transaction_count == 0:
            # First block, can't merge/drop.
            self.previous_rule_tree.take_and_add_matches(
                self.current_rule_tree)
            return None
        else:
            # Can the current block be merged with the previous block,
            # or should the previous be dropped?
            if self.should_merge(transaction_num):
                # Blocks are similar. Merge them.
                self.previous_rule_tree.take_and_add_matches(
                    self.current_rule_tree)
            else:
                # Otherwise the blocks are different, we'll discard data in
                # the former block.
                self.previous_rule_tree.take_and_overwrite_matches(
                    self.current_rule_tree)

        # Test to see whether training block is similar to test block.
        prev_mean, prev_len = self.previous_rule_tree.rule_miss_rate()
        if not hoeffding_bound(self.training_mean, self.training_len,
                               prev_mean, prev_len, TrainingCompareConfidence):
            return Drift(drift_type=SeedDriftAlgorithm)

        return None
Exemple #7
0
  def extract_fsa(self, node):
    "Constructs the segment of the fsa associated with a node in the forest"
    # memoization if we have done this node already
    if self.memo.has_key(node.position_id):
      return self.memo[node.position_id]

    # Create the FSA state for this general node (non marked)
    # (These will go away during minimization)
    down_state = fsa.BasicState(self.fsa, (node, DOWN)) #self.create_state((node, DOWN), False)
    up_state = fsa.BasicState(self.fsa, (node, UP)) #self.create_state((node, UP), False)
    self.memo[node.position_id] = (down_state, up_state)
    

    for edge in node.edges:
      previous_state = down_state
      # start experiment
      # Enumerate internal (non-local terminal) nodes on left hand side 
      lhs = edge.rule.lhs
      
      lhs_treelet = RuleTree.from_lhs_string(edge.rule.lhs)
      def non_fringe(tree):
        "get the non terminals that are not part of the fringe"
        if not tree.subs:
          return []
        return [tree.label] + sum(map(non_fringe, tree.subs), [])
      lhs_internal = sum(map(non_fringe,lhs_treelet.subs), [])
      print "INTERNAL", lhs_internal
      for i, nt in enumerate(lhs_internal):
        extra = "+++"+str(edge.position_id)+"+++"+str(i-10)
        fake_down_state = self.create_state((str(nt)+extra, DOWN), False)
        fake_up_state = self.create_state((str(nt)+extra, UP), False)        
        previous_state.add_edge(fake_down_state, 0.0)
        fake_down_state.add_edge(fake_up_state, 0.0)
        previous_state = fake_up_state
      
      # end experiment


      rhs = edge.rule.rhs
      
      # always start with the parent down state ( . P ) 
      
      nts_num =0 
      for i,sym in enumerate(rhs):
        extra = "+++"+str(edge.position_id)+"+++"+str(i)

        # next is a word ( . lex ) 
        if is_lex(sym):

          if self.unique_words:
            new_state = self.create_state((sym+extra, DOWN), True)

          else:
            new_state = self.create_state(sym, True, extra)

          previous_state.add_edge(new_state, 0.0)

          # Move the dot ( lex . )
          previous_state = new_state          
        else:
          # it's a symbol

          # local symbol name (lagrangians!)
          to_node = edge.subs[nts_num]
          nts_num += 1
          
          # We are at (. N_id ) need to get to ( N_id .) 

          # First, Create a unique named version of this state (. N_id) and ( N_id . )
          # We need these so that we can assign lagrangians
          local_down_state = self.create_state((str(to_node)+extra, DOWN), False)
          local_up_state = self.create_state((str(to_node)+extra, UP), False)

          down_sym, up_sym = self.extract_fsa(to_node)
          
          previous_state.add_edge(local_down_state, 0.0)
          local_down_state.add_edge(down_sym, 0.0)
          up_sym.add_edge(local_up_state, 0.0)

          # move the dot
          previous_state = local_up_state


      # for nt in lhs_internal:
#         extra = "+++"+str(edge.position_id)+"+++-1"
#         local_up_state = self.create_state((str(nt)+extra, UP), False)        
#         previous_state.add_edge(local_up_state,0.0)
#         previous_state = local_up_state


      #extra = "+++"+str(edge.position_id)+"+++"+str(i + 1)
      
      #end_hyp_edge = self.create_state(("edge"+extra, (edge.rule.tree_size(), edge.fvector["text-length"], edge.fvector) ), False)
      #previous_state.add_edge(end_hyp_edge, 0.0)
      #previous_state = end_hyp_edge

      # Finish by connecting back to parent up
      previous_state.add_edge(up_state, 0.0)
    return self.memo[node.position_id]