Esempio n. 1
0
    def __call__(self, full_table, bad_tables, good_tables, **kwargs):
        """
        table has been trimmed of extraneous columns.
        Topdown will use all columns to construct rules
        """
        self.set_params(**kwargs)

        self.bad_tables = bad_tables
        self.good_tables = good_tables

        # allocate and setup error functions
        self.bad_err_funcs = [self.err_func.clone() for t in bad_tables]
        self.good_err_funcs = [self.err_func.clone() for t in good_tables]

        for ef, t in zip(chain(self.bad_err_funcs,self.good_err_funcs),
                         chain(bad_tables, good_tables)):
            ef.setup(t)



        
            


        self.err_func = self.bad_err_funcs[0]
        fill_in_rules(rules, full_table, cols=self.cols)
        self.all_clusters = [Cluster.from_rule(r, self.cols) for r in rules]
Esempio n. 2
0
    def nodes_to_popular_clusters(self, nodes, table):
      """
      Look for clauses found in more than X% of the nodes
      and turn them into clusters
      """
      if not nodes: return []
      from collections import Counter
      counter = Counter()
      str_to_rule = {}

      for node in nodes:
        r = node.rule
        if len(r.filter.conditions) > 1:
          for cond in r.filter.conditions:
            newr = SDRule(r.data, r.targetClass, [cond], r.g)
            newr.quality = node.influence
            counter[newr] += 1
            str_to_rule[newr] = newr

      thresh = np.percentile(counter.values(), 80)
      rules = []
      for strrule, count in counter.iteritems():
        if count >= thresh:  #0.25 * len(nodes):
          r = str_to_rule[strrule]
          rules.append(r)

      fill_in_rules(rules, table, cols=self.cols)
      clusters = [Cluster.from_rule(r, self.cols) for r in rules]
      return clusters
Esempio n. 3
0
 def nodes_to_clusters(self, nodes, table):
     clusters = []
     for node in nodes:
         node.rule.quality = node.influence
         fill_in_rules((node.rule,), table, cols=self.cols)
         cluster = Cluster.from_rule(node.rule, self.cols)
         cluster.states = node.states
         cluster.cards = node.cards
         clusters.append(cluster)
     return clusters
Esempio n. 4
0
    def __call__(self, full_table, bad_tables, good_tables, **kwargs):
        """
        table has been trimmed of extraneous columns.
        Topdown will use all columns to construct rules
        """
        self.set_params(**kwargs)

        self.bad_tables = bad_tables
        self.good_tables = good_tables

        # allocate and setup error functions
        self.bad_err_funcs = [self.err_func.clone() for t in bad_tables]
        self.good_err_funcs = [self.err_func.clone() for t in good_tables]

        for ef, t in zip(chain(self.bad_err_funcs,self.good_err_funcs),
                         chain(bad_tables, good_tables)):
            ef.setup(t)

        # self.err_func.setup(table)
        # self.table = table

        #rules = self.get_scorer_rules(table, self.cols, self.err_func)
        rules = self.get_scorer_rules(full_table,
                                      bad_tables,
                                      good_tables,
                                      self.bad_err_funcs,
                                      self.good_err_funcs,
                                      self.cols)

        self.err_func = self.bad_err_funcs[0]
        fill_in_rules(rules, full_table, cols=self.cols)
        self.all_clusters = [Cluster.from_rule(r, self.cols) for r in rules]

        

        thresh = compute_clusters_threshold(self.all_clusters)
        is_mergable = lambda c: c.error >= thresh
        print "threshold", thresh

        
        start = time.time()
        params = dict(self.params)
        params.update({'cols' : self.cols,
                       'table' : full_table,
                       'err_func' : self.err_func})
        self.merger = ReexecMerger(**params)
        self.final_clusters = self.merger(self.all_clusters, is_mergable=is_mergable)
        self.final_clusters.sort(key=lambda c: c.error, reverse=True)
        self.merge_cost = time.time() - start

        return self.final_clusters
Esempio n. 5
0
    def __call__(self, full_table, bad_tables, good_tables, **kwargs):
        """
        table has been trimmed of extraneous columns.
        """
        self.setup_tables(full_table, bad_tables, good_tables, **kwargs)
        if not self.cs:
            self.cs = [self.c]
        
        self.bests_per_c = defaultdict(list)
        for c in self.cs:
            self.bests_per_c[c] = list()
            self.checkpoints_per_c[c] = list()

        self.max_complexity = kwargs.get('max_complexity', self.max_complexity)
        self.granularity = kwargs.get('granularity', self.granularity)
        discretes = [attr for attr in full_table.domain 
                     if attr.name in self.cols and attr.var_type == Orange.feature.Type.Discrete]


        self.all_clauses = map(self.all_unit_clauses, self.cols)
        self.col_to_clauses = dict(zip(self.cols, self.all_clauses))
        base_rule = SDRule(self.full_table, None)            


        start = time.time()
        if discretes:
            max_card = max([len(attr.values) for attr in discretes])
            for m in xrange(1, max_card+1):
                if self.stop:
                    break
                self.foo(base_rule, max_card=m)
        else:
            self.foo(base_rule)
        self.cost = time.time() - start


        # given a rule = c1,..,cn
        # options for each clause c_i
        # 1) extend c_i to the left, to the right
        # 2) append a new clause
        rules = self.bests_per_c[self.cs[0]]
        rules.sort(key=lambda r: r.quality, reverse=True)
        _logger.debug("best\n%s", "\n".join(map(lambda r: '%.4f\t%s' % (r.quality, str(r)), rules)))

        fill_in_rules(rules, full_table, cols=self.cols)
        clusters = [Cluster.from_rule(rule, self.cols, rule.quality) for rule in rules]
        self.all_clusters = clusters

        self.costs = {'cost' : self.cost}
        return clusters
Esempio n. 6
0
 def blah_to_cluster(self, blah):
   rule = blah.rule
   fill_in_rules([rule], self.full_table, self.cols)
   c = Cluster.from_rule(rule, self.cols)
   c.error = self.influence_cluster(c, self.full_table)
   return c
Esempio n. 7
0
    def __call__(self, full_table, bad_tables, good_tables, **kwargs):
        """
        table has been trimmed of extraneous columns.
        """
        self.full_table = full_table
        self.bad_tables = bad_tables
        self.good_tables = good_tables
        self.bad_err_funcs = [self.err_func.clone() for t in bad_tables]
        self.good_err_funcs = [self.err_func.clone() for t in good_tables]

        for ef, t in zip(chain(self.bad_err_funcs, self.good_err_funcs),
                         chain(bad_tables, good_tables)):
            ef.setup(t)

        self.max_bad_stat = -1e100000000


        rules = None
        self.best = []
        
        while rules is None or rules:

            new_rules = self.make_rules(rules)
            nadded = self.top_k(new_rules)
            pruned_rules = self.prune_rules(new_rules)

            print "bad thresh\t%f" % self.bad_thresh
            print "n added\t%d" % nadded
            print "n rules\t%d" % (sum(map(len, pruned_rules.values())))
            self.best.sort()
            print '\n'.join(map(lambda ro: '\t%f\t%s' % (ro.inf, str(ro.rule)), self.best))

            
            if not nadded:
                break


            rules = pruned_rules

        self.best.sort(reverse=True)
        rules = [ro.rule for ro in self.best]
        self.err_func = self.bad_err_funcs[0]
        fill_in_rules(rules, self.full_table, cols=self.cols)
        self.all_clusters = [Cluster.from_rule(r, self.cols) for r in rules]
        self.all_clusters.sort(key=lambda c: c.error, reverse=True)
        return self.all_clusters

        thresh = compute_clusters_threshold(self.all_clusters)
        is_mergable = lambda c: c.error >= thresh
        print "threshold", thresh


        start = time.time()
        params = dict(self.params)
        params.update({'cols' : self.cols,
                       'full_table' : full_table,
                       'bad_tables' : self.bad_tables,
                       'good_tables' : self.good_tables,
                       'bad_err_funcs' : self.bad_err_funcs,
                       'good_err_funcs' : self.good_err_funcs,
                       'err_func' : self.err_func})
        self.merger = ReexecMerger(**params)
        self.final_clusters = self.merger(self.all_clusters, is_mergable=is_mergable)
        self.final_clusters.sort(key=lambda c: c.error, reverse=True)
        self.merge_cost = time.time() - start

        final_rules = clusters_to_rules(self.final_clusters, full_table)
        print "\n============Besties==========="
        for rule in final_rules:
            print "%f\t%s" % (rule.quality, str(rule))


        return self.final_clusters





        return self.all_clusters
Esempio n. 8
0
    def __call__(self, full_table, bad_tables, good_tables, **kwargs):
        """
        table has been trimmed of extraneous columns.
        """
        self.setup_tables(full_table, bad_tables, good_tables, **kwargs)

        self.SCORE_ID = add_meta_column(
                chain(self.bad_tables, self.good_tables),
                SCORE_VAR)
        self.CLASS_ID = add_meta_column(
                chain(self.bad_tables, self.good_tables),
                "INFCLASS",
                vals=['0', '1'])

        start = time.time()
        self.compute_influences(self.bad_tables, self.bad_err_funcs)
        self.compute_influences(self.good_tables, self.good_err_funcs)
        self.cost_compute_inf = time.time() - start



        start = time.time()
        if self.tree_alg == 'c45':
          table, rules = self.c45_rules()
        elif self.tree_alg == 'or':
          table, rules = self.orange_dt_rules()
        elif self.tree_alg == 'dt':
          table, rules = self.sk_dt_rules(max_depth=12)
        elif self.tree_alg == 'rt':
          table, rules = self.sk_rt_rules(max_depth=12)
        else:
          _logger.warn("unknown NDT algorithm %s.  Defaulting to regression tree", self.tree_alg)
          table, rules = self.sk_rt_rules(max_depth=12)
        self.cost_learn = time.time() - start


        #
        # ok now convert rules to clusters
        #

        _logger.debug( "got %d rules", len(rules))
        fill_in_rules(rules, table, cols=self.cols)

        self.cost_learn = time.time() - start

        clusters = [Cluster.from_rule(rule, self.cols) for rule in rules]
        for cluster in clusters:
          cluster.error = self.influence_cluster(cluster)
        clusters = filter_bad_clusters(clusters)
        clusters.sort(key=lambda c: c.error, reverse=True)
        print '\n'.join(map(str, clusters[:5]))

        self.all_clusters = self.final_clusters = clusters
        return self.final_clusters

        #
        # merge the clusters
        #
        thresh = compute_clusters_threshold(clusters, nstds=1.5)
        is_mergable = lambda c: c.error >= thresh

        params = dict(kwargs)
        params.update({
          'cols' : self.cols,
          'err_func' : self.err_func,
          'influence' : lambda c: self.influence_cluster(c),
          'influence_components': lambda c: self.influence_cluster_components(c),
          'is_mergable' : is_mergable,
          'use_mtuples' : False,
          'learner' : self})
        self.merger = Merger(**params)
        merged_clusters = self.merger(clusters)
        merged_clusters.sort(key=lambda c: c.error, reverse=True)


        clusters.extend(merged_clusters)
        normalize_cluster_errors(clusters)
        clusters = list(set(clusters))
        self.all_clusters = clusters
        self.final_clusters = merged_clusters

        self.costs = {
            'cost_learn' : self.cost_learn
        }
        return self.final_clusters