def __call__(self, full_table, bad_tables, good_tables, **kwargs): """ table has been trimmed of extraneous columns. Topdown will use all columns to construct rules """ self.set_params(**kwargs) self.bad_tables = bad_tables self.good_tables = good_tables # allocate and setup error functions self.bad_err_funcs = [self.err_func.clone() for t in bad_tables] self.good_err_funcs = [self.err_func.clone() for t in good_tables] for ef, t in zip(chain(self.bad_err_funcs,self.good_err_funcs), chain(bad_tables, good_tables)): ef.setup(t) self.err_func = self.bad_err_funcs[0] fill_in_rules(rules, full_table, cols=self.cols) self.all_clusters = [Cluster.from_rule(r, self.cols) for r in rules]
def nodes_to_popular_clusters(self, nodes, table): """ Look for clauses found in more than X% of the nodes and turn them into clusters """ if not nodes: return [] from collections import Counter counter = Counter() str_to_rule = {} for node in nodes: r = node.rule if len(r.filter.conditions) > 1: for cond in r.filter.conditions: newr = SDRule(r.data, r.targetClass, [cond], r.g) newr.quality = node.influence counter[newr] += 1 str_to_rule[newr] = newr thresh = np.percentile(counter.values(), 80) rules = [] for strrule, count in counter.iteritems(): if count >= thresh: #0.25 * len(nodes): r = str_to_rule[strrule] rules.append(r) fill_in_rules(rules, table, cols=self.cols) clusters = [Cluster.from_rule(r, self.cols) for r in rules] return clusters
def nodes_to_clusters(self, nodes, table): clusters = [] for node in nodes: node.rule.quality = node.influence fill_in_rules((node.rule,), table, cols=self.cols) cluster = Cluster.from_rule(node.rule, self.cols) cluster.states = node.states cluster.cards = node.cards clusters.append(cluster) return clusters
def __call__(self, full_table, bad_tables, good_tables, **kwargs): """ table has been trimmed of extraneous columns. Topdown will use all columns to construct rules """ self.set_params(**kwargs) self.bad_tables = bad_tables self.good_tables = good_tables # allocate and setup error functions self.bad_err_funcs = [self.err_func.clone() for t in bad_tables] self.good_err_funcs = [self.err_func.clone() for t in good_tables] for ef, t in zip(chain(self.bad_err_funcs,self.good_err_funcs), chain(bad_tables, good_tables)): ef.setup(t) # self.err_func.setup(table) # self.table = table #rules = self.get_scorer_rules(table, self.cols, self.err_func) rules = self.get_scorer_rules(full_table, bad_tables, good_tables, self.bad_err_funcs, self.good_err_funcs, self.cols) self.err_func = self.bad_err_funcs[0] fill_in_rules(rules, full_table, cols=self.cols) self.all_clusters = [Cluster.from_rule(r, self.cols) for r in rules] thresh = compute_clusters_threshold(self.all_clusters) is_mergable = lambda c: c.error >= thresh print "threshold", thresh start = time.time() params = dict(self.params) params.update({'cols' : self.cols, 'table' : full_table, 'err_func' : self.err_func}) self.merger = ReexecMerger(**params) self.final_clusters = self.merger(self.all_clusters, is_mergable=is_mergable) self.final_clusters.sort(key=lambda c: c.error, reverse=True) self.merge_cost = time.time() - start return self.final_clusters
def __call__(self, full_table, bad_tables, good_tables, **kwargs): """ table has been trimmed of extraneous columns. """ self.setup_tables(full_table, bad_tables, good_tables, **kwargs) if not self.cs: self.cs = [self.c] self.bests_per_c = defaultdict(list) for c in self.cs: self.bests_per_c[c] = list() self.checkpoints_per_c[c] = list() self.max_complexity = kwargs.get('max_complexity', self.max_complexity) self.granularity = kwargs.get('granularity', self.granularity) discretes = [attr for attr in full_table.domain if attr.name in self.cols and attr.var_type == Orange.feature.Type.Discrete] self.all_clauses = map(self.all_unit_clauses, self.cols) self.col_to_clauses = dict(zip(self.cols, self.all_clauses)) base_rule = SDRule(self.full_table, None) start = time.time() if discretes: max_card = max([len(attr.values) for attr in discretes]) for m in xrange(1, max_card+1): if self.stop: break self.foo(base_rule, max_card=m) else: self.foo(base_rule) self.cost = time.time() - start # given a rule = c1,..,cn # options for each clause c_i # 1) extend c_i to the left, to the right # 2) append a new clause rules = self.bests_per_c[self.cs[0]] rules.sort(key=lambda r: r.quality, reverse=True) _logger.debug("best\n%s", "\n".join(map(lambda r: '%.4f\t%s' % (r.quality, str(r)), rules))) fill_in_rules(rules, full_table, cols=self.cols) clusters = [Cluster.from_rule(rule, self.cols, rule.quality) for rule in rules] self.all_clusters = clusters self.costs = {'cost' : self.cost} return clusters
def blah_to_cluster(self, blah): rule = blah.rule fill_in_rules([rule], self.full_table, self.cols) c = Cluster.from_rule(rule, self.cols) c.error = self.influence_cluster(c, self.full_table) return c
def __call__(self, full_table, bad_tables, good_tables, **kwargs): """ table has been trimmed of extraneous columns. """ self.full_table = full_table self.bad_tables = bad_tables self.good_tables = good_tables self.bad_err_funcs = [self.err_func.clone() for t in bad_tables] self.good_err_funcs = [self.err_func.clone() for t in good_tables] for ef, t in zip(chain(self.bad_err_funcs, self.good_err_funcs), chain(bad_tables, good_tables)): ef.setup(t) self.max_bad_stat = -1e100000000 rules = None self.best = [] while rules is None or rules: new_rules = self.make_rules(rules) nadded = self.top_k(new_rules) pruned_rules = self.prune_rules(new_rules) print "bad thresh\t%f" % self.bad_thresh print "n added\t%d" % nadded print "n rules\t%d" % (sum(map(len, pruned_rules.values()))) self.best.sort() print '\n'.join(map(lambda ro: '\t%f\t%s' % (ro.inf, str(ro.rule)), self.best)) if not nadded: break rules = pruned_rules self.best.sort(reverse=True) rules = [ro.rule for ro in self.best] self.err_func = self.bad_err_funcs[0] fill_in_rules(rules, self.full_table, cols=self.cols) self.all_clusters = [Cluster.from_rule(r, self.cols) for r in rules] self.all_clusters.sort(key=lambda c: c.error, reverse=True) return self.all_clusters thresh = compute_clusters_threshold(self.all_clusters) is_mergable = lambda c: c.error >= thresh print "threshold", thresh start = time.time() params = dict(self.params) params.update({'cols' : self.cols, 'full_table' : full_table, 'bad_tables' : self.bad_tables, 'good_tables' : self.good_tables, 'bad_err_funcs' : self.bad_err_funcs, 'good_err_funcs' : self.good_err_funcs, 'err_func' : self.err_func}) self.merger = ReexecMerger(**params) self.final_clusters = self.merger(self.all_clusters, is_mergable=is_mergable) self.final_clusters.sort(key=lambda c: c.error, reverse=True) self.merge_cost = time.time() - start final_rules = clusters_to_rules(self.final_clusters, full_table) print "\n============Besties===========" for rule in final_rules: print "%f\t%s" % (rule.quality, str(rule)) return self.final_clusters return self.all_clusters
def __call__(self, full_table, bad_tables, good_tables, **kwargs): """ table has been trimmed of extraneous columns. """ self.setup_tables(full_table, bad_tables, good_tables, **kwargs) self.SCORE_ID = add_meta_column( chain(self.bad_tables, self.good_tables), SCORE_VAR) self.CLASS_ID = add_meta_column( chain(self.bad_tables, self.good_tables), "INFCLASS", vals=['0', '1']) start = time.time() self.compute_influences(self.bad_tables, self.bad_err_funcs) self.compute_influences(self.good_tables, self.good_err_funcs) self.cost_compute_inf = time.time() - start start = time.time() if self.tree_alg == 'c45': table, rules = self.c45_rules() elif self.tree_alg == 'or': table, rules = self.orange_dt_rules() elif self.tree_alg == 'dt': table, rules = self.sk_dt_rules(max_depth=12) elif self.tree_alg == 'rt': table, rules = self.sk_rt_rules(max_depth=12) else: _logger.warn("unknown NDT algorithm %s. Defaulting to regression tree", self.tree_alg) table, rules = self.sk_rt_rules(max_depth=12) self.cost_learn = time.time() - start # # ok now convert rules to clusters # _logger.debug( "got %d rules", len(rules)) fill_in_rules(rules, table, cols=self.cols) self.cost_learn = time.time() - start clusters = [Cluster.from_rule(rule, self.cols) for rule in rules] for cluster in clusters: cluster.error = self.influence_cluster(cluster) clusters = filter_bad_clusters(clusters) clusters.sort(key=lambda c: c.error, reverse=True) print '\n'.join(map(str, clusters[:5])) self.all_clusters = self.final_clusters = clusters return self.final_clusters # # merge the clusters # thresh = compute_clusters_threshold(clusters, nstds=1.5) is_mergable = lambda c: c.error >= thresh params = dict(kwargs) params.update({ 'cols' : self.cols, 'err_func' : self.err_func, 'influence' : lambda c: self.influence_cluster(c), 'influence_components': lambda c: self.influence_cluster_components(c), 'is_mergable' : is_mergable, 'use_mtuples' : False, 'learner' : self}) self.merger = Merger(**params) merged_clusters = self.merger(clusters) merged_clusters.sort(key=lambda c: c.error, reverse=True) clusters.extend(merged_clusters) normalize_cluster_errors(clusters) clusters = list(set(clusters)) self.all_clusters = clusters self.final_clusters = merged_clusters self.costs = { 'cost_learn' : self.cost_learn } return self.final_clusters