def child_rules(self, rule, attrs=None): attrs = attrs or self.cols next_rules = defaultdict(list) cont_attrs = [attr.name for attr in self.merged_table.domain if attr.name in attrs and attr.var_type != Orange.feature.Type.Discrete] dist_attrs = [attr.name for attr in self.merged_table.domain if attr.name in attrs and attr.var_type == Orange.feature.Type.Discrete] if cont_attrs: refiner = BeamRefiner(attrs=cont_attrs, fanout=2) for attr, new_rule in refiner(rule): next_rules[attr].append(new_rule) if dist_attrs: refiner = BeamRefiner(attrs=dist_attrs, fanout=5) for attr, new_rule in refiner(rule): next_rules[attr].append(new_rule) return next_rules.items()
def child_rules(self, rule, attrs=None): attrs = attrs or self.cols next_rules = defaultdict(list) refiner = BeamRefiner(attrs=attrs, fanout=2) for attr, new_rule in refiner(rule): next_rules[attr].append(new_rule) return next_rules.items()
def recurse_disc_rule(attr, rule): """ Recursively partition multivalued discrete attributes if its worth it """ ro = RuleObj(rule, self.bad_err_funcs, self.good_err_funcs, self.bad_tables, self.good_tables) if not self.prune_rule(ro): return set([ro]) c = rule.filter.conditions[0] var_type = rule.data.domain[c.position].var_type if (var_type == Orange.feature.Type.Discrete): if len(c.values) == 1: return [ro] refiner = BeamRefiner(attrs=[attr], fanout=10) ret = set() for _, newrule in refiner(rule): ret.update(recurse_disc_rule(attr, newrule)) return ret else: if len(rule.data) < self.min_pts: return [ro] return [ro] # XXX: figure out this logic! refiner = BeamRefiner(attrs=[attr], fanout=2) ret = set() for _, newrule in refiner(rule): newro = RuleObj(newrule, self.bad_err_funcs, self.good_err_funcs, self.bad_tables, self.good_tables) ret.update(recurse_disc_rule(attr, newrule))
def get_cost(table): _, table = discretize(table) refiner = BeamRefiner() base_rule = SDRule(table, '1') costs = [] biggest_rule, biggest_n = None, None for new_rule in refiner(base_rule): # f = lambda: [x for x in query(db,'select count(*) from tmp where %s' % ' and '.join( rule_to_clauses(new_rule) ))][0] # n = f() # t = timeit.Timer(f) # t.timeit(10) # costs.append( (n, t.timeit(100) / 100.) ) # print costs[-1] # continue f = lambda: table.filter_ref(new_rule.filter)#new_rule.filter(table) n = len(f()) t = timeit.Timer(f) t.timeit(10) costs.append( (n, t.timeit(100) / 100.) ) if not biggest_n or n > biggest_n: biggest_rule, biggest_n = new_rule, n print costs[-1] exit() for new_rule in refiner(biggest_rule): f = lambda: table.filter_ref(new_rule.filter)#new_rule.filter(table) n = len(f()) t = timeit.Timer(f) t.timeit(10) costs.append( (n, t.timeit(100) / 100.) ) print costs[-1] import numpy as np return costs, np.mean(costs), np.std(costs)
def make_rules(self, old_rules): """ Merge rules in old_rules to compute next round of rules or create 1D partitions of each attribute """ rules = defaultdict(set) def recurse_disc_rule(attr, rule): """ Recursively partition multivalued discrete attributes if its worth it """ ro = RuleObj(rule, self.bad_err_funcs, self.good_err_funcs, self.bad_tables, self.good_tables) if not self.prune_rule(ro): return set([ro]) c = rule.filter.conditions[0] var_type = rule.data.domain[c.position].var_type if (var_type == Orange.feature.Type.Discrete): if len(c.values) == 1: return [ro] refiner = BeamRefiner(attrs=[attr], fanout=10) ret = set() for _, newrule in refiner(rule): ret.update(recurse_disc_rule(attr, newrule)) return ret else: if len(rule.data) < self.min_pts: return [ro] return [ro] # XXX: figure out this logic! refiner = BeamRefiner(attrs=[attr], fanout=2) ret = set() for _, newrule in refiner(rule): newro = RuleObj(newrule, self.bad_err_funcs, self.good_err_funcs, self.bad_tables, self.good_tables) ret.update(recurse_disc_rule(attr, newrule)) if old_rules is None: base_rule = SDRule(self.full_table, None) refiner = BeamRefiner(attrs=self.cols, fanout=10) #refiner = BeamRefiner(attrs=['recipient_nm'], fanout=30) for attr, rule in refiner(base_rule): ros = recurse_disc_rule(attr, rule) #self.top_k({None:ros}) ros = filter(self.prune_rule, ros) rules[(attr,)].update(ros) else: attrs = old_rules.keys() for a_idx, attr1 in enumerate(attrs): for attr2 in attrs[a_idx+1:]: merged_attrs = set(attr1).union(attr2) max_attrs_len = max(len(attr1), len(attr2)) if len(merged_attrs) == max_attrs_len: continue a1rules, a2rules = old_rules[attr1], old_rules[attr2] for ro in self.merge_dims(a1rules, a2rules): key = ro.rule.attributes #self.top_k({None:(ro,)}) if self.prune_rule(ro): rules[key].add(ro) return rules
def quadtree_score(self, prev_rule): if prev_rule.complexity > self.max_levels: raise table = prev_rule.examples samples = self.get_samples(table) # evaluate current partition using sample cur_stats = self.evaluate(samples) should_stop = self.should_stop(samples, cur_stats) or len(samples) == len(table) _logger.info("Stats: %s\tpop(%d)\tsamp(%d)\t%f-%f\t%f-%f", should_stop, len(table), len(samples), cur_stats.est-2.58*cur_stats.std, cur_stats.est+2.58*cur_stats.std, min(cur_stats.vals), max(cur_stats.vals)) if should_stop: for row in table: if row[self.SCORE_ID].value == -inf: row[self.SCORE_ID] = cur_stats.est prev_rule.quality = prev_rule.score = cur_stats.est self.rules.add(prev_rule) print prev_rule.quality, '\t', len(table), '\t', prev_rule return # apply rules to sample table splits = defaultdict(lambda: (list(), list())) for refname, refiner in self.refiners: for attr, new_rule in refiner(prev_rule): key = (refname, attr) partition = new_rule.filter_table(samples) stats = self.evaluate(partition) if len(partition) else None if stats: stats.__dict__['complexity'] = new_rule.complexity - prev_rule.complexity splits[key][0].append(new_rule) splits[key][1].append(stats) # if 'recipient_nm = RATHBUN JESSICA, GMMB INC., AMLIN JEFFREY, SHUMAKER PDT (2+ ..)' in str(prev_rule): # scores = [(k[1].name, self.evaluate_split(s)) for (k,(r,s)) in splits.iteritems()] # scores.sort(key=lambda p: p[1], reverse=True) # complexities = [(k[1].name, np.mean([s.complexity for s in sl])) for (k,(r,sl)) in splits.iteritems()] # pdb.set_trace() splits = sorted(splits.items(), key=lambda (key, (rules, statslist)): self.evaluate_split(statslist), reverse=True) best_split = None for (refname, attr), (rules, stats_list) in splits: counts = map(lambda r: len(r.examples), rules) if len(filter(lambda n: n, counts)) <= 1: continue best_split = ((refname, attr), (rules, stats_list)) break if not best_split: best_split = splits[random.randint(0, len(splits)-1)] #print '\n'.join(map(str,best_split[1][0])) #print #pdb.set_trace() # _logger.info("couldn't find a split. giving up") # self.evaluate(table) # print '%d\t%.5f\t%.5f\t' % (len(table), cur_stats.est, self.errprob[-1]), prev_rule # return (refname, attr), (rules, stats_list) = best_split _logger.info("Splitting on %s", attr.name) for next_rule, stats in zip(rules, stats_list): partition = next_rule.examples if not len(partition): continue if len(self.stats) > 0: ratio = (self.stats[-1].std / cur_stats.std) newerrprob = self.errprob[-1] * max(ratio , 1) #newerrprob = self.errprob[-1] * len(table) / (len(partition) * 1.3) else: newerrprob = self.errprob[-1] self.stats.append(cur_stats) self.errprob.append(min(1.0, newerrprob)) self.quadtree_score(next_rule) self.stats.pop() self.errprob.pop() # sanity check, sum(partitions) == table part_sizes = map(len, [r.examples for r in rules]) total_part_size = sum( part_sizes ) msg = "Partition sizes wrong: %d!=%d\t%s\t%s" msg = msg % (total_part_size, len(table), str(part_sizes), attr.name)
def get_best_split(self, prev_rule, cur_score, bad_tables, good_tables): def compute_stats(rule, evaluate, tables): stats = None samples = map(rule.filter_table, tables) if sum(map(len, samples)): stats = evaluate(samples, sample=False) return stats N = sum(map(len, bad_tables)) # apply rules to sample table splits = defaultdict(lambda: (list(), list())) for refname, refiner in self.refiners: # sanity check that sum of table filtered by attr rules == original table attr_counts = defaultdict(list) for attr, rule in refiner(prev_rule): key = (refname, attr) bad_stats = compute_stats(rule, self.bad_evaluator.evaluate, bad_tables) good_stats = compute_stats(rule, self.good_evaluator.evaluate, good_tables) if bad_stats: bad_stats.__dict__['complexity'] = rule.complexity attr_counts[attr].append(bad_stats) splits[key][0].append(rule) splits[key][1].append((bad_stats, good_stats)) # for attr, sl in attr_counts.iteritems(): # NN = sum(len(s.vals) for s in sl) # assert NN == N, "f**k this shit %s\t%d\t%d" % (attr.name, NN, N) # if there are any discrete attributes that strictly simplify the rule, # use it nexts = [] for (_, attr), (rules, stats_list) in splits.items(): if (len(filter(lambda x:x, zip(*stats_list)[0])) == 1 and attr.var_type == Orange.feature.Type.Discrete and rules[0].complexity == prev_rule.complexity): print "short cutting", attr, prev_rule nexts.append((rules, attr, stats_list)) if nexts: return nexts key_func = lambda split: self.evaluate_split(split, self.score_mode) splits = sorted(splits.items(), key=key_func, reverse=True) evals = map(key_func, splits) rules, stats_list = zip(*zip(*splits)[1]) all_bad_stats = [zip(*sl)[0] for sl in stats_list] all_good_stats = [zip(*sl)[1] for sl in stats_list] print len(prev_rule.examples), prev_rule print 'split\t', [xx.name for xx in zip(*zip(*splits)[0])[1]] print ' \t', [np.std([bs.est for bs in bss if bs]) for bss in all_bad_stats] print ' \t', evals print ' \t', [self.get_igr(bss) for bss in all_bad_stats] print ' \t', [self.get_igr(bss) for bss in all_good_stats] if max(evals) <= 0: return [] prev_score = None nexts = [] for ((refname, attr), (rules, stats_list)), score in zip(splits, evals): counts = map(lambda r: len(r.examples), rules) if len(filter(lambda n: n, counts)) < 1: continue if not len(zip(rules, stats_list)): continue if prev_score is not None and score != prev_score: break nexts.append(( rules, attr, stats_list )) prev_score = score return nexts