Esempio n. 1
0
    def child_rules(self, rule, attrs=None):
        attrs = attrs or self.cols
        next_rules = defaultdict(list)
        cont_attrs = [attr.name for attr in self.merged_table.domain if attr.name in attrs and attr.var_type != Orange.feature.Type.Discrete]
        dist_attrs = [attr.name for attr in self.merged_table.domain if attr.name in attrs and attr.var_type == Orange.feature.Type.Discrete]

        if cont_attrs:
            refiner = BeamRefiner(attrs=cont_attrs, fanout=2)
            for attr, new_rule in refiner(rule):
                next_rules[attr].append(new_rule)
        if dist_attrs:
            refiner = BeamRefiner(attrs=dist_attrs, fanout=5)
            for attr, new_rule in refiner(rule):
                next_rules[attr].append(new_rule)
        return next_rules.items()
Esempio n. 2
0
 def child_rules(self, rule, attrs=None):
     attrs = attrs or self.cols
     next_rules = defaultdict(list)
     refiner = BeamRefiner(attrs=attrs, fanout=2)
     for attr, new_rule in refiner(rule):
         next_rules[attr].append(new_rule)
     return next_rules.items()
Esempio n. 3
0
        def recurse_disc_rule(attr, rule):
            """
            Recursively partition multivalued discrete attributes if
            its worth it
            """


            ro = RuleObj(rule,
                         self.bad_err_funcs,
                         self.good_err_funcs,
                         self.bad_tables,
                         self.good_tables)

            if not self.prune_rule(ro):
                return set([ro])
            
            c = rule.filter.conditions[0]
            var_type = rule.data.domain[c.position].var_type

            if (var_type == Orange.feature.Type.Discrete):
                if len(c.values) == 1:
                    return [ro]
            
                refiner = BeamRefiner(attrs=[attr], fanout=10)
                ret = set()
                for _, newrule in refiner(rule):
                    ret.update(recurse_disc_rule(attr, newrule))
                return ret
            else:
                if len(rule.data) < self.min_pts:
                    return [ro]
                return [ro]

                # XXX: figure out this logic!

                refiner = BeamRefiner(attrs=[attr], fanout=2)
                ret = set()
                for _, newrule in refiner(rule):
                    newro = RuleObj(newrule,
                                 self.bad_err_funcs,
                                 self.good_err_funcs,
                                 self.bad_tables,
                                 self.good_tables)
                    ret.update(recurse_disc_rule(attr, newrule))
Esempio n. 4
0
    def get_cost(table):
        _, table = discretize(table)

        refiner = BeamRefiner()
        base_rule = SDRule(table, '1')

        costs = []
        biggest_rule, biggest_n = None, None
        for new_rule in refiner(base_rule):
            # f = lambda: [x for x in query(db,'select count(*) from tmp where %s' % ' and '.join( rule_to_clauses(new_rule) ))][0]
            # n = f()
            # t = timeit.Timer(f)
            # t.timeit(10)
            # costs.append( (n, t.timeit(100) / 100.) )
            # print costs[-1]            
            # continue
            f = lambda: table.filter_ref(new_rule.filter)#new_rule.filter(table)
            n = len(f())            
            t = timeit.Timer(f)
            t.timeit(10)
            costs.append( (n, t.timeit(100) / 100.) )
            if not biggest_n or n > biggest_n:
                biggest_rule, biggest_n = new_rule, n
            print costs[-1]
        exit()

        for new_rule in refiner(biggest_rule):
            f = lambda: table.filter_ref(new_rule.filter)#new_rule.filter(table)
            n = len(f())            
            t = timeit.Timer(f)
            t.timeit(10)
            costs.append( (n, t.timeit(100) / 100.) )
            print costs[-1]            
            
        import numpy as np
        return costs, np.mean(costs), np.std(costs)
Esempio n. 5
0
    def make_rules(self, old_rules):
        """
        Merge rules in old_rules to compute next round of rules
        or create 1D partitions of each attribute
        """
        rules = defaultdict(set)

        def recurse_disc_rule(attr, rule):
            """
            Recursively partition multivalued discrete attributes if
            its worth it
            """


            ro = RuleObj(rule,
                         self.bad_err_funcs,
                         self.good_err_funcs,
                         self.bad_tables,
                         self.good_tables)

            if not self.prune_rule(ro):
                return set([ro])
            
            c = rule.filter.conditions[0]
            var_type = rule.data.domain[c.position].var_type

            if (var_type == Orange.feature.Type.Discrete):
                if len(c.values) == 1:
                    return [ro]
            
                refiner = BeamRefiner(attrs=[attr], fanout=10)
                ret = set()
                for _, newrule in refiner(rule):
                    ret.update(recurse_disc_rule(attr, newrule))
                return ret
            else:
                if len(rule.data) < self.min_pts:
                    return [ro]
                return [ro]

                # XXX: figure out this logic!

                refiner = BeamRefiner(attrs=[attr], fanout=2)
                ret = set()
                for _, newrule in refiner(rule):
                    newro = RuleObj(newrule,
                                 self.bad_err_funcs,
                                 self.good_err_funcs,
                                 self.bad_tables,
                                 self.good_tables)
                    ret.update(recurse_disc_rule(attr, newrule))

        
        if old_rules is None:
            base_rule = SDRule(self.full_table, None)            
            refiner = BeamRefiner(attrs=self.cols, fanout=10)
            #refiner = BeamRefiner(attrs=['recipient_nm'], fanout=30)        

            
            for attr, rule in refiner(base_rule):
                ros = recurse_disc_rule(attr, rule)
                #self.top_k({None:ros})
                ros = filter(self.prune_rule, ros)
                rules[(attr,)].update(ros)

        else:
            attrs = old_rules.keys()
            for a_idx, attr1 in enumerate(attrs):
                for attr2 in attrs[a_idx+1:]:
                    merged_attrs = set(attr1).union(attr2)
                    max_attrs_len = max(len(attr1), len(attr2))
                    if len(merged_attrs) == max_attrs_len:
                        continue
                        
                    
                    a1rules, a2rules = old_rules[attr1], old_rules[attr2]

                    for ro in self.merge_dims(a1rules, a2rules):
                        key = ro.rule.attributes

                        #self.top_k({None:(ro,)})
                        if self.prune_rule(ro):
                            rules[key].add(ro)
                        
        return rules
Esempio n. 6
0
    def quadtree_score(self, prev_rule):
        if prev_rule.complexity > self.max_levels: raise


        table = prev_rule.examples
        
        samples = self.get_samples(table)

        # evaluate current partition using sample
        cur_stats = self.evaluate(samples)
        should_stop = self.should_stop(samples, cur_stats) or len(samples) == len(table)
        _logger.info("Stats: %s\tpop(%d)\tsamp(%d)\t%f-%f\t%f-%f",
                     should_stop,
                     len(table),
                     len(samples),
                     cur_stats.est-2.58*cur_stats.std,
                     cur_stats.est+2.58*cur_stats.std,
                     min(cur_stats.vals),
                     max(cur_stats.vals))


        if should_stop:
            for row in table:
                if row[self.SCORE_ID].value == -inf:
                    row[self.SCORE_ID] = cur_stats.est

            prev_rule.quality = prev_rule.score = cur_stats.est
            self.rules.add(prev_rule)
            print prev_rule.quality, '\t', len(table), '\t', prev_rule
            return

        # apply rules to sample table
        splits = defaultdict(lambda: (list(), list()))
        for refname, refiner in  self.refiners:
            for attr, new_rule in refiner(prev_rule):
                key = (refname, attr)
                partition = new_rule.filter_table(samples)
                stats = self.evaluate(partition) if len(partition) else None

                if stats:
                    stats.__dict__['complexity'] = new_rule.complexity - prev_rule.complexity
                    
                splits[key][0].append(new_rule)
                splits[key][1].append(stats)
                

        # if 'recipient_nm = RATHBUN JESSICA, GMMB INC., AMLIN JEFFREY, SHUMAKER PDT (2+ ..)' in str(prev_rule):
        #     scores = [(k[1].name, self.evaluate_split(s)) for (k,(r,s)) in splits.iteritems()]
        #     scores.sort(key=lambda p: p[1], reverse=True)
        #     complexities = [(k[1].name, np.mean([s.complexity for s in sl])) for (k,(r,sl)) in splits.iteritems()]
        #     pdb.set_trace()
            
        splits = sorted(splits.items(),
                        key=lambda (key, (rules, statslist)): self.evaluate_split(statslist),
                        reverse=True)


        best_split = None
        for (refname, attr), (rules, stats_list) in splits:
            counts = map(lambda r: len(r.examples), rules)
            if len(filter(lambda n: n, counts)) <= 1:
                continue

            best_split = ((refname, attr), (rules, stats_list))
            break

        if not best_split:
            best_split = splits[random.randint(0, len(splits)-1)]
            #print '\n'.join(map(str,best_split[1][0]))
            #print
            #pdb.set_trace()
            # _logger.info("couldn't find a split.  giving up")
            # self.evaluate(table)
            # print '%d\t%.5f\t%.5f\t' % (len(table), cur_stats.est, self.errprob[-1]), prev_rule
            # return


        
        (refname, attr), (rules, stats_list) = best_split
        _logger.info("Splitting on %s", attr.name)
        for next_rule, stats in zip(rules, stats_list):
            partition = next_rule.examples
            if not len(partition):
                continue

            if len(self.stats) > 0:
                ratio = (self.stats[-1].std / cur_stats.std)
                newerrprob = self.errprob[-1] * max(ratio , 1)
                #newerrprob = self.errprob[-1] * len(table) / (len(partition) * 1.3)
            else:
                newerrprob = self.errprob[-1]


            self.stats.append(cur_stats)
            self.errprob.append(min(1.0, newerrprob))
            self.quadtree_score(next_rule)
            self.stats.pop()
            self.errprob.pop()


        # sanity check, sum(partitions) == table
        part_sizes = map(len, [r.examples for r in rules])
        total_part_size = sum( part_sizes )
        msg = "Partition sizes wrong: %d!=%d\t%s\t%s"
        msg = msg % (total_part_size,
                     len(table),
                     str(part_sizes),
                     attr.name)
Esempio n. 7
0
    def get_best_split(self, prev_rule, cur_score, bad_tables, good_tables):
        def compute_stats(rule, evaluate, tables):
            stats = None
            samples = map(rule.filter_table, tables)
            if sum(map(len, samples)):
                stats = evaluate(samples, sample=False)
            
            return stats

        N = sum(map(len, bad_tables))

        # apply rules to sample table
        splits = defaultdict(lambda: (list(), list()))

        for refname, refiner in  self.refiners:

            # sanity check that sum of table filtered by attr rules == original table
            attr_counts = defaultdict(list)

            for attr, rule in refiner(prev_rule):
                
                key = (refname, attr)
                
                bad_stats = compute_stats(rule, self.bad_evaluator.evaluate, bad_tables)
                good_stats = compute_stats(rule, self.good_evaluator.evaluate, good_tables)

                if bad_stats:
                    bad_stats.__dict__['complexity'] = rule.complexity
                    attr_counts[attr].append(bad_stats)

                splits[key][0].append(rule)
                splits[key][1].append((bad_stats, good_stats))


            # for attr, sl in attr_counts.iteritems():
            #     NN = sum(len(s.vals) for s in sl)
            #     assert NN == N, "f**k this shit %s\t%d\t%d" % (attr.name, NN, N)

        # if there are any discrete attributes that strictly simplify the rule,
        # use it
        nexts = []
        for (_, attr), (rules, stats_list) in splits.items():
            if (len(filter(lambda x:x, zip(*stats_list)[0])) == 1 and 
                attr.var_type ==  Orange.feature.Type.Discrete and
                rules[0].complexity == prev_rule.complexity):
                print "short cutting", attr, prev_rule
                nexts.append((rules, attr, stats_list))
        if nexts:
            return nexts


        key_func = lambda split: self.evaluate_split(split, self.score_mode)
        splits = sorted(splits.items(), key=key_func, reverse=True)
        evals = map(key_func, splits)
        rules, stats_list = zip(*zip(*splits)[1])
        all_bad_stats = [zip(*sl)[0] for sl in stats_list]
        all_good_stats = [zip(*sl)[1] for sl in stats_list]        

        print len(prev_rule.examples), prev_rule
        print 'split\t', [xx.name for xx in zip(*zip(*splits)[0])[1]]
        print '     \t', [np.std([bs.est for bs in bss if bs]) for bss in all_bad_stats]
        print '     \t', evals
        print '     \t', [self.get_igr(bss) for bss in all_bad_stats]
        print '     \t', [self.get_igr(bss) for bss in all_good_stats]



        if max(evals) <= 0:
            return []


        prev_score = None
        nexts = []
        
        for ((refname, attr), (rules, stats_list)), score in zip(splits, evals):
            counts = map(lambda r: len(r.examples), rules)

            if len(filter(lambda n: n, counts)) < 1:
                continue
            if not len(zip(rules, stats_list)):
                continue
            if prev_score is not None and score != prev_score:
                break
            nexts.append(( rules, attr, stats_list ))
            prev_score = score

        return nexts