Beispiel #1
0
    def discrete_vals_iter(self, table):
        """
        @param
        @return tuple of position bits, parameters for calling bottomup, and the vertical subset of the table
                position bits is actually a list of indexes that will be added to the resulting clusters
        """
        ddists = Orange.statistics.distribution.Domain(table)
        discrete_vals = []
        for pos, attr in enumerate(table.domain):
            if attr.var_type == Orange.feature.Type.Discrete:
                discrete_vals.append((attr.name, attr.values))

        cards = [len(vals) for name, vals in discrete_vals]
        if not cards:
            return

        
        all_clusters = []
        for bits in bitarray_iterator(cards):
            partition = self.get_partition(bits, discrete_vals, table)
            if not len(partition):
                continue

            
            rmcols = [attr.name for attr in partition.domain
                      if attr.var_type == Orange.feature.Type.Discrete and attr.name in self.cols]
            cols = [col for col in self.cols if col not in rmcols]
            continuous_table = rm_attr_from_domain(partition, rmcols)

            partition_keys = {}
            for bit, (name, vals) in zip(bits, discrete_vals):
                partition_keys[name] = [bit]
            
            yield partition_keys, cols, continuous_table
Beispiel #2
0
    def __call__(self, table, **kwargs):
        """
        partition table by the discrete attributes
        """
        
        # setup the error function to execute on slim table
        rmcols = [attr.name for attr in table.domain
                  if attr.var_type == Orange.feature.Type.Discrete and attr.name in self.cols]
        thin_table = rm_attr_from_domain(table, rmcols)
        self.params['err_func'].setup(thin_table)
        bottomup_func = DiscreteBottomUpF(self.params)        
        
        if self.parallelize:
            start = time.time()
            pool = Pool(self.nprocesses)
            results = pool.map(bottomup_func, self.discrete_vals_iter(table))
            pool.close()
            self.merge_cost += time.time() - start
        else:
            results = []
            for args in self.discrete_vals_iter(table):
                results.append(bottomup_func(args))

            if results:
                all_stats, clusters_list = zip(*results)                
                kd_cost, sample_cost, initclusters_cost, merge_cost = zip(*all_stats)
                self.kd_cost += sum(kd_cost)
                self.sample_cost += sum(sample_cost)
                self.initclusters_cost += sum(initclusters_cost)
                self.merge_cost += sum(merge_cost)

        
        if not len(results):
            return super(DiscreteBottomUp, self).__call__(table)

        all_stats, clusters_list = zip(*results)
        self.all_clusters = []
        map(self.all_clusters.extend, clusters_list)

        BottomUp.setup(self, table)
        thresh = compute_clusters_threshold(self.all_clusters)
        final_clusters = self.normalize_results(self.all_clusters,
                                                is_mergable=lambda c: c.error >= thresh)

        self.final_clusters = final_clusters
        
        return final_clusters
Beispiel #3
0
    def discrete_vals_iter(self, table):
        domain = table.domain
        for r in self.rules:
            partition = r.filter_table(table)
            if not len(partition):
                continue
            
            rmcols = [attr.name for attr in domain
                      if (attr.var_type == Orange.feature.Type.Discrete
                          and attr.name in self.cols)]
            cols = [col for col in self.cols if col not in rmcols]
            continuous_table = rm_attr_from_domain(partition, rmcols)

            partition_keys = {}
            for c in r.filter.conditions:
                if domain[c.position].var_type == Orange.feature.Type.Discrete:
                    partition_keys[domain[c.position].name] = map(int, c.values)

            yield partition_keys, cols, continuous_table
Beispiel #4
0
    def f(bad_tables, aggerr, klass, params, kwargs, queue):
      try:
        cols = valid_table_cols(bad_tables[0], aggerr.agg.cols, kwargs)
        all_cols = cols + aggerr.agg.cols        
        torm = [attr.name for attr in bad_tables[0].domain 
                if attr.name not in all_cols]

        bad_tables = [rm_attr_from_domain(t, torm) for t in bad_tables]
        good_tables = []
        _, full_table = reconcile_tables(bad_tables)
 
        start = time.time()
        hybrid = klass(**params)
        clusters = hybrid(full_table, bad_tables, good_tables)
        normalize_cluster_errors(clusters)
        rules = clusters_to_rules(clusters, full_table)
        cost = time.time() - start
        ncalls = 0
        
        queue.put( (rules, cost, ncalls) )
      except:
        traceback.print_exc()
        queue.put(None)
Beispiel #5
0
def serial_hybrid(obj, aggerr, **kwargs):
    costs = {}
    db = connect(obj.dbname)
    obj.db = db
    start = time.time()
    all_keys = list(chain(aggerr.keys, obj.goodkeys[aggerr.agg.shortname]))
    all_tables = get_provenance_split(obj, aggerr.agg.cols, all_keys)
    bad_tables = all_tables[:len(aggerr.keys)]
    good_tables = all_tables[len(aggerr.keys):]
    costs['data_load'] = time.time() - start

    _logger.debug("bad table counts:  %s" % ', '.join(map(str, map(len, bad_tables))))
    _logger.debug("good table counts: %s" % ', '.join(map(str, map(len, good_tables))))
    print "agg error %s \t %s" % (aggerr.agg, aggerr.errtype)

    
    cost, ncalls = 0, 0
    rules = []
    try:
        full_start = time.time()
        start = time.time()
        cols = valid_table_cols(bad_tables[0], aggerr.agg.cols, kwargs)
        all_cols = cols + aggerr.agg.cols        
        torm = [attr.name for attr in bad_tables[0].domain if attr.name not in all_cols]
        _logger.debug("valid cols: %s" % cols)

        bad_tables = [rm_attr_from_domain(t, torm) for t in bad_tables]
        good_tables = [rm_attr_from_domain(t, torm) for t in good_tables]
        all_full_table = union_tables(bad_tables, good_tables)
        full_table = union_tables(bad_tables)

        costs['data_setup'] = time.time() - start


        # make sure aggerr keys and tables are consistent one last time
        if len(bad_tables) != len(aggerr.keys):
          pdb.set_trace()
          raise RuntimeError("#badtables (%d) != #aggerr keys (%d)" % (len(bad_tables), len(aggerr.keys)))


        params = {
          'aggerr':aggerr,
          'cols':cols,
          'c': obj.c,
          'aggerr':aggerr,
          'cols':cols,
          'c': obj.c,
          'c_range': [0.05, 1],
          'l' : 0.6,
          'msethreshold': 0.01,
          'max_wait':5,
          'DEBUG': False
        }
        # msethreshold=0.01,
        # k=10,
        # nprocesses=4,
        # parallelize=True,
        # complexity_multiplier=1.5}

        params.update(dict(kwargs))

        if aggerr.agg.func.__class__ in (errfunc.SumErrFunc, errfunc.CountErrFunc):
          klass = MR 
          params.update({
            'use_cache': False,
            'use_mtuples': False,
            'granularity': 100
            })
          params['c'] = params.get('c', .15)

        else:
          klass = BDT
          params.update({
            'use_cache': True,
            'use_mtuples': False,#True,
            'epsilon': 0.0015,
            'min_improvement': 0.01,
            'tau': [0.08, 0.5],
            'p': 0.7
            })
          params['c'] = params.get('c', .3)

        #klass = SVM
        #params.update({})
        _logger.debug("c is set to: %.4f", params['c'])

        start = time.time()
        hybrid = klass(**params)
        clusters = hybrid(all_full_table, bad_tables, good_tables)
        rules = clusters_to_rules(clusters, full_table)
        print "nclusters: %d" % len(clusters)
        costs['rules_get'] = time.time() - start

        _logger.debug("clustering %d rules" % len(rules))
        for r in rules[:10]:
          _logger.debug("%.4f\t%.4f - %.4f\t%s" % (r.quality, r.c_range[0], r.c_range[1], str(r)))


        clustered_rules = hybrid.group_rules(rules, 5)
        rules = clustered_rules
        costs['rules_cluster'] = time.time() - start

        ncalls = 0
    except:
        traceback.print_exc()

    
    # return the best rules first in the list
    start = time.time()
    rules.sort(key=lambda r: r.c_range[0])
    rules = [r.simplify(all_full_table) for r in rules[:10]]
    costs['rules_simplify'] = time.time() - start

    cost = time.time() - full_start


    print "found rules"
    for rule in rules[:5]:
      print "%.5f\t%s" % (rule.quality, rule)

    print "=== Costs ==="
    for key, cost in costs.iteritems():
      print "%.5f\t%s" % (cost, key)
    
    return cost, ncalls, table, rules