Exemple #1
0
def get_parameters(datasetidx, **kwargs):
    name = datasetnames[datasetidx]
    outname = name

    test_data = get_test_data(name)
    dbname, sql, badresults, goodresults, errtype, get_ground_truth, tablename = test_data
    obj, table = create_sharedobj(*test_data[:-2])
    aggerr = obj.errors[0]

    # retrieve table for each good and bad key
    obj.db = connect(dbname)
    bad_tables = get_provenance_split(obj, aggerr.agg.cols, aggerr.keys) or []
    good_tables = get_provenance_split(obj, aggerr.agg.cols, obj.goodkeys[aggerr.agg.shortname]) or []

    (bad_tables, good_tables), full_table = reconcile_tables(bad_tables, good_tables)
    #_, full_table = reconcile_tables(bad_tables)

    # strip unnecessary columns
    user_cols = kwargs.get('cols', None)
    table, cols = strip_columns(table, aggerr, cols=user_cols)
    bad_tables = [strip_columns(t, aggerr, cols=user_cols)[0] for t in bad_tables]
    good_tables = [strip_columns(t, aggerr, cols=user_cols)[0] for t in good_tables]
    table = full_table

    truth = set(get_ground_truth(full_table))

    return full_table, bad_tables, good_tables, truth, aggerr, cols
Exemple #2
0
    def f(bad_tables, aggerr, klass, params, kwargs, queue):
      try:
        cols = valid_table_cols(bad_tables[0], aggerr.agg.cols, kwargs)
        all_cols = cols + aggerr.agg.cols        
        torm = [attr.name for attr in bad_tables[0].domain 
                if attr.name not in all_cols]

        bad_tables = [rm_attr_from_domain(t, torm) for t in bad_tables]
        good_tables = []
        _, full_table = reconcile_tables(bad_tables)
 
        start = time.time()
        hybrid = klass(**params)
        clusters = hybrid(full_table, bad_tables, good_tables)
        normalize_cluster_errors(clusters)
        rules = clusters_to_rules(clusters, full_table)
        cost = time.time() - start
        ncalls = 0
        
        queue.put( (rules, cost, ncalls) )
      except:
        traceback.print_exc()
        queue.put(None)
Exemple #3
0
def parallel_hybrid(obj, aggerr, **kwargs):

    db = connect(obj.dbname)
    obj.db = db

    def f(bad_tables, aggerr, klass, params, kwargs, queue):
      try:
        cols = valid_table_cols(bad_tables[0], aggerr.agg.cols, kwargs)
        all_cols = cols + aggerr.agg.cols        
        torm = [attr.name for attr in bad_tables[0].domain 
                if attr.name not in all_cols]

        bad_tables = [rm_attr_from_domain(t, torm) for t in bad_tables]
        good_tables = []
        _, full_table = reconcile_tables(bad_tables)
 
        start = time.time()
        hybrid = klass(**params)
        clusters = hybrid(full_table, bad_tables, good_tables)
        normalize_cluster_errors(clusters)
        rules = clusters_to_rules(clusters, full_table)
        cost = time.time() - start
        ncalls = 0
        
        queue.put( (rules, cost, ncalls) )
      except:
        traceback.print_exc()
        queue.put(None)

    nprocesses = kwargs.get('nprocesses', 4)
    parallelize = kwargs.get('parallelize', True)


    db = connect(obj.dbname)
    badresults = aggerr.keys
    queue = Queue()
    npending = 0    
    totalcost, totalncalls = 0., 0.

    bad_partition_tables = parallel_get_partitions(obj, aggerr, nprocesses)
    all_tables = []
    map(all_tables.extend, bad_partition_tables)
    _, mastertable = reconcile_tables(all_tables)
    cols = valid_table_cols(all_tables[0], aggerr.agg.cols, kwargs)

    params = {
        'aggerr':aggerr,
        'cols':cols,
        'c' : 0.2,
        'l' : 0.5,
        'msethreshold': 0.01
        }
    params.update(dict(kwargs))

    if aggerr.agg.func.__class__ in (errfunc.SumErrFunc, errfunc.CountErrFunc):
      klass = MR 
      params.update({
        'use_mtuples': False,
        'c': 0,
        })
    else:
      klass = BDT
      params.update({
        'use_cache': False,
        'use_mtuples': False,#True,
        'epsilon': 0.005,
        'min_improvement': 0.01,
        'tau': [0.1, 0.5],
        'c' : 0.3,
        'p': 0.7
        })

    
    for bad_tables in bad_partition_tables:
        args = (bad_tables, aggerr, klass, params, kwargs, queue)
        if parallelize:
            p = Process(target=f, args=args)
            p.start()
        else:
            f(*args)
        npending += 1

    results = []
    start = time.time()
    while npending > 0:
        try:
            result = queue.get(timeout=1)
            npending -= 1            
            if result:
                rules, cost, ncalls = result
                totalncalls += ncalls
                results.extend(rules)
            else:
                print "got error"

        except:
            pass

    results.sort(key=lambda r: r.quality, reverse=True)
    hybrid = klass(**params)
    hybrid.setup_tables(mastertable, all_tables, [])
    results = hybrid.group_rules(results)

    totalcost = time.time() - start
    db.close()

    return totalcost, totalncalls, mastertable, results

    partitions = map(merge_tables, bad_partition_tables)
    results, merge_cost = parallel_rank_rules(aggerr, partitions, results, **kwargs)

    print "found rules"
    print '\n'.join(map(str, results))
    
    return totalcost, totalncalls, mastertable, results