def get_parameters(datasetidx, **kwargs): name = datasetnames[datasetidx] outname = name test_data = get_test_data(name) dbname, sql, badresults, goodresults, errtype, get_ground_truth, tablename = test_data obj, table = create_sharedobj(*test_data[:-2]) aggerr = obj.errors[0] # retrieve table for each good and bad key obj.db = connect(dbname) bad_tables = get_provenance_split(obj, aggerr.agg.cols, aggerr.keys) or [] good_tables = get_provenance_split(obj, aggerr.agg.cols, obj.goodkeys[aggerr.agg.shortname]) or [] (bad_tables, good_tables), full_table = reconcile_tables(bad_tables, good_tables) #_, full_table = reconcile_tables(bad_tables) # strip unnecessary columns user_cols = kwargs.get('cols', None) table, cols = strip_columns(table, aggerr, cols=user_cols) bad_tables = [strip_columns(t, aggerr, cols=user_cols)[0] for t in bad_tables] good_tables = [strip_columns(t, aggerr, cols=user_cols)[0] for t in good_tables] table = full_table truth = set(get_ground_truth(full_table)) return full_table, bad_tables, good_tables, truth, aggerr, cols
def f(bad_tables, aggerr, klass, params, kwargs, queue): try: cols = valid_table_cols(bad_tables[0], aggerr.agg.cols, kwargs) all_cols = cols + aggerr.agg.cols torm = [attr.name for attr in bad_tables[0].domain if attr.name not in all_cols] bad_tables = [rm_attr_from_domain(t, torm) for t in bad_tables] good_tables = [] _, full_table = reconcile_tables(bad_tables) start = time.time() hybrid = klass(**params) clusters = hybrid(full_table, bad_tables, good_tables) normalize_cluster_errors(clusters) rules = clusters_to_rules(clusters, full_table) cost = time.time() - start ncalls = 0 queue.put( (rules, cost, ncalls) ) except: traceback.print_exc() queue.put(None)
def parallel_hybrid(obj, aggerr, **kwargs): db = connect(obj.dbname) obj.db = db def f(bad_tables, aggerr, klass, params, kwargs, queue): try: cols = valid_table_cols(bad_tables[0], aggerr.agg.cols, kwargs) all_cols = cols + aggerr.agg.cols torm = [attr.name for attr in bad_tables[0].domain if attr.name not in all_cols] bad_tables = [rm_attr_from_domain(t, torm) for t in bad_tables] good_tables = [] _, full_table = reconcile_tables(bad_tables) start = time.time() hybrid = klass(**params) clusters = hybrid(full_table, bad_tables, good_tables) normalize_cluster_errors(clusters) rules = clusters_to_rules(clusters, full_table) cost = time.time() - start ncalls = 0 queue.put( (rules, cost, ncalls) ) except: traceback.print_exc() queue.put(None) nprocesses = kwargs.get('nprocesses', 4) parallelize = kwargs.get('parallelize', True) db = connect(obj.dbname) badresults = aggerr.keys queue = Queue() npending = 0 totalcost, totalncalls = 0., 0. bad_partition_tables = parallel_get_partitions(obj, aggerr, nprocesses) all_tables = [] map(all_tables.extend, bad_partition_tables) _, mastertable = reconcile_tables(all_tables) cols = valid_table_cols(all_tables[0], aggerr.agg.cols, kwargs) params = { 'aggerr':aggerr, 'cols':cols, 'c' : 0.2, 'l' : 0.5, 'msethreshold': 0.01 } params.update(dict(kwargs)) if aggerr.agg.func.__class__ in (errfunc.SumErrFunc, errfunc.CountErrFunc): klass = MR params.update({ 'use_mtuples': False, 'c': 0, }) else: klass = BDT params.update({ 'use_cache': False, 'use_mtuples': False,#True, 'epsilon': 0.005, 'min_improvement': 0.01, 'tau': [0.1, 0.5], 'c' : 0.3, 'p': 0.7 }) for bad_tables in bad_partition_tables: args = (bad_tables, aggerr, klass, params, kwargs, queue) if parallelize: p = Process(target=f, args=args) p.start() else: f(*args) npending += 1 results = [] start = time.time() while npending > 0: try: result = queue.get(timeout=1) npending -= 1 if result: rules, cost, ncalls = result totalncalls += ncalls results.extend(rules) else: print "got error" except: pass results.sort(key=lambda r: r.quality, reverse=True) hybrid = klass(**params) hybrid.setup_tables(mastertable, all_tables, []) results = hybrid.group_rules(results) totalcost = time.time() - start db.close() return totalcost, totalncalls, mastertable, results partitions = map(merge_tables, bad_partition_tables) results, merge_cost = parallel_rank_rules(aggerr, partitions, results, **kwargs) print "found rules" print '\n'.join(map(str, results)) return totalcost, totalncalls, mastertable, results