def main(argv): tester = None sepset_rule = 'first' orientation_rule = 'majority' is_random = 'random' in argv is_company = 'company' in argv working_dir = get_working_dir(is_company, is_random) from_index, to_index, n_jobs, _ = arg_parse(argv) if is_random: with open(f'data/random/1000_random_schemas.json', 'r') as f: schemas = json.load(f) with open(f'data/random/1000_random_rcms.json', 'r') as f: rcm_codes = json.load(f) else: schemas, rcm_codes = None, None identifier = str(int(time.time() * 100)) p1_queue = deque() p2_queue = deque() def writing_phase(_phase): assert _phase == 2 assert 1 == _phase or 2 == _phase queue = p1_queue if _phase == 1 else p2_queue with open(f'{working_dir}pure_rbo_comparison_{_phase}_{from_index}_{to_index}_{identifier}.csv', 'a') as _f: while queue: vals = queue.popleft() print(*vals, file=_f, sep=',') last_wrote2 = 0 for idx in trange(from_index, to_index, smoothing=0): for base_size in [200, 500]: if is_random: schema = RelationalSchema.from_dict(schemas[idx]) max_hop, rcm_code = rcm_codes[idx] rdeps = sorted(list(enumerate_rdeps(schema, max_hop))) dependencies = {rdeps[at] for at in rcm_code} rcm = RCM(schema, dependencies) else: schema = company_schema() rcm = company_rcm() def initialize(): np.random.seed(idx + 1) skeleton = sized_random_skeleton(schema, sizing_method(base_size, schema), seed=idx + 1) lg_rcm = linear_gaussians_rcm(rcm, seed=idx + 1) generate_values_for_skeleton(lg_rcm, skeleton, seed=idx + 1) datasource = DataCenter(skeleton) kerner = RBFKernelComputer(datasource, additive=1e-2, n_jobs=n_jobs, eqsize_only=False, k_cache_max_size=128) _tester = RCITester(kerner, n_jobs=n_jobs) return _tester initialized = False if not initialized: tester = initialize() initialized = True np.random.seed(idx + 1) learner0 = RCMLearner(tester, max_rv_hops=rcm.max_hop, max_degree=None, verbose=False, true_rcm=rcm, sepset_rule=sepset_rule, orientation_rule=orientation_rule, aggregator=None, minimum_rows_for_test=0, detect_rbo_violations=False, detect_post_rbo_violations=False) learner1 = RCMLearner(tester, max_rv_hops=rcm.max_hop, max_degree=None, verbose=False, true_rcm=rcm, sepset_rule=sepset_rule, orientation_rule=orientation_rule, aggregator=None, minimum_rows_for_test=0, detect_rbo_violations=False, detect_post_rbo_violations=False) learner2 = RCMLearner(tester, max_rv_hops=rcm.max_hop, max_degree=None, verbose=False, true_rcm=rcm, sepset_rule=sepset_rule, orientation_rule=orientation_rule, aggregator=None, minimum_rows_for_test=0, detect_rbo_violations=True, detect_post_rbo_violations=False) learner0.perfect_phase_I() learner0.CUT_based_collider_tests(rbo_only=True) learner1.perfect_phase_I() learner1.RBO_based_tests() learner2.perfect_phase_I() learner2.RBO_based_tests() p2_values = [] p2_values.extend((idx, base_size)) p2_values.extend(examine_oriori(learner0, rcm)) p2_values.extend(examine_oriori(learner1, rcm)) p2_values.extend(examine_oriori(learner2, rcm)) print(p2_values) # exit(0) p2_queue.append(p2_values) if last_wrote2 + 120 < time.time(): writing_phase(2) last_wrote2 = time.time() # clean up if p2_queue: writing_phase(2)
def main(argv): # p1_key = (idx, base_size, is_aggregated, order_dependent) KEY_LENGTH = {1: 4, 2: 7} is_aggregateds = [True, False] order_dependents = [True, False] is_random = 'random' in argv is_company = 'company' in argv working_dir = get_working_dir(is_company, is_random) done = retrieve_finished(KEY_LENGTH, working_dir) if '--merge' in argv: for phase in [1, 2]: to_be_merged = list( files(working_dir, prefix=f'phase_{phase}', suffix='.csv')) if to_be_merged: print(f'merging: ') for x in to_be_merged: print(' ', x) df = pd.concat([ pd.read_csv(f'{working_dir}{fname}', header=None) for fname in to_be_merged ]) for fname in to_be_merged: os.rename(f'{working_dir}{fname}', f'{working_dir}{fname}.bak') df.to_csv(f'{working_dir}phase_{phase}.csv', header=False, index=False) else: print('nothing to merge.') return from_index, to_index, n_jobs = arg_parse(argv) if is_random: with open(f'data/random/1000_random_schemas.json', 'r') as f: schemas = json.load(f) with open(f'data/random/1000_random_rcms.json', 'r') as f: rcm_codes = json.load(f) else: schemas, rcm_codes = None, None identifier = str(int(time.time() * 100)) options = list(itertools.product(is_aggregateds, order_dependents)) p1_queue = deque() tester = None def writing_phase(_phase): assert 1 == _phase queue = p1_queue with open( f'{working_dir}phase_{_phase}_{from_index}_{to_index}_{identifier}.csv', 'a') as _f: while queue: vals = queue.popleft() print(*vals, file=_f, sep=',') last_wrote1 = 0 for idx in trange(from_index, to_index, smoothing=0): for base_size in [200, 300, 400, 500]: if is_random: schema = RelationalSchema.from_dict(schemas[idx]) max_hop, rcm_code = rcm_codes[idx] rdeps = sorted(list(enumerate_rdeps(schema, max_hop))) dependencies = {rdeps[at] for at in rcm_code} rcm = RCM(schema, dependencies) else: schema = company_schema() rcm = company_rcm() def initialize(): np.random.seed(idx + 1) skeleton = sized_random_skeleton(schema, sizing_method( base_size, schema), seed=idx + 1) lg_rcm = linear_gaussians_rcm(rcm, seed=idx + 1) generate_values_for_skeleton(lg_rcm, skeleton, seed=idx + 1) datasource = DataCenter(skeleton) kerner = RBFKernelComputer(datasource, additive=1e-2, n_jobs=n_jobs, eqsize_only=False, k_cache_max_size=128) _tester = RCITester(kerner, n_jobs=n_jobs) return _tester initialized = False for is_aggregated, order_dependent in options: p1_key = (idx, base_size, is_aggregated, order_dependent) if p1_key in done[1]: continue if not initialized: tester = initialize() initialized = True if p1_key not in done[1]: done[1].add(p1_key) """ Phase I """ np.random.seed(idx + 1) p1_learner = RCMLearner( tester, max_rv_hops=rcm.max_hop, max_degree=None, verbose=False, true_rcm=rcm, aggregator=average_aggregator if is_aggregated else None, minimum_rows_for_test=0, phase_I_order_independence=not order_dependent) p1_learner.phase_I() p1_values = [] p1_values.extend(p1_key) p1_values.extend(phase_I_to_write(p1_learner.prcm, rcm)) counts = [0, 0, 0] for cause, effect in {(cause, effect) for cause, effect, _ in p1_learner.saved_by_aggregated_ci}: dep = RelationalDependency(cause, effect) rev_dep = dep.reverse() if UndirectedRDep( dep ) not in p1_learner.prcm.undirected_dependencies: continue if dep in rcm.directed_dependencies: counts[0] += 1 elif rev_dep in rcm.directed_dependencies: counts[1] += 1 else: counts[2] += 1 p1_values.append(counts[0]) p1_values.append(counts[1]) p1_values.append(counts[2]) p1_queue.append(p1_values) if last_wrote1 + 120 < time.time(): writing_phase(1) last_wrote1 = time.time() # clean up if p1_queue: writing_phase(1)
if len(rcm.directed_dependencies) <= 2: continue cdg = rcm.class_dependency_graph if not nx.is_connected(cdg.as_networkx_dag().to_undirected()): continue if any(len(cdg.adj(attr)) == 0 for attr in schema.attrs): continue rbos, colliders, non_colliders = rbos_colliders_non_colliders(rcm) if len(rbos) + len(colliders) + len(non_colliders) == 0: continue break else: continue rcm_code = [ at for at, dep in enumerate( sorted(list(enumerate_rdeps(schema, rcm.max_hop)))) if dep in rcm.directed_dependencies ] schemas.append(schema.to_dict()) rcms.append([rcm.max_hop, rcm_code]) passed += 1 print() with open(f'random/{total_size}_random_schemas.json', 'w') as f: json.dump(schemas, f, indent=4) with open(f'random/{total_size}_random_rcms.json', 'w') as f: json.dump(rcms, f, indent=4) with open(f'random/{total_size}_random_schemas.json', 'r') as f: schemas2 = json.load(f)
def main(argv): tester = None KEY_LENGTH = {1: 4, 2: 7} is_aggregateds = [True, False] sepset_rules = ['minimal'] orientation_rules = ['majority'] detect_rbos = [True, False] detect_post_rbos = [True, False] is_random = 'random' in argv is_company = 'company' in argv working_dir = get_working_dir(is_company, is_random) done = retrieve_finished(KEY_LENGTH, working_dir) from_index, to_index, n_jobs, _ = arg_parse(argv) if is_random: with open(f'data/random/1000_random_schemas.json', 'r') as f: schemas = json.load(f) with open(f'data/random/1000_random_rcms.json', 'r') as f: rcm_codes = json.load(f) else: schemas, rcm_codes = None, None identifier = str(int(time.time() * 100)) options = list(itertools.product(is_aggregateds, sepset_rules, orientation_rules, detect_rbos, detect_post_rbos)) p1_queue = deque() p2_queue = deque() def writing_phase(_phase): assert 1 == _phase or 2 == _phase queue = p1_queue if _phase == 1 else p2_queue with open(f'{working_dir}phase_{_phase}_{from_index}_{to_index}_{identifier}.csv', 'a') as _f: while queue: vals = queue.popleft() print(*vals, file=_f, sep=',') last_wrote2 = 0 for idx in trange(from_index, to_index, smoothing=0): for base_size in [200, 300, 400, 500]: # 200, 300, 400,500, 600 if is_random: schema = RelationalSchema.from_dict(schemas[idx]) max_hop, rcm_code = rcm_codes[idx] rdeps = sorted(list(enumerate_rdeps(schema, max_hop))) dependencies = {rdeps[at] for at in rcm_code} rcm = RCM(schema, dependencies) else: schema = company_schema() rcm = company_rcm() def initialize(): np.random.seed(idx + 1) skeleton = sized_random_skeleton(schema, sizing_method(base_size, schema), seed=idx + 1) lg_rcm = linear_gaussians_rcm(rcm, seed=idx + 1) generate_values_for_skeleton(lg_rcm, skeleton, seed=idx + 1) datasource = DataCenter(skeleton) kerner = RBFKernelComputer(datasource, additive=1e-2, n_jobs=n_jobs, eqsize_only=False, k_cache_max_size=128) _tester = RCITester(kerner, n_jobs=n_jobs) return _tester initialized = False for is_aggregated, sepset_rule, orientation_rule, detect_rbo, detect_post_rbo in options: if detect_rbo != detect_post_rbo: continue if is_aggregated: if not (detect_post_rbo and detect_rbo): continue p2_key = (idx, base_size, is_aggregated, sepset_rule, orientation_rule, detect_rbo, detect_post_rbo) if p2_key in done[2]: continue if not initialized: tester = initialize() initialized = True if p2_key not in done[2]: done[2].add(p2_key) np.random.seed(idx + 1) learner = RCMLearner(tester, max_rv_hops=rcm.max_hop, max_degree=None, verbose=False, true_rcm=rcm, sepset_rule=sepset_rule, orientation_rule=orientation_rule, aggregator=average_aggregator if is_aggregated else None, minimum_rows_for_test=0, detect_rbo_violations=detect_rbo, detect_post_rbo_violations=detect_post_rbo) learner.perfect_phase_I() learner.RBO_based_tests() learner.post_RBO_unshielded_triples_tests() learner.orient() p2_values = [] p2_values.extend(p2_key) p2_values.append('|') p2_values.extend(learner.rbo_stats[k] for k in stats_keys()) p2_values.append('|') p2_values.extend(learner.post_rbo_stats[k] for k in stats_keys()) p2_values.append('|') p2_values.extend(evaluation_for_orientation(learner.prcm, rcm)[-6:-3]) p2_queue.append(p2_values) if last_wrote2 + 120 < time.time(): writing_phase(2) last_wrote2 = time.time() if last_wrote2 + 120 < time.time(): writing_phase(2) last_wrote2 = time.time() # clean up if p1_queue: writing_phase(1) if p2_queue: writing_phase(2)