Example #1
0
def main(argv):
    tester = None

    sepset_rule = 'first'
    orientation_rule = 'majority'

    is_random = 'random' in argv
    is_company = 'company' in argv

    working_dir = get_working_dir(is_company, is_random)

    from_index, to_index, n_jobs, _ = arg_parse(argv)

    if is_random:
        with open(f'data/random/1000_random_schemas.json', 'r') as f:
            schemas = json.load(f)
        with open(f'data/random/1000_random_rcms.json', 'r') as f:
            rcm_codes = json.load(f)
    else:
        schemas, rcm_codes = None, None

    identifier = str(int(time.time() * 100))

    p1_queue = deque()
    p2_queue = deque()

    def writing_phase(_phase):
        assert _phase == 2
        assert 1 == _phase or 2 == _phase
        queue = p1_queue if _phase == 1 else p2_queue
        with open(f'{working_dir}pure_rbo_comparison_{_phase}_{from_index}_{to_index}_{identifier}.csv', 'a') as _f:
            while queue:
                vals = queue.popleft()
                print(*vals, file=_f, sep=',')

    last_wrote2 = 0
    for idx in trange(from_index, to_index, smoothing=0):
        for base_size in [200, 500]:
            if is_random:
                schema = RelationalSchema.from_dict(schemas[idx])
                max_hop, rcm_code = rcm_codes[idx]
                rdeps = sorted(list(enumerate_rdeps(schema, max_hop)))
                dependencies = {rdeps[at] for at in rcm_code}
                rcm = RCM(schema, dependencies)
            else:
                schema = company_schema()
                rcm = company_rcm()

            def initialize():
                np.random.seed(idx + 1)
                skeleton = sized_random_skeleton(schema, sizing_method(base_size, schema), seed=idx + 1)
                lg_rcm = linear_gaussians_rcm(rcm, seed=idx + 1)
                generate_values_for_skeleton(lg_rcm, skeleton, seed=idx + 1)

                datasource = DataCenter(skeleton)
                kerner = RBFKernelComputer(datasource, additive=1e-2, n_jobs=n_jobs, eqsize_only=False, k_cache_max_size=128)
                _tester = RCITester(kerner, n_jobs=n_jobs)
                return _tester

            initialized = False

            if not initialized:
                tester = initialize()
                initialized = True

            np.random.seed(idx + 1)
            learner0 = RCMLearner(tester, max_rv_hops=rcm.max_hop, max_degree=None, verbose=False, true_rcm=rcm, sepset_rule=sepset_rule,
                                  orientation_rule=orientation_rule,
                                  aggregator=None,
                                  minimum_rows_for_test=0,
                                  detect_rbo_violations=False,
                                  detect_post_rbo_violations=False)

            learner1 = RCMLearner(tester, max_rv_hops=rcm.max_hop, max_degree=None, verbose=False, true_rcm=rcm, sepset_rule=sepset_rule,
                                  orientation_rule=orientation_rule,
                                  aggregator=None,
                                  minimum_rows_for_test=0,
                                  detect_rbo_violations=False,
                                  detect_post_rbo_violations=False)

            learner2 = RCMLearner(tester, max_rv_hops=rcm.max_hop, max_degree=None, verbose=False, true_rcm=rcm, sepset_rule=sepset_rule,
                                  orientation_rule=orientation_rule,
                                  aggregator=None,
                                  minimum_rows_for_test=0,
                                  detect_rbo_violations=True,
                                  detect_post_rbo_violations=False)

            learner0.perfect_phase_I()
            learner0.CUT_based_collider_tests(rbo_only=True)

            learner1.perfect_phase_I()
            learner1.RBO_based_tests()

            learner2.perfect_phase_I()
            learner2.RBO_based_tests()

            p2_values = []
            p2_values.extend((idx, base_size))
            p2_values.extend(examine_oriori(learner0, rcm))
            p2_values.extend(examine_oriori(learner1, rcm))
            p2_values.extend(examine_oriori(learner2, rcm))
            print(p2_values)
            # exit(0)
            p2_queue.append(p2_values)

            if last_wrote2 + 120 < time.time():
                writing_phase(2)
                last_wrote2 = time.time()

    # clean up
    if p2_queue:
        writing_phase(2)
Example #2
0
def main(argv):
    # p1_key = (idx, base_size, is_aggregated, order_dependent)
    KEY_LENGTH = {1: 4, 2: 7}

    is_aggregateds = [True, False]
    order_dependents = [True, False]

    is_random = 'random' in argv
    is_company = 'company' in argv

    working_dir = get_working_dir(is_company, is_random)
    done = retrieve_finished(KEY_LENGTH, working_dir)

    if '--merge' in argv:
        for phase in [1, 2]:
            to_be_merged = list(
                files(working_dir, prefix=f'phase_{phase}', suffix='.csv'))
            if to_be_merged:
                print(f'merging: ')
                for x in to_be_merged:
                    print('       ', x)
                df = pd.concat([
                    pd.read_csv(f'{working_dir}{fname}', header=None)
                    for fname in to_be_merged
                ])
                for fname in to_be_merged:
                    os.rename(f'{working_dir}{fname}',
                              f'{working_dir}{fname}.bak')
                df.to_csv(f'{working_dir}phase_{phase}.csv',
                          header=False,
                          index=False)
            else:
                print('nothing to merge.')
        return

    from_index, to_index, n_jobs = arg_parse(argv)

    if is_random:
        with open(f'data/random/1000_random_schemas.json', 'r') as f:
            schemas = json.load(f)
        with open(f'data/random/1000_random_rcms.json', 'r') as f:
            rcm_codes = json.load(f)
    else:
        schemas, rcm_codes = None, None

    identifier = str(int(time.time() * 100))

    options = list(itertools.product(is_aggregateds, order_dependents))

    p1_queue = deque()
    tester = None

    def writing_phase(_phase):
        assert 1 == _phase
        queue = p1_queue
        with open(
                f'{working_dir}phase_{_phase}_{from_index}_{to_index}_{identifier}.csv',
                'a') as _f:
            while queue:
                vals = queue.popleft()
                print(*vals, file=_f, sep=',')

    last_wrote1 = 0
    for idx in trange(from_index, to_index, smoothing=0):
        for base_size in [200, 300, 400, 500]:
            if is_random:
                schema = RelationalSchema.from_dict(schemas[idx])
                max_hop, rcm_code = rcm_codes[idx]
                rdeps = sorted(list(enumerate_rdeps(schema, max_hop)))
                dependencies = {rdeps[at] for at in rcm_code}
                rcm = RCM(schema, dependencies)
            else:
                schema = company_schema()
                rcm = company_rcm()

            def initialize():
                np.random.seed(idx + 1)
                skeleton = sized_random_skeleton(schema,
                                                 sizing_method(
                                                     base_size, schema),
                                                 seed=idx + 1)
                lg_rcm = linear_gaussians_rcm(rcm, seed=idx + 1)
                generate_values_for_skeleton(lg_rcm, skeleton, seed=idx + 1)

                datasource = DataCenter(skeleton)
                kerner = RBFKernelComputer(datasource,
                                           additive=1e-2,
                                           n_jobs=n_jobs,
                                           eqsize_only=False,
                                           k_cache_max_size=128)
                _tester = RCITester(kerner, n_jobs=n_jobs)
                return _tester

            initialized = False
            for is_aggregated, order_dependent in options:
                p1_key = (idx, base_size, is_aggregated, order_dependent)

                if p1_key in done[1]:
                    continue

                if not initialized:
                    tester = initialize()
                    initialized = True

                if p1_key not in done[1]:
                    done[1].add(p1_key)
                    """ Phase I """
                    np.random.seed(idx + 1)
                    p1_learner = RCMLearner(
                        tester,
                        max_rv_hops=rcm.max_hop,
                        max_degree=None,
                        verbose=False,
                        true_rcm=rcm,
                        aggregator=average_aggregator
                        if is_aggregated else None,
                        minimum_rows_for_test=0,
                        phase_I_order_independence=not order_dependent)

                    p1_learner.phase_I()

                    p1_values = []
                    p1_values.extend(p1_key)
                    p1_values.extend(phase_I_to_write(p1_learner.prcm, rcm))

                    counts = [0, 0, 0]
                    for cause, effect in {(cause, effect)
                                          for cause, effect, _ in
                                          p1_learner.saved_by_aggregated_ci}:
                        dep = RelationalDependency(cause, effect)
                        rev_dep = dep.reverse()
                        if UndirectedRDep(
                                dep
                        ) not in p1_learner.prcm.undirected_dependencies:
                            continue
                        if dep in rcm.directed_dependencies:
                            counts[0] += 1
                        elif rev_dep in rcm.directed_dependencies:
                            counts[1] += 1
                        else:
                            counts[2] += 1
                    p1_values.append(counts[0])
                    p1_values.append(counts[1])
                    p1_values.append(counts[2])

                    p1_queue.append(p1_values)

                    if last_wrote1 + 120 < time.time():
                        writing_phase(1)
                        last_wrote1 = time.time()

    # clean up
    if p1_queue:
        writing_phase(1)
Example #3
0
            if len(rcm.directed_dependencies) <= 2:
                continue
            cdg = rcm.class_dependency_graph
            if not nx.is_connected(cdg.as_networkx_dag().to_undirected()):
                continue
            if any(len(cdg.adj(attr)) == 0 for attr in schema.attrs):
                continue
            rbos, colliders, non_colliders = rbos_colliders_non_colliders(rcm)
            if len(rbos) + len(colliders) + len(non_colliders) == 0:
                continue
            break
        else:
            continue
        rcm_code = [
            at for at, dep in enumerate(
                sorted(list(enumerate_rdeps(schema, rcm.max_hop))))
            if dep in rcm.directed_dependencies
        ]

        schemas.append(schema.to_dict())
        rcms.append([rcm.max_hop, rcm_code])
        passed += 1
        print()

    with open(f'random/{total_size}_random_schemas.json', 'w') as f:
        json.dump(schemas, f, indent=4)
    with open(f'random/{total_size}_random_rcms.json', 'w') as f:
        json.dump(rcms, f, indent=4)

    with open(f'random/{total_size}_random_schemas.json', 'r') as f:
        schemas2 = json.load(f)
Example #4
0
def main(argv):
    tester = None
    KEY_LENGTH = {1: 4, 2: 7}

    is_aggregateds = [True, False]
    sepset_rules = ['minimal']
    orientation_rules = ['majority']
    detect_rbos = [True, False]
    detect_post_rbos = [True, False]

    is_random = 'random' in argv
    is_company = 'company' in argv

    working_dir = get_working_dir(is_company, is_random)
    done = retrieve_finished(KEY_LENGTH, working_dir)

    from_index, to_index, n_jobs, _ = arg_parse(argv)

    if is_random:
        with open(f'data/random/1000_random_schemas.json', 'r') as f:
            schemas = json.load(f)
        with open(f'data/random/1000_random_rcms.json', 'r') as f:
            rcm_codes = json.load(f)
    else:
        schemas, rcm_codes = None, None

    identifier = str(int(time.time() * 100))

    options = list(itertools.product(is_aggregateds, sepset_rules, orientation_rules, detect_rbos, detect_post_rbos))

    p1_queue = deque()
    p2_queue = deque()

    def writing_phase(_phase):
        assert 1 == _phase or 2 == _phase
        queue = p1_queue if _phase == 1 else p2_queue
        with open(f'{working_dir}phase_{_phase}_{from_index}_{to_index}_{identifier}.csv', 'a') as _f:
            while queue:
                vals = queue.popleft()
                print(*vals, file=_f, sep=',')

    last_wrote2 = 0
    for idx in trange(from_index, to_index, smoothing=0):
        for base_size in [200, 300, 400, 500]:  # 200, 300, 400,500, 600
            if is_random:
                schema = RelationalSchema.from_dict(schemas[idx])
                max_hop, rcm_code = rcm_codes[idx]
                rdeps = sorted(list(enumerate_rdeps(schema, max_hop)))
                dependencies = {rdeps[at] for at in rcm_code}
                rcm = RCM(schema, dependencies)
            else:
                schema = company_schema()
                rcm = company_rcm()

            def initialize():
                np.random.seed(idx + 1)
                skeleton = sized_random_skeleton(schema, sizing_method(base_size, schema), seed=idx + 1)
                lg_rcm = linear_gaussians_rcm(rcm, seed=idx + 1)
                generate_values_for_skeleton(lg_rcm, skeleton, seed=idx + 1)

                datasource = DataCenter(skeleton)
                kerner = RBFKernelComputer(datasource, additive=1e-2, n_jobs=n_jobs, eqsize_only=False,
                                           k_cache_max_size=128)
                _tester = RCITester(kerner, n_jobs=n_jobs)
                return _tester

            initialized = False
            for is_aggregated, sepset_rule, orientation_rule, detect_rbo, detect_post_rbo in options:
                if detect_rbo != detect_post_rbo:
                    continue
                if is_aggregated:
                    if not (detect_post_rbo and detect_rbo):
                        continue

                p2_key = (idx, base_size, is_aggregated, sepset_rule, orientation_rule, detect_rbo, detect_post_rbo)

                if p2_key in done[2]:
                    continue

                if not initialized:
                    tester = initialize()
                    initialized = True

                if p2_key not in done[2]:
                    done[2].add(p2_key)
                    np.random.seed(idx + 1)
                    learner = RCMLearner(tester, max_rv_hops=rcm.max_hop, max_degree=None, verbose=False, true_rcm=rcm,
                                         sepset_rule=sepset_rule,
                                         orientation_rule=orientation_rule,
                                         aggregator=average_aggregator if is_aggregated else None,
                                         minimum_rows_for_test=0,
                                         detect_rbo_violations=detect_rbo,
                                         detect_post_rbo_violations=detect_post_rbo)

                    learner.perfect_phase_I()
                    learner.RBO_based_tests()
                    learner.post_RBO_unshielded_triples_tests()
                    learner.orient()

                    p2_values = []
                    p2_values.extend(p2_key)
                    p2_values.append('|')
                    p2_values.extend(learner.rbo_stats[k] for k in stats_keys())
                    p2_values.append('|')
                    p2_values.extend(learner.post_rbo_stats[k] for k in stats_keys())
                    p2_values.append('|')
                    p2_values.extend(evaluation_for_orientation(learner.prcm, rcm)[-6:-3])

                    p2_queue.append(p2_values)

                    if last_wrote2 + 120 < time.time():
                        writing_phase(2)
                        last_wrote2 = time.time()

            if last_wrote2 + 120 < time.time():
                writing_phase(2)
                last_wrote2 = time.time()

    # clean up
    if p1_queue:
        writing_phase(1)
    if p2_queue:
        writing_phase(2)