コード例 #1
0
def test_hospital_without_init():
    db_name = random_database()

    try:
        # 1. Setup a HoloClean session.
        hc = holoclean.HoloClean(db_name='holo',
                                 domain_thresh_1=0.0,
                                 domain_thresh_2=0.0,
                                 weak_label_thresh=0.99,
                                 max_domain=10000,
                                 cor_strength=0.6,
                                 nb_cor_strength=0.8,
                                 epochs=10,
                                 weight_decay=0.01,
                                 learning_rate=0.001,
                                 threads=1,
                                 batch_size=1,
                                 verbose=True,
                                 timeout=3 * 60000,
                                 feature_norm=False,
                                 weight_norm=False,
                                 print_fw=True).session

        # 2. Load training data and denial constraints.
        hc.load_data('hospital', '../testdata/hospital.csv')
        hc.load_dcs('../testdata/hospital_constraints.txt')
        hc.ds.set_constraints(hc.get_dcs())

        # 3. Detect erroneous cells using these two detectors.
        detectors = [NullDetector(), ViolationDetector()]
        hc.detect_errors(detectors)

        # 4. Repair errors utilizing the defined features.
        hc.setup_domain()
        featurizers = [
            OccurAttrFeaturizer(),
            FreqFeaturizer(),
            ConstraintFeaturizer(),
        ]

        hc.repair_errors(featurizers)

        # 5. Evaluate the correctness of the results.
        report = hc.evaluate(fpath='../testdata/hospital_clean.csv',
                             tid_col='tid',
                             attr_col='attribute',
                             val_col='correct_val')

        # We assert that our key metrics are exactly as tested for hospital.
        # If these assertions ever fail in a new change, the results should
        # be comparable if not better than before, unless a clear and correct
        # reason can be given.
        assert report.correct_repairs == 434
        assert report.total_repairs == 456
        assert abs(report.precision - 434. / 456) < TOL
        assert abs(report.recall - 434. / 509) < TOL
        assert abs(report.repair_recall - 434. / 435) < TOL
        assert report.total_repairs_grdt_correct == 22
    finally:
        delete_database(db_name)
コード例 #2
0
ファイル: cleaning.py プロジェクト: orensheffer7/TREx
        def holoclean_detect(hc):
            detectors = [NullDetector(), ViolationDetector()]
            featurizers = [
                InitAttrFeaturizer(),
                OccurAttrFeaturizer(),
                FreqFeaturizer(),
                ConstraintFeaturizer(),
            ]

            hc.detect_errors(detectors)
            hc.repair_errors(featurizers)

            return hc
コード例 #3
0
def template(featurizers, estimator_type):
    db_name = random_database()

    try:
        # 1. Setup a HoloClean session.
        hc = holoclean.HoloClean(
            db_name=db_name,
            domain_thresh_1=0.0,
            domain_thresh_2=0.0,
            weak_label_thresh=0.99,
            max_domain=10000,
            cor_strength=0.6,
            nb_cor_strength=0.8,
            epochs=10,
            weight_decay=0.01,
            learning_rate=0.001,
            threads=1,
            batch_size=1,
            verbose=True,
            timeout=3 * 60000,
            print_fw=True,
            estimator_type=estimator_type,
        ).session

        # 2. Load training data and denial constraints.
        hc.load_data('hospital', '../testdata/hospital/hospital.csv')
        hc.load_dcs('../testdata/hospital/hospital_constraints.txt')
        hc.ds.set_constraints(hc.get_dcs())

        # 3. Detect erroneous cells using these two detectors.
        detectors = [NullDetector(), ViolationDetector()]
        hc.detect_errors(detectors)

        # 4. Repair errors utilizing the defined features.
        hc.generate_domain()
        hc.run_estimator()
        hc.repair_errors(featurizers)

        # 5. Evaluate the correctness of the results.
        report = hc.evaluate(fpath='../testdata/hospital/hospital_clean.csv',
                             tid_col='tid',
                             attr_col='attribute',
                             val_col='correct_val')

        return report

    finally:
        delete_database(db_name)
コード例 #4
0
                         learning_rate=0.001,
                         threads=1,
                         batch_size=1,
                         verbose=True,
                         timeout=3 * 60000,
                         feature_norm=False,
                         weight_norm=False,
                         print_fw=True).session

# 2. Load training data and denial constraints.
hc.load_data('hospital', '../testdata/hospital.csv')
hc.load_dcs('../testdata/hospital_constraints.txt')
hc.ds.set_constraints(hc.get_dcs())

# 3. Detect erroneous cells using these two detectors.
detectors = [NullDetector(), ViolationDetector()]
hc.detect_errors(detectors)

# 4. Repair errors utilizing the defined features.
hc.setup_domain()
featurizers = [
    InitAttrFeaturizer(),
    OccurAttrFeaturizer(),
    FreqFeaturizer(),
    ConstraintFeaturizer(),
]

hc.repair_errors(featurizers)

# 5. Evaluate the correctness of the results.
hc.evaluate(fpath='../testdata/hospital_clean.csv',