def test_hospital_without_init():
    db_name = random_database()

    try:
        # 1. Setup a HoloClean session.
        hc = holoclean.HoloClean(db_name='holo',
                                 domain_thresh_1=0.0,
                                 domain_thresh_2=0.0,
                                 weak_label_thresh=0.99,
                                 max_domain=10000,
                                 cor_strength=0.6,
                                 nb_cor_strength=0.8,
                                 epochs=10,
                                 weight_decay=0.01,
                                 learning_rate=0.001,
                                 threads=1,
                                 batch_size=1,
                                 verbose=True,
                                 timeout=3 * 60000,
                                 feature_norm=False,
                                 weight_norm=False,
                                 print_fw=True).session

        # 2. Load training data and denial constraints.
        hc.load_data('hospital', '../testdata/hospital.csv')
        hc.load_dcs('../testdata/hospital_constraints.txt')
        hc.ds.set_constraints(hc.get_dcs())

        # 3. Detect erroneous cells using these two detectors.
        detectors = [NullDetector(), ViolationDetector()]
        hc.detect_errors(detectors)

        # 4. Repair errors utilizing the defined features.
        hc.setup_domain()
        featurizers = [
            OccurAttrFeaturizer(),
            FreqFeaturizer(),
            ConstraintFeaturizer(),
        ]

        hc.repair_errors(featurizers)

        # 5. Evaluate the correctness of the results.
        report = hc.evaluate(fpath='../testdata/hospital_clean.csv',
                             tid_col='tid',
                             attr_col='attribute',
                             val_col='correct_val')

        # We assert that our key metrics are exactly as tested for hospital.
        # If these assertions ever fail in a new change, the results should
        # be comparable if not better than before, unless a clear and correct
        # reason can be given.
        assert report.correct_repairs == 434
        assert report.total_repairs == 456
        assert abs(report.precision - 434. / 456) < TOL
        assert abs(report.recall - 434. / 509) < TOL
        assert abs(report.repair_recall - 434. / 435) < TOL
        assert report.total_repairs_grdt_correct == 22
    finally:
        delete_database(db_name)
Esempio n. 2
0
def template(featurizers, estimator_type):
    db_name = random_database()

    try:
        # 1. Setup a HoloClean session.
        hc = holoclean.HoloClean(
            db_name=db_name,
            domain_thresh_1=0.0,
            domain_thresh_2=0.0,
            weak_label_thresh=0.99,
            max_domain=10000,
            cor_strength=0.6,
            nb_cor_strength=0.8,
            epochs=10,
            weight_decay=0.01,
            learning_rate=0.001,
            threads=1,
            batch_size=1,
            verbose=True,
            timeout=3 * 60000,
            print_fw=True,
            estimator_type=estimator_type,
        ).session

        # 2. Load training data and denial constraints.
        hc.load_data('hospital', '../testdata/hospital/hospital.csv')
        hc.load_dcs('../testdata/hospital/hospital_constraints.txt')
        hc.ds.set_constraints(hc.get_dcs())

        # 3. Detect erroneous cells using these two detectors.
        detectors = [NullDetector(), ViolationDetector()]
        hc.detect_errors(detectors)

        # 4. Repair errors utilizing the defined features.
        hc.generate_domain()
        hc.run_estimator()
        hc.repair_errors(featurizers)

        # 5. Evaluate the correctness of the results.
        report = hc.evaluate(fpath='../testdata/hospital/hospital_clean.csv',
                             tid_col='tid',
                             attr_col='attribute',
                             val_col='correct_val')

        return report

    finally:
        delete_database(db_name)
Esempio n. 3
0
import holoclean
from detect import *
from repair.featurize import *

# 1. Setup a HoloClean session.
hc = holoclean.HoloClean(
    db_name='holo',
    domain_thresh_1=0.0,
    domain_thresh_2=0.0,
    weak_label_thresh=0.99,
    max_domain=10000,
    cor_strength=0.6,
    nb_cor_strength=0.8,
    weight_decay=0.01,
    learning_rate=0.001,
    threads=1,
    batch_size=1,
    verbose=True,
    timeout=3 * 60000,
    print_fw=True,
).session

# 2. Load training data and denial constraints.
hc.load_data('hospital', '../testdata/hospital/hospital.csv')
hc.load_dcs('../testdata/hospital/hospital_constraints.txt')
hc.ds.set_constraints(hc.get_dcs())

# 3. Detect erroneous cells using these two detectors.
detectors = [NullDetector(), ViolationDetector()]
hc.detect_errors(detectors)
Esempio n. 4
0
def hc_repair(data_name, tmp_path, num_attrs, paras, attr=None, n_val=-1):
    """
    Filling value by value in the synthetic dataset

    :param data_name string for the true dataset
    :param tmp_path temporary path for partial data
    :param num_attrs a list of numeric attributes
    :param paras parameters
    :param n_val number of value to predict. Set None to predicate entire domain

    Return a list of dumped predicates from autoencoder
    """

    m = paras.get('MCMC', 0)

    path_preds = None
    if m > 0:
        assert attr is not None
        # load the model from previous save, without training
        dir_preds = os.path.abspath(f"./_models")
        os.makedirs(dir_preds, exist_ok=True)
        path_preds = f"{dir_preds}/model_{data_name}_{attr}.pkl"

        if os.path.exists(path_preds):
            with open(path_preds, 'rb') as input:
                df_preds = pickle.load(input)

            logging.info('DONE with loading model from file')
            return df_preds

    if n_val is not None:
        # n_val is None for weight learning
        # train the model in the standard way, or the saved model does not exist for mcmc
        n_val_limit, n_try = _get_sampling_paras(paras.get('AR', False))
        n_val = n_val_limit * n_try

    hc = holoclean.HoloClean(db_name='db4kamino',
                             domain_thresh_1=0.0,
                             domain_thresh_2=0.0,
                             max_domain=10000,
                             cor_strength=0,
                             weight_decay=0.,
                             learning_rate=0.001,
                             threads=1,
                             batch_size=1,
                             verbose=False,
                             timeout=3 * 60000,
                             infer_mode='dk',
                             privacy=paras['dp'],
                             delta=paras['delta'],
                             iterations=paras['iterations'],
                             noise_multiplier=paras['noise_multiplier'],
                             l2_norm_clip=paras['l2_norm_clip'],
                             minibatch_size=paras['minibatch_size'],
                             microbatch_size=paras['microbatch_size']).session

    hc.load_data(data_name, tmp_path, numerical_attrs=num_attrs)

    detectors = [NullDetector()]
    hc.detect_errors(detectors)

    num_attr_groups = []
    quantized_num = []
    num_attrs_quant = _get_num_attrs_quant(data_name)

    for num_attr in num_attrs:
        num_attr_groups.append([num_attr])

        if num_attrs_quant is not None and num_attr in num_attrs_quant:
            quantized_num.append((num_attrs_quant[num_attr], [num_attr]))

    hc.quantize_numericals(quantized_num)
    hc.generate_domain()

    embedfest = EmbeddingFeaturizer(reuse_embedding=paras['reuse_embedding'],
                                    numerical_attr_groups=num_attr_groups)
    embedfest.setup_featurizer(hc.env, hc.ds)

    if m > 0:
        assert attr is not None and path_preds is not None
        df_preds = embedfest.embedding_model.dump_predictions_hm(
            n_val_limit=n_val, include_std=True, fpath=path_preds)
    else:
        df_preds = embedfest.embedding_model.gen_predictions(n_val_limit=n_val,
                                                             include_std=True)

    logging.info('DONE with training the autoencoder model')

    return df_preds
Esempio n. 5
0
    def __init__(self, data_df):
        hc = holoclean.HoloClean(db_name='holo',
                                 domain_thresh_1=0,
                                 domain_thresh_2=0,
                                 weak_label_thresh=0.99,
                                 max_domain=10000,
                                 cor_strength=0.6,
                                 nb_cor_strength=0.8,
                                 epochs=10,
                                 weight_decay=0.01,
                                 learning_rate=0.001,
                                 threads=1,
                                 batch_size=1,
                                 verbose=False,
                                 timeout=1 * 600,
                                 feature_norm=False,
                                 weight_norm=False,
                                 print_fw=False).session

        hc.load_data('hospital', data_df)
        hc.load_dcs('./temp_constraints.txt')
        hc.ds.set_constraints(hc.get_dcs())

        hc.setup_domain(list(data_df.columns))
        return hc

        def create_constraints_file(relevant_attr):
            fr = open('./testdata/hospital_constraints.txt')
            fw = open('./temp_constraints.txt', "w+")
            attributes_to_keep = set()
            for line in fr:
                for attr in relevant_attr:
                    if attr in line:
                        fw.write(line)
                        for item in line.split("t2.")[1:]:
                            attributes_to_keep.add(item.split(")")[0])
            fr.close()
            fw.close()
            return attributes_to_keep

        def holoclean_detect(hc):
            detectors = [NullDetector(), ViolationDetector()]
            featurizers = [
                InitAttrFeaturizer(),
                OccurAttrFeaturizer(),
                FreqFeaturizer(),
                ConstraintFeaturizer(),
            ]

            hc.detect_errors(detectors)
            hc.repair_errors(featurizers)

            return hc

        def run_holoclean(df, columns):
            relevant_attributes = create_constraints_file(columns)
            df_in = df.copy()
            df_in = df_in[relevant_attributes]
            hc = holoclean_init(df_in)
            hc = holoclean_detect(hc)
            return hc
Esempio n. 6
0
import holoclean

from detect import NullDetector, ViolationDetector
from repair.featurize import InitFeaturizer
from repair.featurize import InitAttFeaturizer
from repair.featurize import InitSimFeaturizer
from repair.featurize import FreqFeaturizer
from repair.featurize import OccurFeaturizer
from repair.featurize import ConstraintFeat
from repair.featurize import LangModelFeat

# 1. Setup a HoloClean session.
hc = holoclean.HoloClean(pruning_topk=0.1,
                         epochs=30,
                         weight_decay=0.01,
                         threads=20,
                         batch_size=1,
                         verbose=True,
                         timeout=3 * 60000).session

# 2. Load training data and denial constraints.
hc.load_data('hospital', 'data', 'hospital.csv')
hc.load_dcs('data', 'hospital_constraints_att.txt')
hc.ds.set_constraints(hc.get_dcs())

# 3. Detect erroneous cells using these two detectors.
detectors = [NullDetector(), ViolationDetector()]
hc.detect_errors(detectors)

# 4. Repair errors utilizing the defined features.
hc.setup_domain()
Esempio n. 7
0
import holoclean
from holoclean.detect import *
from holoclean.repair.featurize import *

# 1. Setup a HoloClean session.
hc = holoclean.HoloClean(
    sqlalchemy_uri="postgresql://*****:*****@localhost:5432/superset",
    domain_thresh_1=0.0,
    domain_thresh_2=0.0,
    weak_label_thresh=0.99,
    max_domain=10000,
    cor_strength=0.6,
    nb_cor_strength=0.8,
    weight_decay=0.01,
    learning_rate=0.001,
    threads=1,
    batch_size=1,
    verbose=True,
    timeout=3 * 60000,
    print_fw=True,
).session

# 2. Load training data and denial constraints.
hc.load_data('hospital', '../testdata/hospital/hospital.csv')
hc.load_dcs('../testdata/hospital/hospital_constraints.txt')
hc.ds.set_constraints(hc.get_dcs())

# 3. Detect erroneous cells using these two detectors.
detectors = [NullDetector(), ViolationDetector()]
hc.detect_errors(detectors)