Beispiel #1
0
    def setUp(self):

        self.session = Session(holo_obj)
        self.dataset = "../data/unit_test/unit_test_dataset.csv"
        self.session.load_data(self.dataset)
        self.session.load_denial_constraints(
            "../data/unit_test/unit_test_constraints.txt")

        self.detector = SqlDCErrorDetection(self.session)
        self.session.detect_errors([self.detector])
Beispiel #2
0
    def setUp(self):

        self.session = Session(holo_obj)
        self.dataset = "../data/unit_test/unit_test_dataset.csv"
        self.session.load_data(self.dataset)
        self.session.load_denial_constraints(
            "../data/unit_test/unit_test_constraints.txt")

        detector = SqlDCErrorDetection(self.session)
        self.session.detect_errors([detector])
        self.session._ds_domain_pruning(holo_obj.pruning_threshold1,
                                        holo_obj.pruning_threshold2,
                                        holo_obj.pruning_dk_breakoff,
                                        holo_obj.pruning_clean_breakoff)
Beispiel #3
0
 def __init__(self):
     self.holo_obj = HoloClean(
         holoclean_path="..",  # path to holoclean package
         verbose=False,
         # to limit possible values for training data
         pruning_threshold1=0.1,
         # to limit possible values for training data to less than k values
         pruning_clean_breakoff=6,
         # to limit possible values for dirty data (applied after
         # Threshold 1)
         pruning_threshold2=0,
         # to limit possible values for dirty data to less than k values
         pruning_dk_breakoff=6,
         # learning parameters
         learning_iterations=30,
         learning_rate=0.001,
         batch_size=5)
     self.session = Session(self.holo_obj)
Beispiel #4
0
class Testing:
    def __init__(self):
        self.holo_obj = HoloClean(
            mysql_driver="../holoclean/lib/mysql-connector-java-5.1.44-bin.jar",
            verbose=True,
            timing_file='execution_time.txt')
        self.session = Session(self.holo_obj)

    def test(self):

        #dataset = "../tutorial/data/hospital_dataset.csv"
        # dataset = "../datasets/flights/flight_input_holo.csv"
        # dataset = "../datasets/food/food_input_holo.csv"
        dataset = "../datasets/unit_test/unit_test_dataset.csv"

        #denial_constraints = "../tutorial/data/hospital_constraints.txt"
        #denial_constraints = "../datasets/flights/flight_constraints.txt"
        # denial_constraints = "../datasets/food/food_constraints1.txt"
        denial_constraints = "../datasets/unit_test/unit_test_constraints.txt"

        flattening = 0
        # flattening = 1

        #ground_truth = "../tutorial/data/groundtruth.csv"
        #ground_truth = "../datasets/flights/flights_clean.csv"
        # ground_truth = "../datasets/food/food_clean.csv"
        ground_truth = 0

        # Ingesting Dataset and Denial Constraints
        self.session.load_data(dataset)
        self.session.load_denial_constraints(denial_constraints)

        # Error Detector
        detector = Mysql_DCErrorDetection(self.session.Denial_constraints,
                                          self.holo_obj, self.session.dataset)
        self.session.detect_errors(detector)

        self.session.repair()

        if ground_truth:
            self.session.compare_to_truth(ground_truth)
Beispiel #5
0
class TestMysqlErrordetector(unittest.TestCase):
    def setUp(self):

        self.session = Session(holo_obj)
        self.dataset = "../data/unit_test/unit_test_dataset.csv"
        self.session.load_data(self.dataset)
        self.session.load_denial_constraints(
            "../data/unit_test/unit_test_constraints.txt")

        self.detector = SqlDCErrorDetection(self.session)
        self.session.detect_errors([self.detector])

    def tearDown(self):
        del self.session

    def test_number_of_dk_cells(self):
        dataframe_C_dk = holo_obj.dataengine.get_table_to_dataframe(
            'C_dk', self.session.dataset)
        self.assertEquals(dataframe_C_dk.count(), 10)

    def test_number_of_clean_cells(self):
        dataframe_C_clean = holo_obj.dataengine.get_table_to_dataframe(
            'C_clean', self.session.dataset)
        self.assertEquals(dataframe_C_clean.count(), 5)

    def test_correction_of_clean_cells(self):
        dataframe_C_clean = holo_obj.dataengine.get_table_to_dataframe(
            'C_clean', self.session.dataset)

        anticipated_C_clean_cells = [["3", "D"], ["1", "D"], ["2", "D"],
                                     ["3", "A"], ["3", "B"]]

        anticipated_dataframe = holo_obj.spark_session.createDataFrame(
            anticipated_C_clean_cells,
            StructType([
                StructField("ind", StringType(), False),
                StructField("attr", StringType(), False),
            ]))

        incorrect = anticipated_dataframe.subtract(dataframe_C_clean)
        self.assertEquals(incorrect.count(), 0)

    def test_correction_of_dk_cells(self):
        dataframe_C_dk = holo_obj.dataengine.get_table_to_dataframe(
            'C_dk', self.session.dataset)

        anticipated_dataframe_C_dk_cells = [["3", "C"], ["2", "C"], ["2", "A"],
                                            ["2", "E"], ["3", "E"], ["2", "B"],
                                            ["1", "A"], ["1", "C"], ["1", "B"],
                                            ["1", "E"]]

        anticipated_dataframe = holo_obj.spark_session.createDataFrame(
            anticipated_dataframe_C_dk_cells,
            StructType([
                StructField("ind", StringType(), False),
                StructField("attr", StringType(), False),
            ]))

        incorrect = anticipated_dataframe.subtract(dataframe_C_dk)
        self.assertEquals(incorrect.count(), 0)
Beispiel #6
0
index_attribute = "index"

holo = HoloClean(
    holoclean_path="..",  # path to holoclean package
    verbose=False,
    pruning_threshold1=0.1,  # to limit possible values for training data
    pruning_clean_breakoff=
    6,  # to limit possible values for training data to less than k values
    pruning_threshold2=
    0,  # to limit possible values for dirty data (applied after Threshold 1)
    pruning_dk_breakoff=
    6,  # to limit possible values for dirty data to less than k values
    learning_iterations=30,  # learning parameters
    learning_rate=0.001,
    batch_size=5)
session = Session(holo)
data = session.load_data(data_path)
dcs = session.load_denial_constraints(dc_path)
#data.select('City').show(15)
detector = SqlDCErrorDetection(session)
error_detector_list = []
error_detector_list.append(detector)
clean, dirty = session.detect_errors(error_detector_list)
#clean.head(5)
#dirty.head(5)
repaired = session.repair()
repaired = repaired.withColumn(index_attribute,
                               repaired[index_attribute].cast("int"))
repaired.sort(index_attribute)
shutil.rmtree("repaired")
# repaired.repartition(1).write.format('com.databricks.spark.csv').option("header", 'true').save('repaired')
Beispiel #7
0
 def __init__(self):
     self.holo_obj = HoloClean(
         mysql_driver="../holoclean/lib/mysql-connector-java-5.1.44-bin.jar",
         verbose=True,
         timing_file='execution_time.txt')
     self.session = Session(self.holo_obj)
Beispiel #8
0
# In this tutorial, we will walk step-by-step through the process of repairing a dataset in `HoloClean`.

# The dataset in question contains information about hospitals and is commonly-used for research purposes. Errors are present in ~5% of the cells and there is significant duplicate information - the ideal environment for `HoloClean`.

# ### Step 1: Data Loading

# We begin by instantiating the `HoloClean` and `Session` objects needed to run the repairs. For a more detailed overview of these objects and the rest of our infrastructure, please see Tutorial 1.

# In[1]:

from holoclean.holoclean import HoloClean, Session

holo = HoloClean(
    mysql_driver="../holoclean/lib/mysql-connector-java-5.1.44-bin.jar")
session = Session(holo)

# Next, we load in the data and denial constraints needed for this dataset. Both pieces of information are stored in the MySQL database.

# In[2]:

data_path = "data/hospital_dataset.csv"

## loads data into our database and returns pyspark dataframe of initial data
data = session.load_data(data_path)

dc_path = "data/hospital_constraints.txt"

# loads denial constraints into our database and returns a simple list of dcs as strings
dcs = session.load_denial_constraints(dc_path)
from holoclean.errordetection.sql_dcerrordetector import SqlDCErrorDetection
from holoclean.errordetection.sql_nullerrordetector import SqlnullErrorDetection

k_inferred = 2
holo_obj = HoloClean(holoclean_path="../..",
                     verbose=False,
                     pruning_threshold1=0.001,
                     pruning_clean_breakoff=6,
                     pruning_threshold2=0.0,
                     pruning_dk_breakoff=6,
                     learning_iterations=30,
                     learning_rate=0.001,
                     batch_size=5,
                     k_inferred=k_inferred)

session = Session(holo_obj)
dataset = "../data/hospital.csv"
session.load_data(dataset)

session.load_denial_constraints("../data/hospital_constraints.txt")
detector_list = []
Dcdetector = SqlDCErrorDetection(session)
Nulldetector = SqlnullErrorDetection(session)
detector_list.append(Dcdetector)
detector_list.append(Nulldetector)
session.detect_errors(detector_list)
session.repair()


class UnitTestPredictions(unittest.TestCase):
    def setUp(self):
Beispiel #10
0
class Testing:
    def __init__(self):
        self.holo_obj = HoloClean(
            holoclean_path="..",  # path to holoclean package
            verbose=True,
            # to limit possible values for training data
            pruning_threshold1=0.0,
            # to limit possible values for training data to less than k values
            pruning_clean_breakoff=6,
            # to limit possible values for dirty data (applied after
            # Threshold 1)
            pruning_threshold2=0.0,
            # to limit possible values for dirty data to less than k values
            pruning_dk_breakoff=6,
            # learning parameters
            learning_iterations=30,
            learning_rate=0.001,
            batch_size=5,
            # number of inferred values
            k_inferred=2)
        self.session = Session(self.holo_obj)

    def test(self):

        t1 = time.time()

        dataset = "data/hospital.csv"
        print("using dataset: {}".format(dataset))
        denial_constraints = "data/hospital_constraints.txt"
        print("using denial_constraints: {}".format(denial_constraints))
        ground_truth = "data/hospital_clean.csv"
        print("using ground_truth: {}".format(ground_truth))

        # uncheck this if you don't have ground truth
        # ground_truth = 0

        # Ingesting Dataset and Denial Constraints
        self.session.load_data(dataset)
        self.session.load_denial_constraints(denial_constraints)

        # Error Detectors: We have two, dc violations and null values

        t3 = time.time()
        detector_list = []
        Dcdetector = SqlDCErrorDetection(self.session)
        Nulldetector = SqlnullErrorDetection(self.session)
        detector_list.append(Dcdetector)
        detector_list.append(Nulldetector)
        self.session.detect_errors(detector_list)

        t4 = time.time()
        if self.holo_obj.verbose:
            self.holo_obj.logger.info("Error detection time:")
            self.holo_obj.logger.info("Error detection time:" + str(t4 - t3))

        self.session.repair()

        if ground_truth:
            self.session.compare_to_truth(ground_truth)

        t2 = time.time()
        if self.holo_obj.verbose:
            self.holo_obj.logger.info("Total time:" + str(t2 - t1))
            print "Execution finished"

        exit(0)
Beispiel #11
0
class TestPruning(unittest.TestCase):
    def setUp(self):

        self.session = Session(holo_obj)
        self.dataset = "../data/unit_test/unit_test_dataset.csv"
        self.session.load_data(self.dataset)
        self.session.load_denial_constraints(
            "../data/unit_test/unit_test_constraints.txt")

        detector = SqlDCErrorDetection(self.session)
        self.session.detect_errors([detector])
        self.session._ds_domain_pruning(holo_obj.pruning_threshold1,
                                        holo_obj.pruning_threshold2,
                                        holo_obj.pruning_dk_breakoff,
                                        holo_obj.pruning_clean_breakoff)

    def test_possible_values_clean(self):
        possible_values_clean = holo_obj.dataengine.get_table_to_dataframe(
            "Possible_values_clean", self.session.dataset)
        anticipated_possible_values_clean = [["1", "3", "A", "p", "0", "1"],
                                             ["1", "3", "A", "u", "1", "2"],
                                             ["2", "3", "B", "y", "1", "1"],
                                             ["2", "3", "B", "z", "0", "2"],
                                             ["2", "3", "B", "w", "0", "3"]]

        anticipated_dataframe = holo_obj.spark_session.createDataFrame(
            anticipated_possible_values_clean, StructType([
                StructField("vid", StringType(), False),
                StructField("tid", StringType(), False),
                StructField("attr_name", StringType(), False),
                StructField("attr_val", StringType(), False),
                StructField("observed", StringType(), False),
                StructField("domain_id", StringType(), False),
            ]))
        incorrect = anticipated_dataframe.subtract(
            possible_values_clean)
        self.assertEquals(incorrect.count(), 0)

    def test_possible_values_dk(self):
        possible_values_dk = holo_obj.dataengine.get_table_to_dataframe(
            "Possible_values_dk", self.session.dataset)
        anticipated_possible_values_dk = [["1", "1", "A", "p", "1", "1"],
                                          ["1", "1", "A", "u", "0", "2"],
                                          ["2", "1", "B", "y", "0", "1"],
                                          ["2", "1", "B", "z", "0", "2"],
                                          ["2", "1", "B", "w", "1", "3"],
                                          ["3", "1", "C", "m", "0", "1"],
                                          ["3", "1", "C", "f", "1", "2"],
                                          ["4", "1", "E", "r", "1", "1"],
                                          ["5", "2", "A", "p", "1", "1"],
                                          ["5", "2", "A", "u", "0", "2"],
                                          ["6", "2", "B", "y", "0", "1"],
                                          ["6", "2", "B", "z", "1", "2"],
                                          ["6", "2", "B", "w", "0", "3"],
                                          ["7", "2", "C", "m", "0", "1"],
                                          ["7", "2", "C", "f", "1", "2"],
                                          ["8", "2", "E", "r", "1", "1"],
                                          ["9", "3", "C", "m", "1", "1"],
                                          ["9", "3", "C", "f", "0", "2"],
                                          ["10", "3", "E", "r", "1", "1"]]
        anticipated_dataframe = holo_obj.spark_session.createDataFrame(
            anticipated_possible_values_dk, StructType([
                StructField("vid", StringType(), False),
                StructField("tid", StringType(), False),
                StructField("attr_name", StringType(), False),
                StructField("attr_val", StringType(), False),
                StructField("observed", StringType(), False),
                StructField("domain_id", StringType(), False),
            ]))
        incorrect = anticipated_dataframe.subtract(
            possible_values_dk)
        self.assertEquals(incorrect.count(), 0)

    def test_kij_dk(self):
        kij_dk = holo_obj.dataengine.get_table_to_dataframe(
            "Kij_lookup_dk", self.session.dataset)
        anticipated_kij_dk = [["1", "1",  "A", "2"],
                              ["2", "1", "B", "3"],
                              ["3", "1", "C", "2"],
                              ["4", "1", "E", "1"],
                              ["5", "2", "A", "2"],
                              ["6", "2", "B", "3"],
                              ["7", "2", "C", "2"],
                              ["8", "2", "E", "1"],
                              ["9", "3", "C", "2"],
                              ["10", "3", "E", "1"]]

        anticipated_dataframe = holo_obj.spark_session.createDataFrame(
            anticipated_kij_dk, StructType([
                StructField("vid", StringType(), False),
                StructField("tid", StringType(), False),
                StructField("attr_name", StringType(), False),
                StructField("k_ij", StringType(), False),
            ]))
        incorrect = anticipated_dataframe.subtract(
            kij_dk)
        self.assertEquals(incorrect.count(), 0)

    def test_kij_clean(self):
        kij_clean = holo_obj.dataengine.get_table_to_dataframe(
            "Kij_lookup_clean", self.session.dataset)
        anticipated_kij_clean = [["1", "3",  "A", "2"],
                                 ["2", "3", "B", "3"]]
        anticipated_dataframe = holo_obj.spark_session.createDataFrame(
            anticipated_kij_clean, StructType([
                StructField("vid", StringType(), False),
                StructField("tid", StringType(), False),
                StructField("attr_name", StringType(), False),
                StructField("k_ij", StringType(), False),
            ]))
        incorrect = anticipated_dataframe.subtract(
            kij_clean)
        self.assertEquals(incorrect.count(), 0)
Beispiel #12
0
class TestDCFeaturizer(unittest.TestCase):
    def setUp(self):

        self.session = Session(holo_obj)
        self.dataset = "../data/unit_test/unit_test_dataset.csv"
        self.session.load_data(self.dataset)
        self.session.load_denial_constraints(
            "../data/unit_test/unit_test_non_symmetric_constraints.txt")

        detector = SqlDCErrorDetection(self.session)
        self.session.detect_errors([detector])
        self.session._ds_domain_pruning(holo_obj.pruning_threshold1,
                                        holo_obj.pruning_threshold2,
                                        holo_obj.pruning_dk_breakoff,
                                        holo_obj.pruning_clean_breakoff)

    def test_DC_query_for_clean(self):

        dc_signal = SignalDC(self.session.Denial_constraints, self.session)

        self.session._add_featurizer(dc_signal)

        temp_list = dc_signal._create_all_relaxed_dc()
        relaxed_dcs = []
        for relaxed_dc in temp_list:
            relaxed_dcs.append(relaxed_dc[0])

        expected_r_dcs = \
            ["postab.tid = t1." + GlobalVariables.index_name +
             " AND postab.attr_name = 'A' AND postab.attr_val=t2.A AND  t1." +
             GlobalVariables.index_name + " < t2." + GlobalVariables.index_name
             + " AND  t1.B>t2.B",
             "postab.tid = t2." + GlobalVariables.index_name +
             " AND postab.attr_name ='A' AND t1.A=postab.attr_val AND  t1." +
             GlobalVariables.index_name + " < t2." + GlobalVariables.index_name
             + " AND  t1.B>t2.B",
            "postab.tid = t1." + GlobalVariables.index_name +
             " AND postab.attr_name = 'B' AND postab.attr_val>t2.B AND  t1." +
             GlobalVariables.index_name + " < t2." + GlobalVariables.index_name
             + " AND  t1.A=t2.A",
            "postab.tid = t2." + GlobalVariables.index_name +
             " AND postab.attr_name ='B' AND t1.B>postab.attr_val AND  t1." +
             GlobalVariables.index_name + " < t2." + GlobalVariables.index_name
             + " AND  t1.A=t2.A",
            "postab.tid = t1." + GlobalVariables.index_name +
             " AND postab.attr_name = 'C' AND postab.attr_val>='f' AND  t1." +
             GlobalVariables.index_name + " < t2." + GlobalVariables.index_name
             + " AND  t2.C<='m' AND  t1.E=t2.E",
            "postab.tid = t2." + GlobalVariables.index_name +
             " AND postab.attr_name = 'C' AND postab.attr_val<='m' AND  t1." +
             GlobalVariables.index_name + " < t2." + GlobalVariables.index_name
             + " AND  t1.C>='f' AND  t1.E=t2.E",
            "postab.tid = t1." + GlobalVariables.index_name +
             " AND postab.attr_name = 'E' AND postab.attr_val=t2.E AND  t1." +
             GlobalVariables.index_name + " < t2." + GlobalVariables.index_name
             + " AND  t1.C>='f' AND  t2.C<='m'",
            "postab.tid = t2." + GlobalVariables.index_name +
             " AND postab.attr_name ='E' AND t1.E=postab.attr_val AND  t1." +
             GlobalVariables.index_name + " < t2." + GlobalVariables.index_name
             + " AND  t1.C>='f' AND  t2.C<='m'"
             ]

        self.assertEquals(relaxed_dcs, expected_r_dcs)
Beispiel #13
0
class TestInitFeaturizer(unittest.TestCase):
    def setUp(self):

        self.session = Session(holo_obj)
        self.dataset = "../data/unit_test/unit_test_dataset.csv"
        self.session.load_data(self.dataset)
        self.session.load_denial_constraints(
            "../data/unit_test/unit_test_constraints.txt")

        detector = SqlDCErrorDetection(self.session)
        self.session.detect_errors([detector])
        self.session._ds_domain_pruning(holo_obj.pruning_threshold1,
                                        holo_obj.pruning_threshold2,
                                        holo_obj.pruning_dk_breakoff,
                                        holo_obj.pruning_clean_breakoff)

        self.init_signal = SignalInit(self.session)

    def tearDown(self):
        del self.session

    def test_Init_query_for_clean(self):
        query = self.init_signal.get_query()[0]

        Init_feature_dataframe = \
            holo_obj.dataengine.query(query, 1)

        anticipated_Init_feature_C_clean_cells = [["1", "2", "1", "1"],
                                                  ["2", "1", "1", "1"]]
        anticipated_dataframe = holo_obj.spark_session.createDataFrame(
            anticipated_Init_feature_C_clean_cells,
            StructType([
                StructField("vid", StringType(), False),
                StructField("assigned_val", StringType(), False),
                StructField("feature", StringType(), False),
                StructField("count", StringType(), False),
            ]))
        incorrect = anticipated_dataframe.subtract(Init_feature_dataframe)
        self.assertEquals(incorrect.count(), 0)

    def test_Init_query_for_dk(self):
        query = self.init_signal.get_query(0)[0]
        Init_feature_dataframe = \
            holo_obj.dataengine.query(query, 1)

        anticipated_Init_feature_C_dk_cells = [["1", "1", "1", "1"],
                                               ["2", "3", "1", "1"],
                                               ["3", "2", "1", "1"],
                                               ["4", "1", "1", "1"],
                                               ["5", "1", "1", "1"],
                                               ["6", "2", "1", "1"],
                                               ["7", "2", "1", "1"],
                                               ["8", "1", "1", "1"],
                                               ["9", "1", "1", "1"],
                                               ["10", "1", "1", "1"]]
        anticipated_dataframe = holo_obj.spark_session.createDataFrame(
            anticipated_Init_feature_C_dk_cells,
            StructType([
                StructField("vid", StringType(), False),
                StructField("assigned_val", StringType(), False),
                StructField("feature", StringType(), False),
                StructField("count", StringType(), False),
            ]))
        incorrect = anticipated_dataframe.subtract(Init_feature_dataframe)
        self.assertEquals(incorrect.count(), 0)