Esempio n. 1
0
 def __init__(self):
     self.holo_obj = HoloClean(
         holoclean_path="..",  # path to holoclean package
         verbose=False,
         # to limit possible values for training data
         pruning_threshold1=0.1,
         # to limit possible values for training data to less than k values
         pruning_clean_breakoff=6,
         # to limit possible values for dirty data (applied after
         # Threshold 1)
         pruning_threshold2=0,
         # to limit possible values for dirty data to less than k values
         pruning_dk_breakoff=6,
         # learning parameters
         learning_iterations=30,
         learning_rate=0.001,
         batch_size=5)
     self.session = Session(self.holo_obj)
Esempio n. 2
0
import unittest
import sys
sys.path.append("../..")
from holoclean.holoclean import HoloClean, Session
from holoclean.errordetection.sql_dcerrordetector import SqlDCErrorDetection
from pyspark.sql.types import *

holo_obj = HoloClean(holoclean_path="../..",
                     verbose=True,
                     timing_file='execution_time.txt',
                     learning_iterations=50,
                     learning_rate=0.001,
                     batch_size=20)


class TestMysqlErrordetector(unittest.TestCase):
    def setUp(self):

        self.session = Session(holo_obj)
        self.dataset = "../data/unit_test/unit_test_dataset.csv"
        self.session.load_data(self.dataset)
        self.session.load_denial_constraints(
            "../data/unit_test/unit_test_constraints.txt")

        self.detector = SqlDCErrorDetection(self.session)
        self.session.detect_errors([self.detector])

    def tearDown(self):
        del self.session

    def test_number_of_dk_cells(self):
Esempio n. 3
0
 def __init__(self):
     self.holo_obj = HoloClean(
         mysql_driver="../holoclean/lib/mysql-connector-java-5.1.44-bin.jar",
         verbose=True,
         timing_file='execution_time.txt')
     self.session = Session(self.holo_obj)
Esempio n. 4
0
# ## Tutorial 2: A Complete HoloClean Pipeline

# In this tutorial, we will walk step-by-step through the process of repairing a dataset in `HoloClean`.

# The dataset in question contains information about hospitals and is commonly-used for research purposes. Errors are present in ~5% of the cells and there is significant duplicate information - the ideal environment for `HoloClean`.

# ### Step 1: Data Loading

# We begin by instantiating the `HoloClean` and `Session` objects needed to run the repairs. For a more detailed overview of these objects and the rest of our infrastructure, please see Tutorial 1.

# In[1]:

from holoclean.holoclean import HoloClean, Session

holo = HoloClean(
    mysql_driver="../holoclean/lib/mysql-connector-java-5.1.44-bin.jar")
session = Session(holo)

# Next, we load in the data and denial constraints needed for this dataset. Both pieces of information are stored in the MySQL database.

# In[2]:

data_path = "data/hospital_dataset.csv"

## loads data into our database and returns pyspark dataframe of initial data
data = session.load_data(data_path)

dc_path = "data/hospital_constraints.txt"

# loads denial constraints into our database and returns a simple list of dcs as strings
dcs = session.load_denial_constraints(dc_path)
import unittest
import sys
sys.path.append("../..")
from holoclean.holoclean import HoloClean, Session
from holoclean.errordetection.sql_dcerrordetector import SqlDCErrorDetection
from holoclean.errordetection.sql_nullerrordetector import SqlnullErrorDetection

k_inferred = 2
holo_obj = HoloClean(holoclean_path="../..",
                     verbose=False,
                     pruning_threshold1=0.001,
                     pruning_clean_breakoff=6,
                     pruning_threshold2=0.0,
                     pruning_dk_breakoff=6,
                     learning_iterations=30,
                     learning_rate=0.001,
                     batch_size=5,
                     k_inferred=k_inferred)

session = Session(holo_obj)
dataset = "../data/hospital.csv"
session.load_data(dataset)

session.load_denial_constraints("../data/hospital_constraints.txt")
detector_list = []
Dcdetector = SqlDCErrorDetection(session)
Nulldetector = SqlnullErrorDetection(session)
detector_list.append(Dcdetector)
detector_list.append(Nulldetector)
session.detect_errors(detector_list)
session.repair()
Esempio n. 6
0
import unittest
import sys
sys.path.append("../..")
from holoclean.holoclean import HoloClean, Session
from holoclean.errordetection.sql_dcerrordetector import SqlDCErrorDetection
from pyspark.sql.types import *


holo_obj = HoloClean(
    holoclean_path="../..",
    verbose=True,
    timing_file='execution_time.txt')


class TestPruning(unittest.TestCase):
    def setUp(self):

        self.session = Session(holo_obj)
        self.dataset = "../data/unit_test/unit_test_dataset.csv"
        self.session.load_data(self.dataset)
        self.session.load_denial_constraints(
            "../data/unit_test/unit_test_constraints.txt")

        detector = SqlDCErrorDetection(self.session)
        self.session.detect_errors([detector])
        self.session._ds_domain_pruning(holo_obj.pruning_threshold1,
                                        holo_obj.pruning_threshold2,
                                        holo_obj.pruning_dk_breakoff,
                                        holo_obj.pruning_clean_breakoff)

    def test_possible_values_clean(self):