def setUp(self): self.session = Session(holo_obj) self.dataset = "../data/unit_test/unit_test_dataset.csv" self.session.load_data(self.dataset) self.session.load_denial_constraints( "../data/unit_test/unit_test_constraints.txt") self.detector = SqlDCErrorDetection(self.session) self.session.detect_errors([self.detector])
def setUp(self): self.session = Session(holo_obj) self.dataset = "../data/unit_test/unit_test_dataset.csv" self.session.load_data(self.dataset) self.session.load_denial_constraints( "../data/unit_test/unit_test_constraints.txt") detector = SqlDCErrorDetection(self.session) self.session.detect_errors([detector]) self.session._ds_domain_pruning(holo_obj.pruning_threshold1, holo_obj.pruning_threshold2, holo_obj.pruning_dk_breakoff, holo_obj.pruning_clean_breakoff)
def test(self): t1 = time.time() dataset = "data/hospital.csv" print("using dataset: {}".format(dataset)) denial_constraints = "data/hospital_constraints.txt" print("using denial_constraints: {}".format(denial_constraints)) ground_truth = "data/hospital_clean.csv" print("using ground_truth: {}".format(ground_truth)) # uncheck this if you don't have ground truth # ground_truth = 0 # Ingesting Dataset and Denial Constraints self.session.load_data(dataset) self.session.load_denial_constraints(denial_constraints) # Error Detectors: We have two, dc violations and null values t3 = time.time() detector_list = [] Dcdetector = SqlDCErrorDetection(self.session) Nulldetector = SqlnullErrorDetection(self.session) detector_list.append(Dcdetector) detector_list.append(Nulldetector) self.session.detect_errors(detector_list) t4 = time.time() if self.holo_obj.verbose: self.holo_obj.logger.info("Error detection time:") self.holo_obj.logger.info("Error detection time:" + str(t4 - t3)) self.session.repair() if ground_truth: self.session.compare_to_truth(ground_truth) t2 = time.time() if self.holo_obj.verbose: self.holo_obj.logger.info("Total time:" + str(t2 - t1)) print "Execution finished" exit(0)
verbose=False, pruning_threshold1=0.1, # to limit possible values for training data pruning_clean_breakoff= 6, # to limit possible values for training data to less than k values pruning_threshold2= 0, # to limit possible values for dirty data (applied after Threshold 1) pruning_dk_breakoff= 6, # to limit possible values for dirty data to less than k values learning_iterations=30, # learning parameters learning_rate=0.001, batch_size=5) session = Session(holo) data = session.load_data(data_path) dcs = session.load_denial_constraints(dc_path) #data.select('City').show(15) detector = SqlDCErrorDetection(session) error_detector_list = [] error_detector_list.append(detector) clean, dirty = session.detect_errors(error_detector_list) #clean.head(5) #dirty.head(5) repaired = session.repair() repaired = repaired.withColumn(index_attribute, repaired[index_attribute].cast("int")) repaired.sort(index_attribute) shutil.rmtree("repaired") # repaired.repartition(1).write.format('com.databricks.spark.csv').option("header", 'true').save('repaired') repaired.coalesce(1).write.format('com.databricks.spark.csv').option( "header", 'true').save('repaired') # session.compare_to_truth(gt_path)