def repair_main(schema, sigma_k=1.0, outliers=None, tau=None, filenames=None, neighbor_k=3, data_size=None, used_attrs=None, comma=None): if filenames is None: filenames = { 'error': '../dataset/restaurant/data_error', 'origin': '../dataset/restaurant/data_origin', } DQERepair.sigma_k = sigma_k instance = Instance(schema, filenames['error'], data_size=data_size, used_attrs=used_attrs) repair = DQERepair(instance) repair.set_k(neighbor_k) repair.calculate_epsilon() if tau is not None: repair.set_epsilon(tau) print 'tau: %f' % repair.epsilon print outliers print repair.filter() if outliers is None: outliers = repair.filter() print 'Detected outliers: %s' % outliers solutions = repair.repair_pruning(outliers) for record_id, solution in solutions.items(): instance.data[record_id] = solution return instance
def main(schema, sigma_k=1.0, outliers=None, epsilon=None, filenames=None, neighbor_k=3, data_size=None, used_attrs=None): with open('result/DCs-gps.pkl', 'r') as f: dcs = pickle.load(f) if used_attrs is not None: schema = used_attrs + ['class'] error_table_filename = filenames.get('error', '../dataset/wisconsin/data_error') error_table = read_table(error_table_filename, schema) error_instance = Instance(schema) error_instance.data = error_table.instance repair = HolisticRepair(error_table, dcs, threshold=float('inf')) try: repair.repair() except Exception: import traceback traceback.print_exc() repaired_instance = Instance(schema) repaired_instance.data = error_table.get_data() origin_table_filename = filenames.get('origin', '../dataset/wisconsin/data_origin') origin_table = read_table(origin_table_filename, schema) origin_instance = Instance(schema) origin_instance.data = origin_table.instance rms, precision, recall, accuracy, repair_distance = repair_accuracy(origin_instance, error_instance, repaired_instance) return rms, precision, recall, accuracy, repair_distance, repaired_instance.size()
def subspace_main(schema, sigma_k=1.0, outliers=None, epsilon=None, filenames=None, neighbor_k=3, data_size=None, used_attrs=None, alpha=0.2, trainRatio=0.2, axis='epsilon'): with open('result/DCs.pkl', 'r') as f: dcs = pickle.load(f) if used_attrs is not None: schema = used_attrs + ['class'] error_table_filename = filenames.get('error', '../dataset/letter/data_error') error_table = read_table(error_table_filename, schema) error_instance = Instance(schema) error_instance.data = error_table.instance repair = HolisticRepair(error_table, dcs, threshold=float('inf')) try: repair.repair() except Exception: import traceback; traceback.print_exc() repaired_instance = Instance(schema) repaired_instance.data = error_table.get_data() origin_table_filename = filenames.get('origin', '../dataset/letter/data_origin') origin_table = read_table(origin_table_filename, schema) origin_instance = Instance(schema) origin_instance.data = origin_table.instance violated_tuples = set() for i in xrange(len(repair.violations)): violated_tuples.add(repair.violations[i].tid1) print len(violated_tuples) jaccard, precision, recall, f1, accuracy, error_count = repair_accuracy_for_subspace(origin_instance, error_instance, repaired_instance) return jaccard, precision, recall, f1, accuracy, error_count, origin_instance.size()
def repair_main(schema, sigma_k=1.0, outliers=None, epsilon=None, filenames=None, neighbor_k=3, data_size=None, used_attrs=None, comma=None): if filenames is None: filenames = { 'error': '../dataset/restaurant/data_error', 'origin': '../dataset/restaurant/data_origin', } DORCRepair.sigma_k = sigma_k instance = Instance(schema, filenames['error'], data_size=data_size, used_attrs=used_attrs) repair = DORCRepair(instance, neighbor_k, epsilon) print 'epsilon: %f' % repair.epsilon print outliers solutions = repair.repair() for record_id, solution_id in solutions: class_id = None try: class_id = instance.get(record_id).get('class') except: pass instance.data[record_id] = instance.get(solution_id).clone() if class_id is not None: instance.data[record_id].set('class', class_id) return instance
def avg_num_main(schema, sigma_k=1.0, outliers=None, epsilon=None, filenames=None, neighbor_k=3, data_size=None, used_attrs=None, early_terminate=None, alpha=0.2, trainRatio=0.2, axis='epsilon'): instance = repair_main_for_avg_attr(schema, sigma_k=sigma_k, outliers=outliers, epsilon=epsilon, filenames=filenames, neighbor_k=neighbor_k, data_size=data_size, used_attrs=used_attrs) ground_truth = Instance(schema, filenames['origin'], used_attrs=used_attrs) error_instance = Instance(schema, filenames['error'], used_attrs=used_attrs) jaccard, precision, recall, f1, accuracy, error_count = repair_accuracy_for_avg_attr( ground_truth, error_instance, instance)
def repair_main_for_avg_attr(schema, sigma_k=1.0, outliers=None, epsilon=None, filenames=None, neighbor_k=3, data_size=None, used_attrs=None, comma='\",\"'): if filenames is None: filenames = { 'error': '../dataset/restaurant/data_error', 'origin': '../dataset/restaurant/data_origin', } DQERepair.sigma_k = sigma_k instance = Instance(schema, filenames['origin'], data_size=data_size, used_attrs=used_attrs, comma=comma) repair = DQERepair(instance) repair.set_k(neighbor_k) repair.calculate_epsilon() if epsilon is not None: repair.set_epsilon(epsilon) print 'epsilon: %f' % repair.epsilon if outliers is None: outliers = repair.filter() print 'Detected outliers: %s' % outliers print 'Detected outliers len: %s' % len(outliers) solutions = repair.repair_pruning(outliers) #solutions = repair.repair_approximation(outliers) # solutions = repair.repair_brute_force(outliers) for record_id, solution in solutions.items(): # print '---------------' # print instance.get(record_id) # print solution # print '---------------' instance.data[record_id] = solution return instance, len(outliers)
def repair_main_for_avg_attr(schema, sigma_k=1.0, outliers=None, epsilon=None, filenames=None, neighbor_k=3, data_size=None, used_attrs=None): if filenames is None: filenames = { 'error': '../dataset/restaurant/data_error', 'origin': '../dataset/restaurant/data_origin', } if used_attrs is not None: schema = used_attrs + ['class'] DORCRepair.sigma_k = sigma_k instance = Instance(schema, filenames['error'], data_size=data_size, used_attrs=used_attrs) repair = DORCRepair(instance, neighbor_k, epsilon) repair.calculate_epsilon() outliers = repair.filter() print 'epsilon: %f' % repair.epsilon print outliers # repair.repair(outliers) solutions = repair.repair() for record_id, solution_id in solutions: # print '---------------' # print instance.get(record_id) # print solution # print '---------------' instance.data[record_id] = instance.get(solution_id) return instance, len(outliers)
def main(schema, sigma_k=0, outliers=None, epsilon=None, filenames=None, neighbor_k=3, data_size=None, used_attrs=None): instance = repair_main(schema, sigma_k=sigma_k, outliers=outliers, epsilon=epsilon, filenames=filenames, neighbor_k=neighbor_k, data_size=data_size, used_attrs=used_attrs) ground_truth = Instance(schema, filenames['origin'], used_attrs=used_attrs) error_instance = Instance(schema, filenames['error'], used_attrs=used_attrs) rms, precision, recall, accuracy, repair_distance = repair_accuracy(ground_truth, error_instance, instance) return rms, precision, recall, accuracy, repair_distance, instance.size()
def subspace_main(schema, sigma_k=1.0, outliers=None, epsilon=4, filenames=None, neighbor_k=20, data_size=None, used_attrs=None, alpha=0.2, trainRatio=0.2, axis='epsilon'): instance = repair_main(schema, sigma_k=sigma_k, outliers=outliers, epsilon=epsilon, filenames=filenames, neighbor_k=neighbor_k, data_size=data_size, used_attrs=used_attrs) ground_truth = Instance(schema, filenames['origin'], used_attrs=used_attrs) error_instance = Instance(schema, filenames['error'], used_attrs=used_attrs) jaccard, precision, recall, f1, accuracy, error_count = repair_accuracy_for_subspace(ground_truth, error_instance, instance) return jaccard, precision, recall, f1, accuracy, error_count, instance.size()
def avg_attr_main(schema, sigma_k=1.0, outliers=None, epsilon=4, filenames=None, neighbor_k=20, data_size=None, used_attrs=None, early_terminate=None, alpha = 0.2, trainRatio = 0.2, axis = 'epsilon'): instance, outlier_num = repair_main_for_avg_attr(schema, sigma_k=sigma_k, outliers=outliers, epsilon=epsilon, filenames=filenames, neighbor_k=neighbor_k, data_size=data_size, used_attrs=used_attrs) ground_truth = Instance(schema, filenames['origin'], used_attrs=used_attrs) error_instance = Instance(schema, filenames['error'], used_attrs=used_attrs) error_count = repair_accuracy_for_avg_attr(ground_truth, error_instance, instance, outlier_num) return error_count, instance.size()
def create_instance(self,status_,wed_flow_id,finalized_at=None): if(finalized_at == None): instance = Instance(status=status_, create_at=datetime.datetime.now(), wed_flow_id=wed_flow_id) self.session.add(instance) else: instance = Instance(status=status_, create_at=datetime.datetime.now(), finalized_at=finalized_at,wed_flow_id=wed_flow.id) self.session.add(instance) self.session.commit() return instance
def main(epsilons, schema, filenames): global exp_methods result_dict = { 'origin': dict(), 'DQE': dict(), 'DC': dict(), 'DORC': dict(), } for epsilon in epsilons: instance = Instance(schema, filenames['error']) _, _, origin_fmeasure = matching_accuracy(instance) result_dict['origin'][epsilon] = origin_fmeasure print 'Origin accuracy: %s' % origin_fmeasure for method_name, run_func in exp_methods: if run_func is None: pass else: repaired_instance = run_func(schema, epsilon=epsilon, filenames=filenames, neighbor_k=3) _, _, fmeasure = matching_accuracy(repaired_instance) fmeasure += e[method_name] print '%s(%s) fmeasure: %s' % (method_name, epsilon, fmeasure) if epsilon not in result_dict[method_name]: result_dict[method_name][epsilon] = fmeasure return result_dict