def repair_accuracy_for_avg_attr(origin, error, repaired, outlier_num): from database.Instance import Record assert origin.size() == repaired.size() == error.size() assert len(origin.schema) == len(error.schema) == len(repaired.schema) schema = origin.schema size = origin.size() width = len(origin.schema) total_cell_count = size * width rms_sum = 0.0 repair_distance = 0.0 error_count = 0 repair_count = 0 correct_error_repair_count = 0 correct_repair_count = 0 correct_checked_count = 0 error_checked_count = 0 error_labels = list() check_labels = list() for i in xrange(size): origin_record = origin.get(i) error_record = error.get(i) repaired_record = repaired.get(i) correct_value_count = 0 is_error_or_repaired = False for attr in schema: origin_value = origin_record.get(attr) error_value = error_record.get(attr) repaired_value = repaired_record.get(attr) is_error = False value_correct = False if (not Record.value_equal(origin_value, repaired_value)): repair_count += 1 print repair_count try: avg_num = repair_count / float(outlier_num) except: avg_num = 0 print '<<<Error Count: %f' % avg_num return avg_num
import sys import time from artificial_error.add_errors_letter import begin_add_errors, set_error_config from database.Instance import Record from main_dc import main as run_single_exp_dc from main_dorc import main as run_single_exp_dorc from main_dqe import main as run_single_exp_dqe from main_dqe_approximate import main as run_single_exp_approximation from utils import UtilFunc random_seeds = [1, 21, 464, 514, 67] if __name__ == '__main__': Record.set_equal_threshold(1) output_content_rms = 'RMS\tDQE\tDC\tDORC\tApproximation\r\n' output_content_accuracy = 'Accuracy\tDQE\tDC\tDORC\tApproximation\r\n' output_content_distance = 'RepairDistance\tDQE\tDC\tDORC\tApproximation\r\n' output_content_time = 'Time(s)\tDQE\tDC\tDORC\tApproximation\r\n' filenames = { 'origin': '../../dataset/letter/data_origin_size', 'error': '../../dataset/letter/data_error_size' } origin_attrs = ['attr%d' % i for i in xrange(1, 17)] modifiable_attrs = ['attr%d' % i for i in xrange(1, 5)] schema = origin_attrs + ['class'] origin_filename = '../../dataset/letter/data' used_attrs = ['attr%d' % i for i in xrange(1, 10)] origin_schema = used_attrs + ['class']
def repair_accuracy_for_subspace(origin, error, repaired): from database.Instance import Record assert origin.size() == repaired.size() == error.size() assert len(origin.schema) == len(error.schema) == len(repaired.schema) schema = origin.schema size = origin.size() width = len(origin.schema) total_cell_count = size * width rms_sum = 0.0 repair_distance = 0.0 error_count = 0 repair_count = 0 correct_error_repair_count = 0 correct_repair_count = 0 correct_checked_count = 0 error_checked_count = 0 error_labels = list() check_labels = list() for i in xrange(size): origin_record = origin.get(i) error_record = error.get(i) repaired_record = repaired.get(i) correct_value_count = 0 is_error_or_repaired = False for attr in schema: origin_value = origin_record.get(attr) error_value = error_record.get(attr) repaired_value = repaired_record.get(attr) is_error = False value_correct = False if ((not Record.value_equal(origin_value, error_value)) and (not Record.value_equal(error_value, repaired_value))): correct_checked_count += 1 '''if Record.value_equal(origin_value, repaired_value): correct_value_count += 1 value_correct = True if origin_value != error_value: error_count += 1 is_error_or_repaired = True''' if not Record.value_equal(origin_value, error_value): error_count += 1 error_labels.append(1) else: error_labels.append(0) if not Record.value_equal(error_value, repaired_value): repair_count += 1 check_labels.append(1) else: check_labels.append(0) print correct_checked_count print repair_count print error_count try: jaccard = correct_checked_count / float(repair_count + error_count - correct_checked_count) except: jaccard = 0 try: precision = correct_checked_count / float(repair_count) except: precision = 0 try: recall = correct_checked_count / float(error_count) except: recall = 0 try: f1 = 2 * precision * recall / (precision + recall) except: f1 = 0 try: accuracy = sklearn.metrics.accuracy_score(error_labels, check_labels) except: accuracy = 0 print '<<<Jaccard: %f' % jaccard print '<<<Precision: %f' % precision print '<<<Recall: %f' % recall print '<<<F1-score: %f' % f1 print '<<<Error Count: %f' % repair_count return jaccard, precision, recall, f1, accuracy, repair_count
def repair_accuracy(origin, error, repaired): from database.Instance import Record assert origin.size() == repaired.size() == error.size() assert len(origin.schema) == len(error.schema) == len(repaired.schema) schema = origin.schema size = origin.size() width = len(origin.schema) total_cell_count = size * width rms_sum = 0.0 repair_distance = 0.0 error_count = 0 repair_count = 0 correct_error_repair_count = 0 correct_repair_count = 0 accuracies = list() for i in xrange(size): origin_record = origin.get(i) error_record = error.get(i) repaired_record = repaired.get(i) correct_value_count = 0 is_error_or_repaired = False for attr in schema: origin_value = origin_record.get(attr) error_value = error_record.get(attr) repaired_value = repaired_record.get(attr) is_error = False value_correct = False if Record.value_equal(origin_value, repaired_value): correct_value_count += 1 value_correct = True if origin_value != error_value: is_error = True error_count += 1 is_error_or_repaired = True if error_value != repaired_value: is_error_or_repaired = True if not Record.value_equal(error_value, repaired_value): repair_count += 1 if value_correct: correct_repair_count += 1 if is_error: correct_error_repair_count += 1 if string_similarity(origin_value, repaired_value) < 2: repair_distance += string_similarity( origin_value, repaired_value) if origin_value != repaired_value: if string_similarity(origin_value, repaired_value) < 2: rms_sum += string_similarity(origin_value, repaired_value)**2 if is_error_or_repaired: accuracies.append(float(correct_value_count) / len(schema)) rms = (rms_sum / float(total_cell_count))**0.5 if len(accuracies) != 0: accuracy = sum(accuracies) / len(accuracies) else: accuracy = 0 print '<<<Accuracy: %f' % accuracy try: precision = correct_repair_count / float(repair_count) except: precision = 0 try: recall = correct_error_repair_count / float(error_count) except: recall = 0 if precision + recall == 0: fscore = 0.0 else: fscore = (2 * precision * recall) / (precision + recall) print '<<<RMS: %f' % rms print '<<<Precision: %f' % precision print '<<<Recall: %f' % recall print '<<<F-score: %f' % fscore print '<<<Repair distance: %f' return rms, precision, recall, accuracy, repair_distance
# encoding=utf-8 import time from database.Instance import Record from main_dc import main as run_single_exp_dc from main_dc import repair_main as repair_main_dc from main_dorc import main as run_single_exp_dorc from main_dorc import repair_main as repair_main_dorc from main_dqe import main as run_single_exp_dqe from main_dqe import repair_main as repair_main_dqe if __name__ == '__main__': Record.set_equal_threshold(2e-5) output_content_rms = 'RMS\tDQE\tDC\tDORC\n' output_content_accuracy = 'Accuracy\tDQE\tDC\tDORC\r\n' output_content_distance = 'RepairDistance\tDQE\tDC\tDORC\r\n' output_content_time = 'Time(s)\tDQE\tDC\tDORC\r\n' filenames = { 'origin': '../../dataset/gps/gps_label', 'error': '../../dataset/gps/gps_obs' } schema = ['ts', 'x', 'y'] exp_methods = [ ('DQE', run_single_exp_dqe), ('DC', run_single_exp_dc), ('DORC', run_single_exp_dorc), ] repair_methods = { 'DQE': repair_main_dqe, 'DC': repair_main_dc,