Exemple #1
0
def repair_accuracy_for_avg_attr(origin, error, repaired, outlier_num):
    from database.Instance import Record
    assert origin.size() == repaired.size() == error.size()
    assert len(origin.schema) == len(error.schema) == len(repaired.schema)
    schema = origin.schema
    size = origin.size()
    width = len(origin.schema)
    total_cell_count = size * width
    rms_sum = 0.0
    repair_distance = 0.0
    error_count = 0
    repair_count = 0
    correct_error_repair_count = 0
    correct_repair_count = 0
    correct_checked_count = 0
    error_checked_count = 0
    error_labels = list()
    check_labels = list()
    for i in xrange(size):
        origin_record = origin.get(i)
        error_record = error.get(i)
        repaired_record = repaired.get(i)
        correct_value_count = 0
        is_error_or_repaired = False

        for attr in schema:
            origin_value = origin_record.get(attr)
            error_value = error_record.get(attr)
            repaired_value = repaired_record.get(attr)
            is_error = False
            value_correct = False
            if (not Record.value_equal(origin_value, repaired_value)):
                repair_count += 1
    print repair_count

    try:
        avg_num = repair_count / float(outlier_num)
    except:
        avg_num = 0
    print '<<<Error Count: %f' % avg_num
    return avg_num
Exemple #2
0
import sys
import time

from artificial_error.add_errors_letter import begin_add_errors, set_error_config
from database.Instance import Record
from main_dc import main as run_single_exp_dc
from main_dorc import main as run_single_exp_dorc
from main_dqe import main as run_single_exp_dqe
from main_dqe_approximate import main as run_single_exp_approximation
from utils import UtilFunc

random_seeds = [1, 21, 464, 514, 67]

if __name__ == '__main__':
    Record.set_equal_threshold(1)
    output_content_rms = 'RMS\tDQE\tDC\tDORC\tApproximation\r\n'
    output_content_accuracy = 'Accuracy\tDQE\tDC\tDORC\tApproximation\r\n'
    output_content_distance = 'RepairDistance\tDQE\tDC\tDORC\tApproximation\r\n'
    output_content_time = 'Time(s)\tDQE\tDC\tDORC\tApproximation\r\n'

    filenames = {
        'origin': '../../dataset/letter/data_origin_size',
        'error': '../../dataset/letter/data_error_size'
    }
    origin_attrs = ['attr%d' % i for i in xrange(1, 17)]
    modifiable_attrs = ['attr%d' % i for i in xrange(1, 5)]
    schema = origin_attrs + ['class']
    origin_filename = '../../dataset/letter/data'
    used_attrs = ['attr%d' % i for i in xrange(1, 10)]
    origin_schema = used_attrs + ['class']
Exemple #3
0
def repair_accuracy_for_subspace(origin, error, repaired):
    from database.Instance import Record
    assert origin.size() == repaired.size() == error.size()
    assert len(origin.schema) == len(error.schema) == len(repaired.schema)
    schema = origin.schema
    size = origin.size()
    width = len(origin.schema)
    total_cell_count = size * width
    rms_sum = 0.0
    repair_distance = 0.0
    error_count = 0
    repair_count = 0
    correct_error_repair_count = 0
    correct_repair_count = 0
    correct_checked_count = 0
    error_checked_count = 0
    error_labels = list()
    check_labels = list()
    for i in xrange(size):
        origin_record = origin.get(i)
        error_record = error.get(i)
        repaired_record = repaired.get(i)
        correct_value_count = 0
        is_error_or_repaired = False

        for attr in schema:
            origin_value = origin_record.get(attr)
            error_value = error_record.get(attr)
            repaired_value = repaired_record.get(attr)
            is_error = False
            value_correct = False
            if ((not Record.value_equal(origin_value, error_value))
                    and (not Record.value_equal(error_value, repaired_value))):
                correct_checked_count += 1
            '''if Record.value_equal(origin_value, repaired_value):
                correct_value_count += 1
                value_correct = True
            if origin_value != error_value:
                error_count += 1
                is_error_or_repaired = True'''
            if not Record.value_equal(origin_value, error_value):
                error_count += 1
                error_labels.append(1)
            else:
                error_labels.append(0)

            if not Record.value_equal(error_value, repaired_value):
                repair_count += 1
                check_labels.append(1)
            else:
                check_labels.append(0)
    print correct_checked_count
    print repair_count
    print error_count

    try:
        jaccard = correct_checked_count / float(repair_count + error_count -
                                                correct_checked_count)
    except:
        jaccard = 0
    try:
        precision = correct_checked_count / float(repair_count)
    except:
        precision = 0
    try:
        recall = correct_checked_count / float(error_count)
    except:
        recall = 0
    try:
        f1 = 2 * precision * recall / (precision + recall)
    except:
        f1 = 0
    try:
        accuracy = sklearn.metrics.accuracy_score(error_labels, check_labels)
    except:
        accuracy = 0

    print '<<<Jaccard: %f' % jaccard
    print '<<<Precision: %f' % precision
    print '<<<Recall: %f' % recall
    print '<<<F1-score: %f' % f1
    print '<<<Error Count: %f' % repair_count
    return jaccard, precision, recall, f1, accuracy, repair_count
Exemple #4
0
def repair_accuracy(origin, error, repaired):
    from database.Instance import Record
    assert origin.size() == repaired.size() == error.size()
    assert len(origin.schema) == len(error.schema) == len(repaired.schema)
    schema = origin.schema
    size = origin.size()
    width = len(origin.schema)
    total_cell_count = size * width
    rms_sum = 0.0
    repair_distance = 0.0
    error_count = 0
    repair_count = 0
    correct_error_repair_count = 0
    correct_repair_count = 0
    accuracies = list()
    for i in xrange(size):
        origin_record = origin.get(i)
        error_record = error.get(i)
        repaired_record = repaired.get(i)
        correct_value_count = 0
        is_error_or_repaired = False

        for attr in schema:
            origin_value = origin_record.get(attr)
            error_value = error_record.get(attr)
            repaired_value = repaired_record.get(attr)
            is_error = False
            value_correct = False
            if Record.value_equal(origin_value, repaired_value):
                correct_value_count += 1
                value_correct = True
            if origin_value != error_value:
                is_error = True
                error_count += 1
                is_error_or_repaired = True
            if error_value != repaired_value:
                is_error_or_repaired = True
                if not Record.value_equal(error_value, repaired_value):
                    repair_count += 1
                    if value_correct:
                        correct_repair_count += 1
                        if is_error:
                            correct_error_repair_count += 1
                if string_similarity(origin_value, repaired_value) < 2:
                    repair_distance += string_similarity(
                        origin_value, repaired_value)
            if origin_value != repaired_value:
                if string_similarity(origin_value, repaired_value) < 2:
                    rms_sum += string_similarity(origin_value,
                                                 repaired_value)**2
        if is_error_or_repaired:
            accuracies.append(float(correct_value_count) / len(schema))
    rms = (rms_sum / float(total_cell_count))**0.5
    if len(accuracies) != 0:
        accuracy = sum(accuracies) / len(accuracies)
    else:
        accuracy = 0
    print '<<<Accuracy: %f' % accuracy
    try:
        precision = correct_repair_count / float(repair_count)
    except:
        precision = 0
    try:
        recall = correct_error_repair_count / float(error_count)
    except:
        recall = 0
    if precision + recall == 0:
        fscore = 0.0
    else:
        fscore = (2 * precision * recall) / (precision + recall)
    print '<<<RMS: %f' % rms
    print '<<<Precision: %f' % precision
    print '<<<Recall: %f' % recall
    print '<<<F-score: %f' % fscore
    print '<<<Repair distance: %f'
    return rms, precision, recall, accuracy, repair_distance
Exemple #5
0
# encoding=utf-8

import time

from database.Instance import Record
from main_dc import main as run_single_exp_dc
from main_dc import repair_main as repair_main_dc
from main_dorc import main as run_single_exp_dorc
from main_dorc import repair_main as repair_main_dorc
from main_dqe import main as run_single_exp_dqe
from main_dqe import repair_main as repair_main_dqe

if __name__ == '__main__':
    Record.set_equal_threshold(2e-5)
    output_content_rms = 'RMS\tDQE\tDC\tDORC\n'
    output_content_accuracy = 'Accuracy\tDQE\tDC\tDORC\r\n'
    output_content_distance = 'RepairDistance\tDQE\tDC\tDORC\r\n'
    output_content_time = 'Time(s)\tDQE\tDC\tDORC\r\n'
    filenames = {
        'origin': '../../dataset/gps/gps_label',
        'error': '../../dataset/gps/gps_obs'
    }
    schema = ['ts', 'x', 'y']
    exp_methods = [
        ('DQE', run_single_exp_dqe),
        ('DC', run_single_exp_dc),
        ('DORC', run_single_exp_dorc),
    ]
    repair_methods = {
        'DQE': repair_main_dqe,
        'DC': repair_main_dc,