def __init__(self, data):
        self.data = data
        self.project_file = '/tmp/data_dirty.csv'

        self.data.dirty_pd.to_csv(self.project_file, index=False, header=True)

        start_time = time.time()

        self.setUp()
        self.run_transforms()
        self.tearDown()

        runtime = (time.time() - start_time)

        tool = OpenRefine('/tmp/data_clean.tsv', data=data)

        print "Runtime: " + str(runtime)
        print "Fscore: " + str(tool.calculate_total_fscore())
        print "Precision: " + str(tool.calculate_total_precision())
        print "Recall: " + str(tool.calculate_total_recall())
from ml.datasets.mohammad import MohammadDataSet
from ml.tools.openrefine.OpenRefine import OpenRefine

#one rule for all columns:
# if(contains(value, "x"), "error", value)
# takes 3 mins to execute

data = MohammadDataSet("tax", 20, 30, 10)

tool = OpenRefine(
    "/home/felix/SequentialPatternErrorDetection/OpenRefine/tax/result/tax_o20_r30_p10-csv-with-minus-rule.tsv",
    data=data)

print "Fscore: " + str(tool.calculate_total_fscore())
print "Precision: " + str(tool.calculate_total_precision())
print "Recall: " + str(tool.calculate_total_recall())

for c in range(data.shape[1]):
    print tool.calculate_fscore_by_column(c)
from ml.datasets.hospital import HospitalHoloClean
from ml.tools.openrefine.OpenRefine import OpenRefine

#one rule for all columns:
# if(contains(value, "x"), "error", value)
# takes 3 mins to execute

data = HospitalHoloClean()

tool = OpenRefine(
    "/home/felix/SequentialPatternErrorDetection/OpenRefine/Hospital/result/hosp_dirty_holoclean_open_refine.tsv",
    data=data)

print "Fscore: " + str(tool.calculate_total_fscore())
print "Precision: " + str(tool.calculate_total_precision())
print "Recall: " + str(tool.calculate_total_recall())

for c in range(data.shape[1]):
    print tool.calculate_fscore_by_column(c)
from ml.datasets.flights.FlightHoloClean import FlightHoloClean
from ml.tools.openrefine.OpenRefine import OpenRefine

#one rule for all columns:
# if(contains(value, "x"), "error", value)
# takes 3 mins to execute

data = FlightHoloClean()

tool = OpenRefine(
    "/home/felix/SequentialPatternErrorDetection/OpenRefine/Flights/results/regex/flight-holoclean-dirty.tsv",
    data=data)

print "Fscore: " + str(tool.calculate_total_fscore())
print "Precision: " + str(tool.calculate_total_precision())
print "Recall: " + str(tool.calculate_total_recall())

for c in range(data.shape[1]):
    print tool.calculate_fscore_by_column(c)
Exemple #5
0
from ml.datasets.blackOak.BlackOakDataSetUppercase import BlackOakDataSetUppercase
from ml.tools.openrefine.OpenRefine import OpenRefine

#tool = OpenRefine("/home/felix/SequentialPatternErrorDetection/OpenRefine/0_upper_case/BlackOak.tsv")
#tool = OpenRefine("/home/felix/SequentialPatternErrorDetection/OpenRefine/1_ZIP_is_numeric/BlackOak.tsv")
#tool = OpenRefine("/home/felix/SequentialPatternErrorDetection/OpenRefine/2_State_length_2/BlackOak.tsv")
#tool = OpenRefine("/home/felix/SequentialPatternErrorDetection/OpenRefine/3_SSN_is_numeric/BlackOak.tsv")
#tool = OpenRefine("/home/felix/SequentialPatternErrorDetection/OpenRefine/4_ZIP_length_5/BlackOak.tsv")
#tool = OpenRefine("/home/felix/SequentialPatternErrorDetection/OpenRefine/5_City_not_SAN/BlackOak.tsv")
#tool = OpenRefine("/home/felix/SequentialPatternErrorDetection/OpenRefine/6_City_not_LOS/BlackOak.tsv")

data = BlackOakDataSetUppercase()
tool = OpenRefine(
    "/home/felix/SequentialPatternErrorDetection/OpenRefine/BlackOak/upper_case/BlackOakUppercase_dirty-csv.tsv",
    data=data)
#tool = OpenRefine("/home/felix/SequentialPatternErrorDetection/OpenRefine/BlackOak/upper_case/BlackOakUppercase_dirty_morerules-csv.tsv", data=data)

print "Fscore: " + str(tool.calculate_total_fscore())
print "Precision: " + str(tool.calculate_total_precision())
print "Recall: " + str(tool.calculate_total_recall())
'''
false_positives_ids = np.where(np.logical_and(tool.matrix_detected[:,10] == True, data.matrix_is_error[:,10] == False))[0]

print false_positives_ids

for i in range(len(false_positives_ids)):
    print "as error detected but clean: " + str(data.dirty_pd.values[false_positives_ids[i],10])
'''

for c in range(data.shape[1]):
    print data.clean_pd.columns[c]
Exemple #6
0
from ml.tools.openrefine.OpenRefine import OpenRefine

#tool = OpenRefine("/home/felix/SequentialPatternErrorDetection/OpenRefine/0_upper_case/BlackOak.tsv")
#tool = OpenRefine("/home/felix/SequentialPatternErrorDetection/OpenRefine/1_ZIP_is_numeric/BlackOak.tsv")
#tool = OpenRefine("/home/felix/SequentialPatternErrorDetection/OpenRefine/2_State_length_2/BlackOak.tsv")
#tool = OpenRefine("/home/felix/SequentialPatternErrorDetection/OpenRefine/3_SSN_is_numeric/BlackOak.tsv")
#tool = OpenRefine("/home/felix/SequentialPatternErrorDetection/OpenRefine/4_ZIP_length_5/BlackOak.tsv")
#tool = OpenRefine("/home/felix/SequentialPatternErrorDetection/OpenRefine/5_City_not_SAN/BlackOak.tsv")
#tool = OpenRefine("/home/felix/SequentialPatternErrorDetection/OpenRefine/6_City_not_LOS/BlackOak.tsv")

tool = OpenRefine(
    "/home/felix/SequentialPatternErrorDetection/OpenRefine/all/BlackOak.tsv")

print "Fscore: " + str(tool.calculate_total_fscore())
print "Precision: " + str(tool.calculate_total_precision())
print "Recall: " + str(tool.calculate_total_recall())

for c in range(12):
    print tool.calculate_fscore_by_column(c)