def __init__(self, data): self.data = data self.project_file = '/tmp/data_dirty.csv' self.data.dirty_pd.to_csv(self.project_file, index=False, header=True) start_time = time.time() self.setUp() self.run_transforms() self.tearDown() runtime = (time.time() - start_time) tool = OpenRefine('/tmp/data_clean.tsv', data=data) print "Runtime: " + str(runtime) print "Fscore: " + str(tool.calculate_total_fscore()) print "Precision: " + str(tool.calculate_total_precision()) print "Recall: " + str(tool.calculate_total_recall())
from ml.datasets.mohammad import MohammadDataSet from ml.tools.openrefine.OpenRefine import OpenRefine #one rule for all columns: # if(contains(value, "x"), "error", value) # takes 3 mins to execute data = MohammadDataSet("tax", 20, 30, 10) tool = OpenRefine( "/home/felix/SequentialPatternErrorDetection/OpenRefine/tax/result/tax_o20_r30_p10-csv-with-minus-rule.tsv", data=data) print "Fscore: " + str(tool.calculate_total_fscore()) print "Precision: " + str(tool.calculate_total_precision()) print "Recall: " + str(tool.calculate_total_recall()) for c in range(data.shape[1]): print tool.calculate_fscore_by_column(c)
from ml.datasets.hospital import HospitalHoloClean from ml.tools.openrefine.OpenRefine import OpenRefine #one rule for all columns: # if(contains(value, "x"), "error", value) # takes 3 mins to execute data = HospitalHoloClean() tool = OpenRefine( "/home/felix/SequentialPatternErrorDetection/OpenRefine/Hospital/result/hosp_dirty_holoclean_open_refine.tsv", data=data) print "Fscore: " + str(tool.calculate_total_fscore()) print "Precision: " + str(tool.calculate_total_precision()) print "Recall: " + str(tool.calculate_total_recall()) for c in range(data.shape[1]): print tool.calculate_fscore_by_column(c)
from ml.datasets.flights.FlightHoloClean import FlightHoloClean from ml.tools.openrefine.OpenRefine import OpenRefine #one rule for all columns: # if(contains(value, "x"), "error", value) # takes 3 mins to execute data = FlightHoloClean() tool = OpenRefine( "/home/felix/SequentialPatternErrorDetection/OpenRefine/Flights/results/regex/flight-holoclean-dirty.tsv", data=data) print "Fscore: " + str(tool.calculate_total_fscore()) print "Precision: " + str(tool.calculate_total_precision()) print "Recall: " + str(tool.calculate_total_recall()) for c in range(data.shape[1]): print tool.calculate_fscore_by_column(c)
from ml.datasets.blackOak.BlackOakDataSetUppercase import BlackOakDataSetUppercase from ml.tools.openrefine.OpenRefine import OpenRefine #tool = OpenRefine("/home/felix/SequentialPatternErrorDetection/OpenRefine/0_upper_case/BlackOak.tsv") #tool = OpenRefine("/home/felix/SequentialPatternErrorDetection/OpenRefine/1_ZIP_is_numeric/BlackOak.tsv") #tool = OpenRefine("/home/felix/SequentialPatternErrorDetection/OpenRefine/2_State_length_2/BlackOak.tsv") #tool = OpenRefine("/home/felix/SequentialPatternErrorDetection/OpenRefine/3_SSN_is_numeric/BlackOak.tsv") #tool = OpenRefine("/home/felix/SequentialPatternErrorDetection/OpenRefine/4_ZIP_length_5/BlackOak.tsv") #tool = OpenRefine("/home/felix/SequentialPatternErrorDetection/OpenRefine/5_City_not_SAN/BlackOak.tsv") #tool = OpenRefine("/home/felix/SequentialPatternErrorDetection/OpenRefine/6_City_not_LOS/BlackOak.tsv") data = BlackOakDataSetUppercase() tool = OpenRefine( "/home/felix/SequentialPatternErrorDetection/OpenRefine/BlackOak/upper_case/BlackOakUppercase_dirty-csv.tsv", data=data) #tool = OpenRefine("/home/felix/SequentialPatternErrorDetection/OpenRefine/BlackOak/upper_case/BlackOakUppercase_dirty_morerules-csv.tsv", data=data) print "Fscore: " + str(tool.calculate_total_fscore()) print "Precision: " + str(tool.calculate_total_precision()) print "Recall: " + str(tool.calculate_total_recall()) ''' false_positives_ids = np.where(np.logical_and(tool.matrix_detected[:,10] == True, data.matrix_is_error[:,10] == False))[0] print false_positives_ids for i in range(len(false_positives_ids)): print "as error detected but clean: " + str(data.dirty_pd.values[false_positives_ids[i],10]) ''' for c in range(data.shape[1]): print data.clean_pd.columns[c]
from ml.tools.openrefine.OpenRefine import OpenRefine #tool = OpenRefine("/home/felix/SequentialPatternErrorDetection/OpenRefine/0_upper_case/BlackOak.tsv") #tool = OpenRefine("/home/felix/SequentialPatternErrorDetection/OpenRefine/1_ZIP_is_numeric/BlackOak.tsv") #tool = OpenRefine("/home/felix/SequentialPatternErrorDetection/OpenRefine/2_State_length_2/BlackOak.tsv") #tool = OpenRefine("/home/felix/SequentialPatternErrorDetection/OpenRefine/3_SSN_is_numeric/BlackOak.tsv") #tool = OpenRefine("/home/felix/SequentialPatternErrorDetection/OpenRefine/4_ZIP_length_5/BlackOak.tsv") #tool = OpenRefine("/home/felix/SequentialPatternErrorDetection/OpenRefine/5_City_not_SAN/BlackOak.tsv") #tool = OpenRefine("/home/felix/SequentialPatternErrorDetection/OpenRefine/6_City_not_LOS/BlackOak.tsv") tool = OpenRefine( "/home/felix/SequentialPatternErrorDetection/OpenRefine/all/BlackOak.tsv") print "Fscore: " + str(tool.calculate_total_fscore()) print "Precision: " + str(tool.calculate_total_precision()) print "Recall: " + str(tool.calculate_total_recall()) for c in range(12): print tool.calculate_fscore_by_column(c)