#log_folder = "unigram_metadata_naivebayes" #log_folder = "unigram_metadata_linearsvm" #log_folder = "food" log_folder = "deep_all" #dataset = FoodHoloClean() dataset = FlightHoloClean( ) #FlightHoloClean()#BlackOakDataSetUppercase()#HospitalHoloClean() #BlackOakDataSetUppercase() #future_steps = 60 #BlackOak = 7, Flights = 9 ''' from ml.datasets.BartDataset.BartDataSet import BartDataset dataset = BartDataset(BlackOakDataSetUppercase(), "CityFD_20percent") ''' future_steps = 20 # 60 n = dataset.get_number_dirty_columns() best_sum_total_f = {} best_col_seq = {} for d in range(10): file_path = "/home/felix/ExampleDrivenErrorDetection/progress_log_data/" + log_folder + "/log_progress_" + dataset.name + "_" + str( d) + ".csv" x, fp, fn, tp = read_csv1(file_path, None) certainty_sum = get_all_certainty_sum(x, feature_names) #print certainty_sum print "train: " + str(x.shape[0]) print "features: " + str(all_features)
from ml.tools.dboost.TestDBoost import toLatex from ml.configuration.Config import Config import os import time data = FlightHoloClean() steps = 100 #grid for search N = 1 #10 # number runs defined_range_labeled_cells = [100] #[20,40,60,80,100,120] sizes = np.array(defined_range_labeled_cells, dtype=float) # in cells print sizes dirty_column_fraction = data.get_number_dirty_columns() / float(data.shape[1]) sizes /= dirty_column_fraction #cells converted sizes /= float(data.shape[1]) #cells to rows row_sizes = np.array(sizes, dtype=int) # in rows path_folder = Config.get("logging.folder") + "/out/dboost" log_file = path_folder + "/Flights_mix_new " + str(time.time()) + ".txt" if not os.path.exists(path_folder): os.makedirs(path_folder) avg_times, avg_fscores, avg_precision, avg_recall, std_fscores, std_precision, std_recall = test_multiple_sizes_mixture( data, steps, N, row_sizes, log_file) toLatex(defined_range_labeled_cells, avg_times, avg_fscores, avg_precision, avg_recall, std_fscores, std_precision, std_recall, log_file)