def load_model(dataSet, classifier): dataset_log_files = {} dataset_log_files[HospitalHoloClean().name] = "hospital" dataset_log_files[BlackOakDataSetUppercase().name] = "blackoak" dataset_log_files[FlightHoloClean().name] = "flight" # not yet dataset_log_files[Salary().name] = "hospital" # be careful dataset_log_files[Book().name] = "hospital" # be careful potential_model_dir = Config.get("column.potential.models") tp_model = pickle.load( open(potential_model_dir + "/tp_model" + dataset_log_files[dataSet.name] + "_" + classifier.name + ".p")) fpfn_model = pickle.load( open(potential_model_dir + "/fpfn_model" + dataset_log_files[dataSet.name] + "_" + classifier.name + ".p")) delta_tp_model = pickle.load( open(potential_model_dir + "/delta_tp_model" + dataset_log_files[dataSet.name] + "_" + classifier.name + ".p")) delta_fpfn_model = pickle.load( open(potential_model_dir + "/delta_fpfn_model" + dataset_log_files[dataSet.name] + "_" + classifier.name + ".p")) return tp_model, fpfn_model, delta_tp_model, delta_fpfn_model
def __init__(self): holoclean = HospitalHoloClean() rng = np.random.RandomState(42) clean_pd = holoclean.clean_pd.copy() dirty_pd = holoclean.clean_pd.copy() is_error = holoclean.matrix_is_error dirty_matrix = dirty_pd.values for c in range(clean_pd.shape[1]): domain = clean_pd[clean_pd.columns[c]].unique() if len(domain) > 1: for r in range(clean_pd.shape[0]): if is_error[r, c]: val = dirty_matrix[r, c] while dirty_matrix[r, c] == val: val = domain[rng.randint(len(domain))] print str(dirty_matrix[r, c]) + " -> " + str(val) dirty_matrix[r, c] = val dirty_pd = pd.DataFrame(dirty_matrix, columns=holoclean.dirty_pd.columns) super(HospitalDomainError, self).__init__(HospitalDomainError.name, dirty_pd, clean_pd)
def load_model(dataSet, classifier): dataset_log_files = {} dataset_log_files[HospitalHoloClean().name] = "hospital" dataset_log_files[BlackOakDataSetUppercase().name] = "blackoak" dataset_log_files[FlightHoloClean().name] = "flight" potential_model_dir = Config.get("column.potential.models") return pickle.load( open(potential_model_dir + "/model" + dataset_log_files[dataSet.name] + "_" + classifier.name + ".p"))
def load_model(dataSet): dataset_log_files = {} dataset_log_files[HospitalHoloClean().name] = "hospital" dataset_log_files[BlackOakDataSetUppercase().name] = "blackoak" dataset_log_files[FlightHoloClean().name] = "flight" # not yet #dataset_log_files[Salary().name] = "hospital" # be careful #dataset_log_files[Book().name] = "hospital" # be careful potential_model_dir = '/home/felix/ExampleDrivenErrorDetection/potential models/unique_false_current_hist' return pickle.load( open(potential_model_dir + "/model" + dataset_log_files[dataSet.name] + "_" + "XGBoost" + ".p"))
def load_model(dataSet, classifier): dataset_log_files = {} dataset_log_files[HospitalHoloClean().name] = "hospital" dataset_log_files[BlackOakDataSetUppercase().name] = "blackoak" dataset_log_files[FlightHoloClean().name] = "flight" # not yet dataset_log_files[Salary().name] = "hospital" # be careful dataset_log_files[Book().name] = "hospital" # be careful #potential_model_dir = Config.get("column.potential.models") potential_model_dir = "/home/felix/ExampleDrivenErrorDetection/potential models/classification" return pickle.load( open(potential_model_dir + "/model" + dataset_log_files[dataSet.name] + "_" + classifier.name + ".p"))
def load_model(dataSet): dataset_log_files = {} dataset_log_files[HospitalHoloClean().name] = "hospital" dataset_log_files[BlackOakDataSetUppercase().name] = "blackoak" dataset_log_files[FlightHoloClean().name] = "flight" # not yet # dataset_log_files[Salary().name] = "hospital" # be careful # dataset_log_files[Book().name] = "hospital" # be careful #potential_model_dir = '/home/felix/ExampleDrivenErrorDetection/potential models/current_total_f' potential_model_dir = '/home/felix/ExampleDrivenErrorDetection/potential models/simulation100data' tp_model = pickle.load( open(potential_model_dir + "/tp_model_" + "XGBoost" + ".p")) fp_model = pickle.load( open(potential_model_dir + "/fp_model_" + "XGBoost" + ".p")) fn_model = pickle.load(open(potential_model_dir + "/fn_model_XGBoost.p")) return tp_model, fp_model, fn_model
import numpy as np from ml.datasets.hospital.HospitalHoloClean import HospitalHoloClean data = HospitalHoloClean() columns = ["ProviderNumber", "HospitalName", "Address1", "City", "State", "ZipCode", "CountyName", "PhoneNumber", "HospitalType", "HospitalOwner", "EmergencyService", "Condition", "MeasureCode", "MeasureName", "Score", "Sample", "Stateavg"] print columns print list(data.clean_pd.columns) #detected = np.load("/home/felix/ExampleDrivenErrorDetection/model/ml/save_detected.npy") detected = data.matrix_is_error
enable_plotting = True classifier_log_paths = {} classifier_log_paths[ XGBoostClassifier. name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/xgboost" classifier_log_paths[ LinearSVMClassifier. name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/linearsvm" classifier_log_paths[ NaiveBayesClassifier. name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/naivebayes" dataset_log_files = {} dataset_log_files[HospitalHoloClean().name] = "hospital" dataset_log_files[BlackOakDataSetUppercase().name] = "blackoak" dataset_log_files[FlightHoloClean().name] = "flight" #dataset_log_files[Salary().name] = "salary1" classifier_to_use = XGBoostClassifier model_for_dataset = HospitalHoloClean() datasets = [HospitalHoloClean(), BlackOakDataSetUppercase(), FlightHoloClean()] for i in range(len(datasets)): if datasets[i].name == model_for_dataset.name: datasets.pop(i) break print "datasets used for training:"
row = rng.randint(clean_pd.shape[0]) #print row if row in ids: continue ids.add(row) if len(zip_dict[dirty_pd['zip code'].values[row]]) > 1: print str(dirty_pd[column_name][row]) + "->" + str(cities[city]) zip_dict[dirty_pd['zip code'].values[row]].remove(row) dirty_pd[column_name][row] = cities[city] error_count += 1 super(MyFD, self).__init__(MyFD.name, dirty_pd, clean_pd) def validate(self): print "validate" if __name__ == '__main__': from ml.datasets.hospital.HospitalHoloClean import HospitalHoloClean data = MyFD(HospitalHoloClean(), 0.01, "city") #from ml.datasets.blackOak.BlackOakDataSetUppercase import BlackOakDataSetUppercase #data = MyFD(BlackOakDataSetUppercase(), 0.01, "City") print np.sum(data.matrix_is_error, axis=0) / float(data.shape[0]) print np.sum(data.matrix_is_error, axis=0) print data.shape
N_datasets = 7 ''' #log_folder = "unique_batch" #log_folder = "bart/fd1/5percent" #log_folder = "bart/outlier/20percent" #log_folder = "bart/fd1/30percent" #log_folder = "bart/fd1_add" #log_folder = "hospitalFD/30percent" log_folder = "bartstupid/1percent" from ml.datasets.HospitalFD.MyFD import MyFD #dataset = MyFD(HospitalHoloClean(), 0.3, "city") # 0.01, 0.05, 0.1, 0.2, 0.3 from ml.datasets.BartDataset.BartDataSet import BartDataset dataset = BartDataset(HospitalHoloClean(), "bart_fd_stupid/1percent") #dataset = HospitalHoloClean() #dataset.name = "MyFD" #future_steps = 8+9 #BlackOak = 7, Flights = 9 #future_steps = 8+20 #BlackOak = 7 #future_steps = 17*2 + 60 future_steps = 3 #outlier data ''' datan = Salary() def convert_to_int(value): return str(int(float(value))) datan.clean_pd[datan.clean_pd.columns[8]] = datan.clean_pd[datan.clean_pd.columns[8]].apply(convert_to_int) dataset = BartDataset(datan, "Salary_outlier_20percent")
enable_plotting = True cutting = True use_potential = False classifier_log_paths = {} #classifier_log_paths[XGBoostClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/xgboost" #classifier_log_paths[LinearSVMClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/linearsvm" #classifier_log_paths[NaiveBayesClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/naivebayes" #classifier_log_paths[XGBoostClassifier.name] = "/home/felix/ExampleDrivenErrorDetection/progress_log_data/unique" dataset_log_files = {} dataset_log_files[HospitalHoloClean().name] = "hospital" dataset_log_files[BlackOakDataSetUppercase().name] = "blackoak" dataset_log_files[FlightHoloClean().name] = "flight" dataset_log_files[Book().name] = "book" dataset_log_files[Salary().name] = "salaries" dataset_log_files[Restaurant().name] = "restaurant" classifier_to_use = XGBoostClassifier model_for_dataset = HospitalHoloClean() datasets = [ HospitalHoloClean(), BlackOakDataSetUppercase(), FlightHoloClean(), Book(), Salary(),
datasets.pop(i) break print "datasets used for training:" for i in range(len(datasets)): print datasets[i] N_datasets = 7 ''' log_folder = "unique_batch" #"unique" dataset = HospitalHoloClean() #BlackOakDataSetUppercase() future_steps = 60 #BlackOak = 7, Flights = 9 n = dataset.get_number_dirty_columns() best_sum_total_f = {} best_col_seq = {} for d in range(10): file_path = "/home/felix/ExampleDrivenErrorDetection/progress_log_data/" + log_folder + "/log_progress_"+ dataset.name +"_" + str(d) +".csv" x, fp, fn, tp = read_csv1(file_path, None) certainty_sum = get_all_certainty_stddev(x, feature_names)
else: all_matrix_train = hstack( (all_matrix_train, all_matrix_train_deep)).tocsr() feature_name_list.extend(feature_name_list_deep) return all_matrix_train, all_matrix_test, feature_name_list # input start_time = time.time() from ml.datasets.flights.FlightHoloClean import FlightHoloClean #dataSet = FlightHoloClean() from ml.datasets.hospital.HospitalHoloClean import HospitalHoloClean dataSet = HospitalHoloClean() from ml.datasets.blackOak.BlackOakDataSetUppercase import BlackOakDataSetUppercase #dataSet = BlackOakDataSetUppercase() from ml.datasets.salary_data.Salary import Salary #dataSet = Salary() from ml.datasets.luna.book.Book import Book #dataSet = Book() from ml.datasets.luna.restaurant.Restaurant import Restaurant #dataSet = Restaurant() ''' from ml.datasets.synthetic.Synthetic import Synthetic from ml.datasets.synthetic.ReplaceError import ReplaceError
import warnings import numpy as np from ml.datasets.hospital.HospitalHoloClean import HospitalHoloClean from ml.tools.dboost.TestDBoost import test_multiple_sizes_mixture warnings.filterwarnings("ignore", category=DeprecationWarning) data = HospitalHoloClean() ''' steps = 100 sizes = [10, 20, 30, 40, 50] N = 5 test_multiple_sizes_hist(data, steps, N, sizes) ''' steps = 100 N = 1 #10 labels = 918 nr_rows = int(float(labels) / data.shape[1]) #sizes = np.array([200, 400, 600, 800], dtype=float) # in cells sizes = np.array([400], dtype=float) # in cells print sizes dirty_column_fraction = data.get_number_dirty_columns() / float(data.shape[1]) sizes /= dirty_column_fraction sizes /= float(data.shape[1]) print sizes
use_absolute_difference = True # False == Squared / True == Absolute enable_plotting = True classifier_log_paths = {} #classifier_log_paths[XGBoostClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/xgboost" #classifier_log_paths[LinearSVMClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/linearsvm" #classifier_log_paths[NaiveBayesClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/naivebayes" classifier_log_paths[ XGBoostClassifier. name] = "/home/felix/ExampleDrivenErrorDetection/progress_log_data/7" dataset_log_files = {} dataset_log_files[HospitalHoloClean().name] = "hospital" dataset_log_files[BlackOakDataSetUppercase().name] = "blackoak" dataset_log_files[FlightHoloClean().name] = "flight" dataset_log_files[Book().name] = "book" dataset_log_files[Salary().name] = "salaries" dataset_log_files[Restaurant().name] = "restaurant" classifier_to_use = XGBoostClassifier model_for_dataset = BlackOakDataSetUppercase() datasets = [ HospitalHoloClean(), BlackOakDataSetUppercase(), FlightHoloClean(), Book(), Salary(),
from ml.tools.dboost.TestDBoost import run_params_gaussian import time import numpy as np import glob from ml.configuration.Config import Config import os mypath = Config.get("logging.folder") + "/out/server_dboost" mylist = [f for f in glob.glob(mypath + "/*.txt")] datasets = [ FlightHoloClean(), Beers(), BlackOakDataSetUppercase(), HospitalHoloClean(), Movies(), Restaurant(), Citation(), Salary() ] N = 1 path_folder = Config.get("logging.folder") + "/out/dboost_runtime" if not os.path.exists(path_folder): os.makedirs(path_folder) log_file = open(path_folder + '/dboost_runtime' + str(time.time()) + '.csv', 'w+')