def load_model(dataSet, classifier): dataset_log_files = {} dataset_log_files[HospitalHoloClean().name] = "hospital" dataset_log_files[BlackOakDataSetUppercase().name] = "blackoak" dataset_log_files[FlightHoloClean().name] = "flight" # not yet dataset_log_files[Salary().name] = "hospital" # be careful dataset_log_files[Book().name] = "hospital" # be careful potential_model_dir = Config.get("column.potential.models") tp_model = pickle.load( open(potential_model_dir + "/tp_model" + dataset_log_files[dataSet.name] + "_" + classifier.name + ".p")) fpfn_model = pickle.load( open(potential_model_dir + "/fpfn_model" + dataset_log_files[dataSet.name] + "_" + classifier.name + ".p")) delta_tp_model = pickle.load( open(potential_model_dir + "/delta_tp_model" + dataset_log_files[dataSet.name] + "_" + classifier.name + ".p")) delta_fpfn_model = pickle.load( open(potential_model_dir + "/delta_fpfn_model" + dataset_log_files[dataSet.name] + "_" + classifier.name + ".p")) return tp_model, fpfn_model, delta_tp_model, delta_fpfn_model
def load_model(dataSet, classifier): dataset_log_files = {} dataset_log_files[HospitalHoloClean().name] = "hospital" dataset_log_files[BlackOakDataSetUppercase().name] = "blackoak" dataset_log_files[FlightHoloClean().name] = "flight" # not yet dataset_log_files[Salary().name] = "hospital" # be careful dataset_log_files[Book().name] = "hospital" # be careful #potential_model_dir = Config.get("column.potential.models") potential_model_dir = "/home/felix/ExampleDrivenErrorDetection/potential models/classification" return pickle.load( open(potential_model_dir + "/model" + dataset_log_files[dataSet.name] + "_" + classifier.name + ".p"))
cutting = True use_potential = False classifier_log_paths = {} #classifier_log_paths[XGBoostClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/xgboost" #classifier_log_paths[LinearSVMClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/linearsvm" #classifier_log_paths[NaiveBayesClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/naivebayes" #classifier_log_paths[XGBoostClassifier.name] = "/home/felix/ExampleDrivenErrorDetection/progress_log_data/unique" dataset_log_files = {} dataset_log_files[HospitalHoloClean().name] = "hospital" dataset_log_files[BlackOakDataSetUppercase().name] = "blackoak" dataset_log_files[FlightHoloClean().name] = "flight" dataset_log_files[Book().name] = "book" dataset_log_files[Salary().name] = "salaries" dataset_log_files[Restaurant().name] = "restaurant" classifier_to_use = XGBoostClassifier model_for_dataset = HospitalHoloClean() datasets = [ HospitalHoloClean(), BlackOakDataSetUppercase(), FlightHoloClean(), Book(), Salary(), Restaurant() ]
from ml.datasets.flights.FlightHoloClean import FlightHoloClean from ml.datasets.products.Products import Products from ml.datasets.luna.book.Book import Book from ml.datasets.electronics.Electronics import Electronics from ml.datasets.salary_data.Salary import Salary import pandas as pd import csv from ml.data_generator.generate_bart_config import generate_bart_config from shutil import copyfile datasets = [ BlackOakDataSetUppercase().clean_pd.values, FlightHoloClean().clean_pd.values, Salary().clean_pd.values, Electronics().clean_pd.values, Book().clean_pd.values, Products().clean_pd.values ] for n in range(1000): # select dataset dataset_id = np.random.randint(len(datasets)) dataset = datasets[dataset_id] # select number of rows max_rows = 2000 if datasets[dataset_id].shape[0] < max_rows: max_rows = datasets[dataset_id].shape[0] row_size = np.random.randint(low=500, high=max_rows)
enable_plotting = True classifier_log_paths = {} #classifier_log_paths[XGBoostClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/xgboost" #classifier_log_paths[LinearSVMClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/linearsvm" #classifier_log_paths[NaiveBayesClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/naivebayes" classifier_log_paths[XGBoostClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/neweat_backup"#"/home/felix/ExampleDrivenErrorDetection/progress_log_data/new_mean_certainty_change_all"#hist_change" dataset_log_files = {} dataset_log_files[HospitalHoloClean().name] = "hospital" dataset_log_files[BlackOakDataSetUppercase().name] = "blackoak" dataset_log_files[FlightHoloClean().name] = "flight" dataset_log_files[Book().name] = "book" dataset_log_files[Salary().name] = "salaries" dataset_log_files[Restaurant().name] = "restaurant" classifier_to_use = XGBoostClassifier model_for_dataset = HospitalHoloClean() datasets = [HospitalHoloClean(), BlackOakDataSetUppercase(), FlightHoloClean(), Book(), Salary(), Restaurant()] for i in range(len(datasets)): if datasets[i].name == model_for_dataset.name: datasets.pop(i) break print "datasets used for training:"