def getConfig(dataset): path = None future_steps = -1 if type(dataset) == type(FlightHoloClean()): path = '/home/felix/phd/round_robin_part/flights' future_steps = 30 if type(dataset) == type(FlightHoloClean()): path = '/home/felix/phd/round_robin_part/flights' future_steps = 30 return path, future_steps
def load_model(dataSet, classifier): dataset_log_files = {} dataset_log_files[HospitalHoloClean().name] = "hospital" dataset_log_files[BlackOakDataSetUppercase().name] = "blackoak" dataset_log_files[FlightHoloClean().name] = "flight" # not yet dataset_log_files[Salary().name] = "hospital" # be careful dataset_log_files[Book().name] = "hospital" # be careful potential_model_dir = Config.get("column.potential.models") tp_model = pickle.load( open(potential_model_dir + "/tp_model" + dataset_log_files[dataSet.name] + "_" + classifier.name + ".p")) fpfn_model = pickle.load( open(potential_model_dir + "/fpfn_model" + dataset_log_files[dataSet.name] + "_" + classifier.name + ".p")) delta_tp_model = pickle.load( open(potential_model_dir + "/delta_tp_model" + dataset_log_files[dataSet.name] + "_" + classifier.name + ".p")) delta_fpfn_model = pickle.load( open(potential_model_dir + "/delta_fpfn_model" + dataset_log_files[dataSet.name] + "_" + classifier.name + ".p")) return tp_model, fpfn_model, delta_tp_model, delta_fpfn_model
def main(): app = QApplication(sys.argv) #data = BlackOakDataSetUppercase() data = FlightHoloClean() ex = Example(data) sys.exit(app.exec_())
def load_model(dataSet, classifier): dataset_log_files = {} dataset_log_files[HospitalHoloClean().name] = "hospital" dataset_log_files[BlackOakDataSetUppercase().name] = "blackoak" dataset_log_files[FlightHoloClean().name] = "flight" potential_model_dir = Config.get("column.potential.models") return pickle.load( open(potential_model_dir + "/model" + dataset_log_files[dataSet.name] + "_" + classifier.name + ".p"))
def load_model(dataSet): dataset_log_files = {} dataset_log_files[HospitalHoloClean().name] = "hospital" dataset_log_files[BlackOakDataSetUppercase().name] = "blackoak" dataset_log_files[FlightHoloClean().name] = "flight" # not yet #dataset_log_files[Salary().name] = "hospital" # be careful #dataset_log_files[Book().name] = "hospital" # be careful potential_model_dir = '/home/felix/ExampleDrivenErrorDetection/potential models/unique_false_current_hist' return pickle.load( open(potential_model_dir + "/model" + dataset_log_files[dataSet.name] + "_" + "XGBoost" + ".p"))
def load_model(dataSet, classifier): dataset_log_files = {} dataset_log_files[HospitalHoloClean().name] = "hospital" dataset_log_files[BlackOakDataSetUppercase().name] = "blackoak" dataset_log_files[FlightHoloClean().name] = "flight" # not yet dataset_log_files[Salary().name] = "hospital" # be careful dataset_log_files[Book().name] = "hospital" # be careful #potential_model_dir = Config.get("column.potential.models") potential_model_dir = "/home/felix/ExampleDrivenErrorDetection/potential models/classification" return pickle.load( open(potential_model_dir + "/model" + dataset_log_files[dataSet.name] + "_" + classifier.name + ".p"))
def load_model(dataSet): dataset_log_files = {} dataset_log_files[HospitalHoloClean().name] = "hospital" dataset_log_files[BlackOakDataSetUppercase().name] = "blackoak" dataset_log_files[FlightHoloClean().name] = "flight" # not yet # dataset_log_files[Salary().name] = "hospital" # be careful # dataset_log_files[Book().name] = "hospital" # be careful #potential_model_dir = '/home/felix/ExampleDrivenErrorDetection/potential models/current_total_f' potential_model_dir = '/home/felix/ExampleDrivenErrorDetection/potential models/simulation100data' tp_model = pickle.load( open(potential_model_dir + "/tp_model_" + "XGBoost" + ".p")) fp_model = pickle.load( open(potential_model_dir + "/fp_model_" + "XGBoost" + ".p")) fn_model = pickle.load(open(potential_model_dir + "/fn_model_XGBoost.p")) return tp_model, fp_model, fn_model
classifier_log_paths = {} classifier_log_paths[ XGBoostClassifier. name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/xgboost" classifier_log_paths[ LinearSVMClassifier. name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/linearsvm" classifier_log_paths[ NaiveBayesClassifier. name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/naivebayes" dataset_log_files = {} dataset_log_files[HospitalHoloClean().name] = "hospital" dataset_log_files[BlackOakDataSetUppercase().name] = "blackoak" dataset_log_files[FlightHoloClean().name] = "flight" #dataset_log_files[Salary().name] = "salary1" classifier_to_use = XGBoostClassifier model_for_dataset = HospitalHoloClean() datasets = [HospitalHoloClean(), BlackOakDataSetUppercase(), FlightHoloClean()] for i in range(len(datasets)): if datasets[i].name == model_for_dataset.name: datasets.pop(i) break print "datasets used for training:" for i in range(len(datasets)): print datasets[i]
from sets import Set from ml.datasets.flights.FlightHoloClean import FlightHoloClean from ml.tools.katara_new.Katara import Katara data = FlightHoloClean() #data.dirty_pd.to_csv('/tmp/data.csv', index=False) tool = Katara( "/home/felix/ExampleDrivenErrorDetection/data/katara/flights.txt", data) print "Fscore: " + str(tool.calculate_total_fscore()) print "Precision: " + str(tool.calculate_total_precision()) print "Recall: " + str(tool.calculate_total_recall()) for c in range(data.shape[1]): print tool.calculate_fscore_by_column(c)
''' #log_folder = "unique_batch" #log_folder = "bart/fd1/20percent" #log_folder = "word_unigrams" #log_folder = "unigrams" #log_folder = "bigrams" #log_folder = "metadata" #log_folder = "unique_batch" #log_folder = "unigram_metadata_naivebayes" #log_folder = "unigram_metadata_linearsvm" #log_folder = "food" log_folder = "deep_all" #dataset = FoodHoloClean() dataset = FlightHoloClean( ) #FlightHoloClean()#BlackOakDataSetUppercase()#HospitalHoloClean() #BlackOakDataSetUppercase() #future_steps = 60 #BlackOak = 7, Flights = 9 ''' from ml.datasets.BartDataset.BartDataSet import BartDataset dataset = BartDataset(BlackOakDataSetUppercase(), "CityFD_20percent") ''' future_steps = 20 # 60 n = dataset.get_number_dirty_columns() best_sum_total_f = {} best_col_seq = {} for d in range(10): file_path = "/home/felix/ExampleDrivenErrorDetection/progress_log_data/" + log_folder + "/log_progress_" + dataset.name + "_" + str( d) + ".csv"
cutting = True use_potential = False classifier_log_paths = {} #classifier_log_paths[XGBoostClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/xgboost" #classifier_log_paths[LinearSVMClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/linearsvm" #classifier_log_paths[NaiveBayesClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/naivebayes" #classifier_log_paths[XGBoostClassifier.name] = "/home/felix/ExampleDrivenErrorDetection/progress_log_data/unique" dataset_log_files = {} dataset_log_files[HospitalHoloClean().name] = "hospital" dataset_log_files[BlackOakDataSetUppercase().name] = "blackoak" dataset_log_files[FlightHoloClean().name] = "flight" dataset_log_files[Book().name] = "book" dataset_log_files[Salary().name] = "salaries" dataset_log_files[Restaurant().name] = "restaurant" classifier_to_use = XGBoostClassifier model_for_dataset = HospitalHoloClean() datasets = [ HospitalHoloClean(), BlackOakDataSetUppercase(), FlightHoloClean(), Book(), Salary(), Restaurant() ]
certainty_array[index_i] = np.square(1.0 - value) indexmap[index_i] = key index_i += 1 prob = normalize(certainty_array) print certainty_array print prob return indexmap[np.random.choice(len(prob), 1, p=prob)[0]] #input from ml.datasets.flights.FlightHoloClean import FlightHoloClean dataSet = FlightHoloClean() from ml.datasets.hospital.HospitalHoloClean import HospitalHoloClean #dataSet = HospitalHoloClean() from ml.datasets.blackOak.BlackOakDataSetUppercase import BlackOakDataSetUppercase #dataSet = BlackOakDataSetUppercase() #print("read: %s seconds ---" % (time.time() - start_time)) start_time = time.time() train_fraction = 1.0 ngrams = 1 runSVD = False use_metadata = True user_error_probability = 0.0 step_size = 10
feature_names = [ i for j, i in enumerate(feature_names) if j in which_features_to_use ] use_absolute_difference = True # False == Squared / True == Absolute enable_plotting = True cutting = True use_potential = False classifier_log_paths = {} dataset = FlightHoloClean() def getConfig(dataset): path = None future_steps = -1 if type(dataset) == type(FlightHoloClean()): path = '/home/felix/phd/round_robin_part/flights' future_steps = 4 * 2 + 20 return path, future_steps mypath, future_steps = getConfig(dataset) n = dataset.get_number_dirty_columns()
import numpy as np from ml.datasets.blackOak.BlackOakDataSetUppercase import BlackOakDataSetUppercase from ml.datasets.flights.FlightHoloClean import FlightHoloClean from ml.datasets.products.Products import Products from ml.datasets.luna.book.Book import Book from ml.datasets.electronics.Electronics import Electronics from ml.datasets.salary_data.Salary import Salary import pandas as pd import csv from ml.data_generator.generate_bart_config import generate_bart_config from shutil import copyfile datasets = [ BlackOakDataSetUppercase().clean_pd.values, FlightHoloClean().clean_pd.values, Salary().clean_pd.values, Electronics().clean_pd.values, Book().clean_pd.values, Products().clean_pd.values ] for n in range(1000): # select dataset dataset_id = np.random.randint(len(datasets)) dataset = datasets[dataset_id] # select number of rows max_rows = 2000 if datasets[dataset_id].shape[0] < max_rows: max_rows = datasets[dataset_id].shape[0]
from ml.tools.dboost.TestDBoost import run_params_mixture from ml.tools.dboost.TestDBoost import run_params_hist from ml.tools.dboost.TestDBoost import run_params_gaussian import time import numpy as np import glob from ml.configuration.Config import Config import os mypath = Config.get("logging.folder") + "/out/server_dboost" mylist = [f for f in glob.glob(mypath + "/*.txt")] datasets = [ FlightHoloClean(), Beers(), BlackOakDataSetUppercase(), HospitalHoloClean(), Movies(), Restaurant(), Citation(), Salary() ] N = 1 path_folder = Config.get("logging.folder") + "/out/dboost_runtime" if not os.path.exists(path_folder): os.makedirs(path_folder)
enable_plotting = True classifier_log_paths = {} #classifier_log_paths[XGBoostClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/xgboost" #classifier_log_paths[LinearSVMClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/linearsvm" #classifier_log_paths[NaiveBayesClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/naivebayes" classifier_log_paths[XGBoostClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/neweat_backup"#"/home/felix/ExampleDrivenErrorDetection/progress_log_data/new_mean_certainty_change_all"#hist_change" dataset_log_files = {} dataset_log_files[HospitalHoloClean().name] = "hospital" dataset_log_files[BlackOakDataSetUppercase().name] = "blackoak" dataset_log_files[FlightHoloClean().name] = "flight" dataset_log_files[Book().name] = "book" dataset_log_files[Salary().name] = "salaries" dataset_log_files[Restaurant().name] = "restaurant" classifier_to_use = XGBoostClassifier model_for_dataset = HospitalHoloClean() datasets = [HospitalHoloClean(), BlackOakDataSetUppercase(), FlightHoloClean(), Book(), Salary(), Restaurant()] for i in range(len(datasets)): if datasets[i].name == model_for_dataset.name: datasets.pop(i) break