def load_model(dataSet, classifier):
    dataset_log_files = {}
    dataset_log_files[HospitalHoloClean().name] = "hospital"
    dataset_log_files[BlackOakDataSetUppercase().name] = "blackoak"
    dataset_log_files[FlightHoloClean().name] = "flight"
    # not yet
    dataset_log_files[Salary().name] = "hospital"  # be careful
    dataset_log_files[Book().name] = "hospital"  # be careful

    potential_model_dir = Config.get("column.potential.models")

    tp_model = pickle.load(
        open(potential_model_dir + "/tp_model" +
             dataset_log_files[dataSet.name] + "_" + classifier.name + ".p"))
    fpfn_model = pickle.load(
        open(potential_model_dir + "/fpfn_model" +
             dataset_log_files[dataSet.name] + "_" + classifier.name + ".p"))

    delta_tp_model = pickle.load(
        open(potential_model_dir + "/delta_tp_model" +
             dataset_log_files[dataSet.name] + "_" + classifier.name + ".p"))
    delta_fpfn_model = pickle.load(
        open(potential_model_dir + "/delta_fpfn_model" +
             dataset_log_files[dataSet.name] + "_" + classifier.name + ".p"))

    return tp_model, fpfn_model, delta_tp_model, delta_fpfn_model
Esempio n. 2
0
    def __init__(self):
        holoclean = HospitalHoloClean()

        rng = np.random.RandomState(42)

        clean_pd = holoclean.clean_pd.copy()
        dirty_pd = holoclean.clean_pd.copy()
        is_error = holoclean.matrix_is_error

        dirty_matrix = dirty_pd.values

        for c in range(clean_pd.shape[1]):
            domain = clean_pd[clean_pd.columns[c]].unique()
            if len(domain) > 1:
                for r in range(clean_pd.shape[0]):
                    if is_error[r, c]:
                        val = dirty_matrix[r, c]
                        while dirty_matrix[r, c] == val:
                            val = domain[rng.randint(len(domain))]

                        print str(dirty_matrix[r, c]) + " -> " + str(val)
                        dirty_matrix[r, c] = val

        dirty_pd = pd.DataFrame(dirty_matrix, columns=holoclean.dirty_pd.columns)

        super(HospitalDomainError, self).__init__(HospitalDomainError.name, dirty_pd, clean_pd)
def load_model(dataSet, classifier):

    dataset_log_files = {}
    dataset_log_files[HospitalHoloClean().name] = "hospital"
    dataset_log_files[BlackOakDataSetUppercase().name] = "blackoak"
    dataset_log_files[FlightHoloClean().name] = "flight"

    potential_model_dir = Config.get("column.potential.models")

    return pickle.load(
        open(potential_model_dir + "/model" + dataset_log_files[dataSet.name] +
             "_" + classifier.name + ".p"))
Esempio n. 4
0
def load_model(dataSet):
    dataset_log_files = {}
    dataset_log_files[HospitalHoloClean().name] = "hospital"
    dataset_log_files[BlackOakDataSetUppercase().name] = "blackoak"
    dataset_log_files[FlightHoloClean().name] = "flight"
    # not yet
    #dataset_log_files[Salary().name] = "hospital"  # be careful
    #dataset_log_files[Book().name] = "hospital"  # be careful

    potential_model_dir = '/home/felix/ExampleDrivenErrorDetection/potential models/unique_false_current_hist'

    return pickle.load(
        open(potential_model_dir + "/model" + dataset_log_files[dataSet.name] +
             "_" + "XGBoost" + ".p"))
def load_model(dataSet, classifier):

    dataset_log_files = {}
    dataset_log_files[HospitalHoloClean().name] = "hospital"
    dataset_log_files[BlackOakDataSetUppercase().name] = "blackoak"
    dataset_log_files[FlightHoloClean().name] = "flight"
    # not yet
    dataset_log_files[Salary().name] = "hospital"  # be careful
    dataset_log_files[Book().name] = "hospital"  # be careful

    #potential_model_dir = Config.get("column.potential.models")
    potential_model_dir = "/home/felix/ExampleDrivenErrorDetection/potential models/classification"

    return pickle.load(
        open(potential_model_dir + "/model" + dataset_log_files[dataSet.name] +
             "_" + classifier.name + ".p"))
def load_model(dataSet):
    dataset_log_files = {}
    dataset_log_files[HospitalHoloClean().name] = "hospital"
    dataset_log_files[BlackOakDataSetUppercase().name] = "blackoak"
    dataset_log_files[FlightHoloClean().name] = "flight"
    # not yet
    # dataset_log_files[Salary().name] = "hospital"  # be careful
    # dataset_log_files[Book().name] = "hospital"  # be careful

    #potential_model_dir = '/home/felix/ExampleDrivenErrorDetection/potential models/current_total_f'
    potential_model_dir = '/home/felix/ExampleDrivenErrorDetection/potential models/simulation100data'

    tp_model = pickle.load(
        open(potential_model_dir + "/tp_model_" + "XGBoost" + ".p"))
    fp_model = pickle.load(
        open(potential_model_dir + "/fp_model_" + "XGBoost" + ".p"))
    fn_model = pickle.load(open(potential_model_dir + "/fn_model_XGBoost.p"))

    return tp_model, fp_model, fn_model
Esempio n. 7
0
import numpy as np
from ml.datasets.hospital.HospitalHoloClean import HospitalHoloClean



data = HospitalHoloClean()

columns = ["ProviderNumber",
           "HospitalName",
           "Address1",
           "City",
           "State",
           "ZipCode",
           "CountyName",
           "PhoneNumber",
           "HospitalType",
           "HospitalOwner",
           "EmergencyService",
           "Condition",
           "MeasureCode",
           "MeasureName",
           "Score",
           "Sample",
           "Stateavg"]

print columns
print list(data.clean_pd.columns)


#detected = np.load("/home/felix/ExampleDrivenErrorDetection/model/ml/save_detected.npy")
detected = data.matrix_is_error
Esempio n. 8
0
enable_plotting = True

classifier_log_paths = {}
classifier_log_paths[
    XGBoostClassifier.
    name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/xgboost"
classifier_log_paths[
    LinearSVMClassifier.
    name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/linearsvm"
classifier_log_paths[
    NaiveBayesClassifier.
    name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/naivebayes"

dataset_log_files = {}
dataset_log_files[HospitalHoloClean().name] = "hospital"
dataset_log_files[BlackOakDataSetUppercase().name] = "blackoak"
dataset_log_files[FlightHoloClean().name] = "flight"
#dataset_log_files[Salary().name] = "salary1"

classifier_to_use = XGBoostClassifier
model_for_dataset = HospitalHoloClean()

datasets = [HospitalHoloClean(), BlackOakDataSetUppercase(), FlightHoloClean()]

for i in range(len(datasets)):
    if datasets[i].name == model_for_dataset.name:
        datasets.pop(i)
        break

print "datasets used for training:"
Esempio n. 9
0
            row = rng.randint(clean_pd.shape[0])

            #print row

            if row in ids:
                continue
            ids.add(row)

            if len(zip_dict[dirty_pd['zip code'].values[row]]) > 1:
                print str(dirty_pd[column_name][row]) + "->" + str(cities[city])
                zip_dict[dirty_pd['zip code'].values[row]].remove(row)
                dirty_pd[column_name][row] = cities[city]

                error_count += 1

        super(MyFD, self).__init__(MyFD.name, dirty_pd, clean_pd)

    def validate(self):
        print "validate"

if __name__ == '__main__':
    from ml.datasets.hospital.HospitalHoloClean import HospitalHoloClean
    data = MyFD(HospitalHoloClean(), 0.01, "city")
    #from ml.datasets.blackOak.BlackOakDataSetUppercase import BlackOakDataSetUppercase
    #data = MyFD(BlackOakDataSetUppercase(), 0.01, "City")

    print np.sum(data.matrix_is_error, axis=0) / float(data.shape[0])
    print np.sum(data.matrix_is_error, axis=0)

    print data.shape
N_datasets = 7
'''

#log_folder = "unique_batch"
#log_folder = "bart/fd1/5percent"
#log_folder = "bart/outlier/20percent"
#log_folder = "bart/fd1/30percent"
#log_folder = "bart/fd1_add"
#log_folder = "hospitalFD/30percent"
log_folder = "bartstupid/1percent"

from ml.datasets.HospitalFD.MyFD import MyFD
#dataset = MyFD(HospitalHoloClean(), 0.3, "city") # 0.01, 0.05, 0.1, 0.2, 0.3

from ml.datasets.BartDataset.BartDataSet import BartDataset
dataset = BartDataset(HospitalHoloClean(), "bart_fd_stupid/1percent")

#dataset = HospitalHoloClean()
#dataset.name = "MyFD"
#future_steps = 8+9 #BlackOak = 7, Flights = 9
#future_steps = 8+20 #BlackOak = 7
#future_steps = 17*2 + 60
future_steps = 3

#outlier data
'''
datan = Salary()
def convert_to_int(value):
    return str(int(float(value)))
datan.clean_pd[datan.clean_pd.columns[8]] = datan.clean_pd[datan.clean_pd.columns[8]].apply(convert_to_int)
dataset = BartDataset(datan, "Salary_outlier_20percent")
Esempio n. 11
0
enable_plotting = True

cutting = True

use_potential = False

classifier_log_paths = {}
#classifier_log_paths[XGBoostClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/xgboost"
#classifier_log_paths[LinearSVMClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/linearsvm"
#classifier_log_paths[NaiveBayesClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/naivebayes"

#classifier_log_paths[XGBoostClassifier.name] = "/home/felix/ExampleDrivenErrorDetection/progress_log_data/unique"

dataset_log_files = {}
dataset_log_files[HospitalHoloClean().name] = "hospital"
dataset_log_files[BlackOakDataSetUppercase().name] = "blackoak"
dataset_log_files[FlightHoloClean().name] = "flight"
dataset_log_files[Book().name] = "book"
dataset_log_files[Salary().name] = "salaries"
dataset_log_files[Restaurant().name] = "restaurant"

classifier_to_use = XGBoostClassifier
model_for_dataset = HospitalHoloClean()

datasets = [
    HospitalHoloClean(),
    BlackOakDataSetUppercase(),
    FlightHoloClean(),
    Book(),
    Salary(),
Esempio n. 12
0
        datasets.pop(i)
        break

print "datasets used for training:"
for i in range(len(datasets)):
    print datasets[i]

N_datasets = 7
'''




log_folder = "unique_batch" #"unique"

dataset = HospitalHoloClean() #BlackOakDataSetUppercase()
future_steps = 60 #BlackOak = 7, Flights = 9

n = dataset.get_number_dirty_columns()

best_sum_total_f = {}
best_col_seq  = {}



for d in range(10):
    file_path = "/home/felix/ExampleDrivenErrorDetection/progress_log_data/" + log_folder + "/log_progress_"+ dataset.name +"_" + str(d)  +".csv"
    x, fp, fn, tp = read_csv1(file_path, None)

    certainty_sum = get_all_certainty_stddev(x, feature_names)
Esempio n. 13
0
    else:
        all_matrix_train = hstack(
            (all_matrix_train, all_matrix_train_deep)).tocsr()
        feature_name_list.extend(feature_name_list_deep)

    return all_matrix_train, all_matrix_test, feature_name_list


# input

start_time = time.time()

from ml.datasets.flights.FlightHoloClean import FlightHoloClean
#dataSet = FlightHoloClean()
from ml.datasets.hospital.HospitalHoloClean import HospitalHoloClean
dataSet = HospitalHoloClean()
from ml.datasets.blackOak.BlackOakDataSetUppercase import BlackOakDataSetUppercase
#dataSet = BlackOakDataSetUppercase()

from ml.datasets.salary_data.Salary import Salary
#dataSet = Salary()

from ml.datasets.luna.book.Book import Book
#dataSet = Book()

from ml.datasets.luna.restaurant.Restaurant import Restaurant

#dataSet = Restaurant()
'''
from ml.datasets.synthetic.Synthetic import Synthetic
from ml.datasets.synthetic.ReplaceError import ReplaceError
import warnings

import numpy as np

from ml.datasets.hospital.HospitalHoloClean import HospitalHoloClean
from ml.tools.dboost.TestDBoost import test_multiple_sizes_mixture

warnings.filterwarnings("ignore", category=DeprecationWarning)

data = HospitalHoloClean()
'''
steps = 100
sizes = [10, 20, 30, 40, 50]
N = 5

test_multiple_sizes_hist(data, steps, N, sizes)
'''

steps = 100
N = 1  #10
labels = 918

nr_rows = int(float(labels) / data.shape[1])
#sizes = np.array([200, 400, 600, 800], dtype=float) # in cells
sizes = np.array([400], dtype=float)  # in cells

print sizes
dirty_column_fraction = data.get_number_dirty_columns() / float(data.shape[1])
sizes /= dirty_column_fraction
sizes /= float(data.shape[1])
print sizes
use_absolute_difference = True  # False == Squared / True == Absolute

enable_plotting = True

classifier_log_paths = {}
#classifier_log_paths[XGBoostClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/xgboost"
#classifier_log_paths[LinearSVMClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/linearsvm"
#classifier_log_paths[NaiveBayesClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/naivebayes"

classifier_log_paths[
    XGBoostClassifier.
    name] = "/home/felix/ExampleDrivenErrorDetection/progress_log_data/7"

dataset_log_files = {}
dataset_log_files[HospitalHoloClean().name] = "hospital"
dataset_log_files[BlackOakDataSetUppercase().name] = "blackoak"
dataset_log_files[FlightHoloClean().name] = "flight"
dataset_log_files[Book().name] = "book"
dataset_log_files[Salary().name] = "salaries"
dataset_log_files[Restaurant().name] = "restaurant"

classifier_to_use = XGBoostClassifier
model_for_dataset = BlackOakDataSetUppercase()

datasets = [
    HospitalHoloClean(),
    BlackOakDataSetUppercase(),
    FlightHoloClean(),
    Book(),
    Salary(),
Esempio n. 16
0
from ml.tools.dboost.TestDBoost import run_params_gaussian

import time
import numpy as np
import glob
from ml.configuration.Config import Config
import os

mypath = Config.get("logging.folder") + "/out/server_dboost"
mylist = [f for f in glob.glob(mypath + "/*.txt")]

datasets = [
    FlightHoloClean(),
    Beers(),
    BlackOakDataSetUppercase(),
    HospitalHoloClean(),
    Movies(),
    Restaurant(),
    Citation(),
    Salary()
]

N = 1

path_folder = Config.get("logging.folder") + "/out/dboost_runtime"

if not os.path.exists(path_folder):
    os.makedirs(path_folder)

log_file = open(path_folder + '/dboost_runtime' + str(time.time()) + '.csv',
                'w+')