Exemple #1
0
def getConfig(dataset):
    path = None
    future_steps = -1
    if type(dataset) == type(FlightHoloClean()):
        path = '/home/felix/phd/round_robin_part/flights'
        future_steps = 30

    if type(dataset) == type(FlightHoloClean()):
        path = '/home/felix/phd/round_robin_part/flights'
        future_steps = 30

    return path, future_steps
def load_model(dataSet, classifier):
    dataset_log_files = {}
    dataset_log_files[HospitalHoloClean().name] = "hospital"
    dataset_log_files[BlackOakDataSetUppercase().name] = "blackoak"
    dataset_log_files[FlightHoloClean().name] = "flight"
    # not yet
    dataset_log_files[Salary().name] = "hospital"  # be careful
    dataset_log_files[Book().name] = "hospital"  # be careful

    potential_model_dir = Config.get("column.potential.models")

    tp_model = pickle.load(
        open(potential_model_dir + "/tp_model" +
             dataset_log_files[dataSet.name] + "_" + classifier.name + ".p"))
    fpfn_model = pickle.load(
        open(potential_model_dir + "/fpfn_model" +
             dataset_log_files[dataSet.name] + "_" + classifier.name + ".p"))

    delta_tp_model = pickle.load(
        open(potential_model_dir + "/delta_tp_model" +
             dataset_log_files[dataSet.name] + "_" + classifier.name + ".p"))
    delta_fpfn_model = pickle.load(
        open(potential_model_dir + "/delta_fpfn_model" +
             dataset_log_files[dataSet.name] + "_" + classifier.name + ".p"))

    return tp_model, fpfn_model, delta_tp_model, delta_fpfn_model
def main():
    app = QApplication(sys.argv)

    #data = BlackOakDataSetUppercase()
    data = FlightHoloClean()

    ex = Example(data)
    sys.exit(app.exec_())
def load_model(dataSet, classifier):

    dataset_log_files = {}
    dataset_log_files[HospitalHoloClean().name] = "hospital"
    dataset_log_files[BlackOakDataSetUppercase().name] = "blackoak"
    dataset_log_files[FlightHoloClean().name] = "flight"

    potential_model_dir = Config.get("column.potential.models")

    return pickle.load(
        open(potential_model_dir + "/model" + dataset_log_files[dataSet.name] +
             "_" + classifier.name + ".p"))
Exemple #5
0
def load_model(dataSet):
    dataset_log_files = {}
    dataset_log_files[HospitalHoloClean().name] = "hospital"
    dataset_log_files[BlackOakDataSetUppercase().name] = "blackoak"
    dataset_log_files[FlightHoloClean().name] = "flight"
    # not yet
    #dataset_log_files[Salary().name] = "hospital"  # be careful
    #dataset_log_files[Book().name] = "hospital"  # be careful

    potential_model_dir = '/home/felix/ExampleDrivenErrorDetection/potential models/unique_false_current_hist'

    return pickle.load(
        open(potential_model_dir + "/model" + dataset_log_files[dataSet.name] +
             "_" + "XGBoost" + ".p"))
def load_model(dataSet, classifier):

    dataset_log_files = {}
    dataset_log_files[HospitalHoloClean().name] = "hospital"
    dataset_log_files[BlackOakDataSetUppercase().name] = "blackoak"
    dataset_log_files[FlightHoloClean().name] = "flight"
    # not yet
    dataset_log_files[Salary().name] = "hospital"  # be careful
    dataset_log_files[Book().name] = "hospital"  # be careful

    #potential_model_dir = Config.get("column.potential.models")
    potential_model_dir = "/home/felix/ExampleDrivenErrorDetection/potential models/classification"

    return pickle.load(
        open(potential_model_dir + "/model" + dataset_log_files[dataSet.name] +
             "_" + classifier.name + ".p"))
def load_model(dataSet):
    dataset_log_files = {}
    dataset_log_files[HospitalHoloClean().name] = "hospital"
    dataset_log_files[BlackOakDataSetUppercase().name] = "blackoak"
    dataset_log_files[FlightHoloClean().name] = "flight"
    # not yet
    # dataset_log_files[Salary().name] = "hospital"  # be careful
    # dataset_log_files[Book().name] = "hospital"  # be careful

    #potential_model_dir = '/home/felix/ExampleDrivenErrorDetection/potential models/current_total_f'
    potential_model_dir = '/home/felix/ExampleDrivenErrorDetection/potential models/simulation100data'

    tp_model = pickle.load(
        open(potential_model_dir + "/tp_model_" + "XGBoost" + ".p"))
    fp_model = pickle.load(
        open(potential_model_dir + "/fp_model_" + "XGBoost" + ".p"))
    fn_model = pickle.load(open(potential_model_dir + "/fn_model_XGBoost.p"))

    return tp_model, fp_model, fn_model
Exemple #8
0
classifier_log_paths = {}
classifier_log_paths[
    XGBoostClassifier.
    name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/xgboost"
classifier_log_paths[
    LinearSVMClassifier.
    name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/linearsvm"
classifier_log_paths[
    NaiveBayesClassifier.
    name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/naivebayes"

dataset_log_files = {}
dataset_log_files[HospitalHoloClean().name] = "hospital"
dataset_log_files[BlackOakDataSetUppercase().name] = "blackoak"
dataset_log_files[FlightHoloClean().name] = "flight"
#dataset_log_files[Salary().name] = "salary1"

classifier_to_use = XGBoostClassifier
model_for_dataset = HospitalHoloClean()

datasets = [HospitalHoloClean(), BlackOakDataSetUppercase(), FlightHoloClean()]

for i in range(len(datasets)):
    if datasets[i].name == model_for_dataset.name:
        datasets.pop(i)
        break

print "datasets used for training:"
for i in range(len(datasets)):
    print datasets[i]
Exemple #9
0
from sets import Set

from ml.datasets.flights.FlightHoloClean import FlightHoloClean
from ml.tools.katara_new.Katara import Katara

data = FlightHoloClean()

#data.dirty_pd.to_csv('/tmp/data.csv', index=False)

tool = Katara(
    "/home/felix/ExampleDrivenErrorDetection/data/katara/flights.txt", data)

print "Fscore: " + str(tool.calculate_total_fscore())
print "Precision: " + str(tool.calculate_total_precision())
print "Recall: " + str(tool.calculate_total_recall())

for c in range(data.shape[1]):
    print tool.calculate_fscore_by_column(c)
'''

#log_folder = "unique_batch"
#log_folder = "bart/fd1/20percent"
#log_folder = "word_unigrams"
#log_folder = "unigrams"
#log_folder = "bigrams"
#log_folder = "metadata"
#log_folder = "unique_batch"
#log_folder = "unigram_metadata_naivebayes"
#log_folder = "unigram_metadata_linearsvm"
#log_folder = "food"
log_folder = "deep_all"

#dataset = FoodHoloClean()
dataset = FlightHoloClean(
)  #FlightHoloClean()#BlackOakDataSetUppercase()#HospitalHoloClean() #BlackOakDataSetUppercase()
#future_steps = 60 #BlackOak = 7, Flights = 9
'''
from ml.datasets.BartDataset.BartDataSet import BartDataset
dataset = BartDataset(BlackOakDataSetUppercase(), "CityFD_20percent")
'''
future_steps = 20  # 60

n = dataset.get_number_dirty_columns()

best_sum_total_f = {}
best_col_seq = {}

for d in range(10):
    file_path = "/home/felix/ExampleDrivenErrorDetection/progress_log_data/" + log_folder + "/log_progress_" + dataset.name + "_" + str(
        d) + ".csv"
Exemple #11
0
cutting = True

use_potential = False

classifier_log_paths = {}
#classifier_log_paths[XGBoostClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/xgboost"
#classifier_log_paths[LinearSVMClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/linearsvm"
#classifier_log_paths[NaiveBayesClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/naivebayes"

#classifier_log_paths[XGBoostClassifier.name] = "/home/felix/ExampleDrivenErrorDetection/progress_log_data/unique"

dataset_log_files = {}
dataset_log_files[HospitalHoloClean().name] = "hospital"
dataset_log_files[BlackOakDataSetUppercase().name] = "blackoak"
dataset_log_files[FlightHoloClean().name] = "flight"
dataset_log_files[Book().name] = "book"
dataset_log_files[Salary().name] = "salaries"
dataset_log_files[Restaurant().name] = "restaurant"

classifier_to_use = XGBoostClassifier
model_for_dataset = HospitalHoloClean()

datasets = [
    HospitalHoloClean(),
    BlackOakDataSetUppercase(),
    FlightHoloClean(),
    Book(),
    Salary(),
    Restaurant()
]
            certainty_array[index_i] = np.square(1.0 - value)
            indexmap[index_i] = key
            index_i += 1

        prob = normalize(certainty_array)

        print certainty_array
        print prob

        return indexmap[np.random.choice(len(prob), 1, p=prob)[0]]


#input

from ml.datasets.flights.FlightHoloClean import FlightHoloClean
dataSet = FlightHoloClean()
from ml.datasets.hospital.HospitalHoloClean import HospitalHoloClean
#dataSet = HospitalHoloClean()
from ml.datasets.blackOak.BlackOakDataSetUppercase import BlackOakDataSetUppercase
#dataSet = BlackOakDataSetUppercase()

#print("read: %s seconds ---" % (time.time() - start_time))

start_time = time.time()

train_fraction = 1.0
ngrams = 1
runSVD = False
use_metadata = True
user_error_probability = 0.0
step_size = 10
Exemple #13
0
feature_names = [
    i for j, i in enumerate(feature_names) if j in which_features_to_use
]

use_absolute_difference = True  # False == Squared / True == Absolute

enable_plotting = True

cutting = True

use_potential = False

classifier_log_paths = {}

dataset = FlightHoloClean()


def getConfig(dataset):
    path = None
    future_steps = -1
    if type(dataset) == type(FlightHoloClean()):
        path = '/home/felix/phd/round_robin_part/flights'
        future_steps = 4 * 2 + 20

    return path, future_steps


mypath, future_steps = getConfig(dataset)

n = dataset.get_number_dirty_columns()
Exemple #14
0
import numpy as np
from ml.datasets.blackOak.BlackOakDataSetUppercase import BlackOakDataSetUppercase
from ml.datasets.flights.FlightHoloClean import FlightHoloClean
from ml.datasets.products.Products import Products
from ml.datasets.luna.book.Book import Book
from ml.datasets.electronics.Electronics import Electronics
from ml.datasets.salary_data.Salary import Salary
import pandas as pd
import csv
from ml.data_generator.generate_bart_config import generate_bart_config
from shutil import copyfile

datasets = [
    BlackOakDataSetUppercase().clean_pd.values,
    FlightHoloClean().clean_pd.values,
    Salary().clean_pd.values,
    Electronics().clean_pd.values,
    Book().clean_pd.values,
    Products().clean_pd.values
]

for n in range(1000):
    # select dataset
    dataset_id = np.random.randint(len(datasets))
    dataset = datasets[dataset_id]

    # select number of rows
    max_rows = 2000
    if datasets[dataset_id].shape[0] < max_rows:
        max_rows = datasets[dataset_id].shape[0]
Exemple #15
0
from ml.tools.dboost.TestDBoost import run_params_mixture
from ml.tools.dboost.TestDBoost import run_params_hist
from ml.tools.dboost.TestDBoost import run_params_gaussian

import time
import numpy as np
import glob
from ml.configuration.Config import Config
import os

mypath = Config.get("logging.folder") + "/out/server_dboost"
mylist = [f for f in glob.glob(mypath + "/*.txt")]

datasets = [
    FlightHoloClean(),
    Beers(),
    BlackOakDataSetUppercase(),
    HospitalHoloClean(),
    Movies(),
    Restaurant(),
    Citation(),
    Salary()
]

N = 1

path_folder = Config.get("logging.folder") + "/out/dboost_runtime"

if not os.path.exists(path_folder):
    os.makedirs(path_folder)
enable_plotting = True

classifier_log_paths = {}
#classifier_log_paths[XGBoostClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/xgboost"
#classifier_log_paths[LinearSVMClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/linearsvm"
#classifier_log_paths[NaiveBayesClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/naivebayes"

classifier_log_paths[XGBoostClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/neweat_backup"#"/home/felix/ExampleDrivenErrorDetection/progress_log_data/new_mean_certainty_change_all"#hist_change"



dataset_log_files = {}
dataset_log_files[HospitalHoloClean().name] = "hospital"
dataset_log_files[BlackOakDataSetUppercase().name] = "blackoak"
dataset_log_files[FlightHoloClean().name] = "flight"
dataset_log_files[Book().name] = "book"
dataset_log_files[Salary().name] = "salaries"
dataset_log_files[Restaurant().name] = "restaurant"


classifier_to_use = XGBoostClassifier
model_for_dataset = HospitalHoloClean()

datasets = [HospitalHoloClean(), BlackOakDataSetUppercase(), FlightHoloClean(), Book(), Salary(), Restaurant()]

for i in range(len(datasets)):
    if datasets[i].name == model_for_dataset.name:
        datasets.pop(i)
        break