classifier_log_paths = {}
#classifier_log_paths[XGBoostClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/xgboost"
#classifier_log_paths[LinearSVMClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/linearsvm"
#classifier_log_paths[NaiveBayesClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/naivebayes"

#classifier_log_paths[XGBoostClassifier.name] = "/home/felix/ExampleDrivenErrorDetection/progress_log_data/unique"



dataset_log_files = {}
dataset_log_files[HospitalHoloClean().name] = "hospital"
dataset_log_files[BlackOakDataSetUppercase().name] = "blackoak"
dataset_log_files[FlightHoloClean().name] = "flight"
dataset_log_files[Book().name] = "book"
dataset_log_files[Salary().name] = "salaries"
dataset_log_files[Restaurant().name] = "restaurant"


classifier_to_use = XGBoostClassifier
model_for_dataset = HospitalHoloClean()

'''
datasets = [HospitalHoloClean(), BlackOakDataSetUppercase(), FlightHoloClean(), Book(), Salary(), Restaurant()]

for i in range(len(datasets)):
    if datasets[i].name == model_for_dataset.name:
        datasets.pop(i)
        break
Example #2
0
datasets = [HospitalHoloClean(), BlackOakDataSetUppercase(), FlightHoloClean(), Book(), Salary(), Restaurant()]

for i in range(len(datasets)):
    if datasets[i].name == model_for_dataset.name:
        datasets.pop(i)
        break

print "datasets used for training:"
for i in range(len(datasets)):
    print datasets[i]

N_datasets = 7
'''

dataset = FlightHoloClean(
)  #FlightHoloClean()#BlackOakDataSetUppercase()#HospitalHoloClean() #BlackOakDataSetUppercase()

#future_steps = 60 #BlackOak = 7, Flights = 9


def getConfig(dataset):
    path = None
    future_steps = -1
    if type(dataset) == type(FlightHoloClean()):
        path = '/home/felix/phd/round_robin_part/flights'
        future_steps = 20

    return path, future_steps


mypath, future_steps = getConfig(dataset)
import numpy as np

from ml.datasets.flights.FlightHoloClean import FlightHoloClean
from ml.plot.old.user_effort_all_potential.PlotterLatex import PlotterLatex

data = FlightHoloClean()

label_potential = [
    4, 8, 12, 16, 26, 36, 46, 56, 66, 76, 86, 96, 106, 116, 126, 136, 146, 156,
    166, 176, 186, 196, 206, 216, 226, 236, 246, 256, 266, 276, 286
]
fscore_metadata_no_svd_more_data = []
fscore_metadata_no_svd_more_data.append([
    0.0, 0.0, 0.0, 0.0, 0.3019844693700982, 0.5517146398566308,
    0.6819393939392727, 0.8136335209507353, 0.8129854610634996,
    0.8224170196000095, 0.8384676145338619, 0.8636178861788035,
    0.8638716881534007, 0.8869425148494458, 0.893910608939216,
    0.8990408385246227, 0.8884377220586137, 0.8956337602097001,
    0.897860314896998, 0.9071071071070146, 0.910703607733707,
    0.9108178364327478, 0.9166079871239963
])
fscore_metadata_no_svd_more_data.append([
    0.0, 0.0, 0.0, 0.0, 0.30139823925407516, 0.515855039637531,
    0.6529868868382043, 0.7706678118960443, 0.7891252006420769,
    0.7630255697016273, 0.7966925064600301, 0.8155689892883575,
    0.8688841419997421, 0.8693620479479327, 0.876720526630616,
    0.8844040363671689, 0.893855848759437, 0.8996871333594727,
    0.9012019935503657, 0.9018095520618088, 0.910505836575904,
    0.9148914891490294, 0.9183389628453579
])
fscore_metadata_no_svd_more_data.append([
            column_id = 0
        return column_id
    else:
        certainty_array = np.zeros(dataSet.shape[1])
        for key, value in certainty.iteritems():
            certainty_array[key] = value

        min_certainty_index = np.argmin(certainty_array)

        return min_certainty_index


#input

from ml.datasets.flights.FlightHoloClean import FlightHoloClean
dataSet = FlightHoloClean()
from ml.datasets.hospital.HospitalHoloClean import HospitalHoloClean
#dataSet = HospitalHoloClean()
from ml.datasets.blackOak.BlackOakDataSetUppercase import BlackOakDataSetUppercase
#dataSet = BlackOakDataSetUppercase()

#print("read: %s seconds ---" % (time.time() - start_time))

start_time = time.time()

train_fraction = 1.0
ngrams = 1
runSVD = False
use_metadata = True
user_error_probability = 0.0
step_size = 10
enable_plotting = True

classifier_log_paths = {}
#classifier_log_paths[XGBoostClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/xgboost"
#classifier_log_paths[LinearSVMClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/linearsvm"
#classifier_log_paths[NaiveBayesClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/log_newer/naivebayes"

classifier_log_paths[XGBoostClassifier.name] = "/home/felix/SequentialPatternErrorDetection/progress_log_data/neweat_backup"#"/home/felix/ExampleDrivenErrorDetection/progress_log_data/new_mean_certainty_change_all"#hist_change"



dataset_log_files = {}
dataset_log_files[HospitalHoloClean().name] = "hospital"
dataset_log_files[BlackOakDataSetUppercase().name] = "blackoak"
dataset_log_files[FlightHoloClean().name] = "flight"
dataset_log_files[Book().name] = "book"
dataset_log_files[Salary().name] = "salaries"
dataset_log_files[Restaurant().name] = "restaurant"


classifier_to_use = XGBoostClassifier
model_for_dataset = HospitalHoloClean()

datasets = [HospitalHoloClean(), BlackOakDataSetUppercase(), FlightHoloClean(), Book(), Salary(), Restaurant()]

for i in range(len(datasets)):
    if datasets[i].name == model_for_dataset.name:
        datasets.pop(i)
        break
Example #6
0
for i in range(len(datasets)):
    if datasets[i].name == model_for_dataset.name:
        datasets.pop(i)
        break

print "datasets used for training:"
for i in range(len(datasets)):
    print datasets[i]

N_datasets = 7
'''

#dataset = HospitalHoloClean()
#dataset = BlackOakDataSetUppercase()

dataset = FlightHoloClean()

n = dataset.get_number_dirty_columns()

best_sum_total_f = {}
best_col_seq = {}


def getConfig(dataset):
    path = None
    future_steps = -1
    if type(dataset) == type(FlightHoloClean()):
        path = '/home/felix/phd/round_robin_part/flights'
        future_steps = 4 * 2 + 20

    return path, future_steps