Exemple #1
0
print "datasets used for training:"
for i in range(len(datasets)):
    print datasets[i]

N_datasets = 7
'''




log_folder = "unique_batch" #"unique"

dataset = HospitalHoloClean() #BlackOakDataSetUppercase()
future_steps = 60 #BlackOak = 7, Flights = 9

n = dataset.get_number_dirty_columns()

best_sum_total_f = {}
best_col_seq  = {}



for d in range(10):
    file_path = "/home/felix/ExampleDrivenErrorDetection/progress_log_data/" + log_folder + "/log_progress_"+ dataset.name +"_" + str(d)  +".csv"
    x, fp, fn, tp = read_csv1(file_path, None)

    certainty_sum = get_all_certainty_stddev(x, feature_names)

    print certainty_sum

    print "train: " + str(x.shape[0])
Exemple #2
0
for key, value in sorted_x:
    labels.append(key)
    score.append(value)
    t += 1
    if t == 25:
        break

if enable_plotting:
    ind = np.arange(len(score))
    plt.barh(ind, score, align='center', alpha=0.5)
    plt.yticks(ind, labels)
    plt.show()

y_pred = final.predict(mat)

nr_columns = model_for_dataset.get_number_dirty_columns()
t_x, t_y = read_csv1(
    classifier_log_paths[classifier_to_use.name] + "/log_progress_" +
    dataset_log_files[model_for_dataset.name] + ".csv", None)

if not use_change_features:
    t_x = t_x[:, 0:t_x.shape[1] - 4]

print t_x.shape

endfnew = np.zeros(nr_columns)

for i in range(nr_columns):
    endfnew[i] = t_y[len(t_y) - nr_columns + i]

for i in range(len(t_y)):
from ml.tools.dboost.TestDBoost import test_multiple_sizes_mixture

warnings.filterwarnings("ignore", category=DeprecationWarning)

data = HospitalHoloClean()
'''
steps = 100
sizes = [10, 20, 30, 40, 50]
N = 5

test_multiple_sizes_hist(data, steps, N, sizes)
'''

steps = 100
N = 1  #10
labels = 918

nr_rows = int(float(labels) / data.shape[1])
#sizes = np.array([200, 400, 600, 800], dtype=float) # in cells
sizes = np.array([400], dtype=float)  # in cells

print sizes
dirty_column_fraction = data.get_number_dirty_columns() / float(data.shape[1])
sizes /= dirty_column_fraction
sizes /= float(data.shape[1])
print sizes
row_sizes = np.array(sizes, dtype=int)  # in rows

log_file = "/home/felix/ExampleDrivenErrorDetection/log/dBoost/Hospital_mix_new.txt"

test_multiple_sizes_mixture(data, steps, N, row_sizes, log_file)