def __init__(self): path_to_dirty = Config.get( "datapool.folder") + "/HOSP_HoloClean/dirty/hospital_input.csv" path_to_clean = Config.get( "datapool.folder" ) + "/HOSP_HoloClean/ground-truth/hospital_clean.csv" dirty_wrong_format = pd.read_csv(path_to_dirty, header=0, dtype=object, na_filter=False) clean_wrong_format = pd.read_csv(path_to_clean, header=0, dtype=object, na_filter=False) columns = np.unique(dirty_wrong_format['attribute'].values) #print(len(columns)) dirty_pd, mapColumns = self.to_matrix(dirty_wrong_format, columns) clean_pd = self.correct_dirty(mapColumns, dirty_pd, clean_wrong_format) #print(dirty_pd.head()) super(HospitalMoreCol, self).__init__(HospitalMoreCol.name, dirty_pd, clean_pd)
def __init__(self): path_to_dirty = Config.get("blackoak.data") + "/inputDB.csv" path_to_clean = Config.get("blackoak.data") + "/groundDB.csv" dirty_pd = pd.read_csv(path_to_dirty, header=0, dtype=object) clean_pd = pd.read_csv(path_to_clean, header=0, dtype=object) super(BlackOakDataSet, self).__init__("BlackOak", dirty_pd, clean_pd)
def __init__(self): clean_df = pd.read_csv(Config.get("datapool.folder") + '/movies/rotten_tomatoes.csv', header=0, dtype=object, na_filter=False) dirty_df = pd.read_csv(Config.get("datapool.folder") + '/movies/dirty.csv', header=0, dtype=object, na_filter=False) super(Movies, self).__init__("Movies", dirty_df, clean_df)
def __init__(self): clean_df = pd.read_csv( Config.get("datapool.folder") + '/SALARIES/salaries_small/salaries-1_with_id.csv', header=0, dtype=object, na_filter=False) dirty_df = pd.read_csv( Config.get("datapool.folder") + '/SALARIES/salaries_small/dirty/dirty_salaries-1_with_id.csv', header=0, dtype=object, na_filter=False) super(Salaries, self).__init__("Movies", dirty_df, clean_df)
def run_katara(data): ts = time.time() tmp_katara_out = path_folder_tmp + "/katara_time_" + str(ts) + "_" + str( random.randint(1, 100000)) + "_KATARA_" + ".txt" dirty_dataset = path_folder_tmp + '/dirty_dataset_' + str(ts) + '_' + str( random.randint(1, 100000)) + '.csv' dirty_df = data.dirty_pd.copy() for column_i in range(dirty_df.shape[1]): dirty_df[dirty_df.columns[column_i]] = dirty_df[ dirty_df.columns[column_i]].apply(lambda x: x.upper()) dirty_df.to_csv(dirty_dataset, index=False, encoding='utf-8') start_time = time.time() command = "cd " + Config.get( "abstractionlayer.folder" ) + "/\n" + "python2 cleaning_api.py " + dirty_dataset + " " + tmp_katara_out print command os.system(command) return_dict = {} return_dict['output'] = tmp_katara_out return_dict['time'] = time.time() - start_time return return_dict
def __init__(self): path_to_dirty = Config.get("datapool.folder") + "/FOOD_HoloClean/dirty/food_input.csv" path_to_clean = Config.get("datapool.folder") + "/FOOD_HoloClean/corrected_values/labeled_food.csv" dirty_wrong_format = pd.read_csv(path_to_dirty, header=0, dtype=object) clean_wrong_format = pd.read_csv(path_to_clean, header=0, dtype=object) columns = np.unique(dirty_wrong_format['attribute'].values) dirty_pd, mapColumns = self.to_matrix(dirty_wrong_format, columns) clean_pd = self.correct_dirty(mapColumns, dirty_pd, clean_wrong_format) #print(dirty_pd.head()) super(FoodsHoloClean, self).__init__(FoodsHoloClean.name, dirty_pd, clean_pd)
def __init__(self): clean_df = pd.read_csv(Config.get("datapool.folder") + '/Beers_Mohammad/clean.csv', header=0, dtype=object, na_filter=False) dirty_df = pd.read_csv(Config.get("datapool.folder") + '/Beers_Mohammad/dirty.csv', header=0, dtype=object, na_filter=False) clean_df = clean_df.drop('ounces', 1) dirty_df = dirty_df.drop('ounces', 1) super(Beers, self).__init__("Beers", dirty_df, clean_df)
def run_gaussian_stat(gaussian, statistical, sample_file="/tmp/data_sample.csv", result_file="/tmp/dboostres.csv"): command = "python3 " + Config.get( "dboost.py") + " -F ',' --gaussian " + str( gaussian) + " --statistical " + str( statistical) + " '" + sample_file + "' > '" + result_file + "'" os.system(command)
def __init__(self): path_to_dirty = Config.get( "datapool.folder" ) + "/SALARIES/salaries_full/dirty/dirty_salaries_full_with_id.csv" path_to_clean = Config.get( "datapool.folder") + "/SALARIES/salaries_full/salaries_with_id.csv" dirty_pd = pd.read_csv(path_to_dirty, header=0, dtype=object, error_bad_lines=False, na_filter=False) clean_pd = pd.read_csv(path_to_clean, header=0, dtype=object, error_bad_lines=False, na_filter=False) dirty_pd = dirty_pd.sort_values(['oid', 'id'], ascending=[1, 1]) clean_pd = clean_pd.sort_values(['oid', 'id'], ascending=[1, 1]) dirty_pd = dirty_pd[dirty_pd['oid'].isin(clean_pd['oid'].unique())] clean_pd = clean_pd[clean_pd['oid'].isin(dirty_pd['oid'].unique())] dirty_pd.drop('notes', axis=1, inplace=True) clean_pd.drop('notes', axis=1, inplace=True) dirty_pd = dirty_pd.reset_index(drop=True) clean_pd = clean_pd.reset_index(drop=True) assert np.all(dirty_pd['oid'] == clean_pd['oid']) assert np.all(dirty_pd['id'] == clean_pd['id']) assert np.all(dirty_pd['employeename'] == clean_pd['employeename']) assert np.all(dirty_pd['jobtitle'] == clean_pd['jobtitle']) assert np.all(dirty_pd['overtimepay'] == clean_pd['overtimepay']) assert np.all(dirty_pd['otherpay'] == clean_pd['otherpay']) assert np.all(dirty_pd['benefits'] == clean_pd['benefits']) assert np.all( dirty_pd['totalpaybenefits'] == clean_pd['totalpaybenefits']) assert np.all(dirty_pd['year'] == clean_pd['year']) assert np.all(dirty_pd['agency'] == clean_pd['agency']) assert np.all(dirty_pd['status'] == clean_pd['status']) super(Salary, self).__init__("Salary", dirty_pd, clean_pd)
def __init__(self): path_to_dirty = Config.get("datapool.folder") + "/HOSP_HoloClean/dirty/hospital_input.csv" path_to_clean = Config.get("datapool.folder") + "/HOSP_HoloClean/ground-truth/hospital_clean.csv" dirty_wrong_format = pd.read_csv(path_to_dirty, header=0, dtype=object) clean_wrong_format = pd.read_csv(path_to_clean, header=0, dtype=object) dirty_pd = self.to_matrix(dirty_wrong_format) clean_pd = self.to_matrix(clean_wrong_format) # remove empty columns dirty_pd = dirty_pd.drop(['address2', 'address3'], 1) clean_pd = clean_pd.drop(['address2', 'address3'], 1) #dirty_pd.to_csv('hospital.csv', index=False) #clean_pd.to_csv('hospital_clean.csv', index=False) super(HospitalHoloClean, self).__init__(HospitalHoloClean.name, dirty_pd, clean_pd)
def run_histogram_stat(peak, outlier, statistical, sample_file="/tmp/data_sample.csv", result_file="/tmp/dboostres.csv"): command = "python3 " + Config.get( "dboost.py") + " -F ',' --histogram " + str(peak) + " " + str( outlier) + " --statistical " + str( statistical) + " '" + sample_file + "' > '" + result_file + "'" os.system(command)
def run_mixture_stat(n_subpops, threshold, statistical, sample_file="/tmp/data_sample.csv", result_file="/tmp/dboostres.csv"): command = "python3 -W ignore " + Config.get( "dboost.py") + " -F ',' --mixture " + str(n_subpops) + " " + str( threshold) + " --statistical " + str( statistical) + " '" + sample_file + "' > '" + result_file + "'" os.system(command)
def __init__(self): path_to_dirty = Config.get( "datapool.folder") + "/FLIGHTS_HoloClean/dirty/flights_input.csv" path_to_clean = Config.get( "datapool.folder" ) + "/FLIGHTS_HoloClean/ground-truth/flights_clean.csv" dirty_wrong_format = pd.read_csv(path_to_dirty, header=0, dtype=object) clean_wrong_format = pd.read_csv(path_to_clean, header=0, dtype=object) dirty_pd = self.to_matrix(dirty_wrong_format) clean_pd = self.to_matrix(clean_wrong_format) dirty_pd = dirty_pd.sort_values(['flight', 'src'], ascending=[1, 1]) clean_pd = clean_pd.sort_values(['flight', 'src'], ascending=[1, 1]) assert np.all(dirty_pd['flight'] == clean_pd['flight']) assert np.all(dirty_pd['src'] == clean_pd['src']) super(FlightHoloClean, self).__init__(FlightHoloClean.name, dirty_pd, clean_pd)
def install_tools(): """ This method installs and configures the data cleaning tools. """ for tool in os.listdir(TOOLS_FOLDER): if tool == "NADEEF": p = subprocess.Popen(["ant", "all"], cwd="{}/NADEEF".format(TOOLS_FOLDER), stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) p.communicate() postgress_username = Config.get("nadeef.db.user") postgress_password = Config.get("nadeef.db.password") nadeef_configuration_file = open("{}/NADEEF/nadeef.conf".format(TOOLS_FOLDER), "r") nadeef_configuration = nadeef_configuration_file.read() nadeef_configuration = re.sub("(database.username = )([\w\d]+)", "\g<1>{}".format(postgress_username), nadeef_configuration, flags=re.IGNORECASE) nadeef_configuration = re.sub("(database.password = )([\w\d]+)", "\g<1>{}".format(postgress_password), nadeef_configuration, flags=re.IGNORECASE) nadeef_configuration_file.close() nadeef_configuration_file = open("{}/NADEEF/nadeef.conf".format(TOOLS_FOLDER), "w") nadeef_configuration_file.write(nadeef_configuration) print "{} is installed.".format(tool)
def search_histogram_stat(data, data_sample, data_sample_ground_truth, sample_file, result_file, peak_s, outlier_s, statistical_range, write_out=False): best_params = {} best_fscore = 0.0 precision = 0.0 recall = 0.0 for p in peak_s: for o in outlier_s: for s in statistical_range: run_histogram_stat(p, o, s, sample_file, result_file) our_sample_data = DataSetBasic( data.name + " random" + str(data_sample.shape[0]), data_sample, data_sample_ground_truth) run = DBoostMe(our_sample_data, result_file) current_fscore = run.calculate_total_fscore() current_precision = run.calculate_total_precision() current_recall = run.calculate_total_recall() if write_out: run.write_detected_matrix( Config.get("logging.folder") + "/out/dboost" + '/dboost_histogram_' + data.name + '_peak' + str(p) + '_outlier_' + str(o) + '_stat_' + str(s) + '.npy') print "peak: " + str(p) + " outlier: " + str( o) + " --statistical " + str(s) print "Fscore: " + str(current_fscore) print "Precision: " + str(run.calculate_total_precision()) print "Recall: " + str(run.calculate_total_recall()) if current_fscore >= best_fscore: best_fscore = current_fscore precision = current_precision recall = current_recall best_params['peak'] = p best_params['outlier'] = o best_params['statistical'] = s return best_params, best_fscore, precision, recall
def generate_dBoost_result_file_name(model, data, parameter_grid_dict, keys): path = Config.get("logging.folder") + "/out/dboost_results" if not os.path.exists(path): os.makedirs(path) dBoost_result = path + "/dboost_" + str(model.__name__) + "_" + str( data.name) for p_i in range(len(keys)): dBoost_result += '_' + str(keys[p_i]) + '_' + str( parameter_grid_dict[keys[p_i]]) dBoost_result += '.npy' return dBoost_result
def search_mixture_stat(data, data_sample, data_sample_ground_truth, sample_file, result_file, n_subpops_s, threshold_s, statistical_range, write_out=False): best_params = {} best_fscore = 0.0 precision = 0.0 recall = 0.0 for p in n_subpops_s: for t in threshold_s: for s in statistical_range: run_mixture_stat(p, t, s, sample_file, result_file) our_sample_data = DataSetBasic( data.name + " random" + str(data_sample.shape[0]), data_sample, data_sample_ground_truth) run = DBoostMe(our_sample_data, result_file) current_fscore = run.calculate_total_fscore() current_precision = run.calculate_total_precision() current_recall = run.calculate_total_recall() if write_out: run.write_detected_matrix( Config.get("logging.folder") + "/out/dboost" + '/dboost_' + data.name + '_mixture_subpop' + str(p) + '_threshold_' + str(t) + '_stat_' + str(s) + '.npy') print "n_subpops: " + str(p) + " threshold: " + str( t) + " --statistical " + str(s) print "Fscore: " + str(current_fscore) print "Precision: " + str(run.calculate_total_precision()) print "Recall: " + str(run.calculate_total_recall()) if current_fscore >= best_fscore: best_fscore = current_fscore precision = current_precision recall = current_recall best_params['n_subpops'] = p best_params['threshold'] = t best_params['statistical'] = s return best_params, best_fscore, precision, recall
def visualize_model(dataSet, column_id, final_gb, feature_name_list, train, target_run, res): try: column_name = dataSet.clean_pd.columns[column_id] feature_name_list_err_corr = list(feature_name_list) print "missing features: " + str( len(final_gb[column_id].feature_names) - len(feature_name_list)) if len(final_gb[column_id].feature_names) - len(feature_name_list) > 0: for err_corr_id in range(dataSet.shape[1]): if dataSet.is_column_applicable( err_corr_id) and err_corr_id != column_id: feature_name_list_err_corr.append( "error_corr_" + str(dataSet.clean_pd.columns[err_corr_id])) directory = Config.get("logging.folder") + '/out/html/' + dataSet.name if not os.path.exists(directory): os.makedirs(directory) path = directory + '/' + str(column_name) + '_' + str( train[column_id].shape[0]) + '_' + str(time.time()) + '.html' table_content = show_weights(final_gb[column_id], feature_names=feature_name_list_err_corr, importance_type="gain").data # print table_content from ml.VisualizeSVD import replace_with_url table_content = replace_with_url(table_content, dataSet) url = 'file://' + path html = "<h1>" + str(column_name) + "</h1>" html += "<h2>number of labels: " + str( train[column_id].shape[0]) + "</h2>" html += "<h2>F-Score: " + str(f1_score(target_run, res[column_id])) + "</h2>" html += str(table_content) with open(path, 'w') as webf: webf.write(html) webf.close() # webbrowser.open(url) except jinja2.exceptions.UndefinedError: print( format_as_text( explain_weights(final_gb[column_id], feature_names=feature_name_list)))
def __init__(self, duplicate_factor=1): path_to_dirty = Config.get("blackoak.data") + "/inputDB.csv" path_to_clean = Config.get("blackoak.data") + "/groundDB.csv" dirty_pd_init = pd.read_csv(path_to_dirty, header=0, dtype=object, na_filter=False) clean_pd_init = pd.read_csv(path_to_clean, header=0, dtype=object, na_filter=False) #print dirty_pd_init.dtypes #print clean_pd_init.dtypes dirty_pd = self.uppercase(dirty_pd_init) clean_pd = self.uppercase(clean_pd_init) #dirty_pd.to_csv("BlackOakUppercase_dirty_new.csv", index=False) duplicated_clean = clean_pd.copy(deep=True) duplicated_dirty = dirty_pd.copy(deep=True) for i in range(duplicate_factor - 1): copy_dirty = dirty_pd.copy(deep=True) copy_clean = clean_pd.copy(deep=True) duplicated_dirty = duplicated_dirty.append(copy_dirty, ignore_index=True) duplicated_clean = duplicated_clean.append(copy_clean, ignore_index=True) super(BlackOakDataSetUppercase, self).__init__(BlackOakDataSetUppercase.name, duplicated_dirty, duplicated_clean)
def search_gaussian_stat(data, data_sample, data_sample_ground_truth, sample_file, result_file, gaussian_range, statistical_range, write_out=False): best_params = {} best_fscore = 0.0 precision = 0.0 recall = 0.0 for g in gaussian_range: for s in statistical_range: run_gaussian_stat(g, s, sample_file, result_file) our_sample_data = DataSetBasic( data.name + " random" + str(data_sample.shape[0]), data_sample, data_sample_ground_truth) run = DBoostMe(our_sample_data, result_file) current_fscore = run.calculate_total_fscore() current_precision = run.calculate_total_precision() current_recall = run.calculate_total_recall() if write_out: run.write_detected_matrix( Config.get("logging.folder") + "/out/dboost" + '/dboost_gausian_' + data.name + '_gausian' + str(g) + '_stat_' + str(s) + '.npy') print "--gaussian " + str(g) + " --statistical " + str(s) print "Fscore: " + str(current_fscore) print "Precision: " + str(run.calculate_total_precision()) print "Recall: " + str(run.calculate_total_recall()) if current_fscore >= best_fscore: best_fscore = current_fscore precision = current_precision recall = current_recall best_params['gaussian'] = g best_params['statistical'] = s return best_params, best_fscore, precision, recall
def __init__(self, data): new_columns = [] for col_i in range(len(data.dirty_pd.columns)): new_columns.append(data.dirty_pd.columns[col_i].replace(" ", "_")) data.dirty_pd.columns = new_columns print data.dirty_pd.columns data.dirty_pd.to_csv('/tmp/data.csv', index=False) ''' data.dirty_pd.to_csv('/tmp/data.csv', index=False, quoting=csv.QUOTE_ALL, escapechar='\\', quotechar="'", na_rep="") ''' #/home/felix/abstractionlayer/datasets #/tmp/data.csv run_input = { "dataset": { "type": "csv", "param": ["/home/felix/abstractionlayer/datasets/hosp_holoclean.csv"] }, "tool": { "name": "katara", "param": [Config.get("abstractionlayer.tools") + "/KATARA/dominSpecific"] } } matrix_detected = np.zeros(data.shape) results_list = run_data_cleaning_job(run_input) for x in results_list: print x matrix_detected[x[0]-1, x[1]] = True super(KATARA, self).__init__("KATARA_me", data, matrix_detected)
def run_dboost(dBoost, data, defined_range_labeled_cells, steps, N): ts = time.time() path_folder = Config.get("logging.folder") + "/out/dboost_interval" if not os.path.exists(path_folder): os.makedirs(path_folder) log_file = path_folder + "/" + str(data.name) + "_time_" + str(ts) + "_dBoost_" + dBoost.func_name + ".txt" sizes = np.array(defined_range_labeled_cells, dtype=float) # in cells dirty_column_fraction = data.get_number_dirty_columns() / float(data.shape[1]) sizes /= dirty_column_fraction # cells converted sizes /= float(data.shape[1]) # cells to rows row_sizes = np.array(sizes, dtype=int) # in rows avg_times, avg_fscores, avg_precision, avg_recall, std_fscores, std_precision, std_recall = dBoost( data, steps, N, row_sizes, log_file) toLatex(defined_range_labeled_cells, avg_times, avg_fscores, avg_precision, avg_recall, std_fscores, std_precision, std_recall, log_file)
from ml.datasets.MoviesMohammad.Movies import Movies from ml.datasets.RestaurantMohammad.Restaurant import Restaurant from ml.datasets.BeersMohammad.Beers import Beers from ml.datasets.Citations.Citation import Citation from ml.datasets.salary_data.Salary import Salary import time from ml.tools.dboost.TestDBoost import test_multiple_sizes_hist from ml.tools.dboost.TestDBoost import test_multiple_sizes_gaussian from ml.tools.dboost.TestDBoost import test_multiple_sizes_mixture from ml.configuration.Config import Config import numpy as np import os path_folder = Config.get("logging.folder") + "/out/dboost" if not os.path.exists(path_folder): os.makedirs(path_folder) #data_list = [FlightHoloClean, BlackOakDataSetUppercase, HospitalHoloClean, Restaurant, Movies, Beers, Citation] data_list = [FlightHoloClean] steps = 100 N = 1 dBoost_methods = [test_multiple_sizes_mixture] for dataset in data_list: data = dataset() rows_number = data.shape[0]
from ml.datasets.RestaurantMohammad.Restaurant import Restaurant from ml.datasets.BeersMohammad.Beers import Beers from ml.datasets.Citations.Citation import Citation from ml.active_learning.classifier.XGBoostClassifier import XGBoostClassifier from ml.active_learning.classifier.LinearSVMClassifier import LinearSVMClassifier from ml.active_learning.classifier.NaiveBayesClassifier import NaiveBayesClassifier from ml.datasets.salary_data.Salary import Salary import numpy as np from ml.configuration.Config import Config import os import time path_folder = Config.get("logging.folder") + "/out/model" if not os.path.exists(path_folder): os.makedirs(path_folder) data_list = [ FlightHoloClean, BlackOakDataSetUppercase, HospitalHoloClean, Movies, Restaurant, Citation, Beers, Salary ] classifiers = [XGBoostClassifier, LinearSVMClassifier, NaiveBayesClassifier] parameters = [] my_array = [] for dataset in data_list: data = dataset() for classifier in classifiers:
from sets import Set from ml.datasets.BeersMohammad.Beers import Beers from ml.tools.nadeef_detect.FD import FD from ml.tools.nadeef_detect.UDF import UDF from ml.tools.nadeef_detect.NadeefDetect import NadeefDetect from ml.configuration.Config import Config import os import time path_folder = Config.get("logging.folder") + "/out/nadeef" if not os.path.exists(path_folder): os.makedirs(path_folder) #according to FUN and fdmine, no perfect FDs # according to HyFD only ID columns are involved into FDs #check data = Beers() my_list = list(data.clean_pd.columns) my_list[0] = 'anid' data.clean_pd.columns = my_list data.dirty_pd.columns = my_list rules = [] #rules.append(UDF('ounces', 'value.length() > 4')) rules.append(UDF('ibu', 'value.equals("N/A")')) rules.append(UDF('abv', '(value != null && !isNumeric(value))'))
from ml.active_learning.classifier.XGBoostClassifier import XGBoostClassifier from ml.active_learning.classifier.LinearSVMClassifier import LinearSVMClassifier from ml.active_learning.classifier.NaiveBayesClassifier import NaiveBayesClassifier import numpy as np from ml.configuration.Config import Config import os import time from ml.datasets.food.FoodsHoloClean import FoodsHoloClean from ml.datasets.adult.Adult import Adult from ml.datasets.soccer.Soccer import Soccer from ml.datasets.hospital.HospitalMoreCol import HospitalMoreCol path_folder = Config.get("logging.folder") + "/out/holodetect" if not os.path.exists(path_folder): os.makedirs(path_folder) #data_list = [FlightHoloClean, BlackOakDataSetUppercase, HospitalHoloClean, Movies, Restaurant, Citation, Beers, Salary] data_list = [HospitalMoreCol] parameters = [] #parameters.append({'use_metadata': False, 'correlationFeatures': False}) #char unigrams #parameters.append({'use_metadata': False, 'correlationFeatures': False, 'is_word': True}) #word unigrams #parameters.append({'use_metadata_only': True, 'correlationFeatures': False}) #metadata #parameters.append({'use_metadata': False, 'ngrams': 2, 'correlationFeatures': False}) #char unigrams + bigrams #parameters.append({'correlationFeatures': False}) #char unigrams + meta data #parameters.append({}) #char unigrams + meta data + correlation #ed
def __init__(self): clean_df = pd.read_csv(Config.get("datapool.folder") + '/Citations/citation.csv', header=0, dtype=object, na_filter=False, encoding="utf8") dirty_df = pd.read_csv(Config.get("datapool.folder") + '/Citations/dirty.csv', header=0, dtype=object, na_filter=False, encoding="utf8") super(Citation, self).__init__("Citation", dirty_df, clean_df)
# All Rights Reserved ######################################## ######################################## import os import json import re import subprocess import pandas from ml.configuration.Config import Config ######################################## ######################################## TOOLS_FOLDER = Config.get("abstractionlayer.tools") ######################################## ######################################## def install_tools(): """ This method installs and configures the data cleaning tools. """ for tool in os.listdir(TOOLS_FOLDER): if tool == "NADEEF": p = subprocess.Popen(["ant", "all"], cwd="{}/NADEEF".format(TOOLS_FOLDER), stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) p.communicate() postgress_username = Config.get("nadeef.db.user") postgress_password = Config.get("nadeef.db.password")
def __init__(self): clean_df = pd.read_csv(Config.get("datapool.folder") + '/restaurants/yellow_pages.csv', header=0, dtype=object, na_filter=False) dirty_df = pd.read_csv(Config.get("datapool.folder") + '/restaurants/dirty.csv', header=0, dtype=object, na_filter=False) super(Restaurant, self).__init__("Restaurant", dirty_df, clean_df)
from ml.datasets.blackOak.BlackOakDataSetUppercase import BlackOakDataSetUppercase from ml.datasets.hospital.HospitalHoloClean import HospitalHoloClean from ml.datasets.MoviesMohammad.Movies import Movies from ml.datasets.RestaurantMohammad.Restaurant import Restaurant from ml.datasets.BeersMohammad.Beers import Beers from ml.datasets.Citations.Citation import Citation from ml.datasets.salary_data.Salary import Salary from ml.active_learning.classifier.XGBoostClassifier import XGBoostClassifier import numpy as np from ml.configuration.Config import Config import os import time path_folder = Config.get("logging.folder") + "/out/features" if not os.path.exists(path_folder): os.makedirs(path_folder) data_list = [FlightHoloClean] classifier = XGBoostClassifier parameters = [] #parameters.append({'correlationFeatures': False, 'use_metadata': False, 'use_cond_prob': True, 'use_cond_prob_only': True}) #word2vec #feature_names = ['conditional probability'] #LSTM parameters.append({ 'correlationFeatures': False, 'use_metadata': False, 'use_word2vec': True,