def prepare_data(dataset_names=get_dataset_names()): for dataset in DATASETS: if not dataset.get_dataset_name() in dataset_names: continue print("--- Processing dataset:" + dataset.get_dataset_name() + " ---") data_frame = dataset.load_raw_dataset() d = preprocess(dataset, data_frame) for k, v in d.items(): write_to_file(dataset.get_filename(k), v)
def run(dataset=get_dataset_names(), graphs=GRAPHS): for dataset_obj in DATASETS: if not dataset_obj.get_dataset_name() in dataset: continue print("\nGenerating graphs for dataset:" + dataset_obj.get_dataset_name()) for sensitive in dataset_obj.get_sensitive_attributes_with_joint(): for tag in TAGS: print(" type:" + tag) filename = dataset_obj.get_results_filename(sensitive, tag) make_all_graphs(filename, graphs) '''
def run(dataset=get_dataset_names(), graphs=GRAPHS): for dataset_obj in DATASETS: if not dataset_obj.get_dataset_name() in dataset: continue print("\nGenerating graphs for dataset:" + dataset_obj.get_dataset_name()) for sensitive in dataset_obj.get_sensitive_attributes_with_joint(): for tag in TAGS: print(" type:" + tag) filename = dataset_obj.get_results_filename(sensitive, tag) make_all_graphs(filename, graphs) print("Generating additional figures in R...") subprocess.run(["Rscript", "results/generate-report.R"])
def main(): dataset = get_dataset_names() for dataset_obj in DATASETS: if not dataset_obj.get_dataset_name() in dataset: continue if 'propublica' in dataset_obj.get_dataset_name(): if 'violent' not in dataset_obj.get_dataset_name(): print("\n Computing distances for dataset: " + dataset_obj.get_dataset_name()) processed_dataset = ProcessedData(dataset_obj) # Compute distances processed_dataset.generate_distance_matrix( distance_metric='seuclidean')
def run(num_trials=NUM_TRIALS_DEFAULT, dataset=get_dataset_names(), algorithm=get_algorithm_names()): algorithms_to_run = algorithm print("Datasets: '%s'" % dataset) for dataset_obj in DATASETS: if not dataset_obj.get_dataset_name() in dataset: continue print("\nEvaluating dataset:" + dataset_obj.get_dataset_name()) processed_dataset = ProcessedData(dataset_obj) train_test_splits = processed_dataset.create_train_test_splits( num_trials) all_sensitive_attributes = dataset_obj.get_sensitive_attributes_with_joint( ) for sensitive in all_sensitive_attributes: print("Sensitive attribute:" + sensitive) detailed_files = dict(( k, create_detailed_file( dataset_obj.get_results_filename(sensitive, k), dataset_obj, processed_dataset.get_sensitive_values(k), k)) for k in train_test_splits.keys()) for algorithm in ALGORITHMS: if not algorithm.get_name() in algorithms_to_run: continue print(" Algorithm: %s" % algorithm.get_name()) print(" supported types: %s" % algorithm.get_supported_data_types()) if algorithm.__class__ is ParamGridSearch: param_files = \ dict((k, create_detailed_file( dataset_obj.get_param_results_filename(sensitive, k, algorithm.get_name()), dataset_obj, processed_dataset.get_sensitive_values(k), k)) for k in train_test_splits.keys()) for i in range(0, num_trials): for supported_tag in algorithm.get_supported_data_types(): train, test = train_test_splits[supported_tag][i] try: params, results, param_results = \ run_eval_alg(algorithm, train, test, dataset_obj, processed_dataset, all_sensitive_attributes, sensitive, supported_tag) except Exception as e: import traceback traceback.print_exc(file=sys.stderr) print("Failed: %s" % e, file=sys.stderr) else: write_alg_results(detailed_files[supported_tag], algorithm.get_name(), params, i, results) if algorithm.__class__ is ParamGridSearch: for params, results in param_results: write_alg_results( param_files[supported_tag], algorithm.get_name(), params, i, results) print("Results written to:") for supported_tag in algorithm.get_supported_data_types(): print( " %s" % dataset_obj.get_results_filename(sensitive, supported_tag)) for detailed_file in detailed_files.values(): detailed_file.close()
def run(num_trials=NUM_TRIALS_DEFAULT, dataset=get_dataset_names(), algorithm=get_algorithm_names(), num_bootstrap=1): algorithms_to_run = algorithm print("Datasets: '%s'" % dataset) print("Bootstraps: '%s'" % num_bootstrap) for dataset_obj in DATASETS: if not dataset_obj.get_dataset_name() in dataset: continue print("\nEvaluating dataset:" + dataset_obj.get_dataset_name()) processed_dataset = ProcessedData(dataset_obj) train_test_splits = processed_dataset.create_train_test_splits( num_trials) all_sensitive_attributes = dataset_obj.get_sensitive_attributes_with_joint( ) for sensitive in all_sensitive_attributes: print("Sensitive attribute:" + sensitive) detailed_files = dict(( k, create_detailed_file( dataset_obj.get_results_filename(sensitive, k), dataset_obj, processed_dataset.get_sensitive_values(k), k)) for k in train_test_splits.keys()) for algorithm in ALGORITHMS: if not algorithm.get_name() in algorithms_to_run: continue print(" Algorithm: %s" % algorithm.get_name()) print(" supported types: %s" % algorithm.get_supported_data_types()) if algorithm.__class__ is ParamGridSearch: param_files = \ dict((k, create_detailed_file( dataset_obj.get_param_results_filename(sensitive, k, algorithm.get_name()), dataset_obj, processed_dataset.get_sensitive_values(k), k)) for k in train_test_splits.keys()) for i in range(0, num_trials): for supported_tag in algorithm.get_supported_data_types(): # JL: for our exp we're only looking at numerical-binsensitive. no need to run others if supported_tag != 'numerical-binsensitive': continue train, test = train_test_splits[supported_tag][i] try: params, results, param_results = \ run_eval_alg(algorithm, train, test, dataset_obj, processed_dataset, all_sensitive_attributes, sensitive, supported_tag, num_bootstrap) except Exception as e: import traceback traceback.print_exc(file=sys.stderr) print("Failed: %s" % e, file=sys.stderr) else: for j in range(len(results)): write_alg_results( detailed_files[supported_tag], algorithm.get_name(), params, '{}-{}'.format(i, j), results[j]) #results) #if algorithm.__class__ is ParamGridSearch: # for entry in param_results: # j = 0 # for params, results in entry: # write_alg_results(param_files[supported_tag], # algorithm.get_name(), params, '{}-{}'.format(i, j), results) # j += 1 # for param_file in param_files.values(): # param_file.close() print("Results written to:") for supported_tag in algorithm.get_supported_data_types(): print( " %s" % dataset_obj.get_results_filename(sensitive, supported_tag)) for detailed_file in detailed_files.values(): detailed_file.close()