def prepare_data(dataset_names=get_dataset_names()):

    for dataset in DATASETS:
        if not dataset.get_dataset_name() in dataset_names:
            continue
        print("--- Processing dataset:" + dataset.get_dataset_name() + " ---")
        data_frame = dataset.load_raw_dataset()
        d = preprocess(dataset, data_frame)

        for k, v in d.items():
            write_to_file(dataset.get_filename(k), v)
Esempio n. 2
0
def run(dataset=get_dataset_names(), graphs=GRAPHS):
    for dataset_obj in DATASETS:
        if not dataset_obj.get_dataset_name() in dataset:
            continue

        print("\nGenerating graphs for dataset:" +
              dataset_obj.get_dataset_name())
        for sensitive in dataset_obj.get_sensitive_attributes_with_joint():
            for tag in TAGS:
                print("    type:" + tag)
                filename = dataset_obj.get_results_filename(sensitive, tag)
                make_all_graphs(filename, graphs)
    '''
Esempio n. 3
0
def run(dataset=get_dataset_names(), graphs=GRAPHS):
    for dataset_obj in DATASETS:
        if not dataset_obj.get_dataset_name() in dataset:
            continue

        print("\nGenerating graphs for dataset:" +
              dataset_obj.get_dataset_name())
        for sensitive in dataset_obj.get_sensitive_attributes_with_joint():
            for tag in TAGS:
                print("    type:" + tag)
                filename = dataset_obj.get_results_filename(sensitive, tag)
                make_all_graphs(filename, graphs)
    print("Generating additional figures in R...")
    subprocess.run(["Rscript", "results/generate-report.R"])
def main():
    dataset = get_dataset_names()

    for dataset_obj in DATASETS:
        if not dataset_obj.get_dataset_name() in dataset:
            continue
        if 'propublica' in dataset_obj.get_dataset_name():
            if 'violent' not in dataset_obj.get_dataset_name():
                print("\n Computing distances for dataset: " +
                      dataset_obj.get_dataset_name())

                processed_dataset = ProcessedData(dataset_obj)

                # Compute distances
                processed_dataset.generate_distance_matrix(
                    distance_metric='seuclidean')
Esempio n. 5
0
def run(num_trials=NUM_TRIALS_DEFAULT,
        dataset=get_dataset_names(),
        algorithm=get_algorithm_names()):
    algorithms_to_run = algorithm

    print("Datasets: '%s'" % dataset)
    for dataset_obj in DATASETS:
        if not dataset_obj.get_dataset_name() in dataset:
            continue

        print("\nEvaluating dataset:" + dataset_obj.get_dataset_name())

        processed_dataset = ProcessedData(dataset_obj)
        train_test_splits = processed_dataset.create_train_test_splits(
            num_trials)

        all_sensitive_attributes = dataset_obj.get_sensitive_attributes_with_joint(
        )
        for sensitive in all_sensitive_attributes:

            print("Sensitive attribute:" + sensitive)

            detailed_files = dict((
                k,
                create_detailed_file(
                    dataset_obj.get_results_filename(sensitive, k),
                    dataset_obj, processed_dataset.get_sensitive_values(k), k))
                                  for k in train_test_splits.keys())

            for algorithm in ALGORITHMS:
                if not algorithm.get_name() in algorithms_to_run:
                    continue

                print("    Algorithm: %s" % algorithm.get_name())
                print("       supported types: %s" %
                      algorithm.get_supported_data_types())
                if algorithm.__class__ is ParamGridSearch:
                    param_files =  \
                        dict((k, create_detailed_file(
                                     dataset_obj.get_param_results_filename(sensitive, k,
                                                                            algorithm.get_name()),
                                     dataset_obj, processed_dataset.get_sensitive_values(k), k))
                          for k in train_test_splits.keys())
                for i in range(0, num_trials):
                    for supported_tag in algorithm.get_supported_data_types():
                        train, test = train_test_splits[supported_tag][i]
                        try:
                            params, results, param_results =  \
                                run_eval_alg(algorithm, train, test, dataset_obj, processed_dataset,
                                             all_sensitive_attributes, sensitive, supported_tag)
                        except Exception as e:
                            import traceback
                            traceback.print_exc(file=sys.stderr)
                            print("Failed: %s" % e, file=sys.stderr)
                        else:
                            write_alg_results(detailed_files[supported_tag],
                                              algorithm.get_name(), params, i,
                                              results)
                            if algorithm.__class__ is ParamGridSearch:
                                for params, results in param_results:
                                    write_alg_results(
                                        param_files[supported_tag],
                                        algorithm.get_name(), params, i,
                                        results)

            print("Results written to:")
            for supported_tag in algorithm.get_supported_data_types():
                print(
                    "    %s" %
                    dataset_obj.get_results_filename(sensitive, supported_tag))

            for detailed_file in detailed_files.values():
                detailed_file.close()
Esempio n. 6
0
def run(num_trials=NUM_TRIALS_DEFAULT,
        dataset=get_dataset_names(),
        algorithm=get_algorithm_names(),
        num_bootstrap=1):
    algorithms_to_run = algorithm

    print("Datasets: '%s'" % dataset)
    print("Bootstraps: '%s'" % num_bootstrap)

    for dataset_obj in DATASETS:
        if not dataset_obj.get_dataset_name() in dataset:
            continue

        print("\nEvaluating dataset:" + dataset_obj.get_dataset_name())

        processed_dataset = ProcessedData(dataset_obj)
        train_test_splits = processed_dataset.create_train_test_splits(
            num_trials)

        all_sensitive_attributes = dataset_obj.get_sensitive_attributes_with_joint(
        )
        for sensitive in all_sensitive_attributes:

            print("Sensitive attribute:" + sensitive)

            detailed_files = dict((
                k,
                create_detailed_file(
                    dataset_obj.get_results_filename(sensitive, k),
                    dataset_obj, processed_dataset.get_sensitive_values(k), k))
                                  for k in train_test_splits.keys())

            for algorithm in ALGORITHMS:
                if not algorithm.get_name() in algorithms_to_run:
                    continue

                print("    Algorithm: %s" % algorithm.get_name())
                print("       supported types: %s" %
                      algorithm.get_supported_data_types())
                if algorithm.__class__ is ParamGridSearch:
                    param_files =  \
                        dict((k, create_detailed_file(
                                     dataset_obj.get_param_results_filename(sensitive, k,
                                                                            algorithm.get_name()),
                                     dataset_obj, processed_dataset.get_sensitive_values(k), k))
                          for k in train_test_splits.keys())
                for i in range(0, num_trials):
                    for supported_tag in algorithm.get_supported_data_types():
                        # JL: for our exp we're only looking at numerical-binsensitive. no need to run others
                        if supported_tag != 'numerical-binsensitive':
                            continue
                        train, test = train_test_splits[supported_tag][i]
                        try:
                            params, results, param_results =  \
                                run_eval_alg(algorithm, train, test, dataset_obj, processed_dataset,
                                             all_sensitive_attributes, sensitive, supported_tag, num_bootstrap)
                        except Exception as e:
                            import traceback
                            traceback.print_exc(file=sys.stderr)
                            print("Failed: %s" % e, file=sys.stderr)
                        else:
                            for j in range(len(results)):
                                write_alg_results(
                                    detailed_files[supported_tag],
                                    algorithm.get_name(), params,
                                    '{}-{}'.format(i,
                                                   j), results[j])  #results)
                            #if algorithm.__class__ is ParamGridSearch:
                            #    for entry in param_results:
                            #        j = 0
                            #        for params, results in entry:
                            #            write_alg_results(param_files[supported_tag],
                            #                            algorithm.get_name(), params, '{}-{}'.format(i, j), results)
                            #            j += 1
                            #    for param_file in param_files.values():
                            #        param_file.close()

            print("Results written to:")
            for supported_tag in algorithm.get_supported_data_types():
                print(
                    "    %s" %
                    dataset_obj.get_results_filename(sensitive, supported_tag))

            for detailed_file in detailed_files.values():
                detailed_file.close()