Beispiel #1
0
def helper(input_filename, expected_filename, config_file):
    '''

    '''

    input_filename = os.path.join(BASE_DIR, input_filename)
    expected_df = pd.read_csv(os.path.join(BASE_DIR, expected_filename))

    if config_file:
        config_file = os.path.join(BASE_DIR, config_file)

    config = Configs.load_configs(config_file)

    test_df, _ = audit(pd.read_csv(os.path.join(BASE_DIR, input_filename)),
                       config)

    # match expected_df columns
    shared_columns = [c for c in expected_df.columns if c in test_df.columns]

    try:
        expected_df = expected_df[shared_columns]
        test_df = test_df[shared_columns]
        combined_data = pd.merge(expected_df,
                                 test_df,
                                 on=['attribute_name', 'attribute_value'])
    # subtract expected_df from test_df
    except:
        # collect output for
        print('could not merge')
        return (test_df, expected_df)
    # see if close enough to 0

    s = ""
    EPS = 1e-6
    for col in shared_columns:
        if col not in {'attribute_value', 'attribute_name'}:
            print('testing {} ...'.format(col))

            try:
                # TypeError: numpy boolean subtract, the `-` operator, is
                # deprecated, use the bitwise_xor, the `^` operator, or the
                # logical_xor function instead.
                # found online that casting as float64 will go around, but
                # would like to get Jesse's take on best way to avoid issue.
                if np.mean(combined_data[col + "_x"].astype("float64") -
                           combined_data[col + "_y"].astype("float64")) > EPS:
                    exp_mean = np.mean(combined_data[col + "_x"])
                    aeq_mean = np.mean(combined_data[col + "_y"])
                    s += "{} fails: Expected {} on average, but aequitas returned {}\n".format(
                        col, exp_mean, aeq_mean)

                    pytest.fail(s)

            except:
                if not all(combined_data[col + "_x"] == combined_data[col +
                                                                      "_y"]):
                    s += "{} fails: at least one entry was not the same between data sets\n".format(
                        col)
                    pytest.fail(s)
def helper(input_filename, expected_filename, config_file):
    '''

	'''

    input_filename = os.path.join(BASE_DIR, input_filename)
    expected_df = pd.read_csv(os.path.join(BASE_DIR, expected_filename))

    if config_file:
        config_file = os.path.join(BASE_DIR, config_file)

    config = Configs.load_configs(config_file)

    test_df, _ = audit(pd.read_csv(os.path.join(BASE_DIR, input_filename)),
                       config)

    # match expected_df columns
    shared_columns = [c for c in expected_df.columns if c in test_df.columns]

    try:
        expected_df = expected_df[shared_columns]
        test_df = test_df[shared_columns]
        combined_data = pd.merge(expected_df,
                                 test_df,
                                 on=['attribute_name', 'attribute_value'])
    # subtract expected_df from test_df
    except:
        # collect output for
        print('could not merge')
        return (test_df, expected_df)
    # see if close enough to 0

    s = ""
    EPS = 1e-6
    for col in shared_columns:
        if col not in {'attribute_value', 'attribute_name'}:
            print('testing {} ...'.format(col))

            try:
                if np.mean(combined_data[col + "_x"] -
                           combined_data[col + "_y"]) > EPS:
                    exp_mean = np.mean(combined_data[col + "_x"])
                    aeq_mean = np.mean(combined_data[col + "_y"])
                    s += "{} fails: Expected {} on average, but aequitas returned {}\n".format(
                        col, exp_mean, aeq_mean)

                    pytest.fail(s)
            except:
                if not all(combined_data[col + "_x"] == combined_data[col +
                                                                      "_y"]):
                    s += "{} fails: at least one entry was not the same between data sets\n".format(
                        col)

                    pytest.fail(s)
Beispiel #3
0
def audit_file(name, dirname):
    upload_path = os.path.join(tempfile.gettempdir(), dirname)
    data_path = os.path.join(upload_path, name + '.csv')
    if not os.path.exists(data_path):
        abort(404)

    try:
        df = pd.read_csv(data_path)
    except pd.errors.ParserError:
        flash('Bad CSV file – could not parse', 'warning')
        return redirect(url_for('home'))

    (df, groups) = preprocess_input_df(df)

    if "submit" not in request.form:
        subgroups = {col: list(set(df[col])) for col in groups}

        # set defaults
        for (key, values) in (
            ('race', ('White', 'Caucasian')),
            ('sex', ('Male',)),
            ('gender', ('Male',)),
            ('age_cat', ('25 - 45',)),
            ('education', ('HS-grad',)),
        ):
            if key in subgroups:
                subgroups[key].sort(key=lambda value: int(value not in values))

        supported_fairness_measures = Fairness().get_fairness_measures_supported(df)
        fairness_measures = [x for x in FAIR_MAP_ORDER if FAIR_MAP[x].issubset(set(supported_fairness_measures))]

        return render_template('audit.html',
                               categories=groups,
                               subcategories=subgroups,
                               fairness=fairness_measures)

    rgm = request.form["ref_groups_method"]
    if rgm == 'predefined':
        group_variables = request.form.getlist('group_variable1')
    else:
        group_variables = request.form.getlist('group_variable2')

    # check if user forgot to select anything; return all
    if len(group_variables) == 0:
        group_variables = groups

    # remove unwanted cols from df
    subgroups = {g: request.form[g] for g in group_variables}

    # majority_groups = request.form.getlist('use_majority_group')
    raw_fairness_measures = request.form.getlist('fairness_measures')
    if len(raw_fairness_measures) == 0:
        fairness_measures = list(Fairness().get_fairness_measures_supported(df))
    else:
        # map selected measures to input
        fairness_measures = [y for x in raw_fairness_measures for y in FAIR_MAP[x]]

    try:
        fv = float(request.form['fairness_pct'])
    except (KeyError, ValueError):
        fv = None

    fp = fv / 100.0 if fv else 0.8

    configs = Configs(ref_groups=subgroups,
                      ref_groups_method=rgm,
                      fairness_threshold=fp,
                      fairness_measures=fairness_measures,
                      attr_cols=group_variables)

    (_gv_df, report) = audit(df,
                             # model_id=1,
                             configs=configs,
                             preprocessed=True)

    for reportid in itertools.count(1):
        report_path = os.path.join(upload_path, str(reportid))
        if not os.path.exists(report_path):
            break

    with open(report_path, 'w') as fd:
        fd.write(report)

    return redirect(url_for("report",
                            dirname=dirname,
                            name=name,
                            reportid=reportid))