Example #1
0
    def execeute(self):
        model = self.download_model()
        tabla_3 = pd.read_sql_table('centers',
                                    self.engine,
                                    schema="transformed")
        tabla_4 = pd.read_sql_table('inspections',
                                    self.engine,
                                    schema="transformed")

        centros = tabla_3.copy()
        centros.rename(columns={"dc_id": "center_id"}, inplace=True)
        inspecciones = tabla_4.copy()
        last_inspections = inspecciones.sort_values(
            by="inspectiondate").drop_duplicates(subset=["center_id"],
                                                 keep="last")
        centros = centros.drop([
            'centername', 'legalname', 'building', 'street', 'zipcode',
            'phone', 'permitnumber', 'permitexp', 'status', 'agerange',
            'childcaretype', 'bin', 'url', 'datepermitted', 'actual',
            'violationratepercent', 'violationavgratepercent',
            'publichealthhazardviolationrate',
            'averagepublichealthhazardiolationrate', 'criticalviolationrate',
            'avgcriticalviolationrate'
        ],
                               axis=1)
        centros = centros.reset_index(drop=True)
        tabla_5 = pd.merge(last_inspections, centros)
        tabla_5.sort_values(['inspectiondate'],
                            ascending=[False],
                            inplace=True)
        tabla_5['maximumcapacity'] = tabla_5['maximumcapacity'].astype(int)

        tabla_5['totaleducationalworkers'] = tabla_5[
            'totaleducationalworkers'].astype(int)

        tabla_5['totaleducationalworkers'] = tabla_5[
            'totaleducationalworkers'].astype(int)

        tabla_5['averagetotaleducationalworkers'] = tabla_5[
            'averagetotaleducationalworkers'].astype(float)

        tabla_5 = tabla_5.drop([
            'regulationsummary', 'healthcodesubsection', 'violationstatus',
            'borough', 'reason', 'inspectiondate', 'violationcategory_nan'
        ],
                               axis=1)

        tabla_5 = tabla_5.set_index(['center_id'])
        tabla_5 = tabla_5.fillna(0)

        for col in tabla_5.select_dtypes(object):
            tabla_5[col] = tabla_5[col].astype(float)

        tabla_5 = tabla_5.fillna(0)

        prds = model.predict(
            tabla_5.drop(['violationcategory_public_health_hazard'], axis=1))
        probas = model.predict_proba(
            tabla_5.drop(['violationcategory_public_health_hazard'], axis=1))

        res = pd.DataFrame({
            "center": tabla_5.index,
            "etiqueta": prds,
            "proba_0": probas[:, 0],
            "proba_1": probas[:, 1]
        })

        res.loc[res['proba_0'] > res['proba_1'], 'score'] = res['proba_0']
        res.loc[res['proba_0'] < res['proba_1'], 'score'] = res['proba_1']

        categorias_1 = [
            "programtype_all_age_camp", "programtype_infant_toddler",
            "programtype_preschool", "programtype_preschool_camp",
            "programtype_school_age_camp"
        ]
        programtype = pd.get_dummies(centros[categorias_1]).idxmax(1)
        categorias_2 = [
            "borough_bronx", "borough_brooklyn", "borough_manhattan",
            "borough_queens", "borough_staten_island"
        ]
        borough = pd.get_dummies(centros[categorias_2]).idxmax(1)
        ambas = pd.concat(
            [borough, programtype],
            axis=1,
        )
        ambas = ambas.rename(columns={0: 'borough', 1: 'programtype'})
        tabla_1 = pd.concat([centros, ambas], axis=1)
        tabla_2 = pd.merge(res,
                           tabla_1,
                           left_on='center',
                           right_on='center_id')

        for i in list(tabla_2.index):
            if str(tabla_2.iloc[i].borough_bronx) == "1":
                tabla_2.loc[tabla_2.index == i, "borough"] = "bronx"
            elif str(tabla_2.iloc[i].borough_brooklyn) == "1":
                tabla_2.loc[tabla_2.index == i, "borough"] = "brooklyn"
            elif str(tabla_2.iloc[i].borough_manhattan) == "1":
                tabla_2.loc[tabla_2.index == i, "borough"] = "manhattan"
            elif str(tabla_2.iloc[i].borough_queens) == "1":
                tabla_2.loc[tabla_2.index == i, "borough"] = "queens"
            elif str(tabla_2.iloc[i].borough_staten_island) == "1":
                tabla_2.loc[tabla_2.index == i, "borough"] = "staten_island"

        tabla_2.drop(categorias_2, axis=1, inplace=True)

        for i in list(tabla_2.index):
            if str(tabla_2.iloc[i].programtype_all_age_camp) == "1":
                tabla_2.loc[tabla_2.index == i, "programtype"] = "all_age_camp"
            elif str(tabla_2.iloc[i].programtype_infant_toddler) == "1":
                tabla_2.loc[tabla_2.index == i,
                            "programtype"] = "infant_toddler"
            elif str(tabla_2.iloc[i].programtype_preschool) == "1":
                tabla_2.loc[tabla_2.index == i, "programtype"] = "preschool"
            elif str(tabla_2.iloc[i].programtype_preschool_camp) == "1":
                tabla_2.loc[tabla_2.index == i,
                            "programtype"] = "preschool_camp"
            elif str(tabla_2.iloc[i].programtype_school_age_camp) == "1":
                tabla_2.loc[tabla_2.index == i,
                            "programtype"] = "school_age_camp"

        tabla_2.drop(categorias_1, axis=1, inplace=True)

        tabla_6 = tabla_2.loc[:, [
            'center', 'etiqueta', 'score', 'borough', 'programtype'
        ]]
        tabla_6 = tabla_6.rename(columns={'etiqueta': 'label_value'})
        tabla_6.set_index('center', inplace=True)

        g = Group()
        xtab, _ = g.get_crosstabs(tabla_6)

        b = Bias()
        bdf = b.get_disparity_predefined_groups(xtab,
                                                original_df=tabla_6,
                                                ref_groups_dict={
                                                    'borough': 'brooklyn',
                                                    'programtype': 'preschool'
                                                },
                                                alpha=0.05,
                                                mask_significance=True)
        f = Fairness()
        fdf = f.get_group_value_fairness(bdf)

        fdf['model_id'] = self.model_id
        fdf['date'] = self.date_param
        self.output_table = fdf
        return [tuple(x) for x in fdf.to_numpy()
                ], [(c.replace("for", "forr").replace(" ", "_"), 'VARCHAR')
                    for c in list(fdf.columns)]
def audit(df, configs, preprocessed=False):
    """

    :param df:
    :param configs:
    :param preprocessed:
    :return:
    """
    if not preprocessed:
        df, attr_cols_input = preprocess_input_df(df)
        if not configs.attr_cols:
            configs.attr_cols = attr_cols_input
    g = Group()
    print('Welcome to Aequitas-Audit')
    print('Fairness measures requested:',
          ','.join(configs.fair_measures_requested))
    groups_model, attr_cols = g.get_crosstabs(
        df,
        score_thresholds=configs.score_thresholds,
        attr_cols=configs.attr_cols)
    print('audit: df shape from the crosstabs:', groups_model.shape)
    b = Bias()

    # todo move this to the new configs object / the attr_cols now are passed through the configs object...
    ref_groups_method = configs.ref_groups_method
    if ref_groups_method == 'predefined' and configs.ref_groups:
        bias_df = b.get_disparity_predefined_groups(
            groups_model,
            df,
            configs.ref_groups,
            check_significance=configs.check_significance,
            alpha=configs.alpha,
            selected_significance=configs.selected_significance,
            mask_significance=configs.mask_significance)
    elif ref_groups_method == 'majority':
        bias_df = b.get_disparity_major_group(
            groups_model,
            df,
            check_significance=configs.check_significance,
            alpha=configs.alpha,
            selected_significance=configs.selected_significance,
            mask_significance=configs.mask_significance)
    else:
        bias_df = b.get_disparity_min_metric(
            df=groups_model,
            original_df=df,
            check_significance=configs.check_significance,
            alpha=configs.alpha,
            label_score_ref='fpr',
            selected_significance=configs.selected_significance,
            mask_significance=configs.mask_significance)

    print('Any NaN?: ', bias_df.isnull().values.any())
    print('bias_df shape:', bias_df.shape)

    aqp = Plot()

    if configs.plot_bias_metrics:
        if len(configs.plot_bias_metrics) == 1:
            fig1 = aqp.plot_group_metric(
                bias_df, group_metric=configs.plot_bias_metrics[0])

        elif len(configs.plot_bias_metrics) > 1:
            fig1 = aqp.plot_group_metric_all(
                bias_df, metrics=configs.plot_disparity_attributes)

        if (len(configs.plot_bias_disparities) == 1) and (len(
                configs.plot_disparity_attributes) == 1):
            fig2 = aqp.plot_disparity(
                bias_df,
                group_metric=configs.plot_bias_disparities[0],
                attribute_name=configs.plot_disparity_attributes[0])

        elif (len(configs.plot_bias_disparities) > 1) or (len(
                configs.plot_disparity_attributes) > 1):
            fig2 = aqp.plot_disparity_all(
                bias_df,
                metrics=configs.plot_bias_disparities,
                attributes=configs.plot_disparity_attributes)

    f = Fairness(tau=configs.fairness_threshold)
    print('Fairness Threshold:', configs.fairness_threshold)
    print('Fairness Measures:', configs.fair_measures_requested)
    group_value_df = f.get_group_value_fairness(
        bias_df, fair_measures_requested=configs.fair_measures_requested)
    group_attribute_df = f.get_group_attribute_fairness(
        group_value_df,
        fair_measures_requested=configs.fair_measures_requested)
    fair_results = f.get_overall_fairness(group_attribute_df)

    if configs.plot_bias_metrics:
        if len(configs.plot_bias_metrics) == 1:
            fig3 = aqp.plot_fairness_group(
                group_value_df, group_metric=configs.plot_bias_metrics[0])
        elif len(configs.plot_bias_metrics) > 1:
            fig3 = aqp.plot_fairness_group_all(
                group_value_df, metrics=configs.plot_bias_metrics)

        if (len(configs.plot_bias_disparities) == 1) and (len(
                configs.plot_disparity_attributes) == 1):
            fig4 = aqp.plot_fairness_disparity(
                group_value_df,
                group_metric=configs.plot_bias_disparities[0],
                attribute_name=configs.plot_disparity_attributes[0])
        elif (len(configs.plot_bias_disparities) > 1) or (len(
                configs.plot_disparity_attributes) > 1):
            fig4 = aqp.plot_fairness_disparity_all(
                group_value_df,
                metrics=configs.plot_bias_disparities,
                attributes=configs.plot_disparity_attributes)

    print(fair_results)
    report = None
    if configs.report is True:
        report = audit_report_markdown(configs, group_value_df,
                                       f.fair_measures_depend, fair_results)
    return group_value_df, report
Example #3
0
y_test = Y_test.rename(columns={"y": "label_value"})
y_test.reset_index(drop=True, inplace=True)
y_test.shape

#Unimos los dataframe para generar el input data
datos_aequitas = pd.concat([predicciones, y_test, X_test, X_test_df], axis=1)
#Hasta aquí es la preparación de datos para aequitas ------------------------------------------------------------

#Filtrar los datos de aequitas para calculo de FNR
datos_aequitas = datos_aequitas[['score', 'label_value', 'day_sem']]
datos_aequitas.head()

#Instalación de Aequitas
#pip install aequitas

g = Group()
xtab, _ = g.get_crosstabs(datos_aequitas)
#xtab contiene calculos de todas la métricas de FP, FN, TP, TN

#Calculo de Bias
b = Bias()
bdf = b.get_disparity_predefined_groups(
    xtab,
    original_df=datos_aequitas,
    ref_groups_dict={'day_sem': 'e:viernes'},
    alpha=0.05,
    check_significance=False)

#Calculo de Fairness
f = Fairness()
fdf = f.get_group_value_fairness(bdf)  #Mismo grupo de referencia
# Using the gender and race fields, we will prepare the data for the Aequitas Toolkit.

# In[165]:


# Aequitas
from aequitas.preprocessing import preprocess_input_df
from aequitas.group import Group
from aequitas.plotting import Plot
from aequitas.bias import Bias
from aequitas.fairness import Fairness

ae_subset_df = pred_test_df[['race', 'gender', 'score', 'label_value']]
ae_df, _ = preprocess_input_df(ae_subset_df)
g = Group()
xtab, _ = g.get_crosstabs(ae_df)
absolute_metrics = g.list_absolute_metrics(xtab)
clean_xtab = xtab.fillna(-1)
aqp = Plot()
b = Bias()


# ## Reference Group Selection

# Below we have chosen the reference group for our analysis but feel free to select another one.

# In[166]:


# test reference group with Caucasian Male
Example #5
0
    def _write_audit_to_db(self, model_id, protected_df, predictions_proba,
                           labels, tie_breaker, subset_hash, matrix_type,
                           evaluation_start_time, evaluation_end_time,
                           matrix_uuid):
        """
        Runs the bias audit and saves the result in the bias table.

        Args:
            model_id (int) primary key of the model
            protected_df (pandas.DataFrame) A dataframe with protected group attributes:
            predictions_proba (np.array) List of prediction probabilities
            labels (pandas.Series): List of labels
            tie_breaker: 'best' or 'worst' case tiebreaking rule that the predictions and labels were sorted by
            subset_hash (str) the hash of the subset, if any, that the
                evaluation is made on
            matrix_type (triage.component.catwalk.storage.MatrixType)
                The type of matrix used
            evaluation_start_time (pandas._libs.tslibs.timestamps.Timestamp)
                first as_of_date included in the evaluation period
            evaluation_end_time (pandas._libs.tslibs.timestamps.Timestamp) last
                as_of_date included in the evaluation period
            matrix_uuid: the uuid of the matrix
        Returns:

        """
        if protected_df.empty:
            return

        # to preprocess aequitas requires the following columns:
        # score, label value, model_id, protected attributes
        # fill out the protected_df, which just has protected attributes at this point
        protected_df = protected_df.copy()
        protected_df['model_id'] = model_id
        protected_df['score'] = predictions_proba
        protected_df['label_value'] = labels
        aequitas_df, attr_cols_input = preprocess_input_df(protected_df)

        # create group crosstabs
        g = Group()
        score_thresholds = {}
        score_thresholds['rank_abs'] = self.bias_config['thresholds'].get(
            'top_n', [])
        # convert 0-100 percentile to 0-1 that Aequitas expects
        score_thresholds['rank_pct'] = [
            value / 100.0
            for value in self.bias_config['thresholds'].get('percentiles', [])
        ]
        groups_model, attr_cols = g.get_crosstabs(
            aequitas_df,
            score_thresholds=score_thresholds,
            attr_cols=attr_cols_input)
        # analyze bias from reference groups
        bias = Bias()
        ref_groups_method = self.bias_config.get('ref_groups_method', None)
        if ref_groups_method == 'predefined' and self.bias_config['ref_groups']:
            bias_df = bias.get_disparity_predefined_groups(
                groups_model, aequitas_df, self.bias_config['ref_groups'])
        elif ref_groups_method == 'majority':
            bias_df = bias.get_disparity_major_group(groups_model, aequitas_df)
        else:
            bias_df = bias.get_disparity_min_metric(groups_model, aequitas_df)

        # analyze fairness for each group
        f = Fairness(tau=0.8)  # the default fairness threshold is 0.8
        group_value_df = f.get_group_value_fairness(bias_df)
        group_value_df['subset_hash'] = subset_hash
        group_value_df['tie_breaker'] = tie_breaker
        group_value_df['evaluation_start_time'] = evaluation_start_time
        group_value_df['evaluation_end_time'] = evaluation_end_time
        group_value_df['matrix_uuid'] = matrix_uuid
        group_value_df = group_value_df.rename(index=str,
                                               columns={
                                                   "score_threshold":
                                                   "parameter",
                                                   "for": "for_"
                                               })
        if group_value_df.empty:
            raise ValueError(f"""
            Bias audit: aequitas_audit() failed.
            Returned empty dataframe for model_id = {model_id}, and subset_hash = {subset_hash}
            and matrix_type = {matrix_type}""")
        with scoped_session(self.db_engine) as session:
            for index, row in group_value_df.iterrows():
                session.query(matrix_type.aequitas_obj).filter_by(
                    model_id=row['model_id'],
                    evaluation_start_time=row['evaluation_start_time'],
                    evaluation_end_time=row['evaluation_end_time'],
                    subset_hash=row['subset_hash'],
                    parameter=row['parameter'],
                    tie_breaker=row['tie_breaker'],
                    matrix_uuid=row['matrix_uuid'],
                    attribute_name=row['attribute_name'],
                    attribute_value=row['attribute_value']).delete()
            session.bulk_insert_mappings(
                matrix_type.aequitas_obj,
                group_value_df.to_dict(orient="records"))