Ejemplo n.º 1
0
def _positive_predicted_index(predicted_label_data: pd.Series,
                              label_data: pd.Series,
                              positive_label_values: List[Any]) -> pd.Series:
    """
    creates a list of bool series for positive predicted label index based on the input data type,
    list of positive label values or intervals

    :param predicted_label_data: input data for predicted label column
    :param label_datatype:  input data for the label column
    :param positive_label_values: list of positive label values
    :return: list of positive predicted label index series
    """
    predicted_label_datatype = common.series_datatype(predicted_label_data,
                                                      positive_label_values)
    label_datatype = common.series_datatype(label_data, positive_label_values)
    if predicted_label_datatype != label_datatype:
        raise AssertionError(
            "Predicted Label Column series datatype is not the same as Label Column series"
        )
    try:
        predicted_label_data = predicted_label_data.astype(label_data.dtype)
    except ValueError as e:
        raise ValueError(
            "Labels and predicted labels cannot have different types (%s, %s)."
            % (label_data.dtype, predicted_label_data.dtype))
    if predicted_label_datatype == common.DataType.CONTINUOUS:
        data_interval_indices = _interval_index(
            label_data.append(predicted_label_data), positive_label_values)
        positive_predicted_index = _continuous_data_idx(
            predicted_label_data, data_interval_indices)
    elif predicted_label_datatype == common.DataType.CATEGORICAL and positive_label_values:
        positive_predicted_index = _categorical_data_idx(
            predicted_label_data, positive_label_values)
    else:
        raise RuntimeError(
            "Predicted Label_column data is invalid or can't be classified")
    # check if positive index boolean series has all False values
    if (~positive_predicted_index).all():
        raise ValueError(
            "No Label values are present in the predicted Label Column,"
            "Positive Predicted Index Series contains all False values")
    return positive_predicted_index
Ejemplo n.º 2
0
def _positive_label_index(data: pd.Series, positive_values: List[Any]) -> Tuple[pd.Series, str]:
    """
    creates a list of bool series for positive label index based on the input data type, list of positive
    label values or intervals

    :param data: input data for label column
    :param positive_values: list of positive label values
    :return: list of positive label index series, positive_label_values or intervals
    """
    data_type = common.series_datatype(data, positive_values)
    if data_type == common.DataType.CONTINUOUS:
        data_interval_indices = _interval_index(data, positive_values)
        positive_index = _continuous_data_idx(data, data_interval_indices)
        label_values_or_intervals = ",".join(map(str, data_interval_indices))
    elif data_type == common.DataType.CATEGORICAL and positive_values:
        positive_index = _categorical_data_idx(data, positive_values)
        label_values_or_intervals = ",".join(map(str, positive_values))
    else:
        raise RuntimeError("Label_column data is invalid or can't be classified")
    logger.debug(f"positive index: {positive_index}")
    logger.debug(f"label values or intervals: {label_values_or_intervals}")
    return positive_index, label_values_or_intervals
Ejemplo n.º 3
0
def bias_report(
    df: pd.DataFrame,
    facet_column: FacetColumn,
    label_column: LabelColumn,
    stage_type: StageType,
    predicted_label_column: LabelColumn = None,
    metrics: List[Any] = ["all"],
    group_variable: Optional[pd.Series] = None,
) -> List[Dict]:
    """
    Run full bias report on a dataset.

    The report computes the bias metric for multi-facet, and multi-class inputs by
    computing the sensitive_facet_index, positive_label_index, and positive_predicted_label_index by collapsing the
    multiple categories into two, as indicated by the facet_column, label_column, and predicted_label_column respectively.

    :param df: Dataset as a pandas.DataFrame
    :param facet_column: description of column to consider for Bias analysis
    :param label_column: description of column which has the labels.
    :param stage_type: pre_training or post_training for which bias metrics is computed
    :param predicted_label_column: description of column with predicted labels
    :param metrics: list of metrics names to provide bias metrics
    :param group_variable: data series for the group variable
    :return: list of dictionaries with metrics for different label values
    """
    if facet_column:
        if facet_column.name not in df.columns:
            raise ValueError("Facet column {} is not present in the dataset".format(facet_column.name))
    if not label_column.positive_label_values:
        raise ValueError("Positive label values or thresholds are empty for Label column")
    if isinstance(predicted_label_column, LabelColumn) and predicted_label_column.positive_label_values:
        if predicted_label_column.positive_label_values != label_column.positive_label_values:
            raise ValueError(
                "Positive predicted label values or threshold should be empty or same as label values or thresholds"
            )
    if not isinstance(stage_type, StageType):
        raise ValueError("stage_type should be a Enum value of StageType")
    if not predicted_label_column and stage_type == StageType.POST_TRAINING:
        raise ValueError("predicted_label_column has to be provided for Post training metrics")
    data_series: pd.Series = df[facet_column.name]
    df = df.drop(facet_column.name, 1)
    label_series: pd.Series = label_column.data
    positive_label_index, label_values = _positive_label_index(
        data=label_series, positive_values=label_column.positive_label_values
    )
    if label_column.name in df.columns:
        df = df.drop(label_column.name, 1)

    metrics_to_run = []
    if predicted_label_column and stage_type == StageType.POST_TRAINING:
        post_training_metrics = (
            smclarify.bias.metrics.POSTTRAINING_METRICS
            if metrics == ["all"]
            else fetch_metrics_to_run(smclarify.bias.metrics.POSTTRAINING_METRICS, metrics)
        )
        metrics_to_run.extend(post_training_metrics)
        predicted_label_series = predicted_label_column.data
        positive_predicted_label_index = _positive_predicted_index(
            predicted_label_data=predicted_label_series,
            label_data=label_series,
            positive_label_values=label_column.positive_label_values,
        )
        if predicted_label_column.name in df.columns:
            df = df.drop(predicted_label_column.name, 1)
    else:
        positive_predicted_label_index = [None]
        pre_training_metrics = (
            smclarify.bias.metrics.PRETRAINING_METRICS
            if metrics == ["all"]
            else fetch_metrics_to_run(smclarify.bias.metrics.PRETRAINING_METRICS, metrics)
        )
        metrics_to_run.extend(pre_training_metrics)
    metrics_to_run.sort(key=_metric_name_comparator)

    facet_dtype = common.series_datatype(data_series, facet_column.sensitive_values)
    data_series_cat: pd.Series  # Category series
    # result values can be str for label_values or dict for metrics
    result: MetricResult
    facet_metric: FacetReport
    metrics_result = []
    if facet_dtype == common.DataType.CATEGORICAL:
        data_series_cat = data_series.astype("category")
        # pass the values for metric one vs all case
        facet_values_list = (
            [[val] for val in list(data_series.unique())]
            if not facet_column.sensitive_values
            else [facet_column.sensitive_values]
        )
        for facet_values in facet_values_list:
            # list of metrics with values
            metrics_list = []
            for metric in metrics_to_run:
                result = _categorical_metric_call_wrapper(
                    metric,
                    df,
                    data_series_cat,
                    facet_values,
                    positive_label_index,
                    positive_predicted_label_index,
                    group_variable,
                )
                metrics_list.append(result)
            facet_metric = FacetReport(facet_value_or_threshold=",".join(map(str, facet_values)), metrics=metrics_list)
            metrics_result.append(facet_metric.toJson())
        logger.debug("metric_result: %s", str(metrics_result))
        return metrics_result

    elif facet_dtype == common.DataType.CONTINUOUS:
        facet_interval_indices = _interval_index(data_series, facet_column.sensitive_values)
        facet_continuous_column = FacetContinuousColumn(facet_column.name, facet_interval_indices)
        logger.info(f"Threshold Interval indices: {facet_interval_indices}")
        # list of metrics with values
        metrics_list = []
        for metric in metrics_to_run:
            result = _continuous_metric_call_wrapper(
                metric,
                df,
                data_series,
                facet_continuous_column.interval_indices,
                positive_label_index,
                positive_predicted_label_index,
                group_variable,
            )
            metrics_list.append(result)
        facet_metric = FacetReport(
            facet_value_or_threshold=",".join(map(str, facet_interval_indices)), metrics=metrics_list
        )
        metrics_result.append(facet_metric.toJson())
        logger.debug("metric_result:", metrics_result)
        return metrics_result
    else:
        raise RuntimeError("facet_column data is invalid or can't be classified")