Exemple #1
0
Fichier : util.py Projet : IBM/lale
    def __init__(self, metric, favorable_labels, protected_attributes):
        if hasattr(aif360.metrics.BinaryLabelDatasetMetric, metric):
            self.kind = "BinaryLabelDatasetMetric"
        elif hasattr(aif360.metrics.ClassificationMetric, metric):
            self.kind = "ClassificationMetric"
        else:
            raise ValueError(f"unknown metric {metric}")
        self.metric = metric
        self.fairness_info = {
            "favorable_labels": favorable_labels,
            "protected_attributes": protected_attributes,
        }

        from lale.lib.aif360 import ProtectedAttributesEncoder

        self.prot_attr_enc = ProtectedAttributesEncoder(
            **self.fairness_info,
            remainder="drop",
            return_X_y=True,
        )
        pas = protected_attributes
        self.unprivileged_groups = [{_ensure_str(pa["feature"]): 0 for pa in pas}]
        self.privileged_groups = [{_ensure_str(pa["feature"]): 1 for pa in pas}]

        self.pandas_to_dataset = _PandasToDatasetConverter(
            favorable_label=1,
            unfavorable_label=0,
            protected_attribute_names=[_ensure_str(pa["feature"]) for pa in pas],
        )
Exemple #2
0
    def __init__(
        self,
        metric,
        favorable_label=None,
        unfavorable_label=None,
        protected_attribute_names=None,
        unprivileged_groups=None,
        privileged_groups=None,
        favorable_labels=None,
        protected_attributes=None,
    ):
        if hasattr(aif360.metrics.BinaryLabelDatasetMetric, metric):
            self.kind = "BinaryLabelDatasetMetric"
        elif hasattr(aif360.metrics.ClassificationMetric, metric):
            self.kind = "ClassificationMetric"
        else:
            raise ValueError(f"unknown metric {metric}")
        self.metric = metric
        if favorable_labels is None:
            self.prot_attr_enc = None
        else:
            self.favorable_labels = favorable_labels
            assert favorable_label is None and unfavorable_label is None
            favorable_label, unfavorable_label = 1, 0
            assert protected_attribute_names is None
            pas = protected_attributes
            protected_attribute_names = [
                _ensure_str(pa["feature"]) for pa in pas
            ]
            assert unprivileged_groups is None and privileged_groups is None
            unprivileged_groups = [{
                _ensure_str(pa["feature"]): 0
                for pa in pas
            }]
            privileged_groups = [{_ensure_str(pa["feature"]): 1 for pa in pas}]

            from lale.lib.aif360 import ProtectedAttributesEncoder

            self.prot_attr_enc = ProtectedAttributesEncoder(
                favorable_labels=favorable_labels,
                protected_attributes=protected_attributes,
                remainder="drop",
                return_X_y=True,
            )
        self.fairness_info = {
            "favorable_label": favorable_label,
            "unfavorable_label": unfavorable_label,
            "protected_attribute_names": protected_attribute_names,
            "unprivileged_groups": unprivileged_groups,
            "privileged_groups": privileged_groups,
        }
        lale.type_checking.validate_schema(self.fairness_info,
                                           _dataset_fairness_schema)
        self.pandas_to_dataset = _PandasToDatasetConverter(
            favorable_label, unfavorable_label, protected_attribute_names)
Exemple #3
0
Fichier : util.py Projet : IBM/lale
    def fit(self, X, y):
        from lale.lib.aif360 import ProtectedAttributesEncoder, Redacting

        fairness_info = {
            "favorable_labels": self.favorable_labels,
            "protected_attributes": self.protected_attributes,
        }
        redacting = Redacting(**fairness_info) if self.redact else lale.lib.lale.NoOp
        trainable_redact_and_prep = redacting >> self.preparation
        assert isinstance(trainable_redact_and_prep, lale.operators.TrainablePipeline)
        self.redact_and_prep = trainable_redact_and_prep.fit(X, y)
        self.prot_attr_enc = ProtectedAttributesEncoder(
            **fairness_info,
            remainder="drop",
            return_X_y=True,
        )
        prot_attr_names = [pa["feature"] for pa in self.protected_attributes]
        self.pandas_to_dataset = _PandasToDatasetConverter(
            favorable_label=1,
            unfavorable_label=0,
            protected_attribute_names=prot_attr_names,
        )
        encoded_data = self._prep_and_encode(X, y)
        self.mitigator.fit(encoded_data)
        self.unfavorable_labels = list(set(list(y)) - set(list(self.favorable_labels)))
        return self
Exemple #4
0
Fichier : util.py Projet : IBM/lale
def _column_for_stratification(X, y, favorable_labels, protected_attributes):
    from lale.lib.aif360 import ProtectedAttributesEncoder

    prot_attr_enc = ProtectedAttributesEncoder(
        favorable_labels=favorable_labels,
        protected_attributes=protected_attributes,
        remainder="drop",
        return_X_y=True,
    )
    encoded_X, encoded_y = prot_attr_enc.transform(X, y)
    df = pd.concat([encoded_X, encoded_y], axis=1)

    def label_for_stratification(row):
        return "".join(["T" if v == 1 else "F" for v in row])

    result = df.apply(label_for_stratification, axis=1)
    result.name = "stratify"
    return result
Exemple #5
0
    def __init__(
        self,
        metric: str,
        favorable_labels: _FAV_LABELS_TYPE,
        protected_attributes: List[JSON_TYPE],
        unfavorable_labels: Optional[_FAV_LABELS_TYPE],
    ):
        _validate_fairness_info(favorable_labels, protected_attributes,
                                unfavorable_labels, True)
        if metric in ["disparate_impact", "statistical_parity_difference"]:
            unfavorable_labels = None  # not used and may confound AIF360
        if hasattr(aif360.metrics.BinaryLabelDatasetMetric, metric):
            self.kind = "BinaryLabelDatasetMetric"
        elif hasattr(aif360.metrics.ClassificationMetric, metric):
            self.kind = "ClassificationMetric"
        else:
            raise ValueError(f"unknown metric {metric}")
        self.metric = metric
        self.fairness_info = {
            "favorable_labels": favorable_labels,
            "protected_attributes": protected_attributes,
            "unfavorable_labels": unfavorable_labels,
        }

        from lale.lib.aif360 import ProtectedAttributesEncoder

        self.prot_attr_enc = ProtectedAttributesEncoder(
            **self.fairness_info,
            remainder="drop",
            return_X_y=True,
        )
        pas = protected_attributes
        self.unprivileged_groups = [{
            _ensure_str(pa["feature"]): 0
            for pa in pas
        }]
        self.privileged_groups = [{
            _ensure_str(pa["feature"]): 1
            for pa in pas
        }]
        self._cached_pandas_to_dataset = None
Exemple #6
0
    def fit(self, X, y):
        from lale.lib.aif360 import ProtectedAttributesEncoder, Redacting

        fairness_info = {
            "favorable_labels": self.favorable_labels,
            "protected_attributes": self.protected_attributes,
            "unfavorable_labels": self.unfavorable_labels,
        }
        redacting = Redacting(
            **fairness_info) if self.redact else lale.lib.lale.NoOp
        trainable_redact_and_estim = redacting >> self.estimator
        assert isinstance(trainable_redact_and_estim, TrainablePipeline)
        self.redact_and_estim = trainable_redact_and_estim.fit(X, y)
        self.prot_attr_enc = ProtectedAttributesEncoder(
            **fairness_info,
            remainder="drop",
            return_X_y=True,
        )
        prot_attr_names = [pa["feature"] for pa in self.protected_attributes]
        self.pandas_to_dataset = _PandasToDatasetConverter(
            favorable_label=1,
            unfavorable_label=0,
            protected_attribute_names=prot_attr_names,
        )
        encoded_X, encoded_y = self.prot_attr_enc.transform(X, y)
        self.y_dtype = encoded_y.dtype
        self.y_name = encoded_y.name
        predicted_y = self.redact_and_estim.predict(X)
        predicted_y = _ndarray_to_series(predicted_y, self.y_name, X.index)
        _, predicted_y = self.prot_attr_enc.transform(X, predicted_y)
        predicted_probas = self.redact_and_estim.predict_proba(X)
        dataset_true = self.pandas_to_dataset.convert(encoded_X, encoded_y)
        dataset_pred = self.pandas_to_dataset.convert(encoded_X, predicted_y,
                                                      predicted_probas)
        self.mitigator = self.mitigator.fit(dataset_true, dataset_pred)
        self.classes_ = set(list(y))
        self.not_favorable_labels = list(self.classes_ -
                                         set(list(self.favorable_labels)))
        self.classes_ = np.array(list(self.classes_))
        return self
Exemple #7
0
Fichier : util.py Projet : IBM/lale
class _BasePostEstimatorImpl:
    def __init__(
        self,
        *,
        favorable_labels,
        protected_attributes,
        estimator,
        redact,
        mitigator,
    ):
        self.favorable_labels = favorable_labels
        self.protected_attributes = protected_attributes
        self.estimator = estimator
        self.redact = redact
        self.mitigator = mitigator

    def _decode(self, y):
        assert isinstance(y, pd.Series)
        assert len(self.favorable_labels) == 1 and len(self.unfavorable_labels) == 1
        favorable, unfavorable = self.favorable_labels[0], self.unfavorable_labels[0]
        result = y.map(lambda label: favorable if label == 1 else unfavorable)
        return result

    def fit(self, X, y):
        from lale.lib.aif360 import ProtectedAttributesEncoder, Redacting

        fairness_info = {
            "favorable_labels": self.favorable_labels,
            "protected_attributes": self.protected_attributes,
        }
        redacting = Redacting(**fairness_info) if self.redact else lale.lib.lale.NoOp
        trainable_redact_and_estim = redacting >> self.estimator
        assert isinstance(trainable_redact_and_estim, lale.operators.TrainablePipeline)
        self.redact_and_estim = trainable_redact_and_estim.fit(X, y)
        self.prot_attr_enc = ProtectedAttributesEncoder(
            **fairness_info,
            remainder="drop",
            return_X_y=True,
        )
        prot_attr_names = [pa["feature"] for pa in self.protected_attributes]
        self.pandas_to_dataset = _PandasToDatasetConverter(
            favorable_label=1,
            unfavorable_label=0,
            protected_attribute_names=prot_attr_names,
        )
        encoded_X, encoded_y = self.prot_attr_enc.transform(X, y)
        self.y_dtype = encoded_y.dtype
        self.y_name = encoded_y.name
        predicted_y = self.redact_and_estim.predict(X)
        predicted_y = _ndarray_to_series(predicted_y, self.y_name, X.index)
        _, predicted_y = self.prot_attr_enc.transform(X, predicted_y)
        predicted_probas = self.redact_and_estim.predict_proba(X)
        dataset_true = self.pandas_to_dataset.convert(encoded_X, encoded_y)
        dataset_pred = self.pandas_to_dataset.convert(
            encoded_X, predicted_y, predicted_probas
        )
        self.mitigator = self.mitigator.fit(dataset_true, dataset_pred)
        self.unfavorable_labels = list(set(list(y)) - set(list(self.favorable_labels)))
        return self

    def predict(self, X):
        predicted_y = self.redact_and_estim.predict(X)
        predicted_probas = self.redact_and_estim.predict_proba(X)
        predicted_y = _ndarray_to_series(predicted_y, self.y_name, X.index)
        encoded_X, predicted_y = self.prot_attr_enc.transform(X, predicted_y)
        dataset_pred = self.pandas_to_dataset.convert(
            encoded_X, predicted_y, predicted_probas
        )
        dataset_out = self.mitigator.predict(dataset_pred)
        _, result_y = dataset_to_pandas(dataset_out, return_only="y")
        decoded_y = self._decode(result_y)
        return decoded_y
Exemple #8
0
Fichier : util.py Projet : IBM/lale
class _ScorerFactory:
    def __init__(self, metric, favorable_labels, protected_attributes):
        if hasattr(aif360.metrics.BinaryLabelDatasetMetric, metric):
            self.kind = "BinaryLabelDatasetMetric"
        elif hasattr(aif360.metrics.ClassificationMetric, metric):
            self.kind = "ClassificationMetric"
        else:
            raise ValueError(f"unknown metric {metric}")
        self.metric = metric
        self.fairness_info = {
            "favorable_labels": favorable_labels,
            "protected_attributes": protected_attributes,
        }

        from lale.lib.aif360 import ProtectedAttributesEncoder

        self.prot_attr_enc = ProtectedAttributesEncoder(
            **self.fairness_info,
            remainder="drop",
            return_X_y=True,
        )
        pas = protected_attributes
        self.unprivileged_groups = [{_ensure_str(pa["feature"]): 0 for pa in pas}]
        self.privileged_groups = [{_ensure_str(pa["feature"]): 1 for pa in pas}]

        self.pandas_to_dataset = _PandasToDatasetConverter(
            favorable_label=1,
            unfavorable_label=0,
            protected_attribute_names=[_ensure_str(pa["feature"]) for pa in pas],
        )

    def scoring(self, y_true=None, y_pred=None, X=None):
        assert y_pred is not None
        assert X is not None
        y_pred_orig = y_pred
        if not isinstance(y_pred, pd.Series):
            assert y_true is not None
            y_pred = _ndarray_to_series(
                y_pred,
                y_true.name
                if isinstance(y_true, pd.Series)
                else _ensure_str(X.shape[1]),
                X.index if isinstance(X, pd.DataFrame) else None,
                y_pred.dtype,
            )
        encoded_X, y_pred = self.prot_attr_enc.transform(X, y_pred)
        dataset_pred = self.pandas_to_dataset.convert(encoded_X, y_pred)
        if self.kind == "BinaryLabelDatasetMetric":
            fairness_metrics = aif360.metrics.BinaryLabelDatasetMetric(
                dataset_pred, self.unprivileged_groups, self.privileged_groups
            )
        else:
            assert self.kind == "ClassificationMetric"
            assert y_true is not None
            if not isinstance(y_true, pd.Series):
                y_true = _ndarray_to_series(
                    y_true, y_pred.name, y_pred.index, y_pred_orig.dtype
                )
            _, y_true = self.prot_attr_enc.transform(X, y_true)
            dataset_true = self.pandas_to_dataset.convert(encoded_X, y_true)
            fairness_metrics = aif360.metrics.ClassificationMetric(
                dataset_true,
                dataset_pred,
                self.unprivileged_groups,
                self.privileged_groups,
            )
        method = getattr(fairness_metrics, self.metric)
        result = method()
        if np.isnan(result) or not np.isfinite(result):
            if 0 == fairness_metrics.num_positives(privileged=True):
                logger.warning("there are 0 positives in the privileged group")
            if 0 == fairness_metrics.num_positives(privileged=False):
                logger.warning("there are 0 positives in the unprivileged group")
            if 0 == fairness_metrics.num_instances(privileged=True):
                logger.warning("there are 0 instances in the privileged group")
            if 0 == fairness_metrics.num_instances(privileged=False):
                logger.warning("there are 0 instances in the unprivileged group")
            if self.metric == "disparate_impact":
                result = 0.0
            logger.warning(
                f"The metric {self.metric} is ill-defined and returns {result}. Check your fairness configuration. The set of predicted labels is {set(y_pred_orig)}."
            )
        return result

    def scorer(self, estimator, X, y):
        return self.scoring(y_true=y, y_pred=estimator.predict(X), X=X)

    def __call__(self, estimator, X, y):
        return self.scorer(estimator, X, y)
Exemple #9
0
class _ScorerFactory:
    _cached_pandas_to_dataset: Optional[_PandasToDatasetConverter]

    def __init__(
        self,
        metric: str,
        favorable_labels: _FAV_LABELS_TYPE,
        protected_attributes: List[JSON_TYPE],
        unfavorable_labels: Optional[_FAV_LABELS_TYPE],
    ):
        _validate_fairness_info(favorable_labels, protected_attributes,
                                unfavorable_labels, True)
        if metric in ["disparate_impact", "statistical_parity_difference"]:
            unfavorable_labels = None  # not used and may confound AIF360
        if hasattr(aif360.metrics.BinaryLabelDatasetMetric, metric):
            self.kind = "BinaryLabelDatasetMetric"
        elif hasattr(aif360.metrics.ClassificationMetric, metric):
            self.kind = "ClassificationMetric"
        else:
            raise ValueError(f"unknown metric {metric}")
        self.metric = metric
        self.fairness_info = {
            "favorable_labels": favorable_labels,
            "protected_attributes": protected_attributes,
            "unfavorable_labels": unfavorable_labels,
        }

        from lale.lib.aif360 import ProtectedAttributesEncoder

        self.prot_attr_enc = ProtectedAttributesEncoder(
            **self.fairness_info,
            remainder="drop",
            return_X_y=True,
        )
        pas = protected_attributes
        self.unprivileged_groups = [{
            _ensure_str(pa["feature"]): 0
            for pa in pas
        }]
        self.privileged_groups = [{
            _ensure_str(pa["feature"]): 1
            for pa in pas
        }]
        self._cached_pandas_to_dataset = None

    def _pandas_to_dataset(self) -> _PandasToDatasetConverter:
        if self._cached_pandas_to_dataset is None:
            self._cached_pandas_to_dataset = _PandasToDatasetConverter(
                favorable_label=1,
                unfavorable_label=0,
                protected_attribute_names=list(
                    self.privileged_groups[0].keys()),
            )
        return self._cached_pandas_to_dataset

    def _y_pred_series(self, y_true, y_pred, X) -> pd.Series:
        if isinstance(y_pred, pd.Series):
            return y_pred
        assert y_true is not None
        return _ndarray_to_series(
            y_pred,
            y_true.name
            if isinstance(y_true, pd.Series) else _ensure_str(X.shape[1]),
            X.index if isinstance(X, pd.DataFrame) else None,
            y_pred.dtype,
        )

    def score_data(self, y_true=None, y_pred=None, X=None) -> float:
        assert y_pred is not None
        assert X is not None
        y_pred_orig = y_pred
        y_pred = self._y_pred_series(y_true, y_pred, X)
        encoded_X, y_pred = self.prot_attr_enc.transform(X, y_pred)
        try:
            dataset_pred = self._pandas_to_dataset().convert(encoded_X, y_pred)
        except ValueError as e:
            raise ValueError(
                "The data has unexpected labels given the fairness info: "
                f"favorable labels {self.fairness_info['favorable_labels']}, "
                f"unfavorable labels {self.fairness_info['unfavorable_labels']}, "
                f"unique values in y_pred {set(y_pred_orig)}.") from e
        if self.kind == "BinaryLabelDatasetMetric":
            fairness_metrics = aif360.metrics.BinaryLabelDatasetMetric(
                dataset_pred, self.unprivileged_groups, self.privileged_groups)
        else:
            assert self.kind == "ClassificationMetric"
            assert y_true is not None
            if not isinstance(y_true, pd.Series):
                y_true = _ndarray_to_series(y_true, y_pred.name, y_pred.index,
                                            y_pred_orig.dtype)
            _, y_true = self.prot_attr_enc.transform(X, y_true)
            dataset_true = self._pandas_to_dataset().convert(encoded_X, y_true)
            fairness_metrics = aif360.metrics.ClassificationMetric(
                dataset_true,
                dataset_pred,
                self.unprivileged_groups,
                self.privileged_groups,
            )
        method = getattr(fairness_metrics, self.metric)
        result = method()
        if np.isnan(result) or not np.isfinite(result):
            if 0 == fairness_metrics.num_positives(privileged=True):
                logger.warning("there are 0 positives in the privileged group")
            if 0 == fairness_metrics.num_positives(privileged=False):
                logger.warning(
                    "there are 0 positives in the unprivileged group")
            if 0 == fairness_metrics.num_instances(privileged=True):
                logger.warning("there are 0 instances in the privileged group")
            if 0 == fairness_metrics.num_instances(privileged=False):
                logger.warning(
                    "there are 0 instances in the unprivileged group")
            logger.warning(
                f"The metric {self.metric} is ill-defined and returns {result}. Check your fairness configuration. The set of predicted labels is {set(y_pred_orig)}."
            )
        return result

    def score_estimator(self, estimator: TrainedOperator, X, y) -> float:
        return self.score_data(y_true=y, y_pred=estimator.predict(X), X=X)

    def __call__(self, estimator: TrainedOperator, X, y) -> float:
        return self.score_estimator(estimator, X, y)