예제 #1
0
def test_temporarily_ignore():
    sd = StructuredDataset(df=df, label_names=['label'], protected_attribute_names=['one', 'three'])
    modified = sd.copy()
    modified.labels = sd.labels + 1
    assert sd != modified
    with sd.temporarily_ignore('labels'):
        assert sd == modified
    assert 'labels' not in sd.ignore_fields
예제 #2
0
def test_copy():
    sd = StructuredDataset(df=df, label_names=['label'], protected_attribute_names=['two'])
    sd2 = sd.copy()
    sd3 = sd.copy(True)

    sd.features[0] = 999
    assert np.all(sd2.features[0] == 999)
    assert not np.any(sd3.features[0] == 999)
예제 #3
0
def test_split():
    sd = StructuredDataset(df=df, label_names=['label'], protected_attribute_names=['two'])
    train, test = sd.split([0.5])
    train2, test2 = sd.split(2)

    assert train == train2
    assert test == test2
    assert np.all(np.concatenate((train.features, test.features)) == sd.features)
예제 #4
0
def test_eq():
    sd = StructuredDataset(df=df, label_names=['label'], protected_attribute_names=['two'])
    sd2 = sd.copy()
    sd3 = sd.copy(True)
    sd4 = StructuredDataset(df=df, label_names=['label'], protected_attribute_names=['one', 'three'])

    assert sd == sd2
    assert sd == sd3
    assert sd2 == sd3
    assert sd != sd4
예제 #5
0
def test_k_folds():
    sd = StructuredDataset(df=df, label_names=['label'], protected_attribute_names=['two'])
    folds = sd.split(4)

    assert len(folds) == 4
    assert all(f.features.shape[0] == f.labels.shape[0]
            == f.protected_attributes.shape[0] == len(f.instance_names)
            == f.instance_weights.shape[0] == 1 for f in folds)

    folds = sd.split(3)
    assert folds[0].features.shape[0] == 2
예제 #6
0
    def enforce_dummy_coded(self, X):
        """
        Enforces that for dummycoded features exactly one feature is set to 1, all the others to 0. Called after gradient ascend.

        :param X: Feature matrix (dimension `n_instances x n_features`)
        :returns: X' (modified feature matrix)
        """
        for k, v in StructuredDataset._parse_feature_names(
                self.feature_names)[0].items():
            ft_indices = (list(
                map(lambda x: self.feature_names.index(k + '=' + x), v)))
            #            print(k,ft_indices, v)
            max_index = np.argmax(X[:, ft_indices], axis=1)

            #            for i in range(len(max_index)):
            #                if X[i,ft_indices].sum() > 0 and k == 'credit_history':
            #                    print(k)
            #                    print(X[i,ft_indices])
            #                    print((X[i,ft_indices] == 1))

            X[:, ft_indices] = 0
            for i in range(len(max_index)):
                X[i, ft_indices[max_index[i]]] = 1
            for x in X:
                assert (x[ft_indices].sum() == 1)

#        print(X.shape)
        return X
예제 #7
0
 def _get_domain(self, ft):
     """
     Infers domain of feature.
     :param ft: Feature name
     :returns: Domain
     """
     if callable(self.domains[ft]):
         return [self.domains[ft]()]
     elif self._is_dummy_coded(ft):
         raise Exception("Can't use dummy coded for sim")
         warnings.warn(
             "Use set of values present in dataset to infer domain for feature "
             + ft)
         # discrete, dummy coded
         return StructuredDataset._parse_feature_names(
             self.feature_names)[0][ft]
     elif ft in self.discrete:
         # discrete
         #warnings.warn("Use set of values present in dataset to infer domain for feature " + ft)
         return list(set(self.features[:, self._ft_index(ft)]))
     else:
         # continious
         df, _ = self.convert_to_dataframe()
         warnings.warn("Used min/max for feature " + ft +
                       " to infer domain + unsupported/not implemented yet")
         return (min(df[ft]), max(df[:, ft]))
예제 #8
0
 def _is_dummy_coded(self, ft):
     """
     :param ft: Feature name
     :returns: True if ft is dummycoded
     """
     # fix this
     return len(
         StructuredDataset._parse_feature_names(self.feature_names)[0][ft])
예제 #9
0
    def _dedummy_code_obj(self, obj, sep='='):
        """
        :param obj: Instance (feature values) in object form (dict)
        :param sep: Seperator used for dummy coding
        :returns: dedummy coded object
        """
        # reimplemented this bc library is too slow for one row only...
        result_obj = obj.copy()
        for k, v in (StructuredDataset._parse_feature_names(
                self.feature_names)[0]).items():
            # figure out which dummy coded is set to 1
            value_l = list(filter(lambda x: obj[k + sep + x] == 1, v))
            value = value_l.pop() if len(value_l) > 0 else None

            # convert to non-dummy coded
            result_obj[k] = value

            # remove all dummy coded ie [key=value]
            [result_obj.pop(k + sep + option) for option in v]

        return result_obj
예제 #10
0
    def scale_dummy_coded(self, X):
        """
        Ensures that the values for one dummy-coded feature sum up to 1 (scales accordingly). Called during gradient ascend. You may find an in-depth explanation in the write-up.

        :param X: Feature matrix (dimension `n_instances x n_features`)
        :returns: X' (modified feature matrix)
        """
        #print(np.where(X[:,12]>0.8))

        for k, v in StructuredDataset._parse_feature_names(
                self.feature_names)[0].items():
            ft_indices = (list(
                map(lambda x: self.feature_names.index(k + '=' + x), v)))

            #if k == 'property':
            #    print(X[4,ft_indices])

            X[:, ft_indices] = X[:, ft_indices] / X[:, ft_indices].sum(
                axis=1)[:, None]

            assert (np.isclose(X[:, ft_indices].sum(axis=1).sum(), len(X)))

        return X
import numpy as np
import pandas as pd
from scipy.spatial.distance import cdist

from aif360.datasets import StructuredDataset
from aif360.metrics import SampleDistortionMetric

data = np.arange(12).reshape((3, 4)).T
cols = ['one', 'two', 'three', 'label']
labs = np.ones((4, 1))

df = pd.DataFrame(data=np.concatenate((data, labs), axis=1), columns=cols)
sd = StructuredDataset(df=df,
                       label_names=['label'],
                       protected_attribute_names=['one', 'three'])

distorted = data + 1

sd_distorted = sd.copy(True)
sd_distorted.features = distorted

rand = np.random.randint(0, 10, (4, 4))
rand2 = np.random.randint(0, 10, (4, 3))
df_rand = pd.DataFrame(data=rand, columns=cols)
sd_rand = StructuredDataset(df=df_rand,
                            label_names=['label'],
                            protected_attribute_names=['one', 'three'])
sd_rand2 = sd_rand.copy(True)
sd_rand2.features = rand2

priv = [{'one': 1}]