def test_temporarily_ignore(): sd = StructuredDataset(df=df, label_names=['label'], protected_attribute_names=['one', 'three']) modified = sd.copy() modified.labels = sd.labels + 1 assert sd != modified with sd.temporarily_ignore('labels'): assert sd == modified assert 'labels' not in sd.ignore_fields
def test_copy(): sd = StructuredDataset(df=df, label_names=['label'], protected_attribute_names=['two']) sd2 = sd.copy() sd3 = sd.copy(True) sd.features[0] = 999 assert np.all(sd2.features[0] == 999) assert not np.any(sd3.features[0] == 999)
def test_split(): sd = StructuredDataset(df=df, label_names=['label'], protected_attribute_names=['two']) train, test = sd.split([0.5]) train2, test2 = sd.split(2) assert train == train2 assert test == test2 assert np.all(np.concatenate((train.features, test.features)) == sd.features)
def test_eq(): sd = StructuredDataset(df=df, label_names=['label'], protected_attribute_names=['two']) sd2 = sd.copy() sd3 = sd.copy(True) sd4 = StructuredDataset(df=df, label_names=['label'], protected_attribute_names=['one', 'three']) assert sd == sd2 assert sd == sd3 assert sd2 == sd3 assert sd != sd4
def test_k_folds(): sd = StructuredDataset(df=df, label_names=['label'], protected_attribute_names=['two']) folds = sd.split(4) assert len(folds) == 4 assert all(f.features.shape[0] == f.labels.shape[0] == f.protected_attributes.shape[0] == len(f.instance_names) == f.instance_weights.shape[0] == 1 for f in folds) folds = sd.split(3) assert folds[0].features.shape[0] == 2
def enforce_dummy_coded(self, X): """ Enforces that for dummycoded features exactly one feature is set to 1, all the others to 0. Called after gradient ascend. :param X: Feature matrix (dimension `n_instances x n_features`) :returns: X' (modified feature matrix) """ for k, v in StructuredDataset._parse_feature_names( self.feature_names)[0].items(): ft_indices = (list( map(lambda x: self.feature_names.index(k + '=' + x), v))) # print(k,ft_indices, v) max_index = np.argmax(X[:, ft_indices], axis=1) # for i in range(len(max_index)): # if X[i,ft_indices].sum() > 0 and k == 'credit_history': # print(k) # print(X[i,ft_indices]) # print((X[i,ft_indices] == 1)) X[:, ft_indices] = 0 for i in range(len(max_index)): X[i, ft_indices[max_index[i]]] = 1 for x in X: assert (x[ft_indices].sum() == 1) # print(X.shape) return X
def _get_domain(self, ft): """ Infers domain of feature. :param ft: Feature name :returns: Domain """ if callable(self.domains[ft]): return [self.domains[ft]()] elif self._is_dummy_coded(ft): raise Exception("Can't use dummy coded for sim") warnings.warn( "Use set of values present in dataset to infer domain for feature " + ft) # discrete, dummy coded return StructuredDataset._parse_feature_names( self.feature_names)[0][ft] elif ft in self.discrete: # discrete #warnings.warn("Use set of values present in dataset to infer domain for feature " + ft) return list(set(self.features[:, self._ft_index(ft)])) else: # continious df, _ = self.convert_to_dataframe() warnings.warn("Used min/max for feature " + ft + " to infer domain + unsupported/not implemented yet") return (min(df[ft]), max(df[:, ft]))
def _is_dummy_coded(self, ft): """ :param ft: Feature name :returns: True if ft is dummycoded """ # fix this return len( StructuredDataset._parse_feature_names(self.feature_names)[0][ft])
def _dedummy_code_obj(self, obj, sep='='): """ :param obj: Instance (feature values) in object form (dict) :param sep: Seperator used for dummy coding :returns: dedummy coded object """ # reimplemented this bc library is too slow for one row only... result_obj = obj.copy() for k, v in (StructuredDataset._parse_feature_names( self.feature_names)[0]).items(): # figure out which dummy coded is set to 1 value_l = list(filter(lambda x: obj[k + sep + x] == 1, v)) value = value_l.pop() if len(value_l) > 0 else None # convert to non-dummy coded result_obj[k] = value # remove all dummy coded ie [key=value] [result_obj.pop(k + sep + option) for option in v] return result_obj
def scale_dummy_coded(self, X): """ Ensures that the values for one dummy-coded feature sum up to 1 (scales accordingly). Called during gradient ascend. You may find an in-depth explanation in the write-up. :param X: Feature matrix (dimension `n_instances x n_features`) :returns: X' (modified feature matrix) """ #print(np.where(X[:,12]>0.8)) for k, v in StructuredDataset._parse_feature_names( self.feature_names)[0].items(): ft_indices = (list( map(lambda x: self.feature_names.index(k + '=' + x), v))) #if k == 'property': # print(X[4,ft_indices]) X[:, ft_indices] = X[:, ft_indices] / X[:, ft_indices].sum( axis=1)[:, None] assert (np.isclose(X[:, ft_indices].sum(axis=1).sum(), len(X))) return X
import numpy as np import pandas as pd from scipy.spatial.distance import cdist from aif360.datasets import StructuredDataset from aif360.metrics import SampleDistortionMetric data = np.arange(12).reshape((3, 4)).T cols = ['one', 'two', 'three', 'label'] labs = np.ones((4, 1)) df = pd.DataFrame(data=np.concatenate((data, labs), axis=1), columns=cols) sd = StructuredDataset(df=df, label_names=['label'], protected_attribute_names=['one', 'three']) distorted = data + 1 sd_distorted = sd.copy(True) sd_distorted.features = distorted rand = np.random.randint(0, 10, (4, 4)) rand2 = np.random.randint(0, 10, (4, 3)) df_rand = pd.DataFrame(data=rand, columns=cols) sd_rand = StructuredDataset(df=df_rand, label_names=['label'], protected_attribute_names=['one', 'three']) sd_rand2 = sd_rand.copy(True) sd_rand2.features = rand2 priv = [{'one': 1}]