def __init__(self, X, y=None, actionset=None, optimizer=_SOLVER_TYPE_CPX, decision_threshold=None, **clf_args): """ Run an audit on a classifier. :param optimizer: :param clf: :param coefficients: :param intercept: :param actionset: """ ## set clf and coefficients self.__parse_clf_args(clf_args) ### actionset self.actionset = actionset self.X = X if not self.actionset: warnings.warn( "No actionset provided, instantiating with defaults: all features mutable, all features percentile." ) if not self.X: raise ("No actionset or X provided.") self.actionset = ActionSet(X=self.X) self.actionset.align(self.coefficients) self.optimizer = optimizer self.decision_threshold = decision_threshold
def my_actionset_fake(): """Action set with fake values.""" ## default values for "X" (dataset) and "names" (column names). X = np.array([[1, 2, 3], [2, 3, 4]]) names = ['a', 'b', 'c'] ## initialize. return ActionSet(X, names=names)
def german_actionset_unaligned(german_X): """Generate an actionset for German data.""" # setup actionset action_set = ActionSet(X=german_X) immutable_attributes = [ 'Age', 'Single', 'JobClassIsSkilled', 'ForeignWorker', 'OwnsHouse', 'RentsHouse' ] action_set[immutable_attributes].mutable = False action_set['CriticalAccountOrLoansElsewhere'].step_direction = -1 action_set['CheckingAccountBalance_geq_0'].step_direction = 1 return action_set
def action_set(request, data): """Generate an action_set for German data.""" # setup action_set action_set = ActionSet(X = data['X']) if request.param == 'immutable' and data['data_name'] == 'german': immutable_attributes = ['Age', 'Single', 'JobClassIsSkilled', 'ForeignWorker', 'OwnsHouse', 'RentsHouse'] action_set[immutable_attributes].mutable = False action_set['CriticalAccountOrLoansElsewhere'].step_direction = -1 action_set['CheckingAccountBalance_geq_0'].step_direction = 1 return action_set
def test_rb_onehot_encoding(data, solver): if len(data['categorical_names']) == 1: # pick only the indicator variables names = data['onehot_names'] k = len(names) X = data['X'][names] assert np.all(X.sum(axis=1) == 1) #setup classifier of the form #w = [3, -1, -1, -1,...] #t = -1 # score(x[0] = 1) = 3 -> yhat = +1 # score(x[j] = 1) = -2 -> yhat = -1 for j = 1,2,...,k coefs = -np.ones(k) coefs[0] = 3.0 intercept = -1.0 # setup action set a = ActionSet(X) a.add_constraint('subset_limit', names=names, lb=0, ub=1) a.set_alignment(coefficients=coefs, intercept=intercept) rb = RecourseBuilder(action_set=a, coefficients=coefs, intercept=intercept, solver=solver) for j in range(1, k): x = np.zeros(k) x[j] = 1.0 assert rb.score(x) < 0 # set point rb.x = x # find optimal action info = rb.fit() a = info['actions'] # validate solution x_new = x + a assert rb.score(x_new) > 0 assert np.isclose(a[j], -1.0) assert np.isclose(np.sum(x_new), 1.0)
def as_result_file(name, extension='pdf', header=file_header): return os.path.join(demo_results_dir, '%s.%s' % (name, extension)) ## load and process data german_df = pd.read_csv(data_file).reset_index(drop=True) # german_df = german_df.assign(isMale=lambda df: (df['Gender']=='Male').astype(int))#.drop(['PurposeOfLoan', 'Gender', 'OtherLoansAtStore'], axis=1) y = german_df['GoodCustomer'] X = (german_df.drop('GoodCustomer', axis=1).drop( ['PurposeOfLoan', 'Gender', 'OtherLoansAtStore'], axis=1)) ## set up actionset gender_weight = german_df.assign( c=1).groupby('Gender')['c'].transform(lambda s: s * 1. / len(s)) X_gender_balanced = X.sample(n=len(X) * 3, replace=True, weights=gender_weight) action_set = ActionSet(X=X_gender_balanced) action_set['Age'].mutable = False action_set['Single'].mutable = False action_set['JobClassIsSkilled'].mutable = False action_set['ForeignWorker'].mutable = False action_set['OwnsHouse'].mutable = False action_set['RentsHouse'].mutable = False action_set['CriticalAccountOrLoansElsewhere'].step_direction = -1 action_set['CheckingAccountBalance_geq_0'].step_direction = 1 # action_set['isMale'].mutable = False ## dummy model clf = LogisticRegression(max_iter=1000, solver='lbfgs') # grid = GridSearchCV( # clf, param_grid={'C': np.logspace(-4, 3)}, # cv=10,
class Auditor(object): def __init__(self, X, y=None, actionset=None, optimizer=_SOLVER_TYPE_CPX, decision_threshold=None, **clf_args): """ Run an audit on a classifier. :param optimizer: :param clf: :param coefficients: :param intercept: :param actionset: """ ## set clf and coefficients self.__parse_clf_args(clf_args) ### actionset self.actionset = actionset self.X = X if not self.actionset: warnings.warn( "No actionset provided, instantiating with defaults: all features mutable, all features percentile." ) if not self.X: raise ("No actionset or X provided.") self.actionset = ActionSet(X=self.X) self.actionset.align(self.coefficients) self.optimizer = optimizer self.decision_threshold = decision_threshold def __parse_clf_args(self, args): assert 'clf' in args or ('coefficients' in args) if 'clf' in args: clf = args['clf'] self.coefficients = np.array(clf.coef_).flatten() self.intercept = float(clf.intercept_) elif 'coefficients' in args: self.coefficients = args['coefficients'] self.intercept = args['intercept'] if 'intercept' in args else 0.0 def get_negative_points(self): scores = self.clf.predict_proba(self.X)[:, 1] return np.where(scores < self.decision_threshold)[0] def audit(self, num_cases=None): ### TODO: bake decision threshold into the optimizer. denied_individuals = self.get_negative_points() ## downsample if num_cases and num_cases < len(denied_individuals): denied_individuals = np.random.choice(denied_individuals, num_cases) if not any(self.actionset.aligned): self.actionset.align(self.coefficients) ## run flipsets idx = 0 flipsets = {} now = time.time() for i in denied_individuals: if idx % 50 == 0: print('finished %d points in %f...' % (idx, time.time() - now)) now = time.time() x = self.X[i] fb = RecourseBuilder(optimizer=self.optimizer, coefficients=self.coefficients, intercept=self.intercept, action_set=self.actionset, x=x) output = fb.fit() flipsets[i] = output.get('total_cost') or output.get('max_cost') idx += 1 return flipsets
from recourse.paths import * from recourse.action_set import ActionSet from recourse.flipset import Flipset data_name = 'german' data_file = test_dir / ('%s_processed.csv' % data_name) ## load dataset data_df = pd.read_csv(data_file) outcome_name = data_df.columns[0] y = data_df[outcome_name] X = data_df.drop( [outcome_name, 'Gender', 'PurposeOfLoan', 'OtherLoansAtStore'], axis=1) # setup actionset action_set = ActionSet(X=X) immutable_attributes = [ 'Age', 'Single', 'JobClassIsSkilled', 'ForeignWorker', 'OwnsHouse', 'RentsHouse' ] action_set[immutable_attributes].mutable = False action_set['CriticalAccountOrLoansElsewhere'].step_direction = -1 action_set['CheckingAccountBalance_geq_0'].step_direction = 1 # fit classifier clf = LogisticRegression(max_iter=1000, solver='lbfgs') clf.fit(X, y) denied_idx = np.flatnonzero(clf.predict(X) < 0) i = denied_idx[0]
custom_bounds = None immutable_variables = [] if settings['data_name'] == 'credit': immutable_names = ['Female', 'Single', 'Married'] immutable_names += list( filter(lambda x: 'Age' in x or 'Overdue' in x, data['variable_names'])) default_bounds = (0.1, 99.9, 'percentile') custom_bounds = {'Female': (0, 100, 'p'), 'Married': (0, 100, 'p')} data['immutable_variable_names'] = [ n for n in immutable_names if n in data['variable_names'] ] action_set = ActionSet(X=data['X'], custom_bounds=custom_bounds, default_bounds=default_bounds) action_set[data['immutable_variable_names']].mutable = False action_set['EducationLevel'].step_direction = 1 payment_fields = list( filter(lambda x: 'Amount' in x, data['variable_names'])) action_set[payment_fields].step_type = 'absolute' action_set[payment_fields].step_size = 5 for p in payment_fields: action_set[p].update_grid() #### Initialize Model Files #### model_stats = pickle.load(open(settings['model_file'], 'rb'))
intercept['downsampled'] = clf_age_limited.intercept_[0] # utilization bounded # RealEstate should be positive # numtimes90 days > 0 # monthly income >0 # debt p = .98 # generate flipsets for dataset in ['full', 'downsampled']: y_col = 'y_%s_score' % dataset scores = exp_df_sample[y_col] # p = scores.median() denied_individuals = scores.loc[lambda s: s <= p].index # actionset action_set = ActionSet(X=X_audit_holdout) action_set['age'].mutable = False action_set['NumberOfDependents'].mutable = False action_set['DebtRatio'].step_direction = -1 # action_set['NumberOfTime60-89DaysPastDueNotWorse'].step_direction = -1 action_set.align(coefficients=coefficients[dataset]) idx = 0 flipsets = {} import time now = time.time() for i in denied_individuals: if idx % 100 == 0: print('finished %d points in %f...' % (idx, time.time() - now)) now = time.time()
#### Initialize Actionset #### default_bounds = (0.1, 99.9, 'percentile') custom_bounds = None immutable_variables = [] if settings['data_name'] == 'credit': immutable_names = ['Female', 'Single', 'Married'] immutable_names += list(filter(lambda x: 'Age' in x or 'Overdue' in x, data['variable_names'])) default_bounds = (0.1, 99.9, 'percentile') custom_bounds = {'Female': (0, 100, 'p'), 'Married': (0, 100, 'p')} data['immutable_variable_names'] = [n for n in immutable_names if n in data['variable_names']] action_set = ActionSet(X = data['X'], custom_bounds = custom_bounds, default_bounds = default_bounds) action_set[data['immutable_variable_names']].mutable = False payment_fields = list(filter(lambda x: 'Amount' in x, data['variable_names'])) action_set[payment_fields].step_type = 'absolute' action_set[payment_fields].step_size = 5 for p in payment_fields: action_set[p].update_grid() action_set['EducationLevel'].step_direction = 1 action_set['MaxBillAmountOverLast6Months'].step_direction = -1 action_set['MaxPaymentAmountOverLast6Months'].step_direction = 1 #### Initialize Model Files ####
def as_result_file(name, extension = 'pdf', header = file_header): return os.path.join(demo_results_dir, '%s.%s' % (name, extension)) ## load and process data german_df = pd.read_csv(data_file).reset_index(drop=True) # german_df = german_df.assign(isMale=lambda df: (df['Gender']=='Male').astype(int))#.drop(['PurposeOfLoan', 'Gender', 'OtherLoansAtStore'], axis=1) y = german_df['GoodCustomer'] X = (german_df.drop('GoodCustomer', axis=1) .drop(['PurposeOfLoan', 'Gender', 'OtherLoansAtStore'], axis=1) ) ## set up actionset gender_weight = german_df.assign(c=1).groupby('Gender')['c'].transform(lambda s: s*1./len(s)) X_gender_balanced = X.sample(n = len(X)*3, replace=True, weights=gender_weight) action_set = ActionSet(X = X_gender_balanced) action_set['Age'].mutable = False action_set['Single'].mutable = False action_set['JobClassIsSkilled'].mutable = False action_set['ForeignWorker'].mutable = False action_set['OwnsHouse'].mutable = False action_set['RentsHouse'].mutable = False action_set['CriticalAccountOrLoansElsewhere'].step_direction = -1 action_set['CheckingAccountBalance_geq_0'].step_direction = 1 # action_set['isMale'].mutable = False clf = LogisticRegression(max_iter=1000, solver='lbfgs') grid = GridSearchCV( clf, param_grid={'C': np.logspace(-4, 3)}, cv=10, scoring='roc_auc',