Exemple #1
0
    'init_sequential_rounding_max_runtime': 10.0,       # max runtime for SeqRd in initialization procedure
    'init_sequential_rounding_max_solutions': 5,        # max solutions to round using SeqRd
    #
    'init_polishing_after': True,                       # polish after rounding
    'init_polishing_max_runtime': 30.0,                 # max runtime for polishing
    'init_polishing_max_solutions': 5,                  # max solutions to polish
    #
    # CPLEX Solver Parameters
    'cplex_randomseed': 0,                              # random seed
    'cplex_mipemphasis': 0,                             # cplex MIP strategy
}

# train model using lattice_cpa
model_info, mip_info, lcpa_info = run_lattice_cpa(data, constraints, settings)

#model info contains key results
pprint(model_info)
print_model(model_info['solution'], data)

# mip_output contains information to access the MIP
mip_info['risk_slim_mip'] #CPLEX mip
mip_info['risk_slim_idx'] #indices of the relevant constraints

# lcpa_output contains detailed information about LCPA
pprint(lcpa_info)





Exemple #2
0
def risk_cv(KY_x, KY_y, FL_x, FL_y, 
            y_label, 
            max_coef, 
            max_coef_number, 
            max_offset,
            max_runtime, 
            c, 
            seed):
    
    FL_score = []
    KY_score = []
    KY_validation = []
    
    ## set up basic values
    cols = KY_x.columns.tolist()
    sample_weights = np.repeat(1, len(KY_y))
    FL_x = FL_x.values
    FL_y[FL_y == -1] = 0 ## change -1 to 0
    
    ## cross validation set up 
    outer_cv = KFold(n_splits=5,shuffle=True, random_state=seed)
    inner_cv = KFold(n_splits=5,shuffle=True, random_state=seed)   
    
    for outer_train, test in outer_cv.split(KY_x, KY_y):
        
        ## split train & test
        outer_train_x, outer_train_y = KY_x.iloc[outer_train], KY_y[outer_train]
        outer_test_x, outer_test_y = KY_x.iloc[test], KY_y[test]
        outer_train_sample_weights = sample_weights[outer_train]
        
        ## inner loop
        for inner_train, validation in inner_cv.split(outer_train_x, outer_train_y):
            
            ## split inner train & validation
            inner_train_x, inner_train_y = outer_train_x.iloc[inner_train].values, outer_train_y[inner_train]
            validation_x, validation_y = outer_train_x.iloc[validation].values, outer_train_y[validation]
            inner_train_sample_weights = outer_train_sample_weights[inner_train]
            validation_sample_weights = outer_train_sample_weights[validation]
            inner_train_y = inner_train_y.reshape(-1,1)
            
            ## new data
            new_train_data = {
                'X': inner_train_x,
                'Y': inner_train_y,
                'variable_names': cols,
                'outcome_name': y_label,
                'sample_weights': inner_train_sample_weights
            }
            
            ## modeling
            model_info, mip_info, lcpa_info = risk_slim(new_train_data, 
                                                        max_coefficient=max_coef, 
                                                        max_L0_value=max_coef_number, 
                                                        c0_value=c, 
                                                        max_runtime=max_runtime, 
                                                        max_offset = max_offset)
        
            ## check validation auc
            validation_x = validation_x[:,1:] ## remove the first column, which is "intercept"
            validation_y[validation_y == -1] = 0 ## change -1 to 0
            validation_prob = riskslim_prediction(validation_x, np.array(cols), model_info)
            KY_validation.append(roc_auc_score(validation_y, validation_prob))     
        
        ## outer loop
        outer_train_x = outer_train_x.values
        outer_train_y = outer_train_y.reshape(-1,1)
        
        ## new data
        new_train_data = {
            'X': outer_train_x,
            'Y': outer_train_y,
            'variable_names': cols,
            'outcome_name': y_label,
            'sample_weights': outer_train_sample_weights
        }   
        
        ## fit the model
        model_info, mip_info, lcpa_info = risk_slim(new_train_data, 
                                                    max_coefficient=max_coef, 
                                                    max_L0_value=max_coef_number, 
                                                    c0_value=c, 
                                                    max_runtime=max_runtime, 
                                                    max_offset = max_offset)
        print_model(model_info['solution'], new_train_data)          
    
        ## FL_score
        FL_prob = riskslim_prediction(FL_x, np.array(cols), model_info).reshape(-1,1)
        FL_score.append(roc_auc_score(FL_y, FL_prob))
        
        ## KY score
        outer_test_x = outer_test_x.values[:, 1:]
        outer_test_y[outer_test_y == -1] = 0 ## change -1 to 0
        KY_prob = riskslim_prediction(outer_test_x, np.array(cols), model_info).reshape(-1,1)
        KY_score.append(roc_auc_score(outer_test_y, KY_prob))  
        
    return {'FL_score': FL_score,
            'KY_score': KY_score,
            'KY_validation': KY_validation}

    
def risk_nested_cv_constrain(X, Y, indicator, y_label, max_coef,
                             max_coef_number, max_runtime, max_offset, c,
                             seed):

    ## set up data
    sample_weights = np.repeat(1, len(Y))

    ## set up cross validation
    outer_cv = KFold(n_splits=5, random_state=seed, shuffle=True)
    inner_cv = KFold(n_splits=5, random_state=seed, shuffle=True)

    train_auc = []
    validation_auc = []
    test_auc = []
    holdout_with_attrs_test = []
    holdout_probability = []
    holdout_prediction = []
    holdout_y = []

    confusion_matrix_rets = []
    calibrations = []
    race_auc = []
    condition_pn = []
    no_condition_pn = []

    i = 0
    for outer_train, outer_test in outer_cv.split(X, Y):

        outer_train_x, outer_train_y = X.iloc[outer_train], Y[outer_train]
        outer_test_x, outer_test_y = X.iloc[outer_test], Y[outer_test]
        outer_train_sample_weight, outer_test_sample_weight = sample_weights[
            outer_train], sample_weights[outer_test]

        ## holdout test
        holdout_with_attrs = outer_test_x.copy().drop(['(Intercept)'], axis=1)
        holdout_with_attrs = holdout_with_attrs.rename(columns={'sex1': 'sex'})

        ## remove unused feature in modeling
        if indicator == 1:
            outer_train_x = outer_train_x.drop([
                'person_id', 'screening_date', 'race', 'age_at_current_charge',
                'p_charges'
            ],
                                               axis=1)
            outer_test_x = outer_test_x.drop([
                'person_id', 'screening_date', 'race', 'age_at_current_charge',
                'p_charges'
            ],
                                             axis=1)
        else:
            outer_train_x = outer_train_x.drop([
                'person_id', 'screening_date', 'race', 'sex1',
                'age_at_current_charge', 'p_charges'
            ],
                                               axis=1)
            outer_test_x = outer_test_x.drop([
                'person_id', 'screening_date', 'race', 'sex1',
                'age_at_current_charge', 'p_charges'
            ],
                                             axis=1)

        cols = outer_train_x.columns.tolist()

        ## inner cross validation
        for inner_train, validation in inner_cv.split(outer_train_x,
                                                      outer_train_y):

            ## subset train data & store test data
            inner_train_x, inner_train_y = outer_train_x.iloc[
                inner_train].values, outer_train_y[inner_train]
            validation_x, validation_y = outer_train_x.iloc[
                validation].values, outer_train_y[validation]
            inner_train_sample_weight = outer_train_sample_weight[inner_train]
            validation_sample_weight = outer_train_sample_weight[validation]
            inner_train_y = inner_train_y.reshape(-1, 1)

            ## create new data dictionary
            new_train_data = {
                'X': inner_train_x,
                'Y': inner_train_y,
                'variable_names': cols,
                'outcome_name': y_label,
                'sample_weights': inner_train_sample_weight
            }

            ## fit the model
            model_info, mip_info, lcpa_info = risk_slim_constrain(
                new_train_data,
                max_coefficient=max_coef,
                max_L0_value=max_coef_number,
                c0_value=c,
                max_runtime=max_runtime,
                max_offset=max_offset)

            ## check validation auc
            validation_x = validation_x[:,
                                        1:]  ## remove the first column, which is "intercept"
            validation_y[validation_y == -1] = 0  ## change -1 to 0
            validation_prob = riskslim_prediction(validation_x, np.array(cols),
                                                  model_info)
            validation_auc.append(roc_auc_score(validation_y, validation_prob))

        ## outer loop
        outer_train_x = outer_train_x.values
        outer_test_x = outer_test_x.values
        outer_train_y = outer_train_y.reshape(-1, 1)
        new_train_data = {
            'X': outer_train_x,
            'Y': outer_train_y,
            'variable_names': cols,
            'outcome_name': y_label,
            'sample_weights': outer_train_sample_weight
        }

        ## fit the model

        model_info, mip_info, lcpa_info = risk_slim_constrain(
            new_train_data,
            max_coefficient=max_coef,
            max_L0_value=max_coef_number,
            c0_value=c,
            max_runtime=max_runtime,
            max_offset=max_offset)
        print_model(model_info['solution'], new_train_data)

        ## change data format
        outer_train_x, outer_test_x = outer_train_x[:,
                                                    1:], outer_test_x[:,
                                                                      1:]  ## remove the first column, which is "intercept"
        outer_train_y[outer_train_y == -1] = 0  ## change -1 to 0
        outer_test_y[outer_test_y == -1] = 0  ## change -1 to 0

        ## probability & accuracy
        outer_train_prob = riskslim_prediction(outer_train_x, np.array(cols),
                                               model_info).reshape(-1, 1)
        outer_test_prob = riskslim_prediction(outer_test_x, np.array(cols),
                                              model_info)
        outer_test_pred = (outer_test_prob > 0.5)

        ########################
        ## AUC
        train_auc.append(roc_auc_score(outer_train_y, outer_train_prob))
        test_auc.append(roc_auc_score(outer_test_y, outer_test_prob))

        ########################
        ## confusion matrix
        confusion_matrix_fairness = compute_confusion_matrix_stats(
            df=holdout_with_attrs,
            preds=outer_test_pred,
            labels=outer_test_y,
            protected_variables=["sex", "race"])
        cf_final = confusion_matrix_fairness.assign(
            fold_num=[i] * confusion_matrix_fairness['Attribute'].count())
        confusion_matrix_rets.append(cf_final)

        ########################
        ## calibration matrix
        calibration = compute_calibration_fairness(
            df=holdout_with_attrs,
            probs=outer_test_prob,
            labels=outer_test_y,
            protected_variables=["sex", "race"])
        calibration_final = calibration.assign(
            fold_num=[i] * calibration['Attribute'].count())
        calibrations.append(calibration_final)

        ########################
        ## race auc
        try:
            race_auc_matrix = fairness_in_auc(df=holdout_with_attrs,
                                              probs=outer_test_prob,
                                              labels=outer_test_y)
            race_auc_matrix_final = race_auc_matrix.assign(
                fold_num=[i] * race_auc_matrix['Attribute'].count())
            race_auc.append(race_auc_matrix_final)
        except:
            pass

        ########################
        ## ebm_pn
        no_condition_pn_matrix = balance_positive_negative(
            df=holdout_with_attrs, probs=outer_test_prob, labels=outer_test_y)
        no_condition_pn_matrix_final = no_condition_pn_matrix.assign(
            fold_num=[i] * no_condition_pn_matrix['Attribute'].count())
        no_condition_pn.append(no_condition_pn_matrix_final)

        ########################
        ## ebm_condition_pn
        condition_pn_matrix = conditional_balance_positive_negative(
            df=holdout_with_attrs, probs=outer_test_prob, labels=outer_test_y)
        condition_pn_matrix_final = condition_pn_matrix.assign(
            fold_num=[i] * condition_pn_matrix['Attribute'].count())
        condition_pn.append(condition_pn_matrix_final)

        ########################
        ## store results
        holdout_with_attrs_test.append(holdout_with_attrs)
        holdout_probability.append(outer_test_prob)
        holdout_prediction.append(outer_test_pred)
        holdout_y.append(outer_test_y)

        i += 1

    ## confusion matrix
    confusion_df = pd.concat(confusion_matrix_rets, ignore_index=True)
    confusion_df.sort_values(["Attribute", "Attribute Value"], inplace=True)
    confusion_df = confusion_df.reset_index(drop=True)

    ## calibration matrix
    calibration_df = pd.concat(calibrations, ignore_index=True)
    calibration_df.sort_values(
        ["Attribute", "Lower Limit Score", "Upper Limit Score"], inplace=True)
    calibration_df = calibration_df.reset_index(drop=True)

    ## race_auc
    race_auc_df = []
    try:
        race_auc_df = pd.concat(race_auc, ignore_index=True)
        race_auc_df.sort_values(["fold_num", "Attribute"], inplace=True)
        race_auc_df = race_auc_df.reset_index(drop=True)
    except:
        pass

    ## no_condition_pn
    no_condition_pn_df = pd.concat(no_condition_pn, ignore_index=True)
    no_condition_pn_df.sort_values(["fold_num", "Attribute"], inplace=True)
    no_condition_pn_df = no_condition_pn_df.reset_index(drop=True)

    ## condition_pn
    condition_pn_df = pd.concat(condition_pn, ignore_index=True)
    condition_pn_df.sort_values(["fold_num", "Attribute"], inplace=True)
    condition_pn_df = condition_pn_df.reset_index(drop=True)

    return {
        'train_auc': train_auc,
        'validation_auc': validation_auc,
        'test_auc': test_auc,
        'holdout_with_attrs_test': holdout_with_attrs_test,
        'holdout_proba': holdout_probability,
        'holdout_pred': holdout_prediction,
        'holdout_y': holdout_y,
        'confusion_matrix_stats': confusion_df,
        'calibration_stats': calibration_df,
        'race_auc': race_auc_df,
        'condition_pn': condition_pn_df,
        'no_condition_pn': no_condition_pn_df
    }
def risk_cv(X, Y, indicator, y_label, max_coef, max_coef_number, max_runtime,
            max_offset, c, seed):

    ## set up data
    Y = Y.reshape(-1, 1)
    sample_weights = np.repeat(1, len(Y))

    ## set up cross validation
    cv = KFold(n_splits=5, random_state=seed, shuffle=True)
    train_auc = []
    validation_auc = []

    i = 0
    for train, validation in cv.split(X, Y):

        ## subset train data & store test data
        train_x, train_y = X.iloc[train], Y[train]
        validation_x, validation_y = X.iloc[validation], Y[validation]
        sample_weights_train, sample_weights_validation = sample_weights[
            train], sample_weights[validation]

        ## holdout test with "race" for fairness
        holdout_with_attrs = validation_x.copy().drop(['(Intercept)'], axis=1)
        holdout_with_attrs = holdout_with_attrs.rename(columns={'sex1': 'sex'})

        ## remove unused feature in modeling
        if indicator == 1:
            train_x = train_x.drop([
                'person_id', 'screening_date', 'race', 'age_at_current_charge',
                'p_charges'
            ],
                                   axis=1)
            validation_x = validation_x.drop([
                'person_id', 'screening_date', 'race', 'age_at_current_charge',
                'p_charges'
            ],
                                             axis=1).values
        else:
            train_x = train_x.drop([
                'person_id', 'screening_date', 'race', 'sex1',
                'age_at_current_charge', 'p_charges'
            ],
                                   axis=1)
            validation_x = validation_x.drop([
                'person_id', 'screening_date', 'race', 'sex1',
                'age_at_current_charge', 'p_charges'
            ],
                                             axis=1).values

        cols = train_x.columns.tolist()
        train_x = train_x.values

        ## create new data dictionary
        new_train_data = {
            'X': train_x,
            'Y': train_y,
            'variable_names': cols,
            'outcome_name': y_label,
            'sample_weights': sample_weights_train
        }

        ## fit the model
        model_info, mip_info, lcpa_info = risk_slim(
            new_train_data,
            max_coefficient=max_coef,
            max_L0_value=max_coef_number,
            max_offset=max_offset,
            c0_value=c,
            max_runtime=max_runtime)
        print_model(model_info['solution'], new_train_data)

        ## change data format
        train_x, validation_x = train_x[:,
                                        1:], validation_x[:,
                                                          1:]  ## remove the first column, which is "intercept"
        train_y[train_y == -1] = 0  ## change -1 to 0
        validation_y[validation_y == -1] = 0  ## change -1 to 0

        ## probability & accuracy
        train_prob = riskslim_prediction(train_x, np.array(cols),
                                         model_info).reshape(-1, 1)
        validation_prob = riskslim_prediction(validation_x, np.array(cols),
                                              model_info).reshape(-1, 1)
        validation_pred = (validation_prob > 0.5)

        ## AUC
        train_auc.append(roc_auc_score(train_y, train_prob))
        validation_auc.append(roc_auc_score(validation_y, validation_prob))
        i += 1

    return {'train_auc': train_auc, 'validation_auc': validation_auc}
Exemple #5
0
    def fit(self, X, y):

        X, y = check_X_y(X, y, accept_sparse=True)
        self.is_fitted_ = True

        # transforming data
        raw_data = np.insert(X, 0, y, axis=1)
        N = raw_data.shape[0]

        # setup Y vector and Y_name
        Y_col_idx = [0]
        Y = raw_data[:, Y_col_idx]
        Y_name = self.data_headers[Y_col_idx[0]]
        Y[Y == 0] = -1

        # setup X and X_names
        X_col_idx = [j for j in range(raw_data.shape[1]) if j not in Y_col_idx]
        X = raw_data[:, X_col_idx]
        variable_names = [self.data_headers[j] for j in X_col_idx]

        # insert a column of ones to X for the intercept
        X = np.insert(arr=X, obj=0, values=np.ones(N), axis=1)
        variable_names.insert(0, '(Intercept)')

        if self.sample_weights is None or len(self.sample_weights) != N:
            self.sample_weights = np.ones(N)

        self.data = {
            'X': X,
            'Y': Y,
            'variable_names': variable_names,
            'outcome_name': Y_name,
            'sample_weights': self.sample_weights,
        }

        #load folds
        if self.fold_csv_file is not None:
            if not os.path.isfile(self.fold_csv_file):
                raise IOError('could not find fold_csv_file: %s' %
                              self.fold_csv_file)
            else:
                fold_idx = pd.read_csv(self.fold_csv_file,
                                       sep=',',
                                       header=None)
                fold_idx = fold_idx.values.flatten()
                K = max(fold_idx)
                all_fold_nums = np.sort(np.unique(fold_idx))
                assert len(
                    fold_idx
                ) == N, "dimension mismatch: read %r fold indices (expected N = %r)" % (
                    len(fold_idx), N)
                assert np.all(all_fold_nums == np.arange(
                    1, K +
                    1)), "folds should contain indices between 1 to %r" % K
                assert fold_num in np.arange(
                    0, K + 1
                ), "fold_num should either be 0 or an integer between 1 to %r" % K
                if fold_num >= 1:
                    test_idx = fold_num == fold_idx
                    train_idx = fold_num != fold_idx
                    data['X'] = data['X'][train_idx, ]
                    data['Y'] = data['Y'][train_idx]
                    data['sample_weights'] = data['sample_weights'][train_idx]

        assert check_data(self.data)

        # create coefficient set and set the value of the offset parameter
        coef_set = CoefficientSet(variable_names=self.data['variable_names'],
                                  lb=-self.max_coefficient,
                                  ub=self.max_coefficient,
                                  sign=0)
        conservative_offset = get_conservative_offset(self.data, coef_set,
                                                      self.max_L0_value)
        self.max_offset = min(self.max_offset, conservative_offset)
        coef_set['(Intercept)'].ub = self.max_offset
        coef_set['(Intercept)'].lb = -self.max_offset

        # edit contraints here
        constraints = {
            'L0_min': 0,
            'L0_max': self.max_L0_value,
            'coef_set': coef_set,
        }

        # initialize MIP for lattice CPA
        mip_objects = setup_lattice_cpa(self.data, constraints, self.settings)

        # add operational constraints
        mip = mip_objects['mip']
        indices = mip_objects['indices']
        get_alpha_name = lambda var_name: 'alpha_' + str(self.data[
            'variable_names'].index(var_name))
        get_alpha_ind = lambda var_names: [
            get_alpha_name(v) for v in var_names
        ]

        # applies mutual exclusivity feature contraints
        if self.op_constraints is not None:

            names = []
            expressions = []

            for key in self.op_constraints.keys():
                names.append("mutually_exclusive_%s" % key)
                expressions.append(
                    cplex.SparsePair(
                        ind=get_alpha_ind(self.op_constraints[key]),
                        val=[1.0] * len(self.op_constraints[key])))

            mip.linear_constraints.add(
                names=names,
                lin_expr=expressions,
                senses=["L"] * len(self.op_constraints.keys()),
                rhs=[1.0] * len(self.op_constraints.keys()))

        mip_objects['mip'] = mip

        # fit using ltca
        model_info, mip_info, lcpa_info = finish_lattice_cpa(
            self.data, constraints, mip_objects, self.settings)
        rho = model_info['solution']
        self.model_info = model_info

        if np.sum(rho[1:]) != 0:
            print_model(model_info['solution'], self.data)
        print("solver_time = %d" % model_info['solver_time'])
        print("optimality_gap = %.3f" % model_info['optimality_gap'])
        print(rho)

        variable_names = self.data['variable_names']
        rho_values = np.copy(rho)
        rho_names = list(variable_names)

        # removes intercept value or sets it to 0
        if '(Intercept)' in rho_names:
            intercept_ind = variable_names.index('(Intercept)')
            self.intercept_val = int(rho[intercept_ind])
            rho_values = np.delete(rho_values, intercept_ind)
            rho_names.remove('(Intercept)')
        else:
            self.intercept_val = 0

        self.filter_mask = np.array(rho_values) != 0

        # removes zero values
        if not self.show_omitted_variables:
            selected_ind = np.flatnonzero(rho_values)
            self.rho_values = rho_values[selected_ind]
            self.rho_names = [rho_names[i] for i in selected_ind]

        return self