Example #1
0
def model(x_train, y_train, x_control_train, x_test, y_test, x_control_test, SV):
    x_train = ut.add_intercept(x_train)
    x_test = ut.add_intercept(x_test)
    apply_fairness_constraints = 1
    apply_accuracy_constraint = 0
    sensitive_attrs = [SV]
    sensitive_attrs_to_cov_thresh = {SV: 0}
    sep_constraint = 0
    loss_function = lf._logistic_loss
    gamma = 0
    
    def train_test_classifier():
        w = ut.train_model(x_train, y_train, x_control_train, loss_function, apply_fairness_constraints, apply_accuracy_constraint, sep_constraint, sensitive_attrs, sensitive_attrs_to_cov_thresh, gamma)
        train_score, test_score, correct_answers_train, correct_answers_test = ut.check_accuracy(w, x_train, y_train, x_test, y_test, None, None)
        distances_boundary_test = np.dot(x_test, w)
        distances_boundary_train = np.dot(x_train, w)
        prob_test = [sigmoid(x) for x in distances_boundary_test]
        prob_train = [sigmoid(x) for x in distances_boundary_train]
        all_class_labels_assigned_test = np.sign(distances_boundary_test)
        correlation_dict_test = ut.get_correlations(None, None, all_class_labels_assigned_test, x_control_test, sensitive_attrs)
        cov_dict_test = ut.print_covariance_sensitive_attrs(None, x_test, distances_boundary_test, x_control_test, sensitive_attrs)
        p_rule = ut.print_classifier_fairness_stats([test_score], [correlation_dict_test], [cov_dict_test], sensitive_attrs[0])	
        # return w, p_rule, test_score
        return prob_train, prob_test
    return train_test_classifier()
    def train(self, X, y, x_sensitive, fairness_constraint):
        self.x_sensitive = {"s1": x_sensitive}
        self.X = ut.add_intercept(X)
        self.y = y

        if fairness_constraint==-1.0:
            self.w = ut.train_model(self.X, self.y, self.x_sensitive, lf._logistic_loss, 0, 0, 0, 
                                    ["s1"], {"s1":0}, None)
        else:
            self.w = ut.train_model(self.X, self.y, self.x_sensitive, lf._logistic_loss, 1, 0, 0, 
                                    ["s1"], {"s1": fairness_constraint}, None)

        train_score, test_score, correct_answers_train, correct_answers_test = ut.check_accuracy(self.w, 
                                                                                   self.X, self.y, self.X, self.y, None, None)

        distances_boundary_test = (np.dot(self.X, self.w)).tolist()
        all_class_labels_assigned_test = np.sign(distances_boundary_test)
        correlation_dict_test = ut.get_correlations(None, None, 
                                    all_class_labels_assigned_test, self.x_sensitive, ["s1"])

        correlation_dict = ut.get_avg_correlation_dict([correlation_dict_test])
        non_prot_pos = correlation_dict["s1"][1][1]
        prot_pos = correlation_dict["s1"][0][1]
        p_rule = (prot_pos / non_prot_pos) * 100.0

	return self.w, p_rule, 100.0*test_score
Example #3
0
def main(train_file, model_path, setting, value):
    x_train, y_train, x_control_train = load_json(train_file)

    # X = ut.add_intercept(X) # add intercept to X before applying the linear classifier
    x_train = ut.add_intercept(x_train)

    # x_train, y_train, x_control_train, x_test, y_test, x_control_test = ut.split_into_train_test(X, y, x_control, 0.7)

    # print >> sys.stderr, "First row:"
    # print >> sys.stderr, x_train[0,:], y_train[0], x_control_train

    if setting == 'gamma':
        mode = {"accuracy": 1, "gamma": float(value)}
    elif setting == 'c':
        mode = {"fairness": 1}
    elif setting == 'baseline':
        mode = {}
    else:
        raise Exception("Don't know how to handle setting %s" % setting)

    thresh = {}
    if setting == 'c':
        thresh = dict((k, float(value)) for (k, v) in x_control_train.items())
        # print("Covariance threshold: %s" % thresh)

    # print("Will train classifier on %s %s-d points" % x_train.shape, file=sys.stderr)
    # print("Sensitive attribute: %s" % (x_control_train.keys(),), file=sys.stderr)
    sensitive_attrs = list(x_control_train.keys())
    w = train_classifier(x_train, y_train, x_control_train, sensitive_attrs,
                         mode, thresh)

    # print("Model trained successfully.", file=sys.stderr)
    np.save(model_path, w)
Example #4
0
 def fit_agl(self,
             x: Union[np.ndarray, torch.Tensor],
             y: Union[np.ndarray, torch.Tensor],
             lam: Union[float, int],
             max_iters: int = 1000,
             smooth: Union[float, int] = 0,
             weights: List[Union[int, float]] = None):
     """fits the adaptive group lasso"""
     if self.beta is None and weights is None:
         print(
             "Initial beta estimation is not available, please run fit or fit_gic first."
         )
         return None
     if weights is None:
         weights = self.compute_weights(self.beta)
     x = remove_intercept(x)
     x = numpy_to_torch(x)
     y = numpy_to_torch(y)
     x = self.normalize(x)
     x_basis = self.basis_expansion_(x, self.df, self.degree)
     group_size = [self.df] * len(weights)
     x_basis, group_size = add_intercept(x_basis, group_size)
     beta_agl = self.solve(x_basis,
                           y,
                           lam,
                           group_size,
                           max_iters,
                           weights,
                           smooth=smooth)
     self.beta_agl = beta_agl
     self.beta = beta_agl
     return self
def test_synthetic_data():
	
	""" Generate the synthetic data """
	X, y, x_control = generate_synthetic_data(plot_data=False)
	ut.compute_p_rule(x_control["s1"], y) # compute the p-rule in the original data

	""" Classify the data without any constraints """
	apply_fairness_constraints = 0
	apply_accuracy_constraint = 0
	sep_constraint = 0

	loss_function = lf._logistic_loss
	X = ut.add_intercept(X) # add intercept to X before applying the linear classifier
	test_acc_arr, train_acc_arr, correlation_dict_test_arr, correlation_dict_train_arr, cov_dict_test_arr, cov_dict_train_arr = ut.compute_cross_validation_error(X, y, x_control, NUM_FOLDS, loss_function, apply_fairness_constraints, apply_accuracy_constraint, sep_constraint, ['s1'], [{} for i in range(0,NUM_FOLDS)])
	print
	print "== Unconstrained (original) classifier =="
	ut.print_classifier_fairness_stats(test_acc_arr, correlation_dict_test_arr, cov_dict_test_arr, "s1")


	""" Now classify such that we achieve perfect fairness """
	apply_fairness_constraints = 1
	cov_factor = 0
	test_acc_arr, train_acc_arr, correlation_dict_test_arr, correlation_dict_train_arr, cov_dict_test_arr, cov_dict_train_arr = ut.compute_cross_validation_error(X, y, x_control, NUM_FOLDS, loss_function, apply_fairness_constraints, apply_accuracy_constraint, sep_constraint, ['s1'], [{'s1':cov_factor} for i in range(0,NUM_FOLDS)])		
	print
	print "== Constrained (fair) classifier =="
	ut.print_classifier_fairness_stats(test_acc_arr, correlation_dict_test_arr, cov_dict_test_arr, "s1")

	""" Now plot a tradeoff between the fairness and accuracy """
	ut.plot_cov_thresh_vs_acc_pos_ratio(X, y, x_control, NUM_FOLDS, loss_function, apply_fairness_constraints, apply_accuracy_constraint, sep_constraint, ['s1'])
Example #6
0
def test_synthetic_data():
    """ Generate the synthetic data """
    print(sys.path)
    X, y, x_control = generate_synthetic_data(plot_data=False)
    ut.compute_p_rule(x_control["s1"], y)  # compute the p-rule in the original data

    """ Classify the data without any constraints """
    apply_fairness_constraints = 0
    apply_accuracy_constraint = 0
    sep_constraint = 0

    loss_function = lf._logistic_loss
    X = ut.add_intercept(X)  # add intercept to X before applying the linear classifier
    test_acc_arr, train_acc_arr, correlation_dict_test_arr, correlation_dict_train_arr, cov_dict_test_arr, cov_dict_train_arr = ut.compute_cross_validation_error(
        X, y, x_control, NUM_FOLDS, loss_function, apply_fairness_constraints, apply_accuracy_constraint,
        sep_constraint, ['s1'], [{} for i in range(0, NUM_FOLDS)])
    print
    print "== Unconstrained (original) classifier =="
    ut.print_classifier_fairness_stats(test_acc_arr, correlation_dict_test_arr, cov_dict_test_arr, "s1")

    """ Now classify such that we achieve perfect fairness """
    apply_fairness_constraints = 1
    cov_factor = 0
    test_acc_arr, train_acc_arr, correlation_dict_test_arr, correlation_dict_train_arr, cov_dict_test_arr, cov_dict_train_arr = ut.compute_cross_validation_error(
        X, y, x_control, NUM_FOLDS, loss_function, apply_fairness_constraints, apply_accuracy_constraint,
        sep_constraint, ['s1'], [{'s1': cov_factor} for i in range(0, NUM_FOLDS)])
    print
    print "== Constrained (fair) classifier =="
    ut.print_classifier_fairness_stats(test_acc_arr, correlation_dict_test_arr, cov_dict_test_arr, "s1")

    """ Now plot a tradeoff between the fairness and accuracy """
    ut.plot_cov_thresh_vs_acc_pos_ratio(X, y, x_control, NUM_FOLDS, loss_function, apply_fairness_constraints,
                                        apply_accuracy_constraint, sep_constraint, ['s1'])
Example #7
0
    def train(self, X, y, x_sensitive, fairness_constraint):
        self.x_sensitive = {"s1": x_sensitive}
        self.X = ut.add_intercept(X)
        self.y = y

        if fairness_constraint == -1.0:
            self.w = ut.train_model(self.X, self.y, self.x_sensitive,
                                    lf._logistic_loss, 0, 0, 0, ["s1"],
                                    {"s1": 0}, None)
        else:
            self.w = ut.train_model(self.X, self.y, self.x_sensitive,
                                    lf._logistic_loss, 1, 0, 0, ["s1"],
                                    {"s1": fairness_constraint}, None)

        train_score, test_score, correct_answers_train, correct_answers_test = ut.check_accuracy(
            self.w, self.X, self.y, self.X, self.y, None, None)

        distances_boundary_test = (np.dot(self.X, self.w)).tolist()
        all_class_labels_assigned_test = np.sign(distances_boundary_test)
        correlation_dict_test = ut.get_correlations(
            None, None, all_class_labels_assigned_test, self.x_sensitive,
            ["s1"])

        correlation_dict = ut.get_avg_correlation_dict([correlation_dict_test])
        non_prot_pos = correlation_dict["s1"][1][1]
        prot_pos = correlation_dict["s1"][0][1]
        p_rule = (prot_pos / non_prot_pos) * 100.0

        return self.w, p_rule, 100.0 * test_score
def main(train_file, test_file, output_file, setting):
    x_train, y_train, x_control_train = load_json(train_file)
    x_test, y_test, x_control_test = load_json(test_file)

    # X = ut.add_intercept(X) # add intercept to X before applying the linear classifier
    x_train = ut.add_intercept(x_train)
    x_test = ut.add_intercept(x_test)

    # x_train, y_train, x_control_train, x_test, y_test, x_control_test = ut.split_into_train_test(X, y, x_control, 0.7)

    # print >> sys.stderr, "First row:"
    # print >> sys.stderr, x_train[0,:], y_train[0], x_control_train

    sensitive_attrs = list(x_control_train.keys())
    sensitive_attr = str(sensitive_attrs[0])

    tau = 5.0
    mu = 1.2
    sensitive_attrs_to_cov_thresh = {sensitive_attr: {0: {0: 0, 1: 0}, 1: {0: 0, 1: 0}, 2: {0: 0, 1: 0}}}
    cons_params = {"tau": tau,
                   "mu": mu,
                   "sensitive_attrs_to_cov_thresh": sensitive_attrs_to_cov_thresh}

    if setting == 'fnr':
        cons_type = 2
        cons_params["cons_type"] = cons_type

    elif setting == 'fprfnr':
        cons_type = 4
        cons_params["cons_type"] = cons_type
    elif setting == 'baseline':
        cons_params = None
    else:
        raise Exception("Don't know how to handle setting %s" % setting)

    # print("Will train classifier on %s %s-d points" % x_train.shape, file=sys.stderr)
    # print("Sensitive attribute: %s" % (x_control_train.keys(),), file=sys.stderr)
    sensitive_attrs = list(x_control_train.keys())
    w = train_classifier(x_train, y_train, x_control_train,
                         cons_params)
                         
    # print("Model trained successfully.", file=sys.stderr)

    predictions = predict(w, x_test).tolist()
    output_file = open(output_file, "w")
    json.dump(predictions, output_file)
    output_file.close()
Example #9
0
def main(train_file, model_path, mode, tau="5.0", mu="1.2", eps="0.0001"):
    """

    Args:
        cons_type:
            0 for all misclassifications
            1 for FPR
            2 for FNR
            4 for both FPR and FNR
        tau: DCCP parameter, controls how much weight to put on the constraints,
             if the constraints are not satisfied, then increase tau -- default is DCCP val 0.005
        mu: DCCP parameter, controls the multiplicative factor by which the tau increases in each
            DCCP iteration -- default is the DCCP val 1.2
        eps: stopping criteria for the convex solver. check the CVXPY documentation for details.
             default for CVXPY is 1e-6
    """
    x_train, y_train, x_control_train = load_json(train_file)

    # X = ut.add_intercept(X) # add intercept to X before applying the linear classifier
    x_train = ut.add_intercept(x_train)

    # x_train, y_train, x_control_train, x_test, y_test, x_control_test = ut.split_into_train_test(X, y, x_control, 0.7)

    # print >> sys.stderr, "First row:"
    # print >> sys.stderr, x_train[0,:], y_train[0], x_control_train

    sensitive_attrs = list(x_control_train.keys())
    sensitive_attr = str(sensitive_attrs[0])

    sensitive_attrs_to_cov_thresh = {sensitive_attr: {0: {0: 0, 1: 0}, 1: {0: 0, 1: 0}, 2: {0: 0, 1: 0}}}
    cons_params = {"tau": float(tau),
                   "mu": float(mu),
                   "sensitive_attrs_to_cov_thresh": sensitive_attrs_to_cov_thresh}

    if mode == 'fpr':
        cons_type = 1
        cons_params["cons_type"] = cons_type
    elif mode == 'fnr':
        cons_type = 2
        cons_params["cons_type"] = cons_type
    elif mode == 'fprfnr':
        cons_type = 4
        cons_params["cons_type"] = cons_type
    elif mode == 'baseline':
        cons_params = None
    else:
        raise Exception("Don't know how to handle setting %s" % mode)

    # print("Will train classifier on %s %s-d points" % x_train.shape, file=sys.stderr)
    # print("Sensitive attribute: %s" % (x_control_train.keys(),), file=sys.stderr)
    sensitive_attrs = list(x_control_train.keys())
    w = train_classifier(x_train, y_train, x_control_train, cons_params, float(eps))

    # print("Model trained successfully.", file=sys.stderr)
    np.save(model_path, w)
def main(test_file, model_path, output_file):
    x_test, y_test, x_control_test = load_json(test_file)

    # X = ut.add_intercept(X) # add intercept to X before applying the linear classifier
    x_test = ut.add_intercept(x_test)

    w = np.load(model_path)

    predictions = predict(w, x_test).tolist()
    output_file = open(output_file, "w")
    json.dump(predictions, output_file)
    output_file.close()
    def predict(self, x):
        """Make predictions on data x

        Args:
            x: Inputs of shape (m,n)
        Returns:
            Outputs of shape (m,)
        """
        sigmoid = lambda z: 1 / (1 + np.exp(-z))
        x = utils.add_intercept(x)
        probs = sigmoid(np.einsum("ij,j->i", x, self.theta))
        preds = (probs >= 0.5).astype(np.int)
        return preds
Example #12
0
 def fit_2(self,
           x: Union[np.ndarray, torch.Tensor],
           y: Union[np.ndarray, torch.Tensor],
           num_lams: int,
           max_iters: int = 1000,
           an: Union[int, float] = None,
           smooth: Union[float, int] = 0):
     """fit group lasso then followed by adaptive group lasso, saves time for basis expansion"""
     x = numpy_to_torch(x)
     y = numpy_to_torch(y)
     x = remove_intercept(x)
     x = self.normalize(x)
     x_basis = self.basis_expansion_(x, self.df, self.degree)
     group_size = [self.df] * x.shape[1]
     x_basis, group_size = add_intercept(x_basis, group_size)
     result = self.fit_path(x_basis,
                            y,
                            group_size,
                            num_lams,
                            max_iters,
                            smooth=smooth)
     beta_gl = result[min(list(result.keys()))]
     weights = self.compute_weights(beta_gl)
     result = self.fit_path(x_basis,
                            y,
                            group_size,
                            num_lams,
                            max_iters,
                            smooth=smooth,
                            weights=weights)
     best_gic = np.inf
     best_lam = 0
     best_beta = None
     if an is None:
         an = np.log(x.shape[1]) / x.shape[0]
     for lam in result.keys():
         beta_full = result[lam]
         gic = self.compute_gic(x_basis, y, beta_full, an, group_size)
         print(f"lam:{lam}, gic:{gic}")
         if gic < best_gic:
             best_lam = lam
             best_beta = beta_full
             best_gic = gic
     self.beta_agl_gic = best_beta
     self.beta = best_beta
     num_nz, nz = compute_nonzeros(best_beta, group_size)
     print(
         f"The best lam {best_lam} and the best gic {best_gic}. Finally selected {num_nz - 1} nonzeros: {nz}"
     )
     return self
Example #13
0
 def predict(self, x: Union[np.ndarray, torch.Tensor]):
     """predicts x"""
     x = numpy_to_torch(x)
     x = remove_intercept(x)
     x = self.normalize_test(x)
     x_basis = self.basis_expansion_(x, self.df, self.degree)
     x_basis = add_intercept(x_basis)
     eta = torch.matmul(x_basis, self.beta)
     if self.data_class == 'regression':
         return eta
     elif self.data_class == 'classification':
         return torch.where(
             sigmoid(eta) > 0.5, torch.ones(len(eta)),
             torch.zeros(len(eta)))
     elif self.data_class == 'gamma':
         return torch.exp(-eta)
     else:
         return torch.round(torch.exp(eta))
Example #14
0
 def fit_gic(self,
             x: Union[np.ndarray, torch.Tensor],
             y: Union[np.ndarray, torch.Tensor],
             num_lams: int,
             max_iters: int = 1000,
             an: Union[int, float] = None,
             smooth: Union[int, float] = 0):
     """fits the group lasso with gic"""
     x = numpy_to_torch(x)
     y = numpy_to_torch(y)
     x = remove_intercept(x)
     x = self.normalize(x)
     x_basis = self.basis_expansion_(x, self.df, self.degree)
     group_size = [self.df] * x.shape[1]
     x_basis, group_size = add_intercept(x_basis, group_size)
     result = self.fit_path(x_basis,
                            y,
                            group_size,
                            num_lams,
                            max_iters,
                            smooth=smooth)
     best_gic = np.inf
     if an is None:
         an = self.df * np.log(np.log(x.shape[0])) * np.log(
             x.shape[1]) / x.shape[0]
     for lam in result.keys():
         gic = self.compute_gic(x_basis, y, result[lam], an, group_size)
         # print(f"lam:{lam}, gic:{gic}")
         if gic < best_gic:
             best_lam = lam
             best_beta = result[lam]
             best_gic = gic
     self.beta_gic = best_beta
     self.beta = best_beta
     print(f"The best lam {best_lam} and the best gic {best_gic}.")
     return self
Example #15
0
def load_compas_data():

    FEATURES_CLASSIFICATION = [
        "age_cat", "race", "sex", "priors_count", "c_charge_degree"
    ]  #features to be used for classification
    CONT_VARIABLES = [
        "priors_count"
    ]  # continuous features, will need to be handled separately from categorical features, categorical features will be encoded using one-hot
    CLASS_FEATURE = "two_year_recid"  # the decision variable
    SENSITIVE_ATTRS = ["race", "sex"]

    COMPAS_INPUT_FILE = "compas-scores-two-years.csv"
    check_data_file(COMPAS_INPUT_FILE)

    # load the data and get some stats
    df = pd.read_csv(COMPAS_INPUT_FILE)

    # convert to np array
    data = df.to_dict('list')
    for k in data.keys():
        data[k] = np.array(data[k])
    """ Filtering the data """

    # These filters are the same as propublica (refer to https://github.com/propublica/compas-analysis)
    # If the charge date of a defendants Compas scored crime was not within 30 days from when the person was arrested, we assume that because of data quality reasons, that we do not have the right offense.
    idx = np.logical_and(data["days_b_screening_arrest"] <= 30,
                         data["days_b_screening_arrest"] >= -30)

    # We coded the recidivist flag -- is_recid -- to be -1 if we could not find a compas case at all.
    idx = np.logical_and(idx, data["is_recid"] != -1)

    # In a similar vein, ordinary traffic offenses -- those with a c_charge_degree of 'O' -- will not result in Jail time are removed (only two of them).
    idx = np.logical_and(
        idx, data["c_charge_degree"] != "O")  # F: felony, M: misconduct

    # We filtered the underlying data from Broward county to include only those rows representing people who had either recidivated in two years, or had at least two years outside of a correctional facility.
    idx = np.logical_and(idx, data["score_text"] != "NA")

    # we will only consider blacks and whites for this analysis
    idx = np.logical_and(
        idx,
        np.logical_or(data["race"] == "African-American",
                      data["race"] == "Caucasian"))

    # select the examples that satisfy this criteria
    for k in data.keys():
        data[k] = data[k][idx]
    """ Feature normalization and one hot encoding """

    # convert class label 0 to -1
    y = data[CLASS_FEATURE]
    y[y == 0] = -1

    print "\nNumber of people recidivating within two years"
    print pd.Series(y).value_counts()
    print "\n"

    X = np.array([]).reshape(
        len(y), 0
    )  # empty array with num rows same as num examples, will hstack the features to it
    x_control = defaultdict(list)

    feature_names = []
    for attr in FEATURES_CLASSIFICATION:
        vals = data[attr]
        if attr in CONT_VARIABLES:
            vals = [float(v) for v in vals]
            vals = preprocessing.scale(vals)  # 0 mean and 1 variance
            vals = np.reshape(
                vals,
                (len(y), -1))  # convert from 1-d arr to a 2-d arr with one col

        else:  # for binary categorical variables, the label binarizer uses just one var instead of two
            lb = preprocessing.LabelBinarizer()
            lb.fit(vals)
            vals = lb.transform(vals)

        # add to sensitive features dict
        if attr in SENSITIVE_ATTRS:
            x_control[attr] = vals

        # add to learnable features
        X = np.hstack((X, vals))

        if attr in CONT_VARIABLES:  # continuous feature, just append the name
            feature_names.append(attr)
        else:  # categorical features
            if vals.shape[
                    1] == 1:  # binary features that passed through lib binarizer
                feature_names.append(attr)
            else:
                for k in lb.classes_:  # non-binary categorical features, need to add the names for each cat
                    feature_names.append(attr + "_" + str(k))

    # convert the sensitive feature to 1-d array
    x_control = dict(x_control)
    for k in x_control.keys():
        assert (
            x_control[k].shape[1] == 1
        )  # make sure that the sensitive feature is binary after one hot encoding
        x_control[k] = np.array(x_control[k]).flatten()

    # sys.exit(1)
    """permute the date randomly"""
    perm = range(0, X.shape[0])
    shuffle(perm)
    X = X[perm]
    y = y[perm]
    for k in x_control.keys():
        x_control[k] = x_control[k][perm]

    X = ut.add_intercept(X)

    feature_names = ["intercept"] + feature_names
    assert (len(feature_names) == X.shape[1])
    print "Features we will be using for classification are:", feature_names, "\n"

    return X, y, x_control
print X_train.shape, y_train.shape, X_test.shape, y_test.shape
# print X_train[0:3,0:3]

# standardize data
X_train, scaler = standardize(X_train)
X_test, _ = standardize(X_test, scaler)

# one dimension at a time
y_train = y_train[:,0]
y_test = y_test[:,0]

print X_train.shape, y_train.shape, X_test.shape, y_test.shape

tst_song = len(song_id_tst)

# add column of ones to data to account for the bias:
X_train = add_intercept(X_train)
print X_train.shape
# print X_train[0:10]

data = dict(x=X_train, y=y_train)

with Model() as model:
    # specify glm and pass in data. The resulting linear model, its likelihood and 
    # and all its parameters are automatically added to our model.
    glm.glm('y ~ x', data)
    start = find_MAP()
    step = NUTS(scaling=start) # Instantiate MCMC sampling algorithm
    trace = sample(100, step, progressbar=False) # draw 2000 posterior samples using NUTS sampling

def load_bank_marketing_data():

	FEATURES_CLASSIFICATION = ["age","job","marital","education","default","housing","loan", "contact","month","day_of_week","poutcome"] #features to be used for classification
	CONT_VARIABLES = [] # continuous features, will need to be handled separately from categorical features, categorical features will be encoded using one-hot
	CLASS_FEATURE = "y" # the decision variable
	SENSITIVE_ATTRS = ["age"]


	INPUT_FILE = "bank-additional-full.csv"

	# load the data and get some stats
	df = pd.read_csv(INPUT_FILE, sep = ";")
	df = df.dropna(subset=FEATURES_CLASSIFICATION) # dropping missing vals

	# convert to np array
	data = df.to_dict('list')
	for k in data.keys():
		data[k] = np.array(data[k])

	""" Filtering the data """

	# data downloaded are already pro-processed

	""" Feature normalization and one hot encoding """

	y = data[CLASS_FEATURE]
	y[y=="yes"] = 1
	y[y=="no"] = -1
	y = y.astype('int32')

	# convert class label 'age' to a binary value where privileged is `age >= 25` and unprivileged is `age < 25` 
	for i in range(len(data["age"])):
		if int(data["age"][i]) >= 25:
			data["age"][i] = "privileged"
		elif int(data["age"][i]) < 25:
			data["age"][i] = "unprivileged"
	
	print("\nNumber of clients subscribed to a term deposit")
	print(pd.Series(y).value_counts())
	print("\n")

	X = np.array([]).reshape(len(y), 0) # empty array with num rows same as num examples, will hstack the features to it
	x_control = defaultdict(list)

	feature_names = []
	for attr in FEATURES_CLASSIFICATION:
		vals = data[attr]
		if attr in CONT_VARIABLES:
			vals = [float(v) for v in vals]
			vals = preprocessing.scale(vals) # 0 mean and 1 variance  
			vals = np.reshape(vals, (len(y), -1)) # convert from 1-d arr to a 2-d arr with one col

		else: # for binary categorical variables, the label binarizer uses just one var instead of two
			lb = preprocessing.LabelBinarizer()
			lb.fit(vals)
			vals = lb.transform(vals)

		# add to sensitive features dict
		if attr in SENSITIVE_ATTRS:
			x_control[attr] = vals


		# add to learnable features
		X = np.hstack((X, vals))

		if attr in CONT_VARIABLES: # continuous feature, just append the name
			feature_names.append(attr)
		else: # categorical features
			if vals.shape[1] == 1: # binary features that passed through lib binarizer
				feature_names.append(attr)
			else:
				for k in lb.classes_: # non-binary categorical features, need to add the names for each cat
					feature_names.append(attr + "_" + str(k))


	# convert the sensitive feature to 1-d array
	x_control = dict(x_control)
	for k in x_control.keys():
		assert(x_control[k].shape[1] == 1) # make sure that the sensitive feature is binary after one hot encoding
		x_control[k] = np.array(x_control[k]).flatten()

	# sys.exit(1)

	"""permute the date randomly"""
	perm = list(range(0,X.shape[0]))
	shuffle(perm)
	X = X[perm]
	y = y[perm]
	for k in x_control.keys():
		x_control[k] = x_control[k][perm]


	X = ut.add_intercept(X)

	feature_names = ["intercept"] + feature_names
	assert(len(feature_names) == X.shape[1])
	print("Features we will be using for classification are:"+str(feature_names)+"\n")


	return X, y, x_control
Example #18
0
#results = model.fit()
#print(results.summary())


algo = pyhdfe.create(get_np_columns(df, ['idcode', 'year'], False),
                        degrees_method='pairwise')
residualized = algo.residualize(get_np_columns(df, ['ln_wage', 'hours_log'], False))

print(algo.degrees)

import pdb; pdb.set_trace()



#model = sm.OLS(residualized[:,0], np.ones((residualized.shape[0], 1)))
model = sm.OLS(residualized[:,0], add_intercept(residualized[:, 1]))

#print(add_intercept(residualized[:,1])[:10])

ids = get_np_columns(df, ['idcode', 'year'], False)

all_group_indices = []
for i in range(ids.shape[1]):
    col = ids[:, i]
    # col_n[0] = sorted unique values
    # col_n[1] = unique_indices 
    # col_n[2] = unique_inverse
    unique_values, standardized_ids = np.unique(col, return_inverse=True)

    # Okay I have a column with unique values in indices, call it a.
    # I want a list of lists b such that the first list contains
def test_synthetic_data():
	
	""" Generate the synthetic data """
	X, y, x_control = generate_synthetic_data(plot_data=True) # set plot_data to False to skip the data plot
	ut.compute_p_rule(x_control["s1"], y) # compute the p-rule in the original data


	""" Split the data into train and test """
	X = ut.add_intercept(X) # add intercept to X before applying the linear classifier
	train_fold_size = 0.7
	x_train, y_train, x_control_train, x_test, y_test, x_control_test = ut.split_into_train_test(X, y, x_control, train_fold_size)



	apply_fairness_constraints = None
	apply_accuracy_constraint = None
	sep_constraint = None

	loss_function = lf._logistic_loss
	sensitive_attrs = ["s1"]
	sensitive_attrs_to_cov_thresh = {}
	gamma = None

	def train_test_classifier():
		w = ut.train_model(x_train, y_train, x_control_train, loss_function, apply_fairness_constraints, apply_accuracy_constraint, sep_constraint, sensitive_attrs, sensitive_attrs_to_cov_thresh, gamma)
		train_score, test_score, correct_answers_train, correct_answers_test = ut.check_accuracy(w, x_train, y_train, x_test, y_test, None, None)
		distances_boundary_test = (np.dot(x_test, w)).tolist()
		all_class_labels_assigned_test = np.sign(distances_boundary_test)
		correlation_dict_test = ut.get_correlations(None, None, all_class_labels_assigned_test, x_control_test, sensitive_attrs)
		cov_dict_test = ut.print_covariance_sensitive_attrs(None, x_test, distances_boundary_test, x_control_test, sensitive_attrs)
		p_rule = ut.print_classifier_fairness_stats([test_score], [correlation_dict_test], [cov_dict_test], sensitive_attrs[0])	
		return w, p_rule, test_score


	def plot_boundaries(w1, w2, p1, p2, acc1, acc2, fname):

		num_to_draw = 200 # we will only draw a small number of points to avoid clutter
		x_draw = X[:num_to_draw]
		y_draw = y[:num_to_draw]
		x_control_draw = x_control["s1"][:num_to_draw]

		X_s_0 = x_draw[x_control_draw == 0.0]
		X_s_1 = x_draw[x_control_draw == 1.0]
		y_s_0 = y_draw[x_control_draw == 0.0]
		y_s_1 = y_draw[x_control_draw == 1.0]
		plt.scatter(X_s_0[y_s_0==1.0][:, 1], X_s_0[y_s_0==1.0][:, 2], color='green', marker='x', s=30, linewidth=1.5)
		plt.scatter(X_s_0[y_s_0==-1.0][:, 1], X_s_0[y_s_0==-1.0][:, 2], color='red', marker='x', s=30, linewidth=1.5)
		plt.scatter(X_s_1[y_s_1==1.0][:, 1], X_s_1[y_s_1==1.0][:, 2], color='green', marker='o', facecolors='none', s=30)
		plt.scatter(X_s_1[y_s_1==-1.0][:, 1], X_s_1[y_s_1==-1.0][:, 2], color='red', marker='o', facecolors='none', s=30)


		x1,x2 = max(x_draw[:,1]), min(x_draw[:,1])
		y1,y2 = ut.get_line_coordinates(w1, x1, x2)
		plt.plot([x1,x2], [y1,y2], 'c-', linewidth=3, label = "Acc=%0.2f; p%% rule=%0.0f%% - Original"%(acc1, p1))
		y1,y2 = ut.get_line_coordinates(w2, x1, x2)
		plt.plot([x1,x2], [y1,y2], 'b--', linewidth=3, label = "Acc=%0.2f; p%% rule=%0.0f%% - Constrained"%(acc2, p2))



		plt.tick_params(axis='x', which='both', bottom='off', top='off', labelbottom='off') # dont need the ticks to see the data distribution
		plt.tick_params(axis='y', which='both', left='off', right='off', labelleft='off')
		plt.legend(loc=2, fontsize=15)
		plt.xlim((-15,10))
		plt.ylim((-10,15))
		plt.savefig(fname)
		plt.show()


	""" Classify the data while optimizing for accuracy """
	print
	print "== Unconstrained (original) classifier =="
	# all constraint flags are set to 0 since we want to train an unconstrained (original) classifier
	apply_fairness_constraints = 0
	apply_accuracy_constraint = 0
	sep_constraint = 0
	w_uncons, p_uncons, acc_uncons = train_test_classifier()
	
	""" Now classify such that we optimize for accuracy while achieving perfect fairness """
	apply_fairness_constraints = 1 # set this flag to one since we want to optimize accuracy subject to fairness constraints
	apply_accuracy_constraint = 0
	sep_constraint = 0
	sensitive_attrs_to_cov_thresh = {"s1":0}
	print
	print "== Classifier with fairness constraint =="
	w_f_cons, p_f_cons, acc_f_cons  = train_test_classifier()
	plot_boundaries(w_uncons, w_f_cons, p_uncons, p_f_cons, acc_uncons, acc_f_cons, "img/f_cons.png")


	""" Classify such that we optimize for fairness subject to a certain loss in accuracy """
	apply_fairness_constraints = 0 # flag for fairness constraint is set back to0 since we want to apply the accuracy constraint now
	apply_accuracy_constraint = 1 # now, we want to optimize fairness subject to accuracy constraints
	sep_constraint = 0
	gamma = 0.5 # gamma controls how much loss in accuracy we are willing to incur to achieve fairness -- increase gamme to allow more loss in accuracy
	print "== Classifier with accuracy constraint =="
	w_a_cons, p_a_cons, acc_a_cons = train_test_classifier()	
	plot_boundaries(w_uncons, w_a_cons, p_uncons, p_a_cons, acc_uncons, acc_a_cons, "img/a_cons.png")

	""" 
	Classify such that we optimize for fairness subject to a certain loss in accuracy 
	In addition, make sure that no points classified as positive by the unconstrained (original) classifier are misclassified.

	"""
	apply_fairness_constraints = 0 # flag for fairness constraint is set back to0 since we want to apply the accuracy constraint now
	apply_accuracy_constraint = 1 # now, we want to optimize accuracy subject to fairness constraints
	sep_constraint = 1 # set the separate constraint flag to one, since in addition to accuracy constrains, we also want no misclassifications for certain points (details in demo README.md)
	gamma = 2000.0
	print "== Classifier with accuracy constraint (no +ve misclassification) =="
	w_a_cons_fine, p_a_cons_fine, acc_a_cons_fine  = train_test_classifier()
	plot_boundaries(w_uncons, w_a_cons_fine, p_uncons, p_a_cons_fine, acc_uncons, acc_a_cons_fine, "img/a_cons_fine.png")

	return
def load_meps_data():
	# TO DO: CHANGE THIS
	FEATURES_CLASSIFICATION = ['REGION','AGE','SEX','RACE','MARRY',
                                 'FTSTU','ACTDTY','HONRDC','RTHLTH','MNHLTH','HIBPDX','CHDDX','ANGIDX',
                                 'MIDX','OHRTDX','STRKDX','EMPHDX','CHBRON','CHOLDX','CANCERDX','DIABDX',
                                 'JTPAIN','ARTHDX','ARTHTYPE','ASTHDX','ADHDADDX','PREGNT','WLKLIM',
                                 'ACTLIM','SOCLIM','COGLIM','DFHEAR42','DFSEE42','ADSMOK42',
                                 'PCS42',
                                 'MCS42','K6SUM42','PHQ242','EMPST','POVCAT','INSCOV', 'PERWT16F'] #features to be used for classification
	CONT_VARIABLES = ['AGE','PCS42','MCS42','K6SUM42', 'PERWT16F'] # continuous features, will need to be handled separately from categorical features, categorical features will be encoded using one-hot
	CLASS_FEATURE = 'UTILIZATION' # the decision variable
	SENSITIVE_ATTRS = ['RACE']

	INPUT_FILE = "h192.csv"

	# load the data and get some stats
	df = pd.read_csv(INPUT_FILE)

	""" Filtering the data """
	df['RACEV2X'] = df.apply(lambda row: race(row), axis=1)
	df = df.rename(columns = {'RACEV2X' : 'RACE'})
	df = df[df['PANEL'] == 21]
	df = df.rename(columns = {'FTSTU53X' : 'FTSTU', 'ACTDTY53' : 'ACTDTY', 'HONRDC53' : 'HONRDC', 'RTHLTH53' : 'RTHLTH',
                              'MNHLTH53' : 'MNHLTH', 'CHBRON53' : 'CHBRON', 'JTPAIN53' : 'JTPAIN', 'PREGNT53' : 'PREGNT',
                              'WLKLIM53' : 'WLKLIM', 'ACTLIM53' : 'ACTLIM', 'SOCLIM53' : 'SOCLIM', 'COGLIM53' : 'COGLIM',
                              'EMPST53' : 'EMPST', 'REGION53' : 'REGION', 'MARRY53X' : 'MARRY', 'AGE53X' : 'AGE',
                              'POVCAT16' : 'POVCAT', 'INSCOV16' : 'INSCOV'})
	df = df[df['REGION'] >= 0] # remove values -1
	df = df[df['AGE'] >= 0] # remove values -1
	df = df[df['MARRY'] >= 0] # remove values -1, -7, -8, -9
	df = df[df['ASTHDX'] >= 0] # remove values -1, -7, -8, -9
	df = df[(df[['FTSTU','ACTDTY','HONRDC','RTHLTH','MNHLTH','HIBPDX','CHDDX','ANGIDX','EDUCYR','HIDEG',
                             'MIDX','OHRTDX','STRKDX','EMPHDX','CHBRON','CHOLDX','CANCERDX','DIABDX',
                             'JTPAIN','ARTHDX','ARTHTYPE','ASTHDX','ADHDADDX','PREGNT','WLKLIM',
                             'ACTLIM','SOCLIM','COGLIM','DFHEAR42','DFSEE42','ADSMOK42',
                             'PHQ242','EMPST','POVCAT','INSCOV']] >= -1).all(1)]  #for all other categorical features, remove values < -1
	df['TOTEXP16'] = df.apply(lambda row: utilization(row), axis=1)
	lessE = df['TOTEXP16'] < 10.0
	df.loc[lessE,'TOTEXP16'] = -1.0
	moreE = df['TOTEXP16'] >= 10.0
	df.loc[moreE,'TOTEXP16'] = 1.0
	
	df = df.rename(columns = {'TOTEXP16' : 'UTILIZATION'})
	df = df.dropna(subset=FEATURES_CLASSIFICATION) # dropping missing vals

	# convert to np array
	data = df.to_dict('list')
	for k in data.keys():
		data[k] = np.array(data[k])

	""" Feature normalization and one hot encoding """

	y = data[CLASS_FEATURE]

	print("\nNumber of clients subscribed to a term deposit")
	print(pd.Series(y).value_counts())
	print("\n")

	X = np.array([]).reshape(len(y), 0) # empty array with num rows same as num examples, will hstack the features to it
	x_control = defaultdict(list)

	feature_names = []
	for attr in FEATURES_CLASSIFICATION:
		vals = data[attr]
		if attr in CONT_VARIABLES:
			vals = [float(v) for v in vals]
			vals = preprocessing.scale(vals) # 0 mean and 1 variance  
			vals = np.reshape(vals, (len(y), -1)) # convert from 1-d arr to a 2-d arr with one col

		else: # for binary categorical variables, the label binarizer uses just one var instead of two
			lb = preprocessing.LabelBinarizer()
			lb.fit(vals)
			vals = lb.transform(vals)

		# add to sensitive features dict
		if attr in SENSITIVE_ATTRS:
			x_control[attr] = vals


		# add to learnable features
		X = np.hstack((X, vals))

		if attr in CONT_VARIABLES: # continuous feature, just append the name
			feature_names.append(attr)
		else: # categorical features
			if vals.shape[1] == 1: # binary features that passed through lib binarizer
				feature_names.append(attr)
			else:
				for k in lb.classes_: # non-binary categorical features, need to add the names for each cat
					feature_names.append(attr + "_" + str(k))


	# convert the sensitive feature to 1-d array
	x_control = dict(x_control)
	for k in x_control.keys():
		assert(x_control[k].shape[1] == 1) # make sure that the sensitive feature is binary after one hot encoding
		x_control[k] = np.array(x_control[k]).flatten()

	# sys.exit(1)

	"""permute the date randomly"""
	perm = list(range(0,X.shape[0]))
	shuffle(perm)
	X = X[perm]
	y = y[perm]
	for k in x_control.keys():
		x_control[k] = x_control[k][perm]


	X = ut.add_intercept(X)

	feature_names = ["intercept"] + feature_names
	assert(len(feature_names) == X.shape[1])
	print("Features we will be using for classification are:"+str(feature_names)+"\n")


	return X, y, x_control
def load_compas_data():

	FEATURES_CLASSIFICATION = ["age_cat", "race", "sex", "priors_count", "c_charge_degree"] #features to be used for classification
	CONT_VARIABLES = ["priors_count"] # continuous features, will need to be handled separately from categorical features, categorical features will be encoded using one-hot
	CLASS_FEATURE = "two_year_recid" # the decision variable
	SENSITIVE_ATTRS = ["race"]


	COMPAS_INPUT_FILE = "compas-scores-two-years.csv"
	check_data_file(COMPAS_INPUT_FILE)

	# load the data and get some stats
	df = pd.read_csv(COMPAS_INPUT_FILE)
	df = df.dropna(subset=["days_b_screening_arrest"]) # dropping missing vals
	
	# convert to np array
	data = df.to_dict('list')
	for k in data.keys():
		data[k] = np.array(data[k])


	""" Filtering the data """

	# These filters are the same as propublica (refer to https://github.com/propublica/compas-analysis)
	# If the charge date of a defendants Compas scored crime was not within 30 days from when the person was arrested, we assume that because of data quality reasons, that we do not have the right offense. 
	idx = np.logical_and(data["days_b_screening_arrest"]<=30, data["days_b_screening_arrest"]>=-30)


	# We coded the recidivist flag -- is_recid -- to be -1 if we could not find a compas case at all.
	idx = np.logical_and(idx, data["is_recid"] != -1)

	# In a similar vein, ordinary traffic offenses -- those with a c_charge_degree of 'O' -- will not result in Jail time are removed (only two of them).
	idx = np.logical_and(idx, data["c_charge_degree"] != "O") # F: felony, M: misconduct

	# We filtered the underlying data from Broward county to include only those rows representing people who had either recidivated in two years, or had at least two years outside of a correctional facility.
	idx = np.logical_and(idx, data["score_text"] != "NA")

	# we will only consider blacks and whites for this analysis
	idx = np.logical_and(idx, np.logical_or(data["race"] == "African-American", data["race"] == "Caucasian"))

	# select the examples that satisfy this criteria
	for k in data.keys():
		data[k] = data[k][idx]



	""" Feature normalization and one hot encoding """

	# convert class label 0 to -1
	y = data[CLASS_FEATURE]
	y[y==0] = -1

	
	
	print "\nNumber of people recidivating within two years"
	print pd.Series(y).value_counts()
	print "\n"


	X = np.array([]).reshape(len(y), 0) # empty array with num rows same as num examples, will hstack the features to it
	x_control = defaultdict(list)

	feature_names = []
	for attr in FEATURES_CLASSIFICATION:
		vals = data[attr]
		if attr in CONT_VARIABLES:
			vals = [float(v) for v in vals]
			vals = preprocessing.scale(vals) # 0 mean and 1 variance  
			vals = np.reshape(vals, (len(y), -1)) # convert from 1-d arr to a 2-d arr with one col

		else: # for binary categorical variables, the label binarizer uses just one var instead of two
			lb = preprocessing.LabelBinarizer()
			lb.fit(vals)
			vals = lb.transform(vals)

		# add to sensitive features dict
		if attr in SENSITIVE_ATTRS:
			x_control[attr] = vals


		# add to learnable features
		X = np.hstack((X, vals))

		if attr in CONT_VARIABLES: # continuous feature, just append the name
			feature_names.append(attr)
		else: # categorical features
			if vals.shape[1] == 1: # binary features that passed through lib binarizer
				feature_names.append(attr)
			else:
				for k in lb.classes_: # non-binary categorical features, need to add the names for each cat
					feature_names.append(attr + "_" + str(k))


	# convert the sensitive feature to 1-d array
	x_control = dict(x_control)
	for k in x_control.keys():
		assert(x_control[k].shape[1] == 1) # make sure that the sensitive feature is binary after one hot encoding
		x_control[k] = np.array(x_control[k]).flatten()

	# sys.exit(1)

	"""permute the date randomly"""
	perm = range(0,X.shape[0])
	shuffle(perm)
	X = X[perm]
	y = y[perm]
	for k in x_control.keys():
		x_control[k] = x_control[k][perm]


	X = ut.add_intercept(X)

	feature_names = ["intercept"] + feature_names
	assert(len(feature_names) == X.shape[1])
	print "Features we will be using for classification are:", feature_names, "\n"


	return X, y, x_control
def test_adult_data():
    """ Load the adult data """
    X, y, x_control = load_adult_data(
        load_data_size=None
    )  # set the argument to none, or no arguments if you want to test with the whole data -- we are subsampling for performance speedup
    ut.compute_p_rule(x_control["sex"],
                      y)  # compute the p-rule in the original data
    """ Split the data into train and test """
    X = ut.add_intercept(
        X)  # add intercept to X before applying the linear classifier
    train_fold_size = 0.7
    x_train, y_train, x_control_train, x_test, y_test, x_control_test = ut.split_into_train_test(
        X, y, x_control, train_fold_size)

    apply_fairness_constraints = None
    apply_accuracy_constraint = None
    sep_constraint = None

    loss_function = lf._logistic_loss
    sensitive_attrs = ["sex"]
    sensitive_attrs_to_cov_thresh = {}
    gamma = None

    def train_test_classifier():
        w = ut.train_model(x_train, y_train, x_control_train, loss_function,
                           apply_fairness_constraints,
                           apply_accuracy_constraint, sep_constraint,
                           sensitive_attrs, sensitive_attrs_to_cov_thresh,
                           gamma)
        train_score, test_score, correct_answers_train, correct_answers_test = ut.check_accuracy(
            w, x_train, y_train, x_test, y_test, None, None)
        distances_boundary_test = (np.dot(x_test, w)).tolist()
        all_class_labels_assigned_test = np.sign(distances_boundary_test)
        correlation_dict_test = ut.get_correlations(
            None, None, all_class_labels_assigned_test, x_control_test,
            sensitive_attrs)
        cov_dict_test = ut.print_covariance_sensitive_attrs(
            None, x_test, distances_boundary_test, x_control_test,
            sensitive_attrs)
        p_rule = ut.print_classifier_fairness_stats([test_score],
                                                    [correlation_dict_test],
                                                    [cov_dict_test],
                                                    sensitive_attrs[0])
        eq_op_acc, chance_bin_zero, chance_bin_one = ut.get_eq_op_acc(
            w, x_train, y_train, x_control_train, None)
        eq_odds_acc = ut.get_eq_odds_acc(w, x_train, y_train, x_control_train,
                                         None)
        pred_rate_par_acc = ut.get_pred_rate_par_acc(w, x_train, y_train,
                                                     x_control_train, None)
        demo_par_acc_f_cons = ut.get_dem_par_acc(w, x_train, y_train,
                                                 x_control_train, None)
        return w, p_rule, test_score, eq_op_acc, eq_odds_acc, pred_rate_par_acc, demo_par_acc_f_cons

    """ Classify the data while optimizing for accuracy """
    print()
    print("== Unconstrained (original) classifier ==")
    # all constraint flags are set to 0 since we want to train an unconstrained (original) classifier
    apply_fairness_constraints = 0
    apply_accuracy_constraint = 0
    sep_constraint = 0
    w_uncons, p_uncons, acc_uncons, eq_op_acc_uncons, eq_odds_acc_uncons, pred_rate_par_acc_uncons, demo_par_acc_uncons = train_test_classifier(
    )

    temp_eq_op_acc_f = []
    temp_eq_odds_acc_f = []
    temp_pred_rate_par_acc_f = []
    temp_demo_par_acc_f = []
    """ Now classify such that we optimize for accuracy while achieving perfect fairness """
    apply_fairness_constraints = 1  # set this flag to one since we want to optimize accuracy subject to fairness constraints
    apply_accuracy_constraint = 0
    sep_constraint = 0
    for num in np.arange(0, 0.51, 0.1):
        sensitive_attrs_to_cov_thresh = {"sex": num}
        print()
        print("== Classifier with fairness constraint, cov: ", num, " ==")
        w_f_cons, p_f_cons, acc_f_cons, eq_op_acc_f_cons, eq_odds_acc_f_cons, pred_rate_par_acc_f_cons, demo_par_acc_f_cons = train_test_classifier(
        )
        temp_eq_op_acc_f.append(eq_op_acc_f_cons)
        temp_eq_odds_acc_f.append(eq_odds_acc_f_cons)
        temp_pred_rate_par_acc_f.append(pred_rate_par_acc_f_cons)
        temp_demo_par_acc_f.append(demo_par_acc_f_cons)

    sensitive_attrs_to_cov_thresh = {"sex": 1}
    print()
    print("== Classifier with fairness constraint, cov: 1 ==")
    w_f_cons, p_f_cons, acc_f_cons, eq_op_acc_f_cons, eq_odds_acc_f_cons, pred_rate_par_acc_f_cons, demo_par_acc_f_cons = train_test_classifier(
    )
    temp_eq_op_acc_f.append(eq_op_acc_f_cons)
    temp_eq_odds_acc_f.append(eq_odds_acc_f_cons)
    temp_pred_rate_par_acc_f.append(pred_rate_par_acc_f_cons)
    temp_demo_par_acc_f.append(demo_par_acc_f_cons)

    return eq_op_acc_uncons, eq_odds_acc_uncons, pred_rate_par_acc_uncons, demo_par_acc_uncons, temp_eq_op_acc_f, temp_eq_odds_acc_f, temp_pred_rate_par_acc_f, temp_demo_par_acc_f
def generate_synthetic_data(data_type, plot_data=False):

    """
        Code for generating the synthetic data.
        We will have two non-sensitive features and one sensitive feature.
        Non sensitive features will be drawn from a 2D gaussian distribution.
        Sensitive feature specifies the demographic group of the data point and can take values 0 and 1.

        The code will generate data such that a classifier optimizing for accuracy will lead to disparate misclassification rates for the two demographic groups.
        You can generate different data configurations using different values for the "data_type" parameter.
    """

    n_samples = 1000 # generate these many data points per cluster

    def gen_gaussian_diff_size(mean_in, cov_in, z_val, class_label, n):
        """
        mean_in: mean of the gaussian cluster
        cov_in: covariance matrix
        z_val: sensitive feature value
        class_label: +1 or -1
        n: number of points
        """

        nv = multivariate_normal(mean = mean_in, cov = cov_in)
        X = nv.rvs(n)
        y = np.ones(n, dtype=float) * class_label
        z = np.ones(n, dtype=float) * z_val # all the points in this cluster get this value of the sensitive attribute

        return nv, X, y, z


    if data_type == 1:

        """
        Generate data such that a classifier optimizing for accuracy will have disparate false positive rates as well as disparate false negative rates for both groups.
        """


        cc = [[10,1], [1,4]]
        mu1, sigma1 = [2, 3], cc  # z=1, +
        cc = [[5,2], [2,5]]
        mu2, sigma2 = [1, 2], cc  # z=0, +

        cc = [[5, 1], [1, 5]]
        mu3, sigma3 = [-5,0], cc # z=1, -
        cc = [[7, 1], [1, 7]]
        mu4, sigma4 = [0,-1], cc # z=0, -

        nv1, X1, y1, z1 = gen_gaussian_diff_size(mu1, sigma1, 1, +1, int(n_samples * 1) ) # z=1, +
        nv2, X2, y2, z2 = gen_gaussian_diff_size(mu2, sigma2, 0, +1, int(n_samples * 1) ) # z=0, +
        nv3, X3, y3, z3 = gen_gaussian_diff_size(mu3, sigma3, 1, -1, int(n_samples * 1) ) # z=1, -
        nv4, X4, y4, z4 = gen_gaussian_diff_size(mu4, sigma4, 0, -1, int(n_samples * 1) ) # z=0, -

    elif data_type == 2:

        """
        Generate data such that a classifier optimizing for accuracy will have disparate false positive rates for both groups but will have equal false negative rates.
        """


        cc = [[3,1], [1,3]]
        mu1, sigma1 = [2, 2], cc  # z=1, +
        mu2, sigma2 = [2, 2], cc  # z=0, +

        mu3, sigma3 = [-2,-2], cc # z=1, -
        cc = [[3,3], [1,3]]
        mu4, sigma4 = [-1,0], cc # z=0, -

        nv1, X1, y1, z1 = gen_gaussian_diff_size(mu1, sigma1, 1, +1, int(n_samples * 1) ) # z=1, +
        nv2, X2, y2, z2 = gen_gaussian_diff_size(mu2, sigma2, 0, +1, int(n_samples * 1) ) # z=0, +
        nv3, X3, y3, z3 = gen_gaussian_diff_size(mu3, sigma3, 1, -1, int(n_samples * 1) ) # z=1, -
        nv4, X4, y4, z4 = gen_gaussian_diff_size(mu4, sigma4, 0, -1, int(n_samples * 1) ) # z=0, -



    # merge the clusters
    X = np.vstack((X1, X2, X3, X4))
    y = np.hstack((y1, y2, y3, y4))
    x_control = np.hstack((z1, z2, z3, z4))

    # shuffle the data
    perm = list(range(len(X)))
    shuffle(perm)
    X = X[perm]
    y = y[perm]
    x_control = x_control[perm]

    
    """ Plot the data """
    if plot_data:
        plt.figure()
        num_to_draw = 200 # we will only draw a small number of points to avoid clutter
        x_draw = X[:num_to_draw]
        y_draw = y[:num_to_draw]
        x_control_draw = x_control[:num_to_draw]

        X_s_0 = x_draw[x_control_draw == 0.0]
        X_s_1 = x_draw[x_control_draw == 1.0]
        y_s_0 = y_draw[x_control_draw == 0.0]
        y_s_1 = y_draw[x_control_draw == 1.0]

        plt.scatter(X_s_0[y_s_0==1.0][:, 0], X_s_0[y_s_0==1.0][:, 1], color='green', marker='x', s=60, linewidth=2, label= "group-0 +ve")
        plt.scatter(X_s_0[y_s_0==-1.0][:, 0], X_s_0[y_s_0==-1.0][:, 1], color='red', marker='x', s=60, linewidth=2, label = "group-0 -ve")
        plt.scatter(X_s_1[y_s_1==1.0][:, 0], X_s_1[y_s_1==1.0][:, 1], color='green', marker='o', facecolors='none', s=60, linewidth=2, label = "group-1 +ve")
        plt.scatter(X_s_1[y_s_1==-1.0][:, 0], X_s_1[y_s_1==-1.0][:, 1], color='red', marker='o', facecolors='none', s=60, linewidth=2, label = "group-1 -ve")


        plt.tick_params(axis='x', which='both', bottom='off', top='off', labelbottom='off') # dont need the ticks to see the data distribution
        plt.tick_params(axis='y', which='both', left='off', right='off', labelleft='off')
        plt.legend(loc=2, fontsize=21)
        plt.ylim((-8,12))

        plt.savefig("img/data.png")
        plt.show()


    x_control = {"s1": x_control} # all the sensitive features are stored in a dictionary
    X = ut.add_intercept(X)
    

    return X,y,x_control
def test_adult_data():
	

	""" Load the adult data """
	X, y, x_control = load_adult_data(load_data_size=10000) # set the argument to none, or no arguments if you want to test with the whole data -- we are subsampling for performance speedup
	ut.compute_p_rule(x_control["sex"], y) # compute the p-rule in the original data



	""" Split the data into train and test """
	X = ut.add_intercept(X) # add intercept to X before applying the linear classifier
	train_fold_size = 0.7
	x_train, y_train, x_control_train, x_test, y_test, x_control_test = ut.split_into_train_test(X, y, x_control, train_fold_size)




	apply_fairness_constraints = None
	apply_accuracy_constraint = None
	sep_constraint = None

	loss_function = lf._logistic_loss
	sensitive_attrs = ["sex"]
	sensitive_attrs_to_cov_thresh = {}
	gamma = None

	def train_test_classifier():
		w = ut.train_model(x_train, y_train, x_control_train, loss_function, apply_fairness_constraints, apply_accuracy_constraint, sep_constraint, sensitive_attrs, sensitive_attrs_to_cov_thresh, gamma)
		train_score, test_score, correct_answers_train, correct_answers_test = ut.check_accuracy(w, x_train, y_train, x_test, y_test, None, None)
		distances_boundary_test = (np.dot(x_test, w)).tolist()
		all_class_labels_assigned_test = np.sign(distances_boundary_test)
		correlation_dict_test = ut.get_correlations(None, None, all_class_labels_assigned_test, x_control_test, sensitive_attrs)
		cov_dict_test = ut.print_covariance_sensitive_attrs(None, x_test, distances_boundary_test, x_control_test, sensitive_attrs)
		p_rule = ut.print_classifier_fairness_stats([test_score], [correlation_dict_test], [cov_dict_test], sensitive_attrs[0])	
		return w, p_rule, test_score


	""" Classify the data while optimizing for accuracy """
	print
	print "== Unconstrained (original) classifier =="
	# all constraint flags are set to 0 since we want to train an unconstrained (original) classifier
	apply_fairness_constraints = 0
	apply_accuracy_constraint = 0
	sep_constraint = 0
	w_uncons, p_uncons, acc_uncons = train_test_classifier()
	
	""" Now classify such that we optimize for accuracy while achieving perfect fairness """
	apply_fairness_constraints = 1 # set this flag to one since we want to optimize accuracy subject to fairness constraints
	apply_accuracy_constraint = 0
	sep_constraint = 0
	sensitive_attrs_to_cov_thresh = {"sex":0}
	print
	print "== Classifier with fairness constraint =="
	w_f_cons, p_f_cons, acc_f_cons  = train_test_classifier()

	

	""" Classify such that we optimize for fairness subject to a certain loss in accuracy """
	apply_fairness_constraints = 0 # flag for fairness constraint is set back to0 since we want to apply the accuracy constraint now
	apply_accuracy_constraint = 1 # now, we want to optimize fairness subject to accuracy constraints
	sep_constraint = 0
	gamma = 0.5 # gamma controls how much loss in accuracy we are willing to incur to achieve fairness -- increase gamme to allow more loss in accuracy
	print "== Classifier with accuracy constraint =="
	w_a_cons, p_a_cons, acc_a_cons = train_test_classifier()	

	""" 
	Classify such that we optimize for fairness subject to a certain loss in accuracy 
	In addition, make sure that no points classified as positive by the unconstrained (original) classifier are misclassified.

	"""
	apply_fairness_constraints = 0 # flag for fairness constraint is set back to0 since we want to apply the accuracy constraint now
	apply_accuracy_constraint = 1 # now, we want to optimize accuracy subject to fairness constraints
	sep_constraint = 1 # set the separate constraint flag to one, since in addition to accuracy constrains, we also want no misclassifications for certain points (details in demo README.md)
	gamma = 1000.0
	print "== Classifier with accuracy constraint (no +ve misclassification) =="
	w_a_cons_fine, p_a_cons_fine, acc_a_cons_fine  = train_test_classifier()

	return
# print X_train[0:3,0:3]

# standardize data
X_train, scaler = standardize(X_train)
X_test, _ = standardize(X_test, scaler)

# one dimension at a time
y_train = y_train[:, 0]
y_test = y_test[:, 0]

print X_train.shape, y_train.shape, X_test.shape, y_test.shape

tst_song = len(song_id_tst)

# add column of ones to data to account for the bias:
X_train = add_intercept(X_train)
print X_train.shape
# print X_train[0:10]

data = dict(x=X_train, y=y_train)

with Model() as model:
    # specify glm and pass in data. The resulting linear model, its likelihood and
    # and all its parameters are automatically added to our model.
    glm.glm('y ~ x', data)
    start = find_MAP()
    step = NUTS(scaling=start)  # Instantiate MCMC sampling algorithm
    trace = sample(
        100, step,
        progressbar=False)  # draw 2000 posterior samples using NUTS sampling
Example #26
0
 def solve(self, x: Union[np.ndarray, torch.Tensor], y: Union[np.ndarray, torch.Tensor], lam: Union[float, int],
           group_size: Union[int, List[int]], max_iters: int = 1000, weight: List[Union[int, List[int]]] = None,
           smooth: Union[float, int] = 0, recompute_hg: bool = True,
           beta_warm: torch.Tensor = None, weight_multiplied: bool = False) -> torch.Tensor:
     """
     fits the model with a use specified lambda
     :param x: the design matrix
     :param y: the response
     :param lam: the lambda for group lasso
     :param group_size: list of group sizes, or simple group size if all groups are of the same size
     :param weight: feature weights
     :param max_iters: the maximum number of iterations
     :param smooth: smoothness parameter
     :param recompute_hg: whether to recompute hg
     :param beta_warm: warm start of beta
     :return: coefficients
     """
     if isinstance(group_size, int):
         group_size = [group_size] * (x.shape[1] // group_size)
     assert np.sum(group_size) == x.shape[1], \
         f"Sum of group sizes {sum(group_size)} do not match number of variables {x.shape[1]}."
     assert lam >= 0, "Tuning parameter lam must be non-negative."
     """initialize parameters"""
     self.smoothness_penalty = smooth
     x = numpy_to_torch(x)
     y = numpy_to_torch(y)
     x, y = check_xy(x, y)
     x, group_size = add_intercept(x, group_size)
     if weight is None:
         weight = [1] * len(group_size)
     if not weight_multiplied:
         weights = [np.sqrt(group_size[i]) * weight[i] for i in range(len(group_size))]
     else:
         weights = weight[:]
     x1 = x.clone()
     # x1, self.R = self.group_orthogonalization(x, group_size)
     beta, error, iters, loss = self.initialize(group_size)
     if beta_warm is not None and beta_warm.shape == beta.shape:
         beta = beta_warm
     intercept_err = np.inf
     beta_old = beta.clone()
     num_groups = len(group_size)
     hg = None
     """start iterations"""
     while (error > self.tol or intercept_err > self.tol) and iters <= max_iters:
         iters += 1
         for g in range(num_groups):
             group_idx_start, group_idx_end = self.find_group_index(group_size, g)
             if recompute_hg or hg is None or g <= 2:
                 hg = self.compute_hg(x1, y, beta, group_idx_start, group_idx_end)
             derivative = self.compute_grad(x1, y, beta)
             if g == 0:
                 d = self.compute_d(False, derivative, beta, lam, group_idx_start, group_idx_end, hg)
                 alpha = self.line_search(x1, y, beta, d, group_size, g, lam)
                 beta = beta + alpha * d
             else:
                 beta[group_idx_start: group_idx_end] = self.close_form_QM(beta, derivative, hg, lam,
                                                                           group_idx_start, group_idx_end,
                                                                           weights[g], smooth)
         error = torch.norm(beta[1:] - beta_old[1:])
         intercept_err = abs(beta[0].detach().numpy() - beta_old[0].detach().numpy())
         beta_old = beta.clone()
         # print(f"error is {error}")
     # print(iters)
     # beta = self.group_orthogonalization_inverse(beta, self.R, group_size)
     return beta
Example #27
0
#             s = s + '%g '%X_train[i,feat]
#         infile.write('%s\n'%s)

# one dimension at a time
y_train = y_train[:, 0]
y_test = y_test[:, 0]

X_train = X_train[:, [10, 12, 13, 17, 19, 82, 83, 84, 85, 89, 90, 91, 103, 140, 142, 146, 148, 212, 214, 218, 220]]
X_test = X_test[:, [10, 12, 13, 17, 19, 82, 83, 84, 85, 89, 90, 91, 103, 140, 142, 146, 148, 212, 214, 218, 220]]

print X_train.shape, y_train.shape, X_test.shape, y_test.shape

tst_song = len(song_id_tst)

# add column of ones to data to account for the bias:
X_train = add_intercept(X_train)
print X_train.shape
# print X_train[0:10]

# Theano symbolic definitions
X = T.vector()
Y = T.scalar()
lr = T.scalar("learning rate")
regul = T.scalar("L2 regul. coeff")


def model(X, w):
    return T.dot(X, w)


nb_features = X_train.shape[1]
def generate_synthetic_data(data_type, plot_data=False):

    """
        Code for generating the synthetic data.
        We will have two non-sensitive features and one sensitive feature.
        Non sensitive features will be drawn from a 2D gaussian distribution.
        Sensitive feature specifies the demographic group of the data point and can take values 0 and 1.

        The code will generate data such that a classifier optimizing for accuracy will lead to disparate misclassification rates for the two demographic groups.
        You can generate different data configurations using different values for the "data_type" parameter.
    """

    n_samples = 1000 # generate these many data points per cluster

    def gen_gaussian_diff_size(mean_in, cov_in, z_val, class_label, n):
        """
        mean_in: mean of the gaussian cluster
        cov_in: covariance matrix
        z_val: sensitive feature value
        class_label: +1 or -1
        n: number of points
        """

        nv = multivariate_normal(mean = mean_in, cov = cov_in)
        X = nv.rvs(n)
        y = np.ones(n, dtype=float) * class_label
        z = np.ones(n, dtype=float) * z_val # all the points in this cluster get this value of the sensitive attribute

        return nv, X, y, z


    if data_type == 1:

        """
        Generate data such that a classifier optimizing for accuracy will have disparate false positive rates as well as disparate false negative rates for both groups.
        """


        cc = [[10,1], [1,4]]
        mu1, sigma1 = [2, 3], cc  # z=1, +
        cc = [[5,2], [2,5]]
        mu2, sigma2 = [1, 2], cc  # z=0, +

        cc = [[5, 1], [1, 5]]
        mu3, sigma3 = [-5,0], cc # z=1, -
        cc = [[7, 1], [1, 7]]
        mu4, sigma4 = [0,-1], cc # z=0, -

        nv1, X1, y1, z1 = gen_gaussian_diff_size(mu1, sigma1, 1, +1, int(n_samples * 1) ) # z=1, +
        nv2, X2, y2, z2 = gen_gaussian_diff_size(mu2, sigma2, 0, +1, int(n_samples * 1) ) # z=0, +
        nv3, X3, y3, z3 = gen_gaussian_diff_size(mu3, sigma3, 1, -1, int(n_samples * 1) ) # z=1, -
        nv4, X4, y4, z4 = gen_gaussian_diff_size(mu4, sigma4, 0, -1, int(n_samples * 1) ) # z=0, -

    elif data_type == 2:

        """
        Generate data such that a classifier optimizing for accuracy will have disparate false positive rates for both groups but will have equal false negative rates.
        """


        cc = [[3,1], [1,3]]
        mu1, sigma1 = [2, 2], cc  # z=1, +
        mu2, sigma2 = [2, 2], cc  # z=0, +

        mu3, sigma3 = [-2,-2], cc # z=1, -
        cc = [[3,3], [1,3]]
        mu4, sigma4 = [-1,0], cc # z=0, -

        nv1, X1, y1, z1 = gen_gaussian_diff_size(mu1, sigma1, 1, +1, int(n_samples * 1) ) # z=1, +
        nv2, X2, y2, z2 = gen_gaussian_diff_size(mu2, sigma2, 0, +1, int(n_samples * 1) ) # z=0, +
        nv3, X3, y3, z3 = gen_gaussian_diff_size(mu3, sigma3, 1, -1, int(n_samples * 1) ) # z=1, -
        nv4, X4, y4, z4 = gen_gaussian_diff_size(mu4, sigma4, 0, -1, int(n_samples * 1) ) # z=0, -



    # merge the clusters
    X = np.vstack((X1, X2, X3, X4))
    y = np.hstack((y1, y2, y3, y4))
    x_control = np.hstack((z1, z2, z3, z4))

    # shuffle the data
    perm = range(len(X))
    shuffle(perm)
    X = X[perm]
    y = y[perm]
    x_control = x_control[perm]

    
    """ Plot the data """
    if plot_data:
        plt.figure()
        num_to_draw = 200 # we will only draw a small number of points to avoid clutter
        x_draw = X[:num_to_draw]
        y_draw = y[:num_to_draw]
        x_control_draw = x_control[:num_to_draw]

        X_s_0 = x_draw[x_control_draw == 0.0]
        X_s_1 = x_draw[x_control_draw == 1.0]
        y_s_0 = y_draw[x_control_draw == 0.0]
        y_s_1 = y_draw[x_control_draw == 1.0]

        plt.scatter(X_s_0[y_s_0==1.0][:, 0], X_s_0[y_s_0==1.0][:, 1], color='green', marker='x', s=60, linewidth=2, label= "group-0 +ve")
        plt.scatter(X_s_0[y_s_0==-1.0][:, 0], X_s_0[y_s_0==-1.0][:, 1], color='red', marker='x', s=60, linewidth=2, label = "group-0 -ve")
        plt.scatter(X_s_1[y_s_1==1.0][:, 0], X_s_1[y_s_1==1.0][:, 1], color='green', marker='o', facecolors='none', s=60, linewidth=2, label = "group-1 +ve")
        plt.scatter(X_s_1[y_s_1==-1.0][:, 0], X_s_1[y_s_1==-1.0][:, 1], color='red', marker='o', facecolors='none', s=60, linewidth=2, label = "group-1 -ve")


        plt.tick_params(axis='x', which='both', bottom='off', top='off', labelbottom='off') # dont need the ticks to see the data distribution
        plt.tick_params(axis='y', which='both', left='off', right='off', labelleft='off')
        plt.legend(loc=2, fontsize=21)
        plt.ylim((-8,12))

        plt.savefig("img/data.png")
        plt.show()


    x_control = {"s1": x_control} # all the sensitive features are stored in a dictionary
    X = ut.add_intercept(X)
    

    return X,y,x_control
Example #29
0
    212, 214, 218, 220
]]
# X_train = X_train[:,[13,85,103,142,214]]
# X_test = X_test[:,[13,85,103,142,214]]

# one dimension at a time
# 0: arousal, 1: valence
y_train = y_train[:, 0]
y_test = y_test[:, 0]

print X_train.shape, y_train.shape, X_test.shape, y_test.shape

tst_song = len(song_id_tst)

# add column of ones to data to account for the bias:
X_train = add_intercept(X_train)
print X_train.shape
# print X_train[0:10]

# Theano symbolic definitions
X = T.vector()
Y = T.scalar()
lr = T.scalar('learning rate')
regul = T.scalar('L2 regul. coeff')


def model(X, w):
    return T.tanh(T.dot(X, w))
    # return 2.0*T.nnet.sigmoid(T.dot(X, w)) - 1.0
    # return T.erf(T.dot(X,w))
Example #30
0
#                                                   Prob > F        =     0.0000
#                                                   R-squared       =     0.7862
#                                                   Adj R-squared   =     0.7679
#                                                   Within R-sq.    =     0.7586
#                                                   Root MSE        =     6.7671
#
# ------------------------------------------------------------------------------
#       rating |      Coef.   Std. Err.      t    P>|t|     [95% Conf. Interval]
# -------------+----------------------------------------------------------------
#          fat |  -5.684196   .8801468    -6.46   0.000    -7.439594   -3.928799
#      protein |   3.740386   .8430319     4.44   0.000     2.059012     5.42176
#        carbo |  -.7892276   .2041684    -3.87   0.000    -1.196429   -.3820266
#       sugars |   -2.03286   .2179704    -9.33   0.000    -2.467588   -1.598132
#        _cons |   64.49503    4.92674    13.09   0.000     54.66896     74.3211
# ------------------------------------------------------------------------------
#
# Absorbed degrees of freedom:
# -----------------------------------------------------+
#  Absorbed FE | Categories  - Redundant  = Num. Coefs |
# -------------+---------------------------------------|
#        shelf |         3           0           3     |
# -----------------------------------------------------+

residualized = algo.residualize(
    get_np_columns(df, ['rating', 'fat', 'protein', 'carbo', 'sugars'], False))
model = sm.OLS(residualized[:, 0], add_intercept(residualized[:,
                                                              [1, 2, 3, 4]]))
results = model.fit()
print("rating ~ fat + protein + carbo + sugars, absorb(shelf)")
print(results.summary())