Beispiel #1
0
def training_helper(model, train_df, test_df, minibatch_size, num_iterations_per_loop=1, num_loops=1):
    train_error_rate_vector = []
    train_overall_tpr_vector = []
    train_constraints_matrix = []
    test_error_rate_vector = []
    test_overall_tpr_vector = []
    test_constraints_matrix = []
    for train, test in training_generator(model, train_df, test_df, minibatch_size, num_iterations_per_loop, num_loops):
        train_df['predictions'] = train
        test_df['predictions'] = test

        train_error_rate, train_overall_tpr, train_constraints = _get_error_rate_and_constraints(
          train_df, model.tpr_max_diff)
        train_error_rate_vector.append(train_error_rate)
        train_overall_tpr_vector.append(train_overall_tpr)
        train_constraints_matrix.append(train_constraints)

        test_error_rate, test_overall_tpr, test_constraints = _get_error_rate_and_constraints(
            test_df, model.tpr_max_diff)
        test_error_rate_vector.append(test_error_rate)
        test_overall_tpr_vector.append(test_overall_tpr)
        test_constraints_matrix.append(test_constraints)

    cand_dist = tfco.find_best_candidate_distribution(
      train_error_rate_vector, train_constraints_matrix, epsilon=0.001)
    best_cand_index = tfco.find_best_candidate_index(
      train_error_rate_vector, train_constraints_matrix)
    train_metrics = get_iterate_metrics(
      cand_dist, best_cand_index, train_error_rate_vector,
      train_overall_tpr_vector, train_constraints_matrix)
    test_metrics = get_iterate_metrics(
      cand_dist, best_cand_index, test_error_rate_vector,
      test_overall_tpr_vector, test_constraints_matrix)

    return (train_metrics, test_metrics)
def print_metrics_results_dict(results_dict, iterate='best'):
    """Prints metrics from results_dict."""
    index = -1
    if iterate == 'best':
        if FLAGS.unconstrained:
            index = np.argmin(np.array(results_dict['train.true_error_rates']))
        else:
            index = tfco.find_best_candidate_index(
                np.array(results_dict['train.true_error_rates']),
                np.array(results_dict['train.sampled_violations_max']).reshape(
                    (-1, 1)),
                rank_objectives=True)
    for metric_name, values in results_dict.items():
        _print_metric(iterate, metric_name, values[index])
Beispiel #3
0
def lagrangian_optimizer_kld(train_set, additive_slack, learning_rate,
                             learning_rate_constraint, loops):
    """Implements surrogate-based Lagrangian optimizer (Algorithm 2).

  Specifically solves:
    min_{theta} sum_{G = 0, 1} KLD(p, pprG(theta))
      s.t. error_rate <= additive_slack,
    where p is the overall proportion of positives and pprG is the positive
    prediction rate for group G.

  We frame this as a constrained optimization problem:
    min_{theta, xi_pos0, xi_pos1, xi_neg0, xi_neg1} {
      -p log(xi_pos0) - (1-p) log(xi_neg0) - p log(xi_pos1)
        -(1-p) log(xi_neg1)}
    s.t.
      error_rate <= additive_slack,
        xi_pos0 <= ppr0(theta), xi_neg0 <= npr0(theta),
        xi_pos1 <= ppr1(theta), xi_neg1 <= npr1(theta),
  and formulate the Lagrangian:
    max_{lambda's >= 0} min_{xi's} {
      -p log(xi_pos0) - (1-p) log(xi_neg0) - p log(xi_pos1)
        -(1-p) log(xi_neg1)
       + lambda_pos0 (xi_pos0 - ppr0(theta))
       + lambda_neg0 (xi_neg0 - npr0(theta))
       + lambda_pos1 (xi_pos1 - ppr1(theta))
       + lambda_neg1 (xi_neg1 - npr1(theta))}
    s.t.
      error_rate <= additive_slack.

  We do best response for the slack variables xi:
    BR for xi_pos0 = p / lambda_pos0
    BR for xi_neg0 = (1 - p) / lambda_neg0
    BR for xi_pos1 = p / lambda_pos1
    BR for xi_neg1 = (1 - p) / lambda_neg1
  We do gradient ascent on the lambda's, where
    Gradient w.r.t. lambda_pos0
      = BR for xi_pos0 - ppr0(theta)
      = p / lambda_pos0 - ppr0(theta)
      = Gradient w.r.t. lambda_pos0 of
        (p log(lambda_pos0) - lambda_pos0 ppr0(theta))
    Gradient w.r.t. lambda_neg0
      = Gradient w.r.t. lambda_neg0 of
        ((1 - p) log(lambda_neg0) - lambda_neg0 npr0(theta))
    Gradient w.r.t. lambda_pos1
      = Gradient w.r.t. lambda_pos1 of
        (p log(lambda_pos1) - lambda_pos1 ppr1(theta))
    Gradient w.r.t. lambda_neg1
      = Gradient w.r.t. lambda_neg1 of
        ((1 - p) log(lambda_neg1) - lambda_neg1 npr1(theta)).
  We do gradient descent on thetas's, with ppr's and npr's replaced with hinge
  surrogates. We use concave lower bounds on ppr's and npr's, so that when they
  get negated in the updates, we get convex upper bounds.

  See Appendix D.1 in the paper for more details.

  Args:
    train_set: (features, labels, groups)
    additive_slack: float, additive slack on error rate constraint
    learning_rate: float, learning rate for model parameters
    learning_rate_constraint: float, learning rate for Lagrange multipliers
    loops: int, number of iterations

  Returns:
    stochastic_model containing list of models and probabilities,
    deterministic_model.
  """
    x_train, y_train, z_train = train_set
    dimension = x_train.shape[-1]

    tf.reset_default_graph()

    # Data tensors.
    features_tensor = tf.constant(x_train.astype("float32"), name="features")
    labels_tensor = tf.constant(y_train.astype("float32"), name="labels")

    # Linear model.
    weights = tf.Variable(tf.zeros(dimension, dtype=tf.float32),
                          name="weights")
    threshold = tf.Variable(0, name="threshold", dtype=tf.float32)
    predictions_tensor = (tf.tensordot(features_tensor, weights, axes=(1, 0)) +
                          threshold)

    # Group-specific predictions.
    predictions_group0 = tf.boolean_mask(predictions_tensor,
                                         mask=(z_train < 1))
    num_examples0 = np.sum(z_train < 1)
    predictions_group1 = tf.boolean_mask(predictions_tensor,
                                         mask=(z_train > 0))
    num_examples1 = np.sum(z_train > 0)

    # We use the TF Constrained Optimization (TFCO) library to set up the
    # constrained optimization problem. The library doesn't currently support best
    # responses for slack variables. So we maintain explicit Lagrange multipliers
    # for the slack variables, and let the library deal with the Lagrange
    # multipliers for the error rate constraint.

    # Since we need to perform a gradient descent update on the model parameters,
    # and an ascent update on the Lagrange multipliers on the slack variables, we
    # create a single "minimization" objective using stop gradients, where a
    # descent gradient update has the effect of minimizing over the model
    # parameters and maximizing over the Lagrange multipliers for the slack
    # variables. As noted above, the ascent update on the Lagrange multipliers for
    # the error rate constraint is done by the library internally.

    # Placeholders for Lagrange multipliers for the four slack variables.
    lambda_pos0 = tf.Variable(0.5, dtype=tf.float32, name="lambda_pos0")
    lambda_neg0 = tf.Variable(0.5, dtype=tf.float32, name="lambda_neg0")
    lambda_pos1 = tf.Variable(0.5, dtype=tf.float32, name="lambda_pos1")
    lambda_neg1 = tf.Variable(0.5, dtype=tf.float32, name="lambda_neg1")

    # Set up prediction rates and surrogate relaxations on them.
    p = np.mean(y_train)  # Proportion of positives.

    # Positive and negative prediction rates for group 0 and group 1.
    ppr_group0 = tf.reduce_sum(
        tf.cast(
            tf.greater(predictions_group0,
                       tf.zeros(num_examples0, dtype="float32")),
            "float32")) / num_examples0
    npr_group0 = 1 - ppr_group0
    ppr_group1 = tf.reduce_sum(
        tf.cast(
            tf.greater(predictions_group1,
                       tf.zeros(num_examples1, dtype="float32")),
            "float32")) / num_examples1
    npr_group1 = 1 - ppr_group1

    # Hinge concave lower bounds on the positive and negative prediction rates.
    # In the gradient updates, these get negated and become convex upper bounds.
    # For group 0:
    ppr_hinge_group0 = tf.reduce_sum(
        1 - tf.nn.relu(1 - predictions_group0)) * 1.0 / num_examples0
    npr_hinge_group0 = tf.reduce_sum(
        1 - tf.nn.relu(1 + predictions_group0)) * 1.0 / num_examples0
    # For group 1:
    ppr_hinge_group1 = tf.reduce_sum(
        1 - tf.nn.relu(1 - predictions_group1)) * 1.0 / num_examples1
    npr_hinge_group1 = tf.reduce_sum(
        1 - tf.nn.relu(1 + predictions_group1)) * 1.0 / num_examples1

    # Set up KL-divergence objective for constrained optimization.
    # We use stop gradients to ensure that a single descent gradient update on the
    # objective has the effect of minimizing over the model parameters and
    # maximizing over the Lagrange multipliers for the slack variables.

    # KL-divergence for group 0.
    kld_hinge_pos_group0 = (-tf.stop_gradient(lambda_pos0) * ppr_hinge_group0 -
                            p * tf.log(lambda_pos0) +
                            lambda_pos0 * tf.stop_gradient(ppr_group0))
    kld_hinge_neg_group0 = (-tf.stop_gradient(lambda_neg0) * npr_hinge_group0 -
                            (1 - p) * tf.log(lambda_neg0) +
                            lambda_neg0 * tf.stop_gradient(npr_group0))
    kld_hinge_group0 = kld_hinge_pos_group0 + kld_hinge_neg_group0

    # KL-divergence for group 1.
    kld_hinge_pos_group1 = (-tf.stop_gradient(lambda_pos1) * ppr_hinge_group1 -
                            p * tf.log(lambda_pos1) +
                            lambda_pos1 * tf.stop_gradient(ppr_group1))
    kld_hinge_neg_group1 = (-tf.stop_gradient(lambda_neg1) * npr_hinge_group1 -
                            (1 - p) * tf.log(lambda_neg1) +
                            lambda_neg1 * tf.stop_gradient(npr_group1))
    kld_hinge_group1 = kld_hinge_pos_group1 + kld_hinge_neg_group1

    # Wrap the objective into a rate object.
    objective = tfco.wrap_rate(kld_hinge_group0 + kld_hinge_group1)

    # Set up error rate constraint for constrained optimization.
    context = tfco.rate_context(predictions_tensor, labels_tensor)
    error = tfco.error_rate(context)
    constraints = [error <= additive_slack]

    # Cretae rate minimization problem object.
    problem = tfco.RateMinimizationProblem(objective, constraints)

    # Set up optimizer.
    optimizer = tfco.LagrangianOptimizerV1(
        tf.train.AdamOptimizer(learning_rate=learning_rate),
        constraint_optimizer=tf.train.AdamOptimizer(
            learning_rate=learning_rate_constraint))
    train_op = optimizer.minimize(problem)

    # Start TF session and initialize variables.
    session = tf.Session()
    session.run(tf.global_variables_initializer())

    # We maintain a list of objectives and model weights during training.
    objectives = []
    violations = []
    models = []

    # Perform full gradient updates.
    for ii in range(loops):

        # Gradient updates.
        session.run(train_op)

        # Checkpoint once in 10 iterations.
        if ii % 10 == 0:
            # Model weights.
            model = [session.run(weights), session.run(threshold)]
            models.append(model)

            # Objective.
            klds = evaluation.expected_group_klds(x_train, y_train, z_train,
                                                  [model], [1.0])
            objectives.append(sum(klds))

            # Violation.
            error = evaluation.expected_error_rate(x_train, y_train, [model],
                                                   [1.0])
            violations.append([error - additive_slack])

    # Use the recorded objectives and constraints to find the best iterate.
    best_iterate = tfco.find_best_candidate_index(np.array(objectives),
                                                  np.array(violations))
    deterministic_model = models[best_iterate]

    # Use shrinking to find a sparse distribution over iterates.
    probabilities = tfco.find_best_candidate_distribution(
        np.array(objectives), np.array(violations))
    models_pruned = [
        models[i] for i in range(len(models)) if probabilities[i] > 0.0
    ]
    probabilities_pruned = probabilities[probabilities > 0.0]

    return (models_pruned, probabilities_pruned), deterministic_model
def train_constrained(dataset,
                      group_info,
                      epsilon=0.01,
                      learning_rate=0.1,
                      dual_scale=5.0,
                      loops=10000,
                      feature_dependent_multiplier=True,
                      hidden_layers=None,
                      skip_steps=400):
    """Train constrained classifier wth Lagrangian model.

  Args:
    dataset: train, vali and test sets
    group_info: group memberships on train, vali and test sets and thresholds
    epsilon: constraint slack
    learning_rate: learning rate for theta
    dual_scale: learning rate for gamma = dual_scale * learning_rate
    loops: number of gradient steps
    feature_dependent_multiplier: should the multiplier model be feature
      dependent. If False, a common multipler is used for all constraints
    hidden_layers: list of hidden layer nodes to be used for multiplier model
    skip_steps: steps to skip before snapshotting metrics
  """
    tf.set_random_seed(121212)
    np.random.seed(212121)
    random.seed(333333)

    x_train, y_train, z_train, x_vali, y_vali, _, x_test, y_test, _ = dataset

    (group_memberships_list_train, group_memberships_list_vali,
     group_memberships_list_test,
     group_memberships_thresholds_train) = group_info

    # Models and group thresholds tensor.
    model = create_model(x_train.shape[-1])
    multiplier_model, multiplier_weights = create_multiplier_model(
        feature_dependent_multiplier=feature_dependent_multiplier,
        dim=3,
        hidden_layers=hidden_layers)
    group_thresholds = tf.Variable(np.ones(3) * 0.1, dtype=tf.float32)

    # Features, labels, predictions, multipliers.
    features_tensor = tf.constant(x_train)
    labels_tensor = tf.constant(y_train)
    features_tensor_vali = tf.constant(x_vali)

    predictions = lambda: model(features_tensor)
    predictions_vali = lambda: model(features_tensor_vali)
    predictions_test = lambda: model(x_test)

    def multiplier_values():
        return tf.abs(
            multiplier_model(tf.reshape(group_thresholds, shape=(1, -1))))

    # Lagrangian loss function.
    def lagrangian_loss():
        # Separate out objective, constraints and proxy constraints.
        objective = problem.objective()
        constraints = problem.constraints()
        proxy_constraints = problem.proxy_constraints()

        # Set-up custom Lagrangian loss.
        primal = objective
        multipliers = multiplier_values()
        primal += tf.stop_gradient(multipliers) * proxy_constraints
        dual = dual_scale * multipliers * tf.stop_gradient(constraints)
        return primal - dual

    # Objective.
    context = tfco.rate_context(predictions, labels=lambda: labels_tensor)
    overall_error = tfco.error_rate(context)

    # Slice and subset group predictions and labels.
    def group_membership():
        return (z_train[:, 0] > group_thresholds[0]) & (
            z_train[:, 1] > group_thresholds[1]) & (z_train[:, 2] >
                                                    group_thresholds[2])

    def group_predictions():
        pred = predictions()
        groups = tf.reshape(group_membership(), (-1, 1))
        return pred[groups]

    def group_labels():
        groups = tf.reshape(group_membership(), (-1, ))
        return labels_tensor[groups]

    # Constraint.
    group_context = tfco.rate_context(group_predictions, labels=group_labels)
    group_error = tfco.error_rate(group_context)
    constraints = [group_error <= overall_error + epsilon]

    # Set up constrained optimization problem and optimizer.
    problem = tfco.RateMinimizationProblem(overall_error, constraints)
    optimizer = tf.keras.optimizers.Adagrad(learning_rate)
    var_list = model.trainable_weights + multiplier_weights

    objectives_list = []
    objectives_list_test = []
    objectives_list_vali = []
    violations_list = []
    violations_list_test = []
    violations_list_vali = []
    model_weights = []

    # Training
    for ii in range(loops):
        # Sample a group membership at random.
        random_index = np.random.randint(
            group_memberships_thresholds_train.shape[0])
        group_thresholds.assign(
            group_memberships_thresholds_train[random_index, :])

        # Gradient op.
        problem.update_ops()
        optimizer.minimize(lagrangian_loss, var_list=var_list)

        # Snapshot iterate once in 1000 loops.
        if ii % skip_steps == 0:
            pred = np.reshape(predictions(), (-1, ))
            err = error_rate(y_train, pred)
            max_viol, viol_list = violation(y_train, pred, epsilon,
                                            group_memberships_list_train)

            pred_test = np.reshape(predictions_test(), (-1, ))
            err_test = error_rate(y_test, pred_test)
            _, viol_list_test = violation(y_test, pred_test, epsilon,
                                          group_memberships_list_test)

            pred_vali = np.reshape(predictions_vali(), (-1, ))
            err_vali = error_rate(y_vali, pred_vali)
            max_viol_vali, viol_list_vali = violation(
                y_vali, pred_vali, epsilon, group_memberships_list_vali)

            objectives_list.append(err)
            objectives_list_test.append(err_test)
            objectives_list_vali.append(err_vali)
            violations_list.append(viol_list)
            violations_list_test.append(viol_list_test)
            violations_list_vali.append(viol_list_vali)
            model_weights.append(model.get_weights())

            if ii % 1000 == 0:
                print(
                    "Epoch %d | Error = %.3f | Viol = %.3f | Viol_vali = %.3f"
                    % (ii, err, max_viol, max_viol_vali),
                    flush=True)

    # Best candidate index.
    best_ind = tfco.find_best_candidate_index(np.array(objectives_list),
                                              np.array(violations_list),
                                              rank_objectives=False)
    model.set_weights(model_weights[best_ind])

    print("Train:")
    evaluate(x_train, y_train, model, epsilon, group_memberships_list_train)
    print("\nVali:")
    evaluate(x_vali, y_vali, model, epsilon, group_memberships_list_vali)
    print("\nTest:")
    evaluate(x_test, y_test, model, epsilon, group_memberships_list_test)
Beispiel #5
0
def run_experiment():
    """Run experiments comparing unconstrained and constrained methods."""
    # Range of hyper-parameters for unconstrained and constrained optimization.
    lr_range_unc = [0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0]
    lr_range_con = [0.001, 0.01, 0.1, 1.0]

    # Load dataset.
    with open(FLAGS.data_file, "rb") as f:
        train_set, vali_set, test_set = np.load(f,
                                                allow_pickle=True,
                                                fix_imports=True)
    x_vali, y_vali, z_vali = vali_set

    ##################################################
    # Unconstrained Error Optimization.
    print("Running unconstrained error optimization")

    models_unc = []
    param_objectives_unc = []

    # Find best learning rate.
    for lr_model in lr_range_unc:
        model = methods.error_rate_optimizer(train_set,
                                             learning_rate=lr_model,
                                             loops=FLAGS.loops_unc)
        error = evaluation.expected_error_rate(x_vali, y_vali, [model], [1.0])
        param_objectives_unc.append(error)
        models_unc.append(model)

    best_param_index_unc = np.argmin(param_objectives_unc)
    model_er = models_unc[best_param_index_unc]
    print()

    ##################################################
    # Post-shift for Demographic Parity.
    print("Running post-shift for demographic parity")

    # First train logistic regression model.
    models_log = []
    param_objectives_log = []

    # Find best learning rate.
    for lr_model in lr_range_unc:
        model = methods.logistic_regression(train_set,
                                            learning_rate=lr_model,
                                            loops=FLAGS.loops_unc)
        loss = evaluation.cross_entropy_loss(x_vali, y_vali, model[0],
                                             model[1])
        param_objectives_log.append(loss)
        models_log.append(model)

    best_param_index_log = np.argmin(param_objectives_log)
    logreg_model = models_log[best_param_index_log]

    # Post-shift logistic regression model for demographic parity.
    model_ps, train_set_ps, vali_set_ps, test_set_ps = methods.post_shift_dp(
        train_set, vali_set, test_set, logreg_model)
    print()

    ##################################################
    # Surrogate-based Lagrangian Optimizer for Convex Rate Metrics (Algorithm 2).
    print("Running constrained Lagrangian optimization (Algorithm 2)")

    # Set additive slack to unconstrained error * epsilon.
    x_train, y_train, _ = train_set
    error_unc_train = evaluation.expected_error_rate(x_train, y_train,
                                                     [model_er], [1.0])
    additive_slack = error_unc_train * FLAGS.epsilon

    # Maintain list of models, objectives and violations for hyper-parameters.
    stochastic_models_list = []
    deterministic_models_list = []
    param_objectives_con = []
    param_violations_con = []

    # Find best learning rates for model parameters and Lagrange multipliers.
    for lr_model in lr_range_con:
        for lr_constraint in lr_range_con:
            stochastic_model, deterministic_model = (
                methods.lagrangian_optimizer_kld(
                    train_set,
                    learning_rate=lr_model,
                    learning_rate_constraint=lr_constraint,
                    loops=FLAGS.loops_con,
                    additive_slack=additive_slack))
            stochastic_models_list.append(stochastic_model)
            deterministic_models_list.append(deterministic_model)

            # Record objective and constraint violations for stochastic model.
            klds = evaluation.expected_group_klds(x_vali, y_vali, z_vali,
                                                  stochastic_model[0],
                                                  stochastic_model[1])
            param_objectives_con.append(sum(klds))

            error = evaluation.expected_error_rate(x_vali, y_vali,
                                                   stochastic_model[0],
                                                   stochastic_model[1])
            param_violations_con.append([error - additive_slack])

            print("Parameters (%.3f, %.3f): %.3f (%.3f)" %
                  (lr_model, lr_constraint, param_objectives_con[-1],
                   max(param_violations_con[-1])))

    # Best param.
    best_param_index_con = tfco.find_best_candidate_index(
        np.array(param_objectives_con), np.array(param_violations_con))

    stochastic_model_con = stochastic_models_list[best_param_index_con]
    deterministic_model_con = deterministic_models_list[best_param_index_con]
    print()

    # Print summary of performance on test set.
    results = {}
    results["UncError"] = print_results(test_set, [model_er], [1.0],
                                        "UncError")
    error_unc = results["UncError"][1]
    results["PostShift"] = print_results(test_set_ps, [model_ps], [1.0],
                                         "PostShift", error_unc)
    results["Stochastic"] = print_results(test_set, stochastic_model_con[0],
                                          stochastic_model_con[1],
                                          "Constrained (Stochastic)",
                                          error_unc)
    results["Deterministic"] = print_results(test_set,
                                             [deterministic_model_con], [1.0],
                                             "Constrained (Deterministic)",
                                             error_unc)
    print()

    # Print summary of performance on train set.
    results = {}
    results["UncError"] = print_results(train_set, [model_er], [1.0],
                                        "UncError")
    error_unc = results["UncError"][1]
    results["PostShift"] = print_results(train_set_ps, [model_ps], [1.0],
                                         "PostShift", error_unc)
    results["Stochastic"] = print_results(train_set, stochastic_model_con[0],
                                          stochastic_model_con[1],
                                          "Constrained (Stochastic)",
                                          error_unc)
    results["Deterministic"] = print_results(train_set,
                                             [deterministic_model_con], [1.0],
                                             "Constrained (Deterministic)",
                                             error_unc)
    print()

    # Print summary of performance on vali set.
    results = {}
    results["UncError"] = print_results(vali_set, [model_er], [1.0],
                                        "UncError")
    error_unc = results["UncError"][1]
    results["PostShift"] = print_results(vali_set_ps, [model_ps], [1.0],
                                         "PostShift", error_unc)
    results["Stochastic"] = print_results(vali_set, stochastic_model_con[0],
                                          stochastic_model_con[1],
                                          "Constrained (Stochastic)",
                                          error_unc)
    results["Deterministic"] = print_results(vali_set,
                                             [deterministic_model_con], [1.0],
                                             "Constrained (Deterministic)",
                                             error_unc)
def train_model(train_set, params, metric_fn=None, valid_set=None):
  """Set up problem and model."""

  # include id = 0
  np.random.seed(121212 + FLAGS.id)
  random.seed(212121 + FLAGS.id)
  tf.compat.v1.set_random_seed(123456 + FLAGS.id)

  if params['multiplier_type'] == 'unconstrained':
    # Unconstrained optimization.
    constraint_groups = []
    if params['constraint_type'] == 'marginal_equal_opportunity':
      valid_groups = [(0, None), (1, None)]
    elif params['constraint_type'] == 'cross_group_equal_opportunity':
      valid_groups = [(0, 1), (1, 0)]
  else:
    # Constrained optimization.
    if params['constraint_type'] == 'marginal_equal_opportunity':
      constraint_groups = [((0, None), (1, None))]
      valid_groups = [(0, None), (1, None)]
    elif params['constraint_type'] == 'cross_group_equal_opportunity':
      constraint_groups = [((0, 1), (1, 0))]
      valid_groups = [(0, 1), (1, 0)]
    elif params['constraint_type'] == 'custom':
      constraint_groups = params['constraint_groups']
    else:
      constraint_groups = []

  if 'multiplier_dimension' not in params:
    multiplier_dimension = train_set['features'].shape[2] - train_set[
        'dimension']
  else:
    multiplier_dimension = params['multiplier_dimension']

  # Dictionary that will hold batch features pairs, group pairs and labels for
  # current batch. We include one query per-batch.
  paired_batch = {}
  batch_index = 0  # Index of current query.

  # Data functions.
  features = lambda: paired_batch['features']
  groups = lambda: paired_batch['groups']
  labels = lambda: np.ones(paired_batch['features'].shape[0])

  # Create ranking model and constrained optimization problem.
  problem, ranking_model = formulate_problem(features, groups, labels,
                                             train_set['dimension'],
                                             constraint_groups,
                                             params['constraint_slack'])

  if (params['multiplier_type'] == 'unconstrained') or (
      params['multiplier_type'] == 'common'):
    # Unconstrained optimization or constrained optimization with a common
    # set of Lagrange multipliers for all query.

    # Create Lagrangian loss for problem with standard TFCO.
    lagrangian_loss, update_ops, multipliers_variables = (
        tfco.create_lagrangian_loss(problem, dual_scale=params['dual_scale']))
    multipliers_variables_list = [multipliers_variables]

    # All paired queries are valid
    check_train_pair = lambda _: True
  else:
    # Constrained optimization with feature-dependent multiplier, or with
    # per-query multipliers, i.e. separate set of multipliers per each query.
    if params['multiplier_type'] == 'feature_dependent':
      # Create multipliers model.
      print('Creating multiplier model with {} features.'.format(
          multiplier_dimension))
      multiplier_model, multipliers = create_multipliers_model(
          features, multiplier_dimension, problem.num_constraints)
      multipliers_variables_list = multiplier_model.trainable_weights
      check_train_pair = lambda x: np.unique(x['groups'], axis=0).shape[0] >= 4
    elif params['multiplier_type'] == 'per-query':
      # Create separate set of multipliers per query.
      multipliers_variables = tf.Variable(
          np.ones((train_set['num_queries'], problem.num_constraints)),
          dtype=tf.float32)

      def multipliers():
        return tf.reshape(multipliers_variables[batch_index, :], (-1,))

      multipliers_variables_list = [multipliers_variables]
      check_train_pair = lambda _: True
    else:
      raise ValueError('Invalid multiplier type')

    # Create Lagrangian loss with multipliers defined above.
    def lagrangian_loss():
      # Separate out objective, constraints and proxy constraints.
      objective = problem.objective()
      constraints = problem.constraints()
      if constraints.shape[0] == 0:
        # If no constraints, just return objective.
        return objective

      # Set up custom Lagrangian loss.
      proxy_constraints = problem.proxy_constraints()
      multipliers_tensor = tf.abs(multipliers())  # Abs enforces non-negativity.

      primal = objective + tf.tensordot(
          tf.stop_gradient(multipliers_tensor), proxy_constraints, 1)
      dual = params['dual_scale'] * tf.tensordot(
          multipliers_tensor, tf.stop_gradient(constraints), 1)

      return primal - dual

    update_ops = problem.update_ops

  # Create optimizer
  if FLAGS.optimizer == 'sgd':
    optimizer = tf.keras.optimizers.SGD(learning_rate=params['learning_rate'])
  elif FLAGS.optimizer == 'adam':
    optimizer = tf.keras.optimizers.Adam(learning_rate=params['learning_rate'])
  else:
    optimizer = tf.keras.optimizers.Adagrad(
        learning_rate=params['learning_rate'])

  # List of trainable variables.
  if params['multiplier_type'] == 'unconstrained':
    var_list = ranking_model.trainable_weights + problem.trainable_variables
  else:
    var_list = (
        ranking_model.trainable_weights + problem.trainable_variables +
        multipliers_variables_list)

  # List of objectives, group constraint violations, per-query constraint
  # violations, and snapshot of models during course of training.
  objectives = []
  group_violations = []
  query_violations = []
  query_violations_full = []
  query_ndcgs = []
  models = []

  features = train_set['features']
  queries = train_set['queries']
  groups = train_set['groups']

  print()
  # Run loops * iterations_per_loop full batch iterations.
  for ii in range(params['loops']):
    for _ in range(params['iterations_per_loop']):

      # Populate paired_batch dict with all pairs for current query. The batch
      # index is the same as the current query index.
      paired_batch = {
          'features': features[queries == batch_index],
          'groups': groups[queries == batch_index]
      }

      # Optimize loss.
      if check_train_pair(paired_batch):
        update_ops()
        optimizer.minimize(lagrangian_loss, var_list=var_list)

      # Update batch_index, and cycle back once last query is reached.
      batch_index = (batch_index + 1) % train_set['num_queries']
      # print(var_list)

    # Snap shot current model.
    model_copy = tf.keras.models.clone_model(ranking_model)
    model_copy.set_weights(ranking_model.get_weights())
    models.append(model_copy)

    # Evaluate metrics for snapshotted model.
    # error, gerr, group_viol, query_viol, query_viols = evaluate_results(
    #     ranking_model, train_set, params)
    # sys.stdout.write('\r Evaluating')
    if metric_fn is None:
      error, _, group_viol, query_viol, query_viols = evaluate_results(
          ranking_model, valid_set, params)
      query_ndcgs.append(0)
    else:
      error, group_error, query_error, query_ndcg = metric_fn(
          ranking_model, valid_groups)
      group_viol = [
          group_error[0] - group_error[1], group_error[1] - group_error[0]
      ]
      query_viol = [np.max(np.abs(query_error[:, 0] - query_error[:, 1]))]
      query_viols = [np.abs(query_error[:, 0] - query_error[:, 1])]
      query_ndcgs.append(np.mean(query_ndcg))

    objectives.append(error)
    group_violations.append(
        [x - params['constraint_slack'] for x in group_viol])
    query_violations.append(
        [x - params['constraint_slack'] for x in query_viol])
    query_violations_full.append(
        [x - params['constraint_slack'] for x in query_viols])
    sys.stdout.write(
        '\r Epoch %d: error = %.3f, group violation = %.3f, query violation = %.3f'
        % (ii, objectives[-1], max(
            group_violations[-1]), max(query_violations[-1])))

  print()

  best_index_padding = params['loops'] // 2
  if params['multiplier_type'] == 'unconstrained':
    # Find model iterate that achieves lowest objective.
    best_index = np.argmin(objectives[best_index_padding:]) + best_index_padding
  elif params['multiplier_type'] == 'common':
    # Find model iterate that trades-off between objective and group violations.
    best_index = tfco.find_best_candidate_index(
        np.array(objectives[best_index_padding:]),
        np.array(group_violations[best_index_padding:]),
        rank_objectives=False) + best_index_padding
  else:
    # Find model iterate that trades-off between objective and per-query
    # violations.
    best_index = tfco.find_best_candidate_index(
        np.array(objectives[best_index_padding:]),
        np.array(query_violations[best_index_padding:]),
        rank_objectives=False) + best_index_padding

  return models[
      best_index], objectives, group_violations, query_violations, query_violations_full, query_ndcgs, best_index
def lagrangian_optimizer_fmeasure(
    train_set, epsilon, learning_rate, learning_rate_constraint, loops):
  """Implements surrogate-based Lagrangian optimizer (Algorithm 3).

  Specifically solves:
    max F-measure s.t. F-measure(group1) >= F-measure(group0) - epsilon.

  Args:
    train_set: (features, labels, groups)
    epsilon: float, constraint slack.
    learning_rate: float, learning rate for model parameters.
    learning_rate_constraint: float, learning rate for Lagrange multipliers.
    loops: int, number of iterations.

  Returns:
    stochastic_model containing list of models and probabilities,
    determistic_model
  """
  x_train, y_train, z_train = train_set
  dimension = x_train.shape[-1]

  tf.reset_default_graph()

  # Data tensors.
  features_tensor = tf.constant(x_train.astype("float32"), name="features")
  labels_tensor = tf.constant(y_train.astype("float32"), name="labels")

  # Linear model.
  weights = tf.Variable(tf.zeros(dimension, dtype=tf.float32),
                        name="weights")
  threshold = tf.Variable(0, name="threshold", dtype=tf.float32)
  predictions_tensor = (tf.tensordot(features_tensor, weights, axes=(1, 0))
                        + threshold)

  # Contexts.
  context = tfco.rate_context(predictions_tensor, labels_tensor)
  context0 = context.subset(z_train < 1)
  context1 = context.subset(z_train > 0)

  # F-measure rates.
  fm_overall = tfco.f_score_lower_bound(context)
  fm1 = tfco.f_score_lower_bound(context1)
  fm0 = tfco.f_score_upper_bound(context0)

  # Rate minimization.
  problem = tfco.RateMinimizationProblem(-fm_overall, [fm0 <= fm1 + epsilon])

  # Optimizer.
  optimizer = tfco.LagrangianOptimizer(
      tf.train.AdamOptimizer(learning_rate=learning_rate),
      constraint_optimizer=tf.train.AdamOptimizer(
          learning_rate=learning_rate_constraint))
  train_op = optimizer.minimize(problem)

  # Start TF session and initialize variables.
  session = tf.Session()
  session.run(tf.global_variables_initializer())

  # We maintain a list of objectives and model weights during training.
  objectives = []
  violations = []
  models = []

  # Perform full gradient updates.
  for ii in range(loops):

    # Gradient updates.
    session.run(train_op)

    # Checkpoint once in 10 iterations.
    if ii % 10 == 0:
      # Model weights.
      model = [session.run(weights), session.run(threshold)]
      models.append(model)

      # Objective.
      objective = -evaluation.expected_fmeasure(
          x_train, y_train, [model], [1.0])
      objectives.append(objective)

      # Violation.
      fmeasure0, fmeasure1 = evaluation.expected_group_fmeasures(
          x_train, y_train, z_train, [model], [1.0])
      violations.append([fmeasure0 - fmeasure1 - epsilon])

  # Use the recorded objectives and constraints to find the best iterate.
  best_iterate = tfco.find_best_candidate_index(
      np.array(objectives), np.array(violations))
  deterministic_model = models[best_iterate]

  # Use shrinking to find a sparse distribution over iterates.
  probabilities = tfco.find_best_candidate_distribution(
      np.array(objectives), np.array(violations))
  models_pruned = [models[i] for i in range(len(models)) if
                   probabilities[i] > 0.0]
  probabilities_pruned = probabilities[probabilities > 0.0]

  return (models_pruned, probabilities_pruned), deterministic_model
def run_experiment():
    """Run experiments comparing unconstrained and constrained methods."""
    # Range of hyper-parameters for unconstrained and constrained optimization.
    lr_range_unc = [0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0]
    lr_range_con = [0.001, 0.01, 0.1, 1.0]

    # Load dataset.
    with open(FLAGS.data_file, "rb") as f:
        train_set, vali_set, test_set = np.load(f,
                                                allow_pickle=True,
                                                fix_imports=True)
    x_vali, y_vali, z_vali = vali_set

    ##################################################
    # Unconstrained Error Optimization.
    print("Running unconstrained error optimization")

    models_unc = []
    param_objectives_unc = []

    # Find best learning rate.
    for lr_model in lr_range_unc:
        model = methods.error_rate_optimizer(train_set,
                                             learning_rate=lr_model,
                                             loops=FLAGS.loops_unc)
        error = evaluation.expected_error_rate(x_vali, y_vali, [model], [1.0])
        param_objectives_unc.append(error)
        models_unc.append(model)

    best_param_index_unc = np.argmin(param_objectives_unc)
    model_er = models_unc[best_param_index_unc]
    print()

    ##################################################
    # Post-shift F1 Optimization.
    print("Running unconstrained F-measure optimization (Post-shift)")

    # First train logistic regression model.
    models_log = []
    param_objectives_log = []

    # Find best learning rate.
    for lr_model in lr_range_unc:
        model = methods.logistic_regression(train_set,
                                            learning_rate=lr_model,
                                            loops=FLAGS.loops_unc)
        loss = evaluation.cross_entropy_loss(x_vali, y_vali, model[0],
                                             model[1])
        param_objectives_log.append(loss)
        models_log.append(model)

    best_param_index_log = np.argmin(param_objectives_log)
    logreg_model = models_log[best_param_index_log]

    # Post-shift logistic regression model to optimize F-measure.
    model_ps = methods.post_shift_fmeasure(vali_set, logreg_model)
    print()

    ##################################################
    # Surrogate-based Lagrangian Optimizer for Sums-of-ratios (Algorithm 3).
    print("Running constrained Lagrangian optimization (Algorithm 3)")

    # Maintain list of models, objectives and violations for hyper-parameters.
    stochastic_models_list = []
    deterministic_models_list = []
    param_objectives_con = []
    param_violations_con = []

    # Find best learning rates for model parameters and Lagrange multipliers.
    for lr_model in lr_range_con:
        for lr_constraint in lr_range_con:
            stochastic_model, deterministic_model = (
                methods.lagrangian_optimizer_fmeasure(
                    train_set,
                    learning_rate=lr_model,
                    learning_rate_constraint=lr_constraint,
                    loops=FLAGS.loops_con,
                    epsilon=FLAGS.epsilon))
            stochastic_models_list.append(stochastic_model)
            deterministic_models_list.append(deterministic_model)

            # Record objective and constraint violations for stochastic model.
            fm = -evaluation.expected_fmeasure(
                x_vali, y_vali, stochastic_model[0], stochastic_model[1])
            param_objectives_con.append(fm)

            fm0, fm1 = evaluation.expected_group_fmeasures(
                x_vali, y_vali, z_vali, stochastic_model[0],
                stochastic_model[1])
            param_violations_con.append([fm0 - fm1 - FLAGS.epsilon])

            print("Parameters (%.3f, %.3f): %.3f (%.3f)" %
                  (lr_model, lr_constraint, -param_objectives_con[-1],
                   max(param_violations_con[-1])))

    # Best param.
    best_param_index_con = tfco.find_best_candidate_index(
        np.array(param_objectives_con), np.array(param_violations_con))

    stochastic_model_con = stochastic_models_list[best_param_index_con]
    deterministic_model_con = deterministic_models_list[best_param_index_con]
    print()

    # Print summary of performance on test set.
    results = {}
    results["UncError"] = print_results(test_set, [model_er], [1.0],
                                        "UncError")
    results["UncF1"] = print_results(test_set, [model_ps], [1.0], "UncF1")
    results["Stochastic"] = print_results(test_set, stochastic_model_con[0],
                                          stochastic_model_con[1],
                                          "Constrained (Stochastic)")
    results["Deterministic"] = print_results(test_set,
                                             [deterministic_model_con], [1.0],
                                             "Constrained (Deterministic)")
Beispiel #9
0
    def lagrangian_optimizer(train_set,
                             epsilon=epsilon,
                             learning_rate=0.01,
                             learning_rate_constraint=0.01,
                             loops=2000):
        tf.reset_default_graph()

        x_train, y_train, z_train = train_set
        num_examples = x_train.shape[0]
        dimension = x_train.shape[-1]

        # Data tensors.
        features_tensor = tf.constant(x_train.astype("float32"),
                                      name="features")
        labels_tensor = tf.constant(y_train.astype("float32"), name="labels")

        # Linear model.
        weights = tf.Variable(tf.zeros(dimension, dtype=tf.float32),
                              name="weights")
        threshold = tf.Variable(0, name="threshold", dtype=tf.float32)
        predictions_tensor = (
            tf.tensordot(features_tensor, weights, axes=(1, 0)) + threshold)

        predictions_group0 = tf.boolean_mask(predictions_tensor,
                                             mask=(z_train < 1))
        num0 = np.sum(z_train < 1)
        predictions_group1 = tf.boolean_mask(predictions_tensor,
                                             mask=(z_train > 0))
        num1 = np.sum(z_train > 0)

        # Set up rates.
        context = tfco.rate_context(predictions_tensor, labels_tensor)
        true_positive_rate = tfco.true_positive_rate(context)
        true_negative_rate = tfco.true_negative_rate(context)

        context0 = context.subset(z_train < 1)
        true_positive_rate0 = tfco.true_positive_rate(context0)

        context1 = context.subset(z_train > 0)
        true_positive_rate1 = tfco.true_positive_rate(context1)

        # Set up slack variables.
        slack_tpr = tf.Variable(0.5, dtype=tf.float32)
        slack_tnr = tf.Variable(0.5, dtype=tf.float32)

        # Projection ops for slacks.
        projection_ops = []
        projection_ops.append(
            tf.assign(slack_tpr, tf.clip_by_value(slack_tpr, 0.001, 0.999)))
        projection_ops.append(
            tf.assign(slack_tnr, tf.clip_by_value(slack_tnr, 0.001, 0.999)))

        # Set up 1 - G-mean objective.
        objective = tfco.wrap_rate(1.0 - tf.sqrt(slack_tpr * slack_tnr))

        # Set up slack constraints.
        constraints = []
        constraints.append(tfco.wrap_rate(slack_tpr) <= true_positive_rate)
        constraints.append(tfco.wrap_rate(slack_tnr) <= true_negative_rate)

        # Set up fairness equal-opportunity constraints.
        constraints.append(
            true_positive_rate0 <= true_positive_rate1 + epsilon)
        constraints.append(
            true_positive_rate1 <= true_positive_rate0 + epsilon)

        # Set up constraint optimization problem.
        problem = tfco.RateMinimizationProblem(objective, constraints)

        # Set up solver.
        optimizer = tf.train.AdamOptimizer(learning_rate)
        constraint_optimizer = tf.train.AdamOptimizer(learning_rate_constraint)
        lagrangian_optimizer = tfco.ProxyLagrangianOptimizerV1(
            optimizer=optimizer, constraint_optimizer=constraint_optimizer)
        train_op = lagrangian_optimizer.minimize(problem)

        # Start TF session and initialize variables.
        session = tf.Session()
        tf.set_random_seed(654321)  # Set random seed for reproducibility.
        session.run(tf.global_variables_initializer())

        # We maintain a list of objectives and model weights during training.
        objectives = []
        violations = []
        models = []

        # Perform  full gradient updates.
        for ii in xrange(loops):
            # Gradient update.
            session.run(train_op)
            # Projection.
            session.run(projection_ops)

            # Checkpoint once in 100 iterations.
            if ii % 100 == 0:
                # Model weights.
                model = [session.run(weights), session.run(threshold)]
                models.append(model)

                # Snapshot performace
                error, tpr0, tpr1 = evaluate_expected_results(
                    train_set, [model], [1.0])
                objectives.append(error)
                violations.append(
                    [tpr0 - tpr1 - epsilon, tpr1 - tpr0 - epsilon])

        # Use the recorded objectives and constraints to find the best iterate.
        # Best model
        best_iterate = tfco.find_best_candidate_index(np.array(objectives),
                                                      np.array(violations))
        best_model = models[best_iterate]

        # Stochastic model over a subset of classifiers.
        probabilities = tfco.find_best_candidate_distribution(
            np.array(objectives), np.array(violations))
        models_pruned = [
            models[i] for i in range(len(models)) if probabilities[i] > 0.0
        ]
        probabilities_pruned = probabilities[probabilities > 0.0]

        # Stochastic model over all classifiers.
        probabilities_all = probabilities * 0.0 + 1.0 / len(probabilities)

        # Return Pruned models, Avg models, Best model
        results = {
            'stochastic': (models, probabilities_all),
            'pruned': (models_pruned, probabilities_pruned),
            'best': best_model,
            'objectives': objectives,
            'violations': violations
        }
        return results
Beispiel #10
0
def cross_val():
    # New class implementation.
    xnp, xp, y = data_gen.load_adult()

    results = {
        'Last_iterate_train_acc': [],
        'Last_iterate_test_acc': [],
        'Last_iterate_train_ct': [],
        'Last_iterate_test_ct': [],
        'Best_iterate_train_acc': [],
        'Best_iterate_test_acc': [],
        'Best_iterate_train_ct': [],
        'Best_iterate_test_ct': [],
        'Stoch_iterate_train_acc': [],
        'Stoch_iterate_test_acc': [],
        'Stoch_iterate_train_ct': [],
        'Stoch_iterate_test_ct': [],
    }
    nfolds = 5
    fsize = int(np.ceil(len(xnp) / nfolds))
    for fidx in range(nfolds):
        print(f'\n### Processing fold {fidx}')

        # Build a full index set
        idx = np.arange(len(xnp))
        # Separate index sets
        tridx = np.hstack((idx[:fidx * fsize], idx[(fidx + 1) * fsize:]))
        tsidx = idx[fidx * fsize:(fidx + 1) * fsize]

        # Separate training and test data
        xptr = xp[tridx]
        ytr = y[tridx]
        xpts = xp[tsidx]
        yts = y[tsidx]

        # Standardize train set.
        scl = MinMaxScaler()
        xnptr = scl.fit_transform(xnp[tridx])
        xnpts = scl.transform(xnp[tsidx])

        # Add protected features.
        xtr = np.hstack([xnptr, xptr])
        xts = np.hstack([xnpts, xpts])

        scl = MinMaxScaler()
        ytr = scl.fit_transform(ytr)
        yts = scl.transform(yts)

        print("Computing indicator matrices.")
        I_train = utils.compute_indicator_matrix_c(xptr)
        I_test = utils.compute_indicator_matrix_c(xpts)
        didi_tr = utils.didi_c(ytr, I_train)
        didi_ts = utils.didi_c(yts, I_test)

        tfco_model = TFCOFairCls(input_dim=xtr.shape[1],
                                 output_dim=1,
                                 I_train=I_train,
                                 didi_tr=didi_tr)

        minibatch_size = 200
        iterations_per_loop = 200
        loops = 100

        train_pred, test_pred = tfco_model._full_training(
            xtr, xts, ytr, minibatch_size, iterations_per_loop, loops)

        train_errors = []
        train_violations = []
        train_didi = []
        train_acc = []

        for p in train_pred:
            p_class = (1 + np.sign(p)) / 2
            err, viol = _get_error_rate_and_didi(p, ytr.reshape(-1, 1),
                                                 didi_tr, I_train)
            acc = accuracy_score(ytr, p_class)
            didi = utils.didi_r(p_class, I_train) / didi_tr
            train_errors.append(err)
            train_violations.append(viol)
            train_didi.append(didi)
            train_acc.append(acc)

        test_errors = []
        test_violations = []
        test_didi = []
        test_acc = []

        for p in test_pred:
            p_class = (1 + np.sign(p)) / 2
            err, viol = _get_error_rate_and_didi(p, yts.reshape(-1, 1),
                                                 didi_ts, I_test)
            acc = accuracy_score(yts, p_class)
            didi = utils.didi_r(p_class, I_test) / didi_ts
            test_errors.append(err)
            test_violations.append(viol)
            test_didi.append(didi)
            test_acc.append(acc)

        train_violations = np.array(train_violations)
        print("Train Acc.", train_acc[-1])
        print("Train DIDI.", train_didi[-1])

        print("Test Acc.", test_acc[-1])
        print("Test DIDI.", test_didi[-1])

        print("Improving using Best Iterate instead of Last Iterate.")
        #
        # As discussed in [[CotterEtAl18b]](https://arxiv.org/abs/1809.04198), the last iterate may not be the best choice
        # and suggests a simple heuristic to choose the best iterate out of the ones found after each epoch.
        # The heuristic proceeds by ranking each of the solutions based on accuracy and fairness separately with respect to
        # the training data. Any solutions which satisfy the constraints are equally ranked top in terms fairness.
        # Each solution thus has two ranks. Then, the chosen solution is the one with the smallest maximum of the two ranks.
        # We see that this improves the fairness and can find a better accuracy / fairness trade-off on the training data.
        #
        # This solution can be calculated using find_best_candidate_index given the list of training errors and violations
        # associated with each of the epochs.

        best_cand_index = tfco.find_best_candidate_index(
            train_errors, train_violations)

        print("Train Acc.", train_acc[best_cand_index])
        print("Train DIDI.", train_didi[best_cand_index])

        print("Test Acc.", test_acc[best_cand_index])
        print("Test DIDI.", test_acc[best_cand_index])

        print("m-stochastic solution.")
        # [[CoJiSr19]](https://arxiv.org/abs/1804.06500) presents a method which shrinks down the T-stochastic solution down
        # to one that is supported on at most (m+1) points where m is the number of constraints and is guaranteed to be at
        # least as good as the T-stochastic solution.
        # Here we see that indeed there is benefit in performing the shrinking.
        #
        # This solution can be computed using find_best_candidate_distribution by passing in the training errors and
        # violations found at each epoch and returns the weight of each constituent. We see that indeed, it is sparse.

        cand_dist = tfco.find_best_candidate_distribution(
            train_errors, train_violations)
        print(cand_dist)

        m_stoch_train_acc = np.dot(cand_dist, train_acc)
        m_stoch_train_didi = np.dot(cand_dist, train_didi)
        m_stoch_test_acc = np.dot(cand_dist, test_acc)
        m_stoch_test_didi = np.dot(cand_dist, test_didi)

        print("Train Acc", m_stoch_train_acc)
        print("Train DIDI", m_stoch_train_didi)
        print("Test Acc", m_stoch_test_acc)
        print("Test DIDI", m_stoch_test_didi)

        results['Last_iterate_train_acc'].append(train_acc[-1])
        results['Last_iterate_test_acc'].append(test_acc[-1])
        results['Last_iterate_train_ct'].append(train_didi[-1])
        results['Last_iterate_test_ct'].append(test_didi[-1])

        results['Best_iterate_train_acc'].append(train_acc[best_cand_index])
        results['Best_iterate_test_acc'].append(test_acc[best_cand_index])
        results['Best_iterate_train_ct'].append(train_didi[best_cand_index])
        results['Best_iterate_test_ct'].append(test_didi[best_cand_index])

        results['Stoch_iterate_train_acc'].append(m_stoch_train_acc)
        results['Stoch_iterate_test_acc'].append(m_stoch_test_acc)
        results['Stoch_iterate_train_ct'].append(m_stoch_train_didi)
        results['Stoch_iterate_test_ct'].append(m_stoch_test_didi)

    for k, val in results.items():
        print(k, np.mean(val), np.std(val))
Beispiel #11
0
            'violations': violations
        }
        return results

    lr_range = [0.01, 0.1, 0.5, 1.0]
    grid = [(xx, yy) for xx in lr_range for yy in lr_range]
    objectives = []
    violations = []

    for (lr, lr_con) in grid:
        # print(lr, lr_con)
        results = lagrangian_optimizer(train_set,
                                       epsilon=epsilon,
                                       learning_rate=lr,
                                       learning_rate_constraint=lr_con)
        error, tpr0, tpr1 = evaluate_expected_results(train_set,
                                                      results['pruned'][0],
                                                      results['pruned'][1])
        objectives.append(error)
        violations.append([tpr0 - tpr1 - epsilon, tpr1 - tpr0 - epsilon])

    best_index = tfco.find_best_candidate_index(np.array(objectives),
                                                np.array(violations),
                                                rank_objectives=True)
    results = lagrangian_optimizer(
        train_set,
        epsilon=epsilon,
        learning_rate=grid[best_index][0],
        learning_rate_constraint=grid[best_index][1])
    print_results(train_set, test_set, results['pruned'],
                  results['objectives'], results['violations'])
Beispiel #12
0
def cross_val():

    x, y = data_gen.load_dota_data()
    print("Dota")
    results = {
        'Last_iterate_train_acc': [],
        'Last_iterate_test_acc': [],
        'Last_iterate_train_ct': [],
        'Last_iterate_test_ct': [],
        'Best_iterate_train_acc': [],
        'Best_iterate_test_acc': [],
        'Best_iterate_train_ct': [],
        'Best_iterate_test_ct': [],
        'Stoch_iterate_train_acc': [],
        'Stoch_iterate_test_acc': [],
        'Stoch_iterate_train_ct': [],
        'Stoch_iterate_test_ct': [],
    }

    # Process the folds
    nfolds = 5
    fsize = int(np.ceil(len(x) / nfolds))
    for fidx in range(nfolds):
        print(f'\n### Processing fold {fidx}')
        # Build a full index set
        idx = np.arange(len(x))
        # Separate index sets
        tridx = np.hstack((idx[:fidx * fsize], idx[(fidx + 1) * fsize:]))
        tsidx = idx[fidx * fsize:(fidx + 1) * fsize]
        # Separate training and test data
        xtr = x[tridx]
        ytr = y[tridx]
        xts = x[tsidx]
        yts = y[tsidx]
        # Standardize
        # for cidx in range(x.shape[1]):
        scl = StandardScaler()
        xtr = scl.fit_transform(xtr)
        xts = scl.transform(xts)

        num_samples = len(xtr)
        num_classes = len(np.unique(ytr))
        tfco_model = TFCOFairBal(input_dim=xtr.shape[1], output_dim=num_classes,
                                 num_classes=num_classes, num_samples=num_samples)

        # Fitting.
        # train_errors, train_violations = tfco_model.fit(xtr, ytr)
        # train_errors, train_violations = np.array(train_errors), np.array(train_violations)

        # test_errors, test_violations = tfco_model.predict_err(x_ts.values, y_ts.values)
        # test_preds = tfco_model.predict(xts)
        # test_errors, test_violations = _get_error_rate_and_constraints(
        #     test_preds, yts, didi_ts, I_test)

        minibatch_size = 200
        iterations_per_loop = 200
        loops = 100

        train_pred, test_pred = tfco_model._full_training(xtr, xts, ytr,
                                                          minibatch_size, iterations_per_loop, loops)

        train_errors = []
        train_violations = []
        train_acc = []
        train_std = []

        for p in train_pred:
            err, viol = _get_error_rate_and_constraints(p, ytr)
            acc = accuracy_score(ytr, p)
            cnts = np.array([np.sum(p == c) for c in range(num_classes)])
            std = np.std(cnts / np.sum(cnts))
            train_errors.append(err)
            train_violations.append(viol)
            train_acc.append(acc)
            train_std.append(std)

        test_errors = []
        test_violations = []
        test_acc = []
        test_std = []

        for p in test_pred:
            err, viol = _get_error_rate_and_constraints(p, yts)
            acc = accuracy_score(yts, p)
            cnts = np.array([np.sum(p == c) for c in range(num_classes)])
            std = np.std(cnts / np.sum(cnts))
            test_errors.append(err)
            test_violations.append(viol)
            test_acc.append(acc)
            test_std.append(std)

        train_violations = np.array(train_violations)
        print("Train Error", train_errors[-1])
        print("Train Violation", max(train_violations[-1]))
        print("Train Acc.", train_acc[-1])
        print("Train Std.", train_std[-1])

        print("Test Error", test_errors[-1])
        print("Test Violation", max(test_violations[-1]))
        print("Test Acc,", test_acc[-1])
        print("Test Std.", test_std[-1])

        print("Improving using Best Iterate instead of Last Iterate.")
        #
        # As discussed in [[CotterEtAl18b]](https://arxiv.org/abs/1809.04198), the last iterate may not be the best choice
        # and suggests a simple heuristic to choose the best iterate out of the ones found after each epoch.
        # The heuristic proceeds by ranking each of the solutions based on accuracy and fairness separately with respect to
        # the training data. Any solutions which satisfy the constraints are equally ranked top in terms fairness.
        # Each solution thus has two ranks. Then, the chosen solution is the one with the smallest maximum of the two ranks.
        # We see that this improves the fairness and can find a better accuracy / fairness trade-off on the training data.
        #
        # This solution can be calculated using find_best_candidate_index given the list of training errors and violations
        # associated with each of the epochs.

        best_cand_index = tfco.find_best_candidate_index(train_errors, train_violations)

        # print("Train Error", train_errors[best_cand_index])
        # print("Train Violation", max(train_violations[best_cand_index]))
        print("Train Acc.", train_acc[best_cand_index])
        print("Train Std.", train_std[best_cand_index])

        # print("Test Error", test_errors[best_cand_index])
        # print("Test Violation", max(test_violations[best_cand_index]))
        print("Test Acc.", test_acc[best_cand_index])
        print("Test Std.", test_std[best_cand_index])

        print("m-stochastic solution.")
        # [[CoJiSr19]](https://arxiv.org/abs/1804.06500) presents a method which shrinks down the T-stochastic solution down
        # to one that is supported on at most (m+1) points where m is the number of constraints and is guaranteed to be at
        # least as good as the T-stochastic solution.
        # Here we see that indeed there is benefit in performing the shrinking.
        #
        # This solution can be computed using find_best_candidate_distribution by passing in the training errors and
        # violations found at each epoch and returns the weight of each constituent. We see that indeed, it is sparse.

        cand_dist = tfco.find_best_candidate_distribution(train_errors, train_violations)
        print(cand_dist)

        # m_stoch_error_train, m_stoch_violations_train = _get_exp_error_rate_constraints(cand_dist, train_errors,
        #                                                                                 train_violations)
        # m_stoch_error_test, m_stoch_violations_test = _get_exp_error_rate_constraints(cand_dist, test_errors,
        #                                                                               test_violations)

        m_stoch_train_r2 = np.dot(cand_dist, train_acc)
        m_stoch_train_didi = np.dot(cand_dist, train_std)
        m_stoch_test_r2 = np.dot(cand_dist, test_acc)
        m_stoch_test_didi = np.dot(cand_dist, test_std)

        print("Train Acc.", m_stoch_train_r2)
        print("Train Std.", m_stoch_train_didi)
        print("Test Acc.", m_stoch_test_r2)
        print("Test Std.", m_stoch_test_didi)

        results['Last_iterate_train_acc'].append(train_acc[-1])
        results['Last_iterate_test_acc'].append(test_acc[-1])
        results['Last_iterate_train_ct'].append(train_std[-1])
        results['Last_iterate_test_ct'].append(test_std[-1])

        results['Best_iterate_train_acc'].append(train_acc[best_cand_index])
        results['Best_iterate_test_acc'].append(test_acc[best_cand_index])
        results['Best_iterate_train_ct'].append(train_std[best_cand_index])
        results['Best_iterate_test_ct'].append(test_std[best_cand_index])

        results['Stoch_iterate_train_acc'].append(m_stoch_train_r2)
        results['Stoch_iterate_test_acc'].append(m_stoch_test_r2)
        results['Stoch_iterate_train_ct'].append(m_stoch_train_didi)
        results['Stoch_iterate_test_ct'].append(m_stoch_test_didi)

    for k, val in results.items():
        print(k, np.mean(val), np.std(val))
Beispiel #13
0
def test():

    # Data with our preprocessing routines.
    from sklearn.preprocessing import MinMaxScaler

    x, y = data_gen.load_shuttle_data()
    scl = MinMaxScaler()
    train_pts = int(0.8 * len(x))
    xtr = scl.fit_transform(x[:train_pts])
    xts = scl.transform(x[train_pts:])
    ytr = y[:train_pts]
    yts = y[train_pts:]

    num_samples = len(xtr)
    num_classes = len(np.unique(ytr))
    tfco_model = TFCOFairBal(input_dim=xtr.shape[1], output_dim=num_classes,
                             num_classes=num_classes, num_samples=num_samples)

    # Fitting.
    # train_errors, train_violations = tfco_model.fit(xtr, ytr)
    # train_errors, train_violations = np.array(train_errors), np.array(train_violations)

    # test_errors, test_violations = tfco_model.predict_err(x_ts.values, y_ts.values)
    # test_preds = tfco_model.predict(xts)
    # test_errors, test_violations = _get_error_rate_and_constraints(
    #     test_preds, yts, didi_ts, I_test)

    minibatch_size = 200
    iterations_per_loop = 200
    loops = 100

    train_pred, test_pred = tfco_model._full_training(xtr, xts, ytr,
                                                      minibatch_size, iterations_per_loop, loops)

    train_errors = []
    train_violations = []
    train_acc = []
    train_std = []

    for p in train_pred:
        err, viol = _get_error_rate_and_constraints(p, ytr)
        acc = accuracy_score(ytr, p)
        cnts = np.array([np.sum(p == c) for c in range(num_classes)])
        std = np.std(cnts / np.sum(cnts))
        train_errors.append(err)
        train_violations.append(viol)
        train_acc.append(acc)
        train_std.append(std)

    test_errors = []
    test_violations = []
    test_acc = []
    test_std = []

    for p in test_pred:
        err, viol = _get_error_rate_and_constraints(p, yts)
        acc = accuracy_score(yts, p)
        cnts = np.array([np.sum(p == c) for c in range(num_classes)])
        std = np.std(cnts / np.sum(cnts))
        test_errors.append(err)
        test_violations.append(viol)
        test_acc.append(acc)
        test_std.append(std)

    train_violations = np.array(train_violations)
    print("Train Error", train_errors[-1])
    print("Train Violation", max(train_violations[-1]))
    print("Train Acc.", train_acc[-1])
    print("Train Std.", train_std[-1])

    print("Test Error", test_errors[-1])
    print("Test Violation", max(test_violations[-1]))
    print("Test Acc,", test_acc[-1])
    print("Test Std.", test_std[-1])

    print("Improving using Best Iterate instead of Last Iterate.")
    #
    # As discussed in [[CotterEtAl18b]](https://arxiv.org/abs/1809.04198), the last iterate may not be the best choice
    # and suggests a simple heuristic to choose the best iterate out of the ones found after each epoch.
    # The heuristic proceeds by ranking each of the solutions based on accuracy and fairness separately with respect to
    # the training data. Any solutions which satisfy the constraints are equally ranked top in terms fairness.
    # Each solution thus has two ranks. Then, the chosen solution is the one with the smallest maximum of the two ranks.
    # We see that this improves the fairness and can find a better accuracy / fairness trade-off on the training data.
    #
    # This solution can be calculated using find_best_candidate_index given the list of training errors and violations
    # associated with each of the epochs.

    best_cand_index = tfco.find_best_candidate_index(train_errors, train_violations)

    # print("Train Error", train_errors[best_cand_index])
    # print("Train Violation", max(train_violations[best_cand_index]))
    print("Train Acc.", train_acc[best_cand_index])
    print("Train Std.", train_std[best_cand_index])

    # print("Test Error", test_errors[best_cand_index])
    # print("Test Violation", max(test_violations[best_cand_index]))
    print("Test Acc.", test_acc[best_cand_index])
    print("Test Std.", test_std[best_cand_index])

    print("m-stochastic solution.")
    # [[CoJiSr19]](https://arxiv.org/abs/1804.06500) presents a method which shrinks down the T-stochastic solution down
    # to one that is supported on at most (m+1) points where m is the number of constraints and is guaranteed to be at
    # least as good as the T-stochastic solution.
    # Here we see that indeed there is benefit in performing the shrinking.
    #
    # This solution can be computed using find_best_candidate_distribution by passing in the training errors and
    # violations found at each epoch and returns the weight of each constituent. We see that indeed, it is sparse.

    cand_dist = tfco.find_best_candidate_distribution(train_errors, train_violations)
    print(cand_dist)

    # m_stoch_error_train, m_stoch_violations_train = _get_exp_error_rate_constraints(cand_dist, train_errors,
    #                                                                                 train_violations)
    # m_stoch_error_test, m_stoch_violations_test = _get_exp_error_rate_constraints(cand_dist, test_errors,
    #                                                                               test_violations)

    m_stoch_train_r2 = np.dot(cand_dist, train_acc)
    m_stoch_train_didi = np.dot(cand_dist, train_std)
    m_stoch_test_r2 = np.dot(cand_dist, test_acc)
    m_stoch_test_didi = np.dot(cand_dist, test_std)

    print("Train Acc.", m_stoch_train_r2)
    print("Train Std.", m_stoch_train_didi)
    print("Test Acc.", m_stoch_test_r2)
    print("Test Std.", m_stoch_test_didi)
    def coco(train_set,
             test_set,
             model_unc,
             epsilon=0.05,
             learning_rate=0.5,
             loops=10):
        delta_ = 1e-10
        # Skip iterations = (T)^1/3
        # How often should the Lagrange multiplier be updated.
        skip_iter = int(np.cbrt(T))

        # Datasets
        x_train, y_train, z_train = train_set
        # Vali same as training set.
        x_vali, y_vali, z_vali = train_set
        x_test, y_test, z_test = test_set

        # Append z to x as a column.
        x_train_ = np.concatenate([x_train, z_train.reshape(-1, 1)], axis=1)
        x_vali_ = np.concatenate([x_vali, z_vali.reshape(-1, 1)], axis=1)
        x_test_ = np.concatenate([x_test, z_test.reshape(-1, 1)], axis=1)
        train_set_ = x_train_, y_train, z_train
        vali_set_ = x_vali_, y_vali, z_vali
        test_set_ = x_test_, y_test, z_test

        # Labels for each group.
        y0 = y_train[z_train == 0]
        y1 = y_train[z_train == 1]

        # CPE model.
        weights, threshold = model_unc
        y_prob = np.dot(x_train, weights) + threshold

        # Label proportions.
        p = y_train.mean()
        p0 = np.mean(y0 == 1)
        p1 = np.mean(y1 == 1)

        # Group proportions.
        g0 = np.mean(z_train == 0)
        g1 = np.mean(z_train == 1)

        # Initialization.
        threshold0_temp = 0.5
        threshold1_temp = 0.5
        models = []
        objectives = []
        violations = []

        # Initialize conf matrix.
        fp0, fp1, fn0, fn1 = evaluate_conf(y0, y1, z_train, y_prob,
                                           threshold0_temp, threshold1_temp)

        #Initialize lagrange multipliers.
        lambda0 = 0.0
        lambda1 = 0.0

        inner_probabilities = np.zeros(loops)
        outer_probabilities = np.zeros(loops)

        for ii in range(1, loops + 1):
            # G-mean gradient.
            tpr = (1 - g0 * fn0 / p - g1 * fn1 / p) + delta_
            tnr = (1 - g0 * fp0 / (1 - p) - g1 * fp1 / (1 - p)) + delta_
            coef_tpr = 0.5 * np.sqrt(tnr / tpr)
            coef_tnr = 0.5 * np.sqrt(tpr / tnr)

            # Minimize over confusion matrices.
            coef_fn0 = g0 * coef_tpr / p + (lambda0 / p0 - lambda1 / p0)
            coef_fn1 = g1 * coef_tpr / p + (lambda1 / p1 - lambda0 / p1)
            coef_fp0 = g0 * coef_tnr / (1 - p)
            coef_fp1 = g1 * coef_tnr / (1 - p)

            # Opt thresholds for cost-sensitive problem.
            if min(coef_fp0, coef_fn0) < 0:
                if coef_fp0 < coef_fn0:
                    threshold0_temp = 1e-5
                else:
                    threshold0_temp = 1 - 1e-5
            else:
                threshold0_temp = coef_fp0 / (coef_fp0 + coef_fn0 + delta_)
                threshold0_temp = min(threshold0_temp, 1 - delta_)
                threshold0_temp = max(threshold0_temp, delta_)
            threshold0 = np.log(threshold0_temp / (1 - threshold0_temp))

            if min(coef_fp1, coef_fn1) < 0:
                if coef_fp1 < coef_fn1:
                    threshold1_temp = 1e-5
                else:
                    threshold1_temp = 1 - 1e-5
            else:
                threshold1_temp = coef_fp1 / (coef_fp1 + coef_fn1 + delta_)
                threshold1_temp = min(threshold1_temp, 1 - delta_)
                threshold1_temp = max(threshold1_temp, delta_)
            threshold1 = np.log(threshold1_temp / (1 - threshold1_temp))

            # Evaluate metrics.
            fp0_hat, fp1_hat, fn0_hat, fn1_hat = evaluate_conf(
                y0, y1, z_train, y_prob, threshold0, threshold1)

            fp0 = (1 - 2.0 / (ii + 1)) * fp0 + 2.0 / (ii + 1) * fp0_hat
            fp1 = (1 - 2.0 / (ii + 1)) * fp1 + 2.0 / (ii + 1) * fp1_hat
            fn0 = (1 - 2.0 / (ii + 1)) * fn0 + 2.0 / (ii + 1) * fn0_hat
            fn1 = (1 - 2.0 / (ii + 1)) * fn1 + 2.0 / (ii + 1) * fn1_hat

            inner_probabilities[:ii - 1] *= (1 - 2.0 / (ii + 1))
            inner_probabilities[ii - 1] = 2.0 / (ii + 1)

            # Thresholds are added not subtracted during evaluation.
            weights_ = np.concatenate([weights, [-threshold1 + threshold0]])
            threshold_ = threshold - threshold0
            model = [weights_, threshold_]
            models.append(model)

            # Evaluate metrics.
            error, tpr0, tpr1 = evaluate_expected_results(
                train_set_, [model], [1.0])
            objectives.append(error)
            violations.append([tpr0 - tpr1 - epsilon, tpr1 - tpr0 - epsilon])

            # # Report once in 25 iterations.
            # if ii % 25 == 0:
            #   print("Step %d | G-mean error = %3f | EO violation = %.3f" % (
            #         ii, objectives[-1], max(violations[-1])))

            if ii % skip_iter == 0:
                # Update lambda.
                lambda0 += learning_rate * (fn0 / p0 - fn1 / p1 - epsilon)
                lambda1 += learning_rate * (fn1 / p1 - fn0 / p0 - epsilon)

                # Project lambdas.
                lambda0 = np.maximum(lambda0, 0.0)
                lambda1 = np.maximum(lambda1, 0.0)

                # Update count.
                outer_probabilities += inner_probabilities

        # Normalize probabilities to sum to 1.
        if ii % skip_iter != 0:  # Last outer iteration did not complete.
            outer_probabilities += inner_probabilities
        outer_probabilities *= 1.0 / np.sum(outer_probabilities)

        probabilities_pruned = tfco.find_best_candidate_distribution(
            np.array(objectives), np.array(violations))

        # Shrinking.
        models_pruned = [
            models[jj] for jj in range(len(models))
            if probabilities_pruned[jj] > 0
        ]
        probabilities_pruned = probabilities_pruned[probabilities_pruned > 0]

        # Best model.
        best_index = tfco.find_best_candidate_index(np.array(objectives),
                                                    np.array(violations))

        # Return Pruned models, Avg models, Best model
        results = {
            'stochastic': (models, outer_probabilities),
            'pruned': (models_pruned, probabilities_pruned),
            'best': models[best_index],
            'objectives': objectives,
            'violations': violations,
            'modified_train_set': train_set_,
            'modified_test_set': test_set_
        }
        return results
Beispiel #15
0
def test():

    # Data with our preprocessing routines.
    from sklearn.preprocessing import MinMaxScaler

    xnp, xp, y = data_gen.load_crime()
    scl = MinMaxScaler()
    train_pts = int(0.8 * len(xnp))
    xnptr = scl.fit_transform(xnp[:train_pts])
    xnpts = scl.transform(xnp[train_pts:])
    xptr = xp[:train_pts]
    xpts = xp[train_pts:]
    ytr = y[:train_pts]
    yts = y[train_pts:]

    # Add protected features.
    xtr = np.hstack([xnptr, xptr])
    xts = np.hstack([xnpts, xpts])

    scl = MinMaxScaler()
    ytr = scl.fit_transform(ytr)
    yts = scl.transform(yts)

    I_train = utils.compute_indicator_matrix_r(xptr)
    I_test = utils.compute_indicator_matrix_r(xpts)
    didi_tr = utils.didi_r(ytr, I_train)
    didi_ts = utils.didi_r(yts, I_test)

    tfco_model = TFCOFairReg(input_dim=xtr.shape[1], output_dim=1,
                             I_train=I_train, didi_tr=didi_tr)

    # Fitting.
    # train_errors, train_violations = tfco_model.fit(xtr, ytr)
    # train_errors, train_violations = np.array(train_errors), np.array(train_violations)

    # test_errors, test_violations = tfco_model.predict_err(x_ts.values, y_ts.values)
    # test_preds = tfco_model.predict(xts)
    # test_errors, test_violations = _get_error_rate_and_constraints(
    #     test_preds, yts, didi_ts, I_test)

    minibatch_size = 200
    iterations_per_loop = 200
    loops = 80

    train_pred, test_pred = tfco_model._full_training(xtr, xts, ytr,
                                                      minibatch_size, iterations_per_loop, loops)

    train_errors = []
    train_violations = []
    train_didi = []
    train_r2 = []

    for p in train_pred:
        err, viol = _get_error_rate_and_constraints(p, ytr.reshape(-1, 1), didi_tr, I_train)
        r2 = r2_score(ytr, p)
        didi = utils.didi_r(p, I_train) / didi_tr
        train_errors.append(err)
        train_violations.append(viol)
        train_didi.append(didi)
        train_r2.append(r2)

    test_errors = []
    test_violations = []
    test_didi = []
    test_r2 = []

    for p in test_pred:
        err, viol = _get_error_rate_and_constraints(p, yts.reshape(-1, 1), didi_ts, I_test)
        r2 = r2_score(yts, p)
        didi = utils.didi_r(p, I_test) / didi_ts
        test_errors.append(err)
        test_violations.append(viol)
        test_didi.append(didi)
        test_r2.append(r2)

    train_violations = np.array(train_violations)
    # print("DIDI train", didi_tr)
    # print("DIDI test", didi_ts)
    # print("Train Error", train_errors[-1])
    # print("Train Violation", max(train_violations[-1]))
    print("Train R2", train_r2[-1])
    print("Train DIDI", train_didi[-1])

    # print("Test Error", test_errors[-1])
    # print("Test Violation", max(test_violations[-1]))
    print("Train R2", test_r2[-1])
    print("Train DIDI", test_didi[-1])

    print("Improving using Best Iterate instead of Last Iterate.")
    #
    # As discussed in [[CotterEtAl18b]](https://arxiv.org/abs/1809.04198), the last iterate may not be the best choice
    # and suggests a simple heuristic to choose the best iterate out of the ones found after each epoch.
    # The heuristic proceeds by ranking each of the solutions based on accuracy and fairness separately with respect to
    # the training data. Any solutions which satisfy the constraints are equally ranked top in terms fairness.
    # Each solution thus has two ranks. Then, the chosen solution is the one with the smallest maximum of the two ranks.
    # We see that this improves the fairness and can find a better accuracy / fairness trade-off on the training data.
    #
    # This solution can be calculated using find_best_candidate_index given the list of training errors and violations
    # associated with each of the epochs.

    best_cand_index = tfco.find_best_candidate_index(train_errors, train_violations)

    # print("Train Error", train_errors[best_cand_index])
    # print("Train Violation", max(train_violations[best_cand_index]))
    print("Train R2", train_r2[best_cand_index])
    print("Train DIDI", train_didi[best_cand_index])

    # print("Test Error", test_errors[best_cand_index])
    # print("Test Violation", max(test_violations[best_cand_index]))
    print("Test R2", test_r2[best_cand_index])
    print("Test DIDI", test_didi[best_cand_index])

    print("m-stochastic solution.")
    # [[CoJiSr19]](https://arxiv.org/abs/1804.06500) presents a method which shrinks down the T-stochastic solution down
    # to one that is supported on at most (m+1) points where m is the number of constraints and is guaranteed to be at
    # least as good as the T-stochastic solution.
    # Here we see that indeed there is benefit in performing the shrinking.
    #
    # This solution can be computed using find_best_candidate_distribution by passing in the training errors and
    # violations found at each epoch and returns the weight of each constituent. We see that indeed, it is sparse.

    cand_dist = tfco.find_best_candidate_distribution(train_errors, train_violations)
    print(cand_dist)

    # m_stoch_error_train, m_stoch_violations_train = _get_exp_error_rate_constraints(cand_dist, train_errors,
    #                                                                                 train_violations)
    # m_stoch_error_test, m_stoch_violations_test = _get_exp_error_rate_constraints(cand_dist, test_errors,
    #                                                                               test_violations)

    m_stoch_train_r2 = np.dot(cand_dist, train_r2)
    m_stoch_train_didi = np.dot(cand_dist, train_didi)
    m_stoch_test_r2 = np.dot(cand_dist, test_r2)
    m_stoch_test_didi = np.dot(cand_dist, test_didi)

    print("Train R2", m_stoch_train_r2)
    print("Train DIDI", m_stoch_train_didi)
    print("Test R2", m_stoch_test_r2)
    print("Test DIDI", m_stoch_test_didi)