Ejemplo n.º 1
0
def expgrad(dataX,
            dataA,
            dataY,
            learner,
            cons=moments.DP(),
            eps=0.01,
            T=50,
            nu=None,
            eta_mul=2.0,
            debug=False):
    """
    Return a fair classifier under specified fairness constraints
    via exponentiated-gradient reduction.

    Required input arguments:
      dataX -- a DataFrame containing covariates
      dataA -- a Series containing the protected attribute
      dataY -- a Series containing labels in {0,1}
      learner -- a learner implementing methods fit(X,Y,W) and predict(X),
                 where X is the DataFrame of covariates, and Y and W
                 are the Series containing the labels and weights,
                 respectively; labels Y and predictions returned by
                 predict(X) are in {0,1}

    Optional keyword arguments:
      cons -- the fairness measure (default moments.DP())
      eps -- allowed fairness constraint violation (default 0.01)
      T -- max number of iterations (default 50)
      nu -- convergence threshold for the duality gap (default None,
            corresponding to a conservative automatic setting based on the
            statistical uncertainty in measuring classification error)
      eta_mul -- initial setting of the learning rate (default 2.0)
      debug -- if True, then debugging output is produced (default False)

    Returned named tuple with fields:
      best_classifier -- a function that maps a DataFrame X containing
                         covariates to a Series containing the corresponding
                         probabilistic decisions in [0,1]
      best_gap -- the quality of best_classifier; if the algorithm has
                  converged then best_gap<= nu; the solution best_classifier
                  is guaranteed to have the classification error within
                  2*best_gap of the best error under constraint eps; the
                  constraint violation is at most 2*(eps+best_gap)
      classifiers -- the base classifiers generated (instances of learner)
      weights -- the weights of those classifiers within best_classifier
      last_t -- the last executed iteration; always last_t < T
      best_t -- the iteration in which best_classifier was obtained
      n_oracle_calls -- how many times the learner was called
    """

    ExpgradResult = namedtuple(
        "ExgradResult", "best_classifier best_gap classifiers weights"
        " last_t best_t n_oracle_calls")

    n = dataX.shape[0]
    assert len(dataX.shape) == 2 and len(dataA.shape) == 1 and len(dataY.shape) == 1, \
        "dataX must be a DataFrame and dataY and dataA must be Series"
    assert dataA.shape[0] == n and dataY.shape[0] == n, \
        "the number of rows in all data fields must match"

    if debug:
        print("...EG STARTING")

    B = 1 / eps
    lagr = _Lagrangian(dataX, dataA, dataY, learner, cons, eps, B, debug=debug)

    theta = pd.Series(0, lagr.cons.index)
    Qsum = pd.Series()
    lambdas = pd.DataFrame()
    gaps_EG = []
    gaps = []
    Qs = []

    last_regr_checked = _REGR_CHECK_START_T
    last_gap = np.PINF
    for t in range(0, T):
        if debug:
            print("...iter=%03d" % t)

        lambda_vec = B * np.exp(theta) / (1 + np.exp(theta).sum())
        lambdas[t] = lambda_vec
        lambda_EG = lambdas.mean(axis=1)

        h, h_idx = lagr.best_h(lambda_vec)
        pred_h = h(dataX)

        if t == 0:
            if nu is None:
                nu = _ACCURACY_MUL * (pred_h - dataY).abs().std() / np.sqrt(n)
            eta_min = nu / (2 * B)
            eta = eta_mul / B
            if debug:
                print("...eps=%.3f, B=%.1f, nu=%.6f, T=%d, eta_min=%.6f" %
                      (eps, B, nu, T, eta_min))

        if not h_idx in Qsum.index:
            Qsum.at[h_idx] = 0.0
        Qsum[h_idx] += 1.0
        gamma = lagr.gammas[h_idx]

        Q_EG = Qsum / Qsum.sum()
        res_EG = lagr.eval_gap(Q_EG, lambda_EG, nu)
        gap_EG = res_EG.gap()
        gaps_EG.append(gap_EG)

        if (t == 0) or not _RUN_LP_STEP:
            gap_LP = np.PINF
        else:
            Q_LP, lambda_LP, res_LP = lagr.solve_linprog(nu)
            gap_LP = res_LP.gap()

        if gap_EG < gap_LP:
            Qs.append(Q_EG)
            gaps.append(gap_EG)
        else:
            Qs.append(Q_LP)
            gaps.append(gap_LP)

        if debug:
            print("%seta=%.6f, L_low=%.3f, L=%.3f, L_high=%.3f"
                  ", gap=%.6f, disp=%.3f, err=%.3f, gap_LP=%.6f" %
                  (" " * 9, eta, res_EG.L_low, res_EG.L, res_EG.L_high, gap_EG,
                   res_EG.gamma.max(), res_EG.error, gap_LP))

        if (gaps[t] < nu) and (t >= _MIN_T):
            break

        if t >= last_regr_checked * _REGR_CHECK_INCREASE_T:
            best_gap = min(gaps_EG)

            if best_gap > last_gap * _SHRINK_REGRET:
                eta *= _SHRINK_ETA
            last_regr_checked = t
            last_gap = best_gap

        theta += eta * (gamma - eps)

    last_t = len(Qs) - 1
    gaps_series = pd.Series(gaps)
    gaps_best = gaps_series[gaps_series <= gaps_series.min() + _PRECISION]
    best_t = gaps_best.index[-1]
    weights = Qs[best_t]
    hs = lagr.hs
    for h_idx in hs.index:
        if not h_idx in weights.index:
            weights.at[h_idx] = 0.0
    best_classifier = lambda X: _mean_pred(X, hs, weights)
    best_gap = gaps[best_t]

    res = ExpgradResult(best_classifier=best_classifier,
                        best_gap=best_gap,
                        classifiers=lagr.classifiers,
                        weights=weights,
                        last_t=last_t,
                        best_t=best_t,
                        n_oracle_calls=lagr.n_oracle_calls)

    if debug:
        print("...eps=%.3f, B=%.1f, nu=%.6f, T=%d, eta_min=%.6f" %
              (eps, B, nu, T, eta_min))
        print("...last_t=%d, best_t=%d, best_gap=%.6f"
              ", n_oracle_calls=%d, n_hs=%d" %
              (res.last_t, res.best_t, res.best_gap, res.n_oracle_calls,
               len(res.classifiers)))

    return res
Ejemplo n.º 2
0
def expgrad(dataX,
            dataA,
            dataY,
            learner,
            constraints=moments.DP(),
            eps=0.01,
            T=50,
            nu=None,
            eta_mul=2.0,
            debug=False):
    """
    Return a fair classifier under specified fairness constraints
    via exponentiated-gradient reduction.

    Required input arguments:
      dataX -- a DataFrame containing covariates
      dataA -- a Series containing the protected attribute
      dataY -- a Series containing labels in {0,1}
      learner -- a learner implementing methods fit(X,Y,W) and predict(X),
                 where X is the DataFrame of covariates, and Y and W
                 are the Series containing the labels and weights,
                 respectively; labels Y and predictions returned by
                 predict(X) are in {0,1}

    Optional keyword arguments:
      constraints -- the fairness measure (default moments.DP())
      eps -- allowed fairness constraint violation (default 0.01)
      T -- max number of iterations (default 50)
      nu -- convergence threshold for the duality gap (default None,
            corresponding to a conservative automatic setting based on the
            statistical uncertainty in measuring classification error)
      eta_mul -- initial setting of the learning rate (default 2.0)
      debug -- if True, then debugging output is produced (default False)

    Returned named tuple with fields:
      best_classifier -- a function that maps a DataFrame X containing
                         covariates to a Series containing the corresponding
                         probabilistic decisions in [0,1]
      best_gap -- the quality of best_classifier; if the algorithm has
                  converged then best_gap <= nu; the solution best_classifier
                  is guaranteed to have the classification error within
                  2*best_gap of the best error under constraint eps; the
                  constraint violation is at most 2*(eps+best_gap)
      classifiers -- the base classifiers generated (instances of learner)
      weights -- the weights of those classifiers within best_classifier
      last_t -- the last executed iteration; always last_t < T
      best_t -- the iteration in which best_classifier was obtained
      n_oracle_calls -- how many times the learner was called
    """
    n = dataX.shape[0]

    if debug:
        print("...Exponentiated Gradient STARTING")

    B = 1 / eps
    lagrangian = _Lagrangian(dataX,
                             dataA,
                             dataY,
                             learner,
                             constraints,
                             eps,
                             B,
                             debug=debug)

    theta = pd.Series(0, lagrangian.constraints.index)
    Qsum = pd.Series()
    lambdas = pd.DataFrame()
    gaps_EG = []
    gaps = []
    Qs = []

    last_regret_checked = _REGRET_CHECK_START_T
    last_gap = np.PINF
    for t in range(0, T):
        if debug:
            print("...iter=%03d" % t)

        # set lambdas for every constraint
        lambda_vec = B * np.exp(theta) / (1 + np.exp(theta).sum())
        lambdas[t] = lambda_vec
        lambda_EG = lambdas.mean(axis=1)

        # select classifier according to best_h method
        h, h_idx = lagrangian.best_h(lambda_vec)
        pred_h = h(dataX)

        if t == 0:
            if nu is None:
                nu = _ACCURACY_MUL * (pred_h - dataY).abs().std() / np.sqrt(n)
            eta_min = nu / (2 * B)
            eta = eta_mul / B
            if debug:
                print("...eps=%.3f, B=%.1f, nu=%.6f, T=%d, eta_min=%.6f" %
                      (eps, B, nu, T, eta_min))

        if h_idx not in Qsum.index:
            Qsum.at[h_idx] = 0.0
        Qsum[h_idx] += 1.0
        gamma = lagrangian.gammas[h_idx]
        Q_EG = Qsum / Qsum.sum()
        result_EG = lagrangian.eval_gap(Q_EG, lambda_EG, nu)
        gap_EG = result_EG.gap()
        gaps_EG.append(gap_EG)

        if t == 0 or not _RUN_LP_STEP:
            gap_LP = np.PINF
        else:
            # saddle point optimization over the convex hull of classifiers returned so far
            Q_LP, _, result_LP = lagrangian.solve_linprog(nu)
            gap_LP = result_LP.gap()

        # keep values from exponentiated gradient or linear programming
        if gap_EG < gap_LP:
            Qs.append(Q_EG)
            gaps.append(gap_EG)
        else:
            Qs.append(Q_LP)
            gaps.append(gap_LP)

        if debug:
            print("%seta=%.6f, L_low=%.3f, L=%.3f, L_high=%.3f"
                  ", gap=%.6f, disp=%.3f, err=%.3f, gap_LP=%.6f" %
                  (_INDENTATION, eta,
                   result_EG.L_low, result_EG.L, result_EG.L_high, gap_EG,
                   result_EG.gamma.max(), result_EG.error, gap_LP))

        if (gaps[t] < nu) and (t >= _MIN_T):
            # solution found
            break

        # update regret
        if t >= last_regret_checked * _REGRET_CHECK_INCREASE_T:
            best_gap = min(gaps_EG)

            if best_gap > last_gap * _SHRINK_REGRET:
                eta *= _SHRINK_ETA
            last_regret_checked = t
            last_gap = best_gap

        # update theta based on learning rate
        theta += eta * (gamma - eps)

    return _format_results(gaps, Qs, lagrangian, eps, B, nu, T, eta_min, debug)