def test_add_income_bins():
    dta = np.arange(1, 1e6, 5000)
    dfx = pd.DataFrame(data=dta, columns=['expanded_income'])
    bins = [
        -9e99, 0, 9999, 19999, 29999, 39999, 49999, 74999, 99999, 200000, 9e99
    ]
    dfr = add_income_bins(dfx,
                          'expanded_income',
                          bin_type='tpc',
                          bins=None,
                          right=True)
    groupedr = dfr.groupby('bins')
    idx = 1
    for name, _ in groupedr:
        assert name.closed == 'right'
        assert abs(name.right - bins[idx]) < EPSILON
        idx += 1
    dfl = add_income_bins(dfx,
                          'expanded_income',
                          bin_type='tpc',
                          bins=None,
                          right=False)
    groupedl = dfl.groupby('bins')
    idx = 1
    for name, _ in groupedl:
        assert name.closed == 'left'
        assert abs(name.right - bins[idx]) < EPSILON
        idx += 1
def dropq_dist_table(resdf, groupby, result_type, suffix):
    """
    Create and return dropq distribution table.
    """
    # pylint: disable=too-many-locals
    res = resdf
    c04470_s = 'c04470' + suffix
    c00100_s = 'c00100' + suffix
    c09600_s = 'c09600' + suffix
    standard_s = 'standard' + suffix
    s006_s = 's006' + suffix
    returns_ided_s = 'num_returns_ItemDed' + suffix
    returns_sded_s = 'num_returns_StandardDed' + suffix
    returns_amt_s = 'num_returns_AMT' + suffix
    res[c04470_s] = res[c04470_s].where(
        ((res[c00100_s] > 0) & (res[c04470_s] > res[standard_s])), 0)
    res[returns_ided_s] = res[s006_s].where(
        ((res[c00100_s] > 0) & (res[c04470_s] > 0)), 0)
    res[returns_sded_s] = res[s006_s].where(
        ((res[c00100_s] > 0) & (res[standard_s] > 0)), 0)
    res[returns_amt_s] = res[s006_s].where(res[c09600_s] > 0, 0)
    if groupby == "weighted_deciles":
        dframe = add_weighted_income_bins(res, num_bins=10)
    elif groupby == "small_income_bins":
        dframe = add_income_bins(res, compare_with="soi")
    elif groupby == "large_income_bins":
        dframe = add_income_bins(res, compare_with="tpc")
    elif groupby == "webapp_income_bins":
        dframe = add_income_bins(res, compare_with="webapp")
    else:
        err = ("groupby must be either 'weighted_deciles' or "
               "'small_income_bins' or 'large_income_bins' or "
               "'webapp_income_bins'")
        raise ValueError(err)
    pd.options.display.float_format = '{:8,.0f}'.format
    if result_type == "weighted_sum":
        dframe = weighted(dframe, [col + suffix for col in STATS_COLUMNS])
        gby_bins = dframe.groupby('bins', as_index=False)
        gp_mean = gby_bins[[col + suffix for col in TABLE_COLUMNS]].sum()
        gp_mean.drop('bins', axis=1, inplace=True)
        sum_row = get_sums(dframe)[[col + suffix for col in TABLE_COLUMNS]]
    elif result_type == "weighted_avg":
        gp_mean = weighted_avg_allcols(dframe,
                                       [col + suffix for col in TABLE_COLUMNS])
        all_sums = get_sums(dframe, not_available=True)
        sum_row = all_sums[[col + suffix for col in TABLE_COLUMNS]]
    else:
        err = ("result_type must be either 'weighted_sum' or "
               "'weighted_avg'")
        raise ValueError(err)
    return gp_mean.append(sum_row)
def test_add_income_bins_soi():
    dta = np.arange(1, 1e6, 5000)
    dfx = pd.DataFrame(data=dta, columns=['expanded_income'])

    bins = SMALL_INCOME_BINS
    dfr = add_income_bins(dfx, 'expanded_income', bin_type='soi', right=True)
    groupedr = dfr.groupby('bins')
    idx = 1
    for name, _ in groupedr:
        assert name.closed == 'right'
        assert abs(name.right - bins[idx]) < EPSILON
        idx += 1
    dfl = add_income_bins(dfx, 'expanded_income', bin_type='soi', right=False)
    groupedl = dfl.groupby('bins')
    idx = 1
    for name, _ in groupedl:
        assert name.closed == 'left'
        assert abs(name.right - bins[idx]) < EPSILON
        idx += 1
def dropq_diff_table(df1, df2, groupby, res_col, diff_col, suffix, wsum):
    """
    Create and return dropq difference table.
    """
    # pylint: disable=too-many-arguments,too-many-locals
    if groupby == "weighted_deciles":
        gdf = add_weighted_income_bins(df2, num_bins=10)
    elif groupby == "small_income_bins":
        gdf = add_income_bins(df2, compare_with="soi")
    elif groupby == "large_income_bins":
        gdf = add_income_bins(df2, compare_with="tpc")
    elif groupby == "webapp_income_bins":
        gdf = add_income_bins(df2, compare_with="webapp")
    else:
        err = ("groupby must be either 'weighted_deciles' or "
               "'small_income_bins' or 'large_income_bins' or "
               "'webapp_income_bins'")
        raise ValueError(err)
    # Difference in plans
    # Positive values are the magnitude of the tax increase
    # Negative values are the magnitude of the tax decrease
    df2[res_col + suffix] = df2[diff_col + suffix] - df1[diff_col]
    diffs = means_and_comparisons(res_col + suffix,
                                  gdf.groupby('bins', as_index=False),
                                  wsum + EPSILON)
    sum_row = get_sums(diffs)[diffs.columns]
    diffs = diffs.append(sum_row)  # pylint: disable=redefined-variable-type
    pd.options.display.float_format = '{:8,.0f}'.format
    srs_inc = ["{0:.2f}%".format(val * 100) for val in diffs['perc_inc']]
    diffs['perc_inc'] = pd.Series(srs_inc, index=diffs.index)
    srs_cut = ["{0:.2f}%".format(val * 100) for val in diffs['perc_cut']]
    diffs['perc_cut'] = pd.Series(srs_cut, index=diffs.index)
    srs_change = [
        "{0:.2f}%".format(val * 100) for val in diffs['share_of_change']
    ]
    diffs['share_of_change'] = pd.Series(srs_change, index=diffs.index)
    # columns containing weighted values relative to the binning mechanism
    non_sum_cols = [x for x in diffs.columns if 'mean' in x or 'perc' in x]
    for col in non_sum_cols:
        diffs.loc['sums', col] = 'n/a'
    return diffs
 def fuzz(df1, df2, bin_type, imeasure, suffix, cols_to_fuzz):
     """
     Fuzz some df2 records in each bin defined by bin_type and imeasure.
     The fuzzed records have their post-reform tax results (in df2)
     set to their pre-reform tax results (in df1).
     """
     # pylint: disable=too-many-arguments
     assert bin_type == 'dec' or bin_type == 'bin' or bin_type == 'agg'
     if bin_type == 'dec':
         df2 = add_quantile_bins(df2, imeasure, 10)
     elif bin_type == 'bin':
         df2 = add_income_bins(df2, imeasure, bins=WEBAPP_INCOME_BINS)
     else:
         df2 = add_quantile_bins(df2, imeasure, 1)
     gdf2 = df2.groupby('bins')
     df2['nofuzz'] = gdf2['mask'].transform(chooser)
     for col in cols_to_fuzz:
         df2[col + suffix] = (df2[col] * df2['nofuzz'] -
                              df1[col] * df2['nofuzz'] + df1[col])
 def create(df1, df2, bin_type, imeasure, suffix, cols_to_fuzz, do_fuzzing):
     """
     Create additional df2 columns.  If do_fuzzing is True, also
     fuzz some df2 records in each bin defined by bin_type and imeasure
     with the fuzzed records having their post-reform tax results (in df2)
     set to their pre-reform tax results (in df1).
     """
     # pylint: disable=too-many-arguments
     assert bin_type == 'dec' or bin_type == 'bin' or bin_type == 'agg'
     if bin_type == 'dec':
         df2 = add_quantile_bins(df2, imeasure, 10)
     elif bin_type == 'bin':
         df2 = add_income_bins(df2, imeasure, bins=STANDARD_INCOME_BINS)
     else:
         df2 = add_quantile_bins(df2, imeasure, 1)
     gdf2 = df2.groupby('bins')
     if do_fuzzing:
         df2['nofuzz'] = gdf2['mask'].transform(chooser)
     else:  # never do any results fuzzing
         df2['nofuzz'] = np.ones(df2.shape[0], dtype=np.int8)
     for col in cols_to_fuzz:
         df2[col + suffix] = (df2[col] * df2['nofuzz'] -
                              df1[col] * df2['nofuzz'] + df1[col])
def test_add_income_bins_raises():
    dta = np.arange(1, 1e6, 5000)
    dfx = pd.DataFrame(data=dta, columns=['expanded_income'])
    with pytest.raises(ValueError):
        dfx = add_income_bins(dfx, 'expanded_income', bin_type='stuff')
def drop_records(df1, df2, mask):
    """
    Modify df1 and df2 by adding statistical fuzz for data privacy.

    Parameters
    ----------
    df1: Pandas DataFrame
        contains results for the standard plan X and X'.

    df2: Pandas DataFrame
        contains results for the user-specified plan (Plan Y).

    mask: boolean numpy array
        contains info about whether or not each element of X and X' are same

    Returns
    -------
    fuzzed_df1: Pandas DataFrame

    fuzzed_df2: Pandas DataFrame

    Notes
    -----
    This function groups both DataFrames based on the web application's
    income groupings (both weighted decile and income bins), and then
    pseudo-randomly picks three records to 'drop' within each bin.
    We keep track of the three dropped records in both group-by
    strategies and then use these 'flag' columns to modify all
    columns of interest, creating new '_dec' columns for
    statistics based on weighted deciles and '_bin' columns for
    statitistics based on income bins.  Lastly we calculate
    individual income tax differences, payroll tax differences, and
    combined tax differences between the baseline and reform
    for the two groupings.
    """
    # perform all statistics on (Y + X') - X

    # Group first
    df2['mask'] = mask
    df1['mask'] = mask

    df2 = add_weighted_income_bins(df2)
    df1 = add_weighted_income_bins(df1)
    gp2_dec = df2.groupby('bins')

    df2 = add_income_bins(df2, bins=WEBAPP_INCOME_BINS)
    df1 = add_income_bins(df1, bins=WEBAPP_INCOME_BINS)
    gp2_bin = df2.groupby('bins')

    # Transform to get the 'flag' column (3 choices to drop in each bin)
    df2['flag_dec'] = gp2_dec['mask'].transform(chooser)
    df2['flag_bin'] = gp2_bin['mask'].transform(chooser)

    # first calculate all of X'
    columns_to_make_noisy = set(TABLE_COLUMNS) | set(STATS_COLUMNS)
    # these don't exist yet
    columns_to_make_noisy.remove('num_returns_ItemDed')
    columns_to_make_noisy.remove('num_returns_StandardDed')
    columns_to_make_noisy.remove('num_returns_AMT')
    for col in columns_to_make_noisy:
        df2[col + '_dec'] = (df2[col] * df2['flag_dec'] -
                             df1[col] * df2['flag_dec'] + df1[col])
        df2[col + '_bin'] = (df2[col] * df2['flag_bin'] -
                             df1[col] * df2['flag_bin'] + df1[col])

    # Difference in plans
    # Positive values are the magnitude of the tax increase
    # Negative values are the magnitude of the tax decrease
    df2['tax_diff_dec'] = df2['iitax_dec'] - df1['iitax']
    df2['tax_diff_bin'] = df2['iitax_bin'] - df1['iitax']
    df2['payrolltax_diff_dec'] = df2['payrolltax_dec'] - df1['payrolltax']
    df2['payrolltax_diff_bin'] = df2['payrolltax_bin'] - df1['payrolltax']
    df2['combined_diff_dec'] = df2['combined_dec'] - df1['combined']
    df2['combined_diff_bin'] = df2['combined_bin'] - df1['combined']

    return df1, df2