def test_add_income_bins(): dta = np.arange(1, 1e6, 5000) dfx = pd.DataFrame(data=dta, columns=['expanded_income']) bins = [ -9e99, 0, 9999, 19999, 29999, 39999, 49999, 74999, 99999, 200000, 9e99 ] dfr = add_income_bins(dfx, 'expanded_income', bin_type='tpc', bins=None, right=True) groupedr = dfr.groupby('bins') idx = 1 for name, _ in groupedr: assert name.closed == 'right' assert abs(name.right - bins[idx]) < EPSILON idx += 1 dfl = add_income_bins(dfx, 'expanded_income', bin_type='tpc', bins=None, right=False) groupedl = dfl.groupby('bins') idx = 1 for name, _ in groupedl: assert name.closed == 'left' assert abs(name.right - bins[idx]) < EPSILON idx += 1
def dropq_dist_table(resdf, groupby, result_type, suffix): """ Create and return dropq distribution table. """ # pylint: disable=too-many-locals res = resdf c04470_s = 'c04470' + suffix c00100_s = 'c00100' + suffix c09600_s = 'c09600' + suffix standard_s = 'standard' + suffix s006_s = 's006' + suffix returns_ided_s = 'num_returns_ItemDed' + suffix returns_sded_s = 'num_returns_StandardDed' + suffix returns_amt_s = 'num_returns_AMT' + suffix res[c04470_s] = res[c04470_s].where( ((res[c00100_s] > 0) & (res[c04470_s] > res[standard_s])), 0) res[returns_ided_s] = res[s006_s].where( ((res[c00100_s] > 0) & (res[c04470_s] > 0)), 0) res[returns_sded_s] = res[s006_s].where( ((res[c00100_s] > 0) & (res[standard_s] > 0)), 0) res[returns_amt_s] = res[s006_s].where(res[c09600_s] > 0, 0) if groupby == "weighted_deciles": dframe = add_weighted_income_bins(res, num_bins=10) elif groupby == "small_income_bins": dframe = add_income_bins(res, compare_with="soi") elif groupby == "large_income_bins": dframe = add_income_bins(res, compare_with="tpc") elif groupby == "webapp_income_bins": dframe = add_income_bins(res, compare_with="webapp") else: err = ("groupby must be either 'weighted_deciles' or " "'small_income_bins' or 'large_income_bins' or " "'webapp_income_bins'") raise ValueError(err) pd.options.display.float_format = '{:8,.0f}'.format if result_type == "weighted_sum": dframe = weighted(dframe, [col + suffix for col in STATS_COLUMNS]) gby_bins = dframe.groupby('bins', as_index=False) gp_mean = gby_bins[[col + suffix for col in TABLE_COLUMNS]].sum() gp_mean.drop('bins', axis=1, inplace=True) sum_row = get_sums(dframe)[[col + suffix for col in TABLE_COLUMNS]] elif result_type == "weighted_avg": gp_mean = weighted_avg_allcols(dframe, [col + suffix for col in TABLE_COLUMNS]) all_sums = get_sums(dframe, not_available=True) sum_row = all_sums[[col + suffix for col in TABLE_COLUMNS]] else: err = ("result_type must be either 'weighted_sum' or " "'weighted_avg'") raise ValueError(err) return gp_mean.append(sum_row)
def test_add_income_bins_soi(): dta = np.arange(1, 1e6, 5000) dfx = pd.DataFrame(data=dta, columns=['expanded_income']) bins = SMALL_INCOME_BINS dfr = add_income_bins(dfx, 'expanded_income', bin_type='soi', right=True) groupedr = dfr.groupby('bins') idx = 1 for name, _ in groupedr: assert name.closed == 'right' assert abs(name.right - bins[idx]) < EPSILON idx += 1 dfl = add_income_bins(dfx, 'expanded_income', bin_type='soi', right=False) groupedl = dfl.groupby('bins') idx = 1 for name, _ in groupedl: assert name.closed == 'left' assert abs(name.right - bins[idx]) < EPSILON idx += 1
def dropq_diff_table(df1, df2, groupby, res_col, diff_col, suffix, wsum): """ Create and return dropq difference table. """ # pylint: disable=too-many-arguments,too-many-locals if groupby == "weighted_deciles": gdf = add_weighted_income_bins(df2, num_bins=10) elif groupby == "small_income_bins": gdf = add_income_bins(df2, compare_with="soi") elif groupby == "large_income_bins": gdf = add_income_bins(df2, compare_with="tpc") elif groupby == "webapp_income_bins": gdf = add_income_bins(df2, compare_with="webapp") else: err = ("groupby must be either 'weighted_deciles' or " "'small_income_bins' or 'large_income_bins' or " "'webapp_income_bins'") raise ValueError(err) # Difference in plans # Positive values are the magnitude of the tax increase # Negative values are the magnitude of the tax decrease df2[res_col + suffix] = df2[diff_col + suffix] - df1[diff_col] diffs = means_and_comparisons(res_col + suffix, gdf.groupby('bins', as_index=False), wsum + EPSILON) sum_row = get_sums(diffs)[diffs.columns] diffs = diffs.append(sum_row) # pylint: disable=redefined-variable-type pd.options.display.float_format = '{:8,.0f}'.format srs_inc = ["{0:.2f}%".format(val * 100) for val in diffs['perc_inc']] diffs['perc_inc'] = pd.Series(srs_inc, index=diffs.index) srs_cut = ["{0:.2f}%".format(val * 100) for val in diffs['perc_cut']] diffs['perc_cut'] = pd.Series(srs_cut, index=diffs.index) srs_change = [ "{0:.2f}%".format(val * 100) for val in diffs['share_of_change'] ] diffs['share_of_change'] = pd.Series(srs_change, index=diffs.index) # columns containing weighted values relative to the binning mechanism non_sum_cols = [x for x in diffs.columns if 'mean' in x or 'perc' in x] for col in non_sum_cols: diffs.loc['sums', col] = 'n/a' return diffs
def fuzz(df1, df2, bin_type, imeasure, suffix, cols_to_fuzz): """ Fuzz some df2 records in each bin defined by bin_type and imeasure. The fuzzed records have their post-reform tax results (in df2) set to their pre-reform tax results (in df1). """ # pylint: disable=too-many-arguments assert bin_type == 'dec' or bin_type == 'bin' or bin_type == 'agg' if bin_type == 'dec': df2 = add_quantile_bins(df2, imeasure, 10) elif bin_type == 'bin': df2 = add_income_bins(df2, imeasure, bins=WEBAPP_INCOME_BINS) else: df2 = add_quantile_bins(df2, imeasure, 1) gdf2 = df2.groupby('bins') df2['nofuzz'] = gdf2['mask'].transform(chooser) for col in cols_to_fuzz: df2[col + suffix] = (df2[col] * df2['nofuzz'] - df1[col] * df2['nofuzz'] + df1[col])
def create(df1, df2, bin_type, imeasure, suffix, cols_to_fuzz, do_fuzzing): """ Create additional df2 columns. If do_fuzzing is True, also fuzz some df2 records in each bin defined by bin_type and imeasure with the fuzzed records having their post-reform tax results (in df2) set to their pre-reform tax results (in df1). """ # pylint: disable=too-many-arguments assert bin_type == 'dec' or bin_type == 'bin' or bin_type == 'agg' if bin_type == 'dec': df2 = add_quantile_bins(df2, imeasure, 10) elif bin_type == 'bin': df2 = add_income_bins(df2, imeasure, bins=STANDARD_INCOME_BINS) else: df2 = add_quantile_bins(df2, imeasure, 1) gdf2 = df2.groupby('bins') if do_fuzzing: df2['nofuzz'] = gdf2['mask'].transform(chooser) else: # never do any results fuzzing df2['nofuzz'] = np.ones(df2.shape[0], dtype=np.int8) for col in cols_to_fuzz: df2[col + suffix] = (df2[col] * df2['nofuzz'] - df1[col] * df2['nofuzz'] + df1[col])
def test_add_income_bins_raises(): dta = np.arange(1, 1e6, 5000) dfx = pd.DataFrame(data=dta, columns=['expanded_income']) with pytest.raises(ValueError): dfx = add_income_bins(dfx, 'expanded_income', bin_type='stuff')
def drop_records(df1, df2, mask): """ Modify df1 and df2 by adding statistical fuzz for data privacy. Parameters ---------- df1: Pandas DataFrame contains results for the standard plan X and X'. df2: Pandas DataFrame contains results for the user-specified plan (Plan Y). mask: boolean numpy array contains info about whether or not each element of X and X' are same Returns ------- fuzzed_df1: Pandas DataFrame fuzzed_df2: Pandas DataFrame Notes ----- This function groups both DataFrames based on the web application's income groupings (both weighted decile and income bins), and then pseudo-randomly picks three records to 'drop' within each bin. We keep track of the three dropped records in both group-by strategies and then use these 'flag' columns to modify all columns of interest, creating new '_dec' columns for statistics based on weighted deciles and '_bin' columns for statitistics based on income bins. Lastly we calculate individual income tax differences, payroll tax differences, and combined tax differences between the baseline and reform for the two groupings. """ # perform all statistics on (Y + X') - X # Group first df2['mask'] = mask df1['mask'] = mask df2 = add_weighted_income_bins(df2) df1 = add_weighted_income_bins(df1) gp2_dec = df2.groupby('bins') df2 = add_income_bins(df2, bins=WEBAPP_INCOME_BINS) df1 = add_income_bins(df1, bins=WEBAPP_INCOME_BINS) gp2_bin = df2.groupby('bins') # Transform to get the 'flag' column (3 choices to drop in each bin) df2['flag_dec'] = gp2_dec['mask'].transform(chooser) df2['flag_bin'] = gp2_bin['mask'].transform(chooser) # first calculate all of X' columns_to_make_noisy = set(TABLE_COLUMNS) | set(STATS_COLUMNS) # these don't exist yet columns_to_make_noisy.remove('num_returns_ItemDed') columns_to_make_noisy.remove('num_returns_StandardDed') columns_to_make_noisy.remove('num_returns_AMT') for col in columns_to_make_noisy: df2[col + '_dec'] = (df2[col] * df2['flag_dec'] - df1[col] * df2['flag_dec'] + df1[col]) df2[col + '_bin'] = (df2[col] * df2['flag_bin'] - df1[col] * df2['flag_bin'] + df1[col]) # Difference in plans # Positive values are the magnitude of the tax increase # Negative values are the magnitude of the tax decrease df2['tax_diff_dec'] = df2['iitax_dec'] - df1['iitax'] df2['tax_diff_bin'] = df2['iitax_bin'] - df1['iitax'] df2['payrolltax_diff_dec'] = df2['payrolltax_dec'] - df1['payrolltax'] df2['payrolltax_diff_bin'] = df2['payrolltax_bin'] - df1['payrolltax'] df2['combined_diff_dec'] = df2['combined_dec'] - df1['combined'] df2['combined_diff_bin'] = df2['combined_bin'] - df1['combined'] return df1, df2