def test_add_income_table_row_soi(): dta = np.arange(1, 1e6, 5000) dfx = pd.DataFrame(data=dta, columns=['expanded_income']) bins = SMALL_INCOME_BINS dfr = add_income_table_row_variable(dfx, 'expanded_income', bin_type='soi', right=True) groupedr = dfr.groupby('table_row') idx = 1 for name, _ in groupedr: assert name.closed == 'right' assert abs(name.right - bins[idx]) < EPSILON idx += 1 dfl = add_income_table_row_variable(dfx, 'expanded_income', bin_type='soi', right=False) groupedl = dfl.groupby('table_row') idx = 1 for name, _ in groupedl: assert name.closed == 'left' assert abs(name.right - bins[idx]) < EPSILON idx += 1
def test_add_income_trow_var_raises(): dta = np.arange(1, 1e6, 5000) dfx = pd.DataFrame(data=dta, columns=['expanded_income']) with pytest.raises(ValueError): dfx = add_income_table_row_variable(dfx, 'expanded_income', bin_type='stuff')
def create(df1, df2, bin_type, imeasure, suffix, cols_to_fuzz, do_fuzzing): """ Create additional df2 columns. If do_fuzzing is True, also fuzz some df2 records in each bin defined by bin_type and imeasure with the fuzzed records having their post-reform tax results (in df2) set to their pre-reform tax results (in df1). """ # pylint: disable=too-many-arguments assert bin_type == 'dec' or bin_type == 'bin' or bin_type == 'agg' if bin_type == 'dec': df2 = add_quantile_table_row_variable(df2, imeasure, 10, decile_details=True) gdf2 = df2.groupby('table_row') del df2['table_row'] elif bin_type == 'bin': df2 = add_income_table_row_variable(df2, imeasure, bins=STANDARD_INCOME_BINS) gdf2 = df2.groupby('table_row') del df2['table_row'] else: gdf2 = df2 if do_fuzzing: df2['nofuzz'] = gdf2['mask'].transform(chooser) else: # never do any results fuzzing df2['nofuzz'] = np.ones(df2.shape[0], dtype=np.int8) for col in cols_to_fuzz: df2[col + suffix] = (df2[col] * df2['nofuzz'] - df1[col] * df2['nofuzz'] + df1[col])
def test_add_income_trow_var(): dta = np.arange(1, 1e6, 5000) vdf = pd.DataFrame(data=dta, columns=['expanded_income']) vdf = add_income_table_row_variable(vdf, 'expanded_income', SOI_AGI_BINS) gdf = vdf.groupby('table_row') idx = 1 for name, _ in gdf: assert name.closed == 'left' assert abs(name.right - SOI_AGI_BINS[idx]) < EPSILON idx += 1
def test_add_income_trow_var(): dta = np.arange(1, 1e6, 5000) dfx = pd.DataFrame(data=dta, columns=['expanded_income']) bins = [-9e99, 0, 4999, 9999, 14999, 19999, 29999, 32999, 43999, 9e99] dfr = add_income_table_row_variable(dfx, 'expanded_income', bins=bins, right=True) groupedr = dfr.groupby('table_row') idx = 1 for name, _ in groupedr: assert name.closed == 'right' assert abs(name.right - bins[idx]) < EPSILON idx += 1 dfl = add_income_table_row_variable(dfx, 'expanded_income', bins=bins, right=False) groupedl = dfl.groupby('table_row') idx = 1 for name, _ in groupedl: assert name.closed == 'left' assert abs(name.right - bins[idx]) < EPSILON idx += 1
def fuzzed(df1, df2, reform_affected, table_row_type): """ Create fuzzed df2 dataframe and corresponding unfuzzed df1 dataframe. Parameters ---------- df1: Pandas DataFrame contains results variables for the baseline policy, which are not changed by this function df2: Pandas DataFrame contains results variables for the reform policy, which are not changed by this function reform_affected: boolean numpy array (not changed by this function) True for filing units with a reform-induced combined tax difference; otherwise False table_row_type: string valid values are 'aggr', 'xbin', and 'xdec' Returns ------- df1, df2: Pandas DataFrames where copied df2 is fuzzed to maintain data privacy and where copied df1 has same filing unit order as has the fuzzed df2 """ assert (table_row_type == 'aggr' or table_row_type == 'xbin' or table_row_type == 'xdec') assert len(df1.index) == len(df2.index) assert reform_affected.size == len(df1.index) df1 = copy.deepcopy(df1) df2 = copy.deepcopy(df2) # add copy of reform_affected to df2 df2['reform_affected'] = copy.deepcopy(reform_affected) # construct table rows, for which filing units in each row must be fuzzed if table_row_type == 'xbin': df1 = add_income_table_row_variable(df1, 'expanded_income', STANDARD_INCOME_BINS) df2['expanded_income_baseline'] = df1['expanded_income'] df2 = add_income_table_row_variable(df2, 'expanded_income_baseline', STANDARD_INCOME_BINS) del df2['expanded_income_baseline'] elif table_row_type == 'xdec': df1 = add_quantile_table_row_variable(df1, 'expanded_income', 10, decile_details=True) df2['expanded_income_baseline'] = df1['expanded_income'] df2 = add_quantile_table_row_variable(df2, 'expanded_income_baseline', 10, decile_details=True) del df2['expanded_income_baseline'] elif table_row_type == 'aggr': df1['table_row'] = np.ones(reform_affected.shape, dtype=int) df2['table_row'] = df1['table_row'] gdf1 = df1.groupby('table_row', sort=False) gdf2 = df2.groupby('table_row', sort=False) del df1['table_row'] del df2['table_row'] # fuzz up to NUM_TO_FUZZ filing units randomly chosen in each group # (or table row), where fuzz means to replace the reform (2) results # with the baseline (1) results for each chosen filing unit pd.options.mode.chained_assignment = None group_list = list() for name, group2 in gdf2: indices = np.where(group2['reform_affected']) num = min(len(indices[0]), NUM_TO_FUZZ) if num > 0: choices = np.random.choice( indices[0], # pylint: disable=no-member size=num, replace=False) group1 = gdf1.get_group(name) for idx in choices: group2.iloc[idx] = group1.iloc[idx] group_list.append(group2) df2 = pd.concat(group_list) del df2['reform_affected'] pd.options.mode.chained_assignment = 'warn' # reinstate index order of df1 and df2 and return df1.sort_index(inplace=True) df2.sort_index(inplace=True) return (df1, df2)
def summary(df1, df2, mask): """ df1 contains raw results for baseline plan df2 contains raw results for reform plan mask is the boolean array specifying records with reform-induced tax diffs returns dictionary of summary results DataFrames """ # pylint: disable=too-many-statements,too-many-locals df2_xdec, df2_xbin, df2_aggr = create_results_columns(df1, df2, mask) df1_xdec = add_quantile_table_row_variable(df1, 'expanded_income', 10, decile_details=True) del df1_xdec['table_row'] df1_xbin = add_income_table_row_variable(df1, 'expanded_income', bins=STANDARD_INCOME_BINS) del df1_xbin['table_row'] summ = dict() # tax difference totals between reform and baseline tdiff = df2_aggr['iitax_agg'] - df1['iitax'] aggr_itax_d = (tdiff * df2['s006']).sum() tdiff = df2_aggr['payrolltax_agg'] - df1['payrolltax'] aggr_ptax_d = (tdiff * df2['s006']).sum() tdiff = df2_aggr['combined_agg'] - df1['combined'] aggr_comb_d = (tdiff * df2['s006']).sum() aggrd = [aggr_itax_d, aggr_ptax_d, aggr_comb_d] summ['aggr_d'] = pd.DataFrame(data=aggrd, index=AGGR_ROW_NAMES) # totals for baseline aggr_itax_1 = (df1['iitax'] * df1['s006']).sum() aggr_ptax_1 = (df1['payrolltax'] * df1['s006']).sum() aggr_comb_1 = (df1['combined'] * df1['s006']).sum() aggr1 = [aggr_itax_1, aggr_ptax_1, aggr_comb_1] summ['aggr_1'] = pd.DataFrame(data=aggr1, index=AGGR_ROW_NAMES) # totals for reform aggr_itax_2 = (df2_aggr['iitax_agg'] * df2['s006']).sum() aggr_ptax_2 = (df2_aggr['payrolltax_agg'] * df2['s006']).sum() aggr_comb_2 = (df2_aggr['combined_agg'] * df2['s006']).sum() aggr2 = [aggr_itax_2, aggr_ptax_2, aggr_comb_2] summ['aggr_2'] = pd.DataFrame(data=aggr2, index=AGGR_ROW_NAMES) del df1 del df2 # create difference tables grouped by xdec df2_xdec['iitax'] = df2_xdec['iitax_xdec'] summ['diff_itax_xdec'] = \ create_difference_table(df1_xdec, df2_xdec, groupby='weighted_deciles', income_measure='expanded_income', tax_to_diff='iitax') df2_xdec['payrolltax'] = df2_xdec['payrolltax_xdec'] summ['diff_ptax_xdec'] = \ create_difference_table(df1_xdec, df2_xdec, groupby='weighted_deciles', income_measure='expanded_income', tax_to_diff='payrolltax') df2_xdec['combined'] = df2_xdec['combined_xdec'] summ['diff_comb_xdec'] = \ create_difference_table(df1_xdec, df2_xdec, groupby='weighted_deciles', income_measure='expanded_income', tax_to_diff='combined') # create difference tables grouped by xbin df2_xbin['iitax'] = df2_xbin['iitax_xbin'] diff_itax_xbin = \ create_difference_table(df1_xdec, df2_xbin, groupby='standard_income_bins', income_measure='expanded_income', tax_to_diff='iitax') summ['diff_itax_xbin'] = diff_itax_xbin df2_xbin['payrolltax'] = df2_xbin['payrolltax_xbin'] diff_ptax_xbin = \ create_difference_table(df1_xbin, df2_xbin, groupby='standard_income_bins', income_measure='expanded_income', tax_to_diff='payrolltax') summ['diff_ptax_xbin'] = diff_ptax_xbin df2_xbin['combined'] = df2_xbin['combined_xbin'] diff_comb_xbin = \ create_difference_table(df1_xbin, df2_xbin, groupby='standard_income_bins', income_measure='expanded_income', tax_to_diff='combined') summ['diff_comb_xbin'] = diff_comb_xbin # create distribution tables grouped by xdec summ['dist1_xdec'] = \ create_distribution_table(df1_xdec, groupby='weighted_deciles', income_measure='expanded_income', result_type='weighted_sum') suffix = '_xdec' df2_cols_with_suffix = [c for c in list(df2_xdec) if c.endswith(suffix)] for col in df2_cols_with_suffix: root_col_name = col.replace(suffix, '') df2_xdec[root_col_name] = df2_xdec[col] df2_xdec['expanded_income_baseline'] = df1_xdec['expanded_income'] summ['dist2_xdec'] = \ create_distribution_table(df2_xdec, groupby='weighted_deciles', income_measure='expanded_income_baseline', result_type='weighted_sum') # create distribution tables grouped by xbin dist1_xbin = \ create_distribution_table(df1_xbin, groupby='standard_income_bins', income_measure='expanded_income', result_type='weighted_sum') summ['dist1_xbin'] = dist1_xbin suffix = '_xbin' df2_cols_with_suffix = [c for c in list(df2_xbin) if c.endswith(suffix)] for col in df2_cols_with_suffix: root_col_name = col.replace(suffix, '') df2_xbin[root_col_name] = df2_xbin[col] df2_xbin['expanded_income_baseline'] = df1_xbin['expanded_income'] dist2_xbin = \ create_distribution_table(df2_xbin, groupby='standard_income_bins', income_measure='expanded_income_baseline', result_type='weighted_sum') summ['dist2_xbin'] = dist2_xbin # return dictionary of summary results return summ