Example #1
0
def test_add_income_table_row_soi():
    dta = np.arange(1, 1e6, 5000)
    dfx = pd.DataFrame(data=dta, columns=['expanded_income'])

    bins = SMALL_INCOME_BINS
    dfr = add_income_table_row_variable(dfx,
                                        'expanded_income',
                                        bin_type='soi',
                                        right=True)
    groupedr = dfr.groupby('table_row')
    idx = 1
    for name, _ in groupedr:
        assert name.closed == 'right'
        assert abs(name.right - bins[idx]) < EPSILON
        idx += 1
    dfl = add_income_table_row_variable(dfx,
                                        'expanded_income',
                                        bin_type='soi',
                                        right=False)
    groupedl = dfl.groupby('table_row')
    idx = 1
    for name, _ in groupedl:
        assert name.closed == 'left'
        assert abs(name.right - bins[idx]) < EPSILON
        idx += 1
Example #2
0
def test_add_income_trow_var_raises():
    dta = np.arange(1, 1e6, 5000)
    dfx = pd.DataFrame(data=dta, columns=['expanded_income'])
    with pytest.raises(ValueError):
        dfx = add_income_table_row_variable(dfx,
                                            'expanded_income',
                                            bin_type='stuff')
Example #3
0
 def create(df1, df2, bin_type, imeasure, suffix, cols_to_fuzz, do_fuzzing):
     """
     Create additional df2 columns.  If do_fuzzing is True, also
     fuzz some df2 records in each bin defined by bin_type and imeasure
     with the fuzzed records having their post-reform tax results (in df2)
     set to their pre-reform tax results (in df1).
     """
     # pylint: disable=too-many-arguments
     assert bin_type == 'dec' or bin_type == 'bin' or bin_type == 'agg'
     if bin_type == 'dec':
         df2 = add_quantile_table_row_variable(df2,
                                               imeasure,
                                               10,
                                               decile_details=True)
         gdf2 = df2.groupby('table_row')
         del df2['table_row']
     elif bin_type == 'bin':
         df2 = add_income_table_row_variable(df2,
                                             imeasure,
                                             bins=STANDARD_INCOME_BINS)
         gdf2 = df2.groupby('table_row')
         del df2['table_row']
     else:
         gdf2 = df2
     if do_fuzzing:
         df2['nofuzz'] = gdf2['mask'].transform(chooser)
     else:  # never do any results fuzzing
         df2['nofuzz'] = np.ones(df2.shape[0], dtype=np.int8)
     for col in cols_to_fuzz:
         df2[col + suffix] = (df2[col] * df2['nofuzz'] -
                              df1[col] * df2['nofuzz'] + df1[col])
Example #4
0
def test_add_income_trow_var():
    dta = np.arange(1, 1e6, 5000)
    vdf = pd.DataFrame(data=dta, columns=['expanded_income'])
    vdf = add_income_table_row_variable(vdf, 'expanded_income', SOI_AGI_BINS)
    gdf = vdf.groupby('table_row')
    idx = 1
    for name, _ in gdf:
        assert name.closed == 'left'
        assert abs(name.right - SOI_AGI_BINS[idx]) < EPSILON
        idx += 1
def test_add_income_trow_var():
    dta = np.arange(1, 1e6, 5000)
    vdf = pd.DataFrame(data=dta, columns=['expanded_income'])
    vdf = add_income_table_row_variable(vdf, 'expanded_income', SOI_AGI_BINS)
    gdf = vdf.groupby('table_row')
    idx = 1
    for name, _ in gdf:
        assert name.closed == 'left'
        assert abs(name.right - SOI_AGI_BINS[idx]) < EPSILON
        idx += 1
Example #6
0
def test_add_income_trow_var():
    dta = np.arange(1, 1e6, 5000)
    dfx = pd.DataFrame(data=dta, columns=['expanded_income'])
    bins = [-9e99, 0, 4999, 9999, 14999, 19999, 29999, 32999, 43999, 9e99]
    dfr = add_income_table_row_variable(dfx,
                                        'expanded_income',
                                        bins=bins,
                                        right=True)
    groupedr = dfr.groupby('table_row')
    idx = 1
    for name, _ in groupedr:
        assert name.closed == 'right'
        assert abs(name.right - bins[idx]) < EPSILON
        idx += 1
    dfl = add_income_table_row_variable(dfx,
                                        'expanded_income',
                                        bins=bins,
                                        right=False)
    groupedl = dfl.groupby('table_row')
    idx = 1
    for name, _ in groupedl:
        assert name.closed == 'left'
        assert abs(name.right - bins[idx]) < EPSILON
        idx += 1
Example #7
0
def fuzzed(df1, df2, reform_affected, table_row_type):
    """
    Create fuzzed df2 dataframe and corresponding unfuzzed df1 dataframe.

    Parameters
    ----------
    df1: Pandas DataFrame
        contains results variables for the baseline policy, which are not
        changed by this function

    df2: Pandas DataFrame
        contains results variables for the reform policy, which are not
        changed by this function

    reform_affected: boolean numpy array (not changed by this function)
        True for filing units with a reform-induced combined tax difference;
        otherwise False

    table_row_type: string
        valid values are 'aggr', 'xbin', and 'xdec'

    Returns
    -------
    df1, df2: Pandas DataFrames
        where copied df2 is fuzzed to maintain data privacy and
        where copied df1 has same filing unit order as has the fuzzed df2
    """
    assert (table_row_type == 'aggr' or table_row_type == 'xbin'
            or table_row_type == 'xdec')
    assert len(df1.index) == len(df2.index)
    assert reform_affected.size == len(df1.index)
    df1 = copy.deepcopy(df1)
    df2 = copy.deepcopy(df2)
    # add copy of reform_affected to df2
    df2['reform_affected'] = copy.deepcopy(reform_affected)
    # construct table rows, for which filing units in each row must be fuzzed
    if table_row_type == 'xbin':
        df1 = add_income_table_row_variable(df1, 'expanded_income',
                                            STANDARD_INCOME_BINS)
        df2['expanded_income_baseline'] = df1['expanded_income']
        df2 = add_income_table_row_variable(df2, 'expanded_income_baseline',
                                            STANDARD_INCOME_BINS)
        del df2['expanded_income_baseline']
    elif table_row_type == 'xdec':
        df1 = add_quantile_table_row_variable(df1,
                                              'expanded_income',
                                              10,
                                              decile_details=True)
        df2['expanded_income_baseline'] = df1['expanded_income']
        df2 = add_quantile_table_row_variable(df2,
                                              'expanded_income_baseline',
                                              10,
                                              decile_details=True)
        del df2['expanded_income_baseline']
    elif table_row_type == 'aggr':
        df1['table_row'] = np.ones(reform_affected.shape, dtype=int)
        df2['table_row'] = df1['table_row']
    gdf1 = df1.groupby('table_row', sort=False)
    gdf2 = df2.groupby('table_row', sort=False)
    del df1['table_row']
    del df2['table_row']
    # fuzz up to NUM_TO_FUZZ filing units randomly chosen in each group
    # (or table row), where fuzz means to replace the reform (2) results
    # with the baseline (1) results for each chosen filing unit
    pd.options.mode.chained_assignment = None
    group_list = list()
    for name, group2 in gdf2:
        indices = np.where(group2['reform_affected'])
        num = min(len(indices[0]), NUM_TO_FUZZ)
        if num > 0:
            choices = np.random.choice(
                indices[0],  # pylint: disable=no-member
                size=num,
                replace=False)
            group1 = gdf1.get_group(name)
            for idx in choices:
                group2.iloc[idx] = group1.iloc[idx]
        group_list.append(group2)
    df2 = pd.concat(group_list)
    del df2['reform_affected']
    pd.options.mode.chained_assignment = 'warn'
    # reinstate index order of df1 and df2 and return
    df1.sort_index(inplace=True)
    df2.sort_index(inplace=True)
    return (df1, df2)
Example #8
0
def summary(df1, df2, mask):
    """
    df1 contains raw results for baseline plan
    df2 contains raw results for reform plan
    mask is the boolean array specifying records with reform-induced tax diffs
    returns dictionary of summary results DataFrames
    """
    # pylint: disable=too-many-statements,too-many-locals

    df2_xdec, df2_xbin, df2_aggr = create_results_columns(df1, df2, mask)
    df1_xdec = add_quantile_table_row_variable(df1,
                                               'expanded_income',
                                               10,
                                               decile_details=True)
    del df1_xdec['table_row']
    df1_xbin = add_income_table_row_variable(df1,
                                             'expanded_income',
                                             bins=STANDARD_INCOME_BINS)
    del df1_xbin['table_row']

    summ = dict()

    # tax difference totals between reform and baseline
    tdiff = df2_aggr['iitax_agg'] - df1['iitax']
    aggr_itax_d = (tdiff * df2['s006']).sum()
    tdiff = df2_aggr['payrolltax_agg'] - df1['payrolltax']
    aggr_ptax_d = (tdiff * df2['s006']).sum()
    tdiff = df2_aggr['combined_agg'] - df1['combined']
    aggr_comb_d = (tdiff * df2['s006']).sum()
    aggrd = [aggr_itax_d, aggr_ptax_d, aggr_comb_d]
    summ['aggr_d'] = pd.DataFrame(data=aggrd, index=AGGR_ROW_NAMES)

    # totals for baseline
    aggr_itax_1 = (df1['iitax'] * df1['s006']).sum()
    aggr_ptax_1 = (df1['payrolltax'] * df1['s006']).sum()
    aggr_comb_1 = (df1['combined'] * df1['s006']).sum()
    aggr1 = [aggr_itax_1, aggr_ptax_1, aggr_comb_1]
    summ['aggr_1'] = pd.DataFrame(data=aggr1, index=AGGR_ROW_NAMES)

    # totals for reform
    aggr_itax_2 = (df2_aggr['iitax_agg'] * df2['s006']).sum()
    aggr_ptax_2 = (df2_aggr['payrolltax_agg'] * df2['s006']).sum()
    aggr_comb_2 = (df2_aggr['combined_agg'] * df2['s006']).sum()
    aggr2 = [aggr_itax_2, aggr_ptax_2, aggr_comb_2]
    summ['aggr_2'] = pd.DataFrame(data=aggr2, index=AGGR_ROW_NAMES)

    del df1
    del df2

    # create difference tables grouped by xdec
    df2_xdec['iitax'] = df2_xdec['iitax_xdec']
    summ['diff_itax_xdec'] = \
        create_difference_table(df1_xdec, df2_xdec,
                                groupby='weighted_deciles',
                                income_measure='expanded_income',
                                tax_to_diff='iitax')

    df2_xdec['payrolltax'] = df2_xdec['payrolltax_xdec']
    summ['diff_ptax_xdec'] = \
        create_difference_table(df1_xdec, df2_xdec,
                                groupby='weighted_deciles',
                                income_measure='expanded_income',
                                tax_to_diff='payrolltax')

    df2_xdec['combined'] = df2_xdec['combined_xdec']
    summ['diff_comb_xdec'] = \
        create_difference_table(df1_xdec, df2_xdec,
                                groupby='weighted_deciles',
                                income_measure='expanded_income',
                                tax_to_diff='combined')

    # create difference tables grouped by xbin
    df2_xbin['iitax'] = df2_xbin['iitax_xbin']
    diff_itax_xbin = \
        create_difference_table(df1_xdec, df2_xbin,
                                groupby='standard_income_bins',
                                income_measure='expanded_income',
                                tax_to_diff='iitax')
    summ['diff_itax_xbin'] = diff_itax_xbin

    df2_xbin['payrolltax'] = df2_xbin['payrolltax_xbin']
    diff_ptax_xbin = \
        create_difference_table(df1_xbin, df2_xbin,
                                groupby='standard_income_bins',
                                income_measure='expanded_income',
                                tax_to_diff='payrolltax')
    summ['diff_ptax_xbin'] = diff_ptax_xbin

    df2_xbin['combined'] = df2_xbin['combined_xbin']
    diff_comb_xbin = \
        create_difference_table(df1_xbin, df2_xbin,
                                groupby='standard_income_bins',
                                income_measure='expanded_income',
                                tax_to_diff='combined')
    summ['diff_comb_xbin'] = diff_comb_xbin

    # create distribution tables grouped by xdec
    summ['dist1_xdec'] = \
        create_distribution_table(df1_xdec, groupby='weighted_deciles',
                                  income_measure='expanded_income',
                                  result_type='weighted_sum')

    suffix = '_xdec'
    df2_cols_with_suffix = [c for c in list(df2_xdec) if c.endswith(suffix)]
    for col in df2_cols_with_suffix:
        root_col_name = col.replace(suffix, '')
        df2_xdec[root_col_name] = df2_xdec[col]
    df2_xdec['expanded_income_baseline'] = df1_xdec['expanded_income']
    summ['dist2_xdec'] = \
        create_distribution_table(df2_xdec, groupby='weighted_deciles',
                                  income_measure='expanded_income_baseline',
                                  result_type='weighted_sum')

    # create distribution tables grouped by xbin
    dist1_xbin = \
        create_distribution_table(df1_xbin, groupby='standard_income_bins',
                                  income_measure='expanded_income',
                                  result_type='weighted_sum')
    summ['dist1_xbin'] = dist1_xbin

    suffix = '_xbin'
    df2_cols_with_suffix = [c for c in list(df2_xbin) if c.endswith(suffix)]
    for col in df2_cols_with_suffix:
        root_col_name = col.replace(suffix, '')
        df2_xbin[root_col_name] = df2_xbin[col]
    df2_xbin['expanded_income_baseline'] = df1_xbin['expanded_income']
    dist2_xbin = \
        create_distribution_table(df2_xbin, groupby='standard_income_bins',
                                  income_measure='expanded_income_baseline',
                                  result_type='weighted_sum')
    summ['dist2_xbin'] = dist2_xbin

    # return dictionary of summary results
    return summ