def combine_base_reform( base: pd.DataFrame, reform: pd.DataFrame, base_cols: Optional[list], cols: Optional[list], reform_cols: Optional[list], ) -> pd.DataFrame: """Combine base and reform with certain columns. :param base: Base DataFrame. Index must match reform. :type base: pd.DataFrame :param reform: Reform DataFrame. Index must match base. :type reform: pd.DataFrame :param base_cols: Columns in base to keep. :type base_cols: list, optional :param cols: Columns to keep from both base and reform. :type cols: list, optional :param reform_cols: Columns in reform to keep. :type reform_cols: list, optional :returns: DataFrame with columns for base ("_base") and reform ("_reform"). :rtype: pd.DataFrame """ all_base_cols = mdf.listify([base_cols] + [cols]) all_reform_cols = mdf.listify([reform_cols] + [cols]) return base[all_base_cols].join(reform[all_reform_cols], lsuffix="_base", rsuffix="_reform")
def agg(base, reform, groupby, metrics, base_metrics=None, reform_metrics=None): """ Aggregates differences between base and reform. Args: base: Base DataFrame. Index must match reform. reform: Reform DataFrame. Index must match base. groupby: Variable in base to group on. metrics: List of variables to agg and calculate the % change of. These should have associated weighted columns ending in _m in base and reform. base_metrics: List of variables from base to sum. Defaults to None. reform_metrics: List of variables from reform to sum. Defaults to None. Returns: DataFrame with groupby and metrics, and _pctchg metrics. """ metrics = mdf.listify(metrics) metrics_m = [i + '_m' for i in metrics] combined = combine_base_reform(base, reform, base_cols=mdf.listify( [groupby, base_metrics]), cols=mdf.listify(metrics_m), reform_cols=mdf.listify(reform_metrics)) grouped = combined.groupby(groupby).sum() for metric in metrics: grouped[metric + '_pctchg'] = pctchg_base_reform(grouped, metric) return grouped
def agg( base: pd.DataFrame, reform: pd.DataFrame, groupby: str, metrics: list, base_metrics: Optional[list], reform_metrics: Optional[list], ) -> pd.DataFrame: """Aggregates differences between base and reform. :param base: Base DataFrame. Index must match reform. :type base: pd.DataFrame :param reform: Reform DataFrame. Index must match base. :type reform: pd.DataFrame :param groupby: Variable in base to group on. :type groupby: str :param metrics: List of variables to agg and calculate the % change of. These should have associated weighted columns ending in _m in base and reform. :type metrics: list :param base_metrics: List of variables from base to sum. :type base_metrics: Optional[list] :param reform_metrics: List of variables from reform to sum. :type reform_metrics: Optional[list] :returns: DataFrame with groupby and metrics, and _pctchg metrics. :rtype: pd.DataFrame """ metrics = mdf.listify(metrics) metrics_m = [i + "_m" for i in metrics] combined = combine_base_reform( base, reform, base_cols=mdf.listify([groupby, base_metrics]), cols=mdf.listify(metrics_m), reform_cols=mdf.listify(reform_metrics), ) grouped = combined.groupby(groupby).sum() for metric in metrics: grouped[metric + "_pctchg"] = pctchg_base_reform(grouped, metric) return grouped
def combine_base_reform(base, reform, base_cols=None, cols=None, reform_cols=None): """ Combine base and reform with certain columns. Args: base: Base DataFrame. Index must match reform. reform: Reform DataFrame. Index must match base. base_cols: Columns in base to keep. cols: Columns to keep from both base and reform. reform_cols: Columns in reform to keep. Returns: DataFrame with columns for base ("_base") and reform ("_reform"). """ all_base_cols = mdf.listify([base_cols] + [cols]) all_reform_cols = mdf.listify([reform_cols] + [cols]) return base[all_base_cols].join(reform[all_reform_cols], lsuffix='_base', rsuffix='_reform')
def add_weighted_metrics(df, metric_vars, w="s006", divisor=1e6, suffix="_m"): """Adds weighted metrics in millions to a Tax-Calculator pandas DataFrame. Columns are renamed to *_m. :param df: A pandas DataFrame containing Tax-Calculator data. :param metric_vars: A list of column names to weight, or a single column name. :param w: Weight column. Defaults to s006. :param divisor: Number by which the product is divided. Defaults to 1e6. :param suffix: Suffix to add to each weighted total. Defaults to '_m' to match divisor default of 1e6. :returns: Nothing. Weighted columns are added in place. """ df[w + suffix] = df[w] / divisor metric_vars = mdf.listify(metric_vars) for metric_var in metric_vars: df[metric_var + suffix] = df[metric_var] * df[w + suffix]
def ubi_or_bens( df, ben_cols, max_ubi="max_ubi", ubi="ubi", bens="bens", update_income_measures=None, ): """Calculates whether a tax unit will take UBI or benefits, and adjusts values accordingly. :param df: DataFrame. :param ben_cols: List of columns for benefits. :param max_ubi: Column name of the maximum UBI, before accounting for benefits. Defaults to 'max_ubi'. :param ubi: Column name to add representing the UBI. Defaults to 'ubi'. :param bens: Column name to add representing total benefits (after adjustment). Defaults to 'bens'. :param update_income_measures: List of income measures to update. Defaults to ['expanded_income', 'aftertax_income']. :returns: Nothing. Benefits in ben_cols are adjusted, ubi and bens columns are added, and expanded_income and aftertax_income are updated according to the net difference. """ if update_income_measures is None: update_income_measures = ["expanded_income", "aftertax_income"] # Prep list args. update_income_measures = mdf.listify(update_income_measures) total_bens = df[ben_cols].sum(axis=1) take_ubi = df[max_ubi] > total_bens df[ubi] = np.where(take_ubi, df[max_ubi], 0) for ben in ben_cols: df[ben] *= np.where(take_ubi, 0, 1) df[bens] = df[ben_cols].sum(axis=1) # Update expanded and aftertax income. diff = df.ubi + df.bens - total_bens for i in update_income_measures: df[i] += diff
def calc_df( records=None, policy=None, year=2020, reform=None, group_vars=None, metric_vars=None, group_n65=False, ): """Creates a pandas DataFrame for given Tax-Calculator data. s006 is always included, and RECID is used as an index. :param records: An optional Records object. If not provided, uses CPS records. (Default value = None) :param policy: An optional Policy object. If not provided, uses default Policy. :param year: An optional year to advance to. If not provided, defaults to 2020. :param reform: An optional reform to implement for the Policy object. (Default value = None) :param group_vars: An optional list of column names to include in the DataFrame. (Default value = None) :param metric_vars: An optional list of column names to include and calculate weighted sums of (in millions named as *_m) in the DataFrame. (Default value = None) :param group_n65: Whether to calculate and group by n65. Defaults to False. :returns: A pandas DataFrame. market_income is also always calculated. """ tc = import_optional_dependency("taxcalc") # Assign defaults. if records is None: records = tc.Records.cps_constructor() if policy is None: policy = tc.Policy() if reform is not None: policy.implement_reform(reform) # Calculate. calc = tc.Calculator(records=records, policy=policy, verbose=False) calc.advance_to_year(year) calc.calc_all() # Get a deduplicated list of all columns. if group_n65: group_vars = group_vars + [ "age_head", "age_spouse", "elderly_dependents", ] # Include expanded_income and benefits to produce market_income. all_cols = mdf.listify( [ "RECID", "s006", "expanded_income", "aftertax_income", mdf.BENS, group_vars, metric_vars, ] ) df = calc.dataframe(all_cols) # Create core elements. df["market_income"] = mdf.market_income(df) df["bens"] = df[mdf.BENS].sum(axis=1) df["tax"] = df.expanded_income - df.aftertax_income if group_n65: df["n65"] = n65(df.age_head, df.age_spouse, df.elderly_dependents) df.drop( ["age_head", "age_spouse", "elderly_dependents"], axis=1, inplace=True, ) # Add calculated columns for metrics. mdf.add_weighted_metrics(df, metric_vars) # Set RECID to int and set it as index before returning. df["RECID"] = df.RECID.map(int) return df.set_index("RECID")
def calc_df(records=None, policy=None, year=2019, reform=None, group_vars=None, metric_vars=None, group_n65=False): """Creates a pandas DataFrame for given Tax-Calculator data. s006 is always included, and RECID is used as an index. Args: records: An optional Records object. If not provided, uses CPS records. policy: An optional Policy object. If not provided, uses default Policy. year: An optional year to advance to. If not provided, defaults to 2019. reform: An optional reform to implement for the Policy object. group_vars: An optional list of column names to include in the DataFrame. metric_vars: An optional list of column names to include and calculate weighted sums of (in millions named as *_m) in the DataFrame. group_n65: Whether to calculate and group by n65. Defaults to False. Returns: A pandas DataFrame. market_income is also always calculated. """ tc = import_optional_dependency("taxcalc") # Assign defaults. if records is None: records = tc.Records.cps_constructor() if policy is None: policy = tc.Policy() if reform is not None: policy.implement_reform(reform) # Calculate. calc = tc.Calculator(records=records, policy=policy, verbose=False) calc.advance_to_year(year) calc.calc_all() # TODO: Make n65, ECI, etc. part of the list of columns you can request. # Get a deduplicated list of all columns. if group_n65: group_vars = group_vars + ['age_head', 'age_spouse', 'elderly_dependents'] # Include expanded_income and benefits to produce market_income. all_cols = mdf.listify( ['RECID', 's006', 'expanded_income', 'aftertax_income', mdf.BENS, group_vars, metric_vars]) df = calc.dataframe(all_cols) # Create core elements. df['market_income'] = mdf.market_income(df) df['bens'] = df[mdf.BENS].sum(axis=1) df['tax'] = df.expanded_income - df.aftertax_income if group_n65: df['n65'] = n65(df.age_head, df.age_spouse, df.elderly_dependents) df.drop(['age_head', 'age_spouse', 'elderly_dependents'], axis=1, inplace=True) # Add calculated columns for metrics. mdf.add_weighted_metrics(df, metric_vars) # Set RECID to int and set it as index before returning. df['RECID'] = df.RECID.map(int) return df.set_index('RECID')