def test_validity_of_name_lists(): assert len(DIST_TABLE_COLUMNS) == len(DIST_TABLE_LABELS) Records.read_var_info() assert set(DIST_VARIABLES).issubset(Records.CALCULATED_VARS | {'s006'}) extra_vars_set = set( ['num_returns_StandardDed', 'num_returns_ItemDed', 'num_returns_AMT']) assert (set(DIST_TABLE_COLUMNS) - set(DIST_VARIABLES)) == extra_vars_set
def test_calc_and_used_vars(tests_path): """ Runs two kinds of tests on variables used in the calcfunctions.py file: (1) Checks that each var in Records.CALCULATED_VARS is actually calculated If test (1) fails, a variable in Records.CALCULATED_VARS was not calculated in any function in the calcfunctions.py file. With the exception of a few variables listed in this test, all Records.CALCULATED_VARS must be calculated in the calcfunctions.py file. (2) Check that each variable that is calculated in a function and returned by that function is an argument of that function. """ # pylint: disable=too-many-locals funcpath = os.path.join(tests_path, '..', 'calcfunctions.py') gfd = GetFuncDefs() fnames, fargs, cvars, rvars = gfd.visit(ast.parse(open(funcpath).read())) # Test (1): # .. create set of vars that are actually calculated in calcfunctions.py all_cvars = set() for fname in fnames: if fname == 'BenefitSurtax': continue # because BenefitSurtax is not really a function all_cvars.update(set(cvars[fname])) # .. add to all_cvars set variables calculated in Records class all_cvars.update(set(['num', 'sep', 'exact'])) # .. add to all_cvars set variables calculated elsewhere all_cvars.update(set(['mtr_paytax', 'mtr_inctax'])) all_cvars.update(set(['benefit_cost_total', 'benefit_value_total'])) # .. check that each var in Records.CALCULATED_VARS is in the all_cvars set Records.read_var_info() found_error1 = False if not Records.CALCULATED_VARS <= all_cvars: msg1 = ('all Records.CALCULATED_VARS not calculated ' 'in calcfunctions.py\n') for var in Records.CALCULATED_VARS - all_cvars: found_error1 = True msg1 += 'VAR NOT CALCULATED: {}\n'.format(var) # Test (2): faux_functions = ['EITCamount', 'ComputeBenefit', 'BenefitPrograms', 'BenefitSurtax', 'BenefitLimitation'] found_error2 = False msg2 = 'calculated & returned variables are not function arguments\n' for fname in fnames: if fname in faux_functions: continue # because fname is not a genuine function crvars_set = set(cvars[fname]) & set(rvars[fname]) if not crvars_set <= set(fargs[fname]): found_error2 = True for var in crvars_set - set(fargs[fname]): msg2 += 'FUNCTION,VARIABLE: {} {}\n'.format(fname, var) # Report errors for the two tests: if found_error1 and found_error2: raise ValueError('{}\n{}'.format(msg1, msg2)) if found_error1: raise ValueError(msg1) if found_error2: raise ValueError(msg2)
def test_validity_of_name_lists(): assert len(DIST_TABLE_COLUMNS) == len(DIST_TABLE_LABELS) Records.read_var_info() assert set(DIST_VARIABLES).issubset(Records.CALCULATED_VARS | {'s006'}) extra_vars_set = set(['num_returns_StandardDed', 'num_returns_ItemDed', 'num_returns_AMT']) assert (set(DIST_TABLE_COLUMNS) - set(DIST_VARIABLES)) == extra_vars_set
def main(filename, recid, input_vars_only, transpose): """ Contains high-level logic of the script. """ # read all file content into Pandas DataFrame adf = pd.read_csv(filename) adf_vars = set(adf.columns) # pylint: disable=no-member # check that both files contain required tax variables required_input_vars = set(['RECID', 'MARS']) required_input_vars_str = 'RECID, MARS' if not required_input_vars.issubset(adf_vars): msg = 'ERROR: FILE does not include required input variables: {}\n' sys.stderr.write(msg.format(required_input_vars_str)) return 1 # check that RECID actually identifies a filing unit in FILE if recid not in adf['RECID'].values: msg = 'ERROR: RECID={} not in FILE\n' sys.stderr.write(msg.format(recid)) return 1 # extract the adf row with specified recid edf = adf[adf['RECID'] == recid] edf.is_copy = False # optionally remove all but Tax-Calculator usable input variables from edf if input_vars_only: Records.read_var_info() for colname in edf.columns: if colname not in Records.USABLE_READ_VARS: edf.drop(colname, axis=1, inplace=True) # remove all zero-valued variables from edf for colname in edf.columns: if edf[colname].iloc[0] == 0: edf.drop(colname, axis=1, inplace=True) # write edf to CSV-formatted output file if transpose: ofilename = '{}-{}T.csv'.format(filename[:-4], recid) tstr = transposed(edf) with open(ofilename, 'w') as ofile: ofile.write(tstr) else: ofilename = '{}-{}.csv'.format(filename[:-4], recid) edf.to_csv(path_or_buf=ofilename, columns=sorted(edf.columns), index=False, float_format='%.2f') sys.stdout.write('EXTRACT IN {}\n'.format(ofilename)) # normal return code return 0
def _calc_object(self, exact_calcs, emulate_taxsim_2441_logic, output_records): """ Create and return Calculator object to conduct the tax calculations. Parameters ---------- exact_calcs: boolean emulate_taxsim_2441_logic: boolean output_records: boolean Returns ------- calc: Calculator """ # create all-zeros dictionary and then list of all-zero dictionaries Records.read_var_info() zero_dict = {} for varname in Records.USABLE_READ_VARS: zero_dict[varname] = 0 dict_list = [zero_dict for _ in range(0, len(self._input))] # use dict_list to create a Pandas DataFrame and Records object recsdf = pd.DataFrame(dict_list, dtype='int64') recsdf['MARS'] = recsdf['MARS'].add(1) # because MARS==0 is illegal recs = Records(data=recsdf, exact_calculations=exact_calcs, gfactors=None, weights=None, start_year=self.policy.start_year) assert recs.array_length == len(self._input) # specify input for each tax filing unit in Records object lnum = 0 for idx in range(0, recs.array_length): lnum += 1 SimpleTaxIO._specify_input(recs, idx, self._input[lnum], emulate_taxsim_2441_logic) # optionally write Records.USABLE_READ_VARS content to file if output_records: recdf = pd.DataFrame() for varname in Records.USABLE_READ_VARS: vardata = getattr(recs, varname) recdf[varname] = vardata recdf.to_csv(re.sub('out-simtax', 'records', self._output_filename), float_format='%.2f', index=False) # create Calculator object containing all tax filing units return Calculator(policy=self.policy, records=recs, sync_years=False)
MAX_SIZE = 100000 # maximum size of sample to draw from puf.csv DEBUG = False # True implies no variable randomization or record sampling TRACE = False # True implies tracing output written to stdout # specify set of variables not included in xYY.csv file if DEBUG: DROP_VARS = set(['filer']) else: DROP_VARS = set(['filer', 's006', 'cmbtp', 'nu05', 'nu13', 'elderly_dependents', 'e09700', 'e09800', 'e09900', 'e11200']) # specify set of variables whose values are not to be randomized Records.read_var_info() if DEBUG: SKIP_VARS = Records.USABLE_READ_VARS else: SKIP_VARS = set(['RECID', 'MARS', 'DSI', 'MIDR', 'FLPDYR', 'age_head', 'age_spouse', 'nu18', 'n1820', 'n21', 'XTOT', 'EIC', 'n24', 'f2441', 'f6251']) ANNUAL_DRIFT = 0.03 NORM_STD_DEV = 0.25 def randomize_data(xdf, taxyear, rnseed): """
def test_validity_of_name_lists(): assert len(DIST_TABLE_COLUMNS) == len(DIST_TABLE_LABELS) Records.read_var_info() assert set(DIST_VARIABLES).issubset(Records.CALCULATED_VARS | {'weight'}) extra_vars_set = set() assert (set(DIST_TABLE_COLUMNS) - set(DIST_VARIABLES)) == extra_vars_set
DEBUG = False # True implies no variable randomization or record sampling TRACE = False # True implies tracing output written to stdout # specify set of variables not included in xYY.csv file if DEBUG: DROP_VARS = set(['filer']) else: DROP_VARS = set([ 'filer', 's006', 'cmbtp', 'nu05', 'nu13', 'elderly_dependent', 'e09700', 'e09800', 'e09900', 'e11200' ]) # specify set of variables whose values are not to be randomized Records.read_var_info() if DEBUG: SKIP_VARS = Records.USABLE_READ_VARS else: SKIP_VARS = set([ 'RECID', 'MARS', 'DSI', 'MIDR', 'FLPDYR', 'age_head', 'age_spouse', 'nu18', 'n1820', 'n21', 'XTOT', 'EIC', 'n24', 'f2441', 'f6251' ]) ANNUAL_DRIFT = 0.03 NORM_STD_DEV = 0.25 def randomize_data(xdf, taxyear, rnseed): """ Randomizes data variables.
def test_calc_and_used_vars(tests_path): """ Runs two kinds of tests on variables used in the calcfunctions.py file: (1) Checks that each var in Records.CALCULATED_VARS is actually calculated If test (1) fails, a variable in Records.CALCULATED_VARS was not calculated in any function in the calcfunctions.py file. With the exception of a few variables listed in this test, all Records.CALCULATED_VARS must be calculated in the calcfunctions.py file. (2) Check that each variable that is calculated in a function and returned by that function is an argument of that function. """ # pylint: disable=too-many-locals funcpath = os.path.join(tests_path, '..', 'calcfunctions.py') gfd = GetFuncDefs() fnames, fargs, cvars, rvars = gfd.visit(ast.parse(open(funcpath).read())) # Test (1): # .. create set of vars that are actually calculated in calcfunctions.py all_cvars = set() for fname in fnames: if fname == 'BenefitSurtax': continue # because BenefitSurtax is not really a function all_cvars.update(set(cvars[fname])) # .. add to all_cvars set variables calculated in Records class all_cvars.update(set(['num', 'sep', 'exact'])) # .. add to all_cvars set variables calculated elsewhere all_cvars.update(set(['mtr_paytax', 'mtr_inctax'])) all_cvars.update(set(['benefit_cost_total', 'benefit_value_total'])) # .. check that each var in Records.CALCULATED_VARS is in the all_cvars set Records.read_var_info() found_error1 = False if not Records.CALCULATED_VARS <= all_cvars: msg1 = ('all Records.CALCULATED_VARS not calculated ' 'in calcfunctions.py\n') for var in Records.CALCULATED_VARS - all_cvars: found_error1 = True msg1 += 'VAR NOT CALCULATED: {}\n'.format(var) # Test (2): faux_functions = [ 'EITCamount', 'ComputeBenefit', 'BenefitPrograms', 'BenefitSurtax', 'BenefitLimitation' ] found_error2 = False msg2 = 'calculated & returned variables are not function arguments\n' for fname in fnames: if fname in faux_functions: continue # because fname is not a genuine function crvars_set = set(cvars[fname]) & set(rvars[fname]) if not crvars_set <= set(fargs[fname]): found_error2 = True for var in crvars_set - set(fargs[fname]): msg2 += 'FUNCTION,VARIABLE: {} {}\n'.format(fname, var) # Report errors for the two tests: if found_error1 and found_error2: raise ValueError('{}\n{}'.format(msg1, msg2)) elif found_error1: raise ValueError(msg1) elif found_error2: raise ValueError(msg2)