def match(mar_cps_path='asec2014_pubuse_tax_fix_5x8.dat', puf_path='puf2009.csv'): # Add arguments for specifying path to CPS file in CSV format # this will allow the program to skip the process of creating the CPS from # a .DAT file. parser = argparse.ArgumentParser() parser.add_argument('-c', '--cps', help='path to CPS file in CSV format') parser.add_argument('-d', '--dat', help='path to CPS file in DAT format') parser.add_argument('-p', '--puf', help='path to PUF file in CSV format') args = parser.parse_args() # Create CPS file either from a CPS or through create_cps method if args.cps is not None: mar_cps = pd.read_csv(args.cps) else: if args.dat is not None: mar_cps_path = args.dat mar_cps = cpsmar.create_cps(mar_cps_path) # If you already have the CPS in CSV format, comment out the line above and # uncomment the line bellow to skip creation from the DAT file and use CSV # do this initially in an effort to fix warning: # "A value is trying to be set on a copy of a slice from a DataFrame" if args.puf is not None: puf_path = args.puf puf = pd.read_csv(puf_path) puf = puf[puf['recid'] != 999999] print('CPS Created') rets = Returns(mar_cps) cps = rets.computation() print('CPS Tax Units Created') filers, nonfilers = adjfilst(cps) print('Adjustment Complete') soi = create_soi(puf.copy()) print('PUF Created') soi_final, cps_final, counts = phaseone(filers, soi) print('Start Phase Two') match = phasetwo( soi_final.loc[:, ['cellid', 'soiseq', 'wt', 'factor', 'yhat']], cps_final.loc[:, ['cellid', 'cpsseq', 'wt', 'factor', 'yhat']]) print('Creating final file') cpsrets = add_cps(filers, match, puf) cps_matched = add_nonfiler(cpsrets, nonfilers) # Rename variables for use in PUF data prep renames = { 'icps1': 'age_head', 'icps2': 'age_spouse', 'wasp': 'wage_head', 'wass': 'wage_spouse' } cps_matched = cps_matched.rename(columns=renames) return cps_matched
def match(): # If there is a .CSV version of the CPS, simply read that in. Otherwise # convert the .DAT file to a .CSV cps_csv_path = 'cpsmar2016.csv' if os.path.isfile(cps_csv_path): print('Reading CPS Data from .CSV') mar_cps = pd.read_csv(cps_csv_path) else: cps_dat_path = 'asec2016_pubuse_v3.dat' if os.path.isfile(cps_dat_path): print('Converting .DAT to .CSV') mar_cps = cpsmar.create_cps(cps_dat_path) else: m = ('You must have either the .DAT or .CSV version of the 2016' + ' CPS in your directory') raise FileNotFoundError(m) print('Reading PUF Data') puf_path = 'puf2011.csv' puf = pd.read_csv(puf_path) # Change PUF columns to lowercase puf.columns = map(str.lower, puf.columns) # Remove aggregated variables from the PUF puf = puf[(puf['recid'] != 999996) & (puf['recid'] != 999997) & (puf['recid'] != 999998) & (puf['recid'] != 999999)] print('Creating CPS Tax Units') rets = Returns(mar_cps) cps = rets.computation() print('CPS Tax Units Created') filers, nonfilers = adjfilst(cps) print('Adjustment Complete') soi = create_soi(puf.copy()) print('Start Phase One') filers = filers.fillna(0) soi = soi.fillna(0) soi_final, cps_final, counts = phaseone(filers, soi) print('Start Phase Two') match = phasetwo( soi_final.loc[:, ['cellid', 'soiseq', 'wt', 'factor', 'yhat']], cps_final.loc[:, ['cellid', 'cpsseq', 'wt', 'factor', 'yhat']]) print('Creating final file') cpsrets = add_cps(filers, match, puf) cps_matched = add_nonfiler(cpsrets, nonfilers) # add age range variable cps_matched['agerange'] = 0 # Rename variables for use in PUF data prep renames = { 'icps1': 'age_head', 'icps2': 'age_spouse', 'wasp': 'wage_head', 'wass': 'wage_spouse' } cps_matched = cps_matched.rename(columns=renames) return cps_matched