def _save_hh_as_csv(in_file_paths, hid2cnt, hh_path, gq_path): """ 0 SERIALNO,puma_id,place_id,SYNTHETIC_HID,longitude, 5 latitude,AGEGRP,HRSWRK,IMMSTAT,INCTAX, 10 MODE,OCC,POB,RELIGION,SEX, 15 SYNTHETIC_PID """ hid_column = 3 more_header = 'made-empty,made-persons' columns = 18 aid.mkdir(hh_path) with open(hh_path, 'w') as hh_csv, open(gq_path, 'w') as gq_csv: print('writing', os.path.abspath(hh_path), os.path.abspath(gq_path)) file_count = 0 hids = set() for in_file_path in in_file_paths: print('reading', in_file_path) with open(in_file_path, 'r') as fin: for raw in fin: line = raw.strip('\n') if line.startswith('SERIALNO'): file_count += 1 if file_count > 1: continue row = line + ',' + more_header aid.write_and_check_columns(hh_csv, row, columns) aid.write_and_check_columns(gq_csv, row, columns) else: cells = line.split(',') hid = cells[hid_column] if hid not in hids: hids.add(hid) row = line + ',' + ',' + str(hid2cnt[hid]) aid.write_and_check_columns(hh_csv, row, columns)
def _save_hh_as_csv(in_file_paths, hh_path, gq_path): type_column = 1 relp_column = 17 columns = 29 aid.mkdir(hh_path) aid.mkdir(gq_path) with open(hh_path, 'w') as hh_csv, open(gq_path, 'w') as gq_csv: print('writing', os.path.abspath(hh_path), os.path.abspath(gq_path)) file_count = 0 for in_file_path in in_file_paths: print('reading', in_file_path) with open(in_file_path, 'r') as fin: for raw in fin: line = raw.strip('\n') cells = line.split(',') relate = cells[relp_column] is_header = relate == 'RELP' if is_header: file_count += 1 if file_count == 1: row = line + ',made-gq_type' aid.write_and_check_columns(hh_csv, row, columns) aid.write_and_check_columns(gq_csv, row, columns) continue if relate == '0': row = line + ',' csv = hh_csv if cells[type_column] == '1' else gq_csv aid.write_and_check_columns(csv, row, columns)
def out_hh_file(in_file_paths, mapper, out_file_path): # SERIALNO,puma_id,place_id,SYNTHETIC_HID,longitude, latitude,AGEGRP,HRSWRK,IMMSTAT,INCTAX, # MODE,OCC,POB,RELIGION,SEX, SYNTHETIC_PID+dummy print('writing', os.path.abspath(out_file_path)) HID_COLUMN = 3 aid.mkdir(out_file_path) hids = set() with open(out_file_path, 'w') as fout: file_count = 0 for in_file_path in in_file_paths: print('reading', in_file_path) with open(in_file_path, 'r') as fin: for line in fin: cells = line.strip('\n').split(',') if line.startswith('SERIALNO'): cells.append('dummy') file_count += 1 if file_count > 1: continue else: cells.append('') hid = cells[HID_COLUMN] if hid not in hids or hid == 'SYNTHETIC_HID': hids.add(hid) mapped_cells = [mapper(x) for x in cells] row = ','.join(mapped_cells) fout.write(row + '\n')
def out_hh_file(in_file_paths, mapper, hid2hincome, out_file_path, gq_path): # COUNTRY,YEAR,SERIALNO,PERSONS,puma_id, HHTYPE,PERNUM,place_id,SYNTHETIC_HID,longitude, # latitude,AGE,SEX,RACE,SCHOOL, INCTOT,SYNTHETIC_PID+made-age,made-race,made-income, # made-empty PERSONS_COLUMN = 3 HHTYPE_COLUMN = 5 HID_COLUMN = 8 AGE_COLUMN = 11 RACE_COLUMN = 13 columns = 21 aid.mkdir(out_file_path) hids = set() with open(out_file_path, 'w') as fout, open(gq_path, 'w') as gq_csv: print('writing', os.path.abspath(out_file_path)) file_count = 0 for in_file_path in in_file_paths: with open(in_file_path, 'r') as fin: print('reading', in_file_path) for line in fin: cells = line.strip('\n').split(',') if line.startswith('COUNTRY'): file_count += 1 if file_count == 1: row = ','.join( cells ) + ',made-age,made-race,made-income,made-empty' #row = ','.join(mapper(x) for x in cells) + ',hh_age,hh_race,hh_income,dummy' aid.write_and_check_number_of_columns( fout, row, columns) aid.write_and_check_number_of_columns( gq_csv, row, columns) continue hid = cells[HID_COLUMN] if hid not in hids: hids.add(hid) age = cells[AGE_COLUMN] cells.append(to_agep(age)) race = cells[RACE_COLUMN] cells.append(str(race2rac1p.get(race, race))) cells.append(str(hid2hincome[hid])) cells.append('') row = ','.join(cells) htype = cells[HHTYPE_COLUMN] if htype == '11': aid.write_and_check_number_of_columns( gq_csv, row, columns) else: aid.write_and_check_number_of_columns( fout, row, columns) persons = cells[PERSONS_COLUMN] if int(persons) > 20: msg = 'Warning: max persons according to NP is 20 but got' print(msg, persons, ':', row, file=sys.stderr, flush=True)
def out_pp_file(in_file_paths, mapper, pp_path, gq_path): # COUNTRY,YEAR,SERIALNO,PERSONS,puma_id, HHTYPE,PERNUM,place_id,SYNTHETIC_HID,longitude, # latitude,AGE,SEX,RACE,SCHOOL, INCTOT,SYNTHETIC_PID+made-sporder,made-age,made-empty, # made-race HHTYPE_COLUMN = 5 HID_COLUMN = 8 AGE_COLUMN = 11 RACE_COLUMN = 13 INCTOT_COLUMN = 15 columns = 21 hid2cnt = {} hid2hincome = {} wp_ids = set() sc_ids = set() aid.mkdir(pp_path) aid.mkdir(gq_path) with open(pp_path, 'w') as pp_csv, open(gq_path, 'w') as gq_csv: print('writing', os.path.abspath(pp_path)) file_count = 0 for in_file_path in in_file_paths: with open(in_file_path, 'r') as fin: print('reading', in_file_path) for line in fin: cells = line.rstrip('\n').split(',') if line.startswith('COUNTRY'): file_count += 1 if file_count == 1: cells.append( 'made-sporder,made-age,made-empty,made-race') row = ','.join(cells) aid.write_and_check_number_of_columns( pp_csv, row, columns) aid.write_and_check_number_of_columns( gq_csv, row, columns) continue hid = cells[HID_COLUMN] order = hid2cnt.get(hid, 0) + 1 hid2cnt[hid] = order income = int('0' + cells[INCTOT_COLUMN]) hid2hincome[hid] = hid2hincome.get(hid, 0) + income cells.append(str(order)) age = cells[AGE_COLUMN] cells.append(to_agep(age)) cells.append('') race = cells[RACE_COLUMN] cells.append(str(race2rac1p.get(race, race))) row = ','.join(cells) htype = cells[HHTYPE_COLUMN] if htype == '11': aid.write_and_check_number_of_columns( gq_csv, row, columns) else: aid.write_and_check_number_of_columns( pp_csv, row, columns) return hid2cnt.keys() | set(), sc_ids, wp_ids, hid2hincome
def _save_pp_as_csv(in_file_paths, pp_path, gq_pp_path): type_column = 1 hid_column = 7 age_column = 14 relp_column = 17 school_column = 26 workplace_column = 27 columns = 29 hid2cnt = {} hids = set() wp_ids = set() sc_ids = set() aid.mkdir(pp_path) aid.mkdir(gq_pp_path) with open(pp_path, 'w') as pp_csv, open(gq_pp_path, 'w') as gq_pp_csv: print('writing', os.path.abspath(pp_path), os.path.abspath(gq_pp_path)) csvs = [pp_csv, gq_pp_csv] file_count = 0 for in_file_path in in_file_paths: with open(in_file_path, 'r') as fin: print('reading', in_file_path) for raw in fin: line = raw.strip('\n') is_header = line.startswith('RT') if is_header: file_count += 1 if file_count > 1: continue row = line + ',made-sporder' for csv in csvs: aid.write_and_check_columns(csv, row, columns) continue cells = line.split(',') school_id = cells[school_column] age = cells[age_column] if school_id: if int(age) > 19: print('Warning: too old at age of', age, 'to go to school ID =', school_id, ':', line) # continue sc_ids.add(school_id) hid = cells[hid_column] if cells[relp_column] == '0': hids.add(hid) order = hid2cnt.get(hid, 0) order += 1 hid2cnt[hid] = order workplace_id = cells[workplace_column] wp_ids.add(workplace_id) csv = pp_csv if cells[type_column] == '1' else gq_pp_csv row = line + ',' + str(order) aid.write_and_check_columns(csv, row, columns) return hids, sc_ids, wp_ids
def out_pp_file(in_file_paths, mapper, out_file_path): # SERIALNO,puma_id,place_id,SYNTHETIC_HID,longitude, latitude,AGEGRP,HRSWRK,IMMSTAT,INCTAX, # MODE,OCC,POB,RELIGION,SEX, SYNTHETIC_PID+sporder,dummy,sex,age print('writing', os.path.abspath(out_file_path)) HID_COLUMN = 3 SEX_COLUMN = 14 RELP_COLUMN = 6 SCHOOL_COLUMN = 0 WORKPLACE_COLUMN = 0 AGEGRP_COLUMN = 6 hid2cnt = {} hids = set() wp_ids = set() sc_ids = set() skips = 0 aid.mkdir(out_file_path) reversed_sex = {'1':'2', '2':'1'} with open(out_file_path, 'w') as fout: file_count = 0 for in_file_path in in_file_paths: with open(in_file_path, 'r') as fin: print('reading', in_file_path) for line in fin: cells = line.rstrip('\n').split(',') if line.startswith('SERIALNO'): file_count += 1 if file_count > 1: continue cells.append('sporder,dummy,sex,age') else: hid = cells[HID_COLUMN] if cells[RELP_COLUMN] == '0': hids.add(hid) order = hid2cnt.get(hid, 0) order += 1 cells.append(str(order)) cells.append('') sex = cells[SEX_COLUMN] cells.append(reversed_sex.get(sex, sex)) agegroup = cells[AGEGRP_COLUMN] cells.append(to_age(agegroup)) hid2cnt[hid] = order #school_id = cells[SCHOOL_COLUMN] #sc_ids.add(school_id) #workplace_id = cells[WORKPLACE_COLUMN] #wp_ids.add(workplace_id) row = ','.join([mapper(x) for x in cells]) fout.write(row + "\n") print('Skipped', skips, 'rows due to private schools') return hid2cnt.keys() | set(), sc_ids, wp_ids
def _save_hh_as_csv(in_file_paths, hid2hincome, hh_path, gq_path): """ 0 COUNTRY,YEAR,SERIALNO,PERSONS,puma_id, 5 HHTYPE,PERNUM,place_id,SYNTHETIC_HID,longitude, 10 latitude,AGE,SEX,RACE,SCHOOL, 15 INCTOT,SYNTHETIC_PID+made-age,made-race,made-income, 20 made-empty """ persons_column = 3 hhtype_column = 5 hid_column = 8 age_column = 11 race_column = 13 more_header = 'made-age,made-race,made-income,made-empty' columns = 21 hids = set() aid.mkdir(hh_path) aid.mkdir(gq_path) with open(hh_path, 'w') as hh_csv, open(gq_path, 'w') as gq_csv: abspath = os.path.abspath print('writing', abspath(hh_path), abspath(gq_path)) file_count = 0 for in_file_path in in_file_paths: with open(in_file_path, 'r') as fin: print('reading', in_file_path) for raw in fin: line = raw.strip('\n') if line.startswith('COUNTRY'): file_count += 1 if file_count == 1: row = ','.join([line, more_header]) aid.write_and_check_columns(hh_csv, row, columns) aid.write_and_check_columns(gq_csv, row, columns) continue cells = line.split(',') hid = cells[hid_column] if hid not in hids: hids.add(hid) age = cells[age_column] race = cells[race_column] row = ','.join([ line, _to_agep(age), str(race2rac1p.get(race, race)), str(hid2hincome[hid]), '' ]) csv = gq_csv if cells[hhtype_column] == '11' else hh_csv aid.write_and_check_columns(csv, row, columns) persons = cells[persons_column] if int(persons) > 20: msg = 'Warning: max persons of NP is 20 but got' print(msg, persons, ':', row)
def _save_pp_as_csv(in_file_paths, pp_path, gq_path): """ 0 COUNTRY,YEAR,SERIALNO,PERSONS,puma_id, 5 HHTYPE,PERNUM,place_id,SYNTHETIC_HID,longitude, 10 latitude,AGE,SEX,RACE,SCHOOL, 15 INCTOT,SYNTHETIC_PID+made-sporder,made-age,made-empty, 20 made-race """ hhtype_column = 5 hid_column = 8 age_column = 11 race_column = 13 inctot_column = 15 more_header = 'made-sporder,made-age,made-empty,made-race' columns = 21 hid2cnt = {} hid2hincome = {} aid.mkdir(pp_path) aid.mkdir(gq_path) with open(pp_path, 'w') as pp_csv, open(gq_path, 'w') as gq_csv: print('writing', os.path.abspath(pp_path), os.path.abspath(gq_path)) file_count = 0 for in_file_path in in_file_paths: with open(in_file_path, 'r') as fin: print('reading', in_file_path) for raw in fin: line = raw.rstrip('\n') if line.startswith('COUNTRY'): file_count += 1 if file_count == 1: row = ','.join([line, more_header]) aid.write_and_check_columns(pp_csv, row, columns) aid.write_and_check_columns(gq_csv, row, columns) continue cells = line.split(',') hid = cells[hid_column] order = hid2cnt.get(hid, 0) + 1 age = cells[age_column] race = cells[race_column] row = ','.join([ line, str(order), _to_agep(age), '', str(race2rac1p.get(race, race)) ]) csv = gq_csv if cells[hhtype_column] == '11' else pp_csv aid.write_and_check_columns(csv, row, columns) hid2cnt[hid] = order income = int('0' + cells[inctot_column]) hid2hincome[hid] = hid2hincome.get(hid, 0) + income return hid2cnt.keys() | set(), hid2hincome
def out_ref_hh_file(in_file_paths, mapper, out_file_path): HID_COLUMN = 7 result = set() aid.mkdir(out_file_path) with open(out_file_path, 'w') as fout: print('writing', os.path.abspath(out_file_path)) for in_file_path in in_file_paths: print('reading', in_file_path) with open(in_file_path, 'r') as fin: for line in fin: cells = line.split(',') if len(cells) <= HID_COLUMN: print(line) result.add(cells[HID_COLUMN]) row = ','.join(mapper(x) for x in cells) fout.write(row) return result
def out_ref_hh_file(in_file_paths, mapper, out_file_path): # SERIALNO,puma_id,place_id,SYNTHETIC_HID,longitude, latitude print('writing', os.path.abspath(out_file_path)) HID_COLUMN = 3 result = set() aid.mkdir(out_file_path) with open(out_file_path, 'w') as fout: for in_file_path in in_file_paths: print('reading', in_file_path) with open(in_file_path, 'r') as fin: for line in fin: cells = line.split(',') if len(cells) <= HID_COLUMN: print(line) result.add(cells[HID_COLUMN]) row = ','.join([mapper(x) for x in cells]) fout.write(row) return result
def translate(states): print('Started translating counties in', states) path = 'logs/' aid.mkdir(path) with open(path + 'counties.' + str(datetime.now()), 'w') as common: sys.stdout = common sys.stderr = common print('Translating', states) for state in states: if state == 'input': continue aid.log_time('Translating state ID = ' + state) try: pp_csvs = spew.find_csvs(conf.pp_prefix, state) counties = set([to_county_id(csv) for csv in pp_csvs]) print(counties, flush=True) for county in counties: try: prefix = path + state + '/' + county stdout = prefix + '.out' aid.mkdir(stdout) if os.path.exists(stdout): print( stdout, 'already exists. Delete it if you want to rerun.' ) continue aid.log_time('Translating county ID = ' + county) sys.stdout = open(stdout, 'w') sys.stderr = open(prefix + '.err', 'w') us.translate(county) except Exception as e: aid.log_error(e) finally: aid.log_time() sys.stdout = common sys.stderr = common except Exception as e: aid.log_error(e) finally: aid.log_time() sys.stdout = common sys.stderr = common aid.log_time('Done')
def out_pp_file(env_path, in_file_paths, mapper, out_file_path): HID_COLUMN = 7 RELP_COLUMN = 17 SCHOOL_COLUMN = 26 WORKPLACE_COLUMN = 27 AGE_COLUMN = 14 hid2cnt = {} hids = set() wp_ids = set() sc_ids = set() aid.mkdir(out_file_path) with open(out_file_path, 'w') as fout: file_count = 0 print('writing', os.path.abspath(out_file_path)) for in_file_path in in_file_paths: with open(in_file_path, 'r') as fin: print('reading', in_file_path) for line in fin: cells = line.rstrip('\n').split(',') if line.startswith('RT'): file_count += 1 if file_count > 1: continue cells.append('sporder') school_id = cells[SCHOOL_COLUMN] age = cells[AGE_COLUMN] if school_id and age != 'AGEP': if int(age) > 19: print('Skipped due to too old at age of ' + age + ' to go to school ID =', school_id, ':', line.rstrip('\n')) continue sc_ids.add(school_id) hid = cells[HID_COLUMN] if cells[RELP_COLUMN] == '0': hids.add(hid) order = hid2cnt.get(hid, 0) order += 1 cells.append(str(order)) hid2cnt[hid] = order workplace_id = cells[WORKPLACE_COLUMN] wp_ids.add(workplace_id) row = ','.join(mapper(x) for x in cells) fout.write(row + "\n") return hid2cnt.keys() | set(), sc_ids, wp_ids
def _save_pp_as_csv(in_file_paths, pp_path, gq_pp_path): """ 0 SERIALNO,puma_id,place_id,SYNTHETIC_HID,longitude, 5 latitude,AGEGRP,HRSWRK,IMMSTAT,INCTAX, 10 MODE,OCC,POB,RELIGION,SEX, 15 SYNTHETIC_PID """ hid_column = 3 agegrp_column = 6 sex_column = 14 more_headers = 'made-sporder,made-empty,made-sex,made-age' columns = 20 hid2cnt = {} aid.mkdir(pp_path) aid.mkdir(gq_pp_path) with open(pp_path, 'w') as pp_csv, open(gq_pp_path, 'w') as gq_pp_csv: print('writing', os.path.abspath(pp_path), os.path.abspath(gq_pp_path)) file_count = 0 for in_file_path in in_file_paths: with open(in_file_path, 'r') as fin: print('reading', in_file_path) for raw in fin: line = raw.rstrip('\n') if line.startswith('SERIALNO'): file_count += 1 if file_count > 1: continue row = line + ',' + more_headers aid.write_and_check_columns(pp_csv, row, columns) aid.write_and_check_columns(gq_pp_csv, row, columns) else: cells = line.split(',') sex = cells[sex_column] agegroup = cells[agegrp_column] hid = cells[hid_column] order = hid2cnt.get(hid, 0) + 1 hid2cnt[hid] = order row = ','.join([ line, str(order), '', _reversed_sex.get(sex, sex), _to_age(agegroup) ]) aid.write_and_check_columns(pp_csv, row, columns) return hid2cnt
def out_hh_file(in_file_paths, mapper, out_file_path): RELP_COLUMN = 17 aid.mkdir(out_file_path) with open(out_file_path, 'w') as fout: print('writing', os.path.abspath(out_file_path)) file_count = 0 for in_file_path in in_file_paths: print('reading', in_file_path) with open(in_file_path, 'r') as fin: for line in fin: cells = line.split(',') relate = cells[RELP_COLUMN] if relate == 'RELP': file_count += 1 if file_count > 1: continue if relate == '0' or relate == 'RELP': mapped_cells = (mapper(x) for x in cells) row = ','.join(mapped_cells) fout.write(row)