def _save_wp_as_csv(env_path, out_file_path, wp_ids): in_file_path = env_path + '/workplaces.csv' ids = set() columns = 8 with open(out_file_path, 'w') as fout: print('writing', os.path.abspath(out_file_path)) with open(in_file_path) as fin: print('reading', in_file_path) for raw in fin: line = raw.strip('\n') cells = line.split(',') wkb_hex = str(cells[5][1:-1]) wp_id = cells[1] if wkb_hex == 'wkb_geometry': row = 'made-longitude,made-latitude,' + line aid.write_and_check_columns(fout, row, columns) elif wp_id in wp_ids: row = wp.to_long_lat_from_hex(wkb_hex) + ',' + line aid.write_and_check_columns(fout, row, columns) ids.add(wp_id) difference = wp_ids.difference(ids) difference.discard('') difference.discard('workplace_id') if difference: raise Exception(difference, "are not found!")
def _save_pp_as_csv(in_file_paths, pp_path, gq_pp_path): type_column = 1 hid_column = 7 age_column = 14 relp_column = 17 school_column = 26 workplace_column = 27 columns = 29 hid2cnt = {} hids = set() wp_ids = set() sc_ids = set() aid.mkdir(pp_path) aid.mkdir(gq_pp_path) with open(pp_path, 'w') as pp_csv, open(gq_pp_path, 'w') as gq_pp_csv: print('writing', os.path.abspath(pp_path), os.path.abspath(gq_pp_path)) csvs = [pp_csv, gq_pp_csv] file_count = 0 for in_file_path in in_file_paths: with open(in_file_path, 'r') as fin: print('reading', in_file_path) for raw in fin: line = raw.strip('\n') is_header = line.startswith('RT') if is_header: file_count += 1 if file_count > 1: continue row = line + ',made-sporder' for csv in csvs: aid.write_and_check_columns(csv, row, columns) continue cells = line.split(',') school_id = cells[school_column] age = cells[age_column] if school_id: if int(age) > 19: print('Warning: too old at age of', age, 'to go to school ID =', school_id, ':', line) # continue sc_ids.add(school_id) hid = cells[hid_column] if cells[relp_column] == '0': hids.add(hid) order = hid2cnt.get(hid, 0) order += 1 hid2cnt[hid] = order workplace_id = cells[workplace_column] wp_ids.add(workplace_id) csv = pp_csv if cells[type_column] == '1' else gq_pp_csv row = line + ',' + str(order) aid.write_and_check_columns(csv, row, columns) return hids, sc_ids, wp_ids
def _save_hh_as_csv(in_file_paths, hid2cnt, hh_path, gq_path): """ 0 SERIALNO,puma_id,place_id,SYNTHETIC_HID,longitude, 5 latitude,AGEGRP,HRSWRK,IMMSTAT,INCTAX, 10 MODE,OCC,POB,RELIGION,SEX, 15 SYNTHETIC_PID """ hid_column = 3 more_header = 'made-empty,made-persons' columns = 18 aid.mkdir(hh_path) with open(hh_path, 'w') as hh_csv, open(gq_path, 'w') as gq_csv: print('writing', os.path.abspath(hh_path), os.path.abspath(gq_path)) file_count = 0 hids = set() for in_file_path in in_file_paths: print('reading', in_file_path) with open(in_file_path, 'r') as fin: for raw in fin: line = raw.strip('\n') if line.startswith('SERIALNO'): file_count += 1 if file_count > 1: continue row = line + ',' + more_header aid.write_and_check_columns(hh_csv, row, columns) aid.write_and_check_columns(gq_csv, row, columns) else: cells = line.split(',') hid = cells[hid_column] if hid not in hids: hids.add(hid) row = line + ',' + ',' + str(hid2cnt[hid]) aid.write_and_check_columns(hh_csv, row, columns)
def _save_hh_as_csv(in_file_paths, hh_path, gq_path): type_column = 1 relp_column = 17 columns = 29 aid.mkdir(hh_path) aid.mkdir(gq_path) with open(hh_path, 'w') as hh_csv, open(gq_path, 'w') as gq_csv: print('writing', os.path.abspath(hh_path), os.path.abspath(gq_path)) file_count = 0 for in_file_path in in_file_paths: print('reading', in_file_path) with open(in_file_path, 'r') as fin: for raw in fin: line = raw.strip('\n') cells = line.split(',') relate = cells[relp_column] is_header = relate == 'RELP' if is_header: file_count += 1 if file_count == 1: row = line + ',made-gq_type' aid.write_and_check_columns(hh_csv, row, columns) aid.write_and_check_columns(gq_csv, row, columns) continue if relate == '0': row = line + ',' csv = hh_csv if cells[type_column] == '1' else gq_csv aid.write_and_check_columns(csv, row, columns)
def _save_sc_as_csv(env_path, out_file_path, sc_ids): long_column = 5 columns = 11 in_file_paths = [ env_path + '/public_schools.csv', env_path + '/private_schools.csv' ] ids = set() with open(out_file_path, 'w') as fout: print('writing', os.path.abspath(out_file_path)) file_count = 0 for in_file_path in in_file_paths: with open(in_file_path) as fin: print('reading', in_file_path) for line in fin: cells = line.rstrip('\n').split(',') sc_id = cells[2][1:-1] if line.startswith('"","School"'): file_count += 1 if file_count > 1: continue row = ','.join(cells) + ',made-empty' aid.write_and_check_columns(fout, row, columns) elif sc_id in sc_ids: cells.append('') row = ','.join(cells[:long_column]) if len(cells) < columns: row += ',,' row += ',' + ','.join(cells[long_column:]) aid.write_and_check_columns(fout, row, columns) ids.add(sc_id) difference = sc_ids.difference(ids) difference.discard('') difference.discard('school_id') if difference: raise Exception(str(difference) + " are not found!") return ids
def _save_hh_as_csv(in_file_paths, hid2hincome, hh_path, gq_path): """ 0 COUNTRY,YEAR,SERIALNO,PERSONS,puma_id, 5 HHTYPE,PERNUM,place_id,SYNTHETIC_HID,longitude, 10 latitude,AGE,SEX,RACE,SCHOOL, 15 INCTOT,SYNTHETIC_PID+made-age,made-race,made-income, 20 made-empty """ persons_column = 3 hhtype_column = 5 hid_column = 8 age_column = 11 race_column = 13 more_header = 'made-age,made-race,made-income,made-empty' columns = 21 hids = set() aid.mkdir(hh_path) aid.mkdir(gq_path) with open(hh_path, 'w') as hh_csv, open(gq_path, 'w') as gq_csv: abspath = os.path.abspath print('writing', abspath(hh_path), abspath(gq_path)) file_count = 0 for in_file_path in in_file_paths: with open(in_file_path, 'r') as fin: print('reading', in_file_path) for raw in fin: line = raw.strip('\n') if line.startswith('COUNTRY'): file_count += 1 if file_count == 1: row = ','.join([line, more_header]) aid.write_and_check_columns(hh_csv, row, columns) aid.write_and_check_columns(gq_csv, row, columns) continue cells = line.split(',') hid = cells[hid_column] if hid not in hids: hids.add(hid) age = cells[age_column] race = cells[race_column] row = ','.join([ line, _to_agep(age), str(race2rac1p.get(race, race)), str(hid2hincome[hid]), '' ]) csv = gq_csv if cells[hhtype_column] == '11' else hh_csv aid.write_and_check_columns(csv, row, columns) persons = cells[persons_column] if int(persons) > 20: msg = 'Warning: max persons of NP is 20 but got' print(msg, persons, ':', row)
def _save_pp_as_csv(in_file_paths, pp_path, gq_path): """ 0 COUNTRY,YEAR,SERIALNO,PERSONS,puma_id, 5 HHTYPE,PERNUM,place_id,SYNTHETIC_HID,longitude, 10 latitude,AGE,SEX,RACE,SCHOOL, 15 INCTOT,SYNTHETIC_PID+made-sporder,made-age,made-empty, 20 made-race """ hhtype_column = 5 hid_column = 8 age_column = 11 race_column = 13 inctot_column = 15 more_header = 'made-sporder,made-age,made-empty,made-race' columns = 21 hid2cnt = {} hid2hincome = {} aid.mkdir(pp_path) aid.mkdir(gq_path) with open(pp_path, 'w') as pp_csv, open(gq_path, 'w') as gq_csv: print('writing', os.path.abspath(pp_path), os.path.abspath(gq_path)) file_count = 0 for in_file_path in in_file_paths: with open(in_file_path, 'r') as fin: print('reading', in_file_path) for raw in fin: line = raw.rstrip('\n') if line.startswith('COUNTRY'): file_count += 1 if file_count == 1: row = ','.join([line, more_header]) aid.write_and_check_columns(pp_csv, row, columns) aid.write_and_check_columns(gq_csv, row, columns) continue cells = line.split(',') hid = cells[hid_column] order = hid2cnt.get(hid, 0) + 1 age = cells[age_column] race = cells[race_column] row = ','.join([ line, str(order), _to_agep(age), '', str(race2rac1p.get(race, race)) ]) csv = gq_csv if cells[hhtype_column] == '11' else pp_csv aid.write_and_check_columns(csv, row, columns) hid2cnt[hid] = order income = int('0' + cells[inctot_column]) hid2hincome[hid] = hid2hincome.get(hid, 0) + income return hid2cnt.keys() | set(), hid2hincome
def _save_pp_as_csv(in_file_paths, pp_path, gq_pp_path): """ 0 SERIALNO,puma_id,place_id,SYNTHETIC_HID,longitude, 5 latitude,AGEGRP,HRSWRK,IMMSTAT,INCTAX, 10 MODE,OCC,POB,RELIGION,SEX, 15 SYNTHETIC_PID """ hid_column = 3 agegrp_column = 6 sex_column = 14 more_headers = 'made-sporder,made-empty,made-sex,made-age' columns = 20 hid2cnt = {} aid.mkdir(pp_path) aid.mkdir(gq_pp_path) with open(pp_path, 'w') as pp_csv, open(gq_pp_path, 'w') as gq_pp_csv: print('writing', os.path.abspath(pp_path), os.path.abspath(gq_pp_path)) file_count = 0 for in_file_path in in_file_paths: with open(in_file_path, 'r') as fin: print('reading', in_file_path) for raw in fin: line = raw.rstrip('\n') if line.startswith('SERIALNO'): file_count += 1 if file_count > 1: continue row = line + ',' + more_headers aid.write_and_check_columns(pp_csv, row, columns) aid.write_and_check_columns(gq_pp_csv, row, columns) else: cells = line.split(',') sex = cells[sex_column] agegroup = cells[agegrp_column] hid = cells[hid_column] order = hid2cnt.get(hid, 0) + 1 hid2cnt[hid] = order row = ','.join([ line, str(order), '', _reversed_sex.get(sex, sex), _to_age(agegroup) ]) aid.write_and_check_columns(pp_csv, row, columns) return hid2cnt