Esempio n. 1
0
def _save_hh_as_csv(in_file_paths, hid2cnt, hh_path, gq_path):
    """  0 SERIALNO,puma_id,place_id,SYNTHETIC_HID,longitude,
         5 latitude,AGEGRP,HRSWRK,IMMSTAT,INCTAX,
        10 MODE,OCC,POB,RELIGION,SEX,
        15 SYNTHETIC_PID """
    hid_column = 3
    more_header = 'made-empty,made-persons'
    columns = 18
    aid.mkdir(hh_path)
    with open(hh_path, 'w') as hh_csv, open(gq_path, 'w') as gq_csv:
        print('writing', os.path.abspath(hh_path), os.path.abspath(gq_path))
        file_count = 0
        hids = set()
        for in_file_path in in_file_paths:
            print('reading', in_file_path)
            with open(in_file_path, 'r') as fin:
                for raw in fin:
                    line = raw.strip('\n')
                    if line.startswith('SERIALNO'):
                        file_count += 1
                        if file_count > 1:
                            continue
                        row = line + ',' + more_header
                        aid.write_and_check_columns(hh_csv, row, columns)
                        aid.write_and_check_columns(gq_csv, row, columns)
                    else:
                        cells = line.split(',')
                        hid = cells[hid_column]
                        if hid not in hids:
                            hids.add(hid)
                            row = line + ',' + ',' + str(hid2cnt[hid])
                            aid.write_and_check_columns(hh_csv, row, columns)
Esempio n. 2
0
def _save_hh_as_csv(in_file_paths, hh_path, gq_path):
    type_column = 1
    relp_column = 17
    columns = 29
    aid.mkdir(hh_path)
    aid.mkdir(gq_path)
    with open(hh_path, 'w') as hh_csv, open(gq_path, 'w') as gq_csv:
        print('writing', os.path.abspath(hh_path), os.path.abspath(gq_path))
        file_count = 0
        for in_file_path in in_file_paths:
            print('reading', in_file_path)
            with open(in_file_path, 'r') as fin:
                for raw in fin:
                    line = raw.strip('\n')
                    cells = line.split(',')
                    relate = cells[relp_column]
                    is_header = relate == 'RELP'
                    if is_header:
                        file_count += 1
                        if file_count == 1:
                            row = line + ',made-gq_type'
                            aid.write_and_check_columns(hh_csv, row, columns)
                            aid.write_and_check_columns(gq_csv, row, columns)
                        continue
                    if relate == '0':
                        row = line + ','
                        csv = hh_csv if cells[type_column] == '1' else gq_csv
                        aid.write_and_check_columns(csv, row, columns)
Esempio n. 3
0
def out_hh_file(in_file_paths, mapper, out_file_path):
    # SERIALNO,puma_id,place_id,SYNTHETIC_HID,longitude, latitude,AGEGRP,HRSWRK,IMMSTAT,INCTAX,
    # MODE,OCC,POB,RELIGION,SEX, SYNTHETIC_PID+dummy
    print('writing', os.path.abspath(out_file_path))
    HID_COLUMN = 3
    aid.mkdir(out_file_path)
    hids = set()
    with open(out_file_path, 'w') as fout:
        file_count = 0
        for in_file_path in in_file_paths:
            print('reading', in_file_path)
            with open(in_file_path, 'r') as fin:
                for line in fin:
                    cells = line.strip('\n').split(',')
                    if line.startswith('SERIALNO'):
                        cells.append('dummy')
                        file_count += 1
                        if file_count > 1:
                            continue
                    else:
                        cells.append('')
                    hid = cells[HID_COLUMN]
                    if hid not in hids or hid == 'SYNTHETIC_HID':
                        hids.add(hid)
                        mapped_cells = [mapper(x) for x in cells]
                        row = ','.join(mapped_cells)
                        fout.write(row + '\n')
Esempio n. 4
0
def out_hh_file(in_file_paths, mapper, hid2hincome, out_file_path, gq_path):
    # COUNTRY,YEAR,SERIALNO,PERSONS,puma_id, HHTYPE,PERNUM,place_id,SYNTHETIC_HID,longitude,
    # latitude,AGE,SEX,RACE,SCHOOL, INCTOT,SYNTHETIC_PID+made-age,made-race,made-income,
    # made-empty
    PERSONS_COLUMN = 3
    HHTYPE_COLUMN = 5
    HID_COLUMN = 8
    AGE_COLUMN = 11
    RACE_COLUMN = 13
    columns = 21
    aid.mkdir(out_file_path)
    hids = set()
    with open(out_file_path, 'w') as fout, open(gq_path, 'w') as gq_csv:
        print('writing', os.path.abspath(out_file_path))
        file_count = 0
        for in_file_path in in_file_paths:
            with open(in_file_path, 'r') as fin:
                print('reading', in_file_path)
                for line in fin:
                    cells = line.strip('\n').split(',')
                    if line.startswith('COUNTRY'):
                        file_count += 1
                        if file_count == 1:
                            row = ','.join(
                                cells
                            ) + ',made-age,made-race,made-income,made-empty'
                            #row = ','.join(mapper(x) for x in cells) + ',hh_age,hh_race,hh_income,dummy'
                            aid.write_and_check_number_of_columns(
                                fout, row, columns)
                            aid.write_and_check_number_of_columns(
                                gq_csv, row, columns)
                        continue
                    hid = cells[HID_COLUMN]
                    if hid not in hids:
                        hids.add(hid)
                        age = cells[AGE_COLUMN]
                        cells.append(to_agep(age))
                        race = cells[RACE_COLUMN]
                        cells.append(str(race2rac1p.get(race, race)))
                        cells.append(str(hid2hincome[hid]))
                        cells.append('')
                        row = ','.join(cells)
                        htype = cells[HHTYPE_COLUMN]
                        if htype == '11':
                            aid.write_and_check_number_of_columns(
                                gq_csv, row, columns)
                        else:
                            aid.write_and_check_number_of_columns(
                                fout, row, columns)
                        persons = cells[PERSONS_COLUMN]
                        if int(persons) > 20:
                            msg = 'Warning: max persons according to NP is 20 but got'
                            print(msg,
                                  persons,
                                  ':',
                                  row,
                                  file=sys.stderr,
                                  flush=True)
Esempio n. 5
0
def out_pp_file(in_file_paths, mapper, pp_path, gq_path):
    # COUNTRY,YEAR,SERIALNO,PERSONS,puma_id, HHTYPE,PERNUM,place_id,SYNTHETIC_HID,longitude,
    # latitude,AGE,SEX,RACE,SCHOOL, INCTOT,SYNTHETIC_PID+made-sporder,made-age,made-empty,
    # made-race
    HHTYPE_COLUMN = 5
    HID_COLUMN = 8
    AGE_COLUMN = 11
    RACE_COLUMN = 13
    INCTOT_COLUMN = 15
    columns = 21
    hid2cnt = {}
    hid2hincome = {}
    wp_ids = set()
    sc_ids = set()
    aid.mkdir(pp_path)
    aid.mkdir(gq_path)
    with open(pp_path, 'w') as pp_csv, open(gq_path, 'w') as gq_csv:
        print('writing', os.path.abspath(pp_path))
        file_count = 0
        for in_file_path in in_file_paths:
            with open(in_file_path, 'r') as fin:
                print('reading', in_file_path)
                for line in fin:
                    cells = line.rstrip('\n').split(',')
                    if line.startswith('COUNTRY'):
                        file_count += 1
                        if file_count == 1:
                            cells.append(
                                'made-sporder,made-age,made-empty,made-race')
                            row = ','.join(cells)
                            aid.write_and_check_number_of_columns(
                                pp_csv, row, columns)
                            aid.write_and_check_number_of_columns(
                                gq_csv, row, columns)
                        continue
                    hid = cells[HID_COLUMN]
                    order = hid2cnt.get(hid, 0) + 1
                    hid2cnt[hid] = order
                    income = int('0' + cells[INCTOT_COLUMN])
                    hid2hincome[hid] = hid2hincome.get(hid, 0) + income
                    cells.append(str(order))
                    age = cells[AGE_COLUMN]
                    cells.append(to_agep(age))
                    cells.append('')
                    race = cells[RACE_COLUMN]
                    cells.append(str(race2rac1p.get(race, race)))
                    row = ','.join(cells)
                    htype = cells[HHTYPE_COLUMN]
                    if htype == '11':
                        aid.write_and_check_number_of_columns(
                            gq_csv, row, columns)
                    else:
                        aid.write_and_check_number_of_columns(
                            pp_csv, row, columns)
    return hid2cnt.keys() | set(), sc_ids, wp_ids, hid2hincome
Esempio n. 6
0
def _save_pp_as_csv(in_file_paths, pp_path, gq_pp_path):
    type_column = 1
    hid_column = 7
    age_column = 14
    relp_column = 17
    school_column = 26
    workplace_column = 27
    columns = 29
    hid2cnt = {}
    hids = set()
    wp_ids = set()
    sc_ids = set()
    aid.mkdir(pp_path)
    aid.mkdir(gq_pp_path)
    with open(pp_path, 'w') as pp_csv, open(gq_pp_path, 'w') as gq_pp_csv:
        print('writing', os.path.abspath(pp_path), os.path.abspath(gq_pp_path))
        csvs = [pp_csv, gq_pp_csv]
        file_count = 0
        for in_file_path in in_file_paths:
            with open(in_file_path, 'r') as fin:
                print('reading', in_file_path)
                for raw in fin:
                    line = raw.strip('\n')
                    is_header = line.startswith('RT')
                    if is_header:
                        file_count += 1
                        if file_count > 1:
                            continue
                        row = line + ',made-sporder'
                        for csv in csvs:
                            aid.write_and_check_columns(csv, row, columns)
                        continue
                    cells = line.split(',')
                    school_id = cells[school_column]
                    age = cells[age_column]
                    if school_id:
                        if int(age) > 19:
                            print('Warning: too old at age of', age,
                                  'to go to school ID =', school_id, ':', line)
                            # continue
                    sc_ids.add(school_id)
                    hid = cells[hid_column]
                    if cells[relp_column] == '0':
                        hids.add(hid)
                    order = hid2cnt.get(hid, 0)
                    order += 1
                    hid2cnt[hid] = order
                    workplace_id = cells[workplace_column]
                    wp_ids.add(workplace_id)
                    csv = pp_csv if cells[type_column] == '1' else gq_pp_csv
                    row = line + ',' + str(order)
                    aid.write_and_check_columns(csv, row, columns)
    return hids, sc_ids, wp_ids
Esempio n. 7
0
def out_pp_file(in_file_paths, mapper, out_file_path):
    # SERIALNO,puma_id,place_id,SYNTHETIC_HID,longitude, latitude,AGEGRP,HRSWRK,IMMSTAT,INCTAX,
    # MODE,OCC,POB,RELIGION,SEX, SYNTHETIC_PID+sporder,dummy,sex,age
    print('writing', os.path.abspath(out_file_path))
    HID_COLUMN = 3
    SEX_COLUMN = 14
    RELP_COLUMN = 6
    SCHOOL_COLUMN = 0
    WORKPLACE_COLUMN = 0
    AGEGRP_COLUMN = 6
    hid2cnt = {}
    hids = set()
    wp_ids = set()
    sc_ids = set()
    skips = 0
    aid.mkdir(out_file_path)
    reversed_sex = {'1':'2', '2':'1'}
    with open(out_file_path, 'w') as fout:
        file_count = 0
        for in_file_path in in_file_paths:
            with open(in_file_path, 'r') as fin:
                print('reading', in_file_path)
                for line in fin:
                    cells = line.rstrip('\n').split(',')
                    if line.startswith('SERIALNO'):
                        file_count += 1
                        if file_count > 1:
                            continue
                        cells.append('sporder,dummy,sex,age')
                    else:
                        hid = cells[HID_COLUMN]
                        if cells[RELP_COLUMN] == '0':
                            hids.add(hid)
                        order = hid2cnt.get(hid, 0)
                        order += 1
                        cells.append(str(order))
                        cells.append('')
                        sex = cells[SEX_COLUMN]
                        cells.append(reversed_sex.get(sex, sex))
                        agegroup = cells[AGEGRP_COLUMN]
                        cells.append(to_age(agegroup))
                        hid2cnt[hid] = order
                        #school_id = cells[SCHOOL_COLUMN]
                        #sc_ids.add(school_id)
                        #workplace_id = cells[WORKPLACE_COLUMN]
                        #wp_ids.add(workplace_id)
                    row = ','.join([mapper(x) for x in cells])
                    fout.write(row + "\n")
    print('Skipped', skips, 'rows due to private schools')
    return hid2cnt.keys() | set(), sc_ids, wp_ids
Esempio n. 8
0
def _save_hh_as_csv(in_file_paths, hid2hincome, hh_path, gq_path):
    """  0 COUNTRY,YEAR,SERIALNO,PERSONS,puma_id,
         5 HHTYPE,PERNUM,place_id,SYNTHETIC_HID,longitude,
        10 latitude,AGE,SEX,RACE,SCHOOL,
        15 INCTOT,SYNTHETIC_PID+made-age,made-race,made-income,
        20 made-empty """
    persons_column = 3
    hhtype_column = 5
    hid_column = 8
    age_column = 11
    race_column = 13
    more_header = 'made-age,made-race,made-income,made-empty'
    columns = 21
    hids = set()
    aid.mkdir(hh_path)
    aid.mkdir(gq_path)
    with open(hh_path, 'w') as hh_csv, open(gq_path, 'w') as gq_csv:
        abspath = os.path.abspath
        print('writing', abspath(hh_path), abspath(gq_path))
        file_count = 0
        for in_file_path in in_file_paths:
            with open(in_file_path, 'r') as fin:
                print('reading', in_file_path)
                for raw in fin:
                    line = raw.strip('\n')
                    if line.startswith('COUNTRY'):
                        file_count += 1
                        if file_count == 1:
                            row = ','.join([line, more_header])
                            aid.write_and_check_columns(hh_csv, row, columns)
                            aid.write_and_check_columns(gq_csv, row, columns)
                        continue
                    cells = line.split(',')
                    hid = cells[hid_column]
                    if hid not in hids:
                        hids.add(hid)
                        age = cells[age_column]
                        race = cells[race_column]
                        row = ','.join([
                            line,
                            _to_agep(age),
                            str(race2rac1p.get(race, race)),
                            str(hid2hincome[hid]), ''
                        ])
                        csv = gq_csv if cells[hhtype_column] == '11' else hh_csv
                        aid.write_and_check_columns(csv, row, columns)
                        persons = cells[persons_column]
                        if int(persons) > 20:
                            msg = 'Warning: max persons of NP is 20 but got'
                            print(msg, persons, ':', row)
Esempio n. 9
0
def _save_pp_as_csv(in_file_paths, pp_path, gq_path):
    """  0 COUNTRY,YEAR,SERIALNO,PERSONS,puma_id,
         5 HHTYPE,PERNUM,place_id,SYNTHETIC_HID,longitude,
        10 latitude,AGE,SEX,RACE,SCHOOL,
        15 INCTOT,SYNTHETIC_PID+made-sporder,made-age,made-empty,
        20 made-race """
    hhtype_column = 5
    hid_column = 8
    age_column = 11
    race_column = 13
    inctot_column = 15
    more_header = 'made-sporder,made-age,made-empty,made-race'
    columns = 21
    hid2cnt = {}
    hid2hincome = {}
    aid.mkdir(pp_path)
    aid.mkdir(gq_path)
    with open(pp_path, 'w') as pp_csv, open(gq_path, 'w') as gq_csv:
        print('writing', os.path.abspath(pp_path), os.path.abspath(gq_path))
        file_count = 0
        for in_file_path in in_file_paths:
            with open(in_file_path, 'r') as fin:
                print('reading', in_file_path)
                for raw in fin:
                    line = raw.rstrip('\n')
                    if line.startswith('COUNTRY'):
                        file_count += 1
                        if file_count == 1:
                            row = ','.join([line, more_header])
                            aid.write_and_check_columns(pp_csv, row, columns)
                            aid.write_and_check_columns(gq_csv, row, columns)
                        continue
                    cells = line.split(',')
                    hid = cells[hid_column]
                    order = hid2cnt.get(hid, 0) + 1
                    age = cells[age_column]
                    race = cells[race_column]
                    row = ','.join([
                        line,
                        str(order),
                        _to_agep(age), '',
                        str(race2rac1p.get(race, race))
                    ])
                    csv = gq_csv if cells[hhtype_column] == '11' else pp_csv
                    aid.write_and_check_columns(csv, row, columns)
                    hid2cnt[hid] = order
                    income = int('0' + cells[inctot_column])
                    hid2hincome[hid] = hid2hincome.get(hid, 0) + income
    return hid2cnt.keys() | set(), hid2hincome
Esempio n. 10
0
def out_ref_hh_file(in_file_paths, mapper, out_file_path):
    HID_COLUMN = 7
    result = set()
    aid.mkdir(out_file_path)
    with open(out_file_path, 'w') as fout:
        print('writing', os.path.abspath(out_file_path))
        for in_file_path in in_file_paths:
            print('reading', in_file_path)
            with open(in_file_path, 'r') as fin:
                for line in fin:
                    cells = line.split(',')
                    if len(cells) <= HID_COLUMN:
                        print(line)
                    result.add(cells[HID_COLUMN])
                    row = ','.join(mapper(x) for x in cells)
                    fout.write(row)
    return result
Esempio n. 11
0
def out_ref_hh_file(in_file_paths, mapper, out_file_path):
    # SERIALNO,puma_id,place_id,SYNTHETIC_HID,longitude, latitude
    print('writing', os.path.abspath(out_file_path))
    HID_COLUMN = 3
    result = set()
    aid.mkdir(out_file_path)
    with open(out_file_path, 'w') as fout:
        for in_file_path in in_file_paths:
            print('reading', in_file_path)
            with open(in_file_path, 'r') as fin:
                for line in fin:
                    cells = line.split(',')
                    if len(cells) <= HID_COLUMN:
                        print(line)
                    result.add(cells[HID_COLUMN])
                    row = ','.join([mapper(x) for x in cells])
                    fout.write(row)
    return result
Esempio n. 12
0
def translate(states):
    print('Started translating counties in', states)
    path = 'logs/'
    aid.mkdir(path)
    with open(path + 'counties.' + str(datetime.now()), 'w') as common:
        sys.stdout = common
        sys.stderr = common
        print('Translating', states)
        for state in states:
            if state == 'input':
                continue
            aid.log_time('Translating state ID = ' + state)
            try:
                pp_csvs = spew.find_csvs(conf.pp_prefix, state)
                counties = set([to_county_id(csv) for csv in pp_csvs])
                print(counties, flush=True)
                for county in counties:
                    try:
                        prefix = path + state + '/' + county
                        stdout = prefix + '.out'
                        aid.mkdir(stdout)
                        if os.path.exists(stdout):
                            print(
                                stdout,
                                'already exists. Delete it if you want to rerun.'
                            )
                            continue
                        aid.log_time('Translating county ID = ' + county)
                        sys.stdout = open(stdout, 'w')
                        sys.stderr = open(prefix + '.err', 'w')
                        us.translate(county)
                    except Exception as e:
                        aid.log_error(e)
                    finally:
                        aid.log_time()
                        sys.stdout = common
                        sys.stderr = common
            except Exception as e:
                aid.log_error(e)
            finally:
                aid.log_time()
                sys.stdout = common
                sys.stderr = common
        aid.log_time('Done')
Esempio n. 13
0
def out_pp_file(env_path, in_file_paths, mapper, out_file_path):
    HID_COLUMN = 7
    RELP_COLUMN = 17
    SCHOOL_COLUMN = 26
    WORKPLACE_COLUMN = 27
    AGE_COLUMN = 14
    hid2cnt = {}
    hids = set()
    wp_ids = set()
    sc_ids = set()
    aid.mkdir(out_file_path)
    with open(out_file_path, 'w') as fout:
        file_count = 0
        print('writing', os.path.abspath(out_file_path))
        for in_file_path in in_file_paths:
            with open(in_file_path, 'r') as fin:
                print('reading', in_file_path)
                for line in fin:
                    cells = line.rstrip('\n').split(',')
                    if line.startswith('RT'):
                        file_count += 1
                        if file_count > 1:
                            continue
                        cells.append('sporder')
                    school_id = cells[SCHOOL_COLUMN]
                    age = cells[AGE_COLUMN]
                    if school_id and age != 'AGEP':
                        if int(age) > 19:
                            print('Skipped due to too old at age of ' + age + ' to go to school ID =', school_id, ':', line.rstrip('\n'))
                            continue
                    sc_ids.add(school_id)
                    hid = cells[HID_COLUMN]
                    if cells[RELP_COLUMN] == '0':
                        hids.add(hid)
                    order = hid2cnt.get(hid, 0)
                    order += 1
                    cells.append(str(order))
                    hid2cnt[hid] = order
                    workplace_id = cells[WORKPLACE_COLUMN]
                    wp_ids.add(workplace_id)
                    row = ','.join(mapper(x) for x in cells)
                    fout.write(row + "\n")
    return hid2cnt.keys() | set(), sc_ids, wp_ids
Esempio n. 14
0
def _save_pp_as_csv(in_file_paths, pp_path, gq_pp_path):
    """  0 SERIALNO,puma_id,place_id,SYNTHETIC_HID,longitude,
         5 latitude,AGEGRP,HRSWRK,IMMSTAT,INCTAX,
        10 MODE,OCC,POB,RELIGION,SEX,
        15 SYNTHETIC_PID """
    hid_column = 3
    agegrp_column = 6
    sex_column = 14
    more_headers = 'made-sporder,made-empty,made-sex,made-age'
    columns = 20
    hid2cnt = {}
    aid.mkdir(pp_path)
    aid.mkdir(gq_pp_path)
    with open(pp_path, 'w') as pp_csv, open(gq_pp_path, 'w') as gq_pp_csv:
        print('writing', os.path.abspath(pp_path), os.path.abspath(gq_pp_path))
        file_count = 0
        for in_file_path in in_file_paths:
            with open(in_file_path, 'r') as fin:
                print('reading', in_file_path)
                for raw in fin:
                    line = raw.rstrip('\n')
                    if line.startswith('SERIALNO'):
                        file_count += 1
                        if file_count > 1:
                            continue
                        row = line + ',' + more_headers
                        aid.write_and_check_columns(pp_csv, row, columns)
                        aid.write_and_check_columns(gq_pp_csv, row, columns)
                    else:
                        cells = line.split(',')
                        sex = cells[sex_column]
                        agegroup = cells[agegrp_column]
                        hid = cells[hid_column]
                        order = hid2cnt.get(hid, 0) + 1
                        hid2cnt[hid] = order
                        row = ','.join([
                            line,
                            str(order), '',
                            _reversed_sex.get(sex, sex),
                            _to_age(agegroup)
                        ])
                        aid.write_and_check_columns(pp_csv, row, columns)
    return hid2cnt
Esempio n. 15
0
def out_hh_file(in_file_paths, mapper, out_file_path):
    RELP_COLUMN = 17
    aid.mkdir(out_file_path)
    with open(out_file_path, 'w') as fout:
        print('writing', os.path.abspath(out_file_path))
        file_count = 0
        for in_file_path in in_file_paths:
            print('reading', in_file_path)
            with open(in_file_path, 'r') as fin:
                for line in fin:
                    cells = line.split(',')
                    relate = cells[RELP_COLUMN]
                    if relate == 'RELP':
                        file_count += 1
                        if file_count > 1:
                            continue
                    if relate == '0' or relate == 'RELP':
                        mapped_cells = (mapper(x) for x in cells)
                        row = ','.join(mapped_cells)
                        fout.write(row)