Example #1
0
    def __init__(self,
                 key,
                 controls_csv,
                 tazset=None,
                 puma_data_dir=None,
                 fips_file=None,
                 write_households_csv=None,
                 write_persons_csv=None,
                 write_append=False,
                 start_hhid=1,
                 start_persid=1):
        pd.options.display.width = 200
        pd.options.display.float_format = '{:,.3f}'.format
        pd.options.display.max_columns = 30

        # start ids here
        self.start_hhid = start_hhid
        self.start_persid = start_persid

        # Starter.__init__(self, key, '06', '075')
        self.c = Census(key, base_url=puma_data_dir, fips_url=fips_file)

        self.hh_csvfile = None
        if write_households_csv:
            self.hh_csvfile = open(write_households_csv,
                                   'a' if write_append else 'w')
        self.per_csvfile = None
        if write_persons_csv:
            self.per_csvfile = open(write_persons_csv,
                                    'a' if write_append else 'w')
        # if appending, no header
        if write_append:
            self.wrote_hh_header = True
            self.wrote_pers_header = True

        # Read the control file
        print "\n\nReading the control file [%s]" % controls_csv
        self.controls = pd.read_csv(controls_csv, index_col=False)

        # Limit to only the specified TAZs
        if tazset and len(tazset) > 0:
            print "Using only TAZs in %s" % str(tazset)
            self.controls = self.controls[self.controls.SFTAZ.isin(tazset)]

        self.tazToPUMA2010 = pd.read_csv(
            r"Q:\Model Development\Population Synthesizer\4. Geographic Work\Census 2010 PUMAs\TAZ2454_to_Census2010PUMAs.csv",
            index_col=0,
            converters={'PUMA2010': str})

        self.state = '06'

        # for caching - indexed by puma
        self.h_pums = {}
        self.jd_households = {}
        self.p_pums = {}
        self.jd_persons = {}
Example #2
0
def get_acs_data(county, spec, settings):
        state = settings['state']
        census_year = settings['census_year'] 
        if settings['tract'] ==  'None':
            tract = None
        else:
            tract = settings['tract']

        c = Census(os.environ[settings['census_key']])

        hh_bg_columns = get_column_names('block_group', 'household', spec)
        hh_tract_columns = get_column_names('tract', 'household', spec)

        if len([x for x in hh_bg_columns if x in hh_tract_columns]) > 0:
            raise RuntimeError("The same acs column is being used as block group and tract. Please check expression file.")

        h_acs = c.block_group_and_tract_query(
            hh_bg_columns, hh_tract_columns, state, county,
            merge_columns=['tract', 'county', 'state'],
            block_group_size_attr=settings['hh_bg_size_attr'],
            tract_size_attr=settings['hh_tract_size_attr'],
            tract=tract,
            year=census_year)
        
        pers_bg_columns = get_column_names('block_group', 'person', spec)
        pers_tract_columns = get_column_names('tract', 'person', spec)

        if len([x for x in pers_bg_columns if x in pers_tract_columns]) > 0:
            raise RuntimeError("The same acs column is being used as block group and tract. Please check expression file.")

        p_acs = c.block_group_and_tract_query(
            pers_bg_columns, pers_tract_columns, state, county,
            merge_columns=['tract', 'county', 'state'],
            block_group_size_attr=settings['pers_bg_size_attr'],
            tract_size_attr=settings['pers_tract_size_attr'],
            tract=tract,
            year = census_year)

        all_acs = h_acs.merge(p_acs, how = 'left', on = ['state', 'county', 'tract', 'block group'])
        
        return all_acs
Example #3
0
    def __init__(self, key, controls_csv, tazset=None, puma_data_dir=None, fips_file=None,
                  write_households_csv=None, write_persons_csv=None, write_append=False,
                  start_hhid=1, start_persid=1):
        pd.options.display.width        = 200
        pd.options.display.float_format = '{:,.3f}'.format
        pd.options.display.max_columns  = 30

        # start ids here
        self.start_hhid     = start_hhid
        self.start_persid   = start_persid
        
        # Starter.__init__(self, key, '06', '075')
        self.c = Census(key, base_url=puma_data_dir, fips_url=fips_file)
        
        self.hh_csvfile = None
        if write_households_csv:
            self.hh_csvfile  = open(write_households_csv, 'a' if write_append else 'w')
        self.per_csvfile = None
        if write_persons_csv:
            self.per_csvfile = open(write_persons_csv, 'a' if write_append else 'w')
        # if appending, no header
        if write_append:
            self.wrote_hh_header = True
            self.wrote_pers_header = True
                        
        # Read the control file
        print "\n\nReading the control file [%s]" % controls_csv
        self.controls = pd.read_csv(controls_csv, index_col = False)

        # Limit to only the specified TAZs
        if tazset and len(tazset)>0:
            print "Using only TAZs in %s" % str(tazset)
            self.controls = self.controls[self.controls.SFTAZ.isin(tazset)]

        self.tazToPUMA2010 = pd.read_csv(r"Q:\Model Development\Population Synthesizer\4. Geographic Work\Census 2010 PUMAs\TAZ2454_to_Census2010PUMAs.csv",
                                         index_col=0, converters = {'PUMA2010':str})
        
        self.state = '06'

        # for caching - indexed by puma
        self.h_pums         = {}
        self.jd_households  = {}
        self.p_pums         = {}
        self.jd_persons     = {}
Example #4
0
def c():
    return Census('bfa6b4e541243011fab6307a31aed9e91015ba90')
Example #5
0
    state='25' 
    state_code='ma'
elif city=='Detroit': 
    state='26'
    state_code='mi'
    
ALL_ZONES_PATH='./scripts/cities/'+city+'/clean/model_area.geojson'
SIM_ZONES_PATH='./scripts/cities/'+city+'/clean/sim_zones.json'
OD_PATH='./scripts/cities/'+city+'/raw/LODES/'+state_code+'_od_main_JT00_2015.csv'
ALL_SYNTH_HH_PATH='./scripts/cities/'+city+'/clean/all_synth_hh.csv'
ALL_SYNTH_PERSONS_PATH='./scripts/cities/'+city+'/clean/all_synth_persons.csv'
SIM_POP_PATH='./scripts/cities/'+city+'/clean/sim_pop.json'
VACANT_PATH='./scripts/cities/'+city+'/clean/vacant.json'
FLOATING_PATH='./scripts/cities/'+city+'/clean/floating.json'

c = Census('7a25a7624075d46f112113d33106b6648f42686a')

# load the block group geojson for the whole area
# get set of tracts covered

# identify the data we want at tract and block group level

#Households
income_columns = ['B19001_0%02dE'%i for i in range(1, 18)]
vehicle_columns = ['B08201_0%02dE'%i for i in range(1, 7)]
workers_columns = ['B08202_0%02dE'%i for i in range(1, 6)]
families_columns = ['B11001_001E', 'B11001_002E']
# year_built_columns= ['B25034_001E', 'B25034_002E', 'B25034_003E'] 
# includes vacant structures?
tenure_columns=['B25063_001E', 'B25075_001E']
block_group_columns = income_columns + families_columns + tenure_columns
Example #6
0
class SFCTAStarter(Starter):
    """
    The SFCTA starter takes the tazdata as input and formulates the marginal controls from there.

    Parameters
    ----------
    key : string
        Census API key for census_helpers.Census object
    
    Returns
    -------
    household_marginals : DataFrame
        Marginals per TAZ for the household data
    person_marginals : DataFrame
        Marginals per TAZ for the person data
    household_jointdist : DataFrame
        joint distributions for the households (from PUMS), one joint
        distribution for each PUMA (one row per PUMA)
    person_jointdist : DataFrame
        joint distributions for the persons (from PUMS), one joint
        distribution for each PUMA (one row per PUMA)
    """
    def __init__(self,
                 key,
                 controls_csv,
                 tazset=None,
                 puma_data_dir=None,
                 fips_file=None,
                 write_households_csv=None,
                 write_persons_csv=None,
                 write_append=False,
                 start_hhid=1,
                 start_persid=1):
        pd.options.display.width = 200
        pd.options.display.float_format = '{:,.3f}'.format
        pd.options.display.max_columns = 30

        # start ids here
        self.start_hhid = start_hhid
        self.start_persid = start_persid

        # Starter.__init__(self, key, '06', '075')
        self.c = Census(key, base_url=puma_data_dir, fips_url=fips_file)

        self.hh_csvfile = None
        if write_households_csv:
            self.hh_csvfile = open(write_households_csv,
                                   'a' if write_append else 'w')
        self.per_csvfile = None
        if write_persons_csv:
            self.per_csvfile = open(write_persons_csv,
                                    'a' if write_append else 'w')
        # if appending, no header
        if write_append:
            self.wrote_hh_header = True
            self.wrote_pers_header = True

        # Read the control file
        print "\n\nReading the control file [%s]" % controls_csv
        self.controls = pd.read_csv(controls_csv, index_col=False)

        # Limit to only the specified TAZs
        if tazset and len(tazset) > 0:
            print "Using only TAZs in %s" % str(tazset)
            self.controls = self.controls[self.controls.SFTAZ.isin(tazset)]

        self.tazToPUMA2010 = pd.read_csv(
            r"Q:\Model Development\Population Synthesizer\4. Geographic Work\Census 2010 PUMAs\TAZ2454_to_Census2010PUMAs.csv",
            index_col=0,
            converters={'PUMA2010': str})

        self.state = '06'

        # for caching - indexed by puma
        self.h_pums = {}
        self.jd_households = {}
        self.p_pums = {}
        self.jd_persons = {}

    def get_geography_name(self):
        return "SFTAZ"

    def get_num_geographies(self):
        return len(self.controls)

    def get_available_geography_ids(self):
        # print "get_available_geography_ids"
        # return the ids of the geographies, in this case a state, county,
        # tract, block_group id tuple
        for tup in self.person_controls.index:  # [:30]:
            yield pd.Series(tup, index=self.person_controls.index.names)

    def get_household_marginal_for_geography(self, ind):
        """

        Parameters
        ----------
        ind : Series
            Labels are from get_geography_name(), in our case, just SFTAZ
        
        Returns
        -------
        Series
            Household marginals for this geography.
        """
        if type(self.hh_controls.index) is pd.MultiIndex:
            return self.hh_controls.loc[tuple(ind.values)]
        return self.hh_controls.loc[ind.values[0]]

    def get_person_marginal_for_geography(self, ind):
        """"
        Parameters
        ----------
        ind : Series
            Labels are from get_geography_name()
        
        Returns
        -------
        Series
            Person marginals for this geography.
        """
        if type(self.person_controls.index) is pd.MultiIndex:
            return self.person_controls.loc[tuple(ind.values)]
        return self.person_controls.loc[ind.values[0]]

    def get_pums(self, puma):
        """
        Fetch the PUMA data for households and persons and set all kinds of variables up
        according to SFCTA defs.
        """

        # this is cached so won't download more than once
        h_pums = self.c.download_household_pums(self.state, puma)

        # Get some attributes from the persons in the households
        # Household age categories
        p_pums = self.c.download_population_pums(self.state, puma)
        p_pums['_hhadlt'] = p_pums['AGEP'] >= 16
        p_pums['_hh65up'] = p_pums['AGEP'] >= 65
        p_pums['_hh5064'] = (p_pums['AGEP'] >= 50) & (p_pums['AGEP'] <= 64)
        p_pums['_hh3549'] = (p_pums['AGEP'] >= 35) & (p_pums['AGEP'] <= 49)
        p_pums['_hh2534'] = (p_pums['AGEP'] >= 25) & (p_pums['AGEP'] <= 34)
        p_pums['_hh1824'] = (p_pums['AGEP'] >= 18) & (p_pums['AGEP'] <= 24)
        p_pums['_hh1217'] = (p_pums['AGEP'] >= 12) & (p_pums['AGEP'] <= 17)
        p_pums['_hhc511'] = (p_pums['AGEP'] >= 5) & (p_pums['AGEP'] <= 11)
        p_pums['_hhchu5'] = (p_pums['AGEP'] < 5)

        p_pums['race'] = p_pums['RAC1P']
        p_pums.loc[p_pums.loc[:, 'HISP'] > 1, 'race'] = 10

        # worker: ESR (Employment Status Recode) in 1,2,4,5
        # full time: WKHP (usual hours worked per week) >= 35
        p_pums['_hhfull'] = ((p_pums['ESR'] == 1) | (p_pums['ESR'] == 2) |
                             (p_pums['ESR'] == 4) |
                             (p_pums['ESR'] == 5)) & (p_pums['WKHP'] >= 35)
        # part time: WKHP < 35
        p_pums['_hhpart'] = ((p_pums['ESR'] == 1) | (p_pums['ESR'] == 2) |
                             (p_pums['ESR'] == 4) |
                             (p_pums['ESR'] == 5)) & (p_pums['WKHP'] < 35)

        p_pums['employ'] = 5  # not employed
        p_pums.loc[p_pums._hhfull == True, 'employ'] = 1
        p_pums.loc[p_pums._hhpart == True, 'employ'] = 2
        # employed but Class of Worker = Self-employed
        p_pums.loc[(p_pums.employ < 5) & ((p_pums.COW == 6) |
                                          (p_pums.COW == 7)), 'employ'] += 2

        p_pums['educn'] = 0
        p_pums.loc[p_pums.SCHG == 1, 'educn'] = 1  # Nursery school/preschool
        p_pums.loc[p_pums.SCHG == 2, 'educn'] = 2  # Kindergarten
        p_pums.loc[(p_pums.SCHG >= 3) & (p_pums.SCHG <= 6),
                   'educn'] = 3  # Grade 1-4
        p_pums.loc[(p_pums.SCHG >= 7) & (p_pums.SCHG <= 10),
                   'educn'] = 4  # Grade 5-8
        p_pums.loc[(p_pums.SCHG >= 11) & (p_pums.SCHG <= 14),
                   'educn'] = 5  # Grade 9-12
        p_pums.loc[p_pums.SCHG == 15, 'educn'] = 6  # College undergraduate
        p_pums.loc[p_pums.SCHG == 16,
                   'educn'] = 7  # Graduate or professional school

        # recode
        p_pums['relat'] = -1
        p_pums.loc[p_pums.RELP <= 10, 'relat'] = p_pums.RELP + 1
        p_pums.loc[p_pums.RELP >= 11, 'relat'] = p_pums.RELP + 6
        assert (len(p_pums.loc[p_pums.relat < 1]) == 0)

        # age of head of household
        p_pums['hhage'] = 0
        p_pums.loc[(p_pums.RELP == 0) | (p_pums.RELP == 16) |
                   (p_pums.RELP == 17), 'hhage'] = p_pums.AGEP

        # flag is group-quarter person is a child
        p_pums['gqchild'] = 0
        p_pums.loc[p_pums.AGEP < 18, 'gqchild'] = 1

        # group them to household unit serial number and sum
        people_grouped = p_pums.loc[:, [
            'serialno', '_hhadlt', '_hh65up', '_hh5064', '_hh3549', '_hh2534',
            '_hh1824', '_hh1217', '_hhc511', '_hhchu5', '_hhfull', '_hhpart',
            'hhage', 'PINCP', 'gqchild'
        ]].groupby(['serialno'])
        people_grouped_sum = people_grouped.sum()
        people_grouped_sum.rename(columns={
            '_hhadlt': 'hhadlt',
            '_hh65up': 'hh65up',
            '_hh5064': 'hh5064',
            '_hh3549': 'hh3549',
            '_hh2534': 'hh2534',
            '_hh1824': 'hh1824',
            '_hh1217': 'hh1217',
            '_hhc511': 'hhc511',
            '_hhchu5': 'hhchu5',
            '_hhfull': 'hhfull',
            '_hhpart': 'hhpart'
        },
                                  inplace=True)
        people_grouped_sum.reset_index(inplace=True)

        # These shouldn't be floats but pandas is summing bools that way
        # https://github.com/pydata/pandas/issues/7001
        cols = [
            'hhadlt', 'hh65up', 'hh5064', 'hh3549', 'hh2534', 'hh1824',
            'hh1217', 'hhc511', 'hhchu5', 'hhfull', 'hhpart', 'hhage',
            'gqchild'
        ]

        people_grouped_sum[cols] = people_grouped_sum[cols].astype(int)

        h_pums = h_pums.merge(people_grouped_sum, how='left')
        h_pums['workers'] = h_pums['hhfull'] + h_pums['hhpart']

        return h_pums, p_pums

    def write_households(self, geog_id, households):
        if self.hh_csvfile == None:
            return False

        # store the households for persons, filtering out zero-person households
        self.households = households
        self.hh_geog_id = geog_id

        # add TAZ
        self.households['taz'] = geog_id.SFTAZ
        self.households.index.name = 'hh_id'

        # hhid = sequential.  hh_id = original
        self.households['hhid'] = range(self.start_hhid,
                                        self.start_hhid + len(self.households))
        self.start_hhid = self.start_hhid + len(self.households)

        self.households.to_csv(self.hh_csvfile,
                               index=False,
                               header=not self.wrote_hh_header)
        self.wrote_hh_header = True
        print "Wrote %d households" % len(households)
        return True

    def write_persons(self, geog_id, people):
        if self.per_csvfile == None:
            return False

        print "Will write %d people" % len(people)
        # print self.households

        # get rid of extraneous columns
        people = people.loc[:, [
            'race', 'employ', 'educn', 'relat', 'serialno', 'SPORDER',
            'PUMA00', 'PUMA10', 'NP', 'AGEP', 'TYPE', 'ESR', 'WKHP', 'COW',
            'SEX', 'RAC1P', 'HISP', 'SCHG', 'DIS', 'cat_id', 'hh_id'
        ]]

        # we want the taz column
        people['taz'] = geog_id.SFTAZ

        # get some columns from households
        hhs = self.households.loc[:, [
            'serialno', 'hhid', 'hhadlt', 'hh65up', 'hh5064', 'hh3549',
            'hh2534', 'hh1824', 'hh1217', 'hhc511', 'hhchu5', 'hhfull',
            'hhpart', 'workers', 'VEH', 'hhinc', 'income_cat'
        ]]
        # make the hh_id an actual column (not the index) for joining
        hhs.reset_index(drop=False, inplace=True)
        people = people.merge(hhs, how='left', on='hh_id')

        # rename some of these
        people.rename(columns={
            'NP': 'hhsize',
            'VEH': 'hhvehs',
            'SEX': 'gender',
            'AGEP': 'age'
        },
                      inplace=True)
        # this might be blank so it's a float: make it an int
        people.hhvehs = people.hhvehs.fillna(0.0).astype(int)

        # calculate a few fields
        people.sort(columns=['hh_id', 'SPORDER'], inplace=True)
        people['persid'] = range(self.start_persid,
                                 self.start_persid + len(people))
        self.start_persid = self.start_persid + len(people)

        # these are the columns we want
        # http://intranet/Modeling/DisaggregateInputOutputKey
        output_fields = \
            ['hhid',
             'persid',
             'taz',
             'hhsize',
             'hhadlt',
             'hh65up','hh5064','hh3549','hh2534','hh1824','hh1217','hhc511','hhchu5',
             'hhfull','hhpart','hhvehs',
             'hhinc',
             'gender',
             'age',
             'relat',
             'race',
             'employ',
             'educn',
             'ESR','WKHP','SCHG','TYPE','DIS']

        # for field in output_fields:
        #     try:
        #         assert(field in list(people.columns.values))
        #     except Exception as e:
        #         print e
        #         print field

        people = people.loc[:, output_fields]

        # This should be a test!
        # people_allages = people['hh65up']+people['hh5064']+people['hh3549']+ \
        #                  people['hh2534']+people['hh1824']+people['hh1217']+ \
        #                  people['hhc511']+people['hhchu5']
        # people_allages = people_allages.astype(np.int64)
        # from pandas.util.testing import assert_series_equal
        # assert_series_equal(people_allages,people['hhsize'])

        people.to_csv(self.per_csvfile,
                      index=False,
                      header=not self.wrote_pers_header,
                      float_format="%.3f")
        self.wrote_pers_header = True
        print "Wrote %d people" % len(people)
        return True
Example #7
0
class SFCTAStarter(Starter):
    """
    The SFCTA starter takes the tazdata as input and formulates the marginal controls from there.

    Parameters
    ----------
    key : string
        Census API key for census_helpers.Census object
    
    Returns
    -------
    household_marginals : DataFrame
        Marginals per TAZ for the household data
    person_marginals : DataFrame
        Marginals per TAZ for the person data
    household_jointdist : DataFrame
        joint distributions for the households (from PUMS), one joint
        distribution for each PUMA (one row per PUMA)
    person_jointdist : DataFrame
        joint distributions for the persons (from PUMS), one joint
        distribution for each PUMA (one row per PUMA)
    """
    def __init__(self, key, controls_csv, tazset=None, puma_data_dir=None, fips_file=None,
                  write_households_csv=None, write_persons_csv=None, write_append=False,
                  start_hhid=1, start_persid=1):
        pd.options.display.width        = 200
        pd.options.display.float_format = '{:,.3f}'.format
        pd.options.display.max_columns  = 30

        # start ids here
        self.start_hhid     = start_hhid
        self.start_persid   = start_persid
        
        # Starter.__init__(self, key, '06', '075')
        self.c = Census(key, base_url=puma_data_dir, fips_url=fips_file)
        
        self.hh_csvfile = None
        if write_households_csv:
            self.hh_csvfile  = open(write_households_csv, 'a' if write_append else 'w')
        self.per_csvfile = None
        if write_persons_csv:
            self.per_csvfile = open(write_persons_csv, 'a' if write_append else 'w')
        # if appending, no header
        if write_append:
            self.wrote_hh_header = True
            self.wrote_pers_header = True
                        
        # Read the control file
        print "\n\nReading the control file [%s]" % controls_csv
        self.controls = pd.read_csv(controls_csv, index_col = False)

        # Limit to only the specified TAZs
        if tazset and len(tazset)>0:
            print "Using only TAZs in %s" % str(tazset)
            self.controls = self.controls[self.controls.SFTAZ.isin(tazset)]

        self.tazToPUMA2010 = pd.read_csv(r"Q:\Model Development\Population Synthesizer\4. Geographic Work\Census 2010 PUMAs\TAZ2454_to_Census2010PUMAs.csv",
                                         index_col=0, converters = {'PUMA2010':str})
        
        self.state = '06'

        # for caching - indexed by puma
        self.h_pums         = {}
        self.jd_households  = {}
        self.p_pums         = {}
        self.jd_persons     = {}

    def get_geography_name(self):
        return "SFTAZ"

    def get_num_geographies(self):
        return len(self.controls)

    def get_available_geography_ids(self):
        # print "get_available_geography_ids"
        # return the ids of the geographies, in this case a state, county,
        # tract, block_group id tuple
        for tup in self.person_controls.index: # [:30]:
            yield pd.Series(tup, index=self.person_controls.index.names)

    def get_household_marginal_for_geography(self, ind):
        """

        Parameters
        ----------
        ind : Series
            Labels are from get_geography_name(), in our case, just SFTAZ
        
        Returns
        -------
        Series
            Household marginals for this geography.
        """
        if type(self.hh_controls.index) is pd.MultiIndex:
            return self.hh_controls.loc[tuple(ind.values)]
        return self.hh_controls.loc[ind.values[0]]

    
    def get_person_marginal_for_geography(self, ind):
        """"
        Parameters
        ----------
        ind : Series
            Labels are from get_geography_name()
        
        Returns
        -------
        Series
            Person marginals for this geography.
        """
        if type(self.person_controls.index) is pd.MultiIndex:
            return self.person_controls.loc[tuple(ind.values)]
        return self.person_controls.loc[ind.values[0]]


    def get_pums(self, puma):
        """
        Fetch the PUMA data for households and persons and set all kinds of variables up
        according to SFCTA defs.
        """
        
        # this is cached so won't download more than once
        h_pums = self.c.download_household_pums(self.state, puma)
        
        # Get some attributes from the persons in the households
        # Household age categories
        p_pums = self.c.download_population_pums(self.state, puma)
        p_pums['_hhadlt'] = p_pums['AGEP']>=16
        p_pums['_hh65up'] = p_pums['AGEP']>=65
        p_pums['_hh5064'] = (p_pums['AGEP']>=50)&(p_pums['AGEP']<=64)
        p_pums['_hh3549'] = (p_pums['AGEP']>=35)&(p_pums['AGEP']<=49)
        p_pums['_hh2534'] = (p_pums['AGEP']>=25)&(p_pums['AGEP']<=34)
        p_pums['_hh1824'] = (p_pums['AGEP']>=18)&(p_pums['AGEP']<=24)
        p_pums['_hh1217'] = (p_pums['AGEP']>=12)&(p_pums['AGEP']<=17)
        p_pums['_hhc511'] = (p_pums['AGEP']>= 5)&(p_pums['AGEP']<=11)
        p_pums['_hhchu5'] = (p_pums['AGEP']<  5)
            
        p_pums['race'] = p_pums['RAC1P']
        p_pums.loc[p_pums.loc[:,'HISP']>1, 'race'] = 10

        # worker: ESR (Employment Status Recode) in 1,2,4,5
        # full time: WKHP (usual hours worked per week) >= 35
        p_pums['_hhfull'] = ((p_pums['ESR']==1)|(p_pums['ESR']==2)|(p_pums['ESR']==4)|(p_pums['ESR']==5))&(p_pums['WKHP']>=35)
        # part time: WKHP < 35 
        p_pums['_hhpart'] = ((p_pums['ESR']==1)|(p_pums['ESR']==2)|(p_pums['ESR']==4)|(p_pums['ESR']==5))&(p_pums['WKHP']< 35)
        
        p_pums['employ']  = 5  # not employed
        p_pums.loc[p_pums._hhfull==True, 'employ'] = 1
        p_pums.loc[p_pums._hhpart==True, 'employ'] = 2
        # employed but Class of Worker = Self-employed
        p_pums.loc[(p_pums.employ<5)&((p_pums.COW==6)|(p_pums.COW==7)), 'employ'] += 2
        
        p_pums['educn'] = 0
        p_pums.loc[p_pums.SCHG==1, 'educn'] = 1    # Nursery school/preschool
        p_pums.loc[p_pums.SCHG==2, 'educn'] = 2    # Kindergarten
        p_pums.loc[(p_pums.SCHG>= 3)&(p_pums.SCHG<= 6), 'educn'] = 3 # Grade 1-4
        p_pums.loc[(p_pums.SCHG>= 7)&(p_pums.SCHG<=10), 'educn'] = 4 # Grade 5-8
        p_pums.loc[(p_pums.SCHG>=11)&(p_pums.SCHG<=14), 'educn'] = 5 # Grade 9-12
        p_pums.loc[p_pums.SCHG==15, 'educn'] = 6    # College undergraduate
        p_pums.loc[p_pums.SCHG==16, 'educn'] = 7    # Graduate or professional school
        
        # recode
        p_pums['relat'] = -1
        p_pums.loc[p_pums.RELP <= 10, 'relat'] = p_pums.RELP + 1
        p_pums.loc[p_pums.RELP >= 11, 'relat'] = p_pums.RELP + 6
        assert(len(p_pums.loc[p_pums.relat < 1])==0)

        # age of head of household
        p_pums['hhage'] = 0
        p_pums.loc[(p_pums.RELP==0) | (p_pums.RELP==16) | (p_pums.RELP==17), 'hhage'] = p_pums.AGEP
        
        # flag is group-quarter person is a child
        p_pums['gqchild'] = 0
        p_pums.loc[p_pums.AGEP < 18, 'gqchild'] = 1

        # group them to household unit serial number and sum
        people_grouped = p_pums.loc[:,['serialno',
                                       '_hhadlt','_hh65up','_hh5064',
                                       '_hh3549','_hh2534','_hh1824',
                                       '_hh1217','_hhc511','_hhchu5',
                                       '_hhfull','_hhpart','hhage','PINCP','gqchild']].groupby(['serialno'])
        people_grouped_sum = people_grouped.sum()
        people_grouped_sum.rename(columns={'_hhadlt':'hhadlt',
                                           '_hh65up':'hh65up',
                                           '_hh5064':'hh5064',
                                           '_hh3549':'hh3549',
                                           '_hh2534':'hh2534',
                                           '_hh1824':'hh1824',
                                           '_hh1217':'hh1217',
                                           '_hhc511':'hhc511',
                                           '_hhchu5':'hhchu5',
                                           '_hhfull':'hhfull',
                                           '_hhpart':'hhpart'}, inplace=True)
        people_grouped_sum.reset_index(inplace=True)
        
        # These shouldn't be floats but pandas is summing bools that way
        # https://github.com/pydata/pandas/issues/7001
        cols = ['hhadlt','hh65up','hh5064','hh3549','hh2534','hh1824',
                'hh1217','hhc511','hhchu5','hhfull','hhpart','hhage','gqchild']

        people_grouped_sum[cols] = people_grouped_sum[cols].astype(int)
        
        h_pums = h_pums.merge(people_grouped_sum, how='left')
        h_pums['workers'] = h_pums['hhfull']+h_pums['hhpart']
        
        return h_pums, p_pums
    
    def write_households(self, geog_id, households):
        if self.hh_csvfile == None:
            return False
            
        # store the households for persons, filtering out zero-person households
        self.households = households
        self.hh_geog_id = geog_id

        # add TAZ
        self.households['taz'] = geog_id.SFTAZ 
        self.households.index.name = 'hh_id'           
        
        # hhid = sequential.  hh_id = original
        self.households['hhid'] = range(self.start_hhid, self.start_hhid+len(self.households))
        self.start_hhid = self.start_hhid + len(self.households)

        self.households.to_csv(self.hh_csvfile, index=False, header=not self.wrote_hh_header)
        self.wrote_hh_header = True
        print "Wrote %d households" % len(households)
        return True
        
    def write_persons(self, geog_id, people):
        if self.per_csvfile == None:
            return False
        
        print "Will write %d people" % len(people)
        # print self.households
        
        # get rid of extraneous columns
        people = people.loc[:,['race','employ','educn','relat','serialno',
                               'SPORDER','PUMA00','PUMA10',
                               'NP','AGEP','TYPE','ESR','WKHP','COW','SEX',
                               'RAC1P','HISP','SCHG','DIS','cat_id','hh_id']]
            
        # we want the taz column
        people['taz'] = geog_id.SFTAZ

        # get some columns from households
        hhs = self.households.loc[:,
                                  ['serialno','hhid',
                                   'hhadlt',
                                   'hh65up','hh5064','hh3549',
                                   'hh2534','hh1824','hh1217',
                                   'hhc511','hhchu5',
                                   'hhfull','hhpart','workers',
                                   'VEH',
                                   'hhinc','income_cat']]
        # make the hh_id an actual column (not the index) for joining
        hhs.reset_index(drop=False, inplace=True)
        people = people.merge(hhs, how='left', on='hh_id')

        # rename some of these
        people.rename(columns={'NP':'hhsize',
                               'VEH':'hhvehs',
                               'SEX':'gender',
                               'AGEP':'age'}, inplace=True)
        # this might be blank so it's a float: make it an int
        people.hhvehs = people.hhvehs.fillna(0.0).astype(int)
        
        # calculate a few fields
        people.sort(columns=['hh_id','SPORDER'], inplace=True)
        people['persid'] = range(self.start_persid, self.start_persid+len(people))
        self.start_persid = self.start_persid + len(people)
        
        # these are the columns we want
        # http://intranet/Modeling/DisaggregateInputOutputKey
        output_fields = \
            ['hhid',
             'persid',
             'taz',
             'hhsize',
             'hhadlt',
             'hh65up','hh5064','hh3549','hh2534','hh1824','hh1217','hhc511','hhchu5',
             'hhfull','hhpart','hhvehs',
             'hhinc',
             'gender',
             'age',
             'relat',
             'race',
             'employ',
             'educn',
             'ESR','WKHP','SCHG','TYPE','DIS']
            
        # for field in output_fields:
        #     try:
        #         assert(field in list(people.columns.values))
        #     except Exception as e:
        #         print e
        #         print field
            
        people = people.loc[:,output_fields]

        # This should be a test!
        # people_allages = people['hh65up']+people['hh5064']+people['hh3549']+ \
        #                  people['hh2534']+people['hh1824']+people['hh1217']+ \
        #                  people['hhc511']+people['hhchu5']
        # people_allages = people_allages.astype(np.int64)
        # from pandas.util.testing import assert_series_equal
        # assert_series_equal(people_allages,people['hhsize'])

        people.to_csv(self.per_csvfile, index=False, header=not self.wrote_pers_header, float_format="%.3f")
        self.wrote_pers_header = True
        print "Wrote %d people" % len(people)
        return True