def __init__(self, key, controls_csv, tazset=None, puma_data_dir=None, fips_file=None,
                  write_households_csv=None, write_persons_csv=None, write_append=False,
                  start_hhid=1, start_persid=1):
        SFCTAStarter.__init__(self, key, controls_csv, tazset, puma_data_dir, fips_file,
                              write_households_csv, write_persons_csv, write_append,
                              start_hhid, start_persid)

        # Remove 0-group quarters controls
        self.controls = self.controls[self.controls['GQPOP']>0]

        self.hh_controls     = cat.categorize(self.controls, 
            {("hhsize_cat","1"):"GQPOP"}, index_cols=['SFTAZ'])

        # cat_name  hhsize_cat
        # cat_value          1
        # SFTAZ
        # 1                  5
        # 2                  5
        # 3                  4
        # 4                  7
        # 5                  6
        # 6                 12
        # 7                  3
        # 8                  2
        # 9                  6
        # 10                29

        self.person_controls = cat.categorize(self.controls,
            {("gqworker_cat","1" ):"GQWKRS",
             ("gqworker_cat","0" ):"GQNONWKRS",
             ("gqage_cat", "0-64"):"GQAGE064",
             ("gqage_cat", "65+" ):"GQAGE65P" }, index_cols=['SFTAZ'])
    def __init__(self, key, controls_csv, tazset=None, puma_data_dir=None, fips_file=None,
                  write_households_csv=None, write_persons_csv=None, write_append=False,
                  start_hhid=1, start_persid=1):
        SFCTAStarter.__init__(self, key, controls_csv, tazset, puma_data_dir, fips_file,
                              write_households_csv, write_persons_csv, write_append,
                              start_hhid, start_persid)

        # Remove 0-household controls
        self.controls = self.controls[self.controls['HHLDS']>0]
        
        # self.controls = self.controls.iloc[:2,]
        print "Household controls has length %d" % len(self.controls)

        self.hh_controls = cat.categorize(self.controls, 
            {("income_cat", "0-25k"  ): "HHINCQ1",
             ("income_cat", "25-45k" ): "HHINCQ2",
             ("income_cat", "45-75k" ): "HHINCQ3",
             ("income_cat", "75k+"   ): "HHINCQ4",
             ("hhsize_cat", "1"      ): "SZ1_HHLDS",
             ("hhsize_cat", "2"      ): "SZ2_HHLDS",
             ("hhsize_cat", "3"      ): "SZ3_HHLDS",
             ("hhsize_cat", "4"      ): "SZ4_HHLDS",
             ("hhsize_cat", "5+"     ): "SZ5_HHLDS",
             ("workers_cat", "0"     ): "WKR0_HHLDS",
             ("workers_cat", "1"     ): "WKR1_HHLDS",
             ("workers_cat", "2"     ): "WKR2_HHLDS",
             ("workers_cat", "3+"    ): "WKR3_HHLDS",
             ("htype_cat", "HAGE1K0"    ): "HAGE1KIDS0",
             ("htype_cat", "HAGE1K1"    ): "HAGE1KIDS1",
             ("htype_cat", "HAGE65KALL"    ): "HAGE65KIDSWHATEV" },
                                          index_cols=['SFTAZ'])
        
        # print self.hh_controls.loc[1:10,:]

        # cat_name  hhsize_cat                               income_cat                        workers_cat
        # cat_value          1       2      3      4      5+      0-30k  100k+  30-60k 60-100k           0       1       2      3+
        # SFTAZ                                                                                                                  
        # 1             28.365  46.970 40.260 49.410 139.995     51.540 61.848 130.700  59.912      52.155  93.025  96.990  62.830
        # 2             36.663  54.540 45.147 51.207 115.443     54.976 73.970  83.514  90.540      48.783  81.507  93.627  79.083
        # 3             73.610 104.353 77.074 71.012 106.951    261.136 28.188 106.849  37.827     158.911 196.149  67.115  10.825
        # 4             45.133  67.140 55.577 63.037 142.113     67.002 90.663 102.678 111.657      60.053 100.337 115.257  97.353
        # 5             34.038  56.364 48.312 59.292 167.994     62.707 75.211 157.363  71.719      62.586 111.630 116.388  75.396
        # 6             65.403  94.302 78.078 85.176 184.041    144.312 50.742 138.288 173.658      84.669 148.551 159.705 114.075
        # 7             46.920  66.516 49.128 45.264  68.172    166.646 17.759  68.419  24.176     101.292 125.028  42.780   6.900
        # 8             54.282  74.883 56.898 53.301  87.636     67.002 74.937  95.038  90.023      61.476 126.222 104.967  34.335
        # 9             42.471  63.180 52.299 59.319 133.731     63.566 85.465  97.339 105.630      56.511  94.419 108.459  91.611
        # 10            55.480  77.140 60.800 62.700 123.880     90.195 64.092 101.710 124.003      68.780 134.520 122.740  53.960


        # todo: add HAGE1KIDS0, HAGE1KIDS1, HAGE1KIDSWHATEV
        self.person_controls = cat.categorize(self.controls,
            {("age_cat", "0-4"  ): "AGE0004",
             ("age_cat", "5-19" ): "AGE0519",
             ("age_cat", "20-44"): "AGE2044",
             ("age_cat", "45-64"): "AGE4564",
             ("age_cat", "65+"  ): "AGE65P"},
                                          index_cols=['SFTAZ'])
    def __init__(self,
                 key,
                 controls_csv,
                 tazset=None,
                 puma_data_dir=None,
                 fips_file=None,
                 write_households_csv=None,
                 write_persons_csv=None,
                 write_append=False,
                 start_hhid=1,
                 start_persid=1):
        SFCTAStarter.__init__(self, key, controls_csv, tazset, puma_data_dir,
                              fips_file, write_households_csv,
                              write_persons_csv, write_append, start_hhid,
                              start_persid)

        # Remove 0-group quarters controls
        self.controls = self.controls[self.controls['GQPOP'] > 0]

        self.hh_controls = cat.categorize(self.controls,
                                          {("hhsize_cat", "1"): "GQPOP"},
                                          index_cols=['SFTAZ'])

        # cat_name  hhsize_cat
        # cat_value          1
        # SFTAZ
        # 1                  5
        # 2                  5
        # 3                  4
        # 4                  7
        # 5                  6
        # 6                 12
        # 7                  3
        # 8                  2
        # 9                  6
        # 10                29

        self.person_controls = cat.categorize(self.controls, {
            ("gqworker_cat", "1"): "GQWKRS",
            ("gqworker_cat", "0"): "GQNONWKRS",
            ("gqage_cat", "0-64"): "GQAGE064",
            ("gqage_cat", "65+"): "GQAGE65P"
        },
                                              index_cols=['SFTAZ'])
    def get_household_joint_dist_for_geography(self, ind):
        
        # check the cache to see if we've done it already
        puma = self.tazToPUMA2010.loc[ind.SFTAZ,'PUMA2010']
        if puma in self.h_pums.keys():
            return self.h_pums[puma], self.jd_households[puma]

        # if not, get the superclass to do a bunch of variable setting
        h_pums, p_pums = SFCTAStarter.get_pums(self, puma)
        orig_len = len(h_pums)

        # filter to housing unit only with number of persons > 0
        h_pums = h_pums[h_pums['NP']>0]
        # Only Housing units
        h_pums = h_pums[h_pums['TYPE']==1]
        print "Filtered to %d households from %d originally" % (len(h_pums), orig_len)
                
        # Household income
        h_pums['hhinc_2012dollars'] = h_pums['HINCP']*(0.000001*h_pums['ADJINC'])  # ADJINC has 6 implied decimal places
        h_pums['hhinc_1989dollars'] = 0.54*h_pums['hhinc_2012dollars']
        
        h_pums['hhinc'] = h_pums['hhinc_1989dollars']/1000.0  # in thousands of dollars
        # print sum(h_pums.loc[:,'hhinc']<0)
        h_pums.loc[h_pums.loc[:,'hhinc']<0,  'hhinc'] = 0.0       # no negatives
        # print sum(h_pums.loc[:,'hhinc']>255)
        h_pums.loc[h_pums.loc[:,'hhinc']>255,'hhinc'] = 255.0   # max = 255
        
        # For the following, r is a pandas.Series
        # It's basically a row from h_pums, so any variables defined above will be available
        
        def hhsize_cat(r):
            # NP = number of persons
            if r.NP >=5:
                return "5+"
            elif r.NP == 4:
                return "4"
            elif r.NP == 3:
                return "3"
            elif r.NP == 2:
                return "2"
            elif r.NP == 1:
                return "1"
            return "1"

        def income_cat(r):
            if r.hhinc < 25.0:
                return "0-25k"
            elif r.hhinc < 45.0:
                return "25-45k"
            elif r.hhinc < 75.0:
                return "45-75k"
            else:
                return "75k+"

        def workers_cat(r):
            # hmm... WIF = Workers in Family.  What about non-family households?
            if r.workers >= 3:
                return "3+"
            elif r.workers == 2:
                return "2"
            elif r.workers == 1:
                return "1"
            return "0"
        
        def htype_cat(r):
            if r.hhage < 65 and r.NOC==0:
                return "HAGE1K0"
            elif r.hhage < 65 and r.NOC>0:
                return "HAGE1K1"
            else:
                return "HAGE65KALL"

        h_pums, jd_households = cat.joint_distribution(
            h_pums,
            cat.category_combinations(self.hh_controls.columns),
            {"hhsize_cat": hhsize_cat,
             "income_cat": income_cat,
             "workers_cat": workers_cat,
             "htype_cat": htype_cat}
        )
        # cache them
        self.h_pums[puma]           = h_pums
        self.jd_households[puma]    = jd_households

        return h_pums, jd_households
Exemple #5
0
    def __init__(self,
                 key,
                 controls_csv,
                 tazset=None,
                 puma_data_dir=None,
                 fips_file=None,
                 write_households_csv=None,
                 write_persons_csv=None,
                 write_append=False,
                 start_hhid=1,
                 start_persid=1):
        SFCTAStarter.__init__(self, key, controls_csv, tazset, puma_data_dir,
                              fips_file, write_households_csv,
                              write_persons_csv, write_append, start_hhid,
                              start_persid)

        # Remove 0-household controls
        self.controls = self.controls[self.controls['HHLDS'] > 0]

        # self.controls = self.controls.iloc[:2,]
        print "Household controls has length %d" % len(self.controls)

        self.hh_controls = cat.categorize(
            self.controls, {
                ("income_cat", "0-25k"): "HHINCQ1",
                ("income_cat", "25-45k"): "HHINCQ2",
                ("income_cat", "45-75k"): "HHINCQ3",
                ("income_cat", "75k+"): "HHINCQ4",
                ("hhsize_cat", "1"): "SZ1_HHLDS",
                ("hhsize_cat", "2"): "SZ2_HHLDS",
                ("hhsize_cat", "3"): "SZ3_HHLDS",
                ("hhsize_cat", "4"): "SZ4_HHLDS",
                ("hhsize_cat", "5+"): "SZ5_HHLDS",
                ("workers_cat", "0"): "WKR0_HHLDS",
                ("workers_cat", "1"): "WKR1_HHLDS",
                ("workers_cat", "2"): "WKR2_HHLDS",
                ("workers_cat", "3+"): "WKR3_HHLDS",
                ("htype_cat", "HAGE1K0"): "HAGE1KIDS0",
                ("htype_cat", "HAGE1K1"): "HAGE1KIDS1",
                ("htype_cat", "HAGE65KALL"): "HAGE65KIDSWHATEV"
            },
            index_cols=['SFTAZ'])

        # print self.hh_controls.loc[1:10,:]

        # cat_name  hhsize_cat                               income_cat                        workers_cat
        # cat_value          1       2      3      4      5+      0-30k  100k+  30-60k 60-100k           0       1       2      3+
        # SFTAZ
        # 1             28.365  46.970 40.260 49.410 139.995     51.540 61.848 130.700  59.912      52.155  93.025  96.990  62.830
        # 2             36.663  54.540 45.147 51.207 115.443     54.976 73.970  83.514  90.540      48.783  81.507  93.627  79.083
        # 3             73.610 104.353 77.074 71.012 106.951    261.136 28.188 106.849  37.827     158.911 196.149  67.115  10.825
        # 4             45.133  67.140 55.577 63.037 142.113     67.002 90.663 102.678 111.657      60.053 100.337 115.257  97.353
        # 5             34.038  56.364 48.312 59.292 167.994     62.707 75.211 157.363  71.719      62.586 111.630 116.388  75.396
        # 6             65.403  94.302 78.078 85.176 184.041    144.312 50.742 138.288 173.658      84.669 148.551 159.705 114.075
        # 7             46.920  66.516 49.128 45.264  68.172    166.646 17.759  68.419  24.176     101.292 125.028  42.780   6.900
        # 8             54.282  74.883 56.898 53.301  87.636     67.002 74.937  95.038  90.023      61.476 126.222 104.967  34.335
        # 9             42.471  63.180 52.299 59.319 133.731     63.566 85.465  97.339 105.630      56.511  94.419 108.459  91.611
        # 10            55.480  77.140 60.800 62.700 123.880     90.195 64.092 101.710 124.003      68.780 134.520 122.740  53.960

        # todo: add HAGE1KIDS0, HAGE1KIDS1, HAGE1KIDSWHATEV
        self.person_controls = cat.categorize(self.controls, {
            ("age_cat", "0-4"): "AGE0004",
            ("age_cat", "5-19"): "AGE0519",
            ("age_cat", "20-44"): "AGE2044",
            ("age_cat", "45-64"): "AGE4564",
            ("age_cat", "65+"): "AGE65P"
        },
                                              index_cols=['SFTAZ'])
Exemple #6
0
    def get_household_joint_dist_for_geography(self, ind):

        # check the cache to see if we've done it already
        puma = self.tazToPUMA2010.loc[ind.SFTAZ, 'PUMA2010']
        if puma in self.h_pums.keys():
            return self.h_pums[puma], self.jd_households[puma]

        # if not, get the superclass to do a bunch of variable setting
        h_pums, p_pums = SFCTAStarter.get_pums(self, puma)
        orig_len = len(h_pums)

        # filter to housing unit only with number of persons > 0
        h_pums = h_pums[h_pums['NP'] > 0]
        # Only Housing units
        h_pums = h_pums[h_pums['TYPE'] == 1]
        print "Filtered to %d households from %d originally" % (len(h_pums),
                                                                orig_len)

        # Household income
        h_pums['hhinc_2012dollars'] = h_pums['HINCP'] * (
            0.000001 * h_pums['ADJINC'])  # ADJINC has 6 implied decimal places
        h_pums['hhinc_1989dollars'] = 0.54 * h_pums['hhinc_2012dollars']

        h_pums['hhinc'] = h_pums[
            'hhinc_1989dollars'] / 1000.0  # in thousands of dollars
        # print sum(h_pums.loc[:,'hhinc']<0)
        h_pums.loc[h_pums.loc[:, 'hhinc'] < 0, 'hhinc'] = 0.0  # no negatives
        # print sum(h_pums.loc[:,'hhinc']>255)
        h_pums.loc[h_pums.loc[:, 'hhinc'] > 255, 'hhinc'] = 255.0  # max = 255

        # For the following, r is a pandas.Series
        # It's basically a row from h_pums, so any variables defined above will be available

        def hhsize_cat(r):
            # NP = number of persons
            if r.NP >= 5:
                return "5+"
            elif r.NP == 4:
                return "4"
            elif r.NP == 3:
                return "3"
            elif r.NP == 2:
                return "2"
            elif r.NP == 1:
                return "1"
            return "1"

        def income_cat(r):
            if r.hhinc < 25.0:
                return "0-25k"
            elif r.hhinc < 45.0:
                return "25-45k"
            elif r.hhinc < 75.0:
                return "45-75k"
            else:
                return "75k+"

        def workers_cat(r):
            # hmm... WIF = Workers in Family.  What about non-family households?
            if r.workers >= 3:
                return "3+"
            elif r.workers == 2:
                return "2"
            elif r.workers == 1:
                return "1"
            return "0"

        def htype_cat(r):
            if r.hhage < 65 and r.NOC == 0:
                return "HAGE1K0"
            elif r.hhage < 65 and r.NOC > 0:
                return "HAGE1K1"
            else:
                return "HAGE65KALL"

        h_pums, jd_households = cat.joint_distribution(
            h_pums, cat.category_combinations(self.hh_controls.columns), {
                "hhsize_cat": hhsize_cat,
                "income_cat": income_cat,
                "workers_cat": workers_cat,
                "htype_cat": htype_cat
            })
        # cache them
        self.h_pums[puma] = h_pums
        self.jd_households[puma] = jd_households

        return h_pums, jd_households
    def get_household_joint_dist_for_geography(self, ind):
        
        # check the cache to see if we've done it already
        puma = self.tazToPUMA2010.loc[ind.SFTAZ,'PUMA2010']
        if puma in self.h_pums.keys():
            return self.h_pums[puma], self.jd_households[puma]

        # if not, get the superclass to do a bunch of variable setting
        h_pums, p_pums = SFCTAStarter.get_pums(self, puma)
        orig_len = len(h_pums)
        
        # Don't bother filter number of persons -- this should happen with TYPE filter
        # h_pums = h_pums[h_pums['NP']==1]
        
        # Only Non-Institutional Group Quarters
        h_pums = h_pums[h_pums['TYPE']>2]
        print "Filtered to %d GQ 'households' from %d originally" % (len(h_pums), orig_len)
        np_bad = (h_pums.NP != 1)
        assert(np_bad.sum() == 0)
        
        # Group quarters income -- use PINCP
        h_pums.loc[pd.isnull(h_pums.loc[:,'PINCP']),'PINCP'] = 0.0       # no null
        h_pums['hhinc_2012dollars'] = h_pums['PINCP']*(0.000001*h_pums['ADJINC'])  # ADJINC has 6 implied decimal places
        h_pums['hhinc_1989dollars'] = 0.54*h_pums['hhinc_2012dollars']
        
        h_pums['hhinc'] = h_pums['hhinc_1989dollars']/1000.0  # in thousands of dollars
        # print sum(h_pums.loc[:,'hhinc']<0)
        h_pums.loc[h_pums.loc[:,'hhinc']<0,  'hhinc'] = 0.0              # no negatives
        # print sum(h_pums.loc[:,'hhinc']>255)
        h_pums.loc[h_pums.loc[:,'hhinc']>255,'hhinc'] = 255.0   # max = 255
        
        # For the following, r is a pandas.Series
        # It's basically a row from h_pums, so any variables defined above will be available
        
        def hhsize_cat(r):
            # NP = number of persons
            if r.NP >=5:
                return "5+"
            elif r.NP == 4:
                return "4"
            elif r.NP == 3:
                return "3"
            elif r.NP == 2:
                return "2"
            elif r.NP == 1:
                return "1"
            return "1"
        
        def income_cat(r):
            if r.hhinc < 25.0:
                return "0-25k"
            elif r.hhinc < 45.0:
                return "25-45k"
            elif r.hhinc < 75.0:
                return "45-75k"
            else:
                return "75k+"

        def workers_cat(r):
            # hmm... WIF = Workers in Family.  What about non-family households?
            if r.workers >= 3:
                return "3+"
            elif r.workers == 2:
                return "2"
            elif r.workers == 1:
                return "1"
            return "0"
        
        def htype_cat(r):
            if r.hhage < 65 and r.gqchild==0:
                return "HAGE1K0"
            elif r.hhage < 65 and r.gqchild > 0:
                return "HAGE1K1"
            else:
                return "HAGE65KALL"

        category_df = pd.DataFrame({'cat_id':[0], 'hhsize_cat':["1"]})
        category_df.set_index(['hhsize_cat'], inplace=True)
        h_pums, jd_households = cat.joint_distribution(
            h_pums,
            category_df,
            {"hhsize_cat": hhsize_cat,
             "income_cat": income_cat,
             "workers_cat": workers_cat,
             "htype_cat": htype_cat}
        )

        # cache them
        self.h_pums[puma]           = h_pums
        self.jd_households[puma]    = jd_households

        return h_pums, jd_households
    def get_household_joint_dist_for_geography(self, ind):

        # check the cache to see if we've done it already
        puma = self.tazToPUMA2010.loc[ind.SFTAZ, 'PUMA2010']
        if puma in self.h_pums.keys():
            return self.h_pums[puma], self.jd_households[puma]

        # if not, get the superclass to do a bunch of variable setting
        h_pums, p_pums = SFCTAStarter.get_pums(self, puma)
        orig_len = len(h_pums)

        # Don't bother filter number of persons -- this should happen with TYPE filter
        # h_pums = h_pums[h_pums['NP']==1]

        # Only Non-Institutional Group Quarters
        h_pums = h_pums[h_pums['TYPE'] > 2]
        print "Filtered to %d GQ 'households' from %d originally" % (
            len(h_pums), orig_len)
        np_bad = (h_pums.NP != 1)
        assert (np_bad.sum() == 0)

        # Group quarters income -- use PINCP
        h_pums.loc[pd.isnull(h_pums.loc[:, 'PINCP']), 'PINCP'] = 0.0  # no null
        h_pums['hhinc_2012dollars'] = h_pums['PINCP'] * (
            0.000001 * h_pums['ADJINC'])  # ADJINC has 6 implied decimal places
        h_pums['hhinc_1989dollars'] = 0.54 * h_pums['hhinc_2012dollars']

        h_pums['hhinc'] = h_pums[
            'hhinc_1989dollars'] / 1000.0  # in thousands of dollars
        # print sum(h_pums.loc[:,'hhinc']<0)
        h_pums.loc[h_pums.loc[:, 'hhinc'] < 0, 'hhinc'] = 0.0  # no negatives
        # print sum(h_pums.loc[:,'hhinc']>255)
        h_pums.loc[h_pums.loc[:, 'hhinc'] > 255, 'hhinc'] = 255.0  # max = 255

        # For the following, r is a pandas.Series
        # It's basically a row from h_pums, so any variables defined above will be available

        def hhsize_cat(r):
            # NP = number of persons
            if r.NP >= 5:
                return "5+"
            elif r.NP == 4:
                return "4"
            elif r.NP == 3:
                return "3"
            elif r.NP == 2:
                return "2"
            elif r.NP == 1:
                return "1"
            return "1"

        def income_cat(r):
            if r.hhinc < 25.0:
                return "0-25k"
            elif r.hhinc < 45.0:
                return "25-45k"
            elif r.hhinc < 75.0:
                return "45-75k"
            else:
                return "75k+"

        def workers_cat(r):
            # hmm... WIF = Workers in Family.  What about non-family households?
            if r.workers >= 3:
                return "3+"
            elif r.workers == 2:
                return "2"
            elif r.workers == 1:
                return "1"
            return "0"

        def htype_cat(r):
            if r.hhage < 65 and r.gqchild == 0:
                return "HAGE1K0"
            elif r.hhage < 65 and r.gqchild > 0:
                return "HAGE1K1"
            else:
                return "HAGE65KALL"

        category_df = pd.DataFrame({'cat_id': [0], 'hhsize_cat': ["1"]})
        category_df.set_index(['hhsize_cat'], inplace=True)
        h_pums, jd_households = cat.joint_distribution(
            h_pums, category_df, {
                "hhsize_cat": hhsize_cat,
                "income_cat": income_cat,
                "workers_cat": workers_cat,
                "htype_cat": htype_cat
            })

        # cache them
        self.h_pums[puma] = h_pums
        self.jd_households[puma] = jd_households

        return h_pums, jd_households