def _generate(self, agg_data, country): agg_data = humanleague.integerise(agg_data)["result"] # split 5y groups split = humanleague.prob2IntFreq(np.ones(5) / 5, int(agg_data.sum()))["freq"] msynth = humanleague.qis( [np.array([0, 1], dtype=int), np.array([2], dtype=int)], [agg_data, split]) if not isinstance(msynth, dict): raise RuntimeError("microsynthesis general failure: %s" % msynth) if not msynth["conv"]: raise RuntimeError("microsynthesis convergence failure") #neworder.log(pop["result"]) raw = humanleague.flatten(msynth["result"]) pop = pd.DataFrame(columns=["AGE5", "AGE1", "SEX"]) pop.AGE5 = raw[0] pop.AGE1 = raw[2] pop.SEX = raw[1] # could fail here if zero people in any category assert len(pop.AGE5.unique()) == 17 assert len(pop.AGE1.unique()) == 5 assert len(pop.SEX.unique()) == 2 # construct single year of age pop["Country"] = country pop["AGE"] = pop.AGE5 * 5 + pop.AGE1 self.pop = self.pop.append(pop.drop(["AGE5", "AGE1"], axis=1))
def _generate_from_total(self, agg_value, country): # TODO improve distribution sex_split = humanleague.prob2IntFreq(np.ones(2) / 2, int(agg_value))["freq"] age_split = humanleague.prob2IntFreq(np.ones(17) / 17, int(agg_value))["freq"] msynth = humanleague.qis( [np.array([0], dtype=int), np.array([1], dtype=int)], [age_split, sex_split]) if not isinstance(msynth, dict): raise RuntimeError( "microsynthesis (from total) general failure: %s" % msynth) if not msynth["conv"]: raise RuntimeError( "microsynthesis (from total) convergence failure") raw = humanleague.flatten(msynth["result"]) pop = pd.DataFrame(columns=["AGE", "SEX"]) pop.AGE = raw[0] pop.SEX = raw[1] # could fail here if zero people in any category assert len(pop.AGE.unique()) == 17 assert len(pop.SEX.unique()) == 2 # construct single year of age pop["Country"] = country self.pop = self.pop.append(pop, sort=False)
def test_QIS_dim_indexing(self): # tricky array indexing - 1st dimension of d0 already sampled, remaining dimension # indices on slice of d0 need to be remapped m0 = np.ones([4,6,4,4], dtype=int) m1 = np.ones([4,4,4], dtype=int) * 6 ms=hl.qis([np.array([0,1,2,3]),np.array([0,4,5])], [m0,m1]) self.assertTrue(ms["conv"]) ms=hl.qis([np.array([0,4,5]),np.array([0,1,2,3])], [m1,m0]) self.assertTrue(ms["conv"]) ms=hl.qis([np.array([0,1,2]),np.array([0,3,4,5])], [m1,m0]) self.assertTrue(ms["conv"])
def microsynthesise_seed(dc1117, dc2101, dc6206): """ Microsynthesise a seed population from census data """ n_geog = len(dc1117.GEOGRAPHY_CODE.unique()) n_sex = 2 #len(dc1117.C_SEX.unique()) n_age = len(dc1117.C_AGE.unique()) cen11sa = unlistify(dc1117, ["GEOGRAPHY_CODE", "C_SEX", "C_AGE"], [n_geog, n_sex, n_age], "OBS_VALUE") n_eth = len(dc2101.C_ETHPUK11.unique()) cen11se = unlistify(dc2101, ["GEOGRAPHY_CODE", "C_SEX", "C_ETHPUK11"], [n_geog, n_sex, n_eth], "OBS_VALUE") # TODO use microdata (national or perhaps regional) Mistral/persistent_data/seed_ASE_EW.csv # - requires unified age structure # microsynthesise these two into a 4D seed (if this has a lot of zeros can have big impact on microsim) print("Synthesising 2011 seed population...", end='') msynth = hl.qis( [np.array([0, 1, 2]), np.array([0, 1, 3])], [cen11sa, cen11se]) check_result(msynth) print("OK") return msynth["result"]
def test_QIS(self): # m = np.array([[10,20,10],[10,10,20],[20,10,10]]) # idx = [np.array([0,1]), np.array([1,2])] # r = hl.qis(idx, [m, m]) # self.assertTrue(false) m0 = np.array([52, 48]) m1 = np.array([10, 77, 13]) i0 = np.array([0]) i1 = np.array([1]) p = hl.qis([i0, i1], [m0, m1]) print(p) self.assertTrue(p["conv"]) self.assertLess(p["chiSq"], 0.04) self.assertGreater(p["pValue"], 0.9) #self.assertLess(p["degeneracy"], 0.04) TODO check the calculation self.assertEqual(p["pop"], 100.0) self.assertTrue(np.allclose(np.sum(p["result"], 0), m1)) self.assertTrue(np.allclose(np.sum(p["result"], 1), m0)) #self.assertTrue(np.array_equal(p["result"], np.array([[5, 40, 7],[5, 37, 6]]))) m0 = np.array([52, 40, 4, 4]) m1 = np.array([87, 10, 3]) m2 = np.array([55, 15, 6, 12, 12]) i0 = np.array([0]) i1 = np.array([1]) i2 = np.array([2]) p = hl.qis([i0, i1, i2], [m0, m1, m2]) self.assertTrue(p["conv"]) self.assertLess(p["chiSq"], 73.0) # TODO seems a bit high (probably ) self.assertGreater(p["pValue"], 0.0) # TODO this is suspect self.assertEqual(p["pop"], 100.0) self.assertTrue(np.allclose(np.sum(p["result"], (0, 1)), m2)) self.assertTrue(np.allclose(np.sum(p["result"], (1, 2)), m0)) self.assertTrue(np.allclose(np.sum(p["result"], (2, 0)), m1)) # Test flatten functionality table = hl.flatten(p["result"]) # length is no of dims self.assertTrue(len(table) == 3) # length of element is pop self.assertTrue(len(table[0]) == p["pop"]) # check consistent with marginals for i in range(0, len(m0)): self.assertTrue(table[0].count(i) == m0[i]) for i in range(0, len(m1)): self.assertTrue(table[1].count(i) == m1[i]) for i in range(0, len(m2)): self.assertTrue(table[2].count(i) == m2[i]) m0 = np.array([52, 48]) m1 = np.array([87, 13]) m2 = np.array([67, 33]) m3 = np.array([55, 45]) i0 = np.array([0]) i1 = np.array([1]) i2 = np.array([2]) i3 = np.array([3]) p = hl.qis([i0, i1, i2, i3], [m0, m1, m2, m3]) self.assertTrue(p["conv"]) self.assertLess(p["chiSq"], 10) self.assertGreater(p["pValue"], 0.002) # TODO this looks suspect too self.assertEqual(p["pop"], 100) self.assertTrue(np.allclose(np.sum(p["result"], (0, 1, 2)), m3)) self.assertTrue(np.allclose(np.sum(p["result"], (1, 2, 3)), m0)) self.assertTrue(np.allclose(np.sum(p["result"], (2, 3, 0)), m1)) self.assertTrue(np.allclose(np.sum(p["result"], (3, 0, 1)), m2)) m = np.array([[10,20,10],[10,10,20],[20,10,10]]) idx = [np.array([0,1]), np.array([1,2])] p = hl.qis(idx, [m, m]) #print(p) self.assertTrue(p["conv"]) self.assertLess(p["chiSq"], 10) self.assertGreater(p["pValue"], 0.27) self.assertEqual(p["pop"], 120) print(np.sum(p["result"], 2)) self.assertTrue(np.allclose(np.sum(p["result"], 2), m)) self.assertTrue(np.allclose(np.sum(p["result"], 0), m))
def __add_households(self, area, constraints): # TODO use actual values from tables # TODO make members? # Dim (overall dim) tenure_map = self.lc4402.C_TENHUK11.unique() # 0 rooms_map = self.lc4404.C_ROOMS.unique() # 1 occupants_map = self.lc4404.C_SIZHUK11.unique() # 2 bedrooms_map = self.lc4405.C_BEDROOMS.unique() # 3 [1,2,3,4] or [-1] hhtype_map = self.lc4408.C_AHTHUK11.unique() # 4 # ch_map = self.lc4402.C_CENHEATHUK11.unique() # 1 (5) buildtype_map = self.lc4402.C_TYPACCOM.unique() # 2 (6) eth_map = self.lc4202.C_ETHHUK11.unique() # 3 (7) cars_map = self.lc4202.C_CARSNO.unique() # 4 (8) econ_map = self.lc4605.C_NSSEC.unique() # 5 (9) tenure_rooms_occ = self.lc4404.loc[self.lc4404.GEOGRAPHY_CODE == area].copy() # unmap indices # TODO might be quicker to unmap the entire table upfront? utils.unmap(tenure_rooms_occ.C_TENHUK11, tenure_map) utils.unmap(tenure_rooms_occ.C_ROOMS, rooms_map) utils.unmap(tenure_rooms_occ.C_SIZHUK11, occupants_map) m4404 = utils.unlistify( tenure_rooms_occ, ["C_TENHUK11", "C_ROOMS", "C_SIZHUK11"], [len(tenure_map), len(rooms_map), len(occupants_map)], "OBS_VALUE") # no bedroom info in Scottish data tenure_beds_occ = self.lc4405.loc[self.lc4405.GEOGRAPHY_CODE == area].copy() # unmap indices utils.unmap(tenure_beds_occ.C_BEDROOMS, bedrooms_map) utils.unmap(tenure_beds_occ.C_TENHUK11, tenure_map) utils.unmap(tenure_beds_occ.C_SIZHUK11, occupants_map) m4405 = utils.unlistify( tenure_beds_occ, ["C_TENHUK11", "C_BEDROOMS", "C_SIZHUK11"], [len(tenure_map), len(bedrooms_map), len(occupants_map)], "OBS_VALUE") # print(m4405.shape) tenure_accom = self.lc4408.loc[self.lc4408.GEOGRAPHY_CODE == area].copy() utils.unmap(tenure_accom.C_TENHUK11, tenure_map) utils.unmap(tenure_accom.C_AHTHUK11, hhtype_map) m4408 = utils.unlistify( tenure_accom, ["C_TENHUK11", "C_AHTHUK11"], [len(tenure_map), len(hhtype_map)], "OBS_VALUE") #print(np.sum(m4404), np.sum(m4405), np.sum(m4408)) # TODO relax IPF tolerance and maxiters when used within QISI? m4408dim = np.array([0, 4]) # collapse m4408 dim for scotland if self.scotland: m4408 = np.sum(m4408, axis=0) m4408dim = np.array([4]) p0 = humanleague.qisi( constraints, [np.array([0, 1, 2]), np.array([0, 3, 2]), m4408dim], [m4404, m4405, m4408]) # drop the survey seed if there are convergence problems # TODO check_humanleague_result needs complete refactoring if not isinstance(p0, dict) or not p0["conv"]: print("Dropping TROBH constraint due to convergence failure") p0 = humanleague.qisi( seed.get_impossible_TROBH(), [np.array([0, 1, 2]), np.array([0, 3, 2]), m4408dim], [m4404, m4405, m4408]) utils.check_humanleague_result(p0, [m4404, m4405, m4408], seed.get_impossible_TROBH()) else: utils.check_humanleague_result(p0, [m4404, m4405, m4408], constraints) #print("p0 ok") tenure_ch_accom = self.lc4402.loc[self.lc4402.GEOGRAPHY_CODE == area].copy() utils.unmap(tenure_ch_accom.C_CENHEATHUK11, ch_map) utils.unmap(tenure_ch_accom.C_TENHUK11, tenure_map) utils.unmap(tenure_ch_accom.C_TYPACCOM, buildtype_map) m4402 = utils.unlistify( tenure_ch_accom, ["C_TENHUK11", "C_CENHEATHUK11", "C_TYPACCOM"], [len(tenure_map), len(ch_map), len(buildtype_map)], "OBS_VALUE") tenure_eth_car = self.lc4202.loc[self.lc4202.GEOGRAPHY_CODE == area].copy() utils.unmap(tenure_eth_car.C_ETHHUK11, eth_map) utils.unmap(tenure_eth_car.C_CARSNO, cars_map) utils.unmap(tenure_eth_car.C_TENHUK11, tenure_map) m4202 = utils.unlistify( tenure_eth_car, ["C_TENHUK11", "C_ETHHUK11", "C_CARSNO"], [len(tenure_map), len(eth_map), len(cars_map)], "OBS_VALUE") econ = self.lc4605.loc[self.lc4605.GEOGRAPHY_CODE == area].copy() utils.unmap(econ.C_NSSEC, econ_map) utils.unmap(econ.C_TENHUK11, tenure_map) # econ counts often slightly lower, need to tweak ##econ = utils.adjust(econ, tenure_eth_car) m4605 = utils.unlistify( econ, ["C_TENHUK11", "C_NSSEC"], [len(tenure_map), len(econ_map)], "OBS_VALUE") m4605_sum = np.sum(m4605) m4202_sum = np.sum(m4202) if m4605_sum != m4202_sum: print("LC4402: %d LC4605: %d -> %d " % (np.sum(m4402), m4605_sum, m4202_sum), end="") tenure_4202 = np.sum(m4202, axis=(1, 2)) nssec_4605_adj = humanleague.prob2IntFreq( np.sum(m4605, axis=0) / m4605_sum, m4202_sum)["freq"] # m4605_adj = humanleague.qisi(m4605.astype(float), [np.array([0]), np.array([1])], [tenure_4202, nssec_4605_adj]) # Convergence problems can occur when e.g. one of the tenure rows is zero yet the marginal total is nonzero, # Can get round this by adding a small number to the seed # effectively allowing zero states to be occupied with a finite probability # if not m4605_adj["conv"]: m4605_adj = humanleague.qisi( m4605.astype(float) + 1.0 / m4202_sum, [np.array([0]), np.array([1])], [tenure_4202, nssec_4605_adj]) utils.check_humanleague_result(m4605_adj, [tenure_4202, nssec_4605_adj]) m4605 = m4605_adj["result"] #print("econ adj ok") # print(np.sum(p0["result"], axis=(1,2,3,4))) # print(np.sum(m4402, axis=(1,2))) # print(np.sum(m4202, axis=(1,2))) # print(np.sum(m4605, axis=1)) # no seed constraint so just use QIS if self.scotland: # tenures not mappable in LC4202 m4202 = np.sum(m4202, axis=0) m4605 = np.sum(m4605, axis=0) p1 = humanleague.qis([ np.array([0, 1, 2, 3, 4]), np.array([0, 5, 6]), np.array([7, 8]), np.array([9]) ], [p0["result"], m4402, m4202, m4605]) #p1 = humanleague.qis([np.array([0, 1, 2, 3]), np.array([0, 4, 5]), np.array([0, 6, 7])], [p0["result"], m4402, m4202]) else: p1 = humanleague.qis([ np.array([0, 1, 2, 3, 4]), np.array([0, 5, 6]), np.array([0, 7, 8]), np.array([0, 9]) ], [p0["result"], m4402, m4202, m4605]) #p1 = humanleague.qis([np.array([0, 1, 2, 3]), np.array([0, 4, 5]), np.array([0, 6, 7])], [p0["result"], m4402, m4202]) utils.check_humanleague_result(p1, [p0["result"], m4402, m4202, m4605]) #print("p1 ok") table = humanleague.flatten(p1["result"]) chunk = pd.DataFrame(columns=self.dwellings.columns.values) chunk.Area = np.repeat(area, len(table[0])) chunk.LC4402_C_TENHUK11 = utils.remap(table[0], tenure_map) chunk.QS420_CELL = np.repeat(self.NOTAPPLICABLE, len(table[0])) chunk.LC4404_C_ROOMS = utils.remap(table[1], rooms_map) chunk.LC4404_C_SIZHUK11 = utils.remap(table[2], occupants_map) chunk.LC4405EW_C_BEDROOMS = utils.remap(table[3], bedrooms_map) chunk.LC4408_C_AHTHUK11 = utils.remap(table[4], hhtype_map) chunk.LC4402_C_CENHEATHUK11 = utils.remap(table[5], ch_map) chunk.LC4402_C_TYPACCOM = utils.remap(table[6], buildtype_map) chunk.CommunalSize = np.repeat(self.NOTAPPLICABLE, len(table[0])) chunk.LC4202_C_ETHHUK11 = utils.remap(table[7], eth_map) chunk.LC4202_C_CARSNO = utils.remap(table[8], cars_map) chunk.LC4605_C_NSSEC = utils.remap(table[9], econ_map) #print(chunk.head()) self.dwellings = self.dwellings.append(chunk, ignore_index=True)
def __get_census_data_sc(self): #print(self.api_sc.get_metadata("LC4404SC", self.resolution)) self.lc4402 = self.api_sc.get_data("LC4402SC", self.region, self.resolution, category_filters={ "LC4402SC_0_CODE": [2, 3, 5, 6], "LC4402SC_1_CODE": [2, 3, 4, 5], "LC4402SC_2_CODE": [1, 2] }) self.lc4402.rename( { "LC4402SC_1_CODE": "C_TYPACCOM", "LC4402SC_2_CODE": "C_CENHEATHUK11", "LC4402SC_0_CODE": "C_TENHUK11" }, axis=1, inplace=True) #print(self.lc4402.head()) # ensure counts are consistent across tables checksum = self.lc4402.OBS_VALUE.sum() # construct a tenure marginal for synthesis of other tables unavailable in Scottish dataset ngeogs = len(self.lc4402.GEOGRAPHY_CODE.unique()) ntenures = len(self.lc4402.C_TENHUK11.unique()) tenure_table = self.lc4402.groupby([ "GEOGRAPHY_CODE", "C_TENHUK11" ]).sum().reset_index().drop(["C_TYPACCOM", "C_CENHEATHUK11"], axis=1) m4402 = utils.unlistify(tenure_table, ["GEOGRAPHY_CODE", "C_TENHUK11"], [ngeogs, ntenures], "OBS_VALUE") # synthesise LC4404 from QS407 and QS406 # LC4404SC room categories are: 1, 2-3, 4-5, 6+ so not very useful, using univariate tables instead #print(self.api_sc.get_metadata("QS407SC", self.resolution)) qs407 = self.api_sc.get_data( "QS407SC", self.region, self.resolution, category_filters={"QS407SC_0_CODE": range(1, 10)}) qs407.rename({"QS407SC_0_CODE": "C_ROOMS"}, axis=1, inplace=True) qs407 = utils.cap_value(qs407, "C_ROOMS", 6, "OBS_VALUE") #print(qs407.head()) assert qs407.OBS_VALUE.sum() == checksum #print(self.api_sc.get_metadata("QS406SC", self.resolution)) qs406 = self.api_sc.get_data( "QS406SC", self.region, self.resolution, category_filters={"QS406SC_0_CODE": range(1, 9)}) qs406.rename({"QS406SC_0_CODE": "C_SIZHUK11"}, axis=1, inplace=True) qs406 = utils.cap_value(qs406, "C_SIZHUK11", 4, "OBS_VALUE") #print(qs406.head()) assert qs406.OBS_VALUE.sum() == checksum nrooms = len(qs407.C_ROOMS.unique()) nsizes = len(qs406.C_SIZHUK11.unique()) m407 = utils.unlistify(qs407, ["GEOGRAPHY_CODE", "C_ROOMS"], [ngeogs, nrooms], "OBS_VALUE") m406 = utils.unlistify(qs406, ["GEOGRAPHY_CODE", "C_SIZHUK11"], [ngeogs, nsizes], "OBS_VALUE") a4404 = humanleague.qis( [np.array([0, 1]), np.array([0, 2]), np.array([0, 3])], [m4402, m407, m406]) utils.check_humanleague_result(a4404, [m4402, m407, m406]) self.lc4404 = utils.listify( a4404["result"], "OBS_VALUE", ["GEOGRAPHY_CODE", "C_TENHUK11", "C_ROOMS", "C_SIZHUK11"]) self.lc4404.GEOGRAPHY_CODE = utils.remap(self.lc4404.GEOGRAPHY_CODE, qs406.GEOGRAPHY_CODE.unique()) self.lc4404.C_TENHUK11 = utils.remap(self.lc4404.C_TENHUK11, tenure_table.C_TENHUK11.unique()) self.lc4404.C_ROOMS = utils.remap(self.lc4404.C_ROOMS, qs407.C_ROOMS.unique()) self.lc4404.C_SIZHUK11 = utils.remap(self.lc4404.C_SIZHUK11, qs406.C_SIZHUK11.unique()) #print(self.lc4404.head()) assert self.lc4404.OBS_VALUE.sum() == checksum # no bedroom info is available # for now randomly sample from survey on rooms # TODO microsynth using tenure/occs also? self.lc4405 = self.lc4404.copy() # self.lc4405.rename({"C_ROOMS": "C_BEDROOMS"}, axis=1, inplace=True) self.lc4405["C_BEDROOMS"] = Household.UNKNOWN room_bed_dist = np.sum(seed.get_survey_TROBH(), axis=(0, 2, 4)) #print(room_bed_dist) # c = [1,2,3,4] # for i in range(0,6): # p = room_bed_dist[i]/np.sum(room_bed_dist[i]) # n = len(self.lc4405[self.lc4405.C_ROOMS == i+1]) # #print(np.random.choice(c, n, p=p)) # self.lc4405.loc[self.lc4405.C_ROOMS == i+1, "C_BEDROOMS"] = np.random.choice(c, n, p=p) #assert len(self.lc4405[self.lc4405.C_BEDROOMS == Household.UNKNOWN]) == 0 assert len( self.lc4405[self.lc4405.C_ROOMS < self.lc4405.C_BEDROOMS]) == 0 self.lc4405.drop("C_ROOMS", axis=1, inplace=True) self.lc4405 = self.lc4405.groupby( ["GEOGRAPHY_CODE", "C_TENHUK11", "C_SIZHUK11", "C_BEDROOMS"]).sum().reset_index() #print(self.lc4405) assert self.lc4405.OBS_VALUE.sum() == checksum # synthesise LC4408 #print(self.api_sc.get_metadata("QS116SC", self.resolution)) # 1'One person household', # 2'Married couple household: No dependent children', # 3'Married couple household: With dependent children', # 4'Same-sex civil partnership couple household', # 5'Cohabiting couple household: No dependent children', # 6'Cohabiting couple household: With dependent children', # 7'Lone parent household: No dependent children', # 8'Lone parent household: With dependent children', # 9'Multi-person household: All full-time students', # 10'Multi-person household: Other']}} qs116 = self.api_sc.get_data( "QS116SC", self.region, self.resolution, category_filters={"QS116SC_0_CODE": range(1, 11)}) qs116.rename({"QS116SC_0_CODE": "C_AHTHUK11"}, axis=1, inplace=True) # map to lower-resolution household types # 1 -> 1 (single) # (2,3,4) -> 2 (married/civil couple) # (5,6) -> 3 (cohabiting couple) # (7,8) -> 4 (single parent) # (9,10) -> 5 (mixed) qs116.loc[(qs116.C_AHTHUK11 == 2) | (qs116.C_AHTHUK11 == 3) | (qs116.C_AHTHUK11 == 4), "C_AHTHUK11"] = 2 qs116.loc[(qs116.C_AHTHUK11 == 5) | (qs116.C_AHTHUK11 == 6), "C_AHTHUK11"] = 3 qs116.loc[(qs116.C_AHTHUK11 == 7) | (qs116.C_AHTHUK11 == 8), "C_AHTHUK11"] = 4 qs116.loc[(qs116.C_AHTHUK11 == 9) | (qs116.C_AHTHUK11 == 10), "C_AHTHUK11"] = 5 # ...and consolidate qs116 = qs116.groupby(["GEOGRAPHY_CODE", "C_AHTHUK11"]).sum().reset_index() assert qs116.OBS_VALUE.sum() == checksum nhhtypes = len(qs116.C_AHTHUK11.unique()) m116 = utils.unlistify(qs116, ["GEOGRAPHY_CODE", "C_AHTHUK11"], [ngeogs, nhhtypes], "OBS_VALUE") a4408 = humanleague.qis( [np.array([0, 1]), np.array([0, 2])], [m4402, m116]) utils.check_humanleague_result(a4408, [m4402, m116]) self.lc4408 = utils.listify( a4408["result"], "OBS_VALUE", ["GEOGRAPHY_CODE", "C_TENHUK11", "C_AHTHUK11"]) self.lc4408.GEOGRAPHY_CODE = utils.remap(self.lc4408.GEOGRAPHY_CODE, qs116.GEOGRAPHY_CODE.unique()) self.lc4408.C_TENHUK11 = utils.remap(self.lc4408.C_TENHUK11, self.lc4402.C_TENHUK11.unique()) self.lc4408.C_AHTHUK11 = utils.remap(self.lc4408.C_AHTHUK11, qs116.C_AHTHUK11.unique()) #print(self.lc4408.head()) assert self.lc4408.OBS_VALUE.sum() == checksum # LC1105 #print(self.api_sc.get_metadata("KS101SC", self.resolution)) self.lc1105 = self.api_sc.get_data( "KS101SC", self.region, self.resolution, category_filters={"KS101SC_0_CODE": [3, 4]}) self.lc1105.rename({"KS101SC_0_CODE": "C_RESIDENCE_TYPE"}, axis=1, inplace=True) # 3->1, 4->2 self.lc1105["C_RESIDENCE_TYPE"] = self.lc1105["C_RESIDENCE_TYPE"] - 2 #print(self.lc1105.OBS_VALUE.sum(), checksum) # occupied vs unoccupied #print(self.api_sc.get_metadata("KS401SC", self.resolution)) # 5'All household spaces: Occupied', # 6'All household spaces: Unoccupied: Second residence/holiday accommodation', # 7'All household spaces: Unoccupied: Vacant', self.ks401 = self.api_sc.get_data( "KS401SC", self.region, self.resolution, category_filters={"KS401SC_0_CODE": [5, 6, 7]}) self.ks401.rename({"KS401SC_0_CODE": "CELL"}, axis=1, inplace=True) self.ks401 = utils.cap_value(self.ks401, "CELL", 6, "OBS_VALUE") assert self.ks401[self.ks401.CELL == 5].OBS_VALUE.sum() == checksum #print(self.api_sc.get_metadata("LC4202SC", self.resolution)) #{'table': 'LC4202SC', 'description': '', 'geography': 'OA11', 'fields': {'LC4202SC_1_CODE': [ # 'All households:', # 'Owned:', # 'Social rented:', # 'Private rented or living rent free:'], # 'LC4202SC_2_CODE': [ # 'Total', # 'Number of cars or vans in household: No cars or vans', # 'Number of cars or vans in household: One car or van', # 'Number of cars or vans in household:Two or more cars or vans'], # 'LC4202SC_0_CODE': [ # 'All households', # 'White', # 'Mixed or multiple ethnic groups', # 'Asian Asian Scottish or Asian British', # 'African', # 'Caribbean or Black', # 'Other ethnic groups']}} self.lc4202 = self.api_sc.get_data("LC4202SC", self.region, self.resolution, category_filters={ "LC4202SC_1_CODE": [1, 2, 3], "LC4202SC_2_CODE": [1, 2, 3], "LC4202SC_0_CODE": [1, 2, 3, 4, 5, 6] }) self.lc4202.rename( { "LC4202SC_2_CODE": "C_CARSNO", "LC4202SC_1_CODE": "C_TENHUK11", "LC4202SC_0_CODE": "C_ETHHUK11" }, axis=1, inplace=True) # TODO how to map tenure 1->2/3? self.lc4202.loc[self.lc4202.C_TENHUK11 == 3, "C_TENHUK11"] = 6 self.lc4202.loc[self.lc4202.C_TENHUK11 == 2, "C_TENHUK11"] = 5 self.lc4202.loc[self.lc4202.C_TENHUK11 == 1, "C_TENHUK11"] = 3 # OR 2? assert self.lc4202.OBS_VALUE.sum() == checksum #print(self.api_sc.get_metadata("LC4605SC", self.resolution)) #{'table': 'LC4605SC', 'description': '', 'geography': 'OA11', 'fields': {'LC4605SC_1_CODE': [ # 'All HRPs aged 16 to 74', # 'Owned: Total', # 'Owned: Owned outright', # 'Owned: Owned witha mortgage or loan or shared ownership', # 'Rented or living rent free: Total', # 'Rented or living rent free: Social rented', # 'Rented or living rent free: Private rented or living rent free'], # 'LC4605SC_0_CODE': ['All HRPs aged 16 to 74', # '1. Higher managerial administrative and professional occupations', # '2. Lower managerial administrative and professional occupations', # '3. Intermediate occupations', # '4. Small employers and own account workers', # '5. Lower supervisory and technical occupations', # '6. Semi-routine occupations', # '7. Routine occupations', # '8. Never worked and long-term unemployed', # 'L15 Full-time students']}} self.lc4605 = self.api_sc.get_data("LC4605SC", self.region, self.resolution, category_filters={ "LC4605SC_1_CODE": [2, 3, 5, 6], "LC4605SC_0_CODE": range(1, 10) }) self.lc4605.rename( { "LC4605SC_1_CODE": "C_TENHUK11", "LC4605SC_0_CODE": "C_NSSEC" }, axis=1, inplace=True) # TODO add retired? print(self.lc4605.OBS_VALUE.sum(), checksum, "TODO add retired") #print(self.api_sc.get_metadata("QS420SC", self.resolution)) cats = [2, 6, 11, 14, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33] # merge the two communal tables (so we have establishment and people counts) self.communal = self.api_sc.get_data("QS420SC", self.region, self.resolution, category_filters={ "QS420SC_0_CODE": cats }).rename( {"QS420SC_0_CODE": "CELL"}, axis=1) qs421 = self.api_sc.get_data("QS421SC", self.region, self.resolution, category_filters={ "QS421SC_0_CODE": cats }).rename({"OBS_VALUE": "CommunalSize"}, axis=1) #print(qs421.head()) self.communal = self.communal.merge( qs421, left_on=["GEOGRAPHY_CODE", "CELL"], right_on=["GEOGRAPHY_CODE", "QS421SC_0_CODE"]).drop("QS421SC_0_CODE", axis=1)