def _generate_from_total(self, agg_value, country): # TODO improve distribution sex_split = humanleague.prob2IntFreq(np.ones(2) / 2, int(agg_value))["freq"] age_split = humanleague.prob2IntFreq(np.ones(17) / 17, int(agg_value))["freq"] msynth = humanleague.qis( [np.array([0], dtype=int), np.array([1], dtype=int)], [age_split, sex_split]) if not isinstance(msynth, dict): raise RuntimeError( "microsynthesis (from total) general failure: %s" % msynth) if not msynth["conv"]: raise RuntimeError( "microsynthesis (from total) convergence failure") raw = humanleague.flatten(msynth["result"]) pop = pd.DataFrame(columns=["AGE", "SEX"]) pop.AGE = raw[0] pop.SEX = raw[1] # could fail here if zero people in any category assert len(pop.AGE.unique()) == 17 assert len(pop.SEX.unique()) == 2 # construct single year of age pop["Country"] = country self.pop = self.pop.append(pop, sort=False)
def _generate(self, agg_data, country): agg_data = humanleague.integerise(agg_data)["result"] # split 5y groups split = humanleague.prob2IntFreq(np.ones(5) / 5, int(agg_data.sum()))["freq"] msynth = humanleague.qis( [np.array([0, 1], dtype=int), np.array([2], dtype=int)], [agg_data, split]) if not isinstance(msynth, dict): raise RuntimeError("microsynthesis general failure: %s" % msynth) if not msynth["conv"]: raise RuntimeError("microsynthesis convergence failure") #neworder.log(pop["result"]) raw = humanleague.flatten(msynth["result"]) pop = pd.DataFrame(columns=["AGE5", "AGE1", "SEX"]) pop.AGE5 = raw[0] pop.AGE1 = raw[2] pop.SEX = raw[1] # could fail here if zero people in any category assert len(pop.AGE5.unique()) == 17 assert len(pop.AGE1.unique()) == 5 assert len(pop.SEX.unique()) == 2 # construct single year of age pop["Country"] = country pop["AGE"] = pop.AGE5 * 5 + pop.AGE1 self.pop = self.pop.append(pop.drop(["AGE5", "AGE1"], axis=1))
def __microsynthesise(self, year): #LAD=self.region # Census/seed proportions for geography and ethnicity oa_prop = self.seed.sum((1, 2, 3)) / self.seed.sum() eth_prop = self.seed.sum((0, 1, 2)) / self.seed.sum() if year < self.snpp_api.min_year(self.region): age_sex = utils.create_age_sex_marginal(utils.adjust_pp_age(self.mye_api.filter(self.region, year)), self.region) elif year <= self.npp_api.max_year(): # Don't attempt to apply NPP variant if before the start of the NPP data if year < self.npp_api.min_year(): age_sex = utils.create_age_sex_marginal(utils.adjust_pp_age(self.snpp_api.filter(self.region, year)), self.region) else: age_sex = utils.create_age_sex_marginal(utils.adjust_pp_age(self.snpp_api.create_variant(self.variant, self.npp_api, self.region, year)), self.region) else: raise ValueError("Cannot microsimulate past NPP horizon year ({})", self.npp_api.max_year()) # convert proportions/probabilities to integer frequencies oa = hl.prob2IntFreq(oa_prop, age_sex.sum())["freq"] eth = hl.prob2IntFreq(eth_prop, age_sex.sum())["freq"] # combine the above into a 2d marginal using QIS-I and census 2011 or later data as the seed oa_eth = hl.qisi(self.seed.sum((1, 2)), [np.array([0]), np.array([1])], [oa, eth]) if not (isinstance(oa_eth, dict) and oa_eth["conv"]): raise RuntimeError("oa_eth did not converge") # now the full seeded microsynthesis if self.fast_mode: msynth = hl.ipf(self.seed, [np.array([0, 3]), np.array([1, 2])], [oa_eth["result"].astype(float), age_sex.astype(float)]) else: msynth = hl.qisi(self.seed, [np.array([0, 3]), np.array([1, 2])], [oa_eth["result"], age_sex]) if not msynth["conv"]: print(msynth) raise RuntimeError("msynth did not converge") #print(msynth["pop"]) if self.fast_mode: print("updating seed to", year, " ", end="") self.seed = msynth["result"] msynth["result"] = np.around(msynth["result"]).astype(int) else: print("updating seed to", year, " ", end="") self.seed = msynth["result"].astype(float) rawtable = hl.flatten(msynth["result"]) #, c("OA", "SEX", "AGE", "ETH")) # col names and remapped values table = pd.DataFrame(columns=["Area", "DC1117EW_C_SEX", "DC1117EW_C_AGE", "DC2101EW_C_ETHPUK11"]) table.Area = utils.remap(rawtable[0], self.geog_map) table.DC1117EW_C_SEX = utils.remap(rawtable[1], [1, 2]) table.DC1117EW_C_AGE = utils.remap(rawtable[2], range(1, 87)) table.DC2101EW_C_ETHPUK11 = utils.remap(rawtable[3], self.eth_map) # consistency checks (in fast mode just report discrepancies) self.__check(table, age_sex, oa_eth["result"]) return table
def test_QIS(self): # m = np.array([[10,20,10],[10,10,20],[20,10,10]]) # idx = [np.array([0,1]), np.array([1,2])] # r = hl.qis(idx, [m, m]) # self.assertTrue(false) m0 = np.array([52, 48]) m1 = np.array([10, 77, 13]) i0 = np.array([0]) i1 = np.array([1]) p = hl.qis([i0, i1], [m0, m1]) print(p) self.assertTrue(p["conv"]) self.assertLess(p["chiSq"], 0.04) self.assertGreater(p["pValue"], 0.9) #self.assertLess(p["degeneracy"], 0.04) TODO check the calculation self.assertEqual(p["pop"], 100.0) self.assertTrue(np.allclose(np.sum(p["result"], 0), m1)) self.assertTrue(np.allclose(np.sum(p["result"], 1), m0)) #self.assertTrue(np.array_equal(p["result"], np.array([[5, 40, 7],[5, 37, 6]]))) m0 = np.array([52, 40, 4, 4]) m1 = np.array([87, 10, 3]) m2 = np.array([55, 15, 6, 12, 12]) i0 = np.array([0]) i1 = np.array([1]) i2 = np.array([2]) p = hl.qis([i0, i1, i2], [m0, m1, m2]) self.assertTrue(p["conv"]) self.assertLess(p["chiSq"], 73.0) # TODO seems a bit high (probably ) self.assertGreater(p["pValue"], 0.0) # TODO this is suspect self.assertEqual(p["pop"], 100.0) self.assertTrue(np.allclose(np.sum(p["result"], (0, 1)), m2)) self.assertTrue(np.allclose(np.sum(p["result"], (1, 2)), m0)) self.assertTrue(np.allclose(np.sum(p["result"], (2, 0)), m1)) # Test flatten functionality table = hl.flatten(p["result"]) # length is no of dims self.assertTrue(len(table) == 3) # length of element is pop self.assertTrue(len(table[0]) == p["pop"]) # check consistent with marginals for i in range(0, len(m0)): self.assertTrue(table[0].count(i) == m0[i]) for i in range(0, len(m1)): self.assertTrue(table[1].count(i) == m1[i]) for i in range(0, len(m2)): self.assertTrue(table[2].count(i) == m2[i]) m0 = np.array([52, 48]) m1 = np.array([87, 13]) m2 = np.array([67, 33]) m3 = np.array([55, 45]) i0 = np.array([0]) i1 = np.array([1]) i2 = np.array([2]) i3 = np.array([3]) p = hl.qis([i0, i1, i2, i3], [m0, m1, m2, m3]) self.assertTrue(p["conv"]) self.assertLess(p["chiSq"], 10) self.assertGreater(p["pValue"], 0.002) # TODO this looks suspect too self.assertEqual(p["pop"], 100) self.assertTrue(np.allclose(np.sum(p["result"], (0, 1, 2)), m3)) self.assertTrue(np.allclose(np.sum(p["result"], (1, 2, 3)), m0)) self.assertTrue(np.allclose(np.sum(p["result"], (2, 3, 0)), m1)) self.assertTrue(np.allclose(np.sum(p["result"], (3, 0, 1)), m2)) m = np.array([[10,20,10],[10,10,20],[20,10,10]]) idx = [np.array([0,1]), np.array([1,2])] p = hl.qis(idx, [m, m]) #print(p) self.assertTrue(p["conv"]) self.assertLess(p["chiSq"], 10) self.assertGreater(p["pValue"], 0.27) self.assertEqual(p["pop"], 120) print(np.sum(p["result"], 2)) self.assertTrue(np.allclose(np.sum(p["result"], 2), m)) self.assertTrue(np.allclose(np.sum(p["result"], 0), m))
def __add_households(self, area, constraints): # TODO use actual values from tables # TODO make members? # Dim (overall dim) tenure_map = self.lc4402.C_TENHUK11.unique() # 0 rooms_map = self.lc4404.C_ROOMS.unique() # 1 occupants_map = self.lc4404.C_SIZHUK11.unique() # 2 bedrooms_map = self.lc4405.C_BEDROOMS.unique() # 3 [1,2,3,4] or [-1] hhtype_map = self.lc4408.C_AHTHUK11.unique() # 4 # ch_map = self.lc4402.C_CENHEATHUK11.unique() # 1 (5) buildtype_map = self.lc4402.C_TYPACCOM.unique() # 2 (6) eth_map = self.lc4202.C_ETHHUK11.unique() # 3 (7) cars_map = self.lc4202.C_CARSNO.unique() # 4 (8) econ_map = self.lc4605.C_NSSEC.unique() # 5 (9) tenure_rooms_occ = self.lc4404.loc[self.lc4404.GEOGRAPHY_CODE == area].copy() # unmap indices # TODO might be quicker to unmap the entire table upfront? utils.unmap(tenure_rooms_occ.C_TENHUK11, tenure_map) utils.unmap(tenure_rooms_occ.C_ROOMS, rooms_map) utils.unmap(tenure_rooms_occ.C_SIZHUK11, occupants_map) m4404 = utils.unlistify( tenure_rooms_occ, ["C_TENHUK11", "C_ROOMS", "C_SIZHUK11"], [len(tenure_map), len(rooms_map), len(occupants_map)], "OBS_VALUE") # no bedroom info in Scottish data tenure_beds_occ = self.lc4405.loc[self.lc4405.GEOGRAPHY_CODE == area].copy() # unmap indices utils.unmap(tenure_beds_occ.C_BEDROOMS, bedrooms_map) utils.unmap(tenure_beds_occ.C_TENHUK11, tenure_map) utils.unmap(tenure_beds_occ.C_SIZHUK11, occupants_map) m4405 = utils.unlistify( tenure_beds_occ, ["C_TENHUK11", "C_BEDROOMS", "C_SIZHUK11"], [len(tenure_map), len(bedrooms_map), len(occupants_map)], "OBS_VALUE") # print(m4405.shape) tenure_accom = self.lc4408.loc[self.lc4408.GEOGRAPHY_CODE == area].copy() utils.unmap(tenure_accom.C_TENHUK11, tenure_map) utils.unmap(tenure_accom.C_AHTHUK11, hhtype_map) m4408 = utils.unlistify( tenure_accom, ["C_TENHUK11", "C_AHTHUK11"], [len(tenure_map), len(hhtype_map)], "OBS_VALUE") #print(np.sum(m4404), np.sum(m4405), np.sum(m4408)) # TODO relax IPF tolerance and maxiters when used within QISI? m4408dim = np.array([0, 4]) # collapse m4408 dim for scotland if self.scotland: m4408 = np.sum(m4408, axis=0) m4408dim = np.array([4]) p0 = humanleague.qisi( constraints, [np.array([0, 1, 2]), np.array([0, 3, 2]), m4408dim], [m4404, m4405, m4408]) # drop the survey seed if there are convergence problems # TODO check_humanleague_result needs complete refactoring if not isinstance(p0, dict) or not p0["conv"]: print("Dropping TROBH constraint due to convergence failure") p0 = humanleague.qisi( seed.get_impossible_TROBH(), [np.array([0, 1, 2]), np.array([0, 3, 2]), m4408dim], [m4404, m4405, m4408]) utils.check_humanleague_result(p0, [m4404, m4405, m4408], seed.get_impossible_TROBH()) else: utils.check_humanleague_result(p0, [m4404, m4405, m4408], constraints) #print("p0 ok") tenure_ch_accom = self.lc4402.loc[self.lc4402.GEOGRAPHY_CODE == area].copy() utils.unmap(tenure_ch_accom.C_CENHEATHUK11, ch_map) utils.unmap(tenure_ch_accom.C_TENHUK11, tenure_map) utils.unmap(tenure_ch_accom.C_TYPACCOM, buildtype_map) m4402 = utils.unlistify( tenure_ch_accom, ["C_TENHUK11", "C_CENHEATHUK11", "C_TYPACCOM"], [len(tenure_map), len(ch_map), len(buildtype_map)], "OBS_VALUE") tenure_eth_car = self.lc4202.loc[self.lc4202.GEOGRAPHY_CODE == area].copy() utils.unmap(tenure_eth_car.C_ETHHUK11, eth_map) utils.unmap(tenure_eth_car.C_CARSNO, cars_map) utils.unmap(tenure_eth_car.C_TENHUK11, tenure_map) m4202 = utils.unlistify( tenure_eth_car, ["C_TENHUK11", "C_ETHHUK11", "C_CARSNO"], [len(tenure_map), len(eth_map), len(cars_map)], "OBS_VALUE") econ = self.lc4605.loc[self.lc4605.GEOGRAPHY_CODE == area].copy() utils.unmap(econ.C_NSSEC, econ_map) utils.unmap(econ.C_TENHUK11, tenure_map) # econ counts often slightly lower, need to tweak ##econ = utils.adjust(econ, tenure_eth_car) m4605 = utils.unlistify( econ, ["C_TENHUK11", "C_NSSEC"], [len(tenure_map), len(econ_map)], "OBS_VALUE") m4605_sum = np.sum(m4605) m4202_sum = np.sum(m4202) if m4605_sum != m4202_sum: print("LC4402: %d LC4605: %d -> %d " % (np.sum(m4402), m4605_sum, m4202_sum), end="") tenure_4202 = np.sum(m4202, axis=(1, 2)) nssec_4605_adj = humanleague.prob2IntFreq( np.sum(m4605, axis=0) / m4605_sum, m4202_sum)["freq"] # m4605_adj = humanleague.qisi(m4605.astype(float), [np.array([0]), np.array([1])], [tenure_4202, nssec_4605_adj]) # Convergence problems can occur when e.g. one of the tenure rows is zero yet the marginal total is nonzero, # Can get round this by adding a small number to the seed # effectively allowing zero states to be occupied with a finite probability # if not m4605_adj["conv"]: m4605_adj = humanleague.qisi( m4605.astype(float) + 1.0 / m4202_sum, [np.array([0]), np.array([1])], [tenure_4202, nssec_4605_adj]) utils.check_humanleague_result(m4605_adj, [tenure_4202, nssec_4605_adj]) m4605 = m4605_adj["result"] #print("econ adj ok") # print(np.sum(p0["result"], axis=(1,2,3,4))) # print(np.sum(m4402, axis=(1,2))) # print(np.sum(m4202, axis=(1,2))) # print(np.sum(m4605, axis=1)) # no seed constraint so just use QIS if self.scotland: # tenures not mappable in LC4202 m4202 = np.sum(m4202, axis=0) m4605 = np.sum(m4605, axis=0) p1 = humanleague.qis([ np.array([0, 1, 2, 3, 4]), np.array([0, 5, 6]), np.array([7, 8]), np.array([9]) ], [p0["result"], m4402, m4202, m4605]) #p1 = humanleague.qis([np.array([0, 1, 2, 3]), np.array([0, 4, 5]), np.array([0, 6, 7])], [p0["result"], m4402, m4202]) else: p1 = humanleague.qis([ np.array([0, 1, 2, 3, 4]), np.array([0, 5, 6]), np.array([0, 7, 8]), np.array([0, 9]) ], [p0["result"], m4402, m4202, m4605]) #p1 = humanleague.qis([np.array([0, 1, 2, 3]), np.array([0, 4, 5]), np.array([0, 6, 7])], [p0["result"], m4402, m4202]) utils.check_humanleague_result(p1, [p0["result"], m4402, m4202, m4605]) #print("p1 ok") table = humanleague.flatten(p1["result"]) chunk = pd.DataFrame(columns=self.dwellings.columns.values) chunk.Area = np.repeat(area, len(table[0])) chunk.LC4402_C_TENHUK11 = utils.remap(table[0], tenure_map) chunk.QS420_CELL = np.repeat(self.NOTAPPLICABLE, len(table[0])) chunk.LC4404_C_ROOMS = utils.remap(table[1], rooms_map) chunk.LC4404_C_SIZHUK11 = utils.remap(table[2], occupants_map) chunk.LC4405EW_C_BEDROOMS = utils.remap(table[3], bedrooms_map) chunk.LC4408_C_AHTHUK11 = utils.remap(table[4], hhtype_map) chunk.LC4402_C_CENHEATHUK11 = utils.remap(table[5], ch_map) chunk.LC4402_C_TYPACCOM = utils.remap(table[6], buildtype_map) chunk.CommunalSize = np.repeat(self.NOTAPPLICABLE, len(table[0])) chunk.LC4202_C_ETHHUK11 = utils.remap(table[7], eth_map) chunk.LC4202_C_CARSNO = utils.remap(table[8], cars_map) chunk.LC4605_C_NSSEC = utils.remap(table[9], econ_map) #print(chunk.head()) self.dwellings = self.dwellings.append(chunk, ignore_index=True)