Ejemplo n.º 1
0
    def _generate_from_total(self, agg_value, country):
        # TODO improve distribution
        sex_split = humanleague.prob2IntFreq(np.ones(2) / 2,
                                             int(agg_value))["freq"]
        age_split = humanleague.prob2IntFreq(np.ones(17) / 17,
                                             int(agg_value))["freq"]

        msynth = humanleague.qis(
            [np.array([0], dtype=int),
             np.array([1], dtype=int)], [age_split, sex_split])
        if not isinstance(msynth, dict):
            raise RuntimeError(
                "microsynthesis (from total) general failure: %s" % msynth)
        if not msynth["conv"]:
            raise RuntimeError(
                "microsynthesis (from total) convergence failure")

        raw = humanleague.flatten(msynth["result"])
        pop = pd.DataFrame(columns=["AGE", "SEX"])
        pop.AGE = raw[0]
        pop.SEX = raw[1]

        # could fail here if zero people in any category
        assert len(pop.AGE.unique()) == 17
        assert len(pop.SEX.unique()) == 2

        # construct single year of age
        pop["Country"] = country
        self.pop = self.pop.append(pop, sort=False)
Ejemplo n.º 2
0
    def _generate(self, agg_data, country):
        agg_data = humanleague.integerise(agg_data)["result"]
        # split 5y groups
        split = humanleague.prob2IntFreq(np.ones(5) / 5,
                                         int(agg_data.sum()))["freq"]

        msynth = humanleague.qis(
            [np.array([0, 1], dtype=int),
             np.array([2], dtype=int)], [agg_data, split])
        if not isinstance(msynth, dict):
            raise RuntimeError("microsynthesis general failure: %s" % msynth)
        if not msynth["conv"]:
            raise RuntimeError("microsynthesis convergence failure")

        #neworder.log(pop["result"])
        raw = humanleague.flatten(msynth["result"])
        pop = pd.DataFrame(columns=["AGE5", "AGE1", "SEX"])
        pop.AGE5 = raw[0]
        pop.AGE1 = raw[2]
        pop.SEX = raw[1]

        # could fail here if zero people in any category
        assert len(pop.AGE5.unique()) == 17
        assert len(pop.AGE1.unique()) == 5
        assert len(pop.SEX.unique()) == 2

        # construct single year of age
        pop["Country"] = country
        pop["AGE"] = pop.AGE5 * 5 + pop.AGE1
        self.pop = self.pop.append(pop.drop(["AGE5", "AGE1"], axis=1))
Ejemplo n.º 3
0
  def __microsynthesise(self, year): #LAD=self.region

    # Census/seed proportions for geography and ethnicity
    oa_prop = self.seed.sum((1, 2, 3)) / self.seed.sum()
    eth_prop = self.seed.sum((0, 1, 2)) / self.seed.sum()

    if year < self.snpp_api.min_year(self.region):
      age_sex = utils.create_age_sex_marginal(utils.adjust_pp_age(self.mye_api.filter(self.region, year)), self.region)
    elif year <= self.npp_api.max_year():
      # Don't attempt to apply NPP variant if before the start of the NPP data
      if year < self.npp_api.min_year():
        age_sex = utils.create_age_sex_marginal(utils.adjust_pp_age(self.snpp_api.filter(self.region, year)), self.region)
      else:
        age_sex = utils.create_age_sex_marginal(utils.adjust_pp_age(self.snpp_api.create_variant(self.variant, self.npp_api, self.region, year)), self.region)
    else:
      raise ValueError("Cannot microsimulate past NPP horizon year ({})", self.npp_api.max_year())

    # convert proportions/probabilities to integer frequencies
    oa = hl.prob2IntFreq(oa_prop, age_sex.sum())["freq"]
    eth = hl.prob2IntFreq(eth_prop, age_sex.sum())["freq"]
    # combine the above into a 2d marginal using QIS-I and census 2011 or later data as the seed
    oa_eth = hl.qisi(self.seed.sum((1, 2)), [np.array([0]), np.array([1])], [oa, eth])
    if not (isinstance(oa_eth, dict) and oa_eth["conv"]):
      raise RuntimeError("oa_eth did not converge")

    # now the full seeded microsynthesis
    if self.fast_mode:
      msynth = hl.ipf(self.seed, [np.array([0, 3]), np.array([1, 2])], [oa_eth["result"].astype(float), age_sex.astype(float)])
    else:
      msynth = hl.qisi(self.seed, [np.array([0, 3]), np.array([1, 2])], [oa_eth["result"], age_sex])
    if not msynth["conv"]:
      print(msynth)
      raise RuntimeError("msynth did not converge")
    #print(msynth["pop"])
    if self.fast_mode:
      print("updating seed to", year, " ", end="")
      self.seed = msynth["result"]
      msynth["result"] = np.around(msynth["result"]).astype(int)
    else:
      print("updating seed to", year, " ", end="")
      self.seed = msynth["result"].astype(float)
    rawtable = hl.flatten(msynth["result"]) #, c("OA", "SEX", "AGE", "ETH"))

    # col names and remapped values
    table = pd.DataFrame(columns=["Area", "DC1117EW_C_SEX", "DC1117EW_C_AGE", "DC2101EW_C_ETHPUK11"])
    table.Area = utils.remap(rawtable[0], self.geog_map)
    table.DC1117EW_C_SEX = utils.remap(rawtable[1], [1, 2])
    table.DC1117EW_C_AGE = utils.remap(rawtable[2], range(1, 87))
    table.DC2101EW_C_ETHPUK11 = utils.remap(rawtable[3], self.eth_map)

    # consistency checks (in fast mode just report discrepancies)
    self.__check(table, age_sex, oa_eth["result"])

    return table
Ejemplo n.º 4
0
  def test_QIS(self):

    # m = np.array([[10,20,10],[10,10,20],[20,10,10]])
    # idx = [np.array([0,1]), np.array([1,2])]
    # r = hl.qis(idx, [m, m])
    # self.assertTrue(false)

    m0 = np.array([52, 48]) 
    m1 = np.array([10, 77, 13])
    i0 = np.array([0])
    i1 = np.array([1])

    p = hl.qis([i0, i1], [m0, m1])
    print(p)
    self.assertTrue(p["conv"])
    self.assertLess(p["chiSq"], 0.04) 
    self.assertGreater(p["pValue"], 0.9) 
    #self.assertLess(p["degeneracy"], 0.04) TODO check the calculation
    self.assertEqual(p["pop"], 100.0)
    self.assertTrue(np.allclose(np.sum(p["result"], 0), m1))
    self.assertTrue(np.allclose(np.sum(p["result"], 1), m0))
    #self.assertTrue(np.array_equal(p["result"], np.array([[5, 40, 7],[5, 37, 6]])))

    m0 = np.array([52, 40, 4, 4]) 
    m1 = np.array([87, 10, 3])
    m2 = np.array([55, 15, 6, 12, 12])
    i0 = np.array([0])
    i1 = np.array([1])
    i2 = np.array([2])

    p = hl.qis([i0, i1, i2], [m0, m1, m2])
    self.assertTrue(p["conv"])
    self.assertLess(p["chiSq"], 73.0) # TODO seems a bit high (probably )
    self.assertGreater(p["pValue"], 0.0) # TODO this is suspect
    self.assertEqual(p["pop"], 100.0)
    self.assertTrue(np.allclose(np.sum(p["result"], (0, 1)), m2))
    self.assertTrue(np.allclose(np.sum(p["result"], (1, 2)), m0))
    self.assertTrue(np.allclose(np.sum(p["result"], (2, 0)), m1))

    # Test flatten functionality
    table = hl.flatten(p["result"])

    # length is no of dims
    self.assertTrue(len(table) == 3)
    # length of element is pop
    self.assertTrue(len(table[0]) == p["pop"])
    # check consistent with marginals
    for i in range(0, len(m0)):
      self.assertTrue(table[0].count(i) == m0[i])
    for i in range(0, len(m1)):
      self.assertTrue(table[1].count(i) == m1[i])
    for i in range(0, len(m2)):
      self.assertTrue(table[2].count(i) == m2[i])


    m0 = np.array([52, 48]) 
    m1 = np.array([87, 13])
    m2 = np.array([67, 33])
    m3 = np.array([55, 45])
    i0 = np.array([0])
    i1 = np.array([1])
    i2 = np.array([2])
    i3 = np.array([3])

    p = hl.qis([i0, i1, i2, i3], [m0, m1, m2, m3])
    self.assertTrue(p["conv"])
    self.assertLess(p["chiSq"], 10) 
    self.assertGreater(p["pValue"], 0.002) # TODO this looks suspect too
    self.assertEqual(p["pop"], 100)
    self.assertTrue(np.allclose(np.sum(p["result"], (0, 1, 2)), m3))
    self.assertTrue(np.allclose(np.sum(p["result"], (1, 2, 3)), m0))
    self.assertTrue(np.allclose(np.sum(p["result"], (2, 3, 0)), m1))
    self.assertTrue(np.allclose(np.sum(p["result"], (3, 0, 1)), m2))

    m = np.array([[10,20,10],[10,10,20],[20,10,10]])
    idx = [np.array([0,1]), np.array([1,2])]
    p = hl.qis(idx, [m, m])
    #print(p)
    self.assertTrue(p["conv"])
    self.assertLess(p["chiSq"], 10) 
    self.assertGreater(p["pValue"], 0.27) 
    self.assertEqual(p["pop"], 120)
    print(np.sum(p["result"], 2))
    self.assertTrue(np.allclose(np.sum(p["result"], 2), m))
    self.assertTrue(np.allclose(np.sum(p["result"], 0), m))
Ejemplo n.º 5
0
    def __add_households(self, area, constraints):

        # TODO use actual values from tables
        # TODO make members?                            # Dim (overall dim)
        tenure_map = self.lc4402.C_TENHUK11.unique()  # 0
        rooms_map = self.lc4404.C_ROOMS.unique()  # 1
        occupants_map = self.lc4404.C_SIZHUK11.unique()  # 2
        bedrooms_map = self.lc4405.C_BEDROOMS.unique()  # 3 [1,2,3,4] or [-1]
        hhtype_map = self.lc4408.C_AHTHUK11.unique()  # 4
        #
        ch_map = self.lc4402.C_CENHEATHUK11.unique()  # 1 (5)
        buildtype_map = self.lc4402.C_TYPACCOM.unique()  # 2 (6)
        eth_map = self.lc4202.C_ETHHUK11.unique()  # 3 (7)
        cars_map = self.lc4202.C_CARSNO.unique()  # 4 (8)
        econ_map = self.lc4605.C_NSSEC.unique()  # 5 (9)

        tenure_rooms_occ = self.lc4404.loc[self.lc4404.GEOGRAPHY_CODE ==
                                           area].copy()
        # unmap indices
        # TODO might be quicker to unmap the entire table upfront?
        utils.unmap(tenure_rooms_occ.C_TENHUK11, tenure_map)
        utils.unmap(tenure_rooms_occ.C_ROOMS, rooms_map)
        utils.unmap(tenure_rooms_occ.C_SIZHUK11, occupants_map)

        m4404 = utils.unlistify(
            tenure_rooms_occ, ["C_TENHUK11", "C_ROOMS", "C_SIZHUK11"],
            [len(tenure_map),
             len(rooms_map),
             len(occupants_map)], "OBS_VALUE")

        # no bedroom info in Scottish data
        tenure_beds_occ = self.lc4405.loc[self.lc4405.GEOGRAPHY_CODE ==
                                          area].copy()

        # unmap indices
        utils.unmap(tenure_beds_occ.C_BEDROOMS, bedrooms_map)
        utils.unmap(tenure_beds_occ.C_TENHUK11, tenure_map)
        utils.unmap(tenure_beds_occ.C_SIZHUK11, occupants_map)

        m4405 = utils.unlistify(
            tenure_beds_occ, ["C_TENHUK11", "C_BEDROOMS", "C_SIZHUK11"],
            [len(tenure_map),
             len(bedrooms_map),
             len(occupants_map)], "OBS_VALUE")
        #    print(m4405.shape)

        tenure_accom = self.lc4408.loc[self.lc4408.GEOGRAPHY_CODE ==
                                       area].copy()

        utils.unmap(tenure_accom.C_TENHUK11, tenure_map)
        utils.unmap(tenure_accom.C_AHTHUK11, hhtype_map)

        m4408 = utils.unlistify(
            tenure_accom, ["C_TENHUK11", "C_AHTHUK11"],
            [len(tenure_map), len(hhtype_map)], "OBS_VALUE")
        #print(np.sum(m4404), np.sum(m4405), np.sum(m4408))

        # TODO relax IPF tolerance and maxiters when used within QISI?
        m4408dim = np.array([0, 4])
        # collapse m4408 dim for scotland
        if self.scotland:
            m4408 = np.sum(m4408, axis=0)
            m4408dim = np.array([4])
        p0 = humanleague.qisi(
            constraints, [np.array([0, 1, 2]),
                          np.array([0, 3, 2]), m4408dim],
            [m4404, m4405, m4408])

        # drop the survey seed if there are convergence problems
        # TODO check_humanleague_result needs complete refactoring
        if not isinstance(p0, dict) or not p0["conv"]:
            print("Dropping TROBH constraint due to convergence failure")
            p0 = humanleague.qisi(
                seed.get_impossible_TROBH(),
                [np.array([0, 1, 2]),
                 np.array([0, 3, 2]), m4408dim], [m4404, m4405, m4408])
            utils.check_humanleague_result(p0, [m4404, m4405, m4408],
                                           seed.get_impossible_TROBH())
        else:
            utils.check_humanleague_result(p0, [m4404, m4405, m4408],
                                           constraints)

        #print("p0 ok")

        tenure_ch_accom = self.lc4402.loc[self.lc4402.GEOGRAPHY_CODE ==
                                          area].copy()
        utils.unmap(tenure_ch_accom.C_CENHEATHUK11, ch_map)
        utils.unmap(tenure_ch_accom.C_TENHUK11, tenure_map)
        utils.unmap(tenure_ch_accom.C_TYPACCOM, buildtype_map)

        m4402 = utils.unlistify(
            tenure_ch_accom, ["C_TENHUK11", "C_CENHEATHUK11", "C_TYPACCOM"],
            [len(tenure_map), len(ch_map),
             len(buildtype_map)], "OBS_VALUE")

        tenure_eth_car = self.lc4202.loc[self.lc4202.GEOGRAPHY_CODE ==
                                         area].copy()
        utils.unmap(tenure_eth_car.C_ETHHUK11, eth_map)
        utils.unmap(tenure_eth_car.C_CARSNO, cars_map)
        utils.unmap(tenure_eth_car.C_TENHUK11, tenure_map)

        m4202 = utils.unlistify(
            tenure_eth_car, ["C_TENHUK11", "C_ETHHUK11", "C_CARSNO"],
            [len(tenure_map), len(eth_map),
             len(cars_map)], "OBS_VALUE")

        econ = self.lc4605.loc[self.lc4605.GEOGRAPHY_CODE == area].copy()
        utils.unmap(econ.C_NSSEC, econ_map)
        utils.unmap(econ.C_TENHUK11, tenure_map)

        # econ counts often slightly lower, need to tweak
        ##econ = utils.adjust(econ, tenure_eth_car)

        m4605 = utils.unlistify(
            econ, ["C_TENHUK11", "C_NSSEC"],
            [len(tenure_map), len(econ_map)], "OBS_VALUE")

        m4605_sum = np.sum(m4605)
        m4202_sum = np.sum(m4202)

        if m4605_sum != m4202_sum:
            print("LC4402: %d LC4605: %d -> %d " %
                  (np.sum(m4402), m4605_sum, m4202_sum),
                  end="")
            tenure_4202 = np.sum(m4202, axis=(1, 2))
            nssec_4605_adj = humanleague.prob2IntFreq(
                np.sum(m4605, axis=0) / m4605_sum, m4202_sum)["freq"]
            #      m4605_adj = humanleague.qisi(m4605.astype(float), [np.array([0]), np.array([1])], [tenure_4202, nssec_4605_adj])
            # Convergence problems can occur when e.g. one of the tenure rows is zero yet the marginal total is nonzero,
            # Can get round this by adding a small number to the seed
            # effectively allowing zero states to be occupied with a finite probability
            #      if not m4605_adj["conv"]:
            m4605_adj = humanleague.qisi(
                m4605.astype(float) + 1.0 / m4202_sum,
                [np.array([0]), np.array([1])], [tenure_4202, nssec_4605_adj])

            utils.check_humanleague_result(m4605_adj,
                                           [tenure_4202, nssec_4605_adj])
            m4605 = m4605_adj["result"]
            #print("econ adj ok")

        # print(np.sum(p0["result"], axis=(1,2,3,4)))
        # print(np.sum(m4402, axis=(1,2)))
        # print(np.sum(m4202, axis=(1,2)))
        # print(np.sum(m4605, axis=1))

        # no seed constraint so just use QIS
        if self.scotland:
            # tenures not mappable in LC4202
            m4202 = np.sum(m4202, axis=0)
            m4605 = np.sum(m4605, axis=0)
            p1 = humanleague.qis([
                np.array([0, 1, 2, 3, 4]),
                np.array([0, 5, 6]),
                np.array([7, 8]),
                np.array([9])
            ], [p0["result"], m4402, m4202, m4605])
            #p1 = humanleague.qis([np.array([0, 1, 2, 3]), np.array([0, 4, 5]), np.array([0, 6, 7])], [p0["result"], m4402, m4202])
        else:
            p1 = humanleague.qis([
                np.array([0, 1, 2, 3, 4]),
                np.array([0, 5, 6]),
                np.array([0, 7, 8]),
                np.array([0, 9])
            ], [p0["result"], m4402, m4202, m4605])
            #p1 = humanleague.qis([np.array([0, 1, 2, 3]), np.array([0, 4, 5]), np.array([0, 6, 7])], [p0["result"], m4402, m4202])
        utils.check_humanleague_result(p1, [p0["result"], m4402, m4202, m4605])
        #print("p1 ok")

        table = humanleague.flatten(p1["result"])

        chunk = pd.DataFrame(columns=self.dwellings.columns.values)
        chunk.Area = np.repeat(area, len(table[0]))
        chunk.LC4402_C_TENHUK11 = utils.remap(table[0], tenure_map)
        chunk.QS420_CELL = np.repeat(self.NOTAPPLICABLE, len(table[0]))
        chunk.LC4404_C_ROOMS = utils.remap(table[1], rooms_map)
        chunk.LC4404_C_SIZHUK11 = utils.remap(table[2], occupants_map)
        chunk.LC4405EW_C_BEDROOMS = utils.remap(table[3], bedrooms_map)
        chunk.LC4408_C_AHTHUK11 = utils.remap(table[4], hhtype_map)
        chunk.LC4402_C_CENHEATHUK11 = utils.remap(table[5], ch_map)
        chunk.LC4402_C_TYPACCOM = utils.remap(table[6], buildtype_map)
        chunk.CommunalSize = np.repeat(self.NOTAPPLICABLE, len(table[0]))
        chunk.LC4202_C_ETHHUK11 = utils.remap(table[7], eth_map)
        chunk.LC4202_C_CARSNO = utils.remap(table[8], cars_map)
        chunk.LC4605_C_NSSEC = utils.remap(table[9], econ_map)
        #print(chunk.head())
        self.dwellings = self.dwellings.append(chunk, ignore_index=True)