Esempio n. 1
0
    def _generate(self, agg_data, country):
        agg_data = humanleague.integerise(agg_data)["result"]
        # split 5y groups
        split = humanleague.prob2IntFreq(np.ones(5) / 5,
                                         int(agg_data.sum()))["freq"]

        msynth = humanleague.qis(
            [np.array([0, 1], dtype=int),
             np.array([2], dtype=int)], [agg_data, split])
        if not isinstance(msynth, dict):
            raise RuntimeError("microsynthesis general failure: %s" % msynth)
        if not msynth["conv"]:
            raise RuntimeError("microsynthesis convergence failure")

        #neworder.log(pop["result"])
        raw = humanleague.flatten(msynth["result"])
        pop = pd.DataFrame(columns=["AGE5", "AGE1", "SEX"])
        pop.AGE5 = raw[0]
        pop.AGE1 = raw[2]
        pop.SEX = raw[1]

        # could fail here if zero people in any category
        assert len(pop.AGE5.unique()) == 17
        assert len(pop.AGE1.unique()) == 5
        assert len(pop.SEX.unique()) == 2

        # construct single year of age
        pop["Country"] = country
        pop["AGE"] = pop.AGE5 * 5 + pop.AGE1
        self.pop = self.pop.append(pop.drop(["AGE5", "AGE1"], axis=1))
Esempio n. 2
0
    def _generate_from_total(self, agg_value, country):
        # TODO improve distribution
        sex_split = humanleague.prob2IntFreq(np.ones(2) / 2,
                                             int(agg_value))["freq"]
        age_split = humanleague.prob2IntFreq(np.ones(17) / 17,
                                             int(agg_value))["freq"]

        msynth = humanleague.qis(
            [np.array([0], dtype=int),
             np.array([1], dtype=int)], [age_split, sex_split])
        if not isinstance(msynth, dict):
            raise RuntimeError(
                "microsynthesis (from total) general failure: %s" % msynth)
        if not msynth["conv"]:
            raise RuntimeError(
                "microsynthesis (from total) convergence failure")

        raw = humanleague.flatten(msynth["result"])
        pop = pd.DataFrame(columns=["AGE", "SEX"])
        pop.AGE = raw[0]
        pop.SEX = raw[1]

        # could fail here if zero people in any category
        assert len(pop.AGE.unique()) == 17
        assert len(pop.SEX.unique()) == 2

        # construct single year of age
        pop["Country"] = country
        self.pop = self.pop.append(pop, sort=False)
Esempio n. 3
0
  def test_QIS_dim_indexing(self):

    # tricky array indexing - 1st dimension of d0 already sampled, remaining dimension
    # indices on slice of d0 need to be remapped

    m0 = np.ones([4,6,4,4], dtype=int) 
    m1 = np.ones([4,4,4], dtype=int) * 6

    ms=hl.qis([np.array([0,1,2,3]),np.array([0,4,5])], [m0,m1])
    self.assertTrue(ms["conv"])

    ms=hl.qis([np.array([0,4,5]),np.array([0,1,2,3])], [m1,m0])
    self.assertTrue(ms["conv"])

    ms=hl.qis([np.array([0,1,2]),np.array([0,3,4,5])], [m1,m0])
    self.assertTrue(ms["conv"])
Esempio n. 4
0
def microsynthesise_seed(dc1117, dc2101, dc6206):
    """
  Microsynthesise a seed population from census data
  """
    n_geog = len(dc1117.GEOGRAPHY_CODE.unique())
    n_sex = 2  #len(dc1117.C_SEX.unique())
    n_age = len(dc1117.C_AGE.unique())
    cen11sa = unlistify(dc1117, ["GEOGRAPHY_CODE", "C_SEX", "C_AGE"],
                        [n_geog, n_sex, n_age], "OBS_VALUE")
    n_eth = len(dc2101.C_ETHPUK11.unique())
    cen11se = unlistify(dc2101, ["GEOGRAPHY_CODE", "C_SEX", "C_ETHPUK11"],
                        [n_geog, n_sex, n_eth], "OBS_VALUE")

    # TODO use microdata (national or perhaps regional) Mistral/persistent_data/seed_ASE_EW.csv
    # - requires unified age structure

    # microsynthesise these two into a 4D seed (if this has a lot of zeros can have big impact on microsim)
    print("Synthesising 2011 seed population...", end='')
    msynth = hl.qis(
        [np.array([0, 1, 2]), np.array([0, 1, 3])], [cen11sa, cen11se])
    check_result(msynth)
    print("OK")
    return msynth["result"]
Esempio n. 5
0
  def test_QIS(self):

    # m = np.array([[10,20,10],[10,10,20],[20,10,10]])
    # idx = [np.array([0,1]), np.array([1,2])]
    # r = hl.qis(idx, [m, m])
    # self.assertTrue(false)

    m0 = np.array([52, 48]) 
    m1 = np.array([10, 77, 13])
    i0 = np.array([0])
    i1 = np.array([1])

    p = hl.qis([i0, i1], [m0, m1])
    print(p)
    self.assertTrue(p["conv"])
    self.assertLess(p["chiSq"], 0.04) 
    self.assertGreater(p["pValue"], 0.9) 
    #self.assertLess(p["degeneracy"], 0.04) TODO check the calculation
    self.assertEqual(p["pop"], 100.0)
    self.assertTrue(np.allclose(np.sum(p["result"], 0), m1))
    self.assertTrue(np.allclose(np.sum(p["result"], 1), m0))
    #self.assertTrue(np.array_equal(p["result"], np.array([[5, 40, 7],[5, 37, 6]])))

    m0 = np.array([52, 40, 4, 4]) 
    m1 = np.array([87, 10, 3])
    m2 = np.array([55, 15, 6, 12, 12])
    i0 = np.array([0])
    i1 = np.array([1])
    i2 = np.array([2])

    p = hl.qis([i0, i1, i2], [m0, m1, m2])
    self.assertTrue(p["conv"])
    self.assertLess(p["chiSq"], 73.0) # TODO seems a bit high (probably )
    self.assertGreater(p["pValue"], 0.0) # TODO this is suspect
    self.assertEqual(p["pop"], 100.0)
    self.assertTrue(np.allclose(np.sum(p["result"], (0, 1)), m2))
    self.assertTrue(np.allclose(np.sum(p["result"], (1, 2)), m0))
    self.assertTrue(np.allclose(np.sum(p["result"], (2, 0)), m1))

    # Test flatten functionality
    table = hl.flatten(p["result"])

    # length is no of dims
    self.assertTrue(len(table) == 3)
    # length of element is pop
    self.assertTrue(len(table[0]) == p["pop"])
    # check consistent with marginals
    for i in range(0, len(m0)):
      self.assertTrue(table[0].count(i) == m0[i])
    for i in range(0, len(m1)):
      self.assertTrue(table[1].count(i) == m1[i])
    for i in range(0, len(m2)):
      self.assertTrue(table[2].count(i) == m2[i])


    m0 = np.array([52, 48]) 
    m1 = np.array([87, 13])
    m2 = np.array([67, 33])
    m3 = np.array([55, 45])
    i0 = np.array([0])
    i1 = np.array([1])
    i2 = np.array([2])
    i3 = np.array([3])

    p = hl.qis([i0, i1, i2, i3], [m0, m1, m2, m3])
    self.assertTrue(p["conv"])
    self.assertLess(p["chiSq"], 10) 
    self.assertGreater(p["pValue"], 0.002) # TODO this looks suspect too
    self.assertEqual(p["pop"], 100)
    self.assertTrue(np.allclose(np.sum(p["result"], (0, 1, 2)), m3))
    self.assertTrue(np.allclose(np.sum(p["result"], (1, 2, 3)), m0))
    self.assertTrue(np.allclose(np.sum(p["result"], (2, 3, 0)), m1))
    self.assertTrue(np.allclose(np.sum(p["result"], (3, 0, 1)), m2))

    m = np.array([[10,20,10],[10,10,20],[20,10,10]])
    idx = [np.array([0,1]), np.array([1,2])]
    p = hl.qis(idx, [m, m])
    #print(p)
    self.assertTrue(p["conv"])
    self.assertLess(p["chiSq"], 10) 
    self.assertGreater(p["pValue"], 0.27) 
    self.assertEqual(p["pop"], 120)
    print(np.sum(p["result"], 2))
    self.assertTrue(np.allclose(np.sum(p["result"], 2), m))
    self.assertTrue(np.allclose(np.sum(p["result"], 0), m))
Esempio n. 6
0
    def __add_households(self, area, constraints):

        # TODO use actual values from tables
        # TODO make members?                            # Dim (overall dim)
        tenure_map = self.lc4402.C_TENHUK11.unique()  # 0
        rooms_map = self.lc4404.C_ROOMS.unique()  # 1
        occupants_map = self.lc4404.C_SIZHUK11.unique()  # 2
        bedrooms_map = self.lc4405.C_BEDROOMS.unique()  # 3 [1,2,3,4] or [-1]
        hhtype_map = self.lc4408.C_AHTHUK11.unique()  # 4
        #
        ch_map = self.lc4402.C_CENHEATHUK11.unique()  # 1 (5)
        buildtype_map = self.lc4402.C_TYPACCOM.unique()  # 2 (6)
        eth_map = self.lc4202.C_ETHHUK11.unique()  # 3 (7)
        cars_map = self.lc4202.C_CARSNO.unique()  # 4 (8)
        econ_map = self.lc4605.C_NSSEC.unique()  # 5 (9)

        tenure_rooms_occ = self.lc4404.loc[self.lc4404.GEOGRAPHY_CODE ==
                                           area].copy()
        # unmap indices
        # TODO might be quicker to unmap the entire table upfront?
        utils.unmap(tenure_rooms_occ.C_TENHUK11, tenure_map)
        utils.unmap(tenure_rooms_occ.C_ROOMS, rooms_map)
        utils.unmap(tenure_rooms_occ.C_SIZHUK11, occupants_map)

        m4404 = utils.unlistify(
            tenure_rooms_occ, ["C_TENHUK11", "C_ROOMS", "C_SIZHUK11"],
            [len(tenure_map),
             len(rooms_map),
             len(occupants_map)], "OBS_VALUE")

        # no bedroom info in Scottish data
        tenure_beds_occ = self.lc4405.loc[self.lc4405.GEOGRAPHY_CODE ==
                                          area].copy()

        # unmap indices
        utils.unmap(tenure_beds_occ.C_BEDROOMS, bedrooms_map)
        utils.unmap(tenure_beds_occ.C_TENHUK11, tenure_map)
        utils.unmap(tenure_beds_occ.C_SIZHUK11, occupants_map)

        m4405 = utils.unlistify(
            tenure_beds_occ, ["C_TENHUK11", "C_BEDROOMS", "C_SIZHUK11"],
            [len(tenure_map),
             len(bedrooms_map),
             len(occupants_map)], "OBS_VALUE")
        #    print(m4405.shape)

        tenure_accom = self.lc4408.loc[self.lc4408.GEOGRAPHY_CODE ==
                                       area].copy()

        utils.unmap(tenure_accom.C_TENHUK11, tenure_map)
        utils.unmap(tenure_accom.C_AHTHUK11, hhtype_map)

        m4408 = utils.unlistify(
            tenure_accom, ["C_TENHUK11", "C_AHTHUK11"],
            [len(tenure_map), len(hhtype_map)], "OBS_VALUE")
        #print(np.sum(m4404), np.sum(m4405), np.sum(m4408))

        # TODO relax IPF tolerance and maxiters when used within QISI?
        m4408dim = np.array([0, 4])
        # collapse m4408 dim for scotland
        if self.scotland:
            m4408 = np.sum(m4408, axis=0)
            m4408dim = np.array([4])
        p0 = humanleague.qisi(
            constraints, [np.array([0, 1, 2]),
                          np.array([0, 3, 2]), m4408dim],
            [m4404, m4405, m4408])

        # drop the survey seed if there are convergence problems
        # TODO check_humanleague_result needs complete refactoring
        if not isinstance(p0, dict) or not p0["conv"]:
            print("Dropping TROBH constraint due to convergence failure")
            p0 = humanleague.qisi(
                seed.get_impossible_TROBH(),
                [np.array([0, 1, 2]),
                 np.array([0, 3, 2]), m4408dim], [m4404, m4405, m4408])
            utils.check_humanleague_result(p0, [m4404, m4405, m4408],
                                           seed.get_impossible_TROBH())
        else:
            utils.check_humanleague_result(p0, [m4404, m4405, m4408],
                                           constraints)

        #print("p0 ok")

        tenure_ch_accom = self.lc4402.loc[self.lc4402.GEOGRAPHY_CODE ==
                                          area].copy()
        utils.unmap(tenure_ch_accom.C_CENHEATHUK11, ch_map)
        utils.unmap(tenure_ch_accom.C_TENHUK11, tenure_map)
        utils.unmap(tenure_ch_accom.C_TYPACCOM, buildtype_map)

        m4402 = utils.unlistify(
            tenure_ch_accom, ["C_TENHUK11", "C_CENHEATHUK11", "C_TYPACCOM"],
            [len(tenure_map), len(ch_map),
             len(buildtype_map)], "OBS_VALUE")

        tenure_eth_car = self.lc4202.loc[self.lc4202.GEOGRAPHY_CODE ==
                                         area].copy()
        utils.unmap(tenure_eth_car.C_ETHHUK11, eth_map)
        utils.unmap(tenure_eth_car.C_CARSNO, cars_map)
        utils.unmap(tenure_eth_car.C_TENHUK11, tenure_map)

        m4202 = utils.unlistify(
            tenure_eth_car, ["C_TENHUK11", "C_ETHHUK11", "C_CARSNO"],
            [len(tenure_map), len(eth_map),
             len(cars_map)], "OBS_VALUE")

        econ = self.lc4605.loc[self.lc4605.GEOGRAPHY_CODE == area].copy()
        utils.unmap(econ.C_NSSEC, econ_map)
        utils.unmap(econ.C_TENHUK11, tenure_map)

        # econ counts often slightly lower, need to tweak
        ##econ = utils.adjust(econ, tenure_eth_car)

        m4605 = utils.unlistify(
            econ, ["C_TENHUK11", "C_NSSEC"],
            [len(tenure_map), len(econ_map)], "OBS_VALUE")

        m4605_sum = np.sum(m4605)
        m4202_sum = np.sum(m4202)

        if m4605_sum != m4202_sum:
            print("LC4402: %d LC4605: %d -> %d " %
                  (np.sum(m4402), m4605_sum, m4202_sum),
                  end="")
            tenure_4202 = np.sum(m4202, axis=(1, 2))
            nssec_4605_adj = humanleague.prob2IntFreq(
                np.sum(m4605, axis=0) / m4605_sum, m4202_sum)["freq"]
            #      m4605_adj = humanleague.qisi(m4605.astype(float), [np.array([0]), np.array([1])], [tenure_4202, nssec_4605_adj])
            # Convergence problems can occur when e.g. one of the tenure rows is zero yet the marginal total is nonzero,
            # Can get round this by adding a small number to the seed
            # effectively allowing zero states to be occupied with a finite probability
            #      if not m4605_adj["conv"]:
            m4605_adj = humanleague.qisi(
                m4605.astype(float) + 1.0 / m4202_sum,
                [np.array([0]), np.array([1])], [tenure_4202, nssec_4605_adj])

            utils.check_humanleague_result(m4605_adj,
                                           [tenure_4202, nssec_4605_adj])
            m4605 = m4605_adj["result"]
            #print("econ adj ok")

        # print(np.sum(p0["result"], axis=(1,2,3,4)))
        # print(np.sum(m4402, axis=(1,2)))
        # print(np.sum(m4202, axis=(1,2)))
        # print(np.sum(m4605, axis=1))

        # no seed constraint so just use QIS
        if self.scotland:
            # tenures not mappable in LC4202
            m4202 = np.sum(m4202, axis=0)
            m4605 = np.sum(m4605, axis=0)
            p1 = humanleague.qis([
                np.array([0, 1, 2, 3, 4]),
                np.array([0, 5, 6]),
                np.array([7, 8]),
                np.array([9])
            ], [p0["result"], m4402, m4202, m4605])
            #p1 = humanleague.qis([np.array([0, 1, 2, 3]), np.array([0, 4, 5]), np.array([0, 6, 7])], [p0["result"], m4402, m4202])
        else:
            p1 = humanleague.qis([
                np.array([0, 1, 2, 3, 4]),
                np.array([0, 5, 6]),
                np.array([0, 7, 8]),
                np.array([0, 9])
            ], [p0["result"], m4402, m4202, m4605])
            #p1 = humanleague.qis([np.array([0, 1, 2, 3]), np.array([0, 4, 5]), np.array([0, 6, 7])], [p0["result"], m4402, m4202])
        utils.check_humanleague_result(p1, [p0["result"], m4402, m4202, m4605])
        #print("p1 ok")

        table = humanleague.flatten(p1["result"])

        chunk = pd.DataFrame(columns=self.dwellings.columns.values)
        chunk.Area = np.repeat(area, len(table[0]))
        chunk.LC4402_C_TENHUK11 = utils.remap(table[0], tenure_map)
        chunk.QS420_CELL = np.repeat(self.NOTAPPLICABLE, len(table[0]))
        chunk.LC4404_C_ROOMS = utils.remap(table[1], rooms_map)
        chunk.LC4404_C_SIZHUK11 = utils.remap(table[2], occupants_map)
        chunk.LC4405EW_C_BEDROOMS = utils.remap(table[3], bedrooms_map)
        chunk.LC4408_C_AHTHUK11 = utils.remap(table[4], hhtype_map)
        chunk.LC4402_C_CENHEATHUK11 = utils.remap(table[5], ch_map)
        chunk.LC4402_C_TYPACCOM = utils.remap(table[6], buildtype_map)
        chunk.CommunalSize = np.repeat(self.NOTAPPLICABLE, len(table[0]))
        chunk.LC4202_C_ETHHUK11 = utils.remap(table[7], eth_map)
        chunk.LC4202_C_CARSNO = utils.remap(table[8], cars_map)
        chunk.LC4605_C_NSSEC = utils.remap(table[9], econ_map)
        #print(chunk.head())
        self.dwellings = self.dwellings.append(chunk, ignore_index=True)
Esempio n. 7
0
    def __get_census_data_sc(self):
        #print(self.api_sc.get_metadata("LC4404SC", self.resolution))
        self.lc4402 = self.api_sc.get_data("LC4402SC",
                                           self.region,
                                           self.resolution,
                                           category_filters={
                                               "LC4402SC_0_CODE": [2, 3, 5, 6],
                                               "LC4402SC_1_CODE": [2, 3, 4, 5],
                                               "LC4402SC_2_CODE": [1, 2]
                                           })
        self.lc4402.rename(
            {
                "LC4402SC_1_CODE": "C_TYPACCOM",
                "LC4402SC_2_CODE": "C_CENHEATHUK11",
                "LC4402SC_0_CODE": "C_TENHUK11"
            },
            axis=1,
            inplace=True)
        #print(self.lc4402.head())
        # ensure counts are consistent across tables
        checksum = self.lc4402.OBS_VALUE.sum()

        # construct a tenure marginal for synthesis of other tables unavailable in Scottish dataset
        ngeogs = len(self.lc4402.GEOGRAPHY_CODE.unique())
        ntenures = len(self.lc4402.C_TENHUK11.unique())
        tenure_table = self.lc4402.groupby([
            "GEOGRAPHY_CODE", "C_TENHUK11"
        ]).sum().reset_index().drop(["C_TYPACCOM", "C_CENHEATHUK11"], axis=1)
        m4402 = utils.unlistify(tenure_table, ["GEOGRAPHY_CODE", "C_TENHUK11"],
                                [ngeogs, ntenures], "OBS_VALUE")

        # synthesise LC4404 from QS407 and QS406
        # LC4404SC room categories are: 1, 2-3, 4-5, 6+ so not very useful, using univariate tables instead
        #print(self.api_sc.get_metadata("QS407SC", self.resolution))
        qs407 = self.api_sc.get_data(
            "QS407SC",
            self.region,
            self.resolution,
            category_filters={"QS407SC_0_CODE": range(1, 10)})
        qs407.rename({"QS407SC_0_CODE": "C_ROOMS"}, axis=1, inplace=True)
        qs407 = utils.cap_value(qs407, "C_ROOMS", 6, "OBS_VALUE")
        #print(qs407.head())
        assert qs407.OBS_VALUE.sum() == checksum

        #print(self.api_sc.get_metadata("QS406SC", self.resolution))
        qs406 = self.api_sc.get_data(
            "QS406SC",
            self.region,
            self.resolution,
            category_filters={"QS406SC_0_CODE": range(1, 9)})
        qs406.rename({"QS406SC_0_CODE": "C_SIZHUK11"}, axis=1, inplace=True)
        qs406 = utils.cap_value(qs406, "C_SIZHUK11", 4, "OBS_VALUE")
        #print(qs406.head())
        assert qs406.OBS_VALUE.sum() == checksum

        nrooms = len(qs407.C_ROOMS.unique())
        nsizes = len(qs406.C_SIZHUK11.unique())

        m407 = utils.unlistify(qs407, ["GEOGRAPHY_CODE", "C_ROOMS"],
                               [ngeogs, nrooms], "OBS_VALUE")
        m406 = utils.unlistify(qs406, ["GEOGRAPHY_CODE", "C_SIZHUK11"],
                               [ngeogs, nsizes], "OBS_VALUE")

        a4404 = humanleague.qis(
            [np.array([0, 1]),
             np.array([0, 2]),
             np.array([0, 3])], [m4402, m407, m406])
        utils.check_humanleague_result(a4404, [m4402, m407, m406])
        self.lc4404 = utils.listify(
            a4404["result"], "OBS_VALUE",
            ["GEOGRAPHY_CODE", "C_TENHUK11", "C_ROOMS", "C_SIZHUK11"])
        self.lc4404.GEOGRAPHY_CODE = utils.remap(self.lc4404.GEOGRAPHY_CODE,
                                                 qs406.GEOGRAPHY_CODE.unique())
        self.lc4404.C_TENHUK11 = utils.remap(self.lc4404.C_TENHUK11,
                                             tenure_table.C_TENHUK11.unique())
        self.lc4404.C_ROOMS = utils.remap(self.lc4404.C_ROOMS,
                                          qs407.C_ROOMS.unique())
        self.lc4404.C_SIZHUK11 = utils.remap(self.lc4404.C_SIZHUK11,
                                             qs406.C_SIZHUK11.unique())

        #print(self.lc4404.head())

        assert self.lc4404.OBS_VALUE.sum() == checksum

        # no bedroom info is available
        # for now randomly sample from survey on rooms
        # TODO microsynth using tenure/occs also?
        self.lc4405 = self.lc4404.copy()
        #    self.lc4405.rename({"C_ROOMS": "C_BEDROOMS"}, axis=1, inplace=True)
        self.lc4405["C_BEDROOMS"] = Household.UNKNOWN
        room_bed_dist = np.sum(seed.get_survey_TROBH(), axis=(0, 2, 4))
        #print(room_bed_dist)
        # c = [1,2,3,4]
        # for i in range(0,6):
        #   p = room_bed_dist[i]/np.sum(room_bed_dist[i])
        #   n = len(self.lc4405[self.lc4405.C_ROOMS == i+1])
        #   #print(np.random.choice(c, n, p=p))
        #   self.lc4405.loc[self.lc4405.C_ROOMS == i+1, "C_BEDROOMS"] = np.random.choice(c, n, p=p)
        #assert len(self.lc4405[self.lc4405.C_BEDROOMS == Household.UNKNOWN]) == 0

        assert len(
            self.lc4405[self.lc4405.C_ROOMS < self.lc4405.C_BEDROOMS]) == 0
        self.lc4405.drop("C_ROOMS", axis=1, inplace=True)
        self.lc4405 = self.lc4405.groupby(
            ["GEOGRAPHY_CODE", "C_TENHUK11", "C_SIZHUK11",
             "C_BEDROOMS"]).sum().reset_index()
        #print(self.lc4405)
        assert self.lc4405.OBS_VALUE.sum() == checksum

        # synthesise LC4408

        #print(self.api_sc.get_metadata("QS116SC", self.resolution))
        # 1'One person household',
        # 2'Married couple household: No dependent children',
        # 3'Married couple household: With dependent children',
        # 4'Same-sex civil partnership couple household',
        # 5'Cohabiting couple household: No dependent children',
        # 6'Cohabiting couple household: With dependent children',
        # 7'Lone parent household: No dependent children',
        # 8'Lone parent household: With dependent children',
        # 9'Multi-person household: All full-time students',
        # 10'Multi-person household: Other']}}
        qs116 = self.api_sc.get_data(
            "QS116SC",
            self.region,
            self.resolution,
            category_filters={"QS116SC_0_CODE": range(1, 11)})
        qs116.rename({"QS116SC_0_CODE": "C_AHTHUK11"}, axis=1, inplace=True)
        # map to lower-resolution household types
        # 1 -> 1 (single)
        # (2,3,4) -> 2 (married/civil couple)
        # (5,6) -> 3 (cohabiting couple)
        # (7,8) -> 4 (single parent)
        # (9,10) -> 5 (mixed)
        qs116.loc[(qs116.C_AHTHUK11 == 2) | (qs116.C_AHTHUK11 == 3) |
                  (qs116.C_AHTHUK11 == 4), "C_AHTHUK11"] = 2
        qs116.loc[(qs116.C_AHTHUK11 == 5) | (qs116.C_AHTHUK11 == 6),
                  "C_AHTHUK11"] = 3
        qs116.loc[(qs116.C_AHTHUK11 == 7) | (qs116.C_AHTHUK11 == 8),
                  "C_AHTHUK11"] = 4
        qs116.loc[(qs116.C_AHTHUK11 == 9) | (qs116.C_AHTHUK11 == 10),
                  "C_AHTHUK11"] = 5
        # ...and consolidate
        qs116 = qs116.groupby(["GEOGRAPHY_CODE",
                               "C_AHTHUK11"]).sum().reset_index()

        assert qs116.OBS_VALUE.sum() == checksum

        nhhtypes = len(qs116.C_AHTHUK11.unique())
        m116 = utils.unlistify(qs116, ["GEOGRAPHY_CODE", "C_AHTHUK11"],
                               [ngeogs, nhhtypes], "OBS_VALUE")

        a4408 = humanleague.qis(
            [np.array([0, 1]), np.array([0, 2])], [m4402, m116])
        utils.check_humanleague_result(a4408, [m4402, m116])

        self.lc4408 = utils.listify(
            a4408["result"], "OBS_VALUE",
            ["GEOGRAPHY_CODE", "C_TENHUK11", "C_AHTHUK11"])
        self.lc4408.GEOGRAPHY_CODE = utils.remap(self.lc4408.GEOGRAPHY_CODE,
                                                 qs116.GEOGRAPHY_CODE.unique())
        self.lc4408.C_TENHUK11 = utils.remap(self.lc4408.C_TENHUK11,
                                             self.lc4402.C_TENHUK11.unique())
        self.lc4408.C_AHTHUK11 = utils.remap(self.lc4408.C_AHTHUK11,
                                             qs116.C_AHTHUK11.unique())
        #print(self.lc4408.head())
        assert self.lc4408.OBS_VALUE.sum() == checksum

        # LC1105
        #print(self.api_sc.get_metadata("KS101SC", self.resolution))
        self.lc1105 = self.api_sc.get_data(
            "KS101SC",
            self.region,
            self.resolution,
            category_filters={"KS101SC_0_CODE": [3, 4]})
        self.lc1105.rename({"KS101SC_0_CODE": "C_RESIDENCE_TYPE"},
                           axis=1,
                           inplace=True)
        # 3->1, 4->2
        self.lc1105["C_RESIDENCE_TYPE"] = self.lc1105["C_RESIDENCE_TYPE"] - 2
        #print(self.lc1105.OBS_VALUE.sum(), checksum)

        # occupied vs unoccupied
        #print(self.api_sc.get_metadata("KS401SC", self.resolution))
        # 5'All household spaces: Occupied',
        # 6'All household spaces: Unoccupied: Second residence/holiday accommodation',
        # 7'All household spaces: Unoccupied: Vacant',
        self.ks401 = self.api_sc.get_data(
            "KS401SC",
            self.region,
            self.resolution,
            category_filters={"KS401SC_0_CODE": [5, 6, 7]})
        self.ks401.rename({"KS401SC_0_CODE": "CELL"}, axis=1, inplace=True)
        self.ks401 = utils.cap_value(self.ks401, "CELL", 6, "OBS_VALUE")
        assert self.ks401[self.ks401.CELL == 5].OBS_VALUE.sum() == checksum

        #print(self.api_sc.get_metadata("LC4202SC", self.resolution))
        #{'table': 'LC4202SC', 'description': '', 'geography': 'OA11', 'fields': {'LC4202SC_1_CODE': [
        # 'All households:',
        # 'Owned:',
        # 'Social rented:',
        # 'Private rented or living rent free:'],
        # 'LC4202SC_2_CODE': [
        # 'Total',
        # 'Number of cars or vans in household: No cars or vans',
        # 'Number of cars or vans in household: One car or van',
        # 'Number of cars or vans in household:Two or more cars or vans'],
        # 'LC4202SC_0_CODE': [
        # 'All households',
        # 'White',
        # 'Mixed or multiple ethnic groups',
        # 'Asian Asian Scottish or Asian British',
        # 'African',
        # 'Caribbean or Black',
        # 'Other ethnic groups']}}
        self.lc4202 = self.api_sc.get_data("LC4202SC",
                                           self.region,
                                           self.resolution,
                                           category_filters={
                                               "LC4202SC_1_CODE": [1, 2, 3],
                                               "LC4202SC_2_CODE": [1, 2, 3],
                                               "LC4202SC_0_CODE":
                                               [1, 2, 3, 4, 5, 6]
                                           })
        self.lc4202.rename(
            {
                "LC4202SC_2_CODE": "C_CARSNO",
                "LC4202SC_1_CODE": "C_TENHUK11",
                "LC4202SC_0_CODE": "C_ETHHUK11"
            },
            axis=1,
            inplace=True)
        # TODO how to map tenure 1->2/3?
        self.lc4202.loc[self.lc4202.C_TENHUK11 == 3, "C_TENHUK11"] = 6
        self.lc4202.loc[self.lc4202.C_TENHUK11 == 2, "C_TENHUK11"] = 5
        self.lc4202.loc[self.lc4202.C_TENHUK11 == 1, "C_TENHUK11"] = 3  # OR 2?

        assert self.lc4202.OBS_VALUE.sum() == checksum

        #print(self.api_sc.get_metadata("LC4605SC", self.resolution))
        #{'table': 'LC4605SC', 'description': '', 'geography': 'OA11', 'fields': {'LC4605SC_1_CODE': [
        # 'All HRPs aged 16 to 74',
        # 'Owned: Total',
        # 'Owned: Owned outright',
        # 'Owned: Owned witha mortgage or loan or shared ownership',
        # 'Rented or living rent free: Total',
        # 'Rented or living rent free: Social rented',
        # 'Rented or living rent free: Private rented or living rent free'],
        # 'LC4605SC_0_CODE': ['All HRPs aged 16 to 74',
        # '1. Higher managerial administrative and professional occupations',
        # '2. Lower managerial administrative and professional occupations',
        # '3. Intermediate occupations',
        # '4. Small employers and own account workers',
        # '5. Lower supervisory and technical occupations',
        # '6. Semi-routine occupations',
        # '7. Routine occupations',
        # '8. Never worked and long-term unemployed',
        # 'L15 Full-time students']}}
        self.lc4605 = self.api_sc.get_data("LC4605SC",
                                           self.region,
                                           self.resolution,
                                           category_filters={
                                               "LC4605SC_1_CODE": [2, 3, 5, 6],
                                               "LC4605SC_0_CODE": range(1, 10)
                                           })
        self.lc4605.rename(
            {
                "LC4605SC_1_CODE": "C_TENHUK11",
                "LC4605SC_0_CODE": "C_NSSEC"
            },
            axis=1,
            inplace=True)
        # TODO add retired?
        print(self.lc4605.OBS_VALUE.sum(), checksum, "TODO add retired")

        #print(self.api_sc.get_metadata("QS420SC", self.resolution))
        cats = [2, 6, 11, 14, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33]

        # merge the two communal tables (so we have establishment and people counts)
        self.communal = self.api_sc.get_data("QS420SC",
                                             self.region,
                                             self.resolution,
                                             category_filters={
                                                 "QS420SC_0_CODE": cats
                                             }).rename(
                                                 {"QS420SC_0_CODE": "CELL"},
                                                 axis=1)
        qs421 = self.api_sc.get_data("QS421SC",
                                     self.region,
                                     self.resolution,
                                     category_filters={
                                         "QS421SC_0_CODE": cats
                                     }).rename({"OBS_VALUE": "CommunalSize"},
                                               axis=1)
        #print(qs421.head())
        self.communal = self.communal.merge(
            qs421,
            left_on=["GEOGRAPHY_CODE", "CELL"],
            right_on=["GEOGRAPHY_CODE",
                      "QS421SC_0_CODE"]).drop("QS421SC_0_CODE", axis=1)