Beispiel #1
0
    def filter(self,
               geog_codes,
               years=None,
               ages=range(0, 91),
               genders=[1, 2]):

        # convert geog_codes and years to arrays if single values supplied (for isin)
        if isinstance(geog_codes, str):
            geog_codes = [geog_codes]

        countries = utils.country(geog_codes)

        # TODO fix incorrect assumption is that all countries have the same year range
        years = utils.trim_range(years, self.min_year(countries[0]),
                                 self.max_year(countries[0]))

        retval = pd.DataFrame(
        )  #{"GEOGRAPHY_CODE": [], "PROJECTED_YEAR_NAME": [], "C_AGE": [], "GENDER":[], "OBS_VALUE": []})
        # loop over datasets as needed
        for country in countries:
            # apply filters
            retval = retval.append(self.data[country][(self.data[country].GEOGRAPHY_CODE.isin(geog_codes)) &
                                                     (self.data[country].PROJECTED_YEAR_NAME.isin(years)) &
                                                     (self.data[country].C_AGE.isin(ages)) &
                                                     (self.data[country].GENDER.isin(genders))] \
                                  ,ignore_index=True, sort=False)

        return retval
Beispiel #2
0
 def max_year(self, code):
     """
 Returns the final year in the projection, assumes a single LAD or country code
 """
     # convert to country if necessary
     if "0" in code:
         code = utils.country(code)[0]
     return max(self.data[code].PROJECTED_YEAR_NAME.unique())
Beispiel #3
0
  def extrapolate(self, npp, geog_code, year_range):

    (in_range, ex_range) = utils.split_range(year_range, self.max_year(geog_code))

    all_years = self.filter(geog_code, in_range)

    for year in ex_range:
      data = self.filter([geog_code], [self.max_year(geog_code)])
      scaling = npp.year_ratio("ppp", utils.country(geog_code), self.max_year(geog_code), year)
      assert(len(data == len(scaling)))
      data.OBS_VALUE = data.OBS_VALUE * scaling.OBS_VALUE
      data.PROJECTED_YEAR_NAME = year
      all_years = all_years.append(data, ignore_index=True)

    return all_years
Beispiel #4
0
    def filter(self,
               geog_codes,
               years=None,
               ages=range(0, 91),
               genders=[1, 2]):

        # convert inputs to arrays if single values supplied (for isin)
        if isinstance(geog_codes, str):
            geog_codes = [geog_codes]

        if np.isscalar(ages):
            ages = [ages]

        if np.isscalar(genders):
            genders = [genders]

        # Handle problem with empty list not being recognised as Null, was causing problem in utils.trim_range() below
        if not years:
            years = None

        countries = utils.country(geog_codes)

        # TODO fix incorrect assumption is that all countries have the same year range
        years = utils.trim_range(years, self.min_year(countries[0]),
                                 self.max_year(countries[0]))

        retval = pd.DataFrame(
        )  # {"GEOGRAPHY_CODE": [], "PROJECTED_YEAR_NAME": [], "C_AGE": [], "GENDER":[], "OBS_VALUE": []})
        # loop over datasets as needed
        for country in countries:
            # apply filters
            retval = retval.append(self.data[country][
                (self.data[country].GEOGRAPHY_CODE.isin(geog_codes))
                & (self.data[country].PROJECTED_YEAR_NAME.isin(years)) &
                (self.data[country].C_AGE.isin(ages)) &
                (self.data[country].GENDER.isin(genders))],
                                   ignore_index=True,
                                   sort=False)

        # check for any codes requested that werent present (this check is far easier to to on the result)
        invalid_codes = np.setdiff1d(geog_codes,
                                     retval.GEOGRAPHY_CODE.unique())
        if len(invalid_codes) > 0:
            raise ValueError(
                "Filter for LAD code(s): %s for years %s returned no data (check also age/gender filters)"
                % (str(invalid_codes), str(years)))

        return retval
Beispiel #5
0
  def filter(self, geog_codes, years=None, ages=range(0,91), genders=[1,2]):

    # convert geog_codes and years to arrays if single values supplied (for isin)
    if isinstance(geog_codes, str):
      geog_codes = [geog_codes]

    # the assumption is that all geog_codes are in same country
    country = utils.country(geog_codes[0])

    years = utils.trim_range(years, self.min_year(geog_codes[0]), self.max_year(geog_codes[0]))

    # apply filters
    return self.data[country][(self.data[country].GEOGRAPHY_CODE.isin(geog_codes)) & 
                              (self.data[country].PROJECTED_YEAR_NAME.isin(years)) &
                              (self.data[country].C_AGE.isin(ages)) &
                              (self.data[country].GENDER.isin(genders))].reset_index(drop=True)
Beispiel #6
0
    def create_variant(self, variant_name, npp, geog_codes, year_range):
        """
        Apply NPP variant to SNPP: SNPP(v) = SNPP(0) * sum(a,g) [ NPP(v) / NPP(0) ]
        Preserves age-gender structure of SNPP data
        """
        result = pd.DataFrame()
        if isinstance(geog_codes, str):
            geog_codes = [geog_codes]

        for geog_code in geog_codes:

            # split out any years prior to the NPP data (currently SNPP is 2014 based but NPP is 2016)
            (pre_range, in_range) = utils.split_range(year_range,
                                                      npp.min_year() - 1)
            # for any years prior to NPP we just use the SNPP data as-is (i.e. "ppp")
            pre_data = self.filter(geog_code,
                                   pre_range) if pre_range else pd.DataFrame()
            if len(pre_data) > 0:
                print(
                    "WARNING: variant {} not applied for years {} that predate the NPP data"
                    .format(variant_name, pre_range))

            # return if there's nothing in the NPP range
            if not in_range:
                result.append(pre_data)
                continue

            data = self.extrapolate(npp, geog_code, in_range).sort_values(
                ["C_AGE", "GENDER",
                 "PROJECTED_YEAR_NAME"]).reset_index(drop=True)

            scaling = npp.variant_ratio(variant_name, utils.country(geog_code),
                                        year_range).reset_index().sort_values([
                                            "C_AGE", "GENDER",
                                            "PROJECTED_YEAR_NAME"
                                        ])
            # scaling.to_csv(variant_name + ".csv", index=False)

            # print("DF: ", len(data), ":", len(scaling))
            assert (len(data) == len(scaling))
            data.OBS_VALUE = data.OBS_VALUE * scaling.OBS_VALUE

            # prepend any pre-NPP data
            result = result.append(pre_data.append(data))

        return result
Beispiel #7
0
    def test_utils(self):
        year_range = range(2018, 2050)
        (in_range, ex_range) = utils.split_range(year_range, self.snpp.max_year(utils.EN))
        self.assertEqual(min(in_range), min(year_range))
        self.assertEqual(max(in_range), 2029)
        self.assertEqual(min(ex_range), 2030)
        self.assertEqual(max(ex_range), max(year_range))

        self.assertEqual(utils.trim_range(2011, 1991, 2016), [2011])
        self.assertEqual(utils.trim_range(2011.0, 1991, 2016), [2011])
        self.assertEqual(utils.trim_range([2011], 1991, 2016), [2011])
        self.assertEqual(utils.trim_range([2011.0], 1991, 2016), [2011])
        self.assertEqual(utils.trim_range(np.array([1995, 2005, 2019]), 2001, 2011), [2005])
        self.assertEqual(utils.trim_range([1969, 2111], 1991, 2016), [])
        self.assertEqual(utils.trim_range(range(1969, 2111), 2011, 2016), list(range(2011, 2017)))

        codes = "E09000001"
        self.assertTrue(utils.country(codes) == ["en"])
        codes = ['E06000002', 'E09000001']
        self.assertTrue(utils.country(codes) == ["en"])
        codes = ['E06000002', 'N09000001', 'S12000033', 'W06000011']
        self.assertTrue(utils.country(codes) == ['en', 'ni', 'sc', 'wa'])
        codes = ['E06000001', 'E06000002', 'N09000001', 'S12000033', 'W06000011']
        self.assertTrue(utils.country(codes) == ['en', 'ni', 'sc', 'wa'])
        codes = ['E06000001', 'W06000011', 'X06000002', 'Y09000001', 'Z12000033']
        self.assertTrue(utils.country(codes) == ["en", "wa"])
        codes = 'A06000001'
        self.assertTrue(utils.country(codes) == [])

        codes = ['E06000001', 'E06000002', 'N09000001', 'S12000033', 'W06000011']
        split = utils.split_by_country(codes)
        self.assertTrue(split[utils.EN] == ['E06000001', 'E06000002'])
        self.assertTrue(split[utils.WA] == ['W06000011'])
        self.assertTrue(split[utils.SC] == ['S12000033'])
        self.assertTrue(split[utils.NI] == ['N09000001'])

        # naively, each element would be rounded down, making the total 10
        fractional = np.array([0.1, 0.2, 0.3, 0.4]) * 11
        integral = utils.integerise(fractional)
        self.assertTrue(np.array_equal(integral, [1, 2, 3, 5]))

        # 1.51 is NOT increased because 4.5 has a larger fractional part when total is rescaled to 17 from 16.91
        fractional = np.array([1.1, 3.9, 4.5, 5.9, 1.51])
        integral = utils.integerise(fractional)
        self.assertTrue(np.array_equal(integral, [1, 4, 5, 6, 1]))

        # another example that preserves sum
        fractional = np.array([1.01] * 100)
        integral = utils.integerise(fractional)
        self.assertTrue(sum(integral) == 1.01 * 100)
        self.assertTrue(np.array_equal(np.unique(integral), [1, 2]))
Beispiel #8
0
  def aggregate(self, geog_codes, years=None):
    """ Returns aggregate counts of household for specified geographies and years """

    # convert geog_codes and years to arrays if single values supplied (for isin)
    if isinstance(geog_codes, str):
      geog_codes = [geog_codes]

    countries = utils.country(geog_codes)

    # TODO fix incorrect assumption is that all countries have the same year range 
    years = utils.trim_range(years, self.min_year(countries[0]), self.max_year(countries[0]))

    retval = pd.DataFrame()
    # loop over datasets as needed
    for country in countries:
      # apply filters
      retval = retval.append(self.data[country][(self.data[country].GEOGRAPHY_CODE.isin(geog_codes)) & 
                                               (self.data[country].PROJECTED_YEAR_NAME.isin(years))] \
                            ,ignore_index=True, sort=False)
    return retval.groupby(["GEOGRAPHY_CODE", "PROJECTED_YEAR_NAME"]).sum().reset_index()
Beispiel #9
0
 def max_year(self, code):
   """
   Returns the final year in the projection
   """
   return max(self.data[utils.country(code)].PROJECTED_YEAR_NAME.unique())