Exemple #1
0
    def filter(self,
               geog_codes,
               years=None,
               ages=range(0, 91),
               genders=[1, 2]):
        """
    Get MYE detailed data for a given year
    """
        # ensure array inputs
        if isinstance(geog_codes, str):
            geog_codes = [geog_codes]
        if np.isscalar(ages):
            ages = [ages]
        if np.isscalar(genders):
            genders = [genders]

        result = pd.DataFrame()

        years = utils.trim_range(years, self.min_year(), self.max_year())

        for year in years:

            # ensure the data is loaded
            self.__fetch_data(year)

            ## ensure we return a copy!
            part = self.data[year][
                (self.data[year].GEOGRAPHY_CODE.isin(geog_codes))
                & (self.data[year].C_AGE.isin(ages)) &
                (self.data[year].GENDER.isin(genders))].copy()
            part["PROJECTED_YEAR_NAME"] = year
            result = result.append(part)

        return result.reset_index(drop=True)
Exemple #2
0
    def filter(self,
               geog_codes,
               years=None,
               ages=range(0, 91),
               genders=[1, 2]):

        # convert geog_codes and years to arrays if single values supplied (for isin)
        if isinstance(geog_codes, str):
            geog_codes = [geog_codes]

        countries = utils.country(geog_codes)

        # TODO fix incorrect assumption is that all countries have the same year range
        years = utils.trim_range(years, self.min_year(countries[0]),
                                 self.max_year(countries[0]))

        retval = pd.DataFrame(
        )  #{"GEOGRAPHY_CODE": [], "PROJECTED_YEAR_NAME": [], "C_AGE": [], "GENDER":[], "OBS_VALUE": []})
        # loop over datasets as needed
        for country in countries:
            # apply filters
            retval = retval.append(self.data[country][(self.data[country].GEOGRAPHY_CODE.isin(geog_codes)) &
                                                     (self.data[country].PROJECTED_YEAR_NAME.isin(years)) &
                                                     (self.data[country].C_AGE.isin(ages)) &
                                                     (self.data[country].GENDER.isin(genders))] \
                                  ,ignore_index=True, sort=False)

        return retval
Exemple #3
0
    def detail(self,
               variant_name,
               geog,
               years=None,
               ages=range(0, 91),
               genders=[1, 2]):
        """
    Return a subset of the raw data
    """
        if not variant_name in NPPData.VARIANTS:
            raise RuntimeError("invalid variant name: " + variant_name)
        # make years a valid range (this *silently* removes invalid years)
        years = utils.trim_range(years, self.min_year(), self.max_year())

        if not variant_name in self.data:
            # is it a standard variant
            if variant_name in NPPData.VARIANTS:
                self.__load_variant(variant_name)
            else:
                raise RuntimeError(
                    "Invalid variant name / custom variants are not yet implemented"
                )

        # apply filters
        if isinstance(geog, str):
            geog = [geog]
        geog_codes = [utils.CODES[g] for g in geog]
        return self.data[variant_name][
            (self.data[variant_name].GEOGRAPHY_CODE.isin(geog_codes))
            & (self.data[variant_name].PROJECTED_YEAR_NAME.isin(years)) &
            (self.data[variant_name].C_AGE.isin(ages)) &
            (self.data[variant_name].GENDER.isin(genders))].reset_index(
                drop=True)
Exemple #4
0
    def test_utils(self):
        year_range = range(2018, 2050)
        (in_range, ex_range) = utils.split_range(year_range, self.snpp.max_year(utils.EN))
        self.assertEqual(min(in_range), min(year_range))
        self.assertEqual(max(in_range), 2029)
        self.assertEqual(min(ex_range), 2030)
        self.assertEqual(max(ex_range), max(year_range))

        self.assertEqual(utils.trim_range(2011, 1991, 2016), [2011])
        self.assertEqual(utils.trim_range(2011.0, 1991, 2016), [2011])
        self.assertEqual(utils.trim_range([2011], 1991, 2016), [2011])
        self.assertEqual(utils.trim_range([2011.0], 1991, 2016), [2011])
        self.assertEqual(utils.trim_range(np.array([1995, 2005, 2019]), 2001, 2011), [2005])
        self.assertEqual(utils.trim_range([1969, 2111], 1991, 2016), [])
        self.assertEqual(utils.trim_range(range(1969, 2111), 2011, 2016), list(range(2011, 2017)))

        codes = "E09000001"
        self.assertTrue(utils.country(codes) == ["en"])
        codes = ['E06000002', 'E09000001']
        self.assertTrue(utils.country(codes) == ["en"])
        codes = ['E06000002', 'N09000001', 'S12000033', 'W06000011']
        self.assertTrue(utils.country(codes) == ['en', 'ni', 'sc', 'wa'])
        codes = ['E06000001', 'E06000002', 'N09000001', 'S12000033', 'W06000011']
        self.assertTrue(utils.country(codes) == ['en', 'ni', 'sc', 'wa'])
        codes = ['E06000001', 'W06000011', 'X06000002', 'Y09000001', 'Z12000033']
        self.assertTrue(utils.country(codes) == ["en", "wa"])
        codes = 'A06000001'
        self.assertTrue(utils.country(codes) == [])

        codes = ['E06000001', 'E06000002', 'N09000001', 'S12000033', 'W06000011']
        split = utils.split_by_country(codes)
        self.assertTrue(split[utils.EN] == ['E06000001', 'E06000002'])
        self.assertTrue(split[utils.WA] == ['W06000011'])
        self.assertTrue(split[utils.SC] == ['S12000033'])
        self.assertTrue(split[utils.NI] == ['N09000001'])

        # naively, each element would be rounded down, making the total 10
        fractional = np.array([0.1, 0.2, 0.3, 0.4]) * 11
        integral = utils.integerise(fractional)
        self.assertTrue(np.array_equal(integral, [1, 2, 3, 5]))

        # 1.51 is NOT increased because 4.5 has a larger fractional part when total is rescaled to 17 from 16.91
        fractional = np.array([1.1, 3.9, 4.5, 5.9, 1.51])
        integral = utils.integerise(fractional)
        self.assertTrue(np.array_equal(integral, [1, 4, 5, 6, 1]))

        # another example that preserves sum
        fractional = np.array([1.01] * 100)
        integral = utils.integerise(fractional)
        self.assertTrue(sum(integral) == 1.01 * 100)
        self.assertTrue(np.array_equal(np.unique(integral), [1, 2]))
Exemple #5
0
    def filter(self,
               geog_codes,
               years=None,
               ages=range(0, 91),
               genders=[1, 2]):

        # convert inputs to arrays if single values supplied (for isin)
        if isinstance(geog_codes, str):
            geog_codes = [geog_codes]

        if np.isscalar(ages):
            ages = [ages]

        if np.isscalar(genders):
            genders = [genders]

        # Handle problem with empty list not being recognised as Null, was causing problem in utils.trim_range() below
        if not years:
            years = None

        countries = utils.country(geog_codes)

        # TODO fix incorrect assumption is that all countries have the same year range
        years = utils.trim_range(years, self.min_year(countries[0]),
                                 self.max_year(countries[0]))

        retval = pd.DataFrame(
        )  # {"GEOGRAPHY_CODE": [], "PROJECTED_YEAR_NAME": [], "C_AGE": [], "GENDER":[], "OBS_VALUE": []})
        # loop over datasets as needed
        for country in countries:
            # apply filters
            retval = retval.append(self.data[country][
                (self.data[country].GEOGRAPHY_CODE.isin(geog_codes))
                & (self.data[country].PROJECTED_YEAR_NAME.isin(years)) &
                (self.data[country].C_AGE.isin(ages)) &
                (self.data[country].GENDER.isin(genders))],
                                   ignore_index=True,
                                   sort=False)

        # check for any codes requested that werent present (this check is far easier to to on the result)
        invalid_codes = np.setdiff1d(geog_codes,
                                     retval.GEOGRAPHY_CODE.unique())
        if len(invalid_codes) > 0:
            raise ValueError(
                "Filter for LAD code(s): %s for years %s returned no data (check also age/gender filters)"
                % (str(invalid_codes), str(years)))

        return retval
Exemple #6
0
  def filter(self, geog_codes, years=None, ages=range(0,91), genders=[1,2]):

    # convert geog_codes and years to arrays if single values supplied (for isin)
    if isinstance(geog_codes, str):
      geog_codes = [geog_codes]

    # the assumption is that all geog_codes are in same country
    country = utils.country(geog_codes[0])

    years = utils.trim_range(years, self.min_year(geog_codes[0]), self.max_year(geog_codes[0]))

    # apply filters
    return self.data[country][(self.data[country].GEOGRAPHY_CODE.isin(geog_codes)) & 
                              (self.data[country].PROJECTED_YEAR_NAME.isin(years)) &
                              (self.data[country].C_AGE.isin(ages)) &
                              (self.data[country].GENDER.isin(genders))].reset_index(drop=True)
Exemple #7
0
  def aggregate(self, geog_codes, years=None):
    """ Returns aggregate counts of household for specified geographies and years """

    # convert geog_codes and years to arrays if single values supplied (for isin)
    if isinstance(geog_codes, str):
      geog_codes = [geog_codes]

    countries = utils.country(geog_codes)

    # TODO fix incorrect assumption is that all countries have the same year range 
    years = utils.trim_range(years, self.min_year(countries[0]), self.max_year(countries[0]))

    retval = pd.DataFrame()
    # loop over datasets as needed
    for country in countries:
      # apply filters
      retval = retval.append(self.data[country][(self.data[country].GEOGRAPHY_CODE.isin(geog_codes)) & 
                                               (self.data[country].PROJECTED_YEAR_NAME.isin(years))] \
                            ,ignore_index=True, sort=False)
    return retval.groupby(["GEOGRAPHY_CODE", "PROJECTED_YEAR_NAME"]).sum().reset_index()