def filter(self, geog_codes, years=None, ages=range(0, 91), genders=[1, 2]): """ Get MYE detailed data for a given year """ # ensure array inputs if isinstance(geog_codes, str): geog_codes = [geog_codes] if np.isscalar(ages): ages = [ages] if np.isscalar(genders): genders = [genders] result = pd.DataFrame() years = utils.trim_range(years, self.min_year(), self.max_year()) for year in years: # ensure the data is loaded self.__fetch_data(year) ## ensure we return a copy! part = self.data[year][ (self.data[year].GEOGRAPHY_CODE.isin(geog_codes)) & (self.data[year].C_AGE.isin(ages)) & (self.data[year].GENDER.isin(genders))].copy() part["PROJECTED_YEAR_NAME"] = year result = result.append(part) return result.reset_index(drop=True)
def filter(self, geog_codes, years=None, ages=range(0, 91), genders=[1, 2]): # convert geog_codes and years to arrays if single values supplied (for isin) if isinstance(geog_codes, str): geog_codes = [geog_codes] countries = utils.country(geog_codes) # TODO fix incorrect assumption is that all countries have the same year range years = utils.trim_range(years, self.min_year(countries[0]), self.max_year(countries[0])) retval = pd.DataFrame( ) #{"GEOGRAPHY_CODE": [], "PROJECTED_YEAR_NAME": [], "C_AGE": [], "GENDER":[], "OBS_VALUE": []}) # loop over datasets as needed for country in countries: # apply filters retval = retval.append(self.data[country][(self.data[country].GEOGRAPHY_CODE.isin(geog_codes)) & (self.data[country].PROJECTED_YEAR_NAME.isin(years)) & (self.data[country].C_AGE.isin(ages)) & (self.data[country].GENDER.isin(genders))] \ ,ignore_index=True, sort=False) return retval
def detail(self, variant_name, geog, years=None, ages=range(0, 91), genders=[1, 2]): """ Return a subset of the raw data """ if not variant_name in NPPData.VARIANTS: raise RuntimeError("invalid variant name: " + variant_name) # make years a valid range (this *silently* removes invalid years) years = utils.trim_range(years, self.min_year(), self.max_year()) if not variant_name in self.data: # is it a standard variant if variant_name in NPPData.VARIANTS: self.__load_variant(variant_name) else: raise RuntimeError( "Invalid variant name / custom variants are not yet implemented" ) # apply filters if isinstance(geog, str): geog = [geog] geog_codes = [utils.CODES[g] for g in geog] return self.data[variant_name][ (self.data[variant_name].GEOGRAPHY_CODE.isin(geog_codes)) & (self.data[variant_name].PROJECTED_YEAR_NAME.isin(years)) & (self.data[variant_name].C_AGE.isin(ages)) & (self.data[variant_name].GENDER.isin(genders))].reset_index( drop=True)
def test_utils(self): year_range = range(2018, 2050) (in_range, ex_range) = utils.split_range(year_range, self.snpp.max_year(utils.EN)) self.assertEqual(min(in_range), min(year_range)) self.assertEqual(max(in_range), 2029) self.assertEqual(min(ex_range), 2030) self.assertEqual(max(ex_range), max(year_range)) self.assertEqual(utils.trim_range(2011, 1991, 2016), [2011]) self.assertEqual(utils.trim_range(2011.0, 1991, 2016), [2011]) self.assertEqual(utils.trim_range([2011], 1991, 2016), [2011]) self.assertEqual(utils.trim_range([2011.0], 1991, 2016), [2011]) self.assertEqual(utils.trim_range(np.array([1995, 2005, 2019]), 2001, 2011), [2005]) self.assertEqual(utils.trim_range([1969, 2111], 1991, 2016), []) self.assertEqual(utils.trim_range(range(1969, 2111), 2011, 2016), list(range(2011, 2017))) codes = "E09000001" self.assertTrue(utils.country(codes) == ["en"]) codes = ['E06000002', 'E09000001'] self.assertTrue(utils.country(codes) == ["en"]) codes = ['E06000002', 'N09000001', 'S12000033', 'W06000011'] self.assertTrue(utils.country(codes) == ['en', 'ni', 'sc', 'wa']) codes = ['E06000001', 'E06000002', 'N09000001', 'S12000033', 'W06000011'] self.assertTrue(utils.country(codes) == ['en', 'ni', 'sc', 'wa']) codes = ['E06000001', 'W06000011', 'X06000002', 'Y09000001', 'Z12000033'] self.assertTrue(utils.country(codes) == ["en", "wa"]) codes = 'A06000001' self.assertTrue(utils.country(codes) == []) codes = ['E06000001', 'E06000002', 'N09000001', 'S12000033', 'W06000011'] split = utils.split_by_country(codes) self.assertTrue(split[utils.EN] == ['E06000001', 'E06000002']) self.assertTrue(split[utils.WA] == ['W06000011']) self.assertTrue(split[utils.SC] == ['S12000033']) self.assertTrue(split[utils.NI] == ['N09000001']) # naively, each element would be rounded down, making the total 10 fractional = np.array([0.1, 0.2, 0.3, 0.4]) * 11 integral = utils.integerise(fractional) self.assertTrue(np.array_equal(integral, [1, 2, 3, 5])) # 1.51 is NOT increased because 4.5 has a larger fractional part when total is rescaled to 17 from 16.91 fractional = np.array([1.1, 3.9, 4.5, 5.9, 1.51]) integral = utils.integerise(fractional) self.assertTrue(np.array_equal(integral, [1, 4, 5, 6, 1])) # another example that preserves sum fractional = np.array([1.01] * 100) integral = utils.integerise(fractional) self.assertTrue(sum(integral) == 1.01 * 100) self.assertTrue(np.array_equal(np.unique(integral), [1, 2]))
def filter(self, geog_codes, years=None, ages=range(0, 91), genders=[1, 2]): # convert inputs to arrays if single values supplied (for isin) if isinstance(geog_codes, str): geog_codes = [geog_codes] if np.isscalar(ages): ages = [ages] if np.isscalar(genders): genders = [genders] # Handle problem with empty list not being recognised as Null, was causing problem in utils.trim_range() below if not years: years = None countries = utils.country(geog_codes) # TODO fix incorrect assumption is that all countries have the same year range years = utils.trim_range(years, self.min_year(countries[0]), self.max_year(countries[0])) retval = pd.DataFrame( ) # {"GEOGRAPHY_CODE": [], "PROJECTED_YEAR_NAME": [], "C_AGE": [], "GENDER":[], "OBS_VALUE": []}) # loop over datasets as needed for country in countries: # apply filters retval = retval.append(self.data[country][ (self.data[country].GEOGRAPHY_CODE.isin(geog_codes)) & (self.data[country].PROJECTED_YEAR_NAME.isin(years)) & (self.data[country].C_AGE.isin(ages)) & (self.data[country].GENDER.isin(genders))], ignore_index=True, sort=False) # check for any codes requested that werent present (this check is far easier to to on the result) invalid_codes = np.setdiff1d(geog_codes, retval.GEOGRAPHY_CODE.unique()) if len(invalid_codes) > 0: raise ValueError( "Filter for LAD code(s): %s for years %s returned no data (check also age/gender filters)" % (str(invalid_codes), str(years))) return retval
def filter(self, geog_codes, years=None, ages=range(0,91), genders=[1,2]): # convert geog_codes and years to arrays if single values supplied (for isin) if isinstance(geog_codes, str): geog_codes = [geog_codes] # the assumption is that all geog_codes are in same country country = utils.country(geog_codes[0]) years = utils.trim_range(years, self.min_year(geog_codes[0]), self.max_year(geog_codes[0])) # apply filters return self.data[country][(self.data[country].GEOGRAPHY_CODE.isin(geog_codes)) & (self.data[country].PROJECTED_YEAR_NAME.isin(years)) & (self.data[country].C_AGE.isin(ages)) & (self.data[country].GENDER.isin(genders))].reset_index(drop=True)
def aggregate(self, geog_codes, years=None): """ Returns aggregate counts of household for specified geographies and years """ # convert geog_codes and years to arrays if single values supplied (for isin) if isinstance(geog_codes, str): geog_codes = [geog_codes] countries = utils.country(geog_codes) # TODO fix incorrect assumption is that all countries have the same year range years = utils.trim_range(years, self.min_year(countries[0]), self.max_year(countries[0])) retval = pd.DataFrame() # loop over datasets as needed for country in countries: # apply filters retval = retval.append(self.data[country][(self.data[country].GEOGRAPHY_CODE.isin(geog_codes)) & (self.data[country].PROJECTED_YEAR_NAME.isin(years))] \ ,ignore_index=True, sort=False) return retval.groupby(["GEOGRAPHY_CODE", "PROJECTED_YEAR_NAME"]).sum().reset_index()