def test_age_group(self): self.assertEqual("0-9", age_group(0, bin_count=10, max_age=100)) self.assertEqual("0-9", age_group(0.0, bin_count=10, max_age=100)) self.assertEqual("0-9", age_group(9, bin_count=10, max_age=100)) self.assertEqual("10-19", age_group(10, bin_count=10, max_age=100)) self.assertEqual("10-19", age_group(19, bin_count=10, max_age=100)) self.assertEqual("90-", age_group(90, bin_count=10, max_age=100)) self.assertEqual("90-", age_group(100, bin_count=10, max_age=100)) self.assertEqual("90-", age_group(1e9, bin_count=10, max_age=100)) self.assertEqual(None, age_group(-1, bin_count=10, max_age=100))
def parse(self, sources: Dict[Any, str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: url_tpl = sources[0] metadata = aux["metadata"] metadata = metadata[metadata["country_code"] == "FR"] fr_isos = read_file(SRC / "data" / "fr_iso_codes.csv") fr_iso_map = {iso: code for iso, code in zip(fr_isos["iso_code"], fr_isos["region_code"])} fr_codes = metadata[["subregion1_code", "subregion2_code"]].dropna() regions_iter = fr_codes["subregion1_code"].unique() deps_iter = [record for _, record in fr_codes.iterrows()] column_adapter = { "key": "key", "date": "date", "testsRealisesDetails": "_breakdown_tested", "testsPositifsDetails": "_breakdown_confirmed", } # Get country level data country = _get_country(url_tpl, column_adapter) # Get region level data get_region_func = partial(_get_region, url_tpl, column_adapter, fr_iso_map) regions = concat(list(thread_map(get_region_func, regions_iter))) # Get department level data get_department_func = partial(_get_department, url_tpl, column_adapter) departments = concat(list(thread_map(get_department_func, deps_iter))) data = concat([country, regions, departments]) data["date"] = data["date"].apply(lambda x: datetime_isoformat(x, "%Y-%m-%d %H:%M:%S")) data["_breakdown_tested"].fillna("", inplace=True) data["_breakdown_confirmed"].fillna("", inplace=True) records: Dict[str, List] = {"confirmed": [], "tested": []} for key, row in data.set_index("key").iterrows(): for statistic in records.keys(): if row[f"_breakdown_{statistic}"] != "": for item in row[f"_breakdown_{statistic}"]: records[statistic].append( { "key": key, "date": row["date"], "age": item["age"], "sex": item.get("sexe"), f"new_{statistic}": item["value"], } ) df1 = DataFrame.from_records(records["tested"]) df2 = DataFrame.from_records(records["confirmed"]) data = df1.merge(df2, how="outer") data = data[~data["age"].isin(["0", "A", "B", "C", "D", "E"])] data["age"] = data["age"].apply(lambda x: age_group(safe_int_cast(x))) sex_adapter = lambda x: {"h": "male", "f": "female"}.get(x, "sex_unknown") data["sex"] = data["sex"].apply(sex_adapter) return data
def test_age_group_different_bins(self): self.assertEqual("0-9", age_group(0, bin_size=10, age_cutoff=10)) self.assertEqual("0-9", age_group(0, bin_size=10, age_cutoff=70)) self.assertEqual("0-9", age_group(0, bin_size=10, age_cutoff=100)) self.assertEqual("10-19", age_group(10, bin_size=10, age_cutoff=70)) self.assertEqual("10-19", age_group(10, bin_size=10, age_cutoff=100)) self.assertEqual("10-", age_group(10, bin_size=10, age_cutoff=10)) self.assertEqual("70-", age_group(70, bin_size=10, age_cutoff=70)) self.assertEqual("100-", age_group(100, bin_size=10, age_cutoff=100))
def _default_age_adapter(value: Any) -> str: if isna(value): return "age_unknown" try: value = str(value) if re.match(r"\d+(\.\d*)?", value): return age_group(safe_int_cast(value)) if re.match(r"\d\d?\-\d*", value): return value except ValueError: pass return "age_unknown"
def _default_age_adapter(value: Any) -> str: if isna(value): return "age_unknown" try: value_int = safe_int_cast(value) if value_int is not None: return age_group(value_int) if re.match(r"\d+\-\d*", value): return value except ValueError: pass return "age_unknown"
def _default_age_adapter(value: Any) -> str: if isna(value): return "age_unknown" # If the value is already in the form of an age group, return as-is if isinstance(value, str) and re.match(r"^\d+\-\d*$", value): return value # Otherwise assume it's a number and return the corresponding age group try: value_int = safe_int_cast(value) if value_int is not None: return age_group(value_int) except ValueError: pass return "age_unknown"
def test_age_group(self): self.assertEqual("0-9", age_group(0, bin_count=10, age_cutoff=90)) self.assertEqual("0-9", age_group(0.0, bin_count=10, age_cutoff=90)) self.assertEqual("0-9", age_group(9, bin_count=10, age_cutoff=90)) self.assertEqual("10-19", age_group(10, bin_count=10, age_cutoff=90)) self.assertEqual("10-19", age_group(19, bin_count=10, age_cutoff=90)) self.assertEqual("90-", age_group(90, bin_count=10, age_cutoff=90)) self.assertEqual("90-", age_group(100, bin_count=10, age_cutoff=90)) self.assertEqual("90-", age_group(110, bin_count=10, age_cutoff=90)) self.assertEqual("90-", age_group(1e9, bin_count=10, age_cutoff=90)) self.assertEqual(None, age_group(-1, bin_count=10, age_cutoff=90)) self.assertEqual(None, age_group(None, bin_count=10, age_cutoff=90)) self.assertEqual(None, age_group(numpy.nan, bin_count=10, age_cutoff=90))
def test_age_group_standard(self): self.assertEqual("0-9", age_group(0, bin_size=10, age_cutoff=90)) self.assertEqual("0-9", age_group(0.0, bin_size=10, age_cutoff=90)) self.assertEqual("0-9", age_group(9, bin_size=10, age_cutoff=90)) self.assertEqual("10-19", age_group(10, bin_size=10, age_cutoff=90)) self.assertEqual("10-19", age_group(19, bin_size=10, age_cutoff=90)) self.assertEqual("90-", age_group(90, bin_size=10, age_cutoff=90)) self.assertEqual("90-", age_group(100, bin_size=10, age_cutoff=90)) self.assertEqual("90-", age_group(110, bin_size=10, age_cutoff=90)) self.assertEqual("90-", age_group(1e9, bin_size=10, age_cutoff=90)) self.assertRaises(ValueError, lambda: age_group(-1, bin_size=10, age_cutoff=90)) self.assertRaises(ValueError, lambda: age_group(None, bin_size=10, age_cutoff=90)) self.assertRaises( ValueError, lambda: age_group(numpy.nan, bin_size=10, age_cutoff=90))
def _aggregate_population(data: Series) -> Dict[str, int]: """ WorldPop data is "double stacked" by breaking down by age *and* sex, whereas we only want a breakdown of age *or* sex. This function converts the columns: `[ m_0, m_5, ... f_0, f_5, ... ]` Into: `[ population, population_male, population_female, population_age_00_09, population_age_10_19, ... ]` """ age_bucket_size = 10 age_bucket_count = 10 age_bucket_pairs = [(i * age_bucket_size, (i + 1) * age_bucket_size - 1) for i in range(age_bucket_count)] aggregated_values = { "key": data["key"], "population": 0, "population_male": 0, "population_female": 0, **{ f"population_age_{lo:02d}_{hi:02d}": 0 for lo, hi in age_bucket_pairs }, } for col, val in data.iteritems(): # Skip over the key item if col == "key": continue # Total population is the sum of all populations aggregated_values["population"] += val # Get age/sex info from column name sex, age = col.split("_", 2) # Add male and female populations separately if sex == "m": aggregated_values["population_male"] += val elif sex == "f": aggregated_values["population_female"] += val else: raise ValueError(f"Unexpected sex label encountered: {sex}") # Go over all the age buckets and add them separately # Since the WorldPop buckets are [0, 1-5, 5-9, 10-14, ...] we can just use the lower # range of the bucket as the age value to convert into our normalized buckets which are # [0-9, 10-19, 20-29, ...] age_bucket = age_group(int(age)) # Make sure that the age buckets all follow the pattern \d\d_\d\d age_bucket = "_".join( [f"{int(age):02d}" for age in age_bucket.split("-", 2)]) aggregated_values[f"population_age_{age_bucket}"] += val return aggregated_values
def _parse_age_bin(age_bin: str) -> str: try: return age_group(int(age_bin.split("-", 1)[0])) except: return "age_unknown"
def _age_adapter(age: str) -> str: try: age = safe_int_cast(str(age).replace("+", "-").split("-")[0]) return age_group(age, bin_size=10, age_cutoff=70) except: return "age_unknown"