Ejemplo n.º 1
0
 def test_age_group(self):
     self.assertEqual("0-9", age_group(0, bin_count=10, max_age=100))
     self.assertEqual("0-9", age_group(0.0, bin_count=10, max_age=100))
     self.assertEqual("0-9", age_group(9, bin_count=10, max_age=100))
     self.assertEqual("10-19", age_group(10, bin_count=10, max_age=100))
     self.assertEqual("10-19", age_group(19, bin_count=10, max_age=100))
     self.assertEqual("90-", age_group(90, bin_count=10, max_age=100))
     self.assertEqual("90-", age_group(100, bin_count=10, max_age=100))
     self.assertEqual("90-", age_group(1e9, bin_count=10, max_age=100))
     self.assertEqual(None, age_group(-1, bin_count=10, max_age=100))
Ejemplo n.º 2
0
    def parse(self, sources: Dict[Any, str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:
        url_tpl = sources[0]
        metadata = aux["metadata"]
        metadata = metadata[metadata["country_code"] == "FR"]

        fr_isos = read_file(SRC / "data" / "fr_iso_codes.csv")
        fr_iso_map = {iso: code for iso, code in zip(fr_isos["iso_code"], fr_isos["region_code"])}
        fr_codes = metadata[["subregion1_code", "subregion2_code"]].dropna()
        regions_iter = fr_codes["subregion1_code"].unique()
        deps_iter = [record for _, record in fr_codes.iterrows()]

        column_adapter = {
            "key": "key",
            "date": "date",
            "testsRealisesDetails": "_breakdown_tested",
            "testsPositifsDetails": "_breakdown_confirmed",
        }

        # Get country level data
        country = _get_country(url_tpl, column_adapter)

        # Get region level data
        get_region_func = partial(_get_region, url_tpl, column_adapter, fr_iso_map)
        regions = concat(list(thread_map(get_region_func, regions_iter)))

        # Get department level data
        get_department_func = partial(_get_department, url_tpl, column_adapter)
        departments = concat(list(thread_map(get_department_func, deps_iter)))

        data = concat([country, regions, departments])
        data["date"] = data["date"].apply(lambda x: datetime_isoformat(x, "%Y-%m-%d %H:%M:%S"))

        data["_breakdown_tested"].fillna("", inplace=True)
        data["_breakdown_confirmed"].fillna("", inplace=True)

        records: Dict[str, List] = {"confirmed": [], "tested": []}
        for key, row in data.set_index("key").iterrows():
            for statistic in records.keys():
                if row[f"_breakdown_{statistic}"] != "":
                    for item in row[f"_breakdown_{statistic}"]:
                        records[statistic].append(
                            {
                                "key": key,
                                "date": row["date"],
                                "age": item["age"],
                                "sex": item.get("sexe"),
                                f"new_{statistic}": item["value"],
                            }
                        )

        df1 = DataFrame.from_records(records["tested"])
        df2 = DataFrame.from_records(records["confirmed"])
        data = df1.merge(df2, how="outer")

        data = data[~data["age"].isin(["0", "A", "B", "C", "D", "E"])]
        data["age"] = data["age"].apply(lambda x: age_group(safe_int_cast(x)))

        sex_adapter = lambda x: {"h": "male", "f": "female"}.get(x, "sex_unknown")
        data["sex"] = data["sex"].apply(sex_adapter)
        return data
Ejemplo n.º 3
0
 def test_age_group_different_bins(self):
     self.assertEqual("0-9", age_group(0, bin_size=10, age_cutoff=10))
     self.assertEqual("0-9", age_group(0, bin_size=10, age_cutoff=70))
     self.assertEqual("0-9", age_group(0, bin_size=10, age_cutoff=100))
     self.assertEqual("10-19", age_group(10, bin_size=10, age_cutoff=70))
     self.assertEqual("10-19", age_group(10, bin_size=10, age_cutoff=100))
     self.assertEqual("10-", age_group(10, bin_size=10, age_cutoff=10))
     self.assertEqual("70-", age_group(70, bin_size=10, age_cutoff=70))
     self.assertEqual("100-", age_group(100, bin_size=10, age_cutoff=100))
Ejemplo n.º 4
0
def _default_age_adapter(value: Any) -> str:
    if isna(value):
        return "age_unknown"

    try:
        value = str(value)
        if re.match(r"\d+(\.\d*)?", value):
            return age_group(safe_int_cast(value))
        if re.match(r"\d\d?\-\d*", value):
            return value
    except ValueError:
        pass

    return "age_unknown"
Ejemplo n.º 5
0
def _default_age_adapter(value: Any) -> str:
    if isna(value):
        return "age_unknown"

    try:
        value_int = safe_int_cast(value)
        if value_int is not None:
            return age_group(value_int)
        if re.match(r"\d+\-\d*", value):
            return value
    except ValueError:
        pass

    return "age_unknown"
Ejemplo n.º 6
0
def _default_age_adapter(value: Any) -> str:
    if isna(value):
        return "age_unknown"

    # If the value is already in the form of an age group, return as-is
    if isinstance(value, str) and re.match(r"^\d+\-\d*$", value):
        return value

    # Otherwise assume it's a number and return the corresponding age group
    try:
        value_int = safe_int_cast(value)
        if value_int is not None:
            return age_group(value_int)
    except ValueError:
        pass

    return "age_unknown"
Ejemplo n.º 7
0
 def test_age_group(self):
     self.assertEqual("0-9", age_group(0, bin_count=10, age_cutoff=90))
     self.assertEqual("0-9", age_group(0.0, bin_count=10, age_cutoff=90))
     self.assertEqual("0-9", age_group(9, bin_count=10, age_cutoff=90))
     self.assertEqual("10-19", age_group(10, bin_count=10, age_cutoff=90))
     self.assertEqual("10-19", age_group(19, bin_count=10, age_cutoff=90))
     self.assertEqual("90-", age_group(90, bin_count=10, age_cutoff=90))
     self.assertEqual("90-", age_group(100, bin_count=10, age_cutoff=90))
     self.assertEqual("90-", age_group(110, bin_count=10, age_cutoff=90))
     self.assertEqual("90-", age_group(1e9, bin_count=10, age_cutoff=90))
     self.assertEqual(None, age_group(-1, bin_count=10, age_cutoff=90))
     self.assertEqual(None, age_group(None, bin_count=10, age_cutoff=90))
     self.assertEqual(None, age_group(numpy.nan, bin_count=10, age_cutoff=90))
Ejemplo n.º 8
0
 def test_age_group_standard(self):
     self.assertEqual("0-9", age_group(0, bin_size=10, age_cutoff=90))
     self.assertEqual("0-9", age_group(0.0, bin_size=10, age_cutoff=90))
     self.assertEqual("0-9", age_group(9, bin_size=10, age_cutoff=90))
     self.assertEqual("10-19", age_group(10, bin_size=10, age_cutoff=90))
     self.assertEqual("10-19", age_group(19, bin_size=10, age_cutoff=90))
     self.assertEqual("90-", age_group(90, bin_size=10, age_cutoff=90))
     self.assertEqual("90-", age_group(100, bin_size=10, age_cutoff=90))
     self.assertEqual("90-", age_group(110, bin_size=10, age_cutoff=90))
     self.assertEqual("90-", age_group(1e9, bin_size=10, age_cutoff=90))
     self.assertRaises(ValueError,
                       lambda: age_group(-1, bin_size=10, age_cutoff=90))
     self.assertRaises(ValueError,
                       lambda: age_group(None, bin_size=10, age_cutoff=90))
     self.assertRaises(
         ValueError,
         lambda: age_group(numpy.nan, bin_size=10, age_cutoff=90))
Ejemplo n.º 9
0
def _aggregate_population(data: Series) -> Dict[str, int]:
    """
    WorldPop data is "double stacked" by breaking down by age *and* sex, whereas we only want
    a breakdown of age *or* sex. This function converts the columns:
    `[
        m_0,
        m_5,
        ...
        f_0,
        f_5,
        ...
    ]`

    Into:
    `[
        population,
        population_male,
        population_female,
        population_age_00_09,
        population_age_10_19,
        ...
    ]`
    """
    age_bucket_size = 10
    age_bucket_count = 10
    age_bucket_pairs = [(i * age_bucket_size, (i + 1) * age_bucket_size - 1)
                        for i in range(age_bucket_count)]

    aggregated_values = {
        "key": data["key"],
        "population": 0,
        "population_male": 0,
        "population_female": 0,
        **{
            f"population_age_{lo:02d}_{hi:02d}": 0
            for lo, hi in age_bucket_pairs
        },
    }
    for col, val in data.iteritems():
        # Skip over the key item
        if col == "key":
            continue

        # Total population is the sum of all populations
        aggregated_values["population"] += val

        # Get age/sex info from column name
        sex, age = col.split("_", 2)

        # Add male and female populations separately
        if sex == "m":
            aggregated_values["population_male"] += val
        elif sex == "f":
            aggregated_values["population_female"] += val
        else:
            raise ValueError(f"Unexpected sex label encountered: {sex}")

        # Go over all the age buckets and add them separately
        # Since the WorldPop buckets are [0, 1-5, 5-9, 10-14, ...] we can just use the lower
        # range of the bucket as the age value to convert into our normalized buckets which are
        # [0-9, 10-19, 20-29, ...]
        age_bucket = age_group(int(age))

        # Make sure that the age buckets all follow the pattern \d\d_\d\d
        age_bucket = "_".join(
            [f"{int(age):02d}" for age in age_bucket.split("-", 2)])

        aggregated_values[f"population_age_{age_bucket}"] += val

    return aggregated_values
Ejemplo n.º 10
0
def _parse_age_bin(age_bin: str) -> str:
    try:
        return age_group(int(age_bin.split("-", 1)[0]))
    except:
        return "age_unknown"
Ejemplo n.º 11
0
def _age_adapter(age: str) -> str:
    try:
        age = safe_int_cast(str(age).replace("+", "-").split("-")[0])
        return age_group(age, bin_size=10, age_cutoff=70)
    except:
        return "age_unknown"