Example #1
0
def _process_partition(cases: DataFrame) -> DataFrame:
    cases = cases.copy()

    # Extract information about whether doses were first (partial immunization) or second (full)
    cases["date_new_persons_vaccinated"] = None
    cases["date_new_persons_fully_vaccinated"] = None
    first_dose_mask = cases["_dose_information"].str.strip().str.slice(
        0, 1) == "1"
    second_dose_mask = cases["_dose_information"].str.strip().str.slice(
        0, 1) == "2"
    cases.loc[first_dose_mask, "date_new_persons_vaccinated"] = cases.loc[
        first_dose_mask, "date_new_vaccine_doses_administered"]
    cases.loc[second_dose_mask,
              "date_new_persons_fully_vaccinated"] = cases.loc[
                  second_dose_mask, "date_new_vaccine_doses_administered"]

    # Drop columns which we have no use for
    cases = cases[[col for col in cases.columns if not col.startswith("_")]]

    # Make sure our region codes are of type str
    cases["subregion2_code"] = cases["subregion2_code"].apply(
        safe_int_cast).astype(str)

    # Convert ages to int, and translate sex (no "other" sex/gender reported)
    cases["age"] = cases["age"].apply(safe_int_cast)
    cases["sex"] = cases["sex"].str.lower().apply({
        "m": "male",
        "f": "female"
    }.get)

    # Convert to time series format
    data = convert_cases_to_time_series(
        cases, index_columns=["subregion1_code", "subregion2_code"])

    # Convert date to ISO format
    data["date"] = data["date"].str.slice(0, 10)
    data["date"] = data["date"].apply(
        lambda x: datetime_isoformat(x, "%Y-%m-%d"))

    # Get rid of bogus records
    data = data.dropna(subset=["date"])
    data = data[data["date"] >= "2020-01-01"]
    data = data[data["date"] < date_today(offset=1)]

    # Aggregate data by country
    country = aggregate_admin_level(data, ["date", "age", "sex"], "country")
    country["key"] = "BR"

    # Aggregate data by state
    state = (data.drop(columns=["subregion2_code"]).groupby(
        ["date", "subregion1_code", "age", "sex"]).sum().reset_index())
    state["key"] = "BR_" + state["subregion1_code"]

    # We can derive the key from subregion1 + subregion2
    data = data[data["subregion2_code"].notna()
                & (data["subregion2_code"] != "")]
    data["key"] = "BR_" + data["subregion1_code"] + "_" + data[
        "subregion2_code"]

    return concat([country, state, data])
    def parse_dataframes(self, dataframes: Dict[str, DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:

        # Rename appropriate columns
        col = parse_opts["column_name"]
        cases = table_rename(dataframes[0], _column_adapter)
        cases = cases.rename(columns={"date": f"date_{col}"})
        cases = _parse_region_codes(cases).dropna(subset=[f"date_{col}"])

        # Rename the sex values
        cases["sex"] = cases["sex"].apply({"M": "male", "Z": "female"}.get)

        # Go from individual case records to key-grouped records in a flat table
        data = convert_cases_to_time_series(
            cases, index_columns=["subregion1_code", "subregion2_code"])

        # Make sure the region codes are strings before parsing them
        data["subregion1_code"] = data["subregion1_code"].astype(str)
        data["subregion2_code"] = data["subregion2_code"].astype(str)

        # Aggregate L2 + L3 data
        data = _aggregate_regions(data,
                                  ["date", "subregion1_code", "age", "sex"])

        # Remove bogus values
        data = data[data["key"] != "CZ_99"]
        data = data[data["key"] != "CZ_99_99Y"]

        # Convert all dates to ISO format
        data["date"] = (
            data["date"].astype(str).apply(lambda x: datetime_isoformat(
                x, "%d.%m.%Y" if "." in x else "%Y-%m-%d")))

        return data
Example #3
0
    def parse_dataframes(
        self, dataframes: Dict[Any, DataFrame], aux: Dict[str, DataFrame], **parse_opts
    ) -> DataFrame:
        tables = [df for name, df in dataframes.items() if name != "geo"]
        column_adapter = dict(_column_adapter, state="idxs", date=f"date_new_confirmed")
        data = table_rename(concat(tables), column_adapter=column_adapter, drop=True)

        # Correct data types where necessary
        data["idxs"] = data["idxs"].astype(str)
        data["age"] = data["age"].apply(lambda x: None if x < 0 else x)
        data["sex"] = data["sex"].apply({0: "female", 1: "male"}.get)

        # Convert to our preferred time series format
        data = convert_cases_to_time_series(data, ["idxs"])

        # Geo name lookup
        geo_col_adapter = {"state": "subregion1_name", "district": "subregion2_name"}
        geo = table_rename(dataframes["geo"], geo_col_adapter, drop=False)
        geo["idxs"] = geo["idxs"].astype(str)
        geo["subregion1_name"] = geo["subregion1_name"].str.replace("W.P. ", "")
        geo = geo.groupby(["subregion1_name", "idxs"]).first().reset_index()
        data = table_merge([data, geo], on=["idxs"], how="inner")

        # Since only the cases have district level data, ignore it
        data["country_code"] = "MY"
        data["subregion2_name"] = None
        return data
Example #4
0
    def parse_dataframes(self, dataframes: Dict[str, DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:
        cases = table_rename(dataframes[0], _srag_column_adapter, drop=True)
        covid_mask = cases["_classification"] == 5
        valid_mask = cases["_prognosis"].notna() & cases["_prognosis"] != 9
        cases = cases[covid_mask & valid_mask]

        # Record the date of death
        cases["date_new_deceased"] = None
        deceased_mask = cases["_prognosis"] == 2
        cases.loc[deceased_mask,
                  "date_new_deceased"] = cases.loc[deceased_mask,
                                                   "_date_prognosis"]

        # Convert ages to int, and translate sex (no "other" sex/gender reported)
        cases["age"] = cases["age"].apply(safe_int_cast)
        cases["sex"] = cases["sex"].apply({"M": "male", "F": "female"}.get)

        # Convert all dates to ISO format
        for col in filter(lambda x: x.startswith("date"), cases.columns):
            cases[col] = cases[col].apply(
                lambda x: datetime_isoformat(x, "%d/%m/%Y"))

        # Parse subregion codes
        cases["subregion2_code"] = cases["subregion2_code"].apply(
            lambda x: numeric_code_as_string(x, 5))

        # Convert to time series format
        data = convert_cases_to_time_series(cases,
                                            index_columns=["subregion2_code"])
        data["country_code"] = "BR"

        # Get rid of bogus records
        data = data.dropna(subset=["date"])
        data = data[data["date"] >= "2020-01-01"]
        data = data[data["date"] < date_today(offset=1)]

        # Aggregate by country level
        country = (data.drop(columns=["subregion2_code"]).groupby(
            ["date", "age", "sex"]).sum().reset_index())
        country["key"] = "BR"

        # Aggregate by state level
        data["subregion1_code"] = data["subregion2_code"].apply(
            lambda x: _IBGE_STATES.get(safe_int_cast(x[:2])))
        state = (data.drop(columns=["subregion2_code"]).dropna(
            subset=["subregion1_code"]).groupby(
                ["date", "subregion1_code", "age", "sex"]).sum().reset_index())
        state["key"] = "BR_" + state["subregion1_code"]

        # Derive the key from subregion codes
        data = data[data["subregion2_code"].notna()]
        data["key"] = "BR_" + data["subregion1_code"] + "_" + data[
            "subregion2_code"]

        return concat([country, state, data])
Example #5
0
    def parse_dataframes(self, dataframes: Dict[str, DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:

        # Rename appropriate columns
        cases = table_rename(
            dataframes[0],
            {
                "Código DIVIPOLA municipio": "subregion2_code",
                "Fecha de notificación": "_date_notified",
                "Fecha de muerte": "date_new_deceased",
                "Fecha de diagnóstico": "date_new_confirmed",
                "Fecha de recuperación": "date_new_recovered",
                "edad": "age",
                "sexo": "sex",
                "Pertenencia etnica": "ethnicity",
            },
        )

        # Fall back to notification date when no confirmed date is available
        cases["date_new_confirmed"] = cases["date_new_confirmed"].fillna(
            cases["_date_notified"])

        # Clean up the subregion code
        cases.subregion2_code = cases.subregion2_code.apply(
            lambda x: "{0:05d}".format(int(x)))

        # Compute the key from the DIVIPOLA code
        cases["key"] = ("CO_" + cases.subregion2_code.apply(lambda x: x[:2]) +
                        "_" + cases.subregion2_code)

        # A few cases are at the l2 level
        cases["key"] = cases["key"].apply(lambda x: "CO_" + x[-2:]
                                          if x.startswith("CO_00_") else x)

        # Go from individual case records to key-grouped records in a flat table
        index_columns = ["key", "date", "sex", "age"]
        value_columns = ["new_confirmed", "new_deceased", "new_recovered"]
        data = convert_cases_to_time_series(cases)

        # Parse dates to ISO format.
        data["date"] = data["date"].apply(
            lambda x: datetime_isoformat(x.split(" ")[0], "%d/%m/%Y"))
        data.dropna(subset=["date"], inplace=True)

        # Group by level 1 region, and add the parts
        l1 = data.copy()
        l1["key"] = l1.key.apply(lambda x: "_".join(x.split("_")[:2]))
        l1 = l1.groupby(index_columns).sum().reset_index()

        # Group by country level
        country = l1.drop(columns=["key"]).groupby(
            index_columns[1:]).sum().reset_index()
        country["key"] = "CO"

        return concat([data, l1, country])[index_columns + value_columns]
Example #6
0
    def parse_dataframes(self, dataframes: Dict[str, DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:

        # Rename appropriate columns
        data = table_rename(
            dataframes[0],
            {
                "codigo divipola": "subregion2_code",
                "fecha de muerte": "date_new_deceased",
                "fecha diagnostico": "date_new_confirmed",
                "fecha recuperado": "date_new_recovered",
                "edad": "age",
                "sexo": "sex",
                "Pertenencia etnica": "ethnicity",
            },
        )

        # Clean up the subregion code
        data.subregion2_code = data.subregion2_code.apply(
            lambda x: "{0:05d}".format(int(x)))

        # Compute the key from the DIVIPOLA code
        data["key"] = ("CO_" + data.subregion2_code.apply(lambda x: x[:2]) +
                       "_" + data.subregion2_code)

        # A few cases are at the l2 level
        data["key"] = data["key"].apply(lambda x: "CO_" + x[-2:]
                                        if x.startswith("CO_00_") else x)

        # Go from individual case records to key-grouped records in a flat table
        index_columns = ["key", "date", "sex", "age"]
        value_columns = ["new_confirmed", "new_deceased", "new_recovered"]
        merged = convert_cases_to_time_series(data)

        # Some dates are badly formatted as 31/12/1899 in the raw data we can drop these.
        merged = merged[(merged["date"] != datetime(1899, 12, 31))].dropna(
            subset=["date"])

        # Parse dates to ISO format.
        merged["date"] = merged["date"].apply(safe_datetime_parse)
        merged["date"] = merged["date"].apply(lambda x: x.date().isoformat())

        # Group by level 2 region, and add the parts
        l2 = merged.copy()
        l2["key"] = l2.key.apply(lambda x: "_".join(x.split("_")[:2]))
        l2 = l2.groupby(index_columns).sum().reset_index()

        # Group by country level, and add the parts
        l1 = l2.copy().drop(columns=["key"])
        l1 = l1.groupby(index_columns[1:]).sum().reset_index()
        l1["key"] = "CO"

        return concat([merged, l1, l2])[index_columns + value_columns]
Example #7
0
    def parse_dataframes(self, dataframes: Dict[str, DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:

        cases = table_rename(
            dataframes[0],
            {
                # "no": "",
                "age": "age",
                "sex": "sex",
                # "nationality": "",
                # "province_of_isolation": "",
                # "notification_date": "date",
                "announce_date": "date_new_confirmed",
                "province_of_onset": "match_string",
                # "district_of_onset": "subregion2_name",
                # "quarantine": "",
            },
            drop=True,
            remove_regex=r"[^0-9a-z\s]",
        )

        # Convert date to ISO format
        cases["date_new_confirmed"] = cases["date_new_confirmed"].str.slice(
            0, 10)

        # Some dates are not properly parsed, so fix those manually
        for col in (col for col in cases.columns if col.startswith("date_")):
            cases[col] = cases[col].str.replace("1963-", "2020-")
            cases[col] = cases[col].str.replace("2563-", "2020-")
            cases[col] = cases[col].str.replace("15/15/2020", "2020-12-15")
            cases[col] = cases[col].str.replace("15/15/2021", "2020-12-15")

        # Translate sex labels; only male, female and unknown are given
        sex_adapter = lambda x: {
            "ชาย": "male",
            "หญิง": "female"
        }.get(x, "sex_unknown")
        cases["sex"] = cases["sex"].apply(sex_adapter)

        # Convert from cases to time-series format
        data = convert_cases_to_time_series(cases, ["match_string"])

        # Aggregate country-level data by adding all counties
        country = (data.drop(columns=["match_string"]).groupby(
            ["date", "age", "sex"]).sum().reset_index())
        country["key"] = "TH"

        # Drop bogus records from the data
        data = data[data["match_string"].notna()
                    & (data["match_string"] != "")]

        return concat([country, data])
    def parse_dataframes(
        self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts
    ) -> DataFrame:
        date_col = "date_new_confirmed"
        cases = table_rename(
            dataframes[0], {"cdc_report_dt": date_col, "sex": "sex", "age_group": "age"}, drop=True
        )

        cases["key"] = "US"
        cases["sex"] = cases["sex"].apply(lambda x: x.lower() if not isna(x) else None)
        cases["age"] = cases["age"].apply(
            lambda x: "-".join(x.replace(" Years", "").split(" - ")) if not isna(x) else None
        )
        cases[date_col] = cases[date_col].apply(lambda x: datetime_isoformat(x, "%Y/%m/%d"))

        if parse_opts["column"] == "age":
            data = convert_cases_to_time_series(cases.drop(columns=["sex"]))
        elif parse_opts["column"] == "sex":
            data = convert_cases_to_time_series(cases.drop(columns=["age"]))
        else:
            raise ValueError(f'Unknown column {parse_opts["column"]}')

        return data
Example #9
0
    def test_convert_cases_to_time_series_null_deaths(self):
        cases = read_csv(StringIO(CASE_LINE_DATA_NULL_DEATHS))
        table = convert_cases_to_time_series(cases)

        # There should be as many records as there are combinations of <key,age,sex,ethnicity,date>
        self.assertEqual(len(cases), len(table))

        # All lines in our test case indicate a confirmed case
        self.assertEqual(len(cases), table["new_confirmed"].sum())

        # No lines in our test case indicate a deceased case
        self.assertEqual(0, table["new_deceased"].sum())

        # Half of our cases are male, and the other half are female
        self.assertEqual(len(table[table.sex == "male"]),
                         len(table[table.sex == "female"]))
Example #10
0
    def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame],
              **parse_opts) -> DataFrame:
        with open(sources[0], "r") as fd:
            records = json.load(fd)["features"]

        cases = DataFrame.from_records(records)
        cases["date_new_confirmed"] = cases["ChartDate"].apply(
            lambda x: fromtimestamp(x // 1000).date().isoformat())

        # FL does not provide date for deceased or hospitalized, so we just copy it from confirmed
        deceased_mask = cases.Died == "Yes"
        hospitalized_mask = cases.Hospitalized == "YES"
        cases["date_new_deceased"] = None
        cases["date_new_hospitalized"] = None
        cases.loc[deceased_mask,
                  "date_new_deceased"] = cases.loc[deceased_mask,
                                                   "date_new_confirmed"]
        cases.loc[hospitalized_mask,
                  "date_new_hospitalized"] = cases.loc[hospitalized_mask,
                                                       "date_new_confirmed"]

        # Rename the sex labels
        sex_adapter = lambda x: {
            "male": "male",
            "female": "female"
        }.get(x, "sex_unknown")
        cases["sex"] = cases["Gender"].apply(sex_adapter)
        cases.drop(columns=["Gender"], inplace=True)

        # Make sure age is an integer
        cases["age"] = cases["Age"].apply(safe_int_cast)
        cases.drop(columns=["Age"], inplace=True)

        cases = cases.rename(columns={"County": "match_string"})
        data = convert_cases_to_time_series(cases, ["match_string"])
        data["country_code"] = "US"
        data["subregion1_code"] = "FL"

        # Aggregate to state level here, since some data locations are "Unknown"
        group_cols = ["country_code", "subregion1_code", "date", "age", "sex"]
        state = data.drop(
            columns=["match_string"]).groupby(group_cols).sum().reset_index()

        # Remove bogus data
        data = data[data.match_string != "Unknown"]

        return concat([state, data])
Example #11
0
    def parse_dataframes(
        self, dataframes: Dict[Any, DataFrame], aux: Dict[str, DataFrame], **parse_opts
    ) -> DataFrame:
        column_adapter = dict(_column_adapter, date=f"date_new_deceased")
        data = table_rename(dataframes[0], column_adapter=column_adapter, drop=True)

        data["sex"] = data["sex"].apply({0: "female", 1: "male"}.get)
        data["age"] = data["age"].apply(lambda x: None if x < 0 else x)
        data["subregion1_name"] = data["subregion1_name"].str.replace("W.P. ", "")
        data = convert_cases_to_time_series(data, ["subregion1_name"])

        # Remove records with no location
        data = data.dropna(subset=["subregion1_name"])

        data["country_code"] = "MY"
        data["subregion2_code"] = None
        return data
    def parse_dataframes(
        self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts
    ) -> DataFrame:

        cases = table_rename(
            dataframes[0],
            {
                # "Case no.": "",
                "Report date": "_date",
                # "Date of onset": "date_onset",
                "Gender": "sex",
                "Age": "age",
                # "Name of hospital admitted": "",
                "Hospitalised/Discharged/Deceased": "_status",
                # "HK/Non-HK resident": "",
                # "Case classification*": "",
                # "Confirmed/probable": "",
            },
            drop=True,
            remove_regex=r"[^0-9a-z\s]",
        )

        # Convert date to ISO format
        cases["_date"] = cases["_date"].apply(lambda x: datetime_isoformat(x, "%d/%m/%Y"))

        # All cases in the data are confirmed (or probable)
        cases["date_new_confirmed"] = cases["_date"]

        # Use confirmed date as estimate for deceased date
        cases["date_new_deceased"] = None
        deceased_mask = cases["_status"] == "Deceased"
        cases.loc[deceased_mask, "date_new_deceased"] = cases.loc[deceased_mask, "_date"]

        # Use confirmed date as estimate for hospitalization date
        cases["date_new_hospitalized"] = None
        hosp_mask = (cases["_status"] == "Discharged") | (cases["_status"] == "Hospitalized")
        cases.loc[hosp_mask, "date_new_hospitalized"] = cases.loc[hosp_mask, "_date"]

        # Translate sex labels; only male, female and unknown are given
        sex_adapter = lambda x: {"M": "male", "N": "female"}.get(x, "sex_unknown")
        cases["sex"] = cases["sex"].apply(sex_adapter)

        cases["key"] = "HK"
        return convert_cases_to_time_series(cases, ["key"])
Example #13
0
 def parse_dataframes(self, dataframes: Dict[str, DataFrame],
                      aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:
     cases = table_rename(
         dataframes[0],
         {
             "Report Date": "date_new_confirmed",
             # "Case Status": "_status",
             "Sex": "sex",
             "Age group": "age",
             # "DHB": "",
             # "Overseas travel": "",
         },
         drop=True,
     )
     cases["key"] = "NZ"
     cases["age"] = cases["age"].str.slice(0, 2).str.replace(
         " ", "").apply(safe_int_cast)
     data = convert_cases_to_time_series(cases, ["key"])
     return data
Example #14
0
    def test_convert_cases_to_time_series_simple(self):
        cases = read_csv(StringIO(CASE_LINE_DATA_SIMPLE))
        table = convert_cases_to_time_series(cases)
        confirmed = table[table.new_confirmed > 0]

        # There should be as many records as there are combinations of <key,age,sex,ethnicity,date>
        self.assertEqual(len(cases) * 2, len(table))

        # All lines in our test case indicate a confirmed case
        self.assertEqual(len(cases), table["new_confirmed"].sum())

        # All lines in our test case indicate a deceased case
        self.assertEqual(len(cases), table["new_deceased"].sum())

        # Half of our cases are male, and the other half are female
        self.assertEqual(len(table[table.sex == "male"]),
                         len(table[table.sex == "female"]))

        # 2 cases are 10-19 and 6 are 20-29
        self.assertEqual(2, len(confirmed[confirmed.age == "10-19"]))
        self.assertEqual(6, len(confirmed[confirmed.age == "20-29"]))
Example #15
0
    def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:
        with open(sources[0], "r") as fd:
            cases = json.load(fd)["Data"]

        # {"ConfirmDate":"2021-01-09 00:00:00","No":"9876","Age":66,"Gender":"\u0e0a","GenderEn":"Male","Nation":"Thailand","NationEn":"Thailand","Province":"\u0e2d","ProvinceId":72,"District":"\u0e44","ProvinceEn":"Ang Thong","Detail":null,"StatQuarantine":1}
        cases = table_rename(
            DataFrame.from_records(cases),
            {
                "ConfirmDate": "date_new_confirmed",
                "Age": "age",
                "GenderEn": "sex",
                "ProvinceEn": "match_string",
            },
            drop=True,
        )

        # Convert dates to ISO format
        for col in cases.columns:
            if col.startswith("date_"):
                cases[col] = cases[col].str.slice(0, 10)

        # Parse age and sex fields
        cases["sex"] = cases["sex"].str.lower().apply({"male": "male", "female": "female"}.get)
        cases["age"] = cases["age"].fillna("age_unknown")
        cases["sex"] = cases["sex"].fillna("sex_unknown")

        # Convert to time series data
        data = convert_cases_to_time_series(cases, ["match_string"])

        # Aggregate by country level
        country = aggregate_admin_level(data, ["date", "age", "sex"], "country")
        country["key"] = "TH"

        # Add country code and return data
        data["country_code"] = "TH"
        data = data[data["match_string"] != "Unknown"]

        return concat([country, data])
Example #16
0
 def test_convert_cases_to_time_series_age_float_values(self):
     cases = read_csv(StringIO(CASE_LINE_DATA_AGE_FLOATS))
     table = convert_cases_to_time_series(cases)
     self.assertSetEqual({"20-29"}, set(table.age))
Example #17
0
 def test_convert_cases_to_time_series_null_values(self):
     cases = read_csv(StringIO(CASE_LINE_DATA_NULL_VALUES))
     table = convert_cases_to_time_series(cases)
     self.assertSetEqual({"age_unknown"}, set(table.age))
     self.assertSetEqual({"sex_unknown"}, set(table.sex))
     self.assertSetEqual({"ethnicity_unknown"}, set(table.ethnicity))
Example #18
0
 def test_convert_cases_to_time_series_other_values(self):
     cases = read_csv(StringIO(CASE_LINE_DATA_OTHER))
     table = convert_cases_to_time_series(cases)
     self.assertSetEqual({"sex_other"}, set(table.sex))
     self.assertSetEqual({"ethnicity_other"}, set(table.ethnicity))
Example #19
0
def _process_partition(cases: DataFrame) -> DataFrame:
    cases = cases.copy()

    # Confirmed cases are only those with a confirmed positive test result
    cases["date_new_confirmed"] = None
    confirmed_mask = cases["_test_result"] == "Positivo"
    cases.loc[confirmed_mask, "date_new_confirmed"] = cases.loc[confirmed_mask, "date_new_tested"]

    # Do not process deceased counts, since they are considered highly inaccurate

    # # Deceased cases have a specific label and the date is the "closing" date
    # cases["date_new_deceased"] = None
    # deceased_mask = cases["_prognosis"] == "Óbito"
    # cases.loc[deceased_mask, "date_new_deceased"] = cases.loc[deceased_mask, "_date_update"]

    # # Only count deceased cases from confirmed subjects
    # cases.loc[~confirmed_mask, "date_new_deceased"] = None

    # Recovered cases have a specific label and the date is the "closing" date
    cases["date_new_recovered"] = None
    recovered_mask = cases["_prognosis"] == "Cured"
    cases.loc[recovered_mask, "date_new_recovered"] = cases.loc[recovered_mask, "_date_update"]

    # Only count recovered cases from confirmed subjects
    cases.loc[~confirmed_mask, "date_new_recovered"] = None

    # Drop columns which we have no use for
    cases = cases[[col for col in cases.columns if not col.startswith("_")]]

    # Make sure our region codes are of type str
    cases["subregion2_code"] = cases["subregion2_code"].apply(safe_int_cast)
    # The last digit of the region code is actually not necessary
    cases["subregion2_code"] = cases["subregion2_code"].apply(
        lambda x: None if isna(x) else str(int(x))[:-1]
    )

    # Convert ages to int, and translate sex (no "other" sex/gender reported)
    cases["age"] = cases["age"].apply(safe_int_cast)
    cases["sex"] = cases["sex"].str.lower().apply({"masculino": "male", "feminino": "female"}.get)

    # Convert to time series format
    data = convert_cases_to_time_series(cases, index_columns=["subregion1_code", "subregion2_code"])

    # Convert date to ISO format
    data["date"] = data["date"].str.slice(0, 10)
    data["date"] = data["date"].apply(lambda x: datetime_isoformat(x, "%Y-%m-%d"))

    # Get rid of bogus records
    data = data.dropna(subset=["date"])
    data = data[data["date"] >= "2020-01-01"]
    data = data[data["date"] < date_today(offset=1)]

    # Aggregate data by country
    country = (
        data.drop(columns=["subregion1_code", "subregion2_code"])
        .groupby(["date", "age", "sex"])
        .sum()
        .reset_index()
    )
    country["key"] = "BR"

    # Aggregate data by state
    state = (
        data.drop(columns=["subregion2_code"])
        .groupby(["date", "subregion1_code", "age", "sex"])
        .sum()
        .reset_index()
    )
    state["key"] = "BR_" + state["subregion1_code"]

    # We can derive the key from subregion1 + subregion2
    data = data[data["subregion2_code"].notna() & (data["subregion2_code"] != "")]
    data["key"] = "BR_" + data["subregion1_code"] + "_" + data["subregion2_code"]

    return concat([country, state, data])
Example #20
0
    def parse_dataframes(self, dataframes: Dict[str, DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:

        cases = table_rename(
            dataframes[0],
            {
                # "FECHA_ACTUALIZACION": "",
                # "ID_REGISTRO": "",
                # "ORIGEN": "",
                # "SECTOR": "",
                # "ENTIDAD_UM": "",
                "SEXO": "sex",
                # "ENTIDAD_NAC": "",
                "ENTIDAD_RES": "subregion1_code",
                "MUNICIPIO_RES": "subregion2_code",
                "TIPO_PACIENTE": "_type",
                "FECHA_INGRESO": "date_new_confirmed",
                # "FECHA_SINTOMAS": "",
                "FECHA_DEF": "date_new_deceased",
                # "INTUBADO": "",
                # "NEUMONIA": "",
                "EDAD": "age",
                # "NACIONALIDAD": "",
                # "EMBARAZO": "",
                # "HABLA_LENGUA_INDIG": "",
                # "DIABETES": "",
                # "EPOC": "",
                # "ASMA": "",
                # "INMUSUPR": "",
                # "HIPERTENSION": "",
                # "OTRA_COM": "",
                # "CARDIOVASCULAR": "",
                # "OBESIDAD": "",
                # "RENAL_CRONICA": "",
                # "TABAQUISMO": "",
                # "OTRO_CASO": "",
                "RESULTADO": "_diagnosis",
                # "MIGRANTE": "",
                # "PAIS_NACIONALIDAD": "",
                # "PAIS_ORIGEN": "",
                "UCI": "_intensive_care",
            },
            drop=True,
        )

        # Null dates are coded as 9999-99-99
        for col in cases.columns:
            if col.startswith("date_"):
                cases.loc[cases[col] == "9999-99-99", col] = None

        # Discard all cases with negative test result
        cases = cases[cases["_diagnosis"] == 1]

        # Type 1 is normal, type 2 is hospitalized
        cases["date_new_hospitalized"] = None
        hospitalized_mask = cases["_type"] == 2
        cases.loc[hospitalized_mask,
                  "date_new_hospitalized"] = cases.loc[hospitalized_mask,
                                                       "date_new_confirmed"]

        # Parse region codes as strings
        cases["subregion1_code"] = cases["subregion1_code"].apply(
            lambda x: numeric_code_as_string(x, 2))
        cases["subregion2_code"] = cases["subregion2_code"].apply(
            lambda x: numeric_code_as_string(x, 3))

        # Convert case line data to our time series format
        data = convert_cases_to_time_series(
            cases, ["subregion1_code", "subregion2_code"])

        # Convert date to ISO format
        data["date"] = data["date"].astype(str)

        # Unknown region codes are defined as "99+" instead of null
        data.loc[data["subregion1_code"] == "99", "subregion1_code"] = None
        data.loc[data["subregion2_code"] == "999", "subregion2_code"] = None

        # The subregion2 codes need to be composed
        invalid_region_mask = data["subregion2_code"].isna(
        ) | data["subregion2_code"].isna()
        data.loc[~invalid_region_mask, "subregion2_code"] = (
            data.loc[~invalid_region_mask, "subregion1_code"] +
            data.loc[~invalid_region_mask, "subregion2_code"])

        # Use proper ISO codes for the subregion1 level
        data["subregion1_code"] = data["subregion1_code"].apply(
            _SUBREGION1_CODE_MAP.get)

        # Translate sex labels; only male, female and unknown are given
        data["sex"] = data["sex"].apply(lambda x: {
            "hombre": "male",
            "mujer": "female"
        }.get(x.lower()))

        # Aggregate state-level data by adding all municipalities
        state = data.drop(columns=["subregion2_code"]).groupby(
            ["date", "subregion1_code"]).sum()
        state.reset_index(inplace=True)
        state["key"] = "MX_" + state["subregion1_code"]

        # Extract cities from the municipalities
        city = _extract_cities(data)

        # Country level is called "TOTAL" as a subregion1_code
        country_mask = data["subregion1_code"] == "TOTAL"
        country = data[country_mask]
        country["key"] = "MX"

        # We can build the key for the data directly from the subregion codes
        data["key"] = "MX_" + data["subregion1_code"] + "_" + data[
            "subregion2_code"]

        # Drop bogus records from the data
        data = data[~country_mask]
        state.dropna(subset=["subregion1_code"], inplace=True)
        data.dropna(subset=["subregion1_code", "subregion2_code"],
                    inplace=True)

        return concat([country, state, data, city])
    def parse_dataframes(self, dataframes: Dict[str, DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:

        # Rename appropriate columns
        cases = table_rename(
            dataframes[0],
            {
                "ProvRes": "province",
                "RegionRes": "region",
                "CityMuniPSGC": "city",
                "DateDied": "date_new_deceased",
                "DateSpecimen": "date_new_confirmed",
                "DateRecover": "date_new_recovered",
                "daterepconf": "_date_estimate",
                "admitted": "_hospitalized",
                "removaltype": "_prognosis",
                "Age": "age",
                "Sex": "sex",
            },
            drop=True,
        )

        # When there is a case, but missing confirmed date, estimate it
        cases["date_new_confirmed"] = cases["date_new_confirmed"].fillna(
            cases["_date_estimate"])

        # When there is recovered removal, but missing recovery date, estimate it
        nan_recovered_mask = cases.date_new_recovered.isna() & (
            cases["_prognosis"] == "Recovered")
        cases.loc[nan_recovered_mask,
                  "date_new_recovered"] = cases.loc[nan_recovered_mask,
                                                    "_date_estimate"]

        # When there is deceased removal, but missing recovery date, estimate it
        nan_deceased_mask = cases.date_new_deceased.isna() & (
            cases["_prognosis"] == "Died")
        cases.loc[nan_deceased_mask,
                  "date_new_deceased"] = cases.loc[nan_deceased_mask,
                                                   "_date_estimate"]

        # Hospitalized is estimated as the same date as confirmed if admitted == yes
        cases["date_new_hospitalized"] = None
        hospitalized_mask = cases["_hospitalized"].str.lower() == "yes"
        cases.loc[hospitalized_mask,
                  "date_new_hospitalized"] = cases.loc[hospitalized_mask,
                                                       "date_new_confirmed"]

        # Rename the sex values
        cases["sex"] = cases["sex"].apply({
            "MALE": "male",
            "FEMALE": "female"
        }.get)

        # Drop columns which we have no use for
        cases = cases[[
            col for col in cases.columns if not col.startswith("_")
        ]]

        # NCR cases are broken down by city, not by province
        ncr_prov_mask = cases["region"] == "NCR"
        cases.loc[ncr_prov_mask,
                  "province"] = cases.loc[ncr_prov_mask,
                                          "city"].str.slice(2, -3)
        cases.drop(columns=["city"], inplace=True)

        # Go from individual case records to key-grouped records in a flat table
        data = convert_cases_to_time_series(
            cases, index_columns=["province", "region"])

        # Convert date to ISO format
        data["date"] = data["date"].apply(safe_datetime_parse)
        data = data[~data["date"].isna()]
        data["date"] = data["date"].apply(lambda x: x.date().isoformat())

        # Null values are known to be zero, since we have case-line data
        data = data.fillna(0)

        # Aggregate country level directly from base data
        country = (data.drop(columns=["province", "region"]).groupby(
            ["date", "age", "sex"]).sum().reset_index())
        country["key"] = "PH"

        # Aggregate regions and provinces separately
        l3 = data.rename(columns={"province": "match_string"})
        l2 = data.rename(columns={"region": "match_string"})
        l2["match_string"] = l2["match_string"].apply(
            lambda x: x.split(": ")[-1])

        # Ensure matching by flagging whether a record must be L2 or L3
        l3["subregion2_code"] = ""
        l2["subregion2_code"] = None
        l3["locality_code"] = None
        l2["locality_code"] = None

        data = concat([l2, l3]).dropna(subset=["match_string"])
        data["country_code"] = "PH"

        # Remove bogus records
        data = data[data["match_string"].notna()]
        data = data[data["match_string"] != ""]
        data = data[data["match_string"] != "REPATRIATE"]
        data = data[data["match_string"] != "CITY OF ISABELA (NOT A PROVINCE)"]
        data = data[data["match_string"] != "COTABATO CITY (NOT A PROVINCE)"]

        return concat([country, data])
    def parse_dataframes(self, dataframes: Dict[str, DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:
        cases = table_rename(
            concat(dataframes.values()),
            {
                # "Patient Number": "",
                # "State Patient Number": "",
                "Date Announced": "date_new_confirmed",
                # "Estimated Onset Date": "",
                "Age Bracket": "age",
                "Gender": "sex",
                # "Detected City": "",
                "Detected District": "subregion2_name",
                "Detected State": "subregion1_name",
                # "State code": "subregion1_code",
                "Current Status": "_prognosis",
                # "Notes": "",
                # "Contracted from which Patient (Suspected)": "",
                # "Nationality": "",
                # "Type of transmission": "",
                "Status Change Date": "_change_date",
                # "Source_1": "",
                # "Source_2": "",
                # "Source_3": "",
                # "Backup Notes": "",
                "Num Cases": "new_confirmed",
                "Entry_ID": "",
            },
            drop=True,
        )

        # Convert dates to ISO format
        for col in [col for col in cases.columns if "date" in col]:
            cases[col] = cases[col].apply(
                lambda x: datetime_isoformat(x, "%d/%m/%Y"))

        cases["age"] = cases["age"].astype(str)
        cases["age"] = cases["age"].str.lower()
        cases["age"] = cases["age"].str.replace("\.0", "")
        cases["age"] = cases["age"].str.replace(r"[\d\.]+ day(s)?", "1")
        cases["age"] = cases["age"].str.replace(r"[\d\.]+ month(s)?", "1")
        cases.loc[cases["age"].str.contains("-"), "age"] = None

        sex_adapter = lambda x: {
            "M": "male",
            "F": "female"
        }.get(x, "sex_unknown")
        cases["sex"] = cases["sex"].str.strip()
        cases["sex"] = cases["sex"].apply(sex_adapter)

        cases["date_new_deceased"] = None
        deceased_mask = cases["_prognosis"] == "Deceased"
        cases.loc[deceased_mask,
                  "date_new_deceased"] = cases.loc[deceased_mask,
                                                   "_change_date"]

        cases["date_new_hospitalized"] = None
        hosp_mask = cases["_prognosis"] == "Hospitalized"
        cases.loc[hosp_mask,
                  "date_new_hospitalized"] = cases.loc[hosp_mask,
                                                       "_change_date"]

        data = convert_cases_to_time_series(
            cases, ["subregion1_name", "subregion2_name"])
        data["country_code"] = "IN"

        # Aggregate country level and admin level 1
        country = aggregate_admin_level(data, ["date", "age", "sex"],
                                        "country")
        subregion1 = aggregate_admin_level(data, ["date", "age", "sex"],
                                           "subregion1")
        subregion1 = subregion1[
            subregion1["subregion1_name"].str.lower() != "state unassigned"]

        # Data for admin level 2 is too noisy and there are many mismatches, so we only return
        # the aggregated country level and admin level 1 data
        return concat([country, subregion1])
    def parse_dataframes(self, dataframes: Dict[str, DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:

        cases_confirmed = table_rename(
            dataframes["confirmed"], _column_adapter,
            drop=True).rename(columns={"date": "date_new_confirmed"})
        cases_deceased = table_rename(
            dataframes["deceased"], _column_adapter,
            drop=True).rename(columns={"date": "date_new_deceased"})

        # Translate sex label
        for df in (cases_confirmed, cases_deceased):
            df["sex"] = df["sex"].apply({
                "MASCULINO": "male",
                "FEMENINO": "female"
            }.get)

        # Convert to time series
        index_columns = ["subregion1_name", "province_name", "subregion2_name"]
        data_confirmed = convert_cases_to_time_series(cases_confirmed,
                                                      index_columns)
        data_deceased = convert_cases_to_time_series(cases_deceased,
                                                     index_columns)

        # Join into a single dataset
        data = table_multimerge([data_confirmed, data_deceased], how="outer")

        # Remove bogus records
        data.dropna(subset=["date"], inplace=True)

        # Set country code and get date in ISO format
        data["country_code"] = "PE"
        data["date"] = data["date"].apply(safe_int_cast)
        data["date"] = data["date"].apply(safe_str_cast)
        data["date"] = data["date"].apply(
            lambda x: datetime_isoformat(x, "%Y%m%d"))

        # Properly capitalize department to allow for exact matching
        data["subregion1_name"] = data["subregion1_name"].apply(
            lambda x: _department_map.get(x, x.title()))

        # Aggregate by admin level 1
        subregion1 = (data.drop(
            columns=["subregion2_name", "province_name"]).groupby(
                ["date", "country_code", "subregion1_name", "age",
                 "sex"]).sum().reset_index())
        subregion1["subregion2_name"] = None

        # Try to match based on subregion2_name using fuzzy matching, and set subregion2_name to
        # an empty string to turn off exact matching
        data = data.rename(columns={"subregion2_name": "match_string"})
        data["subregion2_name"] = ""

        # Convert other text fields to lowercase for consistent processing
        data["match_string"] = data["match_string"].apply(fuzzy_text)
        data["province_name"] = data["province_name"].apply(fuzzy_text)

        # Drop bogus records
        data = data[~data["match_string"].isna()]
        data = data[~data["match_string"].
                    isin(["", "eninvestigacion", "extranjero"])]

        # Because we are skipping provinces and going directly from region to district, there are
        # some name collisions which we have to disambiguate manually
        for province1, province2, district in [
            ("lima", "canete", "sanluis"),
            ("lima", "yauyos", "miraflores"),
            ("ica", "chincha", "pueblonuevo"),
            ("canete", "huarochiri", "sanantonio"),
            ("bolognesi", "huaylas", "huallanca"),
            ("lucanas", "huancasancos", "sancos"),
            ("santacruz", "cutervo", "santacruz"),
            ("yauli", "jauja", "yauli"),
            ("yauli", "jauja", "paccha"),
            ("huarochiri", "yauyos", "laraos"),
            ("elcollao", "melgar", "santarosa"),
        ]:
            for province in (province1, province2):
                mask = (data["province_name"]
                        == province) & (data["match_string"] == district)
                data.loc[mask, "match_string"] = f"{district}, {province}"

        # Output the results
        return concat([subregion1, data])
Example #24
0
    def parse_dataframes(self, dataframes: Dict[str, DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:

        # Subregion code comes from the parsing parameters
        subregion1_code = parse_opts["subregion1_code"]

        # Join all input data into a single table
        cases = table_rename(concat(dataframes.values()),
                             _column_adapter,
                             drop=True)

        # Keep only cases for a single state
        cases = cases[cases["_state_code"].apply(safe_int_cast) ==
                      _IBGE_STATES[subregion1_code]]

        # Confirmed cases are only those with a confirmed positive test result
        cases["date_new_confirmed"] = None
        confirmed_mask = cases["_test_result"] == "Positivo"
        cases.loc[confirmed_mask,
                  "date_new_confirmed"] = cases.loc[confirmed_mask,
                                                    "date_new_tested"]

        # Deceased cases have a specific label and the date is the "closing" date
        cases["date_new_deceased"] = None
        deceased_mask = cases["_prognosis"] == "Óbito"
        cases.loc[deceased_mask,
                  "date_new_deceased"] = cases.loc[deceased_mask,
                                                   "_date_update"]

        # Recovered cases have a specific label and the date is the "closing" date
        cases["date_new_recovered"] = None
        recovered_mask = cases["_prognosis"] == "Cured"
        cases.loc[recovered_mask,
                  "date_new_recovered"] = cases.loc[recovered_mask,
                                                    "_date_update"]

        # Drop columns which we have no use for
        cases = cases[[
            col for col in cases.columns if not col.startswith("_")
        ]]

        # Make sure our region code is of type str
        cases["subregion2_code"] = cases["subregion2_code"].apply(
            safe_int_cast)
        # The last digit of the region code is actually not necessary
        cases["subregion2_code"] = cases["subregion2_code"].apply(
            lambda x: None if isna(x) else str(int(x))[:-1])

        # Null and unknown records are state only
        subregion2_null_mask = cases["subregion2_code"].isna()
        cases.loc[subregion2_null_mask, "key"] = "BR_" + subregion1_code

        # We can derive the key from subregion1 + subregion2
        cases.loc[~subregion2_null_mask, "key"] = (
            "BR_" + subregion1_code + "_" +
            cases.loc[~subregion2_null_mask, "subregion2_code"])

        # Convert ages to int, and translate sex (no "other" sex/gender reported)
        cases["age"] = cases["age"].apply(safe_int_cast)
        cases["sex"] = (cases["sex"].str.lower().apply({
            "masculino": "male",
            "feminino": "female",
            "indefinido": None
        }.get))

        # Convert to time series format
        data = convert_cases_to_time_series(cases, index_columns=["key"])

        # Convert date to ISO format
        data["date"] = data["date"].str.slice(0, 10)
        data["date"] = data["date"].apply(
            lambda x: datetime_isoformat(x, "%Y-%m-%d"))

        # Get rid of bogus records
        data = data.dropna(subset=["date"])
        data = data[data["date"] >= "2020-01-01"]

        # Aggregate for the whole state
        state = data.drop(columns=["key"]).groupby(["date", "age", "sex"
                                                    ]).sum().reset_index()
        state["key"] = "BR_" + subregion1_code

        return concat([data, state])
Example #25
0
    def parse_dataframes(self, dataframes: Dict[str, DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:

        # Rename appropriate columns
        cases = table_rename(
            dataframes[0],
            {
                "residencia_provincia_id": "subregion1_code",
                "residencia_departamento_id": "subregion2_code",
                "fecha_fallecimiento": "date_new_deceased",
                "fecha_diagnostico": "_date_diagnosed",
                "fecha_internacion": "date_new_hospitalized",
                "fecha_cui_intensivo": "date_new_intensive_care",
                "clasificacion_resumen": "_classification",
                "edad": "age",
                "sexo": "sex",
            },
            drop=True,
        )

        # As long as a case is not labeled as "suspected", assume it has been tested
        cases["date_new_tested"] = None
        suspect_mask = cases["_classification"].str.lower().str.match(
            ".*sospechoso.*")
        cases.loc[~suspect_mask,
                  "date_new_tested"] = cases.loc[suspect_mask,
                                                 "_date_diagnosed"]

        # Get rid of all the suspected cases, since we have nothing to tally for them
        cases = cases[~suspect_mask]

        # Confirmed cases use the label "confirmado"
        cases["date_new_tested"] = None
        confirmed_mask = cases["_classification"].str.lower().str.match(
            ".*confirmado.*")
        cases.loc[confirmed_mask,
                  "date_new_confirmed"] = cases.loc[confirmed_mask,
                                                    "_date_diagnosed"]

        # Clean up the subregion codes
        cases["subregion1_code"] = cases["subregion1_code"].apply(
            lambda x: None if x == 0 else numeric_code_as_string(x, 2))
        cases["subregion2_code"] = cases["subregion2_code"].apply(
            lambda x: None if x == 0 else numeric_code_as_string(x, 3))

        # Convert subregion1_code to the corresponding ISO code
        cases["subregion1_code"] = cases["subregion1_code"].apply(
            _ISO_CODE_MAP.get)

        # Remove unnecessary columns before converting to time series
        cases = cases.drop(
            columns=[col for col in cases.columns if col.startswith("_")])

        # Go from individual case records to key-grouped records in time series format
        data = convert_cases_to_time_series(
            cases, ["subregion1_code", "subregion2_code"])

        # Parse dates to ISO format.
        data["date"] = data["date"].astype(str)

        # Aggregate by province and report that separately
        provinces = (data.drop(columns=["subregion2_code"]).groupby(
            ["subregion1_code", "date", "age", "sex"]).sum().reset_index())

        # Aggregate to the country level and report that separately
        country = (data.drop(columns=["subregion1_code"]).groupby(
            ["date", "age", "sex"]).sum().reset_index())

        # Compute the key from the subregion codes
        country["key"] = "AR"
        provinces["key"] = "AR_" + provinces["subregion1_code"]
        data["key"] = "AR_" + data["subregion1_code"] + "_" + data[
            "subregion2_code"]

        # Remove bogus values
        for df in (country, provinces, data):
            df.drop(df[df["key"].str.endswith("_")].index, inplace=True)
            for nn_col in ("date", "subregion1_code", "subregion2_code"):
                if nn_col in df.columns:
                    df.drop(df[df[nn_col].isna() | (df[nn_col] == "")].index,
                            inplace=True)

        return concat([data, provinces, country])
Example #26
0
    def parse_dataframes(self, dataframes: Dict[str, DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:

        # Rename appropriate columns
        cases = table_rename(
            dataframes[0],
            {
                "residencia_provincia_id": "subregion1_code",
                "residencia_departamento_id": "subregion2_code",
                "fecha_fallecimiento": "date_new_deceased",
                "fecha_apertura": "_date_estimate",
                "fecha_diagnostico": "date_new_tested",
                "fecha_internacion": "date_new_hospitalized",
                "fecha_cui_intensivo": "date_new_intensive_care",
                "clasificacion_resumen": "_classification",
                "edad": "age",
                "sexo": "sex",
            },
            drop=True,
        )

        # Get rid of all the suspected cases, since we have nothing to tally for them
        cases = cases[~cases["_classification"].str.lower().str.
                      match(".*sospechoso.*")]

        # Confirmed cases use the label "confirmado"
        cases["date_new_confirmed"] = None
        confirmed_mask = cases["_classification"].str.lower().str.match(
            ".*confirmado.*")
        cases.loc[confirmed_mask,
                  "date_new_confirmed"] = cases.loc[confirmed_mask,
                                                    "date_new_tested"]

        # Estimate the confirmed date when none is available
        cases.loc[confirmed_mask, "date_new_confirmed"] = cases.loc[
            confirmed_mask,
            "date_new_confirmed"].fillna(cases.loc[confirmed_mask,
                                                   "_date_estimate"])

        # Only count deaths from confirmed cases
        cases.loc[~confirmed_mask, "date_new_deceased"] = None

        # Remove unnecessary columns before converting to time series
        cases = cases.drop(
            columns=[col for col in cases.columns if col.startswith("_")])

        # Clean up the subregion codes
        cases["subregion1_code"] = cases["subregion1_code"].apply(
            lambda x: numeric_code_as_string(x, 2) or "00")
        cases["subregion2_code"] = cases["subregion2_code"].apply(
            lambda x: numeric_code_as_string(x, 3) or "000")

        # Go from individual case records to key-grouped records in time series format
        data = convert_cases_to_time_series(
            cases, ["subregion1_code", "subregion2_code"])

        # Parse dates to ISO format.
        data["date"] = data["date"].astype(str)

        # Aggregate to the country level and report that separately
        country = (data.drop(
            columns=["subregion1_code", "subregion2_code"]).groupby(
                ["date", "age", "sex"]).sum().reset_index())

        # Convert subregion1_code to the corresponding ISO code
        data["subregion1_code"] = data["subregion1_code"].apply(
            _ISO_CODE_MAP.get)

        # Aggregate by province and report that separately
        provinces = (data.drop(columns=["subregion2_code"]).groupby(
            ["subregion1_code", "date", "age", "sex"]).sum().reset_index())

        # Drop regions without a code
        data = data[data["subregion2_code"] != "000"]
        data.dropna(subset=["subregion1_code", "subregion2_code"],
                    inplace=True)
        provinces.dropna(subset=["subregion1_code"], inplace=True)

        # Compute the key from the subregion codes
        country["key"] = "AR"
        provinces["key"] = "AR_" + provinces["subregion1_code"]
        data["key"] = "AR_" + data["subregion1_code"] + "_" + data[
            "subregion2_code"]

        return concat([data, provinces, country])
Example #27
0
    def parse_dataframes(
        self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts
    ) -> DataFrame:

        # Rename appropriate columns
        cases = table_rename(
            dataframes[0],
            {
                "ProvRes": "match_string_province",
                "RegionRes": "match_string_region",
                "DateDied": "date_new_deceased",
                "DateSpecimen": "date_new_confirmed",
                "DateRecover": "date_new_recovered",
                "daterepconf": "_date_estimate",
                "admitted": "_hospitalized",
                "removaltype": "_prognosis",
                "Age": "age",
                "Sex": "sex",
            },
            drop=True,
        )

        # When there is recovered removal, but missing recovery date, estimate it
        nan_recovered_mask = cases.date_new_recovered.isna() & (cases["_prognosis"] == "Recovered")
        cases.loc[nan_recovered_mask, "date_new_recovered"] = cases.loc[
            nan_recovered_mask, "_date_estimate"
        ]

        # When there is deceased removal, but missing recovery date, estimate it
        nan_deceased_mask = cases.date_new_deceased.isna() & (cases["_prognosis"] == "Died")
        cases.loc[nan_deceased_mask, "date_new_deceased"] = cases.loc[
            nan_deceased_mask, "_date_estimate"
        ]

        # Hospitalized is estimated as the same date as confirmed if admitted == yes
        cases["date_new_hospitalized"] = None
        hospitalized_mask = cases["_hospitalized"].str.lower() == "yes"
        cases.loc[hospitalized_mask, "date_new_hospitalized"] = cases.loc[
            hospitalized_mask, "date_new_confirmed"
        ]

        # Create stratified age bands
        cases["age"] = cases["age"].apply(age_group)

        # Rename the sex values
        cases["sex"] = cases["sex"].apply({"MALE": "male", "FEMALE": "female"}.get)

        # Drop columns which we have no use for
        cases = cases[[col for col in cases.columns if not col.startswith("_")]]

        # Go from individual case records to key-grouped records in a flat table
        data = convert_cases_to_time_series(
            cases, index_columns=["match_string_province", "match_string_region"]
        )

        # Convert date to ISO format
        data["date"] = data["date"].apply(safe_datetime_parse)
        data = data[~data["date"].isna()]
        data["date"] = data["date"].apply(lambda x: x.date().isoformat())
        data = data.fillna(0)

        # Aggregate regions and provinces separately
        l3 = data.rename(columns={"match_string_province": "match_string"})
        l2 = data.rename(columns={"match_string_region": "match_string"})
        l2["match_string"] = l2["match_string"].apply(lambda x: x.split(": ")[-1])

        # Ensure matching by flagging whether a record must be L2 or L3
        l2["subregion2_code"] = None
        l3["subregion2_code"] = ""

        data = concat([l2, l3]).dropna(subset=["match_string"])
        data["country_code"] = "PH"

        # Remove bogus records
        data = data[data["match_string"].notna()]
        data = data[data["match_string"] != ""]
        data = data[data["match_string"] != "REPATRIATE"]
        data = data[data["match_string"] != "CITY OF ISABELA (NOT A PROVINCE)"]
        data = data[data["match_string"] != "COTABATO CITY (NOT A PROVINCE)"]

        return data
    def parse_dataframes(self, dataframes: Dict[str, DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:

        cases = table_rename(
            dataframes[0],
            {
                "classificação_final": "confirmed_label",
                "dt_notific": "date_new_confirmed",
                # "dt_inicio_sintomas": "_date_onset",
                "bairro_resid__estadia": "match_string",
                # "ap_residencia_estadia": "_health_department_code",
                "sexo": "sex",
                "faixa_etária": "age",
                "evolução": "_state_label",
                "dt_óbito": "date_new_deceased",
                "raça/cor": "ethnicity",
                "Data_atualização": "_date_updated",
            },
            drop=True,
        )

        # Currently active cases are those which are labeled as "ativo" with the report's date
        cases["date_current_confirmed"] = None
        report_date = cases["_date_updated"].iloc[0]
        cases.loc[cases["_state_label"] == "ativo",
                  "date_current_confirmed"] = report_date

        # Drop columns which we have no use for
        cases = cases[[
            col for col in cases.columns if not col.startswith("_")
        ]]

        # Age is already in buckets
        cases["age"] = cases["age"].apply(
            lambda x: x.replace("De ", "").replace(" a ", "-"))

        # Make all unknown ages null
        cases.loc[cases["age"].str.contains("N/D"), "age"] = None

        # Ethnicity needs translation
        cases["ethnicity"] = cases["ethnicity"].apply(lambda x: {
            "preta": "black",
            "parda": "mixed",
            "branca": "white"
        }.get(str(x).lower(), "unknown"))

        data = convert_cases_to_time_series(cases,
                                            index_columns=["match_string"])
        data["country_code"] = "BR"
        data["subregion1_code"] = "RJ"

        # Convert date to ISO format
        # This data source keeps switching between YYYY and YY for the year, so try both
        data["date"] = data["date"].apply(lambda x: datetime_isoformat(
            x, "%d/%m/%Y") or datetime_isoformat(x, "%d/%m/%y"))

        # The sum of all districts is the metropolitan area of Rio
        metro = data.groupby(["date", "age", "sex",
                              "ethnicity"]).sum().reset_index()
        metro["key"] = "BR_RJ_3304557"

        # Rio is both a subregion of the state and a "locality"
        city = metro.copy()
        city["key"] = "BR_RJ_GIG"

        # Remove bogus data
        data = data[data.match_string != "INDEFINIDO"]
        data = data[data.match_string != "FORA DO MUNICÍPIO"]

        # Return only city-level data for now
        # TODO(owahltinez): add the rest of the data once statewide districts are reported
        # return concat([city, metro, data])
        return city
 def test_convert_cases_to_time_series_age_negative_values(self):
     cases = read_csv(StringIO(CASE_LINE_DATA_AGE_NEGATIVE))
     table = convert_cases_to_time_series(cases)
     self.assertSetEqual({"age_unknown"}, set(table.age))
Example #30
0
    def parse_dataframes(self, dataframes: Dict[str, DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:

        cases = table_rename(
            dataframes[0],
            {
                "sexoPaciente": "sex",
                "idadePaciente": "age",
                "codigoMunicipioPaciente": "subregion2_code",
                "dataResultadoExame": "date_new_tested",
                "dataObito": "date_new_deceased",
                "dataEntradaUtisSvep": "date_new_intensive_care",
                "evolucaoCasoSivep": "_prognosis",
                "dataInicioSintomas": "_date_onset",
                "dataEvolucaoCasoSivep": "_date_update",
                "resultadoFinalExame": "_test_result",
            },
            drop=True,
        )

        # Follow the procedure described in the data documentation to compute the confirmed cases:
        # https://drive.google.com/file/d/1DUwST2zcXUnCJmJauiM5zmpSVWqLiAYI/view
        cases["date_new_confirmed"] = None
        confirmed_mask = cases["_test_result"] == "Positivo"
        cases.loc[confirmed_mask,
                  "date_new_confirmed"] = cases.loc[confirmed_mask,
                                                    "date_new_tested"]

        # Only count intensive care patients if they had a positive test result
        cases.loc[~confirmed_mask, "date_new_intensive_care"] = None

        # Drop columns which we have no use for
        cases = cases[[
            col for col in cases.columns if not col.startswith("_")
        ]]

        # Make sure our region code is of type str
        cases["subregion2_code"] = cases["subregion2_code"].apply(
            lambda x: None if isna(x) else str(safe_int_cast(x)))

        # Convert ages to int, and translate sex (no "other" sex/gender reported)
        cases["age"] = cases["age"].apply(safe_int_cast)
        cases["sex"] = cases["sex"].apply({
            "MASCULINO": "male",
            "FEMENINO": "female"
        }.get)

        # Convert to time series format
        data = convert_cases_to_time_series(cases,
                                            index_columns=["subregion2_code"])

        # Convert date to ISO format
        data["date"] = data["date"].apply(
            lambda x: datetime_isoformat(x, "%Y-%m-%d %H:%M:%S"))

        # Aggregate state-level data by adding all municipalities
        state = data.drop(columns=["subregion2_code"]).groupby(
            ["date", "age", "sex"]).sum()
        state.reset_index(inplace=True)
        state["key"] = "BR_CE"

        # Fortaleza is both a subregion of the state and a "locality"
        city = data.loc[data["subregion2_code"] == "230440"].copy()
        city["key"] = "BR_CE_FOR"

        # Drop bogus records from the data
        data = data[~data["subregion2_code"].isna()
                    & (data["subregion2_code"] != "")]

        # We can build the key for the data directly from the subregion code
        data["key"] = "BR_CE_" + data["subregion2_code"]

        return concat([state, data, city])