Example #1
0
def _process_partition(cases: DataFrame) -> DataFrame:
    cases = cases.copy()

    # Extract information about whether doses were first (partial immunization) or second (full)
    cases["date_new_persons_vaccinated"] = None
    cases["date_new_persons_fully_vaccinated"] = None
    first_dose_mask = cases["_dose_information"].str.strip().str.slice(
        0, 1) == "1"
    second_dose_mask = cases["_dose_information"].str.strip().str.slice(
        0, 1) == "2"
    cases.loc[first_dose_mask, "date_new_persons_vaccinated"] = cases.loc[
        first_dose_mask, "date_new_vaccine_doses_administered"]
    cases.loc[second_dose_mask,
              "date_new_persons_fully_vaccinated"] = cases.loc[
                  second_dose_mask, "date_new_vaccine_doses_administered"]

    # Drop columns which we have no use for
    cases = cases[[col for col in cases.columns if not col.startswith("_")]]

    # Make sure our region codes are of type str
    cases["subregion2_code"] = cases["subregion2_code"].apply(
        safe_int_cast).astype(str)

    # Convert ages to int, and translate sex (no "other" sex/gender reported)
    cases["age"] = cases["age"].apply(safe_int_cast)
    cases["sex"] = cases["sex"].str.lower().apply({
        "m": "male",
        "f": "female"
    }.get)

    # Convert to time series format
    data = convert_cases_to_time_series(
        cases, index_columns=["subregion1_code", "subregion2_code"])

    # Convert date to ISO format
    data["date"] = data["date"].str.slice(0, 10)
    data["date"] = data["date"].apply(
        lambda x: datetime_isoformat(x, "%Y-%m-%d"))

    # Get rid of bogus records
    data = data.dropna(subset=["date"])
    data = data[data["date"] >= "2020-01-01"]
    data = data[data["date"] < date_today(offset=1)]

    # Aggregate data by country
    country = aggregate_admin_level(data, ["date", "age", "sex"], "country")
    country["key"] = "BR"

    # Aggregate data by state
    state = (data.drop(columns=["subregion2_code"]).groupby(
        ["date", "subregion1_code", "age", "sex"]).sum().reset_index())
    state["key"] = "BR_" + state["subregion1_code"]

    # We can derive the key from subregion1 + subregion2
    data = data[data["subregion2_code"].notna()
                & (data["subregion2_code"] != "")]
    data["key"] = "BR_" + data["subregion1_code"] + "_" + data[
        "subregion2_code"]

    return concat([country, state, data])
    def parse_dataframes(self, dataframes: Dict[Any, DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:
        data = table_rename(dataframes[0], _column_adapter,
                            drop=True).sort_values("date")

        # Convert from the ITSA codes to our region codes
        data["subregion1_code"] = data["subregion1_code"].apply(
            _subregion1_code_converter)

        # Aggregate here since some of the codes are null (04 indicates either BZ/TN)
        country = aggregate_admin_level(data, ["date"], "country")
        country["key"] = "IT"

        # Match data with IT subregions
        data = data[data['subregion1_code'].notna()]
        data["country_code"] = "IT"
        data["subregion2_code"] = None
        data["locality_code"] = None

        return concat([country, data])
Example #3
0
    def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:
        with open(sources[0], "r") as fd:
            cases = json.load(fd)["Data"]

        # {"ConfirmDate":"2021-01-09 00:00:00","No":"9876","Age":66,"Gender":"\u0e0a","GenderEn":"Male","Nation":"Thailand","NationEn":"Thailand","Province":"\u0e2d","ProvinceId":72,"District":"\u0e44","ProvinceEn":"Ang Thong","Detail":null,"StatQuarantine":1}
        cases = table_rename(
            DataFrame.from_records(cases),
            {
                "ConfirmDate": "date_new_confirmed",
                "Age": "age",
                "GenderEn": "sex",
                "ProvinceEn": "match_string",
            },
            drop=True,
        )

        # Convert dates to ISO format
        for col in cases.columns:
            if col.startswith("date_"):
                cases[col] = cases[col].str.slice(0, 10)

        # Parse age and sex fields
        cases["sex"] = cases["sex"].str.lower().apply({"male": "male", "female": "female"}.get)
        cases["age"] = cases["age"].fillna("age_unknown")
        cases["sex"] = cases["sex"].fillna("sex_unknown")

        # Convert to time series data
        data = convert_cases_to_time_series(cases, ["match_string"])

        # Aggregate by country level
        country = aggregate_admin_level(data, ["date", "age", "sex"], "country")
        country["key"] = "TH"

        # Add country code and return data
        data["country_code"] = "TH"
        data = data[data["match_string"] != "Unknown"]

        return concat([country, data])
    def parse_dataframes(self, dataframes: Dict[str, DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:
        cases = table_rename(
            concat(dataframes.values()),
            {
                # "Patient Number": "",
                # "State Patient Number": "",
                "Date Announced": "date_new_confirmed",
                # "Estimated Onset Date": "",
                "Age Bracket": "age",
                "Gender": "sex",
                # "Detected City": "",
                "Detected District": "subregion2_name",
                "Detected State": "subregion1_name",
                # "State code": "subregion1_code",
                "Current Status": "_prognosis",
                # "Notes": "",
                # "Contracted from which Patient (Suspected)": "",
                # "Nationality": "",
                # "Type of transmission": "",
                "Status Change Date": "_change_date",
                # "Source_1": "",
                # "Source_2": "",
                # "Source_3": "",
                # "Backup Notes": "",
                "Num Cases": "new_confirmed",
                "Entry_ID": "",
            },
            drop=True,
        )

        # Convert dates to ISO format
        for col in [col for col in cases.columns if "date" in col]:
            cases[col] = cases[col].apply(
                lambda x: datetime_isoformat(x, "%d/%m/%Y"))

        cases["age"] = cases["age"].astype(str)
        cases["age"] = cases["age"].str.lower()
        cases["age"] = cases["age"].str.replace("\.0", "")
        cases["age"] = cases["age"].str.replace(r"[\d\.]+ day(s)?", "1")
        cases["age"] = cases["age"].str.replace(r"[\d\.]+ month(s)?", "1")
        cases.loc[cases["age"].str.contains("-"), "age"] = None

        sex_adapter = lambda x: {
            "M": "male",
            "F": "female"
        }.get(x, "sex_unknown")
        cases["sex"] = cases["sex"].str.strip()
        cases["sex"] = cases["sex"].apply(sex_adapter)

        cases["date_new_deceased"] = None
        deceased_mask = cases["_prognosis"] == "Deceased"
        cases.loc[deceased_mask,
                  "date_new_deceased"] = cases.loc[deceased_mask,
                                                   "_change_date"]

        cases["date_new_hospitalized"] = None
        hosp_mask = cases["_prognosis"] == "Hospitalized"
        cases.loc[hosp_mask,
                  "date_new_hospitalized"] = cases.loc[hosp_mask,
                                                       "_change_date"]

        data = convert_cases_to_time_series(
            cases, ["subregion1_name", "subregion2_name"])
        data["country_code"] = "IN"

        # Aggregate country level and admin level 1
        country = aggregate_admin_level(data, ["date", "age", "sex"],
                                        "country")
        subregion1 = aggregate_admin_level(data, ["date", "age", "sex"],
                                           "subregion1")
        subregion1 = subregion1[
            subregion1["subregion1_name"].str.lower() != "state unassigned"]

        # Data for admin level 2 is too noisy and there are many mismatches, so we only return
        # the aggregated country level and admin level 1 data
        return concat([country, subregion1])
    def parse_dataframes(self, dataframes: Dict[str, DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:

        # Rename appropriate columns
        data = table_rename(
            dataframes[0],
            {
                "sexo": "sex",
                "grupo_etario": "age",
                # "jurisdiccion_residencia": "",
                "jurisdiccion_residencia_id": "subregion1_code",
                # "depto_residencia": "",
                "depto_residencia_id": "subregion2_code",
                # "jurisdiccion_aplicacion": "",
                # "jurisdiccion_aplicacion_id": "",
                # "depto_aplicacion": "",
                # "depto_aplicacion_id": "",
                "fecha_aplicacion": "date",
                "vacuna": "_manufacturer",
                # "condicion_aplicacion": "",
                "orden_dosis": "_dose_number",
                # "lote_vacuna": "",
            },
            drop=True,
        )

        # Parse dates to ISO format.
        data["date"] = data["date"].astype(str)

        # Parse sex label into proper name
        data["sex"] = data["sex"].apply({"M": "male", "F": "female"}.get)

        # Parse the dose number assuming all vaccines have a 2-dose schedule
        data["new_persons_vaccinated"] = data["_dose_number"].apply(
            lambda x: 1 if x == 1 else 0)
        data["new_persons_fully_vaccinated"] = data["_dose_number"].apply(
            lambda x: 1 if x == 2 else 0)
        data["new_vaccine_doses_administered"] = (
            data["new_persons_vaccinated"] +
            data["new_persons_fully_vaccinated"])

        # Add a column for each vaccine manufacturer
        for manufacturer in data["_manufacturer"].unique():
            mask = data["_manufacturer"] == manufacturer
            brand_name = manufacturer.lower()

            cols = [f"new_persons_{mod}vaccinated" for mod in ["", "fully_"]]
            cols += [f"new_vaccine_doses_administered"]
            for col in cols:
                new_col = f"{col}_{brand_name}"
                data[new_col] = None
                data.loc[mask, new_col] = data.loc[mask, col]

        # Clean up the subregion codes
        data["subregion1_code"] = data["subregion1_code"].apply(
            lambda x: numeric_code_as_string(x, 2) or "00")
        data["subregion2_code"] = data["subregion2_code"].apply(
            lambda x: numeric_code_as_string(x, 3) or "000")

        # Convert subregion1_code to the corresponding ISO code
        data["subregion1_code"] = data["subregion1_code"].apply(
            _ISO_CODE_MAP.get)

        # Group by indexable columns
        idx_cols = ["date", "subregion1_code", "subregion2_code", "sex", "age"]
        data = data.groupby(idx_cols).sum().reset_index()

        # Aggregate country level and admin level 1
        country = aggregate_admin_level(data, ["date", "age", "sex"],
                                        "country")
        subregion1 = aggregate_admin_level(data, ["date", "age", "sex"],
                                           "subregion1")
        subregion2 = data.copy()

        # Drop regions without a code
        subregion2 = subregion2[subregion2["subregion2_code"] != "000"]
        subregion2.dropna(subset=["subregion1_code", "subregion2_code"],
                          inplace=True)
        subregion1.dropna(subset=["subregion1_code"], inplace=True)

        # Compute the key from the subregion codes
        country["key"] = "AR"
        subregion1["key"] = "AR_" + subregion1["subregion1_code"]
        subregion2["key"] = "AR_" + subregion2["subregion1_code"] + "_" + data[
            "subregion2_code"]

        return concat([subregion2, subregion1, country])