Ejemplo n.º 1
0
    def parse_dataframes(self, dataframes: Dict[str, DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:
        data = table_rename(
            dataframes[0],
            {
                "e(0)": "life_expectancy",
                "STATE2KX": "state_code",
                "CNTY2KX": "county_code"
            },
            drop=True,
        )

        # Derive the FIPS subregion code from state and county codes
        data["state_code"] = data["state_code"].apply(
            lambda x: numeric_code_as_string(x, 2))
        data["county_code"] = data["county_code"].apply(
            lambda x: numeric_code_as_string(x, 3))
        data["subregion2_code"] = data["state_code"] + data["county_code"]

        # Data is more granular than county level, use a crude average for estimate
        data = (data.drop(columns=["state_code", "county_code"]).groupby(
            "subregion2_code").mean().reset_index())

        # Add country code to all records and return
        data["country_code"] = "US"
        return data
Ejemplo n.º 2
0
    def parse_dataframes(
        self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts
    ) -> DataFrame:

        data = table_multimerge(
            [
                table_rename(
                    dataframes["confirmed"],
                    {
                        "Fecha": "date",
                        "Casos confirmados": "new_confirmed",
                        "Codigo region": "subregion1_code",
                        "Codigo comuna": "subregion2_code",
                    },
                    drop=True,
                ),
                table_rename(
                    dataframes["deceased"],
                    {
                        "Fecha": "date",
                        "Casos fallecidos": "total_deceased",
                        "Codigo region": "subregion1_code",
                        "Codigo comuna": "subregion2_code",
                    },
                    drop=True,
                ),
            ],
            how="outer",
        )

        # Convert date to ISO format
        data["date"] = data["date"].astype(str)

        # Parse region codes as strings
        data["subregion1_code"] = data["subregion1_code"].apply(
            lambda x: numeric_code_as_string(x, 2)
        )
        data["subregion2_code"] = data["subregion2_code"].apply(
            lambda x: numeric_code_as_string(x, 5)
        )

        # Use proper ISO codes for the subregion1 level
        data["subregion1_code"] = data["subregion1_code"].apply(_SUBREGION1_CODE_MAP.get)

        # Extract cities from the municipalities
        city = _extract_cities(data)

        # We can build the key for the data directly from the subregion codes
        data["key"] = "CL_" + data["subregion1_code"] + "_" + data["subregion2_code"]

        # Drop bogus records from the data
        data.dropna(subset=["subregion1_code", "subregion2_code"], inplace=True)

        return concat([data, city])
Ejemplo n.º 3
0
def _process_partition(cases: DataFrame) -> DataFrame:
    cases = cases.copy()

    # Extract information about whether doses were first (partial immunization) or second (full)
    cases["date_new_persons_vaccinated"] = None
    cases["date_new_persons_fully_vaccinated"] = None
    first_dose_mask = cases["_dose_information"].str.strip().str.slice(
        0, 1) == "1"
    second_dose_mask = cases["_dose_information"].str.strip().str.slice(
        0, 1) == "2"
    cases.loc[first_dose_mask, "date_new_persons_vaccinated"] = cases.loc[
        first_dose_mask, "date_new_vaccine_doses_administered"]
    cases.loc[second_dose_mask,
              "date_new_persons_fully_vaccinated"] = cases.loc[
                  second_dose_mask, "date_new_vaccine_doses_administered"]

    # Drop columns which we have no use for
    cases = cases[[col for col in cases.columns if not col.startswith("_")]]

    # Make sure our region codes are of type str
    cases["subregion2_code"] = cases["subregion2_code"].apply(
        lambda x: numeric_code_as_string(x, 6))

    # Convert ages to int, and translate sex (no "other" sex/gender reported)
    cases["age"] = cases["age"].apply(safe_int_cast)
    cases["sex"] = cases["sex"].str.lower().apply({
        "m": "male",
        "f": "female"
    }.get)

    # Convert to time series format
    data = convert_cases_to_time_series(
        cases, index_columns=["subregion1_code", "subregion2_code"])

    # Convert date to ISO format
    data["date"] = data["date"].str.slice(0, 10)
    data["date"] = data["date"].apply(
        lambda x: datetime_isoformat(x, "%Y-%m-%d"))

    # Get rid of bogus records
    data = data.dropna(subset=["date"])
    data = data[data["date"] >= "2020-01-01"]
    data = data[data["date"] < date_today(offset=1)]

    # Aggregate data by country
    country = aggregate_admin_level(data, ["date", "age", "sex"], "country")
    country["key"] = "BR"

    # Aggregate data by state
    state = (data.drop(columns=["subregion2_code"]).groupby(
        ["date", "subregion1_code", "age", "sex"]).sum().reset_index())
    state["key"] = "BR_" + state["subregion1_code"]

    # We can derive the key from subregion1 + subregion2
    data = data[data["subregion2_code"].notna()
                & (data["subregion2_code"] != "")]
    data["key"] = "BR_" + data["subregion1_code"] + "_" + data[
        "subregion2_code"]

    return concat([country, state, data])
Ejemplo n.º 4
0
def _process_state(data: DataFrame) -> DataFrame:
    data["date"] = data["date"].apply(lambda x: str(x)[:10])
    data["subregion2_code"] = data["fips_code"].apply(
        lambda x: numeric_code_as_string(x, 5))
    data["key"] = "US_" + data["state"] + "_" + data["subregion2_code"]
    data.drop(
        columns=[
            "subregion2_code",
            "state",
            "fips_code",
            "county",
            "report_date_window_end",
            "report_date_window_start",
        ],
        inplace=True,
    )

    # Make sure the data is properly sorted, since we need to compute diffs
    data.sort_values(["key", "date"], inplace=True)

    # Get a mapping between rolling average column names and their daily counterparts
    col_prefixes = (
        "new_cases",
        "new_deaths",
        "new_test_results_reported",
        "admissions_covid_confirmed",
    )
    rolling_suffix = "_7_day_rolling_average"
    rolling_columns_map = {
        col + rolling_suffix: col.replace(rolling_suffix, "")
        for col in col_prefixes
    }

    # Seed the daily versions of the columns with empty values
    for name in rolling_columns_map.values():
        data[name] = None

    # Convert the rolling average columns to daily values one key at a time
    # This can probably be done with some clever grouping function instead, but iteratively is
    # fast enough and it works reliably.
    for key in pbar(data["key"].unique(),
                    desc="Computing daily values from rolling means"):
        mask = data["key"] == key
        for col, name in rolling_columns_map.items():
            subset = data.loc[mask, col].dropna()
            data.loc[subset.index, name] = recover_from_rolling_mean(subset, 7)

    # Get rid of unnecessary columns now that we have the daily values
    data.drop(columns=rolling_columns_map.keys(), inplace=True)

    return table_rename(
        data,
        {
            "new_cases": "new_confirmed",
            "new_deaths": "new_deceased",
            "new_test_results_reported": "new_tested",
            "admissions_covid_confirmed": "new_hospitalized",
        },
    )
Ejemplo n.º 5
0
    def parse_dataframes(self, dataframes: Dict[str, DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:
        cases = table_rename(dataframes[0], _srag_column_adapter, drop=True)
        covid_mask = cases["_classification"] == 5
        valid_mask = cases["_prognosis"].notna() & cases["_prognosis"] != 9
        cases = cases[covid_mask & valid_mask]

        # Record the date of death
        cases["date_new_deceased"] = None
        deceased_mask = cases["_prognosis"] == 2
        cases.loc[deceased_mask,
                  "date_new_deceased"] = cases.loc[deceased_mask,
                                                   "_date_prognosis"]

        # Convert ages to int, and translate sex (no "other" sex/gender reported)
        cases["age"] = cases["age"].apply(safe_int_cast)
        cases["sex"] = cases["sex"].apply({"M": "male", "F": "female"}.get)

        # Convert all dates to ISO format
        for col in filter(lambda x: x.startswith("date"), cases.columns):
            cases[col] = cases[col].apply(
                lambda x: datetime_isoformat(x, "%d/%m/%Y"))

        # Parse subregion codes
        cases["subregion2_code"] = cases["subregion2_code"].apply(
            lambda x: numeric_code_as_string(x, 5))

        # Convert to time series format
        data = convert_cases_to_time_series(cases,
                                            index_columns=["subregion2_code"])
        data["country_code"] = "BR"

        # Get rid of bogus records
        data = data.dropna(subset=["date"])
        data = data[data["date"] >= "2020-01-01"]
        data = data[data["date"] < date_today(offset=1)]

        # Aggregate by country level
        country = (data.drop(columns=["subregion2_code"]).groupby(
            ["date", "age", "sex"]).sum().reset_index())
        country["key"] = "BR"

        # Aggregate by state level
        data["subregion1_code"] = data["subregion2_code"].apply(
            lambda x: _IBGE_STATES.get(safe_int_cast(x[:2])))
        state = (data.drop(columns=["subregion2_code"]).dropna(
            subset=["subregion1_code"]).groupby(
                ["date", "subregion1_code", "age", "sex"]).sum().reset_index())
        state["key"] = "BR_" + state["subregion1_code"]

        # Derive the key from subregion codes
        data = data[data["subregion2_code"].notna()]
        data["key"] = "BR_" + data["subregion1_code"] + "_" + data[
            "subregion2_code"]

        return concat([country, state, data])
    def parse_dataframes(self, dataframes: Dict[str, DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:

        data = table_rename(
            dataframes[0],
            {
                "TipusCasData": "date",
                # "ComarcaCodi": "comarca_code",
                # "ComarcaDescripcio": "comarca_name",
                "MunicipiCodi": "subregion2_code",
                "MunicipiDescripcio": "subregion2_name",
                "SexeCodi": "sex",
                # "SexeDescripcio": "sex",
                "TipusCasDescripcio": "_case_type",
                "NumCasos": "new_confirmed",
            },
            drop=True,
        )

        # Remove "suspect" cases
        data = data[data["_case_type"] != "Sospitós"].drop(
            columns=["_case_type"])

        # Use placeholder code for unknown values
        data.loc[data["subregion2_code"].isna(), "subregion2_code"] = "00000"

        # Region codes need cleaning up to match INEI codes
        data["subregion2_code"] = data["subregion2_code"].apply(
            lambda x: numeric_code_as_string(x, 5))

        # Derive key from subregion code
        data["key"] = "ES_CT_" + data["subregion2_code"]

        # Parse sex, date and numeric values
        sex_adapter = {"0": "male", "1": "female"}
        data["sex"] = data["sex"].apply(
            lambda x: sex_adapter.get(x, "sex_unknown"))
        data["date"] = data["date"].apply(
            lambda x: datetime_isoformat(x, "%d/%m/%Y"))
        data["new_confirmed"] = data["new_confirmed"].apply(safe_int_cast)

        # Aggregate manually since some municipalities are clumped together if they are too small
        ccaa = data.drop(columns=["subregion2_code"]).groupby(
            ["date", "sex"]).sum().reset_index()
        ccaa["key"] = "ES_CT"

        # Remove unnecessary data
        data = data[data["key"] != "ES_CT_00000"]

        return concat([ccaa, data])
Ejemplo n.º 7
0
    def parse_dataframes(
        self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts
    ) -> DataFrame:

        data = table_rename(
            dataframes[0],
            {
                "data": "date",
                "estado": "subregion1_code",
                "codmun": "subregion2_code",
                "municipio": "subregion2_name",
                "casosNovos": "new_confirmed",
                "obitosNovos": "new_deceased",
                "casosAcumulado": "total_confirmed",
                "obitosAcumulado": "total_deceased",
                "Recuperadosnovos": "total_recovered",
            },
            drop=True,
        )

        # Convert date to ISO format
        data["date"] = data["date"].astype(str)

        # Parse region codes as strings
        data["subregion2_code"] = data["subregion2_code"].apply(
            lambda x: numeric_code_as_string(x, 6)
        )

        # Country-level data has null state
        data["key"] = None
        country_mask = data["subregion1_code"].isna()
        data.loc[country_mask, "key"] = "BR"

        # State-level data has null municipality
        state_mask = data["subregion2_code"].isna()
        data.loc[~country_mask & state_mask, "key"] = "BR_" + data["subregion1_code"]

        # We can derive the key from subregion1 + subregion2
        data.loc[~country_mask & ~state_mask, "key"] = (
            "BR_" + data["subregion1_code"] + "_" + data["subregion2_code"]
        )

        # Drop bogus data
        data = data[data["subregion2_code"].str.slice(-4) != "0000"]

        return data
Ejemplo n.º 8
0
    def parse_dataframes(
        self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts
    ) -> DataFrame:

        data = table_rename(
            dataframes[0],
            {
                "fecha_informe": "date",
                "municipio_distrito": "subregion2_name",
                "codigo_geometria": "subregion2_code",
                "casos_confirmados_totales": "total_confirmed",
            },
            drop=True,
        )

        # Use placeholder code for unknown values
        data.loc[data["subregion2_code"].isna(), "subregion2_code"] = "000000"

        # Region codes need cleaning up to match INEI codes
        data["subregion2_code"] = data["subregion2_code"].apply(
            lambda x: numeric_code_as_string(x, 6)
        )
        data["subregion2_code"] = data["subregion2_code"].apply(
            lambda x: "28" + ("079" + x[4:] if x.startswith("079") else x[2:5] + x[6:])
        )

        data["key"] = "ES_MD_" + data["subregion2_code"]
        data = data.drop(columns=["subregion2_code"])

        data["date"] = data["date"].apply(lambda x: datetime_isoformat(x[:10], "%Y/%m/%d"))
        data["total_confirmed"] = data["total_confirmed"].apply(safe_int_cast)

        # Aggregate the entire autonomous community
        l1 = data.drop(columns=["key", "subregion2_name"]).groupby("date").sum().reset_index()
        l1["key"] = "ES_MD"

        # Sometimes the subregion code is not properly formatted, so we may need to do string match
        data["country_code"] = "ES"
        data["subregion1_code"] = "MD"
        data["subregion2_name"] = data["subregion2_name"].str.replace("Madrid-", "")
        data.loc[data["key"] == "ES_MD_28000", "key"] = None

        return concat([data, l1])
Ejemplo n.º 9
0
    def parse_dataframes(self, dataframes: Dict[str, DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:
        # Data is nested into multiple sheets
        tables = []
        for df in list(dataframes[0].values())[1:]:
            # Header has two rows, but we ignore them and use our own columns anyway
            df.columns = _columns
            df = df.iloc[2:].copy()

            # Make sure subregion code is numeric
            apply_func = lambda x: numeric_code_as_string(x, 2)
            df["subregion1_code"] = df["subregion1_code"].apply(apply_func)

            # Keep only new_confirmed
            df = df[["date", "subregion1_code"] + parse_opts["columns"]]

            # Keep only rows with indexable columns not null
            df.dropna(subset=["date", "subregion1_code"], inplace=True)

            # This data source is "complete" so all nulls are zeroes
            df = df.fillna(0)

            # Add to the tables including all subregions
            tables.append(df.iloc[1:])

        # Put all sheets together into a single DataFrame
        data = concat(tables)

        # Derive the key from country and region code
        data["key"] = parse_opts["country"] + "_" + data["subregion1_code"]
        data.drop(columns=["subregion1_code"], inplace=True)

        # Ensure date is in ISO format
        data["date"] = data["date"].apply(lambda x: str(x)[:10])

        # Make sure that all data is numeric
        for col in data.columns:
            if col not in ("date", "key"):
                data[col] = data[col].apply(safe_int_cast)

        # Output the results
        return data
def _rename_columns(data: DataFrame) -> DataFrame:
    column_adapter = {
        "date": "date",
        "country_region_code": "country_code",
        "sub_region_1": "subregion1_name",
        "sub_region_2": "subregion2_name",
        "sub_region_1_code": "subregion1_code",
        "sub_region_2_code": "subregion2_code",
    }
    data = data.rename(columns=column_adapter)
    data["subregion1_code"] = data["subregion1_code"].apply(
        lambda x: x.split("-")[-1] if x else None)
    data["subregion2_code"] = data["subregion2_code"].apply(
        lambda x: numeric_code_as_string(x, 5))

    data.columns = [
        col if col in column_adapter.values() else "search_trends_" +
        col.lower().replace("symptom:", "").replace(" ", "_").replace("'", "")
        for col in data.columns
    ]

    return data
Ejemplo n.º 11
0
    def parse_dataframes(self, dataframes: Dict[str, DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:

        cases = table_rename(
            dataframes[0],
            {
                # "FECHA_ACTUALIZACION": "",
                # "ID_REGISTRO": "",
                # "ORIGEN": "",
                # "SECTOR": "",
                # "ENTIDAD_UM": "",
                "SEXO": "sex",
                # "ENTIDAD_NAC": "",
                "ENTIDAD_RES": "subregion1_code",
                "MUNICIPIO_RES": "subregion2_code",
                "TIPO_PACIENTE": "_type",
                "FECHA_INGRESO": "date_new_confirmed",
                # "FECHA_SINTOMAS": "",
                "FECHA_DEF": "date_new_deceased",
                # "INTUBADO": "",
                # "NEUMONIA": "",
                "EDAD": "age",
                # "NACIONALIDAD": "",
                # "EMBARAZO": "",
                # "HABLA_LENGUA_INDIG": "",
                # "DIABETES": "",
                # "EPOC": "",
                # "ASMA": "",
                # "INMUSUPR": "",
                # "HIPERTENSION": "",
                # "OTRA_COM": "",
                # "CARDIOVASCULAR": "",
                # "OBESIDAD": "",
                # "RENAL_CRONICA": "",
                # "TABAQUISMO": "",
                # "OTRO_CASO": "",
                "RESULTADO": "_diagnosis",
                # "MIGRANTE": "",
                # "PAIS_NACIONALIDAD": "",
                # "PAIS_ORIGEN": "",
                "UCI": "_intensive_care",
            },
            drop=True,
        )

        # Null dates are coded as 9999-99-99
        for col in cases.columns:
            if col.startswith("date_"):
                cases.loc[cases[col] == "9999-99-99", col] = None

        # Discard all cases with negative test result
        cases = cases[cases["_diagnosis"] == 1]

        # Type 1 is normal, type 2 is hospitalized
        cases["date_new_hospitalized"] = None
        hospitalized_mask = cases["_type"] == 2
        cases.loc[hospitalized_mask,
                  "date_new_hospitalized"] = cases.loc[hospitalized_mask,
                                                       "date_new_confirmed"]

        # Parse region codes as strings
        cases["subregion1_code"] = cases["subregion1_code"].apply(
            lambda x: numeric_code_as_string(x, 2))
        cases["subregion2_code"] = cases["subregion2_code"].apply(
            lambda x: numeric_code_as_string(x, 3))

        # Convert case line data to our time series format
        data = convert_cases_to_time_series(
            cases, ["subregion1_code", "subregion2_code"])

        # Convert date to ISO format
        data["date"] = data["date"].astype(str)

        # Unknown region codes are defined as "99+" instead of null
        data.loc[data["subregion1_code"] == "99", "subregion1_code"] = None
        data.loc[data["subregion2_code"] == "999", "subregion2_code"] = None

        # The subregion2 codes need to be composed
        invalid_region_mask = data["subregion2_code"].isna(
        ) | data["subregion2_code"].isna()
        data.loc[~invalid_region_mask, "subregion2_code"] = (
            data.loc[~invalid_region_mask, "subregion1_code"] +
            data.loc[~invalid_region_mask, "subregion2_code"])

        # Use proper ISO codes for the subregion1 level
        data["subregion1_code"] = data["subregion1_code"].apply(
            _SUBREGION1_CODE_MAP.get)

        # Translate sex labels; only male, female and unknown are given
        data["sex"] = data["sex"].apply(lambda x: {
            "hombre": "male",
            "mujer": "female"
        }.get(x.lower()))

        # Aggregate state-level data by adding all municipalities
        state = data.drop(columns=["subregion2_code"]).groupby(
            ["date", "subregion1_code"]).sum()
        state.reset_index(inplace=True)
        state["key"] = "MX_" + state["subregion1_code"]

        # Extract cities from the municipalities
        city = _extract_cities(data)

        # Country level is called "TOTAL" as a subregion1_code
        country_mask = data["subregion1_code"] == "TOTAL"
        country = data[country_mask]
        country["key"] = "MX"

        # We can build the key for the data directly from the subregion codes
        data["key"] = "MX_" + data["subregion1_code"] + "_" + data[
            "subregion2_code"]

        # Drop bogus records from the data
        data = data[~country_mask]
        state.dropna(subset=["subregion1_code"], inplace=True)
        data.dropna(subset=["subregion1_code", "subregion2_code"],
                    inplace=True)

        return concat([country, state, data, city])
Ejemplo n.º 12
0
    def parse_dataframes(
        self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts
    ) -> DataFrame:

        # Rename appropriate columns
        data = table_rename(
            dataframes[0],
            {
                "town_code": "subregion2_code",
                "date": "date",
                "accumulated_tested": "total_tested",
                "new_tested_on_date": "_new_tested_flag",
                "accumulated_cases": "total_confirmed",
                "new_cases_on_date": "_new_confirmed_flag",
                "accumulated_recoveries": "total_recovered",
                "new_recoveries_on_date": "_new_recovered_flag",
                "accumulated_hospitalized": "total_hospitalized",
                "new_hospitalized_on_date": "_new_hospitalized_flag",
                "accumulated_deaths": "total_deceased",
                "new_deaths_on_date": "_new_deceased_flag",
                "accumulated_vaccination_first_dose": "total_persons_vaccinated",
                "accumulated_vaccination_second_dose": "total_persons_fully_vaccinated",
                "town": "match_string",
            },
            drop=True,
        )

        # Convert date to ISO format and sort the data
        data["date"] = data["date"].astype(str).str.slice(0, 10)
        data.sort_values("date", inplace=True)

        # Because low counts are masked, we assume <15 = 1 as a rough estimate
        for statistic in (
            "confirmed",
            "deceased",
            "tested",
            "recovered",
            "hospitalized",
            "persons_vaccinated",
            "persons_fully_vaccinated",
        ):
            col = f"total_{statistic}"
            if col in data.columns:
                low_count_mask = data[col] == "<15"
                data.loc[low_count_mask, col] = 1
                # We can fill the data with zeroes since every case should be recorded by source
                data[col] = data[col].apply(safe_int_cast).fillna(0)

        # Estimate total vaccine doses administered from first and second dose counts
        data["total_vaccine_doses_administered"] = (
            data["total_persons_vaccinated"] + data["total_persons_fully_vaccinated"]
        )
        # Properly format the region code and group by it
        data["subregion2_code"] = data["subregion2_code"].apply(
            lambda x: numeric_code_as_string(x, 4)
        )
        data = data.groupby(["date", "subregion2_code", "match_string"]).sum().reset_index()

        # Aggregate to country level and drop unknown locations
        data["country_code"] = "IL"
        intra_country_columns = ["subregion2_code", "match_string"]
        country = data.drop(columns=intra_country_columns)
        country = data.groupby("country_code").sum().reset_index()
        data.dropna(subset=intra_country_columns, inplace=True)

        # Drop country-level confirmed and deceased since we have better sources of aggregated data
        country["key"] = country["country_code"]
        country.drop(columns=["total_confirmed", "total_deceased"], inplace=True)

        # Get the admin level 1 and key from metadata
        il = aux["metadata"][["key", "country_code", "subregion1_code", "subregion2_code"]]
        il = il[(il["country_code"] == "IL") & il["subregion2_code"].notna()]
        il["subregion2_code"] = il["subregion2_code"].apply(lambda x: numeric_code_as_string(x, 4))
        data = data.merge(il, how="left")

        # Aggregate by admin level 1
        admin_l1 = data.groupby(["date", "country_code", "subregion1_code"]).sum().reset_index()
        admin_l1["key"] = admin_l1["country_code"] + "_" + admin_l1["subregion1_code"]

        return concat([country, admin_l1, data])
Ejemplo n.º 13
0
def _process_cache_file(file_map: Dict[str, str], date: str) -> DataFrame:
    data = table_rename(read_file(file_map[date]), _column_adapter, drop=True)
    data["subregion1_code"] = data["subregion1_code"].apply(
        lambda x: _ISO_CODE_MAP.get(numeric_code_as_string(x, 2) or "00"))
    data["date"] = date
    return data
Ejemplo n.º 14
0
def _subregion1_code_converter(code: int):
    return _region_code_map.get(numeric_code_as_string(code, 2))
Ejemplo n.º 15
0
    def parse_dataframes(self, dataframes: Dict[str, DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:

        # Rename appropriate columns
        cases = table_rename(
            dataframes[0],
            {
                "residencia_provincia_id": "subregion1_code",
                "residencia_departamento_id": "subregion2_code",
                "fecha_fallecimiento": "date_new_deceased",
                "fecha_diagnostico": "_date_diagnosed",
                "fecha_internacion": "date_new_hospitalized",
                "fecha_cui_intensivo": "date_new_intensive_care",
                "clasificacion_resumen": "_classification",
                "edad": "age",
                "sexo": "sex",
            },
            drop=True,
        )

        # As long as a case is not labeled as "suspected", assume it has been tested
        cases["date_new_tested"] = None
        suspect_mask = cases["_classification"].str.lower().str.match(
            ".*sospechoso.*")
        cases.loc[~suspect_mask,
                  "date_new_tested"] = cases.loc[suspect_mask,
                                                 "_date_diagnosed"]

        # Get rid of all the suspected cases, since we have nothing to tally for them
        cases = cases[~suspect_mask]

        # Confirmed cases use the label "confirmado"
        cases["date_new_tested"] = None
        confirmed_mask = cases["_classification"].str.lower().str.match(
            ".*confirmado.*")
        cases.loc[confirmed_mask,
                  "date_new_confirmed"] = cases.loc[confirmed_mask,
                                                    "_date_diagnosed"]

        # Clean up the subregion codes
        cases["subregion1_code"] = cases["subregion1_code"].apply(
            lambda x: None if x == 0 else numeric_code_as_string(x, 2))
        cases["subregion2_code"] = cases["subregion2_code"].apply(
            lambda x: None if x == 0 else numeric_code_as_string(x, 3))

        # Convert subregion1_code to the corresponding ISO code
        cases["subregion1_code"] = cases["subregion1_code"].apply(
            _ISO_CODE_MAP.get)

        # Remove unnecessary columns before converting to time series
        cases = cases.drop(
            columns=[col for col in cases.columns if col.startswith("_")])

        # Go from individual case records to key-grouped records in time series format
        data = convert_cases_to_time_series(
            cases, ["subregion1_code", "subregion2_code"])

        # Parse dates to ISO format.
        data["date"] = data["date"].astype(str)

        # Aggregate by province and report that separately
        provinces = (data.drop(columns=["subregion2_code"]).groupby(
            ["subregion1_code", "date", "age", "sex"]).sum().reset_index())

        # Aggregate to the country level and report that separately
        country = (data.drop(columns=["subregion1_code"]).groupby(
            ["date", "age", "sex"]).sum().reset_index())

        # Compute the key from the subregion codes
        country["key"] = "AR"
        provinces["key"] = "AR_" + provinces["subregion1_code"]
        data["key"] = "AR_" + data["subregion1_code"] + "_" + data[
            "subregion2_code"]

        # Remove bogus values
        for df in (country, provinces, data):
            df.drop(df[df["key"].str.endswith("_")].index, inplace=True)
            for nn_col in ("date", "subregion1_code", "subregion2_code"):
                if nn_col in df.columns:
                    df.drop(df[df[nn_col].isna() | (df[nn_col] == "")].index,
                            inplace=True)

        return concat([data, provinces, country])
Ejemplo n.º 16
0
    def parse_dataframes(self, dataframes: Dict[str, DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:

        # Rename appropriate columns
        data = table_rename(
            dataframes[0],
            {
                "sexo": "sex",
                "grupo_etario": "age",
                # "jurisdiccion_residencia": "",
                "jurisdiccion_residencia_id": "subregion1_code",
                # "depto_residencia": "",
                "depto_residencia_id": "subregion2_code",
                # "jurisdiccion_aplicacion": "",
                # "jurisdiccion_aplicacion_id": "",
                # "depto_aplicacion": "",
                # "depto_aplicacion_id": "",
                "fecha_aplicacion": "date",
                "vacuna": "_manufacturer",
                # "condicion_aplicacion": "",
                "orden_dosis": "_dose_number",
                # "lote_vacuna": "",
            },
            drop=True,
        )

        # Parse dates to ISO format.
        data["date"] = data["date"].astype(str)

        # Parse sex label into proper name
        data["sex"] = data["sex"].apply({"M": "male", "F": "female"}.get)

        # Parse the dose number assuming all vaccines have a 2-dose schedule
        data["new_persons_vaccinated"] = data["_dose_number"].apply(
            lambda x: 1 if x == 1 else 0)
        data["new_persons_fully_vaccinated"] = data["_dose_number"].apply(
            lambda x: 1 if x == 2 else 0)
        data["new_vaccine_doses_administered"] = (
            data["new_persons_vaccinated"] +
            data["new_persons_fully_vaccinated"])

        # Add a column for each vaccine manufacturer
        for manufacturer in data["_manufacturer"].unique():
            mask = data["_manufacturer"] == manufacturer
            brand_name = manufacturer.lower()

            cols = [f"new_persons_{mod}vaccinated" for mod in ["", "fully_"]]
            cols += [f"new_vaccine_doses_administered"]
            for col in cols:
                new_col = f"{col}_{brand_name}"
                data[new_col] = None
                data.loc[mask, new_col] = data.loc[mask, col]

        # Clean up the subregion codes
        data["subregion1_code"] = data["subregion1_code"].apply(
            lambda x: numeric_code_as_string(x, 2) or "00")
        data["subregion2_code"] = data["subregion2_code"].apply(
            lambda x: numeric_code_as_string(x, 3) or "000")

        # Convert subregion1_code to the corresponding ISO code
        data["subregion1_code"] = data["subregion1_code"].apply(
            _ISO_CODE_MAP.get)

        # Group by indexable columns
        idx_cols = ["date", "subregion1_code", "subregion2_code", "sex", "age"]
        data = data.groupby(idx_cols).sum().reset_index()

        # Aggregate country level and admin level 1
        country = aggregate_admin_level(data, ["date", "age", "sex"],
                                        "country")
        subregion1 = aggregate_admin_level(data, ["date", "age", "sex"],
                                           "subregion1")
        subregion2 = data.copy()

        # Drop regions without a code
        subregion2 = subregion2[subregion2["subregion2_code"] != "000"]
        subregion2.dropna(subset=["subregion1_code", "subregion2_code"],
                          inplace=True)
        subregion1.dropna(subset=["subregion1_code"], inplace=True)

        # Compute the key from the subregion codes
        country["key"] = "AR"
        subregion1["key"] = "AR_" + subregion1["subregion1_code"]
        subregion2["key"] = "AR_" + subregion2["subregion1_code"] + "_" + data[
            "subregion2_code"]

        return concat([subregion2, subregion1, country])
Ejemplo n.º 17
0
    def parse_dataframes(self, dataframes: Dict[str, DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:

        # Rename appropriate columns
        cases = table_rename(
            dataframes[0],
            {
                "residencia_provincia_id": "subregion1_code",
                "residencia_departamento_id": "subregion2_code",
                "fecha_fallecimiento": "date_new_deceased",
                "fecha_apertura": "_date_estimate",
                "fecha_diagnostico": "date_new_tested",
                "fecha_internacion": "date_new_hospitalized",
                "fecha_cui_intensivo": "date_new_intensive_care",
                "clasificacion_resumen": "_classification",
                "edad": "age",
                "sexo": "sex",
            },
            drop=True,
        )

        # Get rid of all the suspected cases, since we have nothing to tally for them
        cases = cases[~cases["_classification"].str.lower().str.
                      match(".*sospechoso.*")]

        # Confirmed cases use the label "confirmado"
        cases["date_new_confirmed"] = None
        confirmed_mask = cases["_classification"].str.lower().str.match(
            ".*confirmado.*")
        cases.loc[confirmed_mask,
                  "date_new_confirmed"] = cases.loc[confirmed_mask,
                                                    "date_new_tested"]

        # Estimate the confirmed date when none is available
        cases.loc[confirmed_mask, "date_new_confirmed"] = cases.loc[
            confirmed_mask,
            "date_new_confirmed"].fillna(cases.loc[confirmed_mask,
                                                   "_date_estimate"])

        # Only count deaths from confirmed cases
        cases.loc[~confirmed_mask, "date_new_deceased"] = None

        # Remove unnecessary columns before converting to time series
        cases = cases.drop(
            columns=[col for col in cases.columns if col.startswith("_")])

        # Clean up the subregion codes
        cases["subregion1_code"] = cases["subregion1_code"].apply(
            lambda x: numeric_code_as_string(x, 2) or "00")
        cases["subregion2_code"] = cases["subregion2_code"].apply(
            lambda x: numeric_code_as_string(x, 3) or "000")

        # Go from individual case records to key-grouped records in time series format
        data = convert_cases_to_time_series(
            cases, ["subregion1_code", "subregion2_code"])

        # Parse dates to ISO format.
        data["date"] = data["date"].astype(str)

        # Aggregate to the country level and report that separately
        country = (data.drop(
            columns=["subregion1_code", "subregion2_code"]).groupby(
                ["date", "age", "sex"]).sum().reset_index())

        # Convert subregion1_code to the corresponding ISO code
        data["subregion1_code"] = data["subregion1_code"].apply(
            _ISO_CODE_MAP.get)

        # Aggregate by province and report that separately
        provinces = (data.drop(columns=["subregion2_code"]).groupby(
            ["subregion1_code", "date", "age", "sex"]).sum().reset_index())

        # Drop regions without a code
        data = data[data["subregion2_code"] != "000"]
        data.dropna(subset=["subregion1_code", "subregion2_code"],
                    inplace=True)
        provinces.dropna(subset=["subregion1_code"], inplace=True)

        # Compute the key from the subregion codes
        country["key"] = "AR"
        provinces["key"] = "AR_" + provinces["subregion1_code"]
        data["key"] = "AR_" + data["subregion1_code"] + "_" + data[
            "subregion2_code"]

        return concat([data, provinces, country])