Exemple #1
0
    def parse_dataframes(self, dataframes: Dict[str, DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:

        records = []
        sex_buckets = {"gender_male": "male", "gender_female": "female"}
        age_buckets = {
            col: col.replace("age_", "")
            for col in dataframes[0].columns if col.startswith("age_")
        }
        for _, row in dataframes[0].iterrows():
            for age_col, age_bucket in age_buckets.items():
                records.append({
                    "key": "ZA",
                    "date": datetime_isoformat(row.date, "%d-%m-%Y"),
                    "age": None if age_bucket == "unknown" else age_bucket,
                    "total_deceased": row[age_col],
                })
            for sex_col, sex_bucket in sex_buckets.items():
                records.append({
                    "key": "ZA",
                    "date": datetime_isoformat(row.date, "%d-%m-%Y"),
                    "sex": sex_bucket,
                    "total_deceased": row[sex_col],
                })

        return DataFrame.from_records(records)
    def parse_dataframes(
        self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts
    ) -> DataFrame:
        data = table_rename(
            dataframes[0],
            {
                "submission_date": "date",
                "state": "subregion1_code",
                "tot_cases": "total_confirmed",
                # "conf_cases": "total_confirmed",
                # "prob_cases": "",
                "new_case": "new_confirmed",
                # "pnew_case": "",
                "tot_death": "total_deceased",
                # "conf_death": "",
                # "prob_death": "",
                "new_death": "new_deceased",
                # "pnew_death": "",
                # "created_at": "",
                # "consent_cases": "",
                # "consent_deaths": "",
            },
            drop=True,
        )

        data["key"] = "US_" + data["subregion1_code"]
        data["date"] = data["date"].apply(lambda x: datetime_isoformat(x, "%m/%d/%Y"))

        # A few "states" are considered independent territories by our dataset or need correction
        data.loc[data["subregion1_code"] == "PW", "key"] = "PW"
        data.loc[data["subregion1_code"] == "FSM", "key"] = "FM"
        data.loc[data["subregion1_code"] == "RMI", "key"] = "MH"
        data.loc[data["subregion1_code"] == "NYC", "key"] = "US_NY_NYC"

        return data
Exemple #3
0
    def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:
        with open(sources[0], "r") as fd:
            data = json.load(fd)["Data"]

        # "Date":"01\/01\/2020","NewConfirmed":0,"NewRecovered":0,"NewHospitalized":0,"NewDeaths":0,"Confirmed":0,"Recovered":0,"Hospitalized":0,"Deaths":0
        data = table_rename(
            DataFrame.from_records(data),
            {
                "Date": "date",
                "NewConfirmed": "new_confirmed",
                "NewRecovered": "new_recovered",
                "NewHospitalized": "new_hospitalized",
                "NewDeaths": "new_deceased",
                "Confirmed": "total__confirmed",
                "Recovered": "total__recovered",
                "Hospitalized": "total__hospitalized",
                "Deaths": "total__deceased",
            },
            drop=True,
            remove_regex=r"[^0-9a-z\s]",
        )

        # Format date as ISO date
        data["date"] = data["date"].apply(lambda x: datetime_isoformat(x, "%m/%d/%Y"))

        # Add key and return data
        data["key"] = "TH"
        return data
Exemple #4
0
    def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame],
              **parse_opts) -> DataFrame:

        url_tpl = sources[0]
        metadata = aux["metadata"]
        metadata = metadata[metadata["country_code"] == "FR"]

        fr_isos = read_file(SRC / "data" / "fr_iso_codes.csv")
        fr_iso_map = {
            iso: code
            for iso, code in zip(fr_isos["iso_code"], fr_isos["region_code"])
        }
        fr_codes = metadata[["subregion1_code", "subregion2_code"]].dropna()
        regions_iter = fr_codes["subregion1_code"].unique()
        deps_iter = (record for _, record in fr_codes.iterrows())

        if parse_opts.get("country"):
            data = _get_country(url_tpl)

        else:
            get_region_func = partial(_get_region, url_tpl, fr_iso_map)
            regions = concat(list(thread_map(get_region_func, regions_iter)))

            get_department_func = partial(_get_department, url_tpl)
            departments = concat(
                list(
                    thread_map(get_department_func,
                               deps_iter,
                               total=len(fr_codes))))

            data = concat([regions, departments])

        data["date"] = data["date"].apply(
            lambda x: datetime_isoformat(x, "%Y-%m-%d %H:%M:%S"))
        return data
    def parse_dataframes(self, dataframes: Dict[str, DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:

        deceased = table_rename(dataframes["deceased"],
                                {"FECHA / CCAA": "date"})
        deceased = pivot_table(deceased.set_index("date"),
                               value_name="new_deceased",
                               pivot_name="match_string")

        # Convert dates to ISO format
        deceased["date"] = deceased["date"].apply(lambda x: str(x)[:10])
        deceased["date"] = deceased["date"].apply(
            lambda x: datetime_isoformat(x, "%Y-%m-%d"))

        # Add the country code to all records and declare matching as subregion1
        deceased["country_code"] = "ES"
        deceased["subregion2_code"] = None
        deceased["locality_code"] = None

        # Country level is declared as "espana"
        deceased["key"] = None
        deceased.loc[deceased["match_string"] == "espana", "key"] = "ES"

        # Output the results
        return deceased.dropna(subset=["date"])
    def parse_dataframes(self, dataframes: Dict[str, DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:
        # The headers are a bit funny-looking, so we must manually manipulate them first
        data = dataframes[0]
        data.columns = [
            col.split("|")[0].split("~")[0] for col in data.iloc[0]
        ]
        data = data.iloc[1:]

        data = table_rename(
            data,
            {
                "Date": "date",
                "Nombre de personnes en soins intensifs":
                "current_intensive_care",
                "Nombre cumulé de décès": "total_deceased",
                "Nombre de personnes testées COVID+": "new_tested",
            },
            drop=True,
        )

        # Get date in ISO format
        data["date"] = data["date"].apply(
            lambda x: datetime_isoformat(x, "%d/%m/%Y"))

        # Only country-level data is provided
        data["key"] = "LU"

        # Output the results
        return data
Exemple #7
0
    def parse_dataframes(self, dataframes: List[DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:

        # Rename the appropriate columns
        tables = []
        dataframes[0].columns = [
            fuzzy_text(col, remove_regex=r"[^0-9a-z\s_]", remove_spaces=False)
            for col in dataframes[0].columns
        ]
        for keyword, value_column in [
            ("confirmed", "total_confirmed"),
            ("death", "total_deceased"),
            ("recover", "total_recovered"),
        ]:
            data = dataframes[0][
                ["_name"] +
                [col for col in dataframes[0].columns if keyword in col]]
            data.columns = [col.split(" upto ", 2)[-1] for col in data.columns]
            data = data.set_index("_name")[data.columns[1:]]
            data = pivot_table_date_columns(data,
                                            pivot_name="date",
                                            value_name=value_column)
            data.date = data.date.apply(
                lambda x: datetime_isoformat(f"{x}-2020", "%d %B-%Y"))
            data = data.reset_index().rename(columns={"index": "match_string"})
            tables.append(data)

        # Aggregate all tables together
        data = concat(tables)

        # Make sure all records have the country code
        data["country_code"] = "BD"

        # Output the results
        return data
    def parse_dataframes(
        self, dataframes: List[DataFrame], aux: Dict[str, DataFrame], **parse_opts
    ) -> DataFrame:

        data = dataframes[0].rename(
            columns={
                "日付": "date",
                "都道府県名": "match_string",
                "患者数": "confirmed",
                "入院中": "hospitalized",
                "退院者": "recovered",
                "死亡者": "deceased",
            }
        )

        # Convert date to ISO format
        data["date"] = data["date"].apply(lambda x: datetime_isoformat(x, "%Y%m%d"))

        # Add the country code to all records
        data["country_code"] = "JP"

        # Keep only columns we can process
        data = data[["date", "match_string", "confirmed", "hospitalized", "recovered", "deceased"]]

        # Aggregate the region-level data
        data = grouped_cumsum(data, ["country_code", "match_string", "date"])

        # Aggregate the country-level data
        data_country = data.groupby("date").sum().reset_index()
        data_country["key"] = "JP"

        # Output the results
        return concat([data_country, data])
    def parse_dataframes(self, dataframes: Dict[str, DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:
        data = table_merge([
            melt(dataframes[name],
                 id_vars=["Date"],
                 var_name="match_string",
                 value_name=value)
            for name, value in [(
                "confirmed", "new_confirmed"), ("deceased", "total_deceased")]
        ])

        data["country_code"] = "JP"

        # Get date in ISO format
        data = data.rename(columns={"Date": "date"})
        data["date"] = data["date"].apply(
            lambda x: datetime_isoformat(x, "%Y/%m/%d"))

        # Country-level uses the label "ALL"
        country_mask = data["match_string"] == "ALL"
        country = data.loc[country_mask]
        data = data.loc[~country_mask]
        country["key"] = "JP"

        # Output the results
        return concat([country, data])
Exemple #10
0
    def parse(self, sources: Dict[Any, str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:
        url_tpl = sources[0]
        metadata = aux["metadata"]
        metadata = metadata[metadata["country_code"] == "FR"]

        fr_isos = read_file(SRC / "data" / "fr_iso_codes.csv")
        fr_iso_map = {iso: code for iso, code in zip(fr_isos["iso_code"], fr_isos["region_code"])}
        fr_codes = metadata[["subregion1_code", "subregion2_code"]].dropna()
        regions_iter = fr_codes["subregion1_code"].unique()
        deps_iter = [record for _, record in fr_codes.iterrows()]

        column_adapter = {
            "key": "key",
            "date": "date",
            "testsRealisesDetails": "_breakdown_tested",
            "testsPositifsDetails": "_breakdown_confirmed",
        }

        # Get country level data
        country = _get_country(url_tpl, column_adapter)

        # Get region level data
        get_region_func = partial(_get_region, url_tpl, column_adapter, fr_iso_map)
        regions = concat(list(thread_map(get_region_func, regions_iter)))

        # Get department level data
        get_department_func = partial(_get_department, url_tpl, column_adapter)
        departments = concat(list(thread_map(get_department_func, deps_iter)))

        data = concat([country, regions, departments])
        data["date"] = data["date"].apply(lambda x: datetime_isoformat(x, "%Y-%m-%d %H:%M:%S"))

        data["_breakdown_tested"].fillna("", inplace=True)
        data["_breakdown_confirmed"].fillna("", inplace=True)

        records: Dict[str, List] = {"confirmed": [], "tested": []}
        for key, row in data.set_index("key").iterrows():
            for statistic in records.keys():
                if row[f"_breakdown_{statistic}"] != "":
                    for item in row[f"_breakdown_{statistic}"]:
                        records[statistic].append(
                            {
                                "key": key,
                                "date": row["date"],
                                "age": item["age"],
                                "sex": item.get("sexe"),
                                f"new_{statistic}": item["value"],
                            }
                        )

        df1 = DataFrame.from_records(records["tested"])
        df2 = DataFrame.from_records(records["confirmed"])
        data = df1.merge(df2, how="outer")

        data = data[~data["age"].isin(["0", "A", "B", "C", "D", "E"])]
        data["age"] = data["age"].apply(lambda x: age_group(safe_int_cast(x)))

        sex_adapter = lambda x: {"h": "male", "f": "female"}.get(x, "sex_unknown")
        data["sex"] = data["sex"].apply(sex_adapter)
        return data
Exemple #11
0
    def parse_dataframes(self, dataframes: Dict[str, DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:

        # Rename the appropriate columns
        data = (dataframes[0].rename(
            columns={
                "Date": "date",
                "Province": "match_string",
                "Confirmed Cases": "total_confirmed",
            }).drop([0, 1]))

        # Spreadsheet has the typo heatlh
        data = data.drop(axis=1,
                         columns=[
                             "Number of heatlh structures", "Affected",
                             "Source", "Probable cases"
                         ])

        # Data source sometimes uses different hypenation from src/data/iso_3166_2_codes.csv
        data["match_string"].replace({"Haut  Katanga": "Haut-Katanga"},
                                     inplace=True)

        data.date = data.date.apply(
            lambda x: datetime_isoformat(x, "%Y-%m-%d"))

        data["total_confirmed"] = (data["total_confirmed"].fillna(0).astype(
            {"total_confirmed": "int64"}))

        # Make sure all records have the country code
        data["country_code"] = "CD"

        # Output the results
        return data
Exemple #12
0
    def parse_dataframes(
        self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts
    ) -> DataFrame:

        data = dataframes[0]

        data["date"] = data.REPORT_DATE.apply(lambda x: datetime_isoformat(x, "%Y-%m-%d"))
        # Add level1 keys
        subregion1s = country_subregion1s(aux["metadata"], "AU")
        data = table_merge([data, subregion1s], left_on="CODE", right_on="subregion1_code", how="left")
        # Country-level record has CODE AUS
        country_mask = data["CODE"] == "AUS"
        data.loc[country_mask, "key"] = "AU"
        # Only keep country and subregion1 rows
        data = data[data.key != None]
        data = table_rename(
            data,
            {
                "date": "date",
                "key": "key",
                "VACC_DOSE_CNT": "total_vaccine_doses_administered",
                "VACC_PEOPLE_CNT": "total_persons_fully_vaccinated",
            },
            drop=True)
        # remove rows without vaccination data
        data.dropna(subset=["total_vaccine_doses_administered", "total_persons_fully_vaccinated"], how="all", inplace=True)
        # based on the assumption two doses = fully vaccinated(since Australia is using Pfizer and AZ)
        data["total_persons_vaccinated"] = estimate_total_persons_vaccinated(data)

        return data
    def parse_dataframes(self, dataframes: Dict[str, DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:

        data = table_rename(
            dataframes[0],
            {
                "TipusCasData": "date",
                "SexeCodi": "sex",
                # "SexeDescripcio": "sex",
                "EdatRang": "age",
                "TipusCasDescripcio": "_case_type",
                "NumCasos": "new_confirmed",
            },
            drop=True,
        )

        # Remove "suspect" cases
        data = data[data["_case_type"] != "Sospitós"].drop(
            columns=["_case_type"])

        # Derive key from subregion code
        data["key"] = "ES_CT"

        # Parse age, sex, date and numeric values
        sex_adapter = {"0": "male", "1": "female"}
        data["age"] = data["age"].str.replace("90\+", "90-")
        data["sex"] = data["sex"].apply(
            lambda x: sex_adapter.get(x, "sex_unknown"))
        data["date"] = data["date"].apply(
            lambda x: datetime_isoformat(x, "%d/%m/%Y"))
        data["new_confirmed"] = data["new_confirmed"].apply(safe_int_cast)

        return data
Exemple #14
0
    def parse_dataframes(self, dataframes: List[DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:

        data = table_multimerge(
            [
                Covid19ZaCumulativeDataSource._parse_variable(df, name)
                for df, name in zip(
                    dataframes,
                    [
                        "total_confirmed", "total_deceased", "total_recovered",
                        "total_tested"
                    ],
                )
            ],
            how="outer",
        )

        # Convert date to ISO format
        data["date"] = data["date"].apply(
            lambda x: datetime_isoformat(x, "%d-%m-%Y"))

        # Country-level records should have "total" region name
        country_mask = data["subregion1_code"] == "total"
        data.loc[country_mask, "key"] = "ZA"

        # All other records can provide their own key directly
        data.loc[~country_mask, "key"] = "ZA_" + data.subregion1_code

        # Output the results
        return data
Exemple #15
0
def _process_partition(cases: DataFrame) -> DataFrame:
    cases = cases.copy()

    # Extract information about whether doses were first (partial immunization) or second (full)
    cases["date_new_persons_vaccinated"] = None
    cases["date_new_persons_fully_vaccinated"] = None
    first_dose_mask = cases["_dose_information"].str.strip().str.slice(
        0, 1) == "1"
    second_dose_mask = cases["_dose_information"].str.strip().str.slice(
        0, 1) == "2"
    cases.loc[first_dose_mask, "date_new_persons_vaccinated"] = cases.loc[
        first_dose_mask, "date_new_vaccine_doses_administered"]
    cases.loc[second_dose_mask,
              "date_new_persons_fully_vaccinated"] = cases.loc[
                  second_dose_mask, "date_new_vaccine_doses_administered"]

    # Drop columns which we have no use for
    cases = cases[[col for col in cases.columns if not col.startswith("_")]]

    # Make sure our region codes are of type str
    cases["subregion2_code"] = cases["subregion2_code"].apply(
        safe_int_cast).astype(str)

    # Convert ages to int, and translate sex (no "other" sex/gender reported)
    cases["age"] = cases["age"].apply(safe_int_cast)
    cases["sex"] = cases["sex"].str.lower().apply({
        "m": "male",
        "f": "female"
    }.get)

    # Convert to time series format
    data = convert_cases_to_time_series(
        cases, index_columns=["subregion1_code", "subregion2_code"])

    # Convert date to ISO format
    data["date"] = data["date"].str.slice(0, 10)
    data["date"] = data["date"].apply(
        lambda x: datetime_isoformat(x, "%Y-%m-%d"))

    # Get rid of bogus records
    data = data.dropna(subset=["date"])
    data = data[data["date"] >= "2020-01-01"]
    data = data[data["date"] < date_today(offset=1)]

    # Aggregate data by country
    country = aggregate_admin_level(data, ["date", "age", "sex"], "country")
    country["key"] = "BR"

    # Aggregate data by state
    state = (data.drop(columns=["subregion2_code"]).groupby(
        ["date", "subregion1_code", "age", "sex"]).sum().reset_index())
    state["key"] = "BR_" + state["subregion1_code"]

    # We can derive the key from subregion1 + subregion2
    data = data[data["subregion2_code"].notna()
                & (data["subregion2_code"] != "")]
    data["key"] = "BR_" + data["subregion1_code"] + "_" + data[
        "subregion2_code"]

    return concat([country, state, data])
Exemple #16
0
    def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame],
              **parse_opts) -> DataFrame:
        cases = {
            "date": "date",
            "areaCode": "areaCode",
            "newCasesBySpecimenDate": "newCasesBySpecimenDate",
            "cumCasesBySpecimenDate": "cumCasesBySpecimenDate",
        }

        api = Cov19API(filters=["areaType=utla"], structure=cases)
        data = api.get_dataframe()

        data.areaCode = data.areaCode.apply(_apply_area_code_map)
        data = data.groupby(["date", "areaCode"], as_index=False).sum()

        data = table_rename(
            data,
            {
                "areaCode": "subregion2_code",
                "newCasesBySpecimenDate": "new_confirmed",
                "cumCasesBySpecimenDate": "total_confirmed",
                "date": "date",
            },
            drop=True,
        )

        data.date = data.date.apply(
            lambda x: datetime_isoformat(x, "%Y-%m-%d"))

        return data
Exemple #17
0
    def parse_dataframes(self, dataframes: List[DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:

        # Rename the appropriate columns
        data = dataframes[0].rename(
            columns={
                "date": "date",
                "state": "subregion1_code",
                "positive": "confirmed",
                "death": "deceased",
                "total": "tested",
                "recovered": "recovered",
            })

        # Convert date to ISO format
        data["date"] = data["date"].apply(
            lambda x: datetime_isoformat(x, "%Y%m%d"))

        # Keep only columns we can process
        data['key'] = 'US_' + data['subregion1_code']
        data = data[[
            "date", 'key', "confirmed", "deceased", "tested", "recovered"
        ]]

        # Output the results
        return grouped_diff(data, ["key", "date"])
    def parse_dataframes(self, dataframes: Dict[str, DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:

        # Rename appropriate columns
        col = parse_opts["column_name"]
        cases = table_rename(dataframes[0], _column_adapter)
        cases = cases.rename(columns={"date": f"date_{col}"})
        cases = _parse_region_codes(cases).dropna(subset=[f"date_{col}"])

        # Rename the sex values
        cases["sex"] = cases["sex"].apply({"M": "male", "Z": "female"}.get)

        # Go from individual case records to key-grouped records in a flat table
        data = convert_cases_to_time_series(
            cases, index_columns=["subregion1_code", "subregion2_code"])

        # Make sure the region codes are strings before parsing them
        data["subregion1_code"] = data["subregion1_code"].astype(str)
        data["subregion2_code"] = data["subregion2_code"].astype(str)

        # Aggregate L2 + L3 data
        data = _aggregate_regions(data,
                                  ["date", "subregion1_code", "age", "sex"])

        # Remove bogus values
        data = data[data["key"] != "CZ_99"]
        data = data[data["key"] != "CZ_99_99Y"]

        # Convert all dates to ISO format
        data["date"] = (
            data["date"].astype(str).apply(lambda x: datetime_isoformat(
                x, "%d.%m.%Y" if "." in x else "%Y-%m-%d")))

        return data
    def parse_dataframes(
        self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts
    ) -> DataFrame:
        data = table_rename(
            dataframes["counties"],
            {
                "Date": "date",
                "County": "match_string",
                "Count": "total_confirmed",
                "Deaths": "total_deceased",
            },
        )

        # Convert date to ISO format
        data["date"] = data["date"].astype(str).apply(lambda x: datetime_isoformat(x, "%m/%d/%Y"))

        # Drop bogus values
        data = data[data["match_string"] != "Unknown"]

        # Dukes and Nantucket are separate counties but reported as one, so drop them from the data
        data = data[data["match_string"] != "Dukes and Nantucket"]

        data["country_code"] = "US"
        data["subregion1_code"] = "MA"
        return data
Exemple #20
0
    def parse_dataframes(self, dataframes: List[DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:

        data = table_rename(
            dataframes[0],
            parse_opts.get(
                "column_adapter",
                {
                    "discharged_cumulative": "total_discharged",
                    "hospitalized_current": "current_hospitalized",
                    "number hospitalised": "current_hospitalized",
                    "hospitalized_cumulative": "total_hospitalized",
                    "icu_current": "current_intensive_care",
                    "number in icu": "current_intensive_care",
                    "icu_cumulative": "cumulative_intensive_care",
                    "ventilator_current": "current_ventilator",
                    "ventilator_cumulative": "cumulative_ventilator",
                    "new hospital admissions": "new_hospitalized",
                    "new intensive care admissions": "new_intensive_care",
                },
            ),
        )

        # Add key and parse date in ISO format
        data["key"] = parse_opts.get("key")
        data["date"] = data[parse_opts.get("date_column", "date")].astype(str)
        date_format = parse_opts.get("date_format", "%Y-%m-%d")
        data.date = data.date.apply(
            lambda x: datetime_isoformat(x, date_format))

        return data
    def parse_dataframes(self, dataframes: Dict[str, DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:

        # Rename the appropriate columns
        data = (dataframes[0].rename(
            columns={
                "prname": "subregion1_name",
                "numconf": "total_confirmed",
                "numtoday": "new_confirmed",
                "numdeaths": "total_deceased",
                "numtested": "total_tested",
                "numrecover": "total_recovered",
            }).drop(columns=["prnameFR"]))

        # Convert date to ISO format
        data["date"] = data["date"].apply(
            lambda x: datetime_isoformat(x, "%d-%m-%Y"))

        # Make sure all records have the country code and match subregion1 only
        data["country_code"] = "CA"
        data["subregion2_code"] = None

        # Country-level records should have null region name
        country_mask = data["subregion1_name"] == "Canada"
        data.loc[country_mask, "subregion1_name"] = None

        # Remove bogus data
        data = data[~data["subregion1_name"].apply(lambda x: "traveller" in
                                                   (x or "").lower())]

        # Output the results
        return data
    def parse_dataframes(self, dataframes: Dict[str, DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:

        data = table_merge(
            [
                dataframes["confirmed_deceased_recovered"].rename(
                    columns=COMMON_COLUMNS, ),
                dataframes["tested"].rename(columns={
                    "TestGesamt": "total_tested",
                    "MeldeDatum": "Time"
                })
            ],
            how="outer",
        )

        # Convert date to ISO format
        data["date"] = data["Time"].apply(
            lambda x: datetime_isoformat(x, "%d.%m.%Y %H:%M:%S"))

        # Create the key from the state ID
        data["key"] = data["BundeslandID"].apply(lambda x: f"AT_{x}")

        data.loc[data["key"] == "AT_10", "key"] = "AT"

        # Output the results
        return data
    def parse_dataframes(
        self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts
    ) -> DataFrame:

        # Rename the appropriate columns
        data = (
            dataframes[0]
            .rename(
                columns={
                    "Location": "match_string",
                    "Confirmed": "total_confirmed",
                    "Deaths": "total_deceased",
                    "Recoveries": "total_recovered",
                    "Date": "date",
                }
            )
            .drop(columns=["Active"])
        )

        # Convert date to ISO format
        data["date"] = data["date"].apply(lambda x: datetime_isoformat(x, "%m/%d/%Y"))

        # The first row is metadata info about column names - discard it
        data = data[data.match_string != "#loc+name"]

        # Convert string numbers to int
        # Parse integers
        for column in ("total_confirmed", "total_deceased", "total_recovered"):
            data[column] = data[column].apply(lambda x: safe_int_cast(str(x).replace(",", "")))

        # Make sure all records have the country code
        data["country_code"] = "LY"

        # Output the results
        return data
Exemple #24
0
    def parse_dataframes(
        self, dataframes: List[DataFrame], aux: Dict[str, DataFrame], **parse_opts
    ) -> DataFrame:

        data = dataframes[0]
        column_tokens = ["confirmed_", "deaths_", "recovered_"]
        data = data[[col for col in data.columns if any(token in col for token in column_tokens)]]
        data = data.drop(
            columns=["cases_confirmed_new", "cases_unconfirmed_new", "deaths_new", "recovered_new"]
        )
        data["date"] = dataframes[0].date.apply(lambda x: datetime_isoformat(x, "%d-%m-%Y"))

        subsets = []
        for token in column_tokens:
            df = data[["date"] + [col for col in data.columns if token in col]]
            df = pivot_table(df.set_index("date"), pivot_name="match_string")
            df.match_string = df.match_string.apply(lambda x: x.split("_", 2)[1])
            df = df.rename(columns={"value": token.split("_")[0]})
            subsets.append(df)

        data = subsets[0]
        for df in subsets[1:]:
            data = data.merge(df, how="outer")
        data = data.rename(columns={"deaths": "deceased"})

        data = data[data.match_string != "unconfirmed"]
        data = grouped_diff(data, ["match_string", "date"])
        data["country_code"] = "PT"
        return data
Exemple #25
0
    def parse_dataframes(self, dataframes: Dict[str, DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:

        hospitalizations = dataframes[0]
        icu = table_rename(
            hospitalizations.loc[hospitalizations["DPHCategory"] == "ICU"],
            {
                "reportDate": "date",
                "PatientCount": "current_intensive_care"
            },
            drop=True,
        )
        hosp = table_rename(
            hospitalizations.loc[hospitalizations["DPHCategory"] ==
                                 "Med/Surg"],
            {
                "reportDate": "date",
                "PatientCount": "current_hospitalized"
            },
            drop=True,
        )

        data = icu.merge(hosp, on="date")
        data["date"] = data["date"].apply(
            lambda x: datetime_isoformat(x, "%Y/%m/%d"))
        data["key"] = "US_CA_SFO"
        return data
    def parse_dataframes(
        self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts
    ) -> DataFrame:

        data = table_multimerge(
            [
                table_rename(dataframes["confirmed"], _column_adapter, drop=True),
                table_rename(dataframes["deceased"], _column_adapter, drop=True),
            ],
            how="outer",
        )

        # Province names are sometimes codes (but not always compliant with ISO codes)
        data["subregion1_code"] = data["subregion1_name"].apply(_province_map.get)
        data.drop(columns=["subregion1_name"], inplace=True)

        # Convert date to ISO format
        data["date"] = data["date"].apply(lambda x: datetime_isoformat(x, "%d-%m-%Y"))

        # Aggregate subregion1 level
        l1_index = ["date", "subregion1_code"]
        l1 = data.drop(columns=["match_string"]).groupby(l1_index).sum().reset_index()

        # Make sure all records have the country code and subregion2_name
        l1["country_code"] = "CA"
        l1["subregion2_name"] = None
        data["country_code"] = "CA"
        data["subregion2_name"] = ""

        # Remove bogus data
        data = data[data["match_string"] != "Not Reported"]

        # Output the results
        return concat([l1, data])
    def parse_dataframes(self, dataframes: Dict[str, DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:

        data = dataframes[0]
        # Get all the states
        states = list(data.columns.difference(["Status", "Date"]))
        # Flatten the table
        data = melt(data,
                    id_vars=["Date", "Status"],
                    value_vars=states,
                    var_name="subregion1_code")
        # Convert numeric fields to integers
        data["value"] = data["value"].apply(safe_int_cast)
        # Pivot on Status to get flattened confirmed, deceased, recovered numbers
        data = data.pivot_table("value", ["Date", "subregion1_code"], "Status")
        data.reset_index(drop=False, inplace=True)
        data = data.reindex(
            ["Date", "subregion1_code", "Confirmed", "Deceased", "Recovered"],
            axis=1)

        data = data.rename(
            columns={
                "Confirmed": "new_confirmed",
                "Deceased": "new_deceased",
                "Recovered": "new_recovered",
                "Date": "date",
            })
        # No data is recorded against IN_DD, it is now a district of IN_DN
        data = data[data.subregion1_code != "DD"]
        data.date = data.date.apply(
            lambda x: datetime_isoformat(x, "%d-%b-%y"))
        data["key"] = "IN_" + data["subregion1_code"]

        return data
Exemple #28
0
    def parse_dataframes(self, dataframes: Dict[str, DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:

        column_map = {
            "date": "date",
            "state": "subregion1_code",
            "positive": "total_confirmed",
            "death": "total_deceased",
            "total": "total_tested",
            "recovered": "total_recovered",
            "hospitalizedCurrently": "current_hospitalized",
            "hospitalizedCumulative": "total_hospitalized",
            "inIcuCurrently": "current_intensive_care",
            "inIcuCumulative": "total_intensive_care",
            "onVentilatorCurrently": "current_ventilator",
            "onVentilatorCumulative": "total_ventilator",
        }

        # Rename the appropriate columns
        data = dataframes[0].rename(columns=column_map)

        # Convert date to ISO format
        data["date"] = data["date"].apply(
            lambda x: datetime_isoformat(x, "%Y%m%d"))

        # Keep only columns we can process
        data["key"] = "US_" + data["subregion1_code"]
        data = data[["key"] + list(column_map.values())].drop(
            columns=["subregion1_code"])

        # Output the results
        return data
Exemple #29
0
    def parse_dataframes(
        self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts
    ) -> DataFrame:

        data = dataframes[0]
        # Get all the states
        states = list(data.columns.difference(["Status", "Date"]))
        # Flatten the table
        data = melt(data, id_vars=["Date", "Status"], value_vars=states, var_name="subregion1_code")
        # Pivot on Status to get flattened confirmed, deceased, recovered numbers
        data = data.pivot_table("value", ["Date", "subregion1_code"], "Status")
        data.reset_index(drop=False, inplace=True)
        data = data.reindex(
            ["Date", "subregion1_code", "Confirmed", "Deceased", "Recovered"], axis=1
        )

        data = data.rename(
            columns={
                "Confirmed": "new_confirmed",
                "Deaths": "new_deceased",
                "Recovered": "new_recovered",
                "Date": "date",
            }
        )

        data.date = data.date.apply(lambda x: datetime_isoformat(x, "%d-%b-%y"))

        data["country_code"] = "IN"

        return data
Exemple #30
0
    def parse_dataframes(self, dataframes: Dict[str, DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:
        cases = table_rename(dataframes[0], _srag_column_adapter, drop=True)
        covid_mask = cases["_classification"] == 5
        valid_mask = cases["_prognosis"].notna() & cases["_prognosis"] != 9
        cases = cases[covid_mask & valid_mask]

        # Record the date of death
        cases["date_new_deceased"] = None
        deceased_mask = cases["_prognosis"] == 2
        cases.loc[deceased_mask,
                  "date_new_deceased"] = cases.loc[deceased_mask,
                                                   "_date_prognosis"]

        # Convert ages to int, and translate sex (no "other" sex/gender reported)
        cases["age"] = cases["age"].apply(safe_int_cast)
        cases["sex"] = cases["sex"].apply({"M": "male", "F": "female"}.get)

        # Convert all dates to ISO format
        for col in filter(lambda x: x.startswith("date"), cases.columns):
            cases[col] = cases[col].apply(
                lambda x: datetime_isoformat(x, "%d/%m/%Y"))

        # Parse subregion codes
        cases["subregion2_code"] = cases["subregion2_code"].apply(
            lambda x: numeric_code_as_string(x, 5))

        # Convert to time series format
        data = convert_cases_to_time_series(cases,
                                            index_columns=["subregion2_code"])
        data["country_code"] = "BR"

        # Get rid of bogus records
        data = data.dropna(subset=["date"])
        data = data[data["date"] >= "2020-01-01"]
        data = data[data["date"] < date_today(offset=1)]

        # Aggregate by country level
        country = (data.drop(columns=["subregion2_code"]).groupby(
            ["date", "age", "sex"]).sum().reset_index())
        country["key"] = "BR"

        # Aggregate by state level
        data["subregion1_code"] = data["subregion2_code"].apply(
            lambda x: _IBGE_STATES.get(safe_int_cast(x[:2])))
        state = (data.drop(columns=["subregion2_code"]).dropna(
            subset=["subregion1_code"]).groupby(
                ["date", "subregion1_code", "age", "sex"]).sum().reset_index())
        state["key"] = "BR_" + state["subregion1_code"]

        # Derive the key from subregion codes
        data = data[data["subregion2_code"].notna()]
        data["key"] = "BR_" + data["subregion1_code"] + "_" + data[
            "subregion2_code"]

        return concat([country, state, data])