def fetch(
        self,
        output_folder: Path,
        cache: Dict[str, str],
        fetch_opts: List[Dict[str, Any]],
        skip_existing: bool = False,
    ) -> Dict[str, str]:
        # Data is published as snapshots, so we guess the URL based on the date
        opts = dict(fetch_opts[0])
        url_tpl = opts.pop("url")

        # Keep trying URLs in reverse chronological order starting today until one works
        url = None
        date_start = "2021-08-31"
        date_end = date_today(offset=1)
        for date in reversed(list(date_range(date_start, date_end))):
            url = url_tpl.format(date=date.replace("-", ""))
            res = requests.head(url)
            if res.status_code == 200 and int(
                    res.headers.get("Content-Length", "0")) > 0:
                # Pass the actual URLs down to fetch it
                url_opts = dict(url=url, **opts)
                return super().fetch(output_folder,
                                     cache, [url_opts],
                                     skip_existing=skip_existing)
Beispiel #2
0
def _process_partition(cases: DataFrame) -> DataFrame:
    cases = cases.copy()

    # Extract information about whether doses were first (partial immunization) or second (full)
    cases["date_new_persons_vaccinated"] = None
    cases["date_new_persons_fully_vaccinated"] = None
    first_dose_mask = cases["_dose_information"].str.strip().str.slice(
        0, 1) == "1"
    second_dose_mask = cases["_dose_information"].str.strip().str.slice(
        0, 1) == "2"
    cases.loc[first_dose_mask, "date_new_persons_vaccinated"] = cases.loc[
        first_dose_mask, "date_new_vaccine_doses_administered"]
    cases.loc[second_dose_mask,
              "date_new_persons_fully_vaccinated"] = cases.loc[
                  second_dose_mask, "date_new_vaccine_doses_administered"]

    # Drop columns which we have no use for
    cases = cases[[col for col in cases.columns if not col.startswith("_")]]

    # Make sure our region codes are of type str
    cases["subregion2_code"] = cases["subregion2_code"].apply(
        safe_int_cast).astype(str)

    # Convert ages to int, and translate sex (no "other" sex/gender reported)
    cases["age"] = cases["age"].apply(safe_int_cast)
    cases["sex"] = cases["sex"].str.lower().apply({
        "m": "male",
        "f": "female"
    }.get)

    # Convert to time series format
    data = convert_cases_to_time_series(
        cases, index_columns=["subregion1_code", "subregion2_code"])

    # Convert date to ISO format
    data["date"] = data["date"].str.slice(0, 10)
    data["date"] = data["date"].apply(
        lambda x: datetime_isoformat(x, "%Y-%m-%d"))

    # Get rid of bogus records
    data = data.dropna(subset=["date"])
    data = data[data["date"] >= "2020-01-01"]
    data = data[data["date"] < date_today(offset=1)]

    # Aggregate data by country
    country = aggregate_admin_level(data, ["date", "age", "sex"], "country")
    country["key"] = "BR"

    # Aggregate data by state
    state = (data.drop(columns=["subregion2_code"]).groupby(
        ["date", "subregion1_code", "age", "sex"]).sum().reset_index())
    state["key"] = "BR_" + state["subregion1_code"]

    # We can derive the key from subregion1 + subregion2
    data = data[data["subregion2_code"].notna()
                & (data["subregion2_code"] != "")]
    data["key"] = "BR_" + data["subregion1_code"] + "_" + data[
        "subregion2_code"]

    return concat([country, state, data])
 def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame],
           **parse_opts) -> DataFrame:
     # Data can only be retrieved one day at a time, and it starts on 2020-01-22
     first = "2020-01-22"
     map_iter = list(date_range(first, date_today()))
     records = sum(thread_map(_get_daily_records, map_iter), [])
     return DataFrame.from_records(records)
    def parse_dataframes(self, dataframes: Dict[str, DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:
        # Data is nested into multiple sheets
        tables = []
        for df in list(dataframes[0].values()):
            # Header has two rows, but we ignore them and use our own columns anyway
            df.columns = _columns
            df = df.iloc[2:].copy()

            # Keep only rows with indexable columns not null
            df.dropna(subset=["date", "subregion2_name"], inplace=True)

            # Add to the tables including all subregions
            tables.append(df.iloc[1:])

        # Put all sheets together into a single DataFrame
        data = concat(tables)

        # Ensure date is in ISO format
        data["date"] = data["date"].apply(lambda x: str(x)[:10])

        # Make sure that all data is numeric
        for col in data.columns:
            if col not in ("date", "subregion2_name"):
                data[col] = data[col].apply(safe_int_cast)

        # Filter out dates beyond today
        data = data[data["date"] < date_today(offset=1)]

        # Output the results
        data["country_code"] = "SL"
        return data
    def fetch(
        self,
        output_folder: Path,
        cache: Dict[str, str],
        fetch_opts: List[Dict[str, Any]],
        skip_existing: bool = False,
    ) -> Dict[str, str]:
        # Data is published as GitHub Releases, so we guess the URL based on the date
        opts = dict(fetch_opts[0])
        url_tpl = opts.pop("url")

        urls = []
        date_start = "2021-01-11"
        date_end = date_today(offset=1)
        for date in date_range(date_start, date_end):
            urls.append(
                dict(name=date,
                     url=url_tpl.format(date=date.replace("-", "")),
                     **opts))

        # Pass the actual URLs down to fetch it
        return super().fetch(output_folder,
                             cache,
                             urls,
                             skip_existing=skip_existing)
Beispiel #6
0
    def parse_dataframes(self, dataframes: Dict[str, DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts) -> DataFrame:
        cases = table_rename(dataframes[0], _srag_column_adapter, drop=True)
        covid_mask = cases["_classification"] == 5
        valid_mask = cases["_prognosis"].notna() & cases["_prognosis"] != 9
        cases = cases[covid_mask & valid_mask]

        # Record the date of death
        cases["date_new_deceased"] = None
        deceased_mask = cases["_prognosis"] == 2
        cases.loc[deceased_mask,
                  "date_new_deceased"] = cases.loc[deceased_mask,
                                                   "_date_prognosis"]

        # Convert ages to int, and translate sex (no "other" sex/gender reported)
        cases["age"] = cases["age"].apply(safe_int_cast)
        cases["sex"] = cases["sex"].apply({"M": "male", "F": "female"}.get)

        # Convert all dates to ISO format
        for col in filter(lambda x: x.startswith("date"), cases.columns):
            cases[col] = cases[col].apply(
                lambda x: datetime_isoformat(x, "%d/%m/%Y"))

        # Parse subregion codes
        cases["subregion2_code"] = cases["subregion2_code"].apply(
            lambda x: numeric_code_as_string(x, 5))

        # Convert to time series format
        data = convert_cases_to_time_series(cases,
                                            index_columns=["subregion2_code"])
        data["country_code"] = "BR"

        # Get rid of bogus records
        data = data.dropna(subset=["date"])
        data = data[data["date"] >= "2020-01-01"]
        data = data[data["date"] < date_today(offset=1)]

        # Aggregate by country level
        country = (data.drop(columns=["subregion2_code"]).groupby(
            ["date", "age", "sex"]).sum().reset_index())
        country["key"] = "BR"

        # Aggregate by state level
        data["subregion1_code"] = data["subregion2_code"].apply(
            lambda x: _IBGE_STATES.get(safe_int_cast(x[:2])))
        state = (data.drop(columns=["subregion2_code"]).dropna(
            subset=["subregion1_code"]).groupby(
                ["date", "subregion1_code", "age", "sex"]).sum().reset_index())
        state["key"] = "BR_" + state["subregion1_code"]

        # Derive the key from subregion codes
        data = data[data["subregion2_code"].notna()]
        data["key"] = "BR_" + data["subregion1_code"] + "_" + data[
            "subregion2_code"]

        return concat([country, state, data])
Beispiel #7
0
    def fetch(
        self,
        output_folder: Path,
        cache: Dict[str, str],
        fetch_opts: List[Dict[str, Any]],
        skip_existing: bool = False,
    ) -> Dict[str, str]:
        # Data is published as GitHub Releases, so we guess the URL based on the date
        opts = fetch_opts[0]
        url_tpl = opts["url"]

        # Go from <today + 1> until the last known date for which data is reported
        # NOTE: at the time of writing, last known date is October 20
        working_url = None
        last_known_date = "2020-10-20"
        latest_date = date_today(offset=1)
        for date in reversed(list(date_range(last_known_date, latest_date))):
            try:
                url_test = url_tpl.format(date=date.replace("-", "."))
                self.log_debug(f"Trying {url_test}")
                res = requests.get(url_test, timeout=60)
                if res.ok:
                    working_url = url_test
                    break
            except:
                continue

        # Make sure that we found a working URL
        assert working_url is not None, f"No working URL found for DXY data source"

        # Pass the actual URL down to fetch it
        return super().fetch(output_folder,
                             cache, [{
                                 **opts, "url": working_url
                             }],
                             skip_existing=skip_existing)
    def fetch(
        self,
        output_folder: Path,
        cache: Dict[str, str],
        fetch_opts: List[Dict[str, Any]],
        skip_existing: bool = False,
    ) -> Dict[str, str]:
        # Data is published as daily snapshots, so we guess the URL based on the date
        opts = dict(fetch_opts[0])
        url_tpl = opts.pop("url")

        urls = []
        date_start = "2020-05-06"
        date_end = date_today(offset=1)
        for date in date_range(date_start, date_end):
            datestr = "".join(reversed(date.split("-")))
            urls.append(
                dict(name=date, url=url_tpl.format(date=datestr), **opts))

        # Pass the actual URLs down to fetch it
        return super().fetch(output_folder,
                             cache,
                             urls,
                             skip_existing=skip_existing)
Beispiel #9
0
def _process_partition(cases: DataFrame) -> DataFrame:
    cases = cases.copy()

    # Confirmed cases are only those with a confirmed positive test result
    cases["date_new_confirmed"] = None
    confirmed_mask = cases["_test_result"] == "Positivo"
    cases.loc[confirmed_mask, "date_new_confirmed"] = cases.loc[confirmed_mask, "date_new_tested"]

    # Do not process deceased counts, since they are considered highly inaccurate

    # # Deceased cases have a specific label and the date is the "closing" date
    # cases["date_new_deceased"] = None
    # deceased_mask = cases["_prognosis"] == "Óbito"
    # cases.loc[deceased_mask, "date_new_deceased"] = cases.loc[deceased_mask, "_date_update"]

    # # Only count deceased cases from confirmed subjects
    # cases.loc[~confirmed_mask, "date_new_deceased"] = None

    # Recovered cases have a specific label and the date is the "closing" date
    cases["date_new_recovered"] = None
    recovered_mask = cases["_prognosis"] == "Cured"
    cases.loc[recovered_mask, "date_new_recovered"] = cases.loc[recovered_mask, "_date_update"]

    # Only count recovered cases from confirmed subjects
    cases.loc[~confirmed_mask, "date_new_recovered"] = None

    # Drop columns which we have no use for
    cases = cases[[col for col in cases.columns if not col.startswith("_")]]

    # Make sure our region codes are of type str
    cases["subregion2_code"] = cases["subregion2_code"].apply(safe_int_cast)
    # The last digit of the region code is actually not necessary
    cases["subregion2_code"] = cases["subregion2_code"].apply(
        lambda x: None if isna(x) else str(int(x))[:-1]
    )

    # Convert ages to int, and translate sex (no "other" sex/gender reported)
    cases["age"] = cases["age"].apply(safe_int_cast)
    cases["sex"] = cases["sex"].str.lower().apply({"masculino": "male", "feminino": "female"}.get)

    # Convert to time series format
    data = convert_cases_to_time_series(cases, index_columns=["subregion1_code", "subregion2_code"])

    # Convert date to ISO format
    data["date"] = data["date"].str.slice(0, 10)
    data["date"] = data["date"].apply(lambda x: datetime_isoformat(x, "%Y-%m-%d"))

    # Get rid of bogus records
    data = data.dropna(subset=["date"])
    data = data[data["date"] >= "2020-01-01"]
    data = data[data["date"] < date_today(offset=1)]

    # Aggregate data by country
    country = (
        data.drop(columns=["subregion1_code", "subregion2_code"])
        .groupby(["date", "age", "sex"])
        .sum()
        .reset_index()
    )
    country["key"] = "BR"

    # Aggregate data by state
    state = (
        data.drop(columns=["subregion2_code"])
        .groupby(["date", "subregion1_code", "age", "sex"])
        .sum()
        .reset_index()
    )
    state["key"] = "BR_" + state["subregion1_code"]

    # We can derive the key from subregion1 + subregion2
    data = data[data["subregion2_code"].notna() & (data["subregion2_code"] != "")]
    data["key"] = "BR_" + data["subregion1_code"] + "_" + data["subregion2_code"]

    return concat([country, state, data])