def parse_dataframes( self, dataframes: List[DataFrame], aux: Dict[str, DataFrame], **parse_opts ) -> DataFrame: data = dataframes[0].rename( columns={ "日付": "date", "都道府県名": "match_string", "患者数": "confirmed", "入院中": "hospitalized", "退院者": "recovered", "死亡者": "deceased", } ) # Convert date to ISO format data["date"] = data["date"].apply(lambda x: datetime_isoformat(x, "%Y%m%d")) # Add the country code to all records data["country_code"] = "JP" # Keep only columns we can process data = data[["date", "match_string", "confirmed", "hospitalized", "recovered", "deceased"]] # Aggregate the region-level data data = grouped_cumsum(data, ["country_code", "match_string", "date"]) # Aggregate the country-level data data_country = data.groupby("date").sum().reset_index() data_country["key"] = "JP" # Output the results return concat([data_country, data])
def _parse(file_path: str, sheet_name: str, value_name: str): data = read_file(file_path, sheet_name=sheet_name) data.columns = [ col.replace("NHS ", "").replace(" total", "") for col in data.iloc[1] ] data = data.iloc[2:].rename(columns={"Date": "date"}) data = pivot_table(data.set_index("date"), pivot_name="match_string") data = data.rename(columns={"value": value_name}) data[value_name] = data[value_name].replace( "*", None).apply(safe_float_cast).astype(float) # Get date in ISO format data.date = data.date.apply(lambda x: x.date().isoformat()) # Compute cumsum of values data = grouped_cumsum(data, ["match_string", "date"]) # Add metadata data["key"] = None data["country_code"] = "GB" data["subregion1_code"] = "SCT" l2_mask = data.match_string == "Scotland" data.loc[l2_mask, "key"] = "GB_SCT" return data
def parse_dataframes(self, dataframes: List[DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: # Rename appropriate columns data = dataframes[0].rename( columns={ "Codigo DIVIPOLA": "subregion2_code", "Fecha de muerte": "date_deceased", "Fecha diagnostico": "date_confirmed", "Fecha recuperado": "date_recovered", }) # Clean up the subregion code data.subregion2_code = data.subregion2_code.apply( lambda x: "{0:05d}".format(int(x))) # Compute the key from the DIVIPOLA code data["key"] = ("CO_" + data.subregion2_code.apply(lambda x: x[:2]) + "_" + data.subregion2_code) # A few cases are at the l2 level data.key = data.key.apply(lambda x: "CO_" + x[-2:] if x.startswith("CO_00_") else x) # Go from individual case records to key-grouped records in a flat table merged: DataFrame = None for value_column in ("confirmed", "deceased", "recovered"): subset = data.rename( columns={"date_{}".format(value_column): "date"})[[ "key", "date" ]] subset = subset[~subset.date.isna() & (subset.date != "- -")].dropna() subset[value_column] = 1 subset = subset.groupby(["key", "date"]).sum().reset_index() if merged is None: merged = subset else: merged = merged.merge(subset, how="outer") # Convert date to ISO format merged.date = merged.date.apply(safe_datetime_parse) merged = merged[~merged.date.isna()] merged.date = merged.date.apply(lambda x: x.date().isoformat()) merged = merged.fillna(0) # Compute the daily counts data = grouped_cumsum(merged, ["key", "date"]) # Group by level 2 region, and add the parts l2 = data.copy() l2["key"] = l2.key.apply(lambda x: "_".join(x.split("_")[:2])) l2 = l2.groupby(["key", "date"]).sum().reset_index() # Group by country level, and add the parts l1 = l2.copy().drop(columns=["key"]) l1 = l1.groupby("date").sum().reset_index() l1["key"] = "CO" return data
def parse_dataframes( self, dataframes: List[DataFrame], aux: Dict[str, DataFrame], **parse_opts ) -> DataFrame: df1 = Jp2019NcovJapanByDate._parse_pivot(dataframes[0], "confirmed") df2 = Jp2019NcovJapanByDate._parse_pivot(dataframes[1], "deceased") # Keep only columns we can process data = merge(df1, df2) data = data[["date", "country_code", "match_string", "confirmed", "deceased"]] return grouped_cumsum(data, ["country_code", "match_string", "date"])
def parse(self, sources: List[str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: data = read_file(sources[0], sheet_name="Antal intensivvårdade per dag").rename( columns={"Datum_vårdstart": "date", "Antal_intensivvårdade": "intensive_care"} ) # Get date in ISO format data["key"] = "SE" data.date = data.date.apply(lambda x: datetime_isoformat(x, "%m/%d/%Y")) return grouped_cumsum(data, ["key", "date"])
def parse(self, sources: List[str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: data = read_file( sources[0], error_bad_lines=False, encoding="ISO-8859-1", sep=";" ).rename( columns={ "Date": "date", "Nombre de personnes en soins normaux": "current_hospitalized", "Nombre de personnes en soins intensifs (sans patients du Grand Est)": "current_intensive_care", "Nombre de décès - cumulé (sans patients du Grand Est)": "deceased", "Total patients COVID ayant quitté l'hôpital (hospitalisations stationnaires, données brutes)": "recovered", "Nombre de nouvelles personnes testées COVID+ par jour ": "tested", }) # Get date in ISO format data.date = data.date.apply( lambda x: datetime_isoformat(x, "%d/%m/%Y")) # Keep only columns we can provess data = data[[ "date", "current_hospitalized", "current_intensive_care", "deceased", "recovered", "tested", ]] # Convert recovered into a number data.recovered = data.recovered.apply( lambda x: int(x.replace("-", "0"))) # Compute the daily counts data["key"] = "LU" data_new = grouped_diff(data[["key", "date", "deceased"]], ["key", "date"]) data_cum = grouped_cumsum(data[["key", "date", "tested", "recovered"]], ["key", "date"]) data_cur = data[[ "key", "date", "current_hospitalized", "current_intensive_care" ]] data = data_new.merge(data_cum, how="outer").merge(data_cur, how="outer") # Output the results return data
def parse(self, sources: List[str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: data = read_file(sources[0], sheet_name="Antal per dag region").rename( columns={"Statistikdatum": "date"} ) # Get date in ISO format data.date = data.date.astype(str) # Unpivot the regions which are columns data.columns = [col.replace("_", " ") for col in data.columns] data = data.drop(columns=["Totalt antal fall"]).set_index("date") data = pivot_table(data, pivot_name="match_string") data["country_code"] = "SE" data = data.rename(columns={"value": "confirmed"}) return grouped_cumsum(data, ["country_code", "match_string", "date"])
def parse_dataframes(self, dataframes: List[DataFrame], metadata: Dict[str, DataFrame], **parse_opts) -> DataFrame: data = dataframes[0] metadata = metadata["metadata"] # Ensure date field is used as a string data["dateRep"] = data["dateRep"].astype(str) # Convert date to ISO format data["date"] = data["dateRep"].apply( lambda x: datetime_isoformat(x, "%d/%m/%Y")) # Workaround for https://github.com/open-covid-19/data/issues/8 # ECDC mistakenly labels Greece country code as EL instead of GR data["geoId"] = data["geoId"].apply(lambda code: "GR" if code == "EL" else code) # Workaround for https://github.com/open-covid-19/data/issues/13 # ECDC mistakenly labels Great Britain country code as UK instead of GB data["geoId"] = data["geoId"].apply(lambda code: "GB" if code == "UK" else code) # Remove bogus entries (cruiseships, etc.) data = data[~data["geoId"].apply(lambda code: len(code) > 2)] data = data.rename(columns={ "geoId": "key", "cases": "confirmed", "deaths": "deceased" }) # Adjust the date of the records to match local reporting data = self._adjust_date(data, metadata) # Keep only the columns we can process data = data[["date", "key", "confirmed", "deceased"]] return grouped_cumsum(data, ["key", "date"])
def parse_dataframes( self, dataframes: List[DataFrame], aux: Dict[str, DataFrame], **parse_opts ) -> DataFrame: # Rename the appropriate columns data = dataframes[0].rename( columns={ "Date": "date", "Tested (all)": "total_tested", "Tested (daily)": "new_tested", "Positive (all)": "total_confirmed", "Positive (daily)": "new_confirmed", "All hospitalized on certain day": "current_hospitalized", "All persons in intensive care on certain day": "active_intensive_care", "Discharged": "recovered", "Deaths (all)": "total_deceased", "Deaths (daily)": "new_deceased", } ) # Make sure all records have the country code data["country_code"] = "SI" # Make sure that the date column is a string data.date = data.date.astype(str) # Compute the cumsum counts data = grouped_cumsum( data, ["country_code", "date"], skip=[ col for col in data.columns if any(kword in col for kword in ("new", "total", "active")) ], ) # Output the results return data
def parse_dataframes(self, dataframes: List[DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: data = None ordered_columns = [ "confirmed", "deceased", "tested", "hospitalized", "intensive_care" ] for column_name, df in zip(ordered_columns, dataframes): df = df.rename(columns={"Fecha": "date"}).set_index("date") df = pivot_table(df, pivot_name="match_string").rename( columns={"value": column_name}) if data is None: data = df else: data = data.merge(df, how="left") # Compute the cumsum of data data = grouped_cumsum(data, ["match_string", "date"]) data["country_code"] = "MX" # Country-level have a specific label data.loc[data.match_string == "Nacional", "key"] = "MX" return data
def parse_dataframes(self, dataframes: List[DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: # Rename the appropriate columns data = dataframes[0].rename( columns={ "jour": "date", "dep": "subregion2_code", "p": "confirmed", "t": "tested", "incid_hosp": "hospitalized", "incid_dc": "deceased", "incid_rad": "recovered", }) # Add subregion1_code field to all records data["subregion1_code"] = "" # Adjust for special regions region_adjust_map = { "971": "GUA", "972": "MQ", "973": "GF", "974": "LRE", "976": "MAY" } for subregion2_code, subregion1_code in region_adjust_map.items(): mask = data.subregion2_code == subregion2_code data.loc[mask, "subregion2_code"] = None data.loc[mask, "subregion1_code"] = subregion1_code # Get date in ISO format data.date = data.date.astype(str) # Get keys from metadata auxiliary table data["country_code"] = "FR" subregion1_mask = data.subregion2_code.isna() data1 = data[subregion1_mask].merge(aux["metadata"], on=("subregion1_code", "subregion2_code")) data2 = data[~subregion1_mask].merge(aux["metadata"], on="subregion2_code") data = concat([data1, data2]) # We only need to keep key-date pair for identification keep_columns = [ "date", "key", "confirmed", "tested", "deceased", "hospitalized" ] data = data[[col for col in data.columns if col in keep_columns]] # Compute the daily counts data = grouped_cumsum(data, ["key", "date"]) # Group by level 2 region, and add the parts l2 = data.copy() l2["key"] = l2.key.apply(lambda x: "_".join(x.split("_")[:2])) l2 = l2.groupby(["key", "date"]).sum().reset_index() # Group by country level, and add the parts l1 = l2.copy().drop(columns=['key']) l1 = l1.groupby("date").sum().reset_index() l1["key"] = "FR" # Output the results return concat([l2, data])