def parse_dataframes( self, dataframes: Dict[Any, DataFrame], aux: Dict[str, DataFrame], **parse_opts ) -> DataFrame: tables = [df for name, df in dataframes.items() if name != "geo"] column_adapter = dict(_column_adapter, state="idxs", date=f"date_new_confirmed") data = table_rename(concat(tables), column_adapter=column_adapter, drop=True) # Correct data types where necessary data["idxs"] = data["idxs"].astype(str) data["age"] = data["age"].apply(lambda x: None if x < 0 else x) data["sex"] = data["sex"].apply({0: "female", 1: "male"}.get) # Convert to our preferred time series format data = convert_cases_to_time_series(data, ["idxs"]) # Geo name lookup geo_col_adapter = {"state": "subregion1_name", "district": "subregion2_name"} geo = table_rename(dataframes["geo"], geo_col_adapter, drop=False) geo["idxs"] = geo["idxs"].astype(str) geo["subregion1_name"] = geo["subregion1_name"].str.replace("W.P. ", "") geo = geo.groupby(["subregion1_name", "idxs"]).first().reset_index() data = table_merge([data, geo], on=["idxs"], how="inner") # Since only the cases have district level data, ignore it data["country_code"] = "MY" data["subregion2_name"] = None return data
def parse_dataframes( self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts ) -> DataFrame: data = table_multimerge( [ table_rename(dataframes["confirmed"], _column_adapter, drop=True), table_rename(dataframes["deceased"], _column_adapter, drop=True), ], how="outer", ) # Province names are sometimes codes (but not always compliant with ISO codes) data["subregion1_code"] = data["subregion1_name"].apply(_province_map.get) data.drop(columns=["subregion1_name"], inplace=True) # Convert date to ISO format data["date"] = data["date"].apply(lambda x: datetime_isoformat(x, "%d-%m-%Y")) # Aggregate subregion1 level l1_index = ["date", "subregion1_code"] l1 = data.drop(columns=["match_string"]).groupby(l1_index).sum().reset_index() # Make sure all records have the country code and subregion2_name l1["country_code"] = "CA" l1["subregion2_name"] = None data["country_code"] = "CA" data["subregion2_name"] = "" # Remove bogus data data = data[data["match_string"] != "Not Reported"] # Output the results return concat([l1, data])
def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: hospitalizations = dataframes[0] icu = table_rename( hospitalizations.loc[hospitalizations["DPHCategory"] == "ICU"], { "reportDate": "date", "PatientCount": "current_intensive_care" }, drop=True, ) hosp = table_rename( hospitalizations.loc[hospitalizations["DPHCategory"] == "Med/Surg"], { "reportDate": "date", "PatientCount": "current_hospitalized" }, drop=True, ) data = icu.merge(hosp, on="date") data["date"] = data["date"].apply( lambda x: datetime_isoformat(x, "%Y/%m/%d")) data["key"] = "US_CA_SFO" return data
def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: data = table_merge( [ table_rename( dataframes["confirmed"], { "Fecha": "date", "Total": "new_confirmed", "Region": "match_string" }, drop=True, ), table_rename( dataframes["deceased"], # The file name indicates the counts are cumulative, but they are not { "Fecha": "date", "Total": "total_deceased", "Region": "match_string" }, drop=True, ), table_rename( dataframes["tested"], { "Fecha": "date", "numero": "new_tested", "Region": "match_string" }, drop=True, ), ], how="outer", ) # Convert date to ISO format data["date"] = data["date"].astype(str) # Extract cities from the regions city = _extract_cities(data) # Make sure all records have country code and no subregion code or key data["country_code"] = "CL" data["key"] = None data["subregion2_code"] = None # Country is reported as "Total" data.loc[data["match_string"] == "Total", "key"] = "CL" # Drop bogus records from the data data.dropna(subset=["date", "match_string"], inplace=True) return concat([data, city])
def parse_dataframes( self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts ) -> DataFrame: data = table_multimerge( [ table_rename( dataframes["confirmed"], { "Fecha": "date", "Casos confirmados": "new_confirmed", "Codigo region": "subregion1_code", "Codigo comuna": "subregion2_code", }, drop=True, ), table_rename( dataframes["deceased"], { "Fecha": "date", "Casos fallecidos": "total_deceased", "Codigo region": "subregion1_code", "Codigo comuna": "subregion2_code", }, drop=True, ), ], how="outer", ) # Convert date to ISO format data["date"] = data["date"].astype(str) # Parse region codes as strings data["subregion1_code"] = data["subregion1_code"].apply( lambda x: numeric_code_as_string(x, 2) ) data["subregion2_code"] = data["subregion2_code"].apply( lambda x: numeric_code_as_string(x, 5) ) # Use proper ISO codes for the subregion1 level data["subregion1_code"] = data["subregion1_code"].apply(_SUBREGION1_CODE_MAP.get) # Extract cities from the municipalities city = _extract_cities(data) # We can build the key for the data directly from the subregion codes data["key"] = "CL_" + data["subregion1_code"] + "_" + data["subregion2_code"] # Drop bogus records from the data data.dropna(subset=["subregion1_code", "subregion2_code"], inplace=True) return concat([data, city])
def parse_dataframes( self, dataframes: Dict[Any, DataFrame], aux: Dict[str, DataFrame], **parse_opts ) -> DataFrame: data = table_merge( [ table_rename( dataframes['vaccDosesAdministered'], { "date": "date", "geoRegion": "subregion1_code", "sumTotal": "total_vaccine_doses_administered", }, drop=True, ), table_rename( dataframes['fullyVaccPersons'], { "date": "date", "geoRegion": "subregion1_code", "sumTotal": "total_persons_fully_vaccinated", }, drop=True, ), ], on=["date", "subregion1_code"], how="outer", ) # Assuming fully and partially vaccinated persons have 2 and 1 doses respectively, # total_persons_partially_vaccinated = total_vaccine_doses_administered - 2 * total_persons_fully_vaccinated # Therefore, total_persons_vaccinated = total_persons_partially_vaccinated + total_persons_fully_vaccinated # = total_vaccine_doses_administered - total_persons_fully_vaccinated data["total_persons_vaccinated"] = data["total_vaccine_doses_administered"] - data["total_persons_fully_vaccinated"] # Make sure all records have the country code and match subregion1 only data["key"] = None data["country_code"] = "CH" data["subregion2_code"] = None data["locality_code"] = None # Country-level records have a known key country_mask = data["subregion1_code"] == "CH" data.loc[country_mask, "key"] = "CH" # Principality of Liechtenstein is not in CH but is in the data as FL country_mask = data["subregion1_code"] == "FL" data.loc[country_mask, "key"] = "LI" # Output the results return data
def parse_dataframes(self, dataframes: Dict[Any, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: data = table_rename(dataframes[0], _column_adapter, drop=True) # Convert data to int type for col in data.columns[2:]: data[col] = data[col].apply(safe_int_cast) # Match data with GB subregions data["key"] = None data["country_code"] = "GB" data["subregion2_code"] = None data["locality_code"] = None # East Of England data.loc[data["_location"] == "Total", "key"] = "GB_ENG" data.loc[data["_location"] == "East Of England", "key"] = "GB_UKH" data.loc[data["_location"] == "London", "key"] = "GB_UKI" # data.loc[data["_location"] == "Midlands", "key"] = "" # data.loc[data["_location"] == "North East And Yorkshire", "key"] = "" data.loc[data["_location"] == "North West", "key"] = "GB_UKD" data.loc[data["_location"] == "South East", "key"] = "GB_UKJ" data.loc[data["_location"] == "South West", "key"] = "GB_UKK" return data
def parse_dataframes(self, dataframes: Dict[Any, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: data = table_rename( dataframes["fullyVaccPersons"], { "date": "date", "geoRegion": "subregion1_code", "type": "_statistic", "entries": "_new_count", "sumTotal": "_total_count", }, drop=True, ) # Combine all the different variable indicators tables = [] for col, var in { "COVID19AtLeastOneDosePersons": "persons_vaccinated", "COVID19FullyVaccPersons": "persons_fully_vaccinated", }.items(): adapter = { "_new_count": f"new_{var}", "_total_count": f"total_{var}" } subset = data.loc[data["_statistic"] == col].drop( columns=["_statistic"]) tables.append(subset.rename(columns=adapter)) data = table_merge(tables, on=["date", "subregion1_code"], how="outer") # Output the results return _output_ch_data(data)
def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: columns = dataframes[0].iloc[5] data = dataframes[0].iloc[6:] data.columns = columns data = table_rename( data, { "Datum": "date", "Fallzahlen pro Tag": "new_confirmed", "Fallzahlen pro Tag, kumuliert": "total_confirmed", "Hospitalisationen pro Tag": "new_hospitalized", "Hospitalisationen pro Tag, Kumuliert": "total_hospitalized", "Todesfälle pro Tag": "new_deceased", "Todesfälle pro Tag, kumuliert": "total_deceased", }, drop=True, ) # Parse date into ISO format data["date"] = data["date"].apply(lambda x: str(x)[:10]) # The key is just the country code data["key"] = "CH" return data
def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: cases = { "date": "date", "areaCode": "areaCode", "newCasesBySpecimenDate": "newCasesBySpecimenDate", "cumCasesBySpecimenDate": "cumCasesBySpecimenDate", } api = Cov19API(filters=["areaType=utla"], structure=cases) data = api.get_dataframe() data.areaCode = data.areaCode.apply(_apply_area_code_map) data = data.groupby(["date", "areaCode"], as_index=False).sum() data = table_rename( data, { "areaCode": "subregion2_code", "newCasesBySpecimenDate": "new_confirmed", "cumCasesBySpecimenDate": "total_confirmed", "date": "date", }, drop=True, ) data.date = data.date.apply( lambda x: datetime_isoformat(x, "%Y-%m-%d")) return data
def parse_dataframes(self, dataframes: List[DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: # Date_reported Country_code Country WHO_region New_cases Cumulative_cases New_deaths Cumulative_deaths data = table_rename( dataframes[0], { "Date_reported": "date", "Country_code": "key", "New_cases": "new_confirmed", "Cumulative_cases": "total_confirmed", "New_deaths": "new_deceased", "Cumulative_deaths": "total_deceases", }, drop=True, ) # Convert date to ISO format data["date"] = data["date"].astype(str).apply(lambda x: x[:10]) # Adjust the date of the records to match local reporting data = _adjust_date(data, aux["metadata"]) # Remove bogus entries data = data[data["key"].str.strip() != ""] # We consider some countries as subregions of other countries data.loc[data["key"] == "BL", "key"] = "FR_BL" data.loc[data["key"] == "GP", "key"] = "FR_GUA" data.loc[data["key"] == "MF", "key"] = "FR_MF" data.loc[data["key"] == "PM", "key"] = "FR_PM" return data
def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: with open(sources[0], "r") as fd: data = json.load(fd)["features"] data = table_rename( DataFrame.from_records([row["attributes"] for row in data]), _column_adapter, remove_regex=r"[^a-z\s\d]", drop=True, ) # Add the age bins data["age_bin_00"] = "0-9" data["age_bin_01"] = "10-19" data["age_bin_02"] = "20-29" data["age_bin_03"] = "30-39" data["age_bin_04"] = "40-49" data["age_bin_05"] = "50-59" data["age_bin_06"] = "60-69" data["age_bin_07"] = "70-79" data["age_bin_08"] = "80-" # Convert date to ISO format data = data.dropna(subset=["date"]) data.date = data.date.apply(lambda x: datetime.datetime.fromtimestamp( x // 1000).date().isoformat()) data["key"] = "FI" return data
def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: # Rename appropriate columns col = parse_opts["column_name"] cases = table_rename(dataframes[0], _column_adapter) cases = cases.rename(columns={"date": f"date_{col}"}) cases = _parse_region_codes(cases).dropna(subset=[f"date_{col}"]) # Rename the sex values cases["sex"] = cases["sex"].apply({"M": "male", "Z": "female"}.get) # Go from individual case records to key-grouped records in a flat table data = convert_cases_to_time_series( cases, index_columns=["subregion1_code", "subregion2_code"]) # Make sure the region codes are strings before parsing them data["subregion1_code"] = data["subregion1_code"].astype(str) data["subregion2_code"] = data["subregion2_code"].astype(str) # Aggregate L2 + L3 data data = _aggregate_regions(data, ["date", "subregion1_code", "age", "sex"]) # Remove bogus values data = data[data["key"] != "CZ_99"] data = data[data["key"] != "CZ_99_99Y"] # Convert all dates to ISO format data["date"] = ( data["date"].astype(str).apply(lambda x: datetime_isoformat( x, "%d.%m.%Y" if "." in x else "%Y-%m-%d"))) return data
def parse_dataframes( self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts ) -> DataFrame: data = table_rename( dataframes["vaccination_trends_data"], { "Location": "key", "Date": "date", "Date_Type": "_date_type", "Administered_Daily": "new_vaccine_doses_administered", "Administered_Cumulative": "total_vaccine_doses_administered", "Admin_Dose_1_Daily": "new_persons_vaccinated", "Admin_Dose_1_Cumulative": "total_persons_vaccinated", "Admin_Dose_2_Daily": "new_persons_fully_vaccinated", "Admin_Dose_2_Cumulative": "total_persons_fully_vaccinated", }, drop=True, remove_regex=r"[^0-9a-z\s]", ) data = data[data["key"] == "US"] data = data[data["_date_type"] == "Admin"] data = data.sort_values("date") data = data.drop(columns=["_date_type"]) return data
def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: data = dataframes[0] # Get all the states states = list(data.columns.difference(["Status", "Date"])) data = table_rename( data, { "Confirmed": "total_confirmed", "Deceased": "total_deceased", "Recovered": "total_recovered", "Tested": "total_tested", "Date": "date", "District": "match_string", "State": "subregion1_name", }, drop=True, ) data.match_string = data.match_string.apply(self._replace_subregion) data = data[~data.match_string.isin(L3_INDIA_REMOVE_SET)] data["country_code"] = "IN" return data
def _get_country(url_tpl: str): data = read_file(url_tpl.format("FRA")) data["key"] = "FR" # For country level, there is no need to estimate confirmed from tests _column_adapter_2 = dict(_column_adapter) _column_adapter_2.pop("testsPositifs") return table_rename(data, _column_adapter_2, drop=True)
def parse_dataframes( self, dataframes: Dict[Any, DataFrame], aux: Dict[str, DataFrame], **parse_opts ) -> DataFrame: data = table_rename(dataframes[0], _column_adapter, drop=True) int_cols = ["total_vaccine_doses_administered"] data = data.dropna(subset=int_cols) for col in int_cols: data[col] = data[col].apply(safe_int_cast) # Fix typos and merge subregions manually data["match_string"] = data["match_string_2"].fillna(data["match_string_1"]) data["match_string"] = data["match_string"].str.replace("Amazionas", "Amazonas") data["match_string"] = data["match_string"].str.replace("Baranquilla", "Atlántico") data["match_string"] = data["match_string"].str.replace("Benaventura", "Valle del Cauca") data["match_string"] = data["match_string"].str.replace("Cartagena", "Bolivar") data["match_string"] = data["match_string"].str.replace("Santa Marta", "Magdalena") # Match string does not follow strict hierarchy data = data.groupby(["date", "match_string"]).sum().reset_index() # Make sure only subregion1 level is matched data["country_code"] = "CO" data["subregion2_code"] = None data["locality_code"] = None return data
def parse_dataframes( self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts ) -> DataFrame: data = table_rename( dataframes[0], { "submission_date": "date", "state": "subregion1_code", "tot_cases": "total_confirmed", # "conf_cases": "total_confirmed", # "prob_cases": "", "new_case": "new_confirmed", # "pnew_case": "", "tot_death": "total_deceased", # "conf_death": "", # "prob_death": "", "new_death": "new_deceased", # "pnew_death": "", # "created_at": "", # "consent_cases": "", # "consent_deaths": "", }, drop=True, ) data["key"] = "US_" + data["subregion1_code"] data["date"] = data["date"].apply(lambda x: datetime_isoformat(x, "%m/%d/%Y")) # A few "states" are considered independent territories by our dataset or need correction data.loc[data["subregion1_code"] == "PW", "key"] = "PW" data.loc[data["subregion1_code"] == "FSM", "key"] = "FM" data.loc[data["subregion1_code"] == "RMI", "key"] = "MH" data.loc[data["subregion1_code"] == "NYC", "key"] = "US_NY_NYC" return data
def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: with open(sources[0], "r") as fd: data = json.load(fd)["Data"] # "Date":"01\/01\/2020","NewConfirmed":0,"NewRecovered":0,"NewHospitalized":0,"NewDeaths":0,"Confirmed":0,"Recovered":0,"Hospitalized":0,"Deaths":0 data = table_rename( DataFrame.from_records(data), { "Date": "date", "NewConfirmed": "new_confirmed", "NewRecovered": "new_recovered", "NewHospitalized": "new_hospitalized", "NewDeaths": "new_deceased", "Confirmed": "total__confirmed", "Recovered": "total__recovered", "Hospitalized": "total__hospitalized", "Deaths": "total__deceased", }, drop=True, remove_regex=r"[^0-9a-z\s]", ) # Format date as ISO date data["date"] = data["date"].apply(lambda x: datetime_isoformat(x, "%m/%d/%Y")) # Add key and return data data["key"] = "TH" return data
def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: data = table_rename( dataframes["intensive_care"], { "fecha": "date", "numero": "current_intensive_care", "Region": "match_string" }, drop=True, ) # Convert date to ISO format data["date"] = data["date"].astype(str) # Extract cities from the regions city = _extract_cities(data) # Make sure all records have country code and no subregion code data["country_code"] = "CL" data["subregion2_code"] = None # Drop bogus records from the data data.dropna(subset=["date", "match_string"], inplace=True) return concat([data, city])
def _parse_summary(data: DataFrame) -> DataFrame: data = data[data.columns[1:]] data.columns = ["statistic"] + list(data.columns[1:]) data = data.dropna(subset=data.columns[1:], how="all") data = pivot_table_date_columns(data.set_index("statistic"), value_name="statistic") data = data.reset_index().dropna(subset=["date"]) data.statistic = data.statistic.apply(safe_float_cast).astype(float) data = data.pivot_table(index="date", columns=["index"], values="statistic") data = data.reset_index() data = table_rename( data, { "date": "date", "Total Positives": "total_confirmed", "Number of Deaths": "total_deceased", "Total Overall Tested": "total_tested", "Cleared From Isolation": "total_recovered", "Total COVID-19 Patients in DC Hospitals": "total_hospitalized", "Total COVID-19 Patients in ICU": "total_intensive_care", }, drop=True, ) return data
def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: deceased = table_rename(dataframes["deceased"], {"FECHA / CCAA": "date"}) deceased = pivot_table(deceased.set_index("date"), value_name="new_deceased", pivot_name="match_string") # Convert dates to ISO format deceased["date"] = deceased["date"].apply(lambda x: str(x)[:10]) deceased["date"] = deceased["date"].apply( lambda x: datetime_isoformat(x, "%Y-%m-%d")) # Add the country code to all records and declare matching as subregion1 deceased["country_code"] = "ES" deceased["subregion2_code"] = None deceased["locality_code"] = None # Country level is declared as "espana" deceased["key"] = None deceased.loc[deceased["match_string"] == "espana", "key"] = "ES" # Output the results return deceased.dropna(subset=["date"])
def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: # The headers are a bit funny-looking, so we must manually manipulate them first data = dataframes[0] data.columns = [ col.split("|")[0].split("~")[0] for col in data.iloc[0] ] data = data.iloc[1:] data = table_rename( data, { "Date": "date", "Nombre de personnes en soins intensifs": "current_intensive_care", "Nombre cumulé de décès": "total_deceased", "Nombre de personnes testées COVID+": "new_tested", }, drop=True, ) # Get date in ISO format data["date"] = data["date"].apply( lambda x: datetime_isoformat(x, "%d/%m/%Y")) # Only country-level data is provided data["key"] = "LU" # Output the results return data
def _get_department(record: Dict[str, str]): subregion1_code = record["subregion1_code"] subregion2_code = record["subregion2_code"] code = f"DEP-{subregion2_code}" data = read_file(_api_url_tpl.format(code)) data["key"] = f"FR_{subregion1_code}_{subregion2_code}" return table_rename(data, _column_adapter, drop=True)
def parse_dataframes(self, dataframes: List[DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: data = table_rename( dataframes[0], parse_opts.get( "column_adapter", { "discharged_cumulative": "total_discharged", "hospitalized_current": "current_hospitalized", "number hospitalised": "current_hospitalized", "hospitalized_cumulative": "total_hospitalized", "icu_current": "current_intensive_care", "number in icu": "current_intensive_care", "icu_cumulative": "cumulative_intensive_care", "ventilator_current": "current_ventilator", "ventilator_cumulative": "cumulative_ventilator", "new hospital admissions": "new_hospitalized", "new intensive care admissions": "new_intensive_care", }, ), ) # Add key and parse date in ISO format data["key"] = parse_opts.get("key") data["date"] = data[parse_opts.get("date_column", "date")].astype(str) date_format = parse_opts.get("date_format", "%Y-%m-%d") data.date = data.date.apply( lambda x: datetime_isoformat(x, date_format)) return data
def parse_dataframes(self, dataframes: Dict[Any, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: data = table_rename( dataframes[0], _column_adapter, drop=True, ) # Filter data for 2020 and remove all other years data = data[data["date"] == 2020].drop(columns=["date"]) # We only care about the population count indicators data = data[data["indicator"] == "Medium"] # Population counts are in thousands, convert back to single units for col in [ col for col in data.columns if col.startswith("population") ]: data[col] = data[col] * 1000 # Derive key from our country names mapping names = aux["un_country_names"] data = data.merge(names, how="left") return data
def parse_dataframes( self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts ) -> DataFrame: data = table_rename( dataframes["counties"], { "Date": "date", "County": "match_string", "Count": "total_confirmed", "Deaths": "total_deceased", }, ) # Convert date to ISO format data["date"] = data["date"].astype(str).apply(lambda x: datetime_isoformat(x, "%m/%d/%Y")) # Drop bogus values data = data[data["match_string"] != "Unknown"] # Dukes and Nantucket are separate counties but reported as one, so drop them from the data data = data[data["match_string"] != "Dukes and Nantucket"] data["country_code"] = "US" data["subregion1_code"] = "MA" return data
def _rename_columns(data: DataFrame, column_adapter: Dict[str, str]) -> DataFrame: data.columns = data.iloc[0] data.columns = [str(col).replace("\n", " ") for col in data.columns] data = table_rename(data.iloc[1:].replace(".", numpy.nan), column_adapter) return data[column_adapter.values()]
def parse_dataframes( self, dataframes: Dict[Any, DataFrame], aux: Dict[str, DataFrame], **parse_opts ) -> DataFrame: tables = [] for df in dataframes.values(): df = table_rename(df, _column_adapter, drop=True, remove_regex=r"[^a-z]") # Make sure the date is a timestamp df["date"] = df["date"].apply(safe_datetime_parse) df.dropna(subset=["date"], inplace=True) # Fill the date when blank df["date"] = df["date"].fillna(df["date"].max()) df["date"] = df["date"].apply(lambda x: x.date().isoformat()) # Correct the obvious date typos df["date"] = df["date"].apply(lambda x: x.replace("2022", "2021")) tables.append(df) data = concat(tables) # Estimate first doses from total doses and second doses data["total_persons_vaccinated"] = ( data["total_vaccine_doses_administered"] - data["total_persons_fully_vaccinated"] ) data["key"] = None data["country_code"] = "ES" data["subregion2_code"] = None data["locality_code"] = None data.loc[data["match_string"] == "Totales", "key"] = "ES" return data
def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: data = table_rename( dataframes[0], { "e(0)": "life_expectancy", "STATE2KX": "state_code", "CNTY2KX": "county_code" }, drop=True, ) # Derive the FIPS subregion code from state and county codes data["state_code"] = data["state_code"].apply( lambda x: numeric_code_as_string(x, 2)) data["county_code"] = data["county_code"].apply( lambda x: numeric_code_as_string(x, 3)) data["subregion2_code"] = data["state_code"] + data["county_code"] # Data is more granular than county level, use a crude average for estimate data = (data.drop(columns=["state_code", "county_code"]).groupby( "subregion2_code").mean().reset_index()) # Add country code to all records and return data["country_code"] = "US" return data