def _process_partition(cases: DataFrame) -> DataFrame: cases = cases.copy() # Extract information about whether doses were first (partial immunization) or second (full) cases["date_new_persons_vaccinated"] = None cases["date_new_persons_fully_vaccinated"] = None first_dose_mask = cases["_dose_information"].str.strip().str.slice( 0, 1) == "1" second_dose_mask = cases["_dose_information"].str.strip().str.slice( 0, 1) == "2" cases.loc[first_dose_mask, "date_new_persons_vaccinated"] = cases.loc[ first_dose_mask, "date_new_vaccine_doses_administered"] cases.loc[second_dose_mask, "date_new_persons_fully_vaccinated"] = cases.loc[ second_dose_mask, "date_new_vaccine_doses_administered"] # Drop columns which we have no use for cases = cases[[col for col in cases.columns if not col.startswith("_")]] # Make sure our region codes are of type str cases["subregion2_code"] = cases["subregion2_code"].apply( safe_int_cast).astype(str) # Convert ages to int, and translate sex (no "other" sex/gender reported) cases["age"] = cases["age"].apply(safe_int_cast) cases["sex"] = cases["sex"].str.lower().apply({ "m": "male", "f": "female" }.get) # Convert to time series format data = convert_cases_to_time_series( cases, index_columns=["subregion1_code", "subregion2_code"]) # Convert date to ISO format data["date"] = data["date"].str.slice(0, 10) data["date"] = data["date"].apply( lambda x: datetime_isoformat(x, "%Y-%m-%d")) # Get rid of bogus records data = data.dropna(subset=["date"]) data = data[data["date"] >= "2020-01-01"] data = data[data["date"] < date_today(offset=1)] # Aggregate data by country country = aggregate_admin_level(data, ["date", "age", "sex"], "country") country["key"] = "BR" # Aggregate data by state state = (data.drop(columns=["subregion2_code"]).groupby( ["date", "subregion1_code", "age", "sex"]).sum().reset_index()) state["key"] = "BR_" + state["subregion1_code"] # We can derive the key from subregion1 + subregion2 data = data[data["subregion2_code"].notna() & (data["subregion2_code"] != "")] data["key"] = "BR_" + data["subregion1_code"] + "_" + data[ "subregion2_code"] return concat([country, state, data])
def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: # Rename appropriate columns col = parse_opts["column_name"] cases = table_rename(dataframes[0], _column_adapter) cases = cases.rename(columns={"date": f"date_{col}"}) cases = _parse_region_codes(cases).dropna(subset=[f"date_{col}"]) # Rename the sex values cases["sex"] = cases["sex"].apply({"M": "male", "Z": "female"}.get) # Go from individual case records to key-grouped records in a flat table data = convert_cases_to_time_series( cases, index_columns=["subregion1_code", "subregion2_code"]) # Make sure the region codes are strings before parsing them data["subregion1_code"] = data["subregion1_code"].astype(str) data["subregion2_code"] = data["subregion2_code"].astype(str) # Aggregate L2 + L3 data data = _aggregate_regions(data, ["date", "subregion1_code", "age", "sex"]) # Remove bogus values data = data[data["key"] != "CZ_99"] data = data[data["key"] != "CZ_99_99Y"] # Convert all dates to ISO format data["date"] = ( data["date"].astype(str).apply(lambda x: datetime_isoformat( x, "%d.%m.%Y" if "." in x else "%Y-%m-%d"))) return data
def parse_dataframes( self, dataframes: Dict[Any, DataFrame], aux: Dict[str, DataFrame], **parse_opts ) -> DataFrame: tables = [df for name, df in dataframes.items() if name != "geo"] column_adapter = dict(_column_adapter, state="idxs", date=f"date_new_confirmed") data = table_rename(concat(tables), column_adapter=column_adapter, drop=True) # Correct data types where necessary data["idxs"] = data["idxs"].astype(str) data["age"] = data["age"].apply(lambda x: None if x < 0 else x) data["sex"] = data["sex"].apply({0: "female", 1: "male"}.get) # Convert to our preferred time series format data = convert_cases_to_time_series(data, ["idxs"]) # Geo name lookup geo_col_adapter = {"state": "subregion1_name", "district": "subregion2_name"} geo = table_rename(dataframes["geo"], geo_col_adapter, drop=False) geo["idxs"] = geo["idxs"].astype(str) geo["subregion1_name"] = geo["subregion1_name"].str.replace("W.P. ", "") geo = geo.groupby(["subregion1_name", "idxs"]).first().reset_index() data = table_merge([data, geo], on=["idxs"], how="inner") # Since only the cases have district level data, ignore it data["country_code"] = "MY" data["subregion2_name"] = None return data
def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: cases = table_rename(dataframes[0], _srag_column_adapter, drop=True) covid_mask = cases["_classification"] == 5 valid_mask = cases["_prognosis"].notna() & cases["_prognosis"] != 9 cases = cases[covid_mask & valid_mask] # Record the date of death cases["date_new_deceased"] = None deceased_mask = cases["_prognosis"] == 2 cases.loc[deceased_mask, "date_new_deceased"] = cases.loc[deceased_mask, "_date_prognosis"] # Convert ages to int, and translate sex (no "other" sex/gender reported) cases["age"] = cases["age"].apply(safe_int_cast) cases["sex"] = cases["sex"].apply({"M": "male", "F": "female"}.get) # Convert all dates to ISO format for col in filter(lambda x: x.startswith("date"), cases.columns): cases[col] = cases[col].apply( lambda x: datetime_isoformat(x, "%d/%m/%Y")) # Parse subregion codes cases["subregion2_code"] = cases["subregion2_code"].apply( lambda x: numeric_code_as_string(x, 5)) # Convert to time series format data = convert_cases_to_time_series(cases, index_columns=["subregion2_code"]) data["country_code"] = "BR" # Get rid of bogus records data = data.dropna(subset=["date"]) data = data[data["date"] >= "2020-01-01"] data = data[data["date"] < date_today(offset=1)] # Aggregate by country level country = (data.drop(columns=["subregion2_code"]).groupby( ["date", "age", "sex"]).sum().reset_index()) country["key"] = "BR" # Aggregate by state level data["subregion1_code"] = data["subregion2_code"].apply( lambda x: _IBGE_STATES.get(safe_int_cast(x[:2]))) state = (data.drop(columns=["subregion2_code"]).dropna( subset=["subregion1_code"]).groupby( ["date", "subregion1_code", "age", "sex"]).sum().reset_index()) state["key"] = "BR_" + state["subregion1_code"] # Derive the key from subregion codes data = data[data["subregion2_code"].notna()] data["key"] = "BR_" + data["subregion1_code"] + "_" + data[ "subregion2_code"] return concat([country, state, data])
def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: # Rename appropriate columns cases = table_rename( dataframes[0], { "Código DIVIPOLA municipio": "subregion2_code", "Fecha de notificación": "_date_notified", "Fecha de muerte": "date_new_deceased", "Fecha de diagnóstico": "date_new_confirmed", "Fecha de recuperación": "date_new_recovered", "edad": "age", "sexo": "sex", "Pertenencia etnica": "ethnicity", }, ) # Fall back to notification date when no confirmed date is available cases["date_new_confirmed"] = cases["date_new_confirmed"].fillna( cases["_date_notified"]) # Clean up the subregion code cases.subregion2_code = cases.subregion2_code.apply( lambda x: "{0:05d}".format(int(x))) # Compute the key from the DIVIPOLA code cases["key"] = ("CO_" + cases.subregion2_code.apply(lambda x: x[:2]) + "_" + cases.subregion2_code) # A few cases are at the l2 level cases["key"] = cases["key"].apply(lambda x: "CO_" + x[-2:] if x.startswith("CO_00_") else x) # Go from individual case records to key-grouped records in a flat table index_columns = ["key", "date", "sex", "age"] value_columns = ["new_confirmed", "new_deceased", "new_recovered"] data = convert_cases_to_time_series(cases) # Parse dates to ISO format. data["date"] = data["date"].apply( lambda x: datetime_isoformat(x.split(" ")[0], "%d/%m/%Y")) data.dropna(subset=["date"], inplace=True) # Group by level 1 region, and add the parts l1 = data.copy() l1["key"] = l1.key.apply(lambda x: "_".join(x.split("_")[:2])) l1 = l1.groupby(index_columns).sum().reset_index() # Group by country level country = l1.drop(columns=["key"]).groupby( index_columns[1:]).sum().reset_index() country["key"] = "CO" return concat([data, l1, country])[index_columns + value_columns]
def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: # Rename appropriate columns data = table_rename( dataframes[0], { "codigo divipola": "subregion2_code", "fecha de muerte": "date_new_deceased", "fecha diagnostico": "date_new_confirmed", "fecha recuperado": "date_new_recovered", "edad": "age", "sexo": "sex", "Pertenencia etnica": "ethnicity", }, ) # Clean up the subregion code data.subregion2_code = data.subregion2_code.apply( lambda x: "{0:05d}".format(int(x))) # Compute the key from the DIVIPOLA code data["key"] = ("CO_" + data.subregion2_code.apply(lambda x: x[:2]) + "_" + data.subregion2_code) # A few cases are at the l2 level data["key"] = data["key"].apply(lambda x: "CO_" + x[-2:] if x.startswith("CO_00_") else x) # Go from individual case records to key-grouped records in a flat table index_columns = ["key", "date", "sex", "age"] value_columns = ["new_confirmed", "new_deceased", "new_recovered"] merged = convert_cases_to_time_series(data) # Some dates are badly formatted as 31/12/1899 in the raw data we can drop these. merged = merged[(merged["date"] != datetime(1899, 12, 31))].dropna( subset=["date"]) # Parse dates to ISO format. merged["date"] = merged["date"].apply(safe_datetime_parse) merged["date"] = merged["date"].apply(lambda x: x.date().isoformat()) # Group by level 2 region, and add the parts l2 = merged.copy() l2["key"] = l2.key.apply(lambda x: "_".join(x.split("_")[:2])) l2 = l2.groupby(index_columns).sum().reset_index() # Group by country level, and add the parts l1 = l2.copy().drop(columns=["key"]) l1 = l1.groupby(index_columns[1:]).sum().reset_index() l1["key"] = "CO" return concat([merged, l1, l2])[index_columns + value_columns]
def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: cases = table_rename( dataframes[0], { # "no": "", "age": "age", "sex": "sex", # "nationality": "", # "province_of_isolation": "", # "notification_date": "date", "announce_date": "date_new_confirmed", "province_of_onset": "match_string", # "district_of_onset": "subregion2_name", # "quarantine": "", }, drop=True, remove_regex=r"[^0-9a-z\s]", ) # Convert date to ISO format cases["date_new_confirmed"] = cases["date_new_confirmed"].str.slice( 0, 10) # Some dates are not properly parsed, so fix those manually for col in (col for col in cases.columns if col.startswith("date_")): cases[col] = cases[col].str.replace("1963-", "2020-") cases[col] = cases[col].str.replace("2563-", "2020-") cases[col] = cases[col].str.replace("15/15/2020", "2020-12-15") cases[col] = cases[col].str.replace("15/15/2021", "2020-12-15") # Translate sex labels; only male, female and unknown are given sex_adapter = lambda x: { "ชาย": "male", "หญิง": "female" }.get(x, "sex_unknown") cases["sex"] = cases["sex"].apply(sex_adapter) # Convert from cases to time-series format data = convert_cases_to_time_series(cases, ["match_string"]) # Aggregate country-level data by adding all counties country = (data.drop(columns=["match_string"]).groupby( ["date", "age", "sex"]).sum().reset_index()) country["key"] = "TH" # Drop bogus records from the data data = data[data["match_string"].notna() & (data["match_string"] != "")] return concat([country, data])
def parse_dataframes( self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts ) -> DataFrame: date_col = "date_new_confirmed" cases = table_rename( dataframes[0], {"cdc_report_dt": date_col, "sex": "sex", "age_group": "age"}, drop=True ) cases["key"] = "US" cases["sex"] = cases["sex"].apply(lambda x: x.lower() if not isna(x) else None) cases["age"] = cases["age"].apply( lambda x: "-".join(x.replace(" Years", "").split(" - ")) if not isna(x) else None ) cases[date_col] = cases[date_col].apply(lambda x: datetime_isoformat(x, "%Y/%m/%d")) if parse_opts["column"] == "age": data = convert_cases_to_time_series(cases.drop(columns=["sex"])) elif parse_opts["column"] == "sex": data = convert_cases_to_time_series(cases.drop(columns=["age"])) else: raise ValueError(f'Unknown column {parse_opts["column"]}') return data
def test_convert_cases_to_time_series_null_deaths(self): cases = read_csv(StringIO(CASE_LINE_DATA_NULL_DEATHS)) table = convert_cases_to_time_series(cases) # There should be as many records as there are combinations of <key,age,sex,ethnicity,date> self.assertEqual(len(cases), len(table)) # All lines in our test case indicate a confirmed case self.assertEqual(len(cases), table["new_confirmed"].sum()) # No lines in our test case indicate a deceased case self.assertEqual(0, table["new_deceased"].sum()) # Half of our cases are male, and the other half are female self.assertEqual(len(table[table.sex == "male"]), len(table[table.sex == "female"]))
def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: with open(sources[0], "r") as fd: records = json.load(fd)["features"] cases = DataFrame.from_records(records) cases["date_new_confirmed"] = cases["ChartDate"].apply( lambda x: fromtimestamp(x // 1000).date().isoformat()) # FL does not provide date for deceased or hospitalized, so we just copy it from confirmed deceased_mask = cases.Died == "Yes" hospitalized_mask = cases.Hospitalized == "YES" cases["date_new_deceased"] = None cases["date_new_hospitalized"] = None cases.loc[deceased_mask, "date_new_deceased"] = cases.loc[deceased_mask, "date_new_confirmed"] cases.loc[hospitalized_mask, "date_new_hospitalized"] = cases.loc[hospitalized_mask, "date_new_confirmed"] # Rename the sex labels sex_adapter = lambda x: { "male": "male", "female": "female" }.get(x, "sex_unknown") cases["sex"] = cases["Gender"].apply(sex_adapter) cases.drop(columns=["Gender"], inplace=True) # Make sure age is an integer cases["age"] = cases["Age"].apply(safe_int_cast) cases.drop(columns=["Age"], inplace=True) cases = cases.rename(columns={"County": "match_string"}) data = convert_cases_to_time_series(cases, ["match_string"]) data["country_code"] = "US" data["subregion1_code"] = "FL" # Aggregate to state level here, since some data locations are "Unknown" group_cols = ["country_code", "subregion1_code", "date", "age", "sex"] state = data.drop( columns=["match_string"]).groupby(group_cols).sum().reset_index() # Remove bogus data data = data[data.match_string != "Unknown"] return concat([state, data])
def parse_dataframes( self, dataframes: Dict[Any, DataFrame], aux: Dict[str, DataFrame], **parse_opts ) -> DataFrame: column_adapter = dict(_column_adapter, date=f"date_new_deceased") data = table_rename(dataframes[0], column_adapter=column_adapter, drop=True) data["sex"] = data["sex"].apply({0: "female", 1: "male"}.get) data["age"] = data["age"].apply(lambda x: None if x < 0 else x) data["subregion1_name"] = data["subregion1_name"].str.replace("W.P. ", "") data = convert_cases_to_time_series(data, ["subregion1_name"]) # Remove records with no location data = data.dropna(subset=["subregion1_name"]) data["country_code"] = "MY" data["subregion2_code"] = None return data
def parse_dataframes( self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts ) -> DataFrame: cases = table_rename( dataframes[0], { # "Case no.": "", "Report date": "_date", # "Date of onset": "date_onset", "Gender": "sex", "Age": "age", # "Name of hospital admitted": "", "Hospitalised/Discharged/Deceased": "_status", # "HK/Non-HK resident": "", # "Case classification*": "", # "Confirmed/probable": "", }, drop=True, remove_regex=r"[^0-9a-z\s]", ) # Convert date to ISO format cases["_date"] = cases["_date"].apply(lambda x: datetime_isoformat(x, "%d/%m/%Y")) # All cases in the data are confirmed (or probable) cases["date_new_confirmed"] = cases["_date"] # Use confirmed date as estimate for deceased date cases["date_new_deceased"] = None deceased_mask = cases["_status"] == "Deceased" cases.loc[deceased_mask, "date_new_deceased"] = cases.loc[deceased_mask, "_date"] # Use confirmed date as estimate for hospitalization date cases["date_new_hospitalized"] = None hosp_mask = (cases["_status"] == "Discharged") | (cases["_status"] == "Hospitalized") cases.loc[hosp_mask, "date_new_hospitalized"] = cases.loc[hosp_mask, "_date"] # Translate sex labels; only male, female and unknown are given sex_adapter = lambda x: {"M": "male", "N": "female"}.get(x, "sex_unknown") cases["sex"] = cases["sex"].apply(sex_adapter) cases["key"] = "HK" return convert_cases_to_time_series(cases, ["key"])
def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: cases = table_rename( dataframes[0], { "Report Date": "date_new_confirmed", # "Case Status": "_status", "Sex": "sex", "Age group": "age", # "DHB": "", # "Overseas travel": "", }, drop=True, ) cases["key"] = "NZ" cases["age"] = cases["age"].str.slice(0, 2).str.replace( " ", "").apply(safe_int_cast) data = convert_cases_to_time_series(cases, ["key"]) return data
def test_convert_cases_to_time_series_simple(self): cases = read_csv(StringIO(CASE_LINE_DATA_SIMPLE)) table = convert_cases_to_time_series(cases) confirmed = table[table.new_confirmed > 0] # There should be as many records as there are combinations of <key,age,sex,ethnicity,date> self.assertEqual(len(cases) * 2, len(table)) # All lines in our test case indicate a confirmed case self.assertEqual(len(cases), table["new_confirmed"].sum()) # All lines in our test case indicate a deceased case self.assertEqual(len(cases), table["new_deceased"].sum()) # Half of our cases are male, and the other half are female self.assertEqual(len(table[table.sex == "male"]), len(table[table.sex == "female"])) # 2 cases are 10-19 and 6 are 20-29 self.assertEqual(2, len(confirmed[confirmed.age == "10-19"])) self.assertEqual(6, len(confirmed[confirmed.age == "20-29"]))
def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: with open(sources[0], "r") as fd: cases = json.load(fd)["Data"] # {"ConfirmDate":"2021-01-09 00:00:00","No":"9876","Age":66,"Gender":"\u0e0a","GenderEn":"Male","Nation":"Thailand","NationEn":"Thailand","Province":"\u0e2d","ProvinceId":72,"District":"\u0e44","ProvinceEn":"Ang Thong","Detail":null,"StatQuarantine":1} cases = table_rename( DataFrame.from_records(cases), { "ConfirmDate": "date_new_confirmed", "Age": "age", "GenderEn": "sex", "ProvinceEn": "match_string", }, drop=True, ) # Convert dates to ISO format for col in cases.columns: if col.startswith("date_"): cases[col] = cases[col].str.slice(0, 10) # Parse age and sex fields cases["sex"] = cases["sex"].str.lower().apply({"male": "male", "female": "female"}.get) cases["age"] = cases["age"].fillna("age_unknown") cases["sex"] = cases["sex"].fillna("sex_unknown") # Convert to time series data data = convert_cases_to_time_series(cases, ["match_string"]) # Aggregate by country level country = aggregate_admin_level(data, ["date", "age", "sex"], "country") country["key"] = "TH" # Add country code and return data data["country_code"] = "TH" data = data[data["match_string"] != "Unknown"] return concat([country, data])
def test_convert_cases_to_time_series_age_float_values(self): cases = read_csv(StringIO(CASE_LINE_DATA_AGE_FLOATS)) table = convert_cases_to_time_series(cases) self.assertSetEqual({"20-29"}, set(table.age))
def test_convert_cases_to_time_series_null_values(self): cases = read_csv(StringIO(CASE_LINE_DATA_NULL_VALUES)) table = convert_cases_to_time_series(cases) self.assertSetEqual({"age_unknown"}, set(table.age)) self.assertSetEqual({"sex_unknown"}, set(table.sex)) self.assertSetEqual({"ethnicity_unknown"}, set(table.ethnicity))
def test_convert_cases_to_time_series_other_values(self): cases = read_csv(StringIO(CASE_LINE_DATA_OTHER)) table = convert_cases_to_time_series(cases) self.assertSetEqual({"sex_other"}, set(table.sex)) self.assertSetEqual({"ethnicity_other"}, set(table.ethnicity))
def _process_partition(cases: DataFrame) -> DataFrame: cases = cases.copy() # Confirmed cases are only those with a confirmed positive test result cases["date_new_confirmed"] = None confirmed_mask = cases["_test_result"] == "Positivo" cases.loc[confirmed_mask, "date_new_confirmed"] = cases.loc[confirmed_mask, "date_new_tested"] # Do not process deceased counts, since they are considered highly inaccurate # # Deceased cases have a specific label and the date is the "closing" date # cases["date_new_deceased"] = None # deceased_mask = cases["_prognosis"] == "Óbito" # cases.loc[deceased_mask, "date_new_deceased"] = cases.loc[deceased_mask, "_date_update"] # # Only count deceased cases from confirmed subjects # cases.loc[~confirmed_mask, "date_new_deceased"] = None # Recovered cases have a specific label and the date is the "closing" date cases["date_new_recovered"] = None recovered_mask = cases["_prognosis"] == "Cured" cases.loc[recovered_mask, "date_new_recovered"] = cases.loc[recovered_mask, "_date_update"] # Only count recovered cases from confirmed subjects cases.loc[~confirmed_mask, "date_new_recovered"] = None # Drop columns which we have no use for cases = cases[[col for col in cases.columns if not col.startswith("_")]] # Make sure our region codes are of type str cases["subregion2_code"] = cases["subregion2_code"].apply(safe_int_cast) # The last digit of the region code is actually not necessary cases["subregion2_code"] = cases["subregion2_code"].apply( lambda x: None if isna(x) else str(int(x))[:-1] ) # Convert ages to int, and translate sex (no "other" sex/gender reported) cases["age"] = cases["age"].apply(safe_int_cast) cases["sex"] = cases["sex"].str.lower().apply({"masculino": "male", "feminino": "female"}.get) # Convert to time series format data = convert_cases_to_time_series(cases, index_columns=["subregion1_code", "subregion2_code"]) # Convert date to ISO format data["date"] = data["date"].str.slice(0, 10) data["date"] = data["date"].apply(lambda x: datetime_isoformat(x, "%Y-%m-%d")) # Get rid of bogus records data = data.dropna(subset=["date"]) data = data[data["date"] >= "2020-01-01"] data = data[data["date"] < date_today(offset=1)] # Aggregate data by country country = ( data.drop(columns=["subregion1_code", "subregion2_code"]) .groupby(["date", "age", "sex"]) .sum() .reset_index() ) country["key"] = "BR" # Aggregate data by state state = ( data.drop(columns=["subregion2_code"]) .groupby(["date", "subregion1_code", "age", "sex"]) .sum() .reset_index() ) state["key"] = "BR_" + state["subregion1_code"] # We can derive the key from subregion1 + subregion2 data = data[data["subregion2_code"].notna() & (data["subregion2_code"] != "")] data["key"] = "BR_" + data["subregion1_code"] + "_" + data["subregion2_code"] return concat([country, state, data])
def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: cases = table_rename( dataframes[0], { # "FECHA_ACTUALIZACION": "", # "ID_REGISTRO": "", # "ORIGEN": "", # "SECTOR": "", # "ENTIDAD_UM": "", "SEXO": "sex", # "ENTIDAD_NAC": "", "ENTIDAD_RES": "subregion1_code", "MUNICIPIO_RES": "subregion2_code", "TIPO_PACIENTE": "_type", "FECHA_INGRESO": "date_new_confirmed", # "FECHA_SINTOMAS": "", "FECHA_DEF": "date_new_deceased", # "INTUBADO": "", # "NEUMONIA": "", "EDAD": "age", # "NACIONALIDAD": "", # "EMBARAZO": "", # "HABLA_LENGUA_INDIG": "", # "DIABETES": "", # "EPOC": "", # "ASMA": "", # "INMUSUPR": "", # "HIPERTENSION": "", # "OTRA_COM": "", # "CARDIOVASCULAR": "", # "OBESIDAD": "", # "RENAL_CRONICA": "", # "TABAQUISMO": "", # "OTRO_CASO": "", "RESULTADO": "_diagnosis", # "MIGRANTE": "", # "PAIS_NACIONALIDAD": "", # "PAIS_ORIGEN": "", "UCI": "_intensive_care", }, drop=True, ) # Null dates are coded as 9999-99-99 for col in cases.columns: if col.startswith("date_"): cases.loc[cases[col] == "9999-99-99", col] = None # Discard all cases with negative test result cases = cases[cases["_diagnosis"] == 1] # Type 1 is normal, type 2 is hospitalized cases["date_new_hospitalized"] = None hospitalized_mask = cases["_type"] == 2 cases.loc[hospitalized_mask, "date_new_hospitalized"] = cases.loc[hospitalized_mask, "date_new_confirmed"] # Parse region codes as strings cases["subregion1_code"] = cases["subregion1_code"].apply( lambda x: numeric_code_as_string(x, 2)) cases["subregion2_code"] = cases["subregion2_code"].apply( lambda x: numeric_code_as_string(x, 3)) # Convert case line data to our time series format data = convert_cases_to_time_series( cases, ["subregion1_code", "subregion2_code"]) # Convert date to ISO format data["date"] = data["date"].astype(str) # Unknown region codes are defined as "99+" instead of null data.loc[data["subregion1_code"] == "99", "subregion1_code"] = None data.loc[data["subregion2_code"] == "999", "subregion2_code"] = None # The subregion2 codes need to be composed invalid_region_mask = data["subregion2_code"].isna( ) | data["subregion2_code"].isna() data.loc[~invalid_region_mask, "subregion2_code"] = ( data.loc[~invalid_region_mask, "subregion1_code"] + data.loc[~invalid_region_mask, "subregion2_code"]) # Use proper ISO codes for the subregion1 level data["subregion1_code"] = data["subregion1_code"].apply( _SUBREGION1_CODE_MAP.get) # Translate sex labels; only male, female and unknown are given data["sex"] = data["sex"].apply(lambda x: { "hombre": "male", "mujer": "female" }.get(x.lower())) # Aggregate state-level data by adding all municipalities state = data.drop(columns=["subregion2_code"]).groupby( ["date", "subregion1_code"]).sum() state.reset_index(inplace=True) state["key"] = "MX_" + state["subregion1_code"] # Extract cities from the municipalities city = _extract_cities(data) # Country level is called "TOTAL" as a subregion1_code country_mask = data["subregion1_code"] == "TOTAL" country = data[country_mask] country["key"] = "MX" # We can build the key for the data directly from the subregion codes data["key"] = "MX_" + data["subregion1_code"] + "_" + data[ "subregion2_code"] # Drop bogus records from the data data = data[~country_mask] state.dropna(subset=["subregion1_code"], inplace=True) data.dropna(subset=["subregion1_code", "subregion2_code"], inplace=True) return concat([country, state, data, city])
def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: # Rename appropriate columns cases = table_rename( dataframes[0], { "ProvRes": "province", "RegionRes": "region", "CityMuniPSGC": "city", "DateDied": "date_new_deceased", "DateSpecimen": "date_new_confirmed", "DateRecover": "date_new_recovered", "daterepconf": "_date_estimate", "admitted": "_hospitalized", "removaltype": "_prognosis", "Age": "age", "Sex": "sex", }, drop=True, ) # When there is a case, but missing confirmed date, estimate it cases["date_new_confirmed"] = cases["date_new_confirmed"].fillna( cases["_date_estimate"]) # When there is recovered removal, but missing recovery date, estimate it nan_recovered_mask = cases.date_new_recovered.isna() & ( cases["_prognosis"] == "Recovered") cases.loc[nan_recovered_mask, "date_new_recovered"] = cases.loc[nan_recovered_mask, "_date_estimate"] # When there is deceased removal, but missing recovery date, estimate it nan_deceased_mask = cases.date_new_deceased.isna() & ( cases["_prognosis"] == "Died") cases.loc[nan_deceased_mask, "date_new_deceased"] = cases.loc[nan_deceased_mask, "_date_estimate"] # Hospitalized is estimated as the same date as confirmed if admitted == yes cases["date_new_hospitalized"] = None hospitalized_mask = cases["_hospitalized"].str.lower() == "yes" cases.loc[hospitalized_mask, "date_new_hospitalized"] = cases.loc[hospitalized_mask, "date_new_confirmed"] # Rename the sex values cases["sex"] = cases["sex"].apply({ "MALE": "male", "FEMALE": "female" }.get) # Drop columns which we have no use for cases = cases[[ col for col in cases.columns if not col.startswith("_") ]] # NCR cases are broken down by city, not by province ncr_prov_mask = cases["region"] == "NCR" cases.loc[ncr_prov_mask, "province"] = cases.loc[ncr_prov_mask, "city"].str.slice(2, -3) cases.drop(columns=["city"], inplace=True) # Go from individual case records to key-grouped records in a flat table data = convert_cases_to_time_series( cases, index_columns=["province", "region"]) # Convert date to ISO format data["date"] = data["date"].apply(safe_datetime_parse) data = data[~data["date"].isna()] data["date"] = data["date"].apply(lambda x: x.date().isoformat()) # Null values are known to be zero, since we have case-line data data = data.fillna(0) # Aggregate country level directly from base data country = (data.drop(columns=["province", "region"]).groupby( ["date", "age", "sex"]).sum().reset_index()) country["key"] = "PH" # Aggregate regions and provinces separately l3 = data.rename(columns={"province": "match_string"}) l2 = data.rename(columns={"region": "match_string"}) l2["match_string"] = l2["match_string"].apply( lambda x: x.split(": ")[-1]) # Ensure matching by flagging whether a record must be L2 or L3 l3["subregion2_code"] = "" l2["subregion2_code"] = None l3["locality_code"] = None l2["locality_code"] = None data = concat([l2, l3]).dropna(subset=["match_string"]) data["country_code"] = "PH" # Remove bogus records data = data[data["match_string"].notna()] data = data[data["match_string"] != ""] data = data[data["match_string"] != "REPATRIATE"] data = data[data["match_string"] != "CITY OF ISABELA (NOT A PROVINCE)"] data = data[data["match_string"] != "COTABATO CITY (NOT A PROVINCE)"] return concat([country, data])
def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: cases = table_rename( concat(dataframes.values()), { # "Patient Number": "", # "State Patient Number": "", "Date Announced": "date_new_confirmed", # "Estimated Onset Date": "", "Age Bracket": "age", "Gender": "sex", # "Detected City": "", "Detected District": "subregion2_name", "Detected State": "subregion1_name", # "State code": "subregion1_code", "Current Status": "_prognosis", # "Notes": "", # "Contracted from which Patient (Suspected)": "", # "Nationality": "", # "Type of transmission": "", "Status Change Date": "_change_date", # "Source_1": "", # "Source_2": "", # "Source_3": "", # "Backup Notes": "", "Num Cases": "new_confirmed", "Entry_ID": "", }, drop=True, ) # Convert dates to ISO format for col in [col for col in cases.columns if "date" in col]: cases[col] = cases[col].apply( lambda x: datetime_isoformat(x, "%d/%m/%Y")) cases["age"] = cases["age"].astype(str) cases["age"] = cases["age"].str.lower() cases["age"] = cases["age"].str.replace("\.0", "") cases["age"] = cases["age"].str.replace(r"[\d\.]+ day(s)?", "1") cases["age"] = cases["age"].str.replace(r"[\d\.]+ month(s)?", "1") cases.loc[cases["age"].str.contains("-"), "age"] = None sex_adapter = lambda x: { "M": "male", "F": "female" }.get(x, "sex_unknown") cases["sex"] = cases["sex"].str.strip() cases["sex"] = cases["sex"].apply(sex_adapter) cases["date_new_deceased"] = None deceased_mask = cases["_prognosis"] == "Deceased" cases.loc[deceased_mask, "date_new_deceased"] = cases.loc[deceased_mask, "_change_date"] cases["date_new_hospitalized"] = None hosp_mask = cases["_prognosis"] == "Hospitalized" cases.loc[hosp_mask, "date_new_hospitalized"] = cases.loc[hosp_mask, "_change_date"] data = convert_cases_to_time_series( cases, ["subregion1_name", "subregion2_name"]) data["country_code"] = "IN" # Aggregate country level and admin level 1 country = aggregate_admin_level(data, ["date", "age", "sex"], "country") subregion1 = aggregate_admin_level(data, ["date", "age", "sex"], "subregion1") subregion1 = subregion1[ subregion1["subregion1_name"].str.lower() != "state unassigned"] # Data for admin level 2 is too noisy and there are many mismatches, so we only return # the aggregated country level and admin level 1 data return concat([country, subregion1])
def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: cases_confirmed = table_rename( dataframes["confirmed"], _column_adapter, drop=True).rename(columns={"date": "date_new_confirmed"}) cases_deceased = table_rename( dataframes["deceased"], _column_adapter, drop=True).rename(columns={"date": "date_new_deceased"}) # Translate sex label for df in (cases_confirmed, cases_deceased): df["sex"] = df["sex"].apply({ "MASCULINO": "male", "FEMENINO": "female" }.get) # Convert to time series index_columns = ["subregion1_name", "province_name", "subregion2_name"] data_confirmed = convert_cases_to_time_series(cases_confirmed, index_columns) data_deceased = convert_cases_to_time_series(cases_deceased, index_columns) # Join into a single dataset data = table_multimerge([data_confirmed, data_deceased], how="outer") # Remove bogus records data.dropna(subset=["date"], inplace=True) # Set country code and get date in ISO format data["country_code"] = "PE" data["date"] = data["date"].apply(safe_int_cast) data["date"] = data["date"].apply(safe_str_cast) data["date"] = data["date"].apply( lambda x: datetime_isoformat(x, "%Y%m%d")) # Properly capitalize department to allow for exact matching data["subregion1_name"] = data["subregion1_name"].apply( lambda x: _department_map.get(x, x.title())) # Aggregate by admin level 1 subregion1 = (data.drop( columns=["subregion2_name", "province_name"]).groupby( ["date", "country_code", "subregion1_name", "age", "sex"]).sum().reset_index()) subregion1["subregion2_name"] = None # Try to match based on subregion2_name using fuzzy matching, and set subregion2_name to # an empty string to turn off exact matching data = data.rename(columns={"subregion2_name": "match_string"}) data["subregion2_name"] = "" # Convert other text fields to lowercase for consistent processing data["match_string"] = data["match_string"].apply(fuzzy_text) data["province_name"] = data["province_name"].apply(fuzzy_text) # Drop bogus records data = data[~data["match_string"].isna()] data = data[~data["match_string"]. isin(["", "eninvestigacion", "extranjero"])] # Because we are skipping provinces and going directly from region to district, there are # some name collisions which we have to disambiguate manually for province1, province2, district in [ ("lima", "canete", "sanluis"), ("lima", "yauyos", "miraflores"), ("ica", "chincha", "pueblonuevo"), ("canete", "huarochiri", "sanantonio"), ("bolognesi", "huaylas", "huallanca"), ("lucanas", "huancasancos", "sancos"), ("santacruz", "cutervo", "santacruz"), ("yauli", "jauja", "yauli"), ("yauli", "jauja", "paccha"), ("huarochiri", "yauyos", "laraos"), ("elcollao", "melgar", "santarosa"), ]: for province in (province1, province2): mask = (data["province_name"] == province) & (data["match_string"] == district) data.loc[mask, "match_string"] = f"{district}, {province}" # Output the results return concat([subregion1, data])
def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: # Subregion code comes from the parsing parameters subregion1_code = parse_opts["subregion1_code"] # Join all input data into a single table cases = table_rename(concat(dataframes.values()), _column_adapter, drop=True) # Keep only cases for a single state cases = cases[cases["_state_code"].apply(safe_int_cast) == _IBGE_STATES[subregion1_code]] # Confirmed cases are only those with a confirmed positive test result cases["date_new_confirmed"] = None confirmed_mask = cases["_test_result"] == "Positivo" cases.loc[confirmed_mask, "date_new_confirmed"] = cases.loc[confirmed_mask, "date_new_tested"] # Deceased cases have a specific label and the date is the "closing" date cases["date_new_deceased"] = None deceased_mask = cases["_prognosis"] == "Óbito" cases.loc[deceased_mask, "date_new_deceased"] = cases.loc[deceased_mask, "_date_update"] # Recovered cases have a specific label and the date is the "closing" date cases["date_new_recovered"] = None recovered_mask = cases["_prognosis"] == "Cured" cases.loc[recovered_mask, "date_new_recovered"] = cases.loc[recovered_mask, "_date_update"] # Drop columns which we have no use for cases = cases[[ col for col in cases.columns if not col.startswith("_") ]] # Make sure our region code is of type str cases["subregion2_code"] = cases["subregion2_code"].apply( safe_int_cast) # The last digit of the region code is actually not necessary cases["subregion2_code"] = cases["subregion2_code"].apply( lambda x: None if isna(x) else str(int(x))[:-1]) # Null and unknown records are state only subregion2_null_mask = cases["subregion2_code"].isna() cases.loc[subregion2_null_mask, "key"] = "BR_" + subregion1_code # We can derive the key from subregion1 + subregion2 cases.loc[~subregion2_null_mask, "key"] = ( "BR_" + subregion1_code + "_" + cases.loc[~subregion2_null_mask, "subregion2_code"]) # Convert ages to int, and translate sex (no "other" sex/gender reported) cases["age"] = cases["age"].apply(safe_int_cast) cases["sex"] = (cases["sex"].str.lower().apply({ "masculino": "male", "feminino": "female", "indefinido": None }.get)) # Convert to time series format data = convert_cases_to_time_series(cases, index_columns=["key"]) # Convert date to ISO format data["date"] = data["date"].str.slice(0, 10) data["date"] = data["date"].apply( lambda x: datetime_isoformat(x, "%Y-%m-%d")) # Get rid of bogus records data = data.dropna(subset=["date"]) data = data[data["date"] >= "2020-01-01"] # Aggregate for the whole state state = data.drop(columns=["key"]).groupby(["date", "age", "sex" ]).sum().reset_index() state["key"] = "BR_" + subregion1_code return concat([data, state])
def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: # Rename appropriate columns cases = table_rename( dataframes[0], { "residencia_provincia_id": "subregion1_code", "residencia_departamento_id": "subregion2_code", "fecha_fallecimiento": "date_new_deceased", "fecha_diagnostico": "_date_diagnosed", "fecha_internacion": "date_new_hospitalized", "fecha_cui_intensivo": "date_new_intensive_care", "clasificacion_resumen": "_classification", "edad": "age", "sexo": "sex", }, drop=True, ) # As long as a case is not labeled as "suspected", assume it has been tested cases["date_new_tested"] = None suspect_mask = cases["_classification"].str.lower().str.match( ".*sospechoso.*") cases.loc[~suspect_mask, "date_new_tested"] = cases.loc[suspect_mask, "_date_diagnosed"] # Get rid of all the suspected cases, since we have nothing to tally for them cases = cases[~suspect_mask] # Confirmed cases use the label "confirmado" cases["date_new_tested"] = None confirmed_mask = cases["_classification"].str.lower().str.match( ".*confirmado.*") cases.loc[confirmed_mask, "date_new_confirmed"] = cases.loc[confirmed_mask, "_date_diagnosed"] # Clean up the subregion codes cases["subregion1_code"] = cases["subregion1_code"].apply( lambda x: None if x == 0 else numeric_code_as_string(x, 2)) cases["subregion2_code"] = cases["subregion2_code"].apply( lambda x: None if x == 0 else numeric_code_as_string(x, 3)) # Convert subregion1_code to the corresponding ISO code cases["subregion1_code"] = cases["subregion1_code"].apply( _ISO_CODE_MAP.get) # Remove unnecessary columns before converting to time series cases = cases.drop( columns=[col for col in cases.columns if col.startswith("_")]) # Go from individual case records to key-grouped records in time series format data = convert_cases_to_time_series( cases, ["subregion1_code", "subregion2_code"]) # Parse dates to ISO format. data["date"] = data["date"].astype(str) # Aggregate by province and report that separately provinces = (data.drop(columns=["subregion2_code"]).groupby( ["subregion1_code", "date", "age", "sex"]).sum().reset_index()) # Aggregate to the country level and report that separately country = (data.drop(columns=["subregion1_code"]).groupby( ["date", "age", "sex"]).sum().reset_index()) # Compute the key from the subregion codes country["key"] = "AR" provinces["key"] = "AR_" + provinces["subregion1_code"] data["key"] = "AR_" + data["subregion1_code"] + "_" + data[ "subregion2_code"] # Remove bogus values for df in (country, provinces, data): df.drop(df[df["key"].str.endswith("_")].index, inplace=True) for nn_col in ("date", "subregion1_code", "subregion2_code"): if nn_col in df.columns: df.drop(df[df[nn_col].isna() | (df[nn_col] == "")].index, inplace=True) return concat([data, provinces, country])
def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: # Rename appropriate columns cases = table_rename( dataframes[0], { "residencia_provincia_id": "subregion1_code", "residencia_departamento_id": "subregion2_code", "fecha_fallecimiento": "date_new_deceased", "fecha_apertura": "_date_estimate", "fecha_diagnostico": "date_new_tested", "fecha_internacion": "date_new_hospitalized", "fecha_cui_intensivo": "date_new_intensive_care", "clasificacion_resumen": "_classification", "edad": "age", "sexo": "sex", }, drop=True, ) # Get rid of all the suspected cases, since we have nothing to tally for them cases = cases[~cases["_classification"].str.lower().str. match(".*sospechoso.*")] # Confirmed cases use the label "confirmado" cases["date_new_confirmed"] = None confirmed_mask = cases["_classification"].str.lower().str.match( ".*confirmado.*") cases.loc[confirmed_mask, "date_new_confirmed"] = cases.loc[confirmed_mask, "date_new_tested"] # Estimate the confirmed date when none is available cases.loc[confirmed_mask, "date_new_confirmed"] = cases.loc[ confirmed_mask, "date_new_confirmed"].fillna(cases.loc[confirmed_mask, "_date_estimate"]) # Only count deaths from confirmed cases cases.loc[~confirmed_mask, "date_new_deceased"] = None # Remove unnecessary columns before converting to time series cases = cases.drop( columns=[col for col in cases.columns if col.startswith("_")]) # Clean up the subregion codes cases["subregion1_code"] = cases["subregion1_code"].apply( lambda x: numeric_code_as_string(x, 2) or "00") cases["subregion2_code"] = cases["subregion2_code"].apply( lambda x: numeric_code_as_string(x, 3) or "000") # Go from individual case records to key-grouped records in time series format data = convert_cases_to_time_series( cases, ["subregion1_code", "subregion2_code"]) # Parse dates to ISO format. data["date"] = data["date"].astype(str) # Aggregate to the country level and report that separately country = (data.drop( columns=["subregion1_code", "subregion2_code"]).groupby( ["date", "age", "sex"]).sum().reset_index()) # Convert subregion1_code to the corresponding ISO code data["subregion1_code"] = data["subregion1_code"].apply( _ISO_CODE_MAP.get) # Aggregate by province and report that separately provinces = (data.drop(columns=["subregion2_code"]).groupby( ["subregion1_code", "date", "age", "sex"]).sum().reset_index()) # Drop regions without a code data = data[data["subregion2_code"] != "000"] data.dropna(subset=["subregion1_code", "subregion2_code"], inplace=True) provinces.dropna(subset=["subregion1_code"], inplace=True) # Compute the key from the subregion codes country["key"] = "AR" provinces["key"] = "AR_" + provinces["subregion1_code"] data["key"] = "AR_" + data["subregion1_code"] + "_" + data[ "subregion2_code"] return concat([data, provinces, country])
def parse_dataframes( self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts ) -> DataFrame: # Rename appropriate columns cases = table_rename( dataframes[0], { "ProvRes": "match_string_province", "RegionRes": "match_string_region", "DateDied": "date_new_deceased", "DateSpecimen": "date_new_confirmed", "DateRecover": "date_new_recovered", "daterepconf": "_date_estimate", "admitted": "_hospitalized", "removaltype": "_prognosis", "Age": "age", "Sex": "sex", }, drop=True, ) # When there is recovered removal, but missing recovery date, estimate it nan_recovered_mask = cases.date_new_recovered.isna() & (cases["_prognosis"] == "Recovered") cases.loc[nan_recovered_mask, "date_new_recovered"] = cases.loc[ nan_recovered_mask, "_date_estimate" ] # When there is deceased removal, but missing recovery date, estimate it nan_deceased_mask = cases.date_new_deceased.isna() & (cases["_prognosis"] == "Died") cases.loc[nan_deceased_mask, "date_new_deceased"] = cases.loc[ nan_deceased_mask, "_date_estimate" ] # Hospitalized is estimated as the same date as confirmed if admitted == yes cases["date_new_hospitalized"] = None hospitalized_mask = cases["_hospitalized"].str.lower() == "yes" cases.loc[hospitalized_mask, "date_new_hospitalized"] = cases.loc[ hospitalized_mask, "date_new_confirmed" ] # Create stratified age bands cases["age"] = cases["age"].apply(age_group) # Rename the sex values cases["sex"] = cases["sex"].apply({"MALE": "male", "FEMALE": "female"}.get) # Drop columns which we have no use for cases = cases[[col for col in cases.columns if not col.startswith("_")]] # Go from individual case records to key-grouped records in a flat table data = convert_cases_to_time_series( cases, index_columns=["match_string_province", "match_string_region"] ) # Convert date to ISO format data["date"] = data["date"].apply(safe_datetime_parse) data = data[~data["date"].isna()] data["date"] = data["date"].apply(lambda x: x.date().isoformat()) data = data.fillna(0) # Aggregate regions and provinces separately l3 = data.rename(columns={"match_string_province": "match_string"}) l2 = data.rename(columns={"match_string_region": "match_string"}) l2["match_string"] = l2["match_string"].apply(lambda x: x.split(": ")[-1]) # Ensure matching by flagging whether a record must be L2 or L3 l2["subregion2_code"] = None l3["subregion2_code"] = "" data = concat([l2, l3]).dropna(subset=["match_string"]) data["country_code"] = "PH" # Remove bogus records data = data[data["match_string"].notna()] data = data[data["match_string"] != ""] data = data[data["match_string"] != "REPATRIATE"] data = data[data["match_string"] != "CITY OF ISABELA (NOT A PROVINCE)"] data = data[data["match_string"] != "COTABATO CITY (NOT A PROVINCE)"] return data
def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: cases = table_rename( dataframes[0], { "classificação_final": "confirmed_label", "dt_notific": "date_new_confirmed", # "dt_inicio_sintomas": "_date_onset", "bairro_resid__estadia": "match_string", # "ap_residencia_estadia": "_health_department_code", "sexo": "sex", "faixa_etária": "age", "evolução": "_state_label", "dt_óbito": "date_new_deceased", "raça/cor": "ethnicity", "Data_atualização": "_date_updated", }, drop=True, ) # Currently active cases are those which are labeled as "ativo" with the report's date cases["date_current_confirmed"] = None report_date = cases["_date_updated"].iloc[0] cases.loc[cases["_state_label"] == "ativo", "date_current_confirmed"] = report_date # Drop columns which we have no use for cases = cases[[ col for col in cases.columns if not col.startswith("_") ]] # Age is already in buckets cases["age"] = cases["age"].apply( lambda x: x.replace("De ", "").replace(" a ", "-")) # Make all unknown ages null cases.loc[cases["age"].str.contains("N/D"), "age"] = None # Ethnicity needs translation cases["ethnicity"] = cases["ethnicity"].apply(lambda x: { "preta": "black", "parda": "mixed", "branca": "white" }.get(str(x).lower(), "unknown")) data = convert_cases_to_time_series(cases, index_columns=["match_string"]) data["country_code"] = "BR" data["subregion1_code"] = "RJ" # Convert date to ISO format # This data source keeps switching between YYYY and YY for the year, so try both data["date"] = data["date"].apply(lambda x: datetime_isoformat( x, "%d/%m/%Y") or datetime_isoformat(x, "%d/%m/%y")) # The sum of all districts is the metropolitan area of Rio metro = data.groupby(["date", "age", "sex", "ethnicity"]).sum().reset_index() metro["key"] = "BR_RJ_3304557" # Rio is both a subregion of the state and a "locality" city = metro.copy() city["key"] = "BR_RJ_GIG" # Remove bogus data data = data[data.match_string != "INDEFINIDO"] data = data[data.match_string != "FORA DO MUNICÍPIO"] # Return only city-level data for now # TODO(owahltinez): add the rest of the data once statewide districts are reported # return concat([city, metro, data]) return city
def test_convert_cases_to_time_series_age_negative_values(self): cases = read_csv(StringIO(CASE_LINE_DATA_AGE_NEGATIVE)) table = convert_cases_to_time_series(cases) self.assertSetEqual({"age_unknown"}, set(table.age))
def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: cases = table_rename( dataframes[0], { "sexoPaciente": "sex", "idadePaciente": "age", "codigoMunicipioPaciente": "subregion2_code", "dataResultadoExame": "date_new_tested", "dataObito": "date_new_deceased", "dataEntradaUtisSvep": "date_new_intensive_care", "evolucaoCasoSivep": "_prognosis", "dataInicioSintomas": "_date_onset", "dataEvolucaoCasoSivep": "_date_update", "resultadoFinalExame": "_test_result", }, drop=True, ) # Follow the procedure described in the data documentation to compute the confirmed cases: # https://drive.google.com/file/d/1DUwST2zcXUnCJmJauiM5zmpSVWqLiAYI/view cases["date_new_confirmed"] = None confirmed_mask = cases["_test_result"] == "Positivo" cases.loc[confirmed_mask, "date_new_confirmed"] = cases.loc[confirmed_mask, "date_new_tested"] # Only count intensive care patients if they had a positive test result cases.loc[~confirmed_mask, "date_new_intensive_care"] = None # Drop columns which we have no use for cases = cases[[ col for col in cases.columns if not col.startswith("_") ]] # Make sure our region code is of type str cases["subregion2_code"] = cases["subregion2_code"].apply( lambda x: None if isna(x) else str(safe_int_cast(x))) # Convert ages to int, and translate sex (no "other" sex/gender reported) cases["age"] = cases["age"].apply(safe_int_cast) cases["sex"] = cases["sex"].apply({ "MASCULINO": "male", "FEMENINO": "female" }.get) # Convert to time series format data = convert_cases_to_time_series(cases, index_columns=["subregion2_code"]) # Convert date to ISO format data["date"] = data["date"].apply( lambda x: datetime_isoformat(x, "%Y-%m-%d %H:%M:%S")) # Aggregate state-level data by adding all municipalities state = data.drop(columns=["subregion2_code"]).groupby( ["date", "age", "sex"]).sum() state.reset_index(inplace=True) state["key"] = "BR_CE" # Fortaleza is both a subregion of the state and a "locality" city = data.loc[data["subregion2_code"] == "230440"].copy() city["key"] = "BR_CE_FOR" # Drop bogus records from the data data = data[~data["subregion2_code"].isna() & (data["subregion2_code"] != "")] # We can build the key for the data directly from the subregion code data["key"] = "BR_CE_" + data["subregion2_code"] return concat([state, data, city])