def parse_age_group(age_group: str): new_age_group = "" age_bins = age_group.strip().replace("+", "-").split("-", 1) age_lo = safe_int_cast(age_bins[0]) new_age_group += f"{age_lo:02d}-" if len(age_bins) > 1 and age_bins[1]: age_hi = safe_int_cast(age_bins[1]) new_age_group += f"{age_hi:02d}" return new_age_group
def _get_province_records(key: str) -> List[Dict[str, Any]]: records = [] url = _api_url_tpl.format(key.replace("_", "-")) for record in requests.get(url, timeout=60).json(): records.append({ "key": key, "date": datetime_isoformat(record["date"], "%d.%m.%Y"), "total_confirmed": safe_int_cast(record["sick"]), "total_deceased": safe_int_cast(record["died"]), "total_recovered": safe_int_cast(record["healed"]), }) return records
def update_table(table_name: str = None, job_group: str = None, parallel_jobs: int = 8) -> Response: table_name = _get_request_param("table", table_name) job_group = _get_request_param("job_group", job_group) process_count = _get_request_param("parallel_jobs", parallel_jobs) # Default to 1 if invalid process count is given process_count = safe_int_cast(process_count) or 1 # Early exit: table name not found if table_name not in list(get_table_names()): return Response(f"Invalid table name {table_name}", status=400) with temporary_directory() as workdir: (workdir / "snapshot").mkdir(parents=True, exist_ok=True) (workdir / "intermediate").mkdir(parents=True, exist_ok=True) # Load the pipeline configuration given its name pipeline_name = table_name.replace("-", "_") data_pipeline = DataPipeline.load(pipeline_name) # Limit the sources to only the job_group provided if job_group is not None: data_pipeline.data_sources = [ data_source for data_source in data_pipeline.data_sources if data_source.config.get("automation", {}).get("job_group") == job_group ] # Early exit: job group contains no data sources if not data_pipeline.data_sources: return Response( f"No data sources matched job group {job_group} for table {table_name}", status=400, ) # Log the data sources being extracted data_source_names = [ src.config.get("name") for src in data_pipeline.data_sources ] logger.log_info(f"Updating data sources: {data_source_names}") # When running the data pipeline, use as many parallel processes as allowed and avoid # downloading files multiple times. run_options = dict(process_count=process_count, skip_existing=True) # Produce the intermediate files from the data source intermediate_results = data_pipeline.parse(workdir, **run_options) data_pipeline._save_intermediate_results(workdir / "intermediate", intermediate_results) intermediate_files = list( map(str, (workdir / "intermediate").glob("*.csv"))) logger.log_info(f"Created intermediate tables: {intermediate_files}") # Upload results to the test bucket because these are not prod files upload_folder(GCS_BUCKET_TEST, "snapshot", workdir / "snapshot") upload_folder(GCS_BUCKET_TEST, "intermediate", workdir / "intermediate") return Response("OK", status=200)
def parse(self, sources: Dict[Any, str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: url_tpl = sources[0] metadata = aux["metadata"] metadata = metadata[metadata["country_code"] == "FR"] fr_isos = read_file(SRC / "data" / "fr_iso_codes.csv") fr_iso_map = {iso: code for iso, code in zip(fr_isos["iso_code"], fr_isos["region_code"])} fr_codes = metadata[["subregion1_code", "subregion2_code"]].dropna() regions_iter = fr_codes["subregion1_code"].unique() deps_iter = [record for _, record in fr_codes.iterrows()] column_adapter = { "key": "key", "date": "date", "testsRealisesDetails": "_breakdown_tested", "testsPositifsDetails": "_breakdown_confirmed", } # Get country level data country = _get_country(url_tpl, column_adapter) # Get region level data get_region_func = partial(_get_region, url_tpl, column_adapter, fr_iso_map) regions = concat(list(thread_map(get_region_func, regions_iter))) # Get department level data get_department_func = partial(_get_department, url_tpl, column_adapter) departments = concat(list(thread_map(get_department_func, deps_iter))) data = concat([country, regions, departments]) data["date"] = data["date"].apply(lambda x: datetime_isoformat(x, "%Y-%m-%d %H:%M:%S")) data["_breakdown_tested"].fillna("", inplace=True) data["_breakdown_confirmed"].fillna("", inplace=True) records: Dict[str, List] = {"confirmed": [], "tested": []} for key, row in data.set_index("key").iterrows(): for statistic in records.keys(): if row[f"_breakdown_{statistic}"] != "": for item in row[f"_breakdown_{statistic}"]: records[statistic].append( { "key": key, "date": row["date"], "age": item["age"], "sex": item.get("sexe"), f"new_{statistic}": item["value"], } ) df1 = DataFrame.from_records(records["tested"]) df2 = DataFrame.from_records(records["confirmed"]) data = df1.merge(df2, how="outer") data = data[~data["age"].isin(["0", "A", "B", "C", "D", "E"])] data["age"] = data["age"].apply(lambda x: age_group(safe_int_cast(x))) sex_adapter = lambda x: {"h": "male", "f": "female"}.get(x, "sex_unknown") data["sex"] = data["sex"].apply(sex_adapter) return data
def parse_dataframes( self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts ) -> DataFrame: # Rename the appropriate columns data = ( dataframes[0] .rename( columns={ "Location": "match_string", "Confirmed": "total_confirmed", "Deaths": "total_deceased", "Recoveries": "total_recovered", "Date": "date", } ) .drop(columns=["Active"]) ) # Convert date to ISO format data["date"] = data["date"].apply(lambda x: datetime_isoformat(x, "%m/%d/%Y")) # The first row is metadata info about column names - discard it data = data[data.match_string != "#loc+name"] # Convert string numbers to int # Parse integers for column in ("total_confirmed", "total_deceased", "total_recovered"): data[column] = data[column].apply(lambda x: safe_int_cast(str(x).replace(",", ""))) # Make sure all records have the country code data["country_code"] = "LY" # Output the results return data
def parse_dataframes(self, dataframes: List[DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: # Rename the appropriate columns data = (dataframes[0].rename( columns={ "Date": "date", "Province": "match_string", "Cases": "total_confirmed", "Deaths": "total_deceased", "Active Cases": "current_confirmed", "Recoveries": "total_recovered", }).drop([0]).drop(axis=1, columns="current_confirmed")) # Parse integers for column in ("total_confirmed", "total_deceased", "total_recovered"): data[column] = data[column].apply( lambda x: safe_int_cast(str(x).replace(",", ""))) # Make sure all records have the country code data["country_code"] = "AF" # Remove redundant info from names data.match_string = data.match_string.apply( lambda x: x.replace(" Province", "")) # Output the results return data
def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: cases = table_rename(dataframes[0], _srag_column_adapter, drop=True) covid_mask = cases["_classification"] == 5 valid_mask = cases["_prognosis"].notna() & cases["_prognosis"] != 9 cases = cases[covid_mask & valid_mask] # Record the date of death cases["date_new_deceased"] = None deceased_mask = cases["_prognosis"] == 2 cases.loc[deceased_mask, "date_new_deceased"] = cases.loc[deceased_mask, "_date_prognosis"] # Convert ages to int, and translate sex (no "other" sex/gender reported) cases["age"] = cases["age"].apply(safe_int_cast) cases["sex"] = cases["sex"].apply({"M": "male", "F": "female"}.get) # Convert all dates to ISO format for col in filter(lambda x: x.startswith("date"), cases.columns): cases[col] = cases[col].apply( lambda x: datetime_isoformat(x, "%d/%m/%Y")) # Parse subregion codes cases["subregion2_code"] = cases["subregion2_code"].apply( lambda x: numeric_code_as_string(x, 5)) # Convert to time series format data = convert_cases_to_time_series(cases, index_columns=["subregion2_code"]) data["country_code"] = "BR" # Get rid of bogus records data = data.dropna(subset=["date"]) data = data[data["date"] >= "2020-01-01"] data = data[data["date"] < date_today(offset=1)] # Aggregate by country level country = (data.drop(columns=["subregion2_code"]).groupby( ["date", "age", "sex"]).sum().reset_index()) country["key"] = "BR" # Aggregate by state level data["subregion1_code"] = data["subregion2_code"].apply( lambda x: _IBGE_STATES.get(safe_int_cast(x[:2]))) state = (data.drop(columns=["subregion2_code"]).dropna( subset=["subregion1_code"]).groupby( ["date", "subregion1_code", "age", "sex"]).sum().reset_index()) state["key"] = "BR_" + state["subregion1_code"] # Derive the key from subregion codes data = data[data["subregion2_code"].notna()] data["key"] = "BR_" + data["subregion1_code"] + "_" + data[ "subregion2_code"] return concat([country, state, data])
def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: # Rename the appropriate columns data = dataframes[0].rename( columns={ # Sometimes the column names are in the local language "Dátum": "date", "Mintavételek száma (összesen)": "total_tested", "mintavételek száma": "new_tested", "pozitív esetek száma (összesen)": "total_confirmed", "napi pozitív esetszám": "new_confirmed", "hospitalizált": "current_hospitalized", "intenzív ellátásra szoruló": "current_intensive_care", "a kórházból elbocsátottak napi száma": "new_recovered", "elhunytak száma összesen": "total_deceased", "elhunytak": "new_deceased", # Sometimes de column names are in English "Date": "date", "Tested (all)": "total_tested", "Tested (daily)": "new_tested", "Positive (all)": "total_confirmed", "Positive (daily)": "new_confirmed", "All hospitalized on certain day": "current_hospitalized", "All persons in intensive care on certain day": "current_intensive_care", "Discharged": "new_recovered", "Deaths (all)": "total_deceased", "Deaths (daily)": "new_deceased", }) # It's only country-level data so we can compute the key directly data["key"] = "SI" # Make sure that the date column is a string data["date"] = data["date"].apply( lambda x: datetime_isoformat(str(x)[:10], "%Y-%m-%d")) # Drop bogus values data.dropna(subset=["date"], inplace=True) # Remove non-numeric markers from data fields value_columns = [ col for col in data.columns if any( col.startswith(token) for token in ("new", "total", "current")) ] for col in value_columns: data[col] = data[col].apply( lambda x: safe_int_cast(str(x).replace("*", ""))) # Output the results return data
def _default_age_adapter(value: Any) -> str: if isna(value): return "age_unknown" try: value_int = safe_int_cast(value) if value_int is not None: return age_group(value_int) if re.match(r"\d+\-\d*", value): return value except ValueError: pass return "age_unknown"
def _default_age_adapter(value: Any) -> str: if isna(value): return "age_unknown" try: value = str(value) if re.match(r"\d+(\.\d*)?", value): return age_group(safe_int_cast(value)) if re.match(r"\d\d?\-\d*", value): return value except ValueError: pass return "age_unknown"
def parse_dataframes(self, dataframes: List[DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: # Read data from GitHub repo data = {} for df, name in zip(dataframes, ("confirmed", "deceased", "recovered")): df.rename(columns={"ISO 3166-2 Code": "key"}, inplace=True) df.key = df.key.str.replace("-", "_") data[name] = df.set_index("key") # Transform the data from non-tabulated format to record format records = [] for key in data["confirmed"].index.unique(): date_columns = [ col for col in data["confirmed"].columns if re.match(r"\d\d\d\d-\d\d-\d\d", col) ] for col in date_columns: records.append({ **{ "date": col, "key": key }, **{ f"total_{var}": safe_int_cast(df.loc[key, col]) for var, df in data.items() }, }) # Zeroes can be considered NaN in this data data = DataFrame.from_records(records).replace(0, None) # Since it's cumsum data, we can forward fill relatively safely data = data.sort_values(["key", "date" ]).groupby("key").apply(lambda x: x.ffill()) # We already have CO data directly from the authoritative source, ideally we would match # the keys from here to the source which uses DIVIPOLA codes. That's left as a TODO for now. data = data[~data.key.str.startswith("CO_")] # Correct the French colonies since we are using something different to the ISO 3166-2 code data.loc[data.key == "FR_GP", "key"] = "FR_GUA" # VE_W are "external territories" of Venezuela, which can't be mapped to any particular # geographical region and therefore we exclude them data = data[data.key != "VE_W"] return data
def _process_html_file(file_map: Dict[str, str], date: str) -> Dict[str, Any]: records = [] fname = file_map[date] with open(fname, "r") as fd: html_content = fd.read() # Make sure the tables are exactly what we expect age_groups = ["0-17", "18-34", "35-49", "50-64", "65-79", "80-"] tables = list(_extract_tables(html_content))[-6:] if len(tables) == len(age_groups): for age_group, table in zip(age_groups, tables): total = table.iloc[-1] records.append( { "key": "US_CA", "date": date, "age": age_group, "total_confirmed": safe_int_cast(total["No. Cases"]), "total_deceased": safe_int_cast(total["No. Deaths"]), } ) return records
def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: # Partition dataframes based on the state the data is for partitions = {code: [] for code in _IBGE_STATES.values()} for df in dataframes.values(): df = table_rename(df, _open_data_portal_column_adapter, drop=True) apply_func = lambda x: _IBGE_STATES.get(safe_int_cast(x)) df["subregion1_code"] = df["_state_code"].apply(apply_func) for code, group in df.groupby("subregion1_code"): partitions[code].append(group) # Process each partition in separate threads map_opts = dict(desc="Processing Partitions", total=len(partitions)) map_iter = (concat(chunks) for chunks in partitions.values()) return concat(process_map(_process_partition, map_iter, **map_opts))
def parse(self, sources: List[str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: data = read_file(sources[0], error_bad_lines=False, encoding="ISO-8859-1") data = table_rename( data, { "Date": "date", "Nombre de personnes en soins normaux": "current_hospitalized", "Nombre de personnes en soins intensifs (sans patients du Grand Est)": "current_intensive_care", "Nombre de décès - cumulé (sans patients du Grand Est)": "total_deceased", "Total patients COVID ayant quitté l'hôpital (hospitalisations stationnaires, données brutes)": "new_recovered", "Nombre de nouvelles personnes testées COVID+ par jour ": "new_tested", }, ) # Get date in ISO format data.date = data.date.apply( lambda x: datetime_isoformat(x, "%d/%m/%Y")) # Keep only columns we can provess data = data[[ "date", "current_hospitalized", "current_intensive_care", "total_deceased", "new_recovered", "new_tested", ]] # Convert recovered into a number data.new_recovered = data.new_recovered.apply( lambda x: safe_int_cast(x.replace("-", "0"))) # Only country-level data is provided data["key"] = "LU" # Output the results return data
def _default_age_adapter(value: Any) -> str: if isna(value): return "age_unknown" # If the value is already in the form of an age group, return as-is if isinstance(value, str) and re.match(r"^\d+\-\d*$", value): return value # Otherwise assume it's a number and return the corresponding age group try: value_int = safe_int_cast(value) if value_int is not None: return age_group(value_int) except ValueError: pass return "age_unknown"
def test_cast_int(self): test_data = { "1": 1, "1.0": 1, "-1": -1, "-1.0": -1, "+1": 1, "1e3": 1000, "−1": -1, # Special character dash. **TEST_DATA_NULL, } for value, expected in test_data.items(): result = safe_int_cast(value) self.assertEqual( result, expected, f"[{value}] Expected: {expected}. Found: {result}")
def parse_dataframes( self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts ) -> DataFrame: # Parse the data as a list of records records = [] for _, row in dataframes[0].iterrows(): row = row.to_dict() # Parse the start and end dates date_start = row["Effective Date"][:10] date_end = row["Valid Through Date"][:10] # Convert column name and delete unnecessary columns row["subregion1_name"] = row["Jurisdictions"] del row["Jurisdictions"] del row["Effective Date"] del row["Valid Through Date"] # Insert a record for each date in the range for date in date_range(date_start, date_end): record = {} record["date"] = date non_numeric_columns = ("date", "subregion1_name") for col, val in row.items(): if col in non_numeric_columns: record[col] = val else: record[_convert_column_name(col)] = safe_int_cast(val) records.append(record) # Convert to DataFrame and add metadata for matching data = DataFrame.from_records(records) data["country_code"] = "US" data["subregion2_code"] = None data["locality_code"] = None return data
def _parse_value(val: Any) -> int: if isinstance(val, dict): return _parse_value(val["value"]) else: return safe_int_cast(val)
def parse_dataframes( self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts ) -> DataFrame: # Rename the appropriate columns data = table_rename( dataframes[0], { "date": "date", "age": "age", "cum.deaths": "total_deceased", "daily.deaths": "new_deceased", "code": "subregion1_code", }, drop=True, ) # Convert date to ISO format data["date"] = data["date"].apply(lambda x: str(x)[:10]) # Correct an error in the age bins from data source data.loc[data["age"] == "-1-9", "age"] = "1-9" # Parse age to match our group names def parse_age_group(age_group: str): new_age_group = "" age_bins = age_group.strip().replace("+", "-").split("-", 1) age_lo = safe_int_cast(age_bins[0]) new_age_group += f"{age_lo:02d}-" if len(age_bins) > 1 and age_bins[1]: age_hi = safe_int_cast(age_bins[1]) new_age_group += f"{age_hi:02d}" return new_age_group data["age"] = data["age"].apply(parse_age_group) # Derive key from the subregion code data["key"] = "US_" + data["subregion1_code"] # Some of the places are not US states data.loc[data["subregion1_code"] == "NYC", "key"] = "US_NY_NYC" # Compute our own age groups since they are not uniform across states for idx in range(10): data[f"age_bin_{idx:02d}"] = None data[f"new_deceased_age_{idx:02d}"] = None data[f"total_deceased_age_{idx:02d}"] = None for key in data["key"].unique(): mask = data["key"] == key age_bins = data.loc[mask, "age"].unique() sorted_age_bins = sorted(age_bins, key=lambda x: safe_int_cast(x.split("-")[0])) for idx, age_bin_val in enumerate(sorted_age_bins): data.loc[mask, f"age_bin_{idx:02d}"] = age_bin_val age_bin_mask = mask & (data["age"] == age_bin_val) data.loc[age_bin_mask, f"new_deceased_age_{idx:02d}"] = data.loc[ age_bin_mask, "new_deceased" ] data.loc[age_bin_mask, f"total_deceased_age_{idx:02d}"] = data.loc[ age_bin_mask, "total_deceased" ] # Output the results return data.drop(columns=["age", "subregion1_code"])
def fix_temp(value: int): value = safe_int_cast(value) return None if value is None else "%.1f" % (value / 10.0)
def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: cases = table_rename( dataframes[0], { "sexoPaciente": "sex", "idadePaciente": "age", "codigoMunicipioPaciente": "subregion2_code", "dataResultadoExame": "date_new_tested", "dataObito": "date_new_deceased", "dataEntradaUtisSvep": "date_new_intensive_care", "evolucaoCasoSivep": "_prognosis", "dataInicioSintomas": "_date_onset", "dataEvolucaoCasoSivep": "_date_update", "resultadoFinalExame": "_test_result", }, drop=True, ) # Follow the procedure described in the data documentation to compute the confirmed cases: # https://drive.google.com/file/d/1DUwST2zcXUnCJmJauiM5zmpSVWqLiAYI/view cases["date_new_confirmed"] = None confirmed_mask = cases["_test_result"] == "Positivo" cases.loc[confirmed_mask, "date_new_confirmed"] = cases.loc[confirmed_mask, "date_new_tested"] # Only count intensive care patients if they had a positive test result cases.loc[~confirmed_mask, "date_new_intensive_care"] = None # Drop columns which we have no use for cases = cases[[ col for col in cases.columns if not col.startswith("_") ]] # Make sure our region code is of type str cases["subregion2_code"] = cases["subregion2_code"].apply( lambda x: None if isna(x) else str(safe_int_cast(x))) # Convert ages to int, and translate sex (no "other" sex/gender reported) cases["age"] = cases["age"].apply(safe_int_cast) cases["sex"] = cases["sex"].apply({ "MASCULINO": "male", "FEMENINO": "female" }.get) # Convert to time series format data = convert_cases_to_time_series(cases, index_columns=["subregion2_code"]) # Convert date to ISO format data["date"] = data["date"].apply( lambda x: datetime_isoformat(x, "%Y-%m-%d %H:%M:%S")) # Aggregate state-level data by adding all municipalities state = data.drop(columns=["subregion2_code"]).groupby( ["date", "age", "sex"]).sum() state.reset_index(inplace=True) state["key"] = "BR_CE" # Fortaleza is both a subregion of the state and a "locality" city = data.loc[data["subregion2_code"] == "230440"].copy() city["key"] = "BR_CE_FOR" # Drop bogus records from the data data = data[~data["subregion2_code"].isna() & (data["subregion2_code"] != "")] # We can build the key for the data directly from the subregion code data["key"] = "BR_CE_" + data["subregion2_code"] return concat([state, data, city])
def _age_adapter(age: str) -> str: try: age = safe_int_cast(str(age).replace("+", "-").split("-")[0]) return age_group(age, bin_size=10, age_cutoff=70) except: return "age_unknown"
def parse(self, sources: List[str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: if parse_opts.get("debug"): print("File name:", sources[0]) # Get the file contents from source html_content = open(sources[0]).read() # We need to set locale in order to parse dates properly locale.setlocale(locale.LC_TIME, parse_opts.get("locale", "en_US") + ".UTF-8") # Tables keep changing order, so iterate through all until one looks good table_count = count_html_tables(html_content, selector="table.wikitable") data: DataFrame = None for table_index in range(table_count): data = read_html( html_content, header=True, selector="table.wikitable", parser=wiki_html_cell_parser, table_index=table_index, skiprows=parse_opts.get("skiprows", 0), ) if parse_opts.get("debug"): print("\n[%d] Data:" % (table_index + 1)) print(data.columns) print(data.head(50)) # Some of the tables are in Spanish data = data.rename(columns={"Fecha": "Date"}) # Set first date column as index, drop other date columns columns_lowercase = [(col or "").lower() for col in data.columns] date_index = columns_lowercase.index("date") if "date" in columns_lowercase else 0 del_index = [i for i, col in enumerate(columns_lowercase) if col == "date"][1:] data = data.iloc[:, [i for i, _ in enumerate(data.columns) if i not in del_index]] data = data.set_index(data.columns[date_index]) # data = data.iloc[:, :-parse_opts.get('skipcols', 0)] if parse_opts.get("droprows") is not None: try: data = data.drop(parse_opts["droprows"].split(",")) except: pass # Pivot the table to fit our preferred format data = pivot_table(data, pivot_name="subregion") data = data[~data["subregion"].isna()] if parse_opts.get("debug"): print("\n[%d] Pivoted:" % (table_index + 1)) print(data.head(50)) # Make sure all dates include year date_format = parse_opts["date_format"] if "%Y" not in date_format: date_format = date_format + "-%Y" data["date"] = data["date"].astype(str) + "-%d" % datetime.datetime.now().year # Parse into datetime object, drop if not possible data["date"] = data["date"].apply(lambda date: safe_datetime_parse(date, date_format)) data = data[~data["date"].isna()] # If the dataframe is not empty, then we found a good one if len(data) > 10 and len(data["subregion"].unique()) > 3: break # Make sure we have *some* data assert data is not None and len(data) > 0 # Convert all dates to ISO format data["date"] = data["date"].apply(lambda date: date.date().isoformat()) # Get the confirmed and deaths data from the table data["confirmed"] = data["value"].apply(lambda x: safe_int_cast(self._parenthesis(x)[0])) data["deceased"] = data["value"].apply(lambda x: safe_int_cast(self._parenthesis(x)[1])) # Add up all the rows with same Date and subregion data = data.sort_values(["date", "subregion"]) data = ( data.drop(columns=["value"]) .groupby(["subregion", "date"]) .agg(self._aggregate_region_values) ) data = data.reset_index().sort_values(["date", "subregion"]) # Add the appropriate columns to the data value_columns = ["confirmed", "deceased"] for col in value_columns: data["new_" + col] = None data["total_" + col] = None # Iterate over the individual subregions to process the values per group for region in data["subregion"].unique(): mask = data["subregion"] == region for column in value_columns: # We can forward-fill values if data is cumsum if parse_opts.get("cumsum"): data.loc[mask, column] = data.loc[mask, column].ffill() # Fill NA with zero to allow for column-wide operations zero_filled = data.loc[mask, column].fillna(0) # Only perform operation if the column is not all NaN if sum(zero_filled) > 0: # Compute diff of the values region by region if required if parse_opts.get("cumsum"): data.loc[mask, "new_" + column] = zero_filled.diff() data.loc[mask, "total_" + column] = zero_filled # If values are already new daily counts, then empty most likely means zero else: data.loc[mask, "new_" + column] = zero_filled data.loc[mask, "total_" + column] = zero_filled.cumsum() # Get rid of rows which have all null values data = data.dropna(how="all", subset=value_columns) # Add the country code to all records data["country_code"] = parse_opts["country"] # Labels can be any arbitrary column name data = data.rename(columns={"subregion": "match_string"}) # Drop columns if requested if "drop_column" in parse_opts: data = data.drop(columns=[parse_opts["drop_column"]]) # Filter out the appropriate levels aggregation_level = parse_opts.get("aggregation_level", 1) if aggregation_level == 1: null_column = "subregion2_code" elif aggregation_level == 2: null_column = "subregion1_code" data[null_column] = None # Remove known values that are just noise data["_match_string"] = data["match_string"].apply(lambda x: x.lower()) data = data[ ~data["_match_string"].isin( [ "cml", "new", "total", "tests", "deaths", "abroad", "airport", "current", "newcases", "acumulado", "repatriated", "totaltested", "confirmed cases", "unassigned\ncases", "airport screening", ] ) ] data = data[~data["_match_string"].apply(lambda x: "princess" in x or "total" in x)] # Output the results if parse_opts.get("debug"): print("\nOutput:") print(data.head(50)) return data
def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: # Read data from GitHub repo data = {} for name, df in dataframes.items(): df.rename(columns={"ISO 3166-2 Code": "key"}, inplace=True) df.key = df.key.str.replace("-", "_") data[name] = df.set_index("key") value_columns = [f"total_{statistic}" for statistic in data.keys()] # Transform the data from non-tabulated format to record format records = [] for key in data["confirmed"].index.unique(): date_columns = [ col for col in data["confirmed"].columns if re.match(r"\d\d\d\d-\d\d-\d\d", col) ] for col in date_columns: records.append({ **{ "date": col, "key": key }, **{ f"total_{statistic}": safe_int_cast(df.loc[key, col]) for statistic, df in data.items() }, }) # Zeroes can be considered NaN in this data data = DataFrame.from_records(records).replace(0, None) # Remove all data without a proper date data = data.dropna(subset=["date"]) # Since it's cumsum data, we can forward fill relatively safely data = data.sort_values(["key", "date" ]).groupby("key").apply(lambda x: x.ffill()) # We already have AR, BR, CO, MX and PE data directly from the authoritative source for country_code in ("AR", "BR", "CL", "CO", "MX", "PE"): data = data[~data.key.str.startswith(f"{country_code}_")] # Correct the French colonies since we are using something different to the ISO 3166-2 code data.loc[data.key == "FR_GP", "key"] = "FR_GUA" # VE_W are "external territories" of Venezuela, which can't be mapped to any particular # geographical region and therefore we exclude them data = data[data.key != "VE_W"] # The data appears to be very unreliable prior to March 12 data.loc[data["date"] <= "2020-03-12", value_columns] = None # In many cases, total counts go down dramatically; filter out all dates prior to that # We lose a lot of data by setting such a low threshold, but this data source is unreliable skip_threshold = 0 for key in data["key"].unique(): rm_date = "2020-01-01" subset = data[data["key"] == key] diffs = subset.set_index("date")[value_columns].diff() for col in value_columns: max_date = diffs.loc[diffs[col] < skip_threshold].index.max() if not isna(max_date) and max_date > rm_date: rm_date = max_date data.loc[(data["key"] == key) & (data["date"] < rm_date), value_columns] = None return data