def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame], **parse_opts): # Read all files in the eurostat folder and merge them together eurostat_directory = SRC / "data" / "eurostat" dataframes = [ read_file(file_name) for file_name in eurostat_directory.glob("*.csv") ] data = table_merge(dataframes, how="outer").dropna(subset=["key"]) # Use only keys available in metadata return data.merge(aux["metadata"][["key"]], how="inner")
def parse(self, sources: List[str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: data = read_file( sources[0], error_bad_lines=False, encoding="ISO-8859-1", sep=";" ).rename( columns={ "Date": "date", "Nombre de personnes en soins normaux": "current_hospitalized", "Nombre de personnes en soins intensifs (sans patients du Grand Est)": "current_intensive_care", "Nombre de décès - cumulé (sans patients du Grand Est)": "deceased", "Total patients COVID ayant quitté l'hôpital (hospitalisations stationnaires, données brutes)": "recovered", "Nombre de nouvelles personnes testées COVID+ par jour ": "tested", }) # Get date in ISO format data.date = data.date.apply( lambda x: datetime_isoformat(x, "%d/%m/%Y")) # Keep only columns we can provess data = data[[ "date", "current_hospitalized", "current_intensive_care", "deceased", "recovered", "tested", ]] # Convert recovered into a number data.recovered = data.recovered.apply( lambda x: int(x.replace("-", "0"))) # Compute the daily counts data["key"] = "LU" data_new = grouped_diff(data[["key", "date", "deceased"]], ["key", "date"]) data_cum = grouped_cumsum(data[["key", "date", "tested", "recovered"]], ["key", "date"]) data_cur = data[[ "key", "date", "current_hospitalized", "current_intensive_care" ]] data = data_new.merge(data_cum, how="outer").merge(data_cur, how="outer") # Output the results return data
def test_key_build(self): skip_keys = ("UA_40", "UA_43") metadata = read_file(METADATA_PATH).set_index("key") localities = read_file(SRC / "data" / "localities.csv")["locality"].unique() for key, record in metadata.iterrows(): msg = f"{key} does not match region codes in metadata" tokens = key.split("_") if key in skip_keys: continue elif len(tokens) == 1: self.assertEqual(key, record["country_code"], msg) elif key in localities or not isna(record["locality_code"]): self.assertEqual(tokens[-1], record["locality_code"], msg) elif len(tokens) == 2: self.assertEqual(tokens[0], record["country_code"], msg) self.assertEqual(tokens[1], record["subregion1_code"], msg) elif len(tokens) == 3: self.assertEqual(tokens[0], record["country_code"], msg) self.assertEqual(tokens[1], record["subregion1_code"], msg) self.assertEqual(tokens[2], record["subregion2_code"], msg)
def _process_cache_file(file_map: Dict[str, str], date: str) -> DataFrame: data = read_file( file_map[date])["vaccination_county_condensed_data"].values.tolist() data = DataFrame.from_records(data) data = data[data["FIPS"] != "UNK"] data = data.assign( key="US_" + data["StateAbbr"].str[:2] + "_" + data["FIPS"], Series_Complete_Yes=data["Series_Complete_Yes"].fillna(0).astype(int), ) data = table_rename(data, _column_adapter, drop=True) data["date"] = date return data
def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: data = read_file(sources[0], sheet_name=parse_opts.get("sheet_name")) # Process the individual sheet data = _sheet_processors[parse_opts.get("sheet_name")](data) # Fix up the date format data["date"] = data["date"].apply( lambda x: datetime_isoformat(x, "%Y-%m-%d %H:%M:%S")) # Add a key to all the records (state-level only) data["key"] = "US_DC" return data
def _read_main_table(path: Path) -> DataFrame: return read_file( path, dtype={ "country_code": "category", "country_name": "category", "subregion1_code": "category", "subregion1_name": "category", "subregion2_code": "category", "subregion2_name": "category", "3166-1-alpha-2": "category", "3166-1-alpha-3": "category", "aggregation_level": "category", }, )
def parse(self, sources: List[str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: data = read_file(sources[0], sheet_name="Antal intensivvårdade per dag").rename( columns={ "Datum_vårdstart": "date", "Antal_intensivvårdade": "intensive_care" }) # Get date in ISO format data["key"] = "SE" data.date = data.date.apply( lambda x: datetime_isoformat(x, "%m/%d/%Y")) return grouped_cumsum(data, ["key", "date"])
def test_derive_localities(self): localities = read_file(SRC / "data" / "localities.csv") test_data = LOCALITY_TEST_DATA.copy() expected = DataFrame.from_records( [ {"key": "BR_RJ_GIG", "date": "2020-01-01", "val": 1}, {"key": "BR_RJ_GIG", "date": "2020-01-02", "val": 1}, {"key": "US_GA_ATL", "date": "2020-01-01", "val": 4}, {"key": "US_GA_ATL", "date": "2020-01-02", "val": 4}, ] ) columns = test_data.columns test_result = derive_localities(localities, test_data)[columns] self.assertEqual(test_result.to_csv(index=False), expected.to_csv(index=False))
def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: data = read_file(sources[0], sheet_name=parse_opts.get("sheet_name")) data.columns = data.iloc[1] data = table_rename(data.iloc[2:], _column_adapter, drop=True) data["date"] = data["date"].astype(str).apply(lambda x: x[:10]) data["date"] = data["date"].apply( lambda x: datetime_isoformat(x, "%Y-%m-%d")) data = data.dropna(subset=["date"]) if parse_opts.get("key"): data["key"] = parse_opts.get("key") return data
def subset_latest(output_folder: Path, csv_file: Path) -> DataFrame: """ Outputs latest data for each key """ latest_folder = output_folder / "latest" latest_folder.mkdir(exist_ok=True) table = read_file(csv_file, low_memory=False) # Degenerate case: this table has no date if not "date" in table.columns or len(table.date.dropna()) == 0: return export_csv(table, latest_folder / csv_file.name) else: non_null_columns = [ col for col in table.columns if not col in ("key", "date") ] table = table.dropna(subset=non_null_columns, how="all") table = table.sort_values("date").groupby("key").last().reset_index() export_csv(table, latest_folder / csv_file.name)
def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: data = read_file(sources[0], sheet_name="Antal per dag region").rename( columns={"Statistikdatum": "date"}) # Get date in ISO format data.date = data.date.astype(str) # Unpivot the regions which are columns data.columns = [col.replace("_", " ") for col in data.columns] data = data.drop(columns=["Totalt antal fall"]).set_index("date") data = pivot_table(data, pivot_name="match_string") data["country_code"] = "SE" return data.rename(columns={"value": "new_confirmed"})
def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: data = read_file(sources[0], sheet_name="Antal intensivvårdade per dag").rename( columns={ "Datum_vårdstart": "date", "Antal_intensivvårdade": "new_intensive_care" }) # Get date in ISO format data["key"] = "SE" # The source is actually %m/%d/%Y but pandas silently converts it to date object data["date"] = data["date"].astype(str).apply( lambda x: datetime_isoformat(x, "%Y-%m-%d")) return data
def _subset_last_days(output_folder: Path, days: int) -> None: """ Outputs last N days of data """ n_days_folder = output_folder / str(days) n_days_folder.mkdir(exist_ok=True) for csv_file in (output_folder).glob("*.csv"): table = read_file(csv_file) # Degenerate case: this table has no date if not "date" in table.columns or len(table.date.dropna()) == 0: export_csv(table, n_days_folder / csv_file.name) else: last_date = datetime.date.fromisoformat(max(table.date)) # Since APAC is almost always +1 days ahead, increase the window by 1 first_date = last_date - datetime.timedelta(days=days + 1) export_csv(table[table.date >= first_date.isoformat()], n_days_folder / csv_file.name)
def parse(self, sources: List[str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: data = read_file(sources[0], error_bad_lines=False, encoding="ISO-8859-1") data = table_rename( data, { "Date": "date", "Nombre de personnes en soins normaux": "current_hospitalized", "Nombre de personnes en soins intensifs (sans patients du Grand Est)": "current_intensive_care", "Nombre de décès - cumulé (sans patients du Grand Est)": "total_deceased", "Total patients COVID ayant quitté l'hôpital (hospitalisations stationnaires, données brutes)": "new_recovered", "Nombre de nouvelles personnes testées COVID+ par jour ": "new_tested", }, ) # Get date in ISO format data.date = data.date.apply( lambda x: datetime_isoformat(x, "%d/%m/%Y")) # Keep only columns we can provess data = data[[ "date", "current_hospitalized", "current_intensive_care", "total_deceased", "new_recovered", "new_tested", ]] # Convert recovered into a number data.new_recovered = data.new_recovered.apply( lambda x: safe_int_cast(x.replace("-", "0"))) # Only country-level data is provided data["key"] = "LU" # Output the results return data
def _process_cache_file(file_map: Dict[str, str], date: str) -> DataFrame: data = read_file(file_map[date])["vaccination_data"].values.tolist() data = DataFrame([list(v.values()) for v in data], columns=list(data[0].keys())) data = data.loc[data.Location.isin(us_states)] for col in set(_column_adapter.keys()).intersection(data.columns): data[col] = data[col].fillna(0).astype(int) data["key"] = data["Location"].apply(lambda x: "US" if x == "US" else "US_" + x[:2]) data = table_rename(data, _column_adapter, drop=True) data["date"] = date return data
def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: url_tpl = sources[0] metadata = aux["metadata"] metadata = metadata[metadata["country_code"] == "FR"] fr_isos = read_file(SRC / "data" / "fr_iso_codes.csv") fr_iso_map = { iso: code for iso, code in zip(fr_isos["iso_code"], fr_isos["region_code"]) } fr_codes = metadata[["subregion1_code", "subregion2_code"]].dropna() regions_iter = fr_codes["subregion1_code"].unique() deps_iter = [record for _, record in fr_codes.iterrows()] # For country level, there is no need to estimate confirmed from tests column_adapter_country = dict(_column_adapter) column_adapter_country.pop("testsPositifs") # Get country level data country = _get_country(url_tpl, column_adapter_country) # Country level data has totals instead of diffs, so we compute the diffs by hand country.sort_values("date", inplace=True) country["new_confirmed"] = country["total_confirmed"].diff() country.drop(columns=["total_confirmed"], inplace=True) # For region level, we can only estimate confirmed from tests column_adapter_region = dict(_column_adapter) column_adapter_region.pop("casConfirmes") # Get region level data get_region_func = partial(_get_region, url_tpl, column_adapter_region, fr_iso_map) regions = concat(list(thread_map(get_region_func, regions_iter))) # Get department level data get_department_func = partial(_get_department, url_tpl, column_adapter_region) departments = concat(list(thread_map(get_department_func, deps_iter))) data = concat([country, regions, departments]) data["date"] = data["date"].apply( lambda x: datetime_isoformat(x, "%Y-%m-%d %H:%M:%S")) return data.sort_values("date")
def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: url_tpl = sources[0] metadata = aux["metadata"] metadata = metadata[metadata["country_code"] == "FR"] fr_isos = read_file(SRC / "data" / "fr_iso_codes.csv") fr_iso_map = { iso: code for iso, code in zip(fr_isos["iso_code"], fr_isos["region_code"]) } fr_codes = metadata[["subregion1_code", "subregion2_code"]].dropna() regions_iter = fr_codes["subregion1_code"].unique() deps_iter = (record for _, record in fr_codes.iterrows()) if parse_opts.get("country"): # For country level, there is no need to estimate confirmed from tests _column_adapter_2 = dict(_column_adapter) _column_adapter_2.pop("testsPositifs") data = _get_country(url_tpl, _column_adapter_2) else: # For region level, we can only estimate confirmed from tests _column_adapter_2 = dict(_column_adapter) _column_adapter_2.pop("casConfirmes") get_region_func = partial(_get_region, url_tpl, _column_adapter_2, fr_iso_map) regions = concat(list(thread_map(get_region_func, regions_iter))) get_department_func = partial(_get_department, url_tpl, _column_adapter_2) departments = concat( list( thread_map(get_department_func, deps_iter, total=len(fr_codes)))) data = concat([regions, departments]) data["date"] = data["date"].apply( lambda x: datetime_isoformat(x, "%Y-%m-%d %H:%M:%S")) return data
def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: # Retrieve the CSV files from https://covid19.isciii.es data = (read_file(sources[0], error_bad_lines=False, encoding="ISO-8859-1").rename( columns={ "FECHA": "date", "CCAA": "subregion1_code", "Fallecidos": "total_deceased", "Hospitalizados": "total_hospitalized", "UCI": "total_intensive_care", }).dropna(subset=["date"])) # Confirmed cases are split across 2 columns confirmed_columns = ["CASOS", "PCR+"] for col in confirmed_columns: data[col] = data[col].fillna(0) data["total_confirmed"] = data.apply( lambda x: sum([x[col] for col in confirmed_columns]), axis=1) # Convert dates to ISO format data["date"] = data["date"].apply( lambda date: datetime_isoformat(date, "%d/%m/%Y")) # Keep only the columns we can process data = data[[ "date", "subregion1_code", "total_confirmed", "total_deceased", "total_hospitalized", "total_intensive_care", ]] # Derive the key from the subregion code data["key"] = "ES_" + data["subregion1_code"] # Output the results return data
def parse(self, sources: List[str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: # Retrieve the CSV files from https://covid19.isciii.es df = (read_file(sources[0], error_bad_lines=False, encoding="ISO-8859-1").rename( columns={ "FECHA": "date", "CCAA": "subregion1_code", "Fallecidos": "deceased", "Hospitalizados": "hospitalized", "UCI": "ICU", "Recuperados": "recovered", }).dropna(subset=["date"])) # Confirmed cases are split across 2 columns confirmed_columns = ["CASOS", "PCR+"] for col in confirmed_columns: df[col] = df[col].fillna(0) df["confirmed"] = df.apply( lambda x: sum([x[col] for col in confirmed_columns]), axis=1) # Convert dates to ISO format df["date"] = df["date"].apply( lambda date: datetime_isoformat(date, "%d/%m/%Y")) # Reported cases are cumulative, compute the diff df = grouped_diff(df, ["subregion1_code", "date"]) # Add the country code to all records df["country_code"] = "ES" # Country-wide is the sum of all regions country_level = (df.drop(columns=["subregion1_code"]).groupby( ["date", "country_code"]).sum().reset_index()) country_level["subregion1_code"] = None df = concat([country_level, df]) # Output the results return df
def parse(self, sources: List[str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: # Retrieve the CSV files from https://covid19.isciii.es data = (read_file(sources[0], error_bad_lines=False, encoding="ISO-8859-1").rename( columns={ "FECHA": "date", "CCAA": "subregion1_code", "Fallecidos": "deceased", "Hospitalizados": "hospitalized", "UCI": "intensive_care", }).dropna(subset=["date"])) # Confirmed cases are split across 2 columns confirmed_columns = ["CASOS", "PCR+"] for col in confirmed_columns: data[col] = data[col].fillna(0) data["confirmed"] = data.apply( lambda x: sum([x[col] for col in confirmed_columns]), axis=1) # Convert dates to ISO format data["date"] = data["date"].apply( lambda date: datetime_isoformat(date, "%d/%m/%Y")) # Keep only the columns we can process data = data[[ "date", "subregion1_code", "confirmed", "deceased", "hospitalized", "intensive_care" ]] # Reported cases are cumulative, compute the diff data = grouped_diff(data, ["subregion1_code", "date"]) # Add the country code to all records data["country_code"] = "ES" # Output the results return data
def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: read_opts = {k: v for k, v in parse_opts.items() if k in READ_OPTS} dataframes = {} date_start = parse_opts.pop("date_start", None) date_end = parse_opts.pop("date_end", None) for cache_key, cache_urls in sources.items(): daily_data = [] for date, url in cache_urls.items(): if date_start is not None and date < date_start: continue if date_end is not None and date > date_end: continue data = read_file(url, **read_opts) data["date"] = date daily_data.append(data) dataframes[cache_key] = concat(daily_data) return self.parse_dataframes(dataframes, aux, **parse_opts)
def fetch( self, output_folder: Path, cache: Dict[str, str], fetch_opts: List[Dict[str, Any]], skip_existing: bool = False, ) -> Dict[str, str]: metadata = read_file(SRC / "data" / "metadata.csv") za = metadata[metadata["country_code"] == "ZA"] provinces = za[za["key"].apply(lambda x: len(x.split("_")) == 2)] districts = za[za["key"].apply(lambda x: len(x.split("_")) == 3)] url_tpl = {opt["name"]: opt["url"] for opt in fetch_opts} opts = {"opts": {"ignore_failure": True}} fetch_list = [] for key in provinces["key"]: key_ = key[3:].replace("_", "-") fetch_list.append({"url": url_tpl["provinces"].format(key=key_), "name": key, **opts}) for key in districts["key"]: key_ = key[3:].replace("_", "-") fetch_list.append({"url": url_tpl["districts"].format(key=key_), "name": key, **opts}) return super().fetch(output_folder, cache, fetch_list, skip_existing=skip_existing)
def _parse(file_path: str, sheet_name: str, value_name: str): data = read_file(file_path, sheet_name=sheet_name) data.columns = [col.replace("NHS ", "").replace(" total", "") for col in data.iloc[1]] # Drop Golden Jubilee National Hospital - it has no hospitalizations and does not fit # any current matches in metadata.csv. data = data.drop(columns=["Golden Jubilee National Hospital"]) data = data.iloc[2:].rename(columns={"Date": "date"}) data = pivot_table(data.set_index("date"), pivot_name="match_string") data = data.rename(columns={"value": value_name}) data[value_name] = data[value_name].replace("*", None).apply(safe_float_cast).astype(float) # Get date in ISO format data.date = data.date.apply(lambda x: x.date().isoformat()) # Add metadata data["key"] = None data["country_code"] = "GB" data["subregion1_code"] = "SCT" l2_mask = data.match_string == "Scotland" data.loc[l2_mask, "key"] = "GB_SCT" return data
def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: metadata = aux["metadata"] metadata = metadata[metadata["country_code"] == "FR"] fr_isos = read_file(SRC / "data" / "fr_iso_codes.csv") fr_iso_map = { iso: code for iso, code in zip(fr_isos["iso_code"], fr_isos["region_code"]) } fr_codes = metadata[["subregion1_code", "subregion2_code"]].dropna() regions_iter = fr_codes["subregion1_code"].unique() deps_iter = (record for _, record in fr_codes.iterrows()) regions = concat( list(thread_map(partial(_get_region, fr_iso_map), regions_iter))) departments = concat( list(thread_map(_get_department, deps_iter, total=len(fr_codes)))) data = concat([regions, departments]) data["date"] = data["date"].apply( lambda x: datetime_isoformat(x, "%Y-%m-%d %H:%M:%S")) return data
def parse(self, sources: List[str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: sheets = [] sheet_processors = { "Trends": TexasDataSource._parse_trends, "Tests by day": TexasDataSource._parse_tests, "Hospitalization by Day": TexasDataSource._parse_hospitalized, } for sheet_name, sheet_processor in sheet_processors.items(): df = sheet_processor(read_file(sources[0], sheet_name=sheet_name)) df = df.dropna(subset=["date"]) df.date = df.date.astype(str) df.date = df.date.apply( lambda x: datetime_isoformat(x, "%Y-%m-%d %H:%M:%S")) sheets.append(df) data = table_multimerge(sheets, how="outer") for col in data.columns: if col != "date": data[col] = data[col].apply(safe_float_cast).astype(float) data["key"] = "US_TX" return data
# Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import re from typing import Any, Callable, Dict, List from pandas import DataFrame, isna from unidecode import unidecode from lib.cast import age_group, safe_int_cast from lib.constants import SRC from lib.io import read_file from lib.utils import get_or_default STRATIFIED_VALUES = read_file(SRC / "data" / "stratified_values.csv").set_index("type") def _default_adapter_factory(key: str) -> Callable[[str], str]: mapping = {"other": f"{key}_other", "unknown": f"{key}_unknown"} for value, alias in STRATIFIED_VALUES.loc[key].set_index( "value")["alias"].iteritems(): mapping[value] = value if not isna(alias): mapping[alias] = value def default_adapter(value: str): if isna(value): return mapping["unknown"] value = re.sub(r"[\s\-]", "_", unidecode(str(value).lower()))
from lib.utils import drop_na_records def snake_to_camel_case(txt: str) -> str: """ Used to convert V2 column names to V1 column names for backwards compatibility """ return re.sub(r"_(\w)", lambda m: m.group(1).upper(), txt.capitalize()) if __name__ == "__main__": # Create the folder which will be published public_folder = SRC / ".." / "output" / "public" public_folder.mkdir(exist_ok=True, parents=True) # Create the v1 data.csv file main_table = read_file(f"{URL_OUTPUTS_PROD}/main.csv", low_memory=False) data = main_table[main_table.aggregation_level < 2] rename_columns = { "date": "Date", "key": "Key", "country_code": "CountryCode", "country_name": "CountryName", "subregion1_code": "RegionCode", "subregion1_name": "RegionName", "total_confirmed": "Confirmed", "total_deceased": "Deaths", "latitude": "Latitude", "longitude": "Longitude", "population": "Population", } data = data[rename_columns.keys()].rename(columns=rename_columns)
# This script must be run from /src sys.path.append(os.getcwd()) from lib.io import read_file from lib.utils import ROOT # Parse arguments from the command line argparser = ArgumentParser() argparser.add_argument("country_code", type=str) argparser.add_argument("--nuts-level", type=int, default=2) argparser.add_argument("--dc-api-key", type=str, default=os.environ["DATACOMMONS_API_KEY"]) args = argparser.parse_args() # Get the country name aux = read_file(ROOT / "src" / "data" / "metadata.csv").set_index("key") country_name = aux.loc[args.country_code, "country_name"] # Convert 2-letter to 3-letter country code iso_codes = read_file(ROOT / "src" / "data" / "country_codes.csv").set_index("key") country_code_alpha_3 = iso_codes.loc[args.country_code, "3166-1-alpha-3"] dc.set_api_key(args.dc_api_key) country = "country/{}".format(country_code_alpha_3) nuts_name = "EurostatNUTS{}".format(args.nuts_level) regions = dc.get_places_in([country], nuts_name)[country] names = dc.get_property_values(regions, "name") for key, name in names.items(): region_name = name[0] region_code = key.split("/")[-1][2:]
def _process_cache_file(file_map: Dict[str, str], date: str) -> DataFrame: data = table_rename(read_file(file_map[date]), _column_adapter, drop=True) data["subregion1_code"] = data["subregion1_code"].apply( lambda x: _ISO_CODE_MAP.get(numeric_code_as_string(x, 2) or "00")) data["date"] = date return data
def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: # Use a manager to handle memory accessed across processes manager = Manager() # Get all the weather stations with data up until last month from inventory today = datetime.date.today() min_date = (today - datetime.timedelta(days=30)).strftime("%Y%m%d") stations = read_file( sources["inventory"]).rename(columns={ "LAT": "lat", "LON": "lon", "ELEV(M)": "elevation" }) stations = stations[stations.END > int(min_date)] stations["id"] = stations["USAF"] + stations["WBAN"].apply( lambda x: f"{x:05d}") # Open the station data as a compressed file with tarfile.open(sources["gsod"], mode="r:gz") as stations_tar: # Build the station cache by decompressing all files in memory station_cache = manager.dict() for member in pbar(stations_tar.getmembers(), desc="Decompressing"): if not member.name.endswith(".csv"): continue # Read the records from the provided station data = read_file( stations_tar.extractfile(member), file_type="csv", usecols=_COLUMN_MAPPING.keys(), ).rename(columns=_COLUMN_MAPPING) # Fix data types data["noaa_station"] = data["noaa_station"].astype(str) data["rainfall"] = data["rainfall"].apply(conv_dist) data["snowfall"] = data["snowfall"].apply(conv_dist) data["dew_point"] = data["dew_point"].apply(conv_temp) for temp_type in ("average", "minimum", "maximum"): col = f"{temp_type}_temperature" data[col] = data[col].apply(conv_temp) # Compute the relative humidity from the dew point and average temperature data["relative_humidity"] = data.apply( lambda x: relative_humidity(x["average_temperature"], x[ "dew_point"]), axis=1) station_cache[member.name.replace(".csv", "")] = data # Get all the POI from metadata and go through each key keep_columns = ["key", "latitude", "longitude"] metadata = read_file(sources["geography"])[keep_columns].dropna() # Only use keys present in the metadata table metadata = metadata.merge(aux["metadata"])[keep_columns] # Convert all coordinates to radians stations["lat"] = stations["lat"].apply(math.radians) stations["lon"] = stations["lon"].apply(math.radians) metadata["lat"] = metadata["latitude"].apply(math.radians) metadata["lon"] = metadata["longitude"].apply(math.radians) # Make sure the stations and the cache are sent to each function call map_func = partial(_process_location, station_cache, stations) # We don't care about the index while iterating over each metadata item map_iter = (record for _, record in metadata.iterrows()) # Bottleneck is network so we can use lots of threads in parallel records = process_map(map_func, map_iter, total=len(metadata)) return concat(records)