def parse(self, sources: Dict[Any, str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: url_tpl = sources[0] metadata = aux["metadata"] metadata = metadata[metadata["country_code"] == "FR"] fr_isos = read_file(SRC / "data" / "fr_iso_codes.csv") fr_iso_map = {iso: code for iso, code in zip(fr_isos["iso_code"], fr_isos["region_code"])} fr_codes = metadata[["subregion1_code", "subregion2_code"]].dropna() regions_iter = fr_codes["subregion1_code"].unique() deps_iter = [record for _, record in fr_codes.iterrows()] column_adapter = { "key": "key", "date": "date", "testsRealisesDetails": "_breakdown_tested", "testsPositifsDetails": "_breakdown_confirmed", } # Get country level data country = _get_country(url_tpl, column_adapter) # Get region level data get_region_func = partial(_get_region, url_tpl, column_adapter, fr_iso_map) regions = concat(list(thread_map(get_region_func, regions_iter))) # Get department level data get_department_func = partial(_get_department, url_tpl, column_adapter) departments = concat(list(thread_map(get_department_func, deps_iter))) data = concat([country, regions, departments]) data["date"] = data["date"].apply(lambda x: datetime_isoformat(x, "%Y-%m-%d %H:%M:%S")) data["_breakdown_tested"].fillna("", inplace=True) data["_breakdown_confirmed"].fillna("", inplace=True) records: Dict[str, List] = {"confirmed": [], "tested": []} for key, row in data.set_index("key").iterrows(): for statistic in records.keys(): if row[f"_breakdown_{statistic}"] != "": for item in row[f"_breakdown_{statistic}"]: records[statistic].append( { "key": key, "date": row["date"], "age": item["age"], "sex": item.get("sexe"), f"new_{statistic}": item["value"], } ) df1 = DataFrame.from_records(records["tested"]) df2 = DataFrame.from_records(records["confirmed"]) data = df1.merge(df2, how="outer") data = data[~data["age"].isin(["0", "A", "B", "C", "D", "E"])] data["age"] = data["age"].apply(lambda x: age_group(safe_int_cast(x))) sex_adapter = lambda x: {"h": "male", "f": "female"}.get(x, "sex_unknown") data["sex"] = data["sex"].apply(sex_adapter) return data
def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: url_tpl = sources[0] metadata = aux["metadata"] metadata = metadata[metadata["country_code"] == "FR"] fr_isos = read_file(SRC / "data" / "fr_iso_codes.csv") fr_iso_map = { iso: code for iso, code in zip(fr_isos["iso_code"], fr_isos["region_code"]) } fr_codes = metadata[["subregion1_code", "subregion2_code"]].dropna() regions_iter = fr_codes["subregion1_code"].unique() deps_iter = (record for _, record in fr_codes.iterrows()) if parse_opts.get("country"): data = _get_country(url_tpl) else: get_region_func = partial(_get_region, url_tpl, fr_iso_map) regions = concat(list(thread_map(get_region_func, regions_iter))) get_department_func = partial(_get_department, url_tpl) departments = concat( list( thread_map(get_department_func, deps_iter, total=len(fr_codes)))) data = concat([regions, departments]) data["date"] = data["date"].apply( lambda x: datetime_isoformat(x, "%Y-%m-%d %H:%M:%S")) return data
def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame], **parse_opts): url_tpl = sources[0]["url"] # Some states cannot be found in the dataset states_banlist = [ "American Samoa", "District of Columbia", "Guam", "Northern Mariana Islands", "Puerto Rico", "Virgin Islands", ] states = aux["metadata"] states = states.loc[states["country_code"] == "US", "subregion1_name"].dropna().unique() states = [state for state in states if state not in states_banlist] states_url = [ url_tpl.format( subregion1_name_path=state_name.replace(" ", "%20"), subregion1_name_file=state_name.replace(" ", "_"), ) for state_name in states ] dataframes = {idx: df for idx, df in enumerate(thread_map(read_file, states_url))} return self.parse_dataframes(dataframes, aux, **parse_opts)
def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: # Data can only be retrieved one day at a time, and it starts on 2020-01-22 first = "2020-01-22" today = datetime.now().date().isoformat() map_iter = [str(date)[:10] for date in date_range(first, today)] records = sum(thread_map(_get_daily_records, map_iter), []) return DataFrame.from_records(records)
def _download_open_data( logger: ErrorLogger, url_tpl: str, output_folder: Path, ibge_code: str, max_volumes: int = 12, **download_opts, ) -> Dict[str, str]: logger.log_debug(f"Downloading Brazil data for {ibge_code}...") # Since we are guessing the URL, we forgive errors in the download output = {} download_opts = dict(download_opts, ignore_failure=True) map_func = partial(download_snapshot, output_folder=output_folder, **download_opts) map_iter = [url_tpl.format(f"{ibge_code}-{idx + 1}") for idx in range(max_volumes)] for idx, file_path in enumerate(thread_map(map_func, map_iter)): if file_path is not None: output[f"{ibge_code}-{idx + 1}"] = file_path # Filter out empty files, which can happen if download fails in an unexpected way output = {name: path for name, path in output.items() if Path(path).stat().st_size > 0} # If the output is not split into volumes, fall back to single file URL if output: return output else: url = url_tpl.format(ibge_code) return {ibge_code: download_snapshot(url, output_folder, **download_opts)}
def download_folder( bucket_name: str, remote_path: str, local_folder: Path, filter_func: Callable[[Path], bool] = None, ) -> None: bucket = get_storage_bucket(bucket_name) def _download_blob(local_folder: Path, blob: Blob) -> None: # Remove the prefix from the remote path rel_path = blob.name.split(f"{remote_path}/", 1)[-1] if filter_func is None or filter_func(Path(rel_path)): print(f"Downloading {rel_path} to {local_folder}/") file_path = local_folder / rel_path file_path.parent.mkdir(parents=True, exist_ok=True) for i in range(BLOB_OP_MAX_RETRIES): try: return blob.download_to_filename(str(file_path)) except Exception as exc: traceback.print_exc() # Exponential back-off time.sleep(2**i) raise IOError(f"Error downloading {rel_path}") map_func = partial(_download_blob, local_folder) _ = thread_map(map_func, bucket.list_blobs(prefix=remote_path), total=None, disable=True) list(_) # consume the results
def upload_folder( bucket_name: str, remote_path: str, local_folder: Path, filter_func: Callable[[Path], bool] = None, ) -> None: bucket = get_storage_bucket(bucket_name) def _upload_file(remote_path: str, file_path: Path): target_path = file_path.relative_to(local_folder) if filter_func is None or filter_func(target_path): print(f"Uploading {target_path} to {remote_path}/") blob = bucket.blob(os.path.join(remote_path, target_path)) for i in range(BLOB_OP_MAX_RETRIES): try: return blob.upload_from_filename(str(file_path)) except Exception as exc: traceback.print_exc() # Exponential back-off time.sleep(2**i) raise IOError(f"Error uploading {target_path}") map_func = partial(_upload_file, remote_path) _ = thread_map(map_func, local_folder.glob("**/*.*"), total=None, disable=True) list(_) # consume the results
def cache_pull() -> str: with TemporaryDirectory() as workdir: workdir = Path(workdir) now = datetime.datetime.utcnow() output_folder = workdir / now.strftime("%Y-%m-%d-%H") output_folder.mkdir(parents=True, exist_ok=True) def _pull_source(cache_source: Dict[str, str]): url = cache_source.pop("url") output = cache_source.pop("output") buffer = BytesIO() try: download(url, buffer) with (output_folder / output).open("wb") as fd: fd.write(buffer.getvalue()) except: print(f"Cache pull failed for {url}") traceback.print_exc() # Pull each of the sources from the cache config with (SRC / "cache" / "config.json").open("r") as fd: cache_list = json.load(fd) list(thread_map(_pull_source, cache_list)) # Upload all cached data to the bucket upload_folder(GCS_BUCKET_PROD, "cache", workdir) # Build the sitemap for all cached files print("Building sitemap") sitemap = cache_build_map() bucket = get_storage_bucket(GCS_BUCKET_PROD) blob = bucket.blob("cache/sitemap.json") blob.upload_from_string(json.dumps(sitemap)) return "OK"
def upload_folder( bucket_name: str, remote_path: str, local_folder: Path, filter_func: Callable[[Path], bool] = None, ) -> None: bucket = get_storage_bucket(bucket_name) def _upload_file(remote_path: str, file_path: Path): target_path = file_path.relative_to(local_folder) if filter_func is None or filter_func(target_path): logger.log_debug(f"Uploading {target_path} to {remote_path}/") blob = bucket.blob(os.path.join(remote_path, target_path)) for i in range(BLOB_OP_MAX_RETRIES): try: return blob.upload_from_filename(str(file_path)) except Exception as exc: log_message = f"Error uploading {target_path}." logger.log_warning(log_message, traceback=traceback.format_exc()) # Exponential back-off time.sleep(2**i) # If error persists, there must be something wrong with the network so we are better # off crashing the appengine server. error_message = f"Error uploading {target_path}" logger.log_error(error_message) raise IOError(error_message) map_func = partial(_upload_file, remote_path) map_iter = local_folder.glob("**/*.*") list( thread_map(map_func, map_iter, total=None, disable=True, max_workers=8))
def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: # Data can only be retrieved one day at a time, and it starts on 2020-01-22 first = "2020-01-22" map_iter = list(date_range(first, date_today())) records = sum(thread_map(_get_daily_records, map_iter), []) return DataFrame.from_records(records)
def wikidata_property(prop: str, entities: List[str], query: str = _default_query, error_logger: ErrorLogger = None, **tqdm_kwargs) -> Iterable[Tuple[str, Any]]: """ Query a single property from Wikidata, and return all entities which are part of the provided list which contain that property. Arguments: prop: Wikidata property, for example P1082 for population. entities: List of Wikidata identifiers to query the desired property. query: [Optional] SPARQL query used to retrieve `prop`. error_logger: [Optional] ErrorLogger instance to use for logging. Returns: Iterable[Tuple[str, Any]]: Iterable of <Wikidata ID, property value> """ # Limit parallelization to avoid hitting rate limits tqdm_kwargs["max_workers"] = 6 map_func = partial(_query_property, prop, query=query, error_logger=error_logger) for entity, prop in zip(entities, thread_map(map_func, entities, **tqdm_kwargs)): yield entity, prop
def convert_tables_to_json(csv_folder: Path, output_folder: Path) -> Iterable[Path]: def try_json_covert(schema: Dict[str, str], csv_file: Path) -> Path: # JSON output defaults to same as the CSV file but with extension swapped json_output = output_folder / str( csv_file.relative_to(csv_folder)).replace(".csv", ".json") json_output.parent.mkdir(parents=True, exist_ok=True) # Converting to JSON is not critical and it may fail in some corner cases # As long as the "important" JSON files are created, this should be OK try: print(f"Converting {csv_file} to JSON") convert_csv_to_json_records(schema, csv_file, json_output) return json_output except Exception as exc: print(f"Unable to convert CSV file {csv_file} to JSON: ${exc}", file=sys.stderr) traceback.print_exc() return None # Convert all CSV files to JSON using values format map_iter = list(csv_folder.glob("**/*.csv")) map_func = partial(try_json_covert, get_schema()) for json_output in thread_map(map_func, map_iter, max_workers=2, desc="JSON conversion"): if json_output is not None: yield json_output
def download_folder( bucket_name: str, remote_path: str, local_folder: Path, filter_func: Callable[[Path], bool] = None, ) -> None: bucket = get_storage_bucket(bucket_name) def _download_blob(local_folder: Path, blob: Blob) -> None: # Remove the prefix from the remote path rel_path = blob.name.split(f"{remote_path}/", 1)[-1] if filter_func is None or filter_func(Path(rel_path)): logger.log_debug(f"Downloading {rel_path} to {local_folder}/") file_path = local_folder / rel_path file_path.parent.mkdir(parents=True, exist_ok=True) for i in range(BLOB_OP_MAX_RETRIES): try: return blob.download_to_filename(str(file_path)) except Exception as exc: log_message = f"Error downloading {rel_path}." logger.log_warning(log_message, traceback=traceback.format_exc()) # Exponential back-off time.sleep(2 ** i) # If error persists, there must be something wrong with the network so we are better # off crashing the appengine server. error_message = f"Error downloading {rel_path}" logger.log_error(error_message) raise IOError(error_message) map_func = partial(_download_blob, local_folder) map_iter = bucket.list_blobs(prefix=remote_path) list(thread_map(map_func, map_iter, total=None, disable=True, max_workers=8))
def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: url_tpl = sources[0] metadata = aux["metadata"] metadata = metadata[metadata["country_code"] == "FR"] fr_isos = read_file(SRC / "data" / "fr_iso_codes.csv") fr_iso_map = { iso: code for iso, code in zip(fr_isos["iso_code"], fr_isos["region_code"]) } fr_codes = metadata[["subregion1_code", "subregion2_code"]].dropna() regions_iter = fr_codes["subregion1_code"].unique() deps_iter = [record for _, record in fr_codes.iterrows()] # For country level, there is no need to estimate confirmed from tests column_adapter_country = dict(_column_adapter) column_adapter_country.pop("testsPositifs") # Get country level data country = _get_country(url_tpl, column_adapter_country) # Country level data has totals instead of diffs, so we compute the diffs by hand country.sort_values("date", inplace=True) country["new_confirmed"] = country["total_confirmed"].diff() country.drop(columns=["total_confirmed"], inplace=True) # For region level, we can only estimate confirmed from tests column_adapter_region = dict(_column_adapter) column_adapter_region.pop("casConfirmes") # Get region level data get_region_func = partial(_get_region, url_tpl, column_adapter_region, fr_iso_map) regions = concat(list(thread_map(get_region_func, regions_iter))) # Get department level data get_department_func = partial(_get_department, url_tpl, column_adapter_region) departments = concat(list(thread_map(get_department_func, deps_iter))) data = concat([country, regions, departments]) data["date"] = data["date"].apply( lambda x: datetime_isoformat(x, "%Y-%m-%d %H:%M:%S")) return data.sort_values("date")
def parse_dataframes( self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts ) -> DataFrame: google_keys = aux["google_key_map"].set_index("google_location_key")["key"].to_dict() data = concat(thread_map(_process_chunk, dataframes.values(), total=len(dataframes))) data[["key"]].drop_duplicates().to_csv("google_keys.csv", index=False) data["key"] = data["key"].apply(lambda x: google_keys.get(x, x)) return data.dropna(subset=["key"])
def parse(self, sources: List[str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: # Ignore sources, we use an API for this data source keys = aux["metadata"].query('country_code == "RU"').key keys = [key for key in keys.values if len(key.split("_")) == 2] data = DataFrame.from_records(sum(thread_map(_get_province_records, keys), [])) data = data[["key", "date", "total_confirmed", "total_deceased", "total_recovered"]] return data
def parse(self, sources: Dict[Any, str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: # Get a list of all keys to query the API with keys = aux["metadata"].query('country_code == "RU"').key keys = [key for key in keys.values if len(key.split("_")) == 2] map_func = partial(_get_province_records, sources[0]) data = DataFrame.from_records(sum(thread_map(map_func, keys), [])) data = data[["key", "date", "total_confirmed", "total_deceased", "total_recovered"]] return data
def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: url_tpl = sources[0] metadata = aux["metadata"] metadata = metadata[metadata["country_code"] == "FR"] fr_isos = read_file(SRC / "data" / "fr_iso_codes.csv") fr_iso_map = { iso: code for iso, code in zip(fr_isos["iso_code"], fr_isos["region_code"]) } fr_codes = metadata[["subregion1_code", "subregion2_code"]].dropna() regions_iter = fr_codes["subregion1_code"].unique() deps_iter = (record for _, record in fr_codes.iterrows()) if parse_opts.get("country"): # For country level, there is no need to estimate confirmed from tests _column_adapter_2 = dict(_column_adapter) _column_adapter_2.pop("testsPositifs") data = _get_country(url_tpl, _column_adapter_2) else: # For region level, we can only estimate confirmed from tests _column_adapter_2 = dict(_column_adapter) _column_adapter_2.pop("casConfirmes") get_region_func = partial(_get_region, url_tpl, _column_adapter_2, fr_iso_map) regions = concat(list(thread_map(get_region_func, regions_iter))) get_department_func = partial(_get_department, url_tpl, _column_adapter_2) departments = concat( list( thread_map(get_department_func, deps_iter, total=len(fr_codes)))) data = concat([regions, departments]) data["date"] = data["date"].apply( lambda x: datetime_isoformat(x, "%Y-%m-%d %H:%M:%S")) return data
def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: # Ignore sources, we use an API for this data source url_tpl = sources[0] keys = aux["metadata"].query('(country_code == "ID") & subregion1_code.notna()')["key"] keys = [key for key in keys.values if len(key.split("_")) == 2 and len(key) == 5] map_func = partial(_get_province_records, url_tpl) data = DataFrame.from_records(sum(thread_map(map_func, keys), [])) return data
def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: data = aux["knowledge_graph"].merge(aux["metadata"])[["key", "wikidata"]] # Load wikidata using parallel processing map_func = partial(self._process_item, parse_opts) map_iter = data.dropna().set_index("key")["wikidata"].iteritems() records = thread_map(map_func, list(map_iter), desc="Wikidata Properties") # Return all records in DataFrame form return DataFrame.from_records(records)
def get_source_outputs( data_pipelines: Iterable[DataPipeline]) -> Iterable[Dict]: """Map a list of pipeline names to their source configs.""" for data_pipeline in tqdm(list(data_pipelines)): # print(f"Processing {data_pipeline.name}") map_iter = data_pipeline.data_sources map_func = partial(read_source_output, data_pipeline) map_opts = dict(desc="Downloading data tables", leave=False) yield from thread_map(map_func, map_iter, **map_opts)
def parse_dataframes( self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts ) -> DataFrame: # Keep only dataframes which have data available in metadata keys = aux["metadata"]["key"] has_state = lambda state: keys.apply(lambda x: x.startswith(f"US_{state}")).any() dataframes = {state: df for state, df in dataframes.items() if has_state(state)} # Parallelize the work and process each state in a different process to speed up the work map_opts = dict(total=len(dataframes), desc="Processing states") return concat(thread_map(_process_state, dataframes.values(), **map_opts))
def _get_data(url_tpl: str, subregion_code_col: str, subregion_code_to_api_id_map: Dict[str, int], subregions: DataFrame) -> DataFrame: subregion_codes = subregions[subregion_code_col].values map_func = partial(_get_records, url_tpl, subregion_code_to_api_id_map) data = DataFrame.from_records(sum(thread_map(map_func, subregion_codes), [])) data['date'] = data.apply(lambda r: _indonesian_date_to_isoformat(r.tgl), axis=1) # add location keys data = table_merge( [data, subregions], left_on="subregion_code", right_on=subregion_code_col, how="left") data = table_rename(data, _col_name_map, drop=True) return data
def _test_data_pipeline(pipeline_name: str, random_seed: int = 0): # Load the data pipeline to get the number of data sources data_pipeline = DataPipeline.load(pipeline_name) # Load the data pipeline, iterate over each data source and run it to get its output pipeline_count = len(data_pipeline.data_sources) map_func = partial(_test_data_source, pipeline_name, random_seed=random_seed) _ = thread_map(map_func, range(pipeline_count), total=pipeline_count, max_workers=4) # Consume the results list(_)
def parse_dataframes(self, dataframes: List[DataFrame], aux: Dict[str, DataFrame], **parse_opts): # Get all the weather stations with data up until 2020 stations = read_csv( _INVENTORY_URL, sep=r"\s+", names=("id", "lat", "lon", "measurement", "year_start", "year_end"), ) stations = stations[stations.year_end == 2020][[ "id", "lat", "lon", "measurement" ]] # Filter stations that at least provide max and min temps measurements = ["TMIN", "TMAX"] stations = stations.groupby(["id", "lat", "lon"]).agg(lambda x: "|".join(x)) stations = stations[stations.measurement.apply( lambda x: all(m in x for m in measurements))] stations = stations.reset_index() # Get all the POI from metadata and go through each key keep_columns = ["key", "latitude", "longitude"] metadata = dataframes[0][keep_columns].dropna() # Only use keys present in the metadata table metadata = metadata.merge(aux["metadata"])[keep_columns] # Convert all coordinates to radians stations["lat"] = stations.lat.apply(math.radians) stations["lon"] = stations.lon.apply(math.radians) metadata["lat"] = metadata.latitude.apply(math.radians) metadata["lon"] = metadata.longitude.apply(math.radians) # Use a cache to avoid having to query the same station multiple times station_cache: Dict[str, DataFrame] = {} # Make sure the stations and the cache are sent to each function call map_func = partial(NoaaGhcnDataSource.station_records, station_cache, stations) # We don't care about the index while iterating over each metadata item map_iter = [record for _, record in metadata.iterrows()] # Shuffle the iterables to try to make better use of the caching shuffle(map_iter) # Bottleneck is network so we can use lots of threads in parallel records = thread_map(map_func, map_iter, total=len(metadata)) return concat(records)
def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: data = aux["knowledge_graph"].merge(aux["metadata"])[["key", "wikidata"]] entities = data.dropna().set_index("wikidata") # Load wikidata using parallel processing wikidata_props = {v: k for k, v in parse_opts.items()} map_func = partial(self._process_item, entities.index) for _, values in thread_map(map_func, wikidata_props.keys(), desc="Wikidata Properties"): values = ((x[0], *(x[1].split(",", 2) if x[1] else (None, None))) for x in values) df = DataFrame.from_records(values, columns=["wikidata", "latitude", "longitude"]) entities = entities.join(df.set_index("wikidata"), how="outer") # Return all records in DataFrame form return entities
def parse(self, sources: List[str], aux: Dict[str, DataFrame], **parse_opts): buffer = BytesIO() download(sources[0], buffer, progress=True) data = None with zipfile.ZipFile(buffer) as zipped: data = zipped.read("WDIData.csv") data = read_csv(BytesIO(data)) assert data is not None data = data.rename( columns={ "Country Code": "3166-1-alpha-3", "Indicator Name": "indicator_name", "Indicator Code": "indicator_code", }) data = data.merge(aux["worldbank_indicators"]).merge( aux["country_codes"]) data = data.drop( columns=["Country Name", "3166-1-alpha-2", "3166-1-alpha-3"]) indicators = parse_opts.get( "indicators", {code: code for code in data.indicator_code.values}) min_year = int(parse_opts.get("min_year", 2015)) data = data[data.indicator_code.isin(indicators.values())] # Index data by indicator code for performance optimization keys = data.key.unique() indexed = { key: data[data.key == key].set_index("indicator_code") for key in keys } # There is probably a fancy pandas function to this more efficiently but this works for now map_func = partial(WorldbankDataSource._process_record, indexed, indicators, min_year) records = thread_map(map_func, keys, desc="WorldBank Indicators") # Some countries are better described as subregions data = DataFrame.from_records(records) data.loc[data.key == "MF", "key"] = "FR_MF" # Return all records in DataFrame form return data
def upload_folder( bucket_name: str, remote_path: str, local_folder: Path, filter_func: Callable[[Path], bool] = None, ) -> None: bucket = get_storage_bucket(bucket_name) def _upload_file(remote_path: str, file_path: Path): target_path = file_path.relative_to(local_folder) if filter_func is None or filter_func(target_path): logger.log_debug(f"Uploading {target_path} to {remote_path}/") blob = bucket.blob(os.path.join(remote_path, target_path)) for i in range(BLOB_OP_MAX_RETRIES): try: name, suffix = file_path.name, file_path.suffix # If it's an extension we should compress, upload compressed file if suffix[1:] in COMPRESS_EXTENSIONS: with temporary_directory() as workdir: gzipped_file = workdir / name gzip_file(file_path, gzipped_file) blob.content_encoding = "gzip" return blob.upload_from_filename(gzipped_file) # Otherwise upload the file as-is else: return blob.upload_from_filename(file_path) except Exception as exc: log_message = f"Error uploading {target_path}." logger.log_warning(log_message, traceback=traceback.format_exc()) # Exponential back-off time.sleep(2**i) # If error persists, there must be something wrong with the network so we are better # off crashing the appengine server. error_message = f"Error uploading {target_path}" logger.log_error(error_message) raise IOError(error_message) map_func = partial(_upload_file, remote_path) map_iter = local_folder.glob("**/*.*") list( thread_map(map_func, map_iter, total=None, disable=True, max_workers=8))
def fetch( self, output_folder: Path, cache: Dict[str, str], fetch_opts: List[Dict[str, Any]], skip_existing: bool = False, ) -> Dict[str, str]: output = {} download_options = dict(fetch_opts[0], skip_existing=skip_existing) url_tpl = download_options.pop("url") map_opts = dict(desc="Downloading Brazil Open Data") map_iter = [code.lower() for code in _IBGE_STATES.values()] map_func = partial(_download_open_data, self, url_tpl, output_folder, **download_options) for partial_output in thread_map(map_func, map_iter, **map_opts): output.update(partial_output) return output
def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: data = aux["knowledge_graph"].merge( aux["metadata"])[["key", "wikidata"]] entities = data.dropna().set_index("wikidata") # Load wikidata using parallel processing wikidata_props = {v: k for k, v in parse_opts.items()} map_func = partial(self._process_item, entities.index) map_opts = dict(desc="Wikidata Properties", total=len(wikidata_props)) for _, values in thread_map(map_func, wikidata_props.keys(), **map_opts): values = _extract_coordinates(values) df = DataFrame.from_records( values, columns=["wikidata", "latitude", "longitude"]) entities = entities.join(df.set_index("wikidata"), how="outer") # Return all records in DataFrame form return entities