def _download_open_data( logger: ErrorLogger, url_tpl: str, output_folder: Path, ibge_code: str, max_volumes: int = 12, **download_opts, ) -> Dict[str, str]: logger.log_debug(f"Downloading Brazil data for {ibge_code}...") # Since we are guessing the URL, we forgive errors in the download output = {} download_opts = dict(download_opts, ignore_failure=True) map_func = partial(download_snapshot, output_folder=output_folder, **download_opts) map_iter = [url_tpl.format(f"{ibge_code}-{idx + 1}") for idx in range(max_volumes)] for idx, file_path in enumerate(thread_map(map_func, map_iter)): if file_path is not None: output[f"{ibge_code}-{idx + 1}"] = file_path # Filter out empty files, which can happen if download fails in an unexpected way output = {name: path for name, path in output.items() if Path(path).stat().st_size > 0} # If the output is not split into volumes, fall back to single file URL if output: return output else: url = url_tpl.format(ibge_code) return {ibge_code: download_snapshot(url, output_folder, **download_opts)}
def wikidata_property(prop: str, entities: List[str], query: str = _default_query, error_logger: ErrorLogger = ErrorLogger(), **tqdm_kwargs) -> Iterable[Tuple[str, Any]]: """ Query a single property from Wikidata, and return all entities which are part of the provided list which contain that property. Arguments: prop: Wikidata property, for example P1082 for population. entities: List of Wikidata identifiers to query the desired property. query: [Optional] SPARQL query used to retrieve `prop`. error_logger: [Optional] ErrorLogger instance to use for logging. Returns: Iterable[Tuple[str, Any]]: Iterable of <Wikidata ID, property value> """ # Limit parallelization to avoid hitting rate limits tqdm_kwargs["max_workers"] = 6 map_func = partial(_query_property, prop, query=query, error_logger=error_logger) for entity, prop in zip(entities, thread_map(map_func, entities, **tqdm_kwargs)): yield entity, prop
def convert_tables_to_json( csv_folder: Path, output_folder: Path, logger: ErrorLogger = ErrorLogger()) -> Iterable[Path]: def try_json_covert(schema: Dict[str, str], csv_file: Path) -> Path: # JSON output defaults to same as the CSV file but with extension swapped json_output = output_folder / str( csv_file.relative_to(csv_folder)).replace(".csv", ".json") json_output.parent.mkdir(parents=True, exist_ok=True) # Converting to JSON is not critical and it may fail in some corner cases # As long as the "important" JSON files are created, this should be OK try: logger.log_debug(f"Converting {csv_file} to JSON") convert_csv_to_json_records(schema, csv_file, json_output) return json_output except Exception as exc: error_message = f"Unable to convert CSV file {csv_file} to JSON" logger.log_error(error_message, traceback=traceback.format_exc()) return None # Convert all CSV files to JSON using values format map_iter = list(csv_folder.glob("**/*.csv")) map_func = partial(try_json_covert, get_schema()) for json_output in thread_map(map_func, map_iter, max_workers=2, desc="JSON conversion"): if json_output is not None: yield json_output
def _query_property(prop: str, entity: str, query: str = _default_query, error_logger: ErrorLogger = None) -> Any: # Time to wait before retry in case of failure wait_time = 8 # Build the query from template query = query.format(prop=prop, entity=entity) # Keep trying request until succeeds, or _max_retries is reached for i in range(_max_retries): response = None try: params = {"query": query, "format": "json"} response = requests.get(_wikidata_url, headers=_request_header, params=params) data = response.json() # Return the first binding available (there should be only one) for item in data["results"]["bindings"]: return item["prop"]["value"] except Exception as exc: # If limit is reached, then log error if i + 1 < _max_retries: if error_logger is not None: error_logger.errlog( response.text if response is not None else exc) else: traceback.print_exc() # Otherwise use exponential backoff in case of error else: sleep(wait_time) wait_time *= 2 return None
def wikidata_property( prop: str, entities: List[str], query_template: str = _WD_QUERY, logger: ErrorLogger = ErrorLogger(), offset: int = 0, **tqdm_kwargs, ) -> Any: """ Query a single property from Wikidata, and return all entities which are part of the provided list which contain that property. Arguments: prop: Wikidata property, for example P1082 for population. entities: List of Wikidata identifiers to query the desired property. query: [Optional] SPARQL query used to retrieve `prop`. logger: [Optional] ErrorLogger instance to use for logging. offset: [Optional] Number of items to skip in the result set. Returns: Iterable[Tuple[str, Any]]: Iterable of <Wikidata ID, property value> """ # Time to wait before retry in case of failure wait_time = _INIT_WAIT_TIME # Build the query from template tpl = query_template + " LIMIT {limit} OFFSET {offset}" query = tpl.format(prop=prop, limit=_LIMIT, offset=offset) # Keep trying request until succeeds, or _max_retries is reached for i in range(_MAX_RETRIES): response = None try: start_time = time.monotonic() params = {"query": query, "format": "json"} req_opts = dict(headers=_REQUEST_HEADER, params=params, timeout=_WD_TIMEOUT) response = requests.get(_WD_URL, **req_opts) elapsed_time = time.monotonic() - start_time log_opts = dict(status=response.status_code, url=_WD_URL, time=elapsed_time, **params) logger.log_info(f"Wikidata SPARQL server response", **log_opts) data = response.json() # Return the first binding available (there should be only one) for item in pbar(data["results"]["bindings"], **tqdm_kwargs): pid = item["pid"]["value"].split("/")[-1] if pid in entities: yield pid, item["prop"]["value"] # Unless we got `_LIMIT` results, keep adding offset until we run our of results if len(data["results"]["bindings"]) == _LIMIT: yield from wikidata_property( prop, entities, query_template=query_template, logger=logger, offset=offset + _LIMIT, **tqdm_kwargs, ) # If no exceptions were thrown, we have reached the end logger.log_info(f"Wikidata SPARQL results end reached") return except Exception as exc: # If we have reached the error limit, log and re-raise the error if i + 1 >= _MAX_RETRIES: msg = response.text if response is not None else "Unknown error" logger.log_error(msg, exc=exc, traceback=traceback.format_exc()) raise exc # Use exponential backoff in case of error logger.log_info( f"({i + 1}) Request error. Retry in {wait_time} seconds...", exc=exc) time.sleep(wait_time) wait_time *= 2
GCP_SELF_DESTRUCT_SCRIPT, GCS_BUCKET_PROD, GCS_BUCKET_TEST, SRC, V3_TABLE_LIST, ) from lib.error_logger import ErrorLogger from lib.gcloud import delete_instance, get_internal_ip, start_instance_from_image from lib.io import export_csv, gzip_file, temporary_directory from lib.memory_efficient import table_read_column from lib.net import download from lib.pipeline import DataPipeline from lib.pipeline_tools import get_table_names app = Flask(__name__) logger = ErrorLogger("appengine") BLOB_OP_MAX_RETRIES = 10 ENV_TOKEN = "GCP_TOKEN" ENV_PROJECT = "GOOGLE_CLOUD_PROJECT" ENV_SERVICE_ACCOUNT = "GCS_SERVICE_ACCOUNT" COMPRESS_EXTENSIONS = ("json", ) # Used when parsing string parameters into boolean type BOOL_STRING_MAP = { "true": True, "false": False, "1": True, "0": False, "": False, "null": False }
table_breakout, table_concat, table_cross_product, table_drop_nan_columns, table_grouped_tail, table_join, table_merge, table_read_column, table_rename, table_sort, ) from lib.pipeline_tools import get_schema from lib.time import date_range # Used for debugging purposes _logger = ErrorLogger("publish") def _subset_grouped_key(main_table_path: Path, output_folder: Path, desc: str = None) -> Iterable[Path]: """ Outputs a subsets of the table with only records with a particular key """ # Read the header of the main file to get the columns with open(main_table_path, "r") as fd: header = next(fd) # Do a first sweep to get the number of keys so we can accurately report progress key_set = set() for line in read_lines(main_table_path, skip_empty=True): key, data = line.split(",", 1)
def make_main_table( tables_folder: Path, output_path: Path, logger: ErrorLogger = ErrorLogger()) -> None: """ Build a flat view of all tables combined, joined by <key> or <key, date>. Arguments: tables_folder: Input folder where all CSV files exist. Returns: DataFrame: Flat table with all data combined. """ # Use a temporary directory for intermediate files with TemporaryDirectory() as workdir: workdir = Path(workdir) # Merge all output files into a single table keys_table_path = workdir / "keys.csv" keys_table = read_file(tables_folder / "index.csv", usecols=["key"]) export_csv(keys_table, keys_table_path) logger.log_info("Created keys table") # Add a date to each region from index to allow iterative left joins max_date = (datetime.datetime.now() + datetime.timedelta(days=1)).date().isoformat() date_list = date_range("2020-01-01", max_date) date_table_path = workdir / "dates.csv" export_csv(DataFrame(date_list, columns=["date"]), date_table_path) logger.log_info("Created dates table") # Create a temporary working table file which can be used during the steps temp_file_path = workdir / "main.tmp.csv" table_cross_product(keys_table_path, date_table_path, temp_file_path) logger.log_info("Created cross product table") # Add all the index columns to seed the main table main_table_path = workdir / "main.csv" table_join(temp_file_path, tables_folder / "index.csv", ["key"], main_table_path, how="outer") logger.log_info("Joined with table index") non_dated_columns = set(get_table_columns(main_table_path)) for table_file_path in pbar([*tables_folder.glob("*.csv")], desc="Make main table"): table_name = table_file_path.stem if table_name not in EXCLUDE_FROM_MAIN_TABLE: table_columns = get_table_columns(table_file_path) if "date" in table_columns: join_on = ["key", "date"] else: join_on = ["key"] # Keep track of columns which are not indexed by date non_dated_columns = non_dated_columns | set(table_columns) # Iteratively perform left outer joins on all tables table_join(main_table_path, table_file_path, join_on, temp_file_path, how="outer") shutil.move(temp_file_path, main_table_path) logger.log_info(f"Joined with table {table_name}") # Drop rows with null date or without a single dated record # TODO: figure out a memory-efficient way to do this # Ensure that the table is appropriately sorted ans write to output location table_sort(main_table_path, output_path) logger.log_info("Sorted main table")