Ejemplo n.º 1
0
def _download_open_data(
    logger: ErrorLogger,
    url_tpl: str,
    output_folder: Path,
    ibge_code: str,
    max_volumes: int = 12,
    **download_opts,
) -> Dict[str, str]:
    logger.log_debug(f"Downloading Brazil data for {ibge_code}...")

    # Since we are guessing the URL, we forgive errors in the download
    output = {}
    download_opts = dict(download_opts, ignore_failure=True)
    map_func = partial(download_snapshot, output_folder=output_folder, **download_opts)
    map_iter = [url_tpl.format(f"{ibge_code}-{idx + 1}") for idx in range(max_volumes)]
    for idx, file_path in enumerate(thread_map(map_func, map_iter)):
        if file_path is not None:
            output[f"{ibge_code}-{idx + 1}"] = file_path

    # Filter out empty files, which can happen if download fails in an unexpected way
    output = {name: path for name, path in output.items() if Path(path).stat().st_size > 0}

    # If the output is not split into volumes, fall back to single file URL
    if output:
        return output
    else:
        url = url_tpl.format(ibge_code)
        return {ibge_code: download_snapshot(url, output_folder, **download_opts)}
Ejemplo n.º 2
0
def wikidata_property(prop: str,
                      entities: List[str],
                      query: str = _default_query,
                      error_logger: ErrorLogger = ErrorLogger(),
                      **tqdm_kwargs) -> Iterable[Tuple[str, Any]]:
    """
    Query a single property from Wikidata, and return all entities which are part of the provided
    list which contain that property.

    Arguments:
        prop: Wikidata property, for example P1082 for population.
        entities: List of Wikidata identifiers to query the desired property.
        query: [Optional] SPARQL query used to retrieve `prop`.
        error_logger: [Optional] ErrorLogger instance to use for logging.
    Returns:
        Iterable[Tuple[str, Any]]: Iterable of <Wikidata ID, property value>
    """
    # Limit parallelization to avoid hitting rate limits
    tqdm_kwargs["max_workers"] = 6
    map_func = partial(_query_property,
                       prop,
                       query=query,
                       error_logger=error_logger)
    for entity, prop in zip(entities,
                            thread_map(map_func, entities, **tqdm_kwargs)):
        yield entity, prop
Ejemplo n.º 3
0
def convert_tables_to_json(
    csv_folder: Path, output_folder: Path,
    logger: ErrorLogger = ErrorLogger()) -> Iterable[Path]:
    def try_json_covert(schema: Dict[str, str], csv_file: Path) -> Path:
        # JSON output defaults to same as the CSV file but with extension swapped
        json_output = output_folder / str(
            csv_file.relative_to(csv_folder)).replace(".csv", ".json")
        json_output.parent.mkdir(parents=True, exist_ok=True)

        # Converting to JSON is not critical and it may fail in some corner cases
        # As long as the "important" JSON files are created, this should be OK
        try:
            logger.log_debug(f"Converting {csv_file} to JSON")
            convert_csv_to_json_records(schema, csv_file, json_output)
            return json_output
        except Exception as exc:
            error_message = f"Unable to convert CSV file {csv_file} to JSON"
            logger.log_error(error_message, traceback=traceback.format_exc())
            return None

    # Convert all CSV files to JSON using values format
    map_iter = list(csv_folder.glob("**/*.csv"))
    map_func = partial(try_json_covert, get_schema())
    for json_output in thread_map(map_func,
                                  map_iter,
                                  max_workers=2,
                                  desc="JSON conversion"):
        if json_output is not None:
            yield json_output
Ejemplo n.º 4
0
def _query_property(prop: str,
                    entity: str,
                    query: str = _default_query,
                    error_logger: ErrorLogger = None) -> Any:
    # Time to wait before retry in case of failure
    wait_time = 8

    # Build the query from template
    query = query.format(prop=prop, entity=entity)

    # Keep trying request until succeeds, or _max_retries is reached
    for i in range(_max_retries):
        response = None

        try:
            params = {"query": query, "format": "json"}
            response = requests.get(_wikidata_url,
                                    headers=_request_header,
                                    params=params)
            data = response.json()

            # Return the first binding available (there should be only one)
            for item in data["results"]["bindings"]:
                return item["prop"]["value"]

        except Exception as exc:
            # If limit is reached, then log error
            if i + 1 < _max_retries:
                if error_logger is not None:
                    error_logger.errlog(
                        response.text if response is not None else exc)
                else:
                    traceback.print_exc()

            # Otherwise use exponential backoff in case of error
            else:
                sleep(wait_time)
                wait_time *= 2

    return None
Ejemplo n.º 5
0
def wikidata_property(
    prop: str,
    entities: List[str],
    query_template: str = _WD_QUERY,
    logger: ErrorLogger = ErrorLogger(),
    offset: int = 0,
    **tqdm_kwargs,
) -> Any:
    """
    Query a single property from Wikidata, and return all entities which are part of the provided
    list which contain that property.

    Arguments:
        prop: Wikidata property, for example P1082 for population.
        entities: List of Wikidata identifiers to query the desired property.
        query: [Optional] SPARQL query used to retrieve `prop`.
        logger: [Optional] ErrorLogger instance to use for logging.
        offset: [Optional] Number of items to skip in the result set.
    Returns:
        Iterable[Tuple[str, Any]]: Iterable of <Wikidata ID, property value>
    """
    # Time to wait before retry in case of failure
    wait_time = _INIT_WAIT_TIME

    # Build the query from template
    tpl = query_template + " LIMIT {limit} OFFSET {offset}"
    query = tpl.format(prop=prop, limit=_LIMIT, offset=offset)

    # Keep trying request until succeeds, or _max_retries is reached
    for i in range(_MAX_RETRIES):
        response = None

        try:
            start_time = time.monotonic()
            params = {"query": query, "format": "json"}
            req_opts = dict(headers=_REQUEST_HEADER,
                            params=params,
                            timeout=_WD_TIMEOUT)
            response = requests.get(_WD_URL, **req_opts)
            elapsed_time = time.monotonic() - start_time
            log_opts = dict(status=response.status_code,
                            url=_WD_URL,
                            time=elapsed_time,
                            **params)
            logger.log_info(f"Wikidata SPARQL server response", **log_opts)
            data = response.json()

            # Return the first binding available (there should be only one)
            for item in pbar(data["results"]["bindings"], **tqdm_kwargs):
                pid = item["pid"]["value"].split("/")[-1]
                if pid in entities:
                    yield pid, item["prop"]["value"]

            # Unless we got `_LIMIT` results, keep adding offset until we run our of results
            if len(data["results"]["bindings"]) == _LIMIT:
                yield from wikidata_property(
                    prop,
                    entities,
                    query_template=query_template,
                    logger=logger,
                    offset=offset + _LIMIT,
                    **tqdm_kwargs,
                )

            # If no exceptions were thrown, we have reached the end
            logger.log_info(f"Wikidata SPARQL results end reached")
            return

        except Exception as exc:

            # If we have reached the error limit, log and re-raise the error
            if i + 1 >= _MAX_RETRIES:
                msg = response.text if response is not None else "Unknown error"
                logger.log_error(msg,
                                 exc=exc,
                                 traceback=traceback.format_exc())
                raise exc

            # Use exponential backoff in case of error
            logger.log_info(
                f"({i + 1}) Request error. Retry in {wait_time} seconds...",
                exc=exc)
            time.sleep(wait_time)
            wait_time *= 2
Ejemplo n.º 6
0
    GCP_SELF_DESTRUCT_SCRIPT,
    GCS_BUCKET_PROD,
    GCS_BUCKET_TEST,
    SRC,
    V3_TABLE_LIST,
)
from lib.error_logger import ErrorLogger
from lib.gcloud import delete_instance, get_internal_ip, start_instance_from_image
from lib.io import export_csv, gzip_file, temporary_directory
from lib.memory_efficient import table_read_column
from lib.net import download
from lib.pipeline import DataPipeline
from lib.pipeline_tools import get_table_names

app = Flask(__name__)
logger = ErrorLogger("appengine")

BLOB_OP_MAX_RETRIES = 10
ENV_TOKEN = "GCP_TOKEN"
ENV_PROJECT = "GOOGLE_CLOUD_PROJECT"
ENV_SERVICE_ACCOUNT = "GCS_SERVICE_ACCOUNT"
COMPRESS_EXTENSIONS = ("json", )
# Used when parsing string parameters into boolean type
BOOL_STRING_MAP = {
    "true": True,
    "false": False,
    "1": True,
    "0": False,
    "": False,
    "null": False
}
Ejemplo n.º 7
0
    table_breakout,
    table_concat,
    table_cross_product,
    table_drop_nan_columns,
    table_grouped_tail,
    table_join,
    table_merge,
    table_read_column,
    table_rename,
    table_sort,
)
from lib.pipeline_tools import get_schema
from lib.time import date_range

# Used for debugging purposes
_logger = ErrorLogger("publish")


def _subset_grouped_key(main_table_path: Path,
                        output_folder: Path,
                        desc: str = None) -> Iterable[Path]:
    """ Outputs a subsets of the table with only records with a particular key """

    # Read the header of the main file to get the columns
    with open(main_table_path, "r") as fd:
        header = next(fd)

    # Do a first sweep to get the number of keys so we can accurately report progress
    key_set = set()
    for line in read_lines(main_table_path, skip_empty=True):
        key, data = line.split(",", 1)
Ejemplo n.º 8
0
def make_main_table(
    tables_folder: Path,
    output_path: Path,
    logger: ErrorLogger = ErrorLogger()) -> None:
    """
    Build a flat view of all tables combined, joined by <key> or <key, date>.

    Arguments:
        tables_folder: Input folder where all CSV files exist.
    Returns:
        DataFrame: Flat table with all data combined.
    """

    # Use a temporary directory for intermediate files
    with TemporaryDirectory() as workdir:
        workdir = Path(workdir)

        # Merge all output files into a single table
        keys_table_path = workdir / "keys.csv"
        keys_table = read_file(tables_folder / "index.csv", usecols=["key"])
        export_csv(keys_table, keys_table_path)
        logger.log_info("Created keys table")

        # Add a date to each region from index to allow iterative left joins
        max_date = (datetime.datetime.now() +
                    datetime.timedelta(days=1)).date().isoformat()
        date_list = date_range("2020-01-01", max_date)
        date_table_path = workdir / "dates.csv"
        export_csv(DataFrame(date_list, columns=["date"]), date_table_path)
        logger.log_info("Created dates table")

        # Create a temporary working table file which can be used during the steps
        temp_file_path = workdir / "main.tmp.csv"
        table_cross_product(keys_table_path, date_table_path, temp_file_path)
        logger.log_info("Created cross product table")

        # Add all the index columns to seed the main table
        main_table_path = workdir / "main.csv"
        table_join(temp_file_path,
                   tables_folder / "index.csv", ["key"],
                   main_table_path,
                   how="outer")
        logger.log_info("Joined with table index")

        non_dated_columns = set(get_table_columns(main_table_path))
        for table_file_path in pbar([*tables_folder.glob("*.csv")],
                                    desc="Make main table"):
            table_name = table_file_path.stem
            if table_name not in EXCLUDE_FROM_MAIN_TABLE:

                table_columns = get_table_columns(table_file_path)
                if "date" in table_columns:
                    join_on = ["key", "date"]
                else:
                    join_on = ["key"]

                    # Keep track of columns which are not indexed by date
                    non_dated_columns = non_dated_columns | set(table_columns)

                # Iteratively perform left outer joins on all tables
                table_join(main_table_path,
                           table_file_path,
                           join_on,
                           temp_file_path,
                           how="outer")
                shutil.move(temp_file_path, main_table_path)
                logger.log_info(f"Joined with table {table_name}")

        # Drop rows with null date or without a single dated record
        # TODO: figure out a memory-efficient way to do this

        # Ensure that the table is appropriately sorted ans write to output location
        table_sort(main_table_path, output_path)
        logger.log_info("Sorted main table")