Beispiel #1
0
def _query_property(
    prop: str,
    entity: str,
    query: str = _default_query,
    error_logger: ErrorLogger = ErrorLogger()) -> Any:
    # Time to wait before retry in case of failure
    wait_time = 8

    # Build the query from template
    query = query.format(prop=prop, entity=entity)

    # Keep trying request until succeeds, or _max_retries is reached
    for i in range(_max_retries):
        response = None

        try:
            params = {"query": query, "format": "json"}
            response = requests.get(_wikidata_url,
                                    headers=_request_header,
                                    params=params)
            data = response.json()

            # Return the first binding available (there should be only one)
            for item in data["results"]["bindings"]:
                return item["prop"]["value"]

        except Exception as exc:
            # If limit is reached, then log error
            if i + 1 < _max_retries:
                error_logger.log_error(
                    response.text if response is not None else exc)

            # Otherwise use exponential backoff in case of error
            else:
                sleep(wait_time)
                wait_time *= 2

    return None
def wikidata_property(
    prop: str,
    entities: List[str],
    query_template: str = _WD_QUERY,
    logger: ErrorLogger = ErrorLogger(),
    offset: int = 0,
    **tqdm_kwargs,
) -> Any:
    """
    Query a single property from Wikidata, and return all entities which are part of the provided
    list which contain that property.

    Arguments:
        prop: Wikidata property, for example P1082 for population.
        entities: List of Wikidata identifiers to query the desired property.
        query: [Optional] SPARQL query used to retrieve `prop`.
        logger: [Optional] ErrorLogger instance to use for logging.
        offset: [Optional] Number of items to skip in the result set.
    Returns:
        Iterable[Tuple[str, Any]]: Iterable of <Wikidata ID, property value>
    """
    # Time to wait before retry in case of failure
    wait_time = _INIT_WAIT_TIME

    # Build the query from template
    tpl = query_template + " LIMIT {limit} OFFSET {offset}"
    query = tpl.format(prop=prop, limit=_LIMIT, offset=offset)

    # Keep trying request until succeeds, or _max_retries is reached
    for i in range(_MAX_RETRIES):
        response = None

        try:
            start_time = time.monotonic()
            params = {"query": query, "format": "json"}
            req_opts = dict(headers=_REQUEST_HEADER,
                            params=params,
                            timeout=_WD_TIMEOUT)
            response = requests.get(_WD_URL, **req_opts)
            elapsed_time = time.monotonic() - start_time
            log_opts = dict(status=response.status_code,
                            url=_WD_URL,
                            time=elapsed_time,
                            **params)
            logger.log_info(f"Wikidata SPARQL server response", **log_opts)
            data = response.json()

            # Return the first binding available (there should be only one)
            for item in pbar(data["results"]["bindings"], **tqdm_kwargs):
                pid = item["pid"]["value"].split("/")[-1]
                if pid in entities:
                    yield pid, item["prop"]["value"]

            # Unless we got `_LIMIT` results, keep adding offset until we run our of results
            if len(data["results"]["bindings"]) == _LIMIT:
                yield from wikidata_property(
                    prop,
                    entities,
                    query_template=query_template,
                    logger=logger,
                    offset=offset + _LIMIT,
                    **tqdm_kwargs,
                )

            # If no exceptions were thrown, we have reached the end
            logger.log_info(f"Wikidata SPARQL results end reached")
            return

        except Exception as exc:

            # If we have reached the error limit, log and re-raise the error
            if i + 1 >= _MAX_RETRIES:
                msg = response.text if response is not None else "Unknown error"
                logger.log_error(msg,
                                 exc=exc,
                                 traceback=traceback.format_exc())
                raise exc

            # Use exponential backoff in case of error
            logger.log_info(
                f"({i + 1}) Request error. Retry in {wait_time} seconds...",
                exc=exc)
            time.sleep(wait_time)
            wait_time *= 2