Esempio n. 1
0
async def request_raw(query: Query,
                      api_key: Optional[str] = None,
                      endpoint: Optional[str] = None,
                      *,
                      handle_retries: bool = True,
                      max_query_error_retries: int = 0,
                      session: Optional[aiohttp.ClientSession] = None,
                      agg_stats: AggStats = None,
                      headers: Optional[Dict[str, str]] = None,
                      retrying: Optional[AsyncRetrying] = None) -> Result:
    """ Send a request to Scrapinghub AutoExtract API.

    ``query`` is a list of dicts or Request objects, as
    described in the API docs
    (see https://doc.scrapinghub.com/autoextract.html).

    ``api_key`` is your AutoExtract API key. If not set, it is
    taken from SCRAPINGHUB_AUTOEXTRACT_KEY environment variable.

    ``session`` is an optional aiohttp.ClientSession object;
    use it to enable HTTP Keep-Alive and to control connection
    pool size.

    This function retries http 429 errors and network errors by default;
    this allows to handle server-side throttling properly.
    Use ``handle_retries=False`` if you want to disable this behavior
    (e.g. to implement it yourself).

    Among others, this function can raise autoextract.errors.RequestError,
    if there is a Request-level error returned by the API after all attempts
    were exhausted.

    Throttling errors are retried indefinitely when handle_retries is True.

    When ``handle_retries=True``, we could also retry Query-level errors.
    Use ``max_query_error_retries > 0`` if you want to to enable this behavior.

    ``agg_stats`` argument allows to keep track of various stats; pass an
    ``AggStats`` instance, and it'll be updated.

    Additional ``headers`` for the API request can be provided. This headers
    are included in the request done against the API endpoint: they
    won't be used in subsequent requests for fetching the URLs provided in
    the query.

    The default retry policy can be overridden by providing a custom
    ``retrying`` object of type :class:`tenacity.AsyncRetrying` that can
    be built with the class :class:`autoextract.retry.RetryFactory`.
    The following is an example that configure 3 attempts for server
    type errors::

      factory = RetryFactory()
      factory.server_error_stop = stop_after_attempt(3)
      retrying = factory.build()

    See :func:`request_parallel_as_completed` for a more high-level
    interface to send requests in parallel.
    """
    endpoint = API_ENDPOINT if endpoint is None else endpoint
    retrying = retrying or autoextract_retrying

    if agg_stats is None:
        agg_stats = AggStats()  # dummy stats, to simplify code

    if max_query_error_retries and not handle_retries:
        warnings.warn(
            "You've specified a max number of Query-level error retries, "
            "but retries are disabled. Consider passing the handle_retries "
            "argument as True.",
            stacklevel=2)

    # Keep state between executions/retries
    request_processor = RequestProcessor(
        query=query,
        max_retries=max_query_error_retries if handle_retries else 0,
    )

    post = _post_func(session)
    auth = aiohttp.BasicAuth(get_apikey(api_key))
    headers = {'User-Agent': user_agent(aiohttp), **(headers or {})}

    response_stats = []
    start_global = time.perf_counter()

    async def request():
        stats = ResponseStats.create(start_global)
        agg_stats.n_attempts += 1

        post_kwargs = dict(
            url=endpoint,
            json=request_processor.pending_queries,
            auth=auth,
            headers=headers,
        )

        try:
            async with post(**post_kwargs) as resp:
                stats.status = resp.status
                stats.record_connected(agg_stats)
                if resp.status >= 400:
                    content = await resp.read()
                    resp.release()
                    stats.record_read()
                    stats.error = content
                    if resp.status == 429:
                        agg_stats.n_429 += 1
                    else:
                        agg_stats.n_errors += 1
                    raise RequestError(request_info=resp.request_info,
                                       history=resp.history,
                                       status=resp.status,
                                       message=resp.reason,
                                       headers=resp.headers,
                                       response_content=content)

                response = await resp.json()
                stats.record_read(agg_stats)
                return request_processor.process_results(response)
        except Exception as e:
            if not isinstance(e, RequestError):
                agg_stats.n_errors += 1
            raise
        finally:
            response_stats.append(stats)

    if handle_retries:
        request = retrying.wraps(request)

    try:
        # Try to make a batch request
        result = await request()
    except _QueryError:
        # If Tenacity fails to retry a _QueryError because the max number of
        # retries or a timeout was reached, get latest results combining
        # error and successes and consider it as the final result.
        result = request_processor.get_latest_results()
    except Exception:
        agg_stats.n_fatal_errors += 1
        raise
    finally:
        agg_stats.n_input_queries += len(query)
        agg_stats.n_extracted_queries += request_processor.extracted_queries_count(
        )
        agg_stats.n_billable_query_responses += request_processor.billable_query_responses_count(
        )
        agg_stats.n_query_responses += request_processor.query_responses_count(
        )

    result = Result(result)
    result.response_stats = response_stats
    if handle_retries and hasattr(request, 'retry'):
        result.retry_stats = request.retry.statistics  # type: ignore

    agg_stats.n_results += 1
    return result
def test_get_apikey(autoextract_env_variable):
    assert get_apikey('foo') == 'foo'
    assert get_apikey() == autoextract_env_variable
Esempio n. 3
0
async def request_raw(
    query: Query,
    api_key: Optional[str] = None,
    endpoint: str = API_ENDPOINT,
    *,
    handle_retries: bool = True,
    session: Optional[aiohttp.ClientSession] = None,
    agg_stats: AggStats = None,
) -> Result:
    """ Send a request to Scrapinghub AutoExtract API.

    ``query`` is a list of dicts or Request objects, as
    described in the API docs
    (see https://doc.scrapinghub.com/autoextract.html).

    ``api_key`` is your AutoExtract API key. If not set, it is
    taken from SCRAPINGHUB_AUTOEXTRACT_KEY environment variable.

    ``session`` is an optional aiohttp.ClientSession object;
    use it to enable HTTP Keep-Alive.

    This function retries http 429 errors and network errors by default;
    this allows to handle server-side throttling properly.
    Use ``handle_retries=False`` if you want to disable this behavior
    (e.g. to implement it yourself).

    When handle_retries is True, this function can raise

    1) autoextract.errors.ApiError, if there is an error returned by the API
       which is not a throttling response (e.g. it can be raised for incorrect
       request).
    2) tenacity.RetryError, if a network-related error persists for
       a long time, over the allowed time period.

    Throttling errors are retried indefinitely when handle_retries is True.

    ``agg_stats`` argument allows to keep track of various stats; pass an
    ``AggStats`` instance, and it'll be updated.

    See :func:`request_parallel_as_completed` for a more high-level
    interface to send requests in parallel.
    """
    if agg_stats is None:
        agg_stats = AggStats()  # dummy stats, to simplify code
    post = _post_func(session)
    post_kwargs = dict(
        url=endpoint,
        json=query_as_dict_list(query),
        auth=aiohttp.BasicAuth(get_apikey(api_key)),
        headers={'User-Agent': user_agent(aiohttp)},
    )
    response_stats = []
    start_global = time.perf_counter()

    async def request():
        stats = ResponseStats.create(start_global)
        agg_stats.n_attempts += 1
        try:
            async with post(**post_kwargs) as resp:
                stats.status = resp.status
                stats.record_connected(agg_stats)
                if resp.status >= 400:
                    content = await resp.read()
                    resp.release()
                    stats.record_read()
                    stats.error = content
                    if resp.status == 429:
                        agg_stats.n_429 += 1
                    else:
                        agg_stats.n_errors += 1
                    raise ApiError(request_info=resp.request_info,
                                   history=resp.history,
                                   status=resp.status,
                                   message=resp.reason,
                                   headers=resp.headers,
                                   response_content=content)
                # good response
                response = await resp.json()
                stats.record_read(agg_stats)
                return response
        except Exception as e:
            if not isinstance(e, ApiError):
                agg_stats.n_errors += 1
            raise
        finally:
            response_stats.append(stats)

    if handle_retries:
        request = autoextract_retry(request)

    try:
        result = await request()
    except Exception:
        agg_stats.n_fatal_errors += 1
        raise

    result = Result(result)
    result.response_stats = response_stats
    if handle_retries:
        result.retry_stats = request.retry.statistics  # type: ignore
    agg_stats.n_results += 1
    return result
def test_get_apikey_missing():
    with pytest.raises(NoApiKey):
        get_apikey()
async def request_raw(
    query: Query,
    api_key: Optional[str] = None,
    endpoint: Optional[str] = None,
    *,
    handle_retries: bool = True,
    max_query_error_retries: int = 0,
    session: Optional[aiohttp.ClientSession] = None,
    agg_stats: AggStats = None,
) -> Result:
    """ Send a request to Scrapinghub AutoExtract API.

    ``query`` is a list of dicts or Request objects, as
    described in the API docs
    (see https://doc.scrapinghub.com/autoextract.html).

    ``api_key`` is your AutoExtract API key. If not set, it is
    taken from SCRAPINGHUB_AUTOEXTRACT_KEY environment variable.

    ``session`` is an optional aiohttp.ClientSession object;
    use it to enable HTTP Keep-Alive.

    This function retries http 429 errors and network errors by default;
    this allows to handle server-side throttling properly.
    Use ``handle_retries=False`` if you want to disable this behavior
    (e.g. to implement it yourself).

    When handle_retries is True, this function can raise

    1) autoextract.errors.RequestError,
       if there is a Request-level error returned by the API
       which is not a throttling response
       (e.g. it can be raised for incorrect request).
    2) tenacity.RetryError,
       if a network-related error persists for a long time,
       over the allowed time period.

    Throttling errors are retried indefinitely when handle_retries is True.

    When ``handle_retries=True``, we could also retry Query-level errors.
    Use ``max_query_error_retries > 0`` if you want to to enable this behavior.

    ``agg_stats`` argument allows to keep track of various stats; pass an
    ``AggStats`` instance, and it'll be updated.

    See :func:`request_parallel_as_completed` for a more high-level
    interface to send requests in parallel.
    """
    endpoint = API_ENDPOINT if endpoint is None else endpoint

    if agg_stats is None:
        agg_stats = AggStats()  # dummy stats, to simplify code

    if max_query_error_retries and not handle_retries:
        warnings.warn(
            "You've specified a max number of Query-level error retries, "
            "but retries are disabled. Consider passing the handle_retries "
            "argument as True.",
            stacklevel=2)

    # Keep state between executions/retries
    request_processor = RequestProcessor(
        query=query,
        max_retries=max_query_error_retries if handle_retries else 0,
    )

    post = _post_func(session)
    auth = aiohttp.BasicAuth(get_apikey(api_key))
    headers = {'User-Agent': user_agent(aiohttp)}

    response_stats = []
    start_global = time.perf_counter()

    async def request():
        stats = ResponseStats.create(start_global)
        agg_stats.n_attempts += 1

        post_kwargs = dict(
            url=endpoint,
            json=request_processor.pending_queries,
            auth=auth,
            headers=headers,
        )

        try:
            async with post(**post_kwargs) as resp:
                stats.status = resp.status
                stats.record_connected(agg_stats)
                if resp.status >= 400:
                    content = await resp.read()
                    resp.release()
                    stats.record_read()
                    stats.error = content
                    if resp.status == 429:
                        agg_stats.n_429 += 1
                    else:
                        agg_stats.n_errors += 1
                    raise RequestError(request_info=resp.request_info,
                                       history=resp.history,
                                       status=resp.status,
                                       message=resp.reason,
                                       headers=resp.headers,
                                       response_content=content)

                response = await resp.json()
                stats.record_read(agg_stats)
                return request_processor.process_results(response)
        except Exception as e:
            if not isinstance(e, RequestError):
                agg_stats.n_errors += 1
            raise
        finally:
            response_stats.append(stats)

    if handle_retries:
        # If handle_retries=True, the request method could raise
        # RetryError and QueryRetryError exceptions.
        #
        # These exceptions are raised when Tenacity is not able to
        # successfully retry failing requests.
        #
        # In addition to handle_retries=True, QueryRetryError also depends on
        # max_query_error_retries being greater than 0.
        request = autoextract_retry(request)

    try:
        # Try to make a batch request
        result = await request()
    except QueryRetryError:
        # If Tenacity fails to retry a _QueryError because the max number of
        # retries or a timeout was reached, get latest results combining
        # error and successes and consider it as the final result.
        result = request_processor.get_latest_results()
    except Exception:
        agg_stats.n_fatal_errors += 1
        raise

    result = Result(result)
    result.response_stats = response_stats
    if handle_retries:
        result.retry_stats = request.retry.statistics  # type: ignore

    agg_stats.n_results += 1
    return result