async def request_raw(query: Query, api_key: Optional[str] = None, endpoint: Optional[str] = None, *, handle_retries: bool = True, max_query_error_retries: int = 0, session: Optional[aiohttp.ClientSession] = None, agg_stats: AggStats = None, headers: Optional[Dict[str, str]] = None, retrying: Optional[AsyncRetrying] = None) -> Result: """ Send a request to Scrapinghub AutoExtract API. ``query`` is a list of dicts or Request objects, as described in the API docs (see https://doc.scrapinghub.com/autoextract.html). ``api_key`` is your AutoExtract API key. If not set, it is taken from SCRAPINGHUB_AUTOEXTRACT_KEY environment variable. ``session`` is an optional aiohttp.ClientSession object; use it to enable HTTP Keep-Alive and to control connection pool size. This function retries http 429 errors and network errors by default; this allows to handle server-side throttling properly. Use ``handle_retries=False`` if you want to disable this behavior (e.g. to implement it yourself). Among others, this function can raise autoextract.errors.RequestError, if there is a Request-level error returned by the API after all attempts were exhausted. Throttling errors are retried indefinitely when handle_retries is True. When ``handle_retries=True``, we could also retry Query-level errors. Use ``max_query_error_retries > 0`` if you want to to enable this behavior. ``agg_stats`` argument allows to keep track of various stats; pass an ``AggStats`` instance, and it'll be updated. Additional ``headers`` for the API request can be provided. This headers are included in the request done against the API endpoint: they won't be used in subsequent requests for fetching the URLs provided in the query. The default retry policy can be overridden by providing a custom ``retrying`` object of type :class:`tenacity.AsyncRetrying` that can be built with the class :class:`autoextract.retry.RetryFactory`. The following is an example that configure 3 attempts for server type errors:: factory = RetryFactory() factory.server_error_stop = stop_after_attempt(3) retrying = factory.build() See :func:`request_parallel_as_completed` for a more high-level interface to send requests in parallel. """ endpoint = API_ENDPOINT if endpoint is None else endpoint retrying = retrying or autoextract_retrying if agg_stats is None: agg_stats = AggStats() # dummy stats, to simplify code if max_query_error_retries and not handle_retries: warnings.warn( "You've specified a max number of Query-level error retries, " "but retries are disabled. Consider passing the handle_retries " "argument as True.", stacklevel=2) # Keep state between executions/retries request_processor = RequestProcessor( query=query, max_retries=max_query_error_retries if handle_retries else 0, ) post = _post_func(session) auth = aiohttp.BasicAuth(get_apikey(api_key)) headers = {'User-Agent': user_agent(aiohttp), **(headers or {})} response_stats = [] start_global = time.perf_counter() async def request(): stats = ResponseStats.create(start_global) agg_stats.n_attempts += 1 post_kwargs = dict( url=endpoint, json=request_processor.pending_queries, auth=auth, headers=headers, ) try: async with post(**post_kwargs) as resp: stats.status = resp.status stats.record_connected(agg_stats) if resp.status >= 400: content = await resp.read() resp.release() stats.record_read() stats.error = content if resp.status == 429: agg_stats.n_429 += 1 else: agg_stats.n_errors += 1 raise RequestError(request_info=resp.request_info, history=resp.history, status=resp.status, message=resp.reason, headers=resp.headers, response_content=content) response = await resp.json() stats.record_read(agg_stats) return request_processor.process_results(response) except Exception as e: if not isinstance(e, RequestError): agg_stats.n_errors += 1 raise finally: response_stats.append(stats) if handle_retries: request = retrying.wraps(request) try: # Try to make a batch request result = await request() except _QueryError: # If Tenacity fails to retry a _QueryError because the max number of # retries or a timeout was reached, get latest results combining # error and successes and consider it as the final result. result = request_processor.get_latest_results() except Exception: agg_stats.n_fatal_errors += 1 raise finally: agg_stats.n_input_queries += len(query) agg_stats.n_extracted_queries += request_processor.extracted_queries_count( ) agg_stats.n_billable_query_responses += request_processor.billable_query_responses_count( ) agg_stats.n_query_responses += request_processor.query_responses_count( ) result = Result(result) result.response_stats = response_stats if handle_retries and hasattr(request, 'retry'): result.retry_stats = request.retry.statistics # type: ignore agg_stats.n_results += 1 return result
def test_get_apikey(autoextract_env_variable): assert get_apikey('foo') == 'foo' assert get_apikey() == autoextract_env_variable
async def request_raw( query: Query, api_key: Optional[str] = None, endpoint: str = API_ENDPOINT, *, handle_retries: bool = True, session: Optional[aiohttp.ClientSession] = None, agg_stats: AggStats = None, ) -> Result: """ Send a request to Scrapinghub AutoExtract API. ``query`` is a list of dicts or Request objects, as described in the API docs (see https://doc.scrapinghub.com/autoextract.html). ``api_key`` is your AutoExtract API key. If not set, it is taken from SCRAPINGHUB_AUTOEXTRACT_KEY environment variable. ``session`` is an optional aiohttp.ClientSession object; use it to enable HTTP Keep-Alive. This function retries http 429 errors and network errors by default; this allows to handle server-side throttling properly. Use ``handle_retries=False`` if you want to disable this behavior (e.g. to implement it yourself). When handle_retries is True, this function can raise 1) autoextract.errors.ApiError, if there is an error returned by the API which is not a throttling response (e.g. it can be raised for incorrect request). 2) tenacity.RetryError, if a network-related error persists for a long time, over the allowed time period. Throttling errors are retried indefinitely when handle_retries is True. ``agg_stats`` argument allows to keep track of various stats; pass an ``AggStats`` instance, and it'll be updated. See :func:`request_parallel_as_completed` for a more high-level interface to send requests in parallel. """ if agg_stats is None: agg_stats = AggStats() # dummy stats, to simplify code post = _post_func(session) post_kwargs = dict( url=endpoint, json=query_as_dict_list(query), auth=aiohttp.BasicAuth(get_apikey(api_key)), headers={'User-Agent': user_agent(aiohttp)}, ) response_stats = [] start_global = time.perf_counter() async def request(): stats = ResponseStats.create(start_global) agg_stats.n_attempts += 1 try: async with post(**post_kwargs) as resp: stats.status = resp.status stats.record_connected(agg_stats) if resp.status >= 400: content = await resp.read() resp.release() stats.record_read() stats.error = content if resp.status == 429: agg_stats.n_429 += 1 else: agg_stats.n_errors += 1 raise ApiError(request_info=resp.request_info, history=resp.history, status=resp.status, message=resp.reason, headers=resp.headers, response_content=content) # good response response = await resp.json() stats.record_read(agg_stats) return response except Exception as e: if not isinstance(e, ApiError): agg_stats.n_errors += 1 raise finally: response_stats.append(stats) if handle_retries: request = autoextract_retry(request) try: result = await request() except Exception: agg_stats.n_fatal_errors += 1 raise result = Result(result) result.response_stats = response_stats if handle_retries: result.retry_stats = request.retry.statistics # type: ignore agg_stats.n_results += 1 return result
def test_get_apikey_missing(): with pytest.raises(NoApiKey): get_apikey()
async def request_raw( query: Query, api_key: Optional[str] = None, endpoint: Optional[str] = None, *, handle_retries: bool = True, max_query_error_retries: int = 0, session: Optional[aiohttp.ClientSession] = None, agg_stats: AggStats = None, ) -> Result: """ Send a request to Scrapinghub AutoExtract API. ``query`` is a list of dicts or Request objects, as described in the API docs (see https://doc.scrapinghub.com/autoextract.html). ``api_key`` is your AutoExtract API key. If not set, it is taken from SCRAPINGHUB_AUTOEXTRACT_KEY environment variable. ``session`` is an optional aiohttp.ClientSession object; use it to enable HTTP Keep-Alive. This function retries http 429 errors and network errors by default; this allows to handle server-side throttling properly. Use ``handle_retries=False`` if you want to disable this behavior (e.g. to implement it yourself). When handle_retries is True, this function can raise 1) autoextract.errors.RequestError, if there is a Request-level error returned by the API which is not a throttling response (e.g. it can be raised for incorrect request). 2) tenacity.RetryError, if a network-related error persists for a long time, over the allowed time period. Throttling errors are retried indefinitely when handle_retries is True. When ``handle_retries=True``, we could also retry Query-level errors. Use ``max_query_error_retries > 0`` if you want to to enable this behavior. ``agg_stats`` argument allows to keep track of various stats; pass an ``AggStats`` instance, and it'll be updated. See :func:`request_parallel_as_completed` for a more high-level interface to send requests in parallel. """ endpoint = API_ENDPOINT if endpoint is None else endpoint if agg_stats is None: agg_stats = AggStats() # dummy stats, to simplify code if max_query_error_retries and not handle_retries: warnings.warn( "You've specified a max number of Query-level error retries, " "but retries are disabled. Consider passing the handle_retries " "argument as True.", stacklevel=2) # Keep state between executions/retries request_processor = RequestProcessor( query=query, max_retries=max_query_error_retries if handle_retries else 0, ) post = _post_func(session) auth = aiohttp.BasicAuth(get_apikey(api_key)) headers = {'User-Agent': user_agent(aiohttp)} response_stats = [] start_global = time.perf_counter() async def request(): stats = ResponseStats.create(start_global) agg_stats.n_attempts += 1 post_kwargs = dict( url=endpoint, json=request_processor.pending_queries, auth=auth, headers=headers, ) try: async with post(**post_kwargs) as resp: stats.status = resp.status stats.record_connected(agg_stats) if resp.status >= 400: content = await resp.read() resp.release() stats.record_read() stats.error = content if resp.status == 429: agg_stats.n_429 += 1 else: agg_stats.n_errors += 1 raise RequestError(request_info=resp.request_info, history=resp.history, status=resp.status, message=resp.reason, headers=resp.headers, response_content=content) response = await resp.json() stats.record_read(agg_stats) return request_processor.process_results(response) except Exception as e: if not isinstance(e, RequestError): agg_stats.n_errors += 1 raise finally: response_stats.append(stats) if handle_retries: # If handle_retries=True, the request method could raise # RetryError and QueryRetryError exceptions. # # These exceptions are raised when Tenacity is not able to # successfully retry failing requests. # # In addition to handle_retries=True, QueryRetryError also depends on # max_query_error_retries being greater than 0. request = autoextract_retry(request) try: # Try to make a batch request result = await request() except QueryRetryError: # If Tenacity fails to retry a _QueryError because the max number of # retries or a timeout was reached, get latest results combining # error and successes and consider it as the final result. result = request_processor.get_latest_results() except Exception: agg_stats.n_fatal_errors += 1 raise result = Result(result) result.response_stats = response_stats if handle_retries: result.retry_stats = request.retry.statistics # type: ignore agg_stats.n_results += 1 return result