async def test_add_trace_request_ctx(aiohttp_client, loop): actual_request_contexts = [] async def on_request_start( _: ClientSession, trace_config_ctx: SimpleNamespace, __: TraceRequestStartParams, ) -> None: actual_request_contexts.append(trace_config_ctx) test_app = App() trace_config = TraceConfig() trace_config.on_request_start.append(on_request_start) # type: ignore retry_client = RetryClient() retry_client._client = await aiohttp_client( test_app.get_app(), trace_configs=[trace_config] ) async with retry_client.get('/sometimes_error', trace_request_ctx={'foo': 'bar'}): assert test_app.counter == 3 assert actual_request_contexts == [ SimpleNamespace( trace_request_ctx={ 'foo': 'bar', 'current_attempt': i + 1, }, ) for i in range(3) ]
async def test_internal_error(aiohttp_client, loop): test_app = TestApp() app = test_app.get_app() client = await aiohttp_client(app) retry_client = RetryClient() retry_client._client = client async with retry_client.get('/internal_error', retry_attempts=5) as response: assert response.status == 500 assert test_app.counter == 5 await retry_client.close() await client.close()
async def test_not_found_error(aiohttp_client, loop): test_app = App() app = test_app.get_app() client = await aiohttp_client(app) retry_client = RetryClient() retry_client._client = client retry_options = RetryOptions(attempts=5, statuses={404}) async with retry_client.get('/not_found_error', retry_options) as response: assert response.status == 404 assert test_app.counter == 5 await retry_client.close() await client.close()
async def _get_status_code(location: str, client: RetryClient, retry: int) -> int: _tracker.stats.inc_requests() status_code = 0 try: _logger.debug("Requesting status code for %s", location) async with client.get(location, retry_attempts=retry) as response: status_code = response.status except TooManyRedirects: _logger.debug("Redirection Tango, danced enough with %s", location) except ClientConnectionError: _logger.debug("Connection Error occurred while getting %s", location) return status_code
async def test_hello(aiohttp_client, loop): test_app = TestApp() app = test_app.get_app() client = await aiohttp_client(app) retry_client = RetryClient() retry_client._client = client async with retry_client.get('/ping') as response: text = await response.text() assert response.status == 200 assert text == 'Ok!' assert test_app.counter == 1 await retry_client.close() await client.close()
async def fetch(client: RetryClient, query_string: str, timeout: float, retries_count: int) -> Text: """ Fetch result of query :param client: Client with retry mechanism :param query_string: Full query string for repos getting :param timeout: timeout between retries in case of status code != 200 :param retries_count: retries count in case of status code != 200 :return: """ async with client.get(url=query_string, retry_attempts=retries_count, retry_start_timeout=timeout, retry_factor=2, retry_max_timeout=timeout * (2**retries_count), retry_for_statuses=[429]) as response: return await response.text()
async def test_sometimes_error_with_raise_for_status(aiohttp_client, loop): test_app = TestApp() app = test_app.get_app() client = await aiohttp_client(app, raise_for_status=True) retry_client = RetryClient() retry_client._client = client async with retry_client.get('/sometimes_error', retry_attempts=5, retry_exceptions={ClientResponseError}) \ as response: text = await response.text() assert response.status == 200 assert text == 'Ok!' assert test_app.counter == 3 await retry_client.close() await client.close()
async def test_sometimes_error(aiohttp_client, loop): test_app = TestApp() app = test_app.get_app() client = await aiohttp_client(app) retry_client = RetryClient() retry_client._client = client async with retry_client.get('/sometimes_error', retry_attempts=5) as response: text = await response.text() assert response.status == 200 assert text == 'Ok!' assert test_app.counter == 3 await retry_client.close() await client.close()
async def test_override_options(aiohttp_client, loop): test_app = App() app = test_app.get_app() client = await aiohttp_client(app) retry_options = RetryOptions(attempts=1) retry_client = RetryClient(retry_options=retry_options) retry_client._client = client retry_options = RetryOptions(attempts=5) async with retry_client.get('/sometimes_error', retry_options) as response: text = await response.text() assert response.status == 200 assert text == 'Ok!' assert test_app.counter == 3 await retry_client.close() await client.close()
async def download_single(item: Union[MediaData, str], session: RetryClient, params: DownloadParams): """Async function to download single url to disk Args: item (Dict or str): item dict or url. session (RetryClient): aiohttp session. params (DownloadParams): Download parameter dict """ if isinstance(item, dict): url = item.get("url") basename = item.get("basename") label = item.get("label") subset = item.get("subset") else: url = item label, basename, subset = None, None, None if subset is None and params["random_subsets"] is not None: subset_choices = list(params["random_subsets"].keys()) p = list(params["random_subsets"].values()) subset = random.choices(subset_choices, weights=p, k=1)[0] label_path = Path(params["root"]) if subset is not None: label_path /= Path(subset) # create subfolder when label is a single str if isinstance(label, str): # append label path label_path /= Path(label) label_path.mkdir(parents=True, exist_ok=True) if basename is None: # hash the url basename = hashlib.sha1(url.encode("utf-8")).hexdigest() check_files_with_same_basename = label_path.glob(basename + "*") if list(check_files_with_same_basename) and not params["overwrite"]: # do not overwrite, skips based on base path return False async with session.get(url, proxy=params["proxy"]) as res: content = await res.read() # guess mimetype and suffix from content kind = filetype.guess(content) if kind is None: return False else: suffix = "." + kind.extension mime = kind.mime # Check everything went well if res.status != 200: raise aiohttp.ClientResponseError if params["is_valid_file"] is not None: if not params["is_valid_file"](content): print(f"File check failed") return False file_base_path = label_path / basename file_path = file_base_path.with_suffix(suffix) async with aiofiles.open(file_path, "+wb") as f: await f.write(content) if isinstance(label, dict): json_path = (label_path / item["basename"]).with_suffix(".json") async with aiofiles.open(json_path, mode="+w") as fp: await fp.write(json.dumps(label)) return True
class INatAPI: """Access the iNat API and assets via (api|static).inaturalist.org.""" def __init__(self): # pylint: disable=unused-argument async def on_request_start( session: ClientSession, trace_config_ctx: SimpleNamespace, params: TraceRequestStartParams, ) -> None: current_attempt = trace_config_ctx.trace_request_ctx[ "current_attempt"] if current_attempt > 1: LOG.info("iNat request attempt #%d: %s", current_attempt, repr(params)) trace_config = TraceConfig() trace_config.on_request_start.append(on_request_start) self.session = RetryClient( raise_for_status=False, trace_configs=[trace_config], ) self.request_time = time() self.places_cache = {} self.projects_cache = {} self.users_cache = {} self.users_login_cache = {} self.taxa_cache = {} # api_v1_limiter: # --------------- # - Allow a burst of 60 requests (i.e. equal to max_rate) in the initial # seconds of the 60 second time_period before enforcing a rate limit of # 60 requests per minute (max_rate). # - This honours "try to keep it to 60 requests per minute or lower": # - https://api.inaturalist.org/v1/docs/ # - Since the iNat API doesn't throttle until 100 requests per minute, # this should ensure we never get throttled. self.api_v1_limiter = AsyncLimiter(60, 60) async def _get_rate_limited(self, full_url, **kwargs): """Query API, respecting 60 requests per minute rate limit.""" LOG.info('_get_rate_limited("%s", %s)', full_url, repr(kwargs)) async with self.api_v1_limiter: # i.e. wait 0.1s, 0.2s, 0.4s, 0.8s, and finally give up retry_options = ExponentialRetry( attempts=4, exceptions=RETRY_EXCEPTIONS, ) try: async with self.session.get( full_url, params=kwargs, retry_options=retry_options) as response: if response.status == 200: return await response.json() else: try: json = await response.json() msg = f"{json.get('error')} ({json.get('status')})" except ContentTypeError: data = await response.text() document = BeautifulSoup(data, "html.parser") # Only use the body, if present if document.body: text = document.body.find().text else: text = document # Treat as much as we can as markdown markdown = html2markdown.convert(text) # Punt the rest back to bs4 to drop unhandled tags msg = BeautifulSoup(markdown, "html.parser").text lookup_failed_msg = f"Lookup failed: {msg}" LOG.error(lookup_failed_msg) raise LookupError(lookup_failed_msg) except Exception as e: # pylint: disable=broad-except,invalid-name if any(isinstance(e, exc) for exc in retry_options.exceptions): attempts = retry_options.attempts msg = f"iNat not responding after {attempts} attempts. Please try again later." LOG.error(msg) raise LookupError(msg) from e raise e return None async def get_controlled_terms(self, *args, **kwargs): """Query API for controlled terms.""" endpoint = "/".join(("/v1/controlled_terms", *args)) full_url = f"{API_BASE_URL}{endpoint}" return await self._get_rate_limited(full_url, **kwargs) # refresh_cache: Boolean # - Unlike places and projects which change infrequently, we usually want the # latest, uncached taxon record. async def get_taxa(self, *args, refresh_cache=True, **kwargs): """Query API for taxa matching parameters. Parameters ---------- *args - If first positional argument is given, it is passed through as-is, appended to the /v1/taxa endpoint. - If it's a number, the resulting record will be cached. refresh_cache: bool - Unlike places and projects which change infrequently, we usually want the latest, uncached taxon record, as changes are frequently made at the website (e.g. observations count). - Specify refresh_cache=True when the latest data from the site is not needed, e.g. to show names of ancestors for an existing taxon display. **kwargs - All kwargs are passed as params on the API call. - If kwargs["q"] is present, the /v1/taxa/autocomplete endpoint is selected, as that gives the best results, most closely matching the iNat web taxon lookup experience. """ # Select endpoint based on call signature: # - /v1/taxa is needed for id# lookup (i.e. no kwargs["q"]) endpoint = ("/v1/taxa/autocomplete" if "q" in kwargs and "page" not in kwargs else "/v1/taxa") id_arg = f"/{args[0]}" if args else "" full_url = f"{API_BASE_URL}{endpoint}{id_arg}" # Cache lookup by id#, as those should be stable. # - note: we could support splitting a list of id#s and caching each # one, but currently we don't make use of that call, so only cache # when a single ID is specified if args and (isinstance(args[0], int) or args[0].isnumeric()): taxon_id = int(args[0]) if refresh_cache or taxon_id not in self.taxa_cache: taxon = await self._get_rate_limited(full_url, **kwargs) if taxon: self.taxa_cache[taxon_id] = taxon return self.taxa_cache[ taxon_id] if taxon_id in self.taxa_cache else None # Skip the cache for text queries which are not stable. return await self._get_rate_limited(full_url, **kwargs) async def get_observations(self, *args, **kwargs): """Query API for observations. Parameters ---------- *args - If first positional argument is given, it is passed through as-is, appended to the /v1/observations endpoint. **kwargs - All kwargs are passed as params on the API call. """ endpoint = "/v1/observations" id_arg = f"/{args[0]}" if args else "" full_url = f"{API_BASE_URL}{endpoint}{id_arg}" return await self._get_rate_limited(full_url, **kwargs) async def get_observation_bounds(self, taxon_ids): """Get the bounds for the specified observations.""" kwargs = { "return_bounds": "true", "verifiable": "true", "taxon_id": ",".join(map(str, taxon_ids)), "per_page": 0, } result = await self.get_observations(**kwargs) if result and "total_bounds" in result: return result["total_bounds"] return None async def get_obs_taxon_summary(self, obs_id=int, **kwargs): """Get an observation's taxon summary.""" endpoint = f"/v1/observations/{obs_id}/taxon_summary" full_url = f"{API_BASE_URL}{endpoint}" return await self._get_rate_limited(full_url, **kwargs) async def get_places(self, query: Union[int, str, list], refresh_cache=False, **kwargs): """Get places for the specified ids or text query.""" first_place_id = None if isinstance(query, list): cached = set(query).issubset(set(self.places_cache)) request = f"/v1/places/{','.join(map(str, query))}" elif isinstance(query, int): cached = query in self.places_cache if cached: first_place_id = query request = f"/v1/places/{query}" else: cached = False request = f"/v1/places/{query}" full_url = f"{API_BASE_URL}{request}" if refresh_cache or not cached: results = await self._get_rate_limited(full_url, **kwargs) if results: places = results.get("results") or [] for place in places: key = place.get("id") if key: if not first_place_id: first_place_id = key record = { "total_results": 1, "page": 1, "per_page": 1, "results": [place], } self.places_cache[key] = record if isinstance(query, list): return { place_id: self.places_cache[place_id] for place_id in query if self.places_cache[place_id] } if first_place_id in self.places_cache: return self.places_cache[first_place_id] return None async def get_projects(self, query: Union[str, int, list], refresh_cache=False, **kwargs): """Get projects for the specified ids or text query.""" first_project_id = None if isinstance(query, list): cached = set(query).issubset(set(self.projects_cache)) request = f"/v1/projects/{','.join(map(str, query))}" elif isinstance(query, int): cached = query in self.projects_cache if cached: first_project_id = query request = f"/v1/projects/{query}" else: cached = False request = f"/v1/projects/{query}" full_url = f"{API_BASE_URL}{request}" if refresh_cache or not cached: results = await self._get_rate_limited(full_url, **kwargs) if results: projects = results.get("results") or [] for project in projects: key = project.get("id") if key: if not first_project_id: first_project_id = key record = { "total_results": 1, "page": 1, "per_page": 1, "results": [project], } self.projects_cache[key] = record if isinstance(query, list): return { project_id: self.projects_cache[project_id] for project_id in query if self.projects_cache[project_id] } if first_project_id in self.projects_cache: return self.projects_cache[first_project_id] return None async def get_observers_stats(self, **kwargs): """Query API for user counts & rankings.""" request = "/v1/observations/observers" # TODO: validate kwargs includes project_id # TODO: support queries with > 500 observers (one page, default) full_url = f"{API_BASE_URL}{request}" return await self._get_rate_limited(full_url, **kwargs) async def get_search_results(self, **kwargs): """Get site search results.""" if "is_active" in kwargs and kwargs["is_active"] == "any": full_url = f"{API_BASE_URL}/v1/taxa" else: full_url = f"{API_BASE_URL}/v1/search" return await self._get_rate_limited(full_url, **kwargs) async def get_users(self, query: Union[int, str], refresh_cache=False, by_login_id=False, **kwargs): """Get the users for the specified login, user_id, or query.""" request = f"/v1/users/{query}" if isinstance(query, int) or query.isnumeric(): user_id = int(query) key = user_id elif by_login_id: user_id = None key = query else: user_id = None request = f"/v1/users/autocomplete?q={query}" key = query full_url = f"{API_BASE_URL}{request}" if refresh_cache or (key not in self.users_cache and key not in self.users_login_cache): # TODO: provide means to expire the cache (other than reloading the cog). json_data = await self._get_rate_limited(full_url, **kwargs) if json_data: results = json_data.get("results") if not results: return None if user_id is None: if len(results) == 1: # String query matched exactly one result; cache it: user = results[0] # The entry itself is put in the main cache, indexed by user_id. self.users_cache[user["id"]] = json_data # Lookaside by login stores only linkage to the # entry just stored in the main cache. self.users_login_cache[user["login"]] = user["id"] # Additionally add an entry to the main cache for # the query string, but only for other than an # exact login id match as that would serve no # purpose. This is slightly wasteful, but makes for # simpler code. if user["login"] != key: self.users_cache[key] = json_data else: # Cache multiple results matched by string. self.users_cache[key] = json_data # Additional synthesized cache results per matched user, as # if they were queried individually. for user in results: user_json = {} user_json["results"] = [user] self.users_cache[user["id"]] = user_json # Only index the login in the lookaside cache if it # isn't the query string itself, already indexed above # in the main cache. # - i.e. it's possible a search for a login matches # more than one entry (e.g. david, david99, etc.) # so retrieving it from cache must always return # all matching results, not just one for the login # itself if user["login"] != key: self.users_login_cache[ user["login"]] = user["id"] else: # i.e. lookup by user_id only returns one match user = results[0] if user: self.users_cache[key] = json_data self.users_login_cache[user["login"]] = key self.request_time = time() if key in self.users_cache: return self.users_cache[key] # - Lookaside for login is only consulted if not found in the main # users_cache. # - This is important, since a lookup by user_id could prime the # lookaside cache with the single login entry, and then a subsequent # search by login could return multiple results into the main cache. # From then on, searching for the login should return the cached # multiple results from the main cache, not the single result that the # lookaside users_login_cache supports. # - This shortcut seems like it would return incomplete results depending # on the order in which lookups are performed. However, since the login # lookaside is primarily in support of iNat login lookups from already # cached project members, this is OK. The load of the whole project # membership at once (get_observers_from_projects) for that use case # ensures all relevant matches are already individually cached. if key in self.users_login_cache: user_id = self.users_login_cache[key] return self.users_cache[user_id] return None async def get_observers_from_projects(self, project_ids: Optional[List] = None, user_ids: Optional[List] = None): """Get observers for a list of project ids. Since the cache is filled as a side effect, this method can be used to prime the cache prior to fetching multiple users at once by id. Users may also be specified, and in that case, project ids may be omitted. The cache will then be primed from a list of user ids. """ if not (project_ids or user_ids): return page = 1 more = True users = [] # Note: This will only handle up to 10,000 users. Anything more # needs to set id_above and id_below. With luck, we won't ever # need to deal with projects this big! while more: params = {"page": page} if project_ids: params["project_id"] = ",".join(map(str, project_ids)) if user_ids: params["user_id"] = ",".join(map(str, user_ids)) response = await self.get_observations("observers", **params) results = response.get("results") or [] for observer in results: user = observer.get("user") if user: user_id = user.get("id") if user_id: # Synthesize a single result as if returned by a get_users # lookup of a single user_id, and cache it: user_json = {} user_json["results"] = [user] users.append(user) self.users_cache[user_id] = user_json self.users_login_cache[user["login"]] = user_id # default values provided defensively to exit loop if missing per_page = response.get("per_page") or len(results) total_results = response.get("total_results") or len(results) if results and (page * per_page < total_results): page += 1 else: more = False # return all user results as a single page return { "total_results": len(users), "pages": 1, "per_page": len(users), "results": users, }