def test_separate_rate_limited_groups_do_not_affect_each_other(): start_time = datetime.utcnow() with rate_limited(calls_per_second=2, group='a'): 1 + 1 with rate_limited(calls_per_second=2, group='b'): 1 + 1 with rate_limited(calls_per_second=2, group='a'): 1 + 1 with rate_limited(calls_per_second=2, group='b'): 1 + 1 duration = datetime.utcnow() - start_time assert duration.total_seconds() > 0.5 assert duration.total_seconds() < 0.55
def test_rate_limited(): start_time = datetime.utcnow() for i in range(2): with rate_limited(calls_per_second=2): 1 + 1 duration = datetime.utcnow() - start_time assert duration.total_seconds() > 0.5
def timestamped_uri_to_version(self, dt, uri, *, url, maintainers=None, tags=None, view_url=None): """ Fetch version content and combine it with metadata to build a Version. Parameters ---------- dt : datetime.datetime capture time uri : string URI of version url : string page URL maintainers : list of string, optional Entities responsible for maintaining the page, as a list of strings tags : list of string, optional Any arbitrary "tags" to apply to the page for categorization view_url : string, optional The archive.org URL for viewing the page (with rewritten links, etc.) Returns ------- dict : Version suitable for passing to :class:`Client.add_versions` """ with utils.rate_limited(group='timestamped_uri_to_version'): # Check to make sure we are actually getting a memento playback. res = utils.retryable_request('GET', uri, allow_redirects=False, session=self.session) if res.headers.get('memento-datetime') is None: message = res.headers.get('X-Archive-Wayback-Runtime-Error') if message: raise MementoPlaybackError( f'Memento at {uri} could not be played: {message}') elif res.ok: raise MementoPlaybackError( f'Memento at {uri} could not be played') else: res.raise_for_status() # If the playback includes a redirect, continue on. if res.status_code >= 300 and res.status_code < 400: original = res res = utils.retryable_request('GET', res.headers.get('location'), session=self.session) res.history.insert(0, original) res.request = original.request version_hash = utils.hash_content(res.content) title = utils.extract_title(res.content) content_type = (res.headers['content-type'] or '').split(';', 1) # Get all headers from original response prefix = 'X-Archive-Orig-' original_headers = { k[len(prefix):]: v for k, v in res.headers.items() if k.startswith(prefix) } redirected_url = None redirects = None if res.url != uri: redirected_url = original_url_for_memento(res.url) redirects = list( map(lambda response: original_url_for_memento(response.url), res.history)) redirects.append(redirected_url) return format_version(url=url, dt=dt, uri=uri, version_hash=version_hash, title=title, tags=tags, maintainers=maintainers, status=res.status_code, mime_type=content_type[0], encoding=res.encoding, headers=original_headers, view_url=view_url, redirected_url=redirected_url, redirects=redirects)
def get_memento(self, url, exact=True, exact_redirects=None, target_window=24 * 60 * 60): """ Fetch a memento from the Wayback Machine. This retrieves the content that was ultimately returned from a memento, following any redirects that were present at the time the memento was captured. (That is, if `http://example.com/a` redirected to `http://example.com/b`, this returns the memento for `/b` when you request `/a`.) Parameters ---------- url : string URL of memento in Wayback (e.g. `http://web.archive.org/web/20180816111911id_/http://www.nws.noaa.gov/sp/`) exact : boolean, optional If false and the requested memento either doesn't exist or can't be played back, this returns the closest-in-time memento to the requested one, so long as it is within `target_window`. Default: True exact_redirects : boolean, optional If false and the requested memento is a redirect whose *target* doesn't exist or or can't be played back, this returns the closest- in-time memento to the intended target, so long as it is within `target_window`. If unset, this will be the same as `exact`. target_window : int, optional If the memento is of a redirect, allow up to this many seconds between the capture of the redirect and the capture of the target URL. (Note this does NOT apply when the originally requested memento didn't exist and wayback redirects to the next-closest-in- -time one. That will always raise a MementoPlaybackError.) Defaults to 86,400 (24 hours). Returns ------- dict : requests.Response An HTTP response with the content of the memento, including a history of any redirects involved. """ if exact_redirects is None: exact_redirects = exact with utils.rate_limited(calls_per_second=30, group='get_memento'): # Correctly following redirects is actually pretty complicated. In # the simplest case, a memento is a simple web page, and that's # no problem. However... # 1. If the response was a >= 400 status, we have to determine # whether that status is coming from the memento or from the # the Wayback Machine itself. # 2. If the response was a 3xx status (a redirect) we have to # determine the same thing, but it's a little more complex... # a) If the redirect *is* the memento, its target may be an # actual memento (see #1) or it may be a redirect (#2). # The targeted URL is frequently captured anywhere from # the same second to a few hours later, so it is likely # the target will result in case 2b (below). # b) If there is no memento for the requested time, but there # are mementos for the same URL at another time, Wayback # *may* redirect to that memento. # - If this was on the original request, that's *not* ok # because it means we're getting a different memento # than we asked for. # - If the redirect came from a URL that was the target of # of a memento redirect (2a), then this is expected. # Before following the redirect, though, we first sanity # check it to make sure the memento we are redirecting # to actually came from nearby in time (sometimes # Wayback will redirect to captures *months* away). history = [] urls = set() previous_was_memento = False orginal_url, original_date = memento_url_data(url) response = self.session.request('GET', url, allow_redirects=False) protocol_and_www = re.compile(r'^https?://(www\d?\.)?') while True: is_memento = 'Memento-Datetime' in response.headers if not is_memento: # The exactness requirements for redirects from memento # playbacks and non-playbacks is different -- even with # strict matching, a memento that redirects to a non- # memento is normal and ok; the target of a redirect will # rarely have been captured at the same time as the # redirect itself. (See 2b) playable = False if response.next and ( (len(history) == 0 and exact == False) or (len(history) > 0 and (previous_was_memento or exact_redirects == False))): current_url = original_url_for_memento(response.url) target_url, target_date = memento_url_data( response.next.url) # A non-memento redirect is generally taking us to the # closest-in-time capture of the same URL. Note that is # NOT the next capture -- i.e. the one that would have # been produced by an earlier memento redirect -- it's # just the *closest* one. The first job here is to make # sure it fits within our target window. if abs(target_date - original_date).seconds <= target_window: # The redirect will point to the closest-in-time # SURT URL, which will often not be an exact URL # match. If we aren't looking for exact matches, # then just assume wherever we're redirecting to is # ok. Otherwise, try to sanity-check the URL. if exact_redirects: # FIXME: what should *really* happen here, if # we want exactness, is a CDX search for the # next-int-time capture of the exact URL we # redirected to. I'm not totally sure how # great that is (also it seems high overhead to # do a search in the middle of this series of # memento lookups), so just do a loose URL # check for now. current_nice_url = protocol_and_www.sub( '', current_url).casefold() target_nice_url = protocol_and_www.sub( '', target_url).casefold() playable = current_nice_url == target_nice_url else: playable = True if not playable: message = response.headers.get( 'X-Archive-Wayback-Runtime-Error') if message: raise MementoPlaybackError( f'Memento at {url} could not be played: {message}' ) elif response.ok: raise MementoPlaybackError( f'Memento at {url} could not be played') else: response.raise_for_status() if response.next: previous_was_memento = is_memento urls.add(response.url) # Wayback sometimes has circular memento redirects ¯\_(ツ)_/¯ if response.next.url in urls: raise MementoPlaybackError( f'Memento at {url} is circular') history.append(response) response = self.session.send(response.next, allow_redirects=False) else: break response.history = history return response
def timestamped_uri_to_version(dt, uri, *, url, maintainers=None, tags=None, view_url=None): """ Fetch version content and combine it with metadata to build a Version. Parameters ---------- dt : datetime.datetime capture time uri : string URI of version url : string page URL maintainers : list of string, optional Entities responsible for maintaining the page, as a list of strings tags : list of string, optional Any arbitrary "tags" to apply to the page for categorization view_url : string, optional The archive.org URL for viewing the page (with rewritten links, etc.) Returns ------- dict : Version suitable for passing to :class:`Client.add_versions` """ with utils.rate_limited(group='timestamped_uri_to_version'): res = utils.retryable_request('GET', uri) # IA's memento server responds with the status of the original request, so # use the presence of the 'Memento-Datetime' header to determine if we # should use the response or there was an actual error. if not res.ok and not res.headers.get('memento-datetime'): res.raise_for_status() version_hash = utils.hash_content(res.content) title = utils.extract_title(res.content) content_type = (res.headers['content-type'] or '').split(';', 1) # Get all headers from original response prefix = 'X-Archive-Orig-' original_headers = { k[len(prefix):]: v for k, v in res.headers.items() if k.startswith(prefix) } redirected_url = None redirects = None if res.url != uri: redirected_url = original_url_for_memento(res.url) redirects = list( map(lambda response: original_url_for_memento(response.url), res.history)) redirects.append(redirected_url) return format_version(url=url, dt=dt, uri=uri, version_hash=version_hash, title=title, tags=tags, maintainers=maintainers, status=res.status_code, mime_type=content_type[0], encoding=res.encoding, headers=original_headers, view_url=view_url, redirected_url=redirected_url, redirects=redirects)