def _get_response(req): """Get response for the supplied request.""" try: response = urlopen(req) # nosec except Exception: try: response = urlopen(req) # nosec except Exception as e: exc = WaybackError("Error while retrieving %s" % req.full_url) exc.__cause__ = e raise exc return response
def near(self, **kwargs): """ Returns the archived from Wayback Machine for an URL closest to the time supplied. Supported params are year, month, day, hour and minute. The non supplied parameters are default to the runtime time. """ year = kwargs.get("year", datetime.utcnow().strftime('%Y')) month = kwargs.get("month", datetime.utcnow().strftime('%m')) day = kwargs.get("day", datetime.utcnow().strftime('%d')) hour = kwargs.get("hour", datetime.utcnow().strftime('%H')) minute = kwargs.get("minute", datetime.utcnow().strftime('%M')) timestamp = self.wayback_timestamp(year=year, month=month, day=day, hour=hour, minute=minute) request_url = "https://archive.org/wayback/available?url=%s×tamp=%s" % ( self.clean_url(), str(timestamp)) hdr = {'User-Agent': '%s' % self.user_agent} req = Request(request_url, headers=hdr) # nosec response = self.get_response(req) data = json.loads(response.read().decode("UTF-8")) if not data["archived_snapshots"]: raise WaybackError("'%s' is not yet archived." % url) archive_url = (data["archived_snapshots"]["closest"]["url"]) # wayback machine returns http sometimes, idk why? But they support https archive_url = archive_url.replace("http://web.archive.org/web/", "https://web.archive.org/web/", 1) return archive_url
def get_response(self, req): """Get response for the supplied request.""" try: response = urlopen(req) #nosec except Exception: try: response = urlopen(req) #nosec except Exception as e: raise WaybackError(e) return response
def archive_url_parser(header): """Parse out the archive from header.""" #Regex1 arch = re.search( r"rel=\"memento.*?(web\.archive\.org/web/[0-9]{14}/.*?)>", str(header)) if arch: return arch.group(1) #Regex2 arch = re.search(r"X-Cache-Key:\shttps(.*)[A-Z]{2}", str(header)) if arch: return arch.group(1) raise WaybackError( "No archive url found in the API response. Visit https://github.com/akamhy/waybackpy for latest version of waybackpy.\nHeader:\n%s" % str(header))
def near(self, year=None, month=None, day=None, hour=None, minute=None): """ Return the closest Wayback Machine archive to the time supplied. Supported params are year, month, day, hour and minute. Any non-supplied parameters default to the current time. """ now = datetime.utcnow().timetuple() timestamp = _wayback_timestamp( year=year if year else now.tm_year, month=month if month else now.tm_mon, day=day if day else now.tm_mday, hour=hour if hour else now.tm_hour, minute=minute if minute else now.tm_min, ) request_url = "https://archive.org/wayback/available?url=%s×tamp=%s" % ( self._clean_url(), timestamp, ) hdr = {"User-Agent": "%s" % self.user_agent} req = Request(request_url, headers=hdr) # nosec response = _get_response(req) data = json.loads(response.read().decode("UTF-8")) if not data["archived_snapshots"]: raise WaybackError( "Can not find archive for '%s' try later or use wayback.Url(url, user_agent).save() " "to create a new archive." % self._clean_url() ) archive_url = data["archived_snapshots"]["closest"]["url"] archive_url = archive_url.replace( "http://web.archive.org/web/", "https://web.archive.org/web/", 1 ) self.archive_url = archive_url self.timestamp = datetime.strptime(data["archived_snapshots"] ["closest"] ["timestamp"], '%Y%m%d%H%M%S') return self
def _archive_url_parser(header): """Parse out the archive from header.""" # Regex1 arch = re.search( r"Content-Location: (/web/[0-9]{14}/.*)", str(header) ) if arch: return "web.archive.org" + arch.group(1) # Regex2 arch = re.search( r"rel=\"memento.*?(web\.archive\.org/web/[0-9]{14}/.*?)>", str(header) ) if arch: return arch.group(1) # Regex3 arch = re.search(r"X-Cache-Key:\shttps(.*)[A-Z]{2}", str(header)) if arch: return arch.group(1) raise WaybackError( "No archive URL found in the API response. " "This version of waybackpy (%s) is likely out of date. Visit " "https://github.com/akamhy/waybackpy for the latest version " "of waybackpy.\nHeader:\n%s" % (__version__, str(header)) )