Exemple #1
0
def _get_response(req):
    """Get response for the supplied request."""
    try:
        response = urlopen(req)  # nosec
    except Exception:
        try:
            response = urlopen(req)  # nosec
        except Exception as e:
            exc = WaybackError("Error while retrieving %s" % req.full_url)
            exc.__cause__ = e
            raise exc
    return response
Exemple #2
0
 def near(self, **kwargs):
     """ Returns the archived from Wayback Machine for an URL closest to the time supplied.
         Supported params are year, month, day, hour and minute.
         The non supplied parameters are default to the runtime time.
     """
     year = kwargs.get("year", datetime.utcnow().strftime('%Y'))
     month = kwargs.get("month", datetime.utcnow().strftime('%m'))
     day = kwargs.get("day", datetime.utcnow().strftime('%d'))
     hour = kwargs.get("hour", datetime.utcnow().strftime('%H'))
     minute = kwargs.get("minute", datetime.utcnow().strftime('%M'))
     timestamp = self.wayback_timestamp(year=year,
                                        month=month,
                                        day=day,
                                        hour=hour,
                                        minute=minute)
     request_url = "https://archive.org/wayback/available?url=%s&timestamp=%s" % (
         self.clean_url(), str(timestamp))
     hdr = {'User-Agent': '%s' % self.user_agent}
     req = Request(request_url, headers=hdr)  # nosec
     response = self.get_response(req)
     data = json.loads(response.read().decode("UTF-8"))
     if not data["archived_snapshots"]:
         raise WaybackError("'%s' is not yet archived." % url)
     archive_url = (data["archived_snapshots"]["closest"]["url"])
     # wayback machine returns http sometimes, idk why? But they support https
     archive_url = archive_url.replace("http://web.archive.org/web/",
                                       "https://web.archive.org/web/", 1)
     return archive_url
Exemple #3
0
 def get_response(self, req):
     """Get response for the supplied request."""
     try:
         response = urlopen(req)  #nosec
     except Exception:
         try:
             response = urlopen(req)  #nosec
         except Exception as e:
             raise WaybackError(e)
     return response
Exemple #4
0
 def archive_url_parser(header):
     """Parse out the archive from header."""
     #Regex1
     arch = re.search(
         r"rel=\"memento.*?(web\.archive\.org/web/[0-9]{14}/.*?)>",
         str(header))
     if arch:
         return arch.group(1)
     #Regex2
     arch = re.search(r"X-Cache-Key:\shttps(.*)[A-Z]{2}", str(header))
     if arch:
         return arch.group(1)
     raise WaybackError(
         "No archive url found in the API response. Visit https://github.com/akamhy/waybackpy for latest version of waybackpy.\nHeader:\n%s"
         % str(header))
Exemple #5
0
    def near(self, year=None, month=None, day=None, hour=None, minute=None):
        """ Return the closest Wayback Machine archive to the time supplied.
            Supported params are year, month, day, hour and minute.
            Any non-supplied parameters default to the current time.

        """
        now = datetime.utcnow().timetuple()
        timestamp = _wayback_timestamp(
            year=year if year else now.tm_year,
            month=month if month else now.tm_mon,
            day=day if day else now.tm_mday,
            hour=hour if hour else now.tm_hour,
            minute=minute if minute else now.tm_min,
        )

        request_url = "https://archive.org/wayback/available?url=%s&timestamp=%s" % (
            self._clean_url(),
            timestamp,
        )
        hdr = {"User-Agent": "%s" % self.user_agent}
        req = Request(request_url, headers=hdr)  # nosec
        response = _get_response(req)
        data = json.loads(response.read().decode("UTF-8"))
        if not data["archived_snapshots"]:
            raise WaybackError(
                "Can not find archive for '%s' try later or use wayback.Url(url, user_agent).save() "
                "to create a new archive." % self._clean_url()
            )
        archive_url = data["archived_snapshots"]["closest"]["url"]
        archive_url = archive_url.replace(
            "http://web.archive.org/web/", "https://web.archive.org/web/", 1
        )
        
        self.archive_url = archive_url
        self.timestamp = datetime.strptime(data["archived_snapshots"]
                                 ["closest"]
                                 ["timestamp"], 
                                 '%Y%m%d%H%M%S')
         
        return self
Exemple #6
0
def _archive_url_parser(header):
    """Parse out the archive from header."""
    # Regex1
    arch = re.search(
        r"Content-Location: (/web/[0-9]{14}/.*)", str(header)
    )
    if arch:
        return "web.archive.org" + arch.group(1)
    # Regex2
    arch = re.search(
        r"rel=\"memento.*?(web\.archive\.org/web/[0-9]{14}/.*?)>", str(header)
    )
    if arch:
        return arch.group(1)
    # Regex3
    arch = re.search(r"X-Cache-Key:\shttps(.*)[A-Z]{2}", str(header))
    if arch:
        return arch.group(1)
    raise WaybackError(
        "No archive URL found in the API response. "
        "This version of waybackpy (%s) is likely out of date. Visit "
        "https://github.com/akamhy/waybackpy for the latest version "
        "of waybackpy.\nHeader:\n%s" % (__version__, str(header))
    )