Exemple #1
0
def iso_to_dt(date):
    seq = (int(date[:4]), int(date[5:7]), int(date[8:10]), int(date[11:13]),
           int(date[14:16]), int(date[17:19]), 0, 1, -1)
    return date_str(
        datetime.fromtimestamp(
            time.mktime(
                time.struct_time(seq)),
            tzutc()))
Exemple #2
0
    def get_memento(self, req_uri, accept_datetime):
        timestamp = date_str(accept_datetime, self.TIMESTAMPFMT)
        params = {
            'rvlimit': 1,  # Only need one
            'rvstart': timestamp,  # Start listing from here
            'rvdir': 'older'  # List in decreasing order
        }

        # Finds the API and title using scraping
        api_base_uri = None
        try:
            dom = self.get_xml(req_uri, html=True)
            links = dom.xpath("//link")
            for link in links:
                if link.attrib['rel'].lower() == "edituri":
                    api_base_uri = link.attrib['href'].split("?")[0]
                    if api_base_uri.startswith("//"):
                        api_base_uri = api_base_uri.replace("//", "http://")
            parsed_url = urlparse.urlparse(req_uri)
            try:
                title = urlparse.parse_qs(parsed_url[4])['title'][0]
            except Exception as e:
                title = parsed_url.path.split('/')[-1]
            logging.debug(
                "Mediawiki handler: API found: %s, page title parsed to: %s " %
                (api_base_uri, title))
            if not title:
                raise HandlerError("Cannot find Title", 404)
            if not api_base_uri:
                raise HandlerError("Cannot find mediawiki API on page", 404)
            else:
                title = urllib2.unquote(title)

        except HandlerError as he:
            raise he
        except Exception as e:
            logging.error(
                "MediaWikiHandler: querying and parsing page for title/api "
                "%s. handler will return empty response" % e
            )
            return None

        base_uri = api_base_uri.replace("api.php", "index.php")

        return self.query(req_uri, params, title, api_base_uri, base_uri)
Exemple #3
0
    def get_memento(self, req_uri, accept_datetime):
        timestamp = date_str(accept_datetime, self.TIMESTAMPFMT)
        params = {
            'rvlimit': 1,  # Only need one
            'rvstart': timestamp,  # Start listing from here
            'rvdir': 'older'  # List in decreasing order
        }

        # Finds the API and title using scraping
        api_base_uri = None
        try:
            dom = self.get_xml(req_uri, html=True)
            links = dom.xpath("//link")
            for link in links:
                if link.attrib['rel'].lower() == "edituri":
                    api_base_uri = link.attrib['href'].split("?")[0]
                    if api_base_uri.startswith("//"):
                        api_base_uri = api_base_uri.replace("//", "http://")
            parsed_url = urlparse.urlparse(req_uri)
            try:
                title = urlparse.parse_qs(parsed_url[4])['title'][0]
            except Exception as e:
                title = parsed_url.path.split('/')[-1]
            logging.debug(
                "Mediawiki handler: API found: %s, page title parsed to: %s " %
                (api_base_uri, title))
            if not title:
                raise HandlerError("Cannot find Title", 404)
            if not api_base_uri:
                raise HandlerError("Cannot find mediawiki API on page", 404)
            else:
                title = urllib2.unquote(title)

        except HandlerError as he:
            raise he
        except Exception as e:
            logging.error(
                "MediaWikiHandler: querying and parsing page for title/api %s."
                " Handler will return empty response." % e)
            return None

        base_uri = api_base_uri.replace("api.php", "index.php")

        # The best Memento
        memento = self.query(req_uri, params, title, api_base_uri, base_uri)[0]

        # The first Memento
        if title in self.inner_cache and memento:
            logging.debug("Wiki Handler: found cached first for " + title)
            first = self.inner_cache[title]
        else:
            logging.debug("Wiki Handler: Querying first for " + title)
            first_params = {
                'rvlimit': 1,  # Only need one
                'rvstart': '19900101000000',  # Start listing from 1990
                'rvdir': 'newer'  # List in increasing order
            }
            first = self.query(req_uri, first_params, title, api_base_uri,
                               base_uri)[0]
            if len(self.inner_cache) > self.max_inner_cache_size:
                self.inner_cache = {}
            self.inner_cache[title] = first

        # This handler returns more than only the best Memento.
        # A Link with rel="first memento" will also be returned to the client.
        return [first, memento]
Exemple #4
0
    def get_memento(self, req_uri, accept_datetime):
        timestamp = date_str(accept_datetime, self.TIMESTAMPFMT)
        params = {
            "rvlimit": 1,  # Only need one
            "rvstart": timestamp,  # Start listing from here
            "rvdir": "older",  # List in decreasing order
        }

        # Finds the API and title using scraping
        api_base_uri = None
        try:
            dom = self.get_xml(req_uri, html=True)
            links = dom.xpath("//link")
            for link in links:
                if link.attrib["rel"].lower() == "edituri":
                    api_base_uri = link.attrib["href"].split("?")[0]
                    if api_base_uri.startswith("//"):
                        api_base_uri = api_base_uri.replace("//", "http://")
            parsed_url = urlparse.urlparse(req_uri)
            try:
                title = urlparse.parse_qs(parsed_url[4])["title"][0]
            except Exception as e:
                title = parsed_url.path.split("/")[-1]
            logging.debug("Mediawiki handler: API found: %s, page title parsed to: %s " % (api_base_uri, title))
            if not title:
                raise HandlerError("Cannot find Title", 404)
            if not api_base_uri:
                raise HandlerError("Cannot find mediawiki API on page", 404)
            else:
                title = urllib2.unquote(title)

        except HandlerError as he:
            raise he
        except Exception as e:
            logging.error(
                "MediaWikiHandler: querying and parsing page for title/api %s."
                " Handler will return empty response." % e
            )
            return None

        base_uri = api_base_uri.replace("api.php", "index.php")

        # The best Memento
        memento = self.query(req_uri, params, title, api_base_uri, base_uri)[0]

        # The first Memento
        if title in self.inner_cache and memento:
            logging.debug("Wiki Handler: found cached first for " + title)
            first = self.inner_cache[title]
        else:
            logging.debug("Wiki Handler: Querying first for " + title)
            first_params = {
                "rvlimit": 1,  # Only need one
                "rvstart": "19900101000000",  # Start listing from 1990
                "rvdir": "newer",  # List in increasing order
            }
            first = self.query(req_uri, first_params, title, api_base_uri, base_uri)[0]
            if len(self.inner_cache) > self.max_inner_cache_size:
                self.inner_cache = {}
            self.inner_cache[title] = first

        # This handler returns more than only the best Memento.
        # A Link with rel="first memento" will also be returned to the client.
        return [first, memento]