Example #1
0
 def links(self):
     import _beautifulsoup
     bs = self._bs
     base_url = self._base_url
     encoding = self._encoding
     gen = bs.recursiveChildGenerator()
     for ch in bs.recursiveChildGenerator():
         if (isinstance(ch, _beautifulsoup.Tag)
                 and ch.name in self.urltags.keys() + ["base"]):
             link = ch
             attrs = bs.unescape_attrs(link.attrs)
             attrs_dict = dict(attrs)
             if link.name == "base":
                 base_href = attrs_dict.get("href")
                 if base_href is not None:
                     base_url = base_href
                 continue
             url_attr = self.urltags[link.name]
             url = attrs_dict.get(url_attr)
             if not url:
                 continue
             url = _rfc3986.clean_url(url, encoding)
             text = link.fetchText(lambda t: True)
             if not text:
                 # follow _pullparser's weird behaviour rigidly
                 if link.name == "a":
                     text = ""
                 else:
                     text = None
             else:
                 text = self.compress_re.sub(" ", " ".join(text).strip())
             yield Link(base_url, url, text, link.name, attrs)
Example #2
0
 def links(self, urltags=None):
     if urltags is None:
         urltags = self.urltags
     bs = self._bs
     base_url = self._base_url
     encoding = self._encoding
     for ch in bs.recursiveChildGenerator():
         if isinstance(ch, _beautifulsoup.Tag) and ch.name in urltags.keys() + ["base"]:
             link = ch
             attrs = bs.unescape_attrs(link.attrs)
             attrs_dict = dict(attrs)
             if link.name == "base":
                 base_href = attrs_dict.get("href")
                 if base_href is not None:
                     base_url = base_href
                 continue
             url_attr = urltags[link.name]
             url = attrs_dict.get(url_attr)
             if not url:
                 continue
             url = _rfc3986.clean_url(url, encoding)
             text = link.fetchText(lambda t: True)
             if not text:
                 # follow _pullparser's weird behaviour rigidly
                 if link.name == "a":
                     text = ""
                 else:
                     text = None
             else:
                 text = self.compress_re.sub(" ", " ".join(text).strip())
             yield Link(base_url, url, text, link.name, attrs)
Example #3
0
    def http_error_302(self, req, fp, code, msg, headers):
        # Some servers (incorrectly) return multiple Location headers
        # (so probably same goes for URI).  Use first header.
        if "location" in headers:
            newurl = headers.getheaders("location")[0]
        elif "uri" in headers:
            newurl = headers.getheaders("uri")[0]
        else:
            return
        newurl = _rfc3986.clean_url(newurl, "latin-1")
        newurl = _rfc3986.urljoin(req.get_full_url(), newurl)

        # XXX Probably want to forget about the state of the current
        # request, although that might interact poorly with other
        # handlers that also use handler-specific request attributes
        new = self.redirect_request(newurl, req, fp, code, msg, headers)
        if new is None:
            return

        # loop detection
        # .redirect_dict has a key url if url was previously visited.
        if hasattr(req, "redirect_dict"):
            visited = new.redirect_dict = req.redirect_dict
            if visited.get(newurl, 0) >= self.max_repeats or len(visited) >= self.max_redirections:
                raise HTTPError(req.get_full_url(), code, self.inf_msg + msg, headers, fp)
        else:
            visited = new.redirect_dict = req.redirect_dict = {}
        visited[newurl] = visited.get(newurl, 0) + 1

        # Don't close the fp until we are sure that we won't use it
        # with HTTPError.
        fp.read()
        fp.close()

        return self.parent.open(new)
Example #4
0
    def links(self, urltags=None):
        """Return an iterator that provides links of the document."""
        if urltags is None:
            urltags = self.urltags
        response = self._response
        encoding = self._encoding
        base_url = self._base_url
        response.seek(0)
        p = self.link_parser_class(response, encoding=encoding)

        try:
            for token in p.tags(*(urltags.keys() + ["base"])):
                if token.type == "endtag":
                    continue
                if token.data == "base":
                    base_href = dict(token.attrs).get("href")
                    if base_href is not None:
                        base_url = base_href
                    continue
                attrs = dict(token.attrs)
                tag = token.data
                text = None
                # XXX use attr_encoding for ref'd doc if that doc does not
                #  provide one by other means
                # attr_encoding = attrs.get("charset")
                url = attrs.get(urltags[tag])  # XXX is "" a valid URL?
                if not url:
                    # Probably an <A NAME="blah"> link or <AREA NOHREF...>.
                    # For our purposes a link is something with a URL, so
                    # ignore this.
                    continue

                url = _rfc3986.clean_url(url, encoding)
                if tag == "a":
                    if token.type != "startendtag":
                        # hmm, this'd break if end tag is missing
                        text = p.get_compressed_text(("endtag", tag))
                    # but this doesn't work for e.g.
                    # <a href="blah"><b>Andy</b></a>
                    # text = p.get_compressed_text()

                yield Link(base_url, url, text, tag, token.attrs)
        except sgmllib.SGMLParseError, exc:
            raise _form.ParseError(exc)
Example #5
0
    def links(self):
        """Return an iterator that provides links of the document."""
        response = self._response
        encoding = self._encoding
        base_url = self._base_url
        p = self.link_parser_class(response, encoding=encoding)

        try:
            for token in p.tags(*(self.urltags.keys() + ["base"])):
                if token.type == "endtag":
                    continue
                if token.data == "base":
                    base_href = dict(token.attrs).get("href")
                    if base_href is not None:
                        base_url = base_href
                    continue
                attrs = dict(token.attrs)
                tag = token.data
                name = attrs.get("name")
                text = None
                # XXX use attr_encoding for ref'd doc if that doc does not
                #  provide one by other means
                #attr_encoding = attrs.get("charset")
                url = attrs.get(self.urltags[tag])  # XXX is "" a valid URL?
                if not url:
                    # Probably an <A NAME="blah"> link or <AREA NOHREF...>.
                    # For our purposes a link is something with a URL, so
                    # ignore this.
                    continue

                url = _rfc3986.clean_url(url, encoding)
                if tag == "a":
                    if token.type != "startendtag":
                        # hmm, this'd break if end tag is missing
                        text = p.get_compressed_text(("endtag", tag))
                    # but this doesn't work for eg.
                    # <a href="blah"><b>Andy</b></a>
                    #text = p.get_compressed_text()

                yield Link(base_url, url, text, tag, token.attrs)
        except sgmllib.SGMLParseError, exc:
            raise ParseError(exc)
Example #6
0
    def http_error_302(self, req, fp, code, msg, headers):
        # Some servers (incorrectly) return multiple Location headers
        # (so probably same goes for URI).  Use first header.
        if headers.has_key('location'):
            newurl = headers.getheaders('location')[0]
        elif headers.has_key('uri'):
            newurl = headers.getheaders('uri')[0]
        else:
            return
        newurl = _rfc3986.clean_url(newurl, "latin-1")
        newurl = _rfc3986.urljoin(req.get_full_url(), newurl)

        # XXX Probably want to forget about the state of the current
        # request, although that might interact poorly with other
        # handlers that also use handler-specific request attributes
        new = self.redirect_request(newurl, req, fp, code, msg, headers)
        if new is None:
            return

        # loop detection
        # .redirect_dict has a key url if url was previously visited.
        if hasattr(req, 'redirect_dict'):
            visited = new.redirect_dict = req.redirect_dict
            if (visited.get(newurl, 0) >= self.max_repeats or
                len(visited) >= self.max_redirections):
                raise HTTPError(req.get_full_url(), code,
                                self.inf_msg + msg, headers, fp)
        else:
            visited = new.redirect_dict = req.redirect_dict = {}
        visited[newurl] = visited.get(newurl, 0) + 1

        # Don't close the fp until we are sure that we won't use it
        # with HTTPError.  
        fp.read()
        fp.close()

        return self.parent.open(new)
Example #7
0
def clean_refresh_url(url):
    # e.g. Firefox 1.5 does (something like) this
    if ((url.startswith('"') and url.endswith('"')) or
        (url.startswith("'") and url.endswith("'"))):
        url = url[1:-1]
    return _rfc3986.clean_url(url, "latin-1")  # XXX encoding