def title(self): import _pullparser p = _pullparser.TolerantPullParser(self._response, encoding=self._encoding) try: try: p.get_tag("title") except _pullparser.NoMoreTokensError: return None else: return self._get_title_text(p) except sgmllib.SGMLParseError, exc: raise _form.ParseError(exc)
def links(self): """Return an iterator that provides links of the document.""" response = self._response encoding = self._encoding base_url = self._base_url p = self.link_parser_class(response, encoding=encoding) try: for token in p.tags(*(self.urltags.keys() + ["base"])): if token.type == "endtag": continue if token.data == "base": base_href = dict(token.attrs).get("href") if base_href is not None: base_url = base_href continue attrs = dict(token.attrs) tag = token.data text = None # XXX use attr_encoding for ref'd doc if that doc does not # provide one by other means #attr_encoding = attrs.get("charset") url = attrs.get(self.urltags[tag]) # XXX is "" a valid URL? if not url: # Probably an <A NAME="blah"> link or <AREA NOHREF...>. # For our purposes a link is something with a URL, so # ignore this. continue url = _rfc3986.clean_url(url, encoding) if tag == "a": if token.type != "startendtag": # hmm, this'd break if end tag is missing text = p.get_compressed_text(("endtag", tag)) # but this doesn't work for e.g. # <a href="blah"><b>Andy</b></a> #text = p.get_compressed_text() yield Link(base_url, url, text, tag, token.attrs) except sgmllib.SGMLParseError, exc: raise _form.ParseError(exc)