def whereIsYadis(resp): """Given a HTTPResponse, return the location of the Yadis document. May be the URL just retrieved, another URL, or None, if I can't find any. [non-blocking] @returns: str or None """ # Attempt to find out where to go to discover the document # or if we already have it content_type = resp.headers.get('content-type') # According to the spec, the content-type header must be an exact # match, or else we have to look for an indirection. if (content_type and content_type.split(';', 1)[0].lower() == YADIS_CONTENT_TYPE): return resp.final_url else: # Try the header yadis_loc = resp.headers.get(YADIS_HEADER_NAME.lower()) if not yadis_loc: # Parse as HTML if the header is missing. # # XXX: do we want to do something with content-type, like # have a whitelist or a blacklist (for detecting that it's # HTML)? try: yadis_loc = findHTMLMeta(StringIO(resp.body)) except MetaNotFound: pass return yadis_loc
def handle_starttag(self, tag, attrs): # if we ever see a start body tag, bail out right away, since # we want to prevent the meta tag from appearing in the body # [2] if tag=='body': self._terminate() if self.phase == self.TOP: # At the top level, allow a html tag or a head tag to move # to the head or html phase if tag == 'head': # [3] self.phase = self.HEAD elif tag == 'html': # [4] self.phase = self.HTML elif self.phase == self.HTML: # if we are in the html tag, allow a head tag to move to # the HEAD phase. If we get another html tag, then bail # out if tag == 'head': # [3] self.phase = self.HEAD elif tag == 'html': # [5] self._terminate() elif self.phase == self.HEAD: # If we are in the head phase, look for the appropriate # meta tag. If we get a head or body tag, bail out. if tag == 'meta': attrs_d = dict(attrs) http_equiv = attrs_d.get('http-equiv', '').lower() if http_equiv == YADIS_HEADER_NAME.lower(): raw_attr = attrs_d.get('content') yadis_loc = substituteEntities(raw_attr) # [6] self.phase = self.FOUND raise ParseDone(yadis_loc) elif tag in ['head', 'html']: # [5], [7] self._terminate()
def handle_starttag(self, tag, attrs): # if we ever see a start body tag, bail out right away, since # we want to prevent the meta tag from appearing in the body # [2] if tag == 'body': self._terminate() if self.phase == self.TOP: # At the top level, allow a html tag or a head tag to move # to the head or html phase if tag == 'head': # [3] self.phase = self.HEAD elif tag == 'html': # [4] self.phase = self.HTML elif self.phase == self.HTML: # if we are in the html tag, allow a head tag to move to # the HEAD phase. If we get another html tag, then bail # out if tag == 'head': # [3] self.phase = self.HEAD elif tag == 'html': # [5] self._terminate() elif self.phase == self.HEAD: # If we are in the head phase, look for the appropriate # meta tag. If we get a head or body tag, bail out. if tag == 'meta': attrs_d = dict(attrs) http_equiv = attrs_d.get('http-equiv', '').lower() if http_equiv == YADIS_HEADER_NAME.lower(): raw_attr = attrs_d.get('content') yadis_loc = substituteEntities(raw_attr) # [6] self.phase = self.FOUND raise ParseDone(yadis_loc) elif tag in ['head', 'html']: # [5], [7] self._terminate()