def info(): """ >> info Report information on current page. """ current_url = browser.get_url() if current_url is None: logger.warning("We're not on a page!") return content_type = browser._browser._response.info().getheaders("content-type") check_html = is_html(content_type, current_url) code = browser.get_code() logger.info('\nPage information:') logger.info('\tURL: %s', current_url) logger.info('\tHTTP code: %s', code) m = ['\tContent type: ', content_type[0]] if check_html: m.append('(HTML)') logger.info("".join(m)) if check_html: title = browser.get_title() logger.info('\tPage title: %s', title) forms = browser.get_all_forms() if len(forms): logger.info('\tThis page contains %d form(s)', len(forms)) logger.info('')
def test_is_html(self): from mechanize._headersutil import is_html for allow_xhtml in False, True: for cths, ext, expect in [ (["text/html"], ".html", True), (["text/html", "text/plain"], ".html", True), # Content-type takes priority over file extension from URL (["text/html"], ".txt", True), (["text/plain"], ".html", False), # use extension if no Content-Type ([], ".html", True), ([], ".gif", False), # don't regard XHTML as HTML (unless user explicitly asks for it), # since we don't yet handle XML properly ([], ".xhtml", allow_xhtml), (["text/xhtml"], ".xhtml", allow_xhtml), ]: url = "http://example.com/foo" + ext self.assertEqual(expect, is_html(cths, url, allow_xhtml))
def urlinfo(self, url, maxback=2): if urlparse.urlsplit(url).netloc == 'mobile.twitter.com': url = url.replace('mobile.twitter.com', 'twitter.com', 1) try: r = self.openurl(HeadRequest(url), _tries=2, _delay=0.2) body = False except BrowserUnavailable as e: if u'HTTP Error 501' in unicode(e) or u'HTTP Error 405' in unicode( e): r = self.openurl(url, _tries=2, _delay=0.2) body = True elif u'HTTP Error 404' in unicode(e) \ and maxback and not url[-1].isalnum(): return self.urlinfo(url[:-1], maxback - 1) else: raise e headers = r.info() content_type = headers.get('Content-Type') try: size = int(headers.get('Content-Length')) hsize = self.human_size(size) except TypeError: size = None hsize = None is_html = headersutil.is_html([content_type], url, True) title = None if is_html: if not body: r = self.openurl(url, _tries=2, _delay=0.2) # update size has we might not have it from headers size = len(r.read()) hsize = self.human_size(size) r.seek(0) encoding = EncodingFinder('windows-1252').encoding(r).lower() try: h = self.get_document(r, parser='lxml', encoding=encoding) for meta in h.xpath('//head/meta'): # meta http-equiv=content-type content=... if meta.attrib.get('http-equiv', '').lower() == 'content-type': for k, v in headersutil.split_header_words( [meta.attrib.get('content', '')]): if k == 'charset': encoding = v # meta charset=... encoding = meta.attrib.get('charset', encoding).lower() except Exception as e: print e finally: r.seek(0) if encoding == 'iso-8859-1' or not encoding: encoding = 'windows-1252' try: codecs.lookup(encoding) except LookupError: encoding = 'windows-1252' try: h = self.get_document(r, parser='lxml', encoding=encoding) for title in h.xpath('//head/title'): title = to_unicode(title.text_content()).strip() title = ' '.join(title.split()) if urlparse.urlsplit(url).netloc.endswith('twitter.com'): for title in h.getroot().cssselect( '.permalink-tweet .tweet-text'): title = to_unicode(title.text_content()).strip() title = ' '.join(title.splitlines()) except AssertionError as e: # invalid HTML print e return content_type, hsize, title
def urlinfo(self, url, maxback=2): if urlparse.urlsplit(url).netloc == 'mobile.twitter.com': url = url.replace('mobile.twitter.com', 'twitter.com', 1) try: r = self.openurl(HeadRequest(url), _tries=2, _delay=0.2) body = False except BrowserUnavailable as e: if u'HTTP Error 501' in unicode(e) or u'HTTP Error 405' in unicode(e): r = self.openurl(url, _tries=2, _delay=0.2) body = True elif u'HTTP Error 404' in unicode(e) \ and maxback and not url[-1].isalnum(): return self.urlinfo(url[:-1], maxback-1) else: raise e headers = r.info() content_type = headers.get('Content-Type') try: size = int(headers.get('Content-Length')) hsize = self.human_size(size) except TypeError: size = None hsize = None is_html = headersutil.is_html([content_type], url, True) title = None if is_html: if not body: r = self.openurl(url, _tries=2, _delay=0.2) # update size has we might not have it from headers size = len(r.read()) hsize = self.human_size(size) r.seek(0) encoding = EncodingFinder('windows-1252').encoding(r).lower() try: h = self.get_document(r, parser='lxml', encoding=encoding) for meta in h.xpath('//head/meta'): # meta http-equiv=content-type content=... if meta.attrib.get('http-equiv', '').lower() == 'content-type': for k, v in headersutil.split_header_words([meta.attrib.get('content', '')]): if k == 'charset': encoding = v # meta charset=... encoding = meta.attrib.get('charset', encoding).lower() except Exception as e: print e finally: r.seek(0) if encoding == 'iso-8859-1' or not encoding: encoding = 'windows-1252' try: codecs.lookup(encoding) except LookupError: encoding = 'windows-1252' try: h = self.get_document(r, parser='lxml', encoding=encoding) for title in h.xpath('//head/title'): title = to_unicode(title.text_content()).strip() title = ' '.join(title.split()) if urlparse.urlsplit(url).netloc.endswith('twitter.com'): for title in h.getroot().cssselect('.permalink-tweet .tweet-text'): title = to_unicode(title.text_content()).strip() title = ' '.join(title.splitlines()) except AssertionError as e: # invalid HTML print e return content_type, hsize, title