Python is_html Exemples, mechanize._headersutil.is_html Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : commands.py Projet : brandizzi/retwill

def info():
    """
    >> info

    Report information on current page.
    """
    current_url = browser.get_url()
    if current_url is None:
        logger.warning("We're not on a page!")
        return
    
    content_type = browser._browser._response.info().getheaders("content-type")
    check_html = is_html(content_type, current_url)

    code = browser.get_code()


    logger.info('\nPage information:')
    logger.info('\tURL: %s', current_url)
    logger.info('\tHTTP code: %s', code)
    m = ['\tContent type: ', content_type[0]]
    if check_html:
        m.append('(HTML)')
    logger.info("".join(m))
    if check_html:
        title = browser.get_title()
        logger.info('\tPage title: %s', title)

        forms = browser.get_all_forms()
        if len(forms):
            logger.info('\tThis page contains %d form(s)', len(forms))
            
    logger.info('')

Exemple #2

0

Afficher le fichier

Fichier : test_headers.py Projet : suhelhammoud/eclipseworkspace

 def test_is_html(self):
     from mechanize._headersutil import is_html
     for allow_xhtml in False, True:
         for cths, ext, expect in [
             (["text/html"], ".html", True),
             (["text/html", "text/plain"], ".html", True),
                 # Content-type takes priority over file extension from URL
             (["text/html"], ".txt", True),
             (["text/plain"], ".html", False),
                 # use extension if no Content-Type
             ([], ".html", True),
             ([], ".gif", False),
                 # don't regard XHTML as HTML (unless user explicitly asks for it),
                 # since we don't yet handle XML properly
             ([], ".xhtml", allow_xhtml),
             (["text/xhtml"], ".xhtml", allow_xhtml),
         ]:
             url = "http://example.com/foo" + ext
             self.assertEqual(expect, is_html(cths, url, allow_xhtml))

Exemple #3

0

Afficher le fichier

Fichier : boobot.py Projet : lissyx/weboob

    def urlinfo(self, url, maxback=2):
        if urlparse.urlsplit(url).netloc == 'mobile.twitter.com':
            url = url.replace('mobile.twitter.com', 'twitter.com', 1)
        try:
            r = self.openurl(HeadRequest(url), _tries=2, _delay=0.2)
            body = False
        except BrowserUnavailable as e:
            if u'HTTP Error 501' in unicode(e) or u'HTTP Error 405' in unicode(
                    e):
                r = self.openurl(url, _tries=2, _delay=0.2)
                body = True
            elif u'HTTP Error 404' in unicode(e) \
                    and maxback and not url[-1].isalnum():
                return self.urlinfo(url[:-1], maxback - 1)
            else:
                raise e
        headers = r.info()
        content_type = headers.get('Content-Type')
        try:
            size = int(headers.get('Content-Length'))
            hsize = self.human_size(size)
        except TypeError:
            size = None
            hsize = None
        is_html = headersutil.is_html([content_type], url, True)
        title = None
        if is_html:
            if not body:
                r = self.openurl(url, _tries=2, _delay=0.2)
            # update size has we might not have it from headers
            size = len(r.read())
            hsize = self.human_size(size)
            r.seek(0)

            encoding = EncodingFinder('windows-1252').encoding(r).lower()
            try:
                h = self.get_document(r, parser='lxml', encoding=encoding)
                for meta in h.xpath('//head/meta'):
                    # meta http-equiv=content-type content=...
                    if meta.attrib.get('http-equiv',
                                       '').lower() == 'content-type':
                        for k, v in headersutil.split_header_words(
                            [meta.attrib.get('content', '')]):
                            if k == 'charset':
                                encoding = v
                    # meta charset=...
                    encoding = meta.attrib.get('charset', encoding).lower()
            except Exception as e:
                print e
            finally:
                r.seek(0)
            if encoding == 'iso-8859-1' or not encoding:
                encoding = 'windows-1252'
            try:
                codecs.lookup(encoding)
            except LookupError:
                encoding = 'windows-1252'

            try:
                h = self.get_document(r, parser='lxml', encoding=encoding)
                for title in h.xpath('//head/title'):
                    title = to_unicode(title.text_content()).strip()
                    title = ' '.join(title.split())
                if urlparse.urlsplit(url).netloc.endswith('twitter.com'):
                    for title in h.getroot().cssselect(
                            '.permalink-tweet .tweet-text'):
                        title = to_unicode(title.text_content()).strip()
                        title = ' '.join(title.splitlines())
            except AssertionError as e:
                # invalid HTML
                print e

        return content_type, hsize, title

Exemple #4

0

Afficher le fichier

Fichier : boobot.py Projet : Boussadia/weboob

    def urlinfo(self, url, maxback=2):
        if urlparse.urlsplit(url).netloc == 'mobile.twitter.com':
            url = url.replace('mobile.twitter.com', 'twitter.com', 1)
        try:
            r = self.openurl(HeadRequest(url), _tries=2, _delay=0.2)
            body = False
        except BrowserUnavailable as e:
            if u'HTTP Error 501' in unicode(e) or u'HTTP Error 405' in unicode(e):
                r = self.openurl(url, _tries=2, _delay=0.2)
                body = True
            elif u'HTTP Error 404' in unicode(e) \
                    and maxback and not url[-1].isalnum():
                return self.urlinfo(url[:-1], maxback-1)
            else:
                raise e
        headers = r.info()
        content_type = headers.get('Content-Type')
        try:
            size = int(headers.get('Content-Length'))
            hsize = self.human_size(size)
        except TypeError:
            size = None
            hsize = None
        is_html = headersutil.is_html([content_type], url, True)
        title = None
        if is_html:
            if not body:
                r = self.openurl(url, _tries=2, _delay=0.2)
            # update size has we might not have it from headers
            size = len(r.read())
            hsize = self.human_size(size)
            r.seek(0)

            encoding = EncodingFinder('windows-1252').encoding(r).lower()
            try:
                h = self.get_document(r, parser='lxml', encoding=encoding)
                for meta in h.xpath('//head/meta'):
                    # meta http-equiv=content-type content=...
                    if meta.attrib.get('http-equiv', '').lower() == 'content-type':
                        for k, v in headersutil.split_header_words([meta.attrib.get('content', '')]):
                            if k == 'charset':
                                encoding = v
                    # meta charset=...
                    encoding = meta.attrib.get('charset', encoding).lower()
            except Exception as e:
                print e
            finally:
                r.seek(0)
            if encoding == 'iso-8859-1' or not encoding:
                encoding = 'windows-1252'
            try:
                codecs.lookup(encoding)
            except LookupError:
                encoding = 'windows-1252'

            try:
                h = self.get_document(r, parser='lxml', encoding=encoding)
                for title in h.xpath('//head/title'):
                    title = to_unicode(title.text_content()).strip()
                    title = ' '.join(title.split())
                if urlparse.urlsplit(url).netloc.endswith('twitter.com'):
                    for title in h.getroot().cssselect('.permalink-tweet .tweet-text'):
                        title = to_unicode(title.text_content()).strip()
                        title = ' '.join(title.splitlines())
            except AssertionError as e:
                # invalid HTML
                print e

        return content_type, hsize, title