def test_split_header_words(self):
        from mechanize._headersutil import split_header_words

        tests = [
            ("foo", [[("foo", None)]]),
            ("foo=bar", [[("foo", "bar")]]),
            ("   foo   ", [[("foo", None)]]),
            ("   foo=   ", [[("foo", "")]]),
            ("   foo=", [[("foo", "")]]),
            ("   foo=   ; ", [[("foo", "")]]),
            ("   foo=   ; bar= baz ", [[("foo", ""), ("bar", "baz")]]),
            ("foo=bar bar=baz", [[("foo", "bar"), ("bar", "baz")]]),
            # doesn't really matter if this next fails, but it works ATM
            ("foo= bar=baz", [[("foo", "bar=baz")]]),
            ("foo=bar;bar=baz", [[("foo", "bar"), ("bar", "baz")]]),
            ('foo bar baz', [[("foo", None), ("bar", None), ("baz", None)]]),
            ("a, b, c", [[("a", None)], [("b", None)], [("c", None)]]),
            (r'foo; bar=baz, spam=, foo="\,\;\"", bar= ',
             [[("foo", None), ("bar", "baz")],
              [("spam", "")], [("foo", ',;"')], [("bar", "")]]),
            ]

        for arg, expect in tests:
            try:
                result = split_header_words([arg])
            except:
                import traceback, StringIO
                f = StringIO.StringIO()
                traceback.print_exc(None, f)
                result = "(error -- traceback follows)\n\n%s" % f.getvalue()
            assert result == expect, """
When parsing: '%s'
Expected:     '%s'
Got:          '%s'
""" % (arg, expect, result)
    def test_roundtrip(self):
        from mechanize._headersutil import split_header_words, join_header_words

        tests = [
            ("foo", "foo"),
            ("foo=bar", "foo=bar"),
            ("   foo   ", "foo"),
            ("foo=", 'foo=""'),
            ("foo=bar bar=baz", "foo=bar; bar=baz"),
            ("foo=bar;bar=baz", "foo=bar; bar=baz"),
            ('foo bar baz', "foo; bar; baz"),
            (r'foo="\"" bar="\\"', r'foo="\""; bar="\\"'),
            ('foo,,,bar', 'foo, bar'),
            ('foo=bar,bar=baz', 'foo=bar, bar=baz'),

            ('text/html; charset=iso-8859-1',
             'text/html; charset="iso-8859-1"'),

            ('foo="bar"; port="80,81"; discard, bar=baz',
             'foo=bar; port="80,81"; discard, bar=baz'),

            (r'Basic realm="\"foo\\\\bar\""',
             r'Basic; realm="\"foo\\\\bar\""')
            ]

        for arg, expect in tests:
            input = split_header_words([arg])
            res = join_header_words(input)
            assert res == expect, """
When parsing: '%s'
Expected:     '%s'
Got:          '%s'
Input was:    '%s'""" % (arg, expect, res, input)
Exemple #3
0
    def test_roundtrip(self):
        from mechanize._headersutil import split_header_words, join_header_words

        tests = [("foo", "foo"), ("foo=bar", "foo=bar"), ("   foo   ", "foo"),
                 ("foo=", 'foo=""'), ("foo=bar bar=baz", "foo=bar; bar=baz"),
                 ("foo=bar;bar=baz", "foo=bar; bar=baz"),
                 ('foo bar baz', "foo; bar; baz"),
                 (r'foo="\"" bar="\\"', r'foo="\""; bar="\\"'),
                 ('foo,,,bar', 'foo, bar'),
                 ('foo=bar,bar=baz', 'foo=bar, bar=baz'),
                 ('text/html; charset=iso-8859-1',
                  'text/html; charset="iso-8859-1"'),
                 ('foo="bar"; port="80,81"; discard, bar=baz',
                  'foo=bar; port="80,81"; discard, bar=baz'),
                 (r'Basic realm="\"foo\\\\bar\""',
                  r'Basic; realm="\"foo\\\\bar\""')]

        for arg, expect in tests:
            input = split_header_words([arg])
            res = join_header_words(input)
            assert res == expect, """
When parsing: '%s'
Expected:     '%s'
Got:          '%s'
Input was:    '%s'""" % (arg, expect, res, input)
Exemple #4
0
    def test_split_header_words(self):
        from mechanize._headersutil import split_header_words

        tests = [
            ("foo", [[("foo", None)]]),
            ("foo=bar", [[("foo", "bar")]]),
            ("   foo   ", [[("foo", None)]]),
            ("   foo=   ", [[("foo", "")]]),
            ("   foo=", [[("foo", "")]]),
            ("   foo=   ; ", [[("foo", "")]]),
            ("   foo=   ; bar= baz ", [[("foo", ""), ("bar", "baz")]]),
            ("foo=bar bar=baz", [[("foo", "bar"), ("bar", "baz")]]),
            # doesn't really matter if this next fails, but it works ATM
            ("foo= bar=baz", [[("foo", "bar=baz")]]),
            ("foo=bar;bar=baz", [[("foo", "bar"), ("bar", "baz")]]),
            ('foo bar baz', [[("foo", None), ("bar", None), ("baz", None)]]),
            ("a, b, c", [[("a", None)], [("b", None)], [("c", None)]]),
            (r'foo; bar=baz, spam=, foo="\,\;\"", bar= ', [[("foo", None),
                                                            ("bar", "baz")],
                                                           [("spam", "")],
                                                           [("foo", ',;"')],
                                                           [("bar", "")]]),
        ]

        for arg, expect in tests:
            try:
                result = split_header_words([arg])
            except Exception:
                import traceback
                from io import StringIO
                f = StringIO()
                traceback.print_exc(None, f)
                result = "(error -- traceback follows)\n\n%s" % f.getvalue()
            assert result == expect, """
When parsing: '%s'
Expected:     '%s'
Got:          '%s'
""" % (arg, expect, result)
Exemple #5
0
    def urlinfo(self, url, maxback=2):
        if urlparse.urlsplit(url).netloc == 'mobile.twitter.com':
            url = url.replace('mobile.twitter.com', 'twitter.com', 1)
        try:
            r = self.openurl(HeadRequest(url), _tries=2, _delay=0.2)
            body = False
        except BrowserUnavailable as e:
            if u'HTTP Error 501' in unicode(e) or u'HTTP Error 405' in unicode(
                    e):
                r = self.openurl(url, _tries=2, _delay=0.2)
                body = True
            elif u'HTTP Error 404' in unicode(e) \
                    and maxback and not url[-1].isalnum():
                return self.urlinfo(url[:-1], maxback - 1)
            else:
                raise e
        headers = r.info()
        content_type = headers.get('Content-Type')
        try:
            size = int(headers.get('Content-Length'))
            hsize = self.human_size(size)
        except TypeError:
            size = None
            hsize = None
        is_html = headersutil.is_html([content_type], url, True)
        title = None
        if is_html:
            if not body:
                r = self.openurl(url, _tries=2, _delay=0.2)
            # update size has we might not have it from headers
            size = len(r.read())
            hsize = self.human_size(size)
            r.seek(0)

            encoding = EncodingFinder('windows-1252').encoding(r).lower()
            try:
                h = self.get_document(r, parser='lxml', encoding=encoding)
                for meta in h.xpath('//head/meta'):
                    # meta http-equiv=content-type content=...
                    if meta.attrib.get('http-equiv',
                                       '').lower() == 'content-type':
                        for k, v in headersutil.split_header_words(
                            [meta.attrib.get('content', '')]):
                            if k == 'charset':
                                encoding = v
                    # meta charset=...
                    encoding = meta.attrib.get('charset', encoding).lower()
            except Exception as e:
                print e
            finally:
                r.seek(0)
            if encoding == 'iso-8859-1' or not encoding:
                encoding = 'windows-1252'
            try:
                codecs.lookup(encoding)
            except LookupError:
                encoding = 'windows-1252'

            try:
                h = self.get_document(r, parser='lxml', encoding=encoding)
                for title in h.xpath('//head/title'):
                    title = to_unicode(title.text_content()).strip()
                    title = ' '.join(title.split())
                if urlparse.urlsplit(url).netloc.endswith('twitter.com'):
                    for title in h.getroot().cssselect(
                            '.permalink-tweet .tweet-text'):
                        title = to_unicode(title.text_content()).strip()
                        title = ' '.join(title.splitlines())
            except AssertionError as e:
                # invalid HTML
                print e

        return content_type, hsize, title
Exemple #6
0
    def urlinfo(self, url, maxback=2):
        if urlparse.urlsplit(url).netloc == 'mobile.twitter.com':
            url = url.replace('mobile.twitter.com', 'twitter.com', 1)
        try:
            r = self.openurl(HeadRequest(url), _tries=2, _delay=0.2)
            body = False
        except BrowserUnavailable as e:
            if u'HTTP Error 501' in unicode(e) or u'HTTP Error 405' in unicode(e):
                r = self.openurl(url, _tries=2, _delay=0.2)
                body = True
            elif u'HTTP Error 404' in unicode(e) \
                    and maxback and not url[-1].isalnum():
                return self.urlinfo(url[:-1], maxback-1)
            else:
                raise e
        headers = r.info()
        content_type = headers.get('Content-Type')
        try:
            size = int(headers.get('Content-Length'))
            hsize = self.human_size(size)
        except TypeError:
            size = None
            hsize = None
        is_html = headersutil.is_html([content_type], url, True)
        title = None
        if is_html:
            if not body:
                r = self.openurl(url, _tries=2, _delay=0.2)
            # update size has we might not have it from headers
            size = len(r.read())
            hsize = self.human_size(size)
            r.seek(0)

            encoding = EncodingFinder('windows-1252').encoding(r).lower()
            try:
                h = self.get_document(r, parser='lxml', encoding=encoding)
                for meta in h.xpath('//head/meta'):
                    # meta http-equiv=content-type content=...
                    if meta.attrib.get('http-equiv', '').lower() == 'content-type':
                        for k, v in headersutil.split_header_words([meta.attrib.get('content', '')]):
                            if k == 'charset':
                                encoding = v
                    # meta charset=...
                    encoding = meta.attrib.get('charset', encoding).lower()
            except Exception as e:
                print e
            finally:
                r.seek(0)
            if encoding == 'iso-8859-1' or not encoding:
                encoding = 'windows-1252'
            try:
                codecs.lookup(encoding)
            except LookupError:
                encoding = 'windows-1252'

            try:
                h = self.get_document(r, parser='lxml', encoding=encoding)
                for title in h.xpath('//head/title'):
                    title = to_unicode(title.text_content()).strip()
                    title = ' '.join(title.split())
                if urlparse.urlsplit(url).netloc.endswith('twitter.com'):
                    for title in h.getroot().cssselect('.permalink-tweet .tweet-text'):
                        title = to_unicode(title.text_content()).strip()
                        title = ' '.join(title.splitlines())
            except AssertionError as e:
                # invalid HTML
                print e

        return content_type, hsize, title