Ejemplo n.º 1
0
def test_get_gzip():
    url = 'http://httpbin.org/gzip'
    for i, readable in enumerate(URL(url).get(decode_content=False)):
        response = Response.from_readable(readable)
        data = json.load(utf8_reader(decode(response)))
        assert data.get('gzipped')
    assert i == 0
Ejemplo n.º 2
0
def test_get_gzip():
    url = 'http://httpbin.org/gzip'
    for i, readable in enumerate(URL(url).get(decode_content=False)):
        response = Response.from_readable(readable)
        data = json.load(utf8_reader(decode(response)))
        assert data.get('gzipped')
    assert i == 0
Ejemplo n.º 3
0
def urls_from_urlset_or_sitemapindex(response):
    """ Yields URLs from ``<urlset>`` or ``<sitemapindex>`` elements as per 
        `sitemaps.org <http://www.sitemaps.org/protocol.html>`_.
    """

    sitemap = URL(response.url).fragment_dict.get('sitemap')
    content_subtypes = response.headers.get_content_subtype().split('+')
    if not sitemap and not 'xml' in content_subtypes:
        return

    root = None
    for _, elem in iterparse(decode(response)):

        if root is None:
            root = elem.getroottree().getroot()
            if not (root.tag.endswith('}sitemapindex') or
                    root.tag.endswith('}urlset')):
                # root element has wrong tag - give up
                break

        if elem.tag.endswith('}loc') and elem.text is not None:
            text = elem.text.strip()
            if text:
                # http://www.sitemaps.org/protocol.html#locdef
                url = URL(urljoin(response.url, text))
                if elem.getparent().tag.endswith('}sitemap'):
                    # set sitemap=True to help downstream processing
                    url = url.update_fragment_dict(sitemap=True)
                yield "url", url

        if elem.getparent() is root:
            # release memory for previous elements
            while elem.getprevious() is not None:
                del root[0]
Ejemplo n.º 4
0
def test_get_gzip():
    url = "http://httpbin.org/gzip"
    for i, readable in enumerate(URL(url).get(decode_content=False)):
        response = Response.from_readable(readable)
        assert response.headers.get("X-wex-has-gzip-magic") == "1"
        data = json.load(utf8_reader(decode(response)))
        assert data.get("gzipped")
    assert i == 0
Ejemplo n.º 5
0
def get(url, **kw):
    codes = []
    for readable in URL(url).get(**kw):
        response = Response.from_readable(readable)
        codes.append(response.code)
        if response.code == 200:
            data = json.load(utf8_reader(decode(response)))
            assert 'headers' in data
    return codes
Ejemplo n.º 6
0
def get(url, **kw):
    codes = []
    for readable in URL(url).get(**kw):
        response = Response.from_readable(readable)
        codes.append(response.code)
        if response.code == 200:
            data = json.load(utf8_reader(decode(response)))
            assert 'headers' in data
    return codes
Ejemplo n.º 7
0
def urls_from_urlset_or_sitemapindex(response):
    """ Yields URLs from ``<urlset>`` or ``<sitemapindex>`` elements as per 
        `sitemaps.org <http://www.sitemaps.org/protocol.html>`_.
    """

    sitemap = URL(response.url).fragment_dict.get('sitemap')
    content_subtypes = response.headers.get_content_subtype().split('+')
    if not sitemap and 'xml' not in content_subtypes:
        return

    root = None
    try:
        for _, elem in iterparse(decode(response)):

            if root is None:
                root = elem.getroottree().getroot()
                if not (root.tag.endswith('}sitemapindex')
                        or root.tag.endswith('}urlset')):
                    # root element has wrong tag - give up
                    break

            if elem.tag.endswith('}loc') and elem.text is not None:
                text = elem.text.strip()
                if text:
                    # http://www.sitemaps.org/protocol.html#locdef
                    url = URL(urljoin(response.url, text))
                    if elem.getparent().tag.endswith('}sitemap'):
                        # set sitemap=True to help downstream processing
                        url = url.update_fragment_dict(sitemap=True)
                    yield "url", url

            if elem.getparent() is root:
                # release memory for previous elements
                while elem.getprevious() is not None:
                    del root[0]

    except XMLSyntaxError:
        log.debug("XMLSyntaxError in '%s' (%d)", response.url, response.code)