Ejemplo n.º 1
0
def test_chained_does_seek_response():
    readable = resource_stream(__name__, 'fixtures/robots_txt')
    response = Response.from_readable(readable)
    # use the same extractor twice
    extract = chained(extract_first_line, extract_first_line)
    values = list(extract(response))
    # and we get the same first line because chained re-seeks to 0
    assert values == [(b'# /robots.txt\n', ), (b'# /robots.txt\n', )]
Ejemplo n.º 2
0
def test_chained_does_seek_response():
    readable = resource_stream(__name__, 'fixtures/robots_txt')
    response = Response.from_readable(readable)
    # use the same extractor twice
    extract = chained(extract_first_line, extract_first_line)
    values = list(extract(response))
    # and we get the same first line because chained re-seeks to 0
    assert values == [(b'# /robots.txt\n',), (b'# /robots.txt\n',)]
Ejemplo n.º 3
0
def test_labelled_chained():
    # bug test
    labeller  = (lambda x: x)
    extract = labelled(labeller, chained(extract_arg0))
    assert list(extract("foo")) == [("foo", "foo")]
Ejemplo n.º 4
0
def test_chained_extractor_raises():
    extract = chained(extract_with_error)
    items = list(extract('foo'))
    assert items == [(my_error,)]
Ejemplo n.º 5
0
            if root is None:
                root = elem.getroottree().getroot()
                if not (root.tag.endswith('}sitemapindex') or
                        root.tag.endswith('}urlset')):
                    # root element has wrong tag - give up
                    break

            if elem.tag.endswith('}loc') and elem.text is not None:
                text = elem.text.strip()
                if text:
                    # http://www.sitemaps.org/protocol.html#locdef
                    url = URL(urljoin(response.url, text))
                    if elem.getparent().tag.endswith('}sitemap'):
                        # set sitemap=True to help downstream processing
                        url = url.update_fragment_dict(sitemap=True)
                    yield "url", url

            if elem.getparent() is root:
                # release memory for previous elements
                while elem.getprevious() is not None:
                    del root[0]

    except XMLSyntaxError:
        log.debug("XMLSyntaxError in '%s' (%d)", response.url, response.code)

#: Extractor that combines :func:`.urls_from_robots_txt` and
#: :func:`.urls_from_urlset_or_sitemapindex`.
urls_from_sitemaps = chained(urls_from_robots_txt,
                             urls_from_urlset_or_sitemapindex)
Ejemplo n.º 6
0
def test_labelled_chained():
    # bug test
    labeller = (lambda x: x)
    extract = labelled(labeller, chained(extract_arg0))
    assert list(extract("foo")) == [("foo", "foo")]
Ejemplo n.º 7
0
def test_chained_extractor_raises():
    extract = chained(extract_with_error)
    items = list(extract('foo'))
    assert items == [(my_error, )]
Ejemplo n.º 8
0
            if root is None:
                root = elem.getroottree().getroot()
                if not (root.tag.endswith('}sitemapindex')
                        or root.tag.endswith('}urlset')):
                    # root element has wrong tag - give up
                    break

            if elem.tag.endswith('}loc') and elem.text is not None:
                text = elem.text.strip()
                if text:
                    # http://www.sitemaps.org/protocol.html#locdef
                    url = URL(urljoin(response.url, text))
                    if elem.getparent().tag.endswith('}sitemap'):
                        # set sitemap=True to help downstream processing
                        url = url.update_fragment_dict(sitemap=True)
                    yield "url", url

            if elem.getparent() is root:
                # release memory for previous elements
                while elem.getprevious() is not None:
                    del root[0]

    except XMLSyntaxError:
        log.debug("XMLSyntaxError in '%s' (%d)", response.url, response.code)


#: Extractor that combines :func:`.urls_from_robots_txt` and
#: :func:`.urls_from_urlset_or_sitemapindex`.
urls_from_sitemaps = chained(urls_from_robots_txt,
                             urls_from_urlset_or_sitemapindex)