Example #1
0
    def _parse_sitemap(self, response):
        requests = []

        if response.url.endswith('/robots.txt'):
            self._sitemap_urls.extend(iter_urls_from_robots(response.body))
        else:
            sitemap_body = get_sitemap_body(response)
            if sitemap_body is None:
                log.msg(format='Invalid sitemap %(url)s',
                        level=log.WARNING,
                        url=response.url)
                return []

            sitemap_type = get_sitemap_type(sitemap_body)
            if sitemap_type == 'sitemapindex':
                log.msg(format='Sitemap %(url)s is of type <sitemapindex>',
                        level=log.DEBUG,
                        url=response.url)
                self._sitemap_urls.extend(iter_urls_from_sitemap(sitemap_body))
            elif sitemap_type == 'urlset':
                log.msg(format='Sitemap %(url)s is of type <urlset>',
                        level=log.DEBUG,
                        url=response.url)
                self._site_urls.extend(iter_urls_from_sitemap(sitemap_body))
            else:
                log.msg(
                    format='Unrecognized type of sitemap %(url)s: %(stype)s',
                    level=log.WARNING,
                    url=response.url,
                    stype=sitemap_type)
        return requests
Example #2
0
    def _parse_sitemap(self, response):
        requests = []

        if response.url.endswith('/robots.txt'):
            self._sitemap_urls.extend(iter_urls_from_robots(response.body))
        else:
            sitemap_body = get_sitemap_body(response)
            if sitemap_body is None:
                log.msg(format='Invalid sitemap %(url)s',
                        level=log.WARNING, url=response.url)
                return []

            sitemap_type = get_sitemap_type(sitemap_body)
            if sitemap_type == 'sitemapindex':
                log.msg(format='Sitemap %(url)s is of type <sitemapindex>',
                        level=log.DEBUG, url=response.url)
                self._sitemap_urls.extend(iter_urls_from_sitemap(sitemap_body))
            elif sitemap_type == 'urlset':
                log.msg(format='Sitemap %(url)s is of type <urlset>',
                        level=log.DEBUG, url=response.url)
                self._site_urls.extend(iter_urls_from_sitemap(sitemap_body))
            else:
                log.msg(format='Unrecognized type of sitemap %(url)s: %(stype)s',
                        level=log.WARNING, url=response.url, stype=sitemap_type)
        return requests
Example #3
0
    def test_iter_urls_from_robots(self):
        robots = '''User-agent: *
                    Disallow: /aff/
                    Disallow: /wl/

                    # Search and shopping refining
                    Disallow: /s*/*facet
                    Disallow: /s*/*tags

                    # Sitemap files
                    Sitemap: http://example.com/sitemap.xml
                    Sitemap: http://example.com/sitemap-product-index.xml

                    # Forums
                    Disallow: /forum/search/
                    Disallow: /forum/active/
                    '''
        self.assertListEqual(list(iter_urls_from_robots(robots)),
            ['http://example.com/sitemap.xml', 'http://example.com/sitemap-product-index.xml'])
    def test_iter_urls_from_robots(self):
        robots = '''User-agent: *
                    Disallow: /aff/
                    Disallow: /wl/

                    # Search and shopping refining
                    Disallow: /s*/*facet
                    Disallow: /s*/*tags

                    # Sitemap files
                    Sitemap: http://example.com/sitemap.xml
                    Sitemap: http://example.com/sitemap-product-index.xml

                    # Forums
                    Disallow: /forum/search/
                    Disallow: /forum/active/
                    '''
        self.assertListEqual(list(iter_urls_from_robots(robots)), [
            'http://example.com/sitemap.xml',
            'http://example.com/sitemap-product-index.xml'
        ])