Esempio n. 1
0
    def _parse_sitemap(self, response):
        requests = []

        if response.url.endswith('/robots.txt'):
            self._sitemap_urls.extend(iter_urls_from_robots(response.body))
        else:
            sitemap_body = get_sitemap_body(response)
            if sitemap_body is None:
                log.msg(format='Invalid sitemap %(url)s',
                        level=log.WARNING,
                        url=response.url)
                return []

            sitemap_type = get_sitemap_type(sitemap_body)
            if sitemap_type == 'sitemapindex':
                log.msg(format='Sitemap %(url)s is of type <sitemapindex>',
                        level=log.DEBUG,
                        url=response.url)
                self._sitemap_urls.extend(iter_urls_from_sitemap(sitemap_body))
            elif sitemap_type == 'urlset':
                log.msg(format='Sitemap %(url)s is of type <urlset>',
                        level=log.DEBUG,
                        url=response.url)
                self._site_urls.extend(iter_urls_from_sitemap(sitemap_body))
            else:
                log.msg(
                    format='Unrecognized type of sitemap %(url)s: %(stype)s',
                    level=log.WARNING,
                    url=response.url,
                    stype=sitemap_type)
        return requests
Esempio n. 2
0
    def _parse_sitemap(self, response):
        requests = []

        if response.url.endswith('/robots.txt'):
            self._sitemap_urls.extend(iter_urls_from_robots(response.body))
        else:
            sitemap_body = get_sitemap_body(response)
            if sitemap_body is None:
                log.msg(format='Invalid sitemap %(url)s',
                        level=log.WARNING, url=response.url)
                return []

            sitemap_type = get_sitemap_type(sitemap_body)
            if sitemap_type == 'sitemapindex':
                log.msg(format='Sitemap %(url)s is of type <sitemapindex>',
                        level=log.DEBUG, url=response.url)
                self._sitemap_urls.extend(iter_urls_from_sitemap(sitemap_body))
            elif sitemap_type == 'urlset':
                log.msg(format='Sitemap %(url)s is of type <urlset>',
                        level=log.DEBUG, url=response.url)
                self._site_urls.extend(iter_urls_from_sitemap(sitemap_body))
            else:
                log.msg(format='Unrecognized type of sitemap %(url)s: %(stype)s',
                        level=log.WARNING, url=response.url, stype=sitemap_type)
        return requests
 def test_iter_urls_from_sitemap(self):
     self.assertListEqual(list(iter_urls_from_sitemap(sitemapindex)), [
         'http://www.example.com/sitemap1.xml.gz',
         'http://www.example.com/sitemap2.xml.gz'
     ])
     self.assertListEqual(list(iter_urls_from_sitemap(urlset)), [
         'http://www.example.com/',
         'http://www.example.com/Special-Offers.html'
     ])
Esempio n. 4
0
 def test_iter_urls_from_sitemap(self):
     self.assertListEqual(list(iter_urls_from_sitemap(sitemapindex)),
         ['http://www.example.com/sitemap1.xml.gz', 'http://www.example.com/sitemap2.xml.gz'])
     self.assertListEqual(list(iter_urls_from_sitemap(urlset)),
         ['http://www.example.com/', 'http://www.example.com/Special-Offers.html'])