def _parse_sitemap(self, response): requests = [] if response.url.endswith('/robots.txt'): self._sitemap_urls.extend(iter_urls_from_robots(response.body)) else: sitemap_body = get_sitemap_body(response) if sitemap_body is None: log.msg(format='Invalid sitemap %(url)s', level=log.WARNING, url=response.url) return [] sitemap_type = get_sitemap_type(sitemap_body) if sitemap_type == 'sitemapindex': log.msg(format='Sitemap %(url)s is of type <sitemapindex>', level=log.DEBUG, url=response.url) self._sitemap_urls.extend(iter_urls_from_sitemap(sitemap_body)) elif sitemap_type == 'urlset': log.msg(format='Sitemap %(url)s is of type <urlset>', level=log.DEBUG, url=response.url) self._site_urls.extend(iter_urls_from_sitemap(sitemap_body)) else: log.msg( format='Unrecognized type of sitemap %(url)s: %(stype)s', level=log.WARNING, url=response.url, stype=sitemap_type) return requests
def _parse_sitemap(self, response): requests = [] if response.url.endswith('/robots.txt'): self._sitemap_urls.extend(iter_urls_from_robots(response.body)) else: sitemap_body = get_sitemap_body(response) if sitemap_body is None: log.msg(format='Invalid sitemap %(url)s', level=log.WARNING, url=response.url) return [] sitemap_type = get_sitemap_type(sitemap_body) if sitemap_type == 'sitemapindex': log.msg(format='Sitemap %(url)s is of type <sitemapindex>', level=log.DEBUG, url=response.url) self._sitemap_urls.extend(iter_urls_from_sitemap(sitemap_body)) elif sitemap_type == 'urlset': log.msg(format='Sitemap %(url)s is of type <urlset>', level=log.DEBUG, url=response.url) self._site_urls.extend(iter_urls_from_sitemap(sitemap_body)) else: log.msg(format='Unrecognized type of sitemap %(url)s: %(stype)s', level=log.WARNING, url=response.url, stype=sitemap_type) return requests
def test_iter_urls_from_sitemap(self): self.assertListEqual(list(iter_urls_from_sitemap(sitemapindex)), [ 'http://www.example.com/sitemap1.xml.gz', 'http://www.example.com/sitemap2.xml.gz' ]) self.assertListEqual(list(iter_urls_from_sitemap(urlset)), [ 'http://www.example.com/', 'http://www.example.com/Special-Offers.html' ])
def test_iter_urls_from_sitemap(self): self.assertListEqual(list(iter_urls_from_sitemap(sitemapindex)), ['http://www.example.com/sitemap1.xml.gz', 'http://www.example.com/sitemap2.xml.gz']) self.assertListEqual(list(iter_urls_from_sitemap(urlset)), ['http://www.example.com/', 'http://www.example.com/Special-Offers.html'])