def _parse_sitemap(self, response): self.logger.info(f"_parse_sitemap, response: {response.url}") if response.url.endswith("/robots.txt"): for url in sitemap_urls_from_robots(response.text, base_url=response.url): yield Request(url, callback=self._parse_sitemap) else: body = self._get_sitemap_body(response) if body is None: logger.warning( "Ignoring invalid sitemap: %(response)s", {"response": response}, extra={"spider": self}, ) return s = Sitemap(body) it = self.sitemap_filter(s) if s.type.lower() == "sitemapindex": for loc in iterloc(it, self.sitemap_alternate_links): if any(x.search(loc) for x in self._follow): yield Request(loc, callback=self._parse_sitemap) elif s.type == "urlset": for loc in iterloc(it, self.sitemap_alternate_links): for r, c in self._cbs: if r.search(loc): yield Request(loc, callback=c) break
def _parse_sitemap(self, response): if response.url.endswith('/robots.txt'): for url in sitemap_urls_from_robots(response.body): yield Request(url, callback=self._parse_sitemap) else: body = self._get_sitemap_body(response) if body is None: self.logger.info('Ignoring invalid sitemap: %s', response.url) return s = Sitemap(body) if s.type == 'sitemapindex': for loc in iterloc(s, self.sitemap_alternate_links): if any(x.search(loc) for x in self._follow): yield Request(loc, callback=self._parse_sitemap) elif s.type == 'urlset': cnx = mysql.connector.connect(**config) cursor = cnx.cursor() truncate = "truncate table channel_sitemap" cursor.execute(truncate) cnx.commit() for loc in iterloc(s): for r, c in self._cbs: if r.search(loc): print loc cursor.execute("insert into channel_sitemap (url) values (%s)", (loc,)) cnx.commit() break
def _parse_sitemap(self, response): if response.url.endswith('/robots.txt'): for url in sitemap_urls_from_robots(response.text, base_url=response.url): yield Request(url, callback=self._parse_sitemap) else: body = self._get_sitemap_body(response) if body is None: logger.warning("Ignoring invalid sitemap: %(response)s", {'response': response}, extra={'spider': self}) return s = Sitemap(body) it = self.sitemap_filter(s) if s.type == 'sitemapindex': for loc in iterloc(it, self.sitemap_alternate_links): if any(x.search(loc) for x in self._follow): yield Request(loc, callback=self._parse_sitemap) elif s.type == 'urlset': for loc in iterloc(it, self.sitemap_alternate_links): for r, c in self._cbs: if r.search(loc): yield Request(loc, callback=c) break
def _parse_sitemap(self, response): #从sitemap 到发出第一批request就这这个函数 if response.url.endswith('/robots.txt'): for url in sitemap_urls_from_robots( response.text, base_url=response.url): #这里基本就是逐行解析URL 发出去 yield Request(url, callback=self._parse_sitemap) else: body = self._get_sitemap_body(response) if body is None: logger.warning("Ignoring invalid sitemap: %(response)s", {'response': response}, extra={'spider': self}) return s = Sitemap(body) #用lxml.etree.XMLParser 来解析respond 然后过滤出URL it = self.sitemap_filter(s) #用子类的方法再次过滤一次 if s.type == 'sitemapindex': for loc in iterloc(it, self.sitemap_alternate_links): if any(x.search(loc) for x in self._follow): yield Request(loc, callback=self._parse_sitemap) elif s.type == 'urlset': for loc in iterloc(it, self.sitemap_alternate_links): # 变成生成器 for r, c in self._cbs: #判断是否符合 对应的正则规则 if r.search(loc): #复合正则规则 抛出request yield Request(loc, callback=c) break
def _parse_sitemap(self, response): logging.info("Parsing sitemap %s" % response) if response.url.endswith('/robots.txt'): for url in sitemap_urls_from_robots(response.body): yield Request(url, callback=self._parse_sitemap) else: body = self._get_sitemap_body(response) if body is None: logging.warning("Ignoring invalid sitemap: %(response)s", response=response) return s = Sitemap(body) if s.type == 'sitemapindex': for loc in iterloc(s, self.sitemap_alternate_links): if any(x.search(loc) for x in self._follow): yield Request(loc, callback=self._parse_sitemap) elif s.type == 'urlset': for url in iter(s): loc = url['loc'] # Add the lastmod date to the Request meta lastmod = url.get('lastmod', None) if lastmod is not None: lastmod = parse_w3c_datetime(lastmod) for r, c in self._cbs: if r.search(loc): self.urls.append({"url": loc, "lastmod": lastmod}) break
def test_sitemap(self): s = Sitemap("""<?xml version="1.0" encoding="UTF-8"?> <urlset xmlns="http://www.google.com/schemas/sitemap/0.84"> <url> <loc>http://www.example.com/</loc> <lastmod>2009-08-16</lastmod> <changefreq>daily</changefreq> <priority>1</priority> </url> <url> <loc>http://www.example.com/Special-Offers.html</loc> <lastmod>2009-08-16</lastmod> <changefreq>weekly</changefreq> <priority>0.8</priority> </url> </urlset>""") assert s.type == 'urlset' self.assertEqual(list(s), [{ 'priority': '1', 'loc': 'http://www.example.com/', 'lastmod': '2009-08-16', 'changefreq': 'daily' }, { 'priority': '0.8', 'loc': 'http://www.example.com/Special-Offers.html', 'lastmod': '2009-08-16', 'changefreq': 'weekly' }])
def test_alternate(self): s = Sitemap("""<?xml version="1.0" encoding="UTF-8"?> <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xhtml="http://www.w3.org/1999/xhtml"> <url> <loc>http://www.example.com/english/</loc> <xhtml:link rel="alternate" hreflang="de" href="http://www.example.com/deutsch/"/> <xhtml:link rel="alternate" hreflang="de-ch" href="http://www.example.com/schweiz-deutsch/"/> <xhtml:link rel="alternate" hreflang="en" href="http://www.example.com/english/"/> <xhtml:link rel="alternate" hreflang="en"/><!-- wrong tag without href --> </url> </urlset>""") self.assertEqual(list(s), [{ 'loc': 'http://www.example.com/english/', 'alternate': [ 'http://www.example.com/deutsch/', 'http://www.example.com/schweiz-deutsch/', 'http://www.example.com/english/' ] }])
def test_sitemap_strip(self): """Assert we can deal with trailing spaces inside <loc> tags - we've seen those """ s = Sitemap("""<?xml version="1.0" encoding="UTF-8"?> <urlset xmlns="http://www.google.com/schemas/sitemap/0.84"> <url> <loc> http://www.example.com/</loc> <lastmod>2009-08-16</lastmod> <changefreq>daily</changefreq> <priority>1</priority> </url> <url> <loc> http://www.example.com/2</loc> <lastmod /> </url> </urlset> """) self.assertEqual(list(s), [ { 'priority': '1', 'loc': 'http://www.example.com/', 'lastmod': '2009-08-16', 'changefreq': 'daily' }, { 'loc': 'http://www.example.com/2', 'lastmod': '' }, ])
def _parse_sitemap(self, response): body = self._get_sitemap_body(response) if body is None: self.logger.warning("Ignoring invalid sitemap: %(response)s", {'response': response}, extra={'spider': self}) return s = Sitemap(body) if s.type == 'urlset': for loc in iterloc(s): for r, c in self._cbs: if r.search(loc): category_regex = r'toptenreviews\.com/(.*)/$' match = re.search(category_regex, loc) # the URL pattern must be change if there is no category matching if not match: break category = CategoryItem() category['category_path'] = match.group(1) category['category_url'] = loc if self.should_skip_category(category): break yield category request = Request(loc, callback=c) request.meta['category'] = category yield request break
def test_sitemap_wrong_ns(self): """We have seen sitemaps with wrongs ns. Presumably, Google still works with these, though is not 100% confirmed""" s = Sitemap("""<?xml version="1.0" encoding="UTF-8"?> <urlset xmlns="http://www.google.com/schemas/sitemap/0.84"> <url xmlns=""> <loc> http://www.example.com/</loc> <lastmod>2009-08-16</lastmod> <changefreq>daily</changefreq> <priority>1</priority> </url> <url xmlns=""> <loc> http://www.example.com/2</loc> <lastmod /> </url> </urlset> """) self.assertEqual(list(s), [ { 'priority': '1', 'loc': 'http://www.example.com/', 'lastmod': '2009-08-16', 'changefreq': 'daily' }, { 'loc': 'http://www.example.com/2', 'lastmod': '' }, ])
def _parse_sitemap(self, response): if response.url.endswith('/robots.txt'): for url in sitemap_urls_from_robots(response.text, base_url=response.url): yield Request(url, callback=self._parse_sitemap) else: body = self._get_sitemap_body(response) if body is None: return s = Sitemap(body) it = self.sitemap_filter(s) if s.type == 'sitemapindex': for loc, lastmod in self._iterloc(it, self.sitemap_alternate_links): if any(x.search(loc) for x in self._follow): request = Request(loc, callback=self._parse_sitemap) request.meta['lastmod'] = lastmod yield request elif s.type == 'urlset': for loc, lastmod in self._iterloc(it, self.sitemap_alternate_links): for r, c in self._cbs: if r.search(loc): if self.enable_page_limit and self.request_counter >= self.max_requests: return if not self._filter(loc): self.request_counter += 0.2 break self.request_counter += 1 request = Request(loc, callback=c) request.meta['lastmod'] = lastmod yield request break
def sitemap_to_array(url): results = [] body = urllib2.urlopen(url).read() sitemap = Sitemap(body) for item in sitemap: results.append(item['loc']) return results
def parse(self, response): body = "" body = gunzip(response.body) s = Sitemap(body) for sitelink in s: url = sitelink['loc'] yield scrapy.Request(url, callback=self.parse_details)
def test_sitemap_blanklines(self): """Assert we can deal with starting blank lines before <xml> tag""" s = Sitemap(b"""\ <?xml version="1.0" encoding="UTF-8"?> <sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> <!-- cache: cached = yes name = sitemap_jspCache key = sitemap --> <sitemap> <loc>http://www.example.com/sitemap1.xml</loc> <lastmod>2013-07-15</lastmod> </sitemap> <sitemap> <loc>http://www.example.com/sitemap2.xml</loc> <lastmod>2013-07-15</lastmod> </sitemap> <sitemap> <loc>http://www.example.com/sitemap3.xml</loc> <lastmod>2013-07-15</lastmod> </sitemap> <!-- end cache --> </sitemapindex> """) self.assertEqual(list(s), [ {'lastmod': '2013-07-15', 'loc': 'http://www.example.com/sitemap1.xml'}, {'lastmod': '2013-07-15', 'loc': 'http://www.example.com/sitemap2.xml'}, {'lastmod': '2013-07-15', 'loc': 'http://www.example.com/sitemap3.xml'}, ])
def _parse_sitemap(self, response): """get all urls and call for each one with SplashRequest""" urls_list = [] if response.url.endswith('/robots.txt'): for url in sitemap_urls_from_robots(response.text, base_url=response.url): yield Request(url, callback=self._parse_sitemap) else: body = self._get_sitemap_body(response) if body is None: logger.warning("Ignoring invalid sitemap: %(response)s", {'response': response}, extra={'spider': self}) return s = Sitemap(body) it = self.sitemap_filter(s) if s.type == 'sitemapindex': for loc in iterloc(it, self.sitemap_alternate_links): if any(x.search(loc) for x in self._follow): yield Request(loc, callback=self._parse_sitemap) elif s.type == 'urlset': for loc in iterloc(it, self.sitemap_alternate_links): for r, c in self._cbs: if r.search(loc): urls_list.append(loc) pool = ThreadPool() for s in pool.map(self.get_response, urls_list): yield s pool.close() pool.join()
def parse_sitemap(self, response): sitemap = Sitemap(response.body) for site_url in sitemap: url = site_url['loc'] if "sitemap" in url and ".xml" in url: yield Request(url, self.parse_sitemap) else: yield Request(url, self.parse_page)
def test_comment(self): s = Sitemap("""<?xml version="1.0" encoding="UTF-8"?> <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xhtml="http://www.w3.org/1999/xhtml"> <url> <loc>http://www.example.com/</loc> <!-- this is a comment on which the parser might raise an exception if implemented incorrectly --> </url> </urlset>""") self.assertEqual(list(s), [{'loc': 'http://www.example.com/'}])
def _parse_sitemap(self, response): if response.url.endswith("/robots.txt"): for url in sitemap_urls_from_robots(response.text, base_url=response.url): yield Request(url, callback=self._parse_sitemap) else: body = self._get_sitemap_body(response) if body is None: logger.warning( "Ignoring invalid sitemap: %(response)s", {"response": response}, extra={"spider": self}, ) return s = Sitemap(body) it = self.sitemap_filter(s) if s.type == "sitemapindex": for (loc, ts, freq, prio) in iterloc(it, self.sitemap_alternate_links): if any(x.search(loc) for x in self._follow): yield Request(loc, callback=self._parse_sitemap) elif s.type == "urlset": for (loc, ts, freq, prio) in iterloc(it, self.sitemap_alternate_links): for r, c in self._cbs: if r.search(loc): ts = soscan.utils.parseDatetimeString(ts) if self._count_only: item = soscan.items.SitemapItem() item["source"] = response.url item["time_retrieved"] = soscan.utils.dtnow() item["url"] = loc item["time_loc"] = ts item["changefreq"] = freq item["priority"] = prio logger.debug("Yield item: %s", item) yield item else: req = Request( loc, callback=c, flags=[ self._count_only, ], ) req.meta["loc_timestamp"] = ts req.meta["loc_source"] = response.url req.meta["loc_changefreq"] = freq req.meta["loc_priority"] = prio yield req break
def test_xml_entity_expansion(self): s = Sitemap("""<?xml version="1.0" encoding="utf-8"?> <!DOCTYPE foo [ <!ELEMENT foo ANY > <!ENTITY xxe SYSTEM "file:///etc/passwd" > ]> <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> <url> <loc>http://127.0.0.1:8000/&xxe;</loc> </url> </urlset> """) self.assertEqual(list(s), [{'loc': 'http://127.0.0.1:8000/'}])
def test_sitemap_index(self): s = Sitemap(b"""<?xml version="1.0" encoding="UTF-8"?> <sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> <sitemap> <loc>http://www.example.com/sitemap1.xml.gz</loc> <lastmod>2004-10-01T18:23:17+00:00</lastmod> </sitemap> <sitemap> <loc>http://www.example.com/sitemap2.xml.gz</loc> <lastmod>2005-01-01</lastmod> </sitemap> </sitemapindex>""") assert s.type == 'sitemapindex' self.assertEqual(list(s), [{'loc': 'http://www.example.com/sitemap1.xml.gz', 'lastmod': '2004-10-01T18:23:17+00:00'}, {'loc': 'http://www.example.com/sitemap2.xml.gz', 'lastmod': '2005-01-01'}])
def parse_sitemap(self, response): # Errors if (response.status != 200): raise CloseSpider('Bad response returned') # Parse sitemap.xml sitemap = Sitemap(response.body) if sitemap.type == 'urlset' or sitemap.type == 'sitemapindex': for url in sitemap: item = SearchResultItem() item['url'] = url['loc'] item['cache'] = response.url yield item # Also consider alternate URLs (xhtml:link rel="alternate") if 'alternate' in url: for alt in url['alternate']: item = SearchResultItem() item['url'] = alt item['cache'] = response.url yield item
def _parse_sitemap(self, response): if response.url.endswith('/robots.txt'): for url in sitemap_urls_from_robots(response.body): yield Request(url, callback=self._parse_sitemap) else: body = self._get_sitemap_body(response) if body is None: log.msg("Ignoring invalid sitemap: %s" % response, log.WARNING) return s = Sitemap(body) if s.type == 'sitemapindex': for loc in iterloc(s): if any(x.search(loc) for x in self._follow): yield Request(loc, callback=self._parse_sitemap) elif s.type == 'urlset': for loc in iterloc(s): for r, c in self._cbs: if r.search(loc): yield Request(loc, callback=c) break
def _parse_sitemap(self, response): self.logger.info("Custom sitemap parsing function initiating") if response.url.endswith('/robots.txt'): for url in sitemap_urls_from_robots(response.text): yield Request(url, callback=self._parse_sitemap) else: body = response.body if body is None: logger.warning("Ignoring invalid sitemap: %(response)s", {'response': response}, extra={'spider': self}) return s = Sitemap(body) loc_reg = '<loc>(.*?)<\/loc>' if s.type == 'sitemapindex': for loc in re.findall(loc_reg, body.decode('utf-8')): print(loc) yield Request(loc, callback=self._parse_sitemap) elif s.type == 'urlset': for loc in re.findall(loc_reg, body.decode('utf-8')): yield Request(loc, callback=self.parse_wine)
def _parse_sitemap(self, response): if response.url.endswith('/robots.txt'): for url in sitemap_urls_from_robots(response.body): yield Request(url, callback=self._parse_sitemap) else: body = self._get_sitemap_body(response) if body is None: log.msg(format="Ignoring invalid sitemap: %(response)s", level=log.WARNING, spider=self, response=response) return s = Sitemap(body) if s.type == 'sitemapindex': for loc in iterloc(s, alt=self.sitemap_alternate_links): if any(x.search(loc) for x in self._follow): yield Request(loc, callback=self._parse_sitemap) elif s.type == 'urlset': for loc in iterloc(s, since=self.sitemap_modified_since): for r, c in self._cbs: if r.search(loc): yield PageRequest(loc, callback=c) break
def _parse_sitemap(self, response): if response.url.endswith('/robots.txt'): for url in sitemap_urls_from_robots(response.body): yield Request(url, callback=self._parse_sitemap) else: if isinstance(response, XmlResponse): body = response.body elif is_gzipped(response): body = gunzip(response.body) else: log.msg("Ignoring non-XML sitemap: %s" % response, log.WARNING) return s = Sitemap(body) if s.type == 'sitemapindex': for loc in iterloc(s): if any(x.search(loc) for x in self._follow): yield Request(loc, callback=self._parse_sitemap) elif s.type == 'urlset': for loc in iterloc(s): for r, c in self._cbs: if r.search(loc): yield Request(loc, callback=c) break
#!/usr/bin/env python # -*- coding: UTF-8 -*- __author__ = "Sigai" import requests from scrapy.utils.sitemap import Sitemap from scrapy.http import XmlResponse url = 'https://answers.microsoft.com/map/thread?day=1' body = requests.get(url) s = Sitemap(body.content) for d in s: print(d)