def _parse_sitemap(self, response): if response.url.endswith('/robots.txt'): for url in sitemap_urls_from_robots(response.body): yield Request(url, callback=self._parse_sitemap) else: body = self._get_sitemap_body(response) if body is None: self.logger.warning("Ignoring invalid sitemap: %(response)s", {'response': response}, extra={'spider': self}) return s = Sitemap(body) if s.type == 'sitemapindex': for loc in iterloc(s, self.sitemap_alternate_links): if any(x.search(loc) for x in self._follow): yield Request(loc, callback=self._parse_sitemap) elif s.type == 'urlset': for loc in iterloc(s): for r, c in self._cbs: if r.search(loc): if not doctor_exists(loc): self.logger.debug("Doctor's url not found in db. Fetching data") yield Request(loc, callback=c) else: self.logger.debug("Doctor's url found in db. Passing it on") break
def parse_sitemap(self,response): if response.url.endswith('/robots.txt'): for url in sitemap_urls_from_robots(response.text, base_url=response.url): yield Request(url, callback=self.parse_sitemap) else: body = self._get_sitemap_body(response) if body is None: logger.warning("Ignoring invalid sitemap: %(response)s", {'response': response}, extra={'spider': self}) return s = Sitemap(body) if s.type == 'sitemapindex': for loc in iterloc(s, self.sitemap_alternate_links): if any(x.search(loc) for x in self._follow): yield Request(loc, callback=self.parse_sitemap) elif s.type == 'urlset': for loc in iterloc(s): print loc if loc.count(self.rule): self.count = self.count + 1 else: self.other = self.other + 1 print "Total Rule Matched: ",self.count print "Total Other Count: ", self.other
def test_sitemap_urls_from_robots(self): robots = """User-agent: * Disallow: /aff/ Disallow: /wl/ # Search and shopping refining Disallow: /s*/*facet Disallow: /s*/*tags # Sitemap files Sitemap: http://example.com/sitemap.xml Sitemap: http://example.com/sitemap-product-index.xml Sitemap: HTTP://example.com/sitemap-uppercase.xml Sitemap: /sitemap-relative-url.xml # Forums Disallow: /forum/search/ Disallow: /forum/active/ """ self.assertEqual( list( sitemap_urls_from_robots(robots, base_url='http://example.com')), [ 'http://example.com/sitemap.xml', 'http://example.com/sitemap-product-index.xml', 'http://example.com/sitemap-uppercase.xml', 'http://example.com/sitemap-relative-url.xml' ])
def _parse_sitemap(self, response): logging.info("Parsing sitemap %s" % response) if response.url.endswith('/robots.txt'): for url in sitemap_urls_from_robots(response.body): yield Request(url, callback=self._parse_sitemap) else: body = self._get_sitemap_body(response) if body is None: logging.warning("Ignoring invalid sitemap: %(response)s", response=response) return s = Sitemap(body) if s.type == 'sitemapindex': for loc in iterloc(s, self.sitemap_alternate_links): if any(x.search(loc) for x in self._follow): yield Request(loc, callback=self._parse_sitemap) elif s.type == 'urlset': for url in iter(s): loc = url['loc'] # Add the lastmod date to the Request meta lastmod = url.get('lastmod', None) if lastmod is not None: lastmod = parse_w3c_datetime(lastmod) for r, c in self._cbs: if r.search(loc): self.urls.append({"url": loc, "lastmod": lastmod}) break
def _parse_sitemap(self, response): """get all urls and call for each one with SplashRequest""" urls_list = [] if response.url.endswith('/robots.txt'): for url in sitemap_urls_from_robots(response.text, base_url=response.url): yield Request(url, callback=self._parse_sitemap) else: body = self._get_sitemap_body(response) if body is None: logger.warning("Ignoring invalid sitemap: %(response)s", {'response': response}, extra={'spider': self}) return s = Sitemap(body) it = self.sitemap_filter(s) if s.type == 'sitemapindex': for loc in iterloc(it, self.sitemap_alternate_links): if any(x.search(loc) for x in self._follow): yield Request(loc, callback=self._parse_sitemap) elif s.type == 'urlset': for loc in iterloc(it, self.sitemap_alternate_links): for r, c in self._cbs: if r.search(loc): urls_list.append(loc) pool = ThreadPool() for s in pool.map(self.get_response, urls_list): yield s pool.close() pool.join()
def _parse_sitemap(self, response): if response.url.endswith('/robots.txt'): for url in sitemap_urls_from_robots(response.body): yield Request(url, callback=self._parse_sitemap) else: body = self._get_sitemap_body(response) if body is None: self.logger.info('Ignoring invalid sitemap: %s', response.url) return s = Sitemap(body) if s.type == 'sitemapindex': for loc in iterloc(s, self.sitemap_alternate_links): if any(x.search(loc) for x in self._follow): yield Request(loc, callback=self._parse_sitemap) elif s.type == 'urlset': cnx = mysql.connector.connect(**config) cursor = cnx.cursor() truncate = "truncate table channel_sitemap" cursor.execute(truncate) cnx.commit() for loc in iterloc(s): for r, c in self._cbs: if r.search(loc): print loc cursor.execute("insert into channel_sitemap (url) values (%s)", (loc,)) cnx.commit() break
def _parse_sitemap(self, response): #从sitemap 到发出第一批request就这这个函数 if response.url.endswith('/robots.txt'): for url in sitemap_urls_from_robots( response.text, base_url=response.url): #这里基本就是逐行解析URL 发出去 yield Request(url, callback=self._parse_sitemap) else: body = self._get_sitemap_body(response) if body is None: logger.warning("Ignoring invalid sitemap: %(response)s", {'response': response}, extra={'spider': self}) return s = Sitemap(body) #用lxml.etree.XMLParser 来解析respond 然后过滤出URL it = self.sitemap_filter(s) #用子类的方法再次过滤一次 if s.type == 'sitemapindex': for loc in iterloc(it, self.sitemap_alternate_links): if any(x.search(loc) for x in self._follow): yield Request(loc, callback=self._parse_sitemap) elif s.type == 'urlset': for loc in iterloc(it, self.sitemap_alternate_links): # 变成生成器 for r, c in self._cbs: #判断是否符合 对应的正则规则 if r.search(loc): #复合正则规则 抛出request yield Request(loc, callback=c) break
def _parse_sitemap(self, response): self.logger.info(f"_parse_sitemap, response: {response.url}") if response.url.endswith("/robots.txt"): for url in sitemap_urls_from_robots(response.text, base_url=response.url): yield Request(url, callback=self._parse_sitemap) else: body = self._get_sitemap_body(response) if body is None: logger.warning( "Ignoring invalid sitemap: %(response)s", {"response": response}, extra={"spider": self}, ) return s = Sitemap(body) it = self.sitemap_filter(s) if s.type.lower() == "sitemapindex": for loc in iterloc(it, self.sitemap_alternate_links): if any(x.search(loc) for x in self._follow): yield Request(loc, callback=self._parse_sitemap) elif s.type == "urlset": for loc in iterloc(it, self.sitemap_alternate_links): for r, c in self._cbs: if r.search(loc): yield Request(loc, callback=c) break
def _parse_sitemap(self, response): if response.url.endswith('/robots.txt'): for url in sitemap_urls_from_robots(response.text, base_url=response.url): yield Request(url, callback=self._parse_sitemap) else: body = self._get_sitemap_body(response) if body is None: return s = Sitemap(body) it = self.sitemap_filter(s) if s.type == 'sitemapindex': for loc, lastmod in self._iterloc(it, self.sitemap_alternate_links): if any(x.search(loc) for x in self._follow): request = Request(loc, callback=self._parse_sitemap) request.meta['lastmod'] = lastmod yield request elif s.type == 'urlset': for loc, lastmod in self._iterloc(it, self.sitemap_alternate_links): for r, c in self._cbs: if r.search(loc): if self.enable_page_limit and self.request_counter >= self.max_requests: return if not self._filter(loc): self.request_counter += 0.2 break self.request_counter += 1 request = Request(loc, callback=c) request.meta['lastmod'] = lastmod yield request break
def parse_robots(self, response): for url in sitemap_urls_from_robots(response.body): if 'sitemap-products' in url: yield Request(url, callback=self._parse_sitemap, errback=self.on_error, meta={'retry': 5})
def _parse_sitemap(self, response): """This is adapted from scrapy.spiders.sitemap""" if response.url.endswith('/robots.txt'): for url in sitemap_urls_from_robots(response.text, base_url=response.url): yield Request(url, callback=self._parse_sitemap) else: body = self._get_sitemap_body(response) if body is None: self.logger.warning("Ignoring invalid sitemap: %(response)s", {'response': response}, extra={'spider': self}) return s = NewsSitemap(body) if s.type == 'sitemapindex': for loc in iterloc(s, self.sitemap_alternate_links): if any(x.search(loc) for x in self._follow): yield self.url_to_request(loc, callback=self._parse_sitemap) elif s.type == 'urlset': for loc, meta in self.iterurlset(s): for r, c in self._cbs: if r.search(loc): try: self.logger.debug(f'Queuing {loc}') yield self.url_to_request(loc, callback=c, meta=meta) break except Exception as e: self.logger.error( f'Failed to queue {loc}: {e}')
def _parse_sitemap(self, response): logging.info("Parsing sitemap %s" % response) if response.url.endswith('/robots.txt'): for url in sitemap_urls_from_robots(response.body): yield Request(url, callback=self._parse_sitemap) else: body = self._get_sitemap_body(response) if body is None: logging.warning("Ignoring invalid sitemap: %(response)s", response=response) return s = Sitemap(body) if s.type == 'sitemapindex': for loc in iterloc(s, self.sitemap_alternate_links): if any(x.search(loc) for x in self._follow): yield Request(loc, callback=self._parse_sitemap) elif s.type == 'urlset': urls = list(iter(s)) logging.info("Checking {0} sitemap URLs".format(len(urls))) for url in urls: loc = url['loc'] # Add the lastmod date to the Request meta lastmod = url.get('lastmod', None) if lastmod is not None: lastmod = parse_w3c_datetime(lastmod) for r, c in self._cbs: if r.search(loc): self.urls.append({"url": loc, "lastmod": lastmod}) logging.info("Adding sitemap URL {0}".format(loc)) break
def _parse_sitemap(self, response): if response.url.endswith('/robots.txt'): for url in sitemap_urls_from_robots(response.text, base_url=response.url): yield Request(url, callback=self._parse_sitemap) else: body = self._get_sitemap_body(response) if body is None: logger.warning("Ignoring invalid sitemap: %(response)s", {'response': response}, extra={'spider': self}) return s = Sitemap(body) it = self.sitemap_filter(s) if s.type == 'sitemapindex': for loc in iterloc(it, self.sitemap_alternate_links): if any(x.search(loc) for x in self._follow): yield Request(loc, callback=self._parse_sitemap) elif s.type == 'urlset': for loc in iterloc(it, self.sitemap_alternate_links): for r, c in self._cbs: if r.search(loc): yield Request(loc, callback=c) break
def _parse_sitemap(self, response): if response.url.endswith("/robots.txt"): for url in sitemap_urls_from_robots(response.text, base_url=response.url): yield Request(url, callback=self._parse_sitemap) else: body = self._get_sitemap_body(response) if body is None: logger.warning( "Ignoring invalid sitemap: %(response)s", {"response": response}, extra={"spider": self}, ) return s = Sitemap(body) it = self.sitemap_filter(s) if s.type == "sitemapindex": for (loc, ts, freq, prio) in iterloc(it, self.sitemap_alternate_links): if any(x.search(loc) for x in self._follow): yield Request(loc, callback=self._parse_sitemap) elif s.type == "urlset": for (loc, ts, freq, prio) in iterloc(it, self.sitemap_alternate_links): for r, c in self._cbs: if r.search(loc): ts = soscan.utils.parseDatetimeString(ts) if self._count_only: item = soscan.items.SitemapItem() item["source"] = response.url item["time_retrieved"] = soscan.utils.dtnow() item["url"] = loc item["time_loc"] = ts item["changefreq"] = freq item["priority"] = prio logger.debug("Yield item: %s", item) yield item else: req = Request( loc, callback=c, flags=[ self._count_only, ], ) req.meta["loc_timestamp"] = ts req.meta["loc_source"] = response.url req.meta["loc_changefreq"] = freq req.meta["loc_priority"] = prio yield req break
def _parse_sitemap(self, response): if response.url.endswith('/robots.txt'): for url in sitemap_urls_from_robots(response.body): yield Request(url, callback=self._parse_sitemap) else: s = Sitemap(response.body) if s.type == 'sitemapindex': for sitemap in s: yield Request(sitemap['loc'], callback=self._parse_sitemap) elif s.type == 'urlset': for url in s: loc = url['loc'] for r, c in self._cbs: if r.search(loc): yield Request(loc, callback=c) break
def _parse_sitemap(self, response): if response.url.endswith('/robots.txt'): for url in sitemap_urls_from_robots(response.text, base_url=response.url): yield Request(url, callback=self._parse_sitemap, headers=self.sitemap_header) else: body = self._get_sitemap_body(response) if body is None: logger.warning("Ignoring invalid sitemap: %(response)s", {'response': response}, extra={'spider': self}) return s = Sitemap(body) if s.type == 'sitemapindex': for loc in iterloc(s, self.sitemap_alternate_links): if any(x.search(loc) for x in self._follow): yield Request(loc, callback=self._parse_sitemap, headers=self.sitemap_header) elif s.type == 'urlset': if 'news' in s._root.nsmap and 'http://www.google.com/schemas/sitemap-news/' in s._root.nsmap[ 'news']: for news in iternews(s, self.sitemap_alternate_links): if self._index_filter(news): for r, c in self._cbs: if r.search(news['loc']): yield Request(news['loc'], callback=c, meta=news, headers=self.sitemap_header) break else: for item in iteritem(s): if self._index_filter(item): for r, c in self._cbs: if r.search(item['loc']): yield Request(item['loc'], callback=c, meta=item, headers=self.sitemap_header) break
def test_sitemap_urls_from_robots(self): robots = """User-agent: * Disallow: /aff/ Disallow: /wl/ # Search and shopping refining Disallow: /s*/*facet Disallow: /s*/*tags # Sitemap files Sitemap: http://example.com/sitemap.xml Sitemap: http://example.com/sitemap-product-index.xml # Forums Disallow: /forum/search/ Disallow: /forum/active/ """ self.assertEqual(list(sitemap_urls_from_robots(robots)), ['http://example.com/sitemap.xml', 'http://example.com/sitemap-product-index.xml'])
def _parse_sitemap(self, response): if response.url.endswith('/robots.txt'): for url in sitemap_urls_from_robots(response.body): yield Request(url, callback=self._parse_sitemap) else: body = self._get_sitemap_body(response) if body is None: log.msg("Ignoring invalid sitemap: %s" % response, log.WARNING) return s = Sitemap(body) if s.type == 'sitemapindex': for loc in iterloc(s): if any(x.search(loc) for x in self._follow): yield Request(loc, callback=self._parse_sitemap) elif s.type == 'urlset': for loc in iterloc(s): for r, c in self._cbs: if r.search(loc): yield Request(loc, callback=c) break
def _parse_sitemap(self, response): if response.url.endswith('/robots.txt'): for url in sitemap_urls_from_robots(response.body): yield Request(url, callback=self._parse_sitemap) else: body = self._get_sitemap_body(response) if body is None: log.msg(format="Ignoring invalid sitemap: %(response)s", level=log.WARNING, spider=self, response=response) return s = Sitemap(body) if s.type == 'sitemapindex': for loc in iterloc(s): if any(x.search(loc) for x in self._follow): yield Request(loc, callback=self._parse_sitemap) elif s.type == 'urlset': for d in s: for r, c in self._cbs: if r.search(d['loc']): yield Request(d['loc'], callback=c, meta=d) break
def _parse_sitemap(self, response): if response.url.endswith('/robots.txt'): for url in sitemap_urls_from_robots(response.text, base_url=response.url): yield Request(url, callback=self._parse_sitemap) else: body = self._get_sitemap_body(response) if body is None: logger.warning("Ignoring invalid sitemap: %(response)s", {'response': response}, extra={'spider': self}) return s = Sitemap(body) if s.type == 'sitemapindex': for loc in iterloc(s, self.sitemap_alternate_links): if any(x.search(loc) for x in self._follow): yield Request(loc, callback=self._parse_sitemap) elif s.type == 'urlset': for loc in iterloc(s, self.sitemap_alternate_links): for r, c in self._cbs: if r.search(loc): yield Request(loc, callback=c) break
def _parse_sitemap(self, response): if response.url.endswith('/robots.txt'): for url in sitemap_urls_from_robots(response.body): yield Request(url, callback=self._parse_sitemap) else: body = self._get_sitemap_body(response) if body is None: log.msg(format="Ignoring invalid sitemap: %(response)s", level=log.WARNING, spider=self, response=response) return s = Sitemap(body) if s.type == 'sitemapindex': for loc in iterloc(s, alt=self.sitemap_alternate_links): if any(x.search(loc) for x in self._follow): yield Request(loc, callback=self._parse_sitemap) elif s.type == 'urlset': for loc in iterloc(s, since=self.sitemap_modified_since): for r, c in self._cbs: if r.search(loc): yield PageRequest(loc, callback=c) break
def _parse_sitemap(self, response): if response.url.endswith('/robots.txt'): for url in sitemap_urls_from_robots(response.body): yield Request(url, callback=self._parse_sitemap) else: if isinstance(response, XmlResponse): body = response.body elif is_gzipped(response): body = gunzip(response.body) else: log.msg("Ignoring non-XML sitemap: %s" % response, log.WARNING) return s = Sitemap(body) if s.type == 'sitemapindex': for loc in iterloc(s): if any(x.search(loc) for x in self._follow): yield Request(loc, callback=self._parse_sitemap) elif s.type == 'urlset': for loc in iterloc(s): for r, c in self._cbs: if r.search(loc): yield Request(loc, callback=c) break