コード例 #1
0
    def _parse_sitemap(self, response):
        if response.url.endswith('/robots.txt'):
            for url in sitemap_urls_from_robots(response.body):
                yield Request(url, callback=self._parse_sitemap)
        else:
            body = self._get_sitemap_body(response)
            if body is None:
                self.logger.warning("Ignoring invalid sitemap: %(response)s",
                               {'response': response}, extra={'spider': self})
                return

            s = Sitemap(body)
            if s.type == 'sitemapindex':
                for loc in iterloc(s, self.sitemap_alternate_links):
                    if any(x.search(loc) for x in self._follow):
                        yield Request(loc, callback=self._parse_sitemap)
            elif s.type == 'urlset':
                for loc in iterloc(s):
                    for r, c in self._cbs:
                        if r.search(loc):
                            if not doctor_exists(loc):
                                self.logger.debug("Doctor's url not found in db. Fetching data")
                                yield Request(loc, callback=c)
                            else:
                                self.logger.debug("Doctor's url found in db. Passing it on")
                            break
コード例 #2
0
    def parse_sitemap(self,response):
        
        if response.url.endswith('/robots.txt'):
            for url in sitemap_urls_from_robots(response.text, base_url=response.url):
                yield Request(url, callback=self.parse_sitemap)

        else:
            body = self._get_sitemap_body(response)
            if body is None:
                logger.warning("Ignoring invalid sitemap: %(response)s",
                               {'response': response}, extra={'spider': self})
                return

            s = Sitemap(body)
            if s.type == 'sitemapindex':
                for loc in iterloc(s, self.sitemap_alternate_links):
                    if any(x.search(loc) for x in self._follow):
                        yield Request(loc, callback=self.parse_sitemap)

            elif s.type == 'urlset':
                for loc in iterloc(s):
		    print loc
                    if loc.count(self.rule):
			self.count = self.count + 1
		    else:
			self.other = self.other + 1
	        print "Total Rule Matched: ",self.count
		print "Total Other Count: ", self.other
コード例 #3
0
ファイル: test_utils_sitemap.py プロジェクト: lucasqlm/scrapy
    def test_sitemap_urls_from_robots(self):
        robots = """User-agent: *
Disallow: /aff/
Disallow: /wl/

# Search and shopping refining
Disallow: /s*/*facet
Disallow: /s*/*tags

# Sitemap files
Sitemap: http://example.com/sitemap.xml
Sitemap: http://example.com/sitemap-product-index.xml
Sitemap: HTTP://example.com/sitemap-uppercase.xml
Sitemap: /sitemap-relative-url.xml

# Forums
Disallow: /forum/search/
Disallow: /forum/active/
"""
        self.assertEqual(
            list(
                sitemap_urls_from_robots(robots,
                                         base_url='http://example.com')),
            [
                'http://example.com/sitemap.xml',
                'http://example.com/sitemap-product-index.xml',
                'http://example.com/sitemap-uppercase.xml',
                'http://example.com/sitemap-relative-url.xml'
            ])
コード例 #4
0
    def _parse_sitemap(self, response):
        logging.info("Parsing sitemap %s" % response)

        if response.url.endswith('/robots.txt'):
            for url in sitemap_urls_from_robots(response.body):
                yield Request(url, callback=self._parse_sitemap)
        else:
            body = self._get_sitemap_body(response)
            if body is None:
                logging.warning("Ignoring invalid sitemap: %(response)s",
                                response=response)
                return
            s = Sitemap(body)
            if s.type == 'sitemapindex':
                for loc in iterloc(s, self.sitemap_alternate_links):
                    if any(x.search(loc) for x in self._follow):
                        yield Request(loc, callback=self._parse_sitemap)
            elif s.type == 'urlset':
                for url in iter(s):
                    loc = url['loc']
                    # Add the lastmod date to the Request meta
                    lastmod = url.get('lastmod', None)
                    if lastmod is not None:
                        lastmod = parse_w3c_datetime(lastmod)
                    for r, c in self._cbs:
                        if r.search(loc):
                            self.urls.append({"url": loc, "lastmod": lastmod})
                            break
    def _parse_sitemap(self, response):
        """get all urls and call for each one with SplashRequest"""

        urls_list = []

        if response.url.endswith('/robots.txt'):
            for url in sitemap_urls_from_robots(response.text,
                                                base_url=response.url):
                yield Request(url, callback=self._parse_sitemap)
        else:
            body = self._get_sitemap_body(response)
            if body is None:
                logger.warning("Ignoring invalid sitemap: %(response)s",
                               {'response': response},
                               extra={'spider': self})
                return

            s = Sitemap(body)
            it = self.sitemap_filter(s)

            if s.type == 'sitemapindex':
                for loc in iterloc(it, self.sitemap_alternate_links):
                    if any(x.search(loc) for x in self._follow):
                        yield Request(loc, callback=self._parse_sitemap)
            elif s.type == 'urlset':
                for loc in iterloc(it, self.sitemap_alternate_links):
                    for r, c in self._cbs:
                        if r.search(loc):
                            urls_list.append(loc)

        pool = ThreadPool()
        for s in pool.map(self.get_response, urls_list):
            yield s
        pool.close()
        pool.join()
コード例 #6
0
    def _parse_sitemap(self, response):
        if response.url.endswith('/robots.txt'):
            for url in sitemap_urls_from_robots(response.body):
                yield Request(url, callback=self._parse_sitemap)
        else:
            body = self._get_sitemap_body(response)
            if body is None:
                self.logger.info('Ignoring invalid sitemap: %s', response.url)
                return

            s = Sitemap(body)
            if s.type == 'sitemapindex':
                for loc in iterloc(s, self.sitemap_alternate_links):
                    if any(x.search(loc) for x in self._follow):
                        yield Request(loc, callback=self._parse_sitemap)
            elif s.type == 'urlset':
                cnx = mysql.connector.connect(**config)
                cursor = cnx.cursor()
                truncate = "truncate table channel_sitemap"
                cursor.execute(truncate)
                cnx.commit()
                for loc in iterloc(s):
                    for r, c in self._cbs:
                        if r.search(loc):
                            print loc
                            cursor.execute("insert into channel_sitemap (url) values (%s)", (loc,))
                            cnx.commit()
                            break
コード例 #7
0
    def _parse_sitemap(self, response):  #从sitemap 到发出第一批request就这这个函数
        if response.url.endswith('/robots.txt'):
            for url in sitemap_urls_from_robots(
                    response.text, base_url=response.url):  #这里基本就是逐行解析URL 发出去
                yield Request(url, callback=self._parse_sitemap)
        else:
            body = self._get_sitemap_body(response)
            if body is None:
                logger.warning("Ignoring invalid sitemap: %(response)s",
                               {'response': response},
                               extra={'spider': self})
                return

            s = Sitemap(body)  #用lxml.etree.XMLParser 来解析respond 然后过滤出URL
            it = self.sitemap_filter(s)  #用子类的方法再次过滤一次

            if s.type == 'sitemapindex':
                for loc in iterloc(it, self.sitemap_alternate_links):
                    if any(x.search(loc) for x in self._follow):
                        yield Request(loc, callback=self._parse_sitemap)
            elif s.type == 'urlset':
                for loc in iterloc(it, self.sitemap_alternate_links):  # 变成生成器
                    for r, c in self._cbs:  #判断是否符合 对应的正则规则
                        if r.search(loc):  #复合正则规则 抛出request
                            yield Request(loc, callback=c)
                            break
コード例 #8
0
    def _parse_sitemap(self, response):
        self.logger.info(f"_parse_sitemap, response: {response.url}")
        if response.url.endswith("/robots.txt"):
            for url in sitemap_urls_from_robots(response.text,
                                                base_url=response.url):
                yield Request(url, callback=self._parse_sitemap)
        else:
            body = self._get_sitemap_body(response)
            if body is None:
                logger.warning(
                    "Ignoring invalid sitemap: %(response)s",
                    {"response": response},
                    extra={"spider": self},
                )
                return

            s = Sitemap(body)
            it = self.sitemap_filter(s)
            if s.type.lower() == "sitemapindex":
                for loc in iterloc(it, self.sitemap_alternate_links):
                    if any(x.search(loc) for x in self._follow):
                        yield Request(loc, callback=self._parse_sitemap)
            elif s.type == "urlset":
                for loc in iterloc(it, self.sitemap_alternate_links):
                    for r, c in self._cbs:
                        if r.search(loc):
                            yield Request(loc, callback=c)
                            break
コード例 #9
0
    def _parse_sitemap(self, response):

        if response.url.endswith('/robots.txt'):

            for url in sitemap_urls_from_robots(response.text, base_url=response.url):
                yield Request(url, callback=self._parse_sitemap)

        else:
            body = self._get_sitemap_body(response)
            if body is None:
                return

            s = Sitemap(body)
            it = self.sitemap_filter(s)

            if s.type == 'sitemapindex':
                for loc, lastmod in self._iterloc(it, self.sitemap_alternate_links):
                    if any(x.search(loc) for x in self._follow):
                        request = Request(loc, callback=self._parse_sitemap)
                        request.meta['lastmod'] = lastmod
                        yield request
            elif s.type == 'urlset':
                for loc, lastmod in self._iterloc(it, self.sitemap_alternate_links):
                    for r, c in self._cbs:
                        if r.search(loc):
                            if self.enable_page_limit and self.request_counter >= self.max_requests:
                                return
                            if not self._filter(loc):
                                self.request_counter += 0.2
                                break
                            self.request_counter += 1
                            request = Request(loc, callback=c)
                            request.meta['lastmod'] = lastmod
                            yield request
                            break
コード例 #10
0
 def parse_robots(self, response):
     for url in sitemap_urls_from_robots(response.body):
         if 'sitemap-products' in url:
             yield Request(url,
                           callback=self._parse_sitemap,
                           errback=self.on_error,
                           meta={'retry': 5})
コード例 #11
0
    def _parse_sitemap(self, response):
        """This is adapted from scrapy.spiders.sitemap"""
        if response.url.endswith('/robots.txt'):
            for url in sitemap_urls_from_robots(response.text,
                                                base_url=response.url):
                yield Request(url, callback=self._parse_sitemap)
        else:
            body = self._get_sitemap_body(response)
            if body is None:
                self.logger.warning("Ignoring invalid sitemap: %(response)s",
                                    {'response': response},
                                    extra={'spider': self})
                return

            s = NewsSitemap(body)
            if s.type == 'sitemapindex':
                for loc in iterloc(s, self.sitemap_alternate_links):
                    if any(x.search(loc) for x in self._follow):
                        yield self.url_to_request(loc,
                                                  callback=self._parse_sitemap)
            elif s.type == 'urlset':
                for loc, meta in self.iterurlset(s):
                    for r, c in self._cbs:
                        if r.search(loc):
                            try:
                                self.logger.debug(f'Queuing {loc}')
                                yield self.url_to_request(loc,
                                                          callback=c,
                                                          meta=meta)
                                break
                            except Exception as e:
                                self.logger.error(
                                    f'Failed to queue {loc}: {e}')
コード例 #12
0
    def _parse_sitemap(self, response):
        logging.info("Parsing sitemap %s" % response)

        if response.url.endswith('/robots.txt'):
            for url in sitemap_urls_from_robots(response.body):
                yield Request(url, callback=self._parse_sitemap)
        else:
            body = self._get_sitemap_body(response)
            if body is None:
                logging.warning("Ignoring invalid sitemap: %(response)s", response=response)
                return
            s = Sitemap(body)
            if s.type == 'sitemapindex':
                for loc in iterloc(s, self.sitemap_alternate_links):
                    if any(x.search(loc) for x in self._follow):
                        yield Request(loc, callback=self._parse_sitemap)
            elif s.type == 'urlset':
                urls = list(iter(s))
                logging.info("Checking {0} sitemap URLs".format(len(urls)))
                for url in urls:
                    loc = url['loc']
                    # Add the lastmod date to the Request meta
                    lastmod = url.get('lastmod', None)
                    if lastmod is not None:
                        lastmod = parse_w3c_datetime(lastmod)
                    for r, c in self._cbs:
                        if r.search(loc):
                            self.urls.append({"url": loc, "lastmod": lastmod})
                            logging.info("Adding sitemap URL {0}".format(loc))
                            break
コード例 #13
0
    def _parse_sitemap(self, response):
        if response.url.endswith('/robots.txt'):
            for url in sitemap_urls_from_robots(response.text,
                                                base_url=response.url):
                yield Request(url, callback=self._parse_sitemap)
        else:
            body = self._get_sitemap_body(response)
            if body is None:
                logger.warning("Ignoring invalid sitemap: %(response)s",
                               {'response': response},
                               extra={'spider': self})
                return

            s = Sitemap(body)
            it = self.sitemap_filter(s)

            if s.type == 'sitemapindex':
                for loc in iterloc(it, self.sitemap_alternate_links):
                    if any(x.search(loc) for x in self._follow):
                        yield Request(loc, callback=self._parse_sitemap)
            elif s.type == 'urlset':
                for loc in iterloc(it, self.sitemap_alternate_links):
                    for r, c in self._cbs:
                        if r.search(loc):
                            yield Request(loc, callback=c)
                            break
コード例 #14
0
    def _parse_sitemap(self, response):
        if response.url.endswith("/robots.txt"):
            for url in sitemap_urls_from_robots(response.text,
                                                base_url=response.url):
                yield Request(url, callback=self._parse_sitemap)
        else:
            body = self._get_sitemap_body(response)
            if body is None:
                logger.warning(
                    "Ignoring invalid sitemap: %(response)s",
                    {"response": response},
                    extra={"spider": self},
                )
                return

            s = Sitemap(body)
            it = self.sitemap_filter(s)

            if s.type == "sitemapindex":
                for (loc, ts, freq,
                     prio) in iterloc(it, self.sitemap_alternate_links):
                    if any(x.search(loc) for x in self._follow):
                        yield Request(loc, callback=self._parse_sitemap)
            elif s.type == "urlset":
                for (loc, ts, freq,
                     prio) in iterloc(it, self.sitemap_alternate_links):
                    for r, c in self._cbs:
                        if r.search(loc):
                            ts = soscan.utils.parseDatetimeString(ts)
                            if self._count_only:
                                item = soscan.items.SitemapItem()
                                item["source"] = response.url
                                item["time_retrieved"] = soscan.utils.dtnow()
                                item["url"] = loc
                                item["time_loc"] = ts
                                item["changefreq"] = freq
                                item["priority"] = prio
                                logger.debug("Yield item: %s", item)
                                yield item
                            else:
                                req = Request(
                                    loc,
                                    callback=c,
                                    flags=[
                                        self._count_only,
                                    ],
                                )
                                req.meta["loc_timestamp"] = ts
                                req.meta["loc_source"] = response.url
                                req.meta["loc_changefreq"] = freq
                                req.meta["loc_priority"] = prio
                                yield req
                            break
コード例 #15
0
ファイル: sitemap.py プロジェクト: saidimu/scrapy
 def _parse_sitemap(self, response):
     if response.url.endswith('/robots.txt'):
         for url in sitemap_urls_from_robots(response.body):
             yield Request(url, callback=self._parse_sitemap)
     else:
         s = Sitemap(response.body)
         if s.type == 'sitemapindex':
             for sitemap in s:
                 yield Request(sitemap['loc'], callback=self._parse_sitemap)
         elif s.type == 'urlset':
             for url in s:
                 loc = url['loc']
                 for r, c in self._cbs:
                     if r.search(loc):
                         yield Request(loc, callback=c)
                         break
コード例 #16
0
    def _parse_sitemap(self, response):
        if response.url.endswith('/robots.txt'):
            for url in sitemap_urls_from_robots(response.text,
                                                base_url=response.url):
                yield Request(url,
                              callback=self._parse_sitemap,
                              headers=self.sitemap_header)
        else:
            body = self._get_sitemap_body(response)
            if body is None:
                logger.warning("Ignoring invalid sitemap: %(response)s",
                               {'response': response},
                               extra={'spider': self})
                return

            s = Sitemap(body)
            if s.type == 'sitemapindex':
                for loc in iterloc(s, self.sitemap_alternate_links):
                    if any(x.search(loc) for x in self._follow):
                        yield Request(loc,
                                      callback=self._parse_sitemap,
                                      headers=self.sitemap_header)
            elif s.type == 'urlset':
                if 'news' in s._root.nsmap and 'http://www.google.com/schemas/sitemap-news/' in s._root.nsmap[
                        'news']:
                    for news in iternews(s, self.sitemap_alternate_links):
                        if self._index_filter(news):
                            for r, c in self._cbs:
                                if r.search(news['loc']):
                                    yield Request(news['loc'],
                                                  callback=c,
                                                  meta=news,
                                                  headers=self.sitemap_header)
                                    break
                else:
                    for item in iteritem(s):
                        if self._index_filter(item):
                            for r, c in self._cbs:
                                if r.search(item['loc']):
                                    yield Request(item['loc'],
                                                  callback=c,
                                                  meta=item,
                                                  headers=self.sitemap_header)
                                    break
コード例 #17
0
ファイル: test_utils_sitemap.py プロジェクト: floppya/scrapy
    def test_sitemap_urls_from_robots(self):
        robots = """User-agent: *
Disallow: /aff/
Disallow: /wl/

# Search and shopping refining
Disallow: /s*/*facet
Disallow: /s*/*tags

# Sitemap files
Sitemap: http://example.com/sitemap.xml
Sitemap: http://example.com/sitemap-product-index.xml

# Forums 
Disallow: /forum/search/
Disallow: /forum/active/
"""
        self.assertEqual(list(sitemap_urls_from_robots(robots)), 
             ['http://example.com/sitemap.xml', 'http://example.com/sitemap-product-index.xml'])
コード例 #18
0
    def _parse_sitemap(self, response):
        if response.url.endswith('/robots.txt'):
            for url in sitemap_urls_from_robots(response.body):
                yield Request(url, callback=self._parse_sitemap)
        else:
            body = self._get_sitemap_body(response)
            if body is None:
                log.msg("Ignoring invalid sitemap: %s" % response, log.WARNING)
                return

            s = Sitemap(body)
            if s.type == 'sitemapindex':
                for loc in iterloc(s):
                    if any(x.search(loc) for x in self._follow):
                        yield Request(loc, callback=self._parse_sitemap)
            elif s.type == 'urlset':
                for loc in iterloc(s):
                    for r, c in self._cbs:
                        if r.search(loc):
                            yield Request(loc, callback=c)
                            break
コード例 #19
0
ファイル: sitemap.py プロジェクト: cbourjau/scrapy
    def _parse_sitemap(self, response):
        if response.url.endswith('/robots.txt'):
            for url in sitemap_urls_from_robots(response.body):
                yield Request(url, callback=self._parse_sitemap)
        else:
            body = self._get_sitemap_body(response)
            if body is None:
                log.msg(format="Ignoring invalid sitemap: %(response)s",
                        level=log.WARNING, spider=self, response=response)
                return

            s = Sitemap(body)
            if s.type == 'sitemapindex':
                for loc in iterloc(s):
                    if any(x.search(loc) for x in self._follow):
                        yield Request(loc, callback=self._parse_sitemap)
            elif s.type == 'urlset':
                for d in s:
                    for r, c in self._cbs:
                        if r.search(d['loc']):
                            yield Request(d['loc'], callback=c, meta=d)
                            break
コード例 #20
0
ファイル: sitemap.py プロジェクト: ArturGaspar/scrapy
    def _parse_sitemap(self, response):
        if response.url.endswith('/robots.txt'):
            for url in sitemap_urls_from_robots(response.text, base_url=response.url):
                yield Request(url, callback=self._parse_sitemap)
        else:
            body = self._get_sitemap_body(response)
            if body is None:
                logger.warning("Ignoring invalid sitemap: %(response)s",
                               {'response': response}, extra={'spider': self})
                return

            s = Sitemap(body)
            if s.type == 'sitemapindex':
                for loc in iterloc(s, self.sitemap_alternate_links):
                    if any(x.search(loc) for x in self._follow):
                        yield Request(loc, callback=self._parse_sitemap)
            elif s.type == 'urlset':
                for loc in iterloc(s, self.sitemap_alternate_links):
                    for r, c in self._cbs:
                        if r.search(loc):
                            yield Request(loc, callback=c)
                            break
コード例 #21
0
    def _parse_sitemap(self, response):
        if response.url.endswith('/robots.txt'):
            for url in sitemap_urls_from_robots(response.body):
                yield Request(url, callback=self._parse_sitemap)
        else:
            body = self._get_sitemap_body(response)
            if body is None:
                log.msg(format="Ignoring invalid sitemap: %(response)s",
                        level=log.WARNING,
                        spider=self,
                        response=response)
                return

            s = Sitemap(body)
            if s.type == 'sitemapindex':
                for loc in iterloc(s, alt=self.sitemap_alternate_links):
                    if any(x.search(loc) for x in self._follow):
                        yield Request(loc, callback=self._parse_sitemap)
            elif s.type == 'urlset':
                for loc in iterloc(s, since=self.sitemap_modified_since):
                    for r, c in self._cbs:
                        if r.search(loc):
                            yield PageRequest(loc, callback=c)
                            break
コード例 #22
0
ファイル: sitemap.py プロジェクト: Aaron1011/oh-mainline
    def _parse_sitemap(self, response):
        if response.url.endswith('/robots.txt'):
            for url in sitemap_urls_from_robots(response.body):
                yield Request(url, callback=self._parse_sitemap)
        else:
            if isinstance(response, XmlResponse):
                body = response.body
            elif is_gzipped(response):
                body = gunzip(response.body)
            else:
                log.msg("Ignoring non-XML sitemap: %s" % response, log.WARNING)
                return

            s = Sitemap(body)
            if s.type == 'sitemapindex':
                for loc in iterloc(s):
                    if any(x.search(loc) for x in self._follow):
                        yield Request(loc, callback=self._parse_sitemap)
            elif s.type == 'urlset':
                for loc in iterloc(s):
                    for r, c in self._cbs:
                        if r.search(loc):
                            yield Request(loc, callback=c)
                            break
コード例 #23
0
    def _parse_sitemap(self, response):
        if response.url.endswith('/robots.txt'):
            for url in sitemap_urls_from_robots(response.body):
                yield Request(url, callback=self._parse_sitemap)
        else:
            if isinstance(response, XmlResponse):
                body = response.body
            elif is_gzipped(response):
                body = gunzip(response.body)
            else:
                log.msg("Ignoring non-XML sitemap: %s" % response, log.WARNING)
                return

            s = Sitemap(body)
            if s.type == 'sitemapindex':
                for loc in iterloc(s):
                    if any(x.search(loc) for x in self._follow):
                        yield Request(loc, callback=self._parse_sitemap)
            elif s.type == 'urlset':
                for loc in iterloc(s):
                    for r, c in self._cbs:
                        if r.search(loc):
                            yield Request(loc, callback=c)
                            break