def _parse_sitemap(self, response):
        self.logger.info(f"_parse_sitemap, response: {response.url}")
        if response.url.endswith("/robots.txt"):
            for url in sitemap_urls_from_robots(response.text,
                                                base_url=response.url):
                yield Request(url, callback=self._parse_sitemap)
        else:
            body = self._get_sitemap_body(response)
            if body is None:
                logger.warning(
                    "Ignoring invalid sitemap: %(response)s",
                    {"response": response},
                    extra={"spider": self},
                )
                return

            s = Sitemap(body)
            it = self.sitemap_filter(s)
            if s.type.lower() == "sitemapindex":
                for loc in iterloc(it, self.sitemap_alternate_links):
                    if any(x.search(loc) for x in self._follow):
                        yield Request(loc, callback=self._parse_sitemap)
            elif s.type == "urlset":
                for loc in iterloc(it, self.sitemap_alternate_links):
                    for r, c in self._cbs:
                        if r.search(loc):
                            yield Request(loc, callback=c)
                            break
    def _parse_sitemap(self, response):
        if response.url.endswith('/robots.txt'):
            for url in sitemap_urls_from_robots(response.body):
                yield Request(url, callback=self._parse_sitemap)
        else:
            body = self._get_sitemap_body(response)
            if body is None:
                self.logger.info('Ignoring invalid sitemap: %s', response.url)
                return

            s = Sitemap(body)
            if s.type == 'sitemapindex':
                for loc in iterloc(s, self.sitemap_alternate_links):
                    if any(x.search(loc) for x in self._follow):
                        yield Request(loc, callback=self._parse_sitemap)
            elif s.type == 'urlset':
                cnx = mysql.connector.connect(**config)
                cursor = cnx.cursor()
                truncate = "truncate table channel_sitemap"
                cursor.execute(truncate)
                cnx.commit()
                for loc in iterloc(s):
                    for r, c in self._cbs:
                        if r.search(loc):
                            print loc
                            cursor.execute("insert into channel_sitemap (url) values (%s)", (loc,))
                            cnx.commit()
                            break
    def _parse_sitemap(self, response):
        if response.url.endswith('/robots.txt'):
            for url in sitemap_urls_from_robots(response.text,
                                                base_url=response.url):
                yield Request(url, callback=self._parse_sitemap)
        else:
            body = self._get_sitemap_body(response)
            if body is None:
                logger.warning("Ignoring invalid sitemap: %(response)s",
                               {'response': response},
                               extra={'spider': self})
                return

            s = Sitemap(body)
            it = self.sitemap_filter(s)

            if s.type == 'sitemapindex':
                for loc in iterloc(it, self.sitemap_alternate_links):
                    if any(x.search(loc) for x in self._follow):
                        yield Request(loc, callback=self._parse_sitemap)
            elif s.type == 'urlset':
                for loc in iterloc(it, self.sitemap_alternate_links):
                    for r, c in self._cbs:
                        if r.search(loc):
                            yield Request(loc, callback=c)
                            break
Example #4
0
    def _parse_sitemap(self, response):  #从sitemap 到发出第一批request就这这个函数
        if response.url.endswith('/robots.txt'):
            for url in sitemap_urls_from_robots(
                    response.text, base_url=response.url):  #这里基本就是逐行解析URL 发出去
                yield Request(url, callback=self._parse_sitemap)
        else:
            body = self._get_sitemap_body(response)
            if body is None:
                logger.warning("Ignoring invalid sitemap: %(response)s",
                               {'response': response},
                               extra={'spider': self})
                return

            s = Sitemap(body)  #用lxml.etree.XMLParser 来解析respond 然后过滤出URL
            it = self.sitemap_filter(s)  #用子类的方法再次过滤一次

            if s.type == 'sitemapindex':
                for loc in iterloc(it, self.sitemap_alternate_links):
                    if any(x.search(loc) for x in self._follow):
                        yield Request(loc, callback=self._parse_sitemap)
            elif s.type == 'urlset':
                for loc in iterloc(it, self.sitemap_alternate_links):  # 变成生成器
                    for r, c in self._cbs:  #判断是否符合 对应的正则规则
                        if r.search(loc):  #复合正则规则 抛出request
                            yield Request(loc, callback=c)
                            break
Example #5
0
    def _parse_sitemap(self, response):
        logging.info("Parsing sitemap %s" % response)

        if response.url.endswith('/robots.txt'):
            for url in sitemap_urls_from_robots(response.body):
                yield Request(url, callback=self._parse_sitemap)
        else:
            body = self._get_sitemap_body(response)
            if body is None:
                logging.warning("Ignoring invalid sitemap: %(response)s",
                                response=response)
                return
            s = Sitemap(body)
            if s.type == 'sitemapindex':
                for loc in iterloc(s, self.sitemap_alternate_links):
                    if any(x.search(loc) for x in self._follow):
                        yield Request(loc, callback=self._parse_sitemap)
            elif s.type == 'urlset':
                for url in iter(s):
                    loc = url['loc']
                    # Add the lastmod date to the Request meta
                    lastmod = url.get('lastmod', None)
                    if lastmod is not None:
                        lastmod = parse_w3c_datetime(lastmod)
                    for r, c in self._cbs:
                        if r.search(loc):
                            self.urls.append({"url": loc, "lastmod": lastmod})
                            break
Example #6
0
    def test_sitemap(self):
        s = Sitemap("""<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.google.com/schemas/sitemap/0.84">
  <url>
    <loc>http://www.example.com/</loc>
    <lastmod>2009-08-16</lastmod>
    <changefreq>daily</changefreq>
    <priority>1</priority>
  </url>
  <url>
    <loc>http://www.example.com/Special-Offers.html</loc>
    <lastmod>2009-08-16</lastmod>
    <changefreq>weekly</changefreq>
    <priority>0.8</priority>
  </url>
</urlset>""")
        assert s.type == 'urlset'
        self.assertEqual(list(s), [{
            'priority': '1',
            'loc': 'http://www.example.com/',
            'lastmod': '2009-08-16',
            'changefreq': 'daily'
        }, {
            'priority': '0.8',
            'loc': 'http://www.example.com/Special-Offers.html',
            'lastmod': '2009-08-16',
            'changefreq': 'weekly'
        }])
Example #7
0
    def test_alternate(self):
        s = Sitemap("""<?xml version="1.0" encoding="UTF-8"?>
    <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
        xmlns:xhtml="http://www.w3.org/1999/xhtml">
        <url>
            <loc>http://www.example.com/english/</loc>
            <xhtml:link rel="alternate" hreflang="de"
                href="http://www.example.com/deutsch/"/>
            <xhtml:link rel="alternate" hreflang="de-ch"
                href="http://www.example.com/schweiz-deutsch/"/>
            <xhtml:link rel="alternate" hreflang="en"
                href="http://www.example.com/english/"/>
            <xhtml:link rel="alternate" hreflang="en"/><!-- wrong tag without href -->
        </url>
    </urlset>""")

        self.assertEqual(list(s), [{
            'loc':
            'http://www.example.com/english/',
            'alternate': [
                'http://www.example.com/deutsch/',
                'http://www.example.com/schweiz-deutsch/',
                'http://www.example.com/english/'
            ]
        }])
Example #8
0
    def test_sitemap_strip(self):
        """Assert we can deal with trailing spaces inside <loc> tags - we've
        seen those
        """
        s = Sitemap("""<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.google.com/schemas/sitemap/0.84">
  <url>
    <loc> http://www.example.com/</loc>
    <lastmod>2009-08-16</lastmod>
    <changefreq>daily</changefreq>
    <priority>1</priority>
  </url>
  <url>
    <loc> http://www.example.com/2</loc>
    <lastmod />
  </url>
</urlset>
""")
        self.assertEqual(list(s), [
            {
                'priority': '1',
                'loc': 'http://www.example.com/',
                'lastmod': '2009-08-16',
                'changefreq': 'daily'
            },
            {
                'loc': 'http://www.example.com/2',
                'lastmod': ''
            },
        ])
Example #9
0
    def _parse_sitemap(self, response):
        body = self._get_sitemap_body(response)
        if body is None:
            self.logger.warning("Ignoring invalid sitemap: %(response)s",
                                {'response': response},
                                extra={'spider': self})
            return
        s = Sitemap(body)
        if s.type == 'urlset':
            for loc in iterloc(s):
                for r, c in self._cbs:
                    if r.search(loc):
                        category_regex = r'toptenreviews\.com/(.*)/$'
                        match = re.search(category_regex, loc)

                        # the URL pattern must be change if there is no category matching
                        if not match:
                            break

                        category = CategoryItem()
                        category['category_path'] = match.group(1)
                        category['category_url'] = loc
                        if self.should_skip_category(category):
                            break

                        yield category
                        request = Request(loc, callback=c)
                        request.meta['category'] = category
                        yield request
                        break
Example #10
0
    def test_sitemap_wrong_ns(self):
        """We have seen sitemaps with wrongs ns. Presumably, Google still works
        with these, though is not 100% confirmed"""
        s = Sitemap("""<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.google.com/schemas/sitemap/0.84">
  <url xmlns="">
    <loc> http://www.example.com/</loc>
    <lastmod>2009-08-16</lastmod>
    <changefreq>daily</changefreq>
    <priority>1</priority>
  </url>
  <url xmlns="">
    <loc> http://www.example.com/2</loc>
    <lastmod />
  </url>
</urlset>
""")
        self.assertEqual(list(s), [
            {
                'priority': '1',
                'loc': 'http://www.example.com/',
                'lastmod': '2009-08-16',
                'changefreq': 'daily'
            },
            {
                'loc': 'http://www.example.com/2',
                'lastmod': ''
            },
        ])
Example #11
0
    def _parse_sitemap(self, response):

        if response.url.endswith('/robots.txt'):

            for url in sitemap_urls_from_robots(response.text, base_url=response.url):
                yield Request(url, callback=self._parse_sitemap)

        else:
            body = self._get_sitemap_body(response)
            if body is None:
                return

            s = Sitemap(body)
            it = self.sitemap_filter(s)

            if s.type == 'sitemapindex':
                for loc, lastmod in self._iterloc(it, self.sitemap_alternate_links):
                    if any(x.search(loc) for x in self._follow):
                        request = Request(loc, callback=self._parse_sitemap)
                        request.meta['lastmod'] = lastmod
                        yield request
            elif s.type == 'urlset':
                for loc, lastmod in self._iterloc(it, self.sitemap_alternate_links):
                    for r, c in self._cbs:
                        if r.search(loc):
                            if self.enable_page_limit and self.request_counter >= self.max_requests:
                                return
                            if not self._filter(loc):
                                self.request_counter += 0.2
                                break
                            self.request_counter += 1
                            request = Request(loc, callback=c)
                            request.meta['lastmod'] = lastmod
                            yield request
                            break
Example #12
0
def sitemap_to_array(url):
    results = []
    body = urllib2.urlopen(url).read()
    sitemap = Sitemap(body)
    for item in sitemap:
        results.append(item['loc'])
    return results
 def parse(self, response):
     body = ""
     body = gunzip(response.body)
     s = Sitemap(body)
     for sitelink in s:
         url = sitelink['loc']
         yield scrapy.Request(url, callback=self.parse_details)
Example #14
0
    def test_sitemap_blanklines(self):
        """Assert we can deal with starting blank lines before <xml> tag"""
        s = Sitemap(b"""\

<?xml version="1.0" encoding="UTF-8"?>
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">

<!-- cache: cached = yes name = sitemap_jspCache key = sitemap -->
<sitemap>
<loc>http://www.example.com/sitemap1.xml</loc>
<lastmod>2013-07-15</lastmod>
</sitemap>

<sitemap>
<loc>http://www.example.com/sitemap2.xml</loc>
<lastmod>2013-07-15</lastmod>
</sitemap>

<sitemap>
<loc>http://www.example.com/sitemap3.xml</loc>
<lastmod>2013-07-15</lastmod>
</sitemap>

<!-- end cache -->
</sitemapindex>
""")
        self.assertEqual(list(s), [
            {'lastmod': '2013-07-15', 'loc': 'http://www.example.com/sitemap1.xml'},
            {'lastmod': '2013-07-15', 'loc': 'http://www.example.com/sitemap2.xml'},
            {'lastmod': '2013-07-15', 'loc': 'http://www.example.com/sitemap3.xml'},
        ])
    def _parse_sitemap(self, response):
        """get all urls and call for each one with SplashRequest"""

        urls_list = []

        if response.url.endswith('/robots.txt'):
            for url in sitemap_urls_from_robots(response.text,
                                                base_url=response.url):
                yield Request(url, callback=self._parse_sitemap)
        else:
            body = self._get_sitemap_body(response)
            if body is None:
                logger.warning("Ignoring invalid sitemap: %(response)s",
                               {'response': response},
                               extra={'spider': self})
                return

            s = Sitemap(body)
            it = self.sitemap_filter(s)

            if s.type == 'sitemapindex':
                for loc in iterloc(it, self.sitemap_alternate_links):
                    if any(x.search(loc) for x in self._follow):
                        yield Request(loc, callback=self._parse_sitemap)
            elif s.type == 'urlset':
                for loc in iterloc(it, self.sitemap_alternate_links):
                    for r, c in self._cbs:
                        if r.search(loc):
                            urls_list.append(loc)

        pool = ThreadPool()
        for s in pool.map(self.get_response, urls_list):
            yield s
        pool.close()
        pool.join()
Example #16
0
    def parse_sitemap(self, response):
        sitemap = Sitemap(response.body)
        for site_url in sitemap:
            url = site_url['loc']
            if "sitemap" in url and ".xml" in url:
                yield Request(url, self.parse_sitemap)

            else:
                yield Request(url, self.parse_page)
Example #17
0
    def test_comment(self):
        s = Sitemap("""<?xml version="1.0" encoding="UTF-8"?>
    <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
        xmlns:xhtml="http://www.w3.org/1999/xhtml">
        <url>
            <loc>http://www.example.com/</loc>
            <!-- this is a comment on which the parser might raise an exception if implemented incorrectly -->
        </url>
    </urlset>""")

        self.assertEqual(list(s), [{'loc': 'http://www.example.com/'}])
Example #18
0
    def _parse_sitemap(self, response):
        if response.url.endswith("/robots.txt"):
            for url in sitemap_urls_from_robots(response.text,
                                                base_url=response.url):
                yield Request(url, callback=self._parse_sitemap)
        else:
            body = self._get_sitemap_body(response)
            if body is None:
                logger.warning(
                    "Ignoring invalid sitemap: %(response)s",
                    {"response": response},
                    extra={"spider": self},
                )
                return

            s = Sitemap(body)
            it = self.sitemap_filter(s)

            if s.type == "sitemapindex":
                for (loc, ts, freq,
                     prio) in iterloc(it, self.sitemap_alternate_links):
                    if any(x.search(loc) for x in self._follow):
                        yield Request(loc, callback=self._parse_sitemap)
            elif s.type == "urlset":
                for (loc, ts, freq,
                     prio) in iterloc(it, self.sitemap_alternate_links):
                    for r, c in self._cbs:
                        if r.search(loc):
                            ts = soscan.utils.parseDatetimeString(ts)
                            if self._count_only:
                                item = soscan.items.SitemapItem()
                                item["source"] = response.url
                                item["time_retrieved"] = soscan.utils.dtnow()
                                item["url"] = loc
                                item["time_loc"] = ts
                                item["changefreq"] = freq
                                item["priority"] = prio
                                logger.debug("Yield item: %s", item)
                                yield item
                            else:
                                req = Request(
                                    loc,
                                    callback=c,
                                    flags=[
                                        self._count_only,
                                    ],
                                )
                                req.meta["loc_timestamp"] = ts
                                req.meta["loc_source"] = response.url
                                req.meta["loc_changefreq"] = freq
                                req.meta["loc_priority"] = prio
                                yield req
                            break
Example #19
0
    def test_xml_entity_expansion(self):
        s = Sitemap("""<?xml version="1.0" encoding="utf-8"?>
          <!DOCTYPE foo [
          <!ELEMENT foo ANY >
          <!ENTITY xxe SYSTEM "file:///etc/passwd" >
          ]>
          <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
            <url>
              <loc>http://127.0.0.1:8000/&xxe;</loc>
            </url>
          </urlset>
        """)

        self.assertEqual(list(s), [{'loc': 'http://127.0.0.1:8000/'}])
Example #20
0
    def test_sitemap_index(self):
        s = Sitemap(b"""<?xml version="1.0" encoding="UTF-8"?>
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
   <sitemap>
      <loc>http://www.example.com/sitemap1.xml.gz</loc>
      <lastmod>2004-10-01T18:23:17+00:00</lastmod>
   </sitemap>
   <sitemap>
      <loc>http://www.example.com/sitemap2.xml.gz</loc>
      <lastmod>2005-01-01</lastmod>
   </sitemap>
</sitemapindex>""")
        assert s.type == 'sitemapindex'
        self.assertEqual(list(s), [{'loc': 'http://www.example.com/sitemap1.xml.gz', 'lastmod': '2004-10-01T18:23:17+00:00'}, {'loc': 'http://www.example.com/sitemap2.xml.gz', 'lastmod': '2005-01-01'}])
Example #21
0
    def parse_sitemap(self, response):
        # Errors
        if (response.status != 200):
            raise CloseSpider('Bad response returned')

        # Parse sitemap.xml
        sitemap = Sitemap(response.body)
        if sitemap.type == 'urlset' or sitemap.type == 'sitemapindex':
            for url in sitemap:
                item = SearchResultItem()
                item['url'] = url['loc']
                item['cache'] = response.url
                yield item
                # Also consider alternate URLs (xhtml:link rel="alternate")
                if 'alternate' in url:
                    for alt in url['alternate']:
                        item = SearchResultItem()
                        item['url'] = alt
                        item['cache'] = response.url
                        yield item
Example #22
0
    def _parse_sitemap(self, response):
        if response.url.endswith('/robots.txt'):
            for url in sitemap_urls_from_robots(response.body):
                yield Request(url, callback=self._parse_sitemap)
        else:
            body = self._get_sitemap_body(response)
            if body is None:
                log.msg("Ignoring invalid sitemap: %s" % response, log.WARNING)
                return

            s = Sitemap(body)
            if s.type == 'sitemapindex':
                for loc in iterloc(s):
                    if any(x.search(loc) for x in self._follow):
                        yield Request(loc, callback=self._parse_sitemap)
            elif s.type == 'urlset':
                for loc in iterloc(s):
                    for r, c in self._cbs:
                        if r.search(loc):
                            yield Request(loc, callback=c)
                            break
Example #23
0
    def _parse_sitemap(self, response):
        self.logger.info("Custom sitemap parsing function initiating")
        if response.url.endswith('/robots.txt'):
            for url in sitemap_urls_from_robots(response.text):
                yield Request(url, callback=self._parse_sitemap)
        else:
            body = response.body
            if body is None:
                logger.warning("Ignoring invalid sitemap: %(response)s",
                               {'response': response},
                               extra={'spider': self})
                return

            s = Sitemap(body)
            loc_reg = '<loc>(.*?)<\/loc>'
            if s.type == 'sitemapindex':
                for loc in re.findall(loc_reg, body.decode('utf-8')):
                    print(loc)
                    yield Request(loc, callback=self._parse_sitemap)
            elif s.type == 'urlset':
                for loc in re.findall(loc_reg, body.decode('utf-8')):
                    yield Request(loc, callback=self.parse_wine)
    def _parse_sitemap(self, response):
        if response.url.endswith('/robots.txt'):
            for url in sitemap_urls_from_robots(response.body):
                yield Request(url, callback=self._parse_sitemap)
        else:
            body = self._get_sitemap_body(response)
            if body is None:
                log.msg(format="Ignoring invalid sitemap: %(response)s",
                        level=log.WARNING,
                        spider=self,
                        response=response)
                return

            s = Sitemap(body)
            if s.type == 'sitemapindex':
                for loc in iterloc(s, alt=self.sitemap_alternate_links):
                    if any(x.search(loc) for x in self._follow):
                        yield Request(loc, callback=self._parse_sitemap)
            elif s.type == 'urlset':
                for loc in iterloc(s, since=self.sitemap_modified_since):
                    for r, c in self._cbs:
                        if r.search(loc):
                            yield PageRequest(loc, callback=c)
                            break
Example #25
0
    def _parse_sitemap(self, response):
        if response.url.endswith('/robots.txt'):
            for url in sitemap_urls_from_robots(response.body):
                yield Request(url, callback=self._parse_sitemap)
        else:
            if isinstance(response, XmlResponse):
                body = response.body
            elif is_gzipped(response):
                body = gunzip(response.body)
            else:
                log.msg("Ignoring non-XML sitemap: %s" % response, log.WARNING)
                return

            s = Sitemap(body)
            if s.type == 'sitemapindex':
                for loc in iterloc(s):
                    if any(x.search(loc) for x in self._follow):
                        yield Request(loc, callback=self._parse_sitemap)
            elif s.type == 'urlset':
                for loc in iterloc(s):
                    for r, c in self._cbs:
                        if r.search(loc):
                            yield Request(loc, callback=c)
                            break
Example #26
0
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
__author__ = "Sigai"

import requests

from scrapy.utils.sitemap import Sitemap
from scrapy.http import XmlResponse

url = 'https://answers.microsoft.com/map/thread?day=1'

body = requests.get(url)

s = Sitemap(body.content)

for d in s:
    print(d)