Esempio n. 1
0
 def test_40(self):
     good_characters = ''.join([
         x for x in string.printable
         if x not in string.punctuation and x not in string.whitespace
     ])
     oracle_iterator = [
         {
             "loc":
             sample(good_characters, randint(8, 16)),
             "alternate": [
                 sample(good_characters, randint(8, 16))
                 for _ in range(randint(0, 10))
             ]
         } for _ in range(randint(0, 100))  # Not too many tests
     ]
     # Let's build the correct results
     locations = sorted([x['loc'] for x in oracle_iterator])
     alternate = [l for x in oracle_iterator for l in x['alternate']]
     full = sorted(locations + alternate)
     # And get the iterloc ones
     result1 = sorted(list(iterloc(oracle_iterator)))
     result2 = sorted(list(iterloc(oracle_iterator, alt=True)))
     # Check the results
     self.assertEqual(result1, locations)
     self.assertEqual(result2, full)
Esempio n. 2
0
    def _parse_sitemap(self, response):
        self.logger.info(f"_parse_sitemap, response: {response.url}")
        if response.url.endswith("/robots.txt"):
            for url in sitemap_urls_from_robots(response.text,
                                                base_url=response.url):
                yield Request(url, callback=self._parse_sitemap)
        else:
            body = self._get_sitemap_body(response)
            if body is None:
                logger.warning(
                    "Ignoring invalid sitemap: %(response)s",
                    {"response": response},
                    extra={"spider": self},
                )
                return

            s = Sitemap(body)
            it = self.sitemap_filter(s)
            if s.type.lower() == "sitemapindex":
                for loc in iterloc(it, self.sitemap_alternate_links):
                    if any(x.search(loc) for x in self._follow):
                        yield Request(loc, callback=self._parse_sitemap)
            elif s.type == "urlset":
                for loc in iterloc(it, self.sitemap_alternate_links):
                    for r, c in self._cbs:
                        if r.search(loc):
                            yield Request(loc, callback=c)
                            break
    def _parse_sitemap(self, response):
        if response.url.endswith('/robots.txt'):
            for url in sitemap_urls_from_robots(response.body):
                yield Request(url, callback=self._parse_sitemap)
        else:
            body = self._get_sitemap_body(response)
            if body is None:
                self.logger.warning("Ignoring invalid sitemap: %(response)s",
                               {'response': response}, extra={'spider': self})
                return

            s = Sitemap(body)
            if s.type == 'sitemapindex':
                for loc in iterloc(s, self.sitemap_alternate_links):
                    if any(x.search(loc) for x in self._follow):
                        yield Request(loc, callback=self._parse_sitemap)
            elif s.type == 'urlset':
                for loc in iterloc(s):
                    for r, c in self._cbs:
                        if r.search(loc):
                            if not doctor_exists(loc):
                                self.logger.debug("Doctor's url not found in db. Fetching data")
                                yield Request(loc, callback=c)
                            else:
                                self.logger.debug("Doctor's url found in db. Passing it on")
                            break
Esempio n. 4
0
 def test_39(self):
     single_element_iterator = [{
         "loc": "location",
         "alternate": ["alternate_location"]
     }]
     result1 = list(iterloc(single_element_iterator))
     result2 = list(iterloc(single_element_iterator, alt=True))
     self.assertEqual(result1, ["location"])
     self.assertEqual(sorted(result2),
                      sorted(["location", "alternate_location"]))
Esempio n. 5
0
    def _parse_sitemap(self, response):
        logging.info("Parsing sitemap %s" % response)

        if response.url.endswith('/robots.txt'):
            for url in sitemap_urls_from_robots(response.body):
                yield Request(url, callback=self._parse_sitemap)
        else:
            body = self._get_sitemap_body(response)
            if body is None:
                logging.warning("Ignoring invalid sitemap: %(response)s",
                                response=response)
                return
            s = Sitemap(body)
            if s.type == 'sitemapindex':
                for loc in iterloc(s, self.sitemap_alternate_links):
                    if any(x.search(loc) for x in self._follow):
                        yield Request(loc, callback=self._parse_sitemap)
            elif s.type == 'urlset':
                for url in iter(s):
                    loc = url['loc']
                    # Add the lastmod date to the Request meta
                    lastmod = url.get('lastmod', None)
                    if lastmod is not None:
                        lastmod = parse_w3c_datetime(lastmod)
                    for r, c in self._cbs:
                        if r.search(loc):
                            self.urls.append({"url": loc, "lastmod": lastmod})
                            break
    def _parse_sitemap(self, response):
        """This is adapted from scrapy.spiders.sitemap"""
        if response.url.endswith('/robots.txt'):
            for url in sitemap_urls_from_robots(response.text,
                                                base_url=response.url):
                yield Request(url, callback=self._parse_sitemap)
        else:
            body = self._get_sitemap_body(response)
            if body is None:
                self.logger.warning("Ignoring invalid sitemap: %(response)s",
                                    {'response': response},
                                    extra={'spider': self})
                return

            s = NewsSitemap(body)
            if s.type == 'sitemapindex':
                for loc in iterloc(s, self.sitemap_alternate_links):
                    if any(x.search(loc) for x in self._follow):
                        yield self.url_to_request(loc,
                                                  callback=self._parse_sitemap)
            elif s.type == 'urlset':
                for loc, meta in self.iterurlset(s):
                    for r, c in self._cbs:
                        if r.search(loc):
                            try:
                                self.logger.debug(f'Queuing {loc}')
                                yield self.url_to_request(loc,
                                                          callback=c,
                                                          meta=meta)
                                break
                            except Exception as e:
                                self.logger.error(
                                    f'Failed to queue {loc}: {e}')
Esempio n. 7
0
    def _parse_sitemap(self, response):
        body = self._get_sitemap_body(response)
        if body is None:
            self.logger.warning("Ignoring invalid sitemap: %(response)s",
                                {'response': response},
                                extra={'spider': self})
            return
        s = Sitemap(body)
        if s.type == 'urlset':
            for loc in iterloc(s):
                for r, c in self._cbs:
                    if r.search(loc):
                        category_regex = r'toptenreviews\.com/(.*)/$'
                        match = re.search(category_regex, loc)

                        # the URL pattern must be change if there is no category matching
                        if not match:
                            break

                        category = CategoryItem()
                        category['category_path'] = match.group(1)
                        category['category_url'] = loc
                        if self.should_skip_category(category):
                            break

                        yield category
                        request = Request(loc, callback=c)
                        request.meta['category'] = category
                        yield request
                        break
Esempio n. 8
0
    def _parse_sitemap(self, response):
        logging.info("Parsing sitemap %s" % response)

        if response.url.endswith('/robots.txt'):
            for url in sitemap_urls_from_robots(response.body):
                yield Request(url, callback=self._parse_sitemap)
        else:
            body = self._get_sitemap_body(response)
            if body is None:
                logging.warning("Ignoring invalid sitemap: %(response)s", response=response)
                return
            s = Sitemap(body)
            if s.type == 'sitemapindex':
                for loc in iterloc(s, self.sitemap_alternate_links):
                    if any(x.search(loc) for x in self._follow):
                        yield Request(loc, callback=self._parse_sitemap)
            elif s.type == 'urlset':
                urls = list(iter(s))
                logging.info("Checking {0} sitemap URLs".format(len(urls)))
                for url in urls:
                    loc = url['loc']
                    # Add the lastmod date to the Request meta
                    lastmod = url.get('lastmod', None)
                    if lastmod is not None:
                        lastmod = parse_w3c_datetime(lastmod)
                    for r, c in self._cbs:
                        if r.search(loc):
                            self.urls.append({"url": loc, "lastmod": lastmod})
                            logging.info("Adding sitemap URL {0}".format(loc))
                            break
Esempio n. 9
0
 def test_38(self):
     empty_iterator = []
     result1 = list(iterloc(empty_iterator))
     result2 = list(iterloc(empty_iterator, alt=True))
     self.assertEqual(result1, [])
     self.assertEqual(result2, [])