Esempio n. 1
0
 def __getitem__(self, key):
     value = self._domain_metadata[key]
     for k, v in six.iteritems(value):
         if k in self._set_fields:
             value[k] = set(value[k])
     if 'rp_url' in value and 'rp_body' in value:
         value['_rp'] = robotparser.RobotFileParser(value['rp_url'])
         value['_rp'].parse(value['rp_body'].splitlines())
     return value
Esempio n. 2
0
 def _parse_robots(self, response):
     rp = robotparser.RobotFileParser(response.url)
     body = ''
     if hasattr(response, 'body_as_unicode'):
         body = response.body_as_unicode()
     else: # last effort try
         try:
             body = response.body.decode('utf-8')
         except UnicodeDecodeError:
             # If we found garbage, disregard it:,
             # but keep the lookup cached (in self._parsers)
             # Running rp.parse() will set rp state from
             # 'disallow all' to 'allow any'.
             pass
     rp.parse(body.splitlines())
     self._parsers[urlparse_cached(response).netloc] = rp
Esempio n. 3
0
    def _process_robots_txt(self, response, domain):
        """Handle robots.txt successful response.

        The main logic behind the method is to create a RobotFileParser instance
        if it's possible to decode and read robots.txt content, and save it as a
        property of domain to reuse it later when deciding about need to schedule
        a domain page or not.
        """
        netloc = response.meta[b'netloc']
        domain.setdefault('queued_pages', 0)
        try:
            body = response.body.decode('utf-8')  # TODO: use encoding from response.meta.get(b'encoding', 'utf-8')
        except UnicodeDecodeError:
            self.logger.warning("Error during robots.txt decoding at %s", response.url)
            update_domain_with_parser_data(domain, parser=None, url=response.url)
            self._schedule_home_page(netloc, domain)
            return
        robots_lines = body.splitlines()
        parser = robotparser.RobotFileParser(response.url)
        try:
            if not is_valid_robotstxt(robots_lines):
                raise SyntaxError("Robots.txt isn't valid")
            parser.parse(robots_lines)
        except Exception:
            self.logger.exception("Error during robots.txt parsing at %s", response.url)
            update_domain_with_parser_data(domain, parser=None, url=response.url)
            self._schedule_home_page(netloc, domain)
            return
        requests = set()
        for line in robots_lines:
            if line.startswith("Sitemap:"):
                _, _, url = line.partition(':')
                sitemap_url = urljoin(response.url, url.strip())
                meta = {b'seed': domain.get('seed'), b'sitemap': True,
                        b'scrapy_meta': {b'download_maxsize': SITEMAP_DOWNLOAD_MAXSIZE}}
                requests.add(self.create_request(sitemap_url, meta=meta, headers=DEFAULT_HEADERS))
        self.refresh_states(requests)
        # schedule sitemap requests
        self._schedule_requests(requests, domain, score=0.9)
        if not requests:
            self.logger.debug("Sitemap in robots.txt wasn't found for url %s", response.url)
        update_domain_with_parser_data(domain, parser=parser, url=response.url, body=body)
        # also always schedule home page regardless of scheduled sitemaps
        self._schedule_home_page(netloc, domain)
Esempio n. 4
0
def check_robotstxt(url, useCache, cache_dir, userAgent=None):
    scheme, netloc, url_path, query, fragment = urlparse.urlsplit(url)
    robotstxt_url = urlparse.urlunsplit((
        scheme,
        netloc,
        '/robots.txt',
        '',
        '',
    ))

    key = generate_key(robotstxt_url)

    robots_parser = robotparser.RobotFileParser()
    cached_content = cache_get(cache_dir, key) if useCache else ''
    threshold = (time.time() - 86400 * 7)

    if not cached_content or cache_info(cache_dir, key) < threshold:
        try:
            cached_content = fetch(robotstxt_url, userAgent=userAgent)
            if useCache:
                cache_set(cache_dir, key, cached_content)
        except HTTPError as he:
            # this block mimics the behaviour in the robotparser.read() method
            if he.code in (401, 403):
                robots_parser.disallow_all = True
            elif he.code >= 400:
                robots_parser.allow_all = True
            else:
                raise he
            cached_content = ''

    try:
        cached_content = str(cached_content, encoding='utf8')
    except TypeError:
        pass
    robots_parser.parse((x for x in cached_content.split('\n')))
    default_useragent = None
    for k, v in OpenerDirector().addheaders:
        if k == "User-agent":
            default_useragent = v
            break

    return robots_parser.can_fetch(userAgent or default_useragent, url)
Esempio n. 5
0
    def _parse_robots(self, response, netloc):
        rp = robotparser.RobotFileParser(response.url)
        body = ''
        if hasattr(response, 'text'):
            body = response.text
        else:  # last effort try
            try:
                body = response.body.decode('utf-8')
            except UnicodeDecodeError:
                # If we found garbage, disregard it:,
                # but keep the lookup cached (in self._parsers)
                # Running rp.parse() will set rp state from
                # 'disallow all' to 'allow any'.
                pass
        # stdlib's robotparser expects native 'str' ;
        # with unicode input, non-ASCII encoded bytes decoding fails in Python2
        rp.parse(to_native_str(body).splitlines())

        rp_dfd = self._parsers[netloc]
        self._parsers[netloc] = rp
        rp_dfd.callback(rp)
Esempio n. 6
0
 def _parse_robots(self, response):
     rp = robotparser.RobotFileParser(response.url)
     rp.parse(response.body.splitlines())
     self._parsers[urlparse_cached(response).netloc] = rp