def _download_request(self, request, info): """This method is used for HEAD and PUT requests sent to amazon S3 It tries to use a specific spider domain for uploads, or defaults to current domain spider. """ if self.s3_spider: # need to use schedule to auto-open domain return scrapyengine.schedule(request, self.s3_spider) return scrapyengine.download(request, info.spider)
def robot_parser(self, request, spider): url = urlparse_cached(request) netloc = url.netloc if netloc not in self._parsers: self._parsers[netloc] = None robotsurl = "%s://%s/robots.txt" % (url.scheme, url.netloc) robotsreq = Request(robotsurl, priority=self.DOWNLOAD_PRIORITY) dfd = scrapyengine.download(robotsreq, spider) dfd.addCallback(self._parse_robots) self._spider_netlocs[spider].add(netloc) return self._parsers[netloc]
def download(self, request, info): """ Defines how to request the download of media Default gives high priority to media requests and use scheduler, shouldn't be necessary to override. This methods is called only if result for request isn't cached, request fingerprint is used as cache key. """ request.priority = self.DOWNLOAD_PRIORITY return scrapyengine.download(request, info.spider)