def _get_spider_requests(*args): """Collect requests and spiders from the given arguments. Returns a dict of spider -> list of requests """ spider_requests = defaultdict(list) for arg in args: if isinstance(arg, tuple): request, spider = arg spider_requests[spider] = request elif isinstance(arg, Request): spider = spiders.fromurl(arg.url) or BaseSpider('default') if spider: spider_requests[spider] += [arg] else: log.msg('Could not find spider for request: %s' % arg, log.ERROR) elif isinstance(arg, BaseSpider): spider_requests[arg] += arg.start_requests() elif is_url(arg): spider = spiders.fromurl(arg) or BaseSpider('default') if spider: for req in arg_to_iter(spider.make_requests_from_url(arg)): spider_requests[spider] += [req] else: log.msg('Could not find spider for url: %s' % arg, log.ERROR) elif isinstance(arg, basestring): spider = spiders.fromdomain(arg) if spider: spider_requests[spider] += spider.start_requests() else: log.msg('Could not find spider for domain: %s' % arg, log.ERROR) else: raise TypeError("Unsupported argument: %r" % arg) return spider_requests
def _split_urls_and_names(self, args): urls = [] names = [] for arg in args: if is_url(arg): urls.append(arg) else: names.append(arg) return urls, names
def run(self, args, opts): if not len(args) == 1 or not is_url(args[0]): return False responses = [] # to collect downloaded responses request = Request(args[0], callback=responses.append) if opts.spider: try: spider = spiders.create(opts.spider) except KeyError: log.msg("Unable to find spider: %s" % opts.spider, log.ERROR) return else: spider = spiders.create_for_request(request) scrapymanager.configure() scrapymanager.queue.append_request(request, spider) scrapymanager.start() if not responses: log.msg("No response returned", log.ERROR, spider=spider) return # now process response # - if callbacks defined then call each one print results # - if --rules option given search for matching spider's rule # - default print result using default 'parse' spider's callback response = responses[0] if self.callbacks: # apply each callback for callback in self.callbacks: items, links = self.run_callback(spider, response, callback, args, opts) self.print_results(items, links, callback, opts) elif opts.rules: # search for matching spider's rule if hasattr(spider, "rules") and spider.rules: items, links = [], [] for rule in spider.rules: if rule.link_extractor.matches(response.url) and rule.callback: items, links = self.run_callback(spider, response, rule.callback, args, opts) self.print_results(items, links, rule.callback, opts) # first-match rule breaks rules loop break else: log.msg( 'No rules found for spider "%s", ' "please specify a callback for parsing" % spider.name, log.ERROR ) else: # default callback 'parse' items, links = self.run_callback(spider, response, "parse", args, opts) self.print_results(items, links, "parse", opts)
def run(self, args, opts): if not len(args) == 1 or not is_url(args[0]): raise UsageError() response, spider = self.get_response_and_spider(args[0], opts) if not response: return callback = None if opts.callback: callback = opts.callback elif opts.rules: callback = self.get_callback_from_rules(spider, response) items, requests = self.run_callback(spider, response, callback or 'parse', \ opts) self.print_results(items, requests, callback, opts)
def run(self, args, opts): if len(args) != 1 or not is_url(args[0]): raise UsageError() cb = lambda x: self._print_response(x, opts) request = Request(args[0], callback=cb, dont_filter=True) spider = None if opts.spider: try: spider = self.crawler.spiders.create(opts.spider) except KeyError: log.msg("Could not find spider: %s" % opts.spider, log.ERROR) self.crawler.queue.append_request(request, spider, \ default_spider=BaseSpider('default')) self.crawler.start()