def _get_spider_requests(*args): """Collect requests and spiders from the given arguments. Returns a dict of spider -> list of requests """ spider_requests = defaultdict(list) for arg in args: if isinstance(arg, tuple): request, spider = arg spider_requests[spider] = request elif isinstance(arg, Request): spider = spiders.fromurl(arg.url) or BaseSpider('default') if spider: spider_requests[spider] += [arg] else: log.msg('Could not find spider for request: %s' % arg, log.ERROR) elif isinstance(arg, BaseSpider): spider_requests[arg] += arg.start_requests() elif is_url(arg): spider = spiders.fromurl(arg) or BaseSpider('default') if spider: for req in arg_to_iter(spider.make_requests_from_url(arg)): spider_requests[spider] += [req] else: log.msg('Could not find spider for url: %s' % arg, log.ERROR) elif isinstance(arg, basestring): spider = spiders.fromdomain(arg) if spider: spider_requests[spider] += spider.start_requests() else: log.msg('Could not find spider for domain: %s' % arg, log.ERROR) else: raise TypeError("Unsupported argument: %r" % arg) return spider_requests
def run(self, args, opts): if not args: print "An URL is required" return for response in fetch(args): spider = spiders.fromurl(response.url) if not spider: log.msg('Cannot find spider for "%s"' % response.url) continue if self.callbacks: for callback in self.callbacks: items, links = self.run_callback(spider, response, callback, args, opts) self.print_results(items, links, callback, opts) elif opts.rules: rules = getattr(spider, "rules", None) if rules: items, links = [], [] for rule in rules: if rule.callback and rule.link_extractor.matches(response.url): items, links = self.run_callback(spider, response, rule.callback, args, opts) self.print_results(items, links, rule.callback, opts) break else: log.msg( 'No rules found for spider "%s", please specify a callback for parsing' % spider.domain_name ) continue else: items, links = self.run_callback(spider, response, "parse", args, opts) self.print_results(items, links, "parse", opts)
def fetch(self, request_or_url, print_help=False): if isinstance(request_or_url, Request): request = request_or_url url = request.url else: url = parse_url(request_or_url) request = Request(url) spider = spiders.fromurl(url) or BaseSpider('default') print "Fetching %s..." % request response = threads.blockingCallFromThread(reactor, scrapyengine.schedule, \ request, spider) if response: self.populate_vars(url, response, request) if print_help: self.print_help() else: print "Done - use shelp() to see available objects"
def run_callback(self, spider, response, callback, args, opts): spider = spiders.fromurl(response.url) if not spider: log.msg("Cannot find spider for url: %s" % response.url, level=log.ERROR) return (), () if callback: callback_fcn = callback if callable(callback) else getattr(spider, callback, None) if not callback_fcn: log.msg("Cannot find callback %s in %s spider" % (callback, spider.domain_name)) return (), () result = callback_fcn(response) links = [i for i in result if isinstance(i, Request)] items = [self.pipeline_process(i, spider, opts) for i in result if isinstance(i, BaseItem)] return items, links return (), ()
def populate_vars(self, url=None, response=None, request=None): item = self.item_class() self.vars['item'] = item if url: if isinstance(response, TextResponse): self.vars['xxs'] = XmlXPathSelector(response) self.vars['hxs'] = HtmlXPathSelector(response) self.vars['url'] = url self.vars['response'] = response self.vars['request'] = request self.vars['spider'] = spiders.fromurl(url) if not self.nofetch: self.vars['fetch'] = self.fetch self.vars['view'] = open_in_browser self.vars['shelp'] = self.print_help if self.update_vars: self.update_vars(self.vars)