Beispiel #1
0
def _get_spider_requests(*args):
    """Collect requests and spiders from the given arguments. Returns a dict of
    spider -> list of requests
    """
    spider_requests = defaultdict(list)
    for arg in args:
        if isinstance(arg, tuple):
            request, spider = arg
            spider_requests[spider] = request
        elif isinstance(arg, Request):
            spider = spiders.fromurl(arg.url) or BaseSpider('default')
            if spider:
                spider_requests[spider] += [arg]
            else:
                log.msg('Could not find spider for request: %s' % arg, log.ERROR)
        elif isinstance(arg, BaseSpider):
            spider_requests[arg] += arg.start_requests()
        elif is_url(arg):
            spider = spiders.fromurl(arg) or BaseSpider('default')
            if spider:
                for req in arg_to_iter(spider.make_requests_from_url(arg)):
                    spider_requests[spider] += [req]
            else:
                log.msg('Could not find spider for url: %s' % arg, log.ERROR)
        elif isinstance(arg, basestring):
            spider = spiders.fromdomain(arg)
            if spider:
                spider_requests[spider] += spider.start_requests()
            else:
                log.msg('Could not find spider for domain: %s' % arg, log.ERROR)
        else:
            raise TypeError("Unsupported argument: %r" % arg)
    return spider_requests
Beispiel #2
0
    def run(self, args, opts):
        if not args:
            print "An URL is required"
            return

        for response in fetch(args):
            spider = spiders.fromurl(response.url)
            if not spider:
                log.msg('Cannot find spider for "%s"' % response.url)
                continue

            if self.callbacks:
                for callback in self.callbacks:
                    items, links = self.run_callback(spider, response, callback, args, opts)
                    self.print_results(items, links, callback, opts)

            elif opts.rules:
                rules = getattr(spider, "rules", None)
                if rules:
                    items, links = [], []
                    for rule in rules:
                        if rule.callback and rule.link_extractor.matches(response.url):
                            items, links = self.run_callback(spider, response, rule.callback, args, opts)
                            self.print_results(items, links, rule.callback, opts)
                            break
                else:
                    log.msg(
                        'No rules found for spider "%s", please specify a callback for parsing' % spider.domain_name
                    )
                    continue

            else:
                items, links = self.run_callback(spider, response, "parse", args, opts)
                self.print_results(items, links, "parse", opts)
Beispiel #3
0
 def fetch(self, request_or_url, print_help=False):
     if isinstance(request_or_url, Request):
         request = request_or_url
         url = request.url
     else:
         url = parse_url(request_or_url)
         request = Request(url)
     spider = spiders.fromurl(url) or BaseSpider('default')
     print "Fetching %s..." % request
     response = threads.blockingCallFromThread(reactor, scrapyengine.schedule, \
         request, spider)
     if response:
         self.populate_vars(url, response, request)
         if print_help:
             self.print_help()
         else:
             print "Done - use shelp() to see available objects"
Beispiel #4
0
    def run_callback(self, spider, response, callback, args, opts):
        spider = spiders.fromurl(response.url)
        if not spider:
            log.msg("Cannot find spider for url: %s" % response.url, level=log.ERROR)
            return (), ()

        if callback:
            callback_fcn = callback if callable(callback) else getattr(spider, callback, None)
            if not callback_fcn:
                log.msg("Cannot find callback %s in %s spider" % (callback, spider.domain_name))
                return (), ()

            result = callback_fcn(response)
            links = [i for i in result if isinstance(i, Request)]
            items = [self.pipeline_process(i, spider, opts) for i in result if isinstance(i, BaseItem)]
            return items, links

        return (), ()
Beispiel #5
0
    def populate_vars(self, url=None, response=None, request=None):
        item = self.item_class()
        self.vars['item'] = item
        if url:
            if isinstance(response, TextResponse):
                self.vars['xxs'] = XmlXPathSelector(response)
                self.vars['hxs'] = HtmlXPathSelector(response)
            self.vars['url'] = url
            self.vars['response'] = response
            self.vars['request'] = request
            self.vars['spider'] = spiders.fromurl(url)
        if not self.nofetch:
            self.vars['fetch'] = self.fetch
        self.vars['view'] = open_in_browser
        self.vars['shelp'] = self.print_help

        if self.update_vars:
            self.update_vars(self.vars)