Exemple #1
0
def _get_spider_requests(*args):
    """Collect requests and spiders from the given arguments. Returns a dict of
    spider -> list of requests
    """
    spider_requests = defaultdict(list)
    for arg in args:
        if isinstance(arg, tuple):
            request, spider = arg
            spider_requests[spider] = request
        elif isinstance(arg, Request):
            spider = spiders.fromurl(arg.url) or BaseSpider('default')
            if spider:
                spider_requests[spider] += [arg]
            else:
                log.msg('Could not find spider for request: %s' % arg, log.ERROR)
        elif isinstance(arg, BaseSpider):
            spider_requests[arg] += arg.start_requests()
        elif is_url(arg):
            spider = spiders.fromurl(arg) or BaseSpider('default')
            if spider:
                for req in arg_to_iter(spider.make_requests_from_url(arg)):
                    spider_requests[spider] += [req]
            else:
                log.msg('Could not find spider for url: %s' % arg, log.ERROR)
        elif isinstance(arg, basestring):
            spider = spiders.fromdomain(arg)
            if spider:
                spider_requests[spider] += spider.start_requests()
            else:
                log.msg('Could not find spider for domain: %s' % arg, log.ERROR)
        else:
            raise TypeError("Unsupported argument: %r" % arg)
    return spider_requests
Exemple #2
0
 def _split_urls_and_names(self, args):
     urls = []
     names = []
     for arg in args:
         if is_url(arg):
             urls.append(arg)
         else:
             names.append(arg)
     return urls, names
Exemple #3
0
 def _split_urls_and_names(self, args):
     urls = []
     names = []
     for arg in args:
         if is_url(arg):
             urls.append(arg)
         else:
             names.append(arg)
     return urls, names
Exemple #4
0
    def run(self, args, opts):
        if not len(args) == 1 or not is_url(args[0]):
            return False

        responses = []  # to collect downloaded responses
        request = Request(args[0], callback=responses.append)

        if opts.spider:
            try:
                spider = spiders.create(opts.spider)
            except KeyError:
                log.msg("Unable to find spider: %s" % opts.spider, log.ERROR)
                return
        else:
            spider = spiders.create_for_request(request)

        scrapymanager.configure()
        scrapymanager.queue.append_request(request, spider)
        scrapymanager.start()

        if not responses:
            log.msg("No response returned", log.ERROR, spider=spider)
            return

        # now process response
        #   - if callbacks defined then call each one print results
        #   - if --rules option given search for matching spider's rule
        #   - default print result using default 'parse' spider's callback
        response = responses[0]

        if self.callbacks:
            # apply each callback
            for callback in self.callbacks:
                items, links = self.run_callback(spider, response, callback, args, opts)
                self.print_results(items, links, callback, opts)
        elif opts.rules:
            # search for matching spider's rule
            if hasattr(spider, "rules") and spider.rules:
                items, links = [], []
                for rule in spider.rules:
                    if rule.link_extractor.matches(response.url) and rule.callback:

                        items, links = self.run_callback(spider, response, rule.callback, args, opts)
                        self.print_results(items, links, rule.callback, opts)
                        # first-match rule breaks rules loop
                        break
            else:
                log.msg(
                    'No rules found for spider "%s", ' "please specify a callback for parsing" % spider.name, log.ERROR
                )
        else:
            # default callback 'parse'
            items, links = self.run_callback(spider, response, "parse", args, opts)
            self.print_results(items, links, "parse", opts)
Exemple #5
0
 def run(self, args, opts):
     if not len(args) == 1 or not is_url(args[0]):
         raise UsageError()
     response, spider = self.get_response_and_spider(args[0], opts)
     if not response:
         return
     callback = None
     if opts.callback:
         callback = opts.callback
     elif opts.rules:
         callback = self.get_callback_from_rules(spider, response)
     items, requests = self.run_callback(spider, response, callback or 'parse', \
         opts)
     self.print_results(items, requests, callback, opts)
Exemple #6
0
    def run(self, args, opts):
        if len(args) != 1 or not is_url(args[0]):
            raise UsageError()
        cb = lambda x: self._print_response(x, opts)
        request = Request(args[0], callback=cb, dont_filter=True)

        spider = None
        if opts.spider:
            try:
                spider = self.crawler.spiders.create(opts.spider)
            except KeyError:
                log.msg("Could not find spider: %s" % opts.spider, log.ERROR)

        self.crawler.queue.append_request(request, spider, \
            default_spider=BaseSpider('default'))
        self.crawler.start()
Exemple #7
0
    def run(self, args, opts):
        if len(args) != 1 or not is_url(args[0]):
            raise UsageError()
        cb = lambda x: self._print_response(x, opts)
        request = Request(args[0], callback=cb, dont_filter=True)

        spider = None
        if opts.spider:
            try:
                spider = self.crawler.spiders.create(opts.spider)
            except KeyError:
                log.msg("Could not find spider: %s" % opts.spider, log.ERROR)

        self.crawler.queue.append_request(request, spider, \
            default_spider=BaseSpider('default'))
        self.crawler.start()