Example #1
0
    def run(self, args, opts):
        if opts.list:
            self._list_templates()
            return

        if opts.dump:
            template_file = self._find_template(opts.template)
            if template_file:
                template = open(template_file, 'r')
                print template.read() 
            return

        if len(args) != 2:
            return False

        name = args[0]
        domain = args[1]

        module = sanitize_module_name(name)

        # if spider already exists and not force option then halt
        try:
            spider = spiders.create(name)
        except KeyError:
            pass
        else:
            if not opts.force:
                print "Spider '%s' already exists in module:" % name
                print "  %s" % spider.__module__
                sys.exit(1)

        template_file = self._find_template(opts.template)
        if template_file:
            self._genspider(module, name, domain, opts.template, template_file)
Example #2
0
    def run(self, args, opts):
        if not len(args) == 1 or not is_url(args[0]):
            return False

        responses = []  # to collect downloaded responses
        request = Request(args[0], callback=responses.append)

        if opts.spider:
            try:
                spider = spiders.create(opts.spider)
            except KeyError:
                log.msg("Unable to find spider: %s" % opts.spider, log.ERROR)
                return
        else:
            spider = spiders.create_for_request(request)

        scrapymanager.configure()
        scrapymanager.queue.append_request(request, spider)
        scrapymanager.start()

        if not responses:
            log.msg("No response returned", log.ERROR, spider=spider)
            return

        # now process response
        #   - if callbacks defined then call each one print results
        #   - if --rules option given search for matching spider's rule
        #   - default print result using default 'parse' spider's callback
        response = responses[0]

        if self.callbacks:
            # apply each callback
            for callback in self.callbacks:
                items, links = self.run_callback(spider, response, callback, args, opts)
                self.print_results(items, links, callback, opts)
        elif opts.rules:
            # search for matching spider's rule
            if hasattr(spider, "rules") and spider.rules:
                items, links = [], []
                for rule in spider.rules:
                    if rule.link_extractor.matches(response.url) and rule.callback:

                        items, links = self.run_callback(spider, response, rule.callback, args, opts)
                        self.print_results(items, links, rule.callback, opts)
                        # first-match rule breaks rules loop
                        break
            else:
                log.msg(
                    'No rules found for spider "%s", ' "please specify a callback for parsing" % spider.name, log.ERROR
                )
        else:
            # default callback 'parse'
            items, links = self.run_callback(spider, response, "parse", args, opts)
            self.print_results(items, links, "parse", opts)
Example #3
0
    def run(self, args, opts):
        q = ExecutionQueue()
        urls, names = self._split_urls_and_names(args)
        for name in names:
            q.append_spider_name(name)

        if opts.spider:
            try:
                spider = spiders.create(opts.spider)
                for url in urls:
                    q.append_url(url, spider)
            except KeyError:
                log.msg('Unable to find spider: %s' % opts.spider, log.ERROR)
        else:
            for name, urls in self._group_urls_by_spider(urls):
                spider = spiders.create(name)
                for url in urls:
                    q.append_url(url, spider)

        scrapymanager.queue = q
        scrapymanager.start()
Example #4
0
    def run(self, args, opts):
        if len(args) != 1 or not is_url(args[0]):
            return False
        cb = lambda x: self._print_response(x, opts)
        request = Request(args[0], callback=cb, dont_filter=True)

        spider = None
        if opts.spider:
            try:
                spider = spiders.create(opts.spider)
            except KeyError:
                log.msg("Could not find spider: %s" % opts.spider, log.ERROR)

        scrapymanager.configure()
        scrapymanager.queue.append_request(request, spider, \
            default_spider=BaseSpider('default'))
        scrapymanager.start()