def __init__(self, settings):
        super(BotSpiderManager, self).__init__(settings)
        # Backward compatibility
        if isinstance(settings, list):
            from scrapy.conf import settings
        descriptor = Descriptor.from_settings(settings)

        for spider_data in descriptor.list():
            domain = get_domain(spider_data['url'])
            base_cls = self._spiders[spider_data['engine']]
            cls_name = sanitize_module_name(domain).encode('utf-8')
            spider_name = self.spider_name(spider_data['engine'],
                                           domain)
            kwargs = {
                'name': spider_name,
                'start_urls': [spider_data['url']],
                'allowed_domains': [domain],
            }
            kwargs.update(spider_data.get('args', {}))
            spider_cls = self._create_spider_cls(base_cls, cls_name, kwargs)
            self._spiders[spider_name] = spider_cls
Example #2
0
    def run(self, args, opts):
        log.start()
        if len(args) != 1:
            raise UsageError()
        url = args[0]
        log.msg('Detecting: %s' % url)
        detector = Detector()
        try:
            result = detector.detect(url)
        except MultipleSitesDetected as e:
            log.msg('Multiple sites detected: %s' % e.results)
            return
        if not result:
            log.msg('Site not detected: %s', level=log.ERROR)
            return
        engine, features = result
        spider = self._find_spider(engine, features)
        if not spider:
            log.msg('Site detected as "%s %s", but no spider with this name '
                    'has been found' % (engine, features), level=log.ERROR)
            return
        log.msg('Site detected: %s %s' % (engine, features))

        if opts.add:
            descriptor = Descriptor.from_settings(self.crawler_process.settings)
            descriptor.add(url, engine, {'features': features})
            descriptor.save()
            log.msg('Saved site to %s' % descriptor.path)

        elif not opts.detect_only:
            opts.spargs.update({
                'start_urls': [url],
                'allowed_domains': [get_domain(url)],
                'features': features,
            })
            self.crawler_process.crawl(spider.name, **opts.spargs)
            self.crawler_process.start()