def __init__(self, settings): super(BotSpiderManager, self).__init__(settings) # Backward compatibility if isinstance(settings, list): from scrapy.conf import settings descriptor = Descriptor.from_settings(settings) for spider_data in descriptor.list(): domain = get_domain(spider_data['url']) base_cls = self._spiders[spider_data['engine']] cls_name = sanitize_module_name(domain).encode('utf-8') spider_name = self.spider_name(spider_data['engine'], domain) kwargs = { 'name': spider_name, 'start_urls': [spider_data['url']], 'allowed_domains': [domain], } kwargs.update(spider_data.get('args', {})) spider_cls = self._create_spider_cls(base_cls, cls_name, kwargs) self._spiders[spider_name] = spider_cls
def run(self, args, opts): log.start() if len(args) != 1: raise UsageError() url = args[0] log.msg('Detecting: %s' % url) detector = Detector() try: result = detector.detect(url) except MultipleSitesDetected as e: log.msg('Multiple sites detected: %s' % e.results) return if not result: log.msg('Site not detected: %s', level=log.ERROR) return engine, features = result spider = self._find_spider(engine, features) if not spider: log.msg('Site detected as "%s %s", but no spider with this name ' 'has been found' % (engine, features), level=log.ERROR) return log.msg('Site detected: %s %s' % (engine, features)) if opts.add: descriptor = Descriptor.from_settings(self.crawler_process.settings) descriptor.add(url, engine, {'features': features}) descriptor.save() log.msg('Saved site to %s' % descriptor.path) elif not opts.detect_only: opts.spargs.update({ 'start_urls': [url], 'allowed_domains': [get_domain(url)], 'features': features, }) self.crawler_process.crawl(spider.name, **opts.spargs) self.crawler_process.start()