def _configure_plugins(self, settings, spec, schemas, extractors): plugins = IndexedDict() for plugin_class, plugin_name in zip(load_plugins(settings), load_plugin_names(settings)): instance = plugin_class() instance.setup_bot(settings, spec, schemas, extractors) plugins[plugin_name] = instance return plugins
def __init__(self, name, spec, item_schemas, all_extractors, settings=None, **kw): super(IblSpider, self).__init__(name, **kw) self._job_id = settings.get('JOB', '') spec = deepcopy(spec) for key, val in kw.items(): if isinstance(val, six.string_types) and key in STRING_KEYS: val = val.splitlines() spec[key] = val self._item_template_pages = sorted( ((t['scrapes'], t) for t in spec['templates'] if t.get('page_type', 'item') == 'item'), key=itemgetter(0)) self._templates = [templ for _, templ in self._item_template_pages] self.plugins = IndexedDict() for plugin_class, plugin_name in zip(load_plugins(settings), load_plugin_names(settings)): instance = plugin_class() instance.setup_bot(settings, spec, item_schemas, all_extractors) self.plugins[plugin_name] = instance self.js_enabled = False self.SPLASH_HOST = None if settings.get('SPLASH_URL'): self.SPLASH_HOST = urlparse(settings.get('SPLASH_URL')).hostname self.js_enabled = spec.get('js_enabled', False) if self.js_enabled and (settings.get('SPLASH_PASS') is not None or settings.get('SPLASH_USER') is not None): self.splash_auth = basic_auth_header( settings.get('SPLASH_USER', ''), settings.get('SPLASH_PASS', '')) self._filter_js_urls = self._build_js_url_filter(spec) self.login_requests = [] self.form_requests = [] self._start_requests = [] self.generic_form = GenericForm(**kw) self._create_init_requests(spec.get("init_requests", [])) self._process_start_urls(spec) self.allowed_domains = spec.get( 'allowed_domains', self._get_allowed_domains(self._templates)) self.page_actions = spec.get('page_actions', []) if not self.allowed_domains: self.allowed_domains = None
def __init__(self, name, spec, item_schemas, all_extractors, settings=None, **kw): super(IblSpider, self).__init__(name, **kw) spec = deepcopy(spec) for key, val in kw.items(): if isinstance(val, basestring) and key in STRING_KEYS: val = val.splitlines() spec[key] = val self._item_template_pages = sorted( ((t['scrapes'], t) for t in spec['templates'] if t.get('page_type', 'item') == 'item'), key=itemgetter(0)) self._templates = [templ for _, templ in self._item_template_pages] self.plugins = IndexedDict() for plugin_class, plugin_name in zip(load_plugins(settings), load_plugin_names(settings)): instance = plugin_class() instance.setup_bot(settings, spec, item_schemas, all_extractors) self.plugins[plugin_name] = instance self.login_requests = [] self.form_requests = [] self._start_requests = [] self.generic_form = GenericForm(**kw) self._create_init_requests(spec.get("init_requests", [])) self._process_start_urls(spec) self.allowed_domains = spec.get( 'allowed_domains', self._get_allowed_domains(self._templates)) if not self.allowed_domains: self.allowed_domains = None