Ejemplo n.º 1
0
 def __init__(self, datadir, spider_cls=None, settings=None, **kwargs):
     self.spider_cls = load_object(spider_cls) if spider_cls else IblSpider
     self._specs = open_project_from_dir(datadir)
     settings = settings.copy()
     settings.frozen = False
     settings.set('PLUGINS', load_plugins(settings))
     self.settings = settings
Ejemplo n.º 2
0
 def test_trained(self):
     base = 'http://www.daft.ie/ireland/houses-for-sale/?offset={}'.format
     daft_url = base(10)
     spec = {
         'start_urls': [daft_url],
         'links_to_follow': 'auto',
         'respect_nofollow': False,
         'follow_patterns': [],
         'exclude_patterns': [],
         'init_requests': [],
         'templates': [daft_sample]
     }
     settings = Settings()
     settings.set('LOADED_PLUGINS', load_plugins(settings))
     spider = IblSpider('hn', spec, {}, {}, settings=settings)
     request = Request(daft_url)
     response = HtmlResponse(url=daft_url,
                             body=daft_body,
                             request=request,
                             encoding="utf-8")
     data = {
         r.url
         for r in spider.handle_html(response) if isinstance(r, Request)
     }
     self.assertEqual({base(i) for i in (90, 80, 70)}, data)
Ejemplo n.º 3
0
    def __init__(self, name, spec, item_schemas, all_extractors, settings=None,
                 **kw):
        super(IblSpider, self).__init__(name, **kw)
        spec = deepcopy(spec)
        for key, val in kw.items():
            if isinstance(val, basestring) and key in STRING_KEYS:
                val = val.splitlines()
            spec[key] = val

        self._item_template_pages = sorted(
            ((t['scrapes'], t) for t in spec['templates']
             if t.get('page_type', 'item') == 'item'), key=itemgetter(0))

        self._templates = [templ for _, templ in self._item_template_pages]

        self.plugins = IndexedDict()
        for plugin_class, plugin_name in zip(load_plugins(settings),
                                             load_plugin_names(settings)):
            instance = plugin_class()
            instance.setup_bot(settings, spec, item_schemas, all_extractors)
            self.plugins[plugin_name] = instance

        self.login_requests = []
        self.form_requests = []
        self._start_requests = []
        self.generic_form = GenericForm(**kw)
        self._create_init_requests(spec.get("init_requests", []))
        self._process_start_urls(spec)
        self.allowed_domains = spec.get(
            'allowed_domains',
            self._get_allowed_domains(self._templates)
        )
        if not self.allowed_domains:
            self.allowed_domains = None
Ejemplo n.º 4
0
 def __init__(self, datadir, spider_cls=None, settings=None, **kwargs):
     self.spider_cls = load_object(spider_cls) if spider_cls else IblSpider
     self._specs = open_project_from_dir(datadir)
     settings = settings.copy()
     settings.frozen = False
     settings.set('PLUGINS', load_plugins(settings))
     self.settings = settings
Ejemplo n.º 5
0
 def _configure_plugins(self, settings, spec, schemas, extractors):
     plugins = IndexedDict()
     for plugin_class, plugin_name in zip(load_plugins(settings),
                                          load_plugin_names(settings)):
         instance = plugin_class()
         instance.setup_bot(settings, spec, schemas, extractors)
         plugins[plugin_name] = instance
     return plugins
Ejemplo n.º 6
0
 def _configure_plugins(self, settings, spec, schemas, extractors):
     plugins = IndexedDict()
     for plugin_class, plugin_name in zip(load_plugins(settings),
                                          load_plugin_names(settings)):
         instance = plugin_class()
         instance.setup_bot(settings, spec, schemas, extractors, self.logger)
         plugins[plugin_name] = instance
     return plugins
Ejemplo n.º 7
0
 def __init__(self, datadir, spider_cls=None, settings=None, **kwargs):
     logging.info('Slybot %s Spider', slybot.__version__)
     if settings is None:
         settings = get_project_settings()
     self.spider_cls = load_object(spider_cls) if spider_cls else IblSpider
     self._specs = open_project_from_dir(datadir)
     settings = settings.copy()
     settings.frozen = False
     settings.set('LOADED_PLUGINS', load_plugins(settings))
     self.settings = settings
 def __init__(self, datadir, spider_cls=None, settings=None, **kwargs):
     logging.info('Slybot %s Spider', slybot.__version__)
     if settings is None:
         settings = get_project_settings()
     self.spider_cls = load_object(spider_cls) if spider_cls else IblSpider
     self._specs = open_project_from_dir(datadir)
     settings = settings.copy()
     settings.frozen = False
     settings.set('LOADED_PLUGINS', load_plugins(settings))
     self.settings = settings
Ejemplo n.º 9
0
    def __init__(self,
                 name,
                 spec,
                 item_schemas,
                 all_extractors,
                 settings=None,
                 **kw):
        super(IblSpider, self).__init__(name, **kw)
        self._job_id = settings.get('JOB', '')
        spec = deepcopy(spec)
        for key, val in kw.items():
            if isinstance(val, six.string_types) and key in STRING_KEYS:
                val = val.splitlines()
            spec[key] = val

        self._item_template_pages = sorted(
            ((t['scrapes'], t) for t in spec['templates']
             if t.get('page_type', 'item') == 'item'),
            key=itemgetter(0))

        self._templates = [templ for _, templ in self._item_template_pages]

        self.plugins = IndexedDict()
        for plugin_class, plugin_name in zip(load_plugins(settings),
                                             load_plugin_names(settings)):
            instance = plugin_class()
            instance.setup_bot(settings, spec, item_schemas, all_extractors)
            self.plugins[plugin_name] = instance

        self.js_enabled = False
        self.SPLASH_HOST = None
        if settings.get('SPLASH_URL'):
            self.SPLASH_HOST = urlparse(settings.get('SPLASH_URL')).hostname
            self.js_enabled = spec.get('js_enabled', False)
        if self.js_enabled and (settings.get('SPLASH_PASS') is not None
                                or settings.get('SPLASH_USER') is not None):
            self.splash_auth = basic_auth_header(
                settings.get('SPLASH_USER', ''),
                settings.get('SPLASH_PASS', ''))
        self._filter_js_urls = self._build_js_url_filter(spec)
        self.login_requests = []
        self.form_requests = []
        self._start_requests = []
        self.generic_form = GenericForm(**kw)
        self._create_init_requests(spec.get("init_requests", []))
        self._process_start_urls(spec)
        self.allowed_domains = spec.get(
            'allowed_domains', self._get_allowed_domains(self._templates))
        self.page_actions = spec.get('page_actions', [])
        if not self.allowed_domains:
            self.allowed_domains = None
Ejemplo n.º 10
0
    def __init__(self, datadir, spider_cls=None, settings=None, **kwargs):
        logging.info('Slybot %s Spider', slybot.__version__)
        if is_zipfile(datadir):
            tempdir = tempfile.mkdtemp(prefix='slybot-')
            ZipFile(datadir).extractall(tempdir)
            atexit.register(shutil.rmtree, tempdir)
            datadir = tempdir

        if settings is None:
            settings = get_project_settings()
        self.spider_cls = load_object(spider_cls) if spider_cls else IblSpider
        self._specs = open_project_from_dir(datadir)
        settings = settings.copy()
        settings.frozen = False
        settings.set('LOADED_PLUGINS', load_plugins(settings))
        self.settings = settings
Ejemplo n.º 11
0
    def __init__(self, datadir, spider_cls=None, settings=None, **kwargs):
        logging.info('Slybot %s Spider', slybot.__version__)
        if is_zipfile(datadir):
            tempdir = tempfile.mkdtemp(prefix='slybot-')
            ZipFile(datadir).extractall(tempdir)
            atexit.register(shutil.rmtree, tempdir)
            datadir = tempdir

        if settings is None:
            settings = get_project_settings()
        self.spider_cls = load_object(spider_cls) if spider_cls else IblSpider
        self._specs = open_project_from_dir(datadir)
        settings = settings.copy()
        settings.frozen = False
        settings.set('LOADED_PLUGINS', load_plugins(settings))
        self.settings = settings
Ejemplo n.º 12
0
    def __init__(self, name, spec, item_schemas, all_extractors, settings=None,
                 **kw):
        super(IblSpider, self).__init__(name, **kw)
        self._job_id = settings.get('JOB', '')
        spec = deepcopy(spec)
        for key, val in kw.items():
            if isinstance(val, six.string_types) and key in STRING_KEYS:
                val = val.splitlines()
            spec[key] = val

        self._item_template_pages = sorted(
            ((t['scrapes'], t) for t in spec['templates']
             if t.get('page_type', 'item') == 'item'), key=itemgetter(0))

        self._templates = [templ for _, templ in self._item_template_pages]

        self.plugins = IndexedDict()
        for plugin_class, plugin_name in zip(load_plugins(settings),
                                             load_plugin_names(settings)):
            instance = plugin_class()
            instance.setup_bot(settings, spec, item_schemas, all_extractors)
            self.plugins[plugin_name] = instance

        self.js_enabled = False
        self.SPLASH_HOST = None
        if settings.get('SPLASH_URL'):
            self.SPLASH_HOST = urlparse(settings.get('SPLASH_URL')).hostname
            self.js_enabled = spec.get('js_enabled', False)
        if self.js_enabled and (settings.get('SPLASH_PASS') is not None or
                                settings.get('SPLASH_USER') is not None):
            self.splash_auth = basic_auth_header(
                settings.get('SPLASH_USER', ''),
                settings.get('SPLASH_PASS', ''))
        self._filter_js_urls = self._build_js_url_filter(spec)
        self.login_requests = []
        self.form_requests = []
        self._start_requests = []
        self.generic_form = GenericForm(**kw)
        self._create_init_requests(spec.get("init_requests", []))
        self._process_start_urls(spec)
        self.allowed_domains = spec.get(
            'allowed_domains',
            self._get_allowed_domains(self._templates)
        )
        self.page_actions = spec.get('page_actions', [])
        if not self.allowed_domains:
            self.allowed_domains = None
Ejemplo n.º 13
0
 def test_trained(self):
     base = 'http://www.daft.ie/ireland/houses-for-sale/?offset={}'.format
     daft_url = base(10)
     spec = {
         'start_urls': [daft_url],
         'links_to_follow': 'auto',
         'respect_nofollow': False,
         'follow_patterns': [],
         'exclude_patterns': [],
         'init_requests': [],
         'templates': [daft_sample]
     }
     settings = Settings()
     settings.set('LOADED_PLUGINS', load_plugins(settings))
     spider = IblSpider('hn', spec, {}, {}, settings=settings)
     request = Request(daft_url)
     response = UTF8HtmlResponse(url=daft_url, body=daft_body,
                                 request=request)
     data = {r.url for r in spider.handle_html(response)
             if isinstance(r, Request)}
     self.assertEqual({base(i) for i in (90, 80, 70)}, data)
Ejemplo n.º 14
0
    def __init__(self,
                 name,
                 spec,
                 item_schemas,
                 all_extractors,
                 settings=None,
                 **kw):
        super(IblSpider, self).__init__(name, **kw)
        spec = deepcopy(spec)
        for key, val in kw.items():
            if isinstance(val, basestring) and key in STRING_KEYS:
                val = val.splitlines()
            spec[key] = val

        self._item_template_pages = sorted(
            ((t['scrapes'], t) for t in spec['templates']
             if t.get('page_type', 'item') == 'item'),
            key=itemgetter(0))

        self._templates = [templ for _, templ in self._item_template_pages]

        self.plugins = IndexedDict()
        for plugin_class, plugin_name in zip(load_plugins(settings),
                                             load_plugin_names(settings)):
            instance = plugin_class()
            instance.setup_bot(settings, spec, item_schemas, all_extractors)
            self.plugins[plugin_name] = instance

        self.login_requests = []
        self.form_requests = []
        self._start_requests = []
        self.generic_form = GenericForm(**kw)
        self._create_init_requests(spec.get("init_requests", []))
        self._process_start_urls(spec)
        self.allowed_domains = spec.get(
            'allowed_domains', self._get_allowed_domains(self._templates))
        if not self.allowed_domains:
            self.allowed_domains = None