def test_advanced_search_form_regex(self): url = 'http://www.ebay.com/sch/ebayadvsearch/?rt=nc' body = open(join(_PATH, "data", "ebay_advanced_search.html")).read() form_descriptor = json.loads("""{ "type": "form", "form_url": "http://www.ebay.com/sch/ebayadvsearch/?rt=nc", "xpath": "//form[@name='adv_search_from']", "fields": [ { "xpath": ".//*[@name='_nkw']", "type": "constants", "value": ["Cars"] }, { "xpath": ".//*[@name='_in_kw']", "type": "iterate", "value": "[1-2]" } ] }""") generic_form = GenericForm() start_requests = list(generic_form.fill_generic_form(url, body, form_descriptor)) expected_requests = [([('_adv', '1'), ('_ex_kw', ''), ('_ftrv', '1'), ('_ftrt', '901'), ('_sabdlo', u''), ('_sabdhi', u''), ('_sop', '12'), ('_samihi', u''), ('_ipg', '50'), ('_salic', '1'), ('_sasl', ''), ('_udlo', ''), ('_okw', u''), ('_fsradio', '&LH_SpecificSeller=1'), ('_udhi', ''), ('_in_kw', '1'), ('_nkw', u'Cars'), ('_sacat', '0'), ('_oexkw', u''), ('_dmd', '1'), ('_saslop', '1'), ('_samilow', u'')], 'http://www.ebay.com/sch/i.html', 'GET'), ([('_adv', '1'), ('_ex_kw', ''), ('_ftrv', '1'), ('_ftrt', '901'), ('_sabdlo', u''), ('_sabdhi', u''), ('_sop', '12'), ('_samihi', u''), ('_ipg', '50'), ('_salic', '1'), ('_sasl', ''), ('_udlo', ''), ('_okw', u''), ('_fsradio', '&LH_SpecificSeller=1'), ('_udhi', ''), ('_in_kw', '2'), ('_nkw', u'Cars'), ('_sacat', '0'), ('_oexkw', u''), ('_dmd', '1'), ('_saslop', '1'), ('_samilow', u'')], 'http://www.ebay.com/sch/i.html', 'GET')] self.assertEqual(start_requests, expected_requests)
def __init__(self, name, spec, item_schemas, all_extractors, settings=None, **kw): self.start_url_generators = { 'start_urls': IdentityGenerator(), 'generated_urls': UrlGenerator(settings, kw), 'url': IdentityGenerator(), 'feed': FeedGenerator(self.parse), 'generated': FragmentGenerator(), } self.generic_form = GenericForm(**kw) super(IblSpider, self).__init__(name, **kw) spec = deepcopy(spec) self._add_spider_args_to_spec(spec, kw) self._configure_js(spec, settings) self.plugins = self._configure_plugins(settings, spec, item_schemas, all_extractors) self.login_requests, self.form_requests = [], [] self._start_urls = self._create_start_urls(spec) self._start_requests = self._create_start_requests(spec) self._create_init_requests(spec) self._add_allowed_domains(spec) self.page_actions = spec.get('page_actions', [])
def __init__(self, name, spec, item_schemas, all_extractors, **kw): super(IblSpider, self).__init__(name, **kw) spec = deepcopy(spec) for key, val in kw.items(): if isinstance(val, basestring) and key in ['start_urls', 'exclude_patterns', 'follow_patterns', 'allowed_domains']: val = val.splitlines() spec[key] = val self._item_template_pages = sorted(( [t['scrapes'], dict_to_page(t, 'annotated_body'), t.get('extractors', [])] \ for t in spec['templates'] if t.get('page_type', 'item') == 'item' ), key=lambda pair: pair[0]) # generate ibl extractor for links pages _links_pages = [dict_to_page(t, 'annotated_body') for t in spec['templates'] if t.get('page_type') == 'links'] _links_item_descriptor = create_slybot_item_descriptor({'fields': {}}) self._links_ibl_extractor = InstanceBasedLearningExtractor([(t, _links_item_descriptor) for t in _links_pages]) \ if _links_pages else None self._ipages = [page for _, page, _ in self._item_template_pages] self.html_link_extractor = HtmlLinkExtractor() self.rss_link_extractor = RssLinkExtractor() self.build_url_filter(spec) self.itemcls_info = {} for itemclass_name, triplets in itertools.groupby(self._item_template_pages, operator.itemgetter(0)): page_extractors_pairs = map(operator.itemgetter(1, 2), triplets) schema = item_schemas[itemclass_name] item_cls = SlybotItem.create_iblitem_class(schema) page_descriptor_pairs = [] for page, template_extractors in page_extractors_pairs: item_descriptor = create_slybot_item_descriptor(schema) apply_extractors(item_descriptor, template_extractors, all_extractors) page_descriptor_pairs.append((page, item_descriptor)) extractor = InstanceBasedLearningExtractor(page_descriptor_pairs) self.itemcls_info[itemclass_name] = { 'class': item_cls, 'descriptor': item_descriptor, 'extractor': extractor, } self.login_requests = [] self.form_requests = [] self._start_requests = [] self.generic_form = GenericForm(**kw) self._create_init_requests(spec.get("init_requests", [])) self._process_start_urls(spec) self.allowed_domains = spec.get('allowed_domains', self._get_allowed_domains(self._ipages)) if not self.allowed_domains: self.allowed_domains = None
def test_simple_search_form_with_file_type(self): url = 'http://www.ebay.com/sch/ebayadvsearch/?rt=nc' body = open(join(_PATH, "data", "ebay_advanced_search.html")).read() form_descriptor = json.loads("""{ "type": "form", "form_url": "http://www.ebay.com/sch/ebayadvsearch/?rt=nc", "xpath": "//form[@name='adv_search_from']", "fields": [ { "name": "my_param", "type": "inurl", "value": "file://%s/test_params.txt", "file_values": ["Cars", "Boats", "Houses", "Electronics"] } ] }""" % join(_PATH, "data")) generic_form = GenericForm() start_requests = list( generic_form.fill_generic_form(url, body, form_descriptor)) expected_requests = [ ([('_adv', '1'), ('_ex_kw', ''), ('_ftrv', '1'), ('_ftrt', '901'), ('_sabdlo', u''), ('_sabdhi', u''), ('_sop', '12'), ('_samihi', u''), ('_ipg', '50'), ('_salic', '1'), (u'my_param', u'Cars'), ('_sasl', ''), ('_udlo', ''), ('_okw', u''), ('_fsradio', '&LH_SpecificSeller=1'), ('_udhi', ''), ('_in_kw', '1'), ('_nkw', ''), ('_sacat', '0'), ('_oexkw', u''), ('_dmd', '1'), ('_saslop', '1'), ('_samilow', u'')], 'http://www.ebay.com/sch/i.html', 'GET'), ([('_adv', '1'), ('_ex_kw', ''), ('_ftrv', '1'), ('_ftrt', '901'), ('_sabdlo', u''), ('_sabdhi', u''), ('_sop', '12'), ('_samihi', u''), ('_ipg', '50'), ('_salic', '1'), (u'my_param', u'Boats'), ('_sasl', ''), ('_udlo', ''), ('_okw', u''), ('_fsradio', '&LH_SpecificSeller=1'), ('_udhi', ''), ('_in_kw', '1'), ('_nkw', ''), ('_sacat', '0'), ('_oexkw', u''), ('_dmd', '1'), ('_saslop', '1'), ('_samilow', u'')], 'http://www.ebay.com/sch/i.html', 'GET'), ([('_adv', '1'), ('_ex_kw', ''), ('_ftrv', '1'), ('_ftrt', '901'), ('_sabdlo', u''), ('_sabdhi', u''), ('_sop', '12'), ('_samihi', u''), ('_ipg', '50'), ('_salic', '1'), (u'my_param', u'Houses'), ('_sasl', ''), ('_udlo', ''), ('_okw', u''), ('_fsradio', '&LH_SpecificSeller=1'), ('_udhi', ''), ('_in_kw', '1'), ('_nkw', ''), ('_sacat', '0'), ('_oexkw', u''), ('_dmd', '1'), ('_saslop', '1'), ('_samilow', u'')], 'http://www.ebay.com/sch/i.html', 'GET'), ([('_adv', '1'), ('_ex_kw', ''), ('_ftrv', '1'), ('_ftrt', '901'), ('_sabdlo', u''), ('_sabdhi', u''), ('_sop', '12'), ('_samihi', u''), ('_ipg', '50'), ('_salic', '1'), (u'my_param', u'Electronics'), ('_sasl', ''), ('_udlo', ''), ('_okw', u''), ('_fsradio', '&LH_SpecificSeller=1'), ('_udhi', ''), ('_in_kw', '1'), ('_nkw', ''), ('_sacat', '0'), ('_oexkw', u''), ('_dmd', '1'), ('_saslop', '1'), ('_samilow', u'')], 'http://www.ebay.com/sch/i.html', 'GET') ] self.assertEqual(request_to_set(start_requests), request_to_set(expected_requests))
def __init__(self, name, spec, item_schemas, all_extractors, settings=None, **kw): super(IblSpider, self).__init__(name, **kw) self._job_id = settings.get('JOB', '') spec = deepcopy(spec) for key, val in kw.items(): if isinstance(val, six.string_types) and key in STRING_KEYS: val = val.splitlines() spec[key] = val self._item_template_pages = sorted( ((t['scrapes'], t) for t in spec['templates'] if t.get('page_type', 'item') == 'item'), key=itemgetter(0)) self._templates = [templ for _, templ in self._item_template_pages] self.plugins = IndexedDict() for plugin_class, plugin_name in zip(load_plugins(settings), load_plugin_names(settings)): instance = plugin_class() instance.setup_bot(settings, spec, item_schemas, all_extractors) self.plugins[plugin_name] = instance self.js_enabled = False self.SPLASH_HOST = None if settings.get('SPLASH_URL'): self.SPLASH_HOST = urlparse(settings.get('SPLASH_URL')).hostname self.js_enabled = spec.get('js_enabled', False) if self.js_enabled and (settings.get('SPLASH_PASS') is not None or settings.get('SPLASH_USER') is not None): self.splash_auth = basic_auth_header( settings.get('SPLASH_USER', ''), settings.get('SPLASH_PASS', '')) self._filter_js_urls = self._build_js_url_filter(spec) self.login_requests = [] self.form_requests = [] self._start_requests = [] self.generic_form = GenericForm(**kw) self._create_init_requests(spec.get("init_requests", [])) self._process_start_urls(spec) self.allowed_domains = spec.get( 'allowed_domains', self._get_allowed_domains(self._templates)) self.page_actions = spec.get('page_actions', []) if not self.allowed_domains: self.allowed_domains = None
def __init__(self, name, spec, item_schemas, all_extractors, settings=None, **kw): super(IblSpider, self).__init__(name, **kw) spec = deepcopy(spec) for key, val in kw.items(): if isinstance(val, basestring) and key in STRING_KEYS: val = val.splitlines() spec[key] = val self._item_template_pages = sorted( ((t['scrapes'], t) for t in spec['templates'] if t.get('page_type', 'item') == 'item'), key=itemgetter(0)) self._templates = [templ for _, templ in self._item_template_pages] self.plugins = IndexedDict() for plugin_class, plugin_name in zip(load_plugins(settings), load_plugin_names(settings)): instance = plugin_class() instance.setup_bot(settings, spec, item_schemas, all_extractors) self.plugins[plugin_name] = instance self.login_requests = [] self.form_requests = [] self._start_requests = [] self.generic_form = GenericForm(**kw) self._create_init_requests(spec.get("init_requests", [])) self._process_start_urls(spec) self.allowed_domains = spec.get( 'allowed_domains', self._get_allowed_domains(self._templates) ) if not self.allowed_domains: self.allowed_domains = None
def __init__(self, name, spec, item_schemas, all_extractors, **kw): super(IblSpider, self).__init__(name, **kw) self._item_template_pages = sorted(( [t['scrapes'], dict_to_page(t, 'annotated_body'), t.get('extractors', [])] \ for t in spec['templates'] if t.get('page_type', 'item') == 'item' ), key=lambda pair: pair[0]) # generate ibl extractor for links pages _links_pages = [dict_to_page(t, 'annotated_body') for t in spec['templates'] if t.get('page_type') == 'links'] _links_item_descriptor = create_slybot_item_descriptor({'fields': {}}) self._links_ibl_extractor = InstanceBasedLearningExtractor([(t, _links_item_descriptor) for t in _links_pages]) \ if _links_pages else None self._ipages = [page for _, page, _ in self._item_template_pages] self.start_urls = self.start_urls or spec.get('start_urls') if isinstance(self.start_urls, basestring): self.start_urls = self.start_urls.splitlines() self.html_link_extractor = HtmlLinkExtractor() self.rss_link_extractor = RssLinkExtractor() self.allowed_domains = self._get_allowed_domains(self._ipages) self.build_url_filter(spec) self.itemcls_info = {} for itemclass_name, triplets in itertools.groupby(self._item_template_pages, operator.itemgetter(0)): page_extractors_pairs = map(operator.itemgetter(1, 2), triplets) schema = item_schemas[itemclass_name] item_cls = get_iblitem_class(schema) page_descriptor_pairs = [] for page, template_extractors in page_extractors_pairs: item_descriptor = create_slybot_item_descriptor(schema) apply_extractors(item_descriptor, template_extractors, all_extractors) page_descriptor_pairs.append((page, item_descriptor)) extractor = InstanceBasedLearningExtractor(page_descriptor_pairs) self.itemcls_info[itemclass_name] = { 'class': item_cls, 'descriptor': item_descriptor, 'extractor': extractor, } self.login_requests = [] self.form_requests = [] for rdata in spec.get("init_requests", []): if rdata["type"] == "login": request = Request(url=rdata.pop("loginurl"), meta=rdata, callback=self.parse_login_page, dont_filter=True) self.login_requests.append(request) elif rdata["type"] == "form": self.generic_form = GenericForm(**kw) self.form_requests.append(self.get_generic_form_start_request(rdata))
def __init__(self, name, spec, item_schemas, all_extractors, **kw): super(IblSpider, self).__init__(name, **kw) spec = deepcopy(spec) for key, val in kw.items(): if isinstance(val, basestring) and key in ['start_urls', 'exclude_patterns', 'follow_patterns', 'allowed_domains']: val = val.splitlines() spec[key] = val self.i = time.time() self.getProxyList() self._item_template_pages = sorted(( [t['scrapes'], dict_to_page(t, 'annotated_body'), t.get('extractors', [])] \ for t in spec['templates'] if t.get('page_type', 'item') == 'item' ), key=lambda pair: pair[0]) # generate ibl extractor for links pages _links_pages = [dict_to_page(t, 'annotated_body') for t in spec['templates'] if t.get('page_type') == 'links'] _links_item_descriptor = create_slybot_item_descriptor({'fields': {}}) self._links_ibl_extractor = InstanceBasedLearningExtractor([(t, _links_item_descriptor) for t in _links_pages]) \ if _links_pages else None self._ipages = [page for _, page, _ in self._item_template_pages] self.html_link_extractor = HtmlLinkExtractor() self.rss_link_extractor = RssLinkExtractor() self.build_url_filter(spec) self.itemcls_info = {} for itemclass_name, triplets in itertools.groupby(self._item_template_pages, operator.itemgetter(0)): page_extractors_pairs = map(operator.itemgetter(1, 2), triplets) schema = item_schemas[itemclass_name] item_cls = SlybotItem.create_iblitem_class(schema) page_descriptor_pairs = [] for page, template_extractors in page_extractors_pairs: item_descriptor = create_slybot_item_descriptor(schema) apply_extractors(item_descriptor, template_extractors, all_extractors) page_descriptor_pairs.append((page, item_descriptor)) extractor = InstanceBasedLearningExtractor(page_descriptor_pairs) self.itemcls_info[itemclass_name] = { 'class': item_cls, 'descriptor': item_descriptor, 'extractor': extractor, } self.login_requests = [] self.form_requests = [] self._start_requests = [] self.generic_form = GenericForm(**kw) self._create_init_requests(spec.get("init_requests", [])) self._process_start_urls(spec) self.allowed_domains = spec.get('allowed_domains', self._get_allowed_domains(self._ipages)) if not self.allowed_domains: self.allowed_domains = None
def test_simple_search_form_with_named_parameter(self): url = 'http://www.ebay.com/sch/ebayadvsearch/?rt=nc' body = open(join(_PATH, "data", "ebay_advanced_search.html")).read() form_descriptor = json.loads("""{ "type": "form", "form_url": "http://*****:*****@name='adv_search_from']", "fields": [ { "name": "my_param", "type": "constants", "value": ["Cars"] } ] }""") generic_form = GenericForm() start_requests = list(generic_form.fill_generic_form(url, body, form_descriptor)) expected_requests = [([('_in_kw', '1'), ('_udlo', ''), ('_ex_kw', ''), ('_nkw', ''), ('_ipg', '50'), ('_adv', '1'), ('_salic', '1'), ('_dmd', '1'), ('_fsradio', '&LH_SpecificSeller=1'), ('_udhi', ''), ('_sop', '12'), (u'my_param', u'Cars'), ('_sasl', '')], 'http://www.ebay.com/sch/i.html', 'GET')] self.assertEqual(start_requests, expected_requests)
def test_simple_search_form_with_file_type(self): url = 'http://www.ebay.com/sch/ebayadvsearch/?rt=nc' body = open(join(_PATH, "data", "ebay_advanced_search.html")).read() form_descriptor = json.loads("""{ "type": "form", "form_url": "http://www.ebay.com/sch/ebayadvsearch/?rt=nc", "xpath": "//form[@name='adv_search_from']", "fields": [ { "name": "my_param", "type": "inurl", "value": "file://%s/test_params.txt", "file_values": ["Cars", "Boats", "Houses", "Electronics"] } ] }""" % join(_PATH, "data")) generic_form = GenericForm() start_requests = list(generic_form.fill_generic_form(url, body, form_descriptor)) expected_requests = [([('_adv', '1'), ('_ex_kw', ''), ('_ftrv', '1'), ('_ftrt', '901'), ('_sabdlo', u''), ('_sabdhi', u''), ('_sop', '12'), ('_samihi', u''), ('_ipg', '50'), ('_salic', '1'), (u'my_param', u'Cars'), ('_sasl', ''), ('_udlo', ''), ('_okw', u''), ('_fsradio', '&LH_SpecificSeller=1'), ('_udhi', ''), ('_in_kw', '1'), ('_nkw', ''), ('_sacat', '0'), ('_oexkw', u''), ('_dmd', '1'), ('_saslop', '1'), ('_samilow', u'')], 'http://www.ebay.com/sch/i.html', 'GET'), ([('_adv', '1'), ('_ex_kw', ''), ('_ftrv', '1'), ('_ftrt', '901'), ('_sabdlo', u''), ('_sabdhi', u''), ('_sop', '12'), ('_samihi', u''), ('_ipg', '50'), ('_salic', '1'), (u'my_param', u'Boats'), ('_sasl', ''), ('_udlo', ''), ('_okw', u''), ('_fsradio', '&LH_SpecificSeller=1'), ('_udhi', ''), ('_in_kw', '1'), ('_nkw', ''), ('_sacat', '0'), ('_oexkw', u''), ('_dmd', '1'), ('_saslop', '1'), ('_samilow', u'')], 'http://www.ebay.com/sch/i.html', 'GET'), ([('_adv', '1'), ('_ex_kw', ''), ('_ftrv', '1'), ('_ftrt', '901'), ('_sabdlo', u''), ('_sabdhi', u''), ('_sop', '12'), ('_samihi', u''), ('_ipg', '50'), ('_salic', '1'), (u'my_param', u'Houses'), ('_sasl', ''), ('_udlo', ''), ('_okw', u''), ('_fsradio', '&LH_SpecificSeller=1'), ('_udhi', ''), ('_in_kw', '1'), ('_nkw', ''), ('_sacat', '0'), ('_oexkw', u''), ('_dmd', '1'), ('_saslop', '1'), ('_samilow', u'')], 'http://www.ebay.com/sch/i.html', 'GET'), ([('_adv', '1'), ('_ex_kw', ''), ('_ftrv', '1'), ('_ftrt', '901'), ('_sabdlo', u''), ('_sabdhi', u''), ('_sop', '12'), ('_samihi', u''), ('_ipg', '50'), ('_salic', '1'), (u'my_param', u'Electronics'), ('_sasl', ''), ('_udlo', ''), ('_okw', u''), ('_fsradio', '&LH_SpecificSeller=1'), ('_udhi', ''), ('_in_kw', '1'), ('_nkw', ''), ('_sacat', '0'), ('_oexkw', u''), ('_dmd', '1'), ('_saslop', '1'), ('_samilow', u'')], 'http://www.ebay.com/sch/i.html', 'GET')] self.assertEqual(start_requests, expected_requests)
def __init__(self, name, spec, item_schemas, all_extractors, settings=None, **kw): super(IblSpider, self).__init__(name, **kw) spec = deepcopy(spec) for key, val in kw.items(): if isinstance(val, basestring) and key in STRING_KEYS: val = val.splitlines() spec[key] = val self._item_template_pages = sorted( ((t['scrapes'], t) for t in spec['templates'] if t.get('page_type', 'item') == 'item'), key=itemgetter(0)) self._templates = [templ for _, templ in self._item_template_pages] self.plugins = IndexedDict() for plugin_class, plugin_name in zip(load_plugins(settings), load_plugin_names(settings)): instance = plugin_class() instance.setup_bot(settings, spec, item_schemas, all_extractors) self.plugins[plugin_name] = instance self.login_requests = [] self.form_requests = [] self._start_requests = [] self.generic_form = GenericForm(**kw) self._create_init_requests(spec.get("init_requests", [])) self._process_start_urls(spec) self.allowed_domains = spec.get( 'allowed_domains', self._get_allowed_domains(self._templates)) if not self.allowed_domains: self.allowed_domains = None
def __init__(self, name, spec, item_schemas, all_extractors, settings=None, **kw): self.start_url_generators = { 'start_urls': IdentityGenerator(), 'generated_urls': UrlGenerator(settings, kw), 'url': IdentityGenerator(), 'feed': FeedGenerator(self.parse), 'generated': FragmentGenerator(), } self.generic_form = GenericForm(**kw) super(IblSpider, self).__init__(name, **kw) spec = deepcopy(spec) self._add_spider_args_to_spec(spec, kw) #if actions configured, then set js_enabled as true, and put url of each action to js_enable_patterns. self.actions = spec.get('actions', []) ''' if len(self.actions): spec['js_enabled']= True enable_patterns = spec.get('js_enable_patterns', [] ) for action in self.actions: enable_patterns.append(action.get('url')) spec['js_enable_patterns']= enable_patterns ''' self._configure_js(spec, settings) self.plugins = self._configure_plugins( settings, spec, item_schemas, all_extractors) self.login_requests, self.form_requests = [], [] self._start_urls = self._create_start_urls(spec) self._start_requests = self._create_start_requests(spec) self._create_init_requests(spec) self._add_allowed_domains(spec) self.page_actions = spec.get('page_actions', [])
def __init__(self, name, spec, item_schemas, all_extractors, settings=None, **kw): super(IblSpider, self).__init__(name, **kw) self._job_id = settings.get('JOB', '') spec = deepcopy(spec) for key, val in kw.items(): if isinstance(val, six.string_types) and key in STRING_KEYS: val = val.splitlines() spec[key] = val self._item_template_pages = sorted( ((t['scrapes'], t) for t in spec['templates'] if t.get('page_type', 'item') == 'item'), key=itemgetter(0)) self._templates = [templ for _, templ in self._item_template_pages] self.plugins = IndexedDict() for plugin_class, plugin_name in zip(load_plugins(settings), load_plugin_names(settings)): instance = plugin_class() instance.setup_bot(settings, spec, item_schemas, all_extractors) self.plugins[plugin_name] = instance self.js_enabled = False self.SPLASH_HOST = None if settings.get('SPLASH_URL'): self.SPLASH_HOST = urlparse(settings.get('SPLASH_URL')).hostname self.js_enabled = spec.get('js_enabled', False) if self.js_enabled and (settings.get('SPLASH_PASS') is not None or settings.get('SPLASH_USER') is not None): self.splash_auth = basic_auth_header( settings.get('SPLASH_USER', ''), settings.get('SPLASH_PASS', '')) self._filter_js_urls = self._build_js_url_filter(spec) self.login_requests = [] self.form_requests = [] self._start_requests = [] self.generic_form = GenericForm(**kw) self._create_init_requests(spec.get("init_requests", [])) self._process_start_urls(spec) self.allowed_domains = spec.get( 'allowed_domains', self._get_allowed_domains(self._templates) ) self.page_actions = spec.get('page_actions', []) if not self.allowed_domains: self.allowed_domains = None
def __init__(self, name, spec, item_schemas, all_extractors, settings=None, **kw): self.start_url_generators = { 'start_urls': StartUrls(), 'generated_urls': UrlGenerator(settings, kw) } self.generic_form = GenericForm(**kw) super(IblSpider, self).__init__(name, **kw) spec = deepcopy(spec) self._add_spider_args_to_spec(spec, kw) self.plugins = self._configure_plugins( settings, spec, item_schemas, all_extractors) self._configure_js(spec, settings) self.login_requests, self.form_requests = [], [] self._start_requests = [] self._create_init_requests(spec) self._process_start_urls(spec) self._add_allowed_domains(spec) self.page_actions = spec.get('page_actions', [])
class IblSpider(Spider): def __init__(self, name, spec, item_schemas, all_extractors, settings=None, **kw): super(IblSpider, self).__init__(name, **kw) spec = deepcopy(spec) for key, val in kw.items(): if isinstance(val, basestring) and key in STRING_KEYS: val = val.splitlines() spec[key] = val self._item_template_pages = sorted( ((t['scrapes'], t) for t in spec['templates'] if t.get('page_type', 'item') == 'item'), key=itemgetter(0)) self._templates = [templ for _, templ in self._item_template_pages] self.plugins = IndexedDict() for plugin_class, plugin_name in zip(load_plugins(settings), load_plugin_names(settings)): instance = plugin_class() instance.setup_bot(settings, spec, item_schemas, all_extractors) self.plugins[plugin_name] = instance self.login_requests = [] self.form_requests = [] self._start_requests = [] self.generic_form = GenericForm(**kw) self._create_init_requests(spec.get("init_requests", [])) self._process_start_urls(spec) self.allowed_domains = spec.get( 'allowed_domains', self._get_allowed_domains(self._templates) ) if not self.allowed_domains: self.allowed_domains = None def _process_start_urls(self, spec): self.start_urls = spec.get('start_urls') for url in self.start_urls: self._start_requests.append(Request(url, callback=self.parse, dont_filter=True)) def _create_init_requests(self, spec): for rdata in spec: if rdata["type"] == "login": request = Request(url=rdata.pop("loginurl"), meta=rdata, callback=self.parse_login_page, dont_filter=True) self.login_requests.append(request) elif rdata["type"] == "form": self.form_requests.append( self.get_generic_form_start_request(rdata) ) elif rdata["type"] == "start": self._start_requests.append( self._create_start_request_from_specs(rdata) ) def parse_login_page(self, response): username = response.request.meta["username"] password = response.request.meta["password"] args, url, method = fill_login_form(response.url, response.body, username, password) return FormRequest(url, method=method, formdata=args, callback=self.after_login, dont_filter=True) def after_login(self, response): for result in self.parse(response): yield result for req in self._start_requests: yield req def get_generic_form_start_request(self, form_descriptor): file_fields = list(self.generic_form.get_url_field(form_descriptor)) if file_fields: (field_index, field_descriptor) = file_fields.pop(0) form_descriptor['field_index'] = field_index return FormRequest(self.generic_form.get_value(field_descriptor), meta=form_descriptor, callback=self.parse_field_url_page, dont_filter=True) else: return Request(url=form_descriptor.pop("form_url"), meta=form_descriptor, callback=self.parse_form_page, dont_filter=True) def parse_field_url_page(self, response): form_descriptor = response.request.meta field_index = form_descriptor['field_index'] field_descriptor = form_descriptor['fields'][field_index] self.generic_form.set_values_url_field(field_descriptor, response.body) yield self.get_generic_form_start_request(form_descriptor) def parse_form_page(self, response): fill_form = self.generic_form.fill_generic_form try: for (args, url, method) in fill_form(response.url, response.body, response.request.meta): yield FormRequest(url, method=method, formdata=args, callback=self.after_form_page, dont_filter=True) except Exception, e: self.log(str(e), log.WARNING) for req in self._start_requests: yield req
class IblSpider(BaseSpider): def __init__(self, name, spec, item_schemas, all_extractors, **kw): super(IblSpider, self).__init__(name, **kw) self._item_template_pages = sorted(( [t['scrapes'], dict_to_page(t, 'annotated_body'), t.get('extractors', [])] \ for t in spec['templates'] if t.get('page_type', 'item') == 'item' ), key=lambda pair: pair[0]) # generate ibl extractor for links pages _links_pages = [dict_to_page(t, 'annotated_body') for t in spec['templates'] if t.get('page_type') == 'links'] _links_item_descriptor = create_slybot_item_descriptor({'fields': {}}) self._links_ibl_extractor = InstanceBasedLearningExtractor([(t, _links_item_descriptor) for t in _links_pages]) \ if _links_pages else None self._ipages = [page for _, page, _ in self._item_template_pages] self.html_link_extractor = HtmlLinkExtractor() self.rss_link_extractor = RssLinkExtractor() self.build_url_filter(spec) self.itemcls_info = {} for itemclass_name, triplets in itertools.groupby(self._item_template_pages, operator.itemgetter(0)): page_extractors_pairs = map(operator.itemgetter(1, 2), triplets) schema = item_schemas[itemclass_name] item_cls = SlybotItem.create_iblitem_class(schema) page_descriptor_pairs = [] for page, template_extractors in page_extractors_pairs: item_descriptor = create_slybot_item_descriptor(schema) apply_extractors(item_descriptor, template_extractors, all_extractors) page_descriptor_pairs.append((page, item_descriptor)) extractor = InstanceBasedLearningExtractor(page_descriptor_pairs) self.itemcls_info[itemclass_name] = { 'class': item_cls, 'descriptor': item_descriptor, 'extractor': extractor, } self.login_requests = [] self.form_requests = [] self._start_requests = [] self.generic_form = GenericForm(**kw) self._create_init_requests(spec.get("init_requests", [])) self._process_start_urls(spec) self.allowed_domains = spec.get('allowed_domains', self._get_allowed_domains(self._ipages)) if not self.allowed_domains: self.allowed_domains = None def _process_start_urls(self, spec): self.start_urls = self.start_urls or spec.get('start_urls') if isinstance(self.start_urls, basestring): self.start_urls = self.start_urls.splitlines() for url in self.start_urls: self._start_requests.append(Request(url, callback=self.parse, dont_filter=True)) def _create_init_requests(self, spec): for rdata in spec: if rdata["type"] == "login": request = Request(url=rdata.pop("loginurl"), meta=rdata, callback=self.parse_login_page, dont_filter=True) self.login_requests.append(request) elif rdata["type"] == "form": self.form_requests.append(self.get_generic_form_start_request(rdata)) elif rdata["type"] == "start": self._start_requests.append(self._create_start_request_from_specs(rdata)) def parse_login_page(self, response): username = response.request.meta["username"] password = response.request.meta["password"] args, url, method = fill_login_form(response.url, response.body, username, password) return FormRequest(url, method=method, formdata=args, callback=self.after_login, dont_filter=True) def after_login(self, response): for result in self.parse(response): yield result for req in self._start_requests: yield req def get_generic_form_start_request(self, form_descriptor): file_fields = list(self.generic_form.get_url_field(form_descriptor)) if file_fields: (field_index, field_descriptor) = file_fields.pop(0) form_descriptor['field_index'] = field_index return FormRequest(self.generic_form.get_value(field_descriptor), meta=form_descriptor, callback=self.parse_field_url_page, dont_filter=True) else: return Request(url=form_descriptor.pop("form_url"), meta=form_descriptor, callback=self.parse_form_page, dont_filter=True) def parse_field_url_page(self, response): form_descriptor = response.request.meta field_index = form_descriptor['field_index'] field_descriptor = form_descriptor['fields'][field_index] self.generic_form.set_values_url_field(field_descriptor, response.body) yield self.get_generic_form_start_request(form_descriptor) def parse_form_page(self, response): try: for (args, url, method) in self.generic_form.fill_generic_form(response.url, response.body, response.request.meta): yield FormRequest(url, method=method, formdata=args, callback=self.after_form_page, dont_filter=True) except Exception, e: self.log(str(e), log.WARNING) for req in self._start_requests: yield req
def __init__(self, name, spec, item_schemas, all_extractors, **kw): super(IblSpider, self).__init__(name, **kw) self._item_template_pages = sorted(( [t['scrapes'], dict_to_page(t, 'annotated_body'), t.get('extractors', [])] \ for t in spec['templates'] if t.get('page_type', 'item') == 'item' ), key=lambda pair: pair[0]) # generate ibl extractor for links pages _links_pages = [dict_to_page(t, 'annotated_body') for t in spec['templates'] if t.get('page_type') == 'links'] _links_item_descriptor = create_slybot_item_descriptor({'fields': {}}) self._links_ibl_extractor = InstanceBasedLearningExtractor([(t, _links_item_descriptor) for t in _links_pages]) \ if _links_pages else None self._ipages = [page for _, page, _ in self._item_template_pages] self.start_urls = self.start_urls or spec.get('start_urls') if isinstance(self.start_urls, basestring): self.start_urls = self.start_urls.splitlines() self.html_link_extractor = HtmlLinkExtractor() self.rss_link_extractor = RssLinkExtractor() self.allowed_domains = spec.get('allowed_domains', self._get_allowed_domains(self._ipages)) if not self.allowed_domains: self.allowed_domains = None self.build_url_filter(spec) self.itemcls_info = {} for itemclass_name, triplets in itertools.groupby(self._item_template_pages, operator.itemgetter(0)): page_extractors_pairs = map(operator.itemgetter(1, 2), triplets) schema = item_schemas[itemclass_name] item_cls = get_iblitem_class(schema) page_descriptor_pairs = [] for page, template_extractors in page_extractors_pairs: item_descriptor = create_slybot_item_descriptor(schema) apply_extractors(item_descriptor, template_extractors, all_extractors) page_descriptor_pairs.append((page, item_descriptor)) extractor = InstanceBasedLearningExtractor(page_descriptor_pairs) self.itemcls_info[itemclass_name] = { 'class': item_cls, 'descriptor': item_descriptor, 'extractor': extractor, } self.login_requests = [] self.form_requests = [] for rdata in spec.get("init_requests", []): if rdata["type"] == "login": request = Request(url=rdata.pop("loginurl"), meta=rdata, callback=self.parse_login_page, dont_filter=True) self.login_requests.append(request) elif rdata["type"] == "form": self.generic_form = GenericForm(**kw) self.form_requests.append(self.get_generic_form_start_request(rdata))
class IblSpider(SitemapSpider): def __init__(self, name, spec, item_schemas, all_extractors, settings=None, **kw): self.start_url_generators = { 'start_urls': IdentityGenerator(), 'generated_urls': UrlGenerator(settings, kw), 'url': IdentityGenerator(), 'feed': FeedGenerator(self.parse), 'generated': FragmentGenerator(), } self.generic_form = GenericForm(**kw) super(IblSpider, self).__init__(name, **kw) spec = deepcopy(spec) self._add_spider_args_to_spec(spec, kw) self._configure_js(spec, settings) self.plugins = self._configure_plugins(settings, spec, item_schemas, all_extractors) self.login_requests, self.form_requests = [], [] self._start_urls = self._create_start_urls(spec) self._start_requests = self._create_start_requests(spec) self._create_init_requests(spec) self._add_allowed_domains(spec) self.page_actions = spec.get('page_actions', []) def _add_spider_args_to_spec(self, spec, args): for key, val in args.items(): if isinstance(val, six.string_types) and key in STRING_KEYS: val = val.splitlines() spec[key] = val def _create_start_urls(self, spec): url_type = spec.get('start_urls_type', 'start_urls') return StartUrlCollection( arg_to_iter(spec[url_type]), self.start_url_generators, ) def _create_start_requests(self, spec): init_requests = spec.get('init_requests', []) for rdata in init_requests: if rdata["type"] == "start": yield self._create_start_request_from_specs(rdata) for start_url in self._start_urls: if not isinstance(start_url, Request): start_url = Request(start_url, callback=self.parse, dont_filter=True) yield self._add_splash_meta(start_url) def _create_init_requests(self, spec): init_requests = spec.get('init_requests', []) for rdata in init_requests: if rdata["type"] == "login": request = Request(url=rdata.pop("loginurl"), meta=rdata, callback=self.parse_login_page, dont_filter=True) self._add_splash_meta(request) self.login_requests.append(request) elif rdata["type"] == "form": self.form_requests.append( self.get_generic_form_start_request(rdata)) def _add_allowed_domains(self, spec): self.allowed_domains = spec.get('allowed_domains', []) if self.allowed_domains is not None and not self.allowed_domains: self.allowed_domains = self._get_allowed_domains(spec) def parse_login_page(self, response): username = response.request.meta["username"] password = response.request.meta["password"] args, url, method = fill_login_form(response.url, response.body, username, password) return FormRequest(url, method=method, formdata=args, callback=self.after_login, dont_filter=True) def after_login(self, response): for result in self.parse(response): yield result for req in self._start_requests: yield req def get_generic_form_start_request(self, form_descriptor): file_fields = list(self.generic_form.get_url_field(form_descriptor)) if file_fields: (field_index, field_descriptor) = file_fields.pop(0) form_descriptor['field_index'] = field_index return FormRequest(self.generic_form.get_value(field_descriptor), meta=form_descriptor, callback=self.parse_field_url_page, dont_filter=True) else: return Request(url=form_descriptor.pop("form_url"), meta=form_descriptor, callback=self.parse_form_page, dont_filter=True) def parse_field_url_page(self, response): form_descriptor = response.request.meta field_index = form_descriptor['field_index'] field_descriptor = form_descriptor['fields'][field_index] self.generic_form.set_values_url_field(field_descriptor, response.text) yield self.get_generic_form_start_request(form_descriptor) def parse_form_page(self, response): fill_form = self.generic_form.fill_generic_form try: for (args, url, method) in fill_form(response.url, response.body, response.request.meta): yield FormRequest(url, method=method, formdata=args, callback=self.after_form_page, dont_filter=True) except Exception as e: self.logger.warning(str(e)) for req in self._start_requests: yield req def after_form_page(self, response): for result in self.parse(response): yield result def _get_allowed_domains(self, spec): urls = [x['url'] for x in spec['templates']] urls += [ x['url'] for x in spec.get('init_requests', []) if x['type'] == 'start' ] urls += self._start_urls.allowed_domains return [domain for scheme, domain in iter_unique_scheme_hostname(urls)] def start_requests(self): start_requests = [] if self.login_requests: start_requests = self.login_requests elif self.form_requests: start_requests = self.form_requests else: start_requests = self._start_requests for req in start_requests: yield req def _create_start_request_from_specs(self, info): url = info["url"] lspecs = info.get("link_extractor") if lspecs: linkextractor = create_linkextractor_from_specs(lspecs) def _callback(spider, response): for link in linkextractor.links_to_follow(response): request = Request(url=link.url, callback=spider.parse) yield self._add_splash_meta(request) request = Request(url=url, callback=_callback) return self._add_splash_meta(request) request = Request(url=url, callback=self.parse) return self._add_splash_meta(request) def parse(self, response): """Main handler for all downloaded responses""" request = response.request if (request and request.method == 'POST' and urlparse(request.url).hostname == self.SPLASH_HOST): url = json.loads(request.body.decode(request.encoding)).get('url') if url: response._url = url _type = content_type(response) if (isinstance(response, XmlResponse) or response.url.endswith( ('.xml', '.xml.gz')) or 'xml' in _type.subtype): sitemap_body = self._get_sitemap_body(response) if sitemap_body: response._set_body(self._get_sitemap_body(response)) return self.handle_xml(response) if isinstance(response, html_responses): return self.handle_html(response) self.logger.debug( "Ignoring page with content-type=%r: %s" % (response.headers.get('Content-Type', ''), response.url)) return [] def _configure_plugins(self, settings, spec, schemas, extractors): plugins = IndexedDict() for plugin_class, plugin_name in zip(load_plugins(settings), load_plugin_names(settings)): instance = plugin_class() instance.setup_bot(settings, self, spec, schemas, extractors, self.logger) plugins[plugin_name] = instance return plugins def _plugin_hook(self, name, *args): results = [] for plugin in self.plugins.values(): if hasattr(plugin, name): results.append(getattr(plugin, name)(*args)) return results def _handle(self, hook, response, *extrasrgs): generators = self._plugin_hook(hook, response, *extrasrgs) for item_or_request in itertools.chain(*generators): if isinstance(item_or_request, Request): self._plugin_hook('process_request', item_or_request, response) else: self._plugin_hook('process_item', item_or_request, response) if isinstance(item_or_request, Request): item_or_request = self._add_splash_meta(item_or_request) yield item_or_request def handle_xml(self, response): return self._handle('handle_xml', response, set([])) def handle_html(self, response): return self._handle('handle_html', response) def _configure_js(self, spec, settings): self.js_enabled = False self.SPLASH_HOST = None if settings.get('SPLASH_URL'): self.SPLASH_HOST = urlparse(settings.get('SPLASH_URL')).hostname self.js_enabled = spec.get('js_enabled', False) if self.js_enabled and (settings.get('SPLASH_PASS') is not None or settings.get('SPLASH_USER') is not None): self.splash_auth = basic_auth_header( settings.get('SPLASH_USER', ''), settings.get('SPLASH_PASS', '')) self.splash_wait = settings.getint('SPLASH_WAIT', 5) self.splash_timeout = settings.getint('SPLASH_TIMEOUT', 30) self.splash_js_source = settings.get('SPLASH_JS_SOURCE', 'function(){}') self.splash_lua_source = settings.get('SPLASH_LUA_SOURCE', DEFAULT_LUA_SOURCE) self._filter_js_urls = self._build_js_url_filter(spec) def _build_js_url_filter(self, spec): if not self.js_enabled: return lambda x: None enable_patterns = spec.get('js_enable_patterns') disable_patterns = spec.get('js_disable_patterns') return include_exclude_filter(enable_patterns, disable_patterns) def _add_splash_meta(self, request): if self.js_enabled and self._filter_js_urls(request.url): cleaned_url = urlparse(request.url)._replace(params='', query='', fragment='').geturl() request.meta['splash'] = { 'endpoint': 'execute', 'session_id': '{}-{}'.format(self.name, id(self)), 'args': { 'wait': self.splash_wait, 'timeout': self.splash_timeout, 'js_source': self.splash_js_source, 'lua_source': self.splash_lua_source, 'images': 0, 'url': request.url, 'baseurl': cleaned_url } } return request
class IblSpider(SitemapSpider): def __init__(self, name, spec, item_schemas, all_extractors, settings=None, **kw): super(IblSpider, self).__init__(name, **kw) self._job_id = settings.get('JOB', '') spec = deepcopy(spec) for key, val in kw.items(): if isinstance(val, six.string_types) and key in STRING_KEYS: val = val.splitlines() spec[key] = val self._item_template_pages = sorted( ((t['scrapes'], t) for t in spec['templates'] if t.get('page_type', 'item') == 'item'), key=itemgetter(0)) self._templates = [templ for _, templ in self._item_template_pages] self.plugins = IndexedDict() for plugin_class, plugin_name in zip(load_plugins(settings), load_plugin_names(settings)): instance = plugin_class() instance.setup_bot(settings, spec, item_schemas, all_extractors) self.plugins[plugin_name] = instance self.js_enabled = False self.SPLASH_HOST = None if settings.get('SPLASH_URL'): self.SPLASH_HOST = urlparse(settings.get('SPLASH_URL')).hostname self.js_enabled = spec.get('js_enabled', False) if self.js_enabled and (settings.get('SPLASH_PASS') is not None or settings.get('SPLASH_USER') is not None): self.splash_auth = basic_auth_header( settings.get('SPLASH_USER', ''), settings.get('SPLASH_PASS', '')) self._filter_js_urls = self._build_js_url_filter(spec) self.login_requests = [] self.form_requests = [] self._start_requests = [] self.generic_form = GenericForm(**kw) self._create_init_requests(spec.get("init_requests", [])) self._process_start_urls(spec) self.allowed_domains = spec.get( 'allowed_domains', self._get_allowed_domains(self._templates)) self.page_actions = spec.get('page_actions', []) if not self.allowed_domains: self.allowed_domains = None def _process_start_urls(self, spec): self.start_urls = spec.get('start_urls') for url in self.start_urls: request = Request(url, callback=self.parse, dont_filter=True) self._add_splash_meta(request) self._start_requests.append(request) def _create_init_requests(self, spec): for rdata in spec: if rdata["type"] == "login": request = Request(url=rdata.pop("loginurl"), meta=rdata, callback=self.parse_login_page, dont_filter=True) self._add_splash_meta(request) self.login_requests.append(request) elif rdata["type"] == "form": self.form_requests.append( self.get_generic_form_start_request(rdata)) elif rdata["type"] == "start": self._start_requests.append( self._create_start_request_from_specs(rdata)) def parse_login_page(self, response): username = response.request.meta["username"] password = response.request.meta["password"] args, url, method = fill_login_form(response.url, response.body, username, password) return FormRequest(url, method=method, formdata=args, callback=self.after_login, dont_filter=True) def after_login(self, response): for result in self.parse(response): yield result for req in self._start_requests: yield req def get_generic_form_start_request(self, form_descriptor): file_fields = list(self.generic_form.get_url_field(form_descriptor)) if file_fields: (field_index, field_descriptor) = file_fields.pop(0) form_descriptor['field_index'] = field_index return FormRequest(self.generic_form.get_value(field_descriptor), meta=form_descriptor, callback=self.parse_field_url_page, dont_filter=True) else: return Request(url=form_descriptor.pop("form_url"), meta=form_descriptor, callback=self.parse_form_page, dont_filter=True) def parse_field_url_page(self, response): form_descriptor = response.request.meta field_index = form_descriptor['field_index'] field_descriptor = form_descriptor['fields'][field_index] self.generic_form.set_values_url_field(field_descriptor, response.body) yield self.get_generic_form_start_request(form_descriptor) def parse_form_page(self, response): fill_form = self.generic_form.fill_generic_form try: for (args, url, method) in fill_form(response.url, response.body, response.request.meta): yield FormRequest(url, method=method, formdata=args, callback=self.after_form_page, dont_filter=True) except Exception as e: self.logger.warning(str(e)) for req in self._start_requests: yield req def after_form_page(self, response): for result in self.parse(response): yield result def _get_allowed_domains(self, templates): urls = [x['url'] for x in templates] urls += [x.url for x in self._start_requests] return [x[1] for x in iter_unique_scheme_hostname(urls)] def start_requests(self): start_requests = [] if self.login_requests: start_requests = self.login_requests elif self.form_requests: start_requests = self.form_requests else: start_requests = self._start_requests for req in start_requests: yield req def _create_start_request_from_specs(self, info): url = info["url"] lspecs = info.get("link_extractor") if lspecs: linkextractor = create_linkextractor_from_specs(lspecs) def _callback(spider, response): for link in linkextractor.links_to_follow(response): request = Request(url=link.url, callback=spider.parse) yield self._add_splash_meta(request) request = Request(url=url, callback=_callback) return self._add_splash_meta(request) request = Request(url=url, callback=self.parse) return self._add_splash_meta(request) def parse(self, response): """Main handler for all downloaded responses""" request = response.request if (request and request.method == 'POST' and urlparse(request.url).hostname == self.SPLASH_HOST): url = (json.loads(request.body).get('url')) if url: response._url = url content_type = response.headers.get('Content-Type', '') if isinstance(response, HtmlResponse): return self.handle_html(response) if (isinstance(response, XmlResponse) or response.url.endswith( ('.xml', '.xml.gz'))): response._set_body(self._get_sitemap_body(response)) return self.handle_xml(response) self.logger.debug("Ignoring page with content-type=%r: %s" % (content_type, response.url)) return [] def _plugin_hook(self, name, *args): results = [] for plugin in self.plugins.values(): if hasattr(plugin, name): results.append(getattr(plugin, name)(*args)) return results def _handle(self, hook, response, *extrasrgs): generators = self._plugin_hook(hook, response, *extrasrgs) for item_or_request in itertools.chain(*generators): if isinstance(item_or_request, Request): self._plugin_hook('process_request', item_or_request, response) else: self._plugin_hook('process_item', item_or_request, response) if isinstance(item_or_request, Request): item_or_request = self._add_splash_meta(item_or_request) yield item_or_request def handle_xml(self, response): return self._handle('handle_xml', response, set([])) def handle_html(self, response): return self._handle('handle_html', response) def _build_js_url_filter(self, spec): if not self.js_enabled: return lambda x: None enable_patterns = spec.get('js_enable_patterns') disable_patterns = spec.get('js_disable_patterns') filterf = None enablef = None if enable_patterns: pattern = enable_patterns[0] if len(enable_patterns) == 1 else \ "(?:%s)" % '|'.join(enable_patterns) enablef = re.compile(pattern).search filterf = enablef if disable_patterns: pattern = disable_patterns[0] if len(disable_patterns) == 1 else \ "(?:%s)" % '|'.join(disable_patterns) disablef = re.compile(pattern).search if not enablef: filterf = lambda x: not disablef(x) else: filterf = lambda x: enablef(x) and not disablef(x) return filterf if filterf else lambda x: x def _add_splash_meta(self, request): if self.js_enabled and self._filter_js_urls(request.url): cleaned_url = urlparse(request.url)._replace(params='', query='', fragment='').geturl() request.meta['splash'] = { 'endpoint': 'render.html?job_id=%s' % self._job_id, 'args': { 'wait': 5, 'images': 0, 'url': request.url, 'baseurl': cleaned_url } } return request
class IblSpider(Spider): def __init__(self, name, spec, item_schemas, all_extractors, settings=None, **kw): super(IblSpider, self).__init__(name, **kw) self._job_id = settings.get('JOB', '') spec = deepcopy(spec) for key, val in kw.items(): if isinstance(val, six.string_types) and key in STRING_KEYS: val = val.splitlines() spec[key] = val self._item_template_pages = sorted( ((t['scrapes'], t) for t in spec['templates'] if t.get('page_type', 'item') == 'item'), key=itemgetter(0)) self._templates = [templ for _, templ in self._item_template_pages] self.plugins = IndexedDict() for plugin_class, plugin_name in zip(load_plugins(settings), load_plugin_names(settings)): instance = plugin_class() instance.setup_bot(settings, spec, item_schemas, all_extractors) self.plugins[plugin_name] = instance self.js_enabled = False self.SPLASH_HOST = None if settings.get('SPLASH_URL'): self.SPLASH_HOST = urlparse(settings.get('SPLASH_URL')).hostname self.js_enabled = spec.get('js_enabled', False) if self.js_enabled and (settings.get('SPLASH_PASS') is not None or settings.get('SPLASH_USER') is not None): self.splash_auth = basic_auth_header( settings.get('SPLASH_USER', ''), settings.get('SPLASH_PASS', '')) self._filter_js_urls = self._build_js_url_filter(spec) self.login_requests = [] self.form_requests = [] self._start_requests = [] self.generic_form = GenericForm(**kw) self._create_init_requests(spec.get("init_requests", [])) self._process_start_urls(spec) self.allowed_domains = spec.get( 'allowed_domains', self._get_allowed_domains(self._templates) ) if not self.allowed_domains: self.allowed_domains = None def _process_start_urls(self, spec): self.start_urls = spec.get('start_urls') for url in self.start_urls: request = Request(url, callback=self.parse, dont_filter=True) self._add_splash_meta(request) self._start_requests.append(request) def _create_init_requests(self, spec): for rdata in spec: if rdata["type"] == "login": request = Request(url=rdata.pop("loginurl"), meta=rdata, callback=self.parse_login_page, dont_filter=True) self._add_splash_meta(request) self.login_requests.append(request) elif rdata["type"] == "form": self.form_requests.append( self.get_generic_form_start_request(rdata) ) elif rdata["type"] == "start": self._start_requests.append( self._create_start_request_from_specs(rdata) ) def parse_login_page(self, response): username = response.request.meta["username"] password = response.request.meta["password"] args, url, method = fill_login_form(response.url, response.body, username, password) return FormRequest(url, method=method, formdata=args, callback=self.after_login, dont_filter=True) def after_login(self, response): for result in self.parse(response): yield result for req in self._start_requests: yield req def get_generic_form_start_request(self, form_descriptor): file_fields = list(self.generic_form.get_url_field(form_descriptor)) if file_fields: (field_index, field_descriptor) = file_fields.pop(0) form_descriptor['field_index'] = field_index return FormRequest(self.generic_form.get_value(field_descriptor), meta=form_descriptor, callback=self.parse_field_url_page, dont_filter=True) else: return Request(url=form_descriptor.pop("form_url"), meta=form_descriptor, callback=self.parse_form_page, dont_filter=True) def parse_field_url_page(self, response): form_descriptor = response.request.meta field_index = form_descriptor['field_index'] field_descriptor = form_descriptor['fields'][field_index] self.generic_form.set_values_url_field(field_descriptor, response.body) yield self.get_generic_form_start_request(form_descriptor) def parse_form_page(self, response): fill_form = self.generic_form.fill_generic_form try: for (args, url, method) in fill_form(response.url, response.body, response.request.meta): yield FormRequest(url, method=method, formdata=args, callback=self.after_form_page, dont_filter=True) except Exception as e: self.logger.warning(str(e)) for req in self._start_requests: yield req def after_form_page(self, response): for result in self.parse(response): yield result def _get_allowed_domains(self, templates): urls = [x['url'] for x in templates] urls += [x.url for x in self._start_requests] return [x[1] for x in iter_unique_scheme_hostname(urls)] def start_requests(self): start_requests = [] if self.login_requests: start_requests = self.login_requests elif self.form_requests: start_requests = self.form_requests else: start_requests = self._start_requests for req in start_requests: yield req def _create_start_request_from_specs(self, info): url = info["url"] lspecs = info.get("link_extractor") if lspecs: linkextractor = create_linkextractor_from_specs(lspecs) def _callback(spider, response): for link in linkextractor.links_to_follow(response): request = Request(url=link.url, callback=spider.parse) yield self._add_splash_meta(request) request = Request(url=url, callback=_callback) return self._add_splash_meta(request) request = Request(url=url, callback=self.parse) return self._add_splash_meta(request) def parse(self, response): """Main handler for all downloaded responses""" request = response.request if (request and request.method == 'POST' and urlparse(request.url).hostname == self.SPLASH_HOST): url = (json.loads(request.body).get('url')) if url: response._url = url content_type = response.headers.get('Content-Type', '') if isinstance(response, HtmlResponse): return self.handle_html(response) elif "application/rss+xml" in content_type: return self.handle_rss(response) else: self.logger.debug( "Ignoring page with content-type=%r: %s" % (content_type, response.url) ) return [] def _plugin_hook(self, name, *args): results = [] for plugin in self.plugins.values(): if hasattr(plugin, name): results.append(getattr(plugin, name)(*args)) return results def _handle(self, hook, response, *extrasrgs): generators = self._plugin_hook(hook, response, *extrasrgs) for item_or_request in itertools.chain(*generators): if isinstance(item_or_request, Request): self._plugin_hook('process_request', item_or_request, response) else: self._plugin_hook('process_item', item_or_request, response) if isinstance(item_or_request, Request): item_or_request = self._add_splash_meta(item_or_request) yield item_or_request def handle_rss(self, response): return self._handle('handle_rss', response, set([])) def handle_html(self, response): return self._handle('handle_html', response) def _build_js_url_filter(self, spec): if not self.js_enabled: return lambda x: None enable_patterns = spec.get('js_enable_patterns') disable_patterns = spec.get('js_disable_patterns') filterf = None enablef = None if enable_patterns: pattern = enable_patterns[0] if len(enable_patterns) == 1 else \ "(?:%s)" % '|'.join(enable_patterns) enablef = re.compile(pattern).search filterf = enablef if disable_patterns: pattern = disable_patterns[0] if len(disable_patterns) == 1 else \ "(?:%s)" % '|'.join(disable_patterns) disablef = re.compile(pattern).search if not enablef: filterf = lambda x: not disablef(x) else: filterf = lambda x: enablef(x) and not disablef(x) return filterf if filterf else lambda x: x def _add_splash_meta(self, request): if self.js_enabled and self._filter_js_urls(request.url): cleaned_url = urlparse(request.url)._replace(params='', query='', fragment='').geturl() request.meta['splash'] = { 'endpoint': 'render.html?job_id=%s' % self._job_id, 'args': { 'wait': 5, 'images': 0, 'url': request.url, 'baseurl': cleaned_url } } return request
class IblSpider(SitemapSpider): def __init__(self, name, spec, item_schemas, all_extractors, settings=None, **kw): self.start_url_generators = { 'start_urls': IdentityGenerator(), 'generated_urls': UrlGenerator(settings, kw), 'url': IdentityGenerator(), 'generated': FragmentGenerator(), # 'feed_urls': FeedUrls(self, settings, kw) } self.generic_form = GenericForm(**kw) super(IblSpider, self).__init__(name, **kw) spec = deepcopy(spec) self._add_spider_args_to_spec(spec, kw) self.plugins = self._configure_plugins( settings, spec, item_schemas, all_extractors) self._configure_js(spec, settings) self.login_requests, self.form_requests = [], [] self._start_urls = self._create_start_urls(spec) self._start_requests = self._create_start_requests(spec) self._create_init_requests(spec) self._add_allowed_domains(spec) self.page_actions = spec.get('page_actions', []) def _add_spider_args_to_spec(self, spec, args): for key, val in args.items(): if isinstance(val, six.string_types) and key in STRING_KEYS: val = val.splitlines() spec[key] = val def _create_start_urls(self, spec): url_type = spec.get('start_urls_type', 'start_urls') return StartUrlCollection( arg_to_iter(spec[url_type]), self.start_url_generators, url_type ) def _create_start_requests(self, spec): init_requests = spec.get('init_requests', []) for rdata in init_requests: if rdata["type"] == "start": yield self._create_start_request_from_specs(rdata) for start_url in self._start_urls: if not isinstance(start_url, Request): start_url = Request(start_url, callback=self.parse, dont_filter=True) yield self._add_splash_meta(start_url) def _create_init_requests(self, spec): init_requests = spec.get('init_requests', []) for rdata in init_requests: if rdata["type"] == "login": request = Request(url=rdata.pop("loginurl"), meta=rdata, callback=self.parse_login_page, dont_filter=True) self._add_splash_meta(request) self.login_requests.append(request) elif rdata["type"] == "form": self.form_requests.append( self.get_generic_form_start_request(rdata) ) def _add_allowed_domains(self, spec): self.allowed_domains = spec.get('allowed_domains', []) if self.allowed_domains is not None and not self.allowed_domains: self.allowed_domains = self._get_allowed_domains(spec) def parse_login_page(self, response): username = response.request.meta["username"] password = response.request.meta["password"] args, url, method = fill_login_form(response.url, response.body, username, password) return FormRequest(url, method=method, formdata=args, callback=self.after_login, dont_filter=True) def after_login(self, response): for result in self.parse(response): yield result for req in self._start_requests: yield req def get_generic_form_start_request(self, form_descriptor): file_fields = list(self.generic_form.get_url_field(form_descriptor)) if file_fields: (field_index, field_descriptor) = file_fields.pop(0) form_descriptor['field_index'] = field_index return FormRequest(self.generic_form.get_value(field_descriptor), meta=form_descriptor, callback=self.parse_field_url_page, dont_filter=True) else: return Request(url=form_descriptor.pop("form_url"), meta=form_descriptor, callback=self.parse_form_page, dont_filter=True) def parse_field_url_page(self, response): form_descriptor = response.request.meta field_index = form_descriptor['field_index'] field_descriptor = form_descriptor['fields'][field_index] self.generic_form.set_values_url_field(field_descriptor, response.body) yield self.get_generic_form_start_request(form_descriptor) def parse_form_page(self, response): fill_form = self.generic_form.fill_generic_form try: for (args, url, method) in fill_form(response.url, response.body, response.request.meta): yield FormRequest(url, method=method, formdata=args, callback=self.after_form_page, dont_filter=True) except Exception as e: self.logger.warning(str(e)) for req in self._start_requests: yield req def after_form_page(self, response): for result in self.parse(response): yield result def _get_allowed_domains(self, spec): urls = [x['url'] for x in spec['templates']] urls += [x['url'] for x in spec.get('init_requests', []) if x['type'] == 'start'] urls += self._start_urls.allowed_domains return [domain for scheme, domain in iter_unique_scheme_hostname(urls)] def start_requests(self): start_requests = [] if self.login_requests: start_requests = self.login_requests elif self.form_requests: start_requests = self.form_requests else: start_requests = self._start_requests for req in start_requests: yield req def _create_start_request_from_specs(self, info): url = info["url"] lspecs = info.get("link_extractor") if lspecs: linkextractor = create_linkextractor_from_specs(lspecs) def _callback(spider, response): for link in linkextractor.links_to_follow(response): request = Request(url=link.url, callback=spider.parse) yield self._add_splash_meta(request) request = Request(url=url, callback=_callback) return self._add_splash_meta(request) request = Request(url=url, callback=self.parse) return self._add_splash_meta(request) def parse(self, response): """Main handler for all downloaded responses""" request = response.request if (request and request.method == 'POST' and urlparse(request.url).hostname == self.SPLASH_HOST): url = (json.loads(request.body).get('url')) if url: response._url = url content_type = response.headers.get('Content-Type', '') if isinstance(response, HtmlResponse): return self.handle_html(response) if (isinstance(response, XmlResponse) or response.url.endswith(('.xml', '.xml.gz'))): response._set_body(self._get_sitemap_body(response)) return self.handle_xml(response) self.logger.debug( "Ignoring page with content-type=%r: %s" % (content_type, response.url) ) return [] def _configure_plugins(self, settings, spec, schemas, extractors): plugins = IndexedDict() for plugin_class, plugin_name in zip(load_plugins(settings), load_plugin_names(settings)): instance = plugin_class() instance.setup_bot(settings, spec, schemas, extractors, self.logger) plugins[plugin_name] = instance return plugins def _plugin_hook(self, name, *args): results = [] for plugin in self.plugins.values(): if hasattr(plugin, name): results.append(getattr(plugin, name)(*args)) return results def _handle(self, hook, response, *extrasrgs): generators = self._plugin_hook(hook, response, *extrasrgs) for item_or_request in itertools.chain(*generators): if isinstance(item_or_request, Request): self._plugin_hook('process_request', item_or_request, response) else: self._plugin_hook('process_item', item_or_request, response) if isinstance(item_or_request, Request): item_or_request = self._add_splash_meta(item_or_request) yield item_or_request def handle_xml(self, response): return self._handle('handle_xml', response, set([])) def handle_html(self, response): return self._handle('handle_html', response) def _configure_js(self, spec, settings): self.js_enabled = False self.SPLASH_HOST = None if settings.get('SPLASH_URL'): self.SPLASH_HOST = urlparse(settings.get('SPLASH_URL')).hostname self.js_enabled = spec.get('js_enabled', False) if self.js_enabled and (settings.get('SPLASH_PASS') is not None or settings.get('SPLASH_USER') is not None): self.splash_auth = basic_auth_header( settings.get('SPLASH_USER', ''), settings.get('SPLASH_PASS', '')) self.splash_wait = settings.getint('SPLASH_WAIT', 5) self.splash_timeout = settings.getint('SPLASH_TIMEOUT', 30) self.splash_js_source = settings.get( 'SPLASH_JS_SOURCE', 'function(){}') self.splash_lua_source = settings.get('SPLASH_LUA_SOURCE', '') self._filter_js_urls = self._build_js_url_filter(spec) def _build_js_url_filter(self, spec): if not self.js_enabled: return lambda x: None enable_patterns = spec.get('js_enable_patterns') disable_patterns = spec.get('js_disable_patterns') return include_exclude_filter(enable_patterns, disable_patterns) def _add_splash_meta(self, request): if self.js_enabled and self._filter_js_urls(request.url): cleaned_url = urlparse(request.url)._replace(params='', query='', fragment='').geturl() endpoint = 'execute' if self.splash_lua_source else 'render.html' request.meta['splash'] = { 'endpoint': endpoint, 'args': { 'wait': self.splash_wait, 'timeout': self.splash_timeout, 'js_source': self.splash_js_source, 'lua_source': self.splash_lua_source, 'images': 0, 'url': request.url, 'baseurl': cleaned_url } } return request
class IblSpider(BaseSpider): def __init__(self, name, spec, item_schemas, all_extractors, **kw): super(IblSpider, self).__init__(name, **kw) self._item_template_pages = sorted(( [t['scrapes'], dict_to_page(t, 'annotated_body'), t.get('extractors', [])] \ for t in spec['templates'] if t.get('page_type', 'item') == 'item' ), key=lambda pair: pair[0]) # generate ibl extractor for links pages _links_pages = [dict_to_page(t, 'annotated_body') for t in spec['templates'] if t.get('page_type') == 'links'] _links_item_descriptor = create_slybot_item_descriptor({'fields': {}}) self._links_ibl_extractor = InstanceBasedLearningExtractor([(t, _links_item_descriptor) for t in _links_pages]) \ if _links_pages else None self._ipages = [page for _, page, _ in self._item_template_pages] self.start_urls = self.start_urls or spec.get('start_urls') if isinstance(self.start_urls, basestring): self.start_urls = self.start_urls.splitlines() self.html_link_extractor = HtmlLinkExtractor() self.rss_link_extractor = RssLinkExtractor() self.allowed_domains = spec.get('allowed_domains', self._get_allowed_domains(self._ipages)) if not self.allowed_domains: self.allowed_domains = None self.build_url_filter(spec) self.itemcls_info = {} for itemclass_name, triplets in itertools.groupby(self._item_template_pages, operator.itemgetter(0)): page_extractors_pairs = map(operator.itemgetter(1, 2), triplets) schema = item_schemas[itemclass_name] item_cls = get_iblitem_class(schema) page_descriptor_pairs = [] for page, template_extractors in page_extractors_pairs: item_descriptor = create_slybot_item_descriptor(schema) apply_extractors(item_descriptor, template_extractors, all_extractors) page_descriptor_pairs.append((page, item_descriptor)) extractor = InstanceBasedLearningExtractor(page_descriptor_pairs) self.itemcls_info[itemclass_name] = { 'class': item_cls, 'descriptor': item_descriptor, 'extractor': extractor, } self.login_requests = [] self.form_requests = [] for rdata in spec.get("init_requests", []): if rdata["type"] == "login": request = Request(url=rdata.pop("loginurl"), meta=rdata, callback=self.parse_login_page, dont_filter=True) self.login_requests.append(request) elif rdata["type"] == "form": self.generic_form = GenericForm(**kw) self.form_requests.append(self.get_generic_form_start_request(rdata)) def parse_login_page(self, response): username = response.request.meta["username"] password = response.request.meta["password"] args, url, method = fill_login_form(response.url, response.body, username, password) return FormRequest(url, method=method, formdata=args, callback=self.after_login, dont_filter=True) def after_login(self, response): for result in self.parse(response): yield result for req in self._start_requests(): yield req def get_generic_form_start_request(self, form_descriptor): file_fields = list(self.generic_form.get_url_field(form_descriptor)) if file_fields: (field_index, field_descriptor) = file_fields.pop(0) form_descriptor['field_index'] = field_index return FormRequest(self.generic_form.get_value(field_descriptor), meta=form_descriptor, callback=self.parse_field_url_page, dont_filter=True) else: return Request(url=form_descriptor.pop("form_url"), meta=form_descriptor, callback=self.parse_form_page, dont_filter=True) def parse_field_url_page(self, response): form_descriptor = response.request.meta field_index = form_descriptor['field_index'] field_descriptor = form_descriptor['fields'][field_index] self.generic_form.set_values_url_field(field_descriptor, response.body) yield self.get_generic_form_start_request(form_descriptor) def parse_form_page(self, response): try: for (args, url, method) in self.generic_form.fill_generic_form(response.url, response.body, response.request.meta): yield FormRequest(url, method=method, formdata=args, callback=self.after_form_page, dont_filter=True) except Exception, e: self.log(str(e), log.WARNING) for req in self._start_requests(): yield req
class IblSpider(Spider): def __init__(self, name, spec, item_schemas, all_extractors, settings=None, **kw): super(IblSpider, self).__init__(name, **kw) spec = deepcopy(spec) for key, val in kw.items(): if isinstance(val, basestring) and key in STRING_KEYS: val = val.splitlines() spec[key] = val self._item_template_pages = sorted( ((t['scrapes'], t) for t in spec['templates'] if t.get('page_type', 'item') == 'item'), key=itemgetter(0)) self._templates = [templ for _, templ in self._item_template_pages] self.plugins = IndexedDict() for plugin_class, plugin_name in zip(load_plugins(settings), load_plugin_names(settings)): instance = plugin_class() instance.setup_bot(settings, spec, item_schemas, all_extractors) self.plugins[plugin_name] = instance self.login_requests = [] self.form_requests = [] self._start_requests = [] self.generic_form = GenericForm(**kw) self._create_init_requests(spec.get("init_requests", [])) self._process_start_urls(spec) self.allowed_domains = spec.get( 'allowed_domains', self._get_allowed_domains(self._templates)) if not self.allowed_domains: self.allowed_domains = None def _process_start_urls(self, spec): self.start_urls = spec.get('start_urls') for url in self.start_urls: self._start_requests.append( Request(url, callback=self.parse, dont_filter=True)) def _create_init_requests(self, spec): for rdata in spec: if rdata["type"] == "login": request = Request(url=rdata.pop("loginurl"), meta=rdata, callback=self.parse_login_page, dont_filter=True) self.login_requests.append(request) elif rdata["type"] == "form": self.form_requests.append( self.get_generic_form_start_request(rdata)) elif rdata["type"] == "start": self._start_requests.append( self._create_start_request_from_specs(rdata)) def parse_login_page(self, response): username = response.request.meta["username"] password = response.request.meta["password"] args, url, method = fill_login_form(response.url, response.body, username, password) return FormRequest(url, method=method, formdata=args, callback=self.after_login, dont_filter=True) def after_login(self, response): for result in self.parse(response): yield result for req in self._start_requests: yield req def get_generic_form_start_request(self, form_descriptor): file_fields = list(self.generic_form.get_url_field(form_descriptor)) if file_fields: (field_index, field_descriptor) = file_fields.pop(0) form_descriptor['field_index'] = field_index return FormRequest(self.generic_form.get_value(field_descriptor), meta=form_descriptor, callback=self.parse_field_url_page, dont_filter=True) else: return Request(url=form_descriptor.pop("form_url"), meta=form_descriptor, callback=self.parse_form_page, dont_filter=True) def parse_field_url_page(self, response): form_descriptor = response.request.meta field_index = form_descriptor['field_index'] field_descriptor = form_descriptor['fields'][field_index] self.generic_form.set_values_url_field(field_descriptor, response.body) yield self.get_generic_form_start_request(form_descriptor) def parse_form_page(self, response): fill_form = self.generic_form.fill_generic_form try: for (args, url, method) in fill_form(response.url, response.body, response.request.meta): yield FormRequest(url, method=method, formdata=args, callback=self.after_form_page, dont_filter=True) except Exception, e: self.log(str(e), log.WARNING) for req in self._start_requests: yield req