Esempio n. 1
0
    def test_advanced_search_form_regex(self):
        url = 'http://www.ebay.com/sch/ebayadvsearch/?rt=nc'
        body = open(join(_PATH, "data", "ebay_advanced_search.html")).read()
        form_descriptor = json.loads("""{
            "type": "form",
            "form_url": "http://www.ebay.com/sch/ebayadvsearch/?rt=nc",
            "xpath": "//form[@name='adv_search_from']",
            "fields": [
                {
                  "xpath": ".//*[@name='_nkw']",
                  "type": "constants",
                  "value": ["Cars"]
                },
                {
                  "xpath": ".//*[@name='_in_kw']",
                  "type": "iterate",
                  "value": "[1-2]"
                }
            ]
        }""")

        generic_form = GenericForm()
        start_requests = list(generic_form.fill_generic_form(url, body, form_descriptor))
        expected_requests = [([('_adv', '1'), ('_ex_kw', ''), ('_ftrv', '1'), ('_ftrt', '901'), ('_sabdlo', u''), ('_sabdhi', u''), ('_sop', '12'), ('_samihi', u''), ('_ipg', '50'), ('_salic', '1'), ('_sasl', ''), ('_udlo', ''), ('_okw', u''), ('_fsradio', '&LH_SpecificSeller=1'), ('_udhi', ''), ('_in_kw', '1'), ('_nkw', u'Cars'), ('_sacat', '0'), ('_oexkw', u''), ('_dmd', '1'), ('_saslop', '1'), ('_samilow', u'')], 'http://www.ebay.com/sch/i.html', 'GET'), ([('_adv', '1'), ('_ex_kw', ''), ('_ftrv', '1'), ('_ftrt', '901'), ('_sabdlo', u''), ('_sabdhi', u''), ('_sop', '12'), ('_samihi', u''), ('_ipg', '50'), ('_salic', '1'), ('_sasl', ''), ('_udlo', ''), ('_okw', u''), ('_fsradio', '&LH_SpecificSeller=1'), ('_udhi', ''), ('_in_kw', '2'), ('_nkw', u'Cars'), ('_sacat', '0'), ('_oexkw', u''), ('_dmd', '1'), ('_saslop', '1'), ('_samilow', u'')], 'http://www.ebay.com/sch/i.html', 'GET')]
        self.assertEqual(start_requests, expected_requests)
Esempio n. 2
0
    def __init__(self,
                 name,
                 spec,
                 item_schemas,
                 all_extractors,
                 settings=None,
                 **kw):
        self.start_url_generators = {
            'start_urls': IdentityGenerator(),
            'generated_urls': UrlGenerator(settings, kw),
            'url': IdentityGenerator(),
            'feed': FeedGenerator(self.parse),
            'generated': FragmentGenerator(),
        }
        self.generic_form = GenericForm(**kw)
        super(IblSpider, self).__init__(name, **kw)
        spec = deepcopy(spec)
        self._add_spider_args_to_spec(spec, kw)
        self._configure_js(spec, settings)
        self.plugins = self._configure_plugins(settings, spec, item_schemas,
                                               all_extractors)

        self.login_requests, self.form_requests = [], []
        self._start_urls = self._create_start_urls(spec)
        self._start_requests = self._create_start_requests(spec)
        self._create_init_requests(spec)
        self._add_allowed_domains(spec)
        self.page_actions = spec.get('page_actions', [])
Esempio n. 3
0
    def __init__(self, name, spec, item_schemas, all_extractors, **kw):
        super(IblSpider, self).__init__(name, **kw)
        spec = deepcopy(spec)
        for key, val in kw.items():
            if isinstance(val, basestring) and key in ['start_urls', 'exclude_patterns', 'follow_patterns', 'allowed_domains']:
                val = val.splitlines()
            spec[key] = val

        self._item_template_pages = sorted((
            [t['scrapes'], dict_to_page(t, 'annotated_body'),
            t.get('extractors', [])] \
            for t in spec['templates'] if t.get('page_type', 'item') == 'item'
        ), key=lambda pair: pair[0])

        # generate ibl extractor for links pages
        _links_pages = [dict_to_page(t, 'annotated_body')
                for t in spec['templates'] if t.get('page_type') == 'links']
        _links_item_descriptor = create_slybot_item_descriptor({'fields': {}})
        self._links_ibl_extractor = InstanceBasedLearningExtractor([(t, _links_item_descriptor) for t in _links_pages]) \
                if _links_pages else None

        self._ipages = [page for _, page, _ in self._item_template_pages]

        self.html_link_extractor = HtmlLinkExtractor()
        self.rss_link_extractor = RssLinkExtractor()
        self.build_url_filter(spec)

        self.itemcls_info = {}
        for itemclass_name, triplets in itertools.groupby(self._item_template_pages, operator.itemgetter(0)):
            page_extractors_pairs = map(operator.itemgetter(1, 2), triplets)
            schema = item_schemas[itemclass_name]
            item_cls = SlybotItem.create_iblitem_class(schema)

            page_descriptor_pairs = []
            for page, template_extractors in page_extractors_pairs:
                item_descriptor = create_slybot_item_descriptor(schema)
                apply_extractors(item_descriptor, template_extractors, all_extractors)
                page_descriptor_pairs.append((page, item_descriptor))

            extractor = InstanceBasedLearningExtractor(page_descriptor_pairs)

            self.itemcls_info[itemclass_name] = {
                'class': item_cls,
                'descriptor': item_descriptor,
                'extractor': extractor,
            }

        self.login_requests = []
        self.form_requests = []
        self._start_requests = []
        self.generic_form = GenericForm(**kw)
        self._create_init_requests(spec.get("init_requests", []))
        self._process_start_urls(spec)
        self.allowed_domains = spec.get('allowed_domains',
                                        self._get_allowed_domains(self._ipages))
        if not self.allowed_domains:
            self.allowed_domains = None
Esempio n. 4
0
    def test_simple_search_form_with_file_type(self):
        url = 'http://www.ebay.com/sch/ebayadvsearch/?rt=nc'
        body = open(join(_PATH, "data", "ebay_advanced_search.html")).read()
        form_descriptor = json.loads("""{
            "type": "form",
            "form_url": "http://www.ebay.com/sch/ebayadvsearch/?rt=nc",
            "xpath": "//form[@name='adv_search_from']",
            "fields": [
                {
                  "name": "my_param",
                  "type": "inurl",
                  "value": "file://%s/test_params.txt",
                  "file_values": ["Cars", "Boats", "Houses", "Electronics"]
                }
            ]
        }""" % join(_PATH, "data"))

        generic_form = GenericForm()
        start_requests = list(
            generic_form.fill_generic_form(url, body, form_descriptor))
        expected_requests = [
            ([('_adv', '1'), ('_ex_kw', ''), ('_ftrv', '1'), ('_ftrt', '901'),
              ('_sabdlo', u''), ('_sabdhi', u''), ('_sop', '12'),
              ('_samihi', u''), ('_ipg', '50'), ('_salic', '1'),
              (u'my_param', u'Cars'), ('_sasl', ''), ('_udlo', ''),
              ('_okw', u''), ('_fsradio', '&LH_SpecificSeller=1'),
              ('_udhi', ''), ('_in_kw', '1'), ('_nkw', ''), ('_sacat', '0'),
              ('_oexkw', u''), ('_dmd', '1'), ('_saslop', '1'),
              ('_samilow', u'')], 'http://www.ebay.com/sch/i.html', 'GET'),
            ([('_adv', '1'), ('_ex_kw', ''), ('_ftrv', '1'), ('_ftrt', '901'),
              ('_sabdlo', u''), ('_sabdhi', u''), ('_sop', '12'),
              ('_samihi', u''), ('_ipg', '50'), ('_salic', '1'),
              (u'my_param', u'Boats'), ('_sasl', ''), ('_udlo', ''),
              ('_okw', u''), ('_fsradio', '&LH_SpecificSeller=1'),
              ('_udhi', ''), ('_in_kw', '1'), ('_nkw', ''), ('_sacat', '0'),
              ('_oexkw', u''), ('_dmd', '1'), ('_saslop', '1'),
              ('_samilow', u'')], 'http://www.ebay.com/sch/i.html', 'GET'),
            ([('_adv', '1'), ('_ex_kw', ''), ('_ftrv', '1'), ('_ftrt', '901'),
              ('_sabdlo', u''), ('_sabdhi', u''), ('_sop', '12'),
              ('_samihi', u''), ('_ipg', '50'), ('_salic', '1'),
              (u'my_param', u'Houses'), ('_sasl', ''), ('_udlo', ''),
              ('_okw', u''), ('_fsradio', '&LH_SpecificSeller=1'),
              ('_udhi', ''), ('_in_kw', '1'), ('_nkw', ''), ('_sacat', '0'),
              ('_oexkw', u''), ('_dmd', '1'), ('_saslop', '1'),
              ('_samilow', u'')], 'http://www.ebay.com/sch/i.html', 'GET'),
            ([('_adv', '1'), ('_ex_kw', ''), ('_ftrv', '1'), ('_ftrt', '901'),
              ('_sabdlo', u''), ('_sabdhi', u''), ('_sop', '12'),
              ('_samihi', u''), ('_ipg', '50'), ('_salic', '1'),
              (u'my_param', u'Electronics'), ('_sasl', ''), ('_udlo', ''),
              ('_okw', u''), ('_fsradio', '&LH_SpecificSeller=1'),
              ('_udhi', ''), ('_in_kw', '1'), ('_nkw', ''), ('_sacat', '0'),
              ('_oexkw', u''), ('_dmd', '1'), ('_saslop', '1'),
              ('_samilow', u'')], 'http://www.ebay.com/sch/i.html', 'GET')
        ]
        self.assertEqual(request_to_set(start_requests),
                         request_to_set(expected_requests))
Esempio n. 5
0
    def __init__(self,
                 name,
                 spec,
                 item_schemas,
                 all_extractors,
                 settings=None,
                 **kw):
        super(IblSpider, self).__init__(name, **kw)
        self._job_id = settings.get('JOB', '')
        spec = deepcopy(spec)
        for key, val in kw.items():
            if isinstance(val, six.string_types) and key in STRING_KEYS:
                val = val.splitlines()
            spec[key] = val

        self._item_template_pages = sorted(
            ((t['scrapes'], t) for t in spec['templates']
             if t.get('page_type', 'item') == 'item'),
            key=itemgetter(0))

        self._templates = [templ for _, templ in self._item_template_pages]

        self.plugins = IndexedDict()
        for plugin_class, plugin_name in zip(load_plugins(settings),
                                             load_plugin_names(settings)):
            instance = plugin_class()
            instance.setup_bot(settings, spec, item_schemas, all_extractors)
            self.plugins[plugin_name] = instance

        self.js_enabled = False
        self.SPLASH_HOST = None
        if settings.get('SPLASH_URL'):
            self.SPLASH_HOST = urlparse(settings.get('SPLASH_URL')).hostname
            self.js_enabled = spec.get('js_enabled', False)
        if self.js_enabled and (settings.get('SPLASH_PASS') is not None
                                or settings.get('SPLASH_USER') is not None):
            self.splash_auth = basic_auth_header(
                settings.get('SPLASH_USER', ''),
                settings.get('SPLASH_PASS', ''))
        self._filter_js_urls = self._build_js_url_filter(spec)
        self.login_requests = []
        self.form_requests = []
        self._start_requests = []
        self.generic_form = GenericForm(**kw)
        self._create_init_requests(spec.get("init_requests", []))
        self._process_start_urls(spec)
        self.allowed_domains = spec.get(
            'allowed_domains', self._get_allowed_domains(self._templates))
        self.page_actions = spec.get('page_actions', [])
        if not self.allowed_domains:
            self.allowed_domains = None
Esempio n. 6
0
    def __init__(self, name, spec, item_schemas, all_extractors, settings=None,
                 **kw):
        super(IblSpider, self).__init__(name, **kw)
        spec = deepcopy(spec)
        for key, val in kw.items():
            if isinstance(val, basestring) and key in STRING_KEYS:
                val = val.splitlines()
            spec[key] = val

        self._item_template_pages = sorted(
            ((t['scrapes'], t) for t in spec['templates']
             if t.get('page_type', 'item') == 'item'), key=itemgetter(0))

        self._templates = [templ for _, templ in self._item_template_pages]

        self.plugins = IndexedDict()
        for plugin_class, plugin_name in zip(load_plugins(settings),
                                             load_plugin_names(settings)):
            instance = plugin_class()
            instance.setup_bot(settings, spec, item_schemas, all_extractors)
            self.plugins[plugin_name] = instance

        self.login_requests = []
        self.form_requests = []
        self._start_requests = []
        self.generic_form = GenericForm(**kw)
        self._create_init_requests(spec.get("init_requests", []))
        self._process_start_urls(spec)
        self.allowed_domains = spec.get(
            'allowed_domains',
            self._get_allowed_domains(self._templates)
        )
        if not self.allowed_domains:
            self.allowed_domains = None
Esempio n. 7
0
    def __init__(self, name, spec, item_schemas, all_extractors, **kw):
        super(IblSpider, self).__init__(name, **kw)

        self._item_template_pages = sorted((
            [t['scrapes'], dict_to_page(t, 'annotated_body'),
            t.get('extractors', [])] \
            for t in spec['templates'] if t.get('page_type', 'item') == 'item'
        ), key=lambda pair: pair[0])

        # generate ibl extractor for links pages
        _links_pages = [dict_to_page(t, 'annotated_body')
                for t in spec['templates'] if t.get('page_type') == 'links']
        _links_item_descriptor = create_slybot_item_descriptor({'fields': {}})
        self._links_ibl_extractor = InstanceBasedLearningExtractor([(t, _links_item_descriptor) for t in _links_pages]) \
                if _links_pages else None

        self._ipages = [page for _, page, _ in self._item_template_pages]

        self.start_urls = self.start_urls or spec.get('start_urls')
        if isinstance(self.start_urls, basestring):
            self.start_urls = self.start_urls.splitlines()

        self.html_link_extractor = HtmlLinkExtractor()
        self.rss_link_extractor = RssLinkExtractor()
        self.allowed_domains = self._get_allowed_domains(self._ipages)

        self.build_url_filter(spec)

        self.itemcls_info = {}
        for itemclass_name, triplets in itertools.groupby(self._item_template_pages, operator.itemgetter(0)):
            page_extractors_pairs = map(operator.itemgetter(1, 2), triplets)
            schema = item_schemas[itemclass_name]
            item_cls = get_iblitem_class(schema)

            page_descriptor_pairs = []
            for page, template_extractors in page_extractors_pairs:
                item_descriptor = create_slybot_item_descriptor(schema)
                apply_extractors(item_descriptor, template_extractors, all_extractors)
                page_descriptor_pairs.append((page, item_descriptor))

            extractor = InstanceBasedLearningExtractor(page_descriptor_pairs)

            self.itemcls_info[itemclass_name] = {
                'class': item_cls,
                'descriptor': item_descriptor,
                'extractor': extractor,
            }

        self.login_requests = []
        self.form_requests = []
        for rdata in spec.get("init_requests", []):
            if rdata["type"] == "login":
                request = Request(url=rdata.pop("loginurl"), meta=rdata,
                                  callback=self.parse_login_page, dont_filter=True)
                self.login_requests.append(request)

            elif rdata["type"] == "form":
                self.generic_form = GenericForm(**kw)
                self.form_requests.append(self.get_generic_form_start_request(rdata))
Esempio n. 8
0
    def __init__(self, name, spec, item_schemas, all_extractors, **kw):
        super(IblSpider, self).__init__(name, **kw)
	
        spec = deepcopy(spec)
        for key, val in kw.items():
            if isinstance(val, basestring) and key in ['start_urls', 'exclude_patterns', 'follow_patterns', 'allowed_domains']:
                val = val.splitlines()
            spec[key] = val
	self.i = time.time()
	self.getProxyList()
        self._item_template_pages = sorted((
            [t['scrapes'], dict_to_page(t, 'annotated_body'),
            t.get('extractors', [])] \
            for t in spec['templates'] if t.get('page_type', 'item') == 'item'
        ), key=lambda pair: pair[0])	
        # generate ibl extractor for links pages
        _links_pages = [dict_to_page(t, 'annotated_body')
                for t in spec['templates'] if t.get('page_type') == 'links']
        _links_item_descriptor = create_slybot_item_descriptor({'fields': {}})
        self._links_ibl_extractor = InstanceBasedLearningExtractor([(t, _links_item_descriptor) for t in _links_pages]) \
                if _links_pages else None

        self._ipages = [page for _, page, _ in self._item_template_pages]

        self.html_link_extractor = HtmlLinkExtractor()
        self.rss_link_extractor = RssLinkExtractor()
        self.build_url_filter(spec)

        self.itemcls_info = {}
        for itemclass_name, triplets in itertools.groupby(self._item_template_pages, operator.itemgetter(0)):
            page_extractors_pairs = map(operator.itemgetter(1, 2), triplets)
            schema = item_schemas[itemclass_name]
            item_cls = SlybotItem.create_iblitem_class(schema)

            page_descriptor_pairs = []
            for page, template_extractors in page_extractors_pairs:
                item_descriptor = create_slybot_item_descriptor(schema)
                apply_extractors(item_descriptor, template_extractors, all_extractors)
                page_descriptor_pairs.append((page, item_descriptor))

            extractor = InstanceBasedLearningExtractor(page_descriptor_pairs)

            self.itemcls_info[itemclass_name] = {
                'class': item_cls,
                'descriptor': item_descriptor,
                'extractor': extractor,
            }

        self.login_requests = []
        self.form_requests = []
        self._start_requests = []
        self.generic_form = GenericForm(**kw)
        self._create_init_requests(spec.get("init_requests", []))
        self._process_start_urls(spec)
        self.allowed_domains = spec.get('allowed_domains',
                                        self._get_allowed_domains(self._ipages))
        if not self.allowed_domains:
            self.allowed_domains = None
Esempio n. 9
0
    def test_simple_search_form_with_named_parameter(self):
        url = 'http://www.ebay.com/sch/ebayadvsearch/?rt=nc'
        body = open(join(_PATH, "data", "ebay_advanced_search.html")).read()
        form_descriptor = json.loads("""{
            "type": "form",
            "form_url": "http://*****:*****@name='adv_search_from']",
            "fields": [
                {
                  "name": "my_param",
                  "type": "constants",
                  "value": ["Cars"]
                }
            ]
        }""")

        generic_form = GenericForm()
        start_requests = list(generic_form.fill_generic_form(url, body, form_descriptor))
        expected_requests = [([('_in_kw', '1'), ('_udlo', ''), ('_ex_kw', ''), ('_nkw', ''), ('_ipg', '50'), ('_adv', '1'), ('_salic', '1'), ('_dmd', '1'), ('_fsradio', '&LH_SpecificSeller=1'), ('_udhi', ''), ('_sop', '12'), (u'my_param', u'Cars'), ('_sasl', '')], 'http://www.ebay.com/sch/i.html', 'GET')]
        self.assertEqual(start_requests, expected_requests)
Esempio n. 10
0
    def test_simple_search_form_with_file_type(self):
        url = 'http://www.ebay.com/sch/ebayadvsearch/?rt=nc'
        body = open(join(_PATH, "data", "ebay_advanced_search.html")).read()
        form_descriptor = json.loads("""{
            "type": "form",
            "form_url": "http://www.ebay.com/sch/ebayadvsearch/?rt=nc",
            "xpath": "//form[@name='adv_search_from']",
            "fields": [
                {
                  "name": "my_param",
                  "type": "inurl",
                  "value": "file://%s/test_params.txt",
                  "file_values": ["Cars", "Boats", "Houses", "Electronics"]
                }
            ]
        }""" % join(_PATH, "data"))

        generic_form = GenericForm()
        start_requests = list(generic_form.fill_generic_form(url, body, form_descriptor))
        expected_requests = [([('_adv', '1'), ('_ex_kw', ''), ('_ftrv', '1'), ('_ftrt', '901'), ('_sabdlo', u''), ('_sabdhi', u''), ('_sop', '12'), ('_samihi', u''), ('_ipg', '50'), ('_salic', '1'), (u'my_param', u'Cars'), ('_sasl', ''), ('_udlo', ''), ('_okw', u''), ('_fsradio', '&LH_SpecificSeller=1'), ('_udhi', ''), ('_in_kw', '1'), ('_nkw', ''), ('_sacat', '0'), ('_oexkw', u''), ('_dmd', '1'), ('_saslop', '1'), ('_samilow', u'')], 'http://www.ebay.com/sch/i.html', 'GET'), ([('_adv', '1'), ('_ex_kw', ''), ('_ftrv', '1'), ('_ftrt', '901'), ('_sabdlo', u''), ('_sabdhi', u''), ('_sop', '12'), ('_samihi', u''), ('_ipg', '50'), ('_salic', '1'), (u'my_param', u'Boats'), ('_sasl', ''), ('_udlo', ''), ('_okw', u''), ('_fsradio', '&LH_SpecificSeller=1'), ('_udhi', ''), ('_in_kw', '1'), ('_nkw', ''), ('_sacat', '0'), ('_oexkw', u''), ('_dmd', '1'), ('_saslop', '1'), ('_samilow', u'')], 'http://www.ebay.com/sch/i.html', 'GET'), ([('_adv', '1'), ('_ex_kw', ''), ('_ftrv', '1'), ('_ftrt', '901'), ('_sabdlo', u''), ('_sabdhi', u''), ('_sop', '12'), ('_samihi', u''), ('_ipg', '50'), ('_salic', '1'), (u'my_param', u'Houses'), ('_sasl', ''), ('_udlo', ''), ('_okw', u''), ('_fsradio', '&LH_SpecificSeller=1'), ('_udhi', ''), ('_in_kw', '1'), ('_nkw', ''), ('_sacat', '0'), ('_oexkw', u''), ('_dmd', '1'), ('_saslop', '1'), ('_samilow', u'')], 'http://www.ebay.com/sch/i.html', 'GET'), ([('_adv', '1'), ('_ex_kw', ''), ('_ftrv', '1'), ('_ftrt', '901'), ('_sabdlo', u''), ('_sabdhi', u''), ('_sop', '12'), ('_samihi', u''), ('_ipg', '50'), ('_salic', '1'), (u'my_param', u'Electronics'), ('_sasl', ''), ('_udlo', ''), ('_okw', u''), ('_fsradio', '&LH_SpecificSeller=1'), ('_udhi', ''), ('_in_kw', '1'), ('_nkw', ''), ('_sacat', '0'), ('_oexkw', u''), ('_dmd', '1'), ('_saslop', '1'), ('_samilow', u'')], 'http://www.ebay.com/sch/i.html', 'GET')]
        self.assertEqual(start_requests, expected_requests)
Esempio n. 11
0
    def __init__(self,
                 name,
                 spec,
                 item_schemas,
                 all_extractors,
                 settings=None,
                 **kw):
        super(IblSpider, self).__init__(name, **kw)
        spec = deepcopy(spec)
        for key, val in kw.items():
            if isinstance(val, basestring) and key in STRING_KEYS:
                val = val.splitlines()
            spec[key] = val

        self._item_template_pages = sorted(
            ((t['scrapes'], t) for t in spec['templates']
             if t.get('page_type', 'item') == 'item'),
            key=itemgetter(0))

        self._templates = [templ for _, templ in self._item_template_pages]

        self.plugins = IndexedDict()
        for plugin_class, plugin_name in zip(load_plugins(settings),
                                             load_plugin_names(settings)):
            instance = plugin_class()
            instance.setup_bot(settings, spec, item_schemas, all_extractors)
            self.plugins[plugin_name] = instance

        self.login_requests = []
        self.form_requests = []
        self._start_requests = []
        self.generic_form = GenericForm(**kw)
        self._create_init_requests(spec.get("init_requests", []))
        self._process_start_urls(spec)
        self.allowed_domains = spec.get(
            'allowed_domains', self._get_allowed_domains(self._templates))
        if not self.allowed_domains:
            self.allowed_domains = None
Esempio n. 12
0
    def __init__(self, name, spec, item_schemas, all_extractors, settings=None,
                 **kw):
        self.start_url_generators = {
            'start_urls': IdentityGenerator(),
            'generated_urls': UrlGenerator(settings, kw),

            'url': IdentityGenerator(),
            'feed': FeedGenerator(self.parse),
            'generated': FragmentGenerator(),
        }
        self.generic_form = GenericForm(**kw)
        super(IblSpider, self).__init__(name, **kw)
        spec = deepcopy(spec)
        self._add_spider_args_to_spec(spec, kw)

        #if actions configured, then set js_enabled as true, and put url of each action to js_enable_patterns. 
        self.actions = spec.get('actions', [])
        '''
        if len(self.actions):
            spec['js_enabled']= True 
            enable_patterns = spec.get('js_enable_patterns', [] )
            for action in self.actions: 
                enable_patterns.append(action.get('url'))
            spec['js_enable_patterns']= enable_patterns
        '''

 
        self._configure_js(spec, settings)
        self.plugins = self._configure_plugins(
            settings, spec, item_schemas, all_extractors)

        self.login_requests, self.form_requests = [], []
        self._start_urls = self._create_start_urls(spec)
        self._start_requests = self._create_start_requests(spec)
        self._create_init_requests(spec)
        self._add_allowed_domains(spec)
        self.page_actions = spec.get('page_actions', [])
Esempio n. 13
0
    def __init__(self, name, spec, item_schemas, all_extractors, settings=None,
                 **kw):
        super(IblSpider, self).__init__(name, **kw)
        self._job_id = settings.get('JOB', '')
        spec = deepcopy(spec)
        for key, val in kw.items():
            if isinstance(val, six.string_types) and key in STRING_KEYS:
                val = val.splitlines()
            spec[key] = val

        self._item_template_pages = sorted(
            ((t['scrapes'], t) for t in spec['templates']
             if t.get('page_type', 'item') == 'item'), key=itemgetter(0))

        self._templates = [templ for _, templ in self._item_template_pages]

        self.plugins = IndexedDict()
        for plugin_class, plugin_name in zip(load_plugins(settings),
                                             load_plugin_names(settings)):
            instance = plugin_class()
            instance.setup_bot(settings, spec, item_schemas, all_extractors)
            self.plugins[plugin_name] = instance

        self.js_enabled = False
        self.SPLASH_HOST = None
        if settings.get('SPLASH_URL'):
            self.SPLASH_HOST = urlparse(settings.get('SPLASH_URL')).hostname
            self.js_enabled = spec.get('js_enabled', False)
        if self.js_enabled and (settings.get('SPLASH_PASS') is not None or
                                settings.get('SPLASH_USER') is not None):
            self.splash_auth = basic_auth_header(
                settings.get('SPLASH_USER', ''),
                settings.get('SPLASH_PASS', ''))
        self._filter_js_urls = self._build_js_url_filter(spec)
        self.login_requests = []
        self.form_requests = []
        self._start_requests = []
        self.generic_form = GenericForm(**kw)
        self._create_init_requests(spec.get("init_requests", []))
        self._process_start_urls(spec)
        self.allowed_domains = spec.get(
            'allowed_domains',
            self._get_allowed_domains(self._templates)
        )
        self.page_actions = spec.get('page_actions', [])
        if not self.allowed_domains:
            self.allowed_domains = None
Esempio n. 14
0
File: spider.py Progetto: 01-/portia
 def __init__(self, name, spec, item_schemas, all_extractors, settings=None,
              **kw):
     self.start_url_generators = {
         'start_urls': StartUrls(),
         'generated_urls': UrlGenerator(settings, kw)
     }
     self.generic_form = GenericForm(**kw)
     super(IblSpider, self).__init__(name, **kw)
     spec = deepcopy(spec)
     self._add_spider_args_to_spec(spec, kw)
     self.plugins = self._configure_plugins(
         settings, spec, item_schemas, all_extractors)
     self._configure_js(spec, settings)
     self.login_requests, self.form_requests = [], []
     self._start_requests = []
     self._create_init_requests(spec)
     self._process_start_urls(spec)
     self._add_allowed_domains(spec)
     self.page_actions = spec.get('page_actions', [])
Esempio n. 15
0
class IblSpider(Spider):

    def __init__(self, name, spec, item_schemas, all_extractors, settings=None,
                 **kw):
        super(IblSpider, self).__init__(name, **kw)
        spec = deepcopy(spec)
        for key, val in kw.items():
            if isinstance(val, basestring) and key in STRING_KEYS:
                val = val.splitlines()
            spec[key] = val

        self._item_template_pages = sorted(
            ((t['scrapes'], t) for t in spec['templates']
             if t.get('page_type', 'item') == 'item'), key=itemgetter(0))

        self._templates = [templ for _, templ in self._item_template_pages]

        self.plugins = IndexedDict()
        for plugin_class, plugin_name in zip(load_plugins(settings),
                                             load_plugin_names(settings)):
            instance = plugin_class()
            instance.setup_bot(settings, spec, item_schemas, all_extractors)
            self.plugins[plugin_name] = instance

        self.login_requests = []
        self.form_requests = []
        self._start_requests = []
        self.generic_form = GenericForm(**kw)
        self._create_init_requests(spec.get("init_requests", []))
        self._process_start_urls(spec)
        self.allowed_domains = spec.get(
            'allowed_domains',
            self._get_allowed_domains(self._templates)
        )
        if not self.allowed_domains:
            self.allowed_domains = None

    def _process_start_urls(self, spec):
        self.start_urls = spec.get('start_urls')
        for url in self.start_urls:
            self._start_requests.append(Request(url, callback=self.parse,
                                                dont_filter=True))

    def _create_init_requests(self, spec):
        for rdata in spec:
            if rdata["type"] == "login":
                request = Request(url=rdata.pop("loginurl"), meta=rdata,
                                  callback=self.parse_login_page,
                                  dont_filter=True)
                self.login_requests.append(request)
            elif rdata["type"] == "form":
                self.form_requests.append(
                    self.get_generic_form_start_request(rdata)
                )
            elif rdata["type"] == "start":
                self._start_requests.append(
                    self._create_start_request_from_specs(rdata)
                )

    def parse_login_page(self, response):
        username = response.request.meta["username"]
        password = response.request.meta["password"]
        args, url, method = fill_login_form(response.url, response.body,
                                            username, password)
        return FormRequest(url, method=method, formdata=args,
                           callback=self.after_login, dont_filter=True)

    def after_login(self, response):
        for result in self.parse(response):
            yield result
        for req in self._start_requests:
            yield req

    def get_generic_form_start_request(self, form_descriptor):
        file_fields = list(self.generic_form.get_url_field(form_descriptor))
        if file_fields:
            (field_index, field_descriptor) = file_fields.pop(0)
            form_descriptor['field_index'] = field_index
            return FormRequest(self.generic_form.get_value(field_descriptor),
                               meta=form_descriptor,
                               callback=self.parse_field_url_page,
                               dont_filter=True)
        else:
            return Request(url=form_descriptor.pop("form_url"),
                           meta=form_descriptor, callback=self.parse_form_page,
                           dont_filter=True)

    def parse_field_url_page(self, response):
        form_descriptor = response.request.meta
        field_index = form_descriptor['field_index']
        field_descriptor = form_descriptor['fields'][field_index]
        self.generic_form.set_values_url_field(field_descriptor, response.body)
        yield self.get_generic_form_start_request(form_descriptor)

    def parse_form_page(self, response):
        fill_form = self.generic_form.fill_generic_form
        try:
            for (args, url, method) in fill_form(response.url, response.body,
                                                 response.request.meta):
                yield FormRequest(url, method=method, formdata=args,
                                  callback=self.after_form_page,
                                  dont_filter=True)
        except Exception, e:
            self.log(str(e), log.WARNING)
        for req in self._start_requests:
            yield req
Esempio n. 16
0
class IblSpider(BaseSpider):

    def __init__(self, name, spec, item_schemas, all_extractors, **kw):
        super(IblSpider, self).__init__(name, **kw)

        self._item_template_pages = sorted((
            [t['scrapes'], dict_to_page(t, 'annotated_body'),
            t.get('extractors', [])] \
            for t in spec['templates'] if t.get('page_type', 'item') == 'item'
        ), key=lambda pair: pair[0])

        # generate ibl extractor for links pages
        _links_pages = [dict_to_page(t, 'annotated_body')
                for t in spec['templates'] if t.get('page_type') == 'links']
        _links_item_descriptor = create_slybot_item_descriptor({'fields': {}})
        self._links_ibl_extractor = InstanceBasedLearningExtractor([(t, _links_item_descriptor) for t in _links_pages]) \
                if _links_pages else None

        self._ipages = [page for _, page, _ in self._item_template_pages]

        self.html_link_extractor = HtmlLinkExtractor()
        self.rss_link_extractor = RssLinkExtractor()
        self.build_url_filter(spec)

        self.itemcls_info = {}
        for itemclass_name, triplets in itertools.groupby(self._item_template_pages, operator.itemgetter(0)):
            page_extractors_pairs = map(operator.itemgetter(1, 2), triplets)
            schema = item_schemas[itemclass_name]
            item_cls = SlybotItem.create_iblitem_class(schema)

            page_descriptor_pairs = []
            for page, template_extractors in page_extractors_pairs:
                item_descriptor = create_slybot_item_descriptor(schema)
                apply_extractors(item_descriptor, template_extractors, all_extractors)
                page_descriptor_pairs.append((page, item_descriptor))

            extractor = InstanceBasedLearningExtractor(page_descriptor_pairs)

            self.itemcls_info[itemclass_name] = {
                'class': item_cls,
                'descriptor': item_descriptor,
                'extractor': extractor,
            }

        self.login_requests = []
        self.form_requests = []
        self._start_requests = []
        self.generic_form = GenericForm(**kw)
        self._create_init_requests(spec.get("init_requests", []))
        self._process_start_urls(spec)
        self.allowed_domains = spec.get('allowed_domains',
                                        self._get_allowed_domains(self._ipages))
        if not self.allowed_domains:
            self.allowed_domains = None

    def _process_start_urls(self, spec):
        self.start_urls = self.start_urls or spec.get('start_urls')
        if isinstance(self.start_urls, basestring):
            self.start_urls = self.start_urls.splitlines()
        for url in self.start_urls:
            self._start_requests.append(Request(url, callback=self.parse, dont_filter=True))

    def _create_init_requests(self, spec):
        for rdata in spec:
            if rdata["type"] == "login":
                request = Request(url=rdata.pop("loginurl"), meta=rdata,
                                  callback=self.parse_login_page, dont_filter=True)
                self.login_requests.append(request)
            elif rdata["type"] == "form":
                self.form_requests.append(self.get_generic_form_start_request(rdata))
            elif rdata["type"] == "start":
                self._start_requests.append(self._create_start_request_from_specs(rdata))

    def parse_login_page(self, response):
        username = response.request.meta["username"]
        password = response.request.meta["password"]
        args, url, method = fill_login_form(response.url, response.body, username, password)
        return FormRequest(url, method=method, formdata=args, callback=self.after_login, dont_filter=True)

    def after_login(self, response):
        for result in self.parse(response):
            yield result
        for req in self._start_requests:
            yield req

    def get_generic_form_start_request(self, form_descriptor):
        file_fields = list(self.generic_form.get_url_field(form_descriptor))
        if file_fields:
            (field_index, field_descriptor) = file_fields.pop(0)
            form_descriptor['field_index'] = field_index
            return FormRequest(self.generic_form.get_value(field_descriptor), meta=form_descriptor,
                              callback=self.parse_field_url_page, dont_filter=True)
        else:
            return Request(url=form_descriptor.pop("form_url"), meta=form_descriptor,
                                  callback=self.parse_form_page, dont_filter=True)

    def parse_field_url_page(self, response):
        form_descriptor = response.request.meta
        field_index = form_descriptor['field_index']
        field_descriptor = form_descriptor['fields'][field_index]
        self.generic_form.set_values_url_field(field_descriptor, response.body)
        yield self.get_generic_form_start_request(form_descriptor)

    def parse_form_page(self, response):
        try:
            for (args, url, method) in self.generic_form.fill_generic_form(response.url,
                                                                           response.body,
                                                                           response.request.meta):
                yield FormRequest(url, method=method, formdata=args,
                                  callback=self.after_form_page, dont_filter=True)
        except Exception, e:
            self.log(str(e), log.WARNING)
        for req in self._start_requests:
            yield req
Esempio n. 17
0
    def __init__(self, name, spec, item_schemas, all_extractors, **kw):
        super(IblSpider, self).__init__(name, **kw)

        self._item_template_pages = sorted((
            [t['scrapes'], dict_to_page(t, 'annotated_body'),
            t.get('extractors', [])] \
            for t in spec['templates'] if t.get('page_type', 'item') == 'item'
        ), key=lambda pair: pair[0])

        # generate ibl extractor for links pages
        _links_pages = [dict_to_page(t, 'annotated_body')
                for t in spec['templates'] if t.get('page_type') == 'links']
        _links_item_descriptor = create_slybot_item_descriptor({'fields': {}})
        self._links_ibl_extractor = InstanceBasedLearningExtractor([(t, _links_item_descriptor) for t in _links_pages]) \
                if _links_pages else None

        self._ipages = [page for _, page, _ in self._item_template_pages]

        self.start_urls = self.start_urls or spec.get('start_urls')
        if isinstance(self.start_urls, basestring):
            self.start_urls = self.start_urls.splitlines()

        self.html_link_extractor = HtmlLinkExtractor()
        self.rss_link_extractor = RssLinkExtractor()
        self.allowed_domains = spec.get('allowed_domains',
                                        self._get_allowed_domains(self._ipages))
        if not self.allowed_domains:
            self.allowed_domains = None
        self.build_url_filter(spec)

        self.itemcls_info = {}
        for itemclass_name, triplets in itertools.groupby(self._item_template_pages, operator.itemgetter(0)):
            page_extractors_pairs = map(operator.itemgetter(1, 2), triplets)
            schema = item_schemas[itemclass_name]
            item_cls = get_iblitem_class(schema)

            page_descriptor_pairs = []
            for page, template_extractors in page_extractors_pairs:
                item_descriptor = create_slybot_item_descriptor(schema)
                apply_extractors(item_descriptor, template_extractors, all_extractors)
                page_descriptor_pairs.append((page, item_descriptor))

            extractor = InstanceBasedLearningExtractor(page_descriptor_pairs)

            self.itemcls_info[itemclass_name] = {
                'class': item_cls,
                'descriptor': item_descriptor,
                'extractor': extractor,
            }

        self.login_requests = []
        self.form_requests = []
        for rdata in spec.get("init_requests", []):
            if rdata["type"] == "login":
                request = Request(url=rdata.pop("loginurl"), meta=rdata,
                                  callback=self.parse_login_page, dont_filter=True)
                self.login_requests.append(request)

            elif rdata["type"] == "form":
                self.generic_form = GenericForm(**kw)
                self.form_requests.append(self.get_generic_form_start_request(rdata))
Esempio n. 18
0
class IblSpider(SitemapSpider):
    def __init__(self,
                 name,
                 spec,
                 item_schemas,
                 all_extractors,
                 settings=None,
                 **kw):
        self.start_url_generators = {
            'start_urls': IdentityGenerator(),
            'generated_urls': UrlGenerator(settings, kw),
            'url': IdentityGenerator(),
            'feed': FeedGenerator(self.parse),
            'generated': FragmentGenerator(),
        }
        self.generic_form = GenericForm(**kw)
        super(IblSpider, self).__init__(name, **kw)
        spec = deepcopy(spec)
        self._add_spider_args_to_spec(spec, kw)
        self._configure_js(spec, settings)
        self.plugins = self._configure_plugins(settings, spec, item_schemas,
                                               all_extractors)

        self.login_requests, self.form_requests = [], []
        self._start_urls = self._create_start_urls(spec)
        self._start_requests = self._create_start_requests(spec)
        self._create_init_requests(spec)
        self._add_allowed_domains(spec)
        self.page_actions = spec.get('page_actions', [])

    def _add_spider_args_to_spec(self, spec, args):
        for key, val in args.items():
            if isinstance(val, six.string_types) and key in STRING_KEYS:
                val = val.splitlines()
            spec[key] = val

    def _create_start_urls(self, spec):
        url_type = spec.get('start_urls_type', 'start_urls')
        return StartUrlCollection(
            arg_to_iter(spec[url_type]),
            self.start_url_generators,
        )

    def _create_start_requests(self, spec):
        init_requests = spec.get('init_requests', [])
        for rdata in init_requests:
            if rdata["type"] == "start":
                yield self._create_start_request_from_specs(rdata)

        for start_url in self._start_urls:
            if not isinstance(start_url, Request):
                start_url = Request(start_url,
                                    callback=self.parse,
                                    dont_filter=True)
            yield self._add_splash_meta(start_url)

    def _create_init_requests(self, spec):
        init_requests = spec.get('init_requests', [])
        for rdata in init_requests:
            if rdata["type"] == "login":
                request = Request(url=rdata.pop("loginurl"),
                                  meta=rdata,
                                  callback=self.parse_login_page,
                                  dont_filter=True)
                self._add_splash_meta(request)
                self.login_requests.append(request)
            elif rdata["type"] == "form":
                self.form_requests.append(
                    self.get_generic_form_start_request(rdata))

    def _add_allowed_domains(self, spec):
        self.allowed_domains = spec.get('allowed_domains', [])
        if self.allowed_domains is not None and not self.allowed_domains:
            self.allowed_domains = self._get_allowed_domains(spec)

    def parse_login_page(self, response):
        username = response.request.meta["username"]
        password = response.request.meta["password"]
        args, url, method = fill_login_form(response.url, response.body,
                                            username, password)
        return FormRequest(url,
                           method=method,
                           formdata=args,
                           callback=self.after_login,
                           dont_filter=True)

    def after_login(self, response):
        for result in self.parse(response):
            yield result
        for req in self._start_requests:
            yield req

    def get_generic_form_start_request(self, form_descriptor):
        file_fields = list(self.generic_form.get_url_field(form_descriptor))
        if file_fields:
            (field_index, field_descriptor) = file_fields.pop(0)
            form_descriptor['field_index'] = field_index
            return FormRequest(self.generic_form.get_value(field_descriptor),
                               meta=form_descriptor,
                               callback=self.parse_field_url_page,
                               dont_filter=True)
        else:
            return Request(url=form_descriptor.pop("form_url"),
                           meta=form_descriptor,
                           callback=self.parse_form_page,
                           dont_filter=True)

    def parse_field_url_page(self, response):
        form_descriptor = response.request.meta
        field_index = form_descriptor['field_index']
        field_descriptor = form_descriptor['fields'][field_index]
        self.generic_form.set_values_url_field(field_descriptor, response.text)
        yield self.get_generic_form_start_request(form_descriptor)

    def parse_form_page(self, response):
        fill_form = self.generic_form.fill_generic_form
        try:
            for (args, url, method) in fill_form(response.url, response.body,
                                                 response.request.meta):
                yield FormRequest(url,
                                  method=method,
                                  formdata=args,
                                  callback=self.after_form_page,
                                  dont_filter=True)
        except Exception as e:
            self.logger.warning(str(e))
        for req in self._start_requests:
            yield req

    def after_form_page(self, response):
        for result in self.parse(response):
            yield result

    def _get_allowed_domains(self, spec):
        urls = [x['url'] for x in spec['templates']]
        urls += [
            x['url'] for x in spec.get('init_requests', [])
            if x['type'] == 'start'
        ]
        urls += self._start_urls.allowed_domains
        return [domain for scheme, domain in iter_unique_scheme_hostname(urls)]

    def start_requests(self):
        start_requests = []
        if self.login_requests:
            start_requests = self.login_requests
        elif self.form_requests:
            start_requests = self.form_requests
        else:
            start_requests = self._start_requests
        for req in start_requests:
            yield req

    def _create_start_request_from_specs(self, info):
        url = info["url"]
        lspecs = info.get("link_extractor")
        if lspecs:
            linkextractor = create_linkextractor_from_specs(lspecs)

            def _callback(spider, response):
                for link in linkextractor.links_to_follow(response):
                    request = Request(url=link.url, callback=spider.parse)
                    yield self._add_splash_meta(request)

            request = Request(url=url, callback=_callback)
            return self._add_splash_meta(request)
        request = Request(url=url, callback=self.parse)
        return self._add_splash_meta(request)

    def parse(self, response):
        """Main handler for all downloaded responses"""
        request = response.request
        if (request and request.method == 'POST'
                and urlparse(request.url).hostname == self.SPLASH_HOST):
            url = json.loads(request.body.decode(request.encoding)).get('url')
            if url:
                response._url = url
        _type = content_type(response)
        if (isinstance(response, XmlResponse) or response.url.endswith(
            ('.xml', '.xml.gz')) or 'xml' in _type.subtype):
            sitemap_body = self._get_sitemap_body(response)
            if sitemap_body:
                response._set_body(self._get_sitemap_body(response))
            return self.handle_xml(response)
        if isinstance(response, html_responses):
            return self.handle_html(response)
        self.logger.debug(
            "Ignoring page with content-type=%r: %s" %
            (response.headers.get('Content-Type', ''), response.url))
        return []

    def _configure_plugins(self, settings, spec, schemas, extractors):
        plugins = IndexedDict()
        for plugin_class, plugin_name in zip(load_plugins(settings),
                                             load_plugin_names(settings)):
            instance = plugin_class()
            instance.setup_bot(settings, self, spec, schemas, extractors,
                               self.logger)
            plugins[plugin_name] = instance
        return plugins

    def _plugin_hook(self, name, *args):
        results = []
        for plugin in self.plugins.values():
            if hasattr(plugin, name):
                results.append(getattr(plugin, name)(*args))
        return results

    def _handle(self, hook, response, *extrasrgs):
        generators = self._plugin_hook(hook, response, *extrasrgs)
        for item_or_request in itertools.chain(*generators):
            if isinstance(item_or_request, Request):
                self._plugin_hook('process_request', item_or_request, response)
            else:
                self._plugin_hook('process_item', item_or_request, response)
            if isinstance(item_or_request, Request):
                item_or_request = self._add_splash_meta(item_or_request)
            yield item_or_request

    def handle_xml(self, response):
        return self._handle('handle_xml', response, set([]))

    def handle_html(self, response):
        return self._handle('handle_html', response)

    def _configure_js(self, spec, settings):
        self.js_enabled = False
        self.SPLASH_HOST = None
        if settings.get('SPLASH_URL'):
            self.SPLASH_HOST = urlparse(settings.get('SPLASH_URL')).hostname
            self.js_enabled = spec.get('js_enabled', False)
        if self.js_enabled and (settings.get('SPLASH_PASS') is not None
                                or settings.get('SPLASH_USER') is not None):
            self.splash_auth = basic_auth_header(
                settings.get('SPLASH_USER', ''),
                settings.get('SPLASH_PASS', ''))
        self.splash_wait = settings.getint('SPLASH_WAIT', 5)
        self.splash_timeout = settings.getint('SPLASH_TIMEOUT', 30)
        self.splash_js_source = settings.get('SPLASH_JS_SOURCE',
                                             'function(){}')
        self.splash_lua_source = settings.get('SPLASH_LUA_SOURCE',
                                              DEFAULT_LUA_SOURCE)
        self._filter_js_urls = self._build_js_url_filter(spec)

    def _build_js_url_filter(self, spec):
        if not self.js_enabled:
            return lambda x: None
        enable_patterns = spec.get('js_enable_patterns')
        disable_patterns = spec.get('js_disable_patterns')
        return include_exclude_filter(enable_patterns, disable_patterns)

    def _add_splash_meta(self, request):
        if self.js_enabled and self._filter_js_urls(request.url):
            cleaned_url = urlparse(request.url)._replace(params='',
                                                         query='',
                                                         fragment='').geturl()
            request.meta['splash'] = {
                'endpoint': 'execute',
                'session_id': '{}-{}'.format(self.name, id(self)),
                'args': {
                    'wait': self.splash_wait,
                    'timeout': self.splash_timeout,
                    'js_source': self.splash_js_source,
                    'lua_source': self.splash_lua_source,
                    'images': 0,
                    'url': request.url,
                    'baseurl': cleaned_url
                }
            }
        return request
Esempio n. 19
0
class IblSpider(SitemapSpider):
    def __init__(self,
                 name,
                 spec,
                 item_schemas,
                 all_extractors,
                 settings=None,
                 **kw):
        super(IblSpider, self).__init__(name, **kw)
        self._job_id = settings.get('JOB', '')
        spec = deepcopy(spec)
        for key, val in kw.items():
            if isinstance(val, six.string_types) and key in STRING_KEYS:
                val = val.splitlines()
            spec[key] = val

        self._item_template_pages = sorted(
            ((t['scrapes'], t) for t in spec['templates']
             if t.get('page_type', 'item') == 'item'),
            key=itemgetter(0))

        self._templates = [templ for _, templ in self._item_template_pages]

        self.plugins = IndexedDict()
        for plugin_class, plugin_name in zip(load_plugins(settings),
                                             load_plugin_names(settings)):
            instance = plugin_class()
            instance.setup_bot(settings, spec, item_schemas, all_extractors)
            self.plugins[plugin_name] = instance

        self.js_enabled = False
        self.SPLASH_HOST = None
        if settings.get('SPLASH_URL'):
            self.SPLASH_HOST = urlparse(settings.get('SPLASH_URL')).hostname
            self.js_enabled = spec.get('js_enabled', False)
        if self.js_enabled and (settings.get('SPLASH_PASS') is not None
                                or settings.get('SPLASH_USER') is not None):
            self.splash_auth = basic_auth_header(
                settings.get('SPLASH_USER', ''),
                settings.get('SPLASH_PASS', ''))
        self._filter_js_urls = self._build_js_url_filter(spec)
        self.login_requests = []
        self.form_requests = []
        self._start_requests = []
        self.generic_form = GenericForm(**kw)
        self._create_init_requests(spec.get("init_requests", []))
        self._process_start_urls(spec)
        self.allowed_domains = spec.get(
            'allowed_domains', self._get_allowed_domains(self._templates))
        self.page_actions = spec.get('page_actions', [])
        if not self.allowed_domains:
            self.allowed_domains = None

    def _process_start_urls(self, spec):
        self.start_urls = spec.get('start_urls')
        for url in self.start_urls:
            request = Request(url, callback=self.parse, dont_filter=True)
            self._add_splash_meta(request)
            self._start_requests.append(request)

    def _create_init_requests(self, spec):
        for rdata in spec:
            if rdata["type"] == "login":
                request = Request(url=rdata.pop("loginurl"),
                                  meta=rdata,
                                  callback=self.parse_login_page,
                                  dont_filter=True)
                self._add_splash_meta(request)
                self.login_requests.append(request)
            elif rdata["type"] == "form":
                self.form_requests.append(
                    self.get_generic_form_start_request(rdata))
            elif rdata["type"] == "start":
                self._start_requests.append(
                    self._create_start_request_from_specs(rdata))

    def parse_login_page(self, response):
        username = response.request.meta["username"]
        password = response.request.meta["password"]
        args, url, method = fill_login_form(response.url, response.body,
                                            username, password)
        return FormRequest(url,
                           method=method,
                           formdata=args,
                           callback=self.after_login,
                           dont_filter=True)

    def after_login(self, response):
        for result in self.parse(response):
            yield result
        for req in self._start_requests:
            yield req

    def get_generic_form_start_request(self, form_descriptor):
        file_fields = list(self.generic_form.get_url_field(form_descriptor))
        if file_fields:
            (field_index, field_descriptor) = file_fields.pop(0)
            form_descriptor['field_index'] = field_index
            return FormRequest(self.generic_form.get_value(field_descriptor),
                               meta=form_descriptor,
                               callback=self.parse_field_url_page,
                               dont_filter=True)
        else:
            return Request(url=form_descriptor.pop("form_url"),
                           meta=form_descriptor,
                           callback=self.parse_form_page,
                           dont_filter=True)

    def parse_field_url_page(self, response):
        form_descriptor = response.request.meta
        field_index = form_descriptor['field_index']
        field_descriptor = form_descriptor['fields'][field_index]
        self.generic_form.set_values_url_field(field_descriptor, response.body)
        yield self.get_generic_form_start_request(form_descriptor)

    def parse_form_page(self, response):
        fill_form = self.generic_form.fill_generic_form
        try:
            for (args, url, method) in fill_form(response.url, response.body,
                                                 response.request.meta):
                yield FormRequest(url,
                                  method=method,
                                  formdata=args,
                                  callback=self.after_form_page,
                                  dont_filter=True)
        except Exception as e:
            self.logger.warning(str(e))
        for req in self._start_requests:
            yield req

    def after_form_page(self, response):
        for result in self.parse(response):
            yield result

    def _get_allowed_domains(self, templates):
        urls = [x['url'] for x in templates]
        urls += [x.url for x in self._start_requests]
        return [x[1] for x in iter_unique_scheme_hostname(urls)]

    def start_requests(self):
        start_requests = []
        if self.login_requests:
            start_requests = self.login_requests
        elif self.form_requests:
            start_requests = self.form_requests
        else:
            start_requests = self._start_requests
        for req in start_requests:
            yield req

    def _create_start_request_from_specs(self, info):
        url = info["url"]
        lspecs = info.get("link_extractor")
        if lspecs:
            linkextractor = create_linkextractor_from_specs(lspecs)

            def _callback(spider, response):
                for link in linkextractor.links_to_follow(response):
                    request = Request(url=link.url, callback=spider.parse)
                    yield self._add_splash_meta(request)

            request = Request(url=url, callback=_callback)
            return self._add_splash_meta(request)
        request = Request(url=url, callback=self.parse)
        return self._add_splash_meta(request)

    def parse(self, response):
        """Main handler for all downloaded responses"""
        request = response.request
        if (request and request.method == 'POST'
                and urlparse(request.url).hostname == self.SPLASH_HOST):
            url = (json.loads(request.body).get('url'))
            if url:
                response._url = url
        content_type = response.headers.get('Content-Type', '')
        if isinstance(response, HtmlResponse):
            return self.handle_html(response)
        if (isinstance(response, XmlResponse) or response.url.endswith(
            ('.xml', '.xml.gz'))):
            response._set_body(self._get_sitemap_body(response))
            return self.handle_xml(response)
        self.logger.debug("Ignoring page with content-type=%r: %s" %
                          (content_type, response.url))
        return []

    def _plugin_hook(self, name, *args):
        results = []
        for plugin in self.plugins.values():
            if hasattr(plugin, name):
                results.append(getattr(plugin, name)(*args))
        return results

    def _handle(self, hook, response, *extrasrgs):
        generators = self._plugin_hook(hook, response, *extrasrgs)
        for item_or_request in itertools.chain(*generators):
            if isinstance(item_or_request, Request):
                self._plugin_hook('process_request', item_or_request, response)
            else:
                self._plugin_hook('process_item', item_or_request, response)
            if isinstance(item_or_request, Request):
                item_or_request = self._add_splash_meta(item_or_request)
            yield item_or_request

    def handle_xml(self, response):
        return self._handle('handle_xml', response, set([]))

    def handle_html(self, response):
        return self._handle('handle_html', response)

    def _build_js_url_filter(self, spec):
        if not self.js_enabled:
            return lambda x: None
        enable_patterns = spec.get('js_enable_patterns')
        disable_patterns = spec.get('js_disable_patterns')
        filterf = None
        enablef = None
        if enable_patterns:
            pattern = enable_patterns[0] if len(enable_patterns) == 1 else \
                "(?:%s)" % '|'.join(enable_patterns)
            enablef = re.compile(pattern).search
            filterf = enablef
        if disable_patterns:
            pattern = disable_patterns[0] if len(disable_patterns) == 1 else \
                "(?:%s)" % '|'.join(disable_patterns)
            disablef = re.compile(pattern).search
            if not enablef:
                filterf = lambda x: not disablef(x)
            else:
                filterf = lambda x: enablef(x) and not disablef(x)
        return filterf if filterf else lambda x: x

    def _add_splash_meta(self, request):
        if self.js_enabled and self._filter_js_urls(request.url):
            cleaned_url = urlparse(request.url)._replace(params='',
                                                         query='',
                                                         fragment='').geturl()
            request.meta['splash'] = {
                'endpoint': 'render.html?job_id=%s' % self._job_id,
                'args': {
                    'wait': 5,
                    'images': 0,
                    'url': request.url,
                    'baseurl': cleaned_url
                }
            }
        return request
Esempio n. 20
0
class IblSpider(Spider):

    def __init__(self, name, spec, item_schemas, all_extractors, settings=None,
                 **kw):
        super(IblSpider, self).__init__(name, **kw)
        self._job_id = settings.get('JOB', '')
        spec = deepcopy(spec)
        for key, val in kw.items():
            if isinstance(val, six.string_types) and key in STRING_KEYS:
                val = val.splitlines()
            spec[key] = val

        self._item_template_pages = sorted(
            ((t['scrapes'], t) for t in spec['templates']
             if t.get('page_type', 'item') == 'item'), key=itemgetter(0))

        self._templates = [templ for _, templ in self._item_template_pages]

        self.plugins = IndexedDict()
        for plugin_class, plugin_name in zip(load_plugins(settings),
                                             load_plugin_names(settings)):
            instance = plugin_class()
            instance.setup_bot(settings, spec, item_schemas, all_extractors)
            self.plugins[plugin_name] = instance

        self.js_enabled = False
        self.SPLASH_HOST = None
        if settings.get('SPLASH_URL'):
            self.SPLASH_HOST = urlparse(settings.get('SPLASH_URL')).hostname
            self.js_enabled = spec.get('js_enabled', False)
        if self.js_enabled and (settings.get('SPLASH_PASS') is not None or
                                settings.get('SPLASH_USER') is not None):
            self.splash_auth = basic_auth_header(
                settings.get('SPLASH_USER', ''),
                settings.get('SPLASH_PASS', ''))
        self._filter_js_urls = self._build_js_url_filter(spec)
        self.login_requests = []
        self.form_requests = []
        self._start_requests = []
        self.generic_form = GenericForm(**kw)
        self._create_init_requests(spec.get("init_requests", []))
        self._process_start_urls(spec)
        self.allowed_domains = spec.get(
            'allowed_domains',
            self._get_allowed_domains(self._templates)
        )
        if not self.allowed_domains:
            self.allowed_domains = None

    def _process_start_urls(self, spec):
        self.start_urls = spec.get('start_urls')
        for url in self.start_urls:
            request = Request(url, callback=self.parse, dont_filter=True)
            self._add_splash_meta(request)
            self._start_requests.append(request)

    def _create_init_requests(self, spec):
        for rdata in spec:
            if rdata["type"] == "login":
                request = Request(url=rdata.pop("loginurl"), meta=rdata,
                                  callback=self.parse_login_page,
                                  dont_filter=True)
                self._add_splash_meta(request)
                self.login_requests.append(request)
            elif rdata["type"] == "form":
                self.form_requests.append(
                    self.get_generic_form_start_request(rdata)
                )
            elif rdata["type"] == "start":
                self._start_requests.append(
                    self._create_start_request_from_specs(rdata)
                )

    def parse_login_page(self, response):
        username = response.request.meta["username"]
        password = response.request.meta["password"]
        args, url, method = fill_login_form(response.url, response.body,
                                            username, password)
        return FormRequest(url, method=method, formdata=args,
                           callback=self.after_login, dont_filter=True)

    def after_login(self, response):
        for result in self.parse(response):
            yield result
        for req in self._start_requests:
            yield req

    def get_generic_form_start_request(self, form_descriptor):
        file_fields = list(self.generic_form.get_url_field(form_descriptor))
        if file_fields:
            (field_index, field_descriptor) = file_fields.pop(0)
            form_descriptor['field_index'] = field_index
            return FormRequest(self.generic_form.get_value(field_descriptor),
                               meta=form_descriptor,
                               callback=self.parse_field_url_page,
                               dont_filter=True)
        else:
            return Request(url=form_descriptor.pop("form_url"),
                           meta=form_descriptor, callback=self.parse_form_page,
                           dont_filter=True)

    def parse_field_url_page(self, response):
        form_descriptor = response.request.meta
        field_index = form_descriptor['field_index']
        field_descriptor = form_descriptor['fields'][field_index]
        self.generic_form.set_values_url_field(field_descriptor, response.body)
        yield self.get_generic_form_start_request(form_descriptor)

    def parse_form_page(self, response):
        fill_form = self.generic_form.fill_generic_form
        try:
            for (args, url, method) in fill_form(response.url, response.body,
                                                 response.request.meta):
                yield FormRequest(url, method=method, formdata=args,
                                  callback=self.after_form_page,
                                  dont_filter=True)
        except Exception as e:
            self.logger.warning(str(e))
        for req in self._start_requests:
            yield req

    def after_form_page(self, response):
        for result in self.parse(response):
            yield result

    def _get_allowed_domains(self, templates):
        urls = [x['url'] for x in templates]
        urls += [x.url for x in self._start_requests]
        return [x[1] for x in iter_unique_scheme_hostname(urls)]

    def start_requests(self):
        start_requests = []
        if self.login_requests:
            start_requests = self.login_requests
        elif self.form_requests:
            start_requests = self.form_requests
        else:
            start_requests = self._start_requests
        for req in start_requests:
            yield req

    def _create_start_request_from_specs(self, info):
        url = info["url"]
        lspecs = info.get("link_extractor")
        if lspecs:
            linkextractor = create_linkextractor_from_specs(lspecs)

            def _callback(spider, response):
                for link in linkextractor.links_to_follow(response):
                    request = Request(url=link.url, callback=spider.parse)
                    yield self._add_splash_meta(request)
            request = Request(url=url, callback=_callback)
            return self._add_splash_meta(request)
        request = Request(url=url, callback=self.parse)
        return self._add_splash_meta(request)

    def parse(self, response):
        """Main handler for all downloaded responses"""
        request = response.request
        if (request and request.method == 'POST' and
                urlparse(request.url).hostname == self.SPLASH_HOST):
            url = (json.loads(request.body).get('url'))
            if url:
                response._url = url
        content_type = response.headers.get('Content-Type', '')
        if isinstance(response, HtmlResponse):
            return self.handle_html(response)
        elif "application/rss+xml" in content_type:
            return self.handle_rss(response)
        else:
            self.logger.debug(
                "Ignoring page with content-type=%r: %s" % (content_type,
                                                            response.url)
            )
            return []

    def _plugin_hook(self, name, *args):
        results = []
        for plugin in self.plugins.values():
            if hasattr(plugin, name):
                results.append(getattr(plugin, name)(*args))
        return results

    def _handle(self, hook, response, *extrasrgs):
        generators = self._plugin_hook(hook, response, *extrasrgs)
        for item_or_request in itertools.chain(*generators):
            if isinstance(item_or_request, Request):
                self._plugin_hook('process_request', item_or_request, response)
            else:
                self._plugin_hook('process_item', item_or_request, response)
            if isinstance(item_or_request, Request):
                item_or_request = self._add_splash_meta(item_or_request)
            yield item_or_request

    def handle_rss(self, response):
        return self._handle('handle_rss', response, set([]))

    def handle_html(self, response):
        return self._handle('handle_html', response)

    def _build_js_url_filter(self, spec):
        if not self.js_enabled:
            return lambda x: None
        enable_patterns = spec.get('js_enable_patterns')
        disable_patterns = spec.get('js_disable_patterns')
        filterf = None
        enablef = None
        if enable_patterns:
            pattern = enable_patterns[0] if len(enable_patterns) == 1 else \
                "(?:%s)" % '|'.join(enable_patterns)
            enablef = re.compile(pattern).search
            filterf = enablef
        if disable_patterns:
            pattern = disable_patterns[0] if len(disable_patterns) == 1 else \
                "(?:%s)" % '|'.join(disable_patterns)
            disablef = re.compile(pattern).search
            if not enablef:
                filterf = lambda x: not disablef(x)
            else:
                filterf = lambda x: enablef(x) and not disablef(x)
        return filterf if filterf else lambda x: x

    def _add_splash_meta(self, request):
        if self.js_enabled and self._filter_js_urls(request.url):
            cleaned_url = urlparse(request.url)._replace(params='', query='',
                                                         fragment='').geturl()
            request.meta['splash'] = {
                'endpoint': 'render.html?job_id=%s' % self._job_id,
                'args': {
                    'wait': 5,
                    'images': 0,
                    'url': request.url,
                    'baseurl': cleaned_url
                }
            }
        return request
Esempio n. 21
0
class IblSpider(SitemapSpider):
    def __init__(self, name, spec, item_schemas, all_extractors, settings=None,
                 **kw):
        self.start_url_generators = {
            'start_urls': IdentityGenerator(),
            'generated_urls': UrlGenerator(settings, kw),

            'url': IdentityGenerator(),
            'generated': FragmentGenerator(),
            # 'feed_urls': FeedUrls(self, settings, kw)
        }
        self.generic_form = GenericForm(**kw)
        super(IblSpider, self).__init__(name, **kw)
        spec = deepcopy(spec)
        self._add_spider_args_to_spec(spec, kw)
        self.plugins = self._configure_plugins(
            settings, spec, item_schemas, all_extractors)
        self._configure_js(spec, settings)

        self.login_requests, self.form_requests = [], []
        self._start_urls = self._create_start_urls(spec)
        self._start_requests = self._create_start_requests(spec)
        self._create_init_requests(spec)
        self._add_allowed_domains(spec)
        self.page_actions = spec.get('page_actions', [])

    def _add_spider_args_to_spec(self, spec, args):
        for key, val in args.items():
            if isinstance(val, six.string_types) and key in STRING_KEYS:
                val = val.splitlines()
            spec[key] = val

    def _create_start_urls(self, spec):
        url_type = spec.get('start_urls_type', 'start_urls')
        return StartUrlCollection(
            arg_to_iter(spec[url_type]),
            self.start_url_generators,
            url_type
        )

    def _create_start_requests(self, spec):
        init_requests = spec.get('init_requests', [])
        for rdata in init_requests:
            if rdata["type"] == "start":
                yield self._create_start_request_from_specs(rdata)

        for start_url in self._start_urls:
            if not isinstance(start_url, Request):
                start_url = Request(start_url, callback=self.parse,
                                    dont_filter=True)
            yield self._add_splash_meta(start_url)

    def _create_init_requests(self, spec):
        init_requests = spec.get('init_requests', [])
        for rdata in init_requests:
            if rdata["type"] == "login":
                request = Request(url=rdata.pop("loginurl"), meta=rdata,
                                  callback=self.parse_login_page,
                                  dont_filter=True)
                self._add_splash_meta(request)
                self.login_requests.append(request)
            elif rdata["type"] == "form":
                self.form_requests.append(
                    self.get_generic_form_start_request(rdata)
                )

    def _add_allowed_domains(self, spec):
        self.allowed_domains = spec.get('allowed_domains', [])
        if self.allowed_domains is not None and not self.allowed_domains:
            self.allowed_domains = self._get_allowed_domains(spec)

    def parse_login_page(self, response):
        username = response.request.meta["username"]
        password = response.request.meta["password"]
        args, url, method = fill_login_form(response.url, response.body,
                                            username, password)
        return FormRequest(url, method=method, formdata=args,
                           callback=self.after_login, dont_filter=True)

    def after_login(self, response):
        for result in self.parse(response):
            yield result
        for req in self._start_requests:
            yield req

    def get_generic_form_start_request(self, form_descriptor):
        file_fields = list(self.generic_form.get_url_field(form_descriptor))
        if file_fields:
            (field_index, field_descriptor) = file_fields.pop(0)
            form_descriptor['field_index'] = field_index
            return FormRequest(self.generic_form.get_value(field_descriptor),
                               meta=form_descriptor,
                               callback=self.parse_field_url_page,
                               dont_filter=True)
        else:
            return Request(url=form_descriptor.pop("form_url"),
                           meta=form_descriptor, callback=self.parse_form_page,
                           dont_filter=True)

    def parse_field_url_page(self, response):
        form_descriptor = response.request.meta
        field_index = form_descriptor['field_index']
        field_descriptor = form_descriptor['fields'][field_index]
        self.generic_form.set_values_url_field(field_descriptor, response.body)
        yield self.get_generic_form_start_request(form_descriptor)

    def parse_form_page(self, response):
        fill_form = self.generic_form.fill_generic_form
        try:
            for (args, url, method) in fill_form(response.url, response.body,
                                                 response.request.meta):
                yield FormRequest(url, method=method, formdata=args,
                                  callback=self.after_form_page,
                                  dont_filter=True)
        except Exception as e:
            self.logger.warning(str(e))
        for req in self._start_requests:
            yield req

    def after_form_page(self, response):
        for result in self.parse(response):
            yield result

    def _get_allowed_domains(self, spec):
        urls = [x['url'] for x in spec['templates']]
        urls += [x['url'] for x in spec.get('init_requests', [])
                 if x['type'] == 'start']
        urls += self._start_urls.allowed_domains
        return [domain for scheme, domain in iter_unique_scheme_hostname(urls)]

    def start_requests(self):
        start_requests = []
        if self.login_requests:
            start_requests = self.login_requests
        elif self.form_requests:
            start_requests = self.form_requests
        else:
            start_requests = self._start_requests
        for req in start_requests:
            yield req

    def _create_start_request_from_specs(self, info):
        url = info["url"]
        lspecs = info.get("link_extractor")
        if lspecs:
            linkextractor = create_linkextractor_from_specs(lspecs)

            def _callback(spider, response):
                for link in linkextractor.links_to_follow(response):
                    request = Request(url=link.url, callback=spider.parse)
                    yield self._add_splash_meta(request)
            request = Request(url=url, callback=_callback)
            return self._add_splash_meta(request)
        request = Request(url=url, callback=self.parse)
        return self._add_splash_meta(request)

    def parse(self, response):
        """Main handler for all downloaded responses"""
        request = response.request
        if (request and request.method == 'POST' and
                urlparse(request.url).hostname == self.SPLASH_HOST):
            url = (json.loads(request.body).get('url'))
            if url:
                response._url = url
        content_type = response.headers.get('Content-Type', '')
        if isinstance(response, HtmlResponse):
            return self.handle_html(response)
        if (isinstance(response, XmlResponse) or
                response.url.endswith(('.xml', '.xml.gz'))):
            response._set_body(self._get_sitemap_body(response))
            return self.handle_xml(response)
        self.logger.debug(
            "Ignoring page with content-type=%r: %s" % (content_type,
                                                        response.url)
        )
        return []

    def _configure_plugins(self, settings, spec, schemas, extractors):
        plugins = IndexedDict()
        for plugin_class, plugin_name in zip(load_plugins(settings),
                                             load_plugin_names(settings)):
            instance = plugin_class()
            instance.setup_bot(settings, spec, schemas, extractors, self.logger)
            plugins[plugin_name] = instance
        return plugins

    def _plugin_hook(self, name, *args):
        results = []
        for plugin in self.plugins.values():
            if hasattr(plugin, name):
                results.append(getattr(plugin, name)(*args))
        return results

    def _handle(self, hook, response, *extrasrgs):
        generators = self._plugin_hook(hook, response, *extrasrgs)
        for item_or_request in itertools.chain(*generators):
            if isinstance(item_or_request, Request):
                self._plugin_hook('process_request', item_or_request, response)
            else:
                self._plugin_hook('process_item', item_or_request, response)
            if isinstance(item_or_request, Request):
                item_or_request = self._add_splash_meta(item_or_request)
            yield item_or_request

    def handle_xml(self, response):
        return self._handle('handle_xml', response, set([]))

    def handle_html(self, response):
        return self._handle('handle_html', response)

    def _configure_js(self, spec, settings):
        self.js_enabled = False
        self.SPLASH_HOST = None
        if settings.get('SPLASH_URL'):
            self.SPLASH_HOST = urlparse(settings.get('SPLASH_URL')).hostname
            self.js_enabled = spec.get('js_enabled', False)
        if self.js_enabled and (settings.get('SPLASH_PASS') is not None or
                                settings.get('SPLASH_USER') is not None):
            self.splash_auth = basic_auth_header(
                settings.get('SPLASH_USER', ''),
                settings.get('SPLASH_PASS', ''))
        self.splash_wait = settings.getint('SPLASH_WAIT', 5)
        self.splash_timeout = settings.getint('SPLASH_TIMEOUT', 30)
        self.splash_js_source = settings.get(
            'SPLASH_JS_SOURCE', 'function(){}')
        self.splash_lua_source = settings.get('SPLASH_LUA_SOURCE', '')
        self._filter_js_urls = self._build_js_url_filter(spec)

    def _build_js_url_filter(self, spec):
        if not self.js_enabled:
            return lambda x: None
        enable_patterns = spec.get('js_enable_patterns')
        disable_patterns = spec.get('js_disable_patterns')
        return include_exclude_filter(enable_patterns, disable_patterns)

    def _add_splash_meta(self, request):
        if self.js_enabled and self._filter_js_urls(request.url):
            cleaned_url = urlparse(request.url)._replace(params='', query='',
                                                         fragment='').geturl()
            endpoint = 'execute' if self.splash_lua_source else 'render.html'
            request.meta['splash'] = {
                'endpoint': endpoint,
                'args': {
                    'wait': self.splash_wait,
                    'timeout': self.splash_timeout,
                    'js_source': self.splash_js_source,
                    'lua_source': self.splash_lua_source,
                    'images': 0,
                    'url': request.url,
                    'baseurl': cleaned_url
                }
            }
        return request
Esempio n. 22
0
class IblSpider(BaseSpider):

    def __init__(self, name, spec, item_schemas, all_extractors, **kw):
        super(IblSpider, self).__init__(name, **kw)

        self._item_template_pages = sorted((
            [t['scrapes'], dict_to_page(t, 'annotated_body'),
            t.get('extractors', [])] \
            for t in spec['templates'] if t.get('page_type', 'item') == 'item'
        ), key=lambda pair: pair[0])

        # generate ibl extractor for links pages
        _links_pages = [dict_to_page(t, 'annotated_body')
                for t in spec['templates'] if t.get('page_type') == 'links']
        _links_item_descriptor = create_slybot_item_descriptor({'fields': {}})
        self._links_ibl_extractor = InstanceBasedLearningExtractor([(t, _links_item_descriptor) for t in _links_pages]) \
                if _links_pages else None

        self._ipages = [page for _, page, _ in self._item_template_pages]

        self.start_urls = self.start_urls or spec.get('start_urls')
        if isinstance(self.start_urls, basestring):
            self.start_urls = self.start_urls.splitlines()

        self.html_link_extractor = HtmlLinkExtractor()
        self.rss_link_extractor = RssLinkExtractor()
        self.allowed_domains = spec.get('allowed_domains',
                                        self._get_allowed_domains(self._ipages))
        if not self.allowed_domains:
            self.allowed_domains = None
        self.build_url_filter(spec)

        self.itemcls_info = {}
        for itemclass_name, triplets in itertools.groupby(self._item_template_pages, operator.itemgetter(0)):
            page_extractors_pairs = map(operator.itemgetter(1, 2), triplets)
            schema = item_schemas[itemclass_name]
            item_cls = get_iblitem_class(schema)

            page_descriptor_pairs = []
            for page, template_extractors in page_extractors_pairs:
                item_descriptor = create_slybot_item_descriptor(schema)
                apply_extractors(item_descriptor, template_extractors, all_extractors)
                page_descriptor_pairs.append((page, item_descriptor))

            extractor = InstanceBasedLearningExtractor(page_descriptor_pairs)

            self.itemcls_info[itemclass_name] = {
                'class': item_cls,
                'descriptor': item_descriptor,
                'extractor': extractor,
            }

        self.login_requests = []
        self.form_requests = []
        for rdata in spec.get("init_requests", []):
            if rdata["type"] == "login":
                request = Request(url=rdata.pop("loginurl"), meta=rdata,
                                  callback=self.parse_login_page, dont_filter=True)
                self.login_requests.append(request)

            elif rdata["type"] == "form":
                self.generic_form = GenericForm(**kw)
                self.form_requests.append(self.get_generic_form_start_request(rdata))

    def parse_login_page(self, response):
        username = response.request.meta["username"]
        password = response.request.meta["password"]
        args, url, method = fill_login_form(response.url, response.body, username, password)
        return FormRequest(url, method=method, formdata=args, callback=self.after_login, dont_filter=True)

    def after_login(self, response):
        for result in self.parse(response):
            yield result
        for req in self._start_requests():
            yield req

    def get_generic_form_start_request(self, form_descriptor):
        file_fields = list(self.generic_form.get_url_field(form_descriptor))
        if file_fields:
            (field_index, field_descriptor) = file_fields.pop(0)
            form_descriptor['field_index'] = field_index
            return FormRequest(self.generic_form.get_value(field_descriptor), meta=form_descriptor,
                              callback=self.parse_field_url_page, dont_filter=True)
        else:
            return Request(url=form_descriptor.pop("form_url"), meta=form_descriptor,
                                  callback=self.parse_form_page, dont_filter=True)

    def parse_field_url_page(self, response):
        form_descriptor = response.request.meta
        field_index = form_descriptor['field_index']
        field_descriptor = form_descriptor['fields'][field_index]
        self.generic_form.set_values_url_field(field_descriptor, response.body)
        yield self.get_generic_form_start_request(form_descriptor)

    def parse_form_page(self, response):
        try:
            for (args, url, method) in self.generic_form.fill_generic_form(response.url,
                                                                           response.body,
                                                                           response.request.meta):
                yield FormRequest(url, method=method, formdata=args,
                                  callback=self.after_form_page, dont_filter=True)
        except Exception, e:
            self.log(str(e), log.WARNING)
        for req in self._start_requests():
            yield req
Esempio n. 23
0
class IblSpider(Spider):
    def __init__(self,
                 name,
                 spec,
                 item_schemas,
                 all_extractors,
                 settings=None,
                 **kw):
        super(IblSpider, self).__init__(name, **kw)
        spec = deepcopy(spec)
        for key, val in kw.items():
            if isinstance(val, basestring) and key in STRING_KEYS:
                val = val.splitlines()
            spec[key] = val

        self._item_template_pages = sorted(
            ((t['scrapes'], t) for t in spec['templates']
             if t.get('page_type', 'item') == 'item'),
            key=itemgetter(0))

        self._templates = [templ for _, templ in self._item_template_pages]

        self.plugins = IndexedDict()
        for plugin_class, plugin_name in zip(load_plugins(settings),
                                             load_plugin_names(settings)):
            instance = plugin_class()
            instance.setup_bot(settings, spec, item_schemas, all_extractors)
            self.plugins[plugin_name] = instance

        self.login_requests = []
        self.form_requests = []
        self._start_requests = []
        self.generic_form = GenericForm(**kw)
        self._create_init_requests(spec.get("init_requests", []))
        self._process_start_urls(spec)
        self.allowed_domains = spec.get(
            'allowed_domains', self._get_allowed_domains(self._templates))
        if not self.allowed_domains:
            self.allowed_domains = None

    def _process_start_urls(self, spec):
        self.start_urls = spec.get('start_urls')
        for url in self.start_urls:
            self._start_requests.append(
                Request(url, callback=self.parse, dont_filter=True))

    def _create_init_requests(self, spec):
        for rdata in spec:
            if rdata["type"] == "login":
                request = Request(url=rdata.pop("loginurl"),
                                  meta=rdata,
                                  callback=self.parse_login_page,
                                  dont_filter=True)
                self.login_requests.append(request)
            elif rdata["type"] == "form":
                self.form_requests.append(
                    self.get_generic_form_start_request(rdata))
            elif rdata["type"] == "start":
                self._start_requests.append(
                    self._create_start_request_from_specs(rdata))

    def parse_login_page(self, response):
        username = response.request.meta["username"]
        password = response.request.meta["password"]
        args, url, method = fill_login_form(response.url, response.body,
                                            username, password)
        return FormRequest(url,
                           method=method,
                           formdata=args,
                           callback=self.after_login,
                           dont_filter=True)

    def after_login(self, response):
        for result in self.parse(response):
            yield result
        for req in self._start_requests:
            yield req

    def get_generic_form_start_request(self, form_descriptor):
        file_fields = list(self.generic_form.get_url_field(form_descriptor))
        if file_fields:
            (field_index, field_descriptor) = file_fields.pop(0)
            form_descriptor['field_index'] = field_index
            return FormRequest(self.generic_form.get_value(field_descriptor),
                               meta=form_descriptor,
                               callback=self.parse_field_url_page,
                               dont_filter=True)
        else:
            return Request(url=form_descriptor.pop("form_url"),
                           meta=form_descriptor,
                           callback=self.parse_form_page,
                           dont_filter=True)

    def parse_field_url_page(self, response):
        form_descriptor = response.request.meta
        field_index = form_descriptor['field_index']
        field_descriptor = form_descriptor['fields'][field_index]
        self.generic_form.set_values_url_field(field_descriptor, response.body)
        yield self.get_generic_form_start_request(form_descriptor)

    def parse_form_page(self, response):
        fill_form = self.generic_form.fill_generic_form
        try:
            for (args, url, method) in fill_form(response.url, response.body,
                                                 response.request.meta):
                yield FormRequest(url,
                                  method=method,
                                  formdata=args,
                                  callback=self.after_form_page,
                                  dont_filter=True)
        except Exception, e:
            self.log(str(e), log.WARNING)
        for req in self._start_requests:
            yield req