Exemple #1
0
    def setup_bot(self, settings, spec, items, extractors):
        """
        Perform any initialization needed for crawling using this plugin
        """
        _item_template_pages = sorted((
            [t.get('scrapes'), dict_to_page(t, 'annotated_body'),
             t.get('extractors', []), t.get('version', '0.12.0')]
            for t in spec['templates'] if t.get('page_type', 'item') == 'item'
        ), key=lambda x: x[0])
        self.item_classes = {}
        self.template_scrapes = {template.get('page_id'): template['scrapes']
                                 for template in spec.get('templates')}
        self.html_link_extractor = HtmlLinkExtractor()
        for schema_name, schema in items.items():
            if schema_name not in self.item_classes:
                if not schema.get('name'):
                    schema['name'] = schema_name
                item_cls = SlybotItem.create_iblitem_class(schema)
                self.item_classes[schema_name] = item_cls

        # Create descriptors and apply additional extractors to fields
        page_descriptor_pairs = []
        self.schema_descriptors = {}
        for default, template, template_extractors, v in _item_template_pages:
            descriptors = OrderedDict()
            for schema_name, schema in items.items():
                item_descriptor = create_slybot_item_descriptor(schema,
                                                                schema_name)
                apply_extractors(item_descriptor, template_extractors,
                                 extractors)
                descriptors[schema_name] = item_descriptor
            descriptor = descriptors.values() or [{}]
            descriptors['#default'] = descriptors.get(default, descriptor[0])
            self.schema_descriptors[template.page_id] = descriptors['#default']
            page_descriptor_pairs.append((template, descriptors, v))
            add_extractors_to_descriptors(descriptors, extractors)

        grouped = itertools.groupby(sorted(page_descriptor_pairs,
                                           key=operator.itemgetter(2)),
                                    lambda x: x[2] < '0.13.0')
        self.extractors = []
        for version, group in grouped:
            if version:
                self.extractors.append(
                    InstanceBasedLearningExtractor(
                        [(page, scrapes['#default'])
                         for page, scrapes, version in group]))
            else:
                self.extractors.append(SlybotIBLExtractor(list(group)))

        # generate ibl extractor for links pages
        _links_pages = [dict_to_page(t, 'annotated_body')
                        for t in spec['templates']
                        if t.get('page_type') == 'links']
        _links_item_descriptor = create_slybot_item_descriptor({'fields': {}})
        self._links_ibl_extractor = InstanceBasedLearningExtractor(
            [(t, _links_item_descriptor) for t in _links_pages]) \
            if _links_pages else None

        self.build_url_filter(spec)
Exemple #2
0
    def __init__(self, name, spec, item_schemas, all_extractors, **kw):
        super(IblSpider, self).__init__(name, **kw)

        self._item_template_pages = sorted((
            [t['scrapes'], dict_to_page(t, 'annotated_body'),
            t.get('extractors', [])] \
            for t in spec['templates'] if t.get('page_type', 'item') == 'item'
        ), key=lambda pair: pair[0])

        # generate ibl extractor for links pages
        _links_pages = [dict_to_page(t, 'annotated_body')
                for t in spec['templates'] if t.get('page_type') == 'links']
        _links_item_descriptor = create_slybot_item_descriptor({'fields': {}})
        self._links_ibl_extractor = InstanceBasedLearningExtractor([(t, _links_item_descriptor) for t in _links_pages]) \
                if _links_pages else None

        self._ipages = [page for _, page, _ in self._item_template_pages]

        self.start_urls = self.start_urls or spec.get('start_urls')
        if isinstance(self.start_urls, basestring):
            self.start_urls = self.start_urls.splitlines()

        self.html_link_extractor = HtmlLinkExtractor()
        self.rss_link_extractor = RssLinkExtractor()
        self.allowed_domains = self._get_allowed_domains(self._ipages)

        self.build_url_filter(spec)

        self.itemcls_info = {}
        for itemclass_name, triplets in itertools.groupby(self._item_template_pages, operator.itemgetter(0)):
            page_extractors_pairs = map(operator.itemgetter(1, 2), triplets)
            schema = item_schemas[itemclass_name]
            item_cls = get_iblitem_class(schema)

            page_descriptor_pairs = []
            for page, template_extractors in page_extractors_pairs:
                item_descriptor = create_slybot_item_descriptor(schema)
                apply_extractors(item_descriptor, template_extractors, all_extractors)
                page_descriptor_pairs.append((page, item_descriptor))

            extractor = InstanceBasedLearningExtractor(page_descriptor_pairs)

            self.itemcls_info[itemclass_name] = {
                'class': item_cls,
                'descriptor': item_descriptor,
                'extractor': extractor,
            }

        self.login_requests = []
        self.form_requests = []
        for rdata in spec.get("init_requests", []):
            if rdata["type"] == "login":
                request = Request(url=rdata.pop("loginurl"), meta=rdata,
                                  callback=self.parse_login_page, dont_filter=True)
                self.login_requests.append(request)

            elif rdata["type"] == "form":
                self.generic_form = GenericForm(**kw)
                self.form_requests.append(self.get_generic_form_start_request(rdata))
Exemple #3
0
    def __init__(self, name, spec, item_schemas, all_extractors, **kw):
        super(IblSpider, self).__init__(name, **kw)
        default_item = spec['scrapes']
        self._default_schema = item_schemas[default_item]
        if not self._default_schema:
            self.log("Scraping unknown default item schema: %s" % default_item, \
                log.WARNING)
        
        self._item_template_pages = sorted((
            [t.get('scrapes', default_item), dict_to_page(t, 'annotated_body'), 
            t.get('extractors', [])] \
            for t in spec['templates'] if t.get('page_type', 'item') == 'item'
        ), key=lambda pair: pair[0])

        # generate ibl extractor for links pages
        _links_pages = [dict_to_page(t, 'annotated_body')
                for t in spec['templates'] if t.get('page_type') == 'links']
        _links_item_descriptor = create_slybot_item_descriptor({'id': "_links", 'properties': ()})
        self._links_ibl_extractor = InstanceBasedLearningExtractor([(t, _links_item_descriptor) for t in _links_pages]) \
                if _links_pages else None

        self._ipages = [page for _, page, _ in self._item_template_pages]
        self._fpages = [
            dict_to_page(t, 'annotated_body')
            for t in spec['templates'] if t.get('page_type', 'item') == 'form'
        ]
        
        self.start_urls = self.start_urls or spec.get('start_urls')
        if isinstance(self.start_urls, basestring):
            self.start_urls = self.start_urls.splitlines()

        self.link_extractor = LinkExtractor()
        self.allowed_domains = self._get_allowed_domains(self._ipages)
        
        self.build_url_filter(spec)

        default_item_cls = get_iblitem_class(self._default_schema)
        default_item_descriptor = create_slybot_item_descriptor(self._default_schema)

        self.itemcls_info = {}
        for itemclass_name, triplets in itertools.groupby(self._item_template_pages, operator.itemgetter(0)):
            page_extractors_pairs = map(operator.itemgetter(1, 2), triplets)
            schema = item_schemas[itemclass_name]
            item_cls = get_iblitem_class(schema) if schema else default_item_cls

            page_descriptor_pairs = []
            for page, extractors in page_extractors_pairs:
                item_descriptor = create_slybot_item_descriptor(schema) if schema else default_item_descriptor
                apply_extractors(item_descriptor, extractors, all_extractors)
                page_descriptor_pairs.append((page, item_descriptor))

            extractor = InstanceBasedLearningExtractor(page_descriptor_pairs)

            self.itemcls_info[itemclass_name] = {
                'class': item_cls,
                'descriptor': item_descriptor,
                'extractor': extractor,
            }
Exemple #4
0
    def __init__(self, name, spec, item_schemas, all_extractors, **kw):
        super(IblSpider, self).__init__(name, **kw)
	
        spec = deepcopy(spec)
        for key, val in kw.items():
            if isinstance(val, basestring) and key in ['start_urls', 'exclude_patterns', 'follow_patterns', 'allowed_domains']:
                val = val.splitlines()
            spec[key] = val
	self.i = time.time()
	self.getProxyList()
        self._item_template_pages = sorted((
            [t['scrapes'], dict_to_page(t, 'annotated_body'),
            t.get('extractors', [])] \
            for t in spec['templates'] if t.get('page_type', 'item') == 'item'
        ), key=lambda pair: pair[0])	
        # generate ibl extractor for links pages
        _links_pages = [dict_to_page(t, 'annotated_body')
                for t in spec['templates'] if t.get('page_type') == 'links']
        _links_item_descriptor = create_slybot_item_descriptor({'fields': {}})
        self._links_ibl_extractor = InstanceBasedLearningExtractor([(t, _links_item_descriptor) for t in _links_pages]) \
                if _links_pages else None

        self._ipages = [page for _, page, _ in self._item_template_pages]

        self.html_link_extractor = HtmlLinkExtractor()
        self.rss_link_extractor = RssLinkExtractor()
        self.build_url_filter(spec)

        self.itemcls_info = {}
        for itemclass_name, triplets in itertools.groupby(self._item_template_pages, operator.itemgetter(0)):
            page_extractors_pairs = map(operator.itemgetter(1, 2), triplets)
            schema = item_schemas[itemclass_name]
            item_cls = SlybotItem.create_iblitem_class(schema)

            page_descriptor_pairs = []
            for page, template_extractors in page_extractors_pairs:
                item_descriptor = create_slybot_item_descriptor(schema)
                apply_extractors(item_descriptor, template_extractors, all_extractors)
                page_descriptor_pairs.append((page, item_descriptor))

            extractor = InstanceBasedLearningExtractor(page_descriptor_pairs)

            self.itemcls_info[itemclass_name] = {
                'class': item_cls,
                'descriptor': item_descriptor,
                'extractor': extractor,
            }

        self.login_requests = []
        self.form_requests = []
        self._start_requests = []
        self.generic_form = GenericForm(**kw)
        self._create_init_requests(spec.get("init_requests", []))
        self._process_start_urls(spec)
        self.allowed_domains = spec.get('allowed_domains',
                                        self._get_allowed_domains(self._ipages))
        if not self.allowed_domains:
            self.allowed_domains = None
Exemple #5
0
    def __init__(self, name, spec, item_schemas, all_extractors, **kw):
        super(IblSpider, self).__init__(name, **kw)
        spec = deepcopy(spec)
        for key, val in kw.items():
            if isinstance(val, basestring) and key in ['start_urls', 'exclude_patterns', 'follow_patterns', 'allowed_domains']:
                val = val.splitlines()
            spec[key] = val

        self._item_template_pages = sorted((
            [t['scrapes'], dict_to_page(t, 'annotated_body'),
            t.get('extractors', [])] \
            for t in spec['templates'] if t.get('page_type', 'item') == 'item'
        ), key=lambda pair: pair[0])

        # generate ibl extractor for links pages
        _links_pages = [dict_to_page(t, 'annotated_body')
                for t in spec['templates'] if t.get('page_type') == 'links']
        _links_item_descriptor = create_slybot_item_descriptor({'fields': {}})
        self._links_ibl_extractor = InstanceBasedLearningExtractor([(t, _links_item_descriptor) for t in _links_pages]) \
                if _links_pages else None

        self._ipages = [page for _, page, _ in self._item_template_pages]

        self.html_link_extractor = HtmlLinkExtractor()
        self.rss_link_extractor = RssLinkExtractor()
        self.build_url_filter(spec)

        self.itemcls_info = {}
        for itemclass_name, triplets in itertools.groupby(self._item_template_pages, operator.itemgetter(0)):
            page_extractors_pairs = map(operator.itemgetter(1, 2), triplets)
            schema = item_schemas[itemclass_name]
            item_cls = SlybotItem.create_iblitem_class(schema)

            page_descriptor_pairs = []
            for page, template_extractors in page_extractors_pairs:
                item_descriptor = create_slybot_item_descriptor(schema)
                apply_extractors(item_descriptor, template_extractors, all_extractors)
                page_descriptor_pairs.append((page, item_descriptor))

            extractor = InstanceBasedLearningExtractor(page_descriptor_pairs)

            self.itemcls_info[itemclass_name] = {
                'class': item_cls,
                'descriptor': item_descriptor,
                'extractor': extractor,
            }

        self.login_requests = []
        self.form_requests = []
        self._start_requests = []
        self.generic_form = GenericForm(**kw)
        self._create_init_requests(spec.get("init_requests", []))
        self._process_start_urls(spec)
        self.allowed_domains = spec.get('allowed_domains',
                                        self._get_allowed_domains(self._ipages))
        if not self.allowed_domains:
            self.allowed_domains = None
Exemple #6
0
    def setup_bot(self, settings, spec, items, extractors):
        """
        Perform any initialization needed for crawling using this plugin
        """
        _item_template_pages = sorted(([
            t['scrapes'],
            dict_to_page(t, 'annotated_body'),
            t.get('extractors', [])
        ] for t in spec['templates'] if t.get('page_type', 'item') == 'item'),
                                      key=lambda pair: pair[0])

        self.itemcls_info = {}
        if settings.get('AUTO_PAGINATION'):
            self.html_link_extractor = PaginationExtractor()
        else:
            self.html_link_extractor = HtmlLinkExtractor()
        for itemclass_name, triplets in groupby(_item_template_pages,
                                                itemgetter(0)):
            page_extractors_pairs = map(itemgetter(1, 2), triplets)
            schema = items[itemclass_name]
            item_cls = SlybotItem.create_iblitem_class(schema)

            page_descriptor_pairs = []
            for page, template_extractors in page_extractors_pairs:
                item_descriptor = create_slybot_item_descriptor(schema)
                apply_extractors(item_descriptor, template_extractors,
                                 extractors)
                page_descriptor_pairs.append((page, item_descriptor))

            extractor = InstanceBasedLearningExtractor(page_descriptor_pairs)

            self.itemcls_info[itemclass_name] = {
                'class': item_cls,
                'descriptor': item_descriptor,
                'extractor': extractor,
            }

        # generate ibl extractor for links pages
        _links_pages = [
            dict_to_page(t, 'annotated_body') for t in spec['templates']
            if t.get('page_type') == 'links'
        ]
        _links_item_descriptor = create_slybot_item_descriptor({'fields': {}})
        self._links_ibl_extractor = InstanceBasedLearningExtractor(
            [(t, _links_item_descriptor) for t in _links_pages]) \
            if _links_pages else None

        self.build_url_filter(spec)
Exemple #7
0
    def setup_bot(self, settings, spec, items, extractors):
        """
        Perform any initialization needed for crawling using this plugin
        """
        _item_template_pages = sorted((
            [t['scrapes'], dict_to_page(t, 'annotated_body'),
             t.get('extractors', [])]
            for t in spec['templates'] if t.get('page_type', 'item') == 'item'
        ), key=lambda pair: pair[0])

        self.itemcls_info = {}
        if settings.get('AUTO_PAGINATION'):
            self.html_link_extractor = PaginationExtractor()
        else:
            self.html_link_extractor = HtmlLinkExtractor()
        for itemclass_name, triplets in groupby(_item_template_pages,
                                                itemgetter(0)):
            page_extractors_pairs = map(itemgetter(1, 2), triplets)
            schema = items[itemclass_name]
            item_cls = SlybotItem.create_iblitem_class(schema)

            page_descriptor_pairs = []
            for page, template_extractors in page_extractors_pairs:
                item_descriptor = create_slybot_item_descriptor(schema)
                apply_extractors(item_descriptor, template_extractors,
                                 extractors)
                page_descriptor_pairs.append((page, item_descriptor))

            extractor = InstanceBasedLearningExtractor(page_descriptor_pairs)

            self.itemcls_info[itemclass_name] = {
                'class': item_cls,
                'descriptor': item_descriptor,
                'extractor': extractor,
            }

        # generate ibl extractor for links pages
        _links_pages = [dict_to_page(t, 'annotated_body')
                        for t in spec['templates']
                        if t.get('page_type') == 'links']
        _links_item_descriptor = create_slybot_item_descriptor({'fields': {}})
        self._links_ibl_extractor = InstanceBasedLearningExtractor(
            [(t, _links_item_descriptor) for t in _links_pages]) \
            if _links_pages else None

        self.build_url_filter(spec)
Exemple #8
0
    def setup_bot(self, settings, spec, items, extractors):
        """
        Perform any initialization needed for crawling using this plugin
        """
        _item_template_pages = sorted(
            (
                [t["scrapes"], dict_to_page(t, "annotated_body"), t.get("extractors", [])]
                for t in spec["templates"]
                if t.get("page_type", "item") == "item"
            ),
            key=lambda pair: pair[0],
        )

        self.itemcls_info = {}
        self.html_link_extractor = HtmlLinkExtractor()
        self.rss_link_extractor = RssLinkExtractor()
        for itemclass_name, triplets in groupby(_item_template_pages, itemgetter(0)):
            page_extractors_pairs = map(itemgetter(1, 2), triplets)
            schema = items[itemclass_name]
            item_cls = SlybotItem.create_iblitem_class(schema)

            page_descriptor_pairs = []
            for page, template_extractors in page_extractors_pairs:
                item_descriptor = create_slybot_item_descriptor(schema)
                apply_extractors(item_descriptor, template_extractors, extractors)
                page_descriptor_pairs.append((page, item_descriptor))

            extractor = InstanceBasedLearningExtractor(page_descriptor_pairs)

            self.itemcls_info[itemclass_name] = {
                "class": item_cls,
                "descriptor": item_descriptor,
                "extractor": extractor,
            }

        # generate ibl extractor for links pages
        _links_pages = [dict_to_page(t, "annotated_body") for t in spec["templates"] if t.get("page_type") == "links"]
        _links_item_descriptor = create_slybot_item_descriptor({"fields": {}})
        self._links_ibl_extractor = (
            InstanceBasedLearningExtractor([(t, _links_item_descriptor) for t in _links_pages])
            if _links_pages
            else None
        )

        self.build_url_filter(spec)
Exemple #9
0
    def setup_bot(self, settings, spec, items, extractors):
        """
        Perform any initialization needed for crawling using this plugin
        """
        _item_template_pages = sorted((
            [t.get('scrapes'), dict_to_page(t, 'annotated_body'),
             t.get('extractors', [])]
            for t in spec['templates'] if t.get('page_type', 'item') == 'item'
        ))
        self.item_classes = {}
        self.html_link_extractor = HtmlLinkExtractor()
        for schema_name, schema in items.items():
            if schema_name not in self.item_classes:
                if not schema.get('name'):
                    schema['name'] = schema_name
                item_cls = SlybotItem.create_iblitem_class(schema)
                self.item_classes[schema_name] = item_cls

        # Create descriptors and apply additional extractors to fields
        page_descriptor_pairs = []
        for default, template, template_extractors in _item_template_pages:
            descriptors = OrderedDict()
            for schema_name, schema in items.items():
                item_descriptor = create_slybot_item_descriptor(schema,
                                                                schema_name)
                apply_extractors(item_descriptor, template_extractors,
                                 extractors)
                descriptors[schema_name] = item_descriptor
            descriptor = descriptors.values() or [{}]
            descriptors['#default'] = descriptors.get(default, descriptor[0])
            page_descriptor_pairs.append((template, descriptors))

        self.extractors = SlybotIBLExtractor(page_descriptor_pairs)

        # generate ibl extractor for links pages
        _links_pages = [dict_to_page(t, 'annotated_body')
                        for t in spec['templates']
                        if t.get('page_type') == 'links']
        _links_item_descriptor = create_slybot_item_descriptor({'fields': {}})
        self._links_ibl_extractor = InstanceBasedLearningExtractor(
            [(t, _links_item_descriptor) for t in _links_pages]) \
            if _links_pages else None

        self.build_url_filter(spec)
    def get_htmlpage_from_text2(text, url, headers):
        assert isinstance(
            text, unicode), "unicode expected, got %s" % type(text).__name__
        html = text
        html = fix_html(html)
        html = make_links_absolute(html, base_url=url)
        htmlparser = etree.HTMLParser()
        tree = etree.fromstring(html, htmlparser)
        body = tostring(tree, encoding='unicode')

        headers = dict(headers)
        page = dict_to_page({'url': url, 'headers': headers, 'body': body})
        return page
Exemple #11
0
    def setup_bot(self, settings, spec, items, extractors, logger):
        """
        Perform any initialization needed for crawling using this plugin
        """
        self.logger = logger
        templates = map(self._get_annotated_template, spec['templates'])

        _item_template_pages = sorted(([
            t.get('scrapes'),
            dict_to_page(t, 'annotated_body'),
            t.get('extractors', []),
            t.get('version', '0.12.0')
        ] for t in templates if t.get('page_type', 'item') == 'item'),
                                      key=lambda x: x[0])
        self.item_classes = {}
        self.template_scrapes = {
            template.get('page_id'): template['scrapes']
            for template in templates
        }
        if (settings.get('AUTO_PAGINATION')
                or spec.get('links_to_follow') == 'auto'):
            self.html_link_extractor = PaginationExtractor()
        else:
            self.html_link_extractor = HtmlLinkExtractor()
        for schema_name, schema in items.items():
            if schema_name not in self.item_classes:
                if not schema.get('name'):
                    schema['name'] = schema_name
                item_cls = SlybotItem.create_iblitem_class(schema)
                self.item_classes[schema_name] = item_cls

        # Create descriptors and apply additional extractors to fields
        page_descriptor_pairs = []
        self.schema_descriptors = {}
        for default, template, template_extractors, v in _item_template_pages:
            descriptors = OrderedDict()
            for schema_name, schema in items.items():
                item_descriptor = create_slybot_item_descriptor(
                    schema, schema_name)
                apply_extractors(item_descriptor, template_extractors,
                                 extractors)
                descriptors[schema_name] = item_descriptor
            descriptor = descriptors.values() or [{}]
            descriptors['#default'] = descriptors.get(default, descriptor[0])
            self.schema_descriptors[template.page_id] = descriptors['#default']
            page_descriptor_pairs.append((template, descriptors, v))
            add_extractors_to_descriptors(descriptors, extractors)

        grouped = itertools.groupby(
            sorted(page_descriptor_pairs, key=operator.itemgetter(2)),
            lambda x: x[2] < '0.13.0')
        self.extractors = []
        for version, group in grouped:
            if version:
                self.extractors.append(
                    InstanceBasedLearningExtractor([
                        (page, scrapes['#default'])
                        for page, scrapes, version in group
                    ]))
            else:
                self.extractors.append(SlybotIBLExtractor(list(group)))

        # generate ibl extractor for links pages
        _links_pages = [
            dict_to_page(t, 'annotated_body') for t in templates
            if t.get('page_type') == 'links'
        ]
        _links_item_descriptor = create_slybot_item_descriptor({'fields': {}})
        self._links_ibl_extractor = InstanceBasedLearningExtractor(
            [(t, _links_item_descriptor) for t in _links_pages]) \
            if _links_pages else None

        self.build_url_filter(spec)
        # Clustering
        self.template_names = [t.get('page_id') for t in spec['templates']]
        if settings.get('PAGE_CLUSTERING'):
            try:
                import page_clustering
                self.clustering = page_clustering.kmeans_from_samples(
                    spec['templates'])
                self.logger.info("Clustering activated")
            except ImportError:
                self.clustering = None
                self.logger.warning(
                    "Clustering could not be used because it is not installed")
        else:
            self.clustering = None
Exemple #12
0
    def __init__(self, name, spec, item_schemas, all_extractors, **kw):
        super(IblSpider, self).__init__(name, **kw)
        default_item = spec['scrapes']
        self._default_schema = item_schemas[default_item]
        if not self._default_schema:
            self.log("Scraping unknown default item schema: %s" % default_item, \
                log.WARNING)

        self._item_template_pages = sorted((
            [t.get('scrapes', default_item), dict_to_page(t, 'annotated_body'),
            t.get('extractors', [])] \
            for t in spec['templates'] if t.get('page_type', 'item') == 'item'
        ), key=lambda pair: pair[0])

        # generate ibl extractor for links pages
        _links_pages = [
            dict_to_page(t, 'annotated_body') for t in spec['templates']
            if t.get('page_type') == 'links'
        ]
        _links_item_descriptor = create_slybot_item_descriptor({
            'id': "_links",
            'properties': ()
        })
        self._links_ibl_extractor = InstanceBasedLearningExtractor([(t, _links_item_descriptor) for t in _links_pages]) \
                if _links_pages else None

        self._ipages = [page for _, page, _ in self._item_template_pages]

        self.start_urls = self.start_urls or spec.get('start_urls')
        if isinstance(self.start_urls, basestring):
            self.start_urls = self.start_urls.splitlines()

        self.link_extractor = LinkExtractor()
        self.allowed_domains = self._get_allowed_domains(self._ipages)

        self.build_url_filter(spec)

        default_item_cls = get_iblitem_class(self._default_schema)
        default_item_descriptor = create_slybot_item_descriptor(
            self._default_schema)

        self.itemcls_info = {}
        for itemclass_name, triplets in itertools.groupby(
                self._item_template_pages, operator.itemgetter(0)):
            page_extractors_pairs = map(operator.itemgetter(1, 2), triplets)
            schema = item_schemas[itemclass_name]
            item_cls = get_iblitem_class(
                schema) if schema else default_item_cls

            page_descriptor_pairs = []
            for page, extractors in page_extractors_pairs:
                item_descriptor = create_slybot_item_descriptor(
                    schema) if schema else default_item_descriptor
                apply_extractors(item_descriptor, extractors, all_extractors)
                page_descriptor_pairs.append((page, item_descriptor))

            extractor = InstanceBasedLearningExtractor(page_descriptor_pairs)

            self.itemcls_info[itemclass_name] = {
                'class': item_cls,
                'descriptor': item_descriptor,
                'extractor': extractor,
            }

        self.login_requests = []
        self.form_requests = []
        for rdata in spec.get("init_requests", []):
            if rdata["type"] == "login":
                request = Request(url=rdata.pop("loginurl"),
                                  meta=rdata,
                                  callback=self.parse_login_page)
                self.login_requests.append(request)

            elif rdata["type"] == "form":
                request = Request(url=rdata.pop("form_url"),
                                  meta=rdata,
                                  callback=self.parse_form_page)
                self.form_requests.append(request)
Exemple #13
0
    def __init__(self, name, spec, item_schemas, all_extractors, **kw):
        super(IblSpider, self).__init__(name, **kw)

        self._item_template_pages = sorted((
            [t['scrapes'], dict_to_page(t, 'annotated_body'),
            t.get('extractors', [])] \
            for t in spec['templates'] if t.get('page_type', 'item') == 'item'
        ), key=lambda pair: pair[0])

        # generate ibl extractor for links pages
        _links_pages = [dict_to_page(t, 'annotated_body')
                for t in spec['templates'] if t.get('page_type') == 'links']
        _links_item_descriptor = create_slybot_item_descriptor({'fields': {}})
        self._links_ibl_extractor = InstanceBasedLearningExtractor([(t, _links_item_descriptor) for t in _links_pages]) \
                if _links_pages else None

        self._ipages = [page for _, page, _ in self._item_template_pages]

        self.start_urls = self.start_urls or spec.get('start_urls')
        if isinstance(self.start_urls, basestring):
            self.start_urls = self.start_urls.splitlines()

        self.html_link_extractor = HtmlLinkExtractor()
        self.rss_link_extractor = RssLinkExtractor()
        self.allowed_domains = spec.get('allowed_domains',
                                        self._get_allowed_domains(self._ipages))
        if not self.allowed_domains:
            self.allowed_domains = None
        self.build_url_filter(spec)

        self.itemcls_info = {}
        for itemclass_name, triplets in itertools.groupby(self._item_template_pages, operator.itemgetter(0)):
            page_extractors_pairs = map(operator.itemgetter(1, 2), triplets)
            schema = item_schemas[itemclass_name]
            item_cls = get_iblitem_class(schema)

            page_descriptor_pairs = []
            for page, template_extractors in page_extractors_pairs:
                item_descriptor = create_slybot_item_descriptor(schema)
                apply_extractors(item_descriptor, template_extractors, all_extractors)
                page_descriptor_pairs.append((page, item_descriptor))

            extractor = InstanceBasedLearningExtractor(page_descriptor_pairs)

            self.itemcls_info[itemclass_name] = {
                'class': item_cls,
                'descriptor': item_descriptor,
                'extractor': extractor,
            }

        self.login_requests = []
        self.form_requests = []
        for rdata in spec.get("init_requests", []):
            if rdata["type"] == "login":
                request = Request(url=rdata.pop("loginurl"), meta=rdata,
                                  callback=self.parse_login_page, dont_filter=True)
                self.login_requests.append(request)

            elif rdata["type"] == "form":
                self.generic_form = GenericForm(**kw)
                self.form_requests.append(self.get_generic_form_start_request(rdata))
Exemple #14
0
    def __init__(self, name, spec, item_schemas, all_extractors, **kw):
        super(IblSpider, self).__init__(name, **kw)
        default_item = spec['scrapes']
        self._default_schema = item_schemas[default_item]
        if not self._default_schema:
            self.log("Scraping unknown default item schema: %s" % default_item, \
                log.WARNING)
        
        self._item_template_pages = sorted((
            [t.get('scrapes', default_item), dict_to_page(t, 'annotated_body'), 
            t.get('extractors', [])] \
            for t in spec['templates'] if t.get('page_type', 'item') == 'item'
        ), key=lambda pair: pair[0])

        # generate ibl extractor for links pages
        _links_pages = [dict_to_page(t, 'annotated_body')
                for t in spec['templates'] if t.get('page_type') == '_links']
        _links_item_descriptor = create_slybot_item_descriptor({'id': "_links", 'properties': ()})
        self._links_ibl_extractor = InstanceBasedLearningExtractor([(t, _links_item_descriptor) for t in _links_pages]) \
                if _links_pages else None

        self._ipages = [page for _, page, _ in self._item_template_pages]
        self._fpages = [
            dict_to_page(t, 'annotated_body')
            for t in spec['templates'] if t.get('page_type', 'item') == 'form'
        ]
        
        self._start_urls = spec.get('start_urls')

        self.link_extractor = LinkExtractor()
        self.allowed_domains = self._get_allowed_domains(self._ipages)
        
        # make a filter for links 
        respect_nofollow = spec.get('respect_nofollow', True)
        patterns = spec.get('follow_patterns')
        if patterns:
            pattern = patterns[0] if len(patterns) == 1 else "(?:%s)" % '|'.join(patterns)
            follow_pattern = re.compile(pattern)
            if respect_nofollow:
                url_filterf = lambda x: follow_pattern.search(x.url) and not x.nofollow
            else:
                url_filterf = lambda x: follow_pattern.search(x.url)
        elif respect_nofollow:
            url_filterf = lambda x: not x.nofollow
        else:
            url_filterf = bool
        # apply exclude patterns
        exclude_patterns = spec.get('exclude_patterns')
        if exclude_patterns:
            pattern = exclude_patterns[0] if len(exclude_patterns) == 1 else "(?:%s)" % '|'.join(exclude_patterns)
            exclude_pattern = re.compile(pattern)
            self.url_filterf = lambda x: not exclude_pattern.search(x.url) and url_filterf(x)
        else:
            self.url_filterf = url_filterf

        default_item_cls = get_iblitem_class(self._default_schema)
        default_item_descriptor = create_slybot_item_descriptor(self._default_schema)

        self.itemcls_info = {}
        for itemclass_name, triplets in itertools.groupby(self._item_template_pages, operator.itemgetter(0)):
            page_extractors_pairs = map(operator.itemgetter(1, 2), triplets)
            schema = item_schemas[itemclass_name]
            item_cls = get_iblitem_class(schema) if schema else default_item_cls

            page_descriptor_pairs = []
            for page, extractors in page_extractors_pairs:
                item_descriptor = create_slybot_item_descriptor(schema) if schema else default_item_descriptor
                apply_extractors(item_descriptor, extractors, all_extractors)
                page_descriptor_pairs.append((page, item_descriptor))

            extractor = InstanceBasedLearningExtractor(page_descriptor_pairs)

            self.itemcls_info[itemclass_name] = {
                'class': item_cls,
                'descriptor': item_descriptor,
                'extractor': extractor,
            }

        self._itemversion_cache = {}
Exemple #15
0
    def setup_bot(self, settings, spec, items, extractors, logger):
        """
        Perform any initialization needed for crawling using this plugin
        """
        self.logger = logger
        templates = map(self._get_annotated_template, spec['templates'])

        _item_template_pages = sorted((
            [t.get('scrapes'), dict_to_page(t, 'annotated_body'),
             t.get('extractors', []), t.get('version', '0.12.0')]
            for t in templates if t.get('page_type', 'item') == 'item'
        ), key=lambda x: x[0])
        self.item_classes = {}
        self.template_scrapes = {template.get('page_id'): template['scrapes']
                                 for template in templates}
        if (settings.get('AUTO_PAGINATION') or
                spec.get('links_to_follow') == 'auto'):
            self.html_link_extractor = PaginationExtractor()
        else:
            self.html_link_extractor = HtmlLinkExtractor()
        for schema_name, schema in items.items():
            if schema_name not in self.item_classes:
                if not schema.get('name'):
                    schema['name'] = schema_name
                item_cls = SlybotItem.create_iblitem_class(schema)
                self.item_classes[schema_name] = item_cls

        # Create descriptors and apply additional extractors to fields
        page_descriptor_pairs = []
        self.schema_descriptors = {}
        for default, template, template_extractors, v in _item_template_pages:
            descriptors = OrderedDict()
            for schema_name, schema in items.items():
                item_descriptor = create_slybot_item_descriptor(schema,
                                                                schema_name)
                apply_extractors(item_descriptor, template_extractors,
                                 extractors)
                descriptors[schema_name] = item_descriptor
            descriptor = descriptors.values() or [{}]
            descriptors['#default'] = descriptors.get(default, descriptor[0])
            self.schema_descriptors[template.page_id] = descriptors['#default']
            page_descriptor_pairs.append((template, descriptors, v))
            add_extractors_to_descriptors(descriptors, extractors)

        grouped = itertools.groupby(sorted(page_descriptor_pairs,
                                           key=operator.itemgetter(2)),
                                    lambda x: x[2] < '0.13.0')
        self.extractors = []
        for version, group in grouped:
            if version:
                self.extractors.append(
                    InstanceBasedLearningExtractor(
                        [(page, scrapes['#default'])
                         for page, scrapes, version in group]))
            else:
                self.extractors.append(SlybotIBLExtractor(list(group)))

        # generate ibl extractor for links pages
        _links_pages = [dict_to_page(t, 'annotated_body')
                        for t in templates if t.get('page_type') == 'links']
        _links_item_descriptor = create_slybot_item_descriptor({'fields': {}})
        self._links_ibl_extractor = InstanceBasedLearningExtractor(
            [(t, _links_item_descriptor) for t in _links_pages]) \
            if _links_pages else None

        self.build_url_filter(spec)
        # Clustering
        self.template_names = [t.get('page_id') for t in spec['templates']]
        if settings.get('PAGE_CLUSTERING'):
            try:
                import page_clustering
                self.clustering = page_clustering.kmeans_from_samples(spec['templates'])
                self.logger.info("Clustering activated")
            except ImportError:
                self.clustering = None
                self.logger.warning(
                    "Clustering could not be used because it is not installed")
        else:
            self.clustering = None
Exemple #16
0
    def __init__(self, name, spec, item_schemas, all_extractors, **kw):
        super(IblSpider, self).__init__(name, **kw)
        default_item = spec["scrapes"]
        self._default_schema = item_schemas[default_item]
        if not self._default_schema:
            self.log("Scraping unknown default item schema: %s" % default_item, log.WARNING)

        self._item_template_pages = sorted(
            (
                [t.get("scrapes", default_item), dict_to_page(t, "annotated_body"), t.get("extractors", [])]
                for t in spec["templates"]
                if t.get("page_type", "item") == "item"
            ),
            key=lambda pair: pair[0],
        )

        # generate ibl extractor for links pages
        _links_pages = [dict_to_page(t, "annotated_body") for t in spec["templates"] if t.get("page_type") == "links"]
        _links_item_descriptor = create_slybot_item_descriptor({"id": "_links", "properties": ()})
        self._links_ibl_extractor = (
            InstanceBasedLearningExtractor([(t, _links_item_descriptor) for t in _links_pages])
            if _links_pages
            else None
        )

        self._ipages = [page for _, page, _ in self._item_template_pages]
        self._fpages = [
            dict_to_page(t, "annotated_body") for t in spec["templates"] if t.get("page_type", "item") == "form"
        ]

        self.start_urls = self.start_urls or spec.get("start_urls")
        if isinstance(self.start_urls, basestring):
            self.start_urls = self.start_urls.splitlines()

        self.link_extractor = LinkExtractor()
        self.allowed_domains = self._get_allowed_domains(self._ipages)

        self.build_url_filter(spec)

        default_item_cls = get_iblitem_class(self._default_schema)
        default_item_descriptor = create_slybot_item_descriptor(self._default_schema)

        self.itemcls_info = {}
        for itemclass_name, triplets in itertools.groupby(self._item_template_pages, operator.itemgetter(0)):
            page_extractors_pairs = map(operator.itemgetter(1, 2), triplets)
            schema = item_schemas[itemclass_name]
            item_cls = get_iblitem_class(schema) if schema else default_item_cls

            page_descriptor_pairs = []
            for page, extractors in page_extractors_pairs:
                item_descriptor = create_slybot_item_descriptor(schema) if schema else default_item_descriptor
                apply_extractors(item_descriptor, extractors, all_extractors)
                page_descriptor_pairs.append((page, item_descriptor))

            extractor = InstanceBasedLearningExtractor(page_descriptor_pairs)

            self.itemcls_info[itemclass_name] = {
                "class": item_cls,
                "descriptor": item_descriptor,
                "extractor": extractor,
            }

        self._itemversion_cache = {}
Exemple #17
0
    def setup_bot(self, settings, spec, items, extractors):
        """
        Perform any initialization needed for crawling using this plugin
        """
        _item_template_pages = sorted(([
            t.get('scrapes'),
            dict_to_page(t, 'annotated_body'),
            t.get('extractors', []),
            t.get('version', '0.12.0')
        ] for t in spec['templates'] if t.get('page_type', 'item') == 'item'),
                                      key=lambda x: x[0])
        self.item_classes = {}
        self.template_scrapes = {
            template.get('page_id'): template['scrapes']
            for template in spec.get('templates')
        }
        self.html_link_extractor = HtmlLinkExtractor()
        for schema_name, schema in items.items():
            if schema_name not in self.item_classes:
                if not schema.get('name'):
                    schema['name'] = schema_name
                item_cls = SlybotItem.create_iblitem_class(schema)
                self.item_classes[schema_name] = item_cls

        # Create descriptors and apply additional extractors to fields
        page_descriptor_pairs = []
        self.schema_descriptors = {}
        for default, template, template_extractors, v in _item_template_pages:
            descriptors = OrderedDict()
            for schema_name, schema in items.items():
                item_descriptor = create_slybot_item_descriptor(
                    schema, schema_name)
                apply_extractors(item_descriptor, template_extractors,
                                 extractors)
                descriptors[schema_name] = item_descriptor
            descriptor = descriptors.values() or [{}]
            descriptors['#default'] = descriptors.get(default, descriptor[0])
            self.schema_descriptors[template.page_id] = descriptors['#default']
            page_descriptor_pairs.append((template, descriptors, v))
            add_extractors_to_descriptors(descriptors, extractors)

        grouped = itertools.groupby(
            sorted(page_descriptor_pairs, key=operator.itemgetter(2)),
            lambda x: x[2] < '0.13.0')
        self.extractors = []
        for version, group in grouped:
            if version:
                self.extractors.append(
                    InstanceBasedLearningExtractor([
                        (page, scrapes['#default'])
                        for page, scrapes, version in group
                    ]))
            else:
                self.extractors.append(SlybotIBLExtractor(list(group)))

        # generate ibl extractor for links pages
        _links_pages = [
            dict_to_page(t, 'annotated_body') for t in spec['templates']
            if t.get('page_type') == 'links'
        ]
        _links_item_descriptor = create_slybot_item_descriptor({'fields': {}})
        self._links_ibl_extractor = InstanceBasedLearningExtractor(
            [(t, _links_item_descriptor) for t in _links_pages]) \
            if _links_pages else None

        self.build_url_filter(spec)