Esempio n. 1
0
 def _do_extract_items_from(self, htmlpage, extractor, response=None):
     # Try to predict template to use
     template_cluster, pref_template_id = self._cluster_page(htmlpage)
     extracted, template = extractor.extract(htmlpage, pref_template_id)
     extracted = extracted or []
     link_regions = []
     for ddict in extracted:
         link_regions.extend(arg_to_iter(ddict.pop("_links", [])))
     descriptor = None
     unprocessed = False
     if template is not None and hasattr(template, 'descriptor'):
         descriptor = template.descriptor()
         if hasattr(descriptor, 'name'):
             item_cls_name = descriptor.name
         elif hasattr(descriptor, 'get'):
             item_cls_name = descriptor.get('name',
                                            descriptor.get('display_name'))
         else:
             item_cls_name = ''
     else:
         unprocessed = True
         try:
             descriptor = self.schema_descriptors[template.id]
             item_cls_name = self.template_scrapes[template.id]
         except (AttributeError, KeyError):
             descriptor = sorted(self.schema_descriptors.items())[0][1]
             item_cls_name = sorted(self.template_scrapes.items())[0][1]
     item_cls = self.item_classes.get(item_cls_name)
     items = []
     for processed_attributes in extracted:
         if processed_attributes.get('_type') in self.item_classes:
             _type = processed_attributes['_type']
             item = self.item_classes[_type](processed_attributes)
             item['_type'] = item.display_name()
         elif unprocessed:
             item = self._process_attributes(processed_attributes,
                                             descriptor, htmlpage)
             if item_cls:
                 item = item_cls(item)
         elif item_cls:
             item = item_cls(processed_attributes)
         else:
             item = dict(processed_attributes)
         item[u'url'] = htmlpage.url
         item[u'_template'] = str(template.id)
         item.setdefault('_type', item_cls_name)
         if not isinstance(item, SlybotItem):
             default_meta = {
                 'type': 'text',
                 'required': False,
                 'vary': False
             }
             item_cls = SlybotItem.create_iblitem_class(
                 {'fields': {k: default_meta
                             for k in item}})
             item = item_cls(**item)
         if self.clustering:
             item['_template_cluster'] = template_cluster
         items.append(item)
     return items, link_regions
Esempio n. 2
0
    def setup_bot(self, settings, spec, items, extractors):
        """
        Perform any initialization needed for crawling using this plugin
        """
        _item_template_pages = sorted((
            [t.get('scrapes'), dict_to_page(t, 'annotated_body'),
             t.get('extractors', []), t.get('version', '0.12.0')]
            for t in spec['templates'] if t.get('page_type', 'item') == 'item'
        ), key=lambda x: x[0])
        self.item_classes = {}
        self.template_scrapes = {template.get('page_id'): template['scrapes']
                                 for template in spec.get('templates')}
        self.html_link_extractor = HtmlLinkExtractor()
        for schema_name, schema in items.items():
            if schema_name not in self.item_classes:
                if not schema.get('name'):
                    schema['name'] = schema_name
                item_cls = SlybotItem.create_iblitem_class(schema)
                self.item_classes[schema_name] = item_cls

        # Create descriptors and apply additional extractors to fields
        page_descriptor_pairs = []
        self.schema_descriptors = {}
        for default, template, template_extractors, v in _item_template_pages:
            descriptors = OrderedDict()
            for schema_name, schema in items.items():
                item_descriptor = create_slybot_item_descriptor(schema,
                                                                schema_name)
                apply_extractors(item_descriptor, template_extractors,
                                 extractors)
                descriptors[schema_name] = item_descriptor
            descriptor = descriptors.values() or [{}]
            descriptors['#default'] = descriptors.get(default, descriptor[0])
            self.schema_descriptors[template.page_id] = descriptors['#default']
            page_descriptor_pairs.append((template, descriptors, v))
            add_extractors_to_descriptors(descriptors, extractors)

        grouped = itertools.groupby(sorted(page_descriptor_pairs,
                                           key=operator.itemgetter(2)),
                                    lambda x: x[2] < '0.13.0')
        self.extractors = []
        for version, group in grouped:
            if version:
                self.extractors.append(
                    InstanceBasedLearningExtractor(
                        [(page, scrapes['#default'])
                         for page, scrapes, version in group]))
            else:
                self.extractors.append(SlybotIBLExtractor(list(group)))

        # generate ibl extractor for links pages
        _links_pages = [dict_to_page(t, 'annotated_body')
                        for t in spec['templates']
                        if t.get('page_type') == 'links']
        _links_item_descriptor = create_slybot_item_descriptor({'fields': {}})
        self._links_ibl_extractor = InstanceBasedLearningExtractor(
            [(t, _links_item_descriptor) for t in _links_pages]) \
            if _links_pages else None

        self.build_url_filter(spec)
Esempio n. 3
0
 def _do_extract_items_from(self, htmlpage, extractor, response=None):
     # Try to predict template to use
     template_cluster, pref_template_id = self._cluster_page(htmlpage)
     extracted, template = extractor.extract(htmlpage, pref_template_id)
     extracted = extracted or []
     link_regions = []
     for ddict in extracted:
         link_regions.extend(arg_to_iter(ddict.pop("_links", [])))
     descriptor = None
     unprocessed = False
     if template is not None and hasattr(template, 'descriptor'):
         descriptor = template.descriptor()
         if hasattr(descriptor, 'name'):
             item_cls_name = descriptor.name
         elif hasattr(descriptor, 'get'):
             item_cls_name = descriptor.get('name',
                                            descriptor.get('display_name'))
         else:
             item_cls_name = ''
     else:
         unprocessed = True
         try:
             descriptor = self.schema_descriptors[template.id]
             item_cls_name = self.template_scrapes[template.id]
         except (AttributeError, KeyError):
             try:
                 descriptor = sorted(self.schema_descriptors.items())[0][1]
                 item_cls_name = sorted(self.template_scrapes.items())[0][1]
             except IndexError:
                 descriptor, item_cls_name = None, None
     item_cls = self.item_classes.get(item_cls_name)
     items = []
     for processed_attributes in extracted:
         if processed_attributes.get('_type') in self.item_classes:
             _type = processed_attributes['_type']
             item = self.item_classes[_type](processed_attributes)
             item['_type'] = item.display_name()
         elif unprocessed:
             item = self._process_attributes(processed_attributes,
                                             descriptor, htmlpage)
             if item_cls:
                 item = item_cls(item)
         elif item_cls:
             item = item_cls(processed_attributes)
         else:
             item = dict(processed_attributes)
         item[u'url'] = htmlpage.url
         item[u'_template'] = str(template.id)
         item.setdefault('_type', item_cls_name)
         if not isinstance(item, SlybotItem):
             default_meta = {'type': 'text', 'required': False,
                             'vary': False}
             item_cls = SlybotItem.create_iblitem_class(
                 {'fields': {k: default_meta for k in item}}
             )
             item = item_cls(**item)
         if self.clustering:
             item['_template_cluster'] = template_cluster
         items.append(item)
     return items, link_regions
Esempio n. 4
0
    def __init__(self, name, spec, item_schemas, all_extractors, **kw):
        super(IblSpider, self).__init__(name, **kw)
	
        spec = deepcopy(spec)
        for key, val in kw.items():
            if isinstance(val, basestring) and key in ['start_urls', 'exclude_patterns', 'follow_patterns', 'allowed_domains']:
                val = val.splitlines()
            spec[key] = val
	self.i = time.time()
	self.getProxyList()
        self._item_template_pages = sorted((
            [t['scrapes'], dict_to_page(t, 'annotated_body'),
            t.get('extractors', [])] \
            for t in spec['templates'] if t.get('page_type', 'item') == 'item'
        ), key=lambda pair: pair[0])	
        # generate ibl extractor for links pages
        _links_pages = [dict_to_page(t, 'annotated_body')
                for t in spec['templates'] if t.get('page_type') == 'links']
        _links_item_descriptor = create_slybot_item_descriptor({'fields': {}})
        self._links_ibl_extractor = InstanceBasedLearningExtractor([(t, _links_item_descriptor) for t in _links_pages]) \
                if _links_pages else None

        self._ipages = [page for _, page, _ in self._item_template_pages]

        self.html_link_extractor = HtmlLinkExtractor()
        self.rss_link_extractor = RssLinkExtractor()
        self.build_url_filter(spec)

        self.itemcls_info = {}
        for itemclass_name, triplets in itertools.groupby(self._item_template_pages, operator.itemgetter(0)):
            page_extractors_pairs = map(operator.itemgetter(1, 2), triplets)
            schema = item_schemas[itemclass_name]
            item_cls = SlybotItem.create_iblitem_class(schema)

            page_descriptor_pairs = []
            for page, template_extractors in page_extractors_pairs:
                item_descriptor = create_slybot_item_descriptor(schema)
                apply_extractors(item_descriptor, template_extractors, all_extractors)
                page_descriptor_pairs.append((page, item_descriptor))

            extractor = InstanceBasedLearningExtractor(page_descriptor_pairs)

            self.itemcls_info[itemclass_name] = {
                'class': item_cls,
                'descriptor': item_descriptor,
                'extractor': extractor,
            }

        self.login_requests = []
        self.form_requests = []
        self._start_requests = []
        self.generic_form = GenericForm(**kw)
        self._create_init_requests(spec.get("init_requests", []))
        self._process_start_urls(spec)
        self.allowed_domains = spec.get('allowed_domains',
                                        self._get_allowed_domains(self._ipages))
        if not self.allowed_domains:
            self.allowed_domains = None
Esempio n. 5
0
    def __init__(self, name, spec, item_schemas, all_extractors, **kw):
        super(IblSpider, self).__init__(name, **kw)
        spec = deepcopy(spec)
        for key, val in kw.items():
            if isinstance(val, basestring) and key in ['start_urls', 'exclude_patterns', 'follow_patterns', 'allowed_domains']:
                val = val.splitlines()
            spec[key] = val

        self._item_template_pages = sorted((
            [t['scrapes'], dict_to_page(t, 'annotated_body'),
            t.get('extractors', [])] \
            for t in spec['templates'] if t.get('page_type', 'item') == 'item'
        ), key=lambda pair: pair[0])

        # generate ibl extractor for links pages
        _links_pages = [dict_to_page(t, 'annotated_body')
                for t in spec['templates'] if t.get('page_type') == 'links']
        _links_item_descriptor = create_slybot_item_descriptor({'fields': {}})
        self._links_ibl_extractor = InstanceBasedLearningExtractor([(t, _links_item_descriptor) for t in _links_pages]) \
                if _links_pages else None

        self._ipages = [page for _, page, _ in self._item_template_pages]

        self.html_link_extractor = HtmlLinkExtractor()
        self.rss_link_extractor = RssLinkExtractor()
        self.build_url_filter(spec)

        self.itemcls_info = {}
        for itemclass_name, triplets in itertools.groupby(self._item_template_pages, operator.itemgetter(0)):
            page_extractors_pairs = map(operator.itemgetter(1, 2), triplets)
            schema = item_schemas[itemclass_name]
            item_cls = SlybotItem.create_iblitem_class(schema)

            page_descriptor_pairs = []
            for page, template_extractors in page_extractors_pairs:
                item_descriptor = create_slybot_item_descriptor(schema)
                apply_extractors(item_descriptor, template_extractors, all_extractors)
                page_descriptor_pairs.append((page, item_descriptor))

            extractor = InstanceBasedLearningExtractor(page_descriptor_pairs)

            self.itemcls_info[itemclass_name] = {
                'class': item_cls,
                'descriptor': item_descriptor,
                'extractor': extractor,
            }

        self.login_requests = []
        self.form_requests = []
        self._start_requests = []
        self.generic_form = GenericForm(**kw)
        self._create_init_requests(spec.get("init_requests", []))
        self._process_start_urls(spec)
        self.allowed_domains = spec.get('allowed_domains',
                                        self._get_allowed_domains(self._ipages))
        if not self.allowed_domains:
            self.allowed_domains = None
Esempio n. 6
0
    def _do_extract_items_from(self, htmlpage, extractor):
        extracted_data, template = extractor.extract(htmlpage)
        link_regions = []
        for ddict in extracted_data or []:
            link_regions.extend(ddict.pop("_links", []))
        descriptor = None
        unprocessed = False
        if template is not None and hasattr(template, 'descriptor'):
            descriptor = template.descriptor()
            if descriptor is None:
                item_cls_name = ''
            elif isinstance(descriptor, dict):
                item_cls_name = descriptor.get('name', '')
            else:
                item_cls_name = descriptor.name
        else:
            unprocessed = True
            try:
                descriptor = self.schema_descriptors[template.id]
                item_cls_name = self.template_scrapes[template.id]
            except AttributeError:
                descriptor = sorted(self.schema_descriptors.items())[0][1]
                item_cls_name = sorted(self.template_scrapes.items())[0][1]
        item_cls = self.item_classes.get(item_cls_name)
        items = []
        for processed_attributes in extracted_data or []:
            if processed_attributes.get('_type') in self.item_classes:
                _type = processed_attributes['_type']
                item = self.item_classes[_type](processed_attributes)
                item['_type'] = item.display_name()
            elif unprocessed:
                item = self._process_attributes(processed_attributes,
                                                descriptor, htmlpage)
                if item_cls:
                    item = item_cls(item)
            elif item_cls:
                item = item_cls(processed_attributes)
            else:
                item = dict(processed_attributes)
            item['url'] = htmlpage.url
            item['_template'] = str(template.id)
            item.setdefault('_type', item_cls_name)
            if not isinstance(item, SlybotItem):
                default_meta = {
                    'type': 'text',
                    'required': False,
                    'vary': False
                }
                item_cls = SlybotItem.create_iblitem_class(
                    {'fields': {k: default_meta
                                for k in item}})
                item = item_cls(**item)
            items.append(item)

        return items, link_regions
Esempio n. 7
0
    def _do_extract_items_from(self, htmlpage, extractor):
        extracted_data, template = extractor.extract(htmlpage)
        link_regions = []
        for ddict in extracted_data or []:
            link_regions.extend(ddict.pop("_links", []))
        descriptor = None
        unprocessed = False
        if template is not None and hasattr(template, 'descriptor'):
            descriptor = template.descriptor()
            if descriptor is None:
                item_cls_name = ''
            elif isinstance(descriptor, dict):
                item_cls_name = descriptor.get('name', '')
            else:
                item_cls_name = descriptor.name
        else:
            unprocessed = True
            try:
                descriptor = self.schema_descriptors[template.id]
                item_cls_name = self.template_scrapes[template.id]
            except AttributeError:
                descriptor = sorted(self.schema_descriptors.items())[0][1]
                item_cls_name = sorted(self.template_scrapes.items())[0][1]
        item_cls = self.item_classes.get(item_cls_name)
        items = []
        for processed_attributes in extracted_data or []:
            if processed_attributes.get('_type') in self.item_classes:
                _type = processed_attributes['_type']
                item = self.item_classes[_type](processed_attributes)
                item['_type'] = item.display_name()
            elif unprocessed:
                item = self._process_attributes(processed_attributes,
                                                descriptor, htmlpage)
                if item_cls:
                    item = item_cls(item)
            elif item_cls:
                item = item_cls(processed_attributes)
            else:
                item = dict(processed_attributes)
            item['url'] = htmlpage.url
            item['_template'] = str(template.id)
            item.setdefault('_type', item_cls_name)
            if not isinstance(item, SlybotItem):
                default_meta = {'type': 'text', 'required': False,
                                'vary': False}
                item_cls = SlybotItem.create_iblitem_class(
                    {'fields': {k: default_meta for k in item}}
                )
                item = item_cls(**item)
            items.append(item)

        return items, link_regions
Esempio n. 8
0
    def setup_bot(self, settings, spec, items, extractors):
        """
        Perform any initialization needed for crawling using this plugin
        """
        _item_template_pages = sorted(([
            t['scrapes'],
            dict_to_page(t, 'annotated_body'),
            t.get('extractors', [])
        ] for t in spec['templates'] if t.get('page_type', 'item') == 'item'),
                                      key=lambda pair: pair[0])

        self.itemcls_info = {}
        if settings.get('AUTO_PAGINATION'):
            self.html_link_extractor = PaginationExtractor()
        else:
            self.html_link_extractor = HtmlLinkExtractor()
        for itemclass_name, triplets in groupby(_item_template_pages,
                                                itemgetter(0)):
            page_extractors_pairs = map(itemgetter(1, 2), triplets)
            schema = items[itemclass_name]
            item_cls = SlybotItem.create_iblitem_class(schema)

            page_descriptor_pairs = []
            for page, template_extractors in page_extractors_pairs:
                item_descriptor = create_slybot_item_descriptor(schema)
                apply_extractors(item_descriptor, template_extractors,
                                 extractors)
                page_descriptor_pairs.append((page, item_descriptor))

            extractor = InstanceBasedLearningExtractor(page_descriptor_pairs)

            self.itemcls_info[itemclass_name] = {
                'class': item_cls,
                'descriptor': item_descriptor,
                'extractor': extractor,
            }

        # generate ibl extractor for links pages
        _links_pages = [
            dict_to_page(t, 'annotated_body') for t in spec['templates']
            if t.get('page_type') == 'links'
        ]
        _links_item_descriptor = create_slybot_item_descriptor({'fields': {}})
        self._links_ibl_extractor = InstanceBasedLearningExtractor(
            [(t, _links_item_descriptor) for t in _links_pages]) \
            if _links_pages else None

        self.build_url_filter(spec)
Esempio n. 9
0
    def setup_bot(self, settings, spec, items, extractors):
        """
        Perform any initialization needed for crawling using this plugin
        """
        _item_template_pages = sorted((
            [t['scrapes'], dict_to_page(t, 'annotated_body'),
             t.get('extractors', [])]
            for t in spec['templates'] if t.get('page_type', 'item') == 'item'
        ), key=lambda pair: pair[0])

        self.itemcls_info = {}
        if settings.get('AUTO_PAGINATION'):
            self.html_link_extractor = PaginationExtractor()
        else:
            self.html_link_extractor = HtmlLinkExtractor()
        for itemclass_name, triplets in groupby(_item_template_pages,
                                                itemgetter(0)):
            page_extractors_pairs = map(itemgetter(1, 2), triplets)
            schema = items[itemclass_name]
            item_cls = SlybotItem.create_iblitem_class(schema)

            page_descriptor_pairs = []
            for page, template_extractors in page_extractors_pairs:
                item_descriptor = create_slybot_item_descriptor(schema)
                apply_extractors(item_descriptor, template_extractors,
                                 extractors)
                page_descriptor_pairs.append((page, item_descriptor))

            extractor = InstanceBasedLearningExtractor(page_descriptor_pairs)

            self.itemcls_info[itemclass_name] = {
                'class': item_cls,
                'descriptor': item_descriptor,
                'extractor': extractor,
            }

        # generate ibl extractor for links pages
        _links_pages = [dict_to_page(t, 'annotated_body')
                        for t in spec['templates']
                        if t.get('page_type') == 'links']
        _links_item_descriptor = create_slybot_item_descriptor({'fields': {}})
        self._links_ibl_extractor = InstanceBasedLearningExtractor(
            [(t, _links_item_descriptor) for t in _links_pages]) \
            if _links_pages else None

        self.build_url_filter(spec)
Esempio n. 10
0
    def setup_bot(self, settings, spec, items, extractors):
        """
        Perform any initialization needed for crawling using this plugin
        """
        _item_template_pages = sorted(
            (
                [t["scrapes"], dict_to_page(t, "annotated_body"), t.get("extractors", [])]
                for t in spec["templates"]
                if t.get("page_type", "item") == "item"
            ),
            key=lambda pair: pair[0],
        )

        self.itemcls_info = {}
        self.html_link_extractor = HtmlLinkExtractor()
        self.rss_link_extractor = RssLinkExtractor()
        for itemclass_name, triplets in groupby(_item_template_pages, itemgetter(0)):
            page_extractors_pairs = map(itemgetter(1, 2), triplets)
            schema = items[itemclass_name]
            item_cls = SlybotItem.create_iblitem_class(schema)

            page_descriptor_pairs = []
            for page, template_extractors in page_extractors_pairs:
                item_descriptor = create_slybot_item_descriptor(schema)
                apply_extractors(item_descriptor, template_extractors, extractors)
                page_descriptor_pairs.append((page, item_descriptor))

            extractor = InstanceBasedLearningExtractor(page_descriptor_pairs)

            self.itemcls_info[itemclass_name] = {
                "class": item_cls,
                "descriptor": item_descriptor,
                "extractor": extractor,
            }

        # generate ibl extractor for links pages
        _links_pages = [dict_to_page(t, "annotated_body") for t in spec["templates"] if t.get("page_type") == "links"]
        _links_item_descriptor = create_slybot_item_descriptor({"fields": {}})
        self._links_ibl_extractor = (
            InstanceBasedLearningExtractor([(t, _links_item_descriptor) for t in _links_pages])
            if _links_pages
            else None
        )

        self.build_url_filter(spec)
Esempio n. 11
0
    def setup_bot(self, settings, spec, items, extractors):
        """
        Perform any initialization needed for crawling using this plugin
        """
        _item_template_pages = sorted((
            [t.get('scrapes'), dict_to_page(t, 'annotated_body'),
             t.get('extractors', [])]
            for t in spec['templates'] if t.get('page_type', 'item') == 'item'
        ))
        self.item_classes = {}
        self.html_link_extractor = HtmlLinkExtractor()
        for schema_name, schema in items.items():
            if schema_name not in self.item_classes:
                if not schema.get('name'):
                    schema['name'] = schema_name
                item_cls = SlybotItem.create_iblitem_class(schema)
                self.item_classes[schema_name] = item_cls

        # Create descriptors and apply additional extractors to fields
        page_descriptor_pairs = []
        for default, template, template_extractors in _item_template_pages:
            descriptors = OrderedDict()
            for schema_name, schema in items.items():
                item_descriptor = create_slybot_item_descriptor(schema,
                                                                schema_name)
                apply_extractors(item_descriptor, template_extractors,
                                 extractors)
                descriptors[schema_name] = item_descriptor
            descriptor = descriptors.values() or [{}]
            descriptors['#default'] = descriptors.get(default, descriptor[0])
            page_descriptor_pairs.append((template, descriptors))

        self.extractors = SlybotIBLExtractor(page_descriptor_pairs)

        # generate ibl extractor for links pages
        _links_pages = [dict_to_page(t, 'annotated_body')
                        for t in spec['templates']
                        if t.get('page_type') == 'links']
        _links_item_descriptor = create_slybot_item_descriptor({'fields': {}})
        self._links_ibl_extractor = InstanceBasedLearningExtractor(
            [(t, _links_item_descriptor) for t in _links_pages]) \
            if _links_pages else None

        self.build_url_filter(spec)
Esempio n. 12
0
    def setup_bot(self, settings, spec, items, extractors, logger):
        """
        Perform any initialization needed for crawling using this plugin
        """
        self.logger = logger
        templates = map(self._get_annotated_template, spec['templates'])

        _item_template_pages = sorted(([
            t.get('scrapes'),
            dict_to_page(t, 'annotated_body'),
            t.get('extractors', []),
            t.get('version', '0.12.0')
        ] for t in templates if t.get('page_type', 'item') == 'item'),
                                      key=lambda x: x[0])
        self.item_classes = {}
        self.template_scrapes = {
            template.get('page_id'): template['scrapes']
            for template in templates
        }
        if (settings.get('AUTO_PAGINATION')
                or spec.get('links_to_follow') == 'auto'):
            self.html_link_extractor = PaginationExtractor()
        else:
            self.html_link_extractor = HtmlLinkExtractor()
        for schema_name, schema in items.items():
            if schema_name not in self.item_classes:
                if not schema.get('name'):
                    schema['name'] = schema_name
                item_cls = SlybotItem.create_iblitem_class(schema)
                self.item_classes[schema_name] = item_cls

        # Create descriptors and apply additional extractors to fields
        page_descriptor_pairs = []
        self.schema_descriptors = {}
        for default, template, template_extractors, v in _item_template_pages:
            descriptors = OrderedDict()
            for schema_name, schema in items.items():
                item_descriptor = create_slybot_item_descriptor(
                    schema, schema_name)
                apply_extractors(item_descriptor, template_extractors,
                                 extractors)
                descriptors[schema_name] = item_descriptor
            descriptor = descriptors.values() or [{}]
            descriptors['#default'] = descriptors.get(default, descriptor[0])
            self.schema_descriptors[template.page_id] = descriptors['#default']
            page_descriptor_pairs.append((template, descriptors, v))
            add_extractors_to_descriptors(descriptors, extractors)

        grouped = itertools.groupby(
            sorted(page_descriptor_pairs, key=operator.itemgetter(2)),
            lambda x: x[2] < '0.13.0')
        self.extractors = []
        for version, group in grouped:
            if version:
                self.extractors.append(
                    InstanceBasedLearningExtractor([
                        (page, scrapes['#default'])
                        for page, scrapes, version in group
                    ]))
            else:
                self.extractors.append(SlybotIBLExtractor(list(group)))

        # generate ibl extractor for links pages
        _links_pages = [
            dict_to_page(t, 'annotated_body') for t in templates
            if t.get('page_type') == 'links'
        ]
        _links_item_descriptor = create_slybot_item_descriptor({'fields': {}})
        self._links_ibl_extractor = InstanceBasedLearningExtractor(
            [(t, _links_item_descriptor) for t in _links_pages]) \
            if _links_pages else None

        self.build_url_filter(spec)
        # Clustering
        self.template_names = [t.get('page_id') for t in spec['templates']]
        if settings.get('PAGE_CLUSTERING'):
            try:
                import page_clustering
                self.clustering = page_clustering.kmeans_from_samples(
                    spec['templates'])
                self.logger.info("Clustering activated")
            except ImportError:
                self.clustering = None
                self.logger.warning(
                    "Clustering could not be used because it is not installed")
        else:
            self.clustering = None
Esempio n. 13
0
    def setup_bot(self, settings, spec, items, extractors, logger):
        """
        Perform any initialization needed for crawling using this plugin
        """
        self.logger = logger
        templates = map(self._get_annotated_template, spec['templates'])

        _item_template_pages = sorted((
            [t.get('scrapes'), dict_to_page(t, 'annotated_body'),
             t.get('extractors', []), t.get('version', '0.12.0')]
            for t in templates if t.get('page_type', 'item') == 'item'
        ), key=lambda x: x[0])
        self.item_classes = {}
        self.template_scrapes = {template.get('page_id'): template['scrapes']
                                 for template in templates}
        if (settings.get('AUTO_PAGINATION') or
                spec.get('links_to_follow') == 'auto'):
            self.html_link_extractor = PaginationExtractor()
        else:
            self.html_link_extractor = HtmlLinkExtractor()
        for schema_name, schema in items.items():
            if schema_name not in self.item_classes:
                if not schema.get('name'):
                    schema['name'] = schema_name
                item_cls = SlybotItem.create_iblitem_class(schema)
                self.item_classes[schema_name] = item_cls

        # Create descriptors and apply additional extractors to fields
        page_descriptor_pairs = []
        self.schema_descriptors = {}
        for default, template, template_extractors, v in _item_template_pages:
            descriptors = OrderedDict()
            for schema_name, schema in items.items():
                item_descriptor = create_slybot_item_descriptor(schema,
                                                                schema_name)
                apply_extractors(item_descriptor, template_extractors,
                                 extractors)
                descriptors[schema_name] = item_descriptor
            descriptor = descriptors.values() or [{}]
            descriptors['#default'] = descriptors.get(default, descriptor[0])
            self.schema_descriptors[template.page_id] = descriptors['#default']
            page_descriptor_pairs.append((template, descriptors, v))
            add_extractors_to_descriptors(descriptors, extractors)

        grouped = itertools.groupby(sorted(page_descriptor_pairs,
                                           key=operator.itemgetter(2)),
                                    lambda x: x[2] < '0.13.0')
        self.extractors = []
        for version, group in grouped:
            if version:
                self.extractors.append(
                    InstanceBasedLearningExtractor(
                        [(page, scrapes['#default'])
                         for page, scrapes, version in group]))
            else:
                self.extractors.append(SlybotIBLExtractor(list(group)))

        # generate ibl extractor for links pages
        _links_pages = [dict_to_page(t, 'annotated_body')
                        for t in templates if t.get('page_type') == 'links']
        _links_item_descriptor = create_slybot_item_descriptor({'fields': {}})
        self._links_ibl_extractor = InstanceBasedLearningExtractor(
            [(t, _links_item_descriptor) for t in _links_pages]) \
            if _links_pages else None

        self.build_url_filter(spec)
        # Clustering
        self.template_names = [t.get('page_id') for t in spec['templates']]
        if settings.get('PAGE_CLUSTERING'):
            try:
                import page_clustering
                self.clustering = page_clustering.kmeans_from_samples(spec['templates'])
                self.logger.info("Clustering activated")
            except ImportError:
                self.clustering = None
                self.logger.warning(
                    "Clustering could not be used because it is not installed")
        else:
            self.clustering = None
Esempio n. 14
0
    def __init__(self, name, spec, item_schemas, all_extractors, **kw):
        super(IblSpider, self).__init__(name, **kw)

        self._item_template_pages = sorted((
            [t['scrapes'], dict_to_page(t, 'annotated_body'),
            t.get('extractors', [])] \
            for t in spec['templates'] if t.get('page_type', 'item') == 'item'
        ), key=lambda pair: pair[0])

        # generate ibl extractor for links pages
        _links_pages = [dict_to_page(t, 'annotated_body')
                for t in spec['templates'] if t.get('page_type') == 'links']
        _links_item_descriptor = create_slybot_item_descriptor({'fields': {}})
        self._links_ibl_extractor = InstanceBasedLearningExtractor([(t, _links_item_descriptor) for t in _links_pages]) \
                if _links_pages else None

        self._ipages = [page for _, page, _ in self._item_template_pages]

        self.start_urls = self.start_urls or spec.get('start_urls')
        if isinstance(self.start_urls, basestring):
            self.start_urls = self.start_urls.splitlines()

        self.html_link_extractor = HtmlLinkExtractor()
        self.rss_link_extractor = RssLinkExtractor()
        self.allowed_domains = spec.get('allowed_domains',
                                        self._get_allowed_domains(self._ipages))
        if not self.allowed_domains:
            self.allowed_domains = None
        self.build_url_filter(spec)

        self.itemcls_info = {}
        for itemclass_name, triplets in itertools.groupby(self._item_template_pages, operator.itemgetter(0)):
            page_extractors_pairs = map(operator.itemgetter(1, 2), triplets)
            schema = item_schemas[itemclass_name]
            item_cls = SlybotItem.create_iblitem_class(schema)

            page_descriptor_pairs = []
            for page, template_extractors in page_extractors_pairs:
                item_descriptor = create_slybot_item_descriptor(schema)
                apply_extractors(item_descriptor, template_extractors, all_extractors)
                page_descriptor_pairs.append((page, item_descriptor))

            extractor = InstanceBasedLearningExtractor(page_descriptor_pairs)

            self.itemcls_info[itemclass_name] = {
                'class': item_cls,
                'descriptor': item_descriptor,
                'extractor': extractor,
            }

        self.login_requests = []
        self.form_requests = []
        for rdata in spec.get("init_requests", []):
            if rdata["type"] == "login":
                request = Request(url=rdata.pop("loginurl"), meta=rdata,
                                  callback=self.parse_login_page, dont_filter=True)
                self.login_requests.append(request)

            elif rdata["type"] == "form":
                self.generic_form = GenericForm(**kw)
                self.form_requests.append(self.get_generic_form_start_request(rdata))
Esempio n. 15
0
    def setup_bot(self, settings, spec, items, extractors):
        """
        Perform any initialization needed for crawling using this plugin
        """
        _item_template_pages = sorted(([
            t.get('scrapes'),
            dict_to_page(t, 'annotated_body'),
            t.get('extractors', []),
            t.get('version', '0.12.0')
        ] for t in spec['templates'] if t.get('page_type', 'item') == 'item'),
                                      key=lambda x: x[0])
        self.item_classes = {}
        self.template_scrapes = {
            template.get('page_id'): template['scrapes']
            for template in spec.get('templates')
        }
        self.html_link_extractor = HtmlLinkExtractor()
        for schema_name, schema in items.items():
            if schema_name not in self.item_classes:
                if not schema.get('name'):
                    schema['name'] = schema_name
                item_cls = SlybotItem.create_iblitem_class(schema)
                self.item_classes[schema_name] = item_cls

        # Create descriptors and apply additional extractors to fields
        page_descriptor_pairs = []
        self.schema_descriptors = {}
        for default, template, template_extractors, v in _item_template_pages:
            descriptors = OrderedDict()
            for schema_name, schema in items.items():
                item_descriptor = create_slybot_item_descriptor(
                    schema, schema_name)
                apply_extractors(item_descriptor, template_extractors,
                                 extractors)
                descriptors[schema_name] = item_descriptor
            descriptor = descriptors.values() or [{}]
            descriptors['#default'] = descriptors.get(default, descriptor[0])
            self.schema_descriptors[template.page_id] = descriptors['#default']
            page_descriptor_pairs.append((template, descriptors, v))
            add_extractors_to_descriptors(descriptors, extractors)

        grouped = itertools.groupby(
            sorted(page_descriptor_pairs, key=operator.itemgetter(2)),
            lambda x: x[2] < '0.13.0')
        self.extractors = []
        for version, group in grouped:
            if version:
                self.extractors.append(
                    InstanceBasedLearningExtractor([
                        (page, scrapes['#default'])
                        for page, scrapes, version in group
                    ]))
            else:
                self.extractors.append(SlybotIBLExtractor(list(group)))

        # generate ibl extractor for links pages
        _links_pages = [
            dict_to_page(t, 'annotated_body') for t in spec['templates']
            if t.get('page_type') == 'links'
        ]
        _links_item_descriptor = create_slybot_item_descriptor({'fields': {}})
        self._links_ibl_extractor = InstanceBasedLearningExtractor(
            [(t, _links_item_descriptor) for t in _links_pages]) \
            if _links_pages else None

        self.build_url_filter(spec)