Beispiel #1
0
 def scrape_page2(self, page, fields_spec):
     if self._ex is None:
         self._ex = InstanceBasedLearningExtractor(
             ((t, get_visual_tool_item_descriptor(fields_spec))
              for t in self._templates), False, True)
     res = self._ex.extract(page)[0]
     return res
 def test_extractor_w_empty_string_extraction(self):
     schema = {
         "id": "test",
         "properties": [
             ('gender', {
                 'description': '',
                 'optional': True,
                 'type': 'text',
                 'vary': False,
             }),
             ('name', {
                 'description': '',
                 'optional': False,
                 'type': 'text',
                 'vary': False,
             }),
         ],
     }
     descriptor = create_slybot_item_descriptor(schema)
     extractors =  {
                 1: {
                     "_id": 2,
                     "field_name": "gender",
                     "regular_expression": "([0-9]+)"
                 }
     }
     apply_extractors(descriptor, [1], extractors)
     
     ibl_extractor = InstanceBasedLearningExtractor([(self.template2, descriptor)])
     self.assertEqual(ibl_extractor.extract(self.target2)[0][0], {u'name': [u'Name Olivia']})
Beispiel #3
0
 def test_extractor_w_empty_string_extraction(self):
     schema = {
         'fields': {
             'gender': {
                 'required': False,
                 'type': 'text',
                 'vary': False,
             },
             'name': {
                 'required': True,
                 'type': 'text',
                 'vary': False,
             }
         }
     }
     descriptor = create_slybot_item_descriptor(schema)
     extractors =  {
                 1: {
                     "regular_expression": "([0-9]+)"
                 }
     }
     apply_extractors(descriptor, {"gender": [1]}, extractors)
     
     ibl_extractor = InstanceBasedLearningExtractor([(self.template2, descriptor)])
     self.assertEqual(ibl_extractor.extract(self.target2)[0][0], {u'name': [u'Name Olivia']})
Beispiel #4
0
    def scrape(self, url=None, html=None, encoding='utf-8'):
        ## not version from https://github.com/scrapy/scrapely/blob/master/scrapely/extraction/pageparsing.py
        ## may need to replace with version from inspect.getsourcelines(Scraper.scrape), as this version is

        page = self._get_page(url, encoding, html)
        ex = InstanceBasedLearningExtractor(self.templates)
        return ex.extract(page)[0]
Beispiel #5
0
    def _run_extraction(self, name, templates, page, extractors, expected_output):
        self.trace = None
        template_pages = [HtmlPage(None, {}, t) for t in templates]
        extractor = InstanceBasedLearningExtractor(template_pages, extractors, True)
        actual_output, _ = extractor.extract(HtmlPage(None, {}, page))
        if not actual_output:
            if expected_output is None:
                return
            assert False, "failed to extract data for test '%s'" % name
        actual_output = actual_output[0]
        self.trace = ["Extractor:\n%s" % extractor] + actual_output.pop('trace', [])
        expected_names = set(expected_output.keys())
        actual_names = set(actual_output.keys())
        
        missing_in_output = filter(None, expected_names - actual_names)
        error = "attributes '%s' were expected but were not present in test '%s'" % \
                ("', '".join(missing_in_output), name)
        assert len(missing_in_output) == 0, error

        unexpected = actual_names - expected_names
        error = "unexpected attributes %s in test '%s'" % \
                (', '.join(unexpected), name)
        assert len(unexpected) == 0, error

        for k, v in expected_output.items():
            extracted = actual_output[k]
            assert v == extracted, "in test '%s' for attribute '%s', " \
                "expected value '%s' but got '%s'" % (name, k, v, extracted)
    def scrape(self, url=None, html=None, encoding='utf-8'): 
        ## not version from https://github.com/scrapy/scrapely/blob/master/scrapely/extraction/pageparsing.py
        ## may need to replace with version from inspect.getsourcelines(Scraper.scrape), as this version is

        page = self._get_page(url, encoding, html)
        ex = InstanceBasedLearningExtractor(self.templates)
        return ex.extract(page)[0]
Beispiel #7
0
 def test_annotate_multiple(self):
     tm = TemplateMaker(self.PAGE)
     tm.annotate('field1', best_match('text to annotate'), best_match=False)
     tpl = tm.get_template()
     ex = InstanceBasedLearningExtractor([(tpl, None)])
     self.assertEqual(ex.extract(self.PAGE)[0],
         [{u'field1': [u'Some text to annotate here', u'Another text to annotate there']}])
    def test_type_extractor(self):
        schema = {
            "fields": {
                'gender': {
                    'required': False,
                    'type': 'number',
                    'vary': False,
                }
            }
        }
        descriptor = create_slybot_item_descriptor(schema)
        extractors = {
            1: {
                "type_extractor": "text"
            },
            2: {
                "regular_expression": "Gender\\s+(Male|Female)"
            }
        }
        apply_extractors(descriptor, {"gender": [1, 2]}, extractors)

        ibl_extractor = InstanceBasedLearningExtractor([(self.template,
                                                         descriptor)])
        self.assertEqual(
            ibl_extractor.extract(self.target)[0][0], {u'gender': [u'Male']})
Beispiel #9
0
 def test_annotate_ignore_unpaired(self):
     tm = TemplateMaker(self.PAGE)
     tm.annotate('field1', best_match("and that's"), best_match=False)
     tpl = tm.get_template()
     ex = InstanceBasedLearningExtractor([(tpl, None)])
     self.assertEqual(ex.extract(self.PAGE)[0],
         [{u'field1': [u"More text with unpaired tag <img />and that's it"]}])
Beispiel #10
0
 def test_annotate_ignore_unpaired(self):
     tm = TemplateMaker(self.PAGE)
     tm.annotate('field1', best_match("and that's"), best_match=False)
     tpl = tm.get_template()
     ex = InstanceBasedLearningExtractor([(tpl, None)])
     self.assertEqual(ex.extract(self.PAGE)[0],
         [{u'field1': [u"More text with unpaired tag <img />and that's it"]}])
Beispiel #11
0
 def test_annotate_multiple(self):
     tm = TemplateMaker(self.PAGE)
     tm.annotate('field1', best_match('text to annotate'), best_match=False)
     tpl = tm.get_template()
     ex = InstanceBasedLearningExtractor([(tpl, None)])
     self.assertEqual(ex.extract(self.PAGE)[0],
         [{u'field1': [u'Some text to annotate here', u'Another text to annotate there']}])
Beispiel #12
0
    def test_extraction(self, name, templates, page, descriptor, expected_output):
        template_pages = [HtmlPage(None, {}, t) for t in templates]

        extractor = InstanceBasedLearningExtractor([(t, descriptor) for t in template_pages])
        actual_output, _ = extractor.extract(HtmlPage(None, {}, page))

        self.assertEqual(expected_output, actual_output and actual_output[0])
Beispiel #13
0
 def test_type_extractor(self):
     schema = {
         "id": "test",
         "properties": [('gender', {
                 'description': '',
                 'optional': True,
                 'type': 'number',
                 'vary': False,
         })],
     }
     descriptor = create_slybot_item_descriptor(schema)
     extractors =  {
                 1: {
                     "_id": 1,
                     "field_name": "gender",
                     "type_extractor": "text"
                 },
                 2: {
                     "_id": 2,
                     "field_name": "gender",
                     "regular_expression": "Gender\\s+(Male|Female)"
                 }
     }
     apply_extractors(descriptor, [1, 2], extractors)
     
     ibl_extractor = InstanceBasedLearningExtractor([(self.template, descriptor)])
     self.assertEqual(ibl_extractor.extract(self.target)[0][0], {u'gender': [u'Male']})
Beispiel #14
0
 def do_s(self, url):
     """s <url> - scrape url (uses encoding from templates)"""
     templates = self._load_templates()
     if assert_or_print(templates, "no templates available"):
         return
     page = get_page(url, templates[0].encoding)
     ex = InstanceBasedLearningExtractor(templates)
     pprint.pprint(ex.extract(page)[0])
 def test_annotate_multiple(self):
     tm = TemplateMaker(self.PAGE)
     tm.annotate("field1", best_match("text to annotate"), best_match=False)
     tpl = tm.get_template()
     ex = InstanceBasedLearningExtractor([tpl])
     self.assertEqual(
         ex.extract(self.PAGE)[0], [{u"field1": [u"Some text to annotate here", u"Another text to annotate there"]}]
     )
Beispiel #16
0
    def test_extraction(self, name, templates, page, descriptor,
                        expected_output):
        template_pages = [HtmlPage(None, {}, t) for t in templates]

        extractor = InstanceBasedLearningExtractor([(t, descriptor)
                                                    for t in template_pages])
        actual_output, _ = extractor.extract(HtmlPage(None, {}, page))

        self.assertEqual(expected_output, actual_output and actual_output[0])
Beispiel #17
0
 def do_s(self, url):
     """s <url> - scrape url"""
     templates = self._load_templates()
     if assert_or_print(templates, "no templates available"):
         return
     # fall back to the template encoding if none is specified
     page = url_to_page(url, default_encoding=templates[0].encoding)
     ex = InstanceBasedLearningExtractor((t, None) for t in templates)
     pprint.pprint(ex.extract(page)[0])
Beispiel #18
0
 def do_scrape(self, url):
     """scrape <url> - scrape url (alias: s)"""
     templates = self._load_templates()
     if assert_or_print(templates, "no templates available"):
         return
     # fall back to the template encoding if none is specified
     page = url_to_page(url, default_encoding=templates[0].encoding)
     ex = InstanceBasedLearningExtractor((t, None) for t in templates)
     pprint.pprint(ex.extract(page)[0])
Beispiel #19
0
    def __init__(self, name, spec, item_schemas, all_extractors, **kw):
        super(IblSpider, self).__init__(name, **kw)
        spec = deepcopy(spec)
        for key, val in kw.items():
            if isinstance(val, basestring) and key in ['start_urls', 'exclude_patterns', 'follow_patterns', 'allowed_domains']:
                val = val.splitlines()
            spec[key] = val

        self._item_template_pages = sorted((
            [t['scrapes'], dict_to_page(t, 'annotated_body'),
            t.get('extractors', [])] \
            for t in spec['templates'] if t.get('page_type', 'item') == 'item'
        ), key=lambda pair: pair[0])

        # generate ibl extractor for links pages
        _links_pages = [dict_to_page(t, 'annotated_body')
                for t in spec['templates'] if t.get('page_type') == 'links']
        _links_item_descriptor = create_slybot_item_descriptor({'fields': {}})
        self._links_ibl_extractor = InstanceBasedLearningExtractor([(t, _links_item_descriptor) for t in _links_pages]) \
                if _links_pages else None

        self._ipages = [page for _, page, _ in self._item_template_pages]

        self.html_link_extractor = HtmlLinkExtractor()
        self.rss_link_extractor = RssLinkExtractor()
        self.build_url_filter(spec)

        self.itemcls_info = {}
        for itemclass_name, triplets in itertools.groupby(self._item_template_pages, operator.itemgetter(0)):
            page_extractors_pairs = map(operator.itemgetter(1, 2), triplets)
            schema = item_schemas[itemclass_name]
            item_cls = SlybotItem.create_iblitem_class(schema)

            page_descriptor_pairs = []
            for page, template_extractors in page_extractors_pairs:
                item_descriptor = create_slybot_item_descriptor(schema)
                apply_extractors(item_descriptor, template_extractors, all_extractors)
                page_descriptor_pairs.append((page, item_descriptor))

            extractor = InstanceBasedLearningExtractor(page_descriptor_pairs)

            self.itemcls_info[itemclass_name] = {
                'class': item_cls,
                'descriptor': item_descriptor,
                'extractor': extractor,
            }

        self.login_requests = []
        self.form_requests = []
        self._start_requests = []
        self.generic_form = GenericForm(**kw)
        self._create_init_requests(spec.get("init_requests", []))
        self._process_start_urls(spec)
        self.allowed_domains = spec.get('allowed_domains',
                                        self._get_allowed_domains(self._ipages))
        if not self.allowed_domains:
            self.allowed_domains = None
    def test_default_type_extractor(self):
        schema = {'fields': {}}
        descriptor = create_slybot_item_descriptor(schema)
        extractors = {1: {"regular_expression": "Gender\\s+(Male|Female)"}}
        apply_extractors(descriptor, {"gender": [1]}, extractors)

        ibl_extractor = InstanceBasedLearningExtractor([(self.template,
                                                         descriptor)])
        self.assertEqual(
            ibl_extractor.extract(self.target)[0][0], {u'gender': [u'Male']})
Beispiel #21
0
 def do_s(self, line):
     """s <url> [--encoding ENCODING --useragent 'User-Agent'] - scrape url"""
     templates = self._load_templates()
     if assert_or_print(templates, "no templates available"):
         return
     opts, (url,) = parse_at_s(line)
     headers = { 'User-Agent' : opts.useragent or self.user_agent }
     url = urllib2.Request(url, headers=headers)
     # fall back to the template encoding if none is specified
     page = url_to_page(url, opts.encoding, templates[0].encoding)
     ex = InstanceBasedLearningExtractor((t, None) for t in templates)
     pprint.pprint(ex.extract(page)[0])
Beispiel #22
0
 def test_default_type_extractor(self):
     schema = {
         'fields': {}
     }
     descriptor = create_slybot_item_descriptor(schema)
     extractors =  {
                 1: {
                     "regular_expression": "Gender\\s+(Male|Female)"
                 }
     }
     apply_extractors(descriptor, {"gender": [1]}, extractors)
     
     ibl_extractor = InstanceBasedLearningExtractor([(self.template, descriptor)])
     self.assertEqual(ibl_extractor.extract(self.target)[0][0], {u'gender': [u'Male']})
Beispiel #23
0
    def setup_bot(self, settings, spec, items, extractors):
        """
        Perform any initialization needed for crawling using this plugin
        """
        _item_template_pages = sorted(([
            t['scrapes'],
            dict_to_page(t, 'annotated_body'),
            t.get('extractors', [])
        ] for t in spec['templates'] if t.get('page_type', 'item') == 'item'),
                                      key=lambda pair: pair[0])

        self.itemcls_info = {}
        if settings.get('AUTO_PAGINATION'):
            self.html_link_extractor = PaginationExtractor()
        else:
            self.html_link_extractor = HtmlLinkExtractor()
        for itemclass_name, triplets in groupby(_item_template_pages,
                                                itemgetter(0)):
            page_extractors_pairs = map(itemgetter(1, 2), triplets)
            schema = items[itemclass_name]
            item_cls = SlybotItem.create_iblitem_class(schema)

            page_descriptor_pairs = []
            for page, template_extractors in page_extractors_pairs:
                item_descriptor = create_slybot_item_descriptor(schema)
                apply_extractors(item_descriptor, template_extractors,
                                 extractors)
                page_descriptor_pairs.append((page, item_descriptor))

            extractor = InstanceBasedLearningExtractor(page_descriptor_pairs)

            self.itemcls_info[itemclass_name] = {
                'class': item_cls,
                'descriptor': item_descriptor,
                'extractor': extractor,
            }

        # generate ibl extractor for links pages
        _links_pages = [
            dict_to_page(t, 'annotated_body') for t in spec['templates']
            if t.get('page_type') == 'links'
        ]
        _links_item_descriptor = create_slybot_item_descriptor({'fields': {}})
        self._links_ibl_extractor = InstanceBasedLearningExtractor(
            [(t, _links_item_descriptor) for t in _links_pages]) \
            if _links_pages else None

        self.build_url_filter(spec)
Beispiel #24
0
def get_extractor(site_id):
    sds = ScraperDescriptor.objects.filter(site__id=site_id)
    if not sds.exists():
        return

    tmpls = []

    for s in sds:
        items = s.items.filter(descriptor__target__symbol='ProductInfo')
        idesc = ItemDescriptor('', '', [
            FieldDescriptor(i.descriptor.symbol,
                            i.descriptor.desc,
                            extractor=types[i.descriptor.typ.symbol](i.value))
            for i in items
        ])
        ts = load_templates(s.id)
        tmpls += [(t, idesc) for _, t in ts]
    if tmpls:
        ex = InstanceBasedLearningExtractor(tmpls)

        def extractor(response):
            page = HtmlPage(response.url,
                            headers=response.headers,
                            body=response.body.decode(response.encoding),
                            encoding=response.encoding)
            extract = ex.extract(page)
            if extract[0] is not None:
                for e in extract[0]:
                    yield e

        return extractor
Beispiel #25
0
    def setup_bot(self, settings, spec, items, extractors):
        """
        Perform any initialization needed for crawling using this plugin
        """
        _item_template_pages = sorted((
            [t.get('scrapes'), dict_to_page(t, 'annotated_body'),
             t.get('extractors', []), t.get('version', '0.12.0')]
            for t in spec['templates'] if t.get('page_type', 'item') == 'item'
        ), key=lambda x: x[0])
        self.item_classes = {}
        self.template_scrapes = {template.get('page_id'): template['scrapes']
                                 for template in spec.get('templates')}
        self.html_link_extractor = HtmlLinkExtractor()
        for schema_name, schema in items.items():
            if schema_name not in self.item_classes:
                if not schema.get('name'):
                    schema['name'] = schema_name
                item_cls = SlybotItem.create_iblitem_class(schema)
                self.item_classes[schema_name] = item_cls

        # Create descriptors and apply additional extractors to fields
        page_descriptor_pairs = []
        self.schema_descriptors = {}
        for default, template, template_extractors, v in _item_template_pages:
            descriptors = OrderedDict()
            for schema_name, schema in items.items():
                item_descriptor = create_slybot_item_descriptor(schema,
                                                                schema_name)
                apply_extractors(item_descriptor, template_extractors,
                                 extractors)
                descriptors[schema_name] = item_descriptor
            descriptor = descriptors.values() or [{}]
            descriptors['#default'] = descriptors.get(default, descriptor[0])
            self.schema_descriptors[template.page_id] = descriptors['#default']
            page_descriptor_pairs.append((template, descriptors, v))
            add_extractors_to_descriptors(descriptors, extractors)

        grouped = itertools.groupby(sorted(page_descriptor_pairs,
                                           key=operator.itemgetter(2)),
                                    lambda x: x[2] < '0.13.0')
        self.extractors = []
        for version, group in grouped:
            if version:
                self.extractors.append(
                    InstanceBasedLearningExtractor(
                        [(page, scrapes['#default'])
                         for page, scrapes, version in group]))
            else:
                self.extractors.append(SlybotIBLExtractor(list(group)))

        # generate ibl extractor for links pages
        _links_pages = [dict_to_page(t, 'annotated_body')
                        for t in spec['templates']
                        if t.get('page_type') == 'links']
        _links_item_descriptor = create_slybot_item_descriptor({'fields': {}})
        self._links_ibl_extractor = InstanceBasedLearningExtractor(
            [(t, _links_item_descriptor) for t in _links_pages]) \
            if _links_pages else None

        self.build_url_filter(spec)
Beispiel #26
0
 def test_default_type_extractor(self):
     schema = {
         "id": "test",
         "properties": [],
     }
     descriptor = create_slybot_item_descriptor(schema)
     extractors =  {
                 1: {
                     "_id": 1,
                     "field_name": "gender",
                     "regular_expression": "Gender\\s+(Male|Female)"
                 }
     }
     apply_extractors(descriptor, [1], extractors)
     
     ibl_extractor = InstanceBasedLearningExtractor([(self.template, descriptor)])
     self.assertEqual(ibl_extractor.extract(self.target)[0][0], {u'gender': [u'Male']})
    def test_negative_hit_w_regex(self):
        schema = {
            'fields': {
                'gender': {
                    'required': False,
                    'type': 'number',
                    'vary': False,
                }
            }
        }
        descriptor = create_slybot_item_descriptor(schema)
        extractors = {1: {"regular_expression": "Gender\\s+(Male|Female)"}}
        apply_extractors(descriptor, {"gender": [1]}, extractors)

        ibl_extractor = InstanceBasedLearningExtractor([(self.template,
                                                         descriptor)])
        self.assertEqual(ibl_extractor.extract(self.target)[0], None)
Beispiel #28
0
    def __init__(self, name, spec, item_schemas, all_extractors, **kw):
        super(IblSpider, self).__init__(name, **kw)
        default_item = spec['scrapes']
        self._default_schema = item_schemas[default_item]
        if not self._default_schema:
            self.log("Scraping unknown default item schema: %s" % default_item, \
                log.WARNING)
        
        self._item_template_pages = sorted((
            [t.get('scrapes', default_item), dict_to_page(t, 'annotated_body'), 
            t.get('extractors', [])] \
            for t in spec['templates'] if t.get('page_type', 'item') == 'item'
        ), key=lambda pair: pair[0])

        # generate ibl extractor for links pages
        _links_pages = [dict_to_page(t, 'annotated_body')
                for t in spec['templates'] if t.get('page_type') == 'links']
        _links_item_descriptor = create_slybot_item_descriptor({'id': "_links", 'properties': ()})
        self._links_ibl_extractor = InstanceBasedLearningExtractor([(t, _links_item_descriptor) for t in _links_pages]) \
                if _links_pages else None

        self._ipages = [page for _, page, _ in self._item_template_pages]
        self._fpages = [
            dict_to_page(t, 'annotated_body')
            for t in spec['templates'] if t.get('page_type', 'item') == 'form'
        ]
        
        self.start_urls = self.start_urls or spec.get('start_urls')
        if isinstance(self.start_urls, basestring):
            self.start_urls = self.start_urls.splitlines()

        self.link_extractor = LinkExtractor()
        self.allowed_domains = self._get_allowed_domains(self._ipages)
        
        self.build_url_filter(spec)

        default_item_cls = get_iblitem_class(self._default_schema)
        default_item_descriptor = create_slybot_item_descriptor(self._default_schema)

        self.itemcls_info = {}
        for itemclass_name, triplets in itertools.groupby(self._item_template_pages, operator.itemgetter(0)):
            page_extractors_pairs = map(operator.itemgetter(1, 2), triplets)
            schema = item_schemas[itemclass_name]
            item_cls = get_iblitem_class(schema) if schema else default_item_cls

            page_descriptor_pairs = []
            for page, extractors in page_extractors_pairs:
                item_descriptor = create_slybot_item_descriptor(schema) if schema else default_item_descriptor
                apply_extractors(item_descriptor, extractors, all_extractors)
                page_descriptor_pairs.append((page, item_descriptor))

            extractor = InstanceBasedLearningExtractor(page_descriptor_pairs)

            self.itemcls_info[itemclass_name] = {
                'class': item_cls,
                'descriptor': item_descriptor,
                'extractor': extractor,
            }
Beispiel #29
0
 def test_text_type_w_regex(self):
     schema = {
         "fields": {
             'gender': {
                 'required': False,
                 'type': 'text',
                 'vary': False,
             }
         }
     }
     descriptor = create_slybot_item_descriptor(schema)
     extractors =  {1: {
                     "regular_expression": "Gender\\s+(Male|Female)"
     }}
     apply_extractors(descriptor, {"gender": [1]}, extractors)
     
     ibl_extractor = InstanceBasedLearningExtractor([(self.template, descriptor)])
     self.assertEqual(ibl_extractor.extract(self.target)[0][0], {u'gender': [u'Male']})
Beispiel #30
0
    def test_raw_type_w_regex(self):
        schema = {
            'fields': {
                'gender': {
                    'required': False,
                    'type': 'raw',
                    'vary': False,
                }
            }
        }
        descriptor = create_slybot_item_descriptor(schema)
        extractors =  {1: {
                        "regular_expression": "Gender.*(<td\s*>(?:Male|Female)</td>)"
        }}
        apply_extractors(descriptor, {"gender": [1]}, extractors)

        ibl_extractor = InstanceBasedLearningExtractor([(self.template, descriptor)])
        self.assertEqual(ibl_extractor.extract(self.target)[0][0], {u'gender': [u'<td >Male</td>']})
    def test_text_type_w_regex_and_no_groups(self):
        schema = {
            'fields': {
                'gender': {
                    'required': False,
                    'type': 'text',
                    'vary': False,
                }
            }
        }
        descriptor = create_slybot_item_descriptor(schema)
        extractors = {1: {"regular_expression": "Gender"}}
        apply_extractors(descriptor, {"gender": [1]}, extractors)

        ibl_extractor = InstanceBasedLearningExtractor([(self.template,
                                                         descriptor)])
        self.assertEqual(
            ibl_extractor.extract(self.target)[0][0], {u'gender': [u'Gender']})
Beispiel #32
0
class ScraperXPath(Scraper):
    """
    This extractor considers 'xpath_annotations' when training
    """

    default_data_attrs = {}

    def _train_implementation(self,
                              htmlpage,
                              data,
                              data_attrs=None,
                              repeated=False):
        assert data, "Cannot train with empty data"
        if not repeated:
            best_match = True
        else:
            best_match = False
        if data_attrs is None:
            data_attrs = self.default_data_attrs
        # assume that `data` has xpathes for each field and annotate it with
        # 'xpath_annotation' html attributes
        htmlpage.body = prepare_html(htmlpage.body, data)
        # train using xpath annotations
        tm = TemplateMakerWithAttrs(htmlpage)
        for field, values in data.items():
            if not hasattr(values, '__iter__'):
                values = [values]
            if field in data_attrs:
                attr = data_attrs[field]
            else:
                attr = None
            if len(list(values)) > 0:
                tm.annotate(field,
                            best_match_xpath_annotation(field),
                            attr=attr,
                            best_match=best_match)
        template = tm.get_template()
        # remove xpath annotations from resulting html page,
        # cause they will interfere with IBL Extractor
        template = filter_annotation_from_template(template)
        self.add_template(template)

    def train_from_htmlpage(self, htmlpage, data, data_attrs=None):
        self._train_implementation(htmlpage, data, data_attrs, repeated=False)

    def train_from_htmlpage_repeated(self, htmlpage, data, data_attrs=None):
        self._train_implementation(htmlpage, data, data_attrs, repeated=True)

    def scrape_page2(self, page, fields_spec):
        if self._ex is None:
            self._ex = InstanceBasedLearningExtractor(
                ((t, get_visual_tool_item_descriptor(fields_spec))
                 for t in self._templates), False, True)
        res = self._ex.extract(page)[0]
        return res
Beispiel #33
0
    def test_raw_type_w_regex(self):
        schema = {
            "id": "test",
            "properties": [('gender', {
                    'description': '',
                    'optional': True,
                    'type': 'raw',
                    'vary': False,
            })],
        }
        descriptor = create_slybot_item_descriptor(schema)
        extractors =  {1: {
                        "_id": 1,
                        "field_name": "gender",
                        "regular_expression": "Gender.*(<td\s*>(?:Male|Female)</td>)"
        }}
        apply_extractors(descriptor, [1], extractors)

        ibl_extractor = InstanceBasedLearningExtractor([(self.template, descriptor)])
        self.assertEqual(ibl_extractor.extract(self.target)[0][0], {u'gender': [u'<td >Male</td>']})
Beispiel #34
0
    def _run_extraction(self, name, templates, page, descriptor, expected_output):
        self.trace = None
        template_pages = [HtmlPage(None, {}, t) for t in templates]
        # extracts with trace enabled in order to generate traceback
        extractor = InstanceBasedLearningExtractor([(t, descriptor) for t in template_pages], True)
        actual_output, _ = extractor.extract(HtmlPage(None, {}, page))
        if actual_output is not None:
            actual_output = actual_output[0]
            self.trace = ["Extractor:\n%s" % extractor] + actual_output.pop('trace')
        # extracts again with trace disabled in order to get the pure output
        extractor = InstanceBasedLearningExtractor([(t, descriptor) for t in template_pages])
        actual_output, _ = extractor.extract(HtmlPage(None, {}, page))
        if actual_output is None:
            if expected_output is None:
                return
            assert False, "failed to extract data for test '%s'" % name
        else:
            actual_output = actual_output[0]
        expected_names = set(expected_output.keys())
        actual_names = set(actual_output.keys())
        
        missing_in_output = filter(None, expected_names - actual_names)
        error = "attributes '%s' were expected but were not present in test '%s'" % \
                ("', '".join(missing_in_output), name)
        assert len(missing_in_output) == 0, error

        unexpected = actual_names - expected_names
        error = "unexpected attributes %s in test '%s'" % \
                (', '.join(unexpected), name)
        assert len(unexpected) == 0, error

        for k, v in expected_output.items():
            extracted = actual_output[k]
            assert v == extracted, "in test '%s' for attribute '%s', " \
                "expected value '%s' but got '%s'" % (name, k, v, extracted)
    def test_raw_type_w_regex(self):
        schema = {
            'fields': {
                'gender': {
                    'required': False,
                    'type': 'raw',
                    'vary': False,
                }
            }
        }
        descriptor = create_slybot_item_descriptor(schema)
        extractors = {
            1: {
                "regular_expression": "Gender.*(<td\s*>(?:Male|Female)</td>)"
            }
        }
        apply_extractors(descriptor, {"gender": [1]}, extractors)

        ibl_extractor = InstanceBasedLearningExtractor([(self.template,
                                                         descriptor)])
        self.assertEqual(
            ibl_extractor.extract(self.target)[0][0],
            {u'gender': [u'<td >Male</td>']})
Beispiel #36
0
class Scraper(object):

    def __init__(self, templates=None):
        """Initialize an empty scraper."""
        self._templates = templates or []
        self._ex = None

    @classmethod
    def fromfile(cls, file):
        """Initialize a scraper from a file previously stored by tofile()
        method.
        """
        templates = [HtmlPage(**x) for x in json.load(file)['templates']]
        return cls(templates)

    def tofile(self, file):
        """Store the scraper into the given file-like object"""
        tpls = [page_to_dict(x) for x in self._templates]
        json.dump({'templates': tpls}, file)

    def add_template(self, template):
        self._templates.append(template)
        self._ex = None

    def train_from_htmlpage(self, htmlpage, data):
        assert data, "Cannot train with empty data"
        tm = TemplateMaker(htmlpage)
        for field, values in data.items():
            if (isinstance(values, (bytes, str)) or
                    not hasattr(values, '__iter__')):
                values = [values]
            for value in values:
                value = str_to_unicode(value, htmlpage.encoding)
                tm.annotate(field, best_match(value))
        self.add_template(tm.get_template())

    def train(self, url, data, encoding=None):
        page = url_to_page(url, encoding)
        self.train_from_htmlpage(page, data)

    def scrape(self, url, encoding=None):
        page = url_to_page(url, encoding)
        return self.scrape_page(page)

    def scrape_page(self, page):
        if self._ex is None:
            self._ex = InstanceBasedLearningExtractor((t, None) for t in
                    self._templates)
        return self._ex.extract(page)[0]
Beispiel #37
0
class Scraper(object):

    def __init__(self, templates=None):
        """Initialize an empty scraper."""
        self._templates = templates or []
        self._ex = None

    @classmethod
    def fromfile(cls, file):
        """Initialize a scraper from a file previously stored by tofile()
        method.
        """
        templates = [HtmlPage(**x) for x in json.load(file)['templates']]
        return cls(templates)

    def tofile(self, file):
        """Store the scraper into the given file-like object"""
        tpls = [page_to_dict(x) for x in self._templates]
        json.dump({'templates': tpls}, file)

    def add_template(self, template):
        self._templates.append(template)
        self._ex = None

    def train_from_htmlpage(self, htmlpage, data):
        assert data, "Cannot train with empty data"
        tm = TemplateMaker(htmlpage)
        for field, values in data.items():
            if not hasattr(values, '__iter__'):
                values = [values]
            for value in values:
                if isinstance(value, str):
                    value = value.decode(htmlpage.encoding or 'utf-8')
                tm.annotate(field, best_match(value))
        self.add_template(tm.get_template())

    def train(self, url, data, encoding=None):
        page = url_to_page(url, encoding)
        self.train_from_htmlpage(page, data)

    def scrape(self, url, encoding=None):
        page = url_to_page(url, encoding)
        return self.scrape_page(page)

    def scrape_page(self, page):
        if self._ex is None:
            self._ex = InstanceBasedLearningExtractor((t, None) for t in
                    self._templates)
        return self._ex.extract(page)[0]
    def test_extractor_w_empty_string_extraction(self):
        schema = {
            'fields': {
                'gender': {
                    'required': False,
                    'type': 'text',
                    'vary': False,
                },
                'name': {
                    'required': True,
                    'type': 'text',
                    'vary': False,
                }
            }
        }
        descriptor = create_slybot_item_descriptor(schema)
        extractors = {1: {"regular_expression": "([0-9]+)"}}
        apply_extractors(descriptor, {"gender": [1]}, extractors)

        ibl_extractor = InstanceBasedLearningExtractor([(self.template2,
                                                         descriptor)])
        self.assertEqual(
            ibl_extractor.extract(self.target2)[0][0],
            {u'name': [u'Name Olivia']})
Beispiel #39
0
    def setup_bot(self, settings, spec, items, extractors):
        """
        Perform any initialization needed for crawling using this plugin
        """
        _item_template_pages = sorted((
            [t['scrapes'], dict_to_page(t, 'annotated_body'),
             t.get('extractors', [])]
            for t in spec['templates'] if t.get('page_type', 'item') == 'item'
        ), key=lambda pair: pair[0])

        self.itemcls_info = {}
        if settings.get('AUTO_PAGINATION'):
            self.html_link_extractor = PaginationExtractor()
        else:
            self.html_link_extractor = HtmlLinkExtractor()
        for itemclass_name, triplets in groupby(_item_template_pages,
                                                itemgetter(0)):
            page_extractors_pairs = map(itemgetter(1, 2), triplets)
            schema = items[itemclass_name]
            item_cls = SlybotItem.create_iblitem_class(schema)

            page_descriptor_pairs = []
            for page, template_extractors in page_extractors_pairs:
                item_descriptor = create_slybot_item_descriptor(schema)
                apply_extractors(item_descriptor, template_extractors,
                                 extractors)
                page_descriptor_pairs.append((page, item_descriptor))

            extractor = InstanceBasedLearningExtractor(page_descriptor_pairs)

            self.itemcls_info[itemclass_name] = {
                'class': item_cls,
                'descriptor': item_descriptor,
                'extractor': extractor,
            }

        # generate ibl extractor for links pages
        _links_pages = [dict_to_page(t, 'annotated_body')
                        for t in spec['templates']
                        if t.get('page_type') == 'links']
        _links_item_descriptor = create_slybot_item_descriptor({'fields': {}})
        self._links_ibl_extractor = InstanceBasedLearningExtractor(
            [(t, _links_item_descriptor) for t in _links_pages]) \
            if _links_pages else None

        self.build_url_filter(spec)
Beispiel #40
0
    def setup_bot(self, settings, spec, items, extractors):
        """
        Perform any initialization needed for crawling using this plugin
        """
        _item_template_pages = sorted((
            [t.get('scrapes'), dict_to_page(t, 'annotated_body'),
             t.get('extractors', [])]
            for t in spec['templates'] if t.get('page_type', 'item') == 'item'
        ))
        self.item_classes = {}
        self.html_link_extractor = HtmlLinkExtractor()
        for schema_name, schema in items.items():
            if schema_name not in self.item_classes:
                if not schema.get('name'):
                    schema['name'] = schema_name
                item_cls = SlybotItem.create_iblitem_class(schema)
                self.item_classes[schema_name] = item_cls

        # Create descriptors and apply additional extractors to fields
        page_descriptor_pairs = []
        for default, template, template_extractors in _item_template_pages:
            descriptors = OrderedDict()
            for schema_name, schema in items.items():
                item_descriptor = create_slybot_item_descriptor(schema,
                                                                schema_name)
                apply_extractors(item_descriptor, template_extractors,
                                 extractors)
                descriptors[schema_name] = item_descriptor
            descriptor = descriptors.values() or [{}]
            descriptors['#default'] = descriptors.get(default, descriptor[0])
            page_descriptor_pairs.append((template, descriptors))

        self.extractors = SlybotIBLExtractor(page_descriptor_pairs)

        # generate ibl extractor for links pages
        _links_pages = [dict_to_page(t, 'annotated_body')
                        for t in spec['templates']
                        if t.get('page_type') == 'links']
        _links_item_descriptor = create_slybot_item_descriptor({'fields': {}})
        self._links_ibl_extractor = InstanceBasedLearningExtractor(
            [(t, _links_item_descriptor) for t in _links_pages]) \
            if _links_pages else None

        self.build_url_filter(spec)
Beispiel #41
0
        (u'MEASURE_PROD',u'1 g', '', None),
        ]

    items = list(
        map(
            lambda (n,s,desc): (n,s,u'%s'%(desc.name,), types[desc.typ.symbol]),
            ((n,s,KeyValueDescriptor.objects.get(symbol = n))  for n,s,_,_ in items if KeyValueDescriptor.objects.filter(symbol = n).exists() )
        )
    )
    idesc = ItemDescriptor('', '', [ FieldDescriptor(n, desc, extractor=fnc(s))  for n,s,desc,fnc in items ] )
    ts = load_templates('scraper.json', 'site-%d'%site.id)
    if not ts:
        ts = annotate(url, 'site-%d'%site.id, [ (n,s)  for n,s,_,_ in items ])
    tmpls += [ (tm, idesc) for tm in ts ]

ex = InstanceBasedLearningExtractor(tmpls)

urls = (
    'http://www.rc-chem.eu/doprava', # should fail
    'http://www.rc-chem.eu/produkty/2-fma',
    'http://www.rc-chem.eu/produkty/3-fmc',
    'http://www.rc-chem.eu/produkty/3-mmc-crystal',
    'http://www.rc-chem.eu/produkty/4-fa-crystal',
    'http://www.rc-chem.eu/produkty/dimethylone',
    'http://www.rc-chem.eu/produkty/ethylphenidate',
    'http://www.rc-chem.eu/produkty/mpa',
    'http://www.rc-chem.eu/produkty/neb',
    'http://www.rc-chem.eu/produkty/pentedrone-velky-crystal',
    'http://www.rc-chem.eu/produkty/thio-crystal',
    'http://www.rc-chem.eu/produkty/thio-velky-crystal',
    'http://mefedronprodej.webnode.cz/produkty-1/',
Beispiel #42
0
 def scrape_page(self, page):
     if self._ex is None:
         self._ex = InstanceBasedLearningExtractor((t, None) for t in
                 self._templates)
     return self._ex.extract(page)[0]
Beispiel #43
0
        map(
            lambda (n, s, desc): (n, s, u'%s' %
                                  (desc.name, ), types[desc.typ.symbol]),
            ((n, s, KeyValueDescriptor.objects.get(symbol=n))
             for n, s, _, _ in items
             if KeyValueDescriptor.objects.filter(symbol=n).exists())))
    idesc = ItemDescriptor('', '', [
        FieldDescriptor(n, desc, extractor=fnc(s)) for n, s, desc, fnc in items
    ])
    ts = load_templates('scraper.json', 'site-%d' % site.id)
    if not ts:
        ts = annotate(url, 'site-%d' % site.id,
                      [(n, s) for n, s, _, _ in items])
    tmpls += [(tm, idesc) for tm in ts]

ex = InstanceBasedLearningExtractor(tmpls)

urls = (
    'http://www.rc-chem.eu/doprava',  # should fail
    'http://www.rc-chem.eu/produkty/2-fma',
    'http://www.rc-chem.eu/produkty/3-fmc',
    'http://www.rc-chem.eu/produkty/3-mmc-crystal',
    'http://www.rc-chem.eu/produkty/4-fa-crystal',
    'http://www.rc-chem.eu/produkty/dimethylone',
    'http://www.rc-chem.eu/produkty/ethylphenidate',
    'http://www.rc-chem.eu/produkty/mpa',
    'http://www.rc-chem.eu/produkty/neb',
    'http://www.rc-chem.eu/produkty/pentedrone-velky-crystal',
    'http://www.rc-chem.eu/produkty/thio-crystal',
    'http://www.rc-chem.eu/produkty/thio-velky-crystal',
    'http://mefedronprodej.webnode.cz/produkty-1/',
Beispiel #44
0
class IblSpider(BaseSpider):

    def __init__(self, name, spec, item_schemas, all_extractors, **kw):
        super(IblSpider, self).__init__(name, **kw)
        default_item = spec['scrapes']
        self._default_schema = item_schemas[default_item]
        if not self._default_schema:
            self.log("Scraping unknown default item schema: %s" % default_item, \
                log.WARNING)
        
        self._item_template_pages = sorted((
            [t.get('scrapes', default_item), dict_to_page(t, 'annotated_body'), 
            t.get('extractors', [])] \
            for t in spec['templates'] if t.get('page_type', 'item') == 'item'
        ), key=lambda pair: pair[0])

        # generate ibl extractor for links pages
        _links_pages = [dict_to_page(t, 'annotated_body')
                for t in spec['templates'] if t.get('page_type') == 'links']
        _links_item_descriptor = create_slybot_item_descriptor({'id': "_links", 'properties': ()})
        self._links_ibl_extractor = InstanceBasedLearningExtractor([(t, _links_item_descriptor) for t in _links_pages]) \
                if _links_pages else None

        self._ipages = [page for _, page, _ in self._item_template_pages]
        self._fpages = [
            dict_to_page(t, 'annotated_body')
            for t in spec['templates'] if t.get('page_type', 'item') == 'form'
        ]
        
        self.start_urls = self.start_urls or spec.get('start_urls')
        if isinstance(self.start_urls, basestring):
            self.start_urls = self.start_urls.splitlines()

        self.link_extractor = LinkExtractor()
        self.allowed_domains = self._get_allowed_domains(self._ipages)
        
        self.build_url_filter(spec)

        default_item_cls = get_iblitem_class(self._default_schema)
        default_item_descriptor = create_slybot_item_descriptor(self._default_schema)

        self.itemcls_info = {}
        for itemclass_name, triplets in itertools.groupby(self._item_template_pages, operator.itemgetter(0)):
            page_extractors_pairs = map(operator.itemgetter(1, 2), triplets)
            schema = item_schemas[itemclass_name]
            item_cls = get_iblitem_class(schema) if schema else default_item_cls

            page_descriptor_pairs = []
            for page, extractors in page_extractors_pairs:
                item_descriptor = create_slybot_item_descriptor(schema) if schema else default_item_descriptor
                apply_extractors(item_descriptor, extractors, all_extractors)
                page_descriptor_pairs.append((page, item_descriptor))

            extractor = InstanceBasedLearningExtractor(page_descriptor_pairs)

            self.itemcls_info[itemclass_name] = {
                'class': item_cls,
                'descriptor': item_descriptor,
                'extractor': extractor,
            }

    def _get_allowed_domains(self, templates):
        urls = [x.url for x in templates]
        urls += self.start_urls
        return [x[1] for x in iter_unique_scheme_netloc(urls)]

    def _get_form_requests(self, templates):
        reqs = []
        # TODO: filter unique schema netlocs?
        for t in templates:
            # assume all templates are html and unicode
            response = HtmlResponse(t.url, encoding='utf-8',
                                    body=t.body, headers=t.headers)
            request = FormRequest.from_response(response,
                                                formname='SLYBOT-FORM',
                                                callback=self.parse,
                                                dont_filter=True)
            reqs.append(request)
        return reqs

    def _get_item_requests(self, templates):
        reqs = []
        urls = [x.url for x in templates]
        for scheme, netloc in iter_unique_scheme_netloc(urls):
            r = Request("%s://%s/" % (scheme, netloc), callback=self.parse, \
                dont_filter=True)
            reqs.append(r)
        return reqs
    
    def _requests_to_follow(self, htmlpage):
        requests = []
        if self._links_ibl_extractor is not None:
            extracted = self._links_ibl_extractor.extract(htmlpage)[0]
            if extracted:
                extracted_regions = extracted[0].get('_links', [])
                seen = set()
                for region in extracted_regions:
                    htmlregion = HtmlPage(htmlpage.url, htmlpage.headers, region, encoding=htmlpage.encoding)
                    for request in self._request_to_follow_from_region(htmlregion):
                        if request.url in seen:
                            continue
                        seen.add(request.url)
                        requests.append(request)
        else:
            requests = self._request_to_follow_from_region(htmlpage)

        return requests
            
    def _request_to_follow_from_region(self, htmlregion):
        requests = []
        seen = set()

        for link in self.link_extractor.links_to_follow(htmlregion):
            url = link.url
            if self.url_filterf(link):
                # filter out duplicate urls, later we should handle link text
                if url in seen:
                    continue
                seen.add(url)
                request = Request(url)
                if link.text:
                    request.meta['link_text'] = link.text
                requests.append(request)
        return requests

    def start_requests(self):
        if self.start_urls:
            return [Request(r, callback=self.parse, dont_filter=True) \
                for r in self.start_urls]
        if self._fpages:
            return self._get_form_requests(self._fpages)
        return self._get_item_requests(self._ipages)

    def parse(self, response):
        """Main handler for all downloaded responses"""
        if isinstance(response, HtmlResponse):
            return self.handle_html(response)
        else:
            content_type = response.headers.get('Content-Type')
            self.log("Ignoring page with content-type=%r: %s" % (content_type, \
                response.url), level=log.DEBUG)

    def _process_link_regions(self, htmlpage, link_regions):
        """Process link regions if any, and generate requests"""
        requests_to_follow = []
        if link_regions:
            for link_region in link_regions:
                htmlregion = HtmlPage(htmlpage.url, htmlpage.headers, \
                        link_region, encoding=htmlpage.encoding)
                requests_to_follow.extend(self._requests_to_follow(htmlregion))
        else:
            requests_to_follow = self._requests_to_follow(htmlpage)
        return requests_to_follow

    def handle_html(self, response):
        htmlpage = HtmlPage(response.url, response.headers, \
                    response.body_as_unicode(), encoding=response.encoding)
        items, link_regions = self.extract_items(htmlpage)
        requests_to_follow = self._process_link_regions(htmlpage, link_regions)
        return requests_to_follow + items
        
    def extract_items(self, htmlpage):
        """This method is also called from UI webservice to extract items"""
        items = []
        link_regions = []
        for item_cls_name, info in self.itemcls_info.iteritems():
            item_descriptor = info['descriptor']
            extractor = info['extractor']
            extracted, _link_regions = self._do_extract_items_from(
                    htmlpage,
                    item_descriptor,
                    extractor,
                    item_cls_name,
            )
            items.extend(extracted)
            link_regions.extend(_link_regions)
        return items, link_regions

    def _do_extract_items_from(self, htmlpage, item_descriptor, extractor, item_cls_name):
        extracted_data, template = extractor.extract(htmlpage)
        link_regions = []
        for ddict in extracted_data or []:
            link_regions.extend(ddict.pop("_links", []))
        processed_data = _process_extracted_data(extracted_data, item_descriptor, htmlpage)
        items = []
        item_cls = self.itemcls_info[item_cls_name]['class']
        for processed_attributes in processed_data:
            item = item_cls(processed_attributes)
            item['url'] = htmlpage.url
            item['_type'] = item_cls_name
            item['_template'] = template.id
            items.append(item)

        return items, link_regions

    def build_url_filter(self, spec):
        """make a filter for links"""
        respect_nofollow = spec.get('respect_nofollow', True)
        patterns = spec.get('follow_patterns')
        if spec.get("links_to_follow") == "none":
            url_filterf = lambda x: False
        elif patterns:
            pattern = patterns[0] if len(patterns) == 1 else "(?:%s)" % '|'.join(patterns)
            follow_pattern = re.compile(pattern)
            if respect_nofollow:
                url_filterf = lambda x: follow_pattern.search(x.url) and not x.nofollow
            else:
                url_filterf = lambda x: follow_pattern.search(x.url)
        elif respect_nofollow:
            url_filterf = lambda x: not x.nofollow
        else:
            url_filterf = bool
        # apply exclude patterns
        exclude_patterns = spec.get('exclude_patterns')
        if exclude_patterns:
            pattern = exclude_patterns[0] if len(exclude_patterns) == 1 else "(?:%s)" % '|'.join(exclude_patterns)
            exclude_pattern = re.compile(pattern)
            self.url_filterf = lambda x: not exclude_pattern.search(x.url) and url_filterf(x)
        else:
            self.url_filterf = url_filterf
Beispiel #45
0
class Annotations(object):
    """
    Base Class for adding plugins to Portia Web and Slybot.
    """
    def setup_bot(self, settings, spec, items, extractors, logger):
        """
        Perform any initialization needed for crawling using this plugin
        """
        self.logger = logger
        templates = map(self._get_annotated_template, spec['templates'])

        _item_template_pages = sorted(([
            t.get('scrapes'),
            dict_to_page(t, 'annotated_body'),
            t.get('extractors', []),
            t.get('version', '0.12.0')
        ] for t in templates if t.get('page_type', 'item') == 'item'),
                                      key=lambda x: x[0])
        self.item_classes = {}
        self.template_scrapes = {
            template.get('page_id'): template['scrapes']
            for template in templates
        }
        if (settings.get('AUTO_PAGINATION')
                or spec.get('links_to_follow') == 'auto'):
            self.html_link_extractor = PaginationExtractor()
        else:
            self.html_link_extractor = HtmlLinkExtractor()
        for schema_name, schema in items.items():
            if schema_name not in self.item_classes:
                if not schema.get('name'):
                    schema['name'] = schema_name
                item_cls = SlybotItem.create_iblitem_class(schema)
                self.item_classes[schema_name] = item_cls

        # Create descriptors and apply additional extractors to fields
        page_descriptor_pairs = []
        self.schema_descriptors = {}
        for default, template, template_extractors, v in _item_template_pages:
            descriptors = OrderedDict()
            for schema_name, schema in items.items():
                item_descriptor = create_slybot_item_descriptor(
                    schema, schema_name)
                apply_extractors(item_descriptor, template_extractors,
                                 extractors)
                descriptors[schema_name] = item_descriptor
            descriptor = descriptors.values() or [{}]
            descriptors['#default'] = descriptors.get(default, descriptor[0])
            self.schema_descriptors[template.page_id] = descriptors['#default']
            page_descriptor_pairs.append((template, descriptors, v))
            add_extractors_to_descriptors(descriptors, extractors)

        grouped = itertools.groupby(
            sorted(page_descriptor_pairs, key=operator.itemgetter(2)),
            lambda x: x[2] < '0.13.0')
        self.extractors = []
        for version, group in grouped:
            if version:
                self.extractors.append(
                    InstanceBasedLearningExtractor([
                        (page, scrapes['#default'])
                        for page, scrapes, version in group
                    ]))
            else:
                self.extractors.append(SlybotIBLExtractor(list(group)))

        # generate ibl extractor for links pages
        _links_pages = [
            dict_to_page(t, 'annotated_body') for t in templates
            if t.get('page_type') == 'links'
        ]
        _links_item_descriptor = create_slybot_item_descriptor({'fields': {}})
        self._links_ibl_extractor = InstanceBasedLearningExtractor(
            [(t, _links_item_descriptor) for t in _links_pages]) \
            if _links_pages else None

        self.build_url_filter(spec)
        # Clustering
        self.template_names = [t.get('page_id') for t in spec['templates']]
        if settings.get('PAGE_CLUSTERING'):
            try:
                import page_clustering
                self.clustering = page_clustering.kmeans_from_samples(
                    spec['templates'])
                self.logger.info("Clustering activated")
            except ImportError:
                self.clustering = None
                self.logger.warning(
                    "Clustering could not be used because it is not installed")
        else:
            self.clustering = None

    def _get_annotated_template(self, template):
        if template.get('version', '0.12.0') >= '0.13.0':
            _build_sample(template)
        return template

    def handle_html(self, response, seen=None):
        htmlpage = htmlpage_from_response(response)
        items, link_regions = self.extract_items(htmlpage)
        htmlpage.headers['n_items'] = len(items)
        try:
            response.meta['n_items'] = len(items)
        except AttributeError:
            pass  # response not tied to any request
        for item in items:
            yield item
        for request in self._process_link_regions(htmlpage, link_regions):
            yield request

    def extract_items(self, htmlpage):
        """This method is also called from UI webservice to extract items"""
        for extractor in self.extractors:
            items, links = self._do_extract_items_from(htmlpage, extractor)
            if items:
                return items, links
        return [], []

    def _do_extract_items_from(self, htmlpage, extractor):
        # Try to predict template to use
        pref_template_id = None
        template_cluster = _CLUSTER_NA
        if self.clustering:
            self.clustering.add_page(htmlpage)
            if self.clustering.is_fit:
                clt = self.clustering.classify(htmlpage)
                if clt != -1:
                    template_cluster = self.template_names[clt]
                    pref_template_id = template_cluster
                else:
                    template_cluster = _CLUSTER_OUTLIER
        extracted_data, template = extractor.extract(htmlpage,
                                                     pref_template_id)
        link_regions = []
        for ddict in extracted_data or []:
            link_regions.extend(ddict.pop("_links", []))
        descriptor = None
        unprocessed = False
        if template is not None and hasattr(template, 'descriptor'):
            descriptor = template.descriptor()
            if hasattr(descriptor, 'name'):
                item_cls_name = descriptor.name
            elif hasattr(descriptor, 'get'):
                item_cls_name = descriptor.get('name',
                                               descriptor.get('display_name'))
            else:
                item_cls_name = ''
        else:
            unprocessed = True
            try:
                descriptor = self.schema_descriptors[template.id]
                item_cls_name = self.template_scrapes[template.id]
            except AttributeError:
                descriptor = sorted(self.schema_descriptors.items())[0][1]
                item_cls_name = sorted(self.template_scrapes.items())[0][1]
        item_cls = self.item_classes.get(item_cls_name)
        items = []
        for processed_attributes in extracted_data or []:
            if processed_attributes.get('_type') in self.item_classes:
                _type = processed_attributes['_type']
                item = self.item_classes[_type](processed_attributes)
                item['_type'] = item.display_name()
            elif unprocessed:
                item = self._process_attributes(processed_attributes,
                                                descriptor, htmlpage)
                if item_cls:
                    item = item_cls(item)
            elif item_cls:
                item = item_cls(processed_attributes)
            else:
                item = dict(processed_attributes)
            item['url'] = htmlpage.url
            item['_template'] = str(template.id)
            item.setdefault('_type', item_cls_name)
            if not isinstance(item, SlybotItem):
                default_meta = {
                    'type': 'text',
                    'required': False,
                    'vary': False
                }
                item_cls = SlybotItem.create_iblitem_class(
                    {'fields': {k: default_meta
                                for k in item}})
                item = item_cls(**item)
            if self.clustering:
                item['_template_cluster'] = template_cluster
            items.append(item)
        return items, link_regions

    def _process_attributes(self, item, descriptor, htmlpage):
        new_item = {}
        try:
            attr_map = descriptor.attribute_map
        except AttributeError:
            attr_map = {}
        page = getattr(htmlpage, 'htmlpage', htmlpage)
        for field, value in item.items():
            if field.startswith('_sticky'):
                continue
            if field == 'variants':
                value = [
                    self._process_attributes(v, descriptor, page)
                    for v in value
                ]
            elif field in attr_map:
                value = [attr_map[field].adapt(v, page) for v in value]
            new_item[field] = value
        return new_item

    def build_url_filter(self, spec):
        """make a filter for links"""
        respect_nofollow = spec.get('respect_nofollow', True)

        if spec.get("links_to_follow") == "none":
            url_filterf = lambda x: False
        elif spec.get("links_to_follow") == "all":
            if respect_nofollow:
                url_filterf = lambda x: x.nofollow
            else:
                url_filterf = lambda x: True
        else:  # patterns
            patterns = spec.get('follow_patterns')
            excludes = spec.get('exclude_patterns')
            pattern_fn = include_exclude_filter(patterns, excludes)

            if respect_nofollow:
                url_filterf = lambda x: not x.nofollow and pattern_fn(x.url)
            else:
                url_filterf = lambda x: pattern_fn(x.url)

        self.url_filterf = url_filterf

    def _filter_link(self, link, seen):
        url = link.url
        if self.url_filterf(link):
            # filter out duplicate urls, later we should handle link text
            if url not in seen:
                seen.add(url)
                request = Request(url)
                if link.text:
                    request.meta['link_text'] = link.text
                return request

    def _process_link_regions(self, htmlpage, link_regions):
        """Process link regions if any, and generate requests"""
        if link_regions:
            for link_region in link_regions:
                htmlregion = HtmlPage(htmlpage.url,
                                      htmlpage.headers,
                                      link_region,
                                      encoding=htmlpage.encoding)
                for request in self._requests_to_follow(htmlregion):
                    yield request
        else:
            for request in self._requests_to_follow(htmlpage):
                yield request

    def _requests_to_follow(self, htmlpage):
        if self._links_ibl_extractor is not None:
            extracted = self._links_ibl_extractor.extract(htmlpage)[0]
            if extracted:
                extracted_regions = extracted[0].get('_links', [])
                seen = set()
                for region in extracted_regions:
                    htmlregion = HtmlPage(htmlpage.url,
                                          htmlpage.headers,
                                          region,
                                          encoding=htmlpage.encoding)
                    for request in self._request_to_follow_from_region(
                            htmlregion):
                        if request.url in seen:
                            continue
                        seen.add(request.url)
                        yield request
        else:
            for request in self._request_to_follow_from_region(htmlpage):
                yield request

    def _request_to_follow_from_region(self, htmlregion):
        seen = set()
        for link in self.html_link_extractor.links_to_follow(htmlregion):
            request = self._filter_link(link, seen)
            if request is not None:
                yield request

    def handle_xml(self, response, seen):
        _type = XML_APPLICATION_TYPE(response.headers.get('Content-Type', ''))
        _type = _type.groupdict()['type'] if _type else 'xml'
        try:
            link_extractor = create_linkextractor_from_specs({
                'type': _type,
                'value': ''
            })
        except ValueError:
            link_extractor = SitemapLinkExtractor()
        for link in link_extractor.links_to_follow(response):
            request = self._filter_link(link, seen)
            if request:
                yield request
Beispiel #46
0
    def setup_bot(self, settings, spec, items, extractors, logger):
        """
        Perform any initialization needed for crawling using this plugin
        """
        self.logger = logger
        templates = map(self._get_annotated_template, spec['templates'])

        _item_template_pages = sorted(([
            t.get('scrapes'),
            dict_to_page(t, 'annotated_body'),
            t.get('extractors', []),
            t.get('version', '0.12.0')
        ] for t in templates if t.get('page_type', 'item') == 'item'),
                                      key=lambda x: x[0])
        self.item_classes = {}
        self.template_scrapes = {
            template.get('page_id'): template['scrapes']
            for template in templates
        }
        if (settings.get('AUTO_PAGINATION')
                or spec.get('links_to_follow') == 'auto'):
            self.html_link_extractor = PaginationExtractor()
        else:
            self.html_link_extractor = HtmlLinkExtractor()
        for schema_name, schema in items.items():
            if schema_name not in self.item_classes:
                if not schema.get('name'):
                    schema['name'] = schema_name
                item_cls = SlybotItem.create_iblitem_class(schema)
                self.item_classes[schema_name] = item_cls

        # Create descriptors and apply additional extractors to fields
        page_descriptor_pairs = []
        self.schema_descriptors = {}
        for default, template, template_extractors, v in _item_template_pages:
            descriptors = OrderedDict()
            for schema_name, schema in items.items():
                item_descriptor = create_slybot_item_descriptor(
                    schema, schema_name)
                apply_extractors(item_descriptor, template_extractors,
                                 extractors)
                descriptors[schema_name] = item_descriptor
            descriptor = descriptors.values() or [{}]
            descriptors['#default'] = descriptors.get(default, descriptor[0])
            self.schema_descriptors[template.page_id] = descriptors['#default']
            page_descriptor_pairs.append((template, descriptors, v))
            add_extractors_to_descriptors(descriptors, extractors)

        grouped = itertools.groupby(
            sorted(page_descriptor_pairs, key=operator.itemgetter(2)),
            lambda x: x[2] < '0.13.0')
        self.extractors = []
        for version, group in grouped:
            if version:
                self.extractors.append(
                    InstanceBasedLearningExtractor([
                        (page, scrapes['#default'])
                        for page, scrapes, version in group
                    ]))
            else:
                self.extractors.append(SlybotIBLExtractor(list(group)))

        # generate ibl extractor for links pages
        _links_pages = [
            dict_to_page(t, 'annotated_body') for t in templates
            if t.get('page_type') == 'links'
        ]
        _links_item_descriptor = create_slybot_item_descriptor({'fields': {}})
        self._links_ibl_extractor = InstanceBasedLearningExtractor(
            [(t, _links_item_descriptor) for t in _links_pages]) \
            if _links_pages else None

        self.build_url_filter(spec)
        # Clustering
        self.template_names = [t.get('page_id') for t in spec['templates']]
        if settings.get('PAGE_CLUSTERING'):
            try:
                import page_clustering
                self.clustering = page_clustering.kmeans_from_samples(
                    spec['templates'])
                self.logger.info("Clustering activated")
            except ImportError:
                self.clustering = None
                self.logger.warning(
                    "Clustering could not be used because it is not installed")
        else:
            self.clustering = None
Beispiel #47
0
    def setup_bot(self, settings, spec, items, extractors, logger):
        """
        Perform any initialization needed for crawling using this plugin
        """
        self.logger = logger
        templates = map(self._get_annotated_template, spec['templates'])

        _item_template_pages = sorted((
            [t.get('scrapes'), dict_to_page(t, 'annotated_body'),
             t.get('extractors', []), t.get('version', '0.12.0')]
            for t in templates if t.get('page_type', 'item') == 'item'
        ), key=lambda x: x[0])
        self.item_classes = {}
        self.template_scrapes = {template.get('page_id'): template['scrapes']
                                 for template in templates}
        if (settings.get('AUTO_PAGINATION') or
                spec.get('links_to_follow') == 'auto'):
            self.html_link_extractor = PaginationExtractor()
        else:
            self.html_link_extractor = HtmlLinkExtractor()
        for schema_name, schema in items.items():
            if schema_name not in self.item_classes:
                if not schema.get('name'):
                    schema['name'] = schema_name
                item_cls = SlybotItem.create_iblitem_class(schema)
                self.item_classes[schema_name] = item_cls

        # Create descriptors and apply additional extractors to fields
        page_descriptor_pairs = []
        self.schema_descriptors = {}
        for default, template, template_extractors, v in _item_template_pages:
            descriptors = OrderedDict()
            for schema_name, schema in items.items():
                item_descriptor = create_slybot_item_descriptor(schema,
                                                                schema_name)
                apply_extractors(item_descriptor, template_extractors,
                                 extractors)
                descriptors[schema_name] = item_descriptor
            descriptor = descriptors.values() or [{}]
            descriptors['#default'] = descriptors.get(default, descriptor[0])
            self.schema_descriptors[template.page_id] = descriptors['#default']
            page_descriptor_pairs.append((template, descriptors, v))
            add_extractors_to_descriptors(descriptors, extractors)

        grouped = itertools.groupby(sorted(page_descriptor_pairs,
                                           key=operator.itemgetter(2)),
                                    lambda x: x[2] < '0.13.0')
        self.extractors = []
        for version, group in grouped:
            if version:
                self.extractors.append(
                    InstanceBasedLearningExtractor(
                        [(page, scrapes['#default'])
                         for page, scrapes, version in group]))
            else:
                self.extractors.append(SlybotIBLExtractor(list(group)))

        # generate ibl extractor for links pages
        _links_pages = [dict_to_page(t, 'annotated_body')
                        for t in templates if t.get('page_type') == 'links']
        _links_item_descriptor = create_slybot_item_descriptor({'fields': {}})
        self._links_ibl_extractor = InstanceBasedLearningExtractor(
            [(t, _links_item_descriptor) for t in _links_pages]) \
            if _links_pages else None

        self.build_url_filter(spec)
        # Clustering
        self.template_names = [t.get('page_id') for t in spec['templates']]
        if settings.get('PAGE_CLUSTERING'):
            try:
                import page_clustering
                self.clustering = page_clustering.kmeans_from_samples(spec['templates'])
                self.logger.info("Clustering activated")
            except ImportError:
                self.clustering = None
                self.logger.warning(
                    "Clustering could not be used because it is not installed")
        else:
            self.clustering = None
Beispiel #48
0
    def __init__(self, name, spec, item_schemas, all_extractors, **kw):
        super(IblSpider, self).__init__(name, **kw)

        self._item_template_pages = sorted((
            [t['scrapes'], dict_to_page(t, 'annotated_body'),
            t.get('extractors', [])] \
            for t in spec['templates'] if t.get('page_type', 'item') == 'item'
        ), key=lambda pair: pair[0])

        # generate ibl extractor for links pages
        _links_pages = [dict_to_page(t, 'annotated_body')
                for t in spec['templates'] if t.get('page_type') == 'links']
        _links_item_descriptor = create_slybot_item_descriptor({'fields': {}})
        self._links_ibl_extractor = InstanceBasedLearningExtractor([(t, _links_item_descriptor) for t in _links_pages]) \
                if _links_pages else None

        self._ipages = [page for _, page, _ in self._item_template_pages]

        self.start_urls = self.start_urls or spec.get('start_urls')
        if isinstance(self.start_urls, basestring):
            self.start_urls = self.start_urls.splitlines()

        self.html_link_extractor = HtmlLinkExtractor()
        self.rss_link_extractor = RssLinkExtractor()
        self.allowed_domains = spec.get('allowed_domains',
                                        self._get_allowed_domains(self._ipages))
        if not self.allowed_domains:
            self.allowed_domains = None
        self.build_url_filter(spec)

        self.itemcls_info = {}
        for itemclass_name, triplets in itertools.groupby(self._item_template_pages, operator.itemgetter(0)):
            page_extractors_pairs = map(operator.itemgetter(1, 2), triplets)
            schema = item_schemas[itemclass_name]
            item_cls = get_iblitem_class(schema)

            page_descriptor_pairs = []
            for page, template_extractors in page_extractors_pairs:
                item_descriptor = create_slybot_item_descriptor(schema)
                apply_extractors(item_descriptor, template_extractors, all_extractors)
                page_descriptor_pairs.append((page, item_descriptor))

            extractor = InstanceBasedLearningExtractor(page_descriptor_pairs)

            self.itemcls_info[itemclass_name] = {
                'class': item_cls,
                'descriptor': item_descriptor,
                'extractor': extractor,
            }

        self.login_requests = []
        self.form_requests = []
        for rdata in spec.get("init_requests", []):
            if rdata["type"] == "login":
                request = Request(url=rdata.pop("loginurl"), meta=rdata,
                                  callback=self.parse_login_page, dont_filter=True)
                self.login_requests.append(request)

            elif rdata["type"] == "form":
                self.generic_form = GenericForm(**kw)
                self.form_requests.append(self.get_generic_form_start_request(rdata))
Beispiel #49
0
class Annotations(object):
    """
    Base Class for adding plugins to Portia Web and Slybot.
    """

    def setup_bot(self, settings, spec, items, extractors):
        """
        Perform any initialization needed for crawling using this plugin
        """
        _item_template_pages = sorted((
            [t.get('scrapes'), dict_to_page(t, 'annotated_body'),
             t.get('extractors', []), t.get('version', '0.12.0')]
            for t in spec['templates'] if t.get('page_type', 'item') == 'item'
        ), key=lambda x: x[0])
        self.item_classes = {}
        self.template_scrapes = {template.get('page_id'): template['scrapes']
                                 for template in spec.get('templates')}
        self.html_link_extractor = HtmlLinkExtractor()
        for schema_name, schema in items.items():
            if schema_name not in self.item_classes:
                if not schema.get('name'):
                    schema['name'] = schema_name
                item_cls = SlybotItem.create_iblitem_class(schema)
                self.item_classes[schema_name] = item_cls

        # Create descriptors and apply additional extractors to fields
        page_descriptor_pairs = []
        self.schema_descriptors = {}
        for default, template, template_extractors, v in _item_template_pages:
            descriptors = OrderedDict()
            for schema_name, schema in items.items():
                item_descriptor = create_slybot_item_descriptor(schema,
                                                                schema_name)
                apply_extractors(item_descriptor, template_extractors,
                                 extractors)
                descriptors[schema_name] = item_descriptor
            descriptor = descriptors.values() or [{}]
            descriptors['#default'] = descriptors.get(default, descriptor[0])
            self.schema_descriptors[template.page_id] = descriptors['#default']
            page_descriptor_pairs.append((template, descriptors, v))

        grouped = itertools.groupby(sorted(page_descriptor_pairs,
                                           key=operator.itemgetter(2)),
                                    lambda x: x[2] < '0.13.0')
        self.extractors = []
        for version, group in grouped:
            if version:
                self.extractors.append(
                    InstanceBasedLearningExtractor(
                        [(page, scrapes['#default'])
                         for page, scrapes, version in group]))
            else:
                self.extractors.append(SlybotIBLExtractor(list(group)))

        # generate ibl extractor for links pages
        _links_pages = [dict_to_page(t, 'annotated_body')
                        for t in spec['templates']
                        if t.get('page_type') == 'links']
        _links_item_descriptor = create_slybot_item_descriptor({'fields': {}})
        self._links_ibl_extractor = InstanceBasedLearningExtractor(
            [(t, _links_item_descriptor) for t in _links_pages]) \
            if _links_pages else None

        self.build_url_filter(spec)

    def handle_html(self, response, seen=None):
        htmlpage = htmlpage_from_response(response)
        items, link_regions = self.extract_items(htmlpage)
        htmlpage.headers['n_items'] = len(items)
        try:
            response.meta['n_items'] = len(items)
        except AttributeError:
            pass # response not tied to any request
        for item in items:
            yield item
        for request in self._process_link_regions(htmlpage, link_regions):
            yield request

    def extract_items(self, htmlpage):
        """This method is also called from UI webservice to extract items"""
        for extractor in self.extractors:
            items, links = self._do_extract_items_from(htmlpage, extractor)
            if items:
                return items, links
        return [], []

    def _do_extract_items_from(self, htmlpage, extractor):
        extracted_data, template = extractor.extract(htmlpage)
        link_regions = []
        for ddict in extracted_data or []:
            link_regions.extend(ddict.pop("_links", []))
        descriptor = None
        unprocessed = False
        if template is not None and hasattr(template, 'descriptor'):
            descriptor = template.descriptor()
            item_cls_name = descriptor.name if descriptor is not None else ''
        else:
            unprocessed = True
            try:
                descriptor = self.schema_descriptors[template.id]
                item_cls_name = self.template_scrapes[template.id]
            except AttributeError:
                descriptor = sorted(self.schema_descriptors.items())[0][1]
                item_cls_name = sorted(self.template_scrapes.items())[0][1]
        item_cls = self.item_classes.get(item_cls_name)
        items = []
        for processed_attributes in extracted_data or []:
            if processed_attributes.get('_type') in self.item_classes:
                _type = processed_attributes['_type']
                item = self.item_classes[_type](processed_attributes)
                item['_type'] = item.display_name()
            elif unprocessed:
                item = self._process_attributes(processed_attributes,
                                                descriptor, htmlpage)
                if item_cls:
                    item = item_cls(item)
            elif item_cls:
                item = item_cls(processed_attributes)
            else:
                item = dict(processed_attributes)
            item['url'] = htmlpage.url
            item['_template'] = str(template.id)
            item.setdefault('_type', item_cls_name)
            if not isinstance(item, SlybotItem):
                default_meta = {'type': 'text', 'required': False,
                                'vary': False}
                item_cls = SlybotItem.create_iblitem_class(
                    {'fields': {k: default_meta for k in item}}
                )
                item = item_cls(**item)
            items.append(item)

        return items, link_regions

    def _process_attributes(self, item, descriptor, htmlpage):
        new_item = {}
        attr_map = descriptor.attribute_map
        for field, value in item.items():
            if field.startswith('_sticky'):
                continue
            if field == 'variants':
                value = [self._process_attributes(v, descriptor, htmlpage)
                         for v in value]
            elif field in attr_map:
                value = [attr_map[field].adapt(v, htmlpage) for v in value]
            new_item[field] = value
        return new_item

    def build_url_filter(self, spec):
        """make a filter for links"""
        respect_nofollow = spec.get('respect_nofollow', True)

        if spec.get("links_to_follow") == "none":
            url_filterf = lambda x: False
        elif spec.get("links_to_follow") == "all":
            if respect_nofollow:
                url_filterf = lambda x: x.nofollow
            else:
                url_filterf = lambda x: True
        else: # patterns
            patterns = spec.get('follow_patterns')
            excludes = spec.get('exclude_patterns')
            pattern_fn = include_exclude_filter(patterns, excludes)

            if respect_nofollow:
                url_filterf = lambda x: not x.nofollow and pattern_fn(x.url)
            else:
                url_filterf = lambda x: pattern_fn(x.url)

        self.url_filterf = url_filterf


    def _filter_link(self, link, seen):
        url = link.url
        if self.url_filterf(link):
            # filter out duplicate urls, later we should handle link text
            if url not in seen:
                seen.add(url)
                request = Request(url)
                if link.text:
                    request.meta['link_text'] = link.text
                return request

    def _process_link_regions(self, htmlpage, link_regions):
        """Process link regions if any, and generate requests"""
        if link_regions:
            for link_region in link_regions:
                htmlregion = HtmlPage(htmlpage.url, htmlpage.headers,
                                      link_region, encoding=htmlpage.encoding)
                for request in self._requests_to_follow(htmlregion):
                    yield request
        else:
            for request in self._requests_to_follow(htmlpage):
                yield request

    def _requests_to_follow(self, htmlpage):
        if self._links_ibl_extractor is not None:
            extracted = self._links_ibl_extractor.extract(htmlpage)[0]
            if extracted:
                extracted_regions = extracted[0].get('_links', [])
                seen = set()
                for region in extracted_regions:
                    htmlregion = HtmlPage(htmlpage.url, htmlpage.headers,
                                          region, encoding=htmlpage.encoding)
                    for request in self._request_to_follow_from_region(
                            htmlregion):
                        if request.url in seen:
                            continue
                        seen.add(request.url)
                        yield request
        else:
            for request in self._request_to_follow_from_region(htmlpage):
                yield request

    def _request_to_follow_from_region(self, htmlregion):
        seen = set()
        for link in self.html_link_extractor.links_to_follow(htmlregion):
            request = self._filter_link(link, seen)
            if request is not None:
                yield request

    def handle_xml(self, response, seen):
        _type = XML_APPLICATION_TYPE(response.headers.get('Content-Type', ''))
        _type = _type.groupdict()['type'] if _type else 'xml'
        try:
            link_extractor = create_linkextractor_from_specs({
                'type': _type, 'value': ''
            })
        except ValueError:
            link_extractor = SitemapLinkExtractor()
        for link in link_extractor.links_to_follow(response):
            request = self._filter_link(link, seen)
            if request:
                yield request
Beispiel #50
0
    def __init__(self, name, spec, item_schemas, all_extractors, **kw):
        super(IblSpider, self).__init__(name, **kw)
        default_item = spec['scrapes']
        self._default_schema = item_schemas[default_item]
        if not self._default_schema:
            self.log("Scraping unknown default item schema: %s" % default_item, \
                log.WARNING)

        self._item_template_pages = sorted((
            [t.get('scrapes', default_item), dict_to_page(t, 'annotated_body'),
            t.get('extractors', [])] \
            for t in spec['templates'] if t.get('page_type', 'item') == 'item'
        ), key=lambda pair: pair[0])

        # generate ibl extractor for links pages
        _links_pages = [
            dict_to_page(t, 'annotated_body') for t in spec['templates']
            if t.get('page_type') == 'links'
        ]
        _links_item_descriptor = create_slybot_item_descriptor({
            'id': "_links",
            'properties': ()
        })
        self._links_ibl_extractor = InstanceBasedLearningExtractor([(t, _links_item_descriptor) for t in _links_pages]) \
                if _links_pages else None

        self._ipages = [page for _, page, _ in self._item_template_pages]

        self.start_urls = self.start_urls or spec.get('start_urls')
        if isinstance(self.start_urls, basestring):
            self.start_urls = self.start_urls.splitlines()

        self.link_extractor = LinkExtractor()
        self.allowed_domains = self._get_allowed_domains(self._ipages)

        self.build_url_filter(spec)

        default_item_cls = get_iblitem_class(self._default_schema)
        default_item_descriptor = create_slybot_item_descriptor(
            self._default_schema)

        self.itemcls_info = {}
        for itemclass_name, triplets in itertools.groupby(
                self._item_template_pages, operator.itemgetter(0)):
            page_extractors_pairs = map(operator.itemgetter(1, 2), triplets)
            schema = item_schemas[itemclass_name]
            item_cls = get_iblitem_class(
                schema) if schema else default_item_cls

            page_descriptor_pairs = []
            for page, extractors in page_extractors_pairs:
                item_descriptor = create_slybot_item_descriptor(
                    schema) if schema else default_item_descriptor
                apply_extractors(item_descriptor, extractors, all_extractors)
                page_descriptor_pairs.append((page, item_descriptor))

            extractor = InstanceBasedLearningExtractor(page_descriptor_pairs)

            self.itemcls_info[itemclass_name] = {
                'class': item_cls,
                'descriptor': item_descriptor,
                'extractor': extractor,
            }

        self.login_requests = []
        self.form_requests = []
        for rdata in spec.get("init_requests", []):
            if rdata["type"] == "login":
                request = Request(url=rdata.pop("loginurl"),
                                  meta=rdata,
                                  callback=self.parse_login_page)
                self.login_requests.append(request)

            elif rdata["type"] == "form":
                request = Request(url=rdata.pop("form_url"),
                                  meta=rdata,
                                  callback=self.parse_form_page)
                self.form_requests.append(request)
Beispiel #51
0
class Annotations(object):
    """
    Base Class for adding plugins to Portia Web and Slybot.
    """

    def setup_bot(self, settings, spec, items, extractors):
        """
        Perform any initialization needed for crawling using this plugin
        """
        _item_template_pages = sorted((
            [t['scrapes'], dict_to_page(t, 'annotated_body'),
             t.get('extractors', [])]
            for t in spec['templates'] if t.get('page_type', 'item') == 'item'
        ), key=lambda pair: pair[0])

        self.itemcls_info = {}
        if settings.get('AUTO_PAGINATION'):
            self.html_link_extractor = PaginationExtractor()
        else:
            self.html_link_extractor = HtmlLinkExtractor()
        for itemclass_name, triplets in groupby(_item_template_pages,
                                                itemgetter(0)):
            page_extractors_pairs = map(itemgetter(1, 2), triplets)
            schema = items[itemclass_name]
            item_cls = SlybotItem.create_iblitem_class(schema)

            page_descriptor_pairs = []
            for page, template_extractors in page_extractors_pairs:
                item_descriptor = create_slybot_item_descriptor(schema)
                apply_extractors(item_descriptor, template_extractors,
                                 extractors)
                page_descriptor_pairs.append((page, item_descriptor))

            extractor = InstanceBasedLearningExtractor(page_descriptor_pairs)

            self.itemcls_info[itemclass_name] = {
                'class': item_cls,
                'descriptor': item_descriptor,
                'extractor': extractor,
            }

        # generate ibl extractor for links pages
        _links_pages = [dict_to_page(t, 'annotated_body')
                        for t in spec['templates']
                        if t.get('page_type') == 'links']
        _links_item_descriptor = create_slybot_item_descriptor({'fields': {}})
        self._links_ibl_extractor = InstanceBasedLearningExtractor(
            [(t, _links_item_descriptor) for t in _links_pages]) \
            if _links_pages else None

        self.build_url_filter(spec)

    def handle_html(self, response, seen=None):
        htmlpage = htmlpage_from_response(response)
        items, link_regions = self.extract_items(htmlpage)
        htmlpage.headers['n_items'] = len(items)
        try:
            response.meta['n_items'] = len(items)
        except AttributeError:
            pass # response not tied to any request
        for item in items:
            yield item
        for request in self._process_link_regions(htmlpage, link_regions):
            yield request

    def extract_items(self, htmlpage):
        """This method is also called from UI webservice to extract items"""
        items = []
        link_regions = []
        for item_cls_name, info in self.itemcls_info.items():
            item_descriptor = info['descriptor']
            extractor = info['extractor']
            extracted, _link_regions = self._do_extract_items_from(
                htmlpage,
                item_descriptor,
                extractor,
                item_cls_name,
            )
            items.extend(extracted)
            link_regions.extend(_link_regions)
        return items, link_regions

    def _do_extract_items_from(self, htmlpage, item_descriptor, extractor,
                               item_cls_name):
        extracted_data, template = extractor.extract(htmlpage)
        link_regions = []
        for ddict in extracted_data or []:
            link_regions.extend(ddict.pop("_links", []))
        processed_data = _process_extracted_data(extracted_data,
                                                 item_descriptor,
                                                 htmlpage)
        items = []
        item_cls = self.itemcls_info[item_cls_name]['class']
        for processed_attributes in processed_data:
            item = item_cls(processed_attributes)
            item['url'] = htmlpage.url
            item['_type'] = item_cls_name
            item['_template'] = str(template.id)
            items.append(item)

        return items, link_regions

    def build_url_filter(self, spec):
        """make a filter for links"""
        respect_nofollow = spec.get('respect_nofollow', True)
        patterns = spec.get('follow_patterns')
        if spec.get("links_to_follow") == "none":
            url_filterf = lambda x: False
        elif patterns:
            pattern = patterns[0] if len(patterns) == 1 \
                else "(?:%s)" % '|'.join(patterns)
            follow_pattern = re.compile(pattern)
            if respect_nofollow:
                url_filterf = lambda x: follow_pattern.search(x.url) \
                    and not x.nofollow
            else:
                url_filterf = lambda x: follow_pattern.search(x.url)
        elif respect_nofollow:
            url_filterf = lambda x: not x.nofollow
        else:
            url_filterf = bool
        # apply exclude patterns
        excludes = spec.get('exclude_patterns')
        if excludes:
            pattern = excludes[0] if len(excludes) == 1 \
                else "(?:%s)" % '|'.join(excludes)
            exclude_pattern = re.compile(pattern)
            self.url_filterf = lambda x: not exclude_pattern.search(x.url) \
                and url_filterf(x)
        else:
            self.url_filterf = url_filterf

    def _filter_link(self, link, seen):
        url = link.url
        if self.url_filterf(link):
            # filter out duplicate urls, later we should handle link text
            if url not in seen:
                seen.add(url)
                request = Request(url)
                if link.text:
                    request.meta['link_text'] = link.text
                return request

    def _process_link_regions(self, htmlpage, link_regions):
        """Process link regions if any, and generate requests"""
        if link_regions:
            for link_region in link_regions:
                htmlregion = HtmlPage(htmlpage.url, htmlpage.headers,
                                      link_region, encoding=htmlpage.encoding)
                for request in self._requests_to_follow(htmlregion):
                    yield request
        else:
            for request in self._requests_to_follow(htmlpage):
                yield request

    def _requests_to_follow(self, htmlpage):
        if self._links_ibl_extractor is not None:
            extracted = self._links_ibl_extractor.extract(htmlpage)[0]
            if extracted:
                extracted_regions = extracted[0].get('_links', [])
                seen = set()
                for region in extracted_regions:
                    htmlregion = HtmlPage(htmlpage.url, htmlpage.headers,
                                          region, encoding=htmlpage.encoding)
                    for request in self._request_to_follow_from_region(
                            htmlregion):
                        if request.url in seen:
                            continue
                        seen.add(request.url)
                        yield request
        else:
            for request in self._request_to_follow_from_region(htmlpage):
                yield request

    def _request_to_follow_from_region(self, htmlregion):
        seen = set()
        for link in self.html_link_extractor.links_to_follow(htmlregion):
            request = self._filter_link(link, seen)
            if request is not None:
                yield request

    def handle_xml(self, response, seen):
        _type = XML_APPLICATION_TYPE(response.headers.get('Content-Type', ''))
        _type = _type.groupdict()['type'] if _type else 'xml'
        try:
            link_extractor = create_linkextractor_from_specs({
                'type': _type, 'value': ''
            })
        except ValueError:
            link_extractor = SitemapLinkExtractor()
        for link in link_extractor.links_to_follow(response):
            request = self._filter_link(link, seen)
            if request:
                yield request
Beispiel #52
0
    def setup_bot(self, settings, spec, items, extractors):
        """
        Perform any initialization needed for crawling using this plugin
        """
        _item_template_pages = sorted(([
            t.get('scrapes'),
            dict_to_page(t, 'annotated_body'),
            t.get('extractors', []),
            t.get('version', '0.12.0')
        ] for t in spec['templates'] if t.get('page_type', 'item') == 'item'),
                                      key=lambda x: x[0])
        self.item_classes = {}
        self.template_scrapes = {
            template.get('page_id'): template['scrapes']
            for template in spec.get('templates')
        }
        self.html_link_extractor = HtmlLinkExtractor()
        for schema_name, schema in items.items():
            if schema_name not in self.item_classes:
                if not schema.get('name'):
                    schema['name'] = schema_name
                item_cls = SlybotItem.create_iblitem_class(schema)
                self.item_classes[schema_name] = item_cls

        # Create descriptors and apply additional extractors to fields
        page_descriptor_pairs = []
        self.schema_descriptors = {}
        for default, template, template_extractors, v in _item_template_pages:
            descriptors = OrderedDict()
            for schema_name, schema in items.items():
                item_descriptor = create_slybot_item_descriptor(
                    schema, schema_name)
                apply_extractors(item_descriptor, template_extractors,
                                 extractors)
                descriptors[schema_name] = item_descriptor
            descriptor = descriptors.values() or [{}]
            descriptors['#default'] = descriptors.get(default, descriptor[0])
            self.schema_descriptors[template.page_id] = descriptors['#default']
            page_descriptor_pairs.append((template, descriptors, v))
            add_extractors_to_descriptors(descriptors, extractors)

        grouped = itertools.groupby(
            sorted(page_descriptor_pairs, key=operator.itemgetter(2)),
            lambda x: x[2] < '0.13.0')
        self.extractors = []
        for version, group in grouped:
            if version:
                self.extractors.append(
                    InstanceBasedLearningExtractor([
                        (page, scrapes['#default'])
                        for page, scrapes, version in group
                    ]))
            else:
                self.extractors.append(SlybotIBLExtractor(list(group)))

        # generate ibl extractor for links pages
        _links_pages = [
            dict_to_page(t, 'annotated_body') for t in spec['templates']
            if t.get('page_type') == 'links'
        ]
        _links_item_descriptor = create_slybot_item_descriptor({'fields': {}})
        self._links_ibl_extractor = InstanceBasedLearningExtractor(
            [(t, _links_item_descriptor) for t in _links_pages]) \
            if _links_pages else None

        self.build_url_filter(spec)
Beispiel #53
0
    def __init__(self, name, spec, item_schemas, all_extractors, **kw):
        super(IblSpider, self).__init__(name, **kw)
        default_item = spec['scrapes']
        self._default_schema = item_schemas[default_item]
        if not self._default_schema:
            self.log("Scraping unknown default item schema: %s" % default_item, \
                log.WARNING)
        
        self._item_template_pages = sorted((
            [t.get('scrapes', default_item), dict_to_page(t, 'annotated_body'), 
            t.get('extractors', [])] \
            for t in spec['templates'] if t.get('page_type', 'item') == 'item'
        ), key=lambda pair: pair[0])

        # generate ibl extractor for links pages
        _links_pages = [dict_to_page(t, 'annotated_body')
                for t in spec['templates'] if t.get('page_type') == '_links']
        _links_item_descriptor = create_slybot_item_descriptor({'id': "_links", 'properties': ()})
        self._links_ibl_extractor = InstanceBasedLearningExtractor([(t, _links_item_descriptor) for t in _links_pages]) \
                if _links_pages else None

        self._ipages = [page for _, page, _ in self._item_template_pages]
        self._fpages = [
            dict_to_page(t, 'annotated_body')
            for t in spec['templates'] if t.get('page_type', 'item') == 'form'
        ]
        
        self._start_urls = spec.get('start_urls')

        self.link_extractor = LinkExtractor()
        self.allowed_domains = self._get_allowed_domains(self._ipages)
        
        # make a filter for links 
        respect_nofollow = spec.get('respect_nofollow', True)
        patterns = spec.get('follow_patterns')
        if patterns:
            pattern = patterns[0] if len(patterns) == 1 else "(?:%s)" % '|'.join(patterns)
            follow_pattern = re.compile(pattern)
            if respect_nofollow:
                url_filterf = lambda x: follow_pattern.search(x.url) and not x.nofollow
            else:
                url_filterf = lambda x: follow_pattern.search(x.url)
        elif respect_nofollow:
            url_filterf = lambda x: not x.nofollow
        else:
            url_filterf = bool
        # apply exclude patterns
        exclude_patterns = spec.get('exclude_patterns')
        if exclude_patterns:
            pattern = exclude_patterns[0] if len(exclude_patterns) == 1 else "(?:%s)" % '|'.join(exclude_patterns)
            exclude_pattern = re.compile(pattern)
            self.url_filterf = lambda x: not exclude_pattern.search(x.url) and url_filterf(x)
        else:
            self.url_filterf = url_filterf

        default_item_cls = get_iblitem_class(self._default_schema)
        default_item_descriptor = create_slybot_item_descriptor(self._default_schema)

        self.itemcls_info = {}
        for itemclass_name, triplets in itertools.groupby(self._item_template_pages, operator.itemgetter(0)):
            page_extractors_pairs = map(operator.itemgetter(1, 2), triplets)
            schema = item_schemas[itemclass_name]
            item_cls = get_iblitem_class(schema) if schema else default_item_cls

            page_descriptor_pairs = []
            for page, extractors in page_extractors_pairs:
                item_descriptor = create_slybot_item_descriptor(schema) if schema else default_item_descriptor
                apply_extractors(item_descriptor, extractors, all_extractors)
                page_descriptor_pairs.append((page, item_descriptor))

            extractor = InstanceBasedLearningExtractor(page_descriptor_pairs)

            self.itemcls_info[itemclass_name] = {
                'class': item_cls,
                'descriptor': item_descriptor,
                'extractor': extractor,
            }

        self._itemversion_cache = {}
Beispiel #54
0
 def scrape_page(self, page):
     if self._ex is None:
         self._ex = InstanceBasedLearningExtractor((t, None) for t in
                 self._templates)
     return self._ex.extract(page)[0]
Beispiel #55
0
class Annotations(object):
    """
    Base Class for adding plugins to Portia Web and Slybot.
    """
    def setup_bot(self, settings, spec, items, extractors):
        """
        Perform any initialization needed for crawling using this plugin
        """
        _item_template_pages = sorted(([
            t['scrapes'],
            dict_to_page(t, 'annotated_body'),
            t.get('extractors', [])
        ] for t in spec['templates'] if t.get('page_type', 'item') == 'item'),
                                      key=lambda pair: pair[0])

        self.itemcls_info = {}
        self.html_link_extractor = HtmlLinkExtractor()
        self.rss_link_extractor = RssLinkExtractor()
        for itemclass_name, triplets in groupby(_item_template_pages,
                                                itemgetter(0)):
            page_extractors_pairs = map(itemgetter(1, 2), triplets)
            schema = items[itemclass_name]
            item_cls = SlybotItem.create_iblitem_class(schema)

            page_descriptor_pairs = []
            for page, template_extractors in page_extractors_pairs:
                item_descriptor = create_slybot_item_descriptor(schema)
                apply_extractors(item_descriptor, template_extractors,
                                 extractors)
                page_descriptor_pairs.append((page, item_descriptor))

            extractor = InstanceBasedLearningExtractor(page_descriptor_pairs)

            self.itemcls_info[itemclass_name] = {
                'class': item_cls,
                'descriptor': item_descriptor,
                'extractor': extractor,
            }

        # generate ibl extractor for links pages
        _links_pages = [
            dict_to_page(t, 'annotated_body') for t in spec['templates']
            if t.get('page_type') == 'links'
        ]
        _links_item_descriptor = create_slybot_item_descriptor({'fields': {}})
        self._links_ibl_extractor = InstanceBasedLearningExtractor(
            [(t, _links_item_descriptor) for t in _links_pages]) \
            if _links_pages else None

        self.build_url_filter(spec)

    def handle_html(self, response):
        htmlpage = htmlpage_from_response(response)
        items, link_regions = self.extract_items(htmlpage)
        for item in items:
            yield item
        for request in self._process_link_regions(htmlpage, link_regions):
            yield request

    def extract_items(self, htmlpage):
        """This method is also called from UI webservice to extract items"""
        items = []
        link_regions = []
        for item_cls_name, info in self.itemcls_info.iteritems():
            item_descriptor = info['descriptor']
            extractor = info['extractor']
            extracted, _link_regions = self._do_extract_items_from(
                htmlpage,
                item_descriptor,
                extractor,
                item_cls_name,
            )
            items.extend(extracted)
            link_regions.extend(_link_regions)
        return items, link_regions

    def _do_extract_items_from(self, htmlpage, item_descriptor, extractor,
                               item_cls_name):
        extracted_data, template = extractor.extract(htmlpage)
        link_regions = []
        for ddict in extracted_data or []:
            link_regions.extend(ddict.pop("_links", []))
        processed_data = _process_extracted_data(extracted_data,
                                                 item_descriptor, htmlpage)
        items = []
        item_cls = self.itemcls_info[item_cls_name]['class']
        for processed_attributes in processed_data:
            item = item_cls(processed_attributes)
            item['url'] = htmlpage.url
            item['_type'] = item_cls_name
            item['_template'] = str(template.id)
            items.append(item)

        return items, link_regions

    def build_url_filter(self, spec):
        """make a filter for links"""
        respect_nofollow = spec.get('respect_nofollow', True)
        patterns = spec.get('follow_patterns')
        if spec.get("links_to_follow") == "none":
            url_filterf = lambda x: False
        elif patterns:
            pattern = patterns[0] if len(patterns) == 1 \
                else "(?:%s)" % '|'.join(patterns)
            follow_pattern = re.compile(pattern)
            if respect_nofollow:
                url_filterf = lambda x: follow_pattern.search(x.url) \
                    and not x.nofollow
            else:
                url_filterf = lambda x: follow_pattern.search(x.url)
        elif respect_nofollow:
            url_filterf = lambda x: not x.nofollow
        else:
            url_filterf = bool
        # apply exclude patterns
        excludes = spec.get('exclude_patterns')
        if excludes:
            pattern = excludes[0] if len(excludes) == 1 \
                else "(?:%s)" % '|'.join(excludes)
            exclude_pattern = re.compile(pattern)
            self.url_filterf = lambda x: not exclude_pattern.search(x.url) \
                and url_filterf(x)
        else:
            self.url_filterf = url_filterf

    def _filter_link(self, link, seen):
        url = link.url
        if self.url_filterf(link):
            # filter out duplicate urls, later we should handle link text
            if url not in seen:
                seen.add(url)
                request = Request(url)
                if link.text:
                    request.meta['link_text'] = link.text
                return request

    def _process_link_regions(self, htmlpage, link_regions):
        """Process link regions if any, and generate requests"""
        if link_regions:
            for link_region in link_regions:
                htmlregion = HtmlPage(htmlpage.url,
                                      htmlpage.headers,
                                      link_region,
                                      encoding=htmlpage.encoding)
                for request in self._requests_to_follow(htmlregion):
                    yield request
        else:
            for request in self._requests_to_follow(htmlpage):
                yield request

    def _requests_to_follow(self, htmlpage):
        if self._links_ibl_extractor is not None:
            extracted = self._links_ibl_extractor.extract(htmlpage)[0]
            if extracted:
                extracted_regions = extracted[0].get('_links', [])
                seen = set()
                for region in extracted_regions:
                    htmlregion = HtmlPage(htmlpage.url,
                                          htmlpage.headers,
                                          region,
                                          encoding=htmlpage.encoding)
                    for request in self._request_to_follow_from_region(
                            htmlregion):
                        if request.url in seen:
                            continue
                        seen.add(request.url)
                        yield request
        else:
            for request in self._request_to_follow_from_region(htmlpage):
                yield request

    def _request_to_follow_from_region(self, htmlregion):
        seen = set()
        for link in self.html_link_extractor.links_to_follow(htmlregion):
            request = self._filter_link(link, seen)
            if request is not None:
                yield request

    def handle_rss(self, response, seen):
        for link in self.rss_link_extractor.links_to_follow(response):
            request = self._filter_link(link, seen)
            if request:
                yield request