Ejemplo n.º 1
0
    def setup_bot(self, settings, spec, items, extractors):
        """
        Perform any initialization needed for crawling using this plugin
        """
        _item_template_pages = sorted((
            [t.get('scrapes'), dict_to_page(t, 'annotated_body'),
             t.get('extractors', []), t.get('version', '0.12.0')]
            for t in spec['templates'] if t.get('page_type', 'item') == 'item'
        ), key=lambda x: x[0])
        self.item_classes = {}
        self.template_scrapes = {template.get('page_id'): template['scrapes']
                                 for template in spec.get('templates')}
        self.html_link_extractor = HtmlLinkExtractor()
        for schema_name, schema in items.items():
            if schema_name not in self.item_classes:
                if not schema.get('name'):
                    schema['name'] = schema_name
                item_cls = SlybotItem.create_iblitem_class(schema)
                self.item_classes[schema_name] = item_cls

        # Create descriptors and apply additional extractors to fields
        page_descriptor_pairs = []
        self.schema_descriptors = {}
        for default, template, template_extractors, v in _item_template_pages:
            descriptors = OrderedDict()
            for schema_name, schema in items.items():
                item_descriptor = create_slybot_item_descriptor(schema,
                                                                schema_name)
                apply_extractors(item_descriptor, template_extractors,
                                 extractors)
                descriptors[schema_name] = item_descriptor
            descriptor = descriptors.values() or [{}]
            descriptors['#default'] = descriptors.get(default, descriptor[0])
            self.schema_descriptors[template.page_id] = descriptors['#default']
            page_descriptor_pairs.append((template, descriptors, v))
            add_extractors_to_descriptors(descriptors, extractors)

        grouped = itertools.groupby(sorted(page_descriptor_pairs,
                                           key=operator.itemgetter(2)),
                                    lambda x: x[2] < '0.13.0')
        self.extractors = []
        for version, group in grouped:
            if version:
                self.extractors.append(
                    InstanceBasedLearningExtractor(
                        [(page, scrapes['#default'])
                         for page, scrapes, version in group]))
            else:
                self.extractors.append(SlybotIBLExtractor(list(group)))

        # generate ibl extractor for links pages
        _links_pages = [dict_to_page(t, 'annotated_body')
                        for t in spec['templates']
                        if t.get('page_type') == 'links']
        _links_item_descriptor = create_slybot_item_descriptor({'fields': {}})
        self._links_ibl_extractor = InstanceBasedLearningExtractor(
            [(t, _links_item_descriptor) for t in _links_pages]) \
            if _links_pages else None

        self.build_url_filter(spec)
Ejemplo n.º 2
0
 def test_per_annotation_extractors(self):
     schema = {
         'fields': {
             'url': {
                 'required': False,
                 'type': 'text',
                 'vary': False,
             },
             'name': {
                 'required': True,
                 'type': 'text',
                 'vary': False,
             }
         }
     }
     extractors = {
         '1': {
             'type_extractor': 'url'
         },
         '2': {
             'regular_expression': '(.*)\.html'
         },
         '3': {
             'regular_expression': 'Name: (.*)'
         },
         '4': {
             'type_extractor': 'text'
         },
         '5': {
             'type_extractor': 'price'
         },
         '6': {
             'type_extractor': 'number'
         },
         '7': {
             'type_extractor': 'date'
         },
         '8': {
             'regular_expression': '(\d+)-'
         }
     }
     descriptors = {'#default': create_slybot_item_descriptor(schema)}
     add_extractors_to_descriptors(descriptors, extractors)
     ibl_extractor = SlybotIBLExtractor([(self.template3, descriptors,
                                          '0.13.0')])
     result = {
         u'_template': '6223d000057491040e4f411cf1f0734ea802eeb6',
         'name': [u'Olivia'],
         'url': [u'http://www.test.com/olivia'],
         'title': [u'Name: Olivia'],
         'price': [u'2016'],
         'date': [datetime(2016, 3, 17, 20, 25)]
     }
     data = ibl_extractor.extract(self.target3)[0][0]
     self.assertEqual(data, result)
Ejemplo n.º 3
0
 def test_per_annotation_extractors(self):
     schema = {
         'fields': {
             'url': {
                 'required': False,
                 'type': 'text',
                 'vary': False,
             },
             'name': {
                 'required': True,
                 'type': 'text',
                 'vary': False,
             }
         }
     }
     extractors = {
         '1': {
             'type_extractor': 'url'
         },
         '2': {
             'regular_expression': '(.*)\.html'
         },
         '3': {
             'regular_expression': 'Name: (.*)'
         },
         '4': {
             'type_extractor': 'text'
         },
         '5': {
             'type_extractor': 'price'
         },
         '6': {
             'type_extractor': 'number'
         },
         '7': {
             'type_extractor': 'date'
         },
         '8': {
             'regular_expression': '(\d+)-'
         }
     }
     descriptors = {'#default': create_slybot_item_descriptor(schema)}
     add_extractors_to_descriptors(descriptors, extractors)
     ibl_extractor = SlybotIBLExtractor([
         (self.template3, descriptors, '0.13.0')
     ])
     result = {'name': [u'Olivia'], 'url': [u'http://www.test.com/olivia'],
               'title': [u'Name: Olivia'], 'price': [u'2016'],
               'date': [datetime(2016, 3, 17, 20, 25)]}
     data = ibl_extractor.extract(self.target3)[0][0]
     del data['_template']
     self.assertEqual(data, result)
 def test_extract_single_attribute_to_multiple_fields(self):
     extractors = {'1': {'regular_expression': '(.*)\s'},
                   '2': {'regular_expression': '\s(.*)'}}
     descriptors = {'#default': create_slybot_item_descriptor({'fields': {
         'full_name': {'type': 'text', 'required': False, 'vary': False},
         'first_name': {'type': 'text', 'required': False, 'vary': False,
                        'name': u'prénom'},
         'last_name': {'type': 'text', 'required': False, 'vary': False,
                       'name': 'nom'},
         'address': {'type': 'text', 'required': False, 'vary': False}}})}
     add_extractors_to_descriptors(descriptors, extractors)
     extractor = SlybotIBLExtractor([(sample_411, descriptors, '0.13.0')])
     data = extractor.extract(page_411)[0][1]
     self.assertEqual(data['full_name'], [u'Joe Smith'])
     self.assertEqual(data[u'prénom'], [u'Joe'])
     self.assertEqual(data['nom'], [u'Smith'])
 def test_extract_single_attribute_to_multiple_fields(self):
     extractors = {'1': {'regular_expression': '(.*)\s'},
                   '2': {'regular_expression': '\s(.*)'}}
     descriptors = {'#default': create_slybot_item_descriptor({'fields': {
         'full_name': {'type': 'text', 'required': False, 'vary': False},
         'first_name': {'type': 'text', 'required': False, 'vary': False,
                        'name': u'prénom'},
         'last_name': {'type': 'text', 'required': False, 'vary': False,
                       'name': 'nom'},
         'address': {'type': 'text', 'required': False, 'vary': False}}})}
     add_extractors_to_descriptors(descriptors, extractors)
     extractor = SlybotIBLExtractor([(sample_411, descriptors, '0.13.0')])
     data = extractor.extract(page_411)[0]
     self.assertEqual(data[1]['full_name'], [u'Joe Smith'])
     self.assertEqual(data[1][u'prénom'], [u'Joe'])
     self.assertEqual(data[1]['nom'], [u'Smith'])
Ejemplo n.º 6
0
    def setup_bot(self, settings, spec, items, extractors, logger):
        """
        Perform any initialization needed for crawling using this plugin
        """
        self.logger = logger
        templates = map(self._get_annotated_template, spec['templates'])

        _item_template_pages = sorted(([
            t.get('scrapes'),
            dict_to_page(t, 'annotated_body'),
            t.get('extractors', []),
            t.get('version', '0.12.0')
        ] for t in templates if t.get('page_type', 'item') == 'item'),
                                      key=lambda x: x[0])
        self.item_classes = {}
        self.template_scrapes = {
            template.get('page_id'): template['scrapes']
            for template in templates
        }
        if (settings.get('AUTO_PAGINATION')
                or spec.get('links_to_follow') == 'auto'):
            self.html_link_extractor = PaginationExtractor()
        else:
            self.html_link_extractor = HtmlLinkExtractor()
        for schema_name, schema in items.items():
            if schema_name not in self.item_classes:
                if not schema.get('name'):
                    schema['name'] = schema_name
                item_cls = SlybotItem.create_iblitem_class(schema)
                self.item_classes[schema_name] = item_cls

        # Create descriptors and apply additional extractors to fields
        page_descriptor_pairs = []
        self.schema_descriptors = {}
        for default, template, template_extractors, v in _item_template_pages:
            descriptors = OrderedDict()
            for schema_name, schema in items.items():
                item_descriptor = create_slybot_item_descriptor(
                    schema, schema_name)
                apply_extractors(item_descriptor, template_extractors,
                                 extractors)
                descriptors[schema_name] = item_descriptor
            descriptor = descriptors.values() or [{}]
            descriptors['#default'] = descriptors.get(default, descriptor[0])
            self.schema_descriptors[template.page_id] = descriptors['#default']
            page_descriptor_pairs.append((template, descriptors, v))
            add_extractors_to_descriptors(descriptors, extractors)

        grouped = itertools.groupby(
            sorted(page_descriptor_pairs, key=operator.itemgetter(2)),
            lambda x: x[2] < '0.13.0')
        self.extractors = []
        for version, group in grouped:
            if version:
                self.extractors.append(
                    InstanceBasedLearningExtractor([
                        (page, scrapes['#default'])
                        for page, scrapes, version in group
                    ]))
            else:
                self.extractors.append(SlybotIBLExtractor(list(group)))

        # generate ibl extractor for links pages
        _links_pages = [
            dict_to_page(t, 'annotated_body') for t in templates
            if t.get('page_type') == 'links'
        ]
        _links_item_descriptor = create_slybot_item_descriptor({'fields': {}})
        self._links_ibl_extractor = InstanceBasedLearningExtractor(
            [(t, _links_item_descriptor) for t in _links_pages]) \
            if _links_pages else None

        self.build_url_filter(spec)
        # Clustering
        self.template_names = [t.get('page_id') for t in spec['templates']]
        if settings.get('PAGE_CLUSTERING'):
            try:
                import page_clustering
                self.clustering = page_clustering.kmeans_from_samples(
                    spec['templates'])
                self.logger.info("Clustering activated")
            except ImportError:
                self.clustering = None
                self.logger.warning(
                    "Clustering could not be used because it is not installed")
        else:
            self.clustering = None
            'description': {'required': False, 'vary': False, 'type': 'text'},
            'rank': {'required': False, 'vary': False, 'type': 'price'}}
    }
}

simple_template = HtmlPage(url="http://www.test.com/a",
                           body=apply_annotations(annotations, html))
target1 = base_page('\n'.join(item_template(idx=i, rank=1)
                              for i in range(1, 11)))
target2 = base_page('\n'.join(item_template(idx=i, rank=i if i % 2 else '')
                              for i in range(1, 11)))
target1 = HtmlPage(url="http://www.test.com/a", body=target1)
target2 = HtmlPage(url="http://www.test.com/a", body=target2)
simple_descriptors = {k: create_slybot_item_descriptor(v)
                      for k, v in schemas.items()}
add_extractors_to_descriptors(simple_descriptors, {})


td = TokenDict()
html_page = HtmlPage(body=open_spec('stack_overflow.html').decode('utf-8'))
extraction_page = parse_extraction_page(td, html_page)
with open('%s/data/SampleProject/items.json' % PATH) as f:
    items = json.load(f)
descriptors = {'#default': create_slybot_item_descriptor(items['default'],
                                                         'default')}
template = parse_template(td, html_page, descriptors)
unvalidated_template = parse_template(td, html_page, {})
unvalidated_template.id = u'stack_overflow_test'
basic_extractors = BasicTypeExtractor.create(template.annotations)
uncontained_annotation = basic_extractors[0]
root_container = basic_extractors[1]
}


sample = {'plugins': {'annotations-plugin': {'extracts': annotations}},
          'original_body': html}
simple_template = HtmlPage(url="http://www.test.com/a",
                           body=Annotations(sample).apply())
target1 = base_page('\n'.join(item_template(idx=i, rank=1)
                              for i in range(1, 11)))
target2 = base_page('\n'.join(item_template(idx=i, rank=i if i % 2 else '')
                              for i in range(1, 11)))
target1 = HtmlPage(url="http://www.test.com/a", body=target1)
target2 = HtmlPage(url="http://www.test.com/a", body=target2)
simple_descriptors = {k: create_slybot_item_descriptor(v)
                      for k, v in schemas.items()}
add_extractors_to_descriptors(simple_descriptors, {})


td = TokenDict()
html_page = HtmlPage(body=open_spec('stack_overflow.html'))
extraction_page = parse_extraction_page(td, html_page)
with open('%s/data/SampleProject/items.json' % PATH) as f:
    items = json.load(f)
descriptors = {'#default': create_slybot_item_descriptor(items['default'],
                                                         'default')}
template = parse_template(td, html_page, descriptors)
unvalidated_template = parse_template(td, html_page, {})
unvalidated_template.id = u'stack_overflow_test'
basic_extractors = BasicTypeExtractor.create(template.annotations)
uncontained_annotation = basic_extractors[0]
root_container = basic_extractors[1]
Ejemplo n.º 9
0
    def setup_bot(self, settings, spec, items, extractors, logger):
        """
        Perform any initialization needed for crawling using this plugin
        """
        self.logger = logger
        templates = map(self._get_annotated_template, spec['templates'])

        _item_template_pages = sorted((
            [t.get('scrapes'), dict_to_page(t, 'annotated_body'),
             t.get('extractors', []), t.get('version', '0.12.0')]
            for t in templates if t.get('page_type', 'item') == 'item'
        ), key=lambda x: x[0])
        self.item_classes = {}
        self.template_scrapes = {template.get('page_id'): template['scrapes']
                                 for template in templates}
        if (settings.get('AUTO_PAGINATION') or
                spec.get('links_to_follow') == 'auto'):
            self.html_link_extractor = PaginationExtractor()
        else:
            self.html_link_extractor = HtmlLinkExtractor()
        for schema_name, schema in items.items():
            if schema_name not in self.item_classes:
                if not schema.get('name'):
                    schema['name'] = schema_name
                item_cls = SlybotItem.create_iblitem_class(schema)
                self.item_classes[schema_name] = item_cls

        # Create descriptors and apply additional extractors to fields
        page_descriptor_pairs = []
        self.schema_descriptors = {}
        for default, template, template_extractors, v in _item_template_pages:
            descriptors = OrderedDict()
            for schema_name, schema in items.items():
                item_descriptor = create_slybot_item_descriptor(schema,
                                                                schema_name)
                apply_extractors(item_descriptor, template_extractors,
                                 extractors)
                descriptors[schema_name] = item_descriptor
            descriptor = descriptors.values() or [{}]
            descriptors['#default'] = descriptors.get(default, descriptor[0])
            self.schema_descriptors[template.page_id] = descriptors['#default']
            page_descriptor_pairs.append((template, descriptors, v))
            add_extractors_to_descriptors(descriptors, extractors)

        grouped = itertools.groupby(sorted(page_descriptor_pairs,
                                           key=operator.itemgetter(2)),
                                    lambda x: x[2] < '0.13.0')
        self.extractors = []
        for version, group in grouped:
            if version:
                self.extractors.append(
                    InstanceBasedLearningExtractor(
                        [(page, scrapes['#default'])
                         for page, scrapes, version in group]))
            else:
                self.extractors.append(SlybotIBLExtractor(list(group)))

        # generate ibl extractor for links pages
        _links_pages = [dict_to_page(t, 'annotated_body')
                        for t in templates if t.get('page_type') == 'links']
        _links_item_descriptor = create_slybot_item_descriptor({'fields': {}})
        self._links_ibl_extractor = InstanceBasedLearningExtractor(
            [(t, _links_item_descriptor) for t in _links_pages]) \
            if _links_pages else None

        self.build_url_filter(spec)
        # Clustering
        self.template_names = [t.get('page_id') for t in spec['templates']]
        if settings.get('PAGE_CLUSTERING'):
            try:
                import page_clustering
                self.clustering = page_clustering.kmeans_from_samples(spec['templates'])
                self.logger.info("Clustering activated")
            except ImportError:
                self.clustering = None
                self.logger.warning(
                    "Clustering could not be used because it is not installed")
        else:
            self.clustering = None
Ejemplo n.º 10
0
    def setup_bot(self, settings, spec, items, extractors):
        """
        Perform any initialization needed for crawling using this plugin
        """
        _item_template_pages = sorted(([
            t.get('scrapes'),
            dict_to_page(t, 'annotated_body'),
            t.get('extractors', []),
            t.get('version', '0.12.0')
        ] for t in spec['templates'] if t.get('page_type', 'item') == 'item'),
                                      key=lambda x: x[0])
        self.item_classes = {}
        self.template_scrapes = {
            template.get('page_id'): template['scrapes']
            for template in spec.get('templates')
        }
        self.html_link_extractor = HtmlLinkExtractor()
        for schema_name, schema in items.items():
            if schema_name not in self.item_classes:
                if not schema.get('name'):
                    schema['name'] = schema_name
                item_cls = SlybotItem.create_iblitem_class(schema)
                self.item_classes[schema_name] = item_cls

        # Create descriptors and apply additional extractors to fields
        page_descriptor_pairs = []
        self.schema_descriptors = {}
        for default, template, template_extractors, v in _item_template_pages:
            descriptors = OrderedDict()
            for schema_name, schema in items.items():
                item_descriptor = create_slybot_item_descriptor(
                    schema, schema_name)
                apply_extractors(item_descriptor, template_extractors,
                                 extractors)
                descriptors[schema_name] = item_descriptor
            descriptor = descriptors.values() or [{}]
            descriptors['#default'] = descriptors.get(default, descriptor[0])
            self.schema_descriptors[template.page_id] = descriptors['#default']
            page_descriptor_pairs.append((template, descriptors, v))
            add_extractors_to_descriptors(descriptors, extractors)

        grouped = itertools.groupby(
            sorted(page_descriptor_pairs, key=operator.itemgetter(2)),
            lambda x: x[2] < '0.13.0')
        self.extractors = []
        for version, group in grouped:
            if version:
                self.extractors.append(
                    InstanceBasedLearningExtractor([
                        (page, scrapes['#default'])
                        for page, scrapes, version in group
                    ]))
            else:
                self.extractors.append(SlybotIBLExtractor(list(group)))

        # generate ibl extractor for links pages
        _links_pages = [
            dict_to_page(t, 'annotated_body') for t in spec['templates']
            if t.get('page_type') == 'links'
        ]
        _links_item_descriptor = create_slybot_item_descriptor({'fields': {}})
        self._links_ibl_extractor = InstanceBasedLearningExtractor(
            [(t, _links_item_descriptor) for t in _links_pages]) \
            if _links_pages else None

        self.build_url_filter(spec)