def setup_bot(self, settings, spec, items, extractors): """ Perform any initialization needed for crawling using this plugin """ _item_template_pages = sorted(( [t.get('scrapes'), dict_to_page(t, 'annotated_body'), t.get('extractors', []), t.get('version', '0.12.0')] for t in spec['templates'] if t.get('page_type', 'item') == 'item' ), key=lambda x: x[0]) self.item_classes = {} self.template_scrapes = {template.get('page_id'): template['scrapes'] for template in spec.get('templates')} self.html_link_extractor = HtmlLinkExtractor() for schema_name, schema in items.items(): if schema_name not in self.item_classes: if not schema.get('name'): schema['name'] = schema_name item_cls = SlybotItem.create_iblitem_class(schema) self.item_classes[schema_name] = item_cls # Create descriptors and apply additional extractors to fields page_descriptor_pairs = [] self.schema_descriptors = {} for default, template, template_extractors, v in _item_template_pages: descriptors = OrderedDict() for schema_name, schema in items.items(): item_descriptor = create_slybot_item_descriptor(schema, schema_name) apply_extractors(item_descriptor, template_extractors, extractors) descriptors[schema_name] = item_descriptor descriptor = descriptors.values() or [{}] descriptors['#default'] = descriptors.get(default, descriptor[0]) self.schema_descriptors[template.page_id] = descriptors['#default'] page_descriptor_pairs.append((template, descriptors, v)) add_extractors_to_descriptors(descriptors, extractors) grouped = itertools.groupby(sorted(page_descriptor_pairs, key=operator.itemgetter(2)), lambda x: x[2] < '0.13.0') self.extractors = [] for version, group in grouped: if version: self.extractors.append( InstanceBasedLearningExtractor( [(page, scrapes['#default']) for page, scrapes, version in group])) else: self.extractors.append(SlybotIBLExtractor(list(group))) # generate ibl extractor for links pages _links_pages = [dict_to_page(t, 'annotated_body') for t in spec['templates'] if t.get('page_type') == 'links'] _links_item_descriptor = create_slybot_item_descriptor({'fields': {}}) self._links_ibl_extractor = InstanceBasedLearningExtractor( [(t, _links_item_descriptor) for t in _links_pages]) \ if _links_pages else None self.build_url_filter(spec)
def test_per_annotation_extractors(self): schema = { 'fields': { 'url': { 'required': False, 'type': 'text', 'vary': False, }, 'name': { 'required': True, 'type': 'text', 'vary': False, } } } extractors = { '1': { 'type_extractor': 'url' }, '2': { 'regular_expression': '(.*)\.html' }, '3': { 'regular_expression': 'Name: (.*)' }, '4': { 'type_extractor': 'text' }, '5': { 'type_extractor': 'price' }, '6': { 'type_extractor': 'number' }, '7': { 'type_extractor': 'date' }, '8': { 'regular_expression': '(\d+)-' } } descriptors = {'#default': create_slybot_item_descriptor(schema)} add_extractors_to_descriptors(descriptors, extractors) ibl_extractor = SlybotIBLExtractor([(self.template3, descriptors, '0.13.0')]) result = { u'_template': '6223d000057491040e4f411cf1f0734ea802eeb6', 'name': [u'Olivia'], 'url': [u'http://www.test.com/olivia'], 'title': [u'Name: Olivia'], 'price': [u'2016'], 'date': [datetime(2016, 3, 17, 20, 25)] } data = ibl_extractor.extract(self.target3)[0][0] self.assertEqual(data, result)
def test_per_annotation_extractors(self): schema = { 'fields': { 'url': { 'required': False, 'type': 'text', 'vary': False, }, 'name': { 'required': True, 'type': 'text', 'vary': False, } } } extractors = { '1': { 'type_extractor': 'url' }, '2': { 'regular_expression': '(.*)\.html' }, '3': { 'regular_expression': 'Name: (.*)' }, '4': { 'type_extractor': 'text' }, '5': { 'type_extractor': 'price' }, '6': { 'type_extractor': 'number' }, '7': { 'type_extractor': 'date' }, '8': { 'regular_expression': '(\d+)-' } } descriptors = {'#default': create_slybot_item_descriptor(schema)} add_extractors_to_descriptors(descriptors, extractors) ibl_extractor = SlybotIBLExtractor([ (self.template3, descriptors, '0.13.0') ]) result = {'name': [u'Olivia'], 'url': [u'http://www.test.com/olivia'], 'title': [u'Name: Olivia'], 'price': [u'2016'], 'date': [datetime(2016, 3, 17, 20, 25)]} data = ibl_extractor.extract(self.target3)[0][0] del data['_template'] self.assertEqual(data, result)
def test_extract_single_attribute_to_multiple_fields(self): extractors = {'1': {'regular_expression': '(.*)\s'}, '2': {'regular_expression': '\s(.*)'}} descriptors = {'#default': create_slybot_item_descriptor({'fields': { 'full_name': {'type': 'text', 'required': False, 'vary': False}, 'first_name': {'type': 'text', 'required': False, 'vary': False, 'name': u'prénom'}, 'last_name': {'type': 'text', 'required': False, 'vary': False, 'name': 'nom'}, 'address': {'type': 'text', 'required': False, 'vary': False}}})} add_extractors_to_descriptors(descriptors, extractors) extractor = SlybotIBLExtractor([(sample_411, descriptors, '0.13.0')]) data = extractor.extract(page_411)[0][1] self.assertEqual(data['full_name'], [u'Joe Smith']) self.assertEqual(data[u'prénom'], [u'Joe']) self.assertEqual(data['nom'], [u'Smith'])
def test_extract_single_attribute_to_multiple_fields(self): extractors = {'1': {'regular_expression': '(.*)\s'}, '2': {'regular_expression': '\s(.*)'}} descriptors = {'#default': create_slybot_item_descriptor({'fields': { 'full_name': {'type': 'text', 'required': False, 'vary': False}, 'first_name': {'type': 'text', 'required': False, 'vary': False, 'name': u'prénom'}, 'last_name': {'type': 'text', 'required': False, 'vary': False, 'name': 'nom'}, 'address': {'type': 'text', 'required': False, 'vary': False}}})} add_extractors_to_descriptors(descriptors, extractors) extractor = SlybotIBLExtractor([(sample_411, descriptors, '0.13.0')]) data = extractor.extract(page_411)[0] self.assertEqual(data[1]['full_name'], [u'Joe Smith']) self.assertEqual(data[1][u'prénom'], [u'Joe']) self.assertEqual(data[1]['nom'], [u'Smith'])
def setup_bot(self, settings, spec, items, extractors, logger): """ Perform any initialization needed for crawling using this plugin """ self.logger = logger templates = map(self._get_annotated_template, spec['templates']) _item_template_pages = sorted(([ t.get('scrapes'), dict_to_page(t, 'annotated_body'), t.get('extractors', []), t.get('version', '0.12.0') ] for t in templates if t.get('page_type', 'item') == 'item'), key=lambda x: x[0]) self.item_classes = {} self.template_scrapes = { template.get('page_id'): template['scrapes'] for template in templates } if (settings.get('AUTO_PAGINATION') or spec.get('links_to_follow') == 'auto'): self.html_link_extractor = PaginationExtractor() else: self.html_link_extractor = HtmlLinkExtractor() for schema_name, schema in items.items(): if schema_name not in self.item_classes: if not schema.get('name'): schema['name'] = schema_name item_cls = SlybotItem.create_iblitem_class(schema) self.item_classes[schema_name] = item_cls # Create descriptors and apply additional extractors to fields page_descriptor_pairs = [] self.schema_descriptors = {} for default, template, template_extractors, v in _item_template_pages: descriptors = OrderedDict() for schema_name, schema in items.items(): item_descriptor = create_slybot_item_descriptor( schema, schema_name) apply_extractors(item_descriptor, template_extractors, extractors) descriptors[schema_name] = item_descriptor descriptor = descriptors.values() or [{}] descriptors['#default'] = descriptors.get(default, descriptor[0]) self.schema_descriptors[template.page_id] = descriptors['#default'] page_descriptor_pairs.append((template, descriptors, v)) add_extractors_to_descriptors(descriptors, extractors) grouped = itertools.groupby( sorted(page_descriptor_pairs, key=operator.itemgetter(2)), lambda x: x[2] < '0.13.0') self.extractors = [] for version, group in grouped: if version: self.extractors.append( InstanceBasedLearningExtractor([ (page, scrapes['#default']) for page, scrapes, version in group ])) else: self.extractors.append(SlybotIBLExtractor(list(group))) # generate ibl extractor for links pages _links_pages = [ dict_to_page(t, 'annotated_body') for t in templates if t.get('page_type') == 'links' ] _links_item_descriptor = create_slybot_item_descriptor({'fields': {}}) self._links_ibl_extractor = InstanceBasedLearningExtractor( [(t, _links_item_descriptor) for t in _links_pages]) \ if _links_pages else None self.build_url_filter(spec) # Clustering self.template_names = [t.get('page_id') for t in spec['templates']] if settings.get('PAGE_CLUSTERING'): try: import page_clustering self.clustering = page_clustering.kmeans_from_samples( spec['templates']) self.logger.info("Clustering activated") except ImportError: self.clustering = None self.logger.warning( "Clustering could not be used because it is not installed") else: self.clustering = None
'description': {'required': False, 'vary': False, 'type': 'text'}, 'rank': {'required': False, 'vary': False, 'type': 'price'}} } } simple_template = HtmlPage(url="http://www.test.com/a", body=apply_annotations(annotations, html)) target1 = base_page('\n'.join(item_template(idx=i, rank=1) for i in range(1, 11))) target2 = base_page('\n'.join(item_template(idx=i, rank=i if i % 2 else '') for i in range(1, 11))) target1 = HtmlPage(url="http://www.test.com/a", body=target1) target2 = HtmlPage(url="http://www.test.com/a", body=target2) simple_descriptors = {k: create_slybot_item_descriptor(v) for k, v in schemas.items()} add_extractors_to_descriptors(simple_descriptors, {}) td = TokenDict() html_page = HtmlPage(body=open_spec('stack_overflow.html').decode('utf-8')) extraction_page = parse_extraction_page(td, html_page) with open('%s/data/SampleProject/items.json' % PATH) as f: items = json.load(f) descriptors = {'#default': create_slybot_item_descriptor(items['default'], 'default')} template = parse_template(td, html_page, descriptors) unvalidated_template = parse_template(td, html_page, {}) unvalidated_template.id = u'stack_overflow_test' basic_extractors = BasicTypeExtractor.create(template.annotations) uncontained_annotation = basic_extractors[0] root_container = basic_extractors[1]
} sample = {'plugins': {'annotations-plugin': {'extracts': annotations}}, 'original_body': html} simple_template = HtmlPage(url="http://www.test.com/a", body=Annotations(sample).apply()) target1 = base_page('\n'.join(item_template(idx=i, rank=1) for i in range(1, 11))) target2 = base_page('\n'.join(item_template(idx=i, rank=i if i % 2 else '') for i in range(1, 11))) target1 = HtmlPage(url="http://www.test.com/a", body=target1) target2 = HtmlPage(url="http://www.test.com/a", body=target2) simple_descriptors = {k: create_slybot_item_descriptor(v) for k, v in schemas.items()} add_extractors_to_descriptors(simple_descriptors, {}) td = TokenDict() html_page = HtmlPage(body=open_spec('stack_overflow.html')) extraction_page = parse_extraction_page(td, html_page) with open('%s/data/SampleProject/items.json' % PATH) as f: items = json.load(f) descriptors = {'#default': create_slybot_item_descriptor(items['default'], 'default')} template = parse_template(td, html_page, descriptors) unvalidated_template = parse_template(td, html_page, {}) unvalidated_template.id = u'stack_overflow_test' basic_extractors = BasicTypeExtractor.create(template.annotations) uncontained_annotation = basic_extractors[0] root_container = basic_extractors[1]
def setup_bot(self, settings, spec, items, extractors, logger): """ Perform any initialization needed for crawling using this plugin """ self.logger = logger templates = map(self._get_annotated_template, spec['templates']) _item_template_pages = sorted(( [t.get('scrapes'), dict_to_page(t, 'annotated_body'), t.get('extractors', []), t.get('version', '0.12.0')] for t in templates if t.get('page_type', 'item') == 'item' ), key=lambda x: x[0]) self.item_classes = {} self.template_scrapes = {template.get('page_id'): template['scrapes'] for template in templates} if (settings.get('AUTO_PAGINATION') or spec.get('links_to_follow') == 'auto'): self.html_link_extractor = PaginationExtractor() else: self.html_link_extractor = HtmlLinkExtractor() for schema_name, schema in items.items(): if schema_name not in self.item_classes: if not schema.get('name'): schema['name'] = schema_name item_cls = SlybotItem.create_iblitem_class(schema) self.item_classes[schema_name] = item_cls # Create descriptors and apply additional extractors to fields page_descriptor_pairs = [] self.schema_descriptors = {} for default, template, template_extractors, v in _item_template_pages: descriptors = OrderedDict() for schema_name, schema in items.items(): item_descriptor = create_slybot_item_descriptor(schema, schema_name) apply_extractors(item_descriptor, template_extractors, extractors) descriptors[schema_name] = item_descriptor descriptor = descriptors.values() or [{}] descriptors['#default'] = descriptors.get(default, descriptor[0]) self.schema_descriptors[template.page_id] = descriptors['#default'] page_descriptor_pairs.append((template, descriptors, v)) add_extractors_to_descriptors(descriptors, extractors) grouped = itertools.groupby(sorted(page_descriptor_pairs, key=operator.itemgetter(2)), lambda x: x[2] < '0.13.0') self.extractors = [] for version, group in grouped: if version: self.extractors.append( InstanceBasedLearningExtractor( [(page, scrapes['#default']) for page, scrapes, version in group])) else: self.extractors.append(SlybotIBLExtractor(list(group))) # generate ibl extractor for links pages _links_pages = [dict_to_page(t, 'annotated_body') for t in templates if t.get('page_type') == 'links'] _links_item_descriptor = create_slybot_item_descriptor({'fields': {}}) self._links_ibl_extractor = InstanceBasedLearningExtractor( [(t, _links_item_descriptor) for t in _links_pages]) \ if _links_pages else None self.build_url_filter(spec) # Clustering self.template_names = [t.get('page_id') for t in spec['templates']] if settings.get('PAGE_CLUSTERING'): try: import page_clustering self.clustering = page_clustering.kmeans_from_samples(spec['templates']) self.logger.info("Clustering activated") except ImportError: self.clustering = None self.logger.warning( "Clustering could not be used because it is not installed") else: self.clustering = None
def setup_bot(self, settings, spec, items, extractors): """ Perform any initialization needed for crawling using this plugin """ _item_template_pages = sorted(([ t.get('scrapes'), dict_to_page(t, 'annotated_body'), t.get('extractors', []), t.get('version', '0.12.0') ] for t in spec['templates'] if t.get('page_type', 'item') == 'item'), key=lambda x: x[0]) self.item_classes = {} self.template_scrapes = { template.get('page_id'): template['scrapes'] for template in spec.get('templates') } self.html_link_extractor = HtmlLinkExtractor() for schema_name, schema in items.items(): if schema_name not in self.item_classes: if not schema.get('name'): schema['name'] = schema_name item_cls = SlybotItem.create_iblitem_class(schema) self.item_classes[schema_name] = item_cls # Create descriptors and apply additional extractors to fields page_descriptor_pairs = [] self.schema_descriptors = {} for default, template, template_extractors, v in _item_template_pages: descriptors = OrderedDict() for schema_name, schema in items.items(): item_descriptor = create_slybot_item_descriptor( schema, schema_name) apply_extractors(item_descriptor, template_extractors, extractors) descriptors[schema_name] = item_descriptor descriptor = descriptors.values() or [{}] descriptors['#default'] = descriptors.get(default, descriptor[0]) self.schema_descriptors[template.page_id] = descriptors['#default'] page_descriptor_pairs.append((template, descriptors, v)) add_extractors_to_descriptors(descriptors, extractors) grouped = itertools.groupby( sorted(page_descriptor_pairs, key=operator.itemgetter(2)), lambda x: x[2] < '0.13.0') self.extractors = [] for version, group in grouped: if version: self.extractors.append( InstanceBasedLearningExtractor([ (page, scrapes['#default']) for page, scrapes, version in group ])) else: self.extractors.append(SlybotIBLExtractor(list(group))) # generate ibl extractor for links pages _links_pages = [ dict_to_page(t, 'annotated_body') for t in spec['templates'] if t.get('page_type') == 'links' ] _links_item_descriptor = create_slybot_item_descriptor({'fields': {}}) self._links_ibl_extractor = InstanceBasedLearningExtractor( [(t, _links_item_descriptor) for t in _links_pages]) \ if _links_pages else None self.build_url_filter(spec)