def _build_sample(sample): from slybot.plugins.scrapely_annotations.builder import Annotations data = sample.get('plugins', {}).get('annotations-plugin') if data: Annotations().save_extraction_data(data, sample) sample['page_id'] = sample.get('page_id') or sample.get('id') or "" return sample
def open_sample_and_page(name): sample_spec = open_spec(name) url = sample_spec['url'] return (HtmlPage(url=url, body=Annotations(sample_spec).apply()), HtmlPage(url=url, body=sample_spec['original_body']))
def _build_sample(sample, legacy=False): from slybot.plugins.scrapely_annotations.builder import Annotations Annotations(sample, legacy=legacy).build() sample['annotated'] = True return sample
import six REQUIRED_FILES = { 'setup.py', 'scrapy.cfg', 'extractors.json', 'items.json', 'project.json', 'spiders/__init__.py', 'spiders/settings.py' } FILE_TEMPLATES = { 'extractors.json': '{}', 'items.json': '{}', 'project.json': templates['PROJECT'], 'scrapy.cfg': templates['SCRAPY'], 'setup.py': templates['SETUP'], 'spiders/__init__.py': '', 'spiders/settings.py': templates['SETTINGS'] } apply_annotations = Annotations().save_extraction_data class ProjectArchiver(object): required_files = frozenset(REQUIRED_FILES) file_templates = FILE_TEMPLATES def __init__(self, project, version=None, required_files=None): if version is None: version = (0, 10) self.separator = os.path.sep self.version = version self.project = project if required_files is not None: self.required_files = required_files
class ExtractorTest(TestCase): annotated = u""" <table> <tr data-scrapy-annotate="{"required": [], "variant": 0, "annotations": {"content": "gender"}}"> <th class="item-key">Gender</th> <td >Male</td></tr> </table>""" _target = u""" <table> <tr> <th class="item-key">Gender</th> <td >Male</td></tr> </table>""" annotated2 = u""" <table> <tr data-scrapy-annotate="{"required": [], "variant": 0, "annotations": {"content": "name"}}"> <th class="item-key">Name</th> <td >John</td></tr> <span data-scrapy-annotate="{"required": [], "variant": 0, "annotations": {"content": "gender"}}">Male</span> </table>""" _target2 = u""" <body> <tr> <th class="item-key">Name</th><td>Olivia</td></tr> <span></span> </body>""" annotations = _clean_annotation_data([{ 'id': 'annotation1', 'selector': 'td > a', 'container_id': 'parent', 'data': { 1: { 'attribute': 'content', 'field': 'title', 'required': False, 'extractors': [] }, 2: { 'attribute': 'content', 'field': 'name', 'required': False, 'extractors': ['3'] }, 3: { 'attribute': 'href', 'field': 'url', 'required': False, 'extractors': ['1', '2'] } } }, { 'id': 'annotation2', 'selector': 'span', 'container_id': 'parent', 'data': { 1: { 'attribute': 'content', 'field': 'price', 'required': False, 'extractors': ['8', '4', '5', '6'] }, 2: { 'attribute': 'content', 'field': 'date', 'required': False, 'extractors': ['4', '7'] } } }, { 'id': 'parent', 'item_container': True, 'selector': 'body' }]) target3 = u""" <html> <body> <tr> <th class="item-key">Name</th> <td> <a href="/olivia.html">Name: Olivia</a> </td> </tr><span>2016-03-17 20:25</span> </body></html>""" template = HtmlPage(url="http://www.test.com/", body=annotated) target = HtmlPage(url="http://www.test.com/", body=_target) template2 = HtmlPage(url="http://www.test.com/", body=annotated2) target2 = HtmlPage(url="http://www.test.com/a", body=_target2) sample3 = { 'plugins': { 'annotations-plugin': { 'extracts': annotations } }, 'original_body': target3 } template3 = HtmlPage(url="http://www.test.com/a", body=Annotations(sample3).apply()) target3 = HtmlPage(url="http://www.test.com/a", body=target3) def test_regex_extractor(self): extractor = create_regex_extractor("(\d+).*(\.\d+)") extracted = extractor( u"The price of this product is <div>45</div> </div class='small'>.50</div> pounds" ) self.assertEqual(extracted, u"45.50") processor = TextFieldTypeProcessor() self.assertEqual(processor.adapt(extracted, None), u"45.50") def test_raw_type_w_regex(self): schema = { 'fields': { 'gender': { 'required': False, 'type': 'raw', 'vary': False, } } } descriptor = create_slybot_item_descriptor(schema) extractors = { 1: { "regular_expression": "Gender.*(<td\s*>(?:Male|Female)</td>)" } } apply_extractors(descriptor, {"gender": [1]}, extractors) ibl_extractor = SlybotIBLExtractor([(self.template, { '#default': descriptor }, '0.12.0')]) self.assertEqual( ibl_extractor.extract(self.target)[0][0]['gender'], [u'<td >Male</td>']) def test_negative_hit_w_regex(self): schema = { 'fields': { 'gender': { 'required': False, 'type': 'number', 'vary': False, } } } descriptor = create_slybot_item_descriptor(schema) extractors = {1: {"regular_expression": "Gender\\s+(Male|Female)"}} apply_extractors(descriptor, {"gender": [1]}, extractors) ibl_extractor = SlybotIBLExtractor([(self.template, { '#default': descriptor }, '0.12.0')]) self.assertEqual(ibl_extractor.extract(self.target)[0], None) def test_text_type_w_regex(self): schema = { "fields": { 'gender': { 'required': False, 'type': 'text', 'vary': False, } } } descriptor = create_slybot_item_descriptor(schema) extractors = {1: {"regular_expression": "Gender\\s+(Male|Female)"}} apply_extractors(descriptor, {"gender": [1]}, extractors) ibl_extractor = SlybotIBLExtractor([(self.template, { '#default': descriptor }, '0.12.0')]) self.assertEqual( ibl_extractor.extract(self.target)[0][0]['gender'], [u'Male']) def test_type_extractor(self): schema = { "fields": { 'gender': { 'required': False, 'type': 'number', 'vary': False, } } } descriptor = create_slybot_item_descriptor(schema) extractors = { 1: { "type_extractor": "text" }, 2: { "regular_expression": "Gender\\s+(Male|Female)" } } apply_extractors(descriptor, {"gender": [1, 2]}, extractors) ibl_extractor = SlybotIBLExtractor([(self.template, { '#default': descriptor }, '0.12.0')]) self.assertEqual( ibl_extractor.extract(self.target)[0][0]['gender'], [u'Male']) def test_default_type_extractor(self): schema = {'fields': {}} descriptor = create_slybot_item_descriptor(schema) extractors = {1: {"regular_expression": "Gender\\s+(Male|Female)"}} apply_extractors(descriptor, {"gender": [1]}, extractors) ibl_extractor = SlybotIBLExtractor([(self.template, { '#default': descriptor }, '0.12.0')]) self.assertEqual( ibl_extractor.extract(self.target)[0][0]['gender'], [u'Male']) def test_text_type_w_regex_and_no_groups(self): schema = { 'fields': { 'gender': { 'required': False, 'type': 'text', 'vary': False, } } } descriptor = create_slybot_item_descriptor(schema) extractors = {1: {"regular_expression": "Gender"}} apply_extractors(descriptor, {"gender": [1]}, extractors) ibl_extractor = SlybotIBLExtractor([(self.template, { '#default': descriptor }, '0.12.0')]) self.assertEqual( ibl_extractor.extract(self.target)[0][0]['gender'], [u'Gender']) def test_extractor_w_empty_string_extraction(self): schema = { 'fields': { 'gender': { 'required': False, 'type': 'text', 'vary': False, }, 'name': { 'required': True, 'type': 'text', 'vary': False, } } } descriptor = create_slybot_item_descriptor(schema) extractors = {1: {"regular_expression": "([0-9]+)"}} apply_extractors(descriptor, {"gender": [1]}, extractors) ibl_extractor = SlybotIBLExtractor([(self.template2, { '#default': descriptor }, '0.12.0')]) self.assertEqual( ibl_extractor.extract(self.target2)[0][0]['name'], [u'Name Olivia']) def test_per_annotation_extractors(self): schema = { 'fields': { 'url': { 'required': False, 'type': 'text', 'vary': False, }, 'name': { 'required': True, 'type': 'text', 'vary': False, } } } extractors = { '1': { 'type_extractor': 'url' }, '2': { 'regular_expression': '(.*)\.html' }, '3': { 'regular_expression': 'Name: (.*)' }, '4': { 'type_extractor': 'text' }, '5': { 'type_extractor': 'price' }, '6': { 'type_extractor': 'number' }, '7': { 'type_extractor': 'date' }, '8': { 'regular_expression': '(\d+)-' } } descriptors = {'#default': create_slybot_item_descriptor(schema)} add_extractors_to_descriptors(descriptors, extractors) ibl_extractor = SlybotIBLExtractor([(self.template3, descriptors, '0.13.0')]) result = { u'_template': '6223d000057491040e4f411cf1f0734ea802eeb6', 'name': [u'Olivia'], 'url': [u'http://www.test.com/olivia'], 'title': [u'Name: Olivia'], 'price': [u'2016'], 'date': [datetime(2016, 3, 17, 20, 25)] } data = ibl_extractor.extract(self.target3)[0][0] self.assertEqual(data, result)
'type': 'price' } } } } sample = { 'plugins': { 'annotations-plugin': { 'extracts': annotations } }, 'original_body': html } simple_template = HtmlPage(url="http://www.test.com/a", body=Annotations(sample).apply()) target1 = base_page('\n'.join( item_template(idx=i, rank=1) for i in range(1, 11))) target2 = base_page('\n'.join( item_template(idx=i, rank=i if i % 2 else '') for i in range(1, 11))) target1 = HtmlPage(url="http://www.test.com/a", body=target1) target2 = HtmlPage(url="http://www.test.com/a", body=target2) simple_descriptors = { k: create_slybot_item_descriptor(v) for k, v in schemas.items() } add_extractors_to_descriptors(simple_descriptors, {}) td = TokenDict() html_page = HtmlPage(body=open_spec('stack_overflow.html')) extraction_page = parse_extraction_page(td, html_page)
lextractor = create_linkextractor_from_specs(specs) response = UTF8TextResponse(url='http://www.example.com/', body=csvfeed3) links = list(lextractor.links_to_follow(response)) self.assertEqual(len(links), 2) self.assertEqual(links[0].url, 'http://www.example.com/path') self.assertEqual(links[1].url, 'http://www.example.com/path2') html = """ <a href="http://www.example.com/path">Click here</a> """ _PATH = dirname(__file__) with open('%s/data/templates/daft_list.json' % _PATH) as f: daft_sample = json.load(f) daft_body = Annotations(daft_sample).apply() daft_sample['annotated_body'] = daft_body class Test_HtmlLinkExtractor(TestCase): def test_simple(self): specs = {"type": "html", "value": None} lextractor = create_linkextractor_from_specs(specs) response = UTF8HtmlResponse(url='http://www.example.com/', body=html) links = list(lextractor.links_to_follow(response)) self.assertEqual(len(links), 1) self.assertEqual(links[0].url, 'http://www.example.com/path') self.assertEqual(links[0].text, 'Click here') class Test_PaginationExtractor(TestCase):
def generate_from_samples( page_items, path='./slybot-project', spider_name='aile', min_item_fields=2, max_item_fields=None, ): """Generate a full slybot project Parameters ---------- page_items: List[(page, items)] page is an HtmlPage where tagids attributes have been added items is List[Item] path : string Directory where to store the project min_item_fields: int or None Discard items with less fields than this number max_item_fields: int or None Discard items with more fields than this number Returns ------- None """ if not os.path.exists(path): os.mkdir(path) # project.json with open(os.path.join(path, 'project.json'), 'w') as project_file: json.dump(generate_project(), project_file, indent=4, sort_keys=True) # project.json with open(os.path.join(path, 'project.json'), 'w') as project_file: json.dump(generate_project(), project_file, indent=4, sort_keys=True) # items.json all_items = collections.defaultdict(dict) for _, items in page_items: for item in items: for field_name, field_dict in item.dict['fields'].iteritems(): all_items[item.name][field_name] = field_dict with open(os.path.join(path, 'items.json'), 'w') as items_file: json.dump( { item_name: { 'fields': fields } for item_name, fields in all_items.iteritems() }, items_file, indent=4, sort_keys=True) # extractors with open(os.path.join(path, 'extractors.json'), 'w') as extractors_file: json.dump({}, extractors_file, indent=4, sort_keys=True) # spiders/ spiders_dir = os.path.join(path, 'spiders') if not os.path.exists(spiders_dir): os.mkdir(spiders_dir) spider_dir = os.path.join(spiders_dir, spider_name) if not os.path.exists(spider_dir): os.mkdir(spider_dir) templates = [] for i, (page, items) in enumerate(page_items): template = generate_empty_template(page) annotations = [] for j, item in enumerate(filter(item_is_tag, items)): if min_item_fields is not None and len( item.fields) < min_item_fields: continue if max_item_fields is not None and len( item.fields) > max_item_fields: continue annotations += merge_tagid_annotations( generate_item_annotations(item)) annotations = merge_containers(annotations) template['plugins'] = {'annotations-plugin': {'extracts': annotations}} Annotations().save_extraction_data({'extracts': annotations}, template) template_name = 'template-{0}'.format(i) template['name'] = template['id'] = template_name template_path = os.path.join(spider_dir, '{0}.json'.format(template_name)) with open(template_path, 'w') as template_file: json.dump(template, template_file, indent=4, sort_keys=True) html_path = os.path.join(spider_dir, template_name + '-annotated.html') with open(html_path, 'w') as template_annotated: template_annotated.write( template['annotated_body'].encode('utf-8')) templates.append(template_name) spider_path = os.path.join(spiders_dir, '{0}.json'.format(spider_name)) with open(spider_path, 'w') as spider_file: json.dump(generate_spider(page.url, templates), spider_file, indent=4, sort_keys=True)