Ejemplo n.º 1
0
    def test_extractor_w_empty_string_extraction(self):
        schema = {
            'fields': {
                'gender': {
                    'required': False,
                    'type': 'text',
                    'vary': False,
                },
                'name': {
                    'required': True,
                    'type': 'text',
                    'vary': False,
                }
            }
        }
        descriptor = create_slybot_item_descriptor(schema)
        extractors = {
            1: {
                "regular_expression": "([0-9]+)"
            }
        }
        apply_extractors(descriptor, {"gender": [1]}, extractors)

        ibl_extractor = SlybotIBLExtractor([
            (self.template2, {'#default': descriptor}, '0.12.0')])
        self.assertEqual(ibl_extractor.extract(self.target2)[0][0]['name'], [u'Name Olivia'])
Ejemplo n.º 2
0
    def test_type_extractor(self):
        schema = {
            "fields": {
                'gender': {
                    'required': False,
                    'type': 'number',
                    'vary': False,
                }
            }
        }
        descriptor = create_slybot_item_descriptor(schema)
        extractors = {
            1: {
                "type_extractor": "text"
            },
            2: {
                "regular_expression": "Gender\\s+(Male|Female)"
            }
        }
        apply_extractors(descriptor, {"gender": [1, 2]}, extractors)

        ibl_extractor = SlybotIBLExtractor([(self.template, {
            '#default': descriptor
        }, '0.12.0')])
        self.assertEqual(
            ibl_extractor.extract(self.target)[0][0]['gender'], [u'Male'])
Ejemplo n.º 3
0
    def test_extractor_w_empty_string_extraction(self):
        schema = {
            'fields': {
                'gender': {
                    'required': False,
                    'type': 'text',
                    'vary': False,
                },
                'name': {
                    'required': True,
                    'type': 'text',
                    'vary': False,
                }
            }
        }
        descriptor = create_slybot_item_descriptor(schema)
        extractors = {
            1: {
                "regular_expression": "([0-9]+)"
            }
        }
        apply_extractors(descriptor, {"gender": [1]}, extractors)

        ibl_extractor = SlybotIBLExtractor([
            (self.template2, {'#default': descriptor}, '0.12.0')])
        self.assertEqual(ibl_extractor.extract(self.target2)[0][0]['name'], [u'Name Olivia'])
 def test_extract_missing_schema(self):
     extractor = SlybotIBLExtractor([(sample_411, {}, '0.13.0')])
     data = extractor.extract(page_411)[0][1]
     raw_html = ('<span itemprop="name"><span itemprop="givenName">Joe'
                 '</span> <span itemprop="familyName">Smith</span></span>')
     self.assertEqual(data['full_name'], [raw_html])
     self.assertEqual(data['first_name'], [raw_html])
     self.assertEqual(data['last_name'], [raw_html])
 def test_extract_missing_schema(self):
     extractor = SlybotIBLExtractor([(sample_411, {}, '0.13.0')])
     data = extractor.extract(page_411)[0][1]
     raw_html = ('<span itemprop="name"><span itemprop="givenName">Joe'
                 '</span> <span itemprop="familyName">Smith</span></span>')
     self.assertEqual(data['full_name'], [raw_html])
     self.assertEqual(data['first_name'], [raw_html])
     self.assertEqual(data['last_name'], [raw_html])
Ejemplo n.º 6
0
 def test_per_annotation_extractors(self):
     schema = {
         'fields': {
             'url': {
                 'required': False,
                 'type': 'text',
                 'vary': False,
             },
             'name': {
                 'required': True,
                 'type': 'text',
                 'vary': False,
             }
         }
     }
     extractors = {
         '1': {
             'type_extractor': 'url'
         },
         '2': {
             'regular_expression': '(.*)\.html'
         },
         '3': {
             'regular_expression': 'Name: (.*)'
         },
         '4': {
             'type_extractor': 'text'
         },
         '5': {
             'type_extractor': 'price'
         },
         '6': {
             'type_extractor': 'number'
         },
         '7': {
             'type_extractor': 'date'
         },
         '8': {
             'regular_expression': '(\d+)-'
         }
     }
     descriptors = {'#default': create_slybot_item_descriptor(schema)}
     add_extractors_to_descriptors(descriptors, extractors)
     ibl_extractor = SlybotIBLExtractor([(self.template3, descriptors,
                                          '0.13.0')])
     result = {
         u'_template': '6223d000057491040e4f411cf1f0734ea802eeb6',
         'name': [u'Olivia'],
         'url': [u'http://www.test.com/olivia'],
         'title': [u'Name: Olivia'],
         'price': [u'2016'],
         'date': [datetime(2016, 3, 17, 20, 25)]
     }
     data = ibl_extractor.extract(self.target3)[0][0]
     self.assertEqual(data, result)
Ejemplo n.º 7
0
    def test_default_type_extractor(self):
        schema = {'fields': {}}
        descriptor = create_slybot_item_descriptor(schema)
        extractors = {1: {"regular_expression": "Gender\\s+(Male|Female)"}}
        apply_extractors(descriptor, {"gender": [1]}, extractors)

        ibl_extractor = SlybotIBLExtractor([(self.template, {
            '#default': descriptor
        }, '0.12.0')])
        self.assertEqual(
            ibl_extractor.extract(self.target)[0][0]['gender'], [u'Male'])
Ejemplo n.º 8
0
 def test_per_annotation_extractors(self):
     schema = {
         'fields': {
             'url': {
                 'required': False,
                 'type': 'text',
                 'vary': False,
             },
             'name': {
                 'required': True,
                 'type': 'text',
                 'vary': False,
             }
         }
     }
     extractors = {
         '1': {
             'type_extractor': 'url'
         },
         '2': {
             'regular_expression': '(.*)\.html'
         },
         '3': {
             'regular_expression': 'Name: (.*)'
         },
         '4': {
             'type_extractor': 'text'
         },
         '5': {
             'type_extractor': 'price'
         },
         '6': {
             'type_extractor': 'number'
         },
         '7': {
             'type_extractor': 'date'
         },
         '8': {
             'regular_expression': '(\d+)-'
         }
     }
     descriptors = {'#default': create_slybot_item_descriptor(schema)}
     add_extractors_to_descriptors(descriptors, extractors)
     ibl_extractor = SlybotIBLExtractor([
         (self.template3, descriptors, '0.13.0')
     ])
     result = {'name': [u'Olivia'], 'url': [u'http://www.test.com/olivia'],
               'title': [u'Name: Olivia'], 'price': [u'2016'],
               'date': [datetime(2016, 3, 17, 20, 25)]}
     data = ibl_extractor.extract(self.target3)[0][0]
     del data['_template']
     self.assertEqual(data, result)
Ejemplo n.º 9
0
    def test_default_type_extractor(self):
        schema = {
            'fields': {}
        }
        descriptor = create_slybot_item_descriptor(schema)
        extractors = {
            1: {"regular_expression": "Gender\\s+(Male|Female)"}
        }
        apply_extractors(descriptor, {"gender": [1]}, extractors)

        ibl_extractor = SlybotIBLExtractor([
            (self.template, {'#default': descriptor}, '0.12.0')])
        self.assertEqual(ibl_extractor.extract(self.target)[0][0]['gender'], [u'Male'])
 def test_required_annotation(self):
     ibl_extractor = SlybotIBLExtractor([
         (simple_template, simple_descriptors, '0.13.0')
     ])
     data, _ = ibl_extractor.extract(target1)
     self.assertEqual(len(data), 10)
     self.assertTrue(all('rank' in item and item['rank'] for item in data))
     self.assertTrue(all('description' in item and item['description']
                         for item in data))
     data, _ = ibl_extractor.extract(target2)
     self.assertEqual(len(data), 5)
     self.assertTrue(all('rank' in item and item['rank'] for item in data))
     self.assertTrue(all('description' in item and item['description']
                         for item in data))
 def test_required_annotation(self):
     ibl_extractor = SlybotIBLExtractor([
         (simple_template, simple_descriptors, '0.13.0')
     ])
     data, _ = ibl_extractor.extract(target1)
     self.assertEqual(len(data), 10)
     self.assertTrue(all('rank' in item and item['rank'] for item in data))
     self.assertTrue(all('description' in item and item['description']
                         for item in data))
     data, _ = ibl_extractor.extract(target2)
     self.assertEqual(len(data), 5)
     self.assertTrue(all('rank' in item and item['rank'] for item in data))
     self.assertTrue(all('description' in item and item['description']
                         for item in data))
 def test_extract_single_attribute_to_multiple_fields(self):
     extractors = {'1': {'regular_expression': '(.*)\s'},
                   '2': {'regular_expression': '\s(.*)'}}
     descriptors = {'#default': create_slybot_item_descriptor({'fields': {
         'full_name': {'type': 'text', 'required': False, 'vary': False},
         'first_name': {'type': 'text', 'required': False, 'vary': False,
                        'name': u'prénom'},
         'last_name': {'type': 'text', 'required': False, 'vary': False,
                       'name': 'nom'},
         'address': {'type': 'text', 'required': False, 'vary': False}}})}
     add_extractors_to_descriptors(descriptors, extractors)
     extractor = SlybotIBLExtractor([(sample_411, descriptors, '0.13.0')])
     data = extractor.extract(page_411)[0][1]
     self.assertEqual(data['full_name'], [u'Joe Smith'])
     self.assertEqual(data[u'prénom'], [u'Joe'])
     self.assertEqual(data['nom'], [u'Smith'])
 def test_extract_single_attribute_to_multiple_fields(self):
     extractors = {'1': {'regular_expression': '(.*)\s'},
                   '2': {'regular_expression': '\s(.*)'}}
     descriptors = {'#default': create_slybot_item_descriptor({'fields': {
         'full_name': {'type': 'text', 'required': False, 'vary': False},
         'first_name': {'type': 'text', 'required': False, 'vary': False,
                        'name': u'prénom'},
         'last_name': {'type': 'text', 'required': False, 'vary': False,
                       'name': 'nom'},
         'address': {'type': 'text', 'required': False, 'vary': False}}})}
     add_extractors_to_descriptors(descriptors, extractors)
     extractor = SlybotIBLExtractor([(sample_411, descriptors, '0.13.0')])
     data = extractor.extract(page_411)[0]
     self.assertEqual(data[1]['full_name'], [u'Joe Smith'])
     self.assertEqual(data[1][u'prénom'], [u'Joe'])
     self.assertEqual(data[1]['nom'], [u'Smith'])
Ejemplo n.º 14
0
    def test_negative_hit_w_regex(self):
        schema = {
            'fields': {
                'gender': {
                    'required': False,
                    'type': 'number',
                    'vary': False,
                }
            }
        }
        descriptor = create_slybot_item_descriptor(schema)
        extractors = {1: {"regular_expression": "Gender\\s+(Male|Female)"}}
        apply_extractors(descriptor, {"gender": [1]}, extractors)

        ibl_extractor = SlybotIBLExtractor([
            (self.template, {'#default': descriptor}, '0.12.0')])
        self.assertEqual(ibl_extractor.extract(self.target)[0], None)
Ejemplo n.º 15
0
    def test_text_type_w_regex(self):
        schema = {
            "fields": {
                'gender': {
                    'required': False,
                    'type': 'text',
                    'vary': False,
                }
            }
        }
        descriptor = create_slybot_item_descriptor(schema)
        extractors = {1: {"regular_expression": "Gender\\s+(Male|Female)"}}
        apply_extractors(descriptor, {"gender": [1]}, extractors)

        ibl_extractor = SlybotIBLExtractor([
            (self.template, {'#default': descriptor}, '0.12.0')])
        self.assertEqual(ibl_extractor.extract(self.target)[0][0]['gender'], [u'Male'])
Ejemplo n.º 16
0
    def test_raw_type_w_regex(self):
        schema = {
            'fields': {
                'gender': {
                    'required': False,
                    'type': 'raw',
                    'vary': False,
                }
            }
        }
        descriptor = create_slybot_item_descriptor(schema)
        extractors =  {1: {
                        "regular_expression": "Gender.*(<td\s*>(?:Male|Female)</td>)"
        }}
        apply_extractors(descriptor, {"gender": [1]}, extractors)

        ibl_extractor = SlybotIBLExtractor([(self.template, {'#default': descriptor})])
        self.assertEqual(ibl_extractor.extract(self.target)[0][0]['gender'], [u'<td >Male</td>'])
Ejemplo n.º 17
0
    def test_raw_type_w_regex(self):
        schema = {
            'fields': {
                'gender': {
                    'required': False,
                    'type': 'raw',
                    'vary': False,
                }
            }
        }
        descriptor = create_slybot_item_descriptor(schema)
        extractors = {
            1: {"regular_expression": "Gender.*(<td\s*>(?:Male|Female)</td>)"}
        }
        apply_extractors(descriptor, {"gender": [1]}, extractors)

        ibl_extractor = SlybotIBLExtractor([
            (self.template, {'#default': descriptor}, '0.12.0')])
        self.assertEqual(ibl_extractor.extract(self.target)[0][0]['gender'], [u'<td >Male</td>'])
Ejemplo n.º 18
0
    def test_text_type_w_regex_and_no_groups(self):
        schema = {
            'fields': {
                'gender': {
                    'required': False,
                    'type': 'text',
                    'vary': False,
                }
            }
        }
        descriptor = create_slybot_item_descriptor(schema)
        extractors = {
            1: {"regular_expression": "Gender"}
        }
        apply_extractors(descriptor, {"gender": [1]}, extractors)

        ibl_extractor = SlybotIBLExtractor([
            (self.template, {'#default': descriptor}, '0.12.0')])
        self.assertEqual(ibl_extractor.extract(self.target)[0][0]['gender'], [u'Gender'])
Ejemplo n.º 19
0
 def test_required_annotation(self):
     extractor = SlybotIBLExtractor([(sample_daft, {}, '0.13.0')])
     data = extractor.extract(page_daft)[0]
     self.assertEqual(len(data), 5)
     assert all('ber' in house for house in data)
     assert all('address' in house for house in data)
     assert all('price_change' in house for house in data)
     extractor = SlybotIBLExtractor([(sample_daft_no_requireds, {},
                                      '0.13.0')])
     data = extractor.extract(page_daft)[0]
     self.assertEqual(len(data), 8)
     assert all('ber' in house for house in data)
     assert all('address' in house for house in data)
     assert any('price_change' not in house for house in data)
Ejemplo n.º 20
0
 def test_extract_missing_schema(self):
     extractor = SlybotIBLExtractor([(sample_411, {}, '0.13.0')])
     data = extractor.extract(page_411)[0]
     self.assertEqual(data[1]['full_name'], [u'Joe Smith'])
     self.assertEqual(data[1]['first_name'], [u'Joe Smith'])
     self.assertEqual(data[1]['last_name'], [u'Joe Smith'])
Ejemplo n.º 21
0
        self.legacy = legacy
        self.modifiers = {}


schema = FakeContainer(descriptors['#default'])
validate = schema._validate_and_adapt_item
_names_map = {'daft_ie': 'daft', 'patchofland': 'pol'}
ibl_extractors = {}
ibl_pages = {}
selector_pages = {}
for template_name in ('daft_ie', 'hn', 'patchofland'):
    with open('%s/data/templates/%s.html' % (_PATH, template_name)) as f:
        html_page = HtmlPage(body=f.read().decode('utf-8'))
        name = _names_map.get(template_name, template_name)
        ibl_pages[name] = html_page
        ibl_extractors[name] = SlybotIBLExtractor([(html_page, descriptors,
                                                    '0.13.0')])
        selector_pages[name] = Selector(text=html_page.body)


class TestExtractionSpeed(TestCase):
    def test_parsel_parse_and_extract(self):
        for i in range(ITERATIONS):
            for name, page in ibl_pages.items():
                s = Selector(text=page.body)
                extract(parsel_extractors[name], s)

    def test_slybot_parse_and_extract(self):
        for i in range(ITERATIONS):
            for name, page in ibl_pages.items():
                extraction_page = HtmlPage(body=page.body)
                ibl_extractors[name].extract(extraction_page)