def test_required_annotation(self): ibl_extractor = SlybotIBLExtractor([ (simple_template, simple_descriptors, '0.13.0') ]) data, _ = ibl_extractor.extract(target1) self.assertEqual(len(data), 10) self.assertTrue(all('rank' in item and item['rank'] for item in data)) self.assertTrue(all('description' in item and item['description'] for item in data)) data, _ = ibl_extractor.extract(target2) self.assertEqual(len(data), 5) self.assertTrue(all('rank' in item and item['rank'] for item in data)) self.assertTrue(all('description' in item and item['description'] for item in data))
def test_required_annotation(self): extractor = SlybotIBLExtractor([(sample_daft, {}, '0.13.0')]) data = extractor.extract(page_daft)[0] self.assertEqual(len(data), 5) assert all('ber' in house for house in data) assert all('address' in house for house in data) assert all('price_change' in house for house in data) extractor = SlybotIBLExtractor([(sample_daft_no_requireds, {}, '0.13.0')]) data = extractor.extract(page_daft)[0] self.assertEqual(len(data), 8) assert all('ber' in house for house in data) assert all('address' in house for house in data) assert any('price_change' not in house for house in data)
def test_type_extractor(self): schema = { "fields": { 'gender': { 'required': False, 'type': 'number', 'vary': False, } } } descriptor = create_slybot_item_descriptor(schema) extractors = { 1: { "type_extractor": "text" }, 2: { "regular_expression": "Gender\\s+(Male|Female)" } } apply_extractors(descriptor, {"gender": [1, 2]}, extractors) ibl_extractor = SlybotIBLExtractor([(self.template, { '#default': descriptor }, '0.12.0')]) self.assertEqual( ibl_extractor.extract(self.target)[0][0]['gender'], [u'Male'])
def test_extractor_w_empty_string_extraction(self): schema = { 'fields': { 'gender': { 'required': False, 'type': 'text', 'vary': False, }, 'name': { 'required': True, 'type': 'text', 'vary': False, } } } descriptor = create_slybot_item_descriptor(schema) extractors = { 1: { "regular_expression": "([0-9]+)" } } apply_extractors(descriptor, {"gender": [1]}, extractors) ibl_extractor = SlybotIBLExtractor([ (self.template2, {'#default': descriptor}, '0.12.0')]) self.assertEqual(ibl_extractor.extract(self.target2)[0][0]['name'], [u'Name Olivia'])
def test_extract_missing_schema(self): extractor = SlybotIBLExtractor([(sample_411, {}, '0.13.0')]) data = extractor.extract(page_411)[0][1] raw_html = ('<span itemprop="name"><span itemprop="givenName">Joe' '</span> <span itemprop="familyName">Smith</span></span>') self.assertEqual(data['full_name'], [raw_html]) self.assertEqual(data['first_name'], [raw_html]) self.assertEqual(data['last_name'], [raw_html])
def test_per_annotation_extractors(self): schema = { 'fields': { 'url': { 'required': False, 'type': 'text', 'vary': False, }, 'name': { 'required': True, 'type': 'text', 'vary': False, } } } extractors = { '1': { 'type_extractor': 'url' }, '2': { 'regular_expression': '(.*)\.html' }, '3': { 'regular_expression': 'Name: (.*)' }, '4': { 'type_extractor': 'text' }, '5': { 'type_extractor': 'price' }, '6': { 'type_extractor': 'number' }, '7': { 'type_extractor': 'date' }, '8': { 'regular_expression': '(\d+)-' } } descriptors = {'#default': create_slybot_item_descriptor(schema)} add_extractors_to_descriptors(descriptors, extractors) ibl_extractor = SlybotIBLExtractor([(self.template3, descriptors, '0.13.0')]) result = { u'_template': '6223d000057491040e4f411cf1f0734ea802eeb6', 'name': [u'Olivia'], 'url': [u'http://www.test.com/olivia'], 'title': [u'Name: Olivia'], 'price': [u'2016'], 'date': [datetime(2016, 3, 17, 20, 25)] } data = ibl_extractor.extract(self.target3)[0][0] self.assertEqual(data, result)
def test_default_type_extractor(self): schema = {'fields': {}} descriptor = create_slybot_item_descriptor(schema) extractors = {1: {"regular_expression": "Gender\\s+(Male|Female)"}} apply_extractors(descriptor, {"gender": [1]}, extractors) ibl_extractor = SlybotIBLExtractor([(self.template, { '#default': descriptor }, '0.12.0')]) self.assertEqual( ibl_extractor.extract(self.target)[0][0]['gender'], [u'Male'])
def test_per_annotation_extractors(self): schema = { 'fields': { 'url': { 'required': False, 'type': 'text', 'vary': False, }, 'name': { 'required': True, 'type': 'text', 'vary': False, } } } extractors = { '1': { 'type_extractor': 'url' }, '2': { 'regular_expression': '(.*)\.html' }, '3': { 'regular_expression': 'Name: (.*)' }, '4': { 'type_extractor': 'text' }, '5': { 'type_extractor': 'price' }, '6': { 'type_extractor': 'number' }, '7': { 'type_extractor': 'date' }, '8': { 'regular_expression': '(\d+)-' } } descriptors = {'#default': create_slybot_item_descriptor(schema)} add_extractors_to_descriptors(descriptors, extractors) ibl_extractor = SlybotIBLExtractor([ (self.template3, descriptors, '0.13.0') ]) result = {'name': [u'Olivia'], 'url': [u'http://www.test.com/olivia'], 'title': [u'Name: Olivia'], 'price': [u'2016'], 'date': [datetime(2016, 3, 17, 20, 25)]} data = ibl_extractor.extract(self.target3)[0][0] del data['_template'] self.assertEqual(data, result)
def test_default_type_extractor(self): schema = { 'fields': {} } descriptor = create_slybot_item_descriptor(schema) extractors = { 1: {"regular_expression": "Gender\\s+(Male|Female)"} } apply_extractors(descriptor, {"gender": [1]}, extractors) ibl_extractor = SlybotIBLExtractor([ (self.template, {'#default': descriptor}, '0.12.0')]) self.assertEqual(ibl_extractor.extract(self.target)[0][0]['gender'], [u'Male'])
def test_extract_single_attribute_to_multiple_fields(self): extractors = {'1': {'regular_expression': '(.*)\s'}, '2': {'regular_expression': '\s(.*)'}} descriptors = {'#default': create_slybot_item_descriptor({'fields': { 'full_name': {'type': 'text', 'required': False, 'vary': False}, 'first_name': {'type': 'text', 'required': False, 'vary': False, 'name': u'prénom'}, 'last_name': {'type': 'text', 'required': False, 'vary': False, 'name': 'nom'}, 'address': {'type': 'text', 'required': False, 'vary': False}}})} add_extractors_to_descriptors(descriptors, extractors) extractor = SlybotIBLExtractor([(sample_411, descriptors, '0.13.0')]) data = extractor.extract(page_411)[0][1] self.assertEqual(data['full_name'], [u'Joe Smith']) self.assertEqual(data[u'prénom'], [u'Joe']) self.assertEqual(data['nom'], [u'Smith'])
def test_extract_single_attribute_to_multiple_fields(self): extractors = {'1': {'regular_expression': '(.*)\s'}, '2': {'regular_expression': '\s(.*)'}} descriptors = {'#default': create_slybot_item_descriptor({'fields': { 'full_name': {'type': 'text', 'required': False, 'vary': False}, 'first_name': {'type': 'text', 'required': False, 'vary': False, 'name': u'prénom'}, 'last_name': {'type': 'text', 'required': False, 'vary': False, 'name': 'nom'}, 'address': {'type': 'text', 'required': False, 'vary': False}}})} add_extractors_to_descriptors(descriptors, extractors) extractor = SlybotIBLExtractor([(sample_411, descriptors, '0.13.0')]) data = extractor.extract(page_411)[0] self.assertEqual(data[1]['full_name'], [u'Joe Smith']) self.assertEqual(data[1][u'prénom'], [u'Joe']) self.assertEqual(data[1]['nom'], [u'Smith'])
def test_negative_hit_w_regex(self): schema = { 'fields': { 'gender': { 'required': False, 'type': 'number', 'vary': False, } } } descriptor = create_slybot_item_descriptor(schema) extractors = {1: {"regular_expression": "Gender\\s+(Male|Female)"}} apply_extractors(descriptor, {"gender": [1]}, extractors) ibl_extractor = SlybotIBLExtractor([ (self.template, {'#default': descriptor}, '0.12.0')]) self.assertEqual(ibl_extractor.extract(self.target)[0], None)
def test_text_type_w_regex(self): schema = { "fields": { 'gender': { 'required': False, 'type': 'text', 'vary': False, } } } descriptor = create_slybot_item_descriptor(schema) extractors = {1: {"regular_expression": "Gender\\s+(Male|Female)"}} apply_extractors(descriptor, {"gender": [1]}, extractors) ibl_extractor = SlybotIBLExtractor([ (self.template, {'#default': descriptor}, '0.12.0')]) self.assertEqual(ibl_extractor.extract(self.target)[0][0]['gender'], [u'Male'])
def test_raw_type_w_regex(self): schema = { 'fields': { 'gender': { 'required': False, 'type': 'raw', 'vary': False, } } } descriptor = create_slybot_item_descriptor(schema) extractors = {1: { "regular_expression": "Gender.*(<td\s*>(?:Male|Female)</td>)" }} apply_extractors(descriptor, {"gender": [1]}, extractors) ibl_extractor = SlybotIBLExtractor([(self.template, {'#default': descriptor})]) self.assertEqual(ibl_extractor.extract(self.target)[0][0]['gender'], [u'<td >Male</td>'])
def test_raw_type_w_regex(self): schema = { 'fields': { 'gender': { 'required': False, 'type': 'raw', 'vary': False, } } } descriptor = create_slybot_item_descriptor(schema) extractors = { 1: {"regular_expression": "Gender.*(<td\s*>(?:Male|Female)</td>)"} } apply_extractors(descriptor, {"gender": [1]}, extractors) ibl_extractor = SlybotIBLExtractor([ (self.template, {'#default': descriptor}, '0.12.0')]) self.assertEqual(ibl_extractor.extract(self.target)[0][0]['gender'], [u'<td >Male</td>'])
def test_text_type_w_regex_and_no_groups(self): schema = { 'fields': { 'gender': { 'required': False, 'type': 'text', 'vary': False, } } } descriptor = create_slybot_item_descriptor(schema) extractors = { 1: {"regular_expression": "Gender"} } apply_extractors(descriptor, {"gender": [1]}, extractors) ibl_extractor = SlybotIBLExtractor([ (self.template, {'#default': descriptor}, '0.12.0')]) self.assertEqual(ibl_extractor.extract(self.target)[0][0]['gender'], [u'Gender'])
def test_extract_missing_schema(self): extractor = SlybotIBLExtractor([(sample_411, {}, '0.13.0')]) data = extractor.extract(page_411)[0] self.assertEqual(data[1]['full_name'], [u'Joe Smith']) self.assertEqual(data[1]['first_name'], [u'Joe Smith']) self.assertEqual(data[1]['last_name'], [u'Joe Smith'])