Ejemplo n.º 1
0
def _open_sample_and_page(name):
    sample_spec = _open_spec(name)
    annotations = sample_spec['plugins']['annotations-plugin']['extracts']
    annotated = apply_annotations(_clean_annotation_data(annotations),
                                  sample_spec['original_body'])
    url = sample_spec['url']
    return (HtmlPage(url=url, body=annotated),
            HtmlPage(url=url, body=sample_spec['original_body']))
def _open_sample_and_page(name):
    sample_spec = _open_spec(name)
    annotations = sample_spec['plugins']['annotations-plugin']['extracts']
    annotated = apply_annotations(_clean_annotation_data(annotations),
                                  sample_spec['original_body'])
    url = sample_spec['url']
    return (HtmlPage(url=url, body=annotated),
            HtmlPage(url=url, body=sample_spec['original_body']))
Ejemplo n.º 3
0
class ExtractorTest(TestCase):

    annotated = u"""
<table>
<tr data-scrapy-annotate="{&quot;required&quot;: [], &quot;variant&quot;: 0, &quot;annotations&quot;: {&quot;content&quot;: &quot;gender&quot;}}">
<th class="item-key">Gender</th>
<td >Male</td></tr>
</table>"""
    _target = u"""
<table>
<tr>
<th class="item-key">Gender</th>
<td >Male</td></tr>
</table>"""
    annotated2 = u"""
<table>
<tr data-scrapy-annotate="{&quot;required&quot;: [], &quot;variant&quot;: 0, &quot;annotations&quot;: {&quot;content&quot;: &quot;name&quot;}}">
<th class="item-key">Name</th>
<td >John</td></tr>
<span data-scrapy-annotate="{&quot;required&quot;: [], &quot;variant&quot;: 0, &quot;annotations&quot;: {&quot;content&quot;: &quot;gender&quot;}}">Male</span>
</table>"""
    _target2 = u"""
<body>
<tr>
<th class="item-key">Name</th><td>Olivia</td></tr>
<span></span>
</body>"""

    annotations = _clean_annotation_data([{
        'id': 'annotation',
        'selector': 'td > a',
        'container_id': 'parent',
        'data': {
            1: {
                'attribute': 'content',
                'field': 'title',
                'required': False,
                'extractors': []
            },
            2: {
                'attribute': 'content',
                'field': 'name',
                'required': False,
                'extractors': ['3']
            },
            3: {
                'attribute': 'href',
                'field': 'url',
                'required': False,
                'extractors': ['1', '2']
            }
        }
    }, {
        'id': 'annotation',
        'selector': 'span',
        'container_id': 'parent',
        'data': {
            1: {
                'attribute': 'content',
                'field': 'price',
                'required': False,
                'extractors': ['8', '4', '5', '6']
            },
            2: {
                'attribute': 'content',
                'field': 'date',
                'required': False,
                'extractors': ['4', '7']
            }
        }
    }, {
        'id': 'parent',
        'item_container': True,
        'selector': 'body'
    }])
    target3 = u"""
    <html>
    <body>
    <tr>
        <th class="item-key">Name</th>
        <td>
            <a href="/olivia.html">Name: Olivia</a>
        </td>
    </tr><span>2016-03-17 20:25</span>
    </body></html>"""

    template = HtmlPage(url="http://www.test.com/", body=annotated)
    target = HtmlPage(url="http://www.test.com/", body=_target)
    template2 = HtmlPage(url="http://www.test.com/", body=annotated2)
    target2 = HtmlPage(url="http://www.test.com/a", body=_target2)
    template3 = HtmlPage(url="http://www.test.com/a",
                         body=apply_annotations(annotations, target3))
    target3 = HtmlPage(url="http://www.test.com/a", body=target3)

    def test_regex_extractor(self):
        extractor = create_regex_extractor("(\d+).*(\.\d+)")
        extracted = extractor(u"The price of this product is <div>45</div> </div class='small'>.50</div> pounds")
        self.assertEqual(extracted, u"45.50")
        processor = TextFieldTypeProcessor()
        self.assertEqual(processor.adapt(extracted, None), u"45.50")

    def test_raw_type_w_regex(self):
        schema = {
            'fields': {
                'gender': {
                    'required': False,
                    'type': 'raw',
                    'vary': False,
                }
            }
        }
        descriptor = create_slybot_item_descriptor(schema)
        extractors = {
            1: {"regular_expression": "Gender.*(<td\s*>(?:Male|Female)</td>)"}
        }
        apply_extractors(descriptor, {"gender": [1]}, extractors)

        ibl_extractor = SlybotIBLExtractor([
            (self.template, {'#default': descriptor}, '0.12.0')])
        self.assertEqual(ibl_extractor.extract(self.target)[0][0]['gender'], [u'<td >Male</td>'])

    def test_negative_hit_w_regex(self):
        schema = {
            'fields': {
                'gender': {
                    'required': False,
                    'type': 'number',
                    'vary': False,
                }
            }
        }
        descriptor = create_slybot_item_descriptor(schema)
        extractors = {1: {"regular_expression": "Gender\\s+(Male|Female)"}}
        apply_extractors(descriptor, {"gender": [1]}, extractors)

        ibl_extractor = SlybotIBLExtractor([
            (self.template, {'#default': descriptor}, '0.12.0')])
        self.assertEqual(ibl_extractor.extract(self.target)[0], None)

    def test_text_type_w_regex(self):
        schema = {
            "fields": {
                'gender': {
                    'required': False,
                    'type': 'text',
                    'vary': False,
                }
            }
        }
        descriptor = create_slybot_item_descriptor(schema)
        extractors = {1: {"regular_expression": "Gender\\s+(Male|Female)"}}
        apply_extractors(descriptor, {"gender": [1]}, extractors)

        ibl_extractor = SlybotIBLExtractor([
            (self.template, {'#default': descriptor}, '0.12.0')])
        self.assertEqual(ibl_extractor.extract(self.target)[0][0]['gender'], [u'Male'])

    def test_type_extractor(self):
        schema = {
            "fields": {
                'gender': {
                    'required': False,
                    'type': 'number',
                    'vary': False,
                }
            }
        }
        descriptor = create_slybot_item_descriptor(schema)
        extractors = {
            1: {"type_extractor": "text"},
            2: {"regular_expression": "Gender\\s+(Male|Female)"}
        }
        apply_extractors(descriptor, {"gender": [1, 2]}, extractors)

        ibl_extractor = SlybotIBLExtractor([
            (self.template, {'#default': descriptor}, '0.12.0')])
        self.assertEqual(ibl_extractor.extract(self.target)[0][0]['gender'], [u'Male'])

    def test_default_type_extractor(self):
        schema = {
            'fields': {}
        }
        descriptor = create_slybot_item_descriptor(schema)
        extractors = {
            1: {"regular_expression": "Gender\\s+(Male|Female)"}
        }
        apply_extractors(descriptor, {"gender": [1]}, extractors)

        ibl_extractor = SlybotIBLExtractor([
            (self.template, {'#default': descriptor}, '0.12.0')])
        self.assertEqual(ibl_extractor.extract(self.target)[0][0]['gender'], [u'Male'])

    def test_text_type_w_regex_and_no_groups(self):
        schema = {
            'fields': {
                'gender': {
                    'required': False,
                    'type': 'text',
                    'vary': False,
                }
            }
        }
        descriptor = create_slybot_item_descriptor(schema)
        extractors = {
            1: {"regular_expression": "Gender"}
        }
        apply_extractors(descriptor, {"gender": [1]}, extractors)

        ibl_extractor = SlybotIBLExtractor([
            (self.template, {'#default': descriptor}, '0.12.0')])
        self.assertEqual(ibl_extractor.extract(self.target)[0][0]['gender'], [u'Gender'])

    def test_extractor_w_empty_string_extraction(self):
        schema = {
            'fields': {
                'gender': {
                    'required': False,
                    'type': 'text',
                    'vary': False,
                },
                'name': {
                    'required': True,
                    'type': 'text',
                    'vary': False,
                }
            }
        }
        descriptor = create_slybot_item_descriptor(schema)
        extractors = {
            1: {
                "regular_expression": "([0-9]+)"
            }
        }
        apply_extractors(descriptor, {"gender": [1]}, extractors)

        ibl_extractor = SlybotIBLExtractor([
            (self.template2, {'#default': descriptor}, '0.12.0')])
        self.assertEqual(ibl_extractor.extract(self.target2)[0][0]['name'], [u'Name Olivia'])

    def test_per_annotation_extractors(self):
        schema = {
            'fields': {
                'url': {
                    'required': False,
                    'type': 'text',
                    'vary': False,
                },
                'name': {
                    'required': True,
                    'type': 'text',
                    'vary': False,
                }
            }
        }
        extractors = {
            '1': {
                'type_extractor': 'url'
            },
            '2': {
                'regular_expression': '(.*)\.html'
            },
            '3': {
                'regular_expression': 'Name: (.*)'
            },
            '4': {
                'type_extractor': 'text'
            },
            '5': {
                'type_extractor': 'price'
            },
            '6': {
                'type_extractor': 'number'
            },
            '7': {
                'type_extractor': 'date'
            },
            '8': {
                'regular_expression': '(\d+)-'
            }
        }
        descriptors = {'#default': create_slybot_item_descriptor(schema)}
        add_extractors_to_descriptors(descriptors, extractors)
        ibl_extractor = SlybotIBLExtractor([
            (self.template3, descriptors, '0.13.0')
        ])
        result = {'name': [u'Olivia'], 'url': [u'http://www.test.com/olivia'],
                  'title': [u'Name: Olivia'], 'price': [u'2016'],
                  'date': [datetime(2016, 3, 17, 20, 25)]}
        data = ibl_extractor.extract(self.target3)[0][0]
        del data['_template']
        self.assertEqual(data, result)
    {'id': 'repeated_parent', 'item_container': True, 'container_id': 'parent',
     'selector': 'li', 'repeated': True}])
schemas = {
    '#default': {'name': 'default_item', 'fields': {}},
    'data': {
        'name': 'data_item',
        'fields': {
            'title': {'required': False, 'vary': False, 'type': 'text'},
            'url': {'required': False, 'vary': False, 'type': 'url'},
            'description': {'required': False, 'vary': False, 'type': 'text'},
            'rank': {'required': False, 'vary': False, 'type': 'price'}}
    }
}

simple_template = HtmlPage(url="http://www.test.com/a",
                           body=apply_annotations(annotations, html))
target1 = base_page('\n'.join(item_template(idx=i, rank=1)
                              for i in range(1, 11)))
target2 = base_page('\n'.join(item_template(idx=i, rank=i if i % 2 else '')
                              for i in range(1, 11)))
target1 = HtmlPage(url="http://www.test.com/a", body=target1)
target2 = HtmlPage(url="http://www.test.com/a", body=target2)
simple_descriptors = {k: create_slybot_item_descriptor(v)
                      for k, v in schemas.items()}
add_extractors_to_descriptors(simple_descriptors, {})


td = TokenDict()
html_page = HtmlPage(body=open_spec('stack_overflow.html').decode('utf-8'))
extraction_page = parse_extraction_page(td, html_page)
with open('%s/data/SampleProject/items.json' % PATH) as f:
Ejemplo n.º 5
0
    items = json.load(f)
descriptors = {'#default': create_slybot_item_descriptor(items['default'],
                                                         'default')}
template = parse_template(td, html_page, descriptors)
unvalidated_template = parse_template(td, html_page, {})
unvalidated_template.id = u'stack_overflow_test'
basic_extractors = BasicTypeExtractor.create(template.annotations)
uncontained_annotation = basic_extractors[0]
root_container = basic_extractors[1]
child_container = basic_extractors[2]
child_annotations = basic_extractors[3:]

with open('%s/data/templates/411_list.json' % _PATH) as f:
    sample = json.load(f)
annotations = sample['plugins']['annotations-plugin']['extracts']
annotated = apply_annotations(_clean_annotation_data(annotations),
                              sample['original_body'])
sample_411 = HtmlPage(url=sample['url'], body=annotated)
page_411 = HtmlPage(url=sample['url'],
                    body=sample['original_body'])
with open('%s/data/templates/daft_list.json' % _PATH) as f:
    sample = json.load(f)
annotations = sample['plugins']['annotations-plugin']['extracts']
annotated = apply_annotations(_clean_annotation_data(annotations),
                              sample['original_body'])
sample_daft = HtmlPage(url=sample['url'], body=annotated)
page_daft = HtmlPage(url=sample['url'],
                     body=sample['original_body'])
for annotation in annotations:
    for attribute in annotation.get('data', {}).values():
        attribute['required'] = False
annotated = apply_annotations(_clean_annotation_data(annotations),
Ejemplo n.º 6
0
        lextractor = create_linkextractor_from_specs(specs)
        response = TextResponse(url='http://www.example.com/', body=csvfeed3)
        links = list(lextractor.links_to_follow(response))
        self.assertEqual(len(links), 2)
        self.assertEqual(links[0].url, 'http://www.example.com/path')
        self.assertEqual(links[1].url, 'http://www.example.com/path2')


html = """
<a href="http://www.example.com/path">Click here</a>
"""
_PATH = dirname(__file__)
with open('%s/data/templates/daft_list.json' % _PATH) as f:
    daft_sample = json.load(f)
    annotations = daft_sample['plugins']['annotations-plugin']['extracts']
    daft_body = apply_annotations(_clean_annotation_data(annotations),
                                  daft_sample['original_body'])
    daft_sample['annotated_body'] = daft_body


class Test_HtmlLinkExtractor(TestCase):
    def test_simple(self):
        specs = {"type": "html", "value": None}
        lextractor = create_linkextractor_from_specs(specs)
        response = HtmlResponse(url='http://www.example.com/', body=html)
        links = list(lextractor.links_to_follow(response))
        self.assertEqual(len(links), 1)
        self.assertEqual(links[0].url, 'http://www.example.com/path')
        self.assertEqual(links[0].text, 'Click here')


class Test_PaginationExtractor(TestCase):
Ejemplo n.º 7
0
        specs = {"type": "column", "value": 1}
        lextractor = create_linkextractor_from_specs(specs)
        response = TextResponse(url='http://www.example.com/', body=csvfeed3)
        links = list(lextractor.links_to_follow(response))
        self.assertEqual(len(links), 2)
        self.assertEqual(links[0].url, 'http://www.example.com/path')
        self.assertEqual(links[1].url, 'http://www.example.com/path2')

html = """
<a href="http://www.example.com/path">Click here</a>
"""
_PATH = dirname(__file__)
with open('%s/data/templates/daft_list.json' % _PATH) as f:
    daft_sample = json.load(f)
    annotations = daft_sample['plugins']['annotations-plugin']['extracts']
    daft_body = apply_annotations(_clean_annotation_data(annotations),
                                  daft_sample['original_body'])
    daft_sample['annotated_body'] = daft_body


class Test_HtmlLinkExtractor(TestCase):
    def test_simple(self):
        specs = {"type": "html", "value": None}
        lextractor = create_linkextractor_from_specs(specs)
        response = HtmlResponse(url='http://www.example.com/', body=html)
        links = list(lextractor.links_to_follow(response))
        self.assertEqual(len(links), 1)
        self.assertEqual(links[0].url, 'http://www.example.com/path')
        self.assertEqual(links[0].text, 'Click here')


class Test_PaginationExtractor(TestCase):
    {'id': 'repeated_parent', 'item_container': True, 'container_id': 'parent',
     'selector': 'li', 'repeated': True}])
schemas = {
    '#default': {'name': 'default_item', 'fields': {}},
    'data': {
        'name': 'data_item',
        'fields': {
            'title': {'required': False, 'vary': False, 'type': 'text'},
            'url': {'required': False, 'vary': False, 'type': 'url'},
            'description': {'required': False, 'vary': False, 'type': 'text'},
            'rank': {'required': False, 'vary': False, 'type': 'price'}}
    }
}

simple_template = HtmlPage(url="http://www.test.com/a",
                           body=apply_annotations(annotations, html))
target1 = base_page('\n'.join(item_template(idx=i, rank=1)
                              for i in range(1, 11)))
target2 = base_page('\n'.join(item_template(idx=i, rank=i if i % 2 else '')
                              for i in range(1, 11)))
target1 = HtmlPage(url="http://www.test.com/a", body=target1)
target2 = HtmlPage(url="http://www.test.com/a", body=target2)
simple_descriptors = {k: create_slybot_item_descriptor(v)
                      for k, v in schemas.items()}
add_extractors_to_descriptors(simple_descriptors, {})
_PATH = dirname(__file__)


def _open_spec(name):
    use_json = True if name.endswith('.json') else False
    with open('%s/data/templates/%s' % (_PATH, name)) as f: