Example #1
0
    def __init__(self, template_descriptor_pairs, trace=False,
                 apply_extrarequired=True):
        self.token_dict = TokenDict()
        parsed_templates = []
        template_versions = []
        for template, descriptors, version in template_descriptor_pairs:
            parsed = parse_template(self.token_dict, template, descriptors)
            parsed_templates.append(parsed)
            template_versions.append(version)
            if _annotation_count(parsed):
                parse_extraction_page(self.token_dict, template)

        for parsed in parsed_templates:
            default_schema = getattr(parsed, '_default_schema', None)
            descriptor = parsed.descriptors.get(default_schema)
            if descriptor is not None and apply_extrarequired:
                descriptor = descriptor.copy()
                parsed.descriptors[default_schema] = descriptor
                parsed.descriptors['#default'] = descriptor

        # templates with more attributes are considered first
        parsed_templates = sorted(
            parsed_templates, key=_annotation_count, reverse=True
        )
        self.extraction_trees = [
            self.build_extraction_tree(p, None, trace, legacy=v < '0.13.0')
            for p, v in zip(parsed_templates, template_versions)
        ]
Example #2
0
    def __init__(self, template_descriptor_pairs, trace=False,
                 apply_extrarequired=True):
        self.token_dict = TokenDict()
        parsed_templates = []
        template_versions = []
        for template, descriptors, version in template_descriptor_pairs:
            parsed = parse_template(self.token_dict, template, descriptors)
            parsed_templates.append(parsed)
            template_versions.append(version)
            if _annotation_count(parsed):
                parse_extraction_page(self.token_dict, template)

        for parsed in parsed_templates:
            default_schema = getattr(parsed, '_default_schema', None)
            descriptor = parsed.descriptors.get(default_schema)
            if descriptor is not None and apply_extrarequired:
                descriptor = descriptor.copy()
                parsed.descriptors[default_schema] = descriptor
                parsed.descriptors['#default'] = descriptor

        # templates with more attributes are considered first
        parsed_templates = sorted(
            parsed_templates, key=_annotation_count, reverse=True
        )
        self.extraction_trees = [
            self.build_extraction_tree(p, None, trace)
            for p, v in zip(parsed_templates, template_versions)
        ]
Example #3
0
    def extract(self, html, pref_template_id=None):
        """Extract data from an html page.

        If pref_template_url is specified, the template with that url will be
        used first.
        """
        extraction_page = parse_extraction_page(self.token_dict, html)
        extraction_trees = self.extraction_trees
        if pref_template_id is not None:
            extraction_trees = sorted(
                self.extraction_trees,
                key=lambda x: x.template.id != pref_template_id)
        for extraction_tree in extraction_trees:
            template_id = extraction_tree.template.id
            extracted = extraction_tree.extract(extraction_page)
            correctly_extracted = []
            for item in extracted:
                if u'_type' in item or not hasattr(self, 'validated'):
                    correctly_extracted.append(item)
                else:
                    validated = self.validated[template_id]([item])
                    if validated:
                        correctly_extracted.append(validated)
            if len(correctly_extracted) > 0:
                return correctly_extracted, extraction_tree.template
        return None, None
Example #4
0
    def extract(self, html, pref_template_id=None):
        """Extract data from an html page.

        If pref_template_url is specified, the template with that url will be
        used first.
        """
        extraction_page = parse_extraction_page(self.token_dict, html)
        extraction_trees = self.extraction_trees
        if pref_template_id is not None:
            extraction_trees = sorted(
                self.extraction_trees,
                key=lambda x: x.template.id != pref_template_id)
        for extraction_tree in extraction_trees:
            template_id = extraction_tree.template.id
            extracted = extraction_tree.extract(extraction_page)
            correctly_extracted = []
            for item in extracted:
                if (isinstance(item, ItemProcessor) or
                        not hasattr(self, 'validated')):
                    correctly_extracted.append(item)
                else:
                    validated = self.validated[template_id]([item])
                    if validated:
                        correctly_extracted.append(validated)
            if len(correctly_extracted) > 0:
                return correctly_extracted, extraction_tree.template
        return None, None
Example #5
0
    def extract(self, html, pref_template_id=None):
        """Extract data from an html page.

        If pref_template_url is specified, the template with that url will be
        used first.
        """
        extraction_page = parse_extraction_page(self.token_dict, html)
        extraction_trees = self.extraction_trees
        if pref_template_id is not None:
            extraction_trees = sorted(
                self.extraction_trees,
                key=lambda x: x.template.id != pref_template_id)
        sel = Selector(text=html.body)
        for extraction_tree in extraction_trees:
            template_id = extraction_tree.template.id
            extracted = extraction_tree.extract(extraction_page)
            correctly_extracted = []
            for item in extracted:
                if (isinstance(item, ItemProcessor) or
                        not hasattr(self, 'validated')):
                    if hasattr(item, 'process'):
                        item = item.process(sel)
                else:
                    item = self.validated[template_id]([item])
                if item:
                    correctly_extracted.append(item)
            if len(correctly_extracted) > 0:
                return correctly_extracted, extraction_tree.template
        return None, None
simple_template = HtmlPage(url="http://www.test.com/a",
                           body=apply_annotations(annotations, html))
target1 = base_page('\n'.join(item_template(idx=i, rank=1)
                              for i in range(1, 11)))
target2 = base_page('\n'.join(item_template(idx=i, rank=i if i % 2 else '')
                              for i in range(1, 11)))
target1 = HtmlPage(url="http://www.test.com/a", body=target1)
target2 = HtmlPage(url="http://www.test.com/a", body=target2)
simple_descriptors = {k: create_slybot_item_descriptor(v)
                      for k, v in schemas.items()}
add_extractors_to_descriptors(simple_descriptors, {})


td = TokenDict()
html_page = HtmlPage(body=open_spec('stack_overflow.html').decode('utf-8'))
extraction_page = parse_extraction_page(td, html_page)
with open('%s/data/SampleProject/items.json' % PATH) as f:
    items = json.load(f)
descriptors = {'#default': create_slybot_item_descriptor(items['default'],
                                                         'default')}
template = parse_template(td, html_page, descriptors)
unvalidated_template = parse_template(td, html_page, {})
unvalidated_template.id = u'stack_overflow_test'
basic_extractors = BasicTypeExtractor.create(template.annotations)
uncontained_annotation = basic_extractors[0]
root_container = basic_extractors[1]
child_container = basic_extractors[2]
child_annotations = basic_extractors[3:]

sample_411, page_411 = open_sample_and_page('411_list.json')
xceed_spider = open_spec('xceed.json')
from slybot.extractors import add_extractors_to_descriptors
from slybot.item import create_slybot_item_descriptor
from slybot.plugins.scrapely_annotations.builder import (
    apply_annotations, _clean_annotation_data
)
from scrapely.extraction.pageobjects import TokenDict
from scrapely.htmlpage import HtmlPage
from scrapely.extraction.regionextract import BasicTypeExtractor
from scrapely.extraction.pageparsing import parse_extraction_page
from scrapely.htmlpage import HtmlTagType

_PATH = dirname(__file__)
td = TokenDict()
with open('%s/data/templates/stack_overflow.html' % _PATH) as f:
    html_page = HtmlPage(body=f.read().decode('utf-8'))
extraction_page = parse_extraction_page(td, html_page)
with open('%s/data/SampleProject/items.json' % _PATH) as f:
    items = json.load(f)
descriptors = {'#default': create_slybot_item_descriptor(items['default'],
                                                         'default')}
template = parse_template(td, html_page, descriptors)
unvalidated_template = parse_template(td, html_page, {})
unvalidated_template.id = u'stack_overflow_test'
basic_extractors = BasicTypeExtractor.create(template.annotations)
uncontained_annotation = basic_extractors[0]
root_container = basic_extractors[1]
child_container = basic_extractors[2]
child_annotations = basic_extractors[3:]

with open('%s/data/templates/411_list.json' % _PATH) as f:
    sample = json.load(f)