Ejemplo n.º 1
0
    def __init__(self, template_descriptor_pairs, trace=False,
                 apply_extrarequired=True):
        self.token_dict = TokenDict()
        parsed_templates = []
        template_versions = []
        for template, descriptors, version in template_descriptor_pairs:
            parsed = parse_template(self.token_dict, template, descriptors)
            parsed_templates.append(parsed)
            template_versions.append(version)
            if _annotation_count(parsed):
                parse_extraction_page(self.token_dict, template)

        for parsed in parsed_templates:
            default_schema = getattr(parsed, '_default_schema', None)
            descriptor = parsed.descriptors.get(default_schema)
            if descriptor is not None and apply_extrarequired:
                descriptor = descriptor.copy()
                parsed.descriptors[default_schema] = descriptor
                parsed.descriptors['#default'] = descriptor

        # templates with more attributes are considered first
        parsed_templates = sorted(
            parsed_templates, key=_annotation_count, reverse=True
        )
        self.extraction_trees = [
            self.build_extraction_tree(p, None, trace)
            for p, v in zip(parsed_templates, template_versions)
        ]
Ejemplo n.º 2
0
def parse_strings(template_html, extraction_html):
    """Create a template and extraction page from raw strings

    this is useful for testing purposes
    """
    t = TokenDict()
    template_page = HtmlPage(body=template_html)
    extraction_page = HtmlPage(body=extraction_html)
    return (parse_template(t, template_page),
            parse_extraction_page(t, extraction_page))
Ejemplo n.º 3
0
 def test_site_pages(self):
     """
     Tests from real pages. More reliable and easy to build for more complicated structures
     """
     for source, annotations in iter_samples('pageparsing'):
         template = HtmlPage(body=source)
         parser = TemplatePageParser(TokenDict())
         parser.feed(template)
         for annotation in parser.annotations:
             test_annotation = annotations.pop(0)
             for s in annotation.__slots__:
                 if s == "tag_attributes":
                     for pair in getattr(annotation, s):
                         self.assertEqual(list(pair), test_annotation[s].pop(0))
                 else:
                     self.assertEqual(getattr(annotation, s), test_annotation[s])
         self.assertEqual(annotations, [])
Ejemplo n.º 4
0
    for row in item_selectors:
        item = {k: row.xpath(xpath).extract()
                for k, xpath in extractor.selectors.items()}
        item = {k: v for k, v in item.items() if v}
        validated = validate(item, html_page)
        if not validated:
            continue
        if hasattr(validated, 'dump'):
            validated = validated.dump()
        validated['_template'] = None
        items.append(validated)
    items = list(filter(bool, items))
    return [i for i in items if '_type' in i]

_PATH = dirname(__file__)
td = TokenDict()
with open('%s/data/SampleProject/items.json' % _PATH) as f:
    items = json.load(f)
descriptors = {'#default': create_slybot_item_descriptor(items['default'],
                                                         'default')}


class FakeContainer(BaseContainerExtractor):
    def __init__(self, schema, legacy=False):
        self.schema = schema
        self.extra_requires = []
        self.legacy = legacy
        self.modifiers = {}


schema = FakeContainer(descriptors['#default'])
Ejemplo n.º 5
0
def _parse_page(parser_class, pagetext):
    htmlpage = HtmlPage(None, {}, pagetext)
    parser = parser_class(TokenDict())
    parser.feed(htmlpage)
    return parser