def __init__(self, template_descriptor_pairs, trace=False, apply_extrarequired=True): self.token_dict = TokenDict() parsed_templates = [] template_versions = [] for template, descriptors, version in template_descriptor_pairs: parsed = parse_template(self.token_dict, template, descriptors) parsed_templates.append(parsed) template_versions.append(version) if _annotation_count(parsed): parse_extraction_page(self.token_dict, template) for parsed in parsed_templates: default_schema = getattr(parsed, '_default_schema', None) descriptor = parsed.descriptors.get(default_schema) if descriptor is not None and apply_extrarequired: descriptor = descriptor.copy() parsed.descriptors[default_schema] = descriptor parsed.descriptors['#default'] = descriptor # templates with more attributes are considered first parsed_templates = sorted( parsed_templates, key=_annotation_count, reverse=True ) self.extraction_trees = [ self.build_extraction_tree(p, None, trace) for p, v in zip(parsed_templates, template_versions) ]
def parse_strings(template_html, extraction_html): """Create a template and extraction page from raw strings this is useful for testing purposes """ t = TokenDict() template_page = HtmlPage(body=template_html) extraction_page = HtmlPage(body=extraction_html) return (parse_template(t, template_page), parse_extraction_page(t, extraction_page))
def test_site_pages(self): """ Tests from real pages. More reliable and easy to build for more complicated structures """ for source, annotations in iter_samples('pageparsing'): template = HtmlPage(body=source) parser = TemplatePageParser(TokenDict()) parser.feed(template) for annotation in parser.annotations: test_annotation = annotations.pop(0) for s in annotation.__slots__: if s == "tag_attributes": for pair in getattr(annotation, s): self.assertEqual(list(pair), test_annotation[s].pop(0)) else: self.assertEqual(getattr(annotation, s), test_annotation[s]) self.assertEqual(annotations, [])
for row in item_selectors: item = {k: row.xpath(xpath).extract() for k, xpath in extractor.selectors.items()} item = {k: v for k, v in item.items() if v} validated = validate(item, html_page) if not validated: continue if hasattr(validated, 'dump'): validated = validated.dump() validated['_template'] = None items.append(validated) items = list(filter(bool, items)) return [i for i in items if '_type' in i] _PATH = dirname(__file__) td = TokenDict() with open('%s/data/SampleProject/items.json' % _PATH) as f: items = json.load(f) descriptors = {'#default': create_slybot_item_descriptor(items['default'], 'default')} class FakeContainer(BaseContainerExtractor): def __init__(self, schema, legacy=False): self.schema = schema self.extra_requires = [] self.legacy = legacy self.modifiers = {} schema = FakeContainer(descriptors['#default'])
def _parse_page(parser_class, pagetext): htmlpage = HtmlPage(None, {}, pagetext) parser = parser_class(TokenDict()) parser.feed(htmlpage) return parser