def test_site_pages(self): """ Tests from real pages. More reliable and easy to build for more complicated structures """ SAMPLES_FILE_PREFIX = os.path.join(path, "samples/samples_pageparsing") count = 0 fname = "%s_%d.json" % (SAMPLES_FILE_PREFIX, count) while os.path.exists(fname): source = open("%s_%d.html" % (SAMPLES_FILE_PREFIX, count), "rb").read() source = source.decode('utf-8') annotations = json.loads(open(fname, "rb").read().decode('utf-8')) template = HtmlPage(body=source) parser = TemplatePageParser(TokenDict()) parser.feed(template) for annotation in parser.annotations: test_annotation = annotations.pop(0) for s in annotation.__slots__: if s == "tag_attributes": for pair in getattr(annotation, s): self.assertEqual(list(pair), test_annotation[s].pop(0)) else: self.assertEqual(getattr(annotation, s), test_annotation[s]) self.assertEqual(annotations, []) count += 1 fname = "%s_%d.json" % (SAMPLES_FILE_PREFIX, count)
def annotations(self): """Return all annotations contained in the template as a list of tuples (annotation, index) """ anlist = [] for i, f in enumerate(self.htmlpage.parsed_body): if isinstance(f, HtmlTag) and f.tag_type == HtmlTagType.OPEN_TAG: at = f.attributes.get("data-scrapy-annotate") if at: an = json.loads(at.replace(""", '"')) anlist.append((an, i)) return anlist
def test_site_samples(self): """test parse_html from real cases""" count = 0 fname = "%s_%d.json" % (SAMPLES_FILE_PREFIX, count) while os.path.exists(fname): source = open("%s_%d.html" % (SAMPLES_FILE_PREFIX, count), "rb").read() source = source.decode('utf-8') parsed = json.loads(open(fname, "rb").read().decode('utf-8'), \ object_hook=_decode_element) self._test_sample(source, parsed, count) count += 1 fname = "%s_%d.json" % (SAMPLES_FILE_PREFIX, count)
def annotations(self): """Return all annotations contained in the template as a list of tuples (annotation, index) """ anlist = [] for i, f in enumerate(self.htmlpage.parsed_body): if isinstance(f, HtmlTag) and f.tag_type == HtmlTagType.OPEN_TAG: at = f.attributes.get('data-scrapy-annotate') if at: an = json.loads(at.replace('"', '"')) anlist.append((an, i)) return anlist
def _read_template_annotation(html_tag): template_attr = html_tag.attributes.get("data-scrapy-annotate") if template_attr is None: return None unescaped = template_attr.replace(""", '"') return json.loads(unescaped)
def _read_template_annotation(html_tag): template_attr = html_tag.attributes.get('data-scrapy-annotate') if template_attr is None: return None unescaped = template_attr.replace('"', '"') return json.loads(unescaped)