Example #1
0
 def test_site_pages(self):
     """
     Tests from real pages. More reliable and easy to build for more complicated structures
     """
     SAMPLES_FILE_PREFIX = os.path.join(path, "samples/samples_pageparsing")
     count = 0
     fname = "%s_%d.json" % (SAMPLES_FILE_PREFIX, count)
     while os.path.exists(fname):
         source = str_to_unicode(
             open("%s_%d.html" % (SAMPLES_FILE_PREFIX, count), "rb").read())
         annotations = json.loads(str_to_unicode(open(fname, "rb").read()))
         template = HtmlPage(body=source)
         parser = TemplatePageParser(TokenDict())
         parser.feed(template)
         for annotation in parser.annotations:
             test_annotation = annotations.pop(0)
             for s in annotation.__slots__:
                 if s == "tag_attributes":
                     for pair in getattr(annotation, s):
                         self.assertEqual(list(pair),
                                          test_annotation[s].pop(0))
                 else:
                     self.assertEqual(getattr(annotation, s),
                                      test_annotation[s])
         self.assertEqual(annotations, [])
         count += 1
         fname = "%s_%d.json" % (SAMPLES_FILE_PREFIX, count)
Example #2
0
 def test_site_pages(self):
     """
     Tests from real pages. More reliable and easy to build for more complicated structures
     """
     samples_file = open(os.path.join(path, "samples_pageparsing.json.gz"), "r")
     samples = []
     for line in GzipFile(fileobj=StringIO(samples_file.read())).readlines():
         samples.append(json.loads(line))
     for sample in samples:
         source = sample["annotated"]
         annotations = sample["annotations"]
         template = HtmlPage(body=str_to_unicode(source))
         parser = TemplatePageParser(TokenDict())
         parser.feed(template)
         for annotation in parser.annotations:
             test_annotation = annotations.pop(0)
             for s in annotation.__slots__:
                 if s == "tag_attributes":
                     for pair in getattr(annotation, s):
                         self.assertEqual(list(pair), test_annotation[s].pop(0))
                 else:
                     self.assertEqual(getattr(annotation, s), test_annotation[s])
         self.assertEqual(annotations, [])
 def test_site_pages(self):
     """
     Tests from real pages. More reliable and easy to build for more complicated structures
     """
     SAMPLES_FILE_PREFIX = os.path.join(path, "samples/samples_pageparsing")
     count = 0
     fname = "%s_%d.json" % (SAMPLES_FILE_PREFIX, count)
     while os.path.exists(fname):
         source = str_to_unicode(open("%s_%d.html" % (SAMPLES_FILE_PREFIX, count), "rb").read())
         annotations = json.loads(str_to_unicode(open(fname, "rb").read()))
         template = HtmlPage(body=source)
         parser = TemplatePageParser(TokenDict())
         parser.feed(template)
         for annotation in parser.annotations:
             test_annotation = annotations.pop(0)
             for s in annotation.__slots__:
                 if s == "tag_attributes":
                     for pair in getattr(annotation, s):
                         self.assertEqual(list(pair), test_annotation[s].pop(0))
                 else:
                     self.assertEqual(getattr(annotation, s), test_annotation[s])
         self.assertEqual(annotations, [])
         count += 1
         fname = "%s_%d.json" % (SAMPLES_FILE_PREFIX, count)