Ejemplo n.º 1
0
 def test_site_pages(self):
     """
     Tests from real pages. More reliable and easy to build for more complicated structures
     """
     for source, annotations in iter_samples('pageparsing'):
         template = HtmlPage(body=source)
         parser = TemplatePageParser(TokenDict())
         parser.feed(template)
         for annotation in parser.annotations:
             test_annotation = annotations.pop(0)
             for s in annotation.__slots__:
                 if s == "tag_attributes":
                     for pair in getattr(annotation, s):
                         self.assertEqual(list(pair), test_annotation[s].pop(0))
                 else:
                     self.assertEqual(getattr(annotation, s), test_annotation[s])
         self.assertEqual(annotations, [])
Ejemplo n.º 2
0
 def test_site_pages(self):
     """
     Tests from real pages. More reliable and easy to build for more complicated structures
     """
     for source, annotations in iter_samples('pageparsing'):
         template = HtmlPage(body=source)
         parser = TemplatePageParser(TokenDict())
         parser.feed(template)
         for annotation in parser.annotations:
             test_annotation = annotations.pop(0)
             for s in annotation.__slots__:
                 if s == "tag_attributes":
                     for pair in getattr(annotation, s):
                         self.assertEqual(list(pair),
                                          test_annotation[s].pop(0))
                 else:
                     self.assertEqual(getattr(annotation, s),
                                      test_annotation[s])
         self.assertEqual(annotations, [])
Ejemplo n.º 3
0
    def test_extraction(self):

        samples_encoding = 'latin1'
        [(html1, data1), (html2, data2)] = list(iter_samples(
            'scraper_loadstore', html_encoding=samples_encoding))
        sc = Scraper()
        page1 = HtmlPage(body=html1, encoding=samples_encoding)
        sc.train_from_htmlpage(page1, data1)

        page2 = HtmlPage(body=html2, encoding=samples_encoding)
        extracted_data = sc.scrape_page(page2)
        self._assert_extracted(extracted_data, data2)

        # check still works after serialize/deserialize 
        f = StringIO()
        sc.tofile(f)
        f.seek(0)
        sc = Scraper.fromfile(f)
        extracted_data = sc.scrape_page(page2)
        self._assert_extracted(extracted_data, data2)
Ejemplo n.º 4
0
 def test_site_samples(self):
     """test parse_html from real cases"""
     for i, (source, parsed) in enumerate(
             iter_samples('htmlpage', object_hook=_decode_element)):
         self._test_sample(source, parsed, i)
Ejemplo n.º 5
0
 def test_site_samples(self):
     """test parse_html from real cases"""
     for i, (source, parsed) in enumerate(
             iter_samples('htmlpage', object_hook=_decode_element)):
         self._test_sample(source, parsed, i)