def setUp(self): self.logger = logging.getLogger("TestStructuredExtractor") html = ' <html> <div id="header"><h1>hello world</h1>' \ '</div><div id="content"><p>this is important</p>' \ '<p> study computing it is fun</p></div>' \ '<div id="footer"> <h2>byes</h2></div> ' \ '<div id="post"> stay <div id="sub-post">should be gone</div>' \ '</div><footer class="myfoot">at the bottom</footer></html> ' div_ids = [] self.extractor = PositionContentExtractor(div_ids=div_ids) self.extractor.process_html_page(html)
def test_extract_from_bad_page(self): self.extractor = PositionContentExtractor(div_ids=self.div_ids) self.extractor.process_html_page(self.html) #todo pass if no errors? div_ids = ['related', 'skiplink-container'] self.extractor.set_div_ids(div_ids)