def test_html(self): shutil.copy('./tests/data/dirty.html', './tests/data/clean.html') p = web.HTMLParser('./tests/data/clean.html') meta = p.get_meta() self.assertEqual(meta['author'], 'jvoisin') ret = p.remove_all() self.assertTrue(ret) p = web.HTMLParser('./tests/data/clean.cleaned.html') self.assertEqual(p.get_meta(), {}) self.assertTrue(p.remove_all()) os.remove('./tests/data/clean.html') os.remove('./tests/data/clean.cleaned.html') os.remove('./tests/data/clean.cleaned.cleaned.html') with open('./tests/data/clean.html', 'w') as f: f.write('<title><title><pouet/><meta/></title></title><test/>') p = web.HTMLParser('./tests/data/clean.html') self.assertTrue(p.remove_all()) with open('./tests/data/clean.cleaned.html', 'r') as f: self.assertEqual(f.read(), '<title></title><test/>') os.remove('./tests/data/clean.html') os.remove('./tests/data/clean.cleaned.html') with open('./tests/data/clean.html', 'w') as f: f.write('<test><title>Some<b>metadata</b><br/></title></test>') p = web.HTMLParser('./tests/data/clean.html') self.assertTrue(p.remove_all()) with open('./tests/data/clean.cleaned.html', 'r') as f: self.assertEqual(f.read(), '<test><title></title></test>') os.remove('./tests/data/clean.html') os.remove('./tests/data/clean.cleaned.html') with open('./tests/data/clean.html', 'w') as f: f.write('<meta><meta/><!----><!-- test--></meta>') p = web.HTMLParser('./tests/data/clean.html') self.assertTrue(p.remove_all()) with open('./tests/data/clean.cleaned.html', 'r') as f: self.assertEqual(f.read(), '') os.remove('./tests/data/clean.html') os.remove('./tests/data/clean.cleaned.html')
def test_html(self): shutil.copy('./tests/data/dirty.html', './tests/data/clean.html') with open('./tests/data/clean.html', 'a') as f: f.write('<open>but not</closed>') with self.assertRaises(ValueError): web.HTMLParser('./tests/data/clean.html') os.remove('./tests/data/clean.html') # Yes, we're able to deal with malformed html :/ shutil.copy('./tests/data/dirty.html', './tests/data/clean.html') with open('./tests/data/clean.html', 'a') as f: f.write('<meta name=\'this" is="weird"/>') p = web.HTMLParser('./tests/data/clean.html') self.assertTrue(p.remove_all()) p = web.HTMLParser('./tests/data/clean.cleaned.html') self.assertEqual(p.get_meta(), {}) os.remove('./tests/data/clean.html') os.remove('./tests/data/clean.cleaned.html') with open('./tests/data/clean.html', 'w') as f: f.write('</meta>') with self.assertRaises(ValueError): web.HTMLParser('./tests/data/clean.html') os.remove('./tests/data/clean.html') with open('./tests/data/clean.html', 'w') as f: f.write('<meta><a>test</a><set/></meta><title></title><meta>') p = web.HTMLParser('./tests/data/clean.html') with self.assertRaises(ValueError): p.get_meta() p = web.HTMLParser('./tests/data/clean.html') with self.assertRaises(ValueError): p.remove_all() os.remove('./tests/data/clean.html') with open('./tests/data/clean.html', 'w') as f: f.write('<doctitle><br/></doctitle><br/><notclosed>') p = web.HTMLParser('./tests/data/clean.html') with self.assertRaises(ValueError): p.get_meta() p = web.HTMLParser('./tests/data/clean.html') with self.assertRaises(ValueError): p.remove_all() os.remove('./tests/data/clean.html')