def test_render_html(self): html = u'<html><body><p>фыва</p></body></html>' html_utf = html.encode('utf-8') tree = fromstring(html) self.assertEqual(html_utf, render_html(tree)) self.assertEqual(html, render_html(tree, make_unicode=True)) self.assertEqual(html.encode('cp1251'), render_html(tree, encoding='cp1251'))
def test_replace_node_with_text(self): # replace span HTML = """ <div><p><span>span</span><a href="#">link</a></p></div> """ tree = fromstring(HTML) replace_node_with_text(tree, './/span', 'FOO') self.assertTrue(render_html(tree) == '<div><p>FOO<a href="#">link</a></p></div>') # replace span and keep its tail HTML = """ <div><p><span>span</span>BAR<a href="#">link</a></p></div> """ tree = fromstring(HTML) replace_node_with_text(tree, './/span', 'FOO') self.assertTrue(render_html(tree) == '<div><p>FOOBAR<a href="#">link</a></p></div>') # replace p which is only child of parent div HTML = """ <div><p><span>span</span>BAR<a href="#">link</a></p></div> """ tree = fromstring(HTML) replace_node_with_text(tree, './/p', 'FOO') self.assertTrue(render_html(tree) == '<div>FOO</div>') # replace span and keep tai of its preceeding sibling element HTML = """ <div><p><strong>str</strong>!<span>span</span>BAR<a href="#">link</a></p></div> """ tree = fromstring(HTML) replace_node_with_text(tree, './/span', 'FOO') self.assertTrue(render_html(tree) == '<div><p><strong>str</strong>!FOOBAR<a href="#">link</a></p></div>')
def test_replace_node_with_text(self): # replace span HTML = """ <div><p><span>span</span><a href="#">link</a></p></div>""" tree = fromstring(HTML) replace_node_with_text(tree, './/span', 'FOO') self.assertTrue(render_html(tree) == b'<div><p>FOO<a href="#">link</a></p></div>') # replace span and keep its tail HTML = """ <div><p><span>span</span>BAR<a href="#">link</a></p></div>""" tree = fromstring(HTML) replace_node_with_text(tree, './/span', 'FOO') self.assertTrue(render_html(tree) == b'<div><p>FOOBAR<a href="#">link</a></p></div>') # replace p which is only child of parent div HTML = """ <div><p><span>span</span>BAR<a href="#">link</a></p></div>""" tree = fromstring(HTML) replace_node_with_text(tree, './/p', 'FOO') self.assertTrue(render_html(tree) == b'<div>FOO</div>') # replace span and keep tai of its preceeding sibling element HTML = """ <div><p><strong>str</strong>!<span>span</span>BAR<a href="#">link</a></p></div>""" tree = fromstring(HTML) replace_node_with_text(tree, './/span', 'FOO') self.assertTrue(render_html(tree) == b'<div><p><strong>str</strong>!FOOBAR<a href="#">link</a></p></div>')
def test_drop_node(self): HTML = """ <div><p>text<span>span</span><a href="#">link</a></p>tail</div>""" tree = fromstring(HTML) drop_node(tree, './/p') self.assertTrue(render_html(tree) == b'<div>tail</div>') tree = fromstring(HTML) drop_node(tree, './/span', keep_content=True) self.assertTrue(render_html(tree) == b'<div><p>textspan<a href="#">link</a></p>tail</div>')
def test_drop_node(self): HTML = """ <div><p>text<span>span</span><a href="#">link</a></p>tail</div> """ tree = fromstring(HTML) drop_node(tree, './/p') self.assertTrue(render_html(tree) == '<div>tail</div>') tree = fromstring(HTML) drop_node(tree, './/span', keep_content=True) self.assertTrue(render_html(tree) == '<div><p>textspan<a href="#">link</a></p>tail</div>')
def parse_project_description(self, root): for node in root.xpath("//br"): node.tail = (node.tail or "") + "\n" text = strip_tags(decode_entities(render_html(root, encoding="unicode")), normalize_space=False) text = text.split(u"Posted On")[0].strip() text = text.split(u"Budget :")[0].strip() return text
def parse_project_description(self, root): for node in root.xpath('//br'): node.tail = (node.tail or '') + '\n' text = strip_tags(decode_entities(render_html(root, encoding='unicode')), normalize_space=False) text = text.split(u'Category:')[0].strip() return text
def html(self, encoding='unicode'): return render_html(self.node, encoding=encoding)