def test_get_node_text(self):
        dom = lxml.html.fromstring("<a>hello <b>HELLO <s>xyz</s></b> world</a>")
        self.assertEqual(DomTreeHelper.get_node_text(dom), "hello HELLO xyz world")

        dom = lxml.html.fromstring("head <a>hello <b>HELLO <s>xyz</s></b> world</a> tail")
        self.assertEqual(DomTreeHelper.get_node_text(dom), "head hello HELLO xyz world tail")

        dom = lxml.html.fromstring("abc")
        self.assertEqual(DomTreeHelper.get_node_text(dom), "abc")
    def test_get_anchor_text(self):
        dom = lxml.html.fromstring("<a>hello world <b>xxx</b></a>")
        self.assertEqual(DomTreeHelper.get_anchor_text(dom), "hello world xxx")

        dom = lxml.html.fromstring("<a title='hello'/>")
        self.assertEqual(DomTreeHelper.get_anchor_text(dom), "hello")

        dom = lxml.html.fromstring("<a/>")
        self.assertEqual(DomTreeHelper.get_anchor_text(dom), "")
    def test_get_node_html(self):
        dom = lxml.html.fromstring("<a>hello <b>HELLO <s>xyz</s></b> world</a>")
        self.assertEqual(DomTreeHelper.get_node_html(dom), "<a>hello <b>HELLO <s>xyz</s></b> world</a>")

        dom = lxml.html.fromstring("head <a>hello <b>HELLO <s>xyz</s></b> world</a> tail")
        self.assertEqual(DomTreeHelper.get_node_html(dom), "<p>head <a>hello <b>HELLO <s>xyz</s></b> world</a> tail</p>")

        dom = lxml.html.fromstring("abc")
        self.assertEqual(DomTreeHelper.get_node_html(dom), "<p>abc</p>")
 def test_get_tag_count(self):
     dom = misc.load_doc("http://www.taobao.com", False)
     start = datetime.datetime.now()
     c = DomTreeHelper.get_tag_count(dom, "a")
     self.assertTrue(c > 500)
     end = datetime.datetime.now()
     print end - start
 def test_get_leaf_count(self):
     dom = misc.load_doc("http://www.taobao.com", True)
     #start = datetime.datetime.now()
     c = DomTreeHelper.get_leaf_count(dom)
     self.assertTrue(c > 500)