def test_dom_simi(): doc1 = minidom.parse_xml_to_document(_html1) doc2 = minidom.parse_xml_to_document(_html2) #domsimi.stm(doc1.documentElement,doc2.documentElement) #print domsimi.nstm(doc1.documentElement,doc2.documentElement) print domsimi.compute_simi(doc1.documentElement,doc2.documentElement)
def do_template_extract(url, template_id): #load web page page = template_extractor.load_page(url) formal_page = minidom.formalize_html(page) doc = minidom.parse_xml_to_document(formal_page) #load template template = template_extractor.load_template(template_id) #do extract page_result = template_extractor.extract(doc, template) #print result print "page url:%s" %url print "extract result:-------" for field_result in page_result: print field_result print "----------------------"
def extract(url): #load web page page = load_page(url) formal_page = minidom.formalize_html(page) doc = minidom.parse_xml_to_document(formal_page) #get list candidate nodes list_candidate_nodes = get_list_candidate_nodes(doc) #get one list node which is judged as the most important list list_node = get_list_node(list_candidate_nodes) #get the need list item infos list_item_infos =get_list_item_info(list_node) #print extract info print "page url : %s" %url print "extract list item infos : " for info in list_item_infos: print info print "-------------"