def test_dom_simi(): doc1 = minidom.parse_xml_to_document(_html1) doc2 = minidom.parse_xml_to_document(_html2) #domsimi.stm(doc1.documentElement,doc2.documentElement) #print domsimi.nstm(doc1.documentElement,doc2.documentElement) print domsimi.compute_simi(doc1.documentElement,doc2.documentElement)
def get_list_candidate_nodes(doc): ''' find all the list candidate nodes in the web page dom tree ''' list_candidate_nodes = [] #dfs walk web page dom tree for next in minidom.postorder_dfs_walk_iterator(doc.documentElement): list_item_candidate_nodes = [] for child in minidom.element_child_iterator(next): if len(list_item_candidate_nodes) == 0: list_item_candidate_nodes.append(child) else: #compute similarity with siblings last = list_item_candidate_nodes[len(list_item_candidate_nodes) - 1] simi_score = domsimi.compute_simi(last, child) #judge if it's a listitem candidate if simi_score > 0.8 : list_item_candidate_nodes.append(child) #judge if it's a list candidate if len(list_item_candidate_nodes) > 4: list_candidate_node_info = {"list":next,"items":list_item_candidate_nodes} list_candidate_nodes.append(list_candidate_node_info) return list_candidate_nodes