def stm(root1, root2): ''' simple tree matching ''' if not minidom.is_same(root1,root2): #_debug_print_return(0) return 0 else: #k = the number of first-level sub-trees of A(root1) #n = the number of first-level sub-trees of B(root2) k = minidom.element_child_count(root1) n = minidom.element_child_count(root2) #initialize the matrix m #m[i][0] <- 0 for 0...k ; m[0][j] <- 0...n m = [[0] * (n+1) for i in range(k+1)] #filling the matrix i = 1 j = 1 for subtree_a in minidom.element_child_iterator(root1): for subtree_b in minidom.element_child_iterator(root2): # _debug_print_index(i,j) m[i][j] = max(m[i][j-1], m[i-1][j], m[i-1][j-1]+stm(subtree_a,subtree_b)) # _debug_print_matrix(m) j += 1 i += 1 j = 1 # _debug_print_return(m[k][n]+1) return m[k][n] + 1
def get_list_candidate_nodes(doc): ''' find all the list candidate nodes in the web page dom tree ''' list_candidate_nodes = [] #dfs walk web page dom tree for next in minidom.postorder_dfs_walk_iterator(doc.documentElement): list_item_candidate_nodes = [] for child in minidom.element_child_iterator(next): if len(list_item_candidate_nodes) == 0: list_item_candidate_nodes.append(child) else: #compute similarity with siblings last = list_item_candidate_nodes[len(list_item_candidate_nodes) - 1] simi_score = domsimi.compute_simi(last, child) #judge if it's a listitem candidate if simi_score > 0.8 : list_item_candidate_nodes.append(child) #judge if it's a list candidate if len(list_item_candidate_nodes) > 4: list_candidate_node_info = {"list":next,"items":list_item_candidate_nodes} list_candidate_nodes.append(list_candidate_node_info) return list_candidate_nodes