Esempio n. 1
0
def stm(root1, root2):
	'''
		simple tree matching
		'''
	if not minidom.is_same(root1,root2):
		#_debug_print_return(0)
		return 0
	else:
		#k = the number of first-level sub-trees of A(root1)
	    #n = the number of first-level sub-trees of B(root2)
		k = minidom.element_child_count(root1)
		n = minidom.element_child_count(root2)

		#initialize the matrix m
		#m[i][0] <- 0 for 0...k ; m[0][j] <- 0...n
		m = [[0] * (n+1) for i in range(k+1)]

		#filling the matrix
		i = 1
		j = 1
		for subtree_a in minidom.element_child_iterator(root1):
			for subtree_b in minidom.element_child_iterator(root2):
	#			_debug_print_index(i,j)
				m[i][j] = max(m[i][j-1], m[i-1][j], m[i-1][j-1]+stm(subtree_a,subtree_b))
	#			_debug_print_matrix(m)
				j += 1
			i += 1
			j = 1

	#	_debug_print_return(m[k][n]+1)
		return m[k][n] + 1
def get_list_candidate_nodes(doc):
	'''
		find all the list candidate nodes in the web page dom tree
		'''
	list_candidate_nodes = []

	#dfs walk web page dom tree
	for next in minidom.postorder_dfs_walk_iterator(doc.documentElement):
		list_item_candidate_nodes = []

		for child in minidom.element_child_iterator(next):
			if len(list_item_candidate_nodes) == 0:
				list_item_candidate_nodes.append(child)
			else:
				#compute similarity with siblings
				last = list_item_candidate_nodes[len(list_item_candidate_nodes) - 1]
				simi_score = domsimi.compute_simi(last, child)

				#judge if it's a listitem candidate
				if simi_score > 0.8 :
					list_item_candidate_nodes.append(child)

		#judge if it's a list candidate	
		if len(list_item_candidate_nodes) > 4:
			list_candidate_node_info = {"list":next,"items":list_item_candidate_nodes}
			list_candidate_nodes.append(list_candidate_node_info)
	
	return list_candidate_nodes