コード例 #1
0
def get_list_item_info(list_node):
	'''
		get the wanted info in the list item
		'''
	list_item_infos = []
	# the wanted info: the most valuable outlink
	for list_item_node in list_node["items"]:
		#get all the anchors
		anchors = minidom.find_all_nodes_by_xpath(list_item_node,".//a")	
		max_length = 0
		important_anchor = ""
		for anchor in anchors:
			#return the anchor whose anchor text length is the longest
			if len(minidom.get_node_text(anchor)) > max_length:
				max_length = len(minidom.get_node_text(anchor))	
				important_anchor = anchor.getAttribute("href")
				#important_anchor = anchor.toxml()
				list_item_infos.append(important_anchor)
	
	#the wanted info: the most valuable text info

	return list_item_infos
コード例 #2
0
def get_list_node(list_candidate_nodes):
	'''
		find the most important list candidate node, return it as the result list node
		the importance is measured using node text length
		'''
	for candidate in list_candidate_nodes:
		candidate["length"] = len(minidom.get_node_text(candidate["list"]))

	list_candidate_nodes = \
		sorted(list_candidate_nodes, cmp = lambda x,y: cmp(x["length"],y["length"]), key = lambda x : x, reverse = True)

	if len(list_candidate_nodes) > 0:
		return list_candidate_nodes[0]
	else:
		return None