def get_list_item_info(list_node): ''' get the wanted info in the list item ''' list_item_infos = [] # the wanted info: the most valuable outlink for list_item_node in list_node["items"]: #get all the anchors anchors = minidom.find_all_nodes_by_xpath(list_item_node,".//a") max_length = 0 important_anchor = "" for anchor in anchors: #return the anchor whose anchor text length is the longest if len(minidom.get_node_text(anchor)) > max_length: max_length = len(minidom.get_node_text(anchor)) important_anchor = anchor.getAttribute("href") #important_anchor = anchor.toxml() list_item_infos.append(important_anchor) #the wanted info: the most valuable text info return list_item_infos
def get_list_node(list_candidate_nodes): ''' find the most important list candidate node, return it as the result list node the importance is measured using node text length ''' for candidate in list_candidate_nodes: candidate["length"] = len(minidom.get_node_text(candidate["list"])) list_candidate_nodes = \ sorted(list_candidate_nodes, cmp = lambda x,y: cmp(x["length"],y["length"]), key = lambda x : x, reverse = True) if len(list_candidate_nodes) > 0: return list_candidate_nodes[0] else: return None