def _build_clusters(self, element): """ build candidate clusters according to element :return: """ descendants_tree = defaultdict(list) descendants = descendants_of_body(element) for descendant in descendants: # if one element does not have enough siblings, it can not become a child of candidate element if descendant.number_of_siblings + 1 < self.min_number: continue # if min length is larger than specified max length, it can not become a child of candidate element if descendant.a_descendants_group_text_min_length > self.max_length: continue # if max length is smaller than specified min length, it can not become a child of candidate element if descendant.a_descendants_group_text_max_length < self.min_length: continue # descendant element must have same siblings which their similarity should not below similarity_threshold if descendant.similarity_with_siblings < self.similarity_threshold: continue descendants_tree[descendant.parent_selector].append(descendant) descendants_tree = dict(descendants_tree) # cut tree, remove parent block selectors = sorted(list(descendants_tree.keys())) last_selector = None for selector in selectors[::-1]: # if later selector if last_selector and selector and last_selector.startswith( selector): del descendants_tree[selector] last_selector = selector clusters = cluster_dict(descendants_tree) return clusters
def process(self, element: Element): """ extract content from html :param element: :return: """ # preprocess preprocess4content(element) # start to evaluate every child element element_infos = [] descendants = descendants_of_body(element) # get std of density_of_text among all elements density_of_text = [ descendant.density_of_text for descendant in descendants ] density_of_text_std = np.std(density_of_text, ddof=1) # get density_score of every element for descendant in descendants: score = np.log(density_of_text_std) * \ descendant.density_of_text * \ np.log10(descendant.number_of_p_descendants + 2) * \ np.log(descendant.density_of_punctuation) descendant.density_score = score # sort element info by density_score descendants = sorted(descendants, key=lambda x: x.density_score, reverse=True) descendant_first = descendants[0] if descendants else None if descendant_first is None: return None paragraphs = descendant_first.xpath('.//p//text()') paragraphs = [ paragraph.strip() if paragraph else '' for paragraph in paragraphs ] paragraphs = list(filter(lambda x: x, paragraphs)) text = '\n'.join(paragraphs) text = text.strip() return text