Example #1
0
    def _build_clusters(self, element):
        """
        build candidate clusters according to element
        :return:
        """
        descendants_tree = defaultdict(list)
        descendants = descendants_of_body(element)
        for descendant in descendants:
            # if one element does not have enough siblings, it can not become a child of candidate element
            if descendant.number_of_siblings + 1 < self.min_number:
                continue
            # if min length is larger than specified max length, it can not become a child of candidate element
            if descendant.a_descendants_group_text_min_length > self.max_length:
                continue
            # if max length is smaller than specified min length, it can not become a child of candidate element
            if descendant.a_descendants_group_text_max_length < self.min_length:
                continue
            # descendant element must have same siblings which their similarity should not below similarity_threshold
            if descendant.similarity_with_siblings < self.similarity_threshold:
                continue
            descendants_tree[descendant.parent_selector].append(descendant)
        descendants_tree = dict(descendants_tree)

        # cut tree, remove parent block
        selectors = sorted(list(descendants_tree.keys()))
        last_selector = None
        for selector in selectors[::-1]:
            # if later selector
            if last_selector and selector and last_selector.startswith(
                    selector):
                del descendants_tree[selector]
            last_selector = selector
        clusters = cluster_dict(descendants_tree)
        return clusters
Example #2
0
    def process(self, element: Element):
        """
        extract content from html
        :param element:
        :return:
        """
        # preprocess
        preprocess4content(element)

        # start to evaluate every child element
        element_infos = []
        descendants = descendants_of_body(element)

        # get std of density_of_text among all elements
        density_of_text = [
            descendant.density_of_text for descendant in descendants
        ]
        density_of_text_std = np.std(density_of_text, ddof=1)

        # get density_score of every element
        for descendant in descendants:
            score = np.log(density_of_text_std) * \
                    descendant.density_of_text * \
                    np.log10(descendant.number_of_p_descendants + 2) * \
                    np.log(descendant.density_of_punctuation)
            descendant.density_score = score

        # sort element info by density_score
        descendants = sorted(descendants,
                             key=lambda x: x.density_score,
                             reverse=True)
        descendant_first = descendants[0] if descendants else None
        if descendant_first is None:
            return None
        paragraphs = descendant_first.xpath('.//p//text()')
        paragraphs = [
            paragraph.strip() if paragraph else '' for paragraph in paragraphs
        ]
        paragraphs = list(filter(lambda x: x, paragraphs))
        text = '\n'.join(paragraphs)
        text = text.strip()
        return text