コード例 #1
0
ファイル: extractors.py プロジェクト: seinaville/BoilerPy3
class ArticleExtractor(Extractor):
    """
    A full-text extractor which is tuned towards news articles. In this scenario it achieves higher accuracy than
    DefaultExtractor. Works very well for most types of Article-like HTML.
    """

    _filter_chain = filters.FilterChain([
        filters.TerminatingBlocksFinder(),
        filters.DocumentTitleMatchClassifier(None, True),
        filters.NumWordsRulesClassifier(),
        filters.IgnoreBlocksAfterContentFilter(),
        filters.BlockProximityFusion(1, False, False),
        filters.BoilerplateBlockFilter(),
        filters.BlockProximityFusion(1, True, False),
        filters.KeepLargestBlockFilter(),
        filters.ExpandTitleToContentFilter()
    ])

    def __init__(self, raise_on_failure: bool = True) -> None:
        """
        Initialize extractor
        
        :param raise_on_failure: whether or not to raise an exception if a text extraction failure is encountered.
        """

        super().__init__(self._filter_chain, raise_on_failure)
コード例 #2
0
ファイル: extractors.py プロジェクト: sourcepirate/BoilerPy3
class DefaultExtractor(Extractor):
    """
    Usually worse than ArticleExtractor, but simpler/no heuristics. A quite generic full-text extractor.
    """

    _filter_chain = filters.FilterChain([
        filters.SimpleBlockFusionProcessor(),
        filters.BlockProximityFusion(1, False, False),
        filters.DensityRulesClassifier(),
    ])

    def __init__(self):
        super().__init__(self._filter_chain)
コード例 #3
0
ファイル: extractors.py プロジェクト: sourcepirate/BoilerPy3
class ArticleSentencesExtractor(Extractor):
    """
    A full-text extractor which is tuned towards extracting sentences from news articles.
    """

    _filter_chain = filters.FilterChain([
        ArticleExtractor._filter_chain,
        filters.SplitParagraphBlocksFilter(),
        filters.MinClauseWordsFilter(),
    ])

    def __init__(self):
        super().__init__(self._filter_chain)
コード例 #4
0
ファイル: extractors.py プロジェクト: sourcepirate/BoilerPy3
class LargestContentExtractor(Extractor):
    """
    A full-text extractor which extracts the largest text component of a page. For news articles, it may perform better
    than the DefaultExtractor, but usually worse than ArticleExtractor. Like DefaultExtractor, but keeps the largest
    text block only.
    """

    _filter_chain = filters.FilterChain([
        filters.NumWordsRulesClassifier(),
        filters.BlockProximityFusion(1, False, False),
        filters.KeepLargestBlockFilter(),
    ])

    def __init__(self):
        super().__init__(self._filter_chain)
コード例 #5
0
ファイル: extractors.py プロジェクト: seinaville/BoilerPy3
class ArticleSentencesExtractor(Extractor):
    """
    A full-text extractor which is tuned towards extracting sentences from news articles.
    """

    _filter_chain = filters.FilterChain([
        ArticleExtractor._filter_chain,
        filters.SplitParagraphBlocksFilter(),
        filters.MinClauseWordsFilter()
    ])

    def __init__(self, raise_on_failure: bool = True) -> None:
        """
        Initialize extractor
        
        :param raise_on_failure: whether or not to raise an exception if a text extraction failure is encountered.
        """

        super().__init__(self._filter_chain, raise_on_failure)
コード例 #6
0
ファイル: extractors.py プロジェクト: seinaville/BoilerPy3
class DefaultExtractor(Extractor):
    """
    Usually worse than ArticleExtractor, but simpler/no heuristics. A quite generic full-text extractor.
    """

    _filter_chain = filters.FilterChain([
        filters.SimpleBlockFusionProcessor(),
        filters.BlockProximityFusion(1, False, False),
        filters.DensityRulesClassifier()
    ])

    def __init__(self, raise_on_failure: bool = True) -> None:
        """
        Initialize extractor
        
        :param raise_on_failure: whether or not to raise an exception if a text extraction failure is encountered.
        """

        super().__init__(self._filter_chain, raise_on_failure)
コード例 #7
0
ファイル: extractors.py プロジェクト: sourcepirate/BoilerPy3
class ArticleExtractor(Extractor):
    """
    A full-text extractor which is tuned towards news articles. In this scenario it achieves higher accuracy than
    DefaultExtractor. Works very well for most types of Article-like HTML.
    """

    _filter_chain = filters.FilterChain([
        filters.TerminatingBlocksFinder(),
        filters.DocumentTitleMatchClassifier(None, True),
        filters.NumWordsRulesClassifier(),
        filters.IgnoreBlocksAfterContentFilter(),
        filters.BlockProximityFusion(1, False, False),
        filters.BoilerplateBlockFilter(),
        filters.BlockProximityFusion(1, True, False),
        filters.KeepLargestBlockFilter(),
        filters.ExpandTitleToContentFilter(),
    ])

    def __init__(self):
        super().__init__(self._filter_chain)
コード例 #8
0
ファイル: extractors.py プロジェクト: seinaville/BoilerPy3
class LargestContentExtractor(Extractor):
    """
    A full-text extractor which extracts the largest text component of a page. For news articles, it may perform better
    than the DefaultExtractor, but usually worse than ArticleExtractor. Like DefaultExtractor, but keeps the largest
    text block only.
    """

    _filter_chain = filters.FilterChain([
        filters.NumWordsRulesClassifier(),
        filters.BlockProximityFusion(1, False, False),
        filters.KeepLargestBlockFilter()
    ])

    def __init__(self, raise_on_failure: bool = True) -> None:
        """
        Initialize extractor
        
        :param raise_on_failure: whether or not to raise an exception if a text extraction failure is encountered.
        """

        super().__init__(self._filter_chain, raise_on_failure)