Exemple #1
0
class ArticleExtractor(Extractor):
    """
    A full-text extractor which is tuned towards news articles. In this scenario it achieves higher accuracy than
    DefaultExtractor. Works very well for most types of Article-like HTML.
    """

    _filter_chain = filters.FilterChain([
        filters.TerminatingBlocksFinder(),
        filters.DocumentTitleMatchClassifier(None, True),
        filters.NumWordsRulesClassifier(),
        filters.IgnoreBlocksAfterContentFilter(),
        filters.BlockProximityFusion(1, False, False),
        filters.BoilerplateBlockFilter(),
        filters.BlockProximityFusion(1, True, False),
        filters.KeepLargestBlockFilter(),
        filters.ExpandTitleToContentFilter()
    ])

    def __init__(self, raise_on_failure: bool = True) -> None:
        """
        Initialize extractor
        
        :param raise_on_failure: whether or not to raise an exception if a text extraction failure is encountered.
        """

        super().__init__(self._filter_chain, raise_on_failure)
Exemple #2
0
class DefaultExtractor(Extractor):
    """
    Usually worse than ArticleExtractor, but simpler/no heuristics. A quite generic full-text extractor.
    """

    _filter_chain = filters.FilterChain([
        filters.SimpleBlockFusionProcessor(),
        filters.BlockProximityFusion(1, False, False),
        filters.DensityRulesClassifier(),
    ])

    def __init__(self):
        super().__init__(self._filter_chain)
Exemple #3
0
class ArticleSentencesExtractor(Extractor):
    """
    A full-text extractor which is tuned towards extracting sentences from news articles.
    """

    _filter_chain = filters.FilterChain([
        ArticleExtractor._filter_chain,
        filters.SplitParagraphBlocksFilter(),
        filters.MinClauseWordsFilter(),
    ])

    def __init__(self):
        super().__init__(self._filter_chain)
Exemple #4
0
class LargestContentExtractor(Extractor):
    """
    A full-text extractor which extracts the largest text component of a page. For news articles, it may perform better
    than the DefaultExtractor, but usually worse than ArticleExtractor. Like DefaultExtractor, but keeps the largest
    text block only.
    """

    _filter_chain = filters.FilterChain([
        filters.NumWordsRulesClassifier(),
        filters.BlockProximityFusion(1, False, False),
        filters.KeepLargestBlockFilter(),
    ])

    def __init__(self):
        super().__init__(self._filter_chain)
Exemple #5
0
class ArticleSentencesExtractor(Extractor):
    """
    A full-text extractor which is tuned towards extracting sentences from news articles.
    """

    _filter_chain = filters.FilterChain([
        ArticleExtractor._filter_chain,
        filters.SplitParagraphBlocksFilter(),
        filters.MinClauseWordsFilter()
    ])

    def __init__(self, raise_on_failure: bool = True) -> None:
        """
        Initialize extractor
        
        :param raise_on_failure: whether or not to raise an exception if a text extraction failure is encountered.
        """

        super().__init__(self._filter_chain, raise_on_failure)
Exemple #6
0
class DefaultExtractor(Extractor):
    """
    Usually worse than ArticleExtractor, but simpler/no heuristics. A quite generic full-text extractor.
    """

    _filter_chain = filters.FilterChain([
        filters.SimpleBlockFusionProcessor(),
        filters.BlockProximityFusion(1, False, False),
        filters.DensityRulesClassifier()
    ])

    def __init__(self, raise_on_failure: bool = True) -> None:
        """
        Initialize extractor
        
        :param raise_on_failure: whether or not to raise an exception if a text extraction failure is encountered.
        """

        super().__init__(self._filter_chain, raise_on_failure)
Exemple #7
0
class ArticleExtractor(Extractor):
    """
    A full-text extractor which is tuned towards news articles. In this scenario it achieves higher accuracy than
    DefaultExtractor. Works very well for most types of Article-like HTML.
    """

    _filter_chain = filters.FilterChain([
        filters.TerminatingBlocksFinder(),
        filters.DocumentTitleMatchClassifier(None, True),
        filters.NumWordsRulesClassifier(),
        filters.IgnoreBlocksAfterContentFilter(),
        filters.BlockProximityFusion(1, False, False),
        filters.BoilerplateBlockFilter(),
        filters.BlockProximityFusion(1, True, False),
        filters.KeepLargestBlockFilter(),
        filters.ExpandTitleToContentFilter(),
    ])

    def __init__(self):
        super().__init__(self._filter_chain)
Exemple #8
0
class LargestContentExtractor(Extractor):
    """
    A full-text extractor which extracts the largest text component of a page. For news articles, it may perform better
    than the DefaultExtractor, but usually worse than ArticleExtractor. Like DefaultExtractor, but keeps the largest
    text block only.
    """

    _filter_chain = filters.FilterChain([
        filters.NumWordsRulesClassifier(),
        filters.BlockProximityFusion(1, False, False),
        filters.KeepLargestBlockFilter()
    ])

    def __init__(self, raise_on_failure: bool = True) -> None:
        """
        Initialize extractor
        
        :param raise_on_failure: whether or not to raise an exception if a text extraction failure is encountered.
        """

        super().__init__(self._filter_chain, raise_on_failure)