class ArticleExtractor(Extractor): """ A full-text extractor which is tuned towards news articles. In this scenario it achieves higher accuracy than DefaultExtractor. Works very well for most types of Article-like HTML. """ _filter_chain = filters.FilterChain([ filters.TerminatingBlocksFinder(), filters.DocumentTitleMatchClassifier(None, True), filters.NumWordsRulesClassifier(), filters.IgnoreBlocksAfterContentFilter(), filters.BlockProximityFusion(1, False, False), filters.BoilerplateBlockFilter(), filters.BlockProximityFusion(1, True, False), filters.KeepLargestBlockFilter(), filters.ExpandTitleToContentFilter() ]) def __init__(self, raise_on_failure: bool = True) -> None: """ Initialize extractor :param raise_on_failure: whether or not to raise an exception if a text extraction failure is encountered. """ super().__init__(self._filter_chain, raise_on_failure)
class DefaultExtractor(Extractor): """ Usually worse than ArticleExtractor, but simpler/no heuristics. A quite generic full-text extractor. """ _filter_chain = filters.FilterChain([ filters.SimpleBlockFusionProcessor(), filters.BlockProximityFusion(1, False, False), filters.DensityRulesClassifier(), ]) def __init__(self): super().__init__(self._filter_chain)
class ArticleSentencesExtractor(Extractor): """ A full-text extractor which is tuned towards extracting sentences from news articles. """ _filter_chain = filters.FilterChain([ ArticleExtractor._filter_chain, filters.SplitParagraphBlocksFilter(), filters.MinClauseWordsFilter(), ]) def __init__(self): super().__init__(self._filter_chain)
class LargestContentExtractor(Extractor): """ A full-text extractor which extracts the largest text component of a page. For news articles, it may perform better than the DefaultExtractor, but usually worse than ArticleExtractor. Like DefaultExtractor, but keeps the largest text block only. """ _filter_chain = filters.FilterChain([ filters.NumWordsRulesClassifier(), filters.BlockProximityFusion(1, False, False), filters.KeepLargestBlockFilter(), ]) def __init__(self): super().__init__(self._filter_chain)
class ArticleSentencesExtractor(Extractor): """ A full-text extractor which is tuned towards extracting sentences from news articles. """ _filter_chain = filters.FilterChain([ ArticleExtractor._filter_chain, filters.SplitParagraphBlocksFilter(), filters.MinClauseWordsFilter() ]) def __init__(self, raise_on_failure: bool = True) -> None: """ Initialize extractor :param raise_on_failure: whether or not to raise an exception if a text extraction failure is encountered. """ super().__init__(self._filter_chain, raise_on_failure)
class DefaultExtractor(Extractor): """ Usually worse than ArticleExtractor, but simpler/no heuristics. A quite generic full-text extractor. """ _filter_chain = filters.FilterChain([ filters.SimpleBlockFusionProcessor(), filters.BlockProximityFusion(1, False, False), filters.DensityRulesClassifier() ]) def __init__(self, raise_on_failure: bool = True) -> None: """ Initialize extractor :param raise_on_failure: whether or not to raise an exception if a text extraction failure is encountered. """ super().__init__(self._filter_chain, raise_on_failure)
class ArticleExtractor(Extractor): """ A full-text extractor which is tuned towards news articles. In this scenario it achieves higher accuracy than DefaultExtractor. Works very well for most types of Article-like HTML. """ _filter_chain = filters.FilterChain([ filters.TerminatingBlocksFinder(), filters.DocumentTitleMatchClassifier(None, True), filters.NumWordsRulesClassifier(), filters.IgnoreBlocksAfterContentFilter(), filters.BlockProximityFusion(1, False, False), filters.BoilerplateBlockFilter(), filters.BlockProximityFusion(1, True, False), filters.KeepLargestBlockFilter(), filters.ExpandTitleToContentFilter(), ]) def __init__(self): super().__init__(self._filter_chain)
class LargestContentExtractor(Extractor): """ A full-text extractor which extracts the largest text component of a page. For news articles, it may perform better than the DefaultExtractor, but usually worse than ArticleExtractor. Like DefaultExtractor, but keeps the largest text block only. """ _filter_chain = filters.FilterChain([ filters.NumWordsRulesClassifier(), filters.BlockProximityFusion(1, False, False), filters.KeepLargestBlockFilter() ]) def __init__(self, raise_on_failure: bool = True) -> None: """ Initialize extractor :param raise_on_failure: whether or not to raise an exception if a text extraction failure is encountered. """ super().__init__(self._filter_chain, raise_on_failure)