Python strip_template Examples

Programming Language: Python

Namespace/Package Name: ebdata.templatemaker.clean

Method/Function: strip_template

Examples at hotexamples.com: 4

Python strip_template - 4 examples found. These are the top rated real world Python examples of ebdata.templatemaker.clean.strip_template extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

 def auto_excerpt(self):
     """
     Attempts to detect the text of this page (ignoring all navigation and
     other clutter), returning a list of strings. Each string represents a
     paragraph.
     """
     from ebdata.textmining.treeutils import make_tree
     tree = make_tree(self.html)
     if self.seed.rss_full_entry:
         from ebdata.templatemaker.textlist import html_to_paragraph_list
         paras = html_to_paragraph_list(tree)
     else:
         if self.seed.strip_noise:
             from ebdata.templatemaker.clean import strip_template
             try:
                 html2 = self.companion_page().html
             except IndexError:
                 pass
             else:
                 tree2 = make_tree(html2)
                 strip_template(tree, tree2)
         if self.seed.guess_article_text:
             from ebdata.templatemaker.articletext import article_text
             paras = article_text(tree)
         else:
             from ebdata.templatemaker.textlist import html_to_paragraph_list
             paras = html_to_paragraph_list(tree)
     return paras

Example #2

Show file

File: models.py Project: AndrewJHart/everyblock_code

 def auto_excerpt(self):
     """
     Attempts to detect the text of this page (ignoring all navigation and
     other clutter), returning a list of strings. Each string represents a
     paragraph.
     """
     from ebdata.textmining.treeutils import make_tree
     tree = make_tree(self.html)
     if self.seed.rss_full_entry:
         from ebdata.templatemaker.textlist import html_to_paragraph_list
         paras = html_to_paragraph_list(tree)
     else:
         if self.seed.strip_noise:
             from ebdata.templatemaker.clean import strip_template
             try:
                 html2 = self.companion_page().html
             except IndexError:
                 pass
             else:
                 tree2 = make_tree(html2)
                 strip_template(tree, tree2)
         if self.seed.guess_article_text:
             from ebdata.templatemaker.articletext import article_text
             paras = article_text(tree)
         else:
             from ebdata.templatemaker.textlist import html_to_paragraph_list
             paras = html_to_paragraph_list(tree)
     return paras

Example #3

Show file

File: clean.py Project: DotNetWebs/openblock

    def assertStrips(self, html1, html2, expected, num_removals, check_ids=False):
        """
        Asserts that strip_template(html1, html2) will result in the expected
        HTML string, and that the return value is num_removals.
        """
        # The test strings should *not* have <html> and <body> tags, for the
        # sake of brevity.
        tree1 = document_fromstring('<html><body>%s</body></html>' % html1)
        tree2 = document_fromstring('<html><body>%s</body></html>' % html2)
        expected = '<html><body>%s</body></html>' % expected

        got_removals = strip_template(tree1, tree2, check_ids=check_ids)
        got_tree = etree.tostring(tree1, method='html')
        self.assertEqual(got_tree, expected)
        self.assertEqual(got_removals, num_removals)

Example #4

Show file

    def assertStrips(self, html1, html2, expected, num_removals, check_ids=False):
        """
        Asserts that strip_template(html1, html2) will result in the expected
        HTML string, and that the return value is num_removals.
        """
        # The test strings should *not* have <html> and <body> tags, for the
        # sake of brevity.
        tree1 = document_fromstring('<html><body>%s</body></html>' % html1)
        tree2 = document_fromstring('<html><body>%s</body></html>' % html2)
        expected = '<html><body>%s</body></html>' % expected

        got_removals = strip_template(tree1, tree2, check_ids=check_ids)
        got_tree = etree.tostring(tree1, method='html')
        self.assertEqual(got_tree, expected)
        self.assertEqual(got_removals, num_removals)