Example #1
0
 def test_very_simple2(self):
     s = """<div>some text <i>in italic</i> and something else
                 <pre> <div>skip this</div> </pre>
                 <b>bold stuff</b> after the script
            </div>"""
     blocks = KohlschuetterBase.blockify(s)
     self.block_output_tokens(blocks, [['some', 'text', 'in', 'italic', 'and', 'something', 'else', 'bold', 'stuff', 'after', 'the', 'script']])
Example #2
0
 def test_unicode(self):
     s = u"""<div><div><a href="."> the registered trademark \xae</a></div></div"""
     blocks = KohlschuetterBase.blockify(s)
     self.block_output_tokens(blocks,
         [['the', 'registered', 'trademark', u'\xae']])
     self.link_output_tokens(blocks,
         [['the', 'registered', 'trademark', u'\xae']])
Example #3
0
 def test_simple_two_blocks(self):
     s = """<h1>A title <i>with italics</i> and other words</h1>
            some text outside the h1
            <div>a div <span class="test"> with a span </span> more </div>"""
     blocks = KohlschuetterBase.blockify(s)
     self.block_output_tokens(blocks,
            [['A', 'title', 'with', 'italics', 'and', 'other', 'words', 'some', 'text', 'outside', 'the', 'h1'],
             ['a', 'div', 'with', 'a', 'span', 'more']])
Example #4
0
 def test_very_simple(self):
     """test_very_simple"""
     s = """<div>some text
                 <script> skip this </script>
                 more text here
            </div>"""
     blocks = KohlschuetterBase.blockify(s)
     self.block_output_tokens(blocks, [['some', 'text', 'more', 'text', 'here']])
Example #5
0
 def test_empty_blocks(self):
     s = """<div> .! </div>
             some text
            <h1> in an h1 </h1>
            <p> ! _ </p>
         """
     blocks = KohlschuetterBase.blockify(s)
     self.block_output_tokens(blocks,
                 [['.!', 'some', 'text'], ['in', 'an', 'h1']])
Example #6
0
 def test_comment(self):
     s = """<H1>h1 tag word</H1>
            <!-- a comment -->
            orphaned text
            <TABLE><tr><td>table data</td></tr><tr><td>second row</td></tr></TABLE>
            final
            """
     blocks = KohlschuetterBase.blockify(s)
     self.block_output_tokens(blocks,
             [['h1', 'tag', 'word', 'orphaned', 'text'],
              ['table', 'data', 'second', 'row', 'final']])
 def test_all_non_english(self):
     s = u"""<div> <div> \u03b4\u03bf\u03b3 </div> <div> <a href="summer">\xe9t\xe9</a> </div>
      <div> \u62a5\u9053\u4e00\u51fa </div> </div>"""
     blocks = KohlschuetterBase.blockify(s)
     self.block_output_tokens(blocks,
         [[u'\u03b4\u03bf\u03b3'],
         [u'\xe9t\xe9'],
         [u'\u62a5\u9053\u4e00\u51fa']])
     self.link_output_tokens(blocks,
         [[],
          [u'\xe9t\xe9'],
          []])
Example #8
0
 def test_nested_blocks(self):
     s = """initial text
         <div>div <p> with paragraph </p>
         after Paragraph
         <div> nested div <div> and again </div>here</div>
         </div>
         final
         <div> <i> italic </i> before <h1>tag</h1></div>"""
     blocks = KohlschuetterBase.blockify(s)
     self.block_output_tokens(blocks,
             [['initial', 'text'],
             ['div'],
             ['with', 'paragraph', 'after', 'Paragraph'],
             ['nested', 'div'],
             ['and', 'again', 'here', 'final'],
             ['italic', 'before'],
             ['tag']])
Example #9
0
    def test_anchors(self):
        s = """<a href=".">anchor text</a>
               more
               <div>text <a href=".">123</a><div>MORE!</div></div>
               an img link<a href="."><img src="."></a>there
               <table><tr><td><a href=".">WILL <img src="."> THIS PASS <b>THE TEST</b> ??</a></tr></td></table>"""
        blocks = KohlschuetterBase.blockify(s)

        self.block_output_tokens(blocks,
              [['anchor', 'text', 'more'],
              ['text', '123'],
              ['MORE!', 'an', 'img', 'link', 'there'],
              ['WILL', 'THIS', 'PASS', 'THE', 'TEST', '??']])

        self.link_output_tokens(blocks,
            [['anchor', 'text'],
             ['123'],
             [],
             ['WILL', 'THIS', 'PASS', 'THE', 'TEST', '??']])
Example #10
0
    def test_big_html(self):
        s = """
<html>

<body>
<h1>Inside the h1 tag </h1>
<div id="content">
    <b>First line of the content in bold</b>
    <p>A paragraph with <a href="link_target.html">a link</a> and some 

    additional words.

    <p>Second paragraph

    <blockquote>Insert a block quote here</blockquote>

    <div class="image_css" id="image1"><img src="img.jpg"></div>
    
    <p>Some more text after the image
    <h2>An h2 tag just for kicks</h2>
    <p>Finally more text at the end of the content
</div>


<div class="begin_comments">
    <div id="comment1">
        <p>This is a comment</p>
        <p>with two paragraphs <a href="spam_link.html">and some comment spam</a>
    </div>
    <div id="comment2">
        <p>Second comment</p>
    </div>
</div>

<div class="footer"><a href="footer_link.html"><img src="footer_image.jpg" alt="image as anchor text"></a>Footer text
</div>

</html>
"""
        blocks = KohlschuetterBase.blockify(s)

        self.block_output_tokens(blocks,
        [['Inside', 'the', 'h1', 'tag'],
        ['First', 'line', 'of', 'the', 'content', 'in', 'bold'],
        ['A', 'paragraph', 'with', 'a', 'link', 'and', 'some', 'additional', 'words.'],
        ['Second', 'paragraph', 'Insert', 'a', 'block', 'quote', 'here'],
        ['Some', 'more', 'text', 'after', 'the', 'image'],
        ['An', 'h2', 'tag', 'just', 'for', 'kicks'],
        ['Finally', 'more', 'text', 'at', 'the', 'end', 'of', 'the', 'content'],
        ['This', 'is', 'a', 'comment'],
        ['with', 'two', 'paragraphs', 'and', 'some', 'comment', 'spam'],
        ['Second', 'comment'],
        ['Footer', 'text']])

        self.link_output_tokens(blocks,
            [[],
            [],
            ['a', 'link'],
            [],
            [],
            [],
            [],
            [],
            ['and', 'some', 'comment', 'spam'],
            [],
            []])