Beispiel #1
0
 def setUp(self):
     self.visitor = HTMLVisitor()
 def setUp(self):
     self.visitor = HTMLVisitor()
Beispiel #3
0
class TestVisitor(TestCase):
    def setUp(self):
        self.visitor = HTMLVisitor()

    def assert_attrs(self, text, expected_attrs):
        parsed = html_grammar['attrs'].parse(text)
        out = self.visitor.visit(parsed)
        self.assertEqual(out.as_dict(), expected_attrs)

    def test_attrs_empty(self):
        self.assert_attrs('', {})

    def test_attrs_mixed(self):
        text = 'foo="bar" key=\'value\' selected=1'
        expected = {'foo': 'bar', 'key': 'value', 'selected': 1}
        self.assert_attrs(text, expected)

    def test_option_selected(self):
        text = '<option value="value2" selected>Value 2</option>'
        parsed = html_grammar['option_element'].parse(text)
        out = self.visitor.visit(parsed)
        self.assertIsInstance(out, HTMLElement)

    def test_open(self):
        text = '<a href="http://example.com">'
        parsed = html_grammar['a_open'].parse(text)
        out = self.visitor.visit(parsed)
        self.assertEqual(out.start, 0)
        self.assertEqual(out.end, len(text))
        self.assertEqual(out.tag, 'a')
        self.assertEqual(out.attributes.as_dict(),
                         {'href': 'http://example.com'})

    def test_br(self):
        text = '<br>'
        parsed = html_grammar['br_element'].parse(text)
        out = self.visitor.visit(parsed)
        self.assertEqual(out.start, 0)
        self.assertEqual(out.end, len(text))
        self.assertEqual(out.tag, 'br')

    def test_element(self):
        text = '<p>This is a simple paragraph.</p>'
        parsed = html_grammar['p_element'].parse(text)
        out = self.visitor.visit(parsed)
        self.assertEqual(out.start, 0)
        self.assertEqual(out.end, len(text))
        self.assertEqual(out.open_tag.tag, 'p')
        self.assertEqual(out.open_tag.start, 0)
        self.assertEqual(out.open_tag.end, text.index('>') + 1)
        self.assertEqual(out.close_tag.tag, 'p')
        self.assertEqual(out.close_tag.start, text.index('</p>'))
        self.assertEqual(out.close_tag.end, len(text))
        self.assertEqual(out.tag, 'p')
        self.assertEqual(len(out.children), 1)
        self.assertEqual(out.children[0].raw, 'This is a simple paragraph.')

    def test_text_block(self):
        text = 'This is a simple paragraph.'
        parsed = html_grammar['text_block'].parse(text)
        out = self.visitor.visit(parsed)
        self.assertEqual(len(out), 1)
        self.assertEqual(out[0].start, 0)
        self.assertEqual(out[0].end, len(text))
        self.assertEqual(out[0].raw, 'This is a simple paragraph.')

    def test_html_simple_element(self):
        text = '<p>Simple Paragraph</p>'
        parsed = html_grammar['html'].parse(text)
        out = self.visitor.visit(parsed)
        self.assertEqual(len(out), 1)
        self.assertIsInstance(out[0], HTMLElement)
        self.assertEqual(out[0].tag, 'p')

    def test_html_simple_text(self):
        text = 'Simple Text'
        parsed = html_grammar['html'].parse(text)
        out = self.visitor.visit(parsed)
        self.assertEqual(len(out), 1)
        self.assertIsInstance(out[0], HTMLInterval)
        self.assertEqual(out[0].raw, 'Simple Text')
        self.assertEqual(out[0].start, 0)

    def test_html_simple_text_with_offset(self):
        text = 'Simple Text'
        parsed = html_grammar['html'].parse(text)
        self.visitor.offset = 100
        out = self.visitor.visit(parsed)
        self.assertEqual(len(out), 1)
        self.assertEqual(out[0].raw, 'Simple Text')
        self.assertEqual(out[0].start, 100)

    def test_html_complex(self):
        text = '''
<p>
    Paragraph 1
</p>
<p>
    Paragraph 2
</p>
'''
        parsed = html_grammar['html'].parse(text)
        out = self.visitor.visit(parsed)
        self.assertEqual(len(out), 5)
        self.assertIsInstance(out[0], HTMLText)
        self.assertEqual(str(out[0]), '')
        self.assertIsInstance(out[1], HTMLElement)
        self.assertEqual(str(out[1]), '<p>Paragraph 1</p>')
        self.assertIsInstance(out[2], HTMLText)
        self.assertEqual(str(out[2]), '')
        self.assertIsInstance(out[3], HTMLElement)
        self.assertEqual(str(out[3]), '<p>Paragraph 2</p>')
        self.assertIsInstance(out[4], HTMLText)
        self.assertEqual(str(out[4]), '')

    def test_html_with_code(self):
        text = '<p>Here is <code>code</code>.</p>'
        parsed = html_grammar['html'].parse(text)
        out = self.visitor.visit(parsed)
        self.assertEqual(len(out), 1)
        p_elem = out[0]
        self.assertIsInstance(p_elem, HTMLElement)
        self.assertEqual('p', p_elem.tag)
        text1, code, text2 = p_elem.children
        self.assertEqual(text_type(text1), 'Here is')
        self.assertEqual(text_type(code), '<code>code</code>')
        self.assertEqual(text_type(text2), '.')

    def test_html_simple_table(self):
        text = '<table><tr><td>A very dumb table</td></tr></table>'
        parsed = html_grammar['html'].parse(text)
        out = self.visitor.visit(parsed)
        self.assertEqual(len(out), 1)
        table = out[0]
        self.assertEqual(table.tag, 'table')
        self.assertEqual(len(table.children), 1)
        tr = table.children[0]
        self.assertEqual(tr.tag, 'tr')
        self.assertEqual(len(tr.children), 1)
        td = tr.children[0]
        self.assertEqual(td.tag, 'td')
        self.assertEqual(len(td.children), 1)
        text = td.children[0]
        self.assertEqual(text_type(text), 'A very dumb table')

    def test_html_empty_tag(self):
        text = '<td></td>'
        parsed = html_grammar['html'].parse(text)
        out = self.visitor.visit(parsed)
        self.assertEqual(len(out), 1)
        td = out[0]
        self.assertEqual(td.tag, 'td')
        self.assertEqual(len(td.children), 1)
        text = td.children[0]
        self.assertEqual(text_type(text), '')

    def test_add_issue(self):
        text = '<p>A paragraph</p>'
        parsed = html_grammar['html'].parse(text)
        out = self.visitor.visit(parsed)
        self.assertFalse(self.visitor.issues)
        self.assertEqual(len(out), 1)
        p_elem = out[0]
        self.visitor.add_issue('halt_import', p_elem)
        self.assertEqual(self.visitor.issues, [('halt_import', 0, 18, {})])

    def test_html_headers(self):
        text = """\
<h1>An H1 Header</h1>
<p>This is in the h1 section</p>
"""
        parsed = html_grammar['html'].parse(text)
        out = self.visitor.visit(parsed)
        self.assertFalse(self.visitor.issues)
        h1_elem, text1, p_elem, text2 = out
        self.assertIsInstance(h1_elem, HnElement)
        self.assertEqual(text_type(h1_elem), '<h1>An H1 Header</h1>')
        self.assertIsInstance(text1, HTMLText)
        self.assertEqual(text_type(text1), '')
        self.assertIsInstance(p_elem, HTMLElement)
        self.assertEqual(text_type(p_elem), '<p>This is in the h1 section</p>')
        self.assertIsInstance(text2, HTMLText)
        self.assertEqual(text_type(text2), '')

    def test_double_quoted_text(self):
        text = '"text"'
        parsed = html_grammar['text'].parse(text)
        out = self.visitor.visit(parsed)
        self.assertEqual(['text'], out)

    def test_double_quoted_escaped_text(self):
        text = '"the \\"text\\"."'
        parsed = html_grammar['text'].parse(text)
        out = self.visitor.visit(parsed)
        self.assertEqual(['the "text".'], out)

    def test_single_quoted_escaped_text(self):
        text = "'I don\\'t like escaped text'"
        parsed = html_grammar['text'].parse(text)
        out = self.visitor.visit(parsed)
        self.assertEqual(["I don't like escaped text"], out)
class TestVisitor(TestCase):
    def setUp(self):
        self.visitor = HTMLVisitor()

    def assert_attrs(self, text, expected_attrs):
        parsed = html_grammar['attrs'].parse(text)
        out = self.visitor.visit(parsed)
        self.assertEqual(out.as_dict(), expected_attrs)

    def test_attrs_empty(self):
        self.assert_attrs('', {})

    def test_attrs_mixed(self):
        text = 'foo="bar" key=\'value\' selected=1'
        expected = {'foo': 'bar', 'key': 'value', 'selected': 1}
        self.assert_attrs(text, expected)

    def test_option_selected(self):
        text = '<option value="value2" selected>Value 2</option>'
        parsed = html_grammar['option_element'].parse(text)
        out = self.visitor.visit(parsed)
        self.assertIsInstance(out, HTMLElement)

    def test_open(self):
        text = '<a href="http://example.com">'
        parsed = html_grammar['a_open'].parse(text)
        out = self.visitor.visit(parsed)
        self.assertEqual(out.start, 0)
        self.assertEqual(out.end, len(text))
        self.assertEqual(out.tag, 'a')
        self.assertEqual(
            out.attributes.as_dict(), {'href': 'http://example.com'})

    def test_br(self):
        text = '<br>'
        parsed = html_grammar['br_element'].parse(text)
        out = self.visitor.visit(parsed)
        self.assertEqual(out.start, 0)
        self.assertEqual(out.end, len(text))
        self.assertEqual(out.tag, 'br')

    def test_element(self):
        text = '<p>This is a simple paragraph.</p>'
        parsed = html_grammar['p_element'].parse(text)
        out = self.visitor.visit(parsed)
        self.assertEqual(out.start, 0)
        self.assertEqual(out.end, len(text))
        self.assertEqual(out.open_tag.tag, 'p')
        self.assertEqual(out.open_tag.start, 0)
        self.assertEqual(out.open_tag.end, text.index('>') + 1)
        self.assertEqual(out.close_tag.tag, 'p')
        self.assertEqual(out.close_tag.start, text.index('</p>'))
        self.assertEqual(out.close_tag.end, len(text))
        self.assertEqual(out.tag, 'p')
        self.assertEqual(len(out.children), 1)
        self.assertEqual(
            out.children[0].raw, 'This is a simple paragraph.')

    def test_text_block(self):
        text = 'This is a simple paragraph.'
        parsed = html_grammar['text_block'].parse(text)
        out = self.visitor.visit(parsed)
        self.assertEqual(len(out), 1)
        self.assertEqual(out[0].start, 0)
        self.assertEqual(out[0].end, len(text))
        self.assertEqual(out[0].raw, 'This is a simple paragraph.')

    def test_html_simple_element(self):
        text = '<p>Simple Paragraph</p>'
        parsed = html_grammar['html'].parse(text)
        out = self.visitor.visit(parsed)
        self.assertEqual(len(out), 1)
        self.assertIsInstance(out[0], HTMLElement)
        self.assertEqual(out[0].tag, 'p')

    def test_html_simple_text(self):
        text = 'Simple Text'
        parsed = html_grammar['html'].parse(text)
        out = self.visitor.visit(parsed)
        self.assertEqual(len(out), 1)
        self.assertIsInstance(out[0], HTMLInterval)
        self.assertEqual(out[0].raw, 'Simple Text')
        self.assertEqual(out[0].start, 0)

    def test_html_simple_text_with_offset(self):
        text = 'Simple Text'
        parsed = html_grammar['html'].parse(text)
        self.visitor.offset = 100
        out = self.visitor.visit(parsed)
        self.assertEqual(len(out), 1)
        self.assertEqual(out[0].raw, 'Simple Text')
        self.assertEqual(out[0].start, 100)

    def test_html_complex(self):
        text = '''
<p>
    Paragraph 1
</p>
<p>
    Paragraph 2
</p>
'''
        parsed = html_grammar['html'].parse(text)
        out = self.visitor.visit(parsed)
        self.assertEqual(len(out), 5)
        self.assertIsInstance(out[0], HTMLText)
        self.assertEqual(str(out[0]), '')
        self.assertIsInstance(out[1], HTMLElement)
        self.assertEqual(str(out[1]), '<p>Paragraph 1</p>')
        self.assertIsInstance(out[2], HTMLText)
        self.assertEqual(str(out[2]), '')
        self.assertIsInstance(out[3], HTMLElement)
        self.assertEqual(str(out[3]), '<p>Paragraph 2</p>')
        self.assertIsInstance(out[4], HTMLText)
        self.assertEqual(str(out[4]), '')

    def test_html_with_code(self):
        text = '<p>Here is <code>code</code>.</p>'
        parsed = html_grammar['html'].parse(text)
        out = self.visitor.visit(parsed)
        self.assertEqual(len(out), 1)
        p_elem = out[0]
        self.assertIsInstance(p_elem, HTMLElement)
        self.assertEqual('p', p_elem.tag)
        text1, code, text2 = p_elem.children
        self.assertEqual(text_type(text1), 'Here is')
        self.assertEqual(text_type(code), '<code>code</code>')
        self.assertEqual(text_type(text2), '.')

    def test_html_simple_table(self):
        text = '<table><tr><td>A very dumb table</td></tr></table>'
        parsed = html_grammar['html'].parse(text)
        out = self.visitor.visit(parsed)
        self.assertEqual(len(out), 1)
        table = out[0]
        self.assertEqual(table.tag, 'table')
        self.assertEqual(len(table.children), 1)
        tr = table.children[0]
        self.assertEqual(tr.tag, 'tr')
        self.assertEqual(len(tr.children), 1)
        td = tr.children[0]
        self.assertEqual(td.tag, 'td')
        self.assertEqual(len(td.children), 1)
        text = td.children[0]
        self.assertEqual(text_type(text), 'A very dumb table')

    def test_html_empty_tag(self):
        text = '<td></td>'
        parsed = html_grammar['html'].parse(text)
        out = self.visitor.visit(parsed)
        self.assertEqual(len(out), 1)
        td = out[0]
        self.assertEqual(td.tag, 'td')
        self.assertEqual(len(td.children), 1)
        text = td.children[0]
        self.assertEqual(text_type(text), '')

    def test_add_issue(self):
        text = '<p>A paragraph</p>'
        parsed = html_grammar['html'].parse(text)
        out = self.visitor.visit(parsed)
        self.assertFalse(self.visitor.issues)
        self.assertEqual(len(out), 1)
        p_elem = out[0]
        self.visitor.add_issue('halt_import', p_elem)
        self.assertEqual(self.visitor.issues, [('halt_import', 0, 18, {})])

    def test_html_headers(self):
        text = """\
<h1>An H1 Header</h1>
<p>This is in the h1 section</p>
"""
        parsed = html_grammar['html'].parse(text)
        out = self.visitor.visit(parsed)
        self.assertFalse(self.visitor.issues)
        h1_elem, text1, p_elem, text2 = out
        self.assertIsInstance(h1_elem, HnElement)
        self.assertEqual(text_type(h1_elem), '<h1>An H1 Header</h1>')
        self.assertIsInstance(text1, HTMLText)
        self.assertEqual(text_type(text1), '')
        self.assertIsInstance(p_elem, HTMLElement)
        self.assertEqual(text_type(p_elem), '<p>This is in the h1 section</p>')
        self.assertIsInstance(text2, HTMLText)
        self.assertEqual(text_type(text2), '')

    def test_double_quoted_text(self):
        text = '"text"'
        parsed = html_grammar['text'].parse(text)
        out = self.visitor.visit(parsed)
        self.assertEqual(['text'], out)

    def test_double_quoted_escaped_text(self):
        text = '"the \\"text\\"."'
        parsed = html_grammar['text'].parse(text)
        out = self.visitor.visit(parsed)
        self.assertEqual(['the "text".'], out)

    def test_single_quoted_escaped_text(self):
        text = "'I don\\'t like escaped text'"
        parsed = html_grammar['text'].parse(text)
        out = self.visitor.visit(parsed)
        self.assertEqual(["I don't like escaped text"], out)