Beispiel #1
0
def test_exotic_tags(xmloutput=False):
    # cover some edge cases with a specially crafted file
    result = load_mock_page('http://exotic_tags',
                            xml_flag=xmloutput,
                            tei_output=True)
    assert 'Teletype text' in result and 'My new car is silver.' in result
    filepath = os.path.join(TEST_DIR, 'cache', 'exotic_tags_tei.html')
    with open(filepath) as f:
        content = etree.fromstring(f.read())
    res = xml.check_tei(content, 'http://dummy')
    assert etree.tostring(res).startswith(
        b'<html>\n<text>\n<body>\n<div>\n\n<hi rend="uppercase">Hello</hi>\n<p>Teletype text</p>'
    )
    # misformed HTML declaration
    htmlstring = '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" 2012"http://www.w3.org/TR/html4/loose.dtd"><html><head></head><body><p>ABC</p></body></html>'
    # outputs '012"http://www.w3.org/TR/html4/loose.dtd">\nABC'
    assert 'ABC' in extract(htmlstring, config=ZERO_CONFIG)
    # quotes
    assert handle_quotes(etree.Element('quote')) is None
    assert handle_table(etree.Element('table')) is None
    # p within p
    element, second = etree.Element('p'), etree.Element('p')
    element.text, second.text = '1st part.', '2nd part.'
    element.append(second)
    converted = handle_paragraphs(element, ['p'], False, ZERO_CONFIG)
    assert etree.tostring(converted) == b'<p>1st part. 2nd part.</p>'
    # delete last <lb>
    third = etree.Element('lb')
    element.append(third)
    assert etree.tostring(converted) == b'<p>1st part. 2nd part.</p>'
Beispiel #2
0
def test_exotic_tags(xmloutput=False):
    # cover some edge cases with a specially crafted file
    result = load_mock_page('http://exotic_tags',
                            xml_flag=xmloutput,
                            tei_output=True)
    assert 'Teletype text' in result and 'My new car is silver.' in result
    filepath = os.path.join(TEST_DIR, 'cache', 'exotic_tags_tei.html')
    with open(filepath) as f:
        content = etree.fromstring(f.read())
    res = xml.check_tei(content, 'http://dummy')
    assert etree.tostring(res).startswith(
        b'<html>\n<text>\n<body>\n<div>\n\n<hi rend="uppercase">Hello</hi>\n<p>Teletype text</p>'
    )
    # misformed HTML declaration
    htmlstring = '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" 2012"http://www.w3.org/TR/html4/loose.dtd"><html><head></head><body><p>ABC</p></body></html>'
    # outputs '012"http://www.w3.org/TR/html4/loose.dtd">\nABC'
    assert 'ABC' in extract(htmlstring, config=ZERO_CONFIG)
    # quotes
    assert handle_quotes(etree.Element('quote'), False, ZERO_CONFIG) is None
    assert handle_table(etree.Element('table'), False, ZERO_CONFIG) is None
    # p within p
    element, second = etree.Element('p'), etree.Element('p')
    element.text, second.text = '1st part.', '2nd part.'
    element.append(second)
    # delete last <lb>
    element.append(etree.Element('lb'))
    converted = handle_paragraphs(element, ['p'], False, ZERO_CONFIG)
    assert etree.tostring(converted) == b'<p>1st part. 2nd part.</p>'
    # malformed lists (common error)
    result = etree.tostring(
        handle_lists(
            etree.fromstring(
                '<list>Description of the list:<item>List item 1</item><item>List item 2</item><item>List item 3</item></list>'
            ), False, ZERO_CONFIG))
    assert result.count(b'List item') == 3
    assert b"Description" in result
    # HTML5: <details>
    htmlstring = '<html><body><article><details><summary>Epcot Center</summary><p>Epcot is a theme park at Walt Disney World Resort featuring exciting attractions, international pavilions, award-winning fireworks and seasonal special events.</p></details></article></body></html>'
    my_result = extract(htmlstring, no_fallback=True, config=ZERO_CONFIG)
    print(my_result)
    assert 'Epcot Center' in my_result and 'award-winning fireworks' in my_result
    my_result = extract(htmlstring, no_fallback=False, config=ZERO_CONFIG)
    assert 'Epcot Center' in my_result and 'award-winning fireworks' in my_result
Beispiel #3
0
def test_exotic_tags(xmloutput=False):
    # cover some edge cases with a specially crafted file
    result = load_mock_page('http://exotic_tags', xml_flag=xmloutput, tei_output=True)
    assert 'Teletype text' in result and 'My new car is silver.' in result
    filepath = os.path.join(TEST_DIR, 'resources', 'exotic_tags_tei.html')
    with open(filepath) as f:
        content = etree.fromstring(f.read())
    res = xml.check_tei(content, 'http://dummy')
    assert etree.tostring(res).startswith(b'<html>\n<text>\n<body>\n<div>\n\n<hi rend="uppercase">Hello</hi>\n<p>Teletype text</p>')
    # misformed HTML declaration
    htmlstring = '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" 2012"http://www.w3.org/TR/html4/loose.dtd"><html><head></head><body><p>ABC</p></body></html>'
    # outputs '012"http://www.w3.org/TR/html4/loose.dtd">\nABC'
    assert 'ABC' in extract(htmlstring, config=ZERO_CONFIG)
    # quotes
    assert handle_quotes(etree.Element('quote'), False, ZERO_CONFIG) is None
    assert handle_table(etree.Element('table'), TAG_CATALOG, False, ZERO_CONFIG) is None
    # p within p
    element, second = etree.Element('p'), etree.Element('p')
    element.text, second.text = '1st part.', '2nd part.'
    element.append(second)
    # delete last <lb>
    element.append(etree.Element('lb'))
    converted = handle_paragraphs(element, ['p'], False, ZERO_CONFIG)
    assert etree.tostring(converted) == b'<p>1st part. 2nd part.</p>'
    # naked div with <lb>
    assert '1.\n2.\n3.' in extract('<html><body><main><div>1.<br/>2.<br/>3.<br/></div></main></body></html>', no_fallback=True, config=ZERO_CONFIG)
    # malformed lists (common error)
    result = etree.tostring(handle_lists(etree.fromstring('<list>Description of the list:<item>List item 1</item><item>List item 2</item><item>List item 3</item></list>'), False, ZERO_CONFIG))
    assert result.count(b'List item') == 3
    assert b"Description" in result
    # HTML5: <details>
    htmlstring = '<html><body><article><details><summary>Epcot Center</summary><p>Epcot is a theme park at Walt Disney World Resort featuring exciting attractions, international pavilions, award-winning fireworks and seasonal special events.</p></details></article></body></html>'
    my_result = extract(htmlstring, no_fallback=True, config=ZERO_CONFIG)
    assert 'Epcot Center' in my_result and 'award-winning fireworks' in my_result
    my_result = extract(htmlstring, no_fallback=False, config=ZERO_CONFIG)
    assert 'Epcot Center' in my_result and 'award-winning fireworks' in my_result
    # tables with nested elements
    htmlstring = '''<html><body><article>
<table>
<tr><td><b>Present Tense</b></td>
<td>I buy</td>
<td>you buy</td>
<td>he/she/it buys</td>
<td>we buy</td>
<td>you buy</td>
<td>they buy</td>
</tr>
    </table></article></body></html>'''
    my_result = extract(htmlstring, no_fallback=True, output_format='xml', include_formatting=True, config=ZERO_CONFIG)
    assert '''<row>
        <cell>
          <hi>Present Tense</hi>
        </cell>
        <cell>I buy</cell>
        <cell>you buy</cell>
        <cell>he/she/it buys</cell>
        <cell>we buy</cell>
        <cell>you buy</cell>
        <cell>they buy</cell>
      </row>''' in my_result
    # nested list
    htmlstring = '''<html><body><article>
<ul>
  <li>Coffee</li>
  <li>Tea
    <ul>
      <li>Black tea</li>
      <li>Green tea</li>
    </ul>
  </li>
  <li>Milk</li>
</ul>
</article></body></html>'''
    my_result = extract(htmlstring, no_fallback=True, output_format='xml', config=ZERO_CONFIG)
    assert '''
    <list>
      <item>Coffee</item>
      <item>
        <item>Tea</item>
        <list>
          <item>Black tea</item>
          <item>Green tea</item>
        </list>
      </item>
      <item>Milk</item>
    </list>''' in my_result
    # table with links
    # todo: further tests and adjustsments
    htmlstring = '<html><body><article><table><tr><td><a href="test.html">' + 'ABCD'*100 + '</a></td></tr></table></article></body></html>'
    result = extract(htmlstring, no_fallback=True, output_format='xml', config=ZERO_CONFIG, include_tables=True, include_links=True)
    assert 'ABCD' not in result
    # nested table
    htmlstring = '<html><body><article><table><th>1</th><table><tr><td>2</td></tr></table></table></article></body></html>'
    result = extract(htmlstring, no_fallback=True, output_format='xml', config=ZERO_CONFIG, include_tables=True)
    # todo: all elements are there, but output not nested
    # todo: th conversion
    assert '<cell>1</cell>' in result and '<cell>2</cell>' in result