def test_exotic_tags(xmloutput=False): # cover some edge cases with a specially crafted file result = load_mock_page('http://exotic_tags', xml_flag=xmloutput, tei_output=True) assert 'Teletype text' in result and 'My new car is silver.' in result filepath = os.path.join(TEST_DIR, 'cache', 'exotic_tags_tei.html') with open(filepath) as f: content = etree.fromstring(f.read()) res = xml.check_tei(content, 'http://dummy') assert etree.tostring(res).startswith( b'<html>\n<text>\n<body>\n<div>\n\n<hi rend="uppercase">Hello</hi>\n<p>Teletype text</p>') # misformed HTML declaration htmlstring = '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" 2012"http://www.w3.org/TR/html4/loose.dtd"><html><head></head><body><p>ABC</p></body></html>' # outputs '012"http://www.w3.org/TR/html4/loose.dtd">\nABC' assert 'ABC' in extract(htmlstring, config=ZERO_CONFIG) # quotes assert handle_quotes(etree.Element('quote'), False, ZERO_CONFIG) is None assert handle_table(etree.Element('table'), False, ZERO_CONFIG) is None # p within p element, second = etree.Element('p'), etree.Element('p') element.text, second.text = '1st part.', '2nd part.' element.append(second) converted = handle_paragraphs(element, ['p'], False, ZERO_CONFIG) assert etree.tostring(converted) == b'<p>1st part. 2nd part.</p>' # delete last <lb> third = etree.Element('lb') element.append(third) assert etree.tostring(converted) == b'<p>1st part. 2nd part.</p>' # malformed lists (common error) result = etree.tostring(handle_lists(etree.fromstring( '<list>Description of the list:<item>List item 1</item><item>List item 2</item><item>List item 3</item></list>'), False, ZERO_CONFIG)) assert result.count(b'List item') == 3 assert b"Description" in result
def test_exotic_tags(xmloutput=False): # cover some edge cases with a specially crafted file result = load_mock_page('http://exotic_tags', xml_flag=xmloutput, tei_output=True) assert 'Teletype text' in result and 'My new car is silver.' in result filepath = os.path.join(TEST_DIR, 'cache', 'exotic_tags_tei.html') with open(filepath) as f: content = etree.fromstring(f.read()) res = xml.check_tei(content, 'http://dummy') assert etree.tostring(res).startswith( b'<html>\n<text>\n<body>\n<div>\n\n<hi rend="uppercase">Hello</hi>\n<p>Teletype text</p>' ) # misformed HTML declaration htmlstring = '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" 2012"http://www.w3.org/TR/html4/loose.dtd"><html><head></head><body><p>ABC</p></body></html>' # outputs '012"http://www.w3.org/TR/html4/loose.dtd">\nABC' assert 'ABC' in extract(htmlstring, config=ZERO_CONFIG) # quotes assert handle_quotes(etree.Element('quote'), False, ZERO_CONFIG) is None assert handle_table(etree.Element('table'), TAG_CATALOG, False, ZERO_CONFIG) is None # p within p element, second = etree.Element('p'), etree.Element('p') element.text, second.text = '1st part.', '2nd part.' element.append(second) # delete last <lb> element.append(etree.Element('lb')) converted = handle_paragraphs(element, ['p'], False, ZERO_CONFIG) assert etree.tostring(converted) == b'<p>1st part. 2nd part.</p>' # malformed lists (common error) result = etree.tostring( handle_lists( etree.fromstring( '<list>Description of the list:<item>List item 1</item><item>List item 2</item><item>List item 3</item></list>' ), False, ZERO_CONFIG)) assert result.count(b'List item') == 3 assert b"Description" in result # HTML5: <details> htmlstring = '<html><body><article><details><summary>Epcot Center</summary><p>Epcot is a theme park at Walt Disney World Resort featuring exciting attractions, international pavilions, award-winning fireworks and seasonal special events.</p></details></article></body></html>' my_result = extract(htmlstring, no_fallback=True, config=ZERO_CONFIG) assert 'Epcot Center' in my_result and 'award-winning fireworks' in my_result my_result = extract(htmlstring, no_fallback=False, config=ZERO_CONFIG) assert 'Epcot Center' in my_result and 'award-winning fireworks' in my_result # tables with nested elements htmlstring = '''<html><body><article> <table> <tr><td><b>Present Tense</b></td> <td>I buy</td> <td>you buy</td> <td>he/she/it buys</td> <td>we buy</td> <td>you buy</td> <td>they buy</td> </tr> </table></article></body></html>''' my_result = extract(htmlstring, no_fallback=True, output_format='xml', include_formatting=True, config=ZERO_CONFIG) assert '''<row> <cell> <hi>Present Tense</hi> </cell> <cell>I buy</cell> <cell>you buy</cell> <cell>he/she/it buys</cell> <cell>we buy</cell> <cell>you buy</cell> <cell>they buy</cell> </row>''' in my_result