def test_links(): '''Test link extraction function''' assert handle_textelem(etree.Element('ref'), [], False, DEFAULT_CONFIG) is None assert handle_formatting( html.fromstring( '<a href="testlink.html">Test link text.</a>')) is not None mydoc = html.fromstring( '<html><body><p><a href="testlink.html">Test link text.</a></p></body></html>' ) assert 'testlink.html' not in extract(mydoc) assert 'testlink.html' in extract(mydoc, include_links=True, no_fallback=True, config=ZERO_CONFIG) resources_dir = os.path.join(TEST_DIR, 'resources') with open(os.path.join(resources_dir, 'http_sample.html')) as f: teststring = f.read() assert 'testlink.html' not in extract(teststring, config=ZERO_CONFIG) print( extract(teststring, include_links=True, no_fallback=True, config=ZERO_CONFIG)) assert '[link](testlink.html)' in extract(teststring, include_links=True, no_fallback=True, config=ZERO_CONFIG) assert '<ref target="testlink.html">link</ref>' in extract( teststring, include_links=True, no_fallback=True, output_format='xml', config=ZERO_CONFIG)
def test_links(): '''Test link extraction function''' assert handle_textelem(etree.Element('ref'), [], False, DEFAULT_CONFIG) is None assert handle_formatting(html.fromstring('<a href="testlink.html">Test link text.</a>'), dedupbool=False, config=ZERO_CONFIG) is not None # empty link mydoc = html.fromstring('<html><body><p><a></a><b>Some text.</b></p></body></html>') assert extract(mydoc) is not None # link with target mydoc = html.fromstring('<html><body><p><a href="testlink.html">Test link text.</a> This part of the text has to be long enough.</p></body></html>') assert 'testlink.html' not in extract(mydoc) assert '[Test link text.](testlink.html) This part of the text has to be long enough.' in extract(mydoc, include_links=True, no_fallback=True, config=ZERO_CONFIG) # link without target mydoc = html.fromstring('<html><body><p><a>Test link text.</a> This part of the text has to be long enough.</p></body></html>') assert '[Test link text.] This part of the text has to be long enough.' in extract(mydoc, include_links=True, no_fallback=True, config=ZERO_CONFIG) mydoc = html.fromstring('<html><body><article><a>Segment 1</a><h1><a>Segment 2</a></h1><p>Segment 3</p></article></body></html>') result = extract(mydoc, output_format='xml', include_links=True, no_fallback=True, config=ZERO_CONFIG) assert '1' in result and '2' in result and '3' in result with open(os.path.join(RESOURCES_DIR, 'http_sample.html')) as f: teststring = f.read() assert 'testlink.html' not in extract(teststring, config=ZERO_CONFIG) assert '[link](testlink.html)' in extract(teststring, include_links=True, no_fallback=True, config=ZERO_CONFIG) assert '<ref target="testlink.html">link</ref>' in extract(teststring, include_links=True, no_fallback=True, output_format='xml', config=ZERO_CONFIG) # test license link mydoc = html.fromstring('<html><body><p>Test text under <a rel="license" href="">CC BY-SA license</a>.</p></body></html>') assert 'license="CC BY-SA license"' in extract(mydoc, include_links=True, no_fallback=True, output_format='xml', config=ZERO_CONFIG)
def test_formatting(): '''Test HTML formatting conversion and extraction''' # simple my_document = html.fromstring( '<html><body><p><b>This here is in bold font.</b></p></body></html>') my_result = extract(my_document, output_format='xml', include_formatting=True, config=ZERO_CONFIG) assert '<hi rend="#b">This here is in bold font.</hi>' in my_result # nested my_document = html.fromstring( '<html><body><p><b>This here is in bold and <i>italic</i> font.</b></p></body></html>' ) my_result = extract(my_document, output_format='xml', include_formatting=True, config=ZERO_CONFIG) assert '<hi rend="#b">This here is in bold and italic font.</hi>' in my_result # empty my_document = html.fromstring( '<html><body><p><b><i></i></b></p></body></html>') my_result = extract(my_document, output_format='xml', include_formatting=True, config=ZERO_CONFIG) assert '<main/>' in my_result # wild div my_document = html.fromstring( '<html><body><article><div><strong>Wild text</strong></div></article></body></html>' ) my_result = extract(my_document, output_format='xml', include_formatting=True, config=ZERO_CONFIG) assert '<p>' in my_result and '<hi>Wild text</hi>' in my_result # no rend so far my_result = extract(my_document, config=ZERO_CONFIG) assert my_result == 'Wild text' # links doc = html.fromstring( '<html><body><p><a href="">Link text</a></p></body></html>') my_result = extract(doc, config=ZERO_CONFIG) assert my_result == 'Link text' # line-breaks doc = html.fromstring('<html><body><p><br/></p></body></html>') my_result = extract(doc, config=ZERO_CONFIG) assert my_result == '' doc = html.fromstring( '<html><body><p><br/>Here is the text.</p></body></html>') my_result = extract(doc, config=ZERO_CONFIG) assert my_result == 'Here is the text.' # handle formatting tails element = etree.Element("hi") element.text = 'Here is the text.' element.tail = 'And a tail.' converted = handle_formatting(element) assert etree.tostring( converted) == b'<p><hi>Here is the text.</hi>And a tail.</p>'
def test_links(): '''Test link extraction function''' assert handle_textelem(etree.Element('ref'), [], False, DEFAULT_CONFIG) is None assert handle_formatting( html.fromstring('<a href="testlink.html">Test link text.</a>'), dedupbool=False, config=ZERO_CONFIG) is not None # link with target mydoc = html.fromstring( '<html><body><p><a href="testlink.html">Test link text.</a></p></body></html>' ) assert 'testlink.html' not in extract(mydoc) assert '[Test link text.](testlink.html)' in extract(mydoc, include_links=True, no_fallback=True, config=ZERO_CONFIG) # link without target mydoc = html.fromstring( '<html><body><p><a>Test link text.</a></p></body></html>') assert '[Test link text.]' in extract(mydoc, include_links=True, no_fallback=True, config=ZERO_CONFIG) resources_dir = os.path.join(TEST_DIR, 'resources') with open(os.path.join(resources_dir, 'http_sample.html')) as f: teststring = f.read() assert 'testlink.html' not in extract(teststring, config=ZERO_CONFIG) assert '[link](testlink.html)' in extract(teststring, include_links=True, no_fallback=True, config=ZERO_CONFIG) assert '<ref target="testlink.html">link</ref>' in extract( teststring, include_links=True, no_fallback=True, output_format='xml', config=ZERO_CONFIG) # test license link mydoc = html.fromstring( '<html><body><p>Test text under <a rel="license" href="">CC BY-SA license</a>.</p></body></html>' ) assert 'license="CC BY-SA license"' in extract(mydoc, include_links=True, no_fallback=True, output_format='xml', config=ZERO_CONFIG)
def test_formatting(): '''Test HTML formatting conversion and extraction''' # simple my_document = html.fromstring( '<html><body><p><b>This here is in bold font.</b></p></body></html>') my_result = extract(my_document, output_format='xml', include_formatting=True, config=ZERO_CONFIG) assert '<hi rend="#b">This here is in bold font.</hi>' in my_result # titles as markdown my_document = html.fromstring( '<html><body><article><h3>Title</h3><p><b>This here is in bold font.</b></p></article></body></html>' ) my_result = extract(my_document, output_format='txt', include_formatting=True, config=ZERO_CONFIG) assert my_result == '=== Title ===\n**This here is in bold font.**' # nested my_document = html.fromstring( '<html><body><p><b>This here is in bold and <i>italic</i> font.</b></p></body></html>' ) my_result = extract(my_document, output_format='xml', include_formatting=True, config=ZERO_CONFIG) assert '<hi rend="#b">This here is in bold and italic font.</hi>' in my_result # empty my_document = html.fromstring( '<html><body><p><b><i></i></b></p></body></html>') my_result = extract(my_document, output_format='xml', include_formatting=True, config=ZERO_CONFIG) assert '<main/>' in my_result # wild div my_document = html.fromstring( '<html><body><article><div><strong>Wild text</strong></div></article></body></html>' ) my_result = extract(my_document, output_format='xml', include_formatting=True, config=ZERO_CONFIG) assert '<p>' in my_result and '<hi rend="#b">Wild text</hi>' in my_result # no rend so far my_result = extract(my_document, config=ZERO_CONFIG) assert my_result == 'Wild text' # links doc = html.fromstring( '<html><body><p><a href="">Link text</a></p></body></html>') my_result = extract(doc, config=ZERO_CONFIG) assert my_result == 'Link text' # line-breaks doc = html.fromstring('<html><body><p><br/></p></body></html>') my_result = extract(doc, config=ZERO_CONFIG) assert my_result == '' doc = html.fromstring( '<html><body><p><br/>Here is the text.</p></body></html>') my_result = extract(doc, config=ZERO_CONFIG) assert my_result == 'Here is the text.' # handle formatting tails element = etree.Element("hi") element.text = 'Here is the text.' element.tail = 'And a tail.' converted = handle_formatting(element, dedupbool=False, config=ZERO_CONFIG) assert etree.tostring( converted) == b'<p><hi>Here is the text.</hi>And a tail.</p>' # empty elements my_document = html.fromstring( '<html><body><div>\t\n</div><div>There is text here.</div></body></html>' ) my_result = extract(my_document, output_format='xml', config=ZERO_CONFIG) assert '<main>\n <p>There is text here.</p>\n </main>' in my_result # lists with links my_document = html.fromstring( '<html><body><article><ul><li>Number 1</li><li>Number <a href="test.html">2</a></li><li>Number 3</li><p>Test</p></article></body></html>' ) my_result = extract(my_document, output_format='xml', include_links=True, config=ZERO_CONFIG) assert '<item>Number <ref target="test.html">2</ref></item>' in my_result # XML and Markdown formatting within <p>-tag my_document = html.fromstring( '<html><body><p><b>bold</b>, <i>italics</i>, <tt>tt</tt>, <strike>deleted</strike>, <u>underlined</u>, <a href="test.html">link</a>.</p></body></html>' ) my_result = extract(my_document, output_format='xml', no_fallback=True, include_formatting=True, config=ZERO_CONFIG) assert '<p><hi rend="#b">bold</hi>, <hi rend="#i">italics</hi>, <hi rend="#t">tt</hi>, <del>deleted</del>, <hi rend="#u">underlined</hi>, link.</p>' in my_result assert 'rend="#b"' in my_result and 'rend="#i"' in my_result and 'rend="#t"' in my_result and 'rend="#u"' in my_result and '<del>' in my_result my_result = extract(my_document, output_format='xml', include_formatting=True, include_links=True, no_fallback=True, config=ZERO_CONFIG) assert '<hi rend="#t">tt</hi>' in my_result and '<del>deleted</del>' in my_result and '<ref target="test.html">link</ref>.' in my_result assert '<p><hi rend="#b">bold</hi>, <hi rend="#i">italics</hi>, <hi rend="#t">tt</hi>, <del>deleted</del>, <hi rend="#u">underlined</hi>, <ref target="test.html">link</ref>.</p>' in my_result my_result = extract(my_document, output_format='txt', no_fallback=True, include_formatting=True, config=ZERO_CONFIG) assert '**bold**' in my_result and '*italics*' in my_result and '`tt`' in my_result and '~~deleted~~' in my_result and '__underlined__' in my_result assert my_result == '**bold**, *italics*, `tt`,\n~~deleted~~, __underlined__, link.' # double <p>-elems # could be solved by keeping the elements instead of reconstructing them #my_document = html.fromstring('<html><body><p>AAA, <p>BBB</p>, CCC.</p></body></html>') #my_result = extract(my_document, output_format='xml', include_formatting=True, include_links=True, no_fallback=True, config=ZERO_CONFIG) #print(my_result) #assert 1 == 0 # line-break following formatting my_document = html.fromstring( '<html><body><article><p><strong>Staff Review of the Financial Situation</strong><br>Domestic financial conditions remained accommodative over the intermeeting period.</p></article></body></html>' ) my_result = extract(my_document, output_format='txt', no_fallback=True, config=ZERO_CONFIG) assert my_result == 'Staff Review of the Financial Situation\nDomestic financial conditions remained accommodative over the intermeeting period.' # title with formatting my_document = html.fromstring( '<html><body><article><h4 id="1theinoperator">1) The <code>in</code> Operator</h4><p>The easiest way to check if a Python string contains a substring is to use the <code>in</code> operator. The <code>in</code> operator is used to check data structures for membership in Python. It returns a Boolean (either <code>True</code> or <code>False</code>) and can be used as follows:</p></article></body></html>' ) my_result = extract(my_document, output_format='xml', no_fallback=True, include_formatting=True, config=ZERO_CONFIG) assert '<head rend="h4">1) The <code>in</code> Operator</head>' in my_result and '<p>The easiest way to check if a Python string contains a substring is to use the <code>in</code> operator. The <code>in</code> operator is used to check data structures for membership in Python. It returns a Boolean (either <code>True</code> or <code>False</code>) and can be used as follows:</p>' in my_result
def test_formatting(): '''Test HTML formatting conversion and extraction''' # simple my_document = html.fromstring( '<html><body><p><b>This here is in bold font.</b></p></body></html>') my_result = extract(my_document, output_format='xml', include_formatting=True, config=ZERO_CONFIG) print(my_result) assert '<hi rend="#b">This here is in bold font.</hi>' in my_result # nested my_document = html.fromstring( '<html><body><p><b>This here is in bold and <i>italic</i> font.</b></p></body></html>' ) my_result = extract(my_document, output_format='xml', include_formatting=True, config=ZERO_CONFIG) assert '<hi rend="#b">This here is in bold and italic font.</hi>' in my_result # empty my_document = html.fromstring( '<html><body><p><b><i></i></b></p></body></html>') my_result = extract(my_document, output_format='xml', include_formatting=True, config=ZERO_CONFIG) assert '<main/>' in my_result # wild div my_document = html.fromstring( '<html><body><article><div><strong>Wild text</strong></div></article></body></html>' ) my_result = extract(my_document, output_format='xml', include_formatting=True, config=ZERO_CONFIG) assert '<p>' in my_result and '<hi>Wild text</hi>' in my_result # no rend so far my_result = extract(my_document, config=ZERO_CONFIG) assert my_result == 'Wild text' # links doc = html.fromstring( '<html><body><p><a href="">Link text</a></p></body></html>') my_result = extract(doc, config=ZERO_CONFIG) assert my_result == 'Link text' # line-breaks doc = html.fromstring('<html><body><p><br/></p></body></html>') my_result = extract(doc, config=ZERO_CONFIG) assert my_result == '' doc = html.fromstring( '<html><body><p><br/>Here is the text.</p></body></html>') my_result = extract(doc, config=ZERO_CONFIG) assert my_result == 'Here is the text.' # handle formatting tails element = etree.Element("hi") element.text = 'Here is the text.' element.tail = 'And a tail.' converted = handle_formatting(element) assert etree.tostring( converted) == b'<p><hi>Here is the text.</hi>And a tail.</p>' # empty elements my_document = html.fromstring( '<html><body><div>\t\n</div><div>There is text here.</div></body></html>' ) my_result = extract(my_document, output_format='xml', config=ZERO_CONFIG) assert '<main>\n <p>There is text here.</p>\n </main>' in my_result # lists with links my_document = html.fromstring( '<html><body><article><ul><li>Number 1</li><li>Number <a href="test.html">2</a></li><li>Number 3</li><p>Test</p></article></body></html>' ) my_result = extract(my_document, output_format='xml', include_links=True, config=ZERO_CONFIG) assert '<item>Number <ref target="test.html">2</ref></item>' in my_result # (markdown) formatting within <p>-tag my_document = html.fromstring( '<html><body><p><b>bold</b>, <i>italics</i>, <tt>tt</tt>, <strike>deleted</strike>, <u>underlined</u>, <a href="test.html">link</a>.</p></body></html>' ) #my_result = extract(my_document, output_format='txt', no_fallback=True, config=ZERO_CONFIG) # todo: handling <del>-element #print(my_result) my_result = extract(my_document, output_format='xml', include_formatting=True, include_links=True, no_fallback=True, config=ZERO_CONFIG) assert '<hi rend="#t">tt</hi>' in my_result and '<del>deleted</del>' in my_result and '<ref target="test.html">link</ref>.' in my_result