Python handle_formatting Examples, trafilatura.core.handle_formatting Python Examples

Example #1

0

Show file

File: unit_tests.py Project: vkuberan/trafilatura

def test_links():
    '''Test link extraction function'''
    assert handle_textelem(etree.Element('ref'), [], False,
                           DEFAULT_CONFIG) is None
    assert handle_formatting(
        html.fromstring(
            '<a href="testlink.html">Test link text.</a>')) is not None
    mydoc = html.fromstring(
        '<html><body><p><a href="testlink.html">Test link text.</a></p></body></html>'
    )
    assert 'testlink.html' not in extract(mydoc)
    assert 'testlink.html' in extract(mydoc,
                                      include_links=True,
                                      no_fallback=True,
                                      config=ZERO_CONFIG)
    resources_dir = os.path.join(TEST_DIR, 'resources')
    with open(os.path.join(resources_dir, 'http_sample.html')) as f:
        teststring = f.read()
    assert 'testlink.html' not in extract(teststring, config=ZERO_CONFIG)
    print(
        extract(teststring,
                include_links=True,
                no_fallback=True,
                config=ZERO_CONFIG))
    assert '[link](testlink.html)' in extract(teststring,
                                              include_links=True,
                                              no_fallback=True,
                                              config=ZERO_CONFIG)
    assert '<ref target="testlink.html">link</ref>' in extract(
        teststring,
        include_links=True,
        no_fallback=True,
        output_format='xml',
        config=ZERO_CONFIG)

Example #2

0

Show file

def test_links():
    '''Test link extraction function'''
    assert handle_textelem(etree.Element('ref'), [], False, DEFAULT_CONFIG) is None
    assert handle_formatting(html.fromstring('<a href="testlink.html">Test link text.</a>'), dedupbool=False, config=ZERO_CONFIG) is not None
    # empty link
    mydoc = html.fromstring('<html><body><p><a></a><b>Some text.</b></p></body></html>')
    assert extract(mydoc) is not None
    # link with target
    mydoc = html.fromstring('<html><body><p><a href="testlink.html">Test link text.</a> This part of the text has to be long enough.</p></body></html>')
    assert 'testlink.html' not in extract(mydoc)
    assert '[Test link text.](testlink.html) This part of the text has to be long enough.' in extract(mydoc, include_links=True, no_fallback=True, config=ZERO_CONFIG)
    # link without target
    mydoc = html.fromstring('<html><body><p><a>Test link text.</a> This part of the text has to be long enough.</p></body></html>')
    assert '[Test link text.] This part of the text has to be long enough.' in extract(mydoc, include_links=True, no_fallback=True, config=ZERO_CONFIG)
    mydoc = html.fromstring('<html><body><article><a>Segment 1</a><h1><a>Segment 2</a></h1><p>Segment 3</p></article></body></html>')
    result = extract(mydoc, output_format='xml', include_links=True, no_fallback=True, config=ZERO_CONFIG)
    assert '1' in result and '2' in result and '3' in result
    with open(os.path.join(RESOURCES_DIR, 'http_sample.html')) as f:
        teststring = f.read()
    assert 'testlink.html' not in extract(teststring, config=ZERO_CONFIG)
    assert '[link](testlink.html)' in extract(teststring, include_links=True, no_fallback=True, config=ZERO_CONFIG)
    assert '<ref target="testlink.html">link</ref>' in extract(teststring, include_links=True, no_fallback=True, output_format='xml', config=ZERO_CONFIG)
    # test license link
    mydoc = html.fromstring('<html><body><p>Test text under <a rel="license" href="">CC BY-SA license</a>.</p></body></html>')
    assert 'license="CC BY-SA license"' in extract(mydoc, include_links=True, no_fallback=True, output_format='xml', config=ZERO_CONFIG)

Example #3

0

Show file

File: unit_tests.py Project: vkuberan/trafilatura

def test_formatting():
    '''Test HTML formatting conversion and extraction'''
    # simple
    my_document = html.fromstring(
        '<html><body><p><b>This here is in bold font.</b></p></body></html>')
    my_result = extract(my_document,
                        output_format='xml',
                        include_formatting=True,
                        config=ZERO_CONFIG)
    assert '<hi rend="#b">This here is in bold font.</hi>' in my_result
    # nested
    my_document = html.fromstring(
        '<html><body><p><b>This here is in bold and <i>italic</i> font.</b></p></body></html>'
    )
    my_result = extract(my_document,
                        output_format='xml',
                        include_formatting=True,
                        config=ZERO_CONFIG)
    assert '<hi rend="#b">This here is in bold and italic font.</hi>' in my_result
    # empty
    my_document = html.fromstring(
        '<html><body><p><b><i></i></b></p></body></html>')
    my_result = extract(my_document,
                        output_format='xml',
                        include_formatting=True,
                        config=ZERO_CONFIG)
    assert '<main/>' in my_result
    # wild div
    my_document = html.fromstring(
        '<html><body><article><div><strong>Wild text</strong></div></article></body></html>'
    )
    my_result = extract(my_document,
                        output_format='xml',
                        include_formatting=True,
                        config=ZERO_CONFIG)
    assert '<p>' in my_result and '<hi>Wild text</hi>' in my_result  # no rend so far
    my_result = extract(my_document, config=ZERO_CONFIG)
    assert my_result == 'Wild text'
    # links
    doc = html.fromstring(
        '<html><body><p><a href="">Link text</a></p></body></html>')
    my_result = extract(doc, config=ZERO_CONFIG)
    assert my_result == 'Link text'
    # line-breaks
    doc = html.fromstring('<html><body><p><br/></p></body></html>')
    my_result = extract(doc, config=ZERO_CONFIG)
    assert my_result == ''
    doc = html.fromstring(
        '<html><body><p><br/>Here is the text.</p></body></html>')
    my_result = extract(doc, config=ZERO_CONFIG)
    assert my_result == 'Here is the text.'
    # handle formatting tails
    element = etree.Element("hi")
    element.text = 'Here is the text.'
    element.tail = 'And a tail.'
    converted = handle_formatting(element)
    assert etree.tostring(
        converted) == b'<p><hi>Here is the text.</hi>And a tail.</p>'

Example #4

0

Show file

def test_links():
    '''Test link extraction function'''
    assert handle_textelem(etree.Element('ref'), [], False,
                           DEFAULT_CONFIG) is None
    assert handle_formatting(
        html.fromstring('<a href="testlink.html">Test link text.</a>'),
        dedupbool=False,
        config=ZERO_CONFIG) is not None
    # link with target
    mydoc = html.fromstring(
        '<html><body><p><a href="testlink.html">Test link text.</a></p></body></html>'
    )
    assert 'testlink.html' not in extract(mydoc)
    assert '[Test link text.](testlink.html)' in extract(mydoc,
                                                         include_links=True,
                                                         no_fallback=True,
                                                         config=ZERO_CONFIG)
    # link without target
    mydoc = html.fromstring(
        '<html><body><p><a>Test link text.</a></p></body></html>')
    assert '[Test link text.]' in extract(mydoc,
                                          include_links=True,
                                          no_fallback=True,
                                          config=ZERO_CONFIG)
    resources_dir = os.path.join(TEST_DIR, 'resources')
    with open(os.path.join(resources_dir, 'http_sample.html')) as f:
        teststring = f.read()
    assert 'testlink.html' not in extract(teststring, config=ZERO_CONFIG)
    assert '[link](testlink.html)' in extract(teststring,
                                              include_links=True,
                                              no_fallback=True,
                                              config=ZERO_CONFIG)
    assert '<ref target="testlink.html">link</ref>' in extract(
        teststring,
        include_links=True,
        no_fallback=True,
        output_format='xml',
        config=ZERO_CONFIG)
    # test license link
    mydoc = html.fromstring(
        '<html><body><p>Test text under <a rel="license" href="">CC BY-SA license</a>.</p></body></html>'
    )
    assert 'license="CC BY-SA license"' in extract(mydoc,
                                                   include_links=True,
                                                   no_fallback=True,
                                                   output_format='xml',
                                                   config=ZERO_CONFIG)

Example #5

0

Show file

File: unit_tests.py Project: vbarbaresi/trafilatura

def test_formatting():
    '''Test HTML formatting conversion and extraction'''
    # simple
    my_document = html.fromstring(
        '<html><body><p><b>This here is in bold font.</b></p></body></html>')
    my_result = extract(my_document,
                        output_format='xml',
                        include_formatting=True,
                        config=ZERO_CONFIG)
    assert '<hi rend="#b">This here is in bold font.</hi>' in my_result
    # titles as markdown
    my_document = html.fromstring(
        '<html><body><article><h3>Title</h3><p><b>This here is in bold font.</b></p></article></body></html>'
    )
    my_result = extract(my_document,
                        output_format='txt',
                        include_formatting=True,
                        config=ZERO_CONFIG)
    assert my_result == '=== Title ===\n**This here is in bold font.**'
    # nested
    my_document = html.fromstring(
        '<html><body><p><b>This here is in bold and <i>italic</i> font.</b></p></body></html>'
    )
    my_result = extract(my_document,
                        output_format='xml',
                        include_formatting=True,
                        config=ZERO_CONFIG)
    assert '<hi rend="#b">This here is in bold and italic font.</hi>' in my_result
    # empty
    my_document = html.fromstring(
        '<html><body><p><b><i></i></b></p></body></html>')
    my_result = extract(my_document,
                        output_format='xml',
                        include_formatting=True,
                        config=ZERO_CONFIG)
    assert '<main/>' in my_result
    # wild div
    my_document = html.fromstring(
        '<html><body><article><div><strong>Wild text</strong></div></article></body></html>'
    )
    my_result = extract(my_document,
                        output_format='xml',
                        include_formatting=True,
                        config=ZERO_CONFIG)
    assert '<p>' in my_result and '<hi rend="#b">Wild text</hi>' in my_result  # no rend so far
    my_result = extract(my_document, config=ZERO_CONFIG)
    assert my_result == 'Wild text'
    # links
    doc = html.fromstring(
        '<html><body><p><a href="">Link text</a></p></body></html>')
    my_result = extract(doc, config=ZERO_CONFIG)
    assert my_result == 'Link text'
    # line-breaks
    doc = html.fromstring('<html><body><p><br/></p></body></html>')
    my_result = extract(doc, config=ZERO_CONFIG)
    assert my_result == ''
    doc = html.fromstring(
        '<html><body><p><br/>Here is the text.</p></body></html>')
    my_result = extract(doc, config=ZERO_CONFIG)
    assert my_result == 'Here is the text.'
    # handle formatting tails
    element = etree.Element("hi")
    element.text = 'Here is the text.'
    element.tail = 'And a tail.'
    converted = handle_formatting(element, dedupbool=False, config=ZERO_CONFIG)
    assert etree.tostring(
        converted) == b'<p><hi>Here is the text.</hi>And a tail.</p>'
    # empty elements
    my_document = html.fromstring(
        '<html><body><div>\t\n</div><div>There is text here.</div></body></html>'
    )
    my_result = extract(my_document, output_format='xml', config=ZERO_CONFIG)
    assert '<main>\n    <p>There is text here.</p>\n  </main>' in my_result
    # lists with links
    my_document = html.fromstring(
        '<html><body><article><ul><li>Number 1</li><li>Number <a href="test.html">2</a></li><li>Number 3</li><p>Test</p></article></body></html>'
    )
    my_result = extract(my_document,
                        output_format='xml',
                        include_links=True,
                        config=ZERO_CONFIG)
    assert '<item>Number <ref target="test.html">2</ref></item>' in my_result

    # XML and Markdown formatting within <p>-tag
    my_document = html.fromstring(
        '<html><body><p><b>bold</b>, <i>italics</i>, <tt>tt</tt>, <strike>deleted</strike>, <u>underlined</u>, <a href="test.html">link</a>.</p></body></html>'
    )
    my_result = extract(my_document,
                        output_format='xml',
                        no_fallback=True,
                        include_formatting=True,
                        config=ZERO_CONFIG)
    assert '<p><hi rend="#b">bold</hi>, <hi rend="#i">italics</hi>, <hi rend="#t">tt</hi>, <del>deleted</del>, <hi rend="#u">underlined</hi>, link.</p>' in my_result
    assert 'rend="#b"' in my_result and 'rend="#i"' in my_result and 'rend="#t"' in my_result and 'rend="#u"' in my_result and '<del>' in my_result
    my_result = extract(my_document,
                        output_format='xml',
                        include_formatting=True,
                        include_links=True,
                        no_fallback=True,
                        config=ZERO_CONFIG)
    assert '<hi rend="#t">tt</hi>' in my_result and '<del>deleted</del>' in my_result and '<ref target="test.html">link</ref>.' in my_result
    assert '<p><hi rend="#b">bold</hi>, <hi rend="#i">italics</hi>, <hi rend="#t">tt</hi>, <del>deleted</del>, <hi rend="#u">underlined</hi>, <ref target="test.html">link</ref>.</p>' in my_result
    my_result = extract(my_document,
                        output_format='txt',
                        no_fallback=True,
                        include_formatting=True,
                        config=ZERO_CONFIG)
    assert '**bold**' in my_result and '*italics*' in my_result and '`tt`' in my_result and '~~deleted~~' in my_result and '__underlined__' in my_result
    assert my_result == '**bold**, *italics*, `tt`,\n~~deleted~~, __underlined__, link.'

    # double <p>-elems
    # could be solved by keeping the elements instead of reconstructing them
    #my_document = html.fromstring('<html><body><p>AAA, <p>BBB</p>, CCC.</p></body></html>')
    #my_result = extract(my_document, output_format='xml', include_formatting=True, include_links=True, no_fallback=True, config=ZERO_CONFIG)
    #print(my_result)
    #assert 1 == 0

    # line-break following formatting
    my_document = html.fromstring(
        '<html><body><article><p><strong>Staff Review of the Financial Situation</strong><br>Domestic financial conditions remained accommodative over the intermeeting period.</p></article></body></html>'
    )
    my_result = extract(my_document,
                        output_format='txt',
                        no_fallback=True,
                        config=ZERO_CONFIG)
    assert my_result == 'Staff Review of the Financial Situation\nDomestic financial conditions remained accommodative over the intermeeting period.'
    # title with formatting
    my_document = html.fromstring(
        '<html><body><article><h4 id="1theinoperator">1) The <code>in</code> Operator</h4><p>The easiest way to check if a Python string contains a substring is to use the <code>in</code> operator. The <code>in</code> operator is used to check data structures for membership in Python. It returns a Boolean (either <code>True</code> or <code>False</code>) and can be used as follows:</p></article></body></html>'
    )
    my_result = extract(my_document,
                        output_format='xml',
                        no_fallback=True,
                        include_formatting=True,
                        config=ZERO_CONFIG)
    assert '<head rend="h4">1) The <code>in</code> Operator</head>' in my_result and '<p>The easiest way to check if a Python string contains a substring is to use the <code>in</code> operator. The <code>in</code> operator is used to check data structures for membership in Python. It returns a Boolean (either <code>True</code> or <code>False</code>) and can be used as follows:</p>' in my_result

Example #6

0

Show file

def test_formatting():
    '''Test HTML formatting conversion and extraction'''
    # simple
    my_document = html.fromstring(
        '<html><body><p><b>This here is in bold font.</b></p></body></html>')
    my_result = extract(my_document,
                        output_format='xml',
                        include_formatting=True,
                        config=ZERO_CONFIG)
    print(my_result)
    assert '<hi rend="#b">This here is in bold font.</hi>' in my_result
    # nested
    my_document = html.fromstring(
        '<html><body><p><b>This here is in bold and <i>italic</i> font.</b></p></body></html>'
    )
    my_result = extract(my_document,
                        output_format='xml',
                        include_formatting=True,
                        config=ZERO_CONFIG)
    assert '<hi rend="#b">This here is in bold and italic font.</hi>' in my_result
    # empty
    my_document = html.fromstring(
        '<html><body><p><b><i></i></b></p></body></html>')
    my_result = extract(my_document,
                        output_format='xml',
                        include_formatting=True,
                        config=ZERO_CONFIG)
    assert '<main/>' in my_result
    # wild div
    my_document = html.fromstring(
        '<html><body><article><div><strong>Wild text</strong></div></article></body></html>'
    )
    my_result = extract(my_document,
                        output_format='xml',
                        include_formatting=True,
                        config=ZERO_CONFIG)
    assert '<p>' in my_result and '<hi>Wild text</hi>' in my_result  # no rend so far
    my_result = extract(my_document, config=ZERO_CONFIG)
    assert my_result == 'Wild text'
    # links
    doc = html.fromstring(
        '<html><body><p><a href="">Link text</a></p></body></html>')
    my_result = extract(doc, config=ZERO_CONFIG)
    assert my_result == 'Link text'
    # line-breaks
    doc = html.fromstring('<html><body><p><br/></p></body></html>')
    my_result = extract(doc, config=ZERO_CONFIG)
    assert my_result == ''
    doc = html.fromstring(
        '<html><body><p><br/>Here is the text.</p></body></html>')
    my_result = extract(doc, config=ZERO_CONFIG)
    assert my_result == 'Here is the text.'
    # handle formatting tails
    element = etree.Element("hi")
    element.text = 'Here is the text.'
    element.tail = 'And a tail.'
    converted = handle_formatting(element)
    assert etree.tostring(
        converted) == b'<p><hi>Here is the text.</hi>And a tail.</p>'
    # empty elements
    my_document = html.fromstring(
        '<html><body><div>\t\n</div><div>There is text here.</div></body></html>'
    )
    my_result = extract(my_document, output_format='xml', config=ZERO_CONFIG)
    assert '<main>\n    <p>There is text here.</p>\n  </main>' in my_result
    # lists with links
    my_document = html.fromstring(
        '<html><body><article><ul><li>Number 1</li><li>Number <a href="test.html">2</a></li><li>Number 3</li><p>Test</p></article></body></html>'
    )
    my_result = extract(my_document,
                        output_format='xml',
                        include_links=True,
                        config=ZERO_CONFIG)
    assert '<item>Number <ref target="test.html">2</ref></item>' in my_result
    # (markdown) formatting within <p>-tag
    my_document = html.fromstring(
        '<html><body><p><b>bold</b>, <i>italics</i>, <tt>tt</tt>, <strike>deleted</strike>, <u>underlined</u>, <a href="test.html">link</a>.</p></body></html>'
    )
    #my_result = extract(my_document, output_format='txt', no_fallback=True, config=ZERO_CONFIG)
    # todo: handling <del>-element
    #print(my_result)
    my_result = extract(my_document,
                        output_format='xml',
                        include_formatting=True,
                        include_links=True,
                        no_fallback=True,
                        config=ZERO_CONFIG)
    assert '<hi rend="#t">tt</hi>' in my_result and '<del>deleted</del>' in my_result and '<ref target="test.html">link</ref>.' in my_result