Esempio n. 1
0
def test_external():
    '''Test external components'''
    # remove unwanted elements
    mydoc = html.fromstring(
        '<html><body><footer>Test text</footer></body></html>')
    _, _, mylen = sanitize_tree(mydoc)
    assert mylen == 0
    mydoc = html.fromstring(
        '<html><body><table><th>Test text</th><tr><td>Test</td></tr></table></body></html>'
    )
    _, _, mylen = sanitize_tree(mydoc)
    assert mylen > 0
    # strip fancy tags while including links and images
    mydoc = html.fromstring(
        '<html><body><p>Text here <fancy>Test text</fancy><a href="">with a link</a>.</p><img src="test.jpg"/></body></html>'
    )
    mytree, _, _ = sanitize_tree(mydoc,
                                 include_links=False,
                                 include_images=False)
    assert len(mytree) == 1
    mydoc = html.fromstring(
        '<html><body><p>Text here <fancy>Test text</fancy><a href="">with a link</a>.</p><img src="test.jpg"/></body></html>'
    )
    mytree, _, _ = sanitize_tree(mydoc,
                                 include_links=True,
                                 include_images=True)
    myelems = {element.tag for element in set(mytree.iter())}
    assert 'graphic' in myelems and 'ref' in myelems
    # test langid
    if LANGID_FLAG is True:
        doc = html.fromstring('<html><body>' + '<p>Non è inglese.</p>' * 20 +
                              '</body></html>')
        assert extract(
            doc, no_fallback=False, target_language='en',
            deduplicate=False) is None
Esempio n. 2
0
def test_external():
    '''Test external components'''
    # remove unwanted elements
    mydoc = html.fromstring('<html><body><footer>Test text</footer></body></html>')
    _, _, mylen = sanitize_tree(mydoc)
    assert mylen == 0
    # strip fancy tags
    mydoc = html.fromstring('<html><body><p>Text here <fancy>Test text</fancy></p></body></html>')
    mytree, _, _ = sanitize_tree(mydoc)
    assert len(mytree) == 1
    # justext stoplist
    # if LANGID_FLAG is True:
    doc = html.fromstring('<html><body>' + '<p>abc</p>'*10 + '</body></html>')
    result = extract(doc, no_fallback=False, target_language='en')
Esempio n. 3
0
def test_external():
    '''Test external components'''
    # remove unwanted elements
    mydoc = html.fromstring('<html><body><footer>Test text</footer></body></html>')
    _, _, mylen = sanitize_tree(mydoc)
    assert mylen == 0
    mydoc = html.fromstring('<html><body><table><th>Test text</th><tr><td>Test</td></tr></table></body></html>')
    _, _, mylen = sanitize_tree(mydoc)
    assert mylen > 0
    # strip fancy tags
    mydoc = html.fromstring('<html><body><p>Text here <fancy>Test text</fancy></p></body></html>')
    mytree, _, _ = sanitize_tree(mydoc)
    assert len(mytree) == 1
    # test langid
    if LANGID_FLAG is True:
        doc = html.fromstring('<html><body>' + '<p>Non è inglese.</p>' * 20 + '</body></html>')
        assert extract(doc, no_fallback=False, target_language='en', deduplicate=False) is None