コード例 #1
0
def test_external():
    '''Test external components'''
    # remove unwanted elements
    mydoc = html.fromstring('<html><body><footer>Test text</footer></body></html>')
    _, _, mylen = sanitize_tree(mydoc)
    assert mylen == 0
    mydoc = html.fromstring('<html><body><table><th>Test text</th><tr><td>Test</td></tr></table></body></html>')
    _, _, mylen = sanitize_tree(mydoc)
    assert mylen > 0
    # strip fancy tags while including links and images
    mydoc = html.fromstring('<html><body><p>Text here <fancy>Test text</fancy><a href="">with a link</a>.</p><img src="test.jpg"/></body></html>')
    mytree, _, _ = sanitize_tree(mydoc, include_links=False, include_images=False)
    assert len(mytree) == 1
    mydoc = html.fromstring('<html><body><p>Text here <fancy>Test text</fancy><a href="">with a link</a>.</p><img src="test.jpg"/></body></html>')
    mytree, _, _ = sanitize_tree(mydoc, include_links=True, include_images=True)
    myelems = {element.tag for element in set(mytree.iter())}
    assert 'graphic' in myelems and 'ref' in myelems
    # test langid
    if LANGID_FLAG is True:
        doc = html.fromstring('<html><body>' + '<p>Non è inglese.</p>'*20 + '</body></html>')
        assert extract(doc, no_fallback=False, target_language='en', deduplicate=False) is None
    # no tables
    with open(os.path.join(RESOURCES_DIR, 'apache.html')) as f:
        teststring = f.read()
    assert 'localhost:80' in extract(teststring, no_fallback=False, include_tables=True)
    assert 'localhost:80' not in extract(teststring, no_fallback=False, include_tables=False)
    with open(os.path.join(RESOURCES_DIR, 'scam.html')) as f:
        teststring = f.read()
    assert extract(teststring, no_fallback=True, include_tables=False) == ''
    assert extract(teststring, no_fallback=False, include_tables=False) == ''
コード例 #2
0
def test_images():
    '''Test image extraction function'''
    assert handle_image(html.fromstring('<img src="test.jpg"/>')) is not None
    assert handle_image(html.fromstring('<img data-src="test.jpg" alt="text" title="a title"/>')) is not None
    assert handle_image(html.fromstring('<img other="test.jpg"/>')) is None
    assert utils.is_image_file('test.jpg') is True
    assert utils.is_image_file('test.txt') is False
    assert handle_textelem(etree.Element('graphic'), [], False, DEFAULT_CONFIG) is None
    resources_dir = os.path.join(TEST_DIR, 'resources')
    with open(os.path.join(resources_dir, 'http_sample.html')) as f:
        teststring = f.read()
    assert 'test.jpg Example image' not in extract(teststring)
    assert 'test.jpg Example image' in extract(teststring, include_images=True, no_fallback=True)
    assert '<graphic src="test.jpg" title="Example image"/>' in extract(teststring, include_images=True,
                                                                        no_fallback=True, output_format='xml',
                                                                        config=ZERO_CONFIG)
    # CNN example
    mydoc = html.fromstring(
        '<img class="media__image media__image--responsive" alt="Harry and Meghan last March, in their final royal engagement." data-src-mini="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-small-169.jpg" data-src-xsmall="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-medium-plus-169.jpg" data-src-small="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-large-169.jpg" data-src-medium="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-exlarge-169.jpg" data-src-large="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-super-169.jpg" data-src-full16x9="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-full-169.jpg" data-src-mini1x1="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-small-11.jpg" data-demand-load="loaded" data-eq-pts="mini: 0, xsmall: 221, small: 308, medium: 461, large: 781" src="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-exlarge-169.jpg" data-eq-state="mini xsmall small medium" data-src="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-exlarge-169.jpg">')
    myimage = handle_image(mydoc)
    assert myimage is not None and 'alt' in myimage.attrib and 'src' in myimage.attrib
    # modified CNN example
    mydoc = html.fromstring(
        '<img class="media__image media__image--responsive" alt="Harry and Meghan last March, in their final royal engagement." data-src-mini="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-small-169.jpg" data-src-xsmall="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-medium-plus-169.jpg" data-src-small="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-large-169.jpg" data-src-medium="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-exlarge-169.jpg" data-src-large="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-super-169.jpg" data-src-full16x9="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-full-169.jpg" data-src-mini1x1="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-small-11.jpg" data-demand-load="loaded" data-eq-pts="mini: 0, xsmall: 221, small: 308, medium: 461, large: 781">')
    myimage = handle_image(mydoc)
    assert myimage is not None and 'alt' in myimage.attrib and 'src' in myimage.attrib and myimage.get(
        'src').startswith('http')
コード例 #3
0
def test_tei():
    '''test TEI-related functions'''
    # open local resources to avoid redownloading at each run
    resources_dir = os.path.join(TEST_DIR, 'resources')
    with open(os.path.join(resources_dir, 'httpbin_sample.html')) as f:
        teststring = f.read()
    # download, parse and validate simple html file
    result = extract(teststring,
                     "mocked",
                     no_fallback=True,
                     tei_output=True,
                     tei_validation=False)
    assert result is not None
    assert xml.validate_tei(etree.fromstring(result)) is True
    assert xml.validate_tei(etree.fromstring(teststring)) is False
    # test with another file
    with open(os.path.join(resources_dir, 'http_sample.html')) as f:
        teststring = f.read()
    # download, parse and validate simple html file
    result = extract(teststring,
                     "mocked",
                     no_fallback=True,
                     tei_output=True,
                     tei_validation=False)
    assert result is not None
    assert xml.validate_tei(etree.fromstring(result)) is True
    # include ID in metadata
    result = extract(teststring,
                     "mocked",
                     no_fallback=True,
                     tei_output=True,
                     tei_validation=False,
                     record_id='0001')
    assert result is not None
    assert xml.validate_tei(etree.fromstring(result)) is True
コード例 #4
0
ファイル: unit_tests.py プロジェクト: dominhhai/trafilatura
def test_links():
    '''Test link extraction function'''
    assert handle_textelem(etree.Element('ref'), [], False,
                           DEFAULT_CONFIG) is None
    assert handle_formatting(
        html.fromstring(
            '<a href="testlink.html">Test link text.</a>')) is not None
    mydoc = html.fromstring(
        '<html><body><p><a href="testlink.html">Test link text.</a></p></body></html>'
    )
    assert 'testlink.html' not in extract(mydoc)
    assert 'testlink.html' in extract(mydoc,
                                      include_links=True,
                                      no_fallback=True,
                                      config=ZERO_CONFIG)
    resources_dir = os.path.join(TEST_DIR, 'resources')
    with open(os.path.join(resources_dir, 'http_sample.html')) as f:
        teststring = f.read()
    assert 'testlink.html' not in extract(teststring, config=ZERO_CONFIG)
    assert '[link](testlink.html)' in extract(teststring,
                                              include_links=True,
                                              no_fallback=True,
                                              config=ZERO_CONFIG)
    assert '<ref target="testlink.html">link</ref>' in extract(
        teststring,
        include_links=True,
        no_fallback=True,
        output_format='xml',
        config=ZERO_CONFIG)
コード例 #5
0
def test_htmlprocessing():
    '''test html-related functions'''
    assert trafilatura.htmlprocessing.tree_cleaning(etree.Element('html'), True) is not None
    assert trafilatura.htmlprocessing.prune_html(etree.Element('unwanted')) is not None
    mydoc = html.fromstring('<html><body><table><a href="">Link</a></table><img src="test.jpg"/><u>Underlined</u><tt>True Type</tt><sub>Text</sub><sup>Text</sup></body></html>')
    myconverted = trafilatura.htmlprocessing.convert_tags(mydoc, include_formatting=True, include_tables=True, include_images=True, include_links=True)
    assert myconverted.xpath('.//ref') and myconverted.xpath('.//graphic') and myconverted.xpath('.//hi[@rend="#t"]') and myconverted.xpath('.//table')
    myconverted = trafilatura.htmlprocessing.tree_cleaning(mydoc, include_tables=False, include_images=True)
    assert myconverted.xpath('.//graphic') and not myconverted.xpath('.//table')
    mydoc = html.fromstring('<html><body><article><h1>Test headline</h1><p>Test</p></article></body></html>')
    assert '<head rend="h1">Test headline</head>' in extract(mydoc, output_format='xml', config=ZERO_CONFIG, no_fallback=True)
    assert '<fw rend="h1" type="header">Test headline</fw>' in extract(mydoc, output_format='xmltei', config=ZERO_CONFIG, no_fallback=True)
    # merge with parent function
    element = etree.Element('test')
    xml.merge_with_parent(element)
    mydoc = html.fromstring('<html><body><p><span>A</span><span>B</span><span>C</span></p></body></html>')
    for element in mydoc.iter('span'):
        xml.merge_with_parent(element)
    assert b'<p>A B C</p>' in etree.tostring(mydoc)
    mydoc = html.fromstring('<html><body><p><span>A</span><span>B</span> tail<span>C</span></p></body></html>')
    for element in mydoc.iter('span'):
        xml.merge_with_parent(element)
    assert b'<p>A B tail C</p>' in etree.tostring(mydoc)
    # paywalls
    my_html = '<html><body><main><p>1</p><p id="paywall">2</p><p>3</p></main></body></html>'
    assert extract(my_html, config=ZERO_CONFIG, no_fallback=True) == '1\n3'
    assert extract(my_html, config=ZERO_CONFIG, no_fallback=False) == '1\n3'
コード例 #6
0
ファイル: unit_tests.py プロジェクト: vbarbaresi/trafilatura
def test_txttocsv():
    mymeta = dict.fromkeys(METADATA_LIST)
    assert utils.txttocsv('', '',
                          mymeta) == 'None\tNone\tNone\tNone\tNone\t\t\tNone\n'
    mymeta['title'] = 'Test title'
    mymeta['url'] = 'https://example.org'
    mymeta['hostname'] = 'example.org'
    mymeta['id'] = '1'
    mymeta['license'] = 'CC BY-SA'
    assert utils.txttocsv(
        'Test text', 'Test comment', mymeta
    ) == '1\thttps://example.org\tNone\texample.org\tTest title\tNone\tTest text\tTest comment\tCC BY-SA\n'
    mystring = '<html><body><p>ÄÄÄÄÄÄÄÄÄÄÄÄÄÄ</p></body></html>'
    assert extract(mystring, output_format='csv',
                   config=ZERO_CONFIG) is not None
    assert extract(mystring,
                   output_format='csv',
                   include_comments=False,
                   config=ZERO_CONFIG).endswith('\tNone\n')
    # test json
    result = extract(mystring, output_format='json', config=ZERO_CONFIG)
    assert result.endswith('}') and '"fingerprint":' in result
    # bare extraction for python
    result = bare_extraction(mystring, config=ZERO_CONFIG)
    assert isinstance(result, dict) and len(result) == 14
コード例 #7
0
ファイル: unit_tests.py プロジェクト: vkuberan/trafilatura
def test_images():
    '''Test image extraction function'''
    mydoc = html.fromstring('<html><body><img src="test.jpg"/></body></html>')
    assert handle_image(html.fromstring('<img src="test.jpg"/>')) is not None
    assert handle_image(
        html.fromstring('<img data-src="test.jpg" alt="text" title="a title"/>'
                        )) is not None
    assert handle_image(html.fromstring('<img other="test.jpg"/>')) is None
    assert utils.is_image_file('test.jpg') is True
    assert utils.is_image_file('test.txt') is False
    assert handle_textelem(etree.Element('graphic'), [], False,
                           DEFAULT_CONFIG) is None
    resources_dir = os.path.join(TEST_DIR, 'resources')
    with open(os.path.join(resources_dir, 'http_sample.html')) as f:
        teststring = f.read()
    assert 'test.jpg Example image' not in extract(teststring)
    assert 'test.jpg Example image' in extract(teststring,
                                               include_images=True,
                                               no_fallback=True)
    assert '<graphic src="test.jpg" title="Example image"/>' in extract(
        teststring,
        include_images=True,
        no_fallback=True,
        output_format='xml',
        config=ZERO_CONFIG)
コード例 #8
0
ファイル: unit_tests.py プロジェクト: vbarbaresi/trafilatura
def test_htmlprocessing():
    '''test html-related functions'''
    assert trafilatura.htmlprocessing.tree_cleaning(etree.Element('html'),
                                                    True) is not None
    assert trafilatura.htmlprocessing.prune_html(
        etree.Element('unwanted')) is not None
    mydoc = html.fromstring(
        '<html><body><table><a href="">Link</a></table><img src="test.jpg"/><u>Underlined</u><tt>True Type</tt><sub>Text</sub><sup>Text</sup></body></html>'
    )
    myconverted = trafilatura.htmlprocessing.convert_tags(
        mydoc,
        include_formatting=True,
        include_tables=True,
        include_images=True)
    assert myconverted.xpath('.//ref') and myconverted.xpath(
        './/graphic') and myconverted.xpath(
            './/hi[@rend="#t"]') and myconverted.xpath('.//table')
    myconverted = trafilatura.htmlprocessing.tree_cleaning(
        mydoc, include_tables=False, include_images=True)
    assert myconverted.xpath(
        './/graphic') and not myconverted.xpath('.//table')
    mydoc = html.fromstring(
        '<html><body><article><h1>Test headline</h1><p>Test</p></article></body></html>'
    )
    assert '<head rend="h1">Test headline</head>' in extract(
        mydoc, output_format='xml', config=ZERO_CONFIG, no_fallback=True)
    assert '<fw rend="h1" type="header">Test headline</fw>' in extract(
        mydoc, output_format='xmltei', config=ZERO_CONFIG, no_fallback=True)
コード例 #9
0
ファイル: unit_tests.py プロジェクト: vbarbaresi/trafilatura
def test_precision_recall():
    '''test precision- and recall-oriented settings'''
    # the test cases could be better
    my_document = html.fromstring(
        '<html><body><p>This here is the text.</p></body></html>')
    assert extract(my_document, favor_precision=True,
                   config=ZERO_CONFIG) is not None
    assert extract(my_document, favor_recall=True,
                   config=ZERO_CONFIG) is not None
コード例 #10
0
def test_txttocsv():
    mymeta = dict.fromkeys(METADATA_LIST)
    assert utils.txttocsv('', '', mymeta) == 'None\tNone\tNone\tNone\tNone\t\t\n'
    mymeta['title'] = 'Test title'
    mymeta['url'] = 'https://example.org'
    mymeta['hostname'] = 'example.org'
    assert utils.txttocsv('Test text', 'Test comment', mymeta) == 'https://example.org\tNone\texample.org\tTest title\tNone\tTest text\tTest comment\n'
    assert extract('<html><body><p>ÄÄÄÄÄÄÄÄÄÄÄÄÄÄ</p></body></html>', csv_output=True) is not None
    assert extract('<html><body><p>ÄÄÄÄÄÄÄÄÄÄÄÄÄÄ</p></body></html>', csv_output=True, include_comments=False).endswith('\t\n')
    # test json
    assert extract('<html><body><p>ÄÄÄÄÄÄÄÄÄÄÄÄÄÄ</p></body></html>', json_output=True).endswith('}')
コード例 #11
0
def test_formatting():
    '''Test HTML formatting conversion and extraction'''
    # simple
    my_document = html.fromstring('<html><body><p><b>This here is in bold font.</b></p></body></html>')
    my_result = extract(my_document, xml_output=True, include_formatting=True)
    assert '<hi rend="#b">This here is in bold font.</hi>' in my_result
    # nested
    my_document = html.fromstring('<html><body><p><b>This here is in bold and <i>italic</i> font.</b></p></body></html>')
    my_result = extract(my_document, xml_output=True, include_formatting=True)
    assert '<hi rend="#b">This here is in bold and italic font.</hi>' in my_result
    # empty
    my_document = html.fromstring('<html><body><p><b><i></i></b></p></body></html>')
    my_result = extract(my_document, xml_output=True, include_formatting=True)
    assert '<main/>' in my_result
    # wild div
    my_document = html.fromstring('<html><body><article><div><strong>Wild text</strong></div></article></body></html>')
    my_result = extract(my_document, xml_output=True, include_formatting=True)
    assert '<p>' in my_result and '<hi>Wild text</hi>' in my_result  # no rend so far
    my_result = extract(my_document)
    assert my_result == 'Wild text'
    # links
    doc = html.fromstring('<html><body><p><a href="">Link text</a></p></body></html>')
    my_result = extract(doc)
    assert my_result == 'Link text'
    # line-breaks
    doc = html.fromstring('<html><body><p><br/></p></body></html>')
    my_result = extract(doc)
    assert my_result == ''
    doc = html.fromstring('<html><body><p><br/>Here is the text.</p></body></html>')
    my_result = extract(doc)
    assert my_result == 'Here is the text.'
コード例 #12
0
ファイル: unit_tests.py プロジェクト: vbarbaresi/trafilatura
def test_tei():
    '''test TEI-related functions'''
    # open local resources to avoid redownloading at each run
    with open(os.path.join(RESOURCES_DIR, 'httpbin_sample.html')) as f:
        teststring = f.read()
    # download, parse and validate simple html file
    result1 = extract(teststring,
                      "mocked",
                      no_fallback=True,
                      output_format='xmltei',
                      tei_validation=False)
    result2 = extract(teststring,
                      "mocked",
                      no_fallback=True,
                      output_format='xmltei',
                      tei_validation=True)
    assert result1 is not None and result1 == result2
    assert xml.validate_tei(etree.fromstring(result1)) is True
    assert xml.validate_tei(etree.fromstring(teststring)) is False
    # test with another file
    with open(os.path.join(RESOURCES_DIR, 'http_sample.html')) as f:
        teststring = f.read()
    # download, parse and validate simple html file
    result = extract(teststring,
                     "mocked",
                     no_fallback=True,
                     output_format='xmltei',
                     tei_validation=False)
    assert result is not None  # and '<p>license</p>' in result
    assert xml.validate_tei(etree.fromstring(result)) is True
    # include ID in metadata
    result = extract(teststring,
                     "mocked",
                     no_fallback=True,
                     output_format='xmltei',
                     tei_validation=False,
                     record_id='0001')
    assert result is not None
    assert xml.validate_tei(etree.fromstring(result)) is True
    # test header + metadata
    tei = etree.Element('TEI', xmlns='http://www.tei-c.org/ns/1.0')
    header = etree.SubElement(tei, 'teiHeader')
    docmeta = dict.fromkeys(METADATA_LIST)
    docmeta['categories'], docmeta['tags'] = [], []
    docmeta['title'] = 'Title'
    assert xml.write_fullheader(header, docmeta) is not None
    docmeta['sitename'] = 'Site Name'
    assert xml.write_fullheader(header, docmeta) is not None
    docmeta['hostname'], docmeta['sitename'] = 'hostname', None
    assert xml.write_fullheader(header, docmeta) is not None
コード例 #13
0
def test_filters():
    '''Test content filtering'''
    if LANGID_FLAG is True:
        # main text
        assert trafilatura.filters.language_filter('Hier ist ein Text auf Deutsch', '', 'de', SAMPLE_META) is False
        assert trafilatura.filters.language_filter('Hier ist ein Text auf Deutsch', '', 'en', SAMPLE_META) is True
        # comments
        assert trafilatura.filters.language_filter('Hier ist ein Text.', 'Die Kommentare sind aber etwas länger.', 'de', SAMPLE_META) is False
    else:
        # no detection
        assert trafilatura.filters.language_filter('Hier ist ein Text.', '', 'en', SAMPLE_META) is False
    # test URL blacklist
    assert trafilatura.extract('<html><head><link rel="canonical" href="https://example.org"/></head><body></body></html>', output_format='xml', url_blacklist={'https://example.org'}) is None
    ## recursion limit
    my_p = '<p>abc</p>'
    doc = html.fromstring('<html><body>' + my_p*50 + '</body></html>')
    assert extract(doc, max_tree_size=500) is not None
    doc = html.fromstring('<html><body>' + my_p*(501) + '</body></html>')
    assert extract(doc, max_tree_size=500) is None
    my_p = '<p><hi rend="#i">abc</hi></p>'
    doc = html.fromstring('<html><body>' + my_p*(501) + '</body></html>')
    assert extract(doc, include_formatting=True, max_tree_size=500) is None
    doc = html.fromstring('<html><body>' + my_p*(499) + '</body></html>')
    assert extract(doc, include_formatting=True, max_tree_size=500) is not None
    ## deduplication
    doc = html.fromstring('<html><body>' + my_p*50 + '</body></html>')
    lru_test = LRUCache(maxsize=2)
    trafilatura.filters.LRU_TEST = lru_test
    assert extract(doc, deduplicate=True) is not None
    assert extract(doc, deduplicate=True) is not None
    assert extract(doc, deduplicate=True) is not None
    assert extract(doc, deduplicate=True) is None
コード例 #14
0
def test_links():
    '''Test link extraction function'''
    assert handle_textelem(etree.Element('ref'), [], False, DEFAULT_CONFIG) is None
    assert handle_formatting(html.fromstring('<a href="testlink.html">Test link text.</a>'), dedupbool=False, config=ZERO_CONFIG) is not None
    # empty link
    mydoc = html.fromstring('<html><body><p><a></a><b>Some text.</b></p></body></html>')
    assert extract(mydoc) is not None
    # link with target
    mydoc = html.fromstring('<html><body><p><a href="testlink.html">Test link text.</a> This part of the text has to be long enough.</p></body></html>')
    assert 'testlink.html' not in extract(mydoc)
    assert '[Test link text.](testlink.html) This part of the text has to be long enough.' in extract(mydoc, include_links=True, no_fallback=True, config=ZERO_CONFIG)
    # link without target
    mydoc = html.fromstring('<html><body><p><a>Test link text.</a> This part of the text has to be long enough.</p></body></html>')
    assert '[Test link text.] This part of the text has to be long enough.' in extract(mydoc, include_links=True, no_fallback=True, config=ZERO_CONFIG)
    mydoc = html.fromstring('<html><body><article><a>Segment 1</a><h1><a>Segment 2</a></h1><p>Segment 3</p></article></body></html>')
    result = extract(mydoc, output_format='xml', include_links=True, no_fallback=True, config=ZERO_CONFIG)
    assert '1' in result and '2' in result and '3' in result
    with open(os.path.join(RESOURCES_DIR, 'http_sample.html')) as f:
        teststring = f.read()
    assert 'testlink.html' not in extract(teststring, config=ZERO_CONFIG)
    assert '[link](testlink.html)' in extract(teststring, include_links=True, no_fallback=True, config=ZERO_CONFIG)
    assert '<ref target="testlink.html">link</ref>' in extract(teststring, include_links=True, no_fallback=True, output_format='xml', config=ZERO_CONFIG)
    # test license link
    mydoc = html.fromstring('<html><body><p>Test text under <a rel="license" href="">CC BY-SA license</a>.</p></body></html>')
    assert 'license="CC BY-SA license"' in extract(mydoc, include_links=True, no_fallback=True, output_format='xml', config=ZERO_CONFIG)
コード例 #15
0
def test_filters():
    '''Test content filtering'''
    if LANGID_FLAG is True:
        # main text
        assert trafilatura.filters.language_filter('Hier ist ein Text auf Deutsch', '', 'de', SAMPLE_META) is False
        assert trafilatura.filters.language_filter('Hier ist ein Text auf Deutsch', '', 'en', SAMPLE_META) is True
        # comments
        assert trafilatura.filters.language_filter('Hier ist ein Text.', 'Die Kommentare sind aber etwas länger.', 'de',
                                                   SAMPLE_META) is False
    else:
        # no detection
        assert trafilatura.filters.language_filter('Hier ist ein Text.', '', 'en', SAMPLE_META) is False
    # test URL blacklist
    assert trafilatura.extract(
        '<html><head><link rel="canonical" href="https://example.org"/></head><body></body></html>',
        output_format='xml', url_blacklist={'https://example.org'}) is None
    ## recursion limit
    my_p = '<p>abc</p>'
    doc = html.fromstring('<html><body>' + my_p * 50 + '</body></html>')
    assert extract(doc, max_tree_size=500) is not None
    doc = html.fromstring('<html><body>' + my_p * 501 + '</body></html>')
    assert extract(doc, max_tree_size=500) is None
    my_p = '<p><hi rend="#i">abc</hi></p>'
    doc = html.fromstring('<html><body>' + my_p * 501 + '</body></html>')
    assert extract(doc, include_formatting=True, max_tree_size=500) is None
    doc = html.fromstring('<html><body>' + my_p * 499 + '</body></html>')
    assert extract(doc, include_formatting=True, max_tree_size=500) is not None
    ## deduplication
    doc = html.fromstring('<html><body>' + my_p * 50 + '</body></html>')
    lru_test = LRUCache(maxsize=2)
    trafilatura.filters.LRU_TEST = lru_test
    assert extract(doc, deduplicate=True) is not None
    assert extract(doc, deduplicate=True) is not None
    assert extract(doc, deduplicate=True) is not None
    assert extract(doc, deduplicate=True) is None
    # HTML lang filter
    my_p = '<p>In sleep a king, but waking no such matter.</p>'
    assert extract(html.fromstring('<html lang="en-US"><body>' + my_p * 50 + '</body></html>'),
                   target_language='en') is not None
    assert extract(html.fromstring('<html lang="en-US"><body>' + my_p * 50 + '</body></html>'),
                   target_language='de') is None
    assert check_html_lang(html.fromstring('<html lang="de_DE, en_US"><body></body></html>'),
                           target_language='de') is True
    assert check_html_lang(html.fromstring('<html lang="en"><body></body></html>'), target_language='it') is False
    assert check_html_lang(
        html.fromstring('<html><head><meta http-equiv="content-language" content="en"></head><body></body></html>'),
        target_language='en') is True
    assert check_html_lang(
        html.fromstring('<html><head><meta http-equiv="content-language" content="en"></head><body></body></html>'),
        target_language='de') is False
コード例 #16
0
ファイル: unit_tests.py プロジェクト: vkuberan/trafilatura
def test_formatting():
    '''Test HTML formatting conversion and extraction'''
    # simple
    my_document = html.fromstring(
        '<html><body><p><b>This here is in bold font.</b></p></body></html>')
    my_result = extract(my_document,
                        output_format='xml',
                        include_formatting=True,
                        config=ZERO_CONFIG)
    assert '<hi rend="#b">This here is in bold font.</hi>' in my_result
    # nested
    my_document = html.fromstring(
        '<html><body><p><b>This here is in bold and <i>italic</i> font.</b></p></body></html>'
    )
    my_result = extract(my_document,
                        output_format='xml',
                        include_formatting=True,
                        config=ZERO_CONFIG)
    assert '<hi rend="#b">This here is in bold and italic font.</hi>' in my_result
    # empty
    my_document = html.fromstring(
        '<html><body><p><b><i></i></b></p></body></html>')
    my_result = extract(my_document,
                        output_format='xml',
                        include_formatting=True,
                        config=ZERO_CONFIG)
    assert '<main/>' in my_result
    # wild div
    my_document = html.fromstring(
        '<html><body><article><div><strong>Wild text</strong></div></article></body></html>'
    )
    my_result = extract(my_document,
                        output_format='xml',
                        include_formatting=True,
                        config=ZERO_CONFIG)
    assert '<p>' in my_result and '<hi>Wild text</hi>' in my_result  # no rend so far
    my_result = extract(my_document, config=ZERO_CONFIG)
    assert my_result == 'Wild text'
    # links
    doc = html.fromstring(
        '<html><body><p><a href="">Link text</a></p></body></html>')
    my_result = extract(doc, config=ZERO_CONFIG)
    assert my_result == 'Link text'
    # line-breaks
    doc = html.fromstring('<html><body><p><br/></p></body></html>')
    my_result = extract(doc, config=ZERO_CONFIG)
    assert my_result == ''
    doc = html.fromstring(
        '<html><body><p><br/>Here is the text.</p></body></html>')
    my_result = extract(doc, config=ZERO_CONFIG)
    assert my_result == 'Here is the text.'
    # handle formatting tails
    element = etree.Element("hi")
    element.text = 'Here is the text.'
    element.tail = 'And a tail.'
    converted = handle_formatting(element)
    assert etree.tostring(
        converted) == b'<p><hi>Here is the text.</hi>And a tail.</p>'
コード例 #17
0
def test_tei():
    '''test TEI-related functions'''
    # open local resources to avoid redownloading at each run
    with open(os.path.join(RESOURCES_DIR, 'httpbin_sample.html')) as f:
        teststring = f.read()
    # download, parse and validate simple html file
    result1 = extract(teststring, "mocked", no_fallback=True, output_format='xmltei', tei_validation=False)
    result2 = extract(teststring, "mocked", no_fallback=True, output_format='xmltei', tei_validation=True)
    assert result1 is not None and result1 == result2
    assert xml.validate_tei(etree.fromstring(result1)) is True
    assert xml.validate_tei(etree.fromstring(teststring)) is False
    # test with another file
    with open(os.path.join(RESOURCES_DIR, 'http_sample.html')) as f:
        teststring = f.read()
    # download, parse and validate simple html file
    result = extract(teststring, "mocked", no_fallback=True, include_comments=True, output_format='xmltei', tei_validation=False)
    assert result is not None # and '<p>license</p>' in result
    assert xml.validate_tei(etree.fromstring(result)) is True
    result = extract(teststring, "mocked", no_fallback=True, include_comments=False, output_format='xmltei', tei_validation=False)
    assert result is not None # and '<p>license</p>' in result
    assert xml.validate_tei(etree.fromstring(result)) is True
    # include ID in metadata
    result = extract(teststring, "mocked", no_fallback=True, output_format='xmltei', tei_validation=False, record_id='0001')
    assert result is not None
    assert xml.validate_tei(etree.fromstring(result)) is True
    # test header + metadata
    tei = etree.Element('TEI', xmlns='http://www.tei-c.org/ns/1.0')
    header = etree.SubElement(tei, 'teiHeader')
    docmeta = Document()
    docmeta.categories, docmeta.tags = [], []
    docmeta.title = 'Title'
    assert xml.write_fullheader(header, docmeta) is not None
    docmeta.sitename = 'Site Name'
    docmeta.date = '2021-01-01'
    assert xml.write_fullheader(header, docmeta) is not None
    docmeta.date = None
    assert xml.write_fullheader(header, docmeta) is not None
    docmeta.hostname = 'hostname'
    assert xml.write_fullheader(header, docmeta) is not None
    docmeta.sitename = None
    docmeta.license = 'CC BY-SA'
    docmeta.url = 'https://test.org/'
    docmeta.categories = ['cat1', 'cat2']
    assert xml.write_fullheader(header, docmeta) is not None
    docmeta.date = '2021-01-01'
    assert xml.write_fullheader(header, docmeta) is not None
    docmeta.title, docmeta.sitename = None, None
    assert xml.write_fullheader(header, docmeta) is not None
コード例 #18
0
ファイル: unit_tests.py プロジェクト: vkuberan/trafilatura
def test_exotic_tags(xmloutput=False):
    # cover some edge cases with a specially crafted file
    result = load_mock_page('http://exotic_tags',
                            xml_flag=xmloutput,
                            tei_output=True)
    assert 'Teletype text' in result and 'My new car is silver.' in result
    filepath = os.path.join(TEST_DIR, 'cache', 'exotic_tags_tei.html')
    with open(filepath) as f:
        content = etree.fromstring(f.read())
    res = xml.check_tei(content, 'http://dummy')
    assert etree.tostring(res).startswith(
        b'<html>\n<text>\n<body>\n<div>\n\n<hi rend="uppercase">Hello</hi>\n<p>Teletype text</p>'
    )
    # misformed HTML declaration
    htmlstring = '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" 2012"http://www.w3.org/TR/html4/loose.dtd"><html><head></head><body><p>ABC</p></body></html>'
    # outputs '012"http://www.w3.org/TR/html4/loose.dtd">\nABC'
    assert 'ABC' in extract(htmlstring, config=ZERO_CONFIG)
    # quotes
    assert handle_quotes(etree.Element('quote')) is None
    assert handle_table(etree.Element('table')) is None
    # p within p
    element, second = etree.Element('p'), etree.Element('p')
    element.text, second.text = '1st part.', '2nd part.'
    element.append(second)
    converted = handle_paragraphs(element, ['p'], False, ZERO_CONFIG)
    assert etree.tostring(converted) == b'<p>1st part. 2nd part.</p>'
    # delete last <lb>
    third = etree.Element('lb')
    element.append(third)
    assert etree.tostring(converted) == b'<p>1st part. 2nd part.</p>'
コード例 #19
0
def test_fetch():
    '''test URL fetching'''
    assert fetch_url('1234') == ''
    assert fetch_url('https://httpbin.org/status/404') is None
    assert decode_response(b'\x1f\x8babcdef') is not None
    assert fetch_url('https://expired.badssl.com/', no_ssl=True) is not None
    # no decoding
    response = fetch_url('https://httpbin.org/status/200', decode=False)
    assert response == ''
    # response object
    url = 'https://httpbin.org/encoding/utf8'
    response = _send_request(url, False, DEFAULT_CONFIG)
    myobject = _handle_response(url, response, False, DEFAULT_CONFIG)
    assert myobject.data.startswith(b'<h1>Unicode Demo</h1>')
    # straight handling of response object
    assert load_html(response) is not None
    # nothing to see here
    assert extract(response, url=response.geturl(), config=ZERO_CONFIG) is None
    # default config is none
    assert _parse_config(DEFAULT_CONFIG) == (None, None)
    # default user-agent
    default = _determine_headers(DEFAULT_CONFIG)
    assert default['User-Agent'] == USER_AGENT
    assert 'Cookie' not in default
    # user-agents rotation
    assert _parse_config(UA_CONFIG) == ([
        'Firefox', 'Chrome'
    ], 'yummy_cookie=choco; tasty_cookie=strawberry')
    custom = _determine_headers(UA_CONFIG)
    assert custom['User-Agent'] in ['Chrome', 'Firefox']
    assert custom['Cookie'] == 'yummy_cookie=choco; tasty_cookie=strawberry'
コード例 #20
0
def load_mock_page(url, xml_flag=False, langcheck=None, tei_output=False):
    '''load mock page from samples'''
    try:
        with open(os.path.join(TEST_DIR, 'cache', MOCK_PAGES[url]),
                  'r') as inputf:
            htmlstring = inputf.read()
    # encoding/windows fix for the tests
    except UnicodeDecodeError:
        # read as binary
        with open(os.path.join(TEST_DIR, 'cache', MOCK_PAGES[url]),
                  'rb') as inputf:
            htmlbinary = inputf.read()
        guessed_encoding = chardet.detect(htmlbinary)['encoding']
        if guessed_encoding is not None:
            try:
                htmlstring = htmlbinary.decode(guessed_encoding)
            except UnicodeDecodeError:
                htmlstring = htmlbinary
        else:
            print('Encoding error')
    result = extract(htmlstring,
                     url,
                     record_id='0000',
                     no_fallback=False,
                     xml_output=xml_flag,
                     tei_output=tei_output,
                     target_language=langcheck)
    return result
コード例 #21
0
ファイル: unit_tests.py プロジェクト: vbarbaresi/trafilatura
def test_input():
    '''test if loaded strings/trees are handled properly'''
    assert utils.load_html(123) is None
    assert utils.load_html('<html><body>ÄÖÜ</body></html>') is not None
    assert utils.load_html(
        b'<html><body>\x2f\x2e\x9f</body></html>') is not None
    assert utils.load_html(
        '<html><body>\x2f\x2e\x9f</body></html>'.encode('latin-1')) is not None
    #assert utils.load_html(b'0'*int(10e3)) is None
    assert extract(None, 'url', '0000', target_language=None) is None
    # GZip
    with open(os.path.join(RESOURCES_DIR, 'webpage.html.gz'), 'rb') as gzfile:
        myinput = gzfile.read()
    assert 'Long story short,' in extract(myinput)
    # legacy
    assert process_record(None, 'url', '0000', target_language=None) is None
コード例 #22
0
def test_external():
    '''Test external components'''
    # remove unwanted elements
    mydoc = html.fromstring(
        '<html><body><footer>Test text</footer></body></html>')
    _, _, mylen = sanitize_tree(mydoc)
    assert mylen == 0
    mydoc = html.fromstring(
        '<html><body><table><th>Test text</th><tr><td>Test</td></tr></table></body></html>'
    )
    _, _, mylen = sanitize_tree(mydoc)
    assert mylen > 0
    # strip fancy tags while including links and images
    mydoc = html.fromstring(
        '<html><body><p>Text here <fancy>Test text</fancy><a href="">with a link</a>.</p><img src="test.jpg"/></body></html>'
    )
    mytree, _, _ = sanitize_tree(mydoc,
                                 include_links=False,
                                 include_images=False)
    assert len(mytree) == 1
    mydoc = html.fromstring(
        '<html><body><p>Text here <fancy>Test text</fancy><a href="">with a link</a>.</p><img src="test.jpg"/></body></html>'
    )
    mytree, _, _ = sanitize_tree(mydoc,
                                 include_links=True,
                                 include_images=True)
    myelems = set([element.tag for element in set(mytree.iter())])
    assert 'graphic' in myelems and 'ref' in myelems
    # test langid
    if LANGID_FLAG is True:
        doc = html.fromstring('<html><body>' + '<p>Non è inglese.</p>' * 20 +
                              '</body></html>')
        assert extract(
            doc, no_fallback=False, target_language='en',
            deduplicate=False) is None
コード例 #23
0
def test_input():
    '''test if loaded strings/trees are handled properly'''
    assert utils.load_html(123) is None
    assert utils.load_html('<html><body>XYZ</body></html>') is not None
    #assert utils.load_html(b'0'*int(10e3)) is None
    assert extract(None, 'url', '0000', xml_output=False, tei_output=False, target_language=None) is None
    # legacy
    assert process_record(None, 'url', '0000', xml_output=False, tei_output=False, target_language=None) is None
コード例 #24
0
def test_txttocsv():
    mymeta = dict.fromkeys(METADATA_LIST)
    assert utils.txttocsv('', '',
                          mymeta) == 'None\tNone\tNone\tNone\tNone\t\t\n'
    mymeta['title'] = 'Test title'
    mymeta['url'] = 'https://example.org'
    mymeta['hostname'] = 'example.org'
    assert utils.txttocsv(
        'Test text', 'Test comment', mymeta
    ) == 'https://example.org\tNone\texample.org\tTest title\tNone\tTest text\tTest comment\n'
    mystring = '<html><body><p>ÄÄÄÄÄÄÄÄÄÄÄÄÄÄ</p></body></html>'
    assert extract(mystring, csv_output=True) is not None
    assert extract(mystring, csv_output=True,
                   include_comments=False).endswith('\t\n')
    # test json
    assert extract(mystring, json_output=True).endswith('}')
    # bare extraction for python
    result = bare_extraction(mystring)
    assert isinstance(result, dict) and len(result) == 13
コード例 #25
0
def test_txttocsv():
    mymeta = Document()
    assert utils.txttocsv('', '', mymeta) == 'None\tNone\tNone\tNone\tNone\t\t\tNone\n'
    mymeta.title = 'Test title'
    mymeta.url = 'https://example.org'
    mymeta.hostname = 'example.org'
    mymeta.id = '1'
    mymeta.license = 'CC BY-SA'
    assert utils.txttocsv('Test text', 'Test comment', mymeta) == '1\thttps://example.org\tNone\texample.org\tTest title\tNone\tTest text\tTest comment\tCC BY-SA\n'
    mystring = '<html><body><p>ÄÄÄÄÄÄÄÄÄÄÄÄÄÄ</p></body></html>'
    assert extract(mystring, output_format='csv', config=ZERO_CONFIG) is not None
    assert extract(mystring, output_format='csv', include_comments=False, config=ZERO_CONFIG).endswith('\tNone\n')
    # test json
    result = extract(mystring, output_format='json', config=ZERO_CONFIG)
    assert result.endswith('}') and '"fingerprint":' in result
    assert extract(mystring, output_format='json', include_comments=False, config=ZERO_CONFIG).endswith('}')
    # bare extraction for python
    result = bare_extraction(mystring, config=ZERO_CONFIG, as_dict=True)
    assert isinstance(result, dict) and len(result) == 17
コード例 #26
0
def test_exotic_tags(xmloutput=False):
    # cover some edge cases with a specially crafted file
    result = load_mock_page('http://exotic_tags',
                            xml_flag=xmloutput,
                            tei_output=True)
    assert 'Teletype text' in result and 'My new car is silver.' in result
    filepath = os.path.join(TEST_DIR, 'cache', 'exotic_tags_tei.html')
    with open(filepath) as f:
        content = etree.fromstring(f.read())
    res = xml.check_tei(content, 'http://dummy')
    assert etree.tostring(res).startswith(
        b'<html>\n<text>\n<body>\n<div>\n\n<hi rend="uppercase">Hello</hi>\n<p>Teletype text</p>'
    )
    # misformed HTML declaration
    htmlstring = '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" 2012"http://www.w3.org/TR/html4/loose.dtd"><html><head></head><body><p>ABC</p></body></html>'
    # outputs '012"http://www.w3.org/TR/html4/loose.dtd">\nABC'
    assert 'ABC' in extract(htmlstring, config=ZERO_CONFIG)
    # quotes
    assert handle_quotes(etree.Element('quote'), False, ZERO_CONFIG) is None
    assert handle_table(etree.Element('table'), False, ZERO_CONFIG) is None
    # p within p
    element, second = etree.Element('p'), etree.Element('p')
    element.text, second.text = '1st part.', '2nd part.'
    element.append(second)
    # delete last <lb>
    element.append(etree.Element('lb'))
    converted = handle_paragraphs(element, ['p'], False, ZERO_CONFIG)
    assert etree.tostring(converted) == b'<p>1st part. 2nd part.</p>'
    # malformed lists (common error)
    result = etree.tostring(
        handle_lists(
            etree.fromstring(
                '<list>Description of the list:<item>List item 1</item><item>List item 2</item><item>List item 3</item></list>'
            ), False, ZERO_CONFIG))
    assert result.count(b'List item') == 3
    assert b"Description" in result
    # HTML5: <details>
    htmlstring = '<html><body><article><details><summary>Epcot Center</summary><p>Epcot is a theme park at Walt Disney World Resort featuring exciting attractions, international pavilions, award-winning fireworks and seasonal special events.</p></details></article></body></html>'
    my_result = extract(htmlstring, no_fallback=True, config=ZERO_CONFIG)
    print(my_result)
    assert 'Epcot Center' in my_result and 'award-winning fireworks' in my_result
    my_result = extract(htmlstring, no_fallback=False, config=ZERO_CONFIG)
    assert 'Epcot Center' in my_result and 'award-winning fireworks' in my_result
コード例 #27
0
def test_extraction_options():
    '''Test the different parameters available in extract() and bare_extraction()'''
    my_html = '<html><head><meta http-equiv="content-language" content="EN"/></head><body><div="article-body"><p>Text.<!-- comment --></p></div></body></html>'
    with pytest.raises(NameError) as err:
        extract(my_html, json_output=True)
    assert extract(my_html, config=NEW_CONFIG) is None
    assert extract(my_html, config=ZERO_CONFIG) is not None
    assert extract(my_html, with_metadata=True, output_format='xml', config=ZERO_CONFIG) is None
    assert extract(my_html, only_with_metadata=True, output_format='xml', config=ZERO_CONFIG) is None
    assert extract(my_html, target_language='de', config=ZERO_CONFIG) is None
    assert etree.tostring(try_justext(html.fromstring(my_html), None, 'de')) == b'<body/>'
コード例 #28
0
def test_exotic_tags(xmloutput=False):
    # cover some edge cases with a specially crafted file
    result = load_mock_page('http://exotic_tags', xmloutput, tei_output=True)
    assert 'Teletype text' in result and 'My new car is silver.' in result
    filepath = os.path.join(TEST_DIR, 'cache', 'exotic_tags_tei.html')
    with open(filepath) as f:
        content = etree.fromstring(f.read())
    res = xml.check_tei(content, 'http://dummy')
    assert etree.tostring(res).startswith(b'<html>\n<text>\n<body>\n<div>\n\n<hi rend="uppercase">Hello</hi>\n<p>Teletype text</p>')
    # misformed HTML declaration
    htmlstring = '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" 2012"http://www.w3.org/TR/html4/loose.dtd"><html><head></head><body><p>ABC</p></body></html>'
    # outputs '012"http://www.w3.org/TR/html4/loose.dtd">\nABC'
    assert 'ABC' in extract(htmlstring)
コード例 #29
0
def test_external():
    '''Test external components'''
    # remove unwanted elements
    mydoc = html.fromstring('<html><body><footer>Test text</footer></body></html>')
    _, _, mylen = sanitize_tree(mydoc)
    assert mylen == 0
    # strip fancy tags
    mydoc = html.fromstring('<html><body><p>Text here <fancy>Test text</fancy></p></body></html>')
    mytree, _, _ = sanitize_tree(mydoc)
    assert len(mytree) == 1
    # justext stoplist
    # if LANGID_FLAG is True:
    doc = html.fromstring('<html><body>' + '<p>abc</p>'*10 + '</body></html>')
    result = extract(doc, no_fallback=False, target_language='en')
コード例 #30
0
def test_precision_recall():
    '''test precision- and recall-oriented settings'''
    # the test cases could be better
    my_document = html.fromstring('<html><body><p>This here is the text.</p></body></html>')
    assert extract(my_document, favor_precision=True, config=ZERO_CONFIG) is not None
    assert extract(my_document, favor_recall=True, config=ZERO_CONFIG) is not None
    my_document = html.fromstring('<html><body><div class="article-body"><div class="teaser-content"><p>This here is a teaser text.</p></div><div><p>This here is the text.</p></div></body></html>')
    assert 'teaser text' in extract(my_document, favor_recall=True, config=ZERO_CONFIG)
    assert 'teaser text' not in extract(my_document, config=ZERO_CONFIG)
    assert 'teaser text' not in extract(my_document, favor_precision=True, config=ZERO_CONFIG)
    my_document = html.fromstring('<html><body><article><div><p><a href="test.html">1.</a><br/><a href="test2.html">2.</a></p></div></article></body></html>')
    assert '1' not in extract(my_document, favor_recall=True, config=ZERO_CONFIG)
    assert '1' not in extract(my_document, favor_precision=True, config=ZERO_CONFIG)