Esempio n. 1
0
def test_filters():
    '''Test content filtering'''
    if LANGID_FLAG is True:
        # main text
        assert trafilatura.filters.language_filter('Hier ist ein Text auf Deutsch', '', 'de', SAMPLE_META) is False
        assert trafilatura.filters.language_filter('Hier ist ein Text auf Deutsch', '', 'en', SAMPLE_META) is True
        # comments
        assert trafilatura.filters.language_filter('Hier ist ein Text.', 'Die Kommentare sind aber etwas länger.', 'de', SAMPLE_META) is False
    else:
        # no detection
        assert trafilatura.filters.language_filter('Hier ist ein Text.', '', 'en', SAMPLE_META) is False
    # test URL blacklist
    assert trafilatura.extract('<html><head><link rel="canonical" href="https://example.org"/></head><body></body></html>', output_format='xml', url_blacklist={'https://example.org'}) is None
    ## recursion limit
    my_p = '<p>abc</p>'
    doc = html.fromstring('<html><body>' + my_p*50 + '</body></html>')
    assert extract(doc, max_tree_size=500) is not None
    doc = html.fromstring('<html><body>' + my_p*(501) + '</body></html>')
    assert extract(doc, max_tree_size=500) is None
    my_p = '<p><hi rend="#i">abc</hi></p>'
    doc = html.fromstring('<html><body>' + my_p*(501) + '</body></html>')
    assert extract(doc, include_formatting=True, max_tree_size=500) is None
    doc = html.fromstring('<html><body>' + my_p*(499) + '</body></html>')
    assert extract(doc, include_formatting=True, max_tree_size=500) is not None
    ## deduplication
    doc = html.fromstring('<html><body>' + my_p*50 + '</body></html>')
    lru_test = LRUCache(maxsize=2)
    trafilatura.filters.LRU_TEST = lru_test
    assert extract(doc, deduplicate=True) is not None
    assert extract(doc, deduplicate=True) is not None
    assert extract(doc, deduplicate=True) is not None
    assert extract(doc, deduplicate=True) is None
Esempio n. 2
0
def test_lrucache():
    '''test basic duplicate detection'''
    lru_test = LRUCache(maxsize=2)
    trafilatura.filters.LRU_TEST = lru_test
    my_body = etree.Element('body')
    ### element too short
    #my_element = html.fromstring('<p>AAAA BBBB</p>')
    #my_body.append(my_element)
    #put_in_cache(my_body)
    #assert duplicate_test(my_element) is False
    ### cached element
    my_element = html.fromstring(
        '<p>AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB AAAA BBBB</p>'
    )
    my_body.append(my_element)
    assert duplicate_test(my_element) is False
    assert duplicate_test(my_element) is False
    assert duplicate_test(my_body) is False
    assert duplicate_test(my_element) is True
    other_body = etree.Element('body')
    other_element = html.fromstring(
        '<p>CCCC DDDD CCCC DDDD CCCC DDDD CCCC DDDD CCCC DDDD CCCC DDDD CCCC DDDD CCCC DDDD CCCC DDDD CCCC DDDD CCCC DDDD</p>'
    )
    other_body.append(other_element)
    assert duplicate_test(other_body) is False
    assert duplicate_test(other_element) is False
    assert duplicate_test(other_body) is False
    assert duplicate_test(other_element) is True
    yet_another_body = etree.Element('body')
    yet_another_element = html.fromstring(
        '<p>EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF EEEE FFFF</p>'
    )
    yet_another_body.append(yet_another_element)
    assert duplicate_test(yet_another_body) is False
    assert duplicate_test(yet_another_body) is False
    assert duplicate_test(yet_another_body) is False
    # 2 elements in cache, original element has been cleared?
    # print(LRU_TEST.maxsize, LRU_TEST.full)
    assert duplicate_test(other_element) is True
    assert duplicate_test(yet_another_element) is True
    assert duplicate_test(my_element) is False
    # clear the cache
    lru_test.clear()
    assert duplicate_test(other_element) is False
    # get wrong key
    assert lru_test.get('tralala') == -1
Esempio n. 3
0
def test_filters():
    '''Test content filtering'''
    if LANGID_FLAG is True:
        # main text
        assert trafilatura.filters.language_filter('Hier ist ein Text auf Deutsch', '', 'de', SAMPLE_META) is False
        assert trafilatura.filters.language_filter('Hier ist ein Text auf Deutsch', '', 'en', SAMPLE_META) is True
        # comments
        assert trafilatura.filters.language_filter('Hier ist ein Text.', 'Die Kommentare sind aber etwas länger.', 'de',
                                                   SAMPLE_META) is False
    else:
        # no detection
        assert trafilatura.filters.language_filter('Hier ist ein Text.', '', 'en', SAMPLE_META) is False
    # test URL blacklist
    assert trafilatura.extract(
        '<html><head><link rel="canonical" href="https://example.org"/></head><body></body></html>',
        output_format='xml', url_blacklist={'https://example.org'}) is None
    ## recursion limit
    my_p = '<p>abc</p>'
    doc = html.fromstring('<html><body>' + my_p * 50 + '</body></html>')
    assert extract(doc, max_tree_size=500) is not None
    doc = html.fromstring('<html><body>' + my_p * 501 + '</body></html>')
    assert extract(doc, max_tree_size=500) is None
    my_p = '<p><hi rend="#i">abc</hi></p>'
    doc = html.fromstring('<html><body>' + my_p * 501 + '</body></html>')
    assert extract(doc, include_formatting=True, max_tree_size=500) is None
    doc = html.fromstring('<html><body>' + my_p * 499 + '</body></html>')
    assert extract(doc, include_formatting=True, max_tree_size=500) is not None
    ## deduplication
    doc = html.fromstring('<html><body>' + my_p * 50 + '</body></html>')
    lru_test = LRUCache(maxsize=2)
    trafilatura.filters.LRU_TEST = lru_test
    assert extract(doc, deduplicate=True) is not None
    assert extract(doc, deduplicate=True) is not None
    assert extract(doc, deduplicate=True) is not None
    assert extract(doc, deduplicate=True) is None
    # HTML lang filter
    my_p = '<p>In sleep a king, but waking no such matter.</p>'
    assert extract(html.fromstring('<html lang="en-US"><body>' + my_p * 50 + '</body></html>'),
                   target_language='en') is not None
    assert extract(html.fromstring('<html lang="en-US"><body>' + my_p * 50 + '</body></html>'),
                   target_language='de') is None
    assert check_html_lang(html.fromstring('<html lang="de_DE, en_US"><body></body></html>'),
                           target_language='de') is True
    assert check_html_lang(html.fromstring('<html lang="en"><body></body></html>'), target_language='it') is False
    assert check_html_lang(
        html.fromstring('<html><head><meta http-equiv="content-language" content="en"></head><body></body></html>'),
        target_language='en') is True
    assert check_html_lang(
        html.fromstring('<html><head><meta http-equiv="content-language" content="en"></head><body></body></html>'),
        target_language='de') is False
Esempio n. 4
0
def test_filters():
    '''Test content filtering'''
    if LANGID_FLAG is True:
        # main text
        assert trafilatura.filters.language_filter(
            'Hier ist ein Text auf Deutsch', '', 'de', SAMPLE_META) is False
        assert trafilatura.filters.language_filter(
            'Hier ist ein Text auf Deutsch', '', 'en', SAMPLE_META) is True
        # comments
        assert trafilatura.filters.language_filter(
            'Hier ist ein Text.', 'Die Kommentare sind aber etwas länger.',
            'de', SAMPLE_META) is False
        # lang detection on the content
        doc = html.fromstring(
            '<html><body><article><p>How many ages hence/Shall this our lofty scene be acted over,/In states unborn and accents yet unknown!</p></article></body></html>'
        )
        assert extract(doc, config=ZERO_CONFIG, target_language='de') is None
        assert extract(doc, config=ZERO_CONFIG,
                       target_language='en') is not None
    else:
        # no detection
        assert trafilatura.filters.language_filter('Hier ist ein Text.', '',
                                                   'en', SAMPLE_META) is False
    # test URL blacklist
    assert trafilatura.extract(
        '<html><head><link rel="canonical" href="https://example.org"/></head><body></body></html>',
        output_format='xml',
        url_blacklist={'https://example.org'}) is None
    ## recursion limit
    my_p = '<p>abc</p>'
    doc = html.fromstring('<html><body>' + my_p * 50 + '</body></html>')
    assert extract(doc, max_tree_size=500) is not None
    doc = html.fromstring('<html><body>' + my_p * 501 + '</body></html>')
    assert extract(doc, max_tree_size=500) is None
    my_p = '<p><hi rend="#i">abc</hi></p>'
    doc = html.fromstring('<html><body>' + my_p * 501 + '</body></html>')
    assert extract(doc, include_formatting=True, max_tree_size=500) is None
    doc = html.fromstring('<html><body>' + my_p * 499 + '</body></html>')
    assert extract(doc, include_formatting=True, max_tree_size=500) is not None
    ## deduplication
    doc = html.fromstring('<html><body>' + my_p * 50 + '</body></html>')
    lru_test = LRUCache(maxsize=2)
    trafilatura.filters.LRU_TEST = lru_test
    assert extract(doc, deduplicate=True) is not None
    assert extract(doc, deduplicate=True) is not None
    assert extract(doc, deduplicate=True) is not None
    assert extract(doc, deduplicate=True) is None
    # paragraph level
    #lru_test = LRUCache(maxsize=2)
    #trafilatura.filters.LRU_TEST = lru_test
    #my_p = etree.fromstring('<p>abc</p>')
    #assert trafilatura.htmlprocessing.process_node(my_p) is not None
    #assert trafilatura.htmlprocessing.process_node(my_p) is not None
    #assert trafilatura.htmlprocessing.process_node(my_p) is not None
    #assert trafilatura.htmlprocessing.process_node(my_p) is None
    # HTML lang filter
    # no lang
    assert check_html_lang(html.fromstring('<html><body></body></html>'),
                           target_language='en') is True
    # text + lang
    my_p = '<p>In sleep a king, but waking no such matter.</p>'
    assert extract(html.fromstring('<html lang="en-US"><body>' + my_p * 50 +
                                   '</body></html>'),
                   target_language='en') is not None
    #assert extract(html.fromstring('<html lang="en-US"><body>' + my_p*50 + '</body></html>'), target_language='de') is None
    assert check_html_lang(
        html.fromstring('<html lang="de_DE, en_US"><body></body></html>'),
        target_language='de') is True
    assert check_html_lang(
        html.fromstring('<html lang="de_DE, en_US"><body></body></html>'),
        target_language='en') is True
    assert check_html_lang(
        html.fromstring('<html lang="de_DE, en_US"><body></body></html>'),
        target_language='de',
        strict=True) is True
    assert check_html_lang(
        html.fromstring('<html lang="de_DE, en_US"><body></body></html>'),
        target_language='en',
        strict=True) is True
    assert check_html_lang(html.fromstring(
        '<html><head><meta http-equiv="content-language" content="en"></head><body></body></html>'
    ),
                           target_language='en') is True
    assert check_html_lang(html.fromstring(
        '<html><head><meta http-equiv="content-language" content="en"></head><body></body></html>'
    ),
                           target_language='de') is False
    assert check_html_lang(html.fromstring(
        '<html><head><meta http-equiv="content-language" content="DE"></head><body></body></html>'
    ),
                           target_language='de') is True
    # html lang attribute superseded by og:locale
    assert check_html_lang(html.fromstring(
        '<html lang="en-US"><head><meta property="og:locale" content="de_DE" /></head><body></body></html>'
    ),
                           target_language='de') is True
    assert check_html_lang(html.fromstring(
        '<html lang="en-US"><head><meta property="og:locale" content="de_DE" /></head><body></body></html>'
    ),
                           target_language='en') is False
    assert check_html_lang(
        html.fromstring('<html lang="en"><body></body></html>'),
        target_language='it',
        strict=True) is False
    assert check_html_lang(
        html.fromstring('<html lang="en"><body></body></html>'),
        target_language='it',
        strict=False) is True
    assert check_html_lang(html.fromstring(
        '<html lang="en-US"><head><meta property="og:locale" content="de_DE" /></head><body></body></html>'
    ),
                           target_language='de',
                           strict=False) is True
    assert check_html_lang(html.fromstring(
        '<html lang="en-US"><head><meta property="og:locale" content="de_DE" /></head><body></body></html>'
    ),
                           target_language='de',
                           strict=True) is True