Python extract_links Examples

Programming Language: Python

Namespace/Package Name: trafilatura.feeds

Method/Function: extract_links

Examples at hotexamples.com: 7

Python extract_links - 7 examples found. These are the top rated real world Python examples of trafilatura.feeds.extract_links extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: feeds_tests.py Project: phongtnit/trafilatura

def test_atom_extraction():
    '''Test link extraction from an Atom feed'''
    filepath = os.path.join(RESOURCES_DIR, 'feed1.atom')
    with open(filepath) as f:
        teststring = f.read()
    assert len(feeds.extract_links(teststring)) > 0
    assert len(feeds.extract_links('<link type="application/atom+xml" rel="self" href="https://www.dwds.de/api/feed/themenglossar/Corona"/>')) == 0

Example #2

Show file

def test_atom_extraction():
    '''Test link extraction from an Atom feed'''
    assert len(feeds.extract_links('<html></html>', 'example.org', 'https://example.org', '')) == 0
    filepath = os.path.join(RESOURCES_DIR, 'feed1.atom')
    with open(filepath) as f:
        teststring = f.read()
    assert len(feeds.extract_links(teststring, 'example.org', 'https://example.org', '')) > 0
    assert len(feeds.extract_links(XMLDECL + '<link type="application/atom+xml" rel="self" href="https://www.dwds.de/api/feed/themenglossar/Corona"/>', 'dwds.de', 'https://www.dwds.de', '')) == 0
    assert len(feeds.extract_links(XMLDECL + '<link type="application/atom+xml" rel="self" href="123://api.exe"/>', 'example.org', 'https://example.org', '')) == 0

Example #3

Show file

def test_rss_extraction():
    '''Test link extraction from a RSS feed'''
    assert len(feeds.extract_links(XMLDECL + '<link>http://example.org/article1/</link>', 'example.org', 'http://example.org/', '')) == 1
    assert len(feeds.extract_links(XMLDECL + '<link>http://example.org/</link>', 'example.org', 'http://example.org', 'http://example.org')) == 0
    assert len(feeds.extract_links(XMLDECL + '<link rel="self">http://example.org/article1/</link>', 'example.org', 'http://example.org/', '')) == 0
    assert feeds.extract_links(XMLDECL + '<link>/api/feed/themenglossar/Corona</link>', 'www.dwds.de', 'https://www.dwds.de', 'https://www.dwds.de') == ['https://www.dwds.de/api/feed/themenglossar/Corona']
    filepath = os.path.join(RESOURCES_DIR, 'feed2.rss')
    with open(filepath) as f:
        teststring = f.read()
    assert len(feeds.extract_links(teststring, 'example.com', 'https://example.org', '')) > 0

Example #4

Show file

File: feeds_tests.py Project: adbar/trafilatura

def test_rss_extraction():
    '''Test link extraction from a RSS feed'''
    assert len(
        feeds.extract_links(
            XMLDECL + '<link>http://example.org/article1/</link>',
            'example.org', 'http://example.org/', '')) == 1
    # CDATA
    assert feeds.extract_links(
        XMLDECL + '<link><![CDATA[http://example.org/article1/]]></link>',
        'example.org', 'http://example.org/',
        '') == ['http://example.org/article1/']
    # spaces
    assert len(
        feeds.extract_links(
            XMLDECL +
            '<link>\r\n    https://www.ak-kurier.de/akkurier/www/artikel/108815-sinfonisches-blasorchester-spielt-1500-euro-fuer-kinder-in-drk-krankenhaus-kirchen-ein    </link>',
            'ak-kurier.de', 'https://www.ak-kurier.de/', '')) == 1
    assert len(
        feeds.extract_links(XMLDECL + '<link>http://example.org/</link>',
                            'example.org', 'http://example.org',
                            'http://example.org')) == 0
    assert len(
        feeds.extract_links(XMLDECL + '<link>https://example.org</link>',
                            'example.org', 'http://example.org/', '')) == 0
    assert feeds.extract_links(
        XMLDECL + '<link>/api/feed/themenglossar/Corona</link>', 'www.dwds.de',
        'https://www.dwds.de', 'https://www.dwds.de') == [
            'https://www.dwds.de/api/feed/themenglossar/Corona'
        ]
    filepath = os.path.join(RESOURCES_DIR, 'feed2.rss')
    with open(filepath) as f:
        teststring = f.read()
    assert len(
        feeds.extract_links(teststring, 'example.com', 'https://example.org',
                            '')) > 0

Example #5

Show file

File: feeds_tests.py Project: adbar/trafilatura

def test_json_extraction():
    '''Test link extraction from a JSON feed'''
    # find link
    assert len(
        feeds.determine_feed(
            '<html><meta><link rel="alternate" type="application/json" title="JSON Feed" href="https://www.jsonfeed.org/feed.json" />></meta><body/></html>',
            'jsonfeed.org', 'https://www.jsonfeed.org')) == 1
    # extract data
    filepath = os.path.join(RESOURCES_DIR, 'feed.json')
    with open(filepath) as f:
        teststring = f.read()
    links = feeds.extract_links(teststring, 'npr.org', 'https://npr.org', '')
    assert len(links) == 25
    # id as a backup
    links = feeds.extract_links(
        r'{"version":"https:\/\/jsonfeed.org\/version\/1","items":[{"id":"https://www.example.org/1","title":"Test"}]}',
        'example.org', 'https://example.org', '')
    assert len(links) == 1

Example #6

Show file

File: feeds_tests.py Project: phongtnit/trafilatura

def test_rss_extraction():
    '''Test link extraction from a RSS feed'''
    filepath = os.path.join(RESOURCES_DIR, 'feed2.rss')
    with open(filepath) as f:
        teststring = f.read()
    assert len(feeds.extract_links(teststring)) > 0

Example #7

Show file

File: feeds_tests.py Project: scroobiustrip/trafilatura

def test_atom_extraction():
    '''Test link extraction from an Atom feed'''
    filepath = os.path.join(RESOURCES_DIR, 'feed1.atom')
    with open(filepath) as f:
        teststring = f.read()
    assert len(feeds.extract_links(teststring)) > 0