Python extract_sitemap_links Examples, trafilatura.sitemaps.extract_sitemap_links Python Examples

Example #1

0

Show file

def test_extraction():
    '''Test simple link extraction'''
    # fixing partial URLs
    assert sitemaps.fix_relative_urls(
        'example.org',
        'https://example.org/test.html') == 'https://example.org/test.html'
    assert sitemaps.fix_relative_urls(
        'example.org', '/test.html') == 'http://example.org/test.html'
    # parsing a file
    filepath = os.path.join(RESOURCES_DIR, 'sitemap.xml')
    with open(filepath) as f:
        teststring = f.read()
    sitemapurls, linklist = sitemaps.extract_sitemap_links(
        teststring, 'https://www.sitemaps.org/sitemap.xml', 'sitemaps.org')
    assert sitemapurls == [] and len(linklist) == 84
    # nested sitemaps
    filepath = os.path.join(RESOURCES_DIR, 'sitemap2.xml')
    with open(filepath) as f:
        teststring = f.read()
    sitemapurls, linklist = sitemaps.extract_sitemap_links(
        teststring, 'https://www.sitemaps.org/sitemap.xml', 'sitemaps.org')
    assert len(sitemapurls) == 2 and linklist == []
    # invalid
    assert sitemaps.extract_sitemap_links(
        '<html>\n</html>', 'https://www.sitemaps.org/sitemap.xml',
        'sitemaps.org') == ([], [])

Example #2

0

Show file

def test_extraction():
    '''Test simple link extraction'''
    url, domain, baseurl = 'https://www.sitemaps.org/sitemap.xml', 'sitemaps.org', 'https://www.sitemaps.org'
    # fixing partial URLs
    assert fix_relative_urls('https://example.org', 'https://example.org/test.html') == 'https://example.org/test.html'
    assert fix_relative_urls('https://example.org', '/test.html') == 'https://example.org/test.html'
    assert fix_relative_urls('https://example.org', 'test.html') == 'https://example.org/test.html'
    assert fix_relative_urls('https://example.org', '../../test.html') == 'https://example.org/test.html'
    # link handling
    assert sitemaps.handle_link(url, url, domain, baseurl, None) == (url, '0')
    assert sitemaps.handle_link('https://mydomain.wordpress.com/1', 'https://example.org/sitemap.xml', 'example.org', 'https://example.org', None) == ('https://mydomain.wordpress.com/1', 'link')
    # safety belts
    assert sitemaps.check_sitemap('http://example.org/sitemap.xml.gz', b'\x1f\x8bABC') is None
    assert sitemaps.check_sitemap('http://example.org/sitemap.xml', 'ABC') is None
    # parsing a file
    filepath = os.path.join(RESOURCES_DIR, 'sitemap.xml')
    with open(filepath) as f:
        teststring = f.read()
    contents = sitemaps.check_sitemap('http://example.org/sitemap.xml', teststring)
    sitemapurls, linklist = sitemaps.extract_sitemap_links(contents, url, domain, baseurl, None)
    assert len(sitemapurls) == 0 and len(linklist) == 84
    # hreflang
    assert sitemaps.extract_sitemap_langlinks(teststring, url, domain, baseurl, None) == ([], [])
    # nested sitemaps
    url, domain, baseurl = 'http://www.example.com/sitemap.xml', 'example.com', 'http://www.example.com'
    filepath = os.path.join(RESOURCES_DIR, 'sitemap2.xml')
    with open(filepath) as f:
        teststring = f.read()
    sitemapurls, linklist = sitemaps.extract_sitemap_links(teststring, url, domain, baseurl, None)
    assert sitemapurls == ['http://www.example.com/sitemap1.xml.gz', 'http://www.example.com/sitemap2.xml.gz'] and linklist == []
    # invalid
    assert sitemaps.extract_sitemap_links('<html>\n</html>', url, domain, baseurl, None) == ([], [])
    # hreflang
    filepath = os.path.join(RESOURCES_DIR, 'sitemap-hreflang.xml')
    with open(filepath) as f:
        teststring = f.read()
    sitemapsurls, linklist = sitemaps.extract_sitemap_langlinks(teststring, url, domain, baseurl, target_lang='de')
    assert sitemapsurls == ['http://www.example.com/sitemap-de.xml.gz']
    assert len(linklist) > 0
    # GZ-compressed sitemaps
    url, domain, baseurl = 'https://www.sitemaps.org/sitemap.xml', 'sitemaps.org', 'https://www.sitemaps.org'
    filepath = os.path.join(RESOURCES_DIR, 'sitemap.xml.gz')
    with open(filepath, 'rb') as f:
        teststring = f.read()
    teststring = decode_response(teststring)
    contents = sitemaps.check_sitemap('http://example.org/sitemap.xml.gz', teststring)
    sitemapurls, linklist = sitemaps.extract_sitemap_links(contents, url, domain, baseurl, None)
    assert len(sitemapurls) == 0 and len(linklist) == 84
    # check contents
    assert sitemaps.check_sitemap('http://example.org/sitemap.xml.gz?value=1', teststring) is not None
    # TXT links
    assert sitemaps.process_sitemap('https://test.org/sitemap', 'test.org', 'https://test.org/', 'Tralala\nhttps://test.org/1\nhttps://test.org/2') == ([], ['https://test.org/1', 'https://test.org/2'])
    # unique and sorted URLs
    urlfilter = 'category'
    myurls = ['/category/xyz', '/category/abc', '/cat/test', '/category/abc']
    assert filter_urls(myurls, urlfilter) == ['/category/abc', '/category/xyz']

Example #3

0

Show file

File: sitemaps_tests.py Project: phongtnit/trafilatura

def test_extraction():
    '''Test simple link extraction'''
    url, domain = 'https://www.sitemaps.org/sitemap.xml', 'sitemaps.org'
    # fixing partial URLs
    assert sitemaps.fix_relative_urls(
        'example.org',
        'https://example.org/test.html') == 'https://example.org/test.html'
    assert sitemaps.fix_relative_urls(
        'example.org', '/test.html') == 'http://example.org/test.html'
    # link handling
    # assert sitemaps.handle_link(url, domain, url) == (url, '0')
    # parsing a file
    filepath = os.path.join(RESOURCES_DIR, 'sitemap.xml')
    with open(filepath) as f:
        teststring = f.read()
    sitemapurls, linklist = sitemaps.extract_sitemap_links(
        teststring, url, domain)
    assert len(sitemapurls) == 0 and len(linklist) == 84
    # hreflang
    assert sitemaps.extract_sitemap_langlinks(teststring, url,
                                              domain) == ([], [])
    # nested sitemaps
    filepath = os.path.join(RESOURCES_DIR, 'sitemap2.xml')
    with open(filepath) as f:
        teststring = f.read()
    sitemapurls, linklist = sitemaps.extract_sitemap_links(
        teststring, url, domain)
    assert len(sitemapurls) == 2 and linklist == []
    # invalid
    assert sitemaps.extract_sitemap_links('<html>\n</html>', url,
                                          domain) == ([], [])
    # hreflang
    filepath = os.path.join(RESOURCES_DIR, 'sitemap-hreflang.xml')
    with open(filepath) as f:
        teststring = f.read()
    _, linklist = sitemaps.extract_sitemap_langlinks(teststring,
                                                     url,
                                                     domain,
                                                     target_lang='de')
    assert len(linklist) > 0

Example #4

0

Show file

File: sitemaps_tests.py Project: EiffelFly/trafilatura

def test_extraction():
    '''Test simple link extraction'''
    url, domain, baseurl = 'https://www.sitemaps.org/sitemap.xml', 'sitemaps.org', 'https://www.sitemaps.org'
    # fixing partial URLs
    assert fix_relative_urls(
        'https://example.org',
        'https://example.org/test.html') == 'https://example.org/test.html'
    assert fix_relative_urls('https://example.org',
                             '/test.html') == 'https://example.org/test.html'
    assert fix_relative_urls('https://example.org',
                             'test.html') == 'https://example.org/test.html'
    # link handling
    assert sitemaps.handle_link(url, url, domain, baseurl) == (url, '0')
    # safety belts
    assert sitemaps.check_sitemap('http://example.org/sitemap.xml.gz',
                                  b'\x1f\x8bABC') is None
    assert sitemaps.check_sitemap('http://example.org/sitemap.xml',
                                  'ABC') is None
    # parsing a file
    filepath = os.path.join(RESOURCES_DIR, 'sitemap.xml')
    with open(filepath) as f:
        teststring = f.read()
    contents = sitemaps.check_sitemap('http://example.org/sitemap.xml',
                                      teststring)
    sitemapurls, linklist = sitemaps.extract_sitemap_links(
        contents, url, domain, baseurl)
    assert len(sitemapurls) == 0 and len(linklist) == 84
    # hreflang
    assert sitemaps.extract_sitemap_langlinks(teststring, url, domain,
                                              baseurl) == ([], [])
    # nested sitemaps
    filepath = os.path.join(RESOURCES_DIR, 'sitemap2.xml')
    with open(filepath) as f:
        teststring = f.read()
    sitemapurls, linklist = sitemaps.extract_sitemap_links(
        teststring, url, domain, baseurl)
    assert len(sitemapurls) == 2 and linklist == []
    # invalid
    assert sitemaps.extract_sitemap_links('<html>\n</html>', url, domain,
                                          baseurl) == ([], [])
    # hreflang
    filepath = os.path.join(RESOURCES_DIR, 'sitemap-hreflang.xml')
    with open(filepath) as f:
        teststring = f.read()
    _, linklist = sitemaps.extract_sitemap_langlinks(teststring,
                                                     url,
                                                     domain,
                                                     baseurl,
                                                     target_lang='de')
    assert len(linklist) > 0
    # GZ-compressed sitemaps
    filepath = os.path.join(RESOURCES_DIR, 'sitemap.xml.gz')
    with open(filepath, 'rb') as f:
        teststring = f.read()
    contents = sitemaps.check_sitemap('http://example.org/sitemap.xml.gz',
                                      teststring)
    sitemapurls, linklist = sitemaps.extract_sitemap_links(
        contents, url, domain, baseurl)
    assert len(sitemapurls) == 0 and len(linklist) == 84
    # check contents
    assert sitemaps.check_sitemap('http://example.org/sitemap.xml.gz?value=1',
                                  teststring) is not None