Ejemplo n.º 1
0
def test_navigation():
    assert is_navigation_page('https://test.org/') is False
    assert is_navigation_page('https://test.org/page/1') is True
    assert is_navigation_page('https://test.org/?p=11') is True
    assert is_not_crawlable('https://test.org/login') is True
    assert is_not_crawlable('https://test.org/login/') is True
    assert is_not_crawlable('https://test.org/login.php') is True
    assert is_not_crawlable('https://test.org/page') is False
Ejemplo n.º 2
0
def find_new_links(htmlstring,
                   base_url,
                   known_links,
                   language=None,
                   rules=None):
    """Extract and filter new internal links after an optional language check."""
    new_links = []
    # reference=None
    # optional language check: run baseline extraction + language identifier
    if language is not None and LANGID_FLAG is True:
        _, text, _ = baseline(htmlstring)
        result = cld3.get_language(text)
        if result is not None and result.language != language:
            return new_links, known_links
    # iterate through the links and filter them
    for link in extract_links(htmlstring,
                              base_url,
                              False,
                              language=language,
                              with_nav=True):
        # check robots.txt rules
        if rules is not None and not rules.can_fetch("*", link):
            continue
        # sanity check
        if is_known_link(link, known_links) is True or is_not_crawlable(link):
            continue
        new_links.append(link)
        known_links.add(link)
    return new_links, known_links