Ejemplo n.º 1
0
def handle_link(link, sitemapurl, domainname, baseurl, target_lang):
    '''Examine a link and determine if it's valid and if it leads to
       a sitemap or a web page.'''
    state = '0'
    # safety net: recursivity
    if link == sitemapurl:
        return link, state
    # fix and check
    link = fix_relative_urls(baseurl, link)
    # clean and normalize
    link = clean_url(link, target_lang)
    if link is not None:
        if lang_filter(link, target_lang) is True:
            newdomain = extract_domain(link)
            if newdomain is not None:
                # don't take links from another domain and make an exception for main platforms
                if newdomain != domainname and not WHITELISTED_PLATFORMS.search(
                        newdomain):
                    LOGGER.warning('Diverging domain names: %s %s', domainname,
                                   newdomain)
                else:
                    if re.search(r'\.xml$|\.xml[.?#]', link):
                        state = 'sitemap'
                    else:
                        state = 'link'
            else:
                LOGGER.error("Couldn't extract domain: %s", link)
    return link, state
Ejemplo n.º 2
0
def test_lang_filter():
    assert lang_filter('https://www.20min.ch/fr/story/des-millions-pour-produire-de-l-energie-renouvelable-467974085377', None) is True
    assert lang_filter('https://www.20min.ch/fr/story/des-millions-pour-produire-de-l-energie-renouvelable-467974085377', 'de') is False
    assert lang_filter('https://www.20min.ch/fr/story/des-millions-pour-produire-de-l-energie-renouvelable-467974085377', 'fr') is True
    assert lang_filter('https://www.20min.ch/fr/story/des-millions-pour-produire-de-l-energie-renouvelable-467974085377', 'en') is False
    assert lang_filter('https://www.20min.ch/fr/story/des-millions-pour-produire-de-l-energie-renouvelable-467974085377', 'es') is False
    assert lang_filter('https://www.sitemaps.org/en_GB/protocol.html', 'en') is True
    assert lang_filter('https://www.sitemaps.org/en_GB/protocol.html', 'de') is False
Ejemplo n.º 3
0
def test_lang_filter():
    assert lang_filter(
        'https://www.20min.ch/fr/story/des-millions-pour-produire-de-l-energie-renouvelable-467974085377',
        None) is True
    assert lang_filter(
        'https://www.20min.ch/fr/story/des-millions-pour-produire-de-l-energie-renouvelable-467974085377',
        'de') is False
    assert lang_filter(
        'https://www.20min.ch/fr/story/des-millions-pour-produire-de-l-energie-renouvelable-467974085377',
        'fr') is True
    assert lang_filter(
        'https://www.20min.ch/fr/story/des-millions-pour-produire-de-l-energie-renouvelable-467974085377',
        'en') is False
    assert lang_filter(
        'https://www.20min.ch/fr/story/des-millions-pour-produire-de-l-energie-renouvelable-467974085377',
        'es') is False
    assert lang_filter('https://www.sitemaps.org/en_GB/protocol.html',
                       'en') is True
    assert lang_filter('https://www.sitemaps.org/en_GB/protocol.html',
                       'de') is False
    assert lang_filter('https://en.wikipedia.org/', 'de', strict=True) is False
    assert lang_filter('https://en.wikipedia.org/', 'de', strict=False) is True
    assert lang_filter('https://de.wikipedia.org/', 'de', strict=True) is True
    assert lang_filter(
        'http://de.musclefood.com/neu/neue-nahrungsergaenzungsmittel.html',
        'de',
        strict=True) is True
    assert lang_filter(
        'http://de.musclefood.com/neu/neue-nahrungsergaenzungsmittel.html',
        'fr',
        strict=True) is False
    assert lang_filter('http://ch.postleitzahl.org/sankt_gallen/liste-T.html',
                       'fr') is True
    assert lang_filter('http://ch.postleitzahl.org/sankt_gallen/liste-T.html',
                       'de') is True
    # to complete when language mappings are more extensive
    # assert lang_filter('http://ch.postleitzahl.org/sankt_gallen/liste-T.html', 'es') is False
    # disturbing path sub-elements
    assert lang_filter(
        'http://www.uni-rostock.de/fakult/philfak/fkw/iph/thies/mythos.html',
        'de') is True
    assert lang_filter('http://stifter.literature.at/witiko/htm/h15-22b.html',
                       'de') is True
    assert lang_filter('http://stifter.literature.at/doc/witiko/h15-22b.html',
                       'de') is True
    assert lang_filter('http://stifter.literature.at/nl/witiko/h15-22b.html',
                       'de') is False
    assert lang_filter(
        'http://stifter.literature.at/de_DE/witiko/h15-22b.html', 'de') is True
    assert lang_filter(
        'http://stifter.literature.at/en_US/witiko/h15-22b.html',
        'de') is False
    assert lang_filter(
        'http://www.stiftung.koerber.de/bg/recherche/de/beitrag.php?id=15132&refer=',
        'de') is True
    assert lang_filter('http://www.solingen-internet.de/si-hgw/eiferer.htm',
                       'de') is True
    assert lang_filter(
        'http://ig.cs.tu-berlin.de/oldstatic/w2000/ir1/aufgabe2/ir1-auf2-gr16.html',
        'de',
        strict=True) is True
    assert lang_filter(
        'http://ig.cs.tu-berlin.de/oldstatic/w2000/ir1/aufgabe2/ir1-auf2-gr16.html',
        'de',
        strict=False) is True
    assert lang_filter('http://bz.berlin1.de/kino/050513/fans.html',
                       'de',
                       strict=False) is True
    assert lang_filter('http://bz.berlin1.de/kino/050513/fans.html',
                       'de',
                       strict=True) is False