Exemple #1
0
def test_validate():
    assert validate_url('http://www.test[.org/test')[0] is False
    # assert validate_url('http://www.test.org:7ERT/test')[0] is False
    assert validate_url('ntp://www.test.org/test')[0] is False
    assert validate_url('ftps://www.test.org/test')[0] is False
    assert validate_url('http://t.g/test')[0] is False
    assert validate_url('http://test.org/test')[0] is True
Exemple #2
0
def test_examples():
    '''test README examples'''
    assert check_url('https://github.com/adbar/courlan') == ('https://github.com/adbar/courlan', 'github.com')
    assert check_url('https://httpbin.org/redirect-to?url=http%3A%2F%2Fexample.org', strict=True) == ('https://httpbin.org/redirect-to', 'httpbin.org')
    assert clean_url('HTTPS://WWW.DWDS.DE:80/') == 'https://www.dwds.de'
    assert validate_url('http://1234') == (False, None)
    assert validate_url('http://www.example.org/')[0] is True
    assert normalize_url('http://test.net/foo.html?utm_source=twitter&post=abc&page=2#fragment', strict=True) == 'http://test.net/foo.html?page=2&post=abc'
Exemple #3
0
def test_extension_filter():
    validation_test, parsed_url = validate_url(
        'http://www.example.org/test.js')
    assert extension_filter(parsed_url.path) is False
    validation_test, parsed_url = validate_url(
        'http://goodbasic.com/GirlInfo.aspx?Pseudo=MilfJanett')
    assert extension_filter(parsed_url.path) is True
    validation_test, parsed_url = validate_url(
        'https://www.familienrecht-allgaeu.de/de/vermoegensrecht.amp')
    assert extension_filter(parsed_url.path) is True
    validation_test, parsed_url = validate_url(
        'http://www.example.org/test.shtml')
    assert extension_filter(parsed_url.path) is True
    validation_test, parsed_url = validate_url(
        'http://de.artsdot.com/ADC/Art.nsf/O/8EWETN')
    assert extension_filter(parsed_url.path) is True
    validation_test, parsed_url = validate_url(
        'http://de.artsdot.com/ADC/Art.nsf?param1=test')
    assert extension_filter(parsed_url.path) is False
    validation_test, parsed_url = validate_url(
        'http://www.example.org/test.xhtml?param1=this')
    assert extension_filter(parsed_url.path) is True
    validation_test, parsed_url = validate_url(
        'http://www.example.org/test.php5')
    assert extension_filter(parsed_url.path) is True
    validation_test, parsed_url = validate_url(
        'http://www.example.org/test.php6')
    assert extension_filter(parsed_url.path) is True
Exemple #4
0
def load_input_dict(filename, blacklist):
    '''Read input list of URLs to process and build domain-aware processing dictionary'''
    inputdict = defaultdict(list)
    try:
        # optional: errors='strict', buffering=1
        with open(filename, mode='r', encoding='utf-8') as inputfile:
            for line in inputfile:
                # control input validity
                url_match = re.match(r'https?://[^\s]+', line)
                if url_match:
                    url = url_match.group(0)
                    # validation
                    if validate_url(url)[0] is False:
                        LOGGER.warning('Invalid URL, discarding line: %s', line)
                        continue
                    # control blacklist
                    if blacklist:
                        if re.sub(r'^https?://', '', url) in blacklist:
                            continue
                    # segment URL and add to domain dictionary
                    try:
                        _, hostinfo, urlpath = HOSTINFO.split(url)
                        inputdict[hostinfo].append(urlpath)
                    except ValueError:
                        LOGGER.warning('Could not parse URL, discarding line: %s', line)
                else:
                    LOGGER.warning('Not an URL, discarding line: %s', line)
    except UnicodeDecodeError:
        sys.exit('ERROR: system, file type or buffer encoding')
    # deduplicate
    for hostname in inputdict:
        inputdict[hostname] = list(OrderedDict.fromkeys(inputdict[hostname]))
    return inputdict
Exemple #5
0
def convert_inputlist(blacklist, inputlist, url_filter=None, inputdict=None):
    '''Add input URls to domain-aware processing dictionary'''
    # control
    if inputdict is None:
        inputdict = defaultdict(list)
    # filter
    if blacklist:
        inputlist = [u for u in inputlist if re.sub(r'https?://', '', u) not in blacklist]
    if url_filter:
        filtered_list = []
        while inputlist:
            u = inputlist.pop()
            for f in url_filter:
                if f in u:
                    filtered_list.append(u)
                    break
        inputlist = filtered_list
    # validate
    inputlist = [u for u in inputlist if validate_url(u)[0] is True]
    # deduplicate
    for url in list(OrderedDict.fromkeys(inputlist)):
        # segment URL and add to domain dictionary
        try:
            _, hostinfo, urlpath = HOSTINFO.split(url)
            inputdict[hostinfo].append(urlpath)
        except ValueError:
            LOGGER.warning('Could not parse URL, discarding: %s', url)
    return inputdict
Exemple #6
0
def load_blacklist(filename):
    '''Read list of unwanted URLs'''
    blacklist = set()
    with open(filename, mode='r', encoding='utf-8') as inputfh:
        for line in inputfh:
            url = line.strip()
            if validate_url(url)[0] is True:
                blacklist.add(re.sub(r'^https?://', '', url))
    return blacklist
Exemple #7
0
def url_processing_checks(blacklist, input_urls):
    '''Filter and deduplicate input urls'''
    # control blacklist
    if blacklist:
        input_urls = [u for u in input_urls if u not in blacklist]
    # check for invalid URLs
    if input_urls:
        input_urls = [u for u in input_urls if validate_url(u)[0] is True]
    # deduplicate
    if input_urls:
        return list(OrderedDict.fromkeys(input_urls))
    LOGGER.error('No URLs to process, invalid or blacklisted input')
    return []
Exemple #8
0
def determine_feed(htmlstring, baseurl, reference):
    '''Try to extract the feed URL from the home page.
       Adapted from http://www.aaronsw.com/2002/feedfinder/'''
    # parse the page to look for feeds
    tree = load_html(htmlstring)
    # safeguard
    if tree is None:
        LOGGER.debug('Invalid HTML/Feed page: %s', baseurl)
        return []
    feed_urls = []
    for linkelem in tree.xpath('//link[@rel="alternate"]'):
        # discard elements without links
        if 'href' not in linkelem.attrib:
            continue
        # most common case
        if 'type' in linkelem.attrib and linkelem.get('type') in FEED_TYPES:
            feed_urls.append(linkelem.get('href'))
        # websites like geo.de
        elif 'atom' in linkelem.get('href') or 'rss' in linkelem.get('href'):
            feed_urls.append(linkelem.get('href'))
    # backup
    if not feed_urls:
        for linkelem in tree.xpath('//a[@href]'):
            if linkelem.get('href')[-4:].lower() in ('.rss', '.rdf', '.xml'):
                feed_urls.append(linkelem.get('href'))
            elif linkelem.get('href')[-5:].lower() == '.atom':
                feed_urls.append(linkelem.get('href'))
            elif 'atom' in linkelem.get('href') or 'rss' in linkelem.get(
                    'href'):
                feed_urls.append(linkelem.get('href'))
    # refine
    output_urls = []
    for link in sorted(set(feed_urls)):
        link = fix_relative_urls(baseurl, link)
        link = clean_url(link)
        if link == reference or validate_url(link)[0] is False:
            continue
        if BLACKLIST.search(link):
            continue
        output_urls.append(link)
    # log result
    LOGGER.debug('Feed URLs found: %s of which %s valid', len(feed_urls),
                 len(output_urls))
    return output_urls
Exemple #9
0
def add_to_compressed_dict(inputlist,
                           blacklist=None,
                           url_filter=None,
                           inputdict=None):
    '''Filter, convert input URLs and add them to domain-aware processing dictionary'''
    # init
    if inputdict is None:
        inputdict = defaultdict(deque)
    # deduplicate while keeping order
    inputlist = list(OrderedDict.fromkeys(inputlist))
    # filter
    if blacklist:
        inputlist = [
            u for u in inputlist
            if re.sub(r'https?://', '', u) not in blacklist
        ]
    if url_filter:
        filtered_list = []
        while inputlist:
            u = inputlist.pop()
            for f in url_filter:
                if f in u:
                    filtered_list.append(u)
                    break
        inputlist = filtered_list
    # validate and store in dict
    for url in inputlist:
        # validate URL
        if validate_url(url)[0] is False:
            continue
        # segment URL and add to domain dictionary
        try:
            hostinfo, urlpath = get_host_and_path(url)
            inputdict[hostinfo].append(urlpath)
        except ValueError:
            LOGGER.warning('Could not parse URL, discarding: %s', url)
    return inputdict
Exemple #10
0
def determine_feed(htmlstring, baseurl, reference):
    '''Try to extract the feed URL from the home page'''
    feed_urls = []
    # try to find RSS URL
    for feed_url in re.findall(
            r'<link[^<>]+?type="application/rss\+xml"[^<>]+?href="(.+?)"',
            htmlstring):
        feed_urls.append(feed_url)
    for feed_url in re.findall(
            r'<link[^<>]+?href="(.+?)"[^<>]+?type="application/rss\+xml"',
            htmlstring):
        feed_urls.append(feed_url)
    # try to find Atom URL
    if len(feed_urls) == 0:
        for feed_url in re.findall(
                r'<link[^<>]+?type="application/atom\+xml"[^<>]+?href="(.+?)"',
                htmlstring):
            feed_urls.append(feed_url)
        for feed_url in re.findall(
                r'<link[^<>]+?href="(.+?)"[^<>]+?type="application/atom\+xml"',
                htmlstring):
            feed_urls.append(feed_url)
    # refine
    output_urls = []
    for link in sorted(list(set(feed_urls))):
        link = fix_relative_urls(baseurl, link)
        link = clean_url(link)
        if link == reference or validate_url(link)[0] is False:
            continue
        if 'comments' in link:
            continue
        output_urls.append(link)
    # log result
    LOGGER.debug('Feed URLs found: %s of which %s valid', len(feed_urls),
                 len(output_urls))
    return output_urls