def test_validate(): assert validate_url('http://www.test[.org/test')[0] is False # assert validate_url('http://www.test.org:7ERT/test')[0] is False assert validate_url('ntp://www.test.org/test')[0] is False assert validate_url('ftps://www.test.org/test')[0] is False assert validate_url('http://t.g/test')[0] is False assert validate_url('http://test.org/test')[0] is True
def test_examples(): '''test README examples''' assert check_url('https://github.com/adbar/courlan') == ('https://github.com/adbar/courlan', 'github.com') assert check_url('https://httpbin.org/redirect-to?url=http%3A%2F%2Fexample.org', strict=True) == ('https://httpbin.org/redirect-to', 'httpbin.org') assert clean_url('HTTPS://WWW.DWDS.DE:80/') == 'https://www.dwds.de' assert validate_url('http://1234') == (False, None) assert validate_url('http://www.example.org/')[0] is True assert normalize_url('http://test.net/foo.html?utm_source=twitter&post=abc&page=2#fragment', strict=True) == 'http://test.net/foo.html?page=2&post=abc'
def test_extension_filter(): validation_test, parsed_url = validate_url( 'http://www.example.org/test.js') assert extension_filter(parsed_url.path) is False validation_test, parsed_url = validate_url( 'http://goodbasic.com/GirlInfo.aspx?Pseudo=MilfJanett') assert extension_filter(parsed_url.path) is True validation_test, parsed_url = validate_url( 'https://www.familienrecht-allgaeu.de/de/vermoegensrecht.amp') assert extension_filter(parsed_url.path) is True validation_test, parsed_url = validate_url( 'http://www.example.org/test.shtml') assert extension_filter(parsed_url.path) is True validation_test, parsed_url = validate_url( 'http://de.artsdot.com/ADC/Art.nsf/O/8EWETN') assert extension_filter(parsed_url.path) is True validation_test, parsed_url = validate_url( 'http://de.artsdot.com/ADC/Art.nsf?param1=test') assert extension_filter(parsed_url.path) is False validation_test, parsed_url = validate_url( 'http://www.example.org/test.xhtml?param1=this') assert extension_filter(parsed_url.path) is True validation_test, parsed_url = validate_url( 'http://www.example.org/test.php5') assert extension_filter(parsed_url.path) is True validation_test, parsed_url = validate_url( 'http://www.example.org/test.php6') assert extension_filter(parsed_url.path) is True
def load_input_dict(filename, blacklist): '''Read input list of URLs to process and build domain-aware processing dictionary''' inputdict = defaultdict(list) try: # optional: errors='strict', buffering=1 with open(filename, mode='r', encoding='utf-8') as inputfile: for line in inputfile: # control input validity url_match = re.match(r'https?://[^\s]+', line) if url_match: url = url_match.group(0) # validation if validate_url(url)[0] is False: LOGGER.warning('Invalid URL, discarding line: %s', line) continue # control blacklist if blacklist: if re.sub(r'^https?://', '', url) in blacklist: continue # segment URL and add to domain dictionary try: _, hostinfo, urlpath = HOSTINFO.split(url) inputdict[hostinfo].append(urlpath) except ValueError: LOGGER.warning('Could not parse URL, discarding line: %s', line) else: LOGGER.warning('Not an URL, discarding line: %s', line) except UnicodeDecodeError: sys.exit('ERROR: system, file type or buffer encoding') # deduplicate for hostname in inputdict: inputdict[hostname] = list(OrderedDict.fromkeys(inputdict[hostname])) return inputdict
def convert_inputlist(blacklist, inputlist, url_filter=None, inputdict=None): '''Add input URls to domain-aware processing dictionary''' # control if inputdict is None: inputdict = defaultdict(list) # filter if blacklist: inputlist = [u for u in inputlist if re.sub(r'https?://', '', u) not in blacklist] if url_filter: filtered_list = [] while inputlist: u = inputlist.pop() for f in url_filter: if f in u: filtered_list.append(u) break inputlist = filtered_list # validate inputlist = [u for u in inputlist if validate_url(u)[0] is True] # deduplicate for url in list(OrderedDict.fromkeys(inputlist)): # segment URL and add to domain dictionary try: _, hostinfo, urlpath = HOSTINFO.split(url) inputdict[hostinfo].append(urlpath) except ValueError: LOGGER.warning('Could not parse URL, discarding: %s', url) return inputdict
def load_blacklist(filename): '''Read list of unwanted URLs''' blacklist = set() with open(filename, mode='r', encoding='utf-8') as inputfh: for line in inputfh: url = line.strip() if validate_url(url)[0] is True: blacklist.add(re.sub(r'^https?://', '', url)) return blacklist
def url_processing_checks(blacklist, input_urls): '''Filter and deduplicate input urls''' # control blacklist if blacklist: input_urls = [u for u in input_urls if u not in blacklist] # check for invalid URLs if input_urls: input_urls = [u for u in input_urls if validate_url(u)[0] is True] # deduplicate if input_urls: return list(OrderedDict.fromkeys(input_urls)) LOGGER.error('No URLs to process, invalid or blacklisted input') return []
def determine_feed(htmlstring, baseurl, reference): '''Try to extract the feed URL from the home page. Adapted from http://www.aaronsw.com/2002/feedfinder/''' # parse the page to look for feeds tree = load_html(htmlstring) # safeguard if tree is None: LOGGER.debug('Invalid HTML/Feed page: %s', baseurl) return [] feed_urls = [] for linkelem in tree.xpath('//link[@rel="alternate"]'): # discard elements without links if 'href' not in linkelem.attrib: continue # most common case if 'type' in linkelem.attrib and linkelem.get('type') in FEED_TYPES: feed_urls.append(linkelem.get('href')) # websites like geo.de elif 'atom' in linkelem.get('href') or 'rss' in linkelem.get('href'): feed_urls.append(linkelem.get('href')) # backup if not feed_urls: for linkelem in tree.xpath('//a[@href]'): if linkelem.get('href')[-4:].lower() in ('.rss', '.rdf', '.xml'): feed_urls.append(linkelem.get('href')) elif linkelem.get('href')[-5:].lower() == '.atom': feed_urls.append(linkelem.get('href')) elif 'atom' in linkelem.get('href') or 'rss' in linkelem.get( 'href'): feed_urls.append(linkelem.get('href')) # refine output_urls = [] for link in sorted(set(feed_urls)): link = fix_relative_urls(baseurl, link) link = clean_url(link) if link == reference or validate_url(link)[0] is False: continue if BLACKLIST.search(link): continue output_urls.append(link) # log result LOGGER.debug('Feed URLs found: %s of which %s valid', len(feed_urls), len(output_urls)) return output_urls
def add_to_compressed_dict(inputlist, blacklist=None, url_filter=None, inputdict=None): '''Filter, convert input URLs and add them to domain-aware processing dictionary''' # init if inputdict is None: inputdict = defaultdict(deque) # deduplicate while keeping order inputlist = list(OrderedDict.fromkeys(inputlist)) # filter if blacklist: inputlist = [ u for u in inputlist if re.sub(r'https?://', '', u) not in blacklist ] if url_filter: filtered_list = [] while inputlist: u = inputlist.pop() for f in url_filter: if f in u: filtered_list.append(u) break inputlist = filtered_list # validate and store in dict for url in inputlist: # validate URL if validate_url(url)[0] is False: continue # segment URL and add to domain dictionary try: hostinfo, urlpath = get_host_and_path(url) inputdict[hostinfo].append(urlpath) except ValueError: LOGGER.warning('Could not parse URL, discarding: %s', url) return inputdict
def determine_feed(htmlstring, baseurl, reference): '''Try to extract the feed URL from the home page''' feed_urls = [] # try to find RSS URL for feed_url in re.findall( r'<link[^<>]+?type="application/rss\+xml"[^<>]+?href="(.+?)"', htmlstring): feed_urls.append(feed_url) for feed_url in re.findall( r'<link[^<>]+?href="(.+?)"[^<>]+?type="application/rss\+xml"', htmlstring): feed_urls.append(feed_url) # try to find Atom URL if len(feed_urls) == 0: for feed_url in re.findall( r'<link[^<>]+?type="application/atom\+xml"[^<>]+?href="(.+?)"', htmlstring): feed_urls.append(feed_url) for feed_url in re.findall( r'<link[^<>]+?href="(.+?)"[^<>]+?type="application/atom\+xml"', htmlstring): feed_urls.append(feed_url) # refine output_urls = [] for link in sorted(list(set(feed_urls))): link = fix_relative_urls(baseurl, link) link = clean_url(link) if link == reference or validate_url(link)[0] is False: continue if 'comments' in link: continue output_urls.append(link) # log result LOGGER.debug('Feed URLs found: %s of which %s valid', len(feed_urls), len(output_urls)) return output_urls