def test_urlutils(): '''Test URL manipulation tools''' # domain extraction assert extract_domain('h') is None assert extract_domain('https://httpbin.org/') == 'httpbin.org' # url parsing result = _parse('https://httpbin.org/') assert isinstance(result, ParseResult) newresult = _parse(result) assert isinstance(result, ParseResult) with pytest.raises(TypeError): result = _parse(float(1.23)) assert get_base_url('https://example.org/path') == 'https://example.org' with pytest.raises(ValueError): assert get_host_and_path('123') is None assert get_host_and_path('https://example.org/path') == ( 'https://example.org', '/path') assert get_host_and_path('https://example.org/') == ('https://example.org', '/') assert get_host_and_path('https://example.org') == ('https://example.org', '/') assert get_hostinfo('https://httpbin.org/') == ('httpbin.org', 'https://httpbin.org') assert get_hostinfo('https://example.org/path') == ('example.org', 'https://example.org') # keeping track of known URLs known_links = {'https://test.org'} assert is_known_link('https://test.org/1', known_links) is False assert is_known_link('https://test.org', known_links) is True assert is_known_link('http://test.org', known_links) is True assert is_known_link('http://test.org/', known_links) is True assert is_known_link('https://test.org/', known_links) is True
def handle_link(link, sitemapurl, domainname, baseurl, target_lang): '''Examine a link and determine if it's valid and if it leads to a sitemap or a web page.''' state = '0' # safety net: recursivity if link == sitemapurl: return link, state # fix and check link = fix_relative_urls(baseurl, link) # clean and normalize link = clean_url(link, target_lang) if link is not None: if lang_filter(link, target_lang) is True: newdomain = extract_domain(link) if newdomain is not None: # don't take links from another domain and make an exception for main platforms if newdomain != domainname and not WHITELISTED_PLATFORMS.search( newdomain): LOGGER.warning('Diverging domain names: %s %s', domainname, newdomain) else: if re.search(r'\.xml$|\.xml[.?#]', link): state = 'sitemap' else: state = 'link' else: LOGGER.error("Couldn't extract domain: %s", link) return link, state
def sitemap_search(url, target_lang=None): 'Look for sitemaps for the given URL and gather links.' domainname, hostmatch = extract_domain(url), HOSTINFO.match(url) if domainname is None or hostmatch is None: LOGGER.warning('Invalid URL: %s', url) return [] baseurl = hostmatch.group(0) if url.endswith('.xml') or url.endswith('.gz') or url.endswith('sitemap'): sitemapurl = url else: sitemapurl = url.rstrip('/') + '/sitemap.xml' sitemapurls, linklist = process_sitemap(sitemapurl, domainname, baseurl, target_lang) if sitemapurls == [] and len(linklist) > 0: return linklist if sitemapurls == [] and linklist == []: for sitemapurl in find_robots_sitemaps(url, baseurl): tmp_sitemapurls, tmp_linklist = process_sitemap(sitemapurl, domainname, baseurl, target_lang) sitemapurls.extend(tmp_sitemapurls) linklist.extend(tmp_linklist) while sitemapurls: tmp_sitemapurls, tmp_linklist = process_sitemap(sitemapurls.pop(), domainname, baseurl, target_lang) sitemapurls.extend(tmp_sitemapurls) linklist.extend(tmp_linklist) LOGGER.debug('%s links found for %s', len(linklist), domainname) return linklist
def draw_backoff_url(domain_dict, backoff_dict, sleeptime, i): '''Select a random URL from the domains pool and apply backoff rule''' host = random.choice(list(domain_dict)) domain = extract_domain(host) # safeguard if domain in backoff_dict and \ (datetime.now() - backoff_dict[domain]).total_seconds() < sleeptime: i += 1 if i >= len(domain_dict)*3: LOGGER.debug('spacing request for domain name %s', domain) sleep(sleeptime) i = 0 # draw URL url = host + domain_dict[host].pop() # clean registries if not domain_dict[host]: del domain_dict[host] try: del backoff_dict[domain] except KeyError: pass # register backoff else: backoff_dict[domain] = datetime.now() return url, domain_dict, backoff_dict, i
def find_feed_urls(url, target_lang=None): """Try to find feed URLs. Args: url: Homepage or feed URL as string. target_lang: Define a language to filter URLs based on heuristics (two-letter string, ISO 639-1 format). Returns: The extracted links as list (sorted list of unique links). """ url = url.rstrip('/') domainname, hostmatch = extract_domain(url), HOSTINFO.match(url) if domainname is None or hostmatch is None: LOGGER.warning('Invalid URL: %s', url) return [] baseurl = hostmatch.group(0) downloaded = fetch_url(url) if downloaded is None: LOGGER.warning('Could not download web page: %s', url) return [] # assume it's a web page feed_links = [] for feed in determine_feed(downloaded, baseurl, url): sleep(SLEEP_TIME) feed_string = fetch_url(feed) feed_links.extend( extract_links(feed_string, domainname, baseurl, url, target_lang)) feed_links = sorted(list(set(feed_links))) LOGGER.debug('%s feed links found for %s', len(feed_links), domainname) return feed_links
def url_processing_pipeline(args, input_urls, sleeptime): '''Aggregated functions to show a list and download and process an input list''' input_urls = url_processing_checks(args.blacklist, input_urls) # print list without further processing if args.list: for url in input_urls: write_result(url, args) # print('\n'.join(input_urls)) return None # build domain-aware processing list domain_dict = dict() while input_urls: url = input_urls.pop() domain_name = extract_domain(url) if domain_name not in domain_dict: domain_dict[domain_name] = [] domain_dict[domain_name].append(url) # initialize file counter if necessary if len(input_urls) > MAX_FILES_PER_DIRECTORY: counter = 0 else: counter = None if len(domain_dict) <= 5: backoff_dict = dict() single_threaded_processing(domain_dict, backoff_dict, args, sleeptime, counter) else: multi_threaded_processing(domain_dict, args, sleeptime, counter)
def test_urlutils(): '''Test URL manipulation tools''' assert extract_domain('https://httpbin.org/') == 'httpbin.org' assert get_base_url('https://example.org/path') == 'https://example.org' assert get_host_and_path('https://example.org/path') == ('https://example.org', '/path') assert get_hostinfo('https://example.org/path') == ('example.org', 'https://example.org') assert get_hostinfo('https://httpbin.org/') == ('httpbin.org', 'https://httpbin.org')
def find_feed_urls(url, target_lang=None): """Try to find feed URLs. Args: url: Webpage or feed URL as string. Triggers URL-based filter if the webpage isn't a homepage. target_lang: Define a language to filter URLs based on heuristics (two-letter string, ISO 639-1 format). Returns: The extracted links as a list (sorted list of unique links). """ url = url.rstrip('/') domainname, hostmatch = extract_domain(url), HOSTINFO.match(url) if domainname is None or hostmatch is None: LOGGER.warning('Invalid URL: %s', url) return [] baseurl = hostmatch.group(0) urlfilter = None downloaded = fetch_url(url) if downloaded is not None: # assume it's a feed feed_links = extract_links(downloaded, domainname, baseurl, url, target_lang) if len(feed_links) == 0: # assume it's a web page for feed in determine_feed(downloaded, baseurl, url): feed_string = fetch_url(feed) feed_links.extend( extract_links(feed_string, domainname, baseurl, url, target_lang)) # filter triggered, prepare it if len(url) > len(baseurl) + 2: urlfilter = url # return links found if len(feed_links) > 0: feed_links = filter_urls(feed_links, urlfilter) LOGGER.debug('%s feed links found for %s', len(feed_links), domainname) return feed_links else: LOGGER.warning('Could not download web page: %s', url) # try alternative: Google News if target_lang is not None: url = 'https://news.google.com/rss/search?q=site:' + baseurl + '&hl=' + target_lang + '&scoring=n&num=100' downloaded = fetch_url(url) if downloaded is not None: feed_links = extract_links(downloaded, domainname, baseurl, url, target_lang) feed_links = filter_urls(feed_links, urlfilter) LOGGER.debug('%s feed links found for %s', len(feed_links), domainname) return feed_links LOGGER.warning('Could not download web page: %s', url) return []
def sitemap_search(url, target_lang=None): """Look for sitemaps for the given URL and gather links. Args: url: Webpage or sitemap URL as string. Triggers URL-based filter if the webpage isn't a homepage. target_lang: Define a language to filter URLs based on heuristics (two-letter string, ISO 639-1 format). Returns: The extracted links as a list (sorted list of unique links). """ domainname, hostmatch = extract_domain(url), HOSTINFO.match(url) if domainname is None or hostmatch is None: LOGGER.warning('Invalid URL: %s', url) return [] baseurl = hostmatch.group(0) urlfilter = None # determine sitemap URL if url.endswith('.xml') or url.endswith('.gz') or url.endswith('sitemap'): sitemapurl = url else: sitemapurl = baseurl + '/sitemap.xml' # filter triggered, prepare it if len(url) > len(baseurl) + 2: urlfilter = url sitemapurls, linklist = download_and_process_sitemap( sitemapurl, domainname, baseurl, target_lang) if sitemapurls == [] and len(linklist) > 0: linklist = filter_urls(linklist, urlfilter) LOGGER.debug('%s sitemap links found for %s', len(linklist), domainname) return linklist # try sitemaps in robots.txt file if nothing has been found if sitemapurls == [] and linklist == []: sitemapurls = find_robots_sitemaps(baseurl) # try additional URLs just in case if sitemapurls == []: sitemapurls = [''.join([baseurl, '/', g]) for g in GUESSES] # iterate through nested sitemaps and results i = 1 while sitemapurls: sitemapurls, linklist = download_and_process_sitemap( sitemapurls.pop(), domainname, baseurl, target_lang, sitemapurls, linklist) # counter and safeguard i += 1 if i > MAX_SITEMAPS_SEEN: break linklist = filter_urls(linklist, urlfilter) LOGGER.debug('%s sitemap links found for %s', len(linklist), domainname) return linklist
def multi_threaded_processing(domain_dict, args, sleeptime, counter): '''Implement a multi-threaded processing algorithm''' i = 0 backoff_dict = dict() download_threads = args.parallel or DOWNLOAD_THREADS while domain_dict: # the remaining list is too small, process it differently if len({x for v in domain_dict.values() for x in v}) < download_threads: single_threaded_processing(domain_dict, backoff_dict, args, sleeptime, counter) return # populate buffer bufferlist, bufferdomains = list(), set() while len(bufferlist) < download_threads: domain = random.choice(list(domain_dict)) if domain not in backoff_dict or \ (datetime.now() - backoff_dict[domain]).total_seconds() > sleeptime: bufferlist.append(domain_dict[domain].pop()) bufferdomains.add(domain) backoff_dict[domain] = datetime.now() # safeguard else: i += 1 if i > len(domain_dict) * 3: LOGGER.debug('spacing request for domain name %s', domain) sleep(sleeptime) i = 0 # start several threads with ThreadPoolExecutor(max_workers=download_threads) as executor: future_to_url = { executor.submit(fetch_url, url): url for url in bufferlist } for future in as_completed(future_to_url): url = future_to_url[future] # register in backoff dictionary to ensure time between requests domain = extract_domain(url) backoff_dict[domain] = datetime.now() # handle result counter = process_result(future.result(), args, url, counter) # clean registries for domain in bufferdomains: if not domain_dict[domain]: del domain_dict[domain] del backoff_dict[domain]
def find_feed_urls(url, target_lang=None): '''Try to find feed URLs''' url = url.rstrip('/') domainname, hostmatch = extract_domain(url), HOSTINFO.match(url) if domainname is None or hostmatch is None: LOGGER.warning('Invalid URL: %s', url) return [] baseurl = hostmatch.group(0) downloaded = fetch_url(url) if downloaded is None: LOGGER.warning('Could not download web page: %s', url) return [] # assume it's a web page feed_links = [] for feed in determine_feed(downloaded, baseurl, url): sleep(SLEEP_TIME) feed_string = fetch_url(feed) feed_links.extend( extract_links(feed_string, domainname, baseurl, url, target_lang)) return sorted(list(set(feed_links)))
def sitemap_search(url, target_lang=None): """Look for sitemaps for the given URL and gather links. Args: url: Homepage or sitemap URL as string. target_lang: Define a language to filter URLs based on heuristics (two-letter string, ISO 639-1 format). Returns: The extracted links as list (sorted list of unique links). """ domainname, hostmatch = extract_domain(url), HOSTINFO.match(url) if domainname is None or hostmatch is None: LOGGER.warning('Invalid URL: %s', url) return [] baseurl = hostmatch.group(0) if url.endswith('.xml') or url.endswith('.gz') or url.endswith('sitemap'): sitemapurl = url else: sitemapurl = url.rstrip('/') + '/sitemap.xml' sitemapurls, linklist = download_and_process_sitemap( sitemapurl, domainname, baseurl, target_lang) if sitemapurls == [] and len(linklist) > 0: return linklist # try sitemaps in robots.txt file if nothing has been found if sitemapurls == [] and linklist == []: sitemapurls = find_robots_sitemaps(url, baseurl) # iterate through nested sitemaps and results i = 1 while sitemapurls: sitemapurls, linklist = download_and_process_sitemap( sitemapurls.pop(), domainname, baseurl, target_lang, sitemapurls, linklist) # counter and safeguard i += 1 if i > MAX_SITEMAPS_SEEN: break linklist = sorted(list(set(linklist))) LOGGER.debug('%s sitemap links found for %s', len(linklist), domainname) return linklist
def url_processing_pipeline(args, input_urls, sleeptime): '''Aggregated functions to show a list and download and process an input list''' input_urls = url_processing_checks(args.blacklist, input_urls) # print list without further processing if args.list: for url in input_urls: write_result(url, args) # print('\n'.join(input_urls)) return None # build domain-aware processing list domain_dict = dict() while input_urls: url = input_urls.pop() domain_name = extract_domain(url) if domain_name not in domain_dict: domain_dict[domain_name] = [] domain_dict[domain_name].append(url) # initialize file counter if necessary if len(input_urls) > MAX_FILES_PER_DIRECTORY: counter = 0 else: counter = None if len(domain_dict) <= 5: errors, counter = single_threaded_processing(domain_dict, dict(), args, sleeptime, counter) else: errors, counter = multi_threaded_processing(domain_dict, args, sleeptime, counter) LOGGER.debug('%s URLs could not be found', len(errors)) # option to retry if args.archived is True: domain_dict = dict() domain_dict['archive.org'] = [ 'https://web.archive.org/web/20/' + e for e in errors ] archived_errors, _ = single_threaded_processing( domain_dict, dict(), args, sleeptime, counter) LOGGER.debug('%s archived URLs out of %s could not be found', len(archived_errors), len(errors))
def sitemap_search(url, target_lang=None): 'Look for sitemaps for the given URL and gather links.' domain = extract_domain(url) if url.endswith('.xml') or url.endswith('sitemap'): sitemapurl = url else: sitemapurl = url.rstrip('/') + '/sitemap.xml' sitemapurls, linklist = process_sitemap(sitemapurl, domain, target_lang) if sitemapurls == [] and len(linklist) > 0: return linklist if sitemapurls == [] and linklist == []: for sitemapurl in find_robots_sitemaps(url): tmp_sitemapurls, tmp_linklist = process_sitemap( sitemapurl, domain, target_lang) sitemapurls.extend(tmp_sitemapurls) linklist.extend(tmp_linklist) while sitemapurls: tmp_sitemapurls, tmp_linklist = process_sitemap( sitemapurls.pop(), domain, target_lang) sitemapurls.extend(tmp_sitemapurls) linklist.extend(tmp_linklist) LOGGER.debug('%s links found for %s', len(linklist), domain) return linklist
def handle_link(link, sitemapurl, domainname, baseurl, target_lang=None): '''Examine a link and determine if it's valid and if it leads to a sitemap or a web page.''' state = '0' # safety net: recursivity if link == sitemapurl: return link, state # fix and check link = fix_relative_urls(baseurl, link) # clean and normalize link = clean_url(link, target_lang) if link is not None: if lang_filter(link, target_lang) is True: newdomain = extract_domain(link) if newdomain != domainname: LOGGER.warning('Diverging domain names: %s %s', domainname, newdomain) else: if re.search(r'\.xml$|\.xml[.?#]', link): state = 'sitemap' else: state = 'link' return link, state
def get_hostinfo(url): """Extract domain and host info (protocol + host/domain) from a URL.""" domainname = extract_domain(url) parsed_url = urlparse(url) host = parsed_url._replace(path='', params='', query='', fragment='') return domainname, host.geturl()