def test_fix_relative(): assert fix_relative_urls('https://example.org', 'page.html') == 'https://example.org/page.html' assert fix_relative_urls( 'http://example.org', '//example.org/page.html') == 'http://example.org/page.html' assert fix_relative_urls('https://example.org', './page.html') == 'https://example.org/page.html' assert fix_relative_urls('https://example.org', '/page.html') == 'https://example.org/page.html' # fixing partial URLs assert fix_relative_urls( 'https://example.org', 'https://example.org/test.html') == 'https://example.org/test.html' assert fix_relative_urls('https://example.org', '/test.html') == 'https://example.org/test.html' assert fix_relative_urls( 'https://example.org', '//example.org/test.html') == 'https://example.org/test.html' assert fix_relative_urls( 'http://example.org', '//example.org/test.html') == 'http://example.org/test.html' assert fix_relative_urls('https://example.org', 'test.html') == 'https://example.org/test.html' assert fix_relative_urls( 'https://example.org', '../../test.html') == 'https://example.org/test.html'
def refresh_detection(htmlstring, homepage): "Check if there could be a redirection by meta-refresh tag." if '"refresh"' in htmlstring or '"REFRESH"' in htmlstring: try: html_tree = load_html(htmlstring) # test meta-refresh redirection # https://stackoverflow.com/questions/2318446/how-to-follow-meta-refreshes-in-python attr = html_tree.xpath( '//meta[@http-equiv="refresh"]/@content|//meta[@http-equiv="REFRESH"]/@content' )[0] _, text = attr.split(';') text = text.strip().lower() if text.startswith('url=') or text.startswith('URL='): url2 = text[4:] if not url2.startswith('http'): # Relative URL, adapt _, base_url = get_hostinfo(url2) url2 = fix_relative_urls(base_url, url2) # second fetch newhtmlstring = fetch_url(url2) if newhtmlstring is None: logging.warning('failed redirect: %s', url2) return None, None #else: htmlstring, homepage = newhtmlstring, url2 logging.info('successful redirect: %s', url2) except (IndexError, etree.ParserError, etree.XMLSyntaxError, etree.XPathEvalError) as err: logging.info('no redirect found: %s %s', homepage, err) return htmlstring, homepage
def handle_link(link, sitemapurl, domainname, baseurl, target_lang): '''Examine a link and determine if it's valid and if it leads to a sitemap or a web page.''' state = '0' # safety net: recursivity if link == sitemapurl: return link, state # fix and check link = fix_relative_urls(baseurl, link) # clean and normalize link = clean_url(link, target_lang) if link is not None: if lang_filter(link, target_lang) is True: newdomain = extract_domain(link) if newdomain is not None: # don't take links from another domain and make an exception for main platforms if newdomain != domainname and not WHITELISTED_PLATFORMS.search(newdomain): LOGGER.warning('Diverging domain names: %s %s', domainname, newdomain) else: if DETECT_SITEMAP_LINK.search(link): state = 'sitemap' else: state = 'link' else: LOGGER.error("Couldn't extract domain: %s", link) return link, state
def determine_feed(htmlstring, baseurl, reference): '''Try to extract the feed URL from the home page. Adapted from http://www.aaronsw.com/2002/feedfinder/''' # parse the page to look for feeds tree = load_html(htmlstring) # safeguard if tree is None: LOGGER.debug('Invalid HTML/Feed page: %s', baseurl) return [] feed_urls = [] for linkelem in tree.xpath('//link[@rel="alternate"]'): # discard elements without links if 'href' not in linkelem.attrib: continue # most common case if 'type' in linkelem.attrib and linkelem.get('type') in FEED_TYPES: feed_urls.append(linkelem.get('href')) # websites like geo.de elif 'atom' in linkelem.get('href') or 'rss' in linkelem.get('href'): feed_urls.append(linkelem.get('href')) # backup if not feed_urls: for linkelem in tree.xpath('//a[@href]'): if linkelem.get('href')[-4:].lower() in ('.rss', '.rdf', '.xml'): feed_urls.append(linkelem.get('href')) elif linkelem.get('href')[-5:].lower() == '.atom': feed_urls.append(linkelem.get('href')) elif 'atom' in linkelem.get('href') or 'rss' in linkelem.get( 'href'): feed_urls.append(linkelem.get('href')) # refine output_urls = [] for link in sorted(set(feed_urls)): link = fix_relative_urls(baseurl, link) link = clean_url(link) if link == reference or validate_url(link)[0] is False: continue if BLACKLIST.search(link): continue output_urls.append(link) # log result LOGGER.debug('Feed URLs found: %s of which %s valid', len(feed_urls), len(output_urls)) return output_urls
def handle_link_list(linklist, domainname, baseurl, target_lang=None): '''Examine links to determine if they are valid and lead to a web page''' output_links = [] # sort and uniq for item in sorted(set(linklist)): # fix and check link = fix_relative_urls(baseurl, item) # control output for validity checked = check_url(link, language=target_lang) if checked is not None: output_links.append(checked[0]) if checked[1] != domainname: LOGGER.warning('Diverging domain names: %s %s', domainname, checked[1]) # Feedburner/Google feeds elif 'feedburner' in item or 'feedproxy' in item: output_links.append(item) return output_links
def extract_robots_sitemaps(robotstxt, baseurl): 'Read a robots.txt file and find sitemap links.' # sanity check on length (cause: redirections) if robotstxt is None or len(robotstxt) > 10000: return [] sitemapurls = [] # source: https://github.com/python/cpython/blob/3.8/Lib/urllib/robotparser.py for line in robotstxt.splitlines(): # remove optional comment and strip line i = line.find('#') if i >= 0: line = line[:i] line = line.strip() if not line: continue line = line.split(':', 1) if len(line) == 2: line[0] = line[0].strip().lower() if line[0] == "sitemap": # urllib.parse.unquote(line[1].strip()) candidate = fix_relative_urls(baseurl, line[1].strip()) sitemapurls.append(candidate) LOGGER.debug('%s sitemaps found in robots.txt', len(sitemapurls)) return sitemapurls