def _check_internal(self, tested_url): from linkcheck.utils import LinkCheckHandler if not(tested_url): self.message = 'Empty link' elif tested_url.startswith('mailto:'): self.status = None self.message = 'Email link (not automatically checked)' elif tested_url.startswith('#'): self.status = None self.message = 'Link to within the same page (not automatically checked)' elif tested_url.startswith(MEDIA_PREFIX): # TODO Assumes a direct mapping from media url to local filesystem path. This will break quite easily for alternate setups path = settings.MEDIA_ROOT + urlunquote(tested_url)[len(MEDIA_PREFIX)-1:] decoded_path = html_decode(path) if os.path.exists(path) or os.path.exists(decoded_path): self.message = 'Working file link' self.status = True else: self.message = 'Missing Document' elif getattr(self, '_internal_hash', False) and getattr(self, '_instance', None): # This is a hash link pointing to itself from linkcheck import parse_anchors hash = self._internal_hash instance = self._instance if hash == '#': # special case, point to # self.message = 'Working internal hash anchor' self.status = True else: hash = hash[1:] #'#something' => 'something' html_content = '' for field in instance._linklist.html_fields: html_content += getattr(instance, field, '') names = parse_anchors(html_content) if hash in names: self.message = 'Working internal hash anchor' self.status = True else: self.message = 'Broken internal hash anchor' elif tested_url.startswith('/'): old_prepend_setting = settings.PREPEND_WWW settings.PREPEND_WWW = False c = Client() c.handler = LinkCheckHandler() response = c.get(tested_url) if USE_REVERSION: # using test client will clear the RevisionContextManager stack. revision_context_manager.start() if response.status_code == 200: self.message = 'Working internal link' self.status = True # see if the internal link points an anchor if tested_url[-1] == '#': # special case, point to # self.message = 'Working internal hash anchor' elif tested_url.count('#'): anchor = tested_url.split('#')[1] from linkcheck import parse_anchors names = parse_anchors(response.content) if anchor in names: self.message = 'Working internal hash anchor' self.status = True else: self.message = 'Broken internal hash anchor' self.status = False elif response.status_code == 302 or response.status_code == 301: self.status = None self.message = 'This link redirects: code %d (not automatically checked)' % (response.status_code, ) else: self.message = 'Broken internal link' settings.PREPEND_WWW = old_prepend_setting else: self.message = 'Invalid URL' self.last_checked = now() self.save()
def _check_external(self, tested_url, external_recheck_interval): logger.info('checking external link: %s' % tested_url) external_recheck_datetime = now() - timedelta(minutes=external_recheck_interval) if self.last_checked and (self.last_checked > external_recheck_datetime): return self.status opener = build_opener(RedirectHandler) # Remove URL fragment identifiers url = tested_url.rsplit('#')[0] # Check that non-ascii chars are properly encoded try: url.encode('ascii') except UnicodeEncodeError: url = iri_to_uri(url) try: if tested_url.count('#'): # We have to get the content so we can check the anchors response = opener.open( url, timeout=LINKCHECK_CONNECTION_ATTEMPT_TIMEOUT ) else: # Might as well just do a HEAD request req = HeadRequest(url, headers={'User-Agent' : "http://%s Linkchecker" % settings.SITE_DOMAIN}) try: response = opener.open( req, timeout=LINKCHECK_CONNECTION_ATTEMPT_TIMEOUT ) except (ValueError, HTTPError) as error: # ...except sometimes it triggers a bug in urllib2 if hasattr(error, 'code') and error.code == METHOD_NOT_ALLOWED: req = GetRequest(url, headers={'User-Agent' : "http://%s Linkchecker" % settings.SITE_DOMAIN}) else: req = url response = opener.open( req, timeout=LINKCHECK_CONNECTION_ATTEMPT_TIMEOUT ) self.message = ' '.join([str(response.code), response.msg]) self.status = True if tested_url.count('#'): anchor = tested_url.split('#')[1] from linkcheck import parse_anchors try: names = parse_anchors(response.read()) if anchor in names: self.message = 'Working external hash anchor' self.status = True else: self.message = 'Broken external hash anchor' self.status = False except: # The external web page is mal-formatted #or maybe other parse errors like encoding # I reckon a broken anchor on an otherwise good URL should count as a pass self.message = "Page OK but anchor can't be checked" self.status = True except http_client.BadStatusLine: self.message = "Bad Status Line" except HTTPError as e: if hasattr(e, 'code') and hasattr(e, 'msg'): self.message = ' '.join([str(e.code), e.msg]) else: self.message = "Unknown Error" except URLError as e: if hasattr(e, 'reason'): self.message = 'Unreachable: '+str(e.reason) elif hasattr(e, 'code') and e.code!=301: self.message = 'Error: '+str(e.code) else: self.message = 'Redirect. Check manually: '+str(e.code) except Exception as e: self.message = 'Other Error: %s' % e else: if response.getcode() == 301 and response.geturl() != url: self.redirect_to = response.geturl() elif self.redirect_to: self.redirect_to = '' self.last_checked = now() self.save()
def _check_external(self, tested_url, external_recheck_interval): logger.info('checking external link: %s' % tested_url) external_recheck_datetime = now() - timedelta( minutes=external_recheck_interval) if self.last_checked and (self.last_checked > external_recheck_datetime): return self.status # Remove URL fragment identifiers url = tested_url.rsplit('#')[0] # Check that non-ascii chars are properly encoded try: url.encode('ascii') except UnicodeEncodeError: url = iri_to_uri(url) request_params = { 'verify': False, 'allow_redirects': True, 'headers': { 'User-Agent': "http://%s Linkchecker" % settings.SITE_DOMAIN }, 'timeout': LINKCHECK_CONNECTION_ATTEMPT_TIMEOUT, } try: if tested_url.count('#'): # We have to get the content so we can check the anchors response = requests.get(url, **request_params) else: # Might as well just do a HEAD request response = requests.head(url, **request_params) if response.status_code >= 400: # If HEAD is not allowed, let's try with GET response = requests.get(url, **request_params) except ReadTimeout: self.message = 'Other Error: The read operation timed out' self.status = False except Exception as e: self.message = 'Other Error: %s' % e self.status = False else: self.message = ' '.join( [str(response.status_code), response.reason]) self.status = 200 <= response.status_code < 400 if tested_url.count('#'): anchor = tested_url.split('#')[1] from linkcheck import parse_anchors try: names = parse_anchors(response.text) if anchor in names: self.message = 'Working external hash anchor' self.status = True else: self.message = 'Broken external hash anchor' self.status = False except: # The external web page is mal-formatted #or maybe other parse errors like encoding # I reckon a broken anchor on an otherwise good URL should count as a pass self.message = "Page OK but anchor can't be checked" self.status = True if response.status_code in REDIRECT_STATI: # This means it could not follow the redirection self.status = False elif response.status_code < 300 and response.history: self.message = ' '.join([ str(response.history[0].status_code), response.history[0].reason ]) self.redirect_to = response.url self.last_checked = now() self.save()
def check_url(self, check_internal=True, check_external=True, external_recheck_interval=EXTERNAL_RECHECK_INTERVAL): from linkcheck.utils import LinkCheckHandler external_recheck_datetime = now() - timedelta(minutes=external_recheck_interval) self.status = False # Remove current domain from URLs as the test client chokes when trying to test them during a page save # They shouldn't generally exist but occasionally slip through # If settings.SITE_DOMAINS isn't set then use settings.SITE_DOMAIN # but also check for variants: example.org, www.example.org, test.example.org original_url = None # used to restore the original url afterwards if SITE_DOMAINS: #if the setting is present internal_exceptions = SITE_DOMAINS else: # try using SITE_DOMAIN root_domain = settings.SITE_DOMAIN if root_domain.startswith('www.'): root_domain = root_domain[4:] elif root_domain.startswith('test.'): root_domain = root_domain[5:] internal_exceptions = ['http://'+root_domain, 'http://www.'+root_domain, 'http://test.'+root_domain] for ex in internal_exceptions: if ex and self.url.startswith(ex): original_url = self.url self.url = self.url.replace(ex, '', 1) if check_internal and (not self.external): if not(self.url): self.message = 'Empty link' elif self.url.startswith('mailto:'): self.status = None self.message = 'Email link (not automatically checked)' elif self.url.startswith('#'): self.status = None self.message = 'Link to within the same page (not automatically checked)' elif self.url.startswith(MEDIA_PREFIX): #TODO Assumes a direct mapping from media url to local filesystem path. This will break quite easily for alternate setups path = settings.MEDIA_ROOT + self.url_unquoted()[len(MEDIA_PREFIX)-1:] decoded_path = html_decode(path) if os.path.exists(path) or os.path.exists(decoded_path): self.message = 'Working file link' self.status = True else: self.message = 'Missing Document' elif getattr(self, '_internal_hash', False) and getattr(self, '_instance', None): # This is a hash link pointing to itself from linkcheck import parse_anchors hash = self._internal_hash instance = self._instance if hash == '#': # special case, point to # self.message = 'Working internal hash anchor' self.status = True else: hash = hash[1:] #'#something' => 'something' html_content = '' for field in instance._linklist.html_fields: html_content += getattr(instance, field, '') names = parse_anchors(html_content) if hash in names: self.message = 'Working internal hash anchor' self.status = True else: self.message = 'Broken internal hash anchor' elif self.url.startswith('/'): old_prepend_setting = settings.PREPEND_WWW settings.PREPEND_WWW = False c = Client() c.handler = LinkCheckHandler() response = c.get(self.url, follow=True) #using test client will clear the RevisionContextManager stack. from reversion.revisions import revision_context_manager revision_context_manager.start() if response.status_code == 200: self.message = 'Working internal link' self.status = True # see if the internal link points an anchor if self.url[-1] == '#': # special case, point to # self.message = 'Working internal hash anchor' elif self.url.count('#'): anchor = self.url.split('#')[1] from linkcheck import parse_anchors names = parse_anchors(response.content) if anchor in names: self.message = 'Working internal hash anchor' self.status = True else: self.message = 'Broken internal hash anchor' self.status = False elif (response.status_code == 302 or response.status_code == 301): self.status = None self.message = 'This link redirects: code %d (not automatically checked)' % (response.status_code, ) else: self.message = 'Broken internal link' settings.PREPEND_WWW = old_prepend_setting else: self.message = 'Invalid URL' if original_url: # restore the original url before saving self.url = original_url self.last_checked = now() self.save() elif check_external and self.external: logger.info('checking external link: %s' % self.url) if self.last_checked and (self.last_checked > external_recheck_datetime): return self.status try: # Remove URL fragment identifiers url = self.url.rsplit('#')[0] if self.url.count('#'): # We have to get the content so we can check the anchors if TIMEOUT_SUPPORT: response = urllib2.urlopen( url, timeout=LINKCHECK_CONNECTION_ATTEMPT_TIMEOUT ) else: response = urllib2.urlopen(url) else: # Might as well just do a HEAD request req = HeadRequest(url, headers={'User-Agent' : "http://%s Linkchecker" % settings.SITE_DOMAIN}) try: if TIMEOUT_SUPPORT: response = urllib2.urlopen( req, timeout=LINKCHECK_CONNECTION_ATTEMPT_TIMEOUT ) else: response = urllib2.urlopen(req) except (ValueError, urllib2.HTTPError): _, error, _ = sys.exc_info() # ...except sometimes it triggers a bug in urllib2 if hasattr(error, 'code') and error.code == METHOD_NOT_ALLOWED: req = GetRequest(url, headers={'User-Agent' : "http://%s Linkchecker" % settings.SITE_DOMAIN}) else: req = url if TIMEOUT_SUPPORT: response = urllib2.urlopen( req, timeout=LINKCHECK_CONNECTION_ATTEMPT_TIMEOUT ) else: response = urllib2.urlopen(req) self.message = ' '.join([str(response.code), response.msg]) self.status = True if self.url.count('#'): anchor = self.url.split('#')[1] from linkcheck import parse_anchors try: names = parse_anchors(response.read()) if anchor in names: self.message = 'Working external hash anchor' self.status = True else: self.message = 'Broken external hash anchor' self.status = False except: # The external web page is mal-formatted #or maybe other parse errors like encoding # I reckon a broken anchor on an otherwise good URL should count as a pass self.message = "Page OK but anchor can't be checked" self.status = True except BadStatusLine: self.message = "Bad Status Line" except urllib2.HTTPError, e: if hasattr(e, 'code') and hasattr(e, 'msg'): self.message = ' '.join([str(e.code), e.msg]) else: self.message = "Unknown Error" except urllib2.URLError, e: if hasattr(e, 'reason'): self.message = 'Unreachable: '+str(e.reason) elif hasattr(e, 'code') and e.code!=301: self.message = 'Error: '+str(e.code) else: self.message = 'Redirect. Check manually: '+str(e.code)
def _check_internal(self, tested_url): from linkcheck.utils import LinkCheckHandler if not (tested_url): self.message = 'Empty link' elif tested_url.startswith('mailto:'): self.status = None self.message = 'Email link (not automatically checked)' elif tested_url.startswith('tel:'): self.status = None self.message = 'Phone number (not automatically checked)' elif tested_url.startswith('#'): self.status = None self.message = 'Link to within the same page (not automatically checked)' elif tested_url.startswith(MEDIA_PREFIX): # TODO Assumes a direct mapping from media url to local filesystem path. This will break quite easily for alternate setups path = settings.MEDIA_ROOT + unquote( tested_url)[len(MEDIA_PREFIX) - 1:] decoded_path = html_decode(path) if os.path.exists(path) or os.path.exists(decoded_path): self.message = 'Working file link' self.status = True else: self.message = 'Missing Document' elif getattr(self, '_internal_hash', False) and getattr( self, '_instance', None): # This is a hash link pointing to itself from linkcheck import parse_anchors hash = self._internal_hash instance = self._instance if hash == '#': # special case, point to # self.message = 'Working internal hash anchor' self.status = True else: hash = hash[1:] #'#something' => 'something' html_content = '' for field in instance._linklist.html_fields: html_content += getattr(instance, field, '') try: names = parse_anchors(html_content) if hash in names: self.message = 'Working internal hash anchor' self.status = True else: self.message = 'Broken internal hash anchor' except UnicodeDecodeError: self.message = 'Failed to parse HTML for anchor' elif tested_url.startswith('/'): old_prepend_setting = settings.PREPEND_WWW settings.PREPEND_WWW = False """ original code c = Client() c.handler = LinkCheckHandler() response = c.get(tested_url) """ tested_url = 'http://' + settings.ALLOWED_HOSTS[0] + tested_url response = requests.get(tested_url, verify=True) if response.status_code == 200: self.message = 'Working internal link' self.status = True # see if the internal link points an anchor if tested_url[-1] == '#': # special case, point to # self.message = 'Working internal hash anchor' elif tested_url.count('#'): anchor = tested_url.split('#')[1] from linkcheck import parse_anchors try: names = parse_anchors(str(response.content)) if anchor in names: self.message = 'Working internal hash anchor' self.status = True else: self.message = 'Broken internal hash anchor' self.status = False except UnicodeDecodeError: self.message = 'Failed to parse HTML for anchor' elif response.status_code == 302 or response.status_code == 301: # redir_response = c.get(tested_url, follow=True) redir_response = requests.get(tested_url, allow_redirects=True) if redir_response.status_code == 200: redir_state = 'Working redirect' self.status = True else: redir_state = 'Broken redirect' self.status = False self.message = 'This link redirects: code %d (%s)' % ( response.status_code, redir_state) else: self.message = 'Broken internal link' settings.PREPEND_WWW = old_prepend_setting else: self.message = 'Invalid URL' if USE_REVERSION: # using test client will clear the RevisionContextManager stack. revision_context_manager.start() self.last_checked = now() self.save()
def check(self, check_internal=True, check_external=True, external_recheck_interval=EXTERNAL_RECHECK_INTERVAL): from linkcheck.utils import LinkCheckHandler external_recheck_datetime = now() - timedelta( minutes=external_recheck_interval) self.status = False # Remove current domain from URLs as the test client chokes when trying to test them during a page save # They shouldn't generally exist but occasionally slip through # If settings.SITE_DOMAINS isn't set then use settings.SITE_DOMAIN # but also check for variants: example.org, www.example.org, test.example.org original_url = None # used to restore the original url afterwards if SITE_DOMAINS: #if the setting is present internal_exceptions = SITE_DOMAINS else: # try using SITE_DOMAIN root_domain = settings.SITE_DOMAIN if root_domain.startswith('www.'): root_domain = root_domain[4:] elif root_domain.startswith('test.'): root_domain = root_domain[5:] internal_exceptions = [ 'http://' + root_domain, 'http://www.' + root_domain, 'http://test.' + root_domain ] for ex in internal_exceptions: if ex and self.url.startswith(ex): original_url = self.url self.url = self.url.replace(ex, '', 1) if check_internal and (not self.external): if not (self.url): self.message = 'Empty link' elif self.url.startswith('mailto:'): self.status = None self.message = 'Email link (not automatically checked)' elif self.url.startswith('#'): self.status = None self.message = 'Link to within the same page (not automatically checked)' elif self.url.startswith(MEDIA_PREFIX): #TODO Assumes a direct mapping from media url to local filesystem path. This will break quite easily for alternate setups if os.path.exists(settings.MEDIA_ROOT + self.url_unquoted()[len(MEDIA_PREFIX) - 1:]): self.message = 'Working file link' self.status = True else: self.message = 'Missing Document' elif getattr(self, '_internal_hash', False) and getattr( self, '_instance', None): # This is a hash link pointing to itself from linkcheck import parse_anchors hash = self._internal_hash instance = self._instance if hash == '#': # special case, point to # self.message = 'Working internal hash anchor' self.status = True else: hash = hash[1:] #'#something' => 'something' html_content = '' for field in instance._linklist.html_fields: html_content += getattr(instance, field, '') names = parse_anchors(html_content) if hash in names: self.message = 'Working internal hash anchor' self.status = True else: self.message = 'Broken internal hash anchor' elif self.url.startswith('/'): old_prepend_setting = settings.PREPEND_WWW settings.PREPEND_WWW = False c = Client() c.handler = LinkCheckHandler() response = c.get(self.url, follow=True) if response.status_code == 200: self.message = 'Working internal link' self.status = True # see if the internal link points an anchor if self.url[-1] == '#': # special case, point to # self.message = 'Working internal hash anchor' elif self.url.count('#'): anchor = self.url.split('#')[1] from linkcheck import parse_anchors names = parse_anchors(response.content) if anchor in names: self.message = 'Working internal hash anchor' self.status = True else: self.message = 'Broken internal hash anchor' self.status = False elif (response.status_code == 302 or response.status_code == 301): self.status = None self.message = 'This link redirects: code %d (not automatically checked)' % ( response.status_code, ) else: self.message = 'Broken internal link' settings.PREPEND_WWW = old_prepend_setting else: self.message = 'Invalid URL' if original_url: # restore the original url before saving self.url = original_url self.last_checked = now() self.save() elif check_external and self.external: logger.info('checking external link: %s' % self.url) if self.last_checked and (self.last_checked > external_recheck_datetime): return self.status try: # Remove URL fragment identifiers url = self.url.rsplit('#')[0] if self.url.count('#'): # We have to get the content so we can check the anchors if TIMEOUT_SUPPORT: response = urllib2.urlopen( url, timeout=LINKCHECK_CONNECTION_ATTEMPT_TIMEOUT) else: response = urllib2.urlopen(url) else: # Might as well just do a HEAD request req = HeadRequest(url, headers={ 'User-Agent': "http://%s Linkchecker" % settings.SITE_DOMAIN }) try: if TIMEOUT_SUPPORT: response = urllib2.urlopen( req, timeout=LINKCHECK_CONNECTION_ATTEMPT_TIMEOUT) else: response = urllib2.urlopen(req) except (ValueError, urllib2.HTTPError) as error: # ...except sometimes it triggers a bug in urllib2 if hasattr( error, 'code') and error.code == METHOD_NOT_ALLOWED: req = GetRequest(url, headers={ 'User-Agent': "http://%s Linkchecker" % settings.SITE_DOMAIN }) else: req = url if TIMEOUT_SUPPORT: response = urllib2.urlopen( req, timeout=LINKCHECK_CONNECTION_ATTEMPT_TIMEOUT) else: response = urllib2.urlopen(req) self.message = ' '.join([str(response.code), response.msg]) self.status = True if self.url.count('#'): anchor = self.url.split('#')[1] from linkcheck import parse_anchors try: names = parse_anchors(response.read()) if anchor in names: self.message = 'Working external hash anchor' self.status = True else: self.message = 'Broken external hash anchor' self.status = False except: # The external web page is mal-formatted #or maybe other parse errors like encoding # I reckon a broken anchor on an otherwise good URL should count as a pass self.message = "Page OK but anchor can't be checked" self.status = True except BadStatusLine: self.message = "Bad Status Line" except urllib2.HTTPError, e: if hasattr(e, 'code') and hasattr(e, 'msg'): self.message = ' '.join([str(e.code), e.msg]) else: self.message = "Unknown Error" except urllib2.URLError, e: if hasattr(e, 'reason'): self.message = 'Unreachable: ' + str(e.reason) elif hasattr(e, 'code') and e.code != 301: self.message = 'Error: ' + str(e.code) else: self.message = 'Redirect. Check manually: ' + str(e.code)
def _check_external(self, tested_url, external_recheck_interval): logger.info('checking external link: %s' % tested_url) external_recheck_datetime = now() - timedelta( minutes=external_recheck_interval) if self.last_checked and (self.last_checked > external_recheck_datetime): return self.status opener = build_opener(RedirectHandler) # Remove URL fragment identifiers url = tested_url.rsplit('#')[0] # Check that non-ascii chars are properly encoded try: url.encode('ascii') except UnicodeEncodeError: url = iri_to_uri(url) try: if tested_url.count('#'): # We have to get the content so we can check the anchors response = opener.open( url, timeout=LINKCHECK_CONNECTION_ATTEMPT_TIMEOUT) else: # Might as well just do a HEAD request req = HeadRequest(url, headers={ 'User-Agent': "http://%s Linkchecker" % settings.SITE_DOMAIN }) try: response = opener.open( req, timeout=LINKCHECK_CONNECTION_ATTEMPT_TIMEOUT) except URLError as e: # When we get CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:579) error # we try the link using requests, and ignore SSL verification error. if hasattr( e, 'reason') and 'certificate verify failed' in str( e.reason): response = requests.head( url, verify=False, timeout=LINKCHECK_CONNECTION_ATTEMPT_TIMEOUT) response.code = response.status_code response.msg = '' else: raise except (ValueError, HTTPError) as error: # ...except sometimes it triggers a bug in urllib2 if hasattr(error, 'code') and error.code == METHOD_NOT_ALLOWED: req = GetRequest(url, headers={ 'User-Agent': "http://%s Linkchecker" % settings.SITE_DOMAIN }) else: req = url response = opener.open( req, timeout=LINKCHECK_CONNECTION_ATTEMPT_TIMEOUT) self.message = ' '.join([str(response.code), response.msg]) self.status = True if tested_url.count('#'): anchor = tested_url.split('#')[1] from linkcheck import parse_anchors try: names = parse_anchors(response.read()) if anchor in names: self.message = 'Working external hash anchor' self.status = True else: self.message = 'Broken external hash anchor' self.status = False except: # The external web page is mal-formatted #or maybe other parse errors like encoding # I reckon a broken anchor on an otherwise good URL should count as a pass self.message = "Page OK but anchor can't be checked" self.status = True except http_client.BadStatusLine: self.message = "Bad Status Line" except HTTPError as e: if hasattr(e, 'code') and hasattr(e, 'msg'): self.message = ' '.join([str(e.code), e.msg]) else: self.message = "Unknown Error" except URLError as e: if hasattr(e, 'reason'): self.message = 'Unreachable: ' + str(e.reason) elif hasattr(e, 'code') and e.code != 301: self.message = 'Error: ' + str(e.code) else: self.message = 'Redirect. Check manually: ' + str(e.code) except Exception as e: self.message = 'Other Error: %s' % e else: if getattr(response, 'getcode', False) and response.getcode( ) == 301 and response.geturl() != url: self.redirect_to = response.geturl() elif self.redirect_to: self.redirect_to = '' self.last_checked = now() self.save()
def check(self, check_internal=True, check_external=True, external_recheck_interval=EXTERNAL_RECHECK_INTERVAL): from linkcheck.utils import LinkCheckHandler external_recheck_datetime = datetime.now() - timedelta(minutes=external_recheck_interval) self.status = False # Remove current domain from URLs as the test client chokes when trying to test them during a page save # They shouldn't generally exist but occasionally slip through # If settings.SITE_DOMAINS isn't set then use settings.SITE_DOMAIN # but also check for variants: example.org, www.example.org, test.example.org original_url = None # used to restore the original url afterwards if SITE_DOMAINS: #if the setting is present internal_exceptions = SITE_DOMAINS else: # try using SITE_DOMAIN root_domain = settings.SITE_DOMAIN if root_domain.startswith('www.'): root_domain = root_domain[4:] elif root_domain.startswith('test.'): root_domain = root_domain[5:] internal_exceptions = ['http://'+root_domain, 'http://www.'+root_domain, 'http://test.'+root_domain] for ex in internal_exceptions: if ex and self.url.startswith(ex): original_url = self.url self.url = self.url.replace(ex, '', 1) if check_internal and (not self.external): if not(self.url): self.message = 'Empty link' elif self.url.startswith('mailto:'): self.status = None self.message = 'Email link (not automatically checked)' elif self.url.startswith('#'): self.status = None self.message = 'Link to within the same page (not automatically checked)' elif self.url.startswith(MEDIA_PREFIX): #TODO Assumes a direct mapping from media url to local filesystem path. This will break quite easily for alternate setups if os.path.exists(settings.MEDIA_ROOT + self.url_unquoted()[len(MEDIA_PREFIX)-1:]): self.message = 'Working file link' self.status = True else: self.message = 'Missing Document' elif self.url.startswith('/'): old_prepend_setting = settings.PREPEND_WWW settings.PREPEND_WWW = False c = Client() c.handler = LinkCheckHandler() response = c.get(self.url, follow=True) if response.status_code == 200: self.message = 'Working internal link' self.status = True # see if the internal link points an anchor if self.url[-1] == '#': # special case, point to # self.message = 'Working internal hash anchor' elif self.url.count('#'): anchor = self.url.split('#')[1] from linkcheck import parse_anchors names = parse_anchors(response.content) if anchor in names: self.message = 'Working internal hash anchor' self.status = True else: self.message = 'Broken internal hash anchor' self.status = False elif (response.status_code == 302 or response.status_code == 301): self.status = None self.message = 'This link redirects: code %d (not automatically checked)' % (response.status_code, ) else: self.message = 'Broken internal link' settings.PREPEND_WWW = old_prepend_setting else: self.message = 'Invalid URL' if original_url: # restore the original url before saving self.url = original_url self.last_checked = datetime.now() self.save() elif check_external and self.external: if self.last_checked and (self.last_checked > external_recheck_datetime): return self.status try: # Remove URL fragment identifiers url = self.url.rsplit('#')[0] if self.url.count('#'): # We have to get the content so we can check the anchors response = urllib2.urlopen(url) else: # Might as well just do a HEAD request req = HeadRequest(url, headers={'User-Agent' : "http://%s Linkchecker" % settings.SITE_DOMAIN}) try: response = urllib2.urlopen(req) except ValueError: # ...except sometimes it triggers a bug in urllib2 response = urllib2.urlopen(url) self.message = ' '.join([str(response.code), response.msg]) self.status = True if self.url.count('#'): anchor = self.url.split('#')[1] from linkcheck import parse_anchors try: names = parse_anchors(response.read()) if anchor in names: self.message = 'Working external hash anchor' self.status = True else: self.message = 'Broken external hash anchor' self.status = False except HTMLParseError: # The external web page is mal-formatted # I reckon a broken anchor on an otherwise good URL should count as a pass self.message = "Page OK but anchor can't be checked" self.status = True except BadStatusLine: self.message = "Bad Status Line" except urllib2.HTTPError, e: if hasattr(e, 'code') and hasattr(e, 'msg'): self.message = ' '.join([str(e.code), e.msg]) else: self.message = "Unknown Error" except urllib2.URLError, e: if hasattr(e, 'reason'): self.message = 'Unreachable: '+str(e.reason) elif hasattr(e, 'code') and e.code!=301: self.message = 'Error: '+str(e.code) else: self.message = 'Redirect. Check manually: '+str(e.code)
def check(self, recheck_interval=RECHECK_INTERVAL): from linkcheck.utils import LinkCheckHandler external_recheck_datetime = datetime.now() - timedelta(minutes=recheck_interval) self.status = False original_url = None # used to restore the original url afterwards if not(self.url): self.status = True self.message = 'Empty link' elif self.url.startswith('mailto:'): self.status = None self.message = 'Email link (not automatically checked)' elif self.url.startswith('#'): self.status = None self.message = 'Link to within the same page (not automatically checked)' elif self.url.startswith(MEDIA_PREFIX): #TODO Assumes a direct mapping from media url to local filesystem path. This will break quite easily for alternate setups if os.path.exists(settings.MEDIA_ROOT + self.url_unquoted()[len(MEDIA_PREFIX)-1:]): self.message = 'Working file link' self.status = True else: self.message = 'Missing Document' elif getattr(self, '_internal_hash', False) and getattr(self, '_instance', None): # This is a hash link pointing to itself from linkcheck import parse_anchors hash = self._internal_hash instance = self._instance if hash == '#': # special case, point to # self.message = 'Working internal hash anchor' self.status = True else: hash = hash[1:] #'#something' => 'something' html_content = '' for field in instance._linklist.html_fields: html_content += getattr(instance, field, '') names = parse_anchors(html_content) if hash in names: self.message = 'Working internal hash anchor' self.status = True else: self.message = 'Broken internal hash anchor' logger.info('checking external link: %s' % self.url) if self.last_checked and (self.last_checked > external_recheck_datetime): return self.status else: if self.url.startswith("/"): # append site_domain to path root_domain = settings.SITE_DOMAIN self.url = "http://%s%s" % (root_domain, self.url) try: # Remove URL fragment identifiers url = self.url.rsplit('#')[0] if self.url.count('#'): # We have to get the content so we can check the anchors if TIMEOUT: response = urllib2.urlopen(url, timeout=TIMEOUT) else: response = urllib2.urlopen(url) else: # Might as well just do a HEAD request req = HeadRequest(url, headers={'User-Agent' : "http://%s Linkchecker" % settings.SITE_DOMAIN}) try: if TIMEOUT: response = urllib2.urlopen(req, timeout=TIMEOUT) else: response = urllib2.urlopen(req) except: # ...except sometimes it triggers a bug in urllib2 if TIMEOUT: response = urllib2.urlopen(url, timeout=TIMEOUT) else: response = urllib2.urlopen(url) self.message = ' '.join([str(response.code), response.msg]) self.status = True if self.url.count('#'): anchor = self.url.split('#')[1] from linkcheck import parse_anchors try: names = parse_anchors(response.read()) if anchor in names: self.message = 'Working hash anchor' self.status = True else: self.message = 'Broken hash anchor' self.status = False except: # The external web page is mal-formatted #or maybe other parse errors like encoding # I reckon a broken anchor on an otherwise good URL should count as a pass self.message = "Page OK but anchor can't be checked" self.status = True except BadStatusLine: self.message = "Bad Status Line" except urllib2.HTTPError, e: if hasattr(e, 'code') and hasattr(e, 'msg'): self.message = ' '.join([str(e.code), e.msg]) else: self.message = "Unknown Error" except urllib2.URLError, e: if hasattr(e, 'reason'): self.message = 'Unreachable: '+str(e.reason) elif hasattr(e, 'code') and e.code!=301: self.message = 'Error: '+str(e.code) else: self.message = 'Redirect. Check manually: '+str(e.code)