def test_broken_500_handler_with_middleware(self): with Settings(BREAK_THAT_500=True, INSTALLED_APPS=['raven.contrib.django']): client = TestClient(REMOTE_ADDR='127.0.0.1') client.handler = MockSentryMiddleware(MockClientHandler()) self.assertRaises(Exception, client.get, reverse('sentry-raise-exc')) assert len(self.raven.events) == 2 event = self.raven.events.pop(0) self.assertTrue('sentry.interfaces.Exception' in event) exc = event['sentry.interfaces.Exception'] self.assertEquals(exc['type'], 'Exception') self.assertEquals(exc['value'], 'view exception') self.assertEquals(event['level'], logging.ERROR) self.assertEquals(event['message'], 'Exception: view exception') self.assertEquals(event['culprit'], 'tests.contrib.django.views in raise_exc') event = self.raven.events.pop(0) self.assertTrue('sentry.interfaces.Exception' in event) exc = event['sentry.interfaces.Exception'] self.assertEquals(exc['type'], 'ValueError') self.assertEquals(exc['value'], 'handler500') self.assertEquals(event['level'], logging.ERROR) self.assertEquals(event['message'], 'ValueError: handler500') self.assertEquals(event['culprit'], 'tests.contrib.django.urls in handler500')
def test_broken_500_handler_with_middleware(self): with Settings(BREAK_THAT_500=True): client = TestClient(REMOTE_ADDR='127.0.0.1') client.handler = MockSentryMiddleware(MockClientHandler()) self.assertRaises(Exception, client.get, reverse('sentry-raise-exc')) self.assertEquals(len(self.raven.events), 2) event = self.raven.events.pop(0) self.assertTrue('sentry.interfaces.Exception' in event) exc = event['sentry.interfaces.Exception'] self.assertEquals(exc['type'], 'Exception') self.assertEquals(exc['value'], 'view exception') self.assertEquals(event['level'], logging.ERROR) self.assertEquals(event['message'], 'Exception: view exception') self.assertEquals(event['culprit'], 'tests.contrib.django.views.raise_exc') event = self.raven.events.pop(0) self.assertTrue('sentry.interfaces.Exception' in event) exc = event['sentry.interfaces.Exception'] self.assertEquals(exc['type'], 'ValueError') self.assertEquals(exc['value'], 'handler500') self.assertEquals(event['level'], logging.ERROR) self.assertEquals(event['message'], 'ValueError: handler500') self.assertEquals(event['culprit'], 'tests.contrib.django.urls.handler500')
def test_broken_500_handler_with_middleware(self): with Settings(BREAK_THAT_500=True): client = TestClient(REMOTE_ADDR="127.0.0.1") client.handler = MockSentryMiddleware(MockClientHandler()) self.assertRaises(Exception, client.get, reverse("sentry-raise-exc")) self.assertEquals(len(self.raven.events), 2) event = self.raven.events.pop(0) self.assertTrue("sentry.interfaces.Exception" in event) exc = event["sentry.interfaces.Exception"] self.assertEquals(exc["type"], "Exception") self.assertEquals(exc["value"], "view exception") self.assertEquals(event["level"], logging.ERROR) self.assertEquals(event["message"], "Exception: view exception") self.assertEquals(event["culprit"], "tests.contrib.django.views.raise_exc") event = self.raven.events.pop(0) self.assertTrue("sentry.interfaces.Exception" in event) exc = event["sentry.interfaces.Exception"] self.assertEquals(exc["type"], "ValueError") self.assertEquals(exc["value"], "handler500") self.assertEquals(event["level"], logging.ERROR) self.assertEquals(event["message"], "ValueError: handler500") self.assertEquals(event["culprit"], "tests.contrib.django.urls.handler500")
def test_broken_500_handler_with_middleware(self): with self.settings(BREAK_THAT_500=True): client = TestClient(REMOTE_ADDR='127.0.0.1') client.handler = MockOpbeatMiddleware(MockClientHandler()) self.assertRaises(Exception, client.get, reverse('opbeat-raise-exc')) self.assertEquals(len(self.opbeat.events), 2) event = self.opbeat.events.pop(0) self.assertTrue('exception' in event) exc = event['exception'] self.assertEquals(exc['type'], 'Exception') self.assertEquals(exc['value'], 'view exception') self.assertEquals(event['level'], 'error') self.assertEquals(event['message'], 'Exception: view exception') self.assertEquals(event['culprit'], 'tests.contrib.django.testapp.views.raise_exc') event = self.opbeat.events.pop(0) self.assertTrue('exception' in event) exc = event['exception'] self.assertEquals(exc['type'], 'ValueError') self.assertEquals(exc['value'], 'handler500') self.assertEquals(event['level'], 'error') self.assertEquals(event['message'], 'ValueError: handler500') self.assertEquals(event['culprit'], 'tests.contrib.django.testapp.urls.handler500')
def test_broken_500_handler_with_middleware(self): with Settings(BREAK_THAT_500=True, INSTALLED_APPS=['raven.contrib.django']): client = DjangoTestClient(REMOTE_ADDR='127.0.0.1') client.handler = MockSentryMiddleware(MockClientHandler()) self.assertRaises(Exception, client.get, reverse('sentry-raise-exc')) assert len(self.raven.events ) == 2 or 4 # TODO: ash remove duplicate client events event = self.raven.events.pop(0) assert 'exception' in event exc = event['exception']['values'][-1] assert exc['type'] == 'Exception' assert exc['value'] == 'view exception' assert event['level'] == logging.ERROR assert event['message'] == 'Exception: view exception' event = self.raven.events.pop(0) assert 'exception' in event exc = event['exception']['values'][-1] assert exc['type'] == 'ValueError' assert exc['value'] == 'handler500' assert event['level'] == logging.ERROR assert event['message'] == 'ValueError: handler500'
def test_broken_500_handler_with_middleware(self): with Settings(BREAK_THAT_500=True, INSTALLED_APPS=['raven.contrib.django']): client = TestClient(REMOTE_ADDR='127.0.0.1') client.handler = MockSentryMiddleware(MockClientHandler()) self.assertRaises(Exception, client.get, reverse('sentry-raise-exc')) assert len(self.raven.events) == 2 event = self.raven.events.pop(0) assert 'exception' in event exc = event['exception']['values'][0] self.assertEquals(exc['type'], 'Exception') self.assertEquals(exc['value'], 'view exception') self.assertEquals(event['level'], logging.ERROR) self.assertEquals(event['message'], 'Exception: view exception') self.assertEquals(event['culprit'], 'tests.contrib.django.views in raise_exc') event = self.raven.events.pop(0) assert 'exception' in event exc = event['exception']['values'][0] self.assertEquals(exc['type'], 'ValueError') self.assertEquals(exc['value'], 'handler500') self.assertEquals(event['level'], logging.ERROR) self.assertEquals(event['message'], 'ValueError: handler500') self.assertEquals(event['culprit'], 'tests.contrib.django.urls in handler500')
def test_broken_500_handler_with_middleware(self): with Settings(BREAK_THAT_500=True, INSTALLED_APPS=['raven.contrib.django']): client = DjangoTestClient(REMOTE_ADDR='127.0.0.1') client.handler = MockSentryMiddleware(MockClientHandler()) self.assertRaises(Exception, client.get, reverse('sentry-raise-exc')) assert len(self.raven.events) == 2 event = self.raven.events.pop(0) assert 'exception' in event exc = event['exception']['values'][-1] assert exc['type'] == 'Exception' assert exc['value'] == 'view exception' assert event['level'] == logging.ERROR assert event['message'] == 'Exception: view exception' event = self.raven.events.pop(0) assert 'exception' in event exc = event['exception']['values'][-1] assert exc['type'] == 'ValueError' assert exc['value'] == 'handler500' assert event['level'] == logging.ERROR assert event['message'] == 'ValueError: handler500'
def client(): """A Django test client instance.""" from django.test.client import Client client_ = Client() client_.handler = TestClienHandler() return client_
def test_broken_500_handler_with_middleware(self): with Settings(BREAK_THAT_500=True, INSTALLED_APPS=["raven.contrib.django"]): client = TestClient(REMOTE_ADDR="127.0.0.1") client.handler = MockSentryMiddleware(MockClientHandler()) self.assertRaises(Exception, client.get, reverse("sentry-raise-exc")) assert len(self.raven.events) == 2 event = self.raven.events.pop(0) assert "exception" in event exc = event["exception"]["values"][0] assert exc["type"] == "Exception" assert exc["value"] == "view exception" assert event["level"] == logging.ERROR assert event["message"] == "Exception: view exception" assert event["culprit"] == "tests.contrib.django.views in raise_exc" event = self.raven.events.pop(0) assert "exception" in event exc = event["exception"]["values"][0] assert exc["type"] == "ValueError" assert exc["value"] == "handler500" assert event["level"] == logging.ERROR assert event["message"] == "ValueError: handler500" assert event["culprit"] == "tests.contrib.django.urls in handler500"
def admin_client(db, admin_user): """A Django test client logged in as an admin user.""" from django.test.client import Client client_ = Client() client_.handler = TestClienHandler() client_.login(username=admin_user.username, password='******') return client_
def test_broken_500_handler_with_middleware(self): with Settings(BREAK_THAT_500=True): client = TestClient(REMOTE_ADDR='127.0.0.1') client.handler = MockSentryMiddleware(MockClientHandler()) self.assertRaises(Exception, client.get, reverse('sentry-raise-exc')) self.assertEquals(len(self.raven.events), 2) event = self.raven.events.pop(0) self.assertEquals(event['class_name'], 'Exception') self.assertEquals(event['level'], logging.ERROR) self.assertEquals(event['message'], 'view exception') self.assertEquals(event['view'], 'tests.contrib.django.views.raise_exc') event = self.raven.events.pop(0) self.assertEquals(event['class_name'], 'ValueError') self.assertEquals(event['level'], logging.ERROR) self.assertEquals(event['message'], 'handler500') self.assertEquals(event['view'], 'tests.contrib.django.urls.handler500')
def _check_internal(self, tested_url): from linkcheck.utils import LinkCheckHandler if not(tested_url): self.message = 'Empty link' elif tested_url.startswith('mailto:'): self.status = None self.message = 'Email link (not automatically checked)' elif tested_url.startswith('#'): self.status = None self.message = 'Link to within the same page (not automatically checked)' elif tested_url.startswith(MEDIA_PREFIX): # TODO Assumes a direct mapping from media url to local filesystem path. This will break quite easily for alternate setups path = settings.MEDIA_ROOT + urlunquote(tested_url)[len(MEDIA_PREFIX)-1:] decoded_path = html_decode(path) if os.path.exists(path) or os.path.exists(decoded_path): self.message = 'Working file link' self.status = True else: self.message = 'Missing Document' elif getattr(self, '_internal_hash', False) and getattr(self, '_instance', None): # This is a hash link pointing to itself from linkcheck import parse_anchors hash = self._internal_hash instance = self._instance if hash == '#': # special case, point to # self.message = 'Working internal hash anchor' self.status = True else: hash = hash[1:] #'#something' => 'something' html_content = '' for field in instance._linklist.html_fields: html_content += getattr(instance, field, '') names = parse_anchors(html_content) if hash in names: self.message = 'Working internal hash anchor' self.status = True else: self.message = 'Broken internal hash anchor' elif tested_url.startswith('/'): old_prepend_setting = settings.PREPEND_WWW settings.PREPEND_WWW = False c = Client() c.handler = LinkCheckHandler() response = c.get(tested_url) if USE_REVERSION: # using test client will clear the RevisionContextManager stack. revision_context_manager.start() if response.status_code == 200: self.message = 'Working internal link' self.status = True # see if the internal link points an anchor if tested_url[-1] == '#': # special case, point to # self.message = 'Working internal hash anchor' elif tested_url.count('#'): anchor = tested_url.split('#')[1] from linkcheck import parse_anchors names = parse_anchors(response.content) if anchor in names: self.message = 'Working internal hash anchor' self.status = True else: self.message = 'Broken internal hash anchor' self.status = False elif response.status_code == 302 or response.status_code == 301: self.status = None self.message = 'This link redirects: code %d (not automatically checked)' % (response.status_code, ) else: self.message = 'Broken internal link' settings.PREPEND_WWW = old_prepend_setting else: self.message = 'Invalid URL' self.last_checked = now() self.save()
def check_url(self, check_internal=True, check_external=True, external_recheck_interval=EXTERNAL_RECHECK_INTERVAL): from linkcheck.utils import LinkCheckHandler external_recheck_datetime = now() - timedelta(minutes=external_recheck_interval) self.status = False # Remove current domain from URLs as the test client chokes when trying to test them during a page save # They shouldn't generally exist but occasionally slip through # If settings.SITE_DOMAINS isn't set then use settings.SITE_DOMAIN # but also check for variants: example.org, www.example.org, test.example.org original_url = None # used to restore the original url afterwards if SITE_DOMAINS: #if the setting is present internal_exceptions = SITE_DOMAINS else: # try using SITE_DOMAIN root_domain = settings.SITE_DOMAIN if root_domain.startswith('www.'): root_domain = root_domain[4:] elif root_domain.startswith('test.'): root_domain = root_domain[5:] internal_exceptions = ['http://'+root_domain, 'http://www.'+root_domain, 'http://test.'+root_domain] for ex in internal_exceptions: if ex and self.url.startswith(ex): original_url = self.url self.url = self.url.replace(ex, '', 1) if check_internal and (not self.external): if not(self.url): self.message = 'Empty link' elif self.url.startswith('mailto:'): self.status = None self.message = 'Email link (not automatically checked)' elif self.url.startswith('#'): self.status = None self.message = 'Link to within the same page (not automatically checked)' elif self.url.startswith(MEDIA_PREFIX): #TODO Assumes a direct mapping from media url to local filesystem path. This will break quite easily for alternate setups path = settings.MEDIA_ROOT + self.url_unquoted()[len(MEDIA_PREFIX)-1:] decoded_path = html_decode(path) if os.path.exists(path) or os.path.exists(decoded_path): self.message = 'Working file link' self.status = True else: self.message = 'Missing Document' elif getattr(self, '_internal_hash', False) and getattr(self, '_instance', None): # This is a hash link pointing to itself from linkcheck import parse_anchors hash = self._internal_hash instance = self._instance if hash == '#': # special case, point to # self.message = 'Working internal hash anchor' self.status = True else: hash = hash[1:] #'#something' => 'something' html_content = '' for field in instance._linklist.html_fields: html_content += getattr(instance, field, '') names = parse_anchors(html_content) if hash in names: self.message = 'Working internal hash anchor' self.status = True else: self.message = 'Broken internal hash anchor' elif self.url.startswith('/'): old_prepend_setting = settings.PREPEND_WWW settings.PREPEND_WWW = False c = Client() c.handler = LinkCheckHandler() response = c.get(self.url, follow=True) #using test client will clear the RevisionContextManager stack. from reversion.revisions import revision_context_manager revision_context_manager.start() if response.status_code == 200: self.message = 'Working internal link' self.status = True # see if the internal link points an anchor if self.url[-1] == '#': # special case, point to # self.message = 'Working internal hash anchor' elif self.url.count('#'): anchor = self.url.split('#')[1] from linkcheck import parse_anchors names = parse_anchors(response.content) if anchor in names: self.message = 'Working internal hash anchor' self.status = True else: self.message = 'Broken internal hash anchor' self.status = False elif (response.status_code == 302 or response.status_code == 301): self.status = None self.message = 'This link redirects: code %d (not automatically checked)' % (response.status_code, ) else: self.message = 'Broken internal link' settings.PREPEND_WWW = old_prepend_setting else: self.message = 'Invalid URL' if original_url: # restore the original url before saving self.url = original_url self.last_checked = now() self.save() elif check_external and self.external: logger.info('checking external link: %s' % self.url) if self.last_checked and (self.last_checked > external_recheck_datetime): return self.status try: # Remove URL fragment identifiers url = self.url.rsplit('#')[0] if self.url.count('#'): # We have to get the content so we can check the anchors if TIMEOUT_SUPPORT: response = urllib2.urlopen( url, timeout=LINKCHECK_CONNECTION_ATTEMPT_TIMEOUT ) else: response = urllib2.urlopen(url) else: # Might as well just do a HEAD request req = HeadRequest(url, headers={'User-Agent' : "http://%s Linkchecker" % settings.SITE_DOMAIN}) try: if TIMEOUT_SUPPORT: response = urllib2.urlopen( req, timeout=LINKCHECK_CONNECTION_ATTEMPT_TIMEOUT ) else: response = urllib2.urlopen(req) except (ValueError, urllib2.HTTPError): _, error, _ = sys.exc_info() # ...except sometimes it triggers a bug in urllib2 if hasattr(error, 'code') and error.code == METHOD_NOT_ALLOWED: req = GetRequest(url, headers={'User-Agent' : "http://%s Linkchecker" % settings.SITE_DOMAIN}) else: req = url if TIMEOUT_SUPPORT: response = urllib2.urlopen( req, timeout=LINKCHECK_CONNECTION_ATTEMPT_TIMEOUT ) else: response = urllib2.urlopen(req) self.message = ' '.join([str(response.code), response.msg]) self.status = True if self.url.count('#'): anchor = self.url.split('#')[1] from linkcheck import parse_anchors try: names = parse_anchors(response.read()) if anchor in names: self.message = 'Working external hash anchor' self.status = True else: self.message = 'Broken external hash anchor' self.status = False except: # The external web page is mal-formatted #or maybe other parse errors like encoding # I reckon a broken anchor on an otherwise good URL should count as a pass self.message = "Page OK but anchor can't be checked" self.status = True except BadStatusLine: self.message = "Bad Status Line" except urllib2.HTTPError, e: if hasattr(e, 'code') and hasattr(e, 'msg'): self.message = ' '.join([str(e.code), e.msg]) else: self.message = "Unknown Error" except urllib2.URLError, e: if hasattr(e, 'reason'): self.message = 'Unreachable: '+str(e.reason) elif hasattr(e, 'code') and e.code!=301: self.message = 'Error: '+str(e.code) else: self.message = 'Redirect. Check manually: '+str(e.code)
def check(self, check_internal=True, check_external=True, external_recheck_interval=EXTERNAL_RECHECK_INTERVAL): from linkcheck.utils import LinkCheckHandler external_recheck_datetime = now() - timedelta( minutes=external_recheck_interval) self.status = False # Remove current domain from URLs as the test client chokes when trying to test them during a page save # They shouldn't generally exist but occasionally slip through # If settings.SITE_DOMAINS isn't set then use settings.SITE_DOMAIN # but also check for variants: example.org, www.example.org, test.example.org original_url = None # used to restore the original url afterwards if SITE_DOMAINS: #if the setting is present internal_exceptions = SITE_DOMAINS else: # try using SITE_DOMAIN root_domain = settings.SITE_DOMAIN if root_domain.startswith('www.'): root_domain = root_domain[4:] elif root_domain.startswith('test.'): root_domain = root_domain[5:] internal_exceptions = [ 'http://' + root_domain, 'http://www.' + root_domain, 'http://test.' + root_domain ] for ex in internal_exceptions: if ex and self.url.startswith(ex): original_url = self.url self.url = self.url.replace(ex, '', 1) if check_internal and (not self.external): if not (self.url): self.message = 'Empty link' elif self.url.startswith('mailto:'): self.status = None self.message = 'Email link (not automatically checked)' elif self.url.startswith('#'): self.status = None self.message = 'Link to within the same page (not automatically checked)' elif self.url.startswith(MEDIA_PREFIX): #TODO Assumes a direct mapping from media url to local filesystem path. This will break quite easily for alternate setups if os.path.exists(settings.MEDIA_ROOT + self.url_unquoted()[len(MEDIA_PREFIX) - 1:]): self.message = 'Working file link' self.status = True else: self.message = 'Missing Document' elif getattr(self, '_internal_hash', False) and getattr( self, '_instance', None): # This is a hash link pointing to itself from linkcheck import parse_anchors hash = self._internal_hash instance = self._instance if hash == '#': # special case, point to # self.message = 'Working internal hash anchor' self.status = True else: hash = hash[1:] #'#something' => 'something' html_content = '' for field in instance._linklist.html_fields: html_content += getattr(instance, field, '') names = parse_anchors(html_content) if hash in names: self.message = 'Working internal hash anchor' self.status = True else: self.message = 'Broken internal hash anchor' elif self.url.startswith('/'): old_prepend_setting = settings.PREPEND_WWW settings.PREPEND_WWW = False c = Client() c.handler = LinkCheckHandler() response = c.get(self.url, follow=True) if response.status_code == 200: self.message = 'Working internal link' self.status = True # see if the internal link points an anchor if self.url[-1] == '#': # special case, point to # self.message = 'Working internal hash anchor' elif self.url.count('#'): anchor = self.url.split('#')[1] from linkcheck import parse_anchors names = parse_anchors(response.content) if anchor in names: self.message = 'Working internal hash anchor' self.status = True else: self.message = 'Broken internal hash anchor' self.status = False elif (response.status_code == 302 or response.status_code == 301): self.status = None self.message = 'This link redirects: code %d (not automatically checked)' % ( response.status_code, ) else: self.message = 'Broken internal link' settings.PREPEND_WWW = old_prepend_setting else: self.message = 'Invalid URL' if original_url: # restore the original url before saving self.url = original_url self.last_checked = now() self.save() elif check_external and self.external: logger.info('checking external link: %s' % self.url) if self.last_checked and (self.last_checked > external_recheck_datetime): return self.status try: # Remove URL fragment identifiers url = self.url.rsplit('#')[0] if self.url.count('#'): # We have to get the content so we can check the anchors if TIMEOUT_SUPPORT: response = urllib2.urlopen( url, timeout=LINKCHECK_CONNECTION_ATTEMPT_TIMEOUT) else: response = urllib2.urlopen(url) else: # Might as well just do a HEAD request req = HeadRequest(url, headers={ 'User-Agent': "http://%s Linkchecker" % settings.SITE_DOMAIN }) try: if TIMEOUT_SUPPORT: response = urllib2.urlopen( req, timeout=LINKCHECK_CONNECTION_ATTEMPT_TIMEOUT) else: response = urllib2.urlopen(req) except (ValueError, urllib2.HTTPError) as error: # ...except sometimes it triggers a bug in urllib2 if hasattr( error, 'code') and error.code == METHOD_NOT_ALLOWED: req = GetRequest(url, headers={ 'User-Agent': "http://%s Linkchecker" % settings.SITE_DOMAIN }) else: req = url if TIMEOUT_SUPPORT: response = urllib2.urlopen( req, timeout=LINKCHECK_CONNECTION_ATTEMPT_TIMEOUT) else: response = urllib2.urlopen(req) self.message = ' '.join([str(response.code), response.msg]) self.status = True if self.url.count('#'): anchor = self.url.split('#')[1] from linkcheck import parse_anchors try: names = parse_anchors(response.read()) if anchor in names: self.message = 'Working external hash anchor' self.status = True else: self.message = 'Broken external hash anchor' self.status = False except: # The external web page is mal-formatted #or maybe other parse errors like encoding # I reckon a broken anchor on an otherwise good URL should count as a pass self.message = "Page OK but anchor can't be checked" self.status = True except BadStatusLine: self.message = "Bad Status Line" except urllib2.HTTPError, e: if hasattr(e, 'code') and hasattr(e, 'msg'): self.message = ' '.join([str(e.code), e.msg]) else: self.message = "Unknown Error" except urllib2.URLError, e: if hasattr(e, 'reason'): self.message = 'Unreachable: ' + str(e.reason) elif hasattr(e, 'code') and e.code != 301: self.message = 'Error: ' + str(e.code) else: self.message = 'Redirect. Check manually: ' + str(e.code)
def _check_internal(self, tested_url): from linkcheck.utils import LinkCheckHandler if not (tested_url): self.message = 'Empty link' elif tested_url.startswith('mailto:'): self.status = None self.message = 'Email link (not automatically checked)' elif tested_url.startswith('tel:'): self.status = None self.message = 'Phone number (not automatically checked)' elif tested_url.startswith('#'): self.status = None self.message = 'Link to within the same page (not automatically checked)' elif tested_url.startswith(MEDIA_PREFIX): # TODO Assumes a direct mapping from media url to local filesystem path. This will break quite easily for alternate setups path = settings.MEDIA_ROOT + urlunquote( tested_url)[len(MEDIA_PREFIX) - 1:] decoded_path = html_decode(path) if os.path.exists(path) or os.path.exists(decoded_path): self.message = 'Working file link' self.status = True else: self.message = 'Missing Document' elif getattr(self, '_internal_hash', False) and getattr( self, '_instance', None): # This is a hash link pointing to itself from linkcheck import parse_anchors hash = self._internal_hash instance = self._instance if hash == '#': # special case, point to # self.message = 'Working internal hash anchor' self.status = True else: hash = hash[1:] #'#something' => 'something' html_content = '' for field in instance._linklist.html_fields: html_content += getattr(instance, field, '') try: names = parse_anchors(html_content) if hash in names: self.message = 'Working internal hash anchor' self.status = True else: self.message = 'Broken internal hash anchor' except UnicodeDecodeError: self.message = 'Failed to parse HTML for anchor' elif tested_url.startswith('/'): old_prepend_setting = settings.PREPEND_WWW settings.PREPEND_WWW = False c = Client() c.handler = LinkCheckHandler() response = c.get(tested_url) if response.status_code == 200: self.message = 'Working internal link' self.status = True # see if the internal link points an anchor if tested_url[-1] == '#': # special case, point to # self.message = 'Working internal hash anchor' elif tested_url.count('#'): anchor = tested_url.split('#')[1] from linkcheck import parse_anchors try: names = parse_anchors(response.content) if anchor in names: self.message = 'Working internal hash anchor' self.status = True else: self.message = 'Broken internal hash anchor' self.status = False except UnicodeDecodeError: self.message = 'Failed to parse HTML for anchor' elif response.status_code == 302 or response.status_code == 301: redir_response = c.get(tested_url, follow=True) if redir_response.status_code == 200: redir_state = 'Working redirect' self.status = True else: redir_state = 'Broken redirect' self.status = False self.message = 'This link redirects: code %d (%s)' % ( response.status_code, redir_state) else: self.message = 'Broken internal link' settings.PREPEND_WWW = old_prepend_setting else: self.message = 'Invalid URL' if USE_REVERSION: # using test client will clear the RevisionContextManager stack. revision_context_manager.start() self.last_checked = now() self.save()
def check(self, check_internal=True, check_external=True, external_recheck_interval=EXTERNAL_RECHECK_INTERVAL): from linkcheck.utils import LinkCheckHandler external_recheck_datetime = datetime.now() - timedelta(minutes=external_recheck_interval) self.status = False # Remove current domain from URLs as the test client chokes when trying to test them during a page save # They shouldn't generally exist but occasionally slip through # If settings.SITE_DOMAINS isn't set then use settings.SITE_DOMAIN # but also check for variants: example.org, www.example.org, test.example.org original_url = None # used to restore the original url afterwards if SITE_DOMAINS: #if the setting is present internal_exceptions = SITE_DOMAINS else: # try using SITE_DOMAIN root_domain = settings.SITE_DOMAIN if root_domain.startswith('www.'): root_domain = root_domain[4:] elif root_domain.startswith('test.'): root_domain = root_domain[5:] internal_exceptions = ['http://'+root_domain, 'http://www.'+root_domain, 'http://test.'+root_domain] for ex in internal_exceptions: if ex and self.url.startswith(ex): original_url = self.url self.url = self.url.replace(ex, '', 1) if check_internal and (not self.external): if not(self.url): self.message = 'Empty link' elif self.url.startswith('mailto:'): self.status = None self.message = 'Email link (not automatically checked)' elif self.url.startswith('#'): self.status = None self.message = 'Link to within the same page (not automatically checked)' elif self.url.startswith(MEDIA_PREFIX): #TODO Assumes a direct mapping from media url to local filesystem path. This will break quite easily for alternate setups if os.path.exists(settings.MEDIA_ROOT + self.url_unquoted()[len(MEDIA_PREFIX)-1:]): self.message = 'Working file link' self.status = True else: self.message = 'Missing Document' elif self.url.startswith('/'): old_prepend_setting = settings.PREPEND_WWW settings.PREPEND_WWW = False c = Client() c.handler = LinkCheckHandler() response = c.get(self.url, follow=True) if response.status_code == 200: self.message = 'Working internal link' self.status = True # see if the internal link points an anchor if self.url[-1] == '#': # special case, point to # self.message = 'Working internal hash anchor' elif self.url.count('#'): anchor = self.url.split('#')[1] from linkcheck import parse_anchors names = parse_anchors(response.content) if anchor in names: self.message = 'Working internal hash anchor' self.status = True else: self.message = 'Broken internal hash anchor' self.status = False elif (response.status_code == 302 or response.status_code == 301): self.status = None self.message = 'This link redirects: code %d (not automatically checked)' % (response.status_code, ) else: self.message = 'Broken internal link' settings.PREPEND_WWW = old_prepend_setting else: self.message = 'Invalid URL' if original_url: # restore the original url before saving self.url = original_url self.last_checked = datetime.now() self.save() elif check_external and self.external: if self.last_checked and (self.last_checked > external_recheck_datetime): return self.status try: # Remove URL fragment identifiers url = self.url.rsplit('#')[0] if self.url.count('#'): # We have to get the content so we can check the anchors response = urllib2.urlopen(url) else: # Might as well just do a HEAD request req = HeadRequest(url, headers={'User-Agent' : "http://%s Linkchecker" % settings.SITE_DOMAIN}) try: response = urllib2.urlopen(req) except ValueError: # ...except sometimes it triggers a bug in urllib2 response = urllib2.urlopen(url) self.message = ' '.join([str(response.code), response.msg]) self.status = True if self.url.count('#'): anchor = self.url.split('#')[1] from linkcheck import parse_anchors try: names = parse_anchors(response.read()) if anchor in names: self.message = 'Working external hash anchor' self.status = True else: self.message = 'Broken external hash anchor' self.status = False except HTMLParseError: # The external web page is mal-formatted # I reckon a broken anchor on an otherwise good URL should count as a pass self.message = "Page OK but anchor can't be checked" self.status = True except BadStatusLine: self.message = "Bad Status Line" except urllib2.HTTPError, e: if hasattr(e, 'code') and hasattr(e, 'msg'): self.message = ' '.join([str(e.code), e.msg]) else: self.message = "Unknown Error" except urllib2.URLError, e: if hasattr(e, 'reason'): self.message = 'Unreachable: '+str(e.reason) elif hasattr(e, 'code') and e.code!=301: self.message = 'Error: '+str(e.code) else: self.message = 'Redirect. Check manually: '+str(e.code)