Esempio n. 1
0
    def _check_internal(self, tested_url):

        from linkcheck.utils import LinkCheckHandler

        if not(tested_url):
            self.message = 'Empty link'

        elif tested_url.startswith('mailto:'):
            self.status = None
            self.message = 'Email link (not automatically checked)'

        elif tested_url.startswith('#'):
            self.status = None
            self.message = 'Link to within the same page (not automatically checked)'

        elif tested_url.startswith(MEDIA_PREFIX):
            # TODO Assumes a direct mapping from media url to local filesystem path. This will break quite easily for alternate setups
            path = settings.MEDIA_ROOT + urlunquote(tested_url)[len(MEDIA_PREFIX)-1:]
            decoded_path = html_decode(path)
            if os.path.exists(path) or os.path.exists(decoded_path):
                self.message = 'Working file link'
                self.status = True
            else:
                self.message = 'Missing Document'

        elif getattr(self, '_internal_hash', False) and getattr(self, '_instance', None):
            # This is a hash link pointing to itself
            from linkcheck import parse_anchors

            hash = self._internal_hash
            instance = self._instance
            if hash == '#': # special case, point to #
                self.message = 'Working internal hash anchor'
                self.status = True
            else:
                hash = hash[1:] #'#something' => 'something'
                html_content = ''
                for field in instance._linklist.html_fields:
                    html_content += getattr(instance, field, '')
                names = parse_anchors(html_content)
                if hash in names:
                    self.message = 'Working internal hash anchor'
                    self.status = True
                else:
                    self.message = 'Broken internal hash anchor'

        elif tested_url.startswith('/'):
            old_prepend_setting = settings.PREPEND_WWW
            settings.PREPEND_WWW = False
            c = Client()
            c.handler = LinkCheckHandler()
            response = c.get(tested_url)
            if USE_REVERSION:
                # using test client will clear the RevisionContextManager stack.
                revision_context_manager.start()

            if response.status_code == 200:
                self.message = 'Working internal link'
                self.status = True
                # see if the internal link points an anchor
                if tested_url[-1] == '#': # special case, point to #
                    self.message = 'Working internal hash anchor'
                elif tested_url.count('#'):
                    anchor = tested_url.split('#')[1]
                    from linkcheck import parse_anchors
                    names = parse_anchors(response.content)
                    if anchor in names:
                        self.message = 'Working internal hash anchor'
                        self.status = True
                    else:
                        self.message = 'Broken internal hash anchor'
                        self.status = False

            elif response.status_code == 302 or response.status_code == 301:
                self.status = None
                self.message = 'This link redirects: code %d (not automatically checked)' % (response.status_code, )
            else:
                self.message = 'Broken internal link'
            settings.PREPEND_WWW = old_prepend_setting
        else:
            self.message = 'Invalid URL'

        self.last_checked = now()
        self.save()
Esempio n. 2
0
    def _check_external(self, tested_url, external_recheck_interval):
        logger.info('checking external link: %s' % tested_url)
        external_recheck_datetime = now() - timedelta(minutes=external_recheck_interval)

        if self.last_checked and (self.last_checked > external_recheck_datetime):
            return self.status

        opener = build_opener(RedirectHandler)
        # Remove URL fragment identifiers
        url = tested_url.rsplit('#')[0]
        # Check that non-ascii chars are properly encoded
        try:
            url.encode('ascii')
        except UnicodeEncodeError:
            url = iri_to_uri(url)

        try:
            if tested_url.count('#'):
                # We have to get the content so we can check the anchors
                response = opener.open(
                    url,
                    timeout=LINKCHECK_CONNECTION_ATTEMPT_TIMEOUT
                )
            else:
                # Might as well just do a HEAD request
                req = HeadRequest(url, headers={'User-Agent' : "http://%s Linkchecker" % settings.SITE_DOMAIN})
                try:
                    response = opener.open(
                        req,
                        timeout=LINKCHECK_CONNECTION_ATTEMPT_TIMEOUT
                    )
                except (ValueError, HTTPError) as error:
                    # ...except sometimes it triggers a bug in urllib2
                    if hasattr(error, 'code') and error.code == METHOD_NOT_ALLOWED:
                        req = GetRequest(url, headers={'User-Agent' : "http://%s Linkchecker" % settings.SITE_DOMAIN})
                    else:
                        req = url
                    response = opener.open(
                        req,
                        timeout=LINKCHECK_CONNECTION_ATTEMPT_TIMEOUT
                    )

            self.message = ' '.join([str(response.code), response.msg])
            self.status = True

            if tested_url.count('#'):

                anchor = tested_url.split('#')[1]
                from linkcheck import parse_anchors
                try:
                    names = parse_anchors(response.read())
                    if anchor in names:
                        self.message = 'Working external hash anchor'
                        self.status = True
                    else:
                        self.message = 'Broken external hash anchor'
                        self.status = False

                except:
                    # The external web page is mal-formatted #or maybe other parse errors like encoding
                    # I reckon a broken anchor on an otherwise good URL should count as a pass
                    self.message = "Page OK but anchor can't be checked"
                    self.status = True

        except http_client.BadStatusLine:
                self.message = "Bad Status Line"

        except HTTPError as e:
            if hasattr(e, 'code') and hasattr(e, 'msg'):
                self.message = ' '.join([str(e.code), e.msg])
            else:
                self.message = "Unknown Error"

        except URLError as e:
            if hasattr(e, 'reason'):
                self.message = 'Unreachable: '+str(e.reason)
            elif hasattr(e, 'code') and e.code!=301:
                self.message = 'Error: '+str(e.code)
            else:
                self.message = 'Redirect. Check manually: '+str(e.code)
        except Exception as e:
            self.message = 'Other Error: %s' % e
        else:
            if response.getcode() == 301 and response.geturl() != url:
                self.redirect_to = response.geturl()
            elif self.redirect_to:
                self.redirect_to = ''

        self.last_checked = now()
        self.save()
Esempio n. 3
0
    def _check_external(self, tested_url, external_recheck_interval):
        logger.info('checking external link: %s' % tested_url)
        external_recheck_datetime = now() - timedelta(
            minutes=external_recheck_interval)

        if self.last_checked and (self.last_checked >
                                  external_recheck_datetime):
            return self.status

        # Remove URL fragment identifiers
        url = tested_url.rsplit('#')[0]
        # Check that non-ascii chars are properly encoded
        try:
            url.encode('ascii')
        except UnicodeEncodeError:
            url = iri_to_uri(url)

        request_params = {
            'verify': False,
            'allow_redirects': True,
            'headers': {
                'User-Agent': "http://%s Linkchecker" % settings.SITE_DOMAIN
            },
            'timeout': LINKCHECK_CONNECTION_ATTEMPT_TIMEOUT,
        }
        try:
            if tested_url.count('#'):
                # We have to get the content so we can check the anchors
                response = requests.get(url, **request_params)
            else:
                # Might as well just do a HEAD request
                response = requests.head(url, **request_params)

            if response.status_code >= 400:
                # If HEAD is not allowed, let's try with GET
                response = requests.get(url, **request_params)
        except ReadTimeout:
            self.message = 'Other Error: The read operation timed out'
            self.status = False
        except Exception as e:
            self.message = 'Other Error: %s' % e
            self.status = False
        else:
            self.message = ' '.join(
                [str(response.status_code), response.reason])
            self.status = 200 <= response.status_code < 400

            if tested_url.count('#'):
                anchor = tested_url.split('#')[1]
                from linkcheck import parse_anchors
                try:
                    names = parse_anchors(response.text)
                    if anchor in names:
                        self.message = 'Working external hash anchor'
                        self.status = True
                    else:
                        self.message = 'Broken external hash anchor'
                        self.status = False

                except:
                    # The external web page is mal-formatted #or maybe other parse errors like encoding
                    # I reckon a broken anchor on an otherwise good URL should count as a pass
                    self.message = "Page OK but anchor can't be checked"
                    self.status = True

            if response.status_code in REDIRECT_STATI:
                # This means it could not follow the redirection
                self.status = False
            elif response.status_code < 300 and response.history:
                self.message = ' '.join([
                    str(response.history[0].status_code),
                    response.history[0].reason
                ])
                self.redirect_to = response.url

        self.last_checked = now()
        self.save()
    def check_url(self, check_internal=True, check_external=True, external_recheck_interval=EXTERNAL_RECHECK_INTERVAL):

        from linkcheck.utils import LinkCheckHandler
        external_recheck_datetime = now() - timedelta(minutes=external_recheck_interval)
        self.status  = False

        # Remove current domain from URLs as the test client chokes when trying to test them during a page save
        # They shouldn't generally exist but occasionally slip through
        # If settings.SITE_DOMAINS isn't set then use settings.SITE_DOMAIN
        # but also check for variants: example.org, www.example.org, test.example.org

        original_url = None # used to restore the original url afterwards

        if SITE_DOMAINS: #if the setting is present
            internal_exceptions = SITE_DOMAINS

        else: # try using SITE_DOMAIN
            root_domain = settings.SITE_DOMAIN
            if root_domain.startswith('www.'):
                root_domain = root_domain[4:]
            elif root_domain.startswith('test.'):
                root_domain = root_domain[5:]
            internal_exceptions = ['http://'+root_domain, 'http://www.'+root_domain, 'http://test.'+root_domain]

        for ex in internal_exceptions:
            if ex and self.url.startswith(ex):
                original_url = self.url
                self.url = self.url.replace(ex, '', 1)

        if check_internal and (not self.external):
            if not(self.url):
                self.message = 'Empty link'

            elif self.url.startswith('mailto:'):
                self.status = None
                self.message = 'Email link (not automatically checked)'

            elif self.url.startswith('#'):
                self.status = None
                self.message = 'Link to within the same page (not automatically checked)'

            elif self.url.startswith(MEDIA_PREFIX):
                #TODO Assumes a direct mapping from media url to local filesystem path. This will break quite easily for alternate setups
                path = settings.MEDIA_ROOT + self.url_unquoted()[len(MEDIA_PREFIX)-1:]
                decoded_path = html_decode(path)
                if os.path.exists(path) or os.path.exists(decoded_path):
                    self.message = 'Working file link'
                    self.status = True
                else:
                    self.message = 'Missing Document'

            elif getattr(self, '_internal_hash', False) and getattr(self, '_instance', None):
                # This is a hash link pointing to itself
                from linkcheck import parse_anchors

                hash = self._internal_hash
                instance = self._instance
                if hash == '#': # special case, point to #
                    self.message = 'Working internal hash anchor'
                    self.status = True
                else:
                    hash = hash[1:] #'#something' => 'something'
                    html_content = ''
                    for field in instance._linklist.html_fields:
                        html_content += getattr(instance, field, '')
                    names = parse_anchors(html_content)
                    if hash in names:
                        self.message = 'Working internal hash anchor'
                        self.status = True
                    else:
                        self.message = 'Broken internal hash anchor'

            elif self.url.startswith('/'):
                old_prepend_setting = settings.PREPEND_WWW
                settings.PREPEND_WWW = False
                c = Client()
                c.handler = LinkCheckHandler()
                response = c.get(self.url, follow=True)
                #using test client will clear the RevisionContextManager stack. 
                from reversion.revisions import revision_context_manager
                revision_context_manager.start()

                if response.status_code == 200:
                    self.message = 'Working internal link'
                    self.status = True
                    # see if the internal link points an anchor
                    if self.url[-1] == '#': # special case, point to #
                        self.message = 'Working internal hash anchor'
                    elif self.url.count('#'):
                        anchor = self.url.split('#')[1]
                        from linkcheck import parse_anchors
                        names = parse_anchors(response.content)
                        if anchor in names:
                            self.message = 'Working internal hash anchor'
                            self.status = True
                        else:
                            self.message = 'Broken internal hash anchor'
                            self.status = False

                elif (response.status_code == 302 or response.status_code == 301):
                    self.status = None
                    self.message = 'This link redirects: code %d (not automatically checked)' % (response.status_code, )
                else:
                    self.message = 'Broken internal link'
                settings.PREPEND_WWW = old_prepend_setting
            else:
                self.message = 'Invalid URL'

            if original_url: # restore the original url before saving
                self.url = original_url

            self.last_checked  = now()
            self.save()

        elif check_external and self.external:
            logger.info('checking external link: %s' % self.url)
            if self.last_checked and (self.last_checked > external_recheck_datetime):
                return self.status

            try:

                # Remove URL fragment identifiers
                url = self.url.rsplit('#')[0]

                if self.url.count('#'):
                    # We have to get the content so we can check the anchors
                    if TIMEOUT_SUPPORT:
                        response = urllib2.urlopen(
                            url,
                            timeout=LINKCHECK_CONNECTION_ATTEMPT_TIMEOUT
                        )
                    else:
                        response = urllib2.urlopen(url)
                else:
                    # Might as well just do a HEAD request
                    req = HeadRequest(url, headers={'User-Agent' : "http://%s Linkchecker" % settings.SITE_DOMAIN})
                    try:
                        if TIMEOUT_SUPPORT:
                            response = urllib2.urlopen(
                                req,
                                timeout=LINKCHECK_CONNECTION_ATTEMPT_TIMEOUT
                            )
                        else:
                            response = urllib2.urlopen(req)
                    except (ValueError, urllib2.HTTPError):
                        _, error, _ = sys.exc_info()
                        # ...except sometimes it triggers a bug in urllib2
                        if hasattr(error, 'code') and error.code == METHOD_NOT_ALLOWED:
                            req = GetRequest(url, headers={'User-Agent' : "http://%s Linkchecker" % settings.SITE_DOMAIN})
                        else:
                            req = url
                        if TIMEOUT_SUPPORT:
                            response = urllib2.urlopen(
                                req,
                                timeout=LINKCHECK_CONNECTION_ATTEMPT_TIMEOUT
                            )
                        else:
                            response = urllib2.urlopen(req)

                self.message = ' '.join([str(response.code), response.msg])
                self.status = True

                if self.url.count('#'):

                    anchor = self.url.split('#')[1]
                    from linkcheck import parse_anchors
                    try:
                        names = parse_anchors(response.read())
                        if anchor in names:
                            self.message = 'Working external hash anchor'
                            self.status = True
                        else:
                            self.message = 'Broken external hash anchor'
                            self.status = False

                    except:
                        # The external web page is mal-formatted #or maybe other parse errors like encoding
                        # I reckon a broken anchor on an otherwise good URL should count as a pass
                        self.message = "Page OK but anchor can't be checked"
                        self.status = True

            except BadStatusLine:
                    self.message = "Bad Status Line"

            except urllib2.HTTPError, e:
                if hasattr(e, 'code') and hasattr(e, 'msg'):
                    self.message = ' '.join([str(e.code), e.msg])
                else:
                    self.message = "Unknown Error"

            except urllib2.URLError, e:
                if hasattr(e, 'reason'):
                    self.message = 'Unreachable: '+str(e.reason)
                elif hasattr(e, 'code') and e.code!=301:
                    self.message = 'Error: '+str(e.code)
                else:
                    self.message = 'Redirect. Check manually: '+str(e.code)
Esempio n. 5
0
    def _check_internal(self, tested_url):

        from linkcheck.utils import LinkCheckHandler

        if not (tested_url):
            self.message = 'Empty link'

        elif tested_url.startswith('mailto:'):
            self.status = None
            self.message = 'Email link (not automatically checked)'

        elif tested_url.startswith('tel:'):
            self.status = None
            self.message = 'Phone number (not automatically checked)'

        elif tested_url.startswith('#'):
            self.status = None
            self.message = 'Link to within the same page (not automatically checked)'

        elif tested_url.startswith(MEDIA_PREFIX):
            # TODO Assumes a direct mapping from media url to local filesystem path. This will break quite easily for alternate setups
            path = settings.MEDIA_ROOT + unquote(
                tested_url)[len(MEDIA_PREFIX) - 1:]
            decoded_path = html_decode(path)
            if os.path.exists(path) or os.path.exists(decoded_path):
                self.message = 'Working file link'
                self.status = True
            else:
                self.message = 'Missing Document'

        elif getattr(self, '_internal_hash', False) and getattr(
                self, '_instance', None):
            # This is a hash link pointing to itself
            from linkcheck import parse_anchors

            hash = self._internal_hash
            instance = self._instance
            if hash == '#':  # special case, point to #
                self.message = 'Working internal hash anchor'
                self.status = True
            else:
                hash = hash[1:]  #'#something' => 'something'
                html_content = ''
                for field in instance._linklist.html_fields:
                    html_content += getattr(instance, field, '')
                try:
                    names = parse_anchors(html_content)
                    if hash in names:
                        self.message = 'Working internal hash anchor'
                        self.status = True
                    else:
                        self.message = 'Broken internal hash anchor'
                except UnicodeDecodeError:
                    self.message = 'Failed to parse HTML for anchor'

        elif tested_url.startswith('/'):
            old_prepend_setting = settings.PREPEND_WWW
            settings.PREPEND_WWW = False
            """
            original code
            c = Client()
            c.handler = LinkCheckHandler()
            response = c.get(tested_url)
            """
            tested_url = 'http://' + settings.ALLOWED_HOSTS[0] + tested_url
            response = requests.get(tested_url, verify=True)
            if response.status_code == 200:
                self.message = 'Working internal link'
                self.status = True
                # see if the internal link points an anchor
                if tested_url[-1] == '#':  # special case, point to #
                    self.message = 'Working internal hash anchor'
                elif tested_url.count('#'):
                    anchor = tested_url.split('#')[1]
                    from linkcheck import parse_anchors
                    try:
                        names = parse_anchors(str(response.content))
                        if anchor in names:
                            self.message = 'Working internal hash anchor'
                            self.status = True
                        else:
                            self.message = 'Broken internal hash anchor'
                            self.status = False
                    except UnicodeDecodeError:
                        self.message = 'Failed to parse HTML for anchor'

            elif response.status_code == 302 or response.status_code == 301:
                # redir_response = c.get(tested_url, follow=True)
                redir_response = requests.get(tested_url, allow_redirects=True)
                if redir_response.status_code == 200:
                    redir_state = 'Working redirect'
                    self.status = True
                else:
                    redir_state = 'Broken redirect'
                    self.status = False
                self.message = 'This link redirects: code %d (%s)' % (
                    response.status_code, redir_state)
            else:
                self.message = 'Broken internal link'
            settings.PREPEND_WWW = old_prepend_setting
        else:
            self.message = 'Invalid URL'

        if USE_REVERSION:
            # using test client will clear the RevisionContextManager stack.
            revision_context_manager.start()

        self.last_checked = now()
        self.save()
Esempio n. 6
0
    def check(self,
              check_internal=True,
              check_external=True,
              external_recheck_interval=EXTERNAL_RECHECK_INTERVAL):

        from linkcheck.utils import LinkCheckHandler
        external_recheck_datetime = now() - timedelta(
            minutes=external_recheck_interval)
        self.status = False

        # Remove current domain from URLs as the test client chokes when trying to test them during a page save
        # They shouldn't generally exist but occasionally slip through
        # If settings.SITE_DOMAINS isn't set then use settings.SITE_DOMAIN
        # but also check for variants: example.org, www.example.org, test.example.org

        original_url = None  # used to restore the original url afterwards

        if SITE_DOMAINS:  #if the setting is present
            internal_exceptions = SITE_DOMAINS

        else:  # try using SITE_DOMAIN
            root_domain = settings.SITE_DOMAIN
            if root_domain.startswith('www.'):
                root_domain = root_domain[4:]
            elif root_domain.startswith('test.'):
                root_domain = root_domain[5:]
            internal_exceptions = [
                'http://' + root_domain, 'http://www.' + root_domain,
                'http://test.' + root_domain
            ]

        for ex in internal_exceptions:
            if ex and self.url.startswith(ex):
                original_url = self.url
                self.url = self.url.replace(ex, '', 1)

        if check_internal and (not self.external):
            if not (self.url):
                self.message = 'Empty link'

            elif self.url.startswith('mailto:'):
                self.status = None
                self.message = 'Email link (not automatically checked)'

            elif self.url.startswith('#'):
                self.status = None
                self.message = 'Link to within the same page (not automatically checked)'

            elif self.url.startswith(MEDIA_PREFIX):
                #TODO Assumes a direct mapping from media url to local filesystem path. This will break quite easily for alternate setups
                if os.path.exists(settings.MEDIA_ROOT +
                                  self.url_unquoted()[len(MEDIA_PREFIX) - 1:]):
                    self.message = 'Working file link'
                    self.status = True
                else:
                    self.message = 'Missing Document'

            elif getattr(self, '_internal_hash', False) and getattr(
                    self, '_instance', None):
                # This is a hash link pointing to itself
                from linkcheck import parse_anchors

                hash = self._internal_hash
                instance = self._instance
                if hash == '#':  # special case, point to #
                    self.message = 'Working internal hash anchor'
                    self.status = True
                else:
                    hash = hash[1:]  #'#something' => 'something'
                    html_content = ''
                    for field in instance._linklist.html_fields:
                        html_content += getattr(instance, field, '')
                    names = parse_anchors(html_content)
                    if hash in names:
                        self.message = 'Working internal hash anchor'
                        self.status = True
                    else:
                        self.message = 'Broken internal hash anchor'

            elif self.url.startswith('/'):
                old_prepend_setting = settings.PREPEND_WWW
                settings.PREPEND_WWW = False
                c = Client()
                c.handler = LinkCheckHandler()
                response = c.get(self.url, follow=True)
                if response.status_code == 200:
                    self.message = 'Working internal link'
                    self.status = True
                    # see if the internal link points an anchor
                    if self.url[-1] == '#':  # special case, point to #
                        self.message = 'Working internal hash anchor'
                    elif self.url.count('#'):
                        anchor = self.url.split('#')[1]
                        from linkcheck import parse_anchors
                        names = parse_anchors(response.content)
                        if anchor in names:
                            self.message = 'Working internal hash anchor'
                            self.status = True
                        else:
                            self.message = 'Broken internal hash anchor'
                            self.status = False

                elif (response.status_code == 302
                      or response.status_code == 301):
                    self.status = None
                    self.message = 'This link redirects: code %d (not automatically checked)' % (
                        response.status_code, )
                else:
                    self.message = 'Broken internal link'
                settings.PREPEND_WWW = old_prepend_setting
            else:
                self.message = 'Invalid URL'

            if original_url:  # restore the original url before saving
                self.url = original_url

            self.last_checked = now()
            self.save()

        elif check_external and self.external:
            logger.info('checking external link: %s' % self.url)
            if self.last_checked and (self.last_checked >
                                      external_recheck_datetime):
                return self.status

            try:

                # Remove URL fragment identifiers
                url = self.url.rsplit('#')[0]

                if self.url.count('#'):
                    # We have to get the content so we can check the anchors
                    if TIMEOUT_SUPPORT:
                        response = urllib2.urlopen(
                            url, timeout=LINKCHECK_CONNECTION_ATTEMPT_TIMEOUT)
                    else:
                        response = urllib2.urlopen(url)
                else:
                    # Might as well just do a HEAD request
                    req = HeadRequest(url,
                                      headers={
                                          'User-Agent':
                                          "http://%s Linkchecker" %
                                          settings.SITE_DOMAIN
                                      })
                    try:
                        if TIMEOUT_SUPPORT:
                            response = urllib2.urlopen(
                                req,
                                timeout=LINKCHECK_CONNECTION_ATTEMPT_TIMEOUT)
                        else:
                            response = urllib2.urlopen(req)
                    except (ValueError, urllib2.HTTPError) as error:
                        # ...except sometimes it triggers a bug in urllib2
                        if hasattr(
                                error,
                                'code') and error.code == METHOD_NOT_ALLOWED:
                            req = GetRequest(url,
                                             headers={
                                                 'User-Agent':
                                                 "http://%s Linkchecker" %
                                                 settings.SITE_DOMAIN
                                             })
                        else:
                            req = url
                        if TIMEOUT_SUPPORT:
                            response = urllib2.urlopen(
                                req,
                                timeout=LINKCHECK_CONNECTION_ATTEMPT_TIMEOUT)
                        else:
                            response = urllib2.urlopen(req)

                self.message = ' '.join([str(response.code), response.msg])
                self.status = True

                if self.url.count('#'):

                    anchor = self.url.split('#')[1]
                    from linkcheck import parse_anchors
                    try:
                        names = parse_anchors(response.read())
                        if anchor in names:
                            self.message = 'Working external hash anchor'
                            self.status = True
                        else:
                            self.message = 'Broken external hash anchor'
                            self.status = False

                    except:
                        # The external web page is mal-formatted #or maybe other parse errors like encoding
                        # I reckon a broken anchor on an otherwise good URL should count as a pass
                        self.message = "Page OK but anchor can't be checked"
                        self.status = True

            except BadStatusLine:
                self.message = "Bad Status Line"

            except urllib2.HTTPError, e:
                if hasattr(e, 'code') and hasattr(e, 'msg'):
                    self.message = ' '.join([str(e.code), e.msg])
                else:
                    self.message = "Unknown Error"

            except urllib2.URLError, e:
                if hasattr(e, 'reason'):
                    self.message = 'Unreachable: ' + str(e.reason)
                elif hasattr(e, 'code') and e.code != 301:
                    self.message = 'Error: ' + str(e.code)
                else:
                    self.message = 'Redirect. Check manually: ' + str(e.code)
Esempio n. 7
0
    def _check_external(self, tested_url, external_recheck_interval):
        logger.info('checking external link: %s' % tested_url)
        external_recheck_datetime = now() - timedelta(
            minutes=external_recheck_interval)

        if self.last_checked and (self.last_checked >
                                  external_recheck_datetime):
            return self.status

        opener = build_opener(RedirectHandler)
        # Remove URL fragment identifiers
        url = tested_url.rsplit('#')[0]
        # Check that non-ascii chars are properly encoded
        try:
            url.encode('ascii')
        except UnicodeEncodeError:
            url = iri_to_uri(url)

        try:
            if tested_url.count('#'):
                # We have to get the content so we can check the anchors
                response = opener.open(
                    url, timeout=LINKCHECK_CONNECTION_ATTEMPT_TIMEOUT)
            else:
                # Might as well just do a HEAD request
                req = HeadRequest(url,
                                  headers={
                                      'User-Agent':
                                      "http://%s Linkchecker" %
                                      settings.SITE_DOMAIN
                                  })
                try:
                    response = opener.open(
                        req, timeout=LINKCHECK_CONNECTION_ATTEMPT_TIMEOUT)
                except URLError as e:
                    # When we get CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:579) error
                    # we try the link using requests, and ignore SSL verification error.
                    if hasattr(
                            e,
                            'reason') and 'certificate verify failed' in str(
                                e.reason):
                        response = requests.head(
                            url,
                            verify=False,
                            timeout=LINKCHECK_CONNECTION_ATTEMPT_TIMEOUT)
                        response.code = response.status_code
                        response.msg = ''
                    else:
                        raise

                except (ValueError, HTTPError) as error:
                    # ...except sometimes it triggers a bug in urllib2
                    if hasattr(error,
                               'code') and error.code == METHOD_NOT_ALLOWED:
                        req = GetRequest(url,
                                         headers={
                                             'User-Agent':
                                             "http://%s Linkchecker" %
                                             settings.SITE_DOMAIN
                                         })
                    else:
                        req = url
                    response = opener.open(
                        req, timeout=LINKCHECK_CONNECTION_ATTEMPT_TIMEOUT)

            self.message = ' '.join([str(response.code), response.msg])
            self.status = True

            if tested_url.count('#'):

                anchor = tested_url.split('#')[1]
                from linkcheck import parse_anchors
                try:
                    names = parse_anchors(response.read())
                    if anchor in names:
                        self.message = 'Working external hash anchor'
                        self.status = True
                    else:
                        self.message = 'Broken external hash anchor'
                        self.status = False

                except:
                    # The external web page is mal-formatted #or maybe other parse errors like encoding
                    # I reckon a broken anchor on an otherwise good URL should count as a pass
                    self.message = "Page OK but anchor can't be checked"
                    self.status = True

        except http_client.BadStatusLine:
            self.message = "Bad Status Line"

        except HTTPError as e:
            if hasattr(e, 'code') and hasattr(e, 'msg'):
                self.message = ' '.join([str(e.code), e.msg])
            else:
                self.message = "Unknown Error"

        except URLError as e:
            if hasattr(e, 'reason'):
                self.message = 'Unreachable: ' + str(e.reason)
            elif hasattr(e, 'code') and e.code != 301:
                self.message = 'Error: ' + str(e.code)
            else:
                self.message = 'Redirect. Check manually: ' + str(e.code)
        except Exception as e:
            self.message = 'Other Error: %s' % e
        else:
            if getattr(response, 'getcode', False) and response.getcode(
            ) == 301 and response.geturl() != url:
                self.redirect_to = response.geturl()
            elif self.redirect_to:
                self.redirect_to = ''

        self.last_checked = now()
        self.save()
Esempio n. 8
0
    def check(self, check_internal=True, check_external=True, external_recheck_interval=EXTERNAL_RECHECK_INTERVAL):

        from linkcheck.utils import LinkCheckHandler
        external_recheck_datetime = datetime.now() - timedelta(minutes=external_recheck_interval)
        self.status  = False

        # Remove current domain from URLs as the test client chokes when trying to test them during a page save
        # They shouldn't generally exist but occasionally slip through
        # If settings.SITE_DOMAINS isn't set then use settings.SITE_DOMAIN
        # but also check for variants: example.org, www.example.org, test.example.org

        original_url = None # used to restore the original url afterwards

        if SITE_DOMAINS: #if the setting is present
            internal_exceptions = SITE_DOMAINS

        else: # try using SITE_DOMAIN
            root_domain = settings.SITE_DOMAIN
            if root_domain.startswith('www.'):
                root_domain = root_domain[4:]
            elif root_domain.startswith('test.'):
                root_domain = root_domain[5:]
            internal_exceptions = ['http://'+root_domain, 'http://www.'+root_domain, 'http://test.'+root_domain]

        for ex in internal_exceptions:
            if ex and self.url.startswith(ex):
                original_url = self.url
                self.url = self.url.replace(ex, '', 1)

        if check_internal and (not self.external):
            if not(self.url):
                self.message = 'Empty link'

            elif self.url.startswith('mailto:'):
                self.status = None
                self.message = 'Email link (not automatically checked)'

            elif self.url.startswith('#'):
                self.status = None
                self.message = 'Link to within the same page (not automatically checked)'

            elif self.url.startswith(MEDIA_PREFIX):
                #TODO Assumes a direct mapping from media url to local filesystem path. This will break quite easily for alternate setups
                if os.path.exists(settings.MEDIA_ROOT + self.url_unquoted()[len(MEDIA_PREFIX)-1:]):
                    self.message = 'Working file link'
                    self.status = True
                else:
                    self.message = 'Missing Document'

            elif self.url.startswith('/'):
                old_prepend_setting = settings.PREPEND_WWW
                settings.PREPEND_WWW = False
                c = Client()
                c.handler = LinkCheckHandler()
                response = c.get(self.url, follow=True)
                if response.status_code == 200:
                    self.message = 'Working internal link'
                    self.status = True
                    # see if the internal link points an anchor
                    if self.url[-1] == '#': # special case, point to #
                        self.message = 'Working internal hash anchor'
                    elif self.url.count('#'):
                        anchor = self.url.split('#')[1]
                        from linkcheck import parse_anchors
                        names = parse_anchors(response.content)
                        if anchor in names:
                            self.message = 'Working internal hash anchor'
                            self.status = True
                        else:
                            self.message = 'Broken internal hash anchor'
                            self.status = False

                elif (response.status_code == 302 or response.status_code == 301):
                    self.status = None
                    self.message = 'This link redirects: code %d (not automatically checked)' % (response.status_code, )
                else:
                    self.message = 'Broken internal link'
                settings.PREPEND_WWW = old_prepend_setting
            else:
                self.message = 'Invalid URL'

            if original_url: # restore the original url before saving
                self.url = original_url

            self.last_checked  = datetime.now()
            self.save()

        elif check_external and self.external:

            if self.last_checked and (self.last_checked > external_recheck_datetime):
                return self.status

            try:

                # Remove URL fragment identifiers
                url = self.url.rsplit('#')[0]

                if self.url.count('#'):
                    # We have to get the content so we can check the anchors
                    response = urllib2.urlopen(url)
                else:
                    # Might as well just do a HEAD request
                    req = HeadRequest(url, headers={'User-Agent' : "http://%s Linkchecker" % settings.SITE_DOMAIN})
                    try:
                        response = urllib2.urlopen(req)
                    except ValueError:
                        # ...except sometimes it triggers a bug in urllib2
                        response = urllib2.urlopen(url)

                self.message = ' '.join([str(response.code), response.msg])
                self.status = True

                if self.url.count('#'):

                    anchor = self.url.split('#')[1]
                    from linkcheck import parse_anchors
                    try:
                        names = parse_anchors(response.read())
                        if anchor in names:
                            self.message = 'Working external hash anchor'
                            self.status = True
                        else:
                            self.message = 'Broken external hash anchor'
                            self.status = False

                    except HTMLParseError:
                        # The external web page is mal-formatted
                        # I reckon a broken anchor on an otherwise good URL should count as a pass
                        self.message = "Page OK but anchor can't be checked"
                        self.status = True

            except BadStatusLine:
                    self.message = "Bad Status Line"

            except urllib2.HTTPError, e:
                if hasattr(e, 'code') and hasattr(e, 'msg'):
                    self.message = ' '.join([str(e.code), e.msg])
                else:
                    self.message = "Unknown Error"

            except urllib2.URLError, e:
                if hasattr(e, 'reason'):
                    self.message = 'Unreachable: '+str(e.reason)
                elif hasattr(e, 'code') and e.code!=301:
                    self.message = 'Error: '+str(e.code)
                else:
                    self.message = 'Redirect. Check manually: '+str(e.code)
Esempio n. 9
0
    def check(self, recheck_interval=RECHECK_INTERVAL):

        from linkcheck.utils import LinkCheckHandler
        external_recheck_datetime = datetime.now() - timedelta(minutes=recheck_interval)
        self.status = False
        
        original_url = None # used to restore the original url afterwards

        if not(self.url):
            self.status = True
            self.message = 'Empty link'

        elif self.url.startswith('mailto:'):
            self.status = None
            self.message = 'Email link (not automatically checked)'

        elif self.url.startswith('#'):
            self.status = None
            self.message = 'Link to within the same page (not automatically checked)'

        elif self.url.startswith(MEDIA_PREFIX):
            #TODO Assumes a direct mapping from media url to local filesystem path. This will break quite easily for alternate setups
            if os.path.exists(settings.MEDIA_ROOT + self.url_unquoted()[len(MEDIA_PREFIX)-1:]):
                self.message = 'Working file link'
                self.status = True
            else:
                self.message = 'Missing Document'

        elif getattr(self, '_internal_hash', False) and getattr(self, '_instance', None):
            # This is a hash link pointing to itself
            from linkcheck import parse_anchors
            
            hash = self._internal_hash
            instance = self._instance
            if hash == '#': # special case, point to #
                self.message = 'Working internal hash anchor'
                self.status = True
            else:
                hash = hash[1:] #'#something' => 'something'
                html_content = ''
                for field in instance._linklist.html_fields:
                    html_content += getattr(instance, field, '')
                names = parse_anchors(html_content)
                if hash in names:
                    self.message = 'Working internal hash anchor'
                    self.status = True
                else:
                    self.message = 'Broken internal hash anchor'
                    logger.info('checking external link: %s' % self.url)
                    if self.last_checked and (self.last_checked > external_recheck_datetime):
                        return self.status
            
        else:
          if self.url.startswith("/"):
              # append site_domain to path
              root_domain = settings.SITE_DOMAIN
              self.url = "http://%s%s" % (root_domain, self.url)
          
          try:
              # Remove URL fragment identifiers
              url = self.url.rsplit('#')[0]

              if self.url.count('#'):
                  # We have to get the content so we can check the anchors
                  if TIMEOUT:
                      response = urllib2.urlopen(url, timeout=TIMEOUT)
                  else:
                      response = urllib2.urlopen(url)
              else:
                  # Might as well just do a HEAD request
                  req = HeadRequest(url, headers={'User-Agent' : "http://%s Linkchecker" % settings.SITE_DOMAIN})
                  try:
                      if TIMEOUT:
                          response = urllib2.urlopen(req, timeout=TIMEOUT)
                      else:
                          response = urllib2.urlopen(req)
                  except:
                      # ...except sometimes it triggers a bug in urllib2
                      if TIMEOUT:
                          response = urllib2.urlopen(url, timeout=TIMEOUT)
                      else:
                          response = urllib2.urlopen(url)

              self.message = ' '.join([str(response.code), response.msg])
              self.status = True

              if self.url.count('#'):

                  anchor = self.url.split('#')[1]
                  from linkcheck import parse_anchors
                  try:
                      names = parse_anchors(response.read())
                      if anchor in names:
                          self.message = 'Working hash anchor'
                          self.status = True
                      else:
                          self.message = 'Broken hash anchor'
                          self.status = False

                  except:
                      # The external web page is mal-formatted #or maybe other parse errors like encoding
                      # I reckon a broken anchor on an otherwise good URL should count as a pass
                      self.message = "Page OK but anchor can't be checked"
                      self.status = True

          except BadStatusLine:
                  self.message = "Bad Status Line"

          except urllib2.HTTPError, e:
              if hasattr(e, 'code') and hasattr(e, 'msg'):
                  self.message = ' '.join([str(e.code), e.msg])
              else:
                  self.message = "Unknown Error"

          except urllib2.URLError, e:
              if hasattr(e, 'reason'):
                  self.message = 'Unreachable: '+str(e.reason)
              elif hasattr(e, 'code') and e.code!=301:
                  self.message = 'Error: '+str(e.code)
              else:
                  self.message = 'Redirect. Check manually: '+str(e.code)