Beispiel #1
0
def end_request(event):
    # Ignore internal requests.
    if event.request.get('HTTP_USER_AGENT') == 'Bobo':
        return

    # Must have add-on browser layer.
    if not ILayer.providedBy(event.request):
        return

    try:
        site = IAnnotations(event.request)[TRAVERSAL_KEY]
    except KeyError:
        return

    try:
        tool = getToolByName(site, 'portal_linkcheck')
    except AttributeError as exc:
        logger.warn("Did not find tool: %s." % exc)
        return

    # Must be HTML.
    response = event.request.response
    content_type = response.getHeader('Content-Type')
    if content_type and not content_type.startswith('text/html'):
        return

    # Update the status of the present request.
    status = response.getStatus()

    if not tool.is_available():
        logger.warn("Tool not available; please run update step.")
        return

    # Compute path given the actual URL, relative to the site root.
    base_url = site.absolute_url()
    actual_url = event.request.get('ACTUAL_URL', '')
    if not actual_url.startswith(base_url):
        return

    path = actual_url[len(base_url):]
    tool.update(path, status)

    # Must be good response.
    if status != 200:
        return

    # Skip control panel view.
    if '@@linkcheck-controlpanel' in event.request['PATH_INFO']:
        return

    try:
        encoding = response.headers['content-type'].split('charset=')[-1]
    except:
        encoding = "latin-1"

    body = response.body
    if response.headers.get('content-encoding') == 'gzip':
        try:
            body = gzip.GzipFile(fileobj=StringIO(body)).read()
        except BaseException as exc:
            logger.warn(exc)
            return

    try:
        document = body.decode(encoding, 'ignore')
    except UnicodeDecodeError as exc:
        logger.warn(exc)
        return

    hrefs = set()
    for href in iter_links(document):
        # Ignore anchors and javascript.
        if href.startswith('#') or href.startswith('javascript:'):
            continue

        # Internal URLs are stored site-relative.
        if href.startswith(base_url):
            href = "/" + href[len(base_url) + 1:].rstrip("/")

        # Add trailing slash to bare domain.
        if href.startswith('http://') or href.startswith('https://'):
            if href.count('/') == 2:
                href = href.rstrip('/') + '/'

        hrefs.add(href)

    # We want all the hyperlinks in the document to be checked unless
    # it's already in the queue or it has been checked recently.
    now = datetime.datetime.now()
    date = now - datetime.timedelta(days=1)
    yesterday = int(time.mktime(date.timetuple()))

    #referer is nothing else than the actual_url. HTTP_HOST and PATH_INFO
    # give wrong URLs in VirtualHosting

    # Update link database
    tool.register(hrefs, actual_url, yesterday)

    # We always commit the transaction; if no changes were made, this
    # is a NOOP. Note that conflict errors are possible with
    # concurrent requests. We ignore them.
    try:
        transaction.commit()
    except ConflictError:
        transaction.abort()
Beispiel #2
0
def end_request(event):

    # Skip control panel view.
    if '@@linkcheck-controlpanel' in event.request['PATH_INFO']:
        return

    # Ignore internal requests.
    if event.request.get('HTTP_USER_AGENT') == 'Bobo':
        return

    # Must have add-on browser layer.
    if not ILayer.providedBy(event.request):
        return

    try:
        site = IAnnotations(event.request)[TRAVERSAL_KEY]
    except KeyError:
        return

    try:
        tool = getToolByName(site, 'portal_linkcheck')
    except AttributeError as exc:
        logger.warn("Did not find tool: %s." % exc)
        return

    # No processing if 'check_on_request' setting is false
    registry = getUtility(IRegistry, context=site)
    settings = registry.forInterface(ISettings)
    if not settings.check_on_request:
        return

    # Must be HTML.
    response = event.request.response
    content_type = response.getHeader('Content-Type')
    if content_type and not content_type.startswith('text/html'):
        return

    # Update the status of the present request.
    status = response.getStatus()

    if not tool.is_available():
        logger.warn("Tool not available; please run update step.")
        return

    # Compute path given the actual URL, relative to the site root.
    base_url = site.absolute_url()
    actual_url = event.request.get('ACTUAL_URL', '')
    if not actual_url.startswith(base_url):
        return

    path = actual_url[len(base_url):]

    tool.update(path, status)

    # Must be good response.
    if status != 200:
        return

    try:
        encoding = response.headers['content-type'].split('charset=')[-1]
    except:
        encoding = "latin-1"

    body = response.body
    if not body:
        return

    if response.headers.get('content-encoding') == 'gzip':
        try:
            body = gzip.GzipFile(fileobj=StringIO(body)).read()
        except BaseException as exc:
            logger.warn(exc)
            return

    try:
        document = body.decode(encoding, 'ignore')
    except UnicodeDecodeError as exc:
        logger.warn(exc)
        return

    hrefs = set()

    for href in iter_links(document):

        # Ignore anchors and javascript.
        if href.startswith('#') or href.startswith('javascript:'):
            continue

        # Ignore mailto links
        if href.startswith('mailto:'):
            continue

        # handle relative urls
        if href.startswith('.') or (not href.startswith('/')
                                    and '://' not in href):
            href = '/'.join((actual_url, href))

        # Internal URLs are stored site-relative.
        if href.startswith(base_url):
            href = "/" + href[len(base_url) + 1:].rstrip("/")

        # Add trailing slash to bare domain.
        if href.startswith('http://') or href.startswith('https://'):
            if href.count('/') == 2:
                href = href.rstrip('/') + '/'

        hrefs.add(href)

    # We want all the hyperlinks in the document to be checked unless
    # it's already in the queue or it has been checked recently.
    now = datetime.datetime.now()
    date = now - datetime.timedelta(days=1)
    yesterday = int(time.mktime(date.timetuple()))

    # referer is nothing else than the actual_url. HTTP_HOST and PATH_INFO
    # give wrong URLs in VirtualHosting

    # Update link database
    tool.register(hrefs, actual_url, yesterday)

    # We always commit the transaction; if no changes were made, this
    # is a NOOP. Note that conflict errors are possible with
    # concurrent requests. We ignore them.
    try:
        transaction.commit()
    except ConflictError:
        transaction.abort()