Beispiel #1
0
    def load(self, url, offset=0, length=-1):

        # first try to fetch url contents from cache
        cache_key = Link.get_warc_cache_key(url.split(settings.MEDIA_ROOT,1)[-1])


        mirror_name_cache_key = cache_key+'-mirror-name'
        mirror_name = ''

        file_contents = django_cache.get(cache_key)

        if file_contents is None:
            # url wasn't in cache -- load contents

            with opbeat_trace('file-loader-cache-miss'):

                # try fetching from each mirror in the LOCKSS network, in random order
                if settings.USE_LOCKSS_REPLAY:

                    mirrors = Mirror.get_cached_mirrors()
                    random.shuffle(mirrors)

                    for mirror in mirrors:
                        lockss_key = url.replace('file://', '').replace(WARC_STORAGE_PATH, 'https://' + settings.HOST + '/lockss/fetch')
                        lockss_url = urljoin(mirror['content_url'], 'ServeContent')
                        try:
                            logging.info("Fetching from %s?url=%s" % (lockss_url, lockss_key))
                            response = requests.get(lockss_url, params={'url': lockss_key})
                            assert response.ok
                            file_contents = response.content
                            mirror_name = mirror['name']
                            logging.info("Got content from lockss")
                        except (requests.ConnectionError, requests.Timeout, AssertionError) as e:
                            logging.info("Couldn't get from lockss: %s" % e)

                # If url wasn't in LOCKSS yet or LOCKSS is disabled, fetch from local storage using super()
                if file_contents is None:
                    file_contents = super(CachedLoader, self).load(url).read()
                    logging.info("Got content from local disk")

                # cache file contents
                # use a short timeout so large warcs don't evict everything else in the cache
                django_cache.set(cache_key, file_contents, timeout=60)
                django_cache.set(mirror_name_cache_key, mirror_name, timeout=60)

        else:
            mirror_name = django_cache.get(mirror_name_cache_key)

        # set wbrequest.mirror_name so it can be displayed in template later
        thread_local_data.wbrequest.mirror_name = mirror_name

        # turn string contents of url into file-like object
        afile = StringIO.StringIO(file_contents)

        # --- from here down is taken from super() ---
        if offset > 0:
            afile.seek(offset)

        if length >= 0:
            return LimitReader(afile, length)
        else:
            return afile
Beispiel #2
0
    def load(self, url, offset=0, length=-1):

        # first try to fetch url contents from cache
        cache_key = Link.get_warc_cache_key(
            url.split(settings.MEDIA_ROOT, 1)[-1])

        mirror_name_cache_key = cache_key + '-mirror-name'
        mirror_name = ''

        file_contents = django_cache.get(cache_key)

        if file_contents is None:
            # url wasn't in cache -- load contents

            with opbeat_trace('file-loader-cache-miss'):

                # try fetching from each mirror in the LOCKSS network, in random order
                if settings.USE_LOCKSS_REPLAY:

                    mirrors = Mirror.get_cached_mirrors()
                    random.shuffle(mirrors)

                    for mirror in mirrors:
                        lockss_key = url.replace('file://', '').replace(
                            WARC_STORAGE_PATH,
                            'https://' + settings.HOST + '/lockss/fetch')
                        lockss_url = urljoin(mirror['content_url'],
                                             'ServeContent')
                        try:
                            logging.info("Fetching from %s?url=%s" %
                                         (lockss_url, lockss_key))
                            response = requests.get(lockss_url,
                                                    params={'url': lockss_key})
                            assert response.ok
                            file_contents = response.content
                            mirror_name = mirror['name']
                            logging.info("Got content from lockss")
                        except (requests.ConnectionError, requests.Timeout,
                                AssertionError) as e:
                            logging.info("Couldn't get from lockss: %s" % e)

                # If url wasn't in LOCKSS yet or LOCKSS is disabled, fetch from local storage using super()
                if file_contents is None:
                    file_contents = super(CachedLoader, self).load(url).read()
                    logging.debug("Got content from local disk")

                # cache file contents
                # use a short timeout so large warcs don't evict everything else in the cache
                django_cache.set(cache_key, file_contents, timeout=60)
                django_cache.set(mirror_name_cache_key,
                                 mirror_name,
                                 timeout=60)

        else:
            mirror_name = django_cache.get(mirror_name_cache_key)

        # set wbrequest.mirror_name so it can be displayed in template later
        thread_local_data.wbrequest.mirror_name = mirror_name

        # turn string contents of url into file-like object
        afile = StringIO.StringIO(file_contents)

        # --- from here down is taken from super() ---
        if offset > 0:
            afile.seek(offset)

        if length >= 0:
            return LimitReader(afile, length)
        else:
            return afile
Beispiel #3
0
    def apply_filters(self, wbrequest, matcher):
        """Parse the GUID and find the CDXLine in the DB"""

        guid = matcher.group(1)
        cache_key = Link.get_cdx_cache_key(guid)
        cached_cdx = django_cache.get(cache_key)
        redirect_matcher = re.compile(r' 30[1-7] ')
        if cached_cdx is None or not wbrequest.wb_url:
            with opbeat_trace('cdx-cache-miss'), close_database_connection():
                try:
                    # This will filter out links that have user_deleted=True
                    link = Link.objects.get(guid=guid)
                except Link.DoesNotExist:
                    raise_not_found(wbrequest.wb_url)

                if not wbrequest.wb_url:
                    # This is a bare request to /warc/1234-5678/ -- return so we can send a forward to submitted_url in PermaGUIDHandler.
                    wbrequest.custom_params['guid'] = guid
                    wbrequest.custom_params['url'] = link.safe_url
                    return

                # Legacy archives didn't generate CDXLines during
                # capture so generate them on demand if not found, unless
                # A: the warc capture hasn't been generated OR
                # B: we know other cdx lines have already been generated
                #    and the requested line is simply missing
                lines = CDXLine.objects.filter(link_id=link.guid)

                if not lines:
                    lines = CDXLine.objects.create_all_from_link(link)

                # build a lookup of all cdx lines for this link indexed by urlkey, like:
                # cached_cdx = {'urlkey1':['raw1','raw2'], 'urlkey2':['raw3','raw4']}
                cached_cdx = defaultdict(list)
                for line in lines:
                    cached_cdx[line.urlkey].append(str(line.raw))

                # remove any redirects if we also have a non-redirect capture for the same URL, to prevent redirect loops
                for urlkey, lines in cached_cdx.iteritems():
                    if len(lines) > 1:
                        lines_without_redirects = [line for line in lines if not redirect_matcher.search(line)]
                        if lines_without_redirects:
                            cached_cdx[urlkey] = lines_without_redirects

                # record whether link is private so we can enforce permissions
                cached_cdx['is_private'] = link.is_private

                django_cache.set(cache_key, cached_cdx)

        # enforce permissions
        if cached_cdx.get('is_private'):
            # if user is allowed to access this private link, they will have a cookie like GUID=<token>,
            # which can be validated with link.validate_access_token()
            cookie = Cookie.SimpleCookie(wbrequest.env.get('HTTP_COOKIE')).get(guid)
            if not cookie:
                raise CustomTemplateException(status='400 Bad Request',
                                              template_path='archive/missing-cookie.html',
                                              template_kwargs={
                                                  'content_host': settings.WARC_HOST,
                                              })
            if not Link(pk=guid).validate_access_token(cookie.value, 3600):
                raise_not_found(wbrequest.wb_url)

        # check whether archive contains the requested URL
        urlkey = surt(wbrequest.wb_url.url)
        cdx_lines = cached_cdx.get(urlkey)
        if not cdx_lines:
            raise_not_found(wbrequest.wb_url)

        # Store the line for use in PermaCDXSource
        # so we don't need to hit the DB again
        wbrequest.custom_params['lines'] = cdx_lines
        wbrequest.custom_params['guid'] = guid

        # Adds the Memento-Datetime header
        # Normally this is done in MementoReqMixin#_parse_extra
        # but we need the GUID to make the DB query and that
        # isn't parsed from the url until this point
        wbrequest.wb_url.set_replay_timestamp(CDXLine(raw=cdx_lines[0]).timestamp)
Beispiel #4
0
    def apply_filters(self, wbrequest, matcher):
        """Parse the GUID and find the CDXLine in the DB"""

        guid = matcher.group(1)
        cache_key = Link.get_cdx_cache_key(guid)
        cached_cdx = django_cache.get(cache_key)
        redirect_matcher = re.compile(r' 30[1-7] ')
        if cached_cdx is None or not wbrequest.wb_url:
            with opbeat_trace('cdx-cache-miss'):
                try:
                    # This will filter out links that have user_deleted=True
                    link = Link.objects.get(guid=guid)
                except Link.DoesNotExist:
                    raise_not_found(wbrequest.wb_url)

                if not wbrequest.wb_url:
                    # This is a bare request to /warc/1234-5678/ -- return so we can send a forward to submitted_url in PermaGUIDHandler.
                    wbrequest.custom_params['guid'] = guid
                    wbrequest.custom_params['url'] = link.ascii_safe_url
                    return

                # Legacy archives didn't generate CDXLines during
                # capture so generate them on demand if not found, unless
                # A: the warc capture hasn't been generated OR
                # B: we know other cdx lines have already been generated
                #    and the requested line is simply missing
                lines = CDXLine.objects.filter(link_id=link.guid)

                if not lines:
                    lines = CDXLine.objects.create_all_from_link(link)

                # build a lookup of all cdx lines for this link indexed by urlkey, like:
                # cached_cdx = {'urlkey1':['raw1','raw2'], 'urlkey2':['raw3','raw4']}
                cached_cdx = defaultdict(list)
                for line in lines:
                    cached_cdx[line.urlkey].append(str(line.raw))

                # remove any redirects if we also have a non-redirect capture for the same URL, to prevent redirect loops
                for urlkey, lines in cached_cdx.iteritems():
                    if len(lines) > 1:
                        lines_without_redirects = [
                            line for line in lines
                            if not redirect_matcher.search(line)
                        ]
                        if lines_without_redirects:
                            cached_cdx[urlkey] = lines_without_redirects

                # record whether link is private so we can enforce permissions
                cached_cdx['is_private'] = link.is_private

                django_cache.set(cache_key, cached_cdx)

        # enforce permissions
        if cached_cdx.get('is_private'):
            # if user is allowed to access this private link, they will have a cookie like GUID=<token>,
            # which can be validated with link.validate_access_token()
            cookie = Cookie.SimpleCookie(
                wbrequest.env.get('HTTP_COOKIE')).get(guid)
            if not cookie:
                raise CustomTemplateException(
                    status='400 Bad Request',
                    template_path='archive/missing-cookie.html',
                    template_kwargs={
                        'content_host': settings.WARC_HOST,
                    })
            if not Link(pk=guid).validate_access_token(cookie.value, 3600):
                raise_not_found(wbrequest.wb_url)

        # check whether archive contains the requested URL
        urlkey = surt(wbrequest.wb_url.url)
        cdx_lines = cached_cdx.get(urlkey)
        if not cdx_lines:
            raise_not_found(wbrequest.wb_url)

        # Store the line for use in PermaCDXSource
        # so we don't need to hit the DB again
        wbrequest.custom_params['lines'] = cdx_lines
        wbrequest.custom_params['guid'] = guid

        # Adds the Memento-Datetime header
        # Normally this is done in MementoReqMixin#_parse_extra
        # but we need the GUID to make the DB query and that
        # isn't parsed from the url until this point
        wbrequest.wb_url.set_replay_timestamp(
            CDXLine(raw=cdx_lines[0]).timestamp)