Exemple #1
0
    def handle_timegate(self, params, timestamp):
        url = params['url']
        load_url = self.timegate_url.format(url=url, timestamp=timestamp)

        res = None
        try:
            headers = self._get_headers(params)
            res = self.sesh.head(load_url, headers=headers)
        except Exception as e:
            no_except_close(res)
            raise NotFoundException(url)

        if res and res.headers.get('Memento-Datetime'):
            if res.status_code >= 400:
                no_except_close(res)
                raise NotFoundException(url)

            if res.status_code >= 300:
                info = self._extract_location(url, res.headers.get('Location'))
            else:
                info = self._extract_location(
                    url, res.headers.get('Content-Location'))

            url, timestamp, load_url = info

        cdx = CDXObject()
        cdx['urlkey'] = canonicalize(url)
        cdx['timestamp'] = timestamp
        cdx['url'] = url
        cdx['load_url'] = load_url

        if 'Referer' in headers:
            cdx['set_referrer'] = headers['Referer']

        return iter([cdx])
Exemple #2
0
    def load_index(self, params):
        """Loads the xml query index based on the supplied params

        :param dict[str, str] params: The query params
        :return: A list or generator of cdx objects
        :raises NotFoundException: If the query url is not found
        or the results of the query returns no cdx entries
        :raises BadRequestException: If the match type is not exact or prefix
        """
        closest = params.get('closest')

        url = params.get('url', '')

        matchType = params.get('matchType', 'exact')

        if matchType == 'exact':
            query = self.EXACT_QUERY
        elif matchType == 'prefix':
            query = self.PREFIX_QUERY
        else:
            raise BadRequestException('matchType={0} is not supported'.format(matchType=matchType))

        try:
            limit = params.get('limit')
            if limit:
                query = 'limit:{0} '.format(limit) + query

            # OpenSearch API requires double-escaping
            # TODO: add option to not double escape if needed
            query_url = self.query_api_url + '?q=' + quote_plus(query + quote_plus(url))
            self.logger.debug("Running query: %s" % query_url)
            response = self.session.get(query_url)
            response.raise_for_status()

            results = etree.fromstring(response.content)

            items = results.find('results')

        except Exception:
            if self.logger.getEffectiveLevel() == logging.DEBUG:
                import traceback
                traceback.print_exc()

            raise NotFoundException('url {0} not found'.format(url))

        if not items:
            raise NotFoundException('url {0} not found'.format(url))

        items = items.findall('result')

        if matchType == 'exact':
            cdx_iter = [self.convert_to_cdx(item) for item in items]
            if closest:
                cdx_iter = cdx_sort_closest(closest, cdx_iter, limit=10000)

        else:
            cdx_iter = self.prefix_query_iter(items)

        return cdx_iter
Exemple #3
0
    def _extract_location(self, url, location):
        if not location or not location.startswith(self.prefix):
            raise NotFoundException(url)

        m = self.WBURL_MATCH.search(location[len(self.prefix):])
        if not m:
            raise NotFoundException(url)

        url = m.group(2)
        timestamp = m.group(1)
        location = self._get_replay_url(timestamp, url)
        return url, timestamp, location
    def _do_load(self, cdx, params):
        result = self.loader.load_resource(cdx, params)
        if not result:
            raise NotFoundException('Not a memento: ' + cdx['url'])

        cdx['_cached_result'] = result
        yield cdx
Exemple #5
0
    def __call__(self, wbrequest):
        url = wbrequest.wb_url_str.split('?')[0]
        full_path = self.static_path + url

        try:
            data = self.block_loader.load(full_path)

            try:
                data.seek(0, 2)
                size = data.tell()
                data.seek(0)
                headers = [('Content-Length', str(size))]
            except IOError:
                headers = None

            if 'wsgi.file_wrapper' in wbrequest.env:
                reader = wbrequest.env['wsgi.file_wrapper'](data)
            else:
                reader = iter(lambda: data.read(), '')

            content_type = 'application/octet-stream'

            guessed = mimetypes.guess_type(full_path)
            if guessed[0]:
                content_type = guessed[0]

            return WbResponse.text_stream(data,
                                          content_type=content_type,
                                          headers=headers)

        except IOError:
            raise NotFoundException('Static File Not Found: ' +
                                    wbrequest.wb_url_str)
Exemple #6
0
    def _check_cdx_iter(self, cdx_iter, query):
        """ Check cdx iter semantics
        If `cdx_iter` is empty (no matches), check if fuzzy matching
        is allowed, and try it -- otherwise,
        throw :exc:`~pywb.utils.wbexception.NotFoundException`
        """

        cdx_iter = self.peek_iter(cdx_iter)

        if cdx_iter:
            return cdx_iter

        # check if fuzzy is allowed and ensure that its an
        # exact match
        if (self.fuzzy_query and
            query.allow_fuzzy and
            query.is_exact):

            fuzzy_query_params = self.fuzzy_query(query)
            if fuzzy_query_params:
                return self.load_cdx(**fuzzy_query_params)

        msg = 'No Captures found for: ' + query.url
        if not query.is_exact:
            msg += ' (' + query.match_type + ' query)'

        raise NotFoundException(msg, url=query.url)
Exemple #7
0
    def timemap_query(self, url, closest='1'):
        url = urllib.quote(url, ':/')
        full = self.timemap_endpoint + closest + '/' + url
        r = None
        try:
            r = self.session.get(full)
            result = r.json()
        except Exception as e:
            logging.debug(e)
            if r and r.status_code == 503:
                msg = 'No Mementos Currently Available: <br/>'
                msg += r.text
            elif r and r.status_code == 404:
                return {"list": []}
            elif r:
                msg = 'Unknown response with: ' + str(r.status_code)
            else:
                msg = 'No response'

            raise NotFoundException(msg, url=url)

        mementos = result.get('mementos')
        # if got timemap_index, just cached the timemap, so need to query again
        # TODO: revisit this..
        if not mementos and result.get('timemap_index'):
            return self.timemap_query(url, closest)

        return mementos
Exemple #8
0
    def load_index(self, params):
        # no fuzzy match for live resources
        if params.get('is_fuzzy'):
            raise NotFoundException(params['url'] + '*')

        cdx = CDXObject()
        cdx['urlkey'] = params.get('key').decode('utf-8')
        cdx['timestamp'] = timestamp_now()
        cdx['url'] = params['url']
        cdx['load_url'] = self.get_load_url(params)
        cdx['is_live'] = 'true'

        mime = params.get('content_type', '')

        if params.get('filter') and not mime:
            try:
                res = self.sesh.head(cdx['load_url'])
                if res.status_code != 405:
                    cdx['status'] = str(res.status_code)

                content_type = res.headers.get('Content-Type')
                if content_type:
                    mime = content_type.split(';')[0]

            except Exception as e:
                pass

        cdx['mime'] = mime

        return iter([cdx])
Exemple #9
0
    def load_cdx(self, **params):
        prefix = ''

        is_text = (params.get('output') == 'text')

        # lookup collection prefix
        filters = params.get('filter')
        if filters:
            for f in filters:
                if f.startswith('prefix:'):
                    prefix = f[7:]

        # special path for list all
        if params.get('listColls') and is_text:
            colls = self._load_colls()
            return '\n'.join(colls)

        url = params['url']

        # force http prefix
        if url.startswith(self.HTTPS_PREFIX):
            url = self.HTTP_PREFIX + url[len(self.HTTPS_PREFIX):]
        elif not url.startswith(self.HTTP_PREFIX):
            url = self.HTTP_PREFIX + url

        request_uri = self.warcbase_path
        request_uri += prefix
        request_uri += '*/' + url

        try:
            response = requests.get(request_uri)
        except Exception:
            raise WbException('Error reading from: ' + request_uri)

        if response.status_code != 200:
            if response.status_code == 500:
                self._invalid_collection(prefix)
            else:
                raise BadRequestException(('Invalid status code: {0}'.
                                           format(response.status_code)))

        if len(response.content) == 0:
            msg = ('No captures found for <b>{0}</b> in collection <i>{1}</i>'.
                   format(url, prefix.strip('/')))

            raise NotFoundException(msg, url=url)

        lines = response.content.rstrip().split('\n')

        if len(lines[0].split('\t')) != 3:
            self._invalid_collection(prefix)

        resp_iter = self.iter_cdx(lines, url)
        if is_text:
            resp_iter = self.iter_text(resp_iter)

        return resp_iter
Exemple #10
0
    def _iter_sources(self, params):
        the_dir = res_template(self.base_dir, params)
        the_dir = os.path.join(self.base_prefix, the_dir)
        try:
            sources = list(self._load_files(the_dir))
        except Exception:
            raise NotFoundException(the_dir)

        return sources
Exemple #11
0
    def get_timegate_links(self, params, timestamp):
        url = res_template(self.timegate_url, params)
        accept_dt = timestamp_to_http_date(timestamp)
        try:
            headers = self._get_headers(params)
            headers['Accept-Datetime'] = accept_dt
            res = self.sesh.head(url, headers=headers)
            res.raise_for_status()
        except Exception as e:
            self.logger.debug('FAILED: ' + str(e))
            raise NotFoundException(url)

        links = res.headers.get('Link')

        if not links:
            raise NotFoundException(url)

        return links
Exemple #12
0
def raise_not_found(url):
    if not settings.LOG_PLAYBACK_404:
        # use a custom error to skip pywb printing of exceptions
        raise CustomTemplateException(
            status='404 Not Found',
            template_path='archive/archive-error.html',
            template_kwargs={
                'content_host': settings.WARC_HOST,
                'err_url': url
            })
    raise NotFoundException('No Captures found for: %s' % url, url=url)
Exemple #13
0
    def __call__(self, wbrequest):
        perms_checker = self.perms_policy(wbrequest)

        if wbrequest.wb_url:
            return self.check_single_url(wbrequest, perms_checker)

#        elif wbrequest.env['REQUEST_METHOD'] == 'POST':
#            return self.check_bulk(wbrequest, perms_checker)

        else:
            raise NotFoundException(NOT_FOUND)
Exemple #14
0
    def load_index(self, params):
        timestamp = params.get('closest')

        # can't do fuzzy matching via memento
        if params.get('is_fuzzy'):
            raise NotFoundException(params['url'] + '*')

        if not timestamp:
            return self.handle_timemap(params)
        else:
            return self.handle_timegate(params, timestamp)
Exemple #15
0
    def apply_filters(self, wbrequest, matcher):
        """Parse the GUID and find the CDXLine in the DB"""

        guid = matcher.group(1)
        urlkey = surt(wbrequest.wb_url.url)

        try:
            # This will filter out links that have user_deleted=True
            link = Link.objects.get(guid=guid)
        except Link.DoesNotExist:
            raise NotFoundException()

        lines = CDXLine.objects.filter(urlkey=urlkey, asset__link_id=guid)

        # Legacy archives didn't generate CDXLines during
        # capture so generate them on demand if not found, unless
        # A: the warc capture hasn't been generated OR
        # B: we know other cdx lines have already been generated
        #    and the requested line is simply missing
        if lines.count() == 0:
            asset = Asset.objects.get(link_id=guid)
            if asset.warc_capture in [
                    Asset.CAPTURE_STATUS_PENDING, Asset.CAPTURE_STATUS_FAILED
            ] or asset.cdx_lines.count() > 0:
                raise NotFoundException()

            CDXLine.objects.create_all_from_asset(asset)
            lines = CDXLine.objects.filter(urlkey=urlkey, asset__link_id=guid)
            if not len(lines):
                raise NotFoundException()

        # Store the line for use in PermaCDXSource
        # so we don't need to hit the DB again
        wbrequest.custom_params['lines'] = lines
        wbrequest.custom_params['guid'] = guid

        # Adds the Memento-Datetime header
        # Normally this is done in MementoReqMixin#_parse_extra
        # but we need the GUID to make the DB query and that
        # isn't parsed from the url until this point
        wbrequest.wb_url.set_replay_timestamp(lines.first().timestamp)
Exemple #16
0
    def load_cdx(self, **params):
        closest = params.get('closest')

        self.check_url(params)

        if closest:
            query = self._get_closest_query(params)
        else:
            query = self._get_timemap_query(params)

        query = quote_plus(query) + self.CLOSEST_QUERY_FIXED
        full_url = self.opensearch_query + '?query=' + query
        print('QUERY', full_url)

        output = params.get('output', 'text')
        url = params.get('url')
        urlkey = canonicalize(url)

        try:
            response = requests.get(full_url, stream=True)
            buff = response.raw.read()
            response.raw.close()
        except Exception as e:
            import traceback
            traceback.print_exc(e)
            raise WbException(e)

        results = etree.fromstring(buff)

        items = results.find('channel').findall('item')

        cdx_list = [self.convert_to_cdx(item, urlkey, url) for item in items]

        if not cdx_list:
            raise NotFoundException('url {0} not found'.format(url))

        if closest:
            cdx_list = cdx_sort_closest(closest, cdx_list, limit=10000)
            #lets print the list and the closest for debug
        else:
            cdx_list = cdx_sort_closest(EARLIEST_DATE, cdx_list, limit=10000)

        if output == 'text':
            cdx_list = [str(cdx) + '\n' for cdx in cdx_list]
        elif output == 'json':
            fields = params.get('fl', '').split(',')
            cdx_list = [cdx.to_json(fields) for cdx in cdx_list]

        return iter(cdx_list)
Exemple #17
0
    def load_index(self, params):
        filename = res_template(self.filename_template, params)

        try:
            fh = open(filename, 'rb')
        except IOError:
            raise NotFoundException(filename)

        def do_load(fh):
            with fh:
                gen = iter_range(fh, params['key'], params['end_key'])
                for line in gen:
                    yield CDXObject(line)

        return do_load(fh)
Exemple #18
0
        def mock_func(self, params, closest):
            if load:
                res = cls.orig_get_timegate_links(self, params, closest)
                print(test_name + ': ')
                print("    '{0}': '{1}'".format(self.timegate_url, res))
                return res

            try:
                res = cls.link_header_data[test_name][self.timegate_url]
                time.sleep(0.2)
            except Exception as e:
                print(e)
                msg = self.timegate_url.format(url=params['url'])
                raise NotFoundException(msg)

            return res
Exemple #19
0
    def render_content(self, wbrequest, cdx_lines, cdx_loader):
        last_e = None
        first = True

        #cdx_lines = args[0]
        #cdx_loader = args[1]

        # List of already failed w/arcs
        failed_files = []

        response = None

        # Iterate over the cdx until find one that works
        # The cdx should already be sorted in
        # closest-to-timestamp order (from the cdx server)
        for cdx in cdx_lines:
            try:
                # optimize: can detect if redirect is needed just from the cdx,
                # no need to load w/arc data if requiring exact match
                if first:
                    redir_response = self._redirect_if_needed(wbrequest, cdx)
                    if redir_response:
                        return redir_response

                    first = False

                response = self.cached_replay_capture(wbrequest, cdx,
                                                      cdx_loader, failed_files)

            except (CaptureException, ArchiveLoadFailed) as ce:
                #import traceback
                #traceback.print_exc()
                logging.debug(ce)
                last_e = ce
                pass

            if response:
                return response

        if not last_e:
            # can only get here if cdx_lines is empty somehow
            # should be filtered out before hand, but if not
            msg = 'No Captures found for: ' + wbrequest.wb_url.url
            last_e = NotFoundException(msg)

        raise last_e
Exemple #20
0
    def timegate_query(self, timestamp, url):
        url = urllib.quote(url, ':/')
        full = self.api_endpoint + timestamp + '/' + url
        r = None
        try:
            r = self.session.get(full)
            r.raise_for_status()
            result = r.json()
        except Exception as e:
            if r and r.status_code != 404:
                import traceback
                traceback.print_exc(e)

            msg = 'No Mementos Found'
            raise NotFoundException(msg, url=url)

        return result['mementos']
Exemple #21
0
def raise_not_found(url, timestamp=None):
    if not settings.LOG_PLAYBACK_404:
        # use a custom error to skip pywb printing of exceptions
        if not timestamp:
            # if timestamp is not available, fall back on today's date
            now = datetime.now()
            timestamp = '{:%Y%m%d%H%M%S}'.format(now)

        raise CustomTemplateException(
            status='404 Not Found',
            template_path='archive/archive-error.html',
            template_kwargs={
                'content_host': settings.WARC_HOST,
                'err_url': url,
                'timestamp': timestamp,
            })
    raise NotFoundException('No Captures found for: %s' % url, url=url)
Exemple #22
0
    def __call__(self, environ, url_str):
        url = url_str.split('?')[0]

        if url.endswith('/'):
            url += 'index.html'

        full_path = environ.get('pywb.static_dir')
        if full_path:
            full_path = os.path.join(full_path, url)
            if not os.path.isfile(full_path):
                full_path = None

        if not full_path:
            full_path = os.path.join(self.static_path, url)

        try:
            data = self.block_loader.load(full_path)

            data.seek(0, 2)
            size = data.tell()
            data.seek(0)
            headers = [('Content-Length', str(size))]

            reader = None

            if 'wsgi.file_wrapper' in environ:
                try:
                    reader = environ['wsgi.file_wrapper'](data)
                except:
                    pass

            if not reader:
                reader = iter(lambda: data.read(), b'')

            content_type = 'application/octet-stream'

            guessed = mimetypes.guess_type(full_path)
            if guessed[0]:
                content_type = guessed[0]

            return WbResponse.bin_stream(reader,
                                         content_type=content_type,
                                         headers=headers)

        except IOError:
            raise NotFoundException('Static File Not Found: ' + url_str)
Exemple #23
0
    def handle_timemap(self, params):
        url = res_template(self.timemap_url, params)
        headers = self._get_headers(params)
        try:
            res = self.sesh.get(url,
                                headers=headers,
                                timeout=params.get('_timeout'))

            res.raise_for_status()
            assert(res.text)

        except Exception as e:
            self.logger.debug('FAILED: ' + str(e))
            raise NotFoundException(url)

        links = res.text
        return self.links_to_cdxobject(links, 'timemap')
Exemple #24
0
    def load_index(self, params):
        api_url = self._get_api_url(params)
        try:
            r = self.sesh.get(api_url, timeout=params.get('_timeout'))
            r.raise_for_status()
        except Exception as e:
            self.logger.debug('FAILED: ' + str(e))
            raise NotFoundException(api_url)

        lines = r.content.strip().split(b'\n')
        def do_load(lines):
            for line in lines:
                if not line:
                    continue

                cdx = CDXObject(line)
                self._set_load_url(cdx, params)
                yield cdx

        return do_load(lines)
Exemple #25
0
    def handle_methods(self, env, start_response):
        wb_router = self.wb_router
        response = None

        try:
            response = wb_router(env)

            if not response:
                if self.fallback_app:
                    return self.fallback_app(env, start_response)
                else:
                    msg = 'No handler for "{0}".'.format(env['REL_REQUEST_URI'])
                    raise NotFoundException(msg)

        except WbException as e:
            response = self.handle_exception(env, e, False)

        except Exception as e:
            response = self.handle_exception(env, e, True)

        return response(env, start_response)
Exemple #26
0
    def handle_methods(self, env, start_response):
        if env.get('SCRIPT_NAME') or not env.get('REQUEST_URI'):
            env['REL_REQUEST_URI'] = rel_request_uri(env)
        else:
            env['REL_REQUEST_URI'] = env['REQUEST_URI']

        wb_router = self.wb_router
        response = None

        try:
            response = wb_router(env)

            if not response:
                msg = 'No handler for "{0}".'.format(env['REL_REQUEST_URI'])
                raise NotFoundException(msg)

        except WbException as e:
            response = self.handle_exception(env, e, False)

        except Exception as e:
            response = self.handle_exception(env, e, True)

        return response(env, start_response)
Exemple #27
0
    def render_content(self, wb_url, kwargs, environ):
        wb_url = wb_url.replace('#', '%23')
        wb_url = WbUrl(wb_url)

        history_page = environ.pop('HTTP_X_WOMBAT_HISTORY_PAGE', '')
        if history_page:
            wb_url.url = history_page
            is_ajax = True
        else:
            is_ajax = self.is_ajax(environ)

        is_timegate = self._check_accept_dt(wb_url, environ)

        self.prepare_env(environ)

        host_prefix = environ['pywb.host_prefix']
        rel_prefix = self.get_rel_prefix(environ)
        full_prefix = host_prefix + rel_prefix

        pywb_static_prefix = environ['pywb.static_prefix'] + '/'
        is_proxy = ('wsgiprox.proxy_host' in environ)

        # if OPTIONS in proxy mode, just generate the proxy responss
        if is_proxy and self.is_preflight(environ):
            return WbResponse.options_response(environ)

        if self.use_js_obj_proxy:
            content_rw = self.js_proxy_rw
        else:
            content_rw = self.default_rw

        # no redirects if in proxy
        redirect_to_exact = self.redirect_to_exact and not is_proxy

        # Check Prefer
        pref_mod, pref_applied = self._get_prefer_mod(wb_url, environ,
                                                      content_rw, is_proxy)

        response = None
        keep_frame_response = False

        # prefer overrides custom response?
        if pref_mod is not None:
            # fast-redirect to preferred
            if redirect_to_exact and not is_timegate and pref_mod != wb_url.mod:
                new_url = full_prefix + wb_url.to_str(mod=pref_mod)
                headers = [('Preference-Applied', pref_applied),
                           ('Vary', 'Prefer')]

                return WbResponse.redir_response(new_url,
                                                 '307 Temporary Redirect',
                                                 headers=headers)
            else:
                wb_url.mod = pref_mod
        else:
            if kwargs.get('output'):
                response = self.handle_timemap(wb_url, kwargs, full_prefix)

            elif wb_url.is_query():
                response = self.handle_query(environ, wb_url, kwargs,
                                             full_prefix)

            else:
                response = self.handle_custom_response(environ, wb_url,
                                                       full_prefix,
                                                       host_prefix, kwargs)

                keep_frame_response = (not kwargs.get('no_timegate_check')
                                       and is_timegate
                                       and not is_proxy) or redirect_to_exact

        if response and not keep_frame_response:
            return self.format_response(response, wb_url, full_prefix,
                                        is_timegate, is_proxy)

        if is_proxy:
            environ['pywb_proxy_magic'] = environ['wsgiprox.proxy_host']
            urlrewriter = IdentityUrlRewriter(wb_url, '')
            framed_replay = False

        else:
            urlrewriter = UrlRewriter(wb_url,
                                      prefix=full_prefix,
                                      full_prefix=full_prefix,
                                      rel_prefix=rel_prefix,
                                      pywb_static_prefix=pywb_static_prefix)

            framed_replay = self.framed_replay

        url_parts = urlsplit(wb_url.url)
        if not url_parts.path:
            return self.send_redirect('/', url_parts, urlrewriter)

        self.unrewrite_referrer(environ, full_prefix)

        urlkey = canonicalize(wb_url.url)

        inputreq = RewriteInputRequest(environ, urlkey, wb_url.url, content_rw)

        inputreq.include_method_query(wb_url.url)

        range_start, range_end, skip_record = self._check_range(
            inputreq, wb_url)

        setcookie_headers = None
        cookie_key = None
        if self.cookie_tracker:
            cookie_key = self.get_cookie_key(kwargs)
            if cookie_key:
                res = self.cookie_tracker.get_cookie_headers(
                    wb_url.url, urlrewriter, cookie_key,
                    environ.get('HTTP_COOKIE', ''))
                inputreq.extra_cookie, setcookie_headers = res

        r = self._do_req(inputreq, wb_url, kwargs, skip_record)

        if r.status_code >= 400:
            error = None
            try:
                error = r.raw.read()
            except Exception:
                pass
            finally:
                no_except_close(r.raw)

            if error:
                error = error.decode('utf-8')
            else:
                error = ''

            details = dict(args=kwargs, error=error)
            if r.status_code == 404:
                raise NotFoundException(url=wb_url.url, msg=details)

            else:
                raise UpstreamException(r.status_code,
                                        url=wb_url.url,
                                        details=details)

        cdx = CDXObject(r.headers.get('Warcserver-Cdx').encode('utf-8'))

        cdx_url_parts = urlsplit(cdx['url'])

        if cdx_url_parts.path.endswith(
                '/') and not url_parts.path.endswith('/'):
            # add trailing slash
            new_path = url_parts.path + '/'

            no_except_close(r.raw)

            return self.send_redirect(new_path, url_parts, urlrewriter)

        # only redirect to exact if not live, otherwise set to false
        redirect_to_exact = redirect_to_exact and not cdx.get('is_live')

        # return top-frame timegate response, with timestamp from cdx
        if response and keep_frame_response and (not redirect_to_exact
                                                 or not is_timegate):
            no_except_close(r.raw)
            return self.format_response(response, wb_url, full_prefix,
                                        is_timegate, is_proxy,
                                        cdx['timestamp'])

        stream = BufferedReader(r.raw, block_size=BUFF_SIZE)
        record = self.loader.parse_record_stream(stream,
                                                 ensure_http_headers=True)

        memento_dt = r.headers.get('Memento-Datetime')
        target_uri = r.headers.get('WARC-Target-URI')

        # cdx['urlkey'] = urlkey
        # cdx['timestamp'] = http_date_to_timestamp(memento_dt)
        # cdx['url'] = target_uri

        set_content_loc = False

        # Check if Fuzzy Match
        if target_uri != wb_url.url and cdx.get('is_fuzzy') == '1':
            set_content_loc = True

        # if redirect to exact timestamp (only set if not live)
        if redirect_to_exact:
            if set_content_loc or is_timegate or wb_url.timestamp != cdx.get(
                    'timestamp'):
                new_url = urlrewriter.get_new_url(url=target_uri,
                                                  timestamp=cdx['timestamp'],
                                                  mod=wb_url.mod)

                resp = WbResponse.redir_response(new_url,
                                                 '307 Temporary Redirect')
                if self.enable_memento:
                    if is_timegate and not is_proxy:
                        self._add_memento_links(target_uri,
                                                full_prefix,
                                                memento_dt,
                                                cdx['timestamp'],
                                                resp.status_headers,
                                                is_timegate,
                                                is_proxy,
                                                pref_applied=pref_applied,
                                                mod=pref_mod,
                                                is_memento=False)

                    else:
                        resp.status_headers['Link'] = MementoUtils.make_link(
                            target_uri, 'original')

                return resp

        self._add_custom_params(cdx, r.headers, kwargs, record)

        if self._add_range(record, wb_url, range_start, range_end):
            wb_url.mod = 'id_'

        if is_ajax:
            head_insert_func = None
            urlrewriter.rewrite_opts['is_ajax'] = True
        else:
            top_url = self.get_top_url(full_prefix, wb_url, cdx, kwargs)
            head_insert_func = (self.head_insert_view.create_insert_func(
                wb_url,
                full_prefix,
                host_prefix,
                top_url,
                environ,
                framed_replay,
                coll=kwargs.get('coll', ''),
                replay_mod=self.replay_mod,
                metadata=kwargs.get('metadata', {}),
                config=self.config))

        cookie_rewriter = None
        if self.cookie_tracker and cookie_key:
            # skip add cookie if service worker is not 200
            # it seems cookie headers from service workers are not applied, so don't update in cache
            if wb_url.mod == 'sw_':
                cookie_key = None

            cookie_rewriter = self.cookie_tracker.get_rewriter(
                urlrewriter, cookie_key)

        urlrewriter.rewrite_opts['ua_string'] = environ.get('HTTP_USER_AGENT')

        result = content_rw(record, urlrewriter, cookie_rewriter,
                            head_insert_func, cdx, environ)

        status_headers, gen, is_rw = result

        if history_page:
            title = DefaultRewriter._extract_title(gen)
            if not title:
                title = unquote(environ.get('HTTP_X_WOMBAT_HISTORY_TITLE', ''))

            if not title:
                title = history_page

            self._add_history_page(cdx, kwargs, title)
            return WbResponse.json_response({'title': title})

        if setcookie_headers:
            status_headers.headers.extend(setcookie_headers)

        if ' ' not in status_headers.statusline:
            status_headers.statusline += ' None'

        if not is_ajax and self.enable_memento:
            self._add_memento_links(cdx['url'],
                                    full_prefix,
                                    memento_dt,
                                    cdx['timestamp'],
                                    status_headers,
                                    is_timegate,
                                    is_proxy,
                                    cdx.get('source-coll'),
                                    mod=pref_mod,
                                    pref_applied=pref_applied)

            set_content_loc = True

        if set_content_loc and not redirect_to_exact and not is_proxy:
            status_headers.headers.append(
                ('Content-Location',
                 urlrewriter.get_new_url(timestamp=cdx['timestamp'],
                                         url=cdx['url'])))

        if not is_proxy:
            self.add_csp_header(wb_url, status_headers)

        response = WbResponse(status_headers, gen)

        if is_proxy and environ.get('HTTP_ORIGIN'):
            response.add_access_control_headers(environ)

        if r.status_code == 200 and kwargs.get(
                'cache') == 'always' and environ.get('HTTP_REFERER'):
            response.status_headers[
                'Cache-Control'] = 'public, max-age=31536000, immutable'

        return response
Exemple #28
0
 def _invalid_collection(self, prefix):
     msg = ('Sorry, <i>{0}</i> is not a valid collection. '.
            format(prefix.strip('/')))
     raise NotFoundException(msg)
Exemple #29
0
def raise_not_found(url):
    raise NotFoundException('No Captures found for: %s' % url, url=url)
Exemple #30
0
 def _do_open(self, filename):
     try:
         return open(filename, 'rb')
     except IOError:
         raise NotFoundException(filename)