Ejemplo n.º 1
0
    def __init__(self, is_framed_replay=False, proxies=None):
        self.rewriter = RewriteContent(is_framed_replay=is_framed_replay)

        self.proxies = proxies

        self.live_request = live_request

        if self.proxies:
            logging.debug('Live Rewrite via proxy ' + str(proxies))

            if isinstance(proxies, str):
                self.proxies = {'http': proxies, 'https': proxies}

        else:
            logging.debug('Live Rewrite Direct (no proxy)')
Ejemplo n.º 2
0
    def __init__(self, config):
        super(PlatformHandler, self).__init__(config)
        self.upstream_url = config.get('upstream_url')
        self.loader = ArcWarcRecordLoader()

        framed = config.get('framed_replay')
        self.content_rewriter = RewriteContent(is_framed_replay=framed)
Ejemplo n.º 3
0
    def __init__(self, content_loader, config):
        self.content_loader = content_loader

        framed = config.get('framed_replay')
        self.content_rewriter = RewriteContent(is_framed_replay=framed)

        self.head_insert_view = HeadInsertView.init_from_config(config)

        self.buffer_response = config.get('buffer_response', True)

        self.redir_to_exact = config.get('redir_to_exact', True)

        memento = config.get('enable_memento', False)
        if memento:
            self.response_class = MementoResponse
        else:
            self.response_class = WbResponse

        self._reporter = config.get('reporter')
Ejemplo n.º 4
0
    def _buffer_response(status_headers, iterator):
        out = SpooledTemporaryFile(ProxyRouter.BUFF_RESPONSE_MEM_SIZE)
        size = 0

        for buff in iterator:
            size += len(buff)
            out.write(buff)

        content_length_str = str(size)
        # remove existing content length
        status_headers.replace_header('Content-Length', content_length_str)

        out.seek(0)
        return RewriteContent.stream_to_gen(out)
Ejemplo n.º 5
0
    def _buffer_response(status_headers, iterator):
        out = SpooledTemporaryFile(ProxyRouter.BUFF_RESPONSE_MEM_SIZE)
        size = 0

        for buff in iterator:
            size += len(buff)
            out.write(buff)

        content_length_str = str(size)
        # remove existing content length
        status_headers.replace_header('Content-Length',
                                      content_length_str)

        out.seek(0)
        return RewriteContent.stream_to_gen(out)
Ejemplo n.º 6
0
    def __init__(self, content_loader, config):
        self.content_loader = content_loader

        framed = config.get("framed_replay")
        self.content_rewriter = RewriteContent(is_framed_replay=framed)

        self.head_insert_view = HeadInsertView.init_from_config(config)

        self.buffer_response = config.get("buffer_response", True)
        self.buffer_max_size = config.get("buffer_max_size", 16384)

        self.redir_to_exact = config.get("redir_to_exact", True)

        memento = config.get("enable_memento", False)
        if memento:
            self.response_class = MementoResponse
        else:
            self.response_class = WbResponse

        self.enable_range_cache = config.get("enable_ranges", True)

        self._reporter = config.get("reporter")
Ejemplo n.º 7
0
class LiveRewriter(object):
    def __init__(self, is_framed_replay=False, proxies=None):
        self.rewriter = RewriteContent(is_framed_replay=is_framed_replay)

        self.proxies = proxies

        self.live_request = live_request

        if self.proxies:
            logging.debug('Live Rewrite via proxy ' + str(proxies))

            if isinstance(proxies, str):
                self.proxies = {'http': proxies, 'https': proxies}

        else:
            logging.debug('Live Rewrite Direct (no proxy)')

    def is_recording(self):
        return self.proxies is not None

    def fetch_local_file(self, uri):
        #fh = open(uri)
        fh = LocalFileLoader().load(uri)

        content_type, _ = mimetypes.guess_type(uri)

        # create fake headers for local file
        status_headers = StatusAndHeaders('200 OK',
                                          [('Content-Type', content_type)])
        stream = fh

        return (status_headers, stream)

    def translate_headers(self, url, urlkey, env):
        headers = {}

        splits = urlsplit(url)
        has_cookies = False

        for name, value in six.iteritems(env):
            if name == 'HTTP_HOST':
                name = 'Host'
                value = splits.netloc

            elif name == 'HTTP_ORIGIN':
                name = 'Origin'
                value = (splits.scheme + '://' + splits.netloc)

            elif name == 'HTTP_X_CSRFTOKEN':
                name = 'X-CSRFToken'
                cookie_val = extract_client_cookie(env, 'csrftoken')
                if cookie_val:
                    value = cookie_val

            elif name == 'HTTP_REFERER':
                continue

            elif name == 'HTTP_X_PYWB_REQUESTED_WITH':
                continue

            elif name == 'HTTP_X_FORWARDED_PROTO':
                name = 'X-Forwarded-Proto'
                value = splits.scheme

            elif name == 'HTTP_COOKIE':
                name = 'Cookie'
                value = self._req_cookie_rewrite(urlkey, value)
                has_cookies = True

            elif name.startswith('HTTP_'):
                name = name[5:].title().replace('_', '-')

            elif name in ('CONTENT_LENGTH', 'CONTENT_TYPE'):
                name = name.title().replace('_', '-')

            elif name == 'REL_REFERER':
                name = 'Referer'
            else:
                value = None

            if value:
                headers[name] = value

        if not has_cookies:
            value = self._req_cookie_rewrite(urlkey, '')
            if value:
                headers['Cookie'] = value

        return headers

    def _req_cookie_rewrite(self, urlkey, value):
        rule = self.rewriter.ruleset.get_first_match(urlkey)
        if not rule or not rule.req_cookie_rewrite:
            return value

        for cr in rule.req_cookie_rewrite:
            try:
                value = cr['rx'].sub(cr['replace'], value)
            except KeyError:
                pass

        return value

    def fetch_http(self,
                   url,
                   urlkey=None,
                   env=None,
                   req_headers=None,
                   follow_redirects=False,
                   skip_recording=False,
                   verify=True):

        method = 'GET'
        data = None

        proxies = None
        if not skip_recording:
            proxies = self.proxies

        if not req_headers:
            req_headers = {}

        if env is not None:
            method = env['REQUEST_METHOD'].upper()
            input_ = env['wsgi.input']

            req_headers.update(self.translate_headers(url, urlkey, env))

            if method in ('POST', 'PUT'):
                len_ = env.get('CONTENT_LENGTH')
                if len_:
                    data = LimitReader(input_, int(len_))
                else:
                    data = input_

        response = self.live_request(method=method,
                                     url=url,
                                     data=data,
                                     headers=req_headers,
                                     allow_redirects=follow_redirects,
                                     proxies=proxies,
                                     stream=True,
                                     verify=verify)

        statusline = str(response.status_code) + ' ' + response.reason

        headers = response.headers.items()

        stream = response.raw

        try:  #pragma: no cover
            #PY 3
            headers = stream._original_response.headers._headers
        except:  #pragma: no cover
            #PY 2
            headers = []
            resp_headers = stream._original_response.msg.headers
            for h in resp_headers:
                n, v = h.split(':', 1)
                n = n.strip()
                v = v.strip()
                headers.append((n, v))

        status_headers = StatusAndHeaders(statusline, headers)

        return (status_headers, stream)

    def fetch_request(self,
                      url,
                      urlrewriter,
                      head_insert_func=None,
                      urlkey=None,
                      env=None,
                      req_headers={},
                      timestamp=None,
                      follow_redirects=False,
                      skip_recording=False,
                      verify=True,
                      remote_only=True):

        ts_err = url.split('///')

        # fixup for accidental erroneous rewrite which has ///
        # (unless file:///)
        if len(ts_err) > 1 and ts_err[0] != 'file:':
            url = 'http://' + ts_err[1]

        if url.startswith('//'):
            url = 'http:' + url

        if remote_only or is_http(url):
            is_remote = True
        else:
            is_remote = False
            if not url.startswith('file:'):
                url = to_file_url(url)

        # explicit urlkey may be passed in (say for testing)
        if not urlkey:
            urlkey = canonicalize(url)

        if is_remote:
            (status_headers, stream) = self.fetch_http(url, urlkey, env,
                                                       req_headers,
                                                       follow_redirects,
                                                       skip_recording, verify)
        else:
            (status_headers, stream) = self.fetch_local_file(url)

        if timestamp is None:
            timestamp = timestamp_now()

        cdx = {
            'urlkey': urlkey,
            'timestamp': timestamp,
            'url': url,
            'status': status_headers.get_statuscode(),
            'mime': status_headers.get_header('Content-Type'),
            'is_live': True,
        }

        result = (self.rewriter.rewrite_content(
            urlrewriter,
            status_headers,
            stream,
            head_insert_func=head_insert_func,
            urlkey=urlkey,
            cdx=cdx))

        if env:
            env['pywb.cdx'] = cdx

        return result

    def fetch_async(self, url, headers):
        resp = self.live_request(method='GET',
                                 url=url,
                                 headers=headers,
                                 proxies=self.proxies,
                                 verify=False,
                                 stream=True)

        # don't actually read whole response,
        # proxy response for writing it
        resp.close()

    def add_metadata(self, url, headers, data):
        return self.live_request(method='PUTMETA',
                                 url=url,
                                 data=data,
                                 headers=headers,
                                 proxies=self.proxies,
                                 verify=False)

    def get_rewritten(self, *args, **kwargs):
        result = self.fetch_request(*args, **kwargs)

        status_headers, gen, is_rewritten = result

        buff = b''.join(gen)

        return (status_headers, buff)

    def get_video_info(self, url):
        return youtubedl.extract_info(url)
Ejemplo n.º 8
0
class ReplayView(object):
    STRIP_SCHEME = re.compile('^([\w]+:[/]*)?(.*?)$')

    def __init__(self, content_loader, config):
        self.content_loader = content_loader

        framed = config.get('framed_replay')
        self.content_rewriter = RewriteContent(is_framed_replay=framed)

        self.head_insert_view = HeadInsertView.init_from_config(config)

        self.buffer_response = config.get('buffer_response', True)

        self.redir_to_exact = config.get('redir_to_exact', True)

        memento = config.get('enable_memento', False)
        if memento:
            self.response_class = MementoResponse
        else:
            self.response_class = WbResponse

        self._reporter = config.get('reporter')

    def render_content(self, wbrequest, cdx_lines, cdx_loader):
        last_e = None
        first = True

        #cdx_lines = args[0]
        #cdx_loader = args[1]

        # List of already failed w/arcs
        failed_files = []

        response = None

        # Iterate over the cdx until find one that works
        # The cdx should already be sorted in
        # closest-to-timestamp order (from the cdx server)
        for cdx in cdx_lines:
            try:
                # optimize: can detect if redirect is needed just from the cdx,
                # no need to load w/arc data if requiring exact match
                if first:
                    redir_response = self._redirect_if_needed(wbrequest, cdx)
                    if redir_response:
                        return redir_response

                    first = False

                response = self.replay_capture(wbrequest, cdx, cdx_loader,
                                               failed_files)

            except (CaptureException, ArchiveLoadFailed) as ce:
                import traceback
                traceback.print_exc()
                last_e = ce
                pass

            if response:
                return response

        if not last_e:
            # can only get here if cdx_lines is empty somehow
            # should be filtered out before hand, but if not
            msg = 'No Captures found for: ' + wbrequest.wb_url.url
            last_e = NotFoundException(msg)

        raise last_e

    def replay_capture(self, wbrequest, cdx, cdx_loader, failed_files):
        (status_headers,
         stream) = (self.content_loader.resolve_headers_and_payload(
             cdx, failed_files, cdx_loader))

        # check and reject self-redirect
        self._reject_self_redirect(wbrequest, cdx, status_headers)

        # check if redir is needed
        redir_response = self._redirect_if_needed(wbrequest, cdx)
        if redir_response:
            return redir_response

        length = status_headers.get_header('content-length')
        stream = LimitReader.wrap_stream(stream, length)

        # one more check for referrer-based self-redirect
        # TODO: evaluate this, as refreshing in browser may sometimes cause
        # referrer to be set to the same page, incorrectly skipping a capture
        # self._reject_referrer_self_redirect(wbrequest)

        urlrewriter = wbrequest.urlrewriter

        # if using url rewriter, use original url for rewriting purposes
        if wbrequest and wbrequest.wb_url:
            wbrequest.wb_url.url = cdx['original']

        head_insert_func = None
        if self.head_insert_view:
            head_insert_func = (
                self.head_insert_view.create_insert_func(wbrequest))

        result = (self.content_rewriter.rewrite_content(
            urlrewriter,
            headers=status_headers,
            stream=stream,
            head_insert_func=head_insert_func,
            urlkey=cdx['urlkey'],
            cdx=cdx))

        (status_headers, response_iter, is_rewritten) = result

        # buffer response if buffering enabled
        if self.buffer_response:
            response_iter = self.buffered_response(status_headers,
                                                   response_iter)

        response = self.response_class(status_headers,
                                       response_iter,
                                       wbrequest=wbrequest,
                                       cdx=cdx)

        # notify reporter callback, if any
        if self._reporter:
            self._reporter(wbrequest, cdx, response)

        return response

    # Buffer rewrite iterator and return a response from a string
    def buffered_response(self, status_headers, iterator):
        out = BytesIO()

        try:
            for buff in iterator:
                out.write(bytes(buff))

        finally:
            content = out.getvalue()

            content_length_str = str(len(content))

            # remove existing content length
            status_headers.replace_header('Content-Length', content_length_str)
            out.close()

        return [content]

    def _redirect_if_needed(self, wbrequest, cdx):
        if wbrequest.options['is_proxy']:
            return None

        redir_needed = (wbrequest.options.get('is_timegate', False))

        if not redir_needed and self.redir_to_exact:
            redir_needed = (cdx['timestamp'] != wbrequest.wb_url.timestamp)

        if not redir_needed:
            return None

        new_url = (wbrequest.urlrewriter.get_new_url(
            timestamp=cdx['timestamp'], url=cdx['original']))

        if wbrequest.method == 'POST':
            #   FF shows a confirm dialog, so can't use 307 effectively
            #            statusline = '307 Same-Method Internal Redirect'
            return None
        else:
            statusline = '302 Internal Redirect'

        status_headers = StatusAndHeaders(statusline, [('Location', new_url)])

        # don't include cdx to indicate internal redirect
        return self.response_class(status_headers, wbrequest=wbrequest)

    def _reject_self_redirect(self, wbrequest, cdx, status_headers):
        """
        Check if response is a 3xx redirect to the same url
        If so, reject this capture to avoid causing redirect loop
        """
        if not status_headers.statusline.startswith('3'):
            return

        # skip all 304s
        if (status_headers.statusline.startswith('304')
                and not wbrequest.wb_url.is_identity):

            raise CaptureException('Skipping 304 Modified: ' + str(cdx))

        request_url = wbrequest.wb_url.url.lower()
        location_url = status_headers.get_header('Location')
        if not location_url:
            return

        location_url = location_url.lower()
        if location_url.startswith('/'):
            host = urlsplit(cdx['original']).netloc
            location_url = host + location_url

        if (ReplayView.strip_scheme(request_url) == ReplayView.strip_scheme(
                location_url)):
            raise CaptureException('Self Redirect: ' + str(cdx))

    # TODO: reevaluate this, as it may reject valid refreshes of a page
    def _reject_referrer_self_redirect(self, wbrequest):  # pragma: no cover
        """
        Perform final check for referrer based self-redirect.
        This method should be called after verifying that
        the request timestamp == capture timestamp

        If referrer is same as current url,
        reject this response and try another capture.
        """
        if not wbrequest.referrer:
            return

        # build full url even if using relative-rewriting
        request_url = (wbrequest.host_prefix + wbrequest.rel_prefix +
                       str(wbrequest.wb_url))

        if (ReplayView.strip_scheme(request_url) == ReplayView.strip_scheme(
                wbrequest.referrer)):
            raise CaptureException('Self Redirect via Referrer: ' +
                                   str(wbrequest.wb_url))

    @staticmethod
    def strip_scheme(url):
        """
        >>> ReplayView.strip_scheme('https://example.com') ==\
            ReplayView.strip_scheme('http://example.com')
        True

        >>> ReplayView.strip_scheme('https://example.com') ==\
            ReplayView.strip_scheme('http:/example.com')
        True

        >>> ReplayView.strip_scheme('https://example.com') ==\
            ReplayView.strip_scheme('example.com')
        True

        >>> ReplayView.strip_scheme('about://example.com') ==\
            ReplayView.strip_scheme('example.com')
        True

        >>> ReplayView.strip_scheme('http://') ==\
            ReplayView.strip_scheme('')
        True

        >>> ReplayView.strip_scheme('#!@?') ==\
            ReplayView.strip_scheme('#!@?')
        True
        """
        m = ReplayView.STRIP_SCHEME.match(url)
        match = m.group(2)
        return match
Ejemplo n.º 9
0
def test_type_detect_2():
    text_type, stream = RewriteContent._resolve_text_type('js', 'html', BytesIO(b' function() { return 0; }'))
    assert(text_type == 'js')
    assert(stream.read() == b' function() { return 0; }')
Ejemplo n.º 10
0
def test_type_detect_1():
    text_type, stream = RewriteContent._resolve_text_type('js', 'html', BytesIO(b' <html></html>'))
    assert(text_type == 'html')
    assert(stream.read() == b' <html></html>')
Ejemplo n.º 11
0
class ReplayView(object):
    STRIP_SCHEME_WWW = re.compile('^([\w]+:[/]*(?:www[\d]*\.)?)?(.*?)$',
                                  re.MULTILINE)

    def __init__(self, content_loader, config):
        self.content_loader = content_loader

        framed = config.get('framed_replay')
        self.content_rewriter = RewriteContent(is_framed_replay=framed)

        self.head_insert_view = HeadInsertView.init_from_config(config)

        self.buffer_response = config.get('buffer_response', True)
        self.buffer_max_size = config.get('buffer_max_size', 16384)

        self.redir_to_exact = config.get('redir_to_exact', True)

        memento = config.get('enable_memento', False)
        if memento:
            self.response_class = MementoResponse
        else:
            self.response_class = WbResponse

        self.enable_range_cache = config.get('enable_ranges', True)

        self._reporter = config.get('reporter')

    def render_content(self, wbrequest, cdx_lines, cdx_loader):
        last_e = None
        first = True

        #cdx_lines = args[0]
        #cdx_loader = args[1]

        # List of already failed w/arcs
        failed_files = []

        response = None

        # Iterate over the cdx until find one that works
        # The cdx should already be sorted in
        # closest-to-timestamp order (from the cdx server)
        for cdx in cdx_lines:
            try:
                # optimize: can detect if redirect is needed just from the cdx,
                # no need to load w/arc data if requiring exact match
                if first:
                    redir_response = self._redirect_if_needed(wbrequest, cdx)
                    if redir_response:
                        return redir_response

                    first = False

                response = self.cached_replay_capture(wbrequest, cdx,
                                                      cdx_loader, failed_files)

            except (CaptureException, ArchiveLoadFailed) as ce:
                #import traceback
                #traceback.print_exc()
                logging.debug(ce)
                last_e = ce
                pass

            if response:
                return response

        if not last_e:
            # can only get here if cdx_lines is empty somehow
            # should be filtered out before hand, but if not
            msg = 'No Captures found for: ' + wbrequest.wb_url.url
            last_e = NotFoundException(msg)

        raise last_e

    def cached_replay_capture(self, wbrequest, cdx, cdx_loader, failed_files):
        def get_capture():
            return self.replay_capture(wbrequest, cdx, cdx_loader,
                                       failed_files)

        if not self.enable_range_cache:
            return get_capture()

        range_info = wbrequest.extract_range()

        if not range_info:
            return get_capture()

        range_status, range_iter = (range_cache.handle_range(
            wbrequest, cdx.get('digest', cdx['urlkey']), get_capture,
            *range_info))

        response = self.response_class(range_status,
                                       range_iter,
                                       wbrequest=wbrequest,
                                       cdx=cdx)
        return response

    def replay_capture(self, wbrequest, cdx, cdx_loader, failed_files):
        (status_headers,
         stream) = (self.content_loader(cdx, failed_files, cdx_loader,
                                        wbrequest))

        # check and reject self-redirect
        self._reject_self_redirect(wbrequest, cdx, status_headers)

        # check if redir is needed
        redir_response = self._redirect_if_needed(wbrequest, cdx)
        if redir_response:
            return redir_response

        #length = status_headers.get_header('content-length')
        #stream = LimitReader.wrap_stream(stream, length)

        # one more check for referrer-based self-redirect
        # TODO: evaluate this, as refreshing in browser may sometimes cause
        # referrer to be set to the same page, incorrectly skipping a capture
        # self._reject_referrer_self_redirect(wbrequest)

        urlrewriter = wbrequest.urlrewriter

        # if using url rewriter, use original url for rewriting purposes
        if wbrequest and wbrequest.wb_url:
            wbrequest.wb_url.url = cdx['url']

        if wbrequest.options['is_ajax']:
            wbrequest.urlrewriter.rewrite_opts['is_ajax'] = True

        head_insert_func = None
        if self.head_insert_view:
            head_insert_func = (
                self.head_insert_view.create_insert_func(wbrequest))

        result = (self.content_rewriter.rewrite_content(
            urlrewriter,
            status_headers=status_headers,
            stream=stream,
            head_insert_func=head_insert_func,
            urlkey=cdx['urlkey'],
            cdx=cdx,
            env=wbrequest.env))

        (status_headers, response_iter, is_rewritten) = result

        # buffer response if buffering enabled
        if self.buffer_response:
            content_len = status_headers.get_header('content-length')
            try:
                content_len = int(content_len)
            except:
                content_len = 0

            if content_len <= 0:
                max_size = self.buffer_max_size
                response_iter = self.buffered_response(status_headers,
                                                       response_iter, max_size)

        # Set Content-Location if not exact capture
        if not self.redir_to_exact:
            mod = wbrequest.options.get('replay_mod', wbrequest.wb_url.mod)
            canon_url = (wbrequest.urlrewriter.get_new_url(
                timestamp=cdx['timestamp'], url=cdx['url'], mod=mod))

            status_headers.headers.append(('Content-Location', canon_url))

        if wbrequest.wb_url.mod == 'vi_':
            status_headers.headers.append(('access-control-allow-origin', '*'))

        response = self.response_class(status_headers,
                                       response_iter,
                                       wbrequest=wbrequest,
                                       cdx=cdx)

        # notify reporter callback, if any
        if self._reporter:
            self._reporter(wbrequest, cdx, response)

        return response

    # Buffer rewrite iterator and return a response from a string
    def buffered_response(self, status_headers, iterator, max_size):
        out = BytesIO()
        size = 0
        read_all = True

        try:
            for buff in iterator:
                buff = bytes(buff)
                size += len(buff)
                out.write(buff)
                if max_size > 0 and size > max_size:
                    read_all = False
                    break

        finally:
            content = out.getvalue()
            out.close()

        if read_all:
            content_length_str = str(len(content))

            # remove existing content length
            status_headers.replace_header('Content-Length', content_length_str)
            return [content]
        else:
            status_headers.remove_header('Content-Length')
            return chain(iter([content]), iterator)

    def _redirect_if_needed(self, wbrequest, cdx):
        if not self.redir_to_exact:
            return None

        if wbrequest.options['is_proxy']:
            return None

        if wbrequest.custom_params.get('noredir'):
            return None

        is_timegate = (wbrequest.options.get('is_timegate', False))
        if not is_timegate:
            is_timegate = wbrequest.wb_url.is_latest_replay()

        redir_needed = is_timegate or (cdx['timestamp'] !=
                                       wbrequest.wb_url.timestamp)

        if not redir_needed:
            return None

        if self.enable_range_cache and wbrequest.extract_range():
            return None

        #if is_timegate:
        #    timestamp = timestamp_now()
        #else:
        timestamp = cdx['timestamp']

        new_url = (wbrequest.urlrewriter.get_new_url(timestamp=timestamp,
                                                     url=cdx['url']))

        if wbrequest.method == 'POST':
            #   FF shows a confirm dialog, so can't use 307 effectively
            #   was: statusline = '307 Same-Method Internal Redirect'
            return None
        elif is_timegate:
            statusline = '302 Found'
        else:
            # clear cdx line to indicate internal redirect
            statusline = '302 Internal Redirect'
            cdx = None

        status_headers = StatusAndHeaders(statusline, [('Location', new_url)])

        return self.response_class(status_headers,
                                   wbrequest=wbrequest,
                                   cdx=cdx,
                                   memento_is_redir=True)

    def _reject_self_redirect(self, wbrequest, cdx, status_headers):
        """
        Check if response is a 3xx redirect to the same url
        If so, reject this capture to avoid causing redirect loop
        """
        if not status_headers.statusline.startswith('3'):
            return

        # skip all 304s
        if (status_headers.statusline.startswith('304')
                and not wbrequest.wb_url.is_identity):

            raise CaptureException('Skipping 304 Modified: ' + str(cdx))

        request_url = wbrequest.wb_url.url.lower()
        location_url = status_headers.get_header('Location')
        if not location_url:
            return

        location_url = location_url.lower()
        if location_url.startswith('/'):
            host = urlsplit(cdx['url']).netloc
            location_url = host + location_url

        if (ReplayView.strip_scheme_www(request_url) ==
                ReplayView.strip_scheme_www(location_url)):
            raise CaptureException('Self Redirect: ' + str(cdx))

    # TODO: reevaluate this, as it may reject valid refreshes of a page
    def _reject_referrer_self_redirect(self, wbrequest):  # pragma: no cover
        """
        Perform final check for referrer based self-redirect.
        This method should be called after verifying that
        the request timestamp == capture timestamp

        If referrer is same as current url,
        reject this response and try another capture.
        """
        if not wbrequest.referrer:
            return

        # build full url even if using relative-rewriting
        request_url = (wbrequest.host_prefix + wbrequest.rel_prefix +
                       str(wbrequest.wb_url))

        if (ReplayView.strip_scheme_www(request_url) ==
                ReplayView.strip_scheme_www(wbrequest.referrer)):
            raise CaptureException('Self Redirect via Referrer: ' +
                                   str(wbrequest.wb_url))

    @staticmethod
    def strip_scheme_www(url):
        """
        >>> ReplayView.strip_scheme_www('https://example.com') ==\
            ReplayView.strip_scheme_www('http://example.com')
        True

        >>> ReplayView.strip_scheme_www('https://example.com') ==\
            ReplayView.strip_scheme_www('http:/example.com')
        True

        >>> ReplayView.strip_scheme_www('https://example.com') ==\
            ReplayView.strip_scheme_www('example.com')
        True

        >>> ReplayView.strip_scheme_www('https://example.com') ==\
            ReplayView.strip_scheme_www('http://www2.example.com')
        True

        >>> ReplayView.strip_scheme_www('about://example.com') ==\
            ReplayView.strip_scheme_www('example.com')
        True

        >>> ReplayView.strip_scheme_www('http://') ==\
            ReplayView.strip_scheme_www('')
        True

        >>> ReplayView.strip_scheme_www('#!@?') ==\
            ReplayView.strip_scheme_www('#!@?')
        True
        """
        m = ReplayView.STRIP_SCHEME_WWW.match(url)
        match = m.group(2)
        return match
Ejemplo n.º 12
0
class ReplayView(object):
    STRIP_SCHEME_WWW = re.compile("^([\w]+:[/]*(?:www[\d]*\.)?)?(.*?)$")

    def __init__(self, content_loader, config):
        self.content_loader = content_loader

        framed = config.get("framed_replay")
        self.content_rewriter = RewriteContent(is_framed_replay=framed)

        self.head_insert_view = HeadInsertView.init_from_config(config)

        self.buffer_response = config.get("buffer_response", True)
        self.buffer_max_size = config.get("buffer_max_size", 16384)

        self.redir_to_exact = config.get("redir_to_exact", True)

        memento = config.get("enable_memento", False)
        if memento:
            self.response_class = MementoResponse
        else:
            self.response_class = WbResponse

        self.enable_range_cache = config.get("enable_ranges", True)

        self._reporter = config.get("reporter")

    def render_content(self, wbrequest, cdx_lines, cdx_loader):
        last_e = None
        first = True

        # cdx_lines = args[0]
        # cdx_loader = args[1]

        # List of already failed w/arcs
        failed_files = []

        response = None

        # Iterate over the cdx until find one that works
        # The cdx should already be sorted in
        # closest-to-timestamp order (from the cdx server)
        for cdx in cdx_lines:
            try:
                # optimize: can detect if redirect is needed just from the cdx,
                # no need to load w/arc data if requiring exact match
                if first:
                    redir_response = self._redirect_if_needed(wbrequest, cdx)
                    if redir_response:
                        return redir_response

                    first = False

                response = self.cached_replay_capture(wbrequest, cdx, cdx_loader, failed_files)

            except (CaptureException, ArchiveLoadFailed) as ce:
                # import traceback
                # traceback.print_exc()
                logging.debug(ce)
                last_e = ce
                pass

            if response:
                return response

        if not last_e:
            # can only get here if cdx_lines is empty somehow
            # should be filtered out before hand, but if not
            msg = "No Captures found for: " + wbrequest.wb_url.url
            last_e = NotFoundException(msg)

        raise last_e

    def cached_replay_capture(self, wbrequest, cdx, cdx_loader, failed_files):
        def get_capture():
            return self.replay_capture(wbrequest, cdx, cdx_loader, failed_files)

        if not self.enable_range_cache:
            return get_capture()

        range_info = wbrequest.extract_range()

        if not range_info:
            return get_capture()

        range_status, range_iter = range_cache.handle_range(
            wbrequest, cdx.get("digest", cdx["urlkey"]), get_capture, *range_info
        )

        response = self.response_class(range_status, range_iter, wbrequest=wbrequest, cdx=cdx)
        return response

    def replay_capture(self, wbrequest, cdx, cdx_loader, failed_files):
        (status_headers, stream) = self.content_loader(cdx, failed_files, cdx_loader, wbrequest)

        # check and reject self-redirect
        self._reject_self_redirect(wbrequest, cdx, status_headers)

        # check if redir is needed
        redir_response = self._redirect_if_needed(wbrequest, cdx)
        if redir_response:
            return redir_response

        # length = status_headers.get_header('content-length')
        # stream = LimitReader.wrap_stream(stream, length)

        # one more check for referrer-based self-redirect
        # TODO: evaluate this, as refreshing in browser may sometimes cause
        # referrer to be set to the same page, incorrectly skipping a capture
        # self._reject_referrer_self_redirect(wbrequest)

        urlrewriter = wbrequest.urlrewriter

        # if using url rewriter, use original url for rewriting purposes
        if wbrequest and wbrequest.wb_url:
            wbrequest.wb_url.url = cdx["url"]

        head_insert_func = None
        if self.head_insert_view:
            head_insert_func = self.head_insert_view.create_insert_func(wbrequest)

        result = self.content_rewriter.rewrite_content(
            urlrewriter,
            status_headers=status_headers,
            stream=stream,
            head_insert_func=head_insert_func,
            urlkey=cdx["urlkey"],
            cdx=cdx,
        )

        (status_headers, response_iter, is_rewritten) = result

        # buffer response if buffering enabled
        if self.buffer_response:
            content_len = status_headers.get_header("content-length")
            try:
                content_len = int(content_len)
            except:
                content_len = 0

            if content_len <= 0:
                max_size = self.buffer_max_size
                response_iter = self.buffered_response(status_headers, response_iter, max_size)

        # Set Content-Location if not exact capture
        if not self.redir_to_exact:
            mod = wbrequest.options.get("replay_mod", wbrequest.wb_url.mod)
            canon_url = wbrequest.urlrewriter.get_new_url(timestamp=cdx["timestamp"], url=cdx["url"], mod=mod)

            status_headers.headers.append(("Content-Location", canon_url))

        if wbrequest.wb_url.mod == "vi_":
            status_headers.headers.append(("access-control-allow-origin", "*"))

        response = self.response_class(status_headers, response_iter, wbrequest=wbrequest, cdx=cdx)

        # notify reporter callback, if any
        if self._reporter:
            self._reporter(wbrequest, cdx, response)

        return response

    # Buffer rewrite iterator and return a response from a string
    def buffered_response(self, status_headers, iterator, max_size):
        out = BytesIO()
        size = 0
        read_all = True

        try:
            for buff in iterator:
                buff = bytes(buff)
                size += len(buff)
                out.write(buff)
                if max_size > 0 and size > max_size:
                    read_all = False
                    break

        finally:
            content = out.getvalue()
            out.close()

        if read_all:
            content_length_str = str(len(content))

            # remove existing content length
            status_headers.replace_header("Content-Length", content_length_str)
            return [content]
        else:
            status_headers.remove_header("Content-Length")
            return chain(iter([content]), iterator)

    def _redirect_if_needed(self, wbrequest, cdx):
        if not self.redir_to_exact:
            return None

        if wbrequest.options["is_proxy"]:
            return None

        if wbrequest.custom_params.get("noredir"):
            return None

        is_timegate = wbrequest.options.get("is_timegate", False)
        if not is_timegate:
            is_timegate = wbrequest.wb_url.is_latest_replay()

        redir_needed = is_timegate or (cdx["timestamp"] != wbrequest.wb_url.timestamp)

        if not redir_needed:
            return None

        if self.enable_range_cache and wbrequest.extract_range():
            return None

        # if is_timegate:
        #    timestamp = timestamp_now()
        # else:
        timestamp = cdx["timestamp"]

        new_url = wbrequest.urlrewriter.get_new_url(timestamp=timestamp, url=cdx["url"])

        if wbrequest.method == "POST":
            #   FF shows a confirm dialog, so can't use 307 effectively
            #   was: statusline = '307 Same-Method Internal Redirect'
            return None
        elif is_timegate:
            statusline = "302 Found"
        else:
            # clear cdx line to indicate internal redirect
            statusline = "302 Internal Redirect"
            cdx = None

        status_headers = StatusAndHeaders(statusline, [("Location", new_url)])

        return self.response_class(status_headers, wbrequest=wbrequest, cdx=cdx, memento_is_redir=True)

    def _reject_self_redirect(self, wbrequest, cdx, status_headers):
        """
        Check if response is a 3xx redirect to the same url
        If so, reject this capture to avoid causing redirect loop
        """
        if not status_headers.statusline.startswith("3"):
            return

        # skip all 304s
        if status_headers.statusline.startswith("304") and not wbrequest.wb_url.is_identity:

            raise CaptureException("Skipping 304 Modified: " + str(cdx))

        request_url = wbrequest.wb_url.url.lower()
        location_url = status_headers.get_header("Location")
        if not location_url:
            return

        location_url = location_url.lower()
        if location_url.startswith("/"):
            host = urlsplit(cdx["url"]).netloc
            location_url = host + location_url

        if ReplayView.strip_scheme_www(request_url) == ReplayView.strip_scheme_www(location_url):
            raise CaptureException("Self Redirect: " + str(cdx))

    # TODO: reevaluate this, as it may reject valid refreshes of a page
    def _reject_referrer_self_redirect(self, wbrequest):  # pragma: no cover
        """
        Perform final check for referrer based self-redirect.
        This method should be called after verifying that
        the request timestamp == capture timestamp

        If referrer is same as current url,
        reject this response and try another capture.
        """
        if not wbrequest.referrer:
            return

        # build full url even if using relative-rewriting
        request_url = wbrequest.host_prefix + wbrequest.rel_prefix + str(wbrequest.wb_url)

        if ReplayView.strip_scheme_www(request_url) == ReplayView.strip_scheme_www(wbrequest.referrer):
            raise CaptureException("Self Redirect via Referrer: " + str(wbrequest.wb_url))

    @staticmethod
    def strip_scheme_www(url):
        """
        >>> ReplayView.strip_scheme_www('https://example.com') ==\
            ReplayView.strip_scheme_www('http://example.com')
        True

        >>> ReplayView.strip_scheme_www('https://example.com') ==\
            ReplayView.strip_scheme_www('http:/example.com')
        True

        >>> ReplayView.strip_scheme_www('https://example.com') ==\
            ReplayView.strip_scheme_www('example.com')
        True

        >>> ReplayView.strip_scheme_www('https://example.com') ==\
            ReplayView.strip_scheme_www('http://www2.example.com')
        True

        >>> ReplayView.strip_scheme_www('about://example.com') ==\
            ReplayView.strip_scheme_www('example.com')
        True

        >>> ReplayView.strip_scheme_www('http://') ==\
            ReplayView.strip_scheme_www('')
        True

        >>> ReplayView.strip_scheme_www('#!@?') ==\
            ReplayView.strip_scheme_www('#!@?')
        True
        """
        m = ReplayView.STRIP_SCHEME_WWW.match(url)
        match = m.group(2)
        return match
Ejemplo n.º 13
0
class PlatformHandler(RewriteHandler):
    def __init__(self, config):
        super(PlatformHandler, self).__init__(config)
        self.upstream_url = config.get('upstream_url')
        self.loader = ArcWarcRecordLoader()

        framed = config.get('framed_replay')
        self.content_rewriter = RewriteContent(is_framed_replay=framed)

    def render_content(self, wbrequest):
        if wbrequest.wb_url.mod == 'vi_':
            return self._get_video_info(wbrequest)

        ref_wburl_str = wbrequest.extract_referrer_wburl_str()
        if ref_wburl_str:
            wbrequest.env['HTTP_REFERER'] = WbUrl(ref_wburl_str).url

        urlkey = canonicalize(wbrequest.wb_url.url)
        url = wbrequest.wb_url.url

        inputreq = RewriteInputRequest(wbrequest.env, urlkey, url,
                                       self.content_rewriter)

        req_data = inputreq.reconstruct_request(url)

        headers = {'Content-Length': len(req_data),
                   'Content-Type': 'application/request'}

        if wbrequest.wb_url.is_latest_replay():
            closest = 'now'
        else:
            closest = wbrequest.wb_url.timestamp

        upstream_url = self.upstream_url.format(url=quote(url),
                                                closest=closest,
                                                #coll=wbrequest.coll,
                                                **wbrequest.matchdict)

        r = requests.post(upstream_url,
                          data=BytesIO(req_data),
                          headers=headers,
                          stream=True,
                          allow_redirects=False)

        r.raise_for_status()

        record = self.loader.parse_record_stream(r.raw)

        cdx = CDXObject()
        cdx['urlkey'] = urlkey
        cdx['timestamp'] = http_date_to_timestamp(r.headers.get('Memento-Datetime'))
        cdx['url'] = url

        head_insert_func = self.head_insert_view.create_insert_func(wbrequest)
        result = self.content_rewriter.rewrite_content(wbrequest.urlrewriter,
                                               record.status_headers,
                                               record.stream,
                                               head_insert_func,
                                               urlkey,
                                               cdx)

        status_headers, gen, is_rw = result
        return self._make_response(wbrequest, *result)