Esempio n. 1
0
    def put_custom_record(self, environ, coll):
        chunks = []
        while True:
            buff = environ["wsgi.input"].read()
            print("LEN", len(buff))
            if not buff:
                break

            chunks.append(buff)

        data = b"".join(chunks)

        params = dict(parse_qsl(environ.get("QUERY_STRING")))

        rec_type = "resource"

        headers = {"Content-Type": environ.get("CONTENT_TYPE", "text/plain")}

        target_uri = params.get("url")

        if not target_uri:
            return WbResponse.json_response({"error": "no url"})

        timestamp = params.get("timestamp")
        if timestamp:
            headers["WARC-Date"] = timestamp_to_iso_date(timestamp)

        put_url = self.custom_record_path.format(url=target_uri,
                                                 coll=coll,
                                                 rec_type=rec_type)
        res = requests.put(put_url, headers=headers, data=data)

        res = res.json()

        return WbResponse.json_response(res)
Esempio n. 2
0
    def get_wacz(self, environ, coll):
        # if self.pending_count != 0 or self.pending_size != 0:
        #    return WbResponse.json_response(
        #        {"error": "not_ready"}, status="404 Not Found"
        #    )

        params = dict(parse_qsl(environ.get("QUERY_STRING")))

        archive_dir = os.path.join("collections", coll, "archive")
        all_warcs = [
            os.path.join(archive_dir, name) for name in os.listdir(archive_dir)
        ]
        all_warcs.append("-o")
        all_warcs.append("/tmp/out/archive.wacz")

        url = params.get("url")
        if url:
            all_warcs.append("--url")
            all_warcs.append(url)

        try:
            wacz_main(all_warcs)
        except Exception as e:
            print(e)

        return WbResponse.json_response({"done": "/tmp/out/archive.wacz"})
Esempio n. 3
0
    def serve_listing(self, environ):
        result = {
            'fixed': self.warcserver.list_fixed_routes(),
            'dynamic': self.warcserver.list_dynamic_routes()
        }

        return WbResponse.json_response(result)
Esempio n. 4
0
    def exit(self, environ=None):
        import uwsgi
        import signal

        resp = WbResponse.json_response({})
        os.kill(uwsgi.masterpid(), signal.SIGTERM)
        return resp
Esempio n. 5
0
def test_wbresponse_json_response():
    body = dict(pywb=1, wr=2)
    res = WbResponse.json_response(body)
    status_headers = res.status_headers
    assert status_headers.statusline == '200 OK'
    assert ('Content-Type',
            'application/json; charset=utf-8') in status_headers.headers
    assert json.loads(res.body[0]) == body
Esempio n. 6
0
    def put_custom_record(self, environ, coll="$root"):
        """ When recording, PUT a custom WARC record to the specified collection
        (Available only when recording)

        :param dict environ: The WSGI environment dictionary for the request
        :param str coll: The name of the collection the record is to be served from
        """
        chunks = []
        while True:
            buff = environ["wsgi.input"].read()
            if not buff:
                break

            chunks.append(buff)

        data = b"".join(chunks)

        params = dict(parse_qsl(environ.get("QUERY_STRING")))

        rec_type = "resource"

        headers = {"Content-Type": environ.get("CONTENT_TYPE", "text/plain")}

        target_uri = params.get("url")

        if not target_uri:
            return WbResponse.json_response({"error": "no url"},
                                            status="400 Bad Request")

        timestamp = params.get("timestamp")
        if timestamp:
            headers["WARC-Date"] = timestamp_to_iso_date(timestamp)

        put_url = self.put_custom_record_path.format(url=target_uri,
                                                     coll=coll,
                                                     rec_type=rec_type)
        res = requests.put(put_url, headers=headers, data=data)

        res = res.json()

        return WbResponse.json_response(res)
Esempio n. 7
0
    def serve_listing(self, environ):
        """Serves the response for WARCServer fixed and dynamic listing (paths)

        :param dict environ: The WSGI environment dictionary for the request
        :return: WbResponse containing the frontend apps WARCServer URL paths
        :rtype: WbResponse
        """
        result = {'fixed': self.warcserver.list_fixed_routes(),
                  'dynamic': self.warcserver.list_dynamic_routes()
                 }

        return WbResponse.json_response(result)
Esempio n. 8
0
    def serve_listing(self, environ):
        """Serves the response for WARCServer fixed and dynamic listing (paths)

        :param dict environ: The WSGI environment dictionary for the request
        :return: WbResponse containing the frontend apps WARCServer URL paths
        :rtype: WbResponse
        """
        result = {
            'fixed': self.warcserver.list_fixed_routes(),
            'dynamic': self.warcserver.list_dynamic_routes()
        }

        return WbResponse.json_response(result)
Esempio n. 9
0
    def put_screenshot(self, environ, coll):
        self.ensure_coll_exists(coll)

        headers = {'Content-Type': environ.get('CONTENT_TYPE', 'text/plain')}

        query_data = parse_qs(environ.get('QUERY_STRING'))

        url = query_data.get('target_uri', [])
        if url:
            url = url[0]

        if not url:
            return WbResponse.json_response({'error': 'no target_uri'})

        put_url = self.screenshot_recorder_path.format(url=url, coll=coll)

        res = requests.put(put_url,
                           headers=headers,
                           data=environ['wsgi.input'])

        res = res.json()
        return WbResponse.json_response(res)
Esempio n. 10
0
    def put_record(self, environ, coll, target_uri_format, rec_type, params, data):
        self.ensure_coll_exists(coll)

        headers = {'Content-Type': environ.get('CONTENT_TYPE', 'text/plain')}

        url = params.get('url')

        if not url:
            return WbResponse.json_response({'error': 'no url'})

        timestamp = params.get('timestamp')
        if timestamp:
            headers['WARC-Date'] = timestamp_to_iso_date(timestamp)

        target_uri = target_uri_format.format(url=url)
        put_url = self.custom_record_path.format(
            url=target_uri, coll=coll, rec_type=rec_type
        )
        res = requests.put(put_url, headers=headers, data=data)

        res = res.json()

        return WbResponse.json_response(res)
Esempio n. 11
0
def test_wbresponse_callable():
    expected_body = dict(pywb=1, wr=2)
    res = WbResponse.json_response(expected_body)
    env = dict(REQUEST_METHOD='GET')
    expected_passed_values = dict(status_line='200 OK',
                                  headers=[('Content-Type',
                                            'application/json; charset=utf-8'),
                                           ('Content-Length', '17')])
    passed_values = dict(status_line=None, headers=None)

    def start_response(status_line, headers):
        passed_values['status_line'] = status_line
        passed_values['headers'] = headers

    body = res(env, start_response)
    assert json.loads(body[0]) == expected_body
    assert passed_values == expected_passed_values
Esempio n. 12
0
 def get_pending(self, environ):
     return WbResponse.json_response({
         "count": self.pending_count,
         "size": self.pending_size
     })
Esempio n. 13
0
    def render_content(self, wb_url, kwargs, environ):
        wb_url = wb_url.replace('#', '%23')
        wb_url = WbUrl(wb_url)

        history_page = environ.pop('HTTP_X_WOMBAT_HISTORY_PAGE', '')
        if history_page:
            wb_url.url = history_page
            is_ajax = True
        else:
            is_ajax = self.is_ajax(environ)

        is_timegate = self._check_accept_dt(wb_url, environ)

        self.prepare_env(environ)

        host_prefix = environ['pywb.host_prefix']
        rel_prefix = self.get_rel_prefix(environ)
        full_prefix = host_prefix + rel_prefix

        pywb_static_prefix = environ['pywb.static_prefix'] + '/'
        is_proxy = ('wsgiprox.proxy_host' in environ)

        # if OPTIONS in proxy mode, just generate the proxy responss
        if is_proxy and self.is_preflight(environ):
            return WbResponse.options_response(environ)

        if self.use_js_obj_proxy:
            content_rw = self.js_proxy_rw
        else:
            content_rw = self.default_rw

        # no redirects if in proxy
        redirect_to_exact = self.redirect_to_exact and not is_proxy

        # Check Prefer
        pref_mod, pref_applied = self._get_prefer_mod(wb_url, environ,
                                                      content_rw, is_proxy)

        response = None
        keep_frame_response = False

        # prefer overrides custom response?
        if pref_mod is not None:
            # fast-redirect to preferred
            if redirect_to_exact and not is_timegate and pref_mod != wb_url.mod:
                new_url = full_prefix + wb_url.to_str(mod=pref_mod)
                headers = [('Preference-Applied', pref_applied),
                           ('Vary', 'Prefer')]

                return WbResponse.redir_response(new_url,
                                                 '307 Temporary Redirect',
                                                 headers=headers)
            else:
                wb_url.mod = pref_mod
        else:
            if kwargs.get('output'):
                response = self.handle_timemap(wb_url, kwargs, full_prefix)

            elif wb_url.is_query():
                response = self.handle_query(environ, wb_url, kwargs,
                                             full_prefix)

            else:
                response = self.handle_custom_response(environ, wb_url,
                                                       full_prefix,
                                                       host_prefix, kwargs)

                keep_frame_response = (not kwargs.get('no_timegate_check')
                                       and is_timegate
                                       and not is_proxy) or redirect_to_exact

        if response and not keep_frame_response:
            return self.format_response(response, wb_url, full_prefix,
                                        is_timegate, is_proxy)

        if is_proxy:
            environ['pywb_proxy_magic'] = environ['wsgiprox.proxy_host']
            urlrewriter = IdentityUrlRewriter(wb_url, '')
            framed_replay = False

        else:
            urlrewriter = UrlRewriter(wb_url,
                                      prefix=full_prefix,
                                      full_prefix=full_prefix,
                                      rel_prefix=rel_prefix,
                                      pywb_static_prefix=pywb_static_prefix)

            framed_replay = self.framed_replay

        url_parts = urlsplit(wb_url.url)
        if not url_parts.path:
            return self.send_redirect('/', url_parts, urlrewriter)

        self.unrewrite_referrer(environ, full_prefix)

        urlkey = canonicalize(wb_url.url)

        inputreq = RewriteInputRequest(environ, urlkey, wb_url.url, content_rw)

        inputreq.include_method_query(wb_url.url)

        range_start, range_end, skip_record = self._check_range(
            inputreq, wb_url)

        setcookie_headers = None
        cookie_key = None
        if self.cookie_tracker:
            cookie_key = self.get_cookie_key(kwargs)
            if cookie_key:
                res = self.cookie_tracker.get_cookie_headers(
                    wb_url.url, urlrewriter, cookie_key,
                    environ.get('HTTP_COOKIE', ''))
                inputreq.extra_cookie, setcookie_headers = res

        r = self._do_req(inputreq, wb_url, kwargs, skip_record)

        if r.status_code >= 400:
            error = None
            try:
                error = r.raw.read()
            except Exception:
                pass
            finally:
                no_except_close(r.raw)

            if error:
                error = error.decode('utf-8')
            else:
                error = ''

            details = dict(args=kwargs, error=error)
            if r.status_code == 404:
                raise NotFoundException(url=wb_url.url, msg=details)

            else:
                raise UpstreamException(r.status_code,
                                        url=wb_url.url,
                                        details=details)

        cdx = CDXObject(r.headers.get('Warcserver-Cdx').encode('utf-8'))

        cdx_url_parts = urlsplit(cdx['url'])

        if cdx_url_parts.path.endswith(
                '/') and not url_parts.path.endswith('/'):
            # add trailing slash
            new_path = url_parts.path + '/'

            no_except_close(r.raw)

            return self.send_redirect(new_path, url_parts, urlrewriter)

        # only redirect to exact if not live, otherwise set to false
        redirect_to_exact = redirect_to_exact and not cdx.get('is_live')

        # return top-frame timegate response, with timestamp from cdx
        if response and keep_frame_response and (not redirect_to_exact
                                                 or not is_timegate):
            no_except_close(r.raw)
            return self.format_response(response, wb_url, full_prefix,
                                        is_timegate, is_proxy,
                                        cdx['timestamp'])

        stream = BufferedReader(r.raw, block_size=BUFF_SIZE)
        record = self.loader.parse_record_stream(stream,
                                                 ensure_http_headers=True)

        memento_dt = r.headers.get('Memento-Datetime')
        target_uri = r.headers.get('WARC-Target-URI')

        # cdx['urlkey'] = urlkey
        # cdx['timestamp'] = http_date_to_timestamp(memento_dt)
        # cdx['url'] = target_uri

        set_content_loc = False

        # Check if Fuzzy Match
        if target_uri != wb_url.url and cdx.get('is_fuzzy') == '1':
            set_content_loc = True

        # if redirect to exact timestamp (only set if not live)
        if redirect_to_exact:
            if set_content_loc or is_timegate or wb_url.timestamp != cdx.get(
                    'timestamp'):
                new_url = urlrewriter.get_new_url(url=target_uri,
                                                  timestamp=cdx['timestamp'],
                                                  mod=wb_url.mod)

                resp = WbResponse.redir_response(new_url,
                                                 '307 Temporary Redirect')
                if self.enable_memento:
                    if is_timegate and not is_proxy:
                        self._add_memento_links(target_uri,
                                                full_prefix,
                                                memento_dt,
                                                cdx['timestamp'],
                                                resp.status_headers,
                                                is_timegate,
                                                is_proxy,
                                                pref_applied=pref_applied,
                                                mod=pref_mod,
                                                is_memento=False)

                    else:
                        resp.status_headers['Link'] = MementoUtils.make_link(
                            target_uri, 'original')

                return resp

        self._add_custom_params(cdx, r.headers, kwargs, record)

        if self._add_range(record, wb_url, range_start, range_end):
            wb_url.mod = 'id_'

        if is_ajax:
            head_insert_func = None
            urlrewriter.rewrite_opts['is_ajax'] = True
        else:
            top_url = self.get_top_url(full_prefix, wb_url, cdx, kwargs)
            head_insert_func = (self.head_insert_view.create_insert_func(
                wb_url,
                full_prefix,
                host_prefix,
                top_url,
                environ,
                framed_replay,
                coll=kwargs.get('coll', ''),
                replay_mod=self.replay_mod,
                metadata=kwargs.get('metadata', {}),
                config=self.config))

        cookie_rewriter = None
        if self.cookie_tracker and cookie_key:
            # skip add cookie if service worker is not 200
            # it seems cookie headers from service workers are not applied, so don't update in cache
            if wb_url.mod == 'sw_':
                cookie_key = None

            cookie_rewriter = self.cookie_tracker.get_rewriter(
                urlrewriter, cookie_key)

        urlrewriter.rewrite_opts['ua_string'] = environ.get('HTTP_USER_AGENT')

        result = content_rw(record, urlrewriter, cookie_rewriter,
                            head_insert_func, cdx, environ)

        status_headers, gen, is_rw = result

        if history_page:
            title = DefaultRewriter._extract_title(gen)
            if not title:
                title = unquote(environ.get('HTTP_X_WOMBAT_HISTORY_TITLE', ''))

            if not title:
                title = history_page

            self._add_history_page(cdx, kwargs, title)
            return WbResponse.json_response({'title': title})

        if setcookie_headers:
            status_headers.headers.extend(setcookie_headers)

        if ' ' not in status_headers.statusline:
            status_headers.statusline += ' None'

        if not is_ajax and self.enable_memento:
            self._add_memento_links(cdx['url'],
                                    full_prefix,
                                    memento_dt,
                                    cdx['timestamp'],
                                    status_headers,
                                    is_timegate,
                                    is_proxy,
                                    cdx.get('source-coll'),
                                    mod=pref_mod,
                                    pref_applied=pref_applied)

            set_content_loc = True

        if set_content_loc and not redirect_to_exact and not is_proxy:
            status_headers.headers.append(
                ('Content-Location',
                 urlrewriter.get_new_url(timestamp=cdx['timestamp'],
                                         url=cdx['url'])))

        if not is_proxy:
            self.add_csp_header(wb_url, status_headers)

        response = WbResponse(status_headers, gen)

        if is_proxy and environ.get('HTTP_ORIGIN'):
            response.add_access_control_headers(environ)

        if r.status_code == 200 and kwargs.get(
                'cache') == 'always' and environ.get('HTTP_REFERER'):
            response.status_headers[
                'Cache-Control'] = 'public, max-age=31536000, immutable'

        return response
Esempio n. 14
0
    def render_content(self, wb_url, kwargs, environ):
        wb_url = wb_url.replace('#', '%23')
        wb_url = WbUrl(wb_url)

        proto = environ.get('HTTP_X_FORWARDED_PROTO', self.force_scheme)

        if proto:
            environ['wsgi.url_scheme'] = proto

        history_page = environ.pop('HTTP_X_WOMBAT_HISTORY_PAGE', '')
        if history_page:
            wb_url.url = history_page
            is_ajax = True
        else:
            is_ajax = self.is_ajax(environ)

        is_timegate = self._check_accept_dt(wb_url, environ)

        host_prefix = self.get_host_prefix(environ)
        rel_prefix = self.get_rel_prefix(environ)
        full_prefix = host_prefix + rel_prefix
        environ['pywb.host_prefix'] = host_prefix
        pywb_static_prefix = host_prefix + environ.get(
            'pywb.app_prefix', '') + environ.get('pywb.static_prefix',
                                                 '/static/')
        is_proxy = ('wsgiprox.proxy_host' in environ)

        response = self.handle_custom_response(environ, wb_url, full_prefix,
                                               host_prefix, kwargs)

        if response:
            return self.format_response(response, wb_url, full_prefix,
                                        is_timegate, is_proxy)

        if is_proxy:
            environ['pywb_proxy_magic'] = environ['wsgiprox.proxy_host']
            urlrewriter = IdentityUrlRewriter(wb_url, '')
            framed_replay = False

        else:
            urlrewriter = UrlRewriter(wb_url,
                                      prefix=full_prefix,
                                      full_prefix=full_prefix,
                                      rel_prefix=rel_prefix,
                                      pywb_static_prefix=pywb_static_prefix)

            framed_replay = self.framed_replay

        url_parts = urlsplit(wb_url.url)
        if not url_parts.path:
            return self.send_redirect('/', url_parts, urlrewriter)

        self.unrewrite_referrer(environ, full_prefix)

        urlkey = canonicalize(wb_url.url)

        if self.use_js_obj_proxy:
            content_rw = self.js_proxy_rw
        else:
            content_rw = self.default_rw

        inputreq = RewriteInputRequest(environ, urlkey, wb_url.url, content_rw)

        inputreq.include_method_query(wb_url.url)

        range_start, range_end, skip_record = self._check_range(
            inputreq, wb_url)

        setcookie_headers = None
        cookie_key = None
        if self.cookie_tracker:
            cookie_key = self.get_cookie_key(kwargs)
            if cookie_key:
                res = self.cookie_tracker.get_cookie_headers(
                    wb_url.url, urlrewriter, cookie_key,
                    environ.get('HTTP_COOKIE', ''))
                inputreq.extra_cookie, setcookie_headers = res

        r = self._do_req(inputreq, wb_url, kwargs, skip_record)

        if r.status_code >= 400:
            error = None
            try:
                error = r.raw.read()
            except Exception:
                pass
            finally:
                no_except_close(r.raw)

            if error:
                error = error.decode('utf-8')
            else:
                error = ''

            details = dict(args=kwargs, error=error)
            raise UpstreamException(r.status_code,
                                    url=wb_url.url,
                                    details=details)

        cdx = CDXObject(r.headers.get('Warcserver-Cdx').encode('utf-8'))

        cdx_url_parts = urlsplit(cdx['url'])

        if cdx_url_parts.path.endswith(
                '/') and not url_parts.path.endswith('/'):
            # add trailing slash
            new_path = url_parts.path + '/'

            no_except_close(r.raw)

            return self.send_redirect(new_path, url_parts, urlrewriter)

        stream = BufferedReader(r.raw, block_size=BUFF_SIZE)
        record = self.loader.parse_record_stream(stream,
                                                 ensure_http_headers=True)

        memento_dt = r.headers.get('Memento-Datetime')
        target_uri = r.headers.get('WARC-Target-URI')

        # cdx['urlkey'] = urlkey
        # cdx['timestamp'] = http_date_to_timestamp(memento_dt)
        # cdx['url'] = target_uri

        set_content_loc = False

        # Check if Fuzzy Match
        if target_uri != wb_url.url and cdx.get('is_fuzzy') == '1':
            set_content_loc = True

        # if redir to exact, redir if url or ts are different
        if self.redirect_to_exact:
            if (set_content_loc or (wb_url.timestamp != cdx.get('timestamp')
                                    and not cdx.get('is_live'))):

                new_url = urlrewriter.get_new_url(url=target_uri,
                                                  timestamp=cdx['timestamp'],
                                                  mod=wb_url.mod)

                resp = WbResponse.redir_response(new_url,
                                                 '307 Temporary Redirect')
                if self.enable_memento:
                    if is_timegate and not is_proxy:
                        self._add_memento_links(target_uri, full_prefix,
                                                memento_dt, cdx['timestamp'],
                                                resp.status_headers,
                                                is_timegate, is_proxy)

                    else:
                        resp.status_headers['Link'] = MementoUtils.make_link(
                            target_uri, 'original')

                return resp

        self._add_custom_params(cdx, r.headers, kwargs, record)

        if self._add_range(record, wb_url, range_start, range_end):
            wb_url.mod = 'id_'

        if is_ajax:
            head_insert_func = None
            urlrewriter.rewrite_opts['is_ajax'] = True
        else:
            top_url = self.get_top_url(full_prefix, wb_url, cdx, kwargs)
            head_insert_func = (self.head_insert_view.create_insert_func(
                wb_url,
                full_prefix,
                host_prefix,
                top_url,
                environ,
                framed_replay,
                coll=kwargs.get('coll', ''),
                replay_mod=self.replay_mod,
                config=self.config))

        cookie_rewriter = None
        if self.cookie_tracker and cookie_key:
            # skip add cookie if service worker is not 200
            # it seems cookie headers from service workers are not applied, so don't update in cache
            if wb_url.mod == 'sw_':
                cookie_key = None

            cookie_rewriter = self.cookie_tracker.get_rewriter(
                urlrewriter, cookie_key)

        urlrewriter.rewrite_opts['ua_string'] = environ.get('HTTP_USER_AGENT')

        result = content_rw(record, urlrewriter, cookie_rewriter,
                            head_insert_func, cdx, environ)

        status_headers, gen, is_rw = result

        if history_page:
            title = DefaultRewriter._extract_title(gen)
            if not title:
                title = unquote(environ.get('HTTP_X_WOMBAT_HISTORY_TITLE', ''))

            if not title:
                title = history_page

            self._add_history_page(cdx, kwargs, title)
            return WbResponse.json_response({'title': title})

        if setcookie_headers:
            status_headers.headers.extend(setcookie_headers)

        if ' ' not in status_headers.statusline:
            status_headers.statusline += ' None'

        if not is_ajax and self.enable_memento:
            self._add_memento_links(cdx['url'], full_prefix, memento_dt,
                                    cdx['timestamp'], status_headers,
                                    is_timegate, is_proxy,
                                    cdx.get('source-coll'))

            set_content_loc = True

        if set_content_loc and not self.redirect_to_exact:
            status_headers.headers.append(
                ('Content-Location',
                 urlrewriter.get_new_url(timestamp=cdx['timestamp'],
                                         url=cdx['url'])))
        if not is_proxy:
            self.add_csp_header(wb_url, status_headers)

        response = WbResponse(status_headers, gen)

        return response
Esempio n. 15
0
    def page_search(self, environ, coll):
        params = dict(parse_qsl(environ.get('QUERY_STRING')))

        result = self.solr_ingester.query_solr(coll, params)

        return WbResponse.json_response(result)