Exemple #1
0
    def __init__(self,
                 upstream_url_resolver,
                 proxy_magic='pywb.proxy',
                 magic_fwd='http://localhost/',
                 assets_path=None,
                 is_rw=True):

        self.upstream_url_resolver = upstream_url_resolver
        self.loader = ArcWarcRecordLoader()

        self.proxy_magic = proxy_magic
        self.fwd_scheme, self.fwd_host, self.fwd_port, self.fwd_path = parse(
            magic_fwd)

        self.fwd_scheme = self.fwd_scheme.decode('latin-1')
        self.fwd_host = self.fwd_host.decode('latin-1')
        self.fwd_path = self.fwd_path.decode('latin-1')

        self.jinja_env = JinjaEnv(assets_path=assets_path)
        self.head_insert_view = HeadInsertView(self.jinja_env,
                                               'head_insert.html',
                                               'banner.html')
        self.error_view = BaseInsertView(self.jinja_env, 'error.html')
        self.home_redir_view = BaseInsertView(self.jinja_env, 'home.html')

        if is_rw:
            self.content_rewriter = Rewriter(is_framed_replay=False)
        else:
            self.content_rewriter = None
Exemple #2
0
    def __init__(self,
                 fileobj,
                 no_record_parse=False,
                 verify_http=False,
                 arc2warc=False,
                 ensure_http_headers=False,
                 block_size=BUFF_SIZE):

        self.fh = fileobj

        self.loader = ArcWarcRecordLoader(verify_http=verify_http,
                                          arc2warc=arc2warc)
        self.known_format = None

        self.mixed_arc_warc = arc2warc

        self.member_info = None
        self.no_record_parse = no_record_parse
        self.ensure_http_headers = ensure_http_headers

        self.reader = DecompressingBufferedReader(self.fh,
                                                  block_size=block_size)
        self.offset = self.fh.tell()
        self.next_line = None

        self.err_count = 0
        self.record = None

        self.the_iter = self._iterate_records()
def get_rendered_original_stream(warc_filename,
                                 warc_offset,
                                 compressedendoffset,
                                 payload_only=True):
    """
    Grabs a resource.
    """
    # If not found, say so:
    if warc_filename is None:
        return None, None

    # Grab the payload from the WARC and return it.
    url = "%s%s?op=OPEN&user.name=%s&offset=%s" % (
        WEBHDFS_PREFIX, warc_filename, WEBHDFS_USER, warc_offset)
    if compressedendoffset and int(compressedendoffset) > 0:
        url = "%s&length=%s" % (url, compressedendoffset)
    r = requests.get(url, stream=True)
    # We handle decoding etc.
    r.raw.decode_content = False
    logger.debug("Loading from: %s" % r.url)
    logger.debug("Got status code %s" % r.status_code)
    # Return the payload, or the record:
    if payload_only:
        # Parse the WARC, return the payload:
        rl = ArcWarcRecordLoader()
        record = rl.parse_record_stream(
            DecompressingBufferedReader(stream=r.raw))
        #return record.raw_stream, record.content_type
        return record.content_stream(), record.content_type
    else:
        # This makes sure we only get the first GZip chunk:
        s = DecompressingBufferedReader(stream=r.raw)
        warc_record = s.read()
        return warc_record, 'application/warc'
Exemple #4
0
    def __init__(self, framed_replay=False, jinja_env=None, config=None, paths=None):
        self.loader = ArcWarcRecordLoader()

        self.config = config or {}
        self.paths = paths or {}

        self.framed_replay = framed_replay

        if framed_replay:
            self.frame_mod = ''
            self.replay_mod = 'mp_'
        else:
            self.frame_mod = None
            self.replay_mod = ''

        self.default_rw = DefaultRewriter(replay_mod=self.replay_mod,
                                          config=config)

        self.js_proxy_rw = RewriterWithJSProxy(replay_mod=self.replay_mod)

        if not jinja_env:
            jinja_env = JinjaEnv(globals={'static_path': 'static'})

        self.jinja_env = jinja_env

        self.redirect_to_exact = config.get('redirect_to_exact')

        self.banner_view = BaseInsertView(self.jinja_env, self._html_templ('banner_html'))

        self.head_insert_view = HeadInsertView(self.jinja_env,
                                               self._html_templ('head_insert_html'),
                                               self.banner_view)

        self.frame_insert_view = TopFrameView(self.jinja_env,
                                               self._html_templ('frame_insert_html'),
                                               self.banner_view)

        self.error_view = BaseInsertView(self.jinja_env, self._html_templ('error_html'))
        self.not_found_view = BaseInsertView(self.jinja_env, self._html_templ('not_found_html'))
        self.query_view = BaseInsertView(self.jinja_env, self._html_templ('query_html'))

        self.use_js_obj_proxy = config.get('use_js_obj_proxy', True)

        self.cookie_tracker = None

        self.enable_memento = self.config.get('enable_memento')

        csp_header = self.config.get('csp-header', self.DEFAULT_CSP)
        if csp_header:
            self.csp_header = ('Content-Security-Policy', csp_header)
        else:
            self.csp_header = None

        # deprecated: Use X-Forwarded-Proto header instead!
        self.force_scheme = config.get('force_scheme')
Exemple #5
0
def lookupRecord(url):
    """ Look up URL in database. """
    try:
        filename, offset, length = urlmap[url]
        with open(filename, 'rb') as stream:
            stream.seek(offset, 0)
            buf = BytesIO(stream.read(length))
            loader = ArcWarcRecordLoader()
            return loader.parse_record_stream(DecompressingBufferedReader(buf))
    except KeyError:
        return None
Exemple #6
0
    def test_generate_record(self, record_sampler, is_gzip, builder_factory):
        writer = FixedTestWARCWriter(gzip=is_gzip)

        builder = builder_factory(writer)
        record_maker, record_string = record_sampler
        record = record_maker(builder)

        writer.write_record(record)

        raw_buff = writer.get_contents()

        self._validate_record_content_len(BytesIO(raw_buff))

        stream = DecompressingBufferedReader(writer.get_stream())

        buff = stream.read()

        if is_gzip:
            assert len(buff) > len(raw_buff)
        else:
            assert len(buff) == len(raw_buff)

        assert buff.decode('utf-8') == record_string

        # assert parsing record matches as well
        stream = DecompressingBufferedReader(writer.get_stream())
        parsed_record = ArcWarcRecordLoader().parse_record_stream(stream)
        writer2 = FixedTestWARCWriter(gzip=False)
        writer2.write_record(parsed_record)
        assert writer2.get_contents().decode('utf-8') == record_string

        # verify parts of record
        stream = DecompressingBufferedReader(writer.get_stream())
        parsed_record = ArcWarcRecordLoader().parse_record_stream(stream)

        content_buff = parsed_record.content_stream().read().decode('utf-8')
        assert content_buff in record_string

        rec_type = parsed_record.rec_type

        # verify http_headers

        # match original
        assert record.http_headers == parsed_record.http_headers

        if parsed_record.http_headers:
            assert rec_type in ('response', 'request', 'revisit')
        else:
            # empty revisit
            if rec_type == 'revisit':
                assert len(content_buff) == 0
            else:
                assert len(content_buff) == parsed_record.length
Exemple #7
0
def fetch_warc_record(capture, warc_download_prefix):
    for field in ('url', 'filename', 'offset', 'length'):
        if field not in capture:  # pragma: no cover
            raise ValueError('capture must contain '+field)

    url = capture['url']
    filename = capture['filename']
    offset = int(capture['offset'])
    length = int(capture['length'])

    warc_url = warc_download_prefix + '/' + filename
    headers = {'Range': 'bytes={}-{}'.format(offset, offset+length-1)}

    resp = myrequests_get(warc_url, headers=headers)
    record_bytes = resp.content
    stream = DecompressingBufferedReader(BytesIO(record_bytes))
    record = ArcWarcRecordLoader().parse_record_stream(stream)

    for header in ('WARC-Source-URI', 'WARC-Source-Range'):
        if record.rec_headers.get_header(header):  # pragma: no cover
            print('Surprised that {} was already set in this WARC record'.format(header), file=sys.stderr)

    warc_target_uri = record.rec_headers.get_header('WARC-Target-URI')
    if url != warc_target_uri:  # pragma: no cover
        print('Surprised that WARC-Target-URI {} is not the capture url {}'.format(warc_target_uri, url), file=sys.stderr)

    record.rec_headers.replace_header('WARC-Source-URI', warc_url)
    record.rec_headers.replace_header('WARC-Source-Range', 'bytes={}-{}'.format(offset, offset+length-1))
    return record
Exemple #8
0
    def test_record_skip_all_cookies_header(self):
        warc_path = to_path(self.root_dir + '/warcs/cookieskip/')
        header_filter = ExcludeSpecificHeaders(['Set-Cookie', 'Cookie'])
        recorder_app = RecorderApp(self.upstream_url,
                                   PerRecordWARCWriter(
                                       warc_path, header_filter=header_filter),
                                   accept_colls='live')

        resp = self._test_warc_write(
            recorder_app, 'httpbin.org',
            '/cookies/set%3Fname%3Dvalue%26foo%3Dbar')
        assert b'HTTP/1.1 302' in resp.body

        buff = BytesIO(resp.body)
        record = ArcWarcRecordLoader().parse_record_stream(buff)
        assert ('Set-Cookie',
                'name=value; Path=/') in record.http_headers.headers
        assert ('Set-Cookie', 'foo=bar; Path=/') in record.http_headers.headers

        stored_req, stored_resp = self._load_resp_req(warc_path)

        assert ('Set-Cookie',
                'name=value; Path=/') not in stored_resp.http_headers.headers
        assert ('Set-Cookie',
                'foo=bar; Path=/') not in stored_resp.http_headers.headers

        assert ('X-Other', 'foo') in stored_req.http_headers.headers
        assert ('Cookie', 'boo=far') not in stored_req.http_headers.headers

        self._test_all_warcs('/warcs/cookieskip/', 1)
Exemple #9
0
    def test_record_skip_http_only_cookies_header(self):
        warc_path = to_path(self.root_dir + '/warcs/cookieskip_httponly/')
        header_filter = ExcludeHttpOnlyCookieHeaders()
        recorder_app = RecorderApp(self.upstream_url,
                                   PerRecordWARCWriter(
                                       warc_path, header_filter=header_filter),
                                   accept_colls='live')

        resp = self._test_warc_write(recorder_app, 'www.google.com', '/')
        assert b'HTTP/1.1 302' in resp.body

        buff = BytesIO(resp.body)
        record = ArcWarcRecordLoader().parse_record_stream(buff)

        non_http_only, http_only = self._get_http_only_cookies(record)
        # both httponly and other cookies
        assert http_only != None
        assert non_http_only != None

        stored_req, stored_resp = self._load_resp_req(warc_path)

        non_http_only, http_only = self._get_http_only_cookies(stored_resp)
        # no httponly cookies
        assert http_only == None
        assert non_http_only != None

        assert ('X-Other', 'foo') in stored_req.http_headers.headers
        assert ('Cookie', 'boo=far') not in stored_req.http_headers.headers

        self._test_all_warcs('/warcs/cookieskip_httponly/', 1)
Exemple #10
0
    def test_warcinfo_record(self):
        simplewriter = FixedTestWARCWriter(gzip=False)
        params = OrderedDict([('software', 'recorder test'),
                              ('format', 'WARC File Format 1.0'),
                              ('invalid', ''),
                              ('json-metadata', json.dumps({'foo': 'bar'}))])

        record = simplewriter.create_warcinfo_record('testfile.warc.gz',
                                                     params)
        simplewriter.write_record(record)
        buff = simplewriter.get_contents()
        assert isinstance(buff, bytes)

        buff = BytesIO(buff)
        parsed_record = ArcWarcRecordLoader().parse_record_stream(buff)

        assert parsed_record.rec_headers.get_header('WARC-Type') == 'warcinfo'
        assert parsed_record.rec_headers.get_header(
            'Content-Type') == 'application/warc-fields'
        assert parsed_record.rec_headers.get_header(
            'WARC-Filename') == 'testfile.warc.gz'

        buff = parsed_record.raw_stream.read().decode('utf-8')

        length = parsed_record.rec_headers.get_header('Content-Length')

        assert len(buff) == int(length)

        assert 'json-metadata: {"foo": "bar"}\r\n' in buff
        assert 'format: WARC File Format 1.0\r\n' in buff

        assert simplewriter.get_contents().decode('utf-8') == WARCINFO_RECORD
Exemple #11
0
    def test_record_video_metadata(self):
        pytest.importorskip('youtube_dl')
        warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')

        dedup_index = self._get_dedup_index()

        writer = PerRecordWARCWriter(warc_path, dedup_index=dedup_index)
        recorder_app = RecorderApp(self.upstream_url, writer)

        params = {'param.recorder.user': '******',
                  'param.recorder.coll': 'VIDEO',
                  'content_type': 'application/vnd.youtube-dl_formats+json'
                 }

        resp = self._test_warc_write(recorder_app,
            'www.youtube.com', '/v/BfBgWtAIbRc', '&' + urlencode(params),
            link_url='metadata://www.youtube.com/v/BfBgWtAIbRc')

        r = FakeStrictRedis.from_url('redis://localhost/2')

        warcs = r.hgetall('USER:VIDEO:warc')
        assert len(warcs) == 1

        filename = list(warcs.values())[0]

        with open(filename, 'rb') as fh:
            decomp = DecompressingBufferedReader(fh)
            record = ArcWarcRecordLoader().parse_record_stream(decomp)

        status_headers = record.rec_headers
        assert status_headers.get_header('WARC-Type') == 'metadata'
        assert status_headers.get_header('Content-Type') == 'application/vnd.youtube-dl_formats+json'
        assert status_headers.get_header('WARC-Block-Digest') != ''
        assert status_headers.get_header('WARC-Block-Digest') == status_headers.get_header('WARC-Payload-Digest')
Exemple #12
0
    def test_record_custom_record(self):
        dedup_index = self._get_dedup_index(user=False)

        warc_path = to_path(self.root_dir + '/warcs/meta/meta.warc.gz')

        writer = MultiFileWARCWriter(warc_path, dedup_index=dedup_index)
        recorder_app = RecorderApp(self.upstream_url, writer)

        req_url = '/live/resource/postreq?url=custom://httpbin.org&param.recorder.coll=META&put_record=resource'

        buff = b'Some Data'

        testapp = webtest.TestApp(recorder_app)
        headers = {'content-type': 'text/plain', 'WARC-Custom': 'foo'}

        resp = testapp.put(req_url, headers=headers, params=buff)

        assert resp.json['success'] == 'true'
        assert resp.json['WARC-Date'] != ''

        self._test_all_warcs('/warcs/meta', 1)

        r = FakeStrictRedis.from_url('redis://localhost/2')

        warcs = r.hgetall('META:warc')
        assert len(warcs) == 1

        warc_key = os.path.join('meta', 'meta.warc.gz').encode('utf-8')

        with open(warcs[warc_key], 'rb') as fh:
            decomp = DecompressingBufferedReader(fh)
            record = ArcWarcRecordLoader().parse_record_stream(
                decomp, ensure_http_headers=True)

        status_headers = record.rec_headers
        assert len(record.rec_headers.headers) == 9
        assert status_headers.get_header('WARC-Type') == 'resource'
        assert status_headers.get_header(
            'WARC-Target-URI') == 'custom://httpbin.org'
        assert status_headers.get_header('WARC-Record-ID') != ''
        assert status_headers.get_header('WARC-Date') != ''
        assert status_headers.get_header('WARC-Block-Digest') != ''
        assert status_headers.get_header(
            'WARC-Block-Digest') == status_headers.get_header(
                'WARC-Payload-Digest')
        assert status_headers.get_header('Content-Type') == 'text/plain'
        assert status_headers.get_header('Content-Length') == str(len(buff))
        assert status_headers.get_header('WARC-Custom') == 'foo'

        assert record.raw_stream.read() == buff

        status_headers = record.http_headers
        assert len(record.http_headers.headers) == 2

        assert status_headers.get_header('Content-Type') == 'text/plain'
        assert status_headers.get_header('Content-Length') == str(len(buff))

        writer.close()
        assert len(writer.fh_cache) == 0
Exemple #13
0
    def test_agg_select_mem_unrewrite_headers(self):
        resp = self.testapp.get('/cdx_api/resource?closest=20161103124134&url=http://iana.org/')

        assert resp.headers['Warcserver-Source-Coll'] == 'ia-cdx'

        buff = BytesIO(resp.body)
        record = ArcWarcRecordLoader().parse_record_stream(buff, no_record_parse=False)
        print(record.http_headers)
        assert record.http_headers.get_statuscode() == '200'
Exemple #14
0
def get_rendered_original_stream(warc_filename, warc_offset,
                                 compressedendoffset):
    # If not found, say so:
    if warc_filename is None:
        return None, None

    # Grab the payload from the WARC and return it.
    url = "%s%s?op=OPEN&user.name=%s&offset=%s" % (
        WEBHDFS_PREFIX, warc_filename, WEBHDFS_USER, warc_offset)
    if compressedendoffset:
        url = "%s&length=%s" % (url, compressedendoffset)
    r = requests.get(url, stream=True)
    logger.debug("Loading from: %s" % r.url)
    r.raw.decode_content = False
    rl = ArcWarcRecordLoader()
    record = rl.parse_record_stream(DecompressingBufferedReader(stream=r.raw))

    return record.raw_stream, record.content_type
Exemple #15
0
    def test_read_from_stream_no_content_length(self, record_sampler, is_gzip,
                                                builder_factory):
        writer = FixedTestWARCWriter(gzip=is_gzip)
        builder = builder_factory(writer)

        record_maker, record_string = record_sampler
        full_record = record_maker(builder)

        stream = BytesIO()
        record_no_cl = self._conv_to_streaming_record(record_string,
                                                      full_record.rec_type)

        if is_gzip:
            gzip_stream = GzippingWrapper(stream)
            gzip_stream.write(record_no_cl.encode('utf-8'))
            gzip_stream.flush()
        else:
            stream.write(record_no_cl.encode('utf-8'))

        # parse to verify http headers + payload matches sample record
        # but not rec headers (missing content-length)
        stream.seek(0)
        parsed_record = ArcWarcRecordLoader().parse_record_stream(
            DecompressingBufferedReader(stream))

        if 'Content-Disposition' not in record_string:
            assert full_record.http_headers == parsed_record.http_headers
        assert full_record.raw_stream.read() == parsed_record.raw_stream.read()
        assert full_record.rec_headers != parsed_record.rec_headers

        # parse and write
        stream.seek(0)
        parsed_record = ArcWarcRecordLoader().parse_record_stream(
            DecompressingBufferedReader(stream))

        writer.write_record(parsed_record)

        stream = DecompressingBufferedReader(writer.get_stream())
        buff = stream.read()

        # assert written record matches expected response record
        # with content-length, digests computed
        assert buff.decode('utf-8') == record_string
Exemple #16
0
    def create_warc_record(self,
                           uri,
                           record_type,
                           payload=None,
                           length=None,
                           warc_content_type='',
                           warc_headers_dict=None,
                           warc_headers=None,
                           http_headers=None):
        if warc_headers_dict is None:
            warc_headers_dict = dict()

        if payload and not http_headers:
            loader = ArcWarcRecordLoader()
            http_headers = loader.load_http_headers(record_type, uri, payload,
                                                    length)
            if http_headers and length is not None:
                length -= payload.tell()

        if not payload:
            payload = BytesIO()
            length = 0

        if not warc_headers:
            warc_headers = self._init_warc_headers(uri, record_type,
                                                   warc_headers_dict)

        # compute Content-Type
        if not warc_content_type:
            warc_content_type = warc_headers.get_header('Content-Type')

            if not warc_content_type:
                warc_content_type = self.WARC_RECORDS.get(
                    record_type, 'application/warc-record')

        record = ArcWarcRecord('warc', record_type, warc_headers, payload,
                               http_headers, warc_content_type, length)

        record.payload_length = length

        self.ensure_digest(record, block=False, payload=True)

        return record
Exemple #17
0
    def test_warcinfo_record(self, is_gzip):
        writer = FixedTestWARCWriter(gzip=is_gzip)

        record = sample_warcinfo(writer)

        writer.write_record(record)
        reader = DecompressingBufferedReader(writer.get_stream())

        parsed_record = ArcWarcRecordLoader().parse_record_stream(reader)

        assert parsed_record.rec_headers.get_header('WARC-Type') == 'warcinfo'
        assert parsed_record.rec_headers.get_header(
            'Content-Type') == 'application/warc-fields'
        assert parsed_record.rec_headers.get_header(
            'WARC-Filename') == 'testfile.warc.gz'

        buff = parsed_record.content_stream().read().decode('utf-8')

        assert 'json-metadata: {"foo": "bar"}\r\n' in buff
        assert 'format: WARC File Format 1.0\r\n' in buff
Exemple #18
0
    def test_live_1(self):
        resp = requests.get(self.base_url +
                            '/live/resource?url=http://httpbin.org/get',
                            stream=True)
        assert resp.headers['Warcserver-Source-Coll'] == 'live'

        record = ArcWarcRecordLoader().parse_record_stream(
            resp.raw, no_record_parse=False)
        assert record.rec_headers.get_header(
            'WARC-Target-URI') == 'http://httpbin.org/get'
        assert record.http_headers.get_header('Date') != ''
Exemple #19
0
    def test_upstream_1(self):
        resp = self.testapp.get(
            '/upstream/resource?url=http://httpbin.org/get')
        assert resp.headers['Warcserver-Source-Coll'] == 'upstream:live'

        raw = BytesIO(resp.body)

        record = ArcWarcRecordLoader().parse_record_stream(
            raw, no_record_parse=False)
        assert record.rec_headers.get_header(
            'WARC-Target-URI') == 'http://httpbin.org/get'
        assert record.http_headers.get_header('Date') != ''
Exemple #20
0
    def test_warcinfo_record(self, is_gzip, builder_factory):
        writer = FixedTestWARCWriter(gzip=is_gzip)
        builder = builder_factory(writer)

        record = sample_warcinfo(builder)

        writer.write_record(record)
        reader = DecompressingBufferedReader(writer.get_stream())

        parsed_record = ArcWarcRecordLoader().parse_record_stream(reader)

        assert parsed_record.rec_headers.get_header('WARC-Type') == 'warcinfo'
        assert parsed_record.rec_headers.get_header(
            'Content-Type') == 'application/warc-fields'
        assert parsed_record.rec_headers.get_header(
            'WARC-Filename') == 'testfile.warc.gz'
        assert parsed_record.rec_headers.get_header(
            'WARC-Block-Digest') == 'sha1:GAD6P5BTZPRU57ICXEYUJZGCURZYABID'

        buff = parsed_record.content_stream().read().decode('utf-8')

        assert 'json-metadata: {"foo": "bar"}\r\n' in buff
        assert 'format: WARC File Format 1.0\r\n' in buff
Exemple #21
0
    def test_utf8_rewrite_content_adjust(self):
        UTF8_PAYLOAD = u'\
HTTP/1.0 200 OK\r\n\
Content-Type: text/plain; charset="UTF-8"\r\n\
Content-Disposition: attachment; filename="испытание.txt"\r\n\
Custom-Header: somevalue\r\n\
Unicode-Header: %F0%9F%93%81%20text%20%F0%9F%97%84%EF%B8%8F\r\n\
\r\n\
some\n\
text'

        content_length = len(UTF8_PAYLOAD.encode('utf-8'))

        UTF8_RECORD = u'\
WARC/1.0\r\n\
WARC-Type: response\r\n\
WARC-Record-ID: <urn:uuid:12345678-feb0-11e6-8f83-68a86d1772ce>\r\n\
WARC-Target-URI: http://example.com/\r\n\
WARC-Date: 2000-01-01T00:00:00Z\r\n\
WARC-Payload-Digest: sha1:B6QJ6BNJ3R4B23XXMRKZKHLPGJY2VE4O\r\n\
WARC-Block-Digest: sha1:KMUABC6URWIQ7QXCZDQ5FS6WIBBFRORR\r\n\
Content-Type: application/http; msgtype=response\r\n\
Content-Length: {0}\r\n\
\r\n\
{1}\r\n\
\r\n\
'.format(content_length, UTF8_PAYLOAD)

        assert (content_length == 226)

        record = ArcWarcRecordLoader().parse_record_stream(
            BytesIO(UTF8_RECORD.encode('utf-8')))

        writer = BufferWARCWriter(gzip=False)
        writer.write_record(record)

        raw_buff = writer.get_contents()
        assert raw_buff.decode('utf-8') == RESPONSE_RECORD_UNICODE_HEADERS

        for record in ArchiveIterator(writer.get_stream()):
            assert record.length == 268
    def _write_one(self):
        req_pay = None
        resp_pay = None
        try:
            result = self.write_queue.get()

            req_head, req_pay, resp_head, resp_pay, params = result

            resp_length = resp_pay.tell()
            resp_pay.seek(0)
            resp = ArcWarcRecordLoader().parse_record_stream(resp_pay)

            if resp.rec_type == 'response':
                uri = resp.rec_headers.get_header('WARC-Target-Uri')
                req_length = req_pay.tell()
                req_pay.seek(0)
                req = self.writer.create_warc_record(
                    uri=uri,
                    record_type='request',
                    payload=req_pay,
                    length=req_length,
                    warc_headers_dict=req_head)

                self.writer.write_request_response_pair(req, resp, params)

            else:
                self.writer.write_record(resp, params)

        finally:
            try:
                if req_pay:
                    no_except_close(req_pay)

                if resp_pay:
                    no_except_close(resp_pay)
            except Exception as e:
                traceback.print_exc()
Exemple #23
0
class DirectUpstream(object):
    def __init__(self,
                 upstream_url_resolver,
                 proxy_magic='pywb.proxy',
                 magic_fwd='http://localhost/',
                 assets_path=None,
                 is_rw=True):

        self.upstream_url_resolver = upstream_url_resolver
        self.loader = ArcWarcRecordLoader()

        self.proxy_magic = proxy_magic
        self.fwd_scheme, self.fwd_host, self.fwd_port, self.fwd_path = parse(
            magic_fwd)

        self.fwd_scheme = self.fwd_scheme.decode('latin-1')
        self.fwd_host = self.fwd_host.decode('latin-1')
        self.fwd_path = self.fwd_path.decode('latin-1')

        self.jinja_env = JinjaEnv(assets_path=assets_path)
        self.head_insert_view = HeadInsertView(self.jinja_env,
                                               'head_insert.html',
                                               'banner.html')
        self.error_view = BaseInsertView(self.jinja_env, 'error.html')
        self.home_redir_view = BaseInsertView(self.jinja_env, 'home.html')

        if is_rw:
            self.content_rewriter = Rewriter(is_framed_replay=False)
        else:
            self.content_rewriter = None

    def request(self, flow):
        self._set_request_url(flow)

    def _set_request_url(self, flow, postreq=''):
        host = flow.request.headers.get('host')
        if not host:
            host = flow.request.host

        homepage_redirect = None

        if (host == self.proxy_magic
                and (flow.request.path in (H_REFRESH_PATH, H_REDIR_PATH))):
            homepage_redirect = flow.request.path

        elif host == self.proxy_magic:
            flow.request.host = self.fwd_host
            flow.request.scheme = self.fwd_scheme
            flow.request.port = self.fwd_port
            flow.request.headers['X-Proxy-For'] = str(
                flow.client_conn.address.host)
            return False

        if host:
            host = flow.request.scheme + '://' + host
        else:
            host = hostport(flow.request.scheme, flow.request.host,
                            flow.request.port)

        req_url = host + flow.request.path

        flow.request.req_url = req_url
        flow.request.req_scheme = flow.request.scheme

        result = self.upstream_url_resolver(url=quote_plus(req_url),
                                            headers=flow.request.headers,
                                            address=flow.client_conn.address,
                                            postreq=postreq)

        full_url, extra_data = result

        if homepage_redirect:
            url = extra_data.get('url')
            if url:
                if homepage_redirect == H_REFRESH_PATH:
                    self.homepage_refresh(flow, url)
                elif homepage_redirect == H_REDIR_PATH:
                    self.homepage_redir(flow, url)

                return False

        scheme, host, port, path = parse(full_url)

        flow.request.scheme = scheme
        flow.request.host = host
        flow.request.port = port
        flow.request.path = path

        flow.extra_data = extra_data
        return True

    def responseheaders(self, flow):
        if flow.request.host == self.fwd_host:
            return

        if hasattr(flow, 'direct_response'):
            return

        if flow.response.status_code == 200:
            flow.response.stream = True

    def response(self, flow):
        if flow.request.host == self.fwd_host:
            return

        if hasattr(flow, 'direct_response'):
            return

        if flow.response.status_code != 200:
            url = flow.request.req_url
            err_status = 400
            err_msg = 'Proxy Error'
            if flow.response.status_code == 404:
                err_status = 404
                err_msg = 'Not Found'

            self.send_error(flow, url, err_status, err_msg)
            return

        an_iter = flow.live.read_response_body(flow.request, flow.response)
        stream = IterIO(an_iter)

        try:
            self._set_response(flow, stream)
        except Exception as e:
            if hasattr(flow.request, 'req_url'):
                print(flow.request.req_url)
            print(type(e), e)
            import traceback
            traceback.print_exc()

    def homepage_redir(self, flow, redir_url):
        flow.request.host = self.fwd_host
        flow.response = HTTPResponse.make(303, b'', {'Location': redir_url})
        return True

    def homepage_refresh(self, flow, url):
        flow.direct_response = True
        environ = {}
        environ['webrec.template_params'] = {'url': url}
        resp_data = self.home_redir_view.render_to_string(environ).encode(
            'utf-8')
        flow.response = HTTPResponse.make(
            200, resp_data, {'Content-Type': 'text/html; charset=utf-8'})
        return True

    def send_error(self, flow, url, status, reason):
        template_params = {}
        if hasattr(flow, 'extra_data') and flow.extra_data:
            template_params = flow.extra_data

        template_params['url'] = url

        template_params['cdx'] = {'url': url}
        template_params['proxy_magic'] = self.proxy_magic

        host_prefix = flow.request.req_scheme + '://' + self.proxy_magic
        template_params['wbrequest'] = {'host_prefix': host_prefix}

        environ = {
            'pywb_proxy_magic': self.proxy_magic,
            'webrec.template_params': template_params
        }

        msg = self.error_view.render_to_string(environ).encode('utf-8')

        flow.response.content = msg
        flow.response.status_code = status
        flow.response.reason = reason
        flow.response.headers = Headers()
        flow.response.headers['Content-Type'] = 'text/html; charset=utf-8'
        flow.response.headers['Content-Length'] = str(len(msg))

    def process_record(self, record, flow):
        headers = flow.response.headers
        url = flow.request.req_url
        scheme = flow.request.req_scheme

        if not self.content_rewriter:
            return record.http_headers, StreamIO(record.raw_stream)

        cookie_rewriter = None

        template_params = flow.extra_data

        environ = {
            'pywb_proxy_magic': self.proxy_magic,
            'webrec.template_params': template_params
        }

        wb_url = WbUrl(url)
        wb_prefix = ''
        host_prefix = flow.request.req_scheme + '://' + self.proxy_magic
        urlrewriter = SchemeOnlyUrlRewriter(wb_url, '')

        if flow.request.headers.get('X-Requested-With',
                                    '').lower() == 'xmlhttprequest':
            urlrewriter.rewrite_opts['is_ajax'] = True

        head_insert_func = (self.head_insert_view.create_insert_func(
            wb_url, wb_prefix, host_prefix, url, environ, False))

        urlkey = canonicalize(wb_url.url)

        cdx = CDXObject()
        cdx['urlkey'] = urlkey
        cdx['timestamp'] = http_date_to_timestamp(
            headers.get('Memento-Datetime'))
        cdx['url'] = wb_url.url
        if headers.get('Webagg-Source-Coll') == 'live':
            cdx['is_live'] = 'true'

        result = self.content_rewriter.rewrite_content(
            urlrewriter, record.http_headers, record.raw_stream,
            head_insert_func, urlkey, cdx, cookie_rewriter, environ)

        status_headers, gen, is_rw = result

        status_headers.remove_header('Content-Security-Policy')

        # check for content-length
        res = status_headers.get_header('content-length')
        try:
            if int(res) > 0:
                return status_headers, IterIdent(gen)
        except:
            pass

        # need to either chunk or buffer to get content-length
        if flow.request.http_version == 'HTTP/1.1':
            status_headers.remove_header('content-length')
            status_headers.headers.append(('Transfer-Encoding', 'chunked'))
            #gen = chunk_encode_iter(gen)
        else:
            gen = buffer_iter(status_headers, gen)

        return status_headers, IterIdent(gen)

    def _set_response(self, flow, stream):
        record = self.loader.parse_record_stream(stream)

        status_headers, gen = self.process_record(record, flow)

        if status_headers:
            headers_bytes = [(n.encode('iso-8859-1'), v.encode('iso-8859-1'))
                             for n, v in status_headers.headers]

            flow.response.headers = Headers(headers_bytes)

        protocol = status_headers.protocol
        status, reason = status_headers.statusline.split(' ', 1)

        flow.response.status_code = int(status)
        flow.response.reason = reason

        flow.response.stream = gen

    def serverconnect(self, server_conn):
        return

    def error(self, flow):
        if hasattr(flow.request, 'req_url'):
            url = flow.request.req_url
        else:
            url = ''

        print('ERROR', url)
Exemple #24
0
class RewriterApp(object):
    """Primary application for rewriting the content served by pywb (if it is to be rewritten).

    This class is also responsible rendering the archives templates
    """
    VIDEO_INFO_CONTENT_TYPE = 'application/vnd.youtube-dl_formats+json'

    DEFAULT_CSP = "default-src 'unsafe-eval' 'unsafe-inline' 'self' data: blob: mediastream: ws: wss: ; form-action 'self'"

    def __init__(self,
                 framed_replay=False,
                 jinja_env=None,
                 config=None,
                 paths=None):
        """Initialize a new instance of RewriterApp

        :param bool framed_replay: Is rewriting happening in framed replay mode
        :param JinjaEnv|None jinja_env: Optional JinjaEnv instance to be used for
            rendering static files
        :param dict|None config: Optional config dictionary
        :param dict|None paths: Optional dictionary containing a mapping
            of path names to URLs
        """
        self.loader = ArcWarcRecordLoader()

        self.config = config or {}
        self.paths = paths or {}

        self.framed_replay = framed_replay

        if framed_replay:
            self.frame_mod = ''
            self.replay_mod = 'mp_'
        else:
            self.frame_mod = None
            self.replay_mod = ''

        self.enable_prefer = self.config.get('enable_prefer', False)

        self.default_rw = DefaultRewriter(replay_mod=self.replay_mod,
                                          config=config)

        self.js_proxy_rw = RewriterWithJSProxy(replay_mod=self.replay_mod)

        if not jinja_env:
            jinja_env = JinjaEnv(
                globals={'static_path': 'static'},
                extensions=['jinja2.ext.i18n', 'jinja2.ext.with_'])
            jinja_env.jinja_env.install_null_translations()

        self.jinja_env = jinja_env
        self.loc_map = {}

        self.jinja_env.init_loc(self.config.get('locales_root_dir'),
                                self.config.get('locales'), self.loc_map,
                                self.config.get('default_locale'))

        self.redirect_to_exact = config.get('redirect_to_exact')

        self.banner_view = BaseInsertView(self.jinja_env,
                                          self._html_templ('banner_html'))

        self.head_insert_view = HeadInsertView(
            self.jinja_env, self._html_templ('head_insert_html'),
            self.banner_view)

        self.frame_insert_view = TopFrameView(
            self.jinja_env, self._html_templ('frame_insert_html'),
            self.banner_view)

        self.error_view = BaseInsertView(self.jinja_env,
                                         self._html_templ('error_html'))
        self.not_found_view = BaseInsertView(
            self.jinja_env, self._html_templ('not_found_html'))
        self.query_view = BaseInsertView(self.jinja_env,
                                         self._html_templ('query_html'))

        self.use_js_obj_proxy = config.get('use_js_obj_proxy', True)

        self.cookie_tracker = self._init_cookie_tracker()

        self.enable_memento = self.config.get('enable_memento')

        self.static_prefix = self.config.get('static_prefix', 'static')

        csp_header = self.config.get('csp-header', self.DEFAULT_CSP)
        if csp_header:
            self.csp_header = ('Content-Security-Policy', csp_header)
        else:
            self.csp_header = None

        # deprecated: Use X-Forwarded-Proto header instead!
        self.force_scheme = config.get('force_scheme')

    def _init_cookie_tracker(self, redis=None):
        """Initialize the CookieTracker

        :param redis: Optional redis instance to be used
        Defaults to FakeStrictRedis
        :return: The initialized cookie tracker
        :rtype: CookieTracker
        """
        if redis is None:
            redis = FakeStrictRedis()
        return CookieTracker(redis)

    def add_csp_header(self, wb_url, status_headers):
        """Adds Content-Security-Policy headers to the supplied
        StatusAndHeaders instance if the wb_url's mod is equal
        to the replay mod

        :param WbUrl wb_url: The WbUrl for the URL being operated on
        :param warcio.StatusAndHeaders status_headers: The status and
        headers instance for the reply to the URL
        """
        if self.csp_header and wb_url.mod == self.replay_mod:
            status_headers.headers.append(self.csp_header)

    def _html_templ(self, name):
        """Returns the html file name for the supplied
        html template name.

        :param str name: The name of the html template
        :return: The file name for the template
        :rtype: str|None
        """
        value = self.config.get(name)
        if not value:
            value = name.replace('_html', '.html')
        return value

    def is_framed_replay(self, wb_url):
        """Returns T/F indicating if the rewriter app is configured to
        be operating in framed replay mode and the supplied WbUrl
        is also operating in framed replay mode

        :param WbUrl wb_url: The WbUrl instance to check
        :return: T/F if in framed replay mode
        :rtype: bool
        """
        return (self.framed_replay and wb_url.mod == self.frame_mod
                and wb_url.is_replay())

    def _check_accept_dt(self, wb_url, environ):
        """Returns T/F indicating if the supplied WbUrl instance
        is for a timegate request

        :param WbUrl wb_url: The URL to be checked
        :param dict environ: The wsgi environment object for the request
        :return: T/F indicating if the WbUrl is for timegate request
        :rtype: bool
        """
        is_timegate = False
        if wb_url.is_latest_replay():
            accept_dt = environ.get('HTTP_ACCEPT_DATETIME')
            is_timegate = True
            if accept_dt:
                try:
                    wb_url.timestamp = http_date_to_timestamp(accept_dt)
                except Exception:
                    raise UpstreamException(400,
                                            url=wb_url.url,
                                            details='Invalid Accept-Datetime')
                    # return WbResponse.text_response('Invalid Accept-Datetime', status='400 Bad Request')

                wb_url.type = wb_url.REPLAY

            elif 'pywb_proxy_default_timestamp' in environ:
                wb_url.timestamp = environ['pywb_proxy_default_timestamp']
                wb_url.type = wb_url.REPLAY

        return is_timegate

    def _get_prefer_mod(self, wb_url, environ, content_rw, is_proxy):
        """Returns the default rewrite modifier and rewrite modifier based on the
        value of the Prefer HTTP header if it is present

        :param WbUrl wb_url: The WbUrl for the URL being rewritten
        :param dict environ: The WSGI environment dictionary for the request
        :param content_rw: The content rewriter instance
        :param bool is_proxy: Is the rewrite operating in proxy mode
        :return: A tuple containing the default rewrite modifier and rewrite modifier based
        on the  value of the Prefer HTTP header if it is present
        :rtype: tuple[str|None, str|None]
        """
        if not self.enable_prefer:
            return None, None

        prefer = environ.get('HTTP_PREFER')
        if not prefer:
            return None, content_rw.mod_to_prefer(wb_url.mod)

        mod = content_rw.prefer_to_mod(prefer)

        if mod is None:
            raise UpstreamException(400,
                                    url=wb_url.url,
                                    details='Invalid Prefer: ' + prefer)

        if is_proxy and mod == self.replay_mod:
            mod = 'bn_'
            prefer = content_rw.mod_to_prefer('bn_')

        return mod, prefer

    def _check_range(self, inputreq, wb_url):
        """Checks the input request if it is a range request returning
        the start and end of the range as well as T/F if the request should
        be skipped as a tuple.

        :param RewriteInputRequest inputreq: The input request to check range
        :param WbUrl wb_url: The WbUrl associated with the request
        :return: A tuple with the start, end, and T/F should skip request
        :rtype: tuple[int|None, int|None, bool]
        """
        skip_record = False
        range_start = None
        range_end = None

        rangeres = inputreq.extract_range()

        if not rangeres:
            return range_start, range_end, skip_record

        mod_url, start, end, use_206 = rangeres

        # remove the range and still proxy
        if not use_206:
            return range_start, range_end, skip_record

        wb_url.url = mod_url
        inputreq.url = mod_url

        range_start = start
        range_end = end

        # if start with 0, load from upstream, but add range after
        if start == 0:
            del inputreq.env['HTTP_RANGE']
        else:
            skip_record = True

        return range_start, range_end, skip_record

    def _add_range(self, record, wb_url, range_start, range_end):
        if range_end is None and range_start is None:
            return

        if record.http_headers.get_statuscode() != '200':
            return

        content_length = (record.http_headers.get_header('Content-Length'))

        if content_length is None:
            return

        content_length = content_length.split(',')[0]

        try:
            content_length = int(content_length)
            if not range_end:
                range_end = content_length - 1

            if range_start >= content_length or range_end >= content_length:
                details = 'Invalid Range: {0} >= {2} or {1} >= {2}'.format(
                    range_start, range_end, content_length)
                raise UpstreamException(416, url=wb_url.url, details=details)

            range_len = range_end - range_start + 1
            record.http_headers.add_range(range_start, range_len,
                                          content_length)

            record.http_headers.replace_header('Content-Length',
                                               str(range_len))

            record.raw_stream = OffsetLimitReader(record.raw_stream,
                                                  range_start, range_len)
            return True

        except (ValueError, TypeError):
            pass

    def send_redirect(self, new_path, url_parts, urlrewriter):
        scheme, netloc, path, query, frag = url_parts
        path = new_path
        url = urlunsplit((scheme, netloc, path, query, frag))
        resp = WbResponse.redir_response(urlrewriter.rewrite(url),
                                         '307 Temporary Redirect')

        if self.enable_memento:
            resp.status_headers['Link'] = MementoUtils.make_link(
                url, 'original')

        return resp

    def prepare_env(self, environ):
        """ setup environ path prefixes and scheme """
        if 'pywb.host_prefix' in environ:
            return

        proto = environ.get('HTTP_X_FORWARDED_PROTO', self.force_scheme)

        if proto:
            environ['wsgi.url_scheme'] = proto

        environ['pywb.host_prefix'] = self.get_host_prefix(environ)
        environ['pywb.app_prefix'] = environ.get('SCRIPT_NAME', '')
        environ['pywb.static_prefix'] = environ['pywb.host_prefix'] + environ[
            'pywb.app_prefix'] + '/' + self.static_prefix

    def render_content(self, wb_url, kwargs, environ):
        wb_url = wb_url.replace('#', '%23')
        wb_url = WbUrl(wb_url)

        history_page = environ.pop('HTTP_X_WOMBAT_HISTORY_PAGE', '')
        if history_page:
            wb_url.url = history_page
            is_ajax = True
        else:
            is_ajax = self.is_ajax(environ)

        is_timegate = self._check_accept_dt(wb_url, environ)

        self.prepare_env(environ)

        host_prefix = environ['pywb.host_prefix']
        rel_prefix = self.get_rel_prefix(environ)
        full_prefix = host_prefix + rel_prefix

        pywb_static_prefix = environ['pywb.static_prefix'] + '/'
        is_proxy = ('wsgiprox.proxy_host' in environ)

        # if OPTIONS in proxy mode, just generate the proxy responss
        if is_proxy and self.is_preflight(environ):
            return WbResponse.options_response(environ)

        if self.use_js_obj_proxy:
            content_rw = self.js_proxy_rw
        else:
            content_rw = self.default_rw

        # no redirects if in proxy
        redirect_to_exact = self.redirect_to_exact and not is_proxy

        # Check Prefer
        pref_mod, pref_applied = self._get_prefer_mod(wb_url, environ,
                                                      content_rw, is_proxy)

        response = None
        keep_frame_response = False

        # prefer overrides custom response?
        if pref_mod is not None:
            # fast-redirect to preferred
            if redirect_to_exact and not is_timegate and pref_mod != wb_url.mod:
                new_url = full_prefix + wb_url.to_str(mod=pref_mod)
                headers = [('Preference-Applied', pref_applied),
                           ('Vary', 'Prefer')]

                return WbResponse.redir_response(new_url,
                                                 '307 Temporary Redirect',
                                                 headers=headers)
            else:
                wb_url.mod = pref_mod
        else:
            if kwargs.get('output'):
                response = self.handle_timemap(wb_url, kwargs, full_prefix)

            elif wb_url.is_query():
                response = self.handle_query(environ, wb_url, kwargs,
                                             full_prefix)

            else:
                response = self.handle_custom_response(environ, wb_url,
                                                       full_prefix,
                                                       host_prefix, kwargs)

                keep_frame_response = (not kwargs.get('no_timegate_check')
                                       and is_timegate
                                       and not is_proxy) or redirect_to_exact

        if response and not keep_frame_response:
            return self.format_response(response, wb_url, full_prefix,
                                        is_timegate, is_proxy)

        if is_proxy:
            environ['pywb_proxy_magic'] = environ['wsgiprox.proxy_host']
            urlrewriter = IdentityUrlRewriter(wb_url, '')
            framed_replay = False

        else:
            urlrewriter = UrlRewriter(wb_url,
                                      prefix=full_prefix,
                                      full_prefix=full_prefix,
                                      rel_prefix=rel_prefix,
                                      pywb_static_prefix=pywb_static_prefix)

            framed_replay = self.framed_replay

        url_parts = urlsplit(wb_url.url)
        if not url_parts.path:
            return self.send_redirect('/', url_parts, urlrewriter)

        self.unrewrite_referrer(environ, full_prefix)

        urlkey = canonicalize(wb_url.url)

        inputreq = RewriteInputRequest(environ, urlkey, wb_url.url, content_rw)

        inputreq.include_method_query(wb_url.url)

        range_start, range_end, skip_record = self._check_range(
            inputreq, wb_url)

        setcookie_headers = None
        cookie_key = None
        if self.cookie_tracker:
            cookie_key = self.get_cookie_key(kwargs)
            if cookie_key:
                res = self.cookie_tracker.get_cookie_headers(
                    wb_url.url, urlrewriter, cookie_key,
                    environ.get('HTTP_COOKIE', ''))
                inputreq.extra_cookie, setcookie_headers = res

        r = self._do_req(inputreq, wb_url, kwargs, skip_record)

        if r.status_code >= 400:
            error = None
            try:
                error = r.raw.read()
            except Exception:
                pass
            finally:
                no_except_close(r.raw)

            if error:
                error = error.decode('utf-8')
            else:
                error = ''

            details = dict(args=kwargs, error=error)
            if r.status_code == 404:
                raise NotFoundException(url=wb_url.url, msg=details)

            else:
                raise UpstreamException(r.status_code,
                                        url=wb_url.url,
                                        details=details)

        cdx = CDXObject(r.headers.get('Warcserver-Cdx').encode('utf-8'))

        cdx_url_parts = urlsplit(cdx['url'])

        if cdx_url_parts.path.endswith(
                '/') and not url_parts.path.endswith('/'):
            # add trailing slash
            new_path = url_parts.path + '/'

            no_except_close(r.raw)

            return self.send_redirect(new_path, url_parts, urlrewriter)

        # only redirect to exact if not live, otherwise set to false
        redirect_to_exact = redirect_to_exact and not cdx.get('is_live')

        # return top-frame timegate response, with timestamp from cdx
        if response and keep_frame_response and (not redirect_to_exact
                                                 or not is_timegate):
            no_except_close(r.raw)
            return self.format_response(response, wb_url, full_prefix,
                                        is_timegate, is_proxy,
                                        cdx['timestamp'])

        stream = BufferedReader(r.raw, block_size=BUFF_SIZE)
        record = self.loader.parse_record_stream(stream,
                                                 ensure_http_headers=True)

        memento_dt = r.headers.get('Memento-Datetime')
        target_uri = r.headers.get('WARC-Target-URI')

        # cdx['urlkey'] = urlkey
        # cdx['timestamp'] = http_date_to_timestamp(memento_dt)
        # cdx['url'] = target_uri

        set_content_loc = False

        # Check if Fuzzy Match
        if target_uri != wb_url.url and cdx.get('is_fuzzy') == '1':
            set_content_loc = True

        # if redirect to exact timestamp (only set if not live)
        if redirect_to_exact:
            if set_content_loc or is_timegate or wb_url.timestamp != cdx.get(
                    'timestamp'):
                new_url = urlrewriter.get_new_url(url=target_uri,
                                                  timestamp=cdx['timestamp'],
                                                  mod=wb_url.mod)

                resp = WbResponse.redir_response(new_url,
                                                 '307 Temporary Redirect')
                if self.enable_memento:
                    if is_timegate and not is_proxy:
                        self._add_memento_links(target_uri,
                                                full_prefix,
                                                memento_dt,
                                                cdx['timestamp'],
                                                resp.status_headers,
                                                is_timegate,
                                                is_proxy,
                                                pref_applied=pref_applied,
                                                mod=pref_mod,
                                                is_memento=False)

                    else:
                        resp.status_headers['Link'] = MementoUtils.make_link(
                            target_uri, 'original')

                return resp

        self._add_custom_params(cdx, r.headers, kwargs, record)

        if self._add_range(record, wb_url, range_start, range_end):
            wb_url.mod = 'id_'

        if is_ajax:
            head_insert_func = None
            urlrewriter.rewrite_opts['is_ajax'] = True
        else:
            top_url = self.get_top_url(full_prefix, wb_url, cdx, kwargs)
            head_insert_func = (self.head_insert_view.create_insert_func(
                wb_url,
                full_prefix,
                host_prefix,
                top_url,
                environ,
                framed_replay,
                coll=kwargs.get('coll', ''),
                replay_mod=self.replay_mod,
                metadata=kwargs.get('metadata', {}),
                config=self.config))

        cookie_rewriter = None
        if self.cookie_tracker and cookie_key:
            # skip add cookie if service worker is not 200
            # it seems cookie headers from service workers are not applied, so don't update in cache
            if wb_url.mod == 'sw_':
                cookie_key = None

            cookie_rewriter = self.cookie_tracker.get_rewriter(
                urlrewriter, cookie_key)

        urlrewriter.rewrite_opts['ua_string'] = environ.get('HTTP_USER_AGENT')

        result = content_rw(record, urlrewriter, cookie_rewriter,
                            head_insert_func, cdx, environ)

        status_headers, gen, is_rw = result

        if history_page:
            title = DefaultRewriter._extract_title(gen)
            if not title:
                title = unquote(environ.get('HTTP_X_WOMBAT_HISTORY_TITLE', ''))

            if not title:
                title = history_page

            self._add_history_page(cdx, kwargs, title)
            return WbResponse.json_response({'title': title})

        if setcookie_headers:
            status_headers.headers.extend(setcookie_headers)

        if ' ' not in status_headers.statusline:
            status_headers.statusline += ' None'

        if not is_ajax and self.enable_memento:
            self._add_memento_links(cdx['url'],
                                    full_prefix,
                                    memento_dt,
                                    cdx['timestamp'],
                                    status_headers,
                                    is_timegate,
                                    is_proxy,
                                    cdx.get('source-coll'),
                                    mod=pref_mod,
                                    pref_applied=pref_applied)

            set_content_loc = True

        if set_content_loc and not redirect_to_exact and not is_proxy:
            status_headers.headers.append(
                ('Content-Location',
                 urlrewriter.get_new_url(timestamp=cdx['timestamp'],
                                         url=cdx['url'])))

        if not is_proxy:
            self.add_csp_header(wb_url, status_headers)

        response = WbResponse(status_headers, gen)

        if is_proxy and environ.get('HTTP_ORIGIN'):
            response.add_access_control_headers(environ)

        if r.status_code == 200 and kwargs.get(
                'cache') == 'always' and environ.get('HTTP_REFERER'):
            response.status_headers[
                'Cache-Control'] = 'public, max-age=31536000, immutable'

        return response

    def format_response(self,
                        response,
                        wb_url,
                        full_prefix,
                        is_timegate,
                        is_proxy,
                        timegate_closest_ts=None):
        memento_ts = None
        if not isinstance(response, WbResponse):
            content_type = 'text/html'

            # if not replay outer frame, specify utf-8 charset
            if not self.is_framed_replay(wb_url):
                content_type += '; charset=utf-8'
            else:
                memento_ts = timegate_closest_ts or wb_url.timestamp

            response = WbResponse.text_response(response,
                                                content_type=content_type)

        if self.enable_memento and response.status_headers.statusline.startswith(
                '200'):
            self._add_memento_links(wb_url.url,
                                    full_prefix,
                                    None,
                                    memento_ts,
                                    response.status_headers,
                                    is_timegate,
                                    is_proxy,
                                    is_memento=not is_timegate)
        return response

    def _add_memento_links(self,
                           url,
                           full_prefix,
                           memento_dt,
                           memento_ts,
                           status_headers,
                           is_timegate,
                           is_proxy,
                           coll=None,
                           pref_applied=None,
                           mod=None,
                           is_memento=True):
        """Adds the memento link headers to supplied StatusAndHeaders instance

        :param str url: The URI-R being rewritten
        :param str full_prefix: The replay prefix
        :param str|None memento_dt: The memento datetime for the URI-R being rewritten
        :param str memento_ts: The memento timestamp
        :param warcio.StatusAndHeaders status_headers:
        :param bool is_timegate: Are we returning a response for a timegate
        :param bool is_proxy: Are we operating in proxy mode
        :param str|None coll: The collection the URI-R is from
        :param str|None pref_applied:
        :param str|None mod: The rewrite modifier
        :param bool is_memento:
        :rtype: None
        """

        replay_mod = mod or self.replay_mod

        # memento url + header
        if not memento_dt and memento_ts:
            memento_dt = timestamp_to_http_date(memento_ts)

        if memento_dt:
            if is_memento:
                status_headers.headers.append(('Memento-Datetime', memento_dt))

            if is_proxy:
                memento_url = url
            else:
                memento_url = full_prefix + memento_ts + replay_mod
                memento_url += '/' + url
        else:
            memento_url = None

        timegate_url, timemap_url = self._get_timegate_timemap(
            url, full_prefix, mod)

        link = []
        if not is_proxy:
            link.append(MementoUtils.make_link(url, 'original'))
            link.append(MementoUtils.make_link(timegate_url, 'timegate'))
            link.append(MementoUtils.make_link(timemap_url, 'timemap'))

        if memento_dt:
            link.append(
                MementoUtils.make_memento_link(memento_url, 'memento',
                                               memento_dt, coll))

        link_str = ', '.join(link)

        status_headers.headers.append(('Link', link_str))

        vary = ''
        if is_timegate:
            vary = 'accept-datetime'

        if pref_applied:
            vary = 'Prefer' if not vary else vary + ', Prefer'
            status_headers.headers.append(('Preference-Applied', pref_applied))

        if vary:
            status_headers.headers.append(('Vary', vary))

    def _get_timegate_timemap(self, url, full_prefix, mod):
        # timegate url
        timegate_url = full_prefix
        mod = ''
        if mod:
            timegate_url += mod + '/'

        timegate_url += url

        # timemap url
        timemap_url = full_prefix + 'timemap/link/' + url
        return timegate_url, timemap_url

    def get_top_url(self, full_prefix, wb_url, cdx, kwargs):
        top_url = full_prefix + wb_url.to_str(mod='')
        return top_url

    def handle_error(self, environ, wbe):
        if isinstance(wbe, NotFoundException):
            return self._not_found_response(environ, wbe.url)
        else:
            return self._error_response(environ, wbe)

    def _not_found_response(self, environ, url):
        resp = self.not_found_view.render_to_string(environ,
                                                    url=url,
                                                    err_msg="Not Found")

        return WbResponse.text_response(resp,
                                        status='404 Not Found',
                                        content_type='text/html')

    def _error_response(self, environ, wbe):
        status = wbe.status()

        resp = self.error_view.render_to_string(environ,
                                                err_msg=wbe.url,
                                                err_details=wbe.msg,
                                                err_status=wbe.status_code)

        return WbResponse.text_response(resp,
                                        status=status,
                                        content_type='text/html')

    def _do_req(self, inputreq, wb_url, kwargs, skip_record):
        req_data = inputreq.reconstruct_request(wb_url.url)

        headers = {
            'Content-Length': str(len(req_data)),
            'Content-Type': 'application/request'
        }

        headers.update(inputreq.warcserver_headers)

        if skip_record:
            headers['Recorder-Skip'] = '1'

        if wb_url.is_latest_replay():
            closest = 'now'
        else:
            closest = wb_url.timestamp

        params = {'url': wb_url.url, 'closest': closest, 'matchType': 'exact'}

        if wb_url.mod == 'vi_':
            params['content_type'] = self.VIDEO_INFO_CONTENT_TYPE

        upstream_url = self.get_upstream_url(wb_url, kwargs, params)

        r = requests.post(upstream_url,
                          data=BytesIO(req_data),
                          headers=headers,
                          stream=True)

        return r

    def do_query(self, wb_url, kwargs):
        """Performs the timemap query request for the supplied WbUrl
        returning the response

        :param WbUrl wb_url: The WbUrl to be queried
        :param dict kwargs: Optional keyword arguments
        :return: The queries response
        :rtype: requests.Response
        """
        params = {
            'url': wb_url.url,
            'output': kwargs.get('output', 'json'),
            'from': wb_url.timestamp,
            'to': wb_url.end_timestamp
        }
        if 'memento_format' in kwargs:
            params['memento_format'] = kwargs['memento_format']

        if 'limit' in kwargs:
            params['limit'] = kwargs['limit']

        upstream_url = self.get_upstream_url(wb_url, kwargs, params)
        upstream_url = upstream_url.replace('/resource/postreq', '/index')

        r = requests.get(upstream_url)

        return r

    def make_timemap(self, wb_url, res, full_prefix, output):
        wb_url.type = wb_url.QUERY

        content_type = res.headers.get('Content-Type')
        text = res.text

        if not res.text:
            status = '404 Not Found'

        elif res.status_code:
            status = str(res.status_code) + ' ' + res.reason

            if res.status_code == 200 and output == 'link':
                timegate, timemap = self._get_timegate_timemap(
                    wb_url.url, full_prefix, wb_url.mod)

                text = MementoUtils.wrap_timemap_header(
                    wb_url.url, timegate, timemap, res.text)
        return WbResponse.text_response(text,
                                        content_type=content_type,
                                        status=status)

    def handle_timemap(self, wb_url, kwargs, full_prefix):
        output = kwargs.get('output')
        kwargs[
            'memento_format'] = full_prefix + '{timestamp}' + self.replay_mod + '/{url}'
        res = self.do_query(wb_url, kwargs)
        return self.make_timemap(wb_url, res, full_prefix, output)

    def handle_query(self, environ, wb_url, kwargs, full_prefix):
        prefix = self.get_full_prefix(environ)

        params = dict(url=wb_url.url, prefix=prefix)

        return self.query_view.render_to_string(environ, **params)

    def get_host_prefix(self, environ):
        scheme = environ['wsgi.url_scheme'] + '://'

        # proxy
        host = environ.get('wsgiprox.proxy_host')
        if host:
            return scheme + host

        # default
        host = environ.get('HTTP_HOST')
        if host:
            return scheme + host

        # if no host
        host = environ['SERVER_NAME']
        if environ['wsgi.url_scheme'] == 'https':
            if environ['SERVER_PORT'] != '443':
                host += ':' + environ['SERVER_PORT']
        else:
            if environ['SERVER_PORT'] != '80':
                host += ':' + environ['SERVER_PORT']

        return scheme + host

    def get_rel_prefix(self, environ):
        # return request.script_name
        return environ.get('SCRIPT_NAME') + '/'

    def get_full_prefix(self, environ):
        return self.get_host_prefix(environ) + self.get_rel_prefix(environ)

    def unrewrite_referrer(self, environ, full_prefix):
        referrer = environ.get('HTTP_REFERER')
        if not referrer:
            return False

        if referrer.startswith(full_prefix):
            referrer = referrer[len(full_prefix):]
            if referrer:
                environ['HTTP_REFERER'] = WbUrl(referrer).url
                return True

        return False

    def is_ajax(self, environ):
        value = environ.get('HTTP_X_REQUESTED_WITH')
        value = value or environ.get('HTTP_X_PYWB_REQUESTED_WITH')
        if value and value.lower() == 'xmlhttprequest':
            return True

        # additional checks for proxy mode only
        if not ('wsgiprox.proxy_host' in environ):
            return False

        # if Chrome Sec-Fetch-Mode is set and is set to 'cors', then
        # a fetch / ajax request
        sec_fetch_mode = environ.get('HTTP_SEC_FETCH_MODE')
        if sec_fetch_mode and sec_fetch_mode == 'cors':
            return True

        return False

    def is_preflight(self, environ):
        if environ.get('REQUEST_METHOD') != 'OPTIONS':
            return False

        if not environ.get('HTTP_ORIGIN'):
            return False

        if not environ.get(
                'HTTP_ACCESS_CONTROL_REQUEST_METHOD') and not environ.get(
                    'HTTP_ACCESS_CONTROL_REQUEST_HEADERS'):
            return False

        return True

    def get_base_url(self, wb_url, kwargs):
        type_ = kwargs.get('type')
        return self.paths[type_].format(**kwargs)

    def get_upstream_url(self, wb_url, kwargs, params):
        base_url = self.get_base_url(wb_url, kwargs)
        param_str = urlencode(params, True)
        if param_str:
            q_char = '&' if '?' in base_url else '?'
            base_url += q_char + param_str
        return base_url

    def get_cookie_key(self, kwargs):
        # note: currently this is per-collection, so enabled only for live or recording
        # to support multiple users recording/live, would need per user cookie
        if kwargs.get('index') == '$live' or kwargs.get('type') == 'record':
            return 'cookie:' + kwargs['coll']
        else:
            return None

    def _add_history_page(self, cdx, kwargs, doc_title):
        pass

    def _add_custom_params(self, cdx, headers, kwargs, record):
        pass

    def get_top_frame_params(self, wb_url, kwargs):
        return {'metadata': kwargs.get('metadata', {})}

    def handle_custom_response(self, environ, wb_url, full_prefix, host_prefix,
                               kwargs):
        if self.is_framed_replay(wb_url):
            extra_params = self.get_top_frame_params(wb_url, kwargs)
            return self.frame_insert_view.get_top_frame(
                wb_url,
                full_prefix,
                host_prefix,
                environ,
                self.frame_mod,
                self.replay_mod,
                coll='',
                extra_params=extra_params)

        return None
Exemple #25
0
    def __init__(self,
                 framed_replay=False,
                 jinja_env=None,
                 config=None,
                 paths=None):
        """Initialize a new instance of RewriterApp

        :param bool framed_replay: Is rewriting happening in framed replay mode
        :param JinjaEnv|None jinja_env: Optional JinjaEnv instance to be used for
            rendering static files
        :param dict|None config: Optional config dictionary
        :param dict|None paths: Optional dictionary containing a mapping
            of path names to URLs
        """
        self.loader = ArcWarcRecordLoader()

        self.config = config or {}
        self.paths = paths or {}

        self.framed_replay = framed_replay

        if framed_replay:
            self.frame_mod = ''
            self.replay_mod = 'mp_'
        else:
            self.frame_mod = None
            self.replay_mod = ''

        self.enable_prefer = self.config.get('enable_prefer', False)

        self.default_rw = DefaultRewriter(replay_mod=self.replay_mod,
                                          config=config)

        self.js_proxy_rw = RewriterWithJSProxy(replay_mod=self.replay_mod)

        if not jinja_env:
            jinja_env = JinjaEnv(
                globals={'static_path': 'static'},
                extensions=['jinja2.ext.i18n', 'jinja2.ext.with_'])
            jinja_env.jinja_env.install_null_translations()

        self.jinja_env = jinja_env
        self.loc_map = {}

        self.jinja_env.init_loc(self.config.get('locales_root_dir'),
                                self.config.get('locales'), self.loc_map,
                                self.config.get('default_locale'))

        self.redirect_to_exact = config.get('redirect_to_exact')

        self.banner_view = BaseInsertView(self.jinja_env,
                                          self._html_templ('banner_html'))

        self.head_insert_view = HeadInsertView(
            self.jinja_env, self._html_templ('head_insert_html'),
            self.banner_view)

        self.frame_insert_view = TopFrameView(
            self.jinja_env, self._html_templ('frame_insert_html'),
            self.banner_view)

        self.error_view = BaseInsertView(self.jinja_env,
                                         self._html_templ('error_html'))
        self.not_found_view = BaseInsertView(
            self.jinja_env, self._html_templ('not_found_html'))
        self.query_view = BaseInsertView(self.jinja_env,
                                         self._html_templ('query_html'))

        self.use_js_obj_proxy = config.get('use_js_obj_proxy', True)

        self.cookie_tracker = self._init_cookie_tracker()

        self.enable_memento = self.config.get('enable_memento')

        self.static_prefix = self.config.get('static_prefix', 'static')

        csp_header = self.config.get('csp-header', self.DEFAULT_CSP)
        if csp_header:
            self.csp_header = ('Content-Security-Policy', csp_header)
        else:
            self.csp_header = None

        # deprecated: Use X-Forwarded-Proto header instead!
        self.force_scheme = config.get('force_scheme')
Exemple #26
0
 def parse(row):
     record = ArcWarcRecordLoader()
     record = record.parse_record_stream(StringIO(row),
                                         known_format="warc")
     return {"warc": record, "raw": str(row)}
Exemple #27
0
def parse_stream_error(**params):
    try:
        return ArcWarcRecordLoader().parse_record_stream(**params)
    except Exception as e:
        print('Exception: ' + e.__class__.__name__)
Exemple #28
0
class ArchiveIterator(six.Iterator):
    """ Iterate over records in WARC and ARC files, both gzip chunk
    compressed and uncompressed

    The indexer will automatically detect format, and decompress
    if necessary.

    """

    GZIP_ERR_MSG = """
    ERROR: non-chunked gzip file detected, gzip block continues
    beyond single record.

    This file is probably not a multi-member gzip but a single gzip file.

    To allow seek, a gzipped {1} must have each record compressed into
    a single gzip member and concatenated together.

    This file is likely still valid and can be fixed by running:

    warcio recompress <path/to/file> <path/to/new_file>

"""

    INC_RECORD = """\
    WARNING: Record not followed by newline, perhaps Content-Length is invalid
    Offset: {0}
    Remainder: {1}
"""

    def __init__(self, fileobj, no_record_parse=False,
                 verify_http=False, arc2warc=False,
                 ensure_http_headers=False, block_size=BUFF_SIZE):

        self.fh = fileobj

        self.loader = ArcWarcRecordLoader(verify_http=verify_http,
                                          arc2warc=arc2warc)
        self.known_format = None

        self.mixed_arc_warc = arc2warc

        self.member_info = None
        self.no_record_parse = no_record_parse
        self.ensure_http_headers = ensure_http_headers

        self.reader = DecompressingBufferedReader(self.fh,
                                                  block_size=block_size)
        self.offset = self.fh.tell()
        self.next_line = None

        self.err_count = 0

        self.the_iter = self._iterate_records()

    def __iter__(self):
        return self.the_iter

    def __next__(self):
        return six.next(self.the_iter)

    def _iterate_records(self):
        """ iterate over each record
        """
        raise_invalid_gzip = False
        empty_record = False
        record = None

        while True:
            try:
                record = self._next_record(self.next_line)
                if raise_invalid_gzip:
                    self._raise_invalid_gzip_err()

                yield record

            except EOFError:
                empty_record = True

            if record:
                self.read_to_end(record)

            if self.reader.decompressor:
                # if another gzip member, continue
                if self.reader.read_next_member():
                    continue

                # if empty record, then we're done
                elif empty_record:
                    break

                # otherwise, probably a gzip
                # containing multiple non-chunked records
                # raise this as an error
                else:
                    raise_invalid_gzip = True

            # non-gzip, so we're done
            elif empty_record:
                break

    def _raise_invalid_gzip_err(self):
        """ A gzip file with multiple ARC/WARC records, non-chunked
        has been detected. This is not valid for replay, so notify user
        """
        frmt = 'warc/arc'
        if self.known_format:
            frmt = self.known_format

        frmt_up = frmt.upper()

        msg = self.GZIP_ERR_MSG.format(frmt, frmt_up)
        raise ArchiveLoadFailed(msg)

    def _consume_blanklines(self):
        """ Consume blank lines that are between records
        - For warcs, there are usually 2
        - For arcs, may be 1 or 0
        - For block gzipped files, these are at end of each gzip envelope
          and are included in record length which is the full gzip envelope
        - For uncompressed, they are between records and so are NOT part of
          the record length

          count empty_size so that it can be substracted from
          the record length for uncompressed

          if first line read is not blank, likely error in WARC/ARC,
          display a warning
        """
        empty_size = 0
        first_line = True

        while True:
            line = self.reader.readline()
            if len(line) == 0:
                return None, empty_size

            stripped = line.rstrip()

            if len(stripped) == 0 or first_line:
                empty_size += len(line)

                if len(stripped) != 0:
                    # if first line is not blank,
                    # likely content-length was invalid, display warning
                    err_offset = self.fh.tell() - self.reader.rem_length() - empty_size
                    sys.stderr.write(self.INC_RECORD.format(err_offset, line))
                    self.err_count += 1

                first_line = False
                continue

            return line, empty_size

    def read_to_end(self, record):
        """ Read remainder of the stream
        If a digester is included, update it
        with the data read
        """

        # already at end of this record, don't read until it is consumed
        if self.member_info:
            return None

        curr_offset = self.offset

        while True:
            b = record.raw_stream.read(BUFF_SIZE)
            if not b:
                break

        """
        - For compressed files, blank lines are consumed
          since they are part of record length
        - For uncompressed files, blank lines are read later,
          and not included in the record length
        """
        #if self.reader.decompressor:
        self.next_line, empty_size = self._consume_blanklines()

        self.offset = self.fh.tell() - self.reader.rem_length()
        #if self.offset < 0:
        #    raise Exception('Not Gzipped Properly')

        if self.next_line:
            self.offset -= len(self.next_line)

        length = self.offset - curr_offset

        if not self.reader.decompressor:
            length -= empty_size

        self.member_info = (curr_offset, length)
        #return self.member_info
        #return next_line

    def _next_record(self, next_line):
        """ Use loader to parse the record from the reader stream
        Supporting warc and arc records
        """
        record = self.loader.parse_record_stream(self.reader,
                                                 next_line,
                                                 self.known_format,
                                                 self.no_record_parse,
                                                 self.ensure_http_headers)

        self.member_info = None

        # Track known format for faster parsing of other records
        if not self.mixed_arc_warc:
            self.known_format = record.format

        return record
Exemple #29
0
class RewriterApp(object):
    VIDEO_INFO_CONTENT_TYPE = 'application/vnd.youtube-dl_formats+json'

    DEFAULT_CSP = "default-src 'unsafe-eval' 'unsafe-inline' 'self' data: blob: mediastream: ws: wss: ; form-action 'self'"

    def __init__(self, framed_replay=False, jinja_env=None, config=None, paths=None):
        self.loader = ArcWarcRecordLoader()

        self.config = config or {}
        self.paths = paths or {}

        self.framed_replay = framed_replay

        if framed_replay:
            self.frame_mod = ''
            self.replay_mod = 'mp_'
        else:
            self.frame_mod = None
            self.replay_mod = ''

        self.default_rw = DefaultRewriter(replay_mod=self.replay_mod,
                                          config=config)

        self.js_proxy_rw = RewriterWithJSProxy(replay_mod=self.replay_mod)

        if not jinja_env:
            jinja_env = JinjaEnv(globals={'static_path': 'static'})

        self.jinja_env = jinja_env

        self.redirect_to_exact = config.get('redirect_to_exact')

        self.banner_view = BaseInsertView(self.jinja_env, self._html_templ('banner_html'))

        self.head_insert_view = HeadInsertView(self.jinja_env,
                                               self._html_templ('head_insert_html'),
                                               self.banner_view)

        self.frame_insert_view = TopFrameView(self.jinja_env,
                                               self._html_templ('frame_insert_html'),
                                               self.banner_view)

        self.error_view = BaseInsertView(self.jinja_env, self._html_templ('error_html'))
        self.not_found_view = BaseInsertView(self.jinja_env, self._html_templ('not_found_html'))
        self.query_view = BaseInsertView(self.jinja_env, self._html_templ('query_html'))

        self.use_js_obj_proxy = config.get('use_js_obj_proxy', True)

        self.cookie_tracker = None

        self.enable_memento = self.config.get('enable_memento')

        csp_header = self.config.get('csp-header', self.DEFAULT_CSP)
        if csp_header:
            self.csp_header = ('Content-Security-Policy', csp_header)
        else:
            self.csp_header = None

    def add_csp_header(self, wb_url, status_headers):
        if self.csp_header and wb_url.mod == self.replay_mod:
            status_headers.headers.append(self.csp_header)

    def _html_templ(self, name):
        value = self.config.get(name)
        if not value:
            value = name.replace('_html', '.html')
        return value

    def is_framed_replay(self, wb_url):
        return (self.framed_replay and
                wb_url.mod == self.frame_mod and
                wb_url.is_replay())

    def _check_accept_dt(self, wb_url, environ):
        is_timegate = False
        if wb_url.is_latest_replay():
            accept_dt = environ.get('HTTP_ACCEPT_DATETIME')
            is_timegate = True
            if accept_dt:
                try:
                    wb_url.timestamp = http_date_to_timestamp(accept_dt)
                except:
                    raise UpstreamException(400, url=wb_url.url, details='Invalid Accept-Datetime')
                    #return WbResponse.text_response('Invalid Accept-Datetime', status='400 Bad Request')

                wb_url.type = wb_url.REPLAY

        return is_timegate

    def _check_range(self, inputreq, wb_url):
        skip_record = False
        range_start = None
        range_end = None

        rangeres = inputreq.extract_range()

        if not rangeres:
            return range_start, range_end, skip_record

        mod_url, start, end, use_206 = rangeres

        # remove the range and still proxy
        if not use_206:
            return range_start, range_end, skip_record

        wb_url.url = mod_url
        inputreq.url = mod_url

        range_start = start
        range_end = end

        #if start with 0, load from upstream, but add range after
        if start == 0:
            del inputreq.env['HTTP_RANGE']
        else:
            skip_record = True

        return range_start, range_end, skip_record

    def _add_range(self, record, wb_url, range_start, range_end):
        if range_end is None and range_start is None:
            return

        if record.http_headers.get_statuscode() != '200':
            return

        content_length = (record.http_headers.
                          get_header('Content-Length'))
        try:
            content_length = int(content_length)
            if not range_end:
                range_end = content_length - 1

            if range_start >= content_length or range_end >= content_length:
                details = 'Invalid Range: {0} >= {2} or {1} >= {2}'.format(range_start, range_end, content_length)
                try:
                    r.raw.close()
                except:
                    pass

                raise UpstreamException(416, url=wb_url.url, details=details)

            range_len = range_end - range_start + 1
            record.http_headers.add_range(range_start, range_len,
                                          content_length)

            record.http_headers.replace_header('Content-Length', str(range_len))

            record.raw_stream = OffsetLimitReader(record.raw_stream, range_start, range_len)
            return True

        except (ValueError, TypeError):
            pass

    def render_content(self, wb_url, kwargs, environ):
        wb_url = wb_url.replace('#', '%23')
        wb_url = WbUrl(wb_url)
        is_timegate = self._check_accept_dt(wb_url, environ)

        host_prefix = self.get_host_prefix(environ)
        rel_prefix = self.get_rel_prefix(environ)
        full_prefix = host_prefix + rel_prefix

        is_proxy = ('wsgiprox.proxy_host' in environ)

        response = self.handle_custom_response(environ, wb_url,
                                               full_prefix, host_prefix,
                                               kwargs)

        if response:
            return self.format_response(response, wb_url, full_prefix, is_timegate, is_proxy)

        if is_proxy:
            environ['pywb_proxy_magic'] = environ['wsgiprox.proxy_host']
            urlrewriter = IdentityUrlRewriter(wb_url, '')
            framed_replay = False

        else:
            urlrewriter = UrlRewriter(wb_url,
                                      prefix=full_prefix,
                                      full_prefix=full_prefix,
                                      rel_prefix=rel_prefix)

            framed_replay = self.framed_replay

        url_parts = urlsplit(wb_url.url)
        if not url_parts.path:
            scheme, netloc, path, query, frag = url_parts
            path = '/'
            url = urlunsplit((scheme, netloc, path, query, frag))
            resp = WbResponse.redir_response(urlrewriter.rewrite(url),
                                             '307 Temporary Redirect')

            if self.enable_memento:
                resp.status_headers['Link'] = MementoUtils.make_link(url, 'original')

            return resp

        self.unrewrite_referrer(environ, full_prefix)

        urlkey = canonicalize(wb_url.url)

        if self.use_js_obj_proxy:
            content_rw = self.js_proxy_rw
        else:
            content_rw = self.default_rw

        inputreq = RewriteInputRequest(environ, urlkey, wb_url.url, content_rw)

        inputreq.include_method_query(wb_url.url)

        range_start, range_end, skip_record = self._check_range(inputreq, wb_url)

        setcookie_headers = None
        if self.cookie_tracker:
            cookie_key = self.get_cookie_key(kwargs)
            res = self.cookie_tracker.get_cookie_headers(wb_url.url, urlrewriter, cookie_key)
            inputreq.extra_cookie, setcookie_headers = res

        r = self._do_req(inputreq, wb_url, kwargs, skip_record)

        if r.status_code >= 400:
            error = None
            try:
                error = r.raw.read()
                r.raw.close()
            except:
                pass

            if error:
                error = error.decode('utf-8')
            else:
                error = ''

            details = dict(args=kwargs, error=error)
            raise UpstreamException(r.status_code, url=wb_url.url, details=details)

        stream = BufferedReader(r.raw, block_size=BUFF_SIZE)
        record = self.loader.parse_record_stream(stream,
                                                 ensure_http_headers=True)

        memento_dt = r.headers.get('Memento-Datetime')
        target_uri = r.headers.get('WARC-Target-URI')

        cdx = CDXObject(r.headers.get('Warcserver-Cdx').encode('utf-8'))

        #cdx['urlkey'] = urlkey
        #cdx['timestamp'] = http_date_to_timestamp(memento_dt)
        #cdx['url'] = target_uri

        set_content_loc = False

        # Check if Fuzzy Match
        if target_uri != wb_url.url and cdx.get('is_fuzzy') == '1':
            set_content_loc = True

        # if redir to exact, redir if url or ts are different
        if self.redirect_to_exact:
            if (set_content_loc or
                (wb_url.timestamp != cdx.get('timestamp') and not cdx.get('is_live'))):

                new_url = urlrewriter.get_new_url(url=target_uri,
                                                  timestamp=cdx['timestamp'],
                                                  mod=wb_url.mod)

                resp = WbResponse.redir_response(new_url, '307 Temporary Redirect')
                if self.enable_memento:
                    if is_timegate and not is_proxy:
                        self._add_memento_links(target_uri, full_prefix,
                                                memento_dt, cdx['timestamp'],
                                                resp.status_headers,
                                                is_timegate, is_proxy)

                    else:
                        resp.status_headers['Link'] = MementoUtils.make_link(target_uri, 'original')

                return resp

        self._add_custom_params(cdx, r.headers, kwargs)

        if self._add_range(record, wb_url, range_start, range_end):
            wb_url.mod = 'id_'

        is_ajax = self.is_ajax(environ)

        if is_ajax:
            head_insert_func = None
            urlrewriter.rewrite_opts['is_ajax'] = True
        else:
            top_url = self.get_top_url(full_prefix, wb_url, cdx, kwargs)
            head_insert_func = (self.head_insert_view.
                                    create_insert_func(wb_url,
                                                       full_prefix,
                                                       host_prefix,
                                                       top_url,
                                                       environ,
                                                       framed_replay,
                                                       config=self.config))

        cookie_rewriter = None
        if self.cookie_tracker:
            cookie_rewriter = self.cookie_tracker.get_rewriter(urlrewriter,
                                                               cookie_key)

        urlrewriter.rewrite_opts['ua_string'] = environ.get('HTTP_USER_AGENT')

        result = content_rw(record, urlrewriter, cookie_rewriter, head_insert_func, cdx)

        status_headers, gen, is_rw = result

        if setcookie_headers:
            status_headers.headers.extend(setcookie_headers)

        if ' ' not in status_headers.statusline:
            status_headers.statusline += ' None'

        if not is_ajax and self.enable_memento:
            self._add_memento_links(cdx['url'], full_prefix,
                                    memento_dt, cdx['timestamp'], status_headers,
                                    is_timegate, is_proxy, cdx.get('source-coll'))

            set_content_loc = True

        if set_content_loc and not self.redirect_to_exact:
            status_headers.headers.append(('Content-Location', urlrewriter.get_new_url(timestamp=cdx['timestamp'],
                                                                                       url=cdx['url'])))
        if not is_proxy:
            self.add_csp_header(wb_url, status_headers)

        response = WbResponse(status_headers, gen)

        return response

    def format_response(self, response, wb_url, full_prefix, is_timegate, is_proxy):
        memento_ts = None
        if not isinstance(response, WbResponse):
            content_type = 'text/html'

            # if not replay outer frame, specify utf-8 charset
            if not self.is_framed_replay(wb_url):
                content_type += '; charset=utf-8'
            else:
                memento_ts = wb_url.timestamp

            response = WbResponse.text_response(response, content_type=content_type)

        if self.enable_memento:
            self._add_memento_links(wb_url.url, full_prefix, None, memento_ts,
                                    response.status_headers, is_timegate, is_proxy)
        return response

    def _add_memento_links(self, url, full_prefix, memento_dt, memento_ts,
                           status_headers, is_timegate, is_proxy, coll=None):

        # memento url + header
        if not memento_dt and memento_ts:
            memento_dt = timestamp_to_http_date(memento_ts)

        if memento_dt:
            status_headers.headers.append(('Memento-Datetime', memento_dt))

            if is_proxy:
                memento_url = url
            else:
                memento_url = full_prefix + memento_ts + self.replay_mod
                memento_url += '/' + url
        else:
            memento_url = None

        timegate_url, timemap_url = self._get_timegate_timemap(url, full_prefix)

        link = []
        if not is_proxy:
            link.append(MementoUtils.make_link(url, 'original'))
            link.append(MementoUtils.make_link(timegate_url, 'timegate'))
            link.append(MementoUtils.make_link(timemap_url, 'timemap'))

        if memento_dt:
            link.append(MementoUtils.make_memento_link(memento_url, 'memento', memento_dt, coll))

        link_str = ', '.join(link)

        status_headers.headers.append(('Link', link_str))

        if is_timegate:
            status_headers.headers.append(('Vary', 'accept-datetime'))

    def _get_timegate_timemap(self, url, full_prefix):
        # timegate url
        timegate_url = full_prefix
        if self.replay_mod:
            timegate_url += self.replay_mod + '/'

        timegate_url += url

        # timemap url
        timemap_url = full_prefix + 'timemap/link/' + url
        return timegate_url, timemap_url

    def get_top_url(self, full_prefix, wb_url, cdx, kwargs):
        top_url = full_prefix
        top_url += wb_url.to_str(mod='')
        return top_url

    def handle_error(self, environ, ue):
        if ue.status_code == 404:
            return self._not_found_response(environ, ue.url)

        else:
            status = str(ue.status_code) + ' ' + HTTP_STATUS_CODES.get(ue.status_code, 'Unknown Error')
            return self._error_response(environ, ue.url, ue.msg,
                                        status=status)

    def _not_found_response(self, environ, url):
        resp = self.not_found_view.render_to_string(environ, url=url)

        return WbResponse.text_response(resp, status='404 Not Found', content_type='text/html')

    def _error_response(self, environ, msg='', details='', status='404 Not Found'):
        resp = self.error_view.render_to_string(environ,
                                                err_msg=msg,
                                                err_details=details)

        return WbResponse.text_response(resp, status=status, content_type='text/html')


    def _do_req(self, inputreq, wb_url, kwargs, skip_record):
        req_data = inputreq.reconstruct_request(wb_url.url)

        headers = {'Content-Length': str(len(req_data)),
                   'Content-Type': 'application/request'}

        if skip_record:
            headers['Recorder-Skip'] = '1'

        if wb_url.is_latest_replay():
            closest = 'now'
        else:
            closest = wb_url.timestamp

        params = {}
        params['url'] = wb_url.url
        params['closest'] = closest
        params['matchType'] = 'exact'

        if wb_url.mod == 'vi_':
            params['content_type'] = self.VIDEO_INFO_CONTENT_TYPE

        upstream_url = self.get_upstream_url(wb_url, kwargs, params)

        r = requests.post(upstream_url,
                          data=BytesIO(req_data),
                          headers=headers,
                          stream=True)

        return r

    def do_query(self, wb_url, kwargs):
        params = {}
        params['url'] = wb_url.url
        params['output'] = kwargs.get('output', 'json')
        params['from'] = wb_url.timestamp
        params['to'] = wb_url.end_timestamp

        upstream_url = self.get_upstream_url(wb_url, kwargs, params)
        upstream_url = upstream_url.replace('/resource/postreq', '/index')

        r = requests.get(upstream_url)

        return r

    def make_timemap(self, wb_url, res, full_prefix, output):
        wb_url.type = wb_url.QUERY

        content_type = res.headers.get('Content-Type')
        text = res.text

        if not res.text:
            status = '404 Not Found'

        elif res.status_code:
            status = str(res.status_code) + ' ' + res.reason

            if res.status_code == 200 and output == 'link':
                timegate, timemap = self._get_timegate_timemap(wb_url.url, full_prefix)

                text = MementoUtils.wrap_timemap_header(wb_url.url,
                                                        timegate,
                                                        timemap,
                                                        res.text)
        return WbResponse.text_response(text,
                                        content_type=content_type,
                                        status=status)

    def handle_timemap(self, wb_url, kwargs, full_prefix):
        output = kwargs.get('output')
        res = self.do_query(wb_url, kwargs)
        return self.make_timemap(wb_url, res, full_prefix, output)

    def handle_query(self, environ, wb_url, kwargs, full_prefix):
        prefix = self.get_full_prefix(environ)

        params = dict(url=wb_url.url,
                      prefix=prefix)

        return self.query_view.render_to_string(environ, **params)

    def get_host_prefix(self, environ):
        scheme = environ['wsgi.url_scheme'] + '://'

        # proxy
        host = environ.get('wsgiprox.proxy_host')
        if host:
            return scheme + host

        # default
        host = environ.get('HTTP_HOST')
        if host:
            return scheme + host

        # if no host
        host = environ['SERVER_NAME']
        if environ['wsgi.url_scheme'] == 'https':
            if environ['SERVER_PORT'] != '443':
                host += ':' + environ['SERVER_PORT']
        else:
            if environ['SERVER_PORT'] != '80':
                host += ':' + environ['SERVER_PORT']

        return scheme + host

    def get_rel_prefix(self, environ):
        #return request.script_name
        return environ.get('SCRIPT_NAME') + '/'

    def get_full_prefix(self, environ):
        return self.get_host_prefix(environ) + self.get_rel_prefix(environ)

    def unrewrite_referrer(self, environ, full_prefix):
        referrer = environ.get('HTTP_REFERER')
        if not referrer:
            return False

        if referrer.startswith(full_prefix):
            referrer = referrer[len(full_prefix):]
            if referrer:
                environ['HTTP_REFERER'] = WbUrl(referrer).url
                return True

        return False

    def is_ajax(self, environ):
        value = environ.get('HTTP_X_REQUESTED_WITH')
        value = value or environ.get('HTTP_X_PYWB_REQUESTED_WITH')
        if value and value.lower() == 'xmlhttprequest':
            return True

        return False

    def get_base_url(self, wb_url, kwargs):
        type = kwargs.get('type')
        return self.paths[type].format(**kwargs)

    def get_upstream_url(self, wb_url, kwargs, params):
        base_url = self.get_base_url(wb_url, kwargs)
        param_str = urlencode(params, True)
        if param_str:
            q_char = '&' if '?' in base_url else '?'
            base_url += q_char + param_str
        return base_url

    def get_cookie_key(self, kwargs):
        raise NotImplemented()

    def _add_custom_params(self, cdx, headers, kwargs):
        pass

    def get_top_frame_params(self, wb_url, kwargs):
        return None

    def handle_custom_response(self, environ, wb_url, full_prefix, host_prefix, kwargs):
        if kwargs.get('output'):
            return self.handle_timemap(wb_url, kwargs, full_prefix)

        if wb_url.is_query():
            return self.handle_query(environ, wb_url, kwargs, full_prefix)

        if self.is_framed_replay(wb_url):
            extra_params = self.get_top_frame_params(wb_url, kwargs)
            return self.frame_insert_view.get_top_frame(wb_url,
                                                        full_prefix,
                                                        host_prefix,
                                                        environ,
                                                        self.frame_mod,
                                                        self.replay_mod,
                                                        coll='',
                                                        extra_params=extra_params)

        return None