Example #1
0
    def handle_range(self, wbrequest, key, wbresponse_func, url, start, end,
                     use_206):
        # key must be set
        assert (key)
        if key not in self.cache:
            wbrequest.custom_params['noredir'] = True
            response = wbresponse_func()

            # only cache 200 responses
            if not response.status_headers.get_statuscode().startswith('200'):
                return response.status_headers, response.body

            if not self.temp_dir:
                self.temp_dir = mkdtemp(prefix='_pywbcache')
            else:
                pass
                #self._check_dir_size(self.temp_dir)

            with NamedTemporaryFile(delete=False, dir=self.temp_dir) as fh:
                for obj in response.body:
                    fh.write(obj)

                name = fh.name

            spec = dict(name=fh.name, headers=response.status_headers.headers)

            self.cache[key] = yaml.dump(spec)
        else:
            spec = yaml.load(self.cache[key])

            spec['headers'] = [tuple(x) for x in spec['headers']]

        filelen = os.path.getsize(spec['name'])

        maxlen = filelen - start

        if end:
            maxlen = min(maxlen, end - start + 1)

        def read_range():
            with open(spec['name'], 'rb') as fh:
                fh.seek(start)
                fh = LimitReader.wrap_stream(fh, maxlen)
                while True:
                    buf = fh.read()
                    if not buf:
                        break

                    yield buf

        status_headers = StatusAndHeaders('200 OK', spec['headers'])

        if use_206:
            StatusAndHeaders.add_range(status_headers, start, maxlen, filelen)

        status_headers.replace_header('Content-Length', str(maxlen))

        return status_headers, read_range()
Example #2
0
def handle_range(self, wbrequest, key, wbresponse_func, url, start, end,
                 use_206):
    # adapt handle_range so it reads ranges directly from the response bytestring
    # rather than creating cache files on disk:
    # we cache warcs in redis, and these files can be huge and don't seem to be reliably cleaned up
    # https://github.com/harvard-lil/perma/issues/2428
    # original: https://github.com/webrecorder/pywb/blob/0.32.0/pywb/webapp/rangecache.py#L27

    # begin Perma changes 1
    wbrequest.custom_params['noredir'] = True
    response = wbresponse_func()
    joined = b"\n".join(response.body)
    filelen = len(joined)
    # end Perma changes 1

    maxlen = filelen - start

    if end:
        maxlen = min(maxlen, end - start + 1)

    def read_range():
        with io.BytesIO(
                joined
        ) as fh:  # Perma changes 2: replaced real file w/ BytesIO
            fh.seek(start)
            fh = LimitReader.wrap_stream(fh, maxlen)
            while True:
                buf = fh.read()
                if not buf:
                    break

                yield buf

    # begin Perma changes 3
    status_headers = StatusAndHeaders('200 OK',
                                      response.status_headers.headers)
    # end Perma changes 3

    if use_206:
        StatusAndHeaders.add_range(status_headers, start, maxlen, filelen)

    status_headers.replace_header('Content-Length', str(maxlen))

    return status_headers, read_range()
Example #3
0
    def handle_range(self, wbrequest, key, wbresponse_func,
                     url, start, end, use_206):
        # key must be set
        assert(key)
        if key not in self.cache:
            wbrequest.custom_params['noredir'] = True
            response = wbresponse_func()

            # only cache 200 responses
            if not response.status_headers.get_statuscode().startswith('200'):
                return response.status_headers, response.body

            if not self.temp_dir:
                self.temp_dir = mkdtemp(prefix='_pywbcache')
            else:
                pass
                #self._check_dir_size(self.temp_dir)

            with NamedTemporaryFile(delete=False, dir=self.temp_dir) as fh:
                for obj in response.body:
                    fh.write(obj)

                name = fh.name

            spec = dict(name=fh.name,
                        headers=response.status_headers.headers)

            self.cache[key] = yaml.dump(spec)
        else:
            spec = yaml.load(self.cache[key])

            spec['headers'] = [tuple(x) for x in spec['headers']]

        filelen = os.path.getsize(spec['name'])

        maxlen = filelen - start

        if end:
            maxlen = min(maxlen, end - start + 1)

        def read_range():
            with open(spec['name'], 'rb') as fh:
                fh.seek(start)
                fh = LimitReader.wrap_stream(fh, maxlen)
                while True:
                    buf = fh.read()
                    if not buf:
                        break

                    yield buf

        status_headers = StatusAndHeaders('200 OK', spec['headers'])

        if use_206:
            StatusAndHeaders.add_range(status_headers, start,
                                       maxlen,
                                       filelen)

        status_headers.replace_header('Content-Length', str(maxlen))

        return status_headers, read_range()