Beispiel #1
0
    def __str__(self):
        if self.cdxline:
            return to_native_str(self.cdxline, 'utf-8')

        if not self._from_json:
            return ' '.join(str(val) for val in six.itervalues(self))
        else:
            return json_encode(self)
Beispiel #2
0
    def __call__(self, filename, cdx):
        with open(self.pathindex_file, 'rb') as reader:
            result = iter_exact(reader, filename.encode('utf-8'), b'\t')

            for pathline in result:
                paths = pathline.split(b'\t')[1:]
                for path in paths:
                    yield to_native_str(path, 'utf-8')
Beispiel #3
0
    def __init__(self, cdxline=b''):
        OrderedDict.__init__(self)

        cdxline = cdxline.rstrip()
        self._from_json = False
        self._cached_json = None

        # Allows for filling the fields later or in a custom way
        if not cdxline:
            self.cdxline = cdxline
            return

        fields = cdxline.split(b' ' , 2)
        # Check for CDX JSON
        if fields[-1].startswith(b'{'):
            self[URLKEY] = to_native_str(fields[0], 'utf-8')
            self[TIMESTAMP] = to_native_str(fields[1], 'utf-8')
            json_fields = json_decode(to_native_str(fields[-1], 'utf-8'))
            for n, v in six.iteritems(json_fields):
                n = to_native_str(n, 'utf-8')
                n = self.CDX_ALT_FIELDS.get(n, n)

                if n == 'url':
                    try:
                        v.encode('ascii')
                    except UnicodeEncodeError:
                        v = quote(v.encode('utf-8'), safe=':/')

                if n != 'filename':
                    v = to_native_str(v, 'utf-8')

                self[n] = v

            self.cdxline = cdxline
            self._from_json = True
            return

        more_fields = fields.pop().split(b' ')
        fields.extend(more_fields)

        cdxformat = None
        for i in self.CDX_FORMATS:
            if len(i) == len(fields):
                cdxformat = i

        if not cdxformat:
            msg = 'unknown {0}-field cdx format'.format(len(fields))
            raise CDXException(msg)

        for header, field in zip(cdxformat, fields):
            self[header] = to_native_str(field, 'utf-8')

        self.cdxline = cdxline
Beispiel #4
0
    def __call__(self, filename, cdx):
        redis_key = self.redis_key_template
        params = {}
        if hasattr(cdx, '_formatter') and cdx._formatter:
            redis_key = cdx._formatter.format(redis_key)
            params = cdx._formatter.params

        res = None

        if '*' in redis_key:
            for key in self.scan_keys(redis_key, params):
                res = self.redis.hget(key, filename)
                if res:
                    break
        else:
            res = self.redis.hget(redis_key, filename)

        res = to_native_str(res, 'utf-8')

        return res
Beispiel #5
0
    def __init__(self, idxline):
        OrderedDict.__init__(self)

        idxline = idxline.rstrip()
        fields = idxline.split(b'\t')

        if len(fields) < self.NUM_REQ_FIELDS:
            msg = 'invalid idx format: {0} fields found, {1} required'
            raise CDXException(msg.format(len(fields), self.NUM_REQ_FIELDS))

        for header, field in zip(self.FORMAT, fields):
            self[header] = to_native_str(field, 'utf-8')

        self['offset'] = int(self['offset'])
        self['length'] = int(self['length'])
        lineno = self.get('lineno')
        if lineno:
            self['lineno'] = int(lineno)

        self.idxline = idxline
Beispiel #6
0
    def serve_content(self, environ, coll='$root', url='', timemap_output='', record=False):
        """Serve the contents of a URL/Record rewriting the contents of the response when applicable.

        :param dict environ: The WSGI environment dictionary for the request
        :param str coll: The name of the collection the record is to be served from
        :param str url: The URL for the corresponding record to be served if it exists
        :param str timemap_output: The contents of the timemap included in the link header of the response
        :param bool record: Should the content being served by recorded (save to a warc). Only valid in record mode
        :return: WbResponse containing the contents of the record/URL
        :rtype: WbResponse
        """
        if not self.is_valid_coll(coll):
            self.raise_not_found(environ, 'No handler for "/{0}"'.format(coll))

        self.setup_paths(environ, coll, record)

        request_uri = environ.get('REQUEST_URI')
        script_name = environ.get('SCRIPT_NAME', '') + '/'
        if request_uri and request_uri.startswith(script_name):
            wb_url_str = request_uri[len(script_name):]

        else:
            wb_url_str = to_native_str(url)

            if environ.get('QUERY_STRING'):
                wb_url_str += '?' + environ.get('QUERY_STRING')

        metadata = self.get_metadata(coll)
        if record:
            metadata['type'] = 'record'

        if timemap_output:
            metadata['output'] = timemap_output
            # ensure that the timemap path information is not included
            wb_url_str = wb_url_str.replace('timemap/{0}/'.format(timemap_output), '')
        try:
            response = self.rewriterapp.render_content(wb_url_str, metadata, environ)
        except UpstreamException as ue:
            response = self.rewriterapp.handle_error(environ, ue)
            raise HTTPException(response=response)
        return response
Beispiel #7
0
    def __call__(self, cdx, params):
        entry = self.load_resource(cdx, params)
        if not entry:
            return None, None

        compress = params.get('compress') == 'gzip'

        warc_headers, other_headers, stream = entry

        source = self._get_source_id(cdx)

        out_headers = {}
        out_headers['Warcserver-Type'] = 'warc'
        out_headers['Content-Type'] = 'application/warc-record'

        if params.get('recorder_skip'):
            out_headers['Recorder-Skip'] = '1'
            cdx['recorder_skip'] = '1'

        out_headers['Warcserver-Cdx'] = to_native_str(cdx.to_cdxj().rstrip())
        out_headers['Warcserver-Source-Coll'] = to_native_str(source)

        if not warc_headers:
            if other_headers:
                out_headers['Link'] = other_headers.get('Link')
                out_headers['Memento-Datetime'] = other_headers.get('Memento-Datetime')
                if not compress:
                    out_headers['Content-Length'] = other_headers.get('Content-Length')

            return out_headers, StreamIter(stream, closer=call_release_conn)

        target_uri = warc_headers.get_header('WARC-Target-URI')

        out_headers['WARC-Target-URI'] = target_uri

        out_headers['Link'] = MementoUtils.make_link(target_uri, 'original')

        memento_dt = iso_date_to_datetime(warc_headers.get_header('WARC-Date'))
        out_headers['Memento-Datetime'] = datetime_to_http_date(memento_dt)

        warc_headers_buff = warc_headers.to_bytes()

        if not compress:
            lenset = self._set_content_len(warc_headers.get_header('Content-Length'),
                                         out_headers,
                                         len(warc_headers_buff))
        else:
            lenset = False

        streamiter = StreamIter(stream,
                                header1=warc_headers_buff,
                                header2=other_headers,
                                closer=call_release_conn)

        if compress:
            streamiter = compress_gzip_iter(streamiter)
            out_headers['Content-Encoding'] = 'gzip'

        #if not lenset:
        #    out_headers['Transfer-Encoding'] = 'chunked'
        #    streamiter = chunk_encode_iter(streamiter)

        return out_headers, streamiter
Beispiel #8
0
    def parse(self, stream, full_statusline=None):
        """
        parse stream for status line and headers
        return a StatusAndHeaders object

        support continuation headers starting with space or tab
        """
        def readline():
            return to_native_str(stream.readline())

        # status line w newlines intact
        if full_statusline is None:
            full_statusline = readline()
        else:
            full_statusline = to_native_str(full_statusline)

        statusline, total_read = _strip_count(full_statusline, 0)

        headers = []

        # at end of stream
        if total_read == 0:
            raise EOFError()
        elif not statusline:
            return StatusAndHeaders(statusline=statusline,
                                    headers=headers,
                                    protocol='',
                                    total_len=total_read)

        # validate only if verify is set
        if self.verify:
            protocol_status = self.split_prefix(statusline, self.statuslist)

            if not protocol_status:
                msg = 'Expected Status Line starting with {0} - Found: {1}'
                msg = msg.format(self.statuslist, statusline)
                raise StatusAndHeadersParserException(msg, full_statusline)
        else:
            protocol_status = statusline.split(' ', 1)

        line, total_read = _strip_count(readline(), total_read)
        while line:
            result = line.split(':', 1)
            if len(result) == 2:
                name = result[0].rstrip(' \t')
                value = result[1].lstrip()
            else:
                name = result[0]
                value = None

            next_line, total_read = _strip_count(readline(), total_read)

            # append continuation lines, if any
            while next_line and next_line.startswith((' ', '\t')):
                if value is not None:
                    value += next_line
                next_line, total_read = _strip_count(readline(), total_read)

            if value is not None:
                header = (name, value)
                headers.append(header)

            line = next_line

        if len(protocol_status) > 1:
            statusline = protocol_status[1].strip()
        else:
            statusline = ''

        return StatusAndHeaders(statusline=statusline,
                                headers=headers,
                                protocol=protocol_status[0],
                                total_len=total_read)
Beispiel #9
0
 def handle_binary(query):
     query = base64.b64encode(query)
     query = to_native_str(query)
     query = '__wb_post_data=' + query
     return query
Beispiel #10
0
    def __init__(self, method, mime, length, stream,
                       buffered_stream=None,
                       environ=None):
        """
        Append the method for HEAD/OPTIONS as __pywb_method=<method>
        For POST requests, requests extract a url-encoded form from stream
        read content length and convert to query params, if possible
        Attempt to decode application/x-www-form-urlencoded or multipart/*,
        otherwise read whole block and b64encode
        """
        self.query = b''

        method = method.upper()

        if method in ('OPTIONS', 'HEAD'):
            self.query = '__pywb_method=' + method.lower()
            return

        if method != 'POST':
            return

        try:
            length = int(length)
        except (ValueError, TypeError):
            return

        if length <= 0:
            return

        query = b''

        while length > 0:
            buff = stream.read(length)
            length -= len(buff)

            if not buff:
                break

            query += buff

        if buffered_stream:
            buffered_stream.write(query)
            buffered_stream.seek(0)

        if not mime:
            mime = ''

        if mime.startswith('application/x-www-form-urlencoded'):
            query = to_native_str(query)
            query = unquote_plus(query)

        elif mime.startswith('multipart/'):
            env = {'REQUEST_METHOD': 'POST',
                   'CONTENT_TYPE': mime,
                   'CONTENT_LENGTH': len(query)}

            args = dict(fp=BytesIO(query),
                        environ=env,
                        keep_blank_values=True)

            if PY3:
                args['encoding'] = 'utf-8'

            data = cgi.FieldStorage(**args)

            values = []
            for item in data.list:
                values.append((item.name, item.value))

            query = urlencode(values, True)

        elif mime.startswith('application/x-amf'):
            query = self.amf_parse(query, environ)

        else:
            query = base64.b64encode(query)
            query = to_native_str(query)
            query = '__wb_post_data=' + query

        self.query = query
Beispiel #11
0
 def __str__(self):
     return self.type_ + ':' + to_native_str(
         base64.b32encode(self.digester.digest()))
Beispiel #12
0
    def __init__(self, method, mime, length, stream,
                       buffered_stream=None,
                       environ=None):
        """
        Append the method for HEAD/OPTIONS as __pywb_method=<method>
        For POST requests, requests extract a url-encoded form from stream
        read content length and convert to query params, if possible
        Attempt to decode application/x-www-form-urlencoded or multipart/*,
        otherwise read whole block and b64encode
        """
        self.query = b''

        method = method.upper()

        if method in ('OPTIONS', 'HEAD'):
            self.query = '__pywb_method=' + method.lower()
            return

        if method != 'POST':
            return

        try:
            length = int(length)
        except (ValueError, TypeError):
            return

        if length <= 0:
            return

        # max POST query allowed, for size considerations, only read upto this size
        length = min(length, self.MAX_POST_SIZE)
        query = []

        while length > 0:
            buff = stream.read(length)
            length -= len(buff)

            if not buff:
                break

            query.append(buff)

        query = b''.join(query)

        if buffered_stream:
            buffered_stream.write(query)
            buffered_stream.seek(0)

        if not mime:
            mime = ''

        def handle_binary(query):
            query = base64.b64encode(query)
            query = to_native_str(query)
            query = '__wb_post_data=' + query
            return query

        if mime.startswith('application/x-www-form-urlencoded'):
            try:
                query = to_native_str(query.decode('utf-8'))
                query = unquote_plus(query)
            except UnicodeDecodeError:
                query = handle_binary(query)

        elif mime.startswith('multipart/'):
            env = {'REQUEST_METHOD': 'POST',
                   'CONTENT_TYPE': mime,
                   'CONTENT_LENGTH': len(query)}

            args = dict(fp=BytesIO(query),
                        environ=env,
                        keep_blank_values=True)

            if PY3:
                args['encoding'] = 'utf-8'

            try:
                data = cgi.FieldStorage(**args)
            except ValueError:
                # Content-Type multipart/form-data may lack "boundary" info
                query = handle_binary(query)
            else:
                values = []
                for item in data.list:
                    values.append((item.name, item.value))

                query = urlencode(values, True)

        elif mime.startswith('application/x-amf'):
            query = self.amf_parse(query, environ)

        else:
            query = handle_binary(query)

        self.query = query
Beispiel #13
0
 def __str__(self):
     return to_native_str(self.idxline, 'utf-8')
Beispiel #14
0
    def __call__(self, cdx, params):
        entry = self.load_resource(cdx, params)
        if not entry:
            return None, None

        compress = params.get('compress') == 'gzip'

        warc_headers, other_headers, stream = entry

        source = self._get_source_id(cdx)

        out_headers = {}
        out_headers['Warcserver-Type'] = 'warc'
        out_headers['Content-Type'] = 'application/warc-record'

        if params.get('recorder_skip'):
            out_headers['Recorder-Skip'] = '1'
            cdx['recorder_skip'] = '1'

        out_headers['Warcserver-Cdx'] = to_native_str(cdx.to_cdxj().rstrip())
        out_headers['Warcserver-Source-Coll'] = to_native_str(source)

        if not warc_headers:
            if other_headers:
                out_headers['Link'] = other_headers.get('Link')
                out_headers['Memento-Datetime'] = other_headers.get(
                    'Memento-Datetime')
                if not compress:
                    out_headers['Content-Length'] = other_headers.get(
                        'Content-Length')

            return out_headers, StreamIter(stream, closer=call_release_conn)

        target_uri = warc_headers.get_header('WARC-Target-URI')

        out_headers['WARC-Target-URI'] = target_uri

        out_headers['Link'] = MementoUtils.make_link(target_uri, 'original')

        memento_dt = iso_date_to_datetime(warc_headers.get_header('WARC-Date'))
        out_headers['Memento-Datetime'] = datetime_to_http_date(memento_dt)

        warc_headers_buff = warc_headers.to_bytes()

        if not compress:
            lenset = self._set_content_len(
                warc_headers.get_header('Content-Length'), out_headers,
                len(warc_headers_buff))
        else:
            lenset = False

        streamiter = StreamIter(stream,
                                header1=warc_headers_buff,
                                header2=other_headers,
                                closer=call_release_conn)

        if compress:
            streamiter = compress_gzip_iter(streamiter)
            out_headers['Content-Encoding'] = 'gzip'

        #if not lenset:
        #    out_headers['Transfer-Encoding'] = 'chunked'
        #    streamiter = chunk_encode_iter(streamiter)

        return out_headers, streamiter
Beispiel #15
0
 def __str__(self):
     return to_native_str(self.idxline, 'utf-8')
Beispiel #16
0
def post_query_extract(mime, length, stream):
    """
    Extract a url-encoded form POST/PUT from stream
    content length, return None
    Attempt to decode application/x-www-form-urlencoded or multipart/*,
    otherwise read whole block and b64encode
    """
    post_query = b''

    try:
        length = int(length)
    except (ValueError, TypeError):
        return

    if length <= 0:
        return

    while length > 0:
        buff = stream.read(length)
        length -= len(buff)

        if not buff:
            break

        post_query += buff

    if not mime:
        mime = ''

    if mime.startswith('application/x-www-form-urlencoded'):
        post_query = to_native_str(post_query)
        post_query = unquote_plus(post_query)

    elif mime.startswith('multipart/'):
        env = {
            'REQUEST_METHOD': 'POST',
            'CONTENT_TYPE': mime,
            'CONTENT_LENGTH': len(post_query)
        }

        args = dict(fp=BytesIO(post_query),
                    environ=env,
                    keep_blank_values=True)

        if six.PY3:
            args['encoding'] = 'utf-8'

        data = cgi.FieldStorage(**args)

        values = []
        for item in data.list:
            values.append((item.name, item.value))

        post_query = urlencode(values, True)

    else:
        post_query = base64.b64encode(post_query)
        post_query = to_native_str(post_query)
        post_query = '__warc_post_data=' + post_query

    return post_query
Beispiel #17
0
def do_encode(m, encoding='UTF-8'):
    return "*={0}''".format(encoding) + quote(to_native_str(m.group(1)))
Beispiel #18
0
 def readline():
     return to_native_str(stream.readline())
Beispiel #19
0
def query_extract(mime, length, stream, url):
    """
    Extract a url-encoded form POST/PUT from stream
    content length, return None
    Attempt to decode application/x-www-form-urlencoded or multipart/*,
    otherwise read whole block and b64encode
    """
    query_data = b""

    try:
        length = int(length)
    except (ValueError, TypeError):
        if length is None:
            length = 8192
        else:
            return

    while length > 0:
        buff = stream.read(length)

        length -= len(buff)

        if not buff:
            break

        query_data += buff

    if not mime:
        mime = ""

    query = ""

    def handle_binary(query_data):
        query = base64.b64encode(query_data)
        query = to_native_str(query)
        query = "__wb_post_data=" + query
        return query

    if mime.startswith("application/x-www-form-urlencoded"):
        try:
            query = to_native_str(query_data.decode("utf-8"))
            query = unquote_plus(query)
        except UnicodeDecodeError:
            query = handle_binary(query_data)

    elif mime.startswith("multipart/"):
        env = {
            "REQUEST_METHOD": "POST",
            "CONTENT_TYPE": mime,
            "CONTENT_LENGTH": len(query_data),
        }

        args = dict(fp=BytesIO(query_data),
                    environ=env,
                    keep_blank_values=True)

        args["encoding"] = "utf-8"

        try:
            data = cgi.FieldStorage(**args)
        except ValueError:
            # Content-Type multipart/form-data may lack "boundary" info
            query = handle_binary(query_data)
        else:
            values = []
            for item in data.list:
                values.append((item.name, item.value))

            query = urlencode(values, True)

    elif mime.startswith("application/json"):
        try:
            query = json_parse(query_data)
        except Exception as e:
            if query_data:
                try:
                    sys.stderr.write("Error parsing: " +
                                     query_data.decode("utf-8") + "\n")
                except:
                    pass

            query = ""

    elif mime.startswith("text/plain"):
        try:
            query = json_parse(query_data)
        except Exception as e:
            query = handle_binary(query_data)

    elif mime.startswith("application/x-amf"):
        query = amf_parse(query_data)
    else:
        query = handle_binary(query_data)

    if query:
        query = query[:MAX_QUERY_LENGTH]

    return query