def __str__(self): if self.cdxline: return to_native_str(self.cdxline, 'utf-8') if not self._from_json: return ' '.join(str(val) for val in six.itervalues(self)) else: return json_encode(self)
def __call__(self, filename, cdx): with open(self.pathindex_file, 'rb') as reader: result = iter_exact(reader, filename.encode('utf-8'), b'\t') for pathline in result: paths = pathline.split(b'\t')[1:] for path in paths: yield to_native_str(path, 'utf-8')
def __init__(self, cdxline=b''): OrderedDict.__init__(self) cdxline = cdxline.rstrip() self._from_json = False self._cached_json = None # Allows for filling the fields later or in a custom way if not cdxline: self.cdxline = cdxline return fields = cdxline.split(b' ' , 2) # Check for CDX JSON if fields[-1].startswith(b'{'): self[URLKEY] = to_native_str(fields[0], 'utf-8') self[TIMESTAMP] = to_native_str(fields[1], 'utf-8') json_fields = json_decode(to_native_str(fields[-1], 'utf-8')) for n, v in six.iteritems(json_fields): n = to_native_str(n, 'utf-8') n = self.CDX_ALT_FIELDS.get(n, n) if n == 'url': try: v.encode('ascii') except UnicodeEncodeError: v = quote(v.encode('utf-8'), safe=':/') if n != 'filename': v = to_native_str(v, 'utf-8') self[n] = v self.cdxline = cdxline self._from_json = True return more_fields = fields.pop().split(b' ') fields.extend(more_fields) cdxformat = None for i in self.CDX_FORMATS: if len(i) == len(fields): cdxformat = i if not cdxformat: msg = 'unknown {0}-field cdx format'.format(len(fields)) raise CDXException(msg) for header, field in zip(cdxformat, fields): self[header] = to_native_str(field, 'utf-8') self.cdxline = cdxline
def __call__(self, filename, cdx): redis_key = self.redis_key_template params = {} if hasattr(cdx, '_formatter') and cdx._formatter: redis_key = cdx._formatter.format(redis_key) params = cdx._formatter.params res = None if '*' in redis_key: for key in self.scan_keys(redis_key, params): res = self.redis.hget(key, filename) if res: break else: res = self.redis.hget(redis_key, filename) res = to_native_str(res, 'utf-8') return res
def __init__(self, idxline): OrderedDict.__init__(self) idxline = idxline.rstrip() fields = idxline.split(b'\t') if len(fields) < self.NUM_REQ_FIELDS: msg = 'invalid idx format: {0} fields found, {1} required' raise CDXException(msg.format(len(fields), self.NUM_REQ_FIELDS)) for header, field in zip(self.FORMAT, fields): self[header] = to_native_str(field, 'utf-8') self['offset'] = int(self['offset']) self['length'] = int(self['length']) lineno = self.get('lineno') if lineno: self['lineno'] = int(lineno) self.idxline = idxline
def serve_content(self, environ, coll='$root', url='', timemap_output='', record=False): """Serve the contents of a URL/Record rewriting the contents of the response when applicable. :param dict environ: The WSGI environment dictionary for the request :param str coll: The name of the collection the record is to be served from :param str url: The URL for the corresponding record to be served if it exists :param str timemap_output: The contents of the timemap included in the link header of the response :param bool record: Should the content being served by recorded (save to a warc). Only valid in record mode :return: WbResponse containing the contents of the record/URL :rtype: WbResponse """ if not self.is_valid_coll(coll): self.raise_not_found(environ, 'No handler for "/{0}"'.format(coll)) self.setup_paths(environ, coll, record) request_uri = environ.get('REQUEST_URI') script_name = environ.get('SCRIPT_NAME', '') + '/' if request_uri and request_uri.startswith(script_name): wb_url_str = request_uri[len(script_name):] else: wb_url_str = to_native_str(url) if environ.get('QUERY_STRING'): wb_url_str += '?' + environ.get('QUERY_STRING') metadata = self.get_metadata(coll) if record: metadata['type'] = 'record' if timemap_output: metadata['output'] = timemap_output # ensure that the timemap path information is not included wb_url_str = wb_url_str.replace('timemap/{0}/'.format(timemap_output), '') try: response = self.rewriterapp.render_content(wb_url_str, metadata, environ) except UpstreamException as ue: response = self.rewriterapp.handle_error(environ, ue) raise HTTPException(response=response) return response
def __call__(self, cdx, params): entry = self.load_resource(cdx, params) if not entry: return None, None compress = params.get('compress') == 'gzip' warc_headers, other_headers, stream = entry source = self._get_source_id(cdx) out_headers = {} out_headers['Warcserver-Type'] = 'warc' out_headers['Content-Type'] = 'application/warc-record' if params.get('recorder_skip'): out_headers['Recorder-Skip'] = '1' cdx['recorder_skip'] = '1' out_headers['Warcserver-Cdx'] = to_native_str(cdx.to_cdxj().rstrip()) out_headers['Warcserver-Source-Coll'] = to_native_str(source) if not warc_headers: if other_headers: out_headers['Link'] = other_headers.get('Link') out_headers['Memento-Datetime'] = other_headers.get('Memento-Datetime') if not compress: out_headers['Content-Length'] = other_headers.get('Content-Length') return out_headers, StreamIter(stream, closer=call_release_conn) target_uri = warc_headers.get_header('WARC-Target-URI') out_headers['WARC-Target-URI'] = target_uri out_headers['Link'] = MementoUtils.make_link(target_uri, 'original') memento_dt = iso_date_to_datetime(warc_headers.get_header('WARC-Date')) out_headers['Memento-Datetime'] = datetime_to_http_date(memento_dt) warc_headers_buff = warc_headers.to_bytes() if not compress: lenset = self._set_content_len(warc_headers.get_header('Content-Length'), out_headers, len(warc_headers_buff)) else: lenset = False streamiter = StreamIter(stream, header1=warc_headers_buff, header2=other_headers, closer=call_release_conn) if compress: streamiter = compress_gzip_iter(streamiter) out_headers['Content-Encoding'] = 'gzip' #if not lenset: # out_headers['Transfer-Encoding'] = 'chunked' # streamiter = chunk_encode_iter(streamiter) return out_headers, streamiter
def parse(self, stream, full_statusline=None): """ parse stream for status line and headers return a StatusAndHeaders object support continuation headers starting with space or tab """ def readline(): return to_native_str(stream.readline()) # status line w newlines intact if full_statusline is None: full_statusline = readline() else: full_statusline = to_native_str(full_statusline) statusline, total_read = _strip_count(full_statusline, 0) headers = [] # at end of stream if total_read == 0: raise EOFError() elif not statusline: return StatusAndHeaders(statusline=statusline, headers=headers, protocol='', total_len=total_read) # validate only if verify is set if self.verify: protocol_status = self.split_prefix(statusline, self.statuslist) if not protocol_status: msg = 'Expected Status Line starting with {0} - Found: {1}' msg = msg.format(self.statuslist, statusline) raise StatusAndHeadersParserException(msg, full_statusline) else: protocol_status = statusline.split(' ', 1) line, total_read = _strip_count(readline(), total_read) while line: result = line.split(':', 1) if len(result) == 2: name = result[0].rstrip(' \t') value = result[1].lstrip() else: name = result[0] value = None next_line, total_read = _strip_count(readline(), total_read) # append continuation lines, if any while next_line and next_line.startswith((' ', '\t')): if value is not None: value += next_line next_line, total_read = _strip_count(readline(), total_read) if value is not None: header = (name, value) headers.append(header) line = next_line if len(protocol_status) > 1: statusline = protocol_status[1].strip() else: statusline = '' return StatusAndHeaders(statusline=statusline, headers=headers, protocol=protocol_status[0], total_len=total_read)
def handle_binary(query): query = base64.b64encode(query) query = to_native_str(query) query = '__wb_post_data=' + query return query
def __init__(self, method, mime, length, stream, buffered_stream=None, environ=None): """ Append the method for HEAD/OPTIONS as __pywb_method=<method> For POST requests, requests extract a url-encoded form from stream read content length and convert to query params, if possible Attempt to decode application/x-www-form-urlencoded or multipart/*, otherwise read whole block and b64encode """ self.query = b'' method = method.upper() if method in ('OPTIONS', 'HEAD'): self.query = '__pywb_method=' + method.lower() return if method != 'POST': return try: length = int(length) except (ValueError, TypeError): return if length <= 0: return query = b'' while length > 0: buff = stream.read(length) length -= len(buff) if not buff: break query += buff if buffered_stream: buffered_stream.write(query) buffered_stream.seek(0) if not mime: mime = '' if mime.startswith('application/x-www-form-urlencoded'): query = to_native_str(query) query = unquote_plus(query) elif mime.startswith('multipart/'): env = {'REQUEST_METHOD': 'POST', 'CONTENT_TYPE': mime, 'CONTENT_LENGTH': len(query)} args = dict(fp=BytesIO(query), environ=env, keep_blank_values=True) if PY3: args['encoding'] = 'utf-8' data = cgi.FieldStorage(**args) values = [] for item in data.list: values.append((item.name, item.value)) query = urlencode(values, True) elif mime.startswith('application/x-amf'): query = self.amf_parse(query, environ) else: query = base64.b64encode(query) query = to_native_str(query) query = '__wb_post_data=' + query self.query = query
def __str__(self): return self.type_ + ':' + to_native_str( base64.b32encode(self.digester.digest()))
def __init__(self, method, mime, length, stream, buffered_stream=None, environ=None): """ Append the method for HEAD/OPTIONS as __pywb_method=<method> For POST requests, requests extract a url-encoded form from stream read content length and convert to query params, if possible Attempt to decode application/x-www-form-urlencoded or multipart/*, otherwise read whole block and b64encode """ self.query = b'' method = method.upper() if method in ('OPTIONS', 'HEAD'): self.query = '__pywb_method=' + method.lower() return if method != 'POST': return try: length = int(length) except (ValueError, TypeError): return if length <= 0: return # max POST query allowed, for size considerations, only read upto this size length = min(length, self.MAX_POST_SIZE) query = [] while length > 0: buff = stream.read(length) length -= len(buff) if not buff: break query.append(buff) query = b''.join(query) if buffered_stream: buffered_stream.write(query) buffered_stream.seek(0) if not mime: mime = '' def handle_binary(query): query = base64.b64encode(query) query = to_native_str(query) query = '__wb_post_data=' + query return query if mime.startswith('application/x-www-form-urlencoded'): try: query = to_native_str(query.decode('utf-8')) query = unquote_plus(query) except UnicodeDecodeError: query = handle_binary(query) elif mime.startswith('multipart/'): env = {'REQUEST_METHOD': 'POST', 'CONTENT_TYPE': mime, 'CONTENT_LENGTH': len(query)} args = dict(fp=BytesIO(query), environ=env, keep_blank_values=True) if PY3: args['encoding'] = 'utf-8' try: data = cgi.FieldStorage(**args) except ValueError: # Content-Type multipart/form-data may lack "boundary" info query = handle_binary(query) else: values = [] for item in data.list: values.append((item.name, item.value)) query = urlencode(values, True) elif mime.startswith('application/x-amf'): query = self.amf_parse(query, environ) else: query = handle_binary(query) self.query = query
def __str__(self): return to_native_str(self.idxline, 'utf-8')
def __call__(self, cdx, params): entry = self.load_resource(cdx, params) if not entry: return None, None compress = params.get('compress') == 'gzip' warc_headers, other_headers, stream = entry source = self._get_source_id(cdx) out_headers = {} out_headers['Warcserver-Type'] = 'warc' out_headers['Content-Type'] = 'application/warc-record' if params.get('recorder_skip'): out_headers['Recorder-Skip'] = '1' cdx['recorder_skip'] = '1' out_headers['Warcserver-Cdx'] = to_native_str(cdx.to_cdxj().rstrip()) out_headers['Warcserver-Source-Coll'] = to_native_str(source) if not warc_headers: if other_headers: out_headers['Link'] = other_headers.get('Link') out_headers['Memento-Datetime'] = other_headers.get( 'Memento-Datetime') if not compress: out_headers['Content-Length'] = other_headers.get( 'Content-Length') return out_headers, StreamIter(stream, closer=call_release_conn) target_uri = warc_headers.get_header('WARC-Target-URI') out_headers['WARC-Target-URI'] = target_uri out_headers['Link'] = MementoUtils.make_link(target_uri, 'original') memento_dt = iso_date_to_datetime(warc_headers.get_header('WARC-Date')) out_headers['Memento-Datetime'] = datetime_to_http_date(memento_dt) warc_headers_buff = warc_headers.to_bytes() if not compress: lenset = self._set_content_len( warc_headers.get_header('Content-Length'), out_headers, len(warc_headers_buff)) else: lenset = False streamiter = StreamIter(stream, header1=warc_headers_buff, header2=other_headers, closer=call_release_conn) if compress: streamiter = compress_gzip_iter(streamiter) out_headers['Content-Encoding'] = 'gzip' #if not lenset: # out_headers['Transfer-Encoding'] = 'chunked' # streamiter = chunk_encode_iter(streamiter) return out_headers, streamiter
def __str__(self): return to_native_str(self.idxline, 'utf-8')
def post_query_extract(mime, length, stream): """ Extract a url-encoded form POST/PUT from stream content length, return None Attempt to decode application/x-www-form-urlencoded or multipart/*, otherwise read whole block and b64encode """ post_query = b'' try: length = int(length) except (ValueError, TypeError): return if length <= 0: return while length > 0: buff = stream.read(length) length -= len(buff) if not buff: break post_query += buff if not mime: mime = '' if mime.startswith('application/x-www-form-urlencoded'): post_query = to_native_str(post_query) post_query = unquote_plus(post_query) elif mime.startswith('multipart/'): env = { 'REQUEST_METHOD': 'POST', 'CONTENT_TYPE': mime, 'CONTENT_LENGTH': len(post_query) } args = dict(fp=BytesIO(post_query), environ=env, keep_blank_values=True) if six.PY3: args['encoding'] = 'utf-8' data = cgi.FieldStorage(**args) values = [] for item in data.list: values.append((item.name, item.value)) post_query = urlencode(values, True) else: post_query = base64.b64encode(post_query) post_query = to_native_str(post_query) post_query = '__warc_post_data=' + post_query return post_query
def do_encode(m, encoding='UTF-8'): return "*={0}''".format(encoding) + quote(to_native_str(m.group(1)))
def readline(): return to_native_str(stream.readline())
def query_extract(mime, length, stream, url): """ Extract a url-encoded form POST/PUT from stream content length, return None Attempt to decode application/x-www-form-urlencoded or multipart/*, otherwise read whole block and b64encode """ query_data = b"" try: length = int(length) except (ValueError, TypeError): if length is None: length = 8192 else: return while length > 0: buff = stream.read(length) length -= len(buff) if not buff: break query_data += buff if not mime: mime = "" query = "" def handle_binary(query_data): query = base64.b64encode(query_data) query = to_native_str(query) query = "__wb_post_data=" + query return query if mime.startswith("application/x-www-form-urlencoded"): try: query = to_native_str(query_data.decode("utf-8")) query = unquote_plus(query) except UnicodeDecodeError: query = handle_binary(query_data) elif mime.startswith("multipart/"): env = { "REQUEST_METHOD": "POST", "CONTENT_TYPE": mime, "CONTENT_LENGTH": len(query_data), } args = dict(fp=BytesIO(query_data), environ=env, keep_blank_values=True) args["encoding"] = "utf-8" try: data = cgi.FieldStorage(**args) except ValueError: # Content-Type multipart/form-data may lack "boundary" info query = handle_binary(query_data) else: values = [] for item in data.list: values.append((item.name, item.value)) query = urlencode(values, True) elif mime.startswith("application/json"): try: query = json_parse(query_data) except Exception as e: if query_data: try: sys.stderr.write("Error parsing: " + query_data.decode("utf-8") + "\n") except: pass query = "" elif mime.startswith("text/plain"): try: query = json_parse(query_data) except Exception as e: query = handle_binary(query_data) elif mime.startswith("application/x-amf"): query = amf_parse(query_data) else: query = handle_binary(query_data) if query: query = query[:MAX_QUERY_LENGTH] return query