def replay_capture(self, wbrequest, cdx, cdx_loader, failed_files): (status_headers, stream) = (self.content_loader. resolve_headers_and_payload(cdx, failed_files, cdx_loader)) # check and reject self-redirect self._reject_self_redirect(wbrequest, cdx, status_headers) # check if redir is needed redir_response = self._redirect_if_needed(wbrequest, cdx) if redir_response: return redir_response length = status_headers.get_header('content-length') stream = LimitReader.wrap_stream(stream, length) # one more check for referrer-based self-redirect # TODO: evaluate this, as refreshing in browser may sometimes cause # referrer to be set to the same page, incorrectly skipping a capture # self._reject_referrer_self_redirect(wbrequest) urlrewriter = wbrequest.urlrewriter # if using url rewriter, use original url for rewriting purposes if wbrequest and wbrequest.wb_url: wbrequest.wb_url.url = cdx['original'] head_insert_func = None if self.head_insert_view: head_insert_func = (self.head_insert_view. create_insert_func(wbrequest)) result = (self.content_rewriter. rewrite_content(urlrewriter, headers=status_headers, stream=stream, head_insert_func=head_insert_func, urlkey=cdx['urlkey'], cdx=cdx)) (status_headers, response_iter, is_rewritten) = result # buffer response if buffering enabled if self.buffer_response: response_iter = self.buffered_response(status_headers, response_iter) response = self.response_class(status_headers, response_iter, wbrequest=wbrequest, cdx=cdx) # notify reporter callback, if any if self._reporter: self._reporter(wbrequest, cdx, response) return response
def replay_capture(self, wbrequest, cdx, cdx_loader, failed_files): (status_headers, stream) = (self.content_loader.resolve_headers_and_payload( cdx, failed_files, cdx_loader)) # check and reject self-redirect self._reject_self_redirect(wbrequest, cdx, status_headers) # check if redir is needed redir_response = self._redirect_if_needed(wbrequest, cdx) if redir_response: return redir_response length = status_headers.get_header('content-length') stream = LimitReader.wrap_stream(stream, length) # one more check for referrer-based self-redirect # TODO: evaluate this, as refreshing in browser may sometimes cause # referrer to be set to the same page, incorrectly skipping a capture # self._reject_referrer_self_redirect(wbrequest) urlrewriter = wbrequest.urlrewriter # if using url rewriter, use original url for rewriting purposes if wbrequest and wbrequest.wb_url: wbrequest.wb_url.url = cdx['original'] head_insert_func = None if self.head_insert_view: head_insert_func = ( self.head_insert_view.create_insert_func(wbrequest)) result = (self.content_rewriter.rewrite_content( urlrewriter, headers=status_headers, stream=stream, head_insert_func=head_insert_func, urlkey=cdx['urlkey'], cdx=cdx)) (status_headers, response_iter, is_rewritten) = result # buffer response if buffering enabled if self.buffer_response: response_iter = self.buffered_response(status_headers, response_iter) response = self.response_class(status_headers, response_iter, wbrequest=wbrequest, cdx=cdx) # notify reporter callback, if any if self._reporter: self._reporter(wbrequest, cdx, response) return response
def read_range(): with open(spec['name'], 'rb') as fh: fh.seek(start) fh = LimitReader.wrap_stream(fh, maxlen) while True: buf = fh.read() if not buf: break yield buf
def read_range(): with open(spec['name'], 'rb') as fh: fh.seek(start) fh = LimitReader.wrap_stream(fh, maxlen) while True: buf = fh.read() if not buf: break yield buf
def read_range(): with io.BytesIO( joined ) as fh: # Perma changes 2: replaced real file w/ BytesIO fh.seek(start) fh = LimitReader.wrap_stream(fh, maxlen) while True: buf = fh.read() if not buf: break yield buf
def parse_record_stream(self, stream, statusline=None, known_format=None, no_record_parse=False): """ Parse file-like stream and return an ArcWarcRecord encapsulating the record headers, http headers (if any), and a stream limited to the remainder of the record. Pass statusline and known_format to detect_type_loader_headers() to faciliate parsing. """ (the_format, rec_headers) = (self. _detect_type_load_headers(stream, statusline, known_format)) if the_format == 'arc': uri = rec_headers.get_header('uri') length = rec_headers.get_header('length') content_type = rec_headers.get_header('content-type') sub_len = rec_headers.total_len if uri and uri.startswith('filedesc://'): rec_type = 'arc_header' else: rec_type = 'response' elif the_format in ('warc', 'arc2warc'): rec_type = rec_headers.get_header('WARC-Type') uri = rec_headers.get_header('WARC-Target-URI') length = rec_headers.get_header('Content-Length') content_type = rec_headers.get_header('Content-Type') if the_format == 'warc': sub_len = 0 else: sub_len = rec_headers.total_len the_format = 'warc' is_err = False try: if length is not None: length = int(length) - sub_len if length < 0: is_err = True except (ValueError, TypeError): is_err = True # err condition if is_err: length = 0 # limit stream to the length for all valid records if length is not None and length >= 0: stream = LimitReader.wrap_stream(stream, length) # don't parse the http record at all if no_record_parse: status_headers = None#StatusAndHeaders('', []) # if empty record (error or otherwise) set status to 204 elif length == 0: if is_err: msg = '204 Possible Error' else: msg = '204 No Content' status_headers = StatusAndHeaders(msg, []) # response record or non-empty revisit: parse HTTP status and headers! elif (rec_type in ('response', 'revisit') and uri.startswith(self.HTTP_SCHEMES)): status_headers = self.http_parser.parse(stream) # request record: parse request elif ((rec_type == 'request') and uri.startswith(self.HTTP_SCHEMES)): status_headers = self.http_req_parser.parse(stream) # everything else: create a no-status entry, set content-type else: content_type_header = [('Content-Type', content_type)] if length is not None and length >= 0: content_type_header.append(('Content-Length', str(length))) status_headers = StatusAndHeaders('200 OK', content_type_header) return ArcWarcRecord(the_format, rec_type, rec_headers, stream, status_headers, content_type, length)
def parse_record_stream(self, stream, statusline=None, known_format=None, no_record_parse=False): """ Parse file-like stream and return an ArcWarcRecord encapsulating the record headers, http headers (if any), and a stream limited to the remainder of the record. Pass statusline and known_format to detect_type_loader_headers() to faciliate parsing. """ (the_format, rec_headers) = (self._detect_type_load_headers( stream, statusline, known_format)) if the_format == 'arc': uri = rec_headers.get_header('uri') length = rec_headers.get_header('length') content_type = rec_headers.get_header('content-type') sub_len = rec_headers.total_len if uri and uri.startswith('filedesc://'): rec_type = 'arc_header' else: rec_type = 'response' elif the_format == 'warc': rec_type = rec_headers.get_header('WARC-Type') uri = rec_headers.get_header('WARC-Target-URI') length = rec_headers.get_header('Content-Length') content_type = rec_headers.get_header('Content-Type') sub_len = 0 is_err = False try: if length is not None: length = int(length) - sub_len if length < 0: is_err = True except (ValueError, TypeError): is_err = True # err condition if is_err: length = 0 # limit stream to the length for all valid records if length is not None and length >= 0: stream = LimitReader.wrap_stream(stream, length) # don't parse the http record at all if no_record_parse: status_headers = None #StatusAndHeaders('', []) # if empty record (error or otherwise) set status to 204 elif length == 0: if is_err: msg = '204 Possible Error' else: msg = '204 No Content' status_headers = StatusAndHeaders(msg, []) # response record or non-empty revisit: parse HTTP status and headers! elif (rec_type in ('response', 'revisit') and uri.startswith(self.HTTP_SCHEMES)): status_headers = self.http_parser.parse(stream) # request record: parse request elif ((rec_type == 'request') and uri.startswith(self.HTTP_SCHEMES)): status_headers = self.http_req_parser.parse(stream) # everything else: create a no-status entry, set content-type else: content_type_header = [('Content-Type', content_type)] if length is not None and length >= 0: content_type_header.append(('Content-Length', str(length))) status_headers = StatusAndHeaders('200 OK', content_type_header) return ArcWarcRecord(the_format, rec_type, rec_headers, stream, status_headers, content_type, length)
def parse_record_stream(self, stream, statusline=None, known_format=None): """ Parse file-like stream and return an ArcWarcRecord encapsulating the record headers, http headers (if any), and a stream limited to the remainder of the record. Pass statusline and known_format to detect_type_loader_headers() to faciliate parsing. """ (the_format, rec_headers) = (self. _detect_type_load_headers(stream, statusline, known_format)) if the_format == 'arc': uri = rec_headers.get_header('uri') length = rec_headers.get_header('length') content_type = rec_headers.get_header('content-type') sub_len = rec_headers.total_len if uri and uri.startswith('filedesc://'): rec_type = 'arc_header' else: rec_type = 'response' elif the_format == 'warc': rec_type = rec_headers.get_header('WARC-Type') uri = rec_headers.get_header('WARC-Target-URI') length = rec_headers.get_header('Content-Length') content_type = rec_headers.get_header('Content-Type') sub_len = 0 is_err = False try: length = int(length) - sub_len if length < 0: is_err = True except ValueError: is_err = True # err condition if is_err: length = 0 # limit stream to the length for all valid records stream = LimitReader.wrap_stream(stream, length) # if empty record (error or otherwise) set status to - if length == 0: status_headers = StatusAndHeaders('- None', []) # response record or non-empty revisit: parse HTTP status and headers! elif (rec_type in ('response', 'revisit') and not uri.startswith(('dns:', 'whois:'))): status_headers = self.http_parser.parse(stream) # request record: parse request elif ((rec_type == 'request') and not uri.startswith(('dns:', 'whois:'))): status_headers = self.http_req_parser.parse(stream) # everything else: create a no-status entry, set content-type else: content_type_header = [('Content-Type', content_type)] status_headers = StatusAndHeaders('- OK', content_type_header) return ArcWarcRecord(the_format, rec_type, rec_headers, stream, status_headers, content_type, length)