def parse_record_stream(self, stream, statusline=None, known_format=None, no_record_parse=False): """ Parse file-like stream and return an ArcWarcRecord encapsulating the record headers, http headers (if any), and a stream limited to the remainder of the record. Pass statusline and known_format to detect_type_loader_headers() to faciliate parsing. """ (the_format, rec_headers) = (self._detect_type_load_headers( stream, statusline, known_format)) if the_format == 'arc': uri = rec_headers.get_header('uri') length = rec_headers.get_header('length') content_type = rec_headers.get_header('content-type') sub_len = rec_headers.total_len if uri and uri.startswith('filedesc://'): rec_type = 'arc_header' else: rec_type = 'response' elif the_format in ('warc', 'arc2warc'): rec_type = rec_headers.get_header('WARC-Type') uri = rec_headers.get_header('WARC-Target-URI') length = rec_headers.get_header('Content-Length') content_type = rec_headers.get_header('Content-Type') if the_format == 'warc': sub_len = 0 else: sub_len = rec_headers.total_len the_format = 'warc' is_err = False try: if length is not None: length = int(length) - sub_len if length < 0: is_err = True except (ValueError, TypeError): is_err = True # err condition if is_err: length = 0 # limit stream to the length for all valid records if length is not None and length >= 0: stream = LimitReader.wrap_stream(stream, length) # don't parse the http record at all if no_record_parse: http_headers = None #StatusAndHeaders('', []) # if empty record (error or otherwise) set status to 204 elif length == 0: #if is_err: # msg = '204 Possible Error' #else: # msg = '204 No Content' http_headers = StatusAndHeaders('', []) # response record or non-empty revisit: parse HTTP status and headers! elif (rec_type in ('response', 'revisit') and uri.startswith(self.HTTP_SCHEMES)): http_headers = self.http_parser.parse(stream) # request record: parse request elif ((rec_type == 'request') and uri.startswith(self.HTTP_SCHEMES)): http_headers = self.http_req_parser.parse(stream) # everything else: create a no-status entry, set content-type else: content_type_header = [('Content-Type', content_type)] if length is not None and length >= 0: content_type_header.append(('Content-Length', str(length))) http_headers = StatusAndHeaders('200 OK', content_type_header) return ArcWarcRecord(the_format, rec_type, rec_headers, stream, http_headers, content_type, length)
def test_limit_reader_invalid_wrap(self): b = BytesIO(b'some data') assert LimitReader.wrap_stream(b, 'abc') == b
def parse_record_stream(self, stream, statusline=None, known_format=None, no_record_parse=False, ensure_http_headers=False): """ Parse file-like stream and return an ArcWarcRecord encapsulating the record headers, http headers (if any), and a stream limited to the remainder of the record. Pass statusline and known_format to detect_type_loader_headers() to faciliate parsing. """ (the_format, rec_headers) = (self._detect_type_load_headers( stream, statusline, known_format)) if the_format == 'arc': uri = rec_headers.get_header('uri') length = rec_headers.get_header('length') content_type = rec_headers.get_header('content-type') sub_len = rec_headers.total_len if uri and uri.startswith('filedesc://'): rec_type = 'arc_header' else: rec_type = 'response' elif the_format in ('warc', 'arc2warc'): rec_type = rec_headers.get_header('WARC-Type') uri = self._ensure_target_uri_format(rec_headers) length = rec_headers.get_header('Content-Length') content_type = rec_headers.get_header('Content-Type') if the_format == 'warc': sub_len = 0 else: sub_len = rec_headers.total_len the_format = 'warc' is_err = False try: if length is not None: length = int(length) - sub_len if length < 0: is_err = True except (ValueError, TypeError): is_err = True # err condition if is_err: length = 0 # limit stream to the length for all valid records if length is not None and length >= 0: stream = LimitReader.wrap_stream(stream, length) http_headers = None # load http headers if parsing if not no_record_parse: http_headers = self.load_http_headers(rec_type, uri, stream, length) # generate validate http headers (eg. for replay) if not http_headers and ensure_http_headers: http_headers = self.default_http_headers(length, content_type) return ArcWarcRecord(the_format, rec_type, rec_headers, stream, http_headers, content_type, length)
def test_limit_reader_3(self): reader = LimitReader(BytesIO(b'abcdefghjiklmnopqrstuvwxyz'), 8) new_reader = LimitReader.wrap_stream(reader, 4) assert reader == new_reader assert b'abcd' == new_reader.readline(26)