Exemple #1
0
    def parse_record_stream(self,
                            stream,
                            statusline=None,
                            known_format=None,
                            no_record_parse=False):
        """ Parse file-like stream and return an ArcWarcRecord
        encapsulating the record headers, http headers (if any),
        and a stream limited to the remainder of the record.

        Pass statusline and known_format to detect_type_loader_headers()
        to faciliate parsing.
        """
        (the_format, rec_headers) = (self._detect_type_load_headers(
            stream, statusline, known_format))

        if the_format == 'arc':
            uri = rec_headers.get_header('uri')
            length = rec_headers.get_header('length')
            content_type = rec_headers.get_header('content-type')
            sub_len = rec_headers.total_len
            if uri and uri.startswith('filedesc://'):
                rec_type = 'arc_header'
            else:
                rec_type = 'response'

        elif the_format in ('warc', 'arc2warc'):
            rec_type = rec_headers.get_header('WARC-Type')
            uri = rec_headers.get_header('WARC-Target-URI')
            length = rec_headers.get_header('Content-Length')
            content_type = rec_headers.get_header('Content-Type')
            if the_format == 'warc':
                sub_len = 0
            else:
                sub_len = rec_headers.total_len
                the_format = 'warc'

        is_err = False

        try:
            if length is not None:
                length = int(length) - sub_len
                if length < 0:
                    is_err = True

        except (ValueError, TypeError):
            is_err = True

        # err condition
        if is_err:
            length = 0

        # limit stream to the length for all valid records
        if length is not None and length >= 0:
            stream = LimitReader.wrap_stream(stream, length)

        # don't parse the http record at all
        if no_record_parse:
            http_headers = None  #StatusAndHeaders('', [])

        # if empty record (error or otherwise) set status to 204
        elif length == 0:
            #if is_err:
            #    msg = '204 Possible Error'
            #else:
            #    msg = '204 No Content'
            http_headers = StatusAndHeaders('', [])

        # response record or non-empty revisit: parse HTTP status and headers!
        elif (rec_type in ('response', 'revisit')
              and uri.startswith(self.HTTP_SCHEMES)):
            http_headers = self.http_parser.parse(stream)

        # request record: parse request
        elif ((rec_type == 'request') and uri.startswith(self.HTTP_SCHEMES)):
            http_headers = self.http_req_parser.parse(stream)

        # everything else: create a no-status entry, set content-type
        else:
            content_type_header = [('Content-Type', content_type)]

            if length is not None and length >= 0:
                content_type_header.append(('Content-Length', str(length)))

            http_headers = StatusAndHeaders('200 OK', content_type_header)

        return ArcWarcRecord(the_format, rec_type, rec_headers, stream,
                             http_headers, content_type, length)
Exemple #2
0
 def test_limit_reader_invalid_wrap(self):
     b = BytesIO(b'some data')
     assert LimitReader.wrap_stream(b, 'abc') == b
Exemple #3
0
    def parse_record_stream(self,
                            stream,
                            statusline=None,
                            known_format=None,
                            no_record_parse=False,
                            ensure_http_headers=False):
        """ Parse file-like stream and return an ArcWarcRecord
        encapsulating the record headers, http headers (if any),
        and a stream limited to the remainder of the record.

        Pass statusline and known_format to detect_type_loader_headers()
        to faciliate parsing.
        """
        (the_format, rec_headers) = (self._detect_type_load_headers(
            stream, statusline, known_format))

        if the_format == 'arc':
            uri = rec_headers.get_header('uri')
            length = rec_headers.get_header('length')
            content_type = rec_headers.get_header('content-type')
            sub_len = rec_headers.total_len
            if uri and uri.startswith('filedesc://'):
                rec_type = 'arc_header'
            else:
                rec_type = 'response'

        elif the_format in ('warc', 'arc2warc'):
            rec_type = rec_headers.get_header('WARC-Type')
            uri = self._ensure_target_uri_format(rec_headers)
            length = rec_headers.get_header('Content-Length')
            content_type = rec_headers.get_header('Content-Type')
            if the_format == 'warc':
                sub_len = 0
            else:
                sub_len = rec_headers.total_len
                the_format = 'warc'

        is_err = False

        try:
            if length is not None:
                length = int(length) - sub_len
                if length < 0:
                    is_err = True

        except (ValueError, TypeError):
            is_err = True

        # err condition
        if is_err:
            length = 0

        # limit stream to the length for all valid records
        if length is not None and length >= 0:
            stream = LimitReader.wrap_stream(stream, length)

        http_headers = None

        # load http headers if parsing
        if not no_record_parse:
            http_headers = self.load_http_headers(rec_type, uri, stream,
                                                  length)

        # generate validate http headers (eg. for replay)
        if not http_headers and ensure_http_headers:
            http_headers = self.default_http_headers(length, content_type)

        return ArcWarcRecord(the_format, rec_type, rec_headers, stream,
                             http_headers, content_type, length)
Exemple #4
0
 def test_limit_reader_3(self):
     reader = LimitReader(BytesIO(b'abcdefghjiklmnopqrstuvwxyz'), 8)
     new_reader = LimitReader.wrap_stream(reader, 4)
     assert reader == new_reader
     assert b'abcd' == new_reader.readline(26)