Beispiel #1
0
    def test_limit_reader_multiple_read(self):
        reader = LimitReader(BytesIO(b'abcdefghjiklmnopqrstuvwxyz'), 10)
        string = None
        for x in [2, 2, 20]:
            string = reader.read(x)

        assert b'efghji' == string
Beispiel #2
0
 def __init__(self, stream, offset, length):
     super(OffsetLimitReader, self).__init__(stream, length)
     self.offset = offset
     if offset > 0:
         self._skip_reader = LimitReader(stream, offset)
     else:
         self._skip_reader = None
Beispiel #3
0
    def load(self, url, offset=0, length=-1):
        """
        Load a file-like reader from the local file system
        """

        # if starting with . or /, can only be a file path..
        file_only = url.startswith(('/', '.'))

        # convert to filename
        filename = from_file_url(url)
        if filename != url:
            file_only = True
            url = filename

        afile = None
        try:
            # first, try as file
            afile = open(url, 'rb')

        except IOError:
            no_except_close(afile)
            if file_only:
                raise

            return super(LocalFileLoader, self).load(url, offset, length)

        if offset > 0:
            afile.seek(offset)

        if length >= 0:
            return LimitReader(afile, length)
        else:
            return afile
Beispiel #4
0
    def do_upload(self, upload_key, filename, stream, user, coll, rec, offset,
                  length):
        """Send PUT request to upload recording.

        :param str upload_key: upload Redis key
        :param str filename: WARC archive filename
        :param stream: file object
        :param User user: user
        :param str coll: collection ID
        :param str rec: record ID
        :param int offset: offset to start of stream
        :param int length: length of recording
        """
        stream.seek(offset)

        logger.debug('do_upload(): {0} offset: {1}: len: {2}'.format(
            rec, offset, length))

        stream = LimitReader(stream, length)
        headers = {'Content-Length': str(length)}

        upload_url = self.upload_path.format(record_host=self.record_host,
                                             user=user,
                                             coll=coll,
                                             rec=rec,
                                             upid=upload_key)

        r = requests.put(upload_url, headers=headers, data=stream)
Beispiel #5
0
    def get_req_body(self):
        input_ = self.env['wsgi.input']
        len_ = self._get_content_length()
        enc = self._get_header('Transfer-Encoding')

        if len_:
            data = LimitReader(input_, int(len_))
        elif enc:
            data = input_
        else:
            data = None

        return data
Beispiel #6
0
    def do_upload(self, upload_key, filename, stream, user, coll, rec, offset,
                  length):
        stream.seek(offset)

        logger.debug('do_upload(): {0} offset: {1}: len: {2}'.format(
            rec, offset, length))

        stream = LimitReader(stream, length)
        headers = {'Content-Length': str(length)}

        upload_url = self.upload_path.format(record_host=self.record_host,
                                             user=user,
                                             coll=coll,
                                             rec=rec,
                                             upid=upload_key)

        r = requests.put(upload_url, headers=headers, data=stream)
Beispiel #7
0
class OffsetLimitReader(LimitReader):
    def __init__(self, stream, offset, length):
        super(OffsetLimitReader, self).__init__(stream, length)
        self.offset = offset
        if offset > 0:
            self._skip_reader = LimitReader(stream, offset)
        else:
            self._skip_reader = None

    def _skip(self):
        while self._skip_reader:
            buff = self._skip_reader.read()
            if not buff:
                self._skip_reader = None

    def read(self, length=None):
        self._skip()
        return super(OffsetLimitReader, self).read(length)

    def readline(self, length=None):
        self._skip()
        return super(OffsetLimitReader, self).readline(length)
Beispiel #8
0
    def parse_record_stream(self,
                            stream,
                            statusline=None,
                            known_format=None,
                            no_record_parse=False):
        """ Parse file-like stream and return an ArcWarcRecord
        encapsulating the record headers, http headers (if any),
        and a stream limited to the remainder of the record.

        Pass statusline and known_format to detect_type_loader_headers()
        to faciliate parsing.
        """
        (the_format, rec_headers) = (self._detect_type_load_headers(
            stream, statusline, known_format))

        if the_format == 'arc':
            uri = rec_headers.get_header('uri')
            length = rec_headers.get_header('length')
            content_type = rec_headers.get_header('content-type')
            sub_len = rec_headers.total_len
            if uri and uri.startswith('filedesc://'):
                rec_type = 'arc_header'
            else:
                rec_type = 'response'

        elif the_format in ('warc', 'arc2warc'):
            rec_type = rec_headers.get_header('WARC-Type')
            uri = rec_headers.get_header('WARC-Target-URI')
            length = rec_headers.get_header('Content-Length')
            content_type = rec_headers.get_header('Content-Type')
            if the_format == 'warc':
                sub_len = 0
            else:
                sub_len = rec_headers.total_len
                the_format = 'warc'

        is_err = False

        try:
            if length is not None:
                length = int(length) - sub_len
                if length < 0:
                    is_err = True

        except (ValueError, TypeError):
            is_err = True

        # err condition
        if is_err:
            length = 0

        # limit stream to the length for all valid records
        if length is not None and length >= 0:
            stream = LimitReader.wrap_stream(stream, length)

        # don't parse the http record at all
        if no_record_parse:
            http_headers = None  #StatusAndHeaders('', [])

        # if empty record (error or otherwise) set status to 204
        elif length == 0:
            #if is_err:
            #    msg = '204 Possible Error'
            #else:
            #    msg = '204 No Content'
            http_headers = StatusAndHeaders('', [])

        # response record or non-empty revisit: parse HTTP status and headers!
        elif (rec_type in ('response', 'revisit')
              and uri.startswith(self.HTTP_SCHEMES)):
            http_headers = self.http_parser.parse(stream)

        # request record: parse request
        elif ((rec_type == 'request') and uri.startswith(self.HTTP_SCHEMES)):
            http_headers = self.http_req_parser.parse(stream)

        # everything else: create a no-status entry, set content-type
        else:
            content_type_header = [('Content-Type', content_type)]

            if length is not None and length >= 0:
                content_type_header.append(('Content-Length', str(length)))

            http_headers = StatusAndHeaders('200 OK', content_type_header)

        return ArcWarcRecord(the_format, rec_type, rec_headers, stream,
                             http_headers, content_type, length)
Beispiel #9
0
    def parse_record_stream(self,
                            stream,
                            statusline=None,
                            known_format=None,
                            no_record_parse=False,
                            ensure_http_headers=False):
        """ Parse file-like stream and return an ArcWarcRecord
        encapsulating the record headers, http headers (if any),
        and a stream limited to the remainder of the record.

        Pass statusline and known_format to detect_type_loader_headers()
        to faciliate parsing.
        """
        (the_format, rec_headers) = (self._detect_type_load_headers(
            stream, statusline, known_format))

        if the_format == 'arc':
            uri = rec_headers.get_header('uri')
            length = rec_headers.get_header('length')
            content_type = rec_headers.get_header('content-type')
            sub_len = rec_headers.total_len
            if uri and uri.startswith('filedesc://'):
                rec_type = 'arc_header'
            else:
                rec_type = 'response'

        elif the_format in ('warc', 'arc2warc'):
            rec_type = rec_headers.get_header('WARC-Type')
            uri = self._ensure_target_uri_format(rec_headers)
            length = rec_headers.get_header('Content-Length')
            content_type = rec_headers.get_header('Content-Type')
            if the_format == 'warc':
                sub_len = 0
            else:
                sub_len = rec_headers.total_len
                the_format = 'warc'

        is_err = False

        try:
            if length is not None:
                length = int(length) - sub_len
                if length < 0:
                    is_err = True

        except (ValueError, TypeError):
            is_err = True

        # err condition
        if is_err:
            length = 0

        # limit stream to the length for all valid records
        if length is not None and length >= 0:
            stream = LimitReader.wrap_stream(stream, length)

        http_headers = None

        # load http headers if parsing
        if not no_record_parse:
            http_headers = self.load_http_headers(rec_type, uri, stream,
                                                  length)

        # generate validate http headers (eg. for replay)
        if not http_headers and ensure_http_headers:
            http_headers = self.default_http_headers(length, content_type)

        return ArcWarcRecord(the_format, rec_type, rec_headers, stream,
                             http_headers, content_type, length)
Beispiel #10
0
 def test_limit_reader_1(self):
     assert b'abcdefghji' == LimitReader(
         BytesIO(b'abcdefghjiklmnopqrstuvwxyz'), 10).read(26)
Beispiel #11
0
 def test_limit_reader_close(self):
     reader = LimitReader(BytesIO(b'abcdefg'), 3)
     with closing(reader):
         assert b'abc' == reader.read(10)
         assert reader.tell() == 3
Beispiel #12
0
 def test_limit_reader_invalid_wrap(self):
     b = BytesIO(b'some data')
     assert LimitReader.wrap_stream(b, 'abc') == b
Beispiel #13
0
 def test_limit_reader_zero(self):
     assert b'' == LimitReader(BytesIO(b'a'), 0).readline(0)
Beispiel #14
0
 def test_limit_reader_3(self):
     reader = LimitReader(BytesIO(b'abcdefghjiklmnopqrstuvwxyz'), 8)
     new_reader = LimitReader.wrap_stream(reader, 4)
     assert reader == new_reader
     assert b'abcd' == new_reader.readline(26)
Beispiel #15
0
 def test_limit_reader_2(self):
     assert b'abcdefgh' == LimitReader(
         BytesIO(b'abcdefghjiklmnopqrstuvwxyz'), 8).readline(26)