Exemple #1
0
    def __init__(self, key_id, secret, stream_name, region, partition, chunk_size=DEFAULT_CHUNK_SIZE, encoder='utf-8',
                 workers=2):

        self.kinesis = client('kinesis', region_name=region, aws_access_key_id=key_id,
                              aws_secret_access_key=secret)
        self.chunk_size = chunk_size
        self.stream_name = stream_name
        self.region = region
        self.tasks = queue.Queue()
        self.partition = partition
        self.encoder = encoder

        try:
            stream_desc = self.kinesis.describe_stream(StreamName=self.stream_name)
            if stream_desc['StreamDescription']['StreamStatus'] != 'ACTIVE':
                raise AssertionError
        except Exception:
            raise ValueError('Kinesis stream %s does not exist or inactive, or insufficient permissions' % stream_name)

        self.workers = [threading.Thread(target=task_worker, args=(self.tasks,)).start() for _ in
                        range(int(max(workers, MIN_WORKERS_NUM) / 2) + 1)]
        self._stream = BytesIO()

        self._is_open = True

        BufferedIOBase.__init__(self)
Exemple #2
0
    def __init__(self,
                 name,
                 mode='r',
                 buffer_size=None,
                 max_buffers=0,
                 max_workers=None,
                 **kwargs):

        if 'a' in mode:
            # TODO: Implement append mode and remove this exception
            raise NotImplementedError('Not implemented yet in Pycosio')

        BufferedIOBase.__init__(self)
        ObjectIOBase.__init__(self, name, mode=mode)
        WorkerPoolBase.__init__(self, max_workers)

        # Instantiate raw IO
        self._raw = self._RAW_CLASS(name, mode=mode, **kwargs)
        self._raw._is_raw_of_buffered = True

        # Link to RAW methods
        self._mode = self._raw.mode
        self._name = self._raw.name
        self._client_kwargs = self._raw._client_kwargs

        # Initializes buffer
        if not buffer_size or buffer_size < 0:
            self._buffer_size = self.DEFAULT_BUFFER_SIZE
        elif buffer_size < self.MINIMUM_BUFFER_SIZE:
            self._buffer_size = self.MINIMUM_BUFFER_SIZE
        elif (self.MAXIMUM_BUFFER_SIZE
              and buffer_size > self.MAXIMUM_BUFFER_SIZE):
            self._buffer_size = self.MAXIMUM_BUFFER_SIZE
        else:
            self._buffer_size = buffer_size

        # Initialize write mode
        if self._writable:
            self._max_buffers = max_buffers
            self._buffer_seek = 0
            self._write_buffer = bytearray(self._buffer_size)
            self._seekable = False
            self._write_futures = []
            self._raw_flush = self._raw._flush

            # Size used only with random write access
            # Value will be lazy evaluated latter if needed.
            self._size_synched = False
            self._size = 0
            self._size_lock = Lock()

        # Initialize read mode
        else:
            self._size = self._raw._size
            self._read_range = self.raw._read_range
            if max_buffers:
                self._max_buffers = max_buffers
            else:
                self._max_buffers = ceil(self._size / self._buffer_size)
            self._read_queue = dict()
    def __init__(self, bucket, key, *, chunk_size=DEFAULT_CHUNK_SIZE,
                 max_file_log_time=DEFAULT_ROTATION_TIME_SECS, max_file_size_bytes=MAX_FILE_SIZE_BYTES,
                 encoder='utf-8', workers=2, compress=False, key_id=None, secret=None, token=None):

        self.session = Session(aws_access_key_id=key_id, aws_secret_access_key=secret, aws_session_token=token)
        self.s3 = self.session.resource('s3')
        self.start_time = int(time.time())
        self.key = key.strip('/')
        self.chunk_size = chunk_size
        self.max_file_log_time = max_file_log_time
        self.max_file_size_bytes = max_file_size_bytes
        self.current_file_name = "{}_{}".format(key, int(time.time()))
        if compress:
            self.current_file_name = "{}.gz".format(self.current_file_name)
        self.encoder = encoder

        try:
            self.s3.meta.client.head_bucket(Bucket=bucket)
        except Exception:
            raise ValueError('Bucket %s does not exist, or missing permissions' % bucket)

        self._bucket = self.s3.Bucket(bucket)
        self._current_object = self._get_stream_object(self.current_file_name)
        self.workers = [threading.Thread(target=task_worker, args=(self._rotation_queue,)).start() for _ in
                        range(int(max(workers, MIN_WORKERS_NUM) / 2) + 1)]
        self.stream_bg_workers = [threading.Thread(target=task_worker, args=(self._stream_buffer_queue,)).start() for _
                                  in range(max(int(max(workers, MIN_WORKERS_NUM) / 2), 1))]

        self._is_open = True
        self.compress = compress

        BufferedIOBase.__init__(self)
Exemple #4
0
    def __init__(self,
                 name,
                 mode="r",
                 buffer_size=None,
                 max_buffers=0,
                 max_workers=None,
                 **kwargs):

        if "a" in mode:
            raise NotImplementedError('"a" mode not implemented yet')

        BufferedIOBase.__init__(self)
        ObjectIOBase.__init__(self, name, mode=mode)
        WorkerPoolBase.__init__(self, max_workers)

        self._raw = self._RAW_CLASS(name, mode=mode, **kwargs)
        self._raw._is_raw_of_buffered = True
        self._mode = self._raw.mode
        self._name = self._raw.name
        self._client_kwargs = self._raw._client_kwargs

        if not buffer_size or buffer_size < 0:
            self._buffer_size = self.DEFAULT_BUFFER_SIZE
        elif buffer_size < self.MINIMUM_BUFFER_SIZE:
            self._buffer_size = self.MINIMUM_BUFFER_SIZE
        elif self.MAXIMUM_BUFFER_SIZE and buffer_size > self.MAXIMUM_BUFFER_SIZE:
            self._buffer_size = self.MAXIMUM_BUFFER_SIZE
        else:
            self._buffer_size = buffer_size

        if self._writable:
            self._max_buffers = max_buffers
            self._buffer_seek = 0
            self._write_buffer = bytearray(self._buffer_size)
            self._seekable = False
            self._write_futures = []
            self._raw_flush = self._raw._flush

            # Size used only with random write access
            # Value will be lazy evaluated latter if needed.
            self._size_synched = False
            self._size = 0
            self._size_lock = Lock()

        else:
            self._size = self._raw._size
            self._read_range = self.raw._read_range
            self._seekable = self.raw._seekable
            if max_buffers:
                self._max_buffers = max_buffers
            else:
                self._max_buffers = ceil(self._size / self._buffer_size)
            self._read_queue = dict()
    def __init__(self,
                 stream_name: str,
                 partition_key: str,
                 *,
                 chunk_size: int = DEFAULT_CHUNK_SIZE,
                 encoder: str = 'utf-8',
                 workers: int = 1,
                 **boto_session_kwargs):
        """

        :param stream_name: Name of the Kinesis stream
        :type stream_name: str
        :param partition_key: Kinesis partition key used to group data by shards
        :type partition_key: str
        :param chunk_size: the size of a a chunk of records for rotation threshold (default 524288)
        :type chunk_size: int
        :param encoder: the encoder to be used for log records (default 'utf-8')
        :type encoder: str
        :param workers: the number of background workers that rotate log records (default 1)
        :type workers: int
        :param boto_session_kwargs: additional keyword arguments for the AWS Kinesis Resource
        :type boto_session_kwargs: boto3 resource keyword arguments
        """

        self._client = client('kinesis', **boto_session_kwargs)
        self.chunk_size = chunk_size
        self.stream_name = stream_name
        self.tasks = Queue()
        self.partition_key = partition_key
        self.encoder = encoder

        try:
            stream_desc = self._client.describe_stream(
                StreamName=self.stream_name)
            if stream_desc['StreamDescription']['StreamStatus'] != 'ACTIVE':
                raise AssertionError
        except Exception:
            raise ValueError(
                'Kinesis stream %s does not exist or inactive, or insufficient permissions'
                % stream_name)

        self.workers = [
            threading.Thread(target=task_worker,
                             args=(self.tasks, ),
                             daemon=True).start()
            for _ in range(int(max(workers, MIN_WORKERS_NUM) / 2) + 1)
        ]
        self._stream = BytesIO()

        self._is_open = True

        BufferedIOBase.__init__(self)
Exemple #6
0
    def __init__(self, bucket: str, key: str, *, chunk_size: int = DEFAULT_CHUNK_SIZE,
                 max_file_log_time: int = DEFAULT_ROTATION_TIME_SECS, max_file_size_bytes: int = MAX_FILE_SIZE_BYTES,
                 encoder: str = 'utf-8', workers: int = 1, compress: bool = False, **boto_session_kwargs):
        """

        :param bucket: name of the s3 bucket
        :type bucket: str
        :param key: s3 key path
        :type key: str
        :param chunk_size: size of multipart upload chunk size (default 5MB)
        :type chunk_size: int
        :param max_file_log_time: threshold period for a log period until file rotation (default 12 Hours)
        :type max_file_log_time: int
        :param max_file_size_bytes: threshold for file rotation by bytes (default 100MB)
        :type max_file_size_bytes: int
        :param encoder: the encoder to be used for log records (default 'utf-8')
        :type encoder: str
        :param workers: the number of background workers that rotate log records (default 1)
        :type workers: int
        :param compress: flag indication for archiving the content of a file
        :type compress: bool
        :param boto_session_kwargs: additional keyword arguments for the AWS Kinesis Resource
        :type boto_session_kwargs: boto3 resource keyword arguments
        """

        self._session = Session()
        self.s3 = self._session.resource('s3', **boto_session_kwargs)
        self.start_time = int(datetime.utcnow().strftime('%s'))
        self.key = key
        self.chunk_size = chunk_size
        self.max_file_log_time = max_file_log_time
        self.max_file_size_bytes = max_file_size_bytes
        self.current_file_name = "{}_{}".format(key, int(datetime.utcnow().strftime('%s')))
        if compress:
            self.current_file_name = "{}.gz".format(self.current_file_name)
        self.encoder = encoder

        self.bucket = bucket
        self._current_object = self._get_stream_object(self.current_file_name)
        self.workers = [threading.Thread(target=task_worker, args=(self._rotation_queue,), daemon=True).start() for _ in
                        range(int(max(workers, MIN_WORKERS_NUM) / 2) + 1)]
        self._stream_bg_workers = [threading.Thread(target=task_worker, args=(self._stream_buffer_queue,), daemon=True).start() for _
                                   in range(max(int(max(workers, MIN_WORKERS_NUM) / 2), 1))]

        self._is_open = True
        self.compress = compress

        BufferedIOBase.__init__(self)
Exemple #7
0
 def __init__(self):
     self.coroutine_result = None
     self.buffer = bytes()
     return BufferedIOBase.__init__(self)
Exemple #8
0
        try:
            s3_resource.meta.client.head_bucket(Bucket=bucket)
        except Exception:
            raise ValueError('Bucket %s does not exist, or missing permissions' % bucket)

        self._bucket = s3_resource.Bucket(bucket)
        self._current_object = self._get_stream_object(self.current_file_name)
        self.workers = [threading.Thread(target=task_worker, args=(self._rotation_queue,)).start() for _ in
                        range(int(max(workers, MIN_WORKERS_NUM) / 2) + 1)]
        self.stream_bg_workers = [threading.Thread(target=task_worker, args=(self._stream_buffer_queue,)).start() for _
                                  in range(max(int(max(workers, MIN_WORKERS_NUM) / 2), 1))]

        self._is_open = True
        self.compress = compress

        BufferedIOBase.__init__(self)

    def get_filename(self):
        filename = '{}_{}'.format(self.key, self.start_time)
        if not self.compress:
            return filename
        return '{}.gz'.format(filename)

    def add_task(self, task):
        self._rotation_queue.put(task)

    def join_tasks(self):
        self._rotation_queue.join()

    def _get_stream_object(self, filename):
        try:
 def __init__(self):
     self.coroutine_result = None
     self.buffer = bytes()
     return BufferedIOBase.__init__(self)