def __init__(self, key_id, secret, stream_name, region, partition, chunk_size=DEFAULT_CHUNK_SIZE, encoder='utf-8', workers=2): self.kinesis = client('kinesis', region_name=region, aws_access_key_id=key_id, aws_secret_access_key=secret) self.chunk_size = chunk_size self.stream_name = stream_name self.region = region self.tasks = queue.Queue() self.partition = partition self.encoder = encoder try: stream_desc = self.kinesis.describe_stream(StreamName=self.stream_name) if stream_desc['StreamDescription']['StreamStatus'] != 'ACTIVE': raise AssertionError except Exception: raise ValueError('Kinesis stream %s does not exist or inactive, or insufficient permissions' % stream_name) self.workers = [threading.Thread(target=task_worker, args=(self.tasks,)).start() for _ in range(int(max(workers, MIN_WORKERS_NUM) / 2) + 1)] self._stream = BytesIO() self._is_open = True BufferedIOBase.__init__(self)
def __init__(self, name, mode='r', buffer_size=None, max_buffers=0, max_workers=None, **kwargs): if 'a' in mode: # TODO: Implement append mode and remove this exception raise NotImplementedError('Not implemented yet in Pycosio') BufferedIOBase.__init__(self) ObjectIOBase.__init__(self, name, mode=mode) WorkerPoolBase.__init__(self, max_workers) # Instantiate raw IO self._raw = self._RAW_CLASS(name, mode=mode, **kwargs) self._raw._is_raw_of_buffered = True # Link to RAW methods self._mode = self._raw.mode self._name = self._raw.name self._client_kwargs = self._raw._client_kwargs # Initializes buffer if not buffer_size or buffer_size < 0: self._buffer_size = self.DEFAULT_BUFFER_SIZE elif buffer_size < self.MINIMUM_BUFFER_SIZE: self._buffer_size = self.MINIMUM_BUFFER_SIZE elif (self.MAXIMUM_BUFFER_SIZE and buffer_size > self.MAXIMUM_BUFFER_SIZE): self._buffer_size = self.MAXIMUM_BUFFER_SIZE else: self._buffer_size = buffer_size # Initialize write mode if self._writable: self._max_buffers = max_buffers self._buffer_seek = 0 self._write_buffer = bytearray(self._buffer_size) self._seekable = False self._write_futures = [] self._raw_flush = self._raw._flush # Size used only with random write access # Value will be lazy evaluated latter if needed. self._size_synched = False self._size = 0 self._size_lock = Lock() # Initialize read mode else: self._size = self._raw._size self._read_range = self.raw._read_range if max_buffers: self._max_buffers = max_buffers else: self._max_buffers = ceil(self._size / self._buffer_size) self._read_queue = dict()
def __init__(self, bucket, key, *, chunk_size=DEFAULT_CHUNK_SIZE, max_file_log_time=DEFAULT_ROTATION_TIME_SECS, max_file_size_bytes=MAX_FILE_SIZE_BYTES, encoder='utf-8', workers=2, compress=False, key_id=None, secret=None, token=None): self.session = Session(aws_access_key_id=key_id, aws_secret_access_key=secret, aws_session_token=token) self.s3 = self.session.resource('s3') self.start_time = int(time.time()) self.key = key.strip('/') self.chunk_size = chunk_size self.max_file_log_time = max_file_log_time self.max_file_size_bytes = max_file_size_bytes self.current_file_name = "{}_{}".format(key, int(time.time())) if compress: self.current_file_name = "{}.gz".format(self.current_file_name) self.encoder = encoder try: self.s3.meta.client.head_bucket(Bucket=bucket) except Exception: raise ValueError('Bucket %s does not exist, or missing permissions' % bucket) self._bucket = self.s3.Bucket(bucket) self._current_object = self._get_stream_object(self.current_file_name) self.workers = [threading.Thread(target=task_worker, args=(self._rotation_queue,)).start() for _ in range(int(max(workers, MIN_WORKERS_NUM) / 2) + 1)] self.stream_bg_workers = [threading.Thread(target=task_worker, args=(self._stream_buffer_queue,)).start() for _ in range(max(int(max(workers, MIN_WORKERS_NUM) / 2), 1))] self._is_open = True self.compress = compress BufferedIOBase.__init__(self)
def __init__(self, name, mode="r", buffer_size=None, max_buffers=0, max_workers=None, **kwargs): if "a" in mode: raise NotImplementedError('"a" mode not implemented yet') BufferedIOBase.__init__(self) ObjectIOBase.__init__(self, name, mode=mode) WorkerPoolBase.__init__(self, max_workers) self._raw = self._RAW_CLASS(name, mode=mode, **kwargs) self._raw._is_raw_of_buffered = True self._mode = self._raw.mode self._name = self._raw.name self._client_kwargs = self._raw._client_kwargs if not buffer_size or buffer_size < 0: self._buffer_size = self.DEFAULT_BUFFER_SIZE elif buffer_size < self.MINIMUM_BUFFER_SIZE: self._buffer_size = self.MINIMUM_BUFFER_SIZE elif self.MAXIMUM_BUFFER_SIZE and buffer_size > self.MAXIMUM_BUFFER_SIZE: self._buffer_size = self.MAXIMUM_BUFFER_SIZE else: self._buffer_size = buffer_size if self._writable: self._max_buffers = max_buffers self._buffer_seek = 0 self._write_buffer = bytearray(self._buffer_size) self._seekable = False self._write_futures = [] self._raw_flush = self._raw._flush # Size used only with random write access # Value will be lazy evaluated latter if needed. self._size_synched = False self._size = 0 self._size_lock = Lock() else: self._size = self._raw._size self._read_range = self.raw._read_range self._seekable = self.raw._seekable if max_buffers: self._max_buffers = max_buffers else: self._max_buffers = ceil(self._size / self._buffer_size) self._read_queue = dict()
def __init__(self, stream_name: str, partition_key: str, *, chunk_size: int = DEFAULT_CHUNK_SIZE, encoder: str = 'utf-8', workers: int = 1, **boto_session_kwargs): """ :param stream_name: Name of the Kinesis stream :type stream_name: str :param partition_key: Kinesis partition key used to group data by shards :type partition_key: str :param chunk_size: the size of a a chunk of records for rotation threshold (default 524288) :type chunk_size: int :param encoder: the encoder to be used for log records (default 'utf-8') :type encoder: str :param workers: the number of background workers that rotate log records (default 1) :type workers: int :param boto_session_kwargs: additional keyword arguments for the AWS Kinesis Resource :type boto_session_kwargs: boto3 resource keyword arguments """ self._client = client('kinesis', **boto_session_kwargs) self.chunk_size = chunk_size self.stream_name = stream_name self.tasks = Queue() self.partition_key = partition_key self.encoder = encoder try: stream_desc = self._client.describe_stream( StreamName=self.stream_name) if stream_desc['StreamDescription']['StreamStatus'] != 'ACTIVE': raise AssertionError except Exception: raise ValueError( 'Kinesis stream %s does not exist or inactive, or insufficient permissions' % stream_name) self.workers = [ threading.Thread(target=task_worker, args=(self.tasks, ), daemon=True).start() for _ in range(int(max(workers, MIN_WORKERS_NUM) / 2) + 1) ] self._stream = BytesIO() self._is_open = True BufferedIOBase.__init__(self)
def __init__(self, bucket: str, key: str, *, chunk_size: int = DEFAULT_CHUNK_SIZE, max_file_log_time: int = DEFAULT_ROTATION_TIME_SECS, max_file_size_bytes: int = MAX_FILE_SIZE_BYTES, encoder: str = 'utf-8', workers: int = 1, compress: bool = False, **boto_session_kwargs): """ :param bucket: name of the s3 bucket :type bucket: str :param key: s3 key path :type key: str :param chunk_size: size of multipart upload chunk size (default 5MB) :type chunk_size: int :param max_file_log_time: threshold period for a log period until file rotation (default 12 Hours) :type max_file_log_time: int :param max_file_size_bytes: threshold for file rotation by bytes (default 100MB) :type max_file_size_bytes: int :param encoder: the encoder to be used for log records (default 'utf-8') :type encoder: str :param workers: the number of background workers that rotate log records (default 1) :type workers: int :param compress: flag indication for archiving the content of a file :type compress: bool :param boto_session_kwargs: additional keyword arguments for the AWS Kinesis Resource :type boto_session_kwargs: boto3 resource keyword arguments """ self._session = Session() self.s3 = self._session.resource('s3', **boto_session_kwargs) self.start_time = int(datetime.utcnow().strftime('%s')) self.key = key self.chunk_size = chunk_size self.max_file_log_time = max_file_log_time self.max_file_size_bytes = max_file_size_bytes self.current_file_name = "{}_{}".format(key, int(datetime.utcnow().strftime('%s'))) if compress: self.current_file_name = "{}.gz".format(self.current_file_name) self.encoder = encoder self.bucket = bucket self._current_object = self._get_stream_object(self.current_file_name) self.workers = [threading.Thread(target=task_worker, args=(self._rotation_queue,), daemon=True).start() for _ in range(int(max(workers, MIN_WORKERS_NUM) / 2) + 1)] self._stream_bg_workers = [threading.Thread(target=task_worker, args=(self._stream_buffer_queue,), daemon=True).start() for _ in range(max(int(max(workers, MIN_WORKERS_NUM) / 2), 1))] self._is_open = True self.compress = compress BufferedIOBase.__init__(self)
def __init__(self): self.coroutine_result = None self.buffer = bytes() return BufferedIOBase.__init__(self)
try: s3_resource.meta.client.head_bucket(Bucket=bucket) except Exception: raise ValueError('Bucket %s does not exist, or missing permissions' % bucket) self._bucket = s3_resource.Bucket(bucket) self._current_object = self._get_stream_object(self.current_file_name) self.workers = [threading.Thread(target=task_worker, args=(self._rotation_queue,)).start() for _ in range(int(max(workers, MIN_WORKERS_NUM) / 2) + 1)] self.stream_bg_workers = [threading.Thread(target=task_worker, args=(self._stream_buffer_queue,)).start() for _ in range(max(int(max(workers, MIN_WORKERS_NUM) / 2), 1))] self._is_open = True self.compress = compress BufferedIOBase.__init__(self) def get_filename(self): filename = '{}_{}'.format(self.key, self.start_time) if not self.compress: return filename return '{}.gz'.format(filename) def add_task(self, task): self._rotation_queue.put(task) def join_tasks(self): self._rotation_queue.join() def _get_stream_object(self, filename): try: