class KinesisStreamer(BufferedIOBase): """ The stream interface used by the handler which binds to Kinesis and utilizes the object class """ _stream_buffer_queue = queue.Queue() def __init__(self, key_id, secret, stream_name, region, partition, chunk_size=DEFAULT_CHUNK_SIZE, encoder='utf-8', workers=2): self.kinesis = client('kinesis', region_name=region, aws_access_key_id=key_id, aws_secret_access_key=secret) self.chunk_size = chunk_size self.stream_name = stream_name self.region = region self.tasks = queue.Queue() self.partition = partition self.encoder = encoder try: stream_desc = self.kinesis.describe_stream(StreamName=self.stream_name) if stream_desc['StreamDescription']['StreamStatus'] != 'ACTIVE': raise AssertionError except Exception: raise ValueError('Kinesis stream %s does not exist or inactive, or insufficient permissions' % stream_name) self.workers = [threading.Thread(target=task_worker, args=(self.tasks,)).start() for _ in range(int(max(workers, MIN_WORKERS_NUM) / 2) + 1)] self._stream = BytesIO() self._is_open = True BufferedIOBase.__init__(self) def add_task(self, task): self.tasks.put(task) def join_tasks(self): self.tasks.join() def _rotate_chunk(self, async=True): assert self._stream, "Stream object not found" buffer = self._stream self._stream = BytesIO() if buffer.tell() > MAX_CHUNK_SIZE: # We are limited to a size of 1 MB per stream upload command so we need to enforce it chunk_delta = MAX_CHUNK_SIZE - buffer.tell() buffer.seek(chunk_delta) self._stream.write(buffer.read()) buffer.seek(0) if async: self.add_task(Task(self._upload_part, buffer)) else: self._upload_part(buffer)
def _rotate_file(self): if self._current_object.buffer.tell() > 0: self._rotate_chunk() temp_object = self._current_object self._add_task(Task(self._close_stream, stream_object=temp_object)) self.start_time = int(datetime.utcnow().strftime('%s')) new_filename = self.get_filename() self._current_object = self._get_stream_object(new_filename)
def _rotate_chunk(self, run_async=True): assert self._current_object, "Stream object not found" part_num = self._current_object.chunk_count + 1 part = self._current_object.uploader.Part(part_num) buffer = self._current_object.buffer self._current_object.buffer = BytesIO() buffer.seek(0) if run_async: self._current_object.add_task(Task(self._upload_part, self._current_object, part, part_num, buffer)) else: self._upload_part(self._current_object, part, part_num, buffer) self._current_object.chunk_count += 1
def _rotate_chunk(self, run_async=True): assert self._stream, "Stream object not found" buffer = self._stream self._stream = BytesIO() if buffer.tell() > MAX_CHUNK_SIZE: # We are limited to a size of 1 MB per stream upload command so we need to enforce it chunk_delta = MAX_CHUNK_SIZE - buffer.tell() buffer.seek(chunk_delta) self._stream.write(buffer.read()) buffer.seek(0) if run_async: self.add_task(Task(self._upload_part, buffer)) else: self._upload_part(buffer)
def _rotate_chunk(self, run_async=True): """ Send the accumulated records to the stream and clear the buffer :param run_async: Indicates whether the rotation should by asynchronous on a different thread :type run_async: bool :return: """ assert self._stream, "Stream object not found" buffer = self._stream self._stream = BytesIO() if buffer.tell() > MAX_CHUNK_SIZE: # We are limited to a size of 1 MB per stream upload command so we need to enforce it chunk_delta = MAX_CHUNK_SIZE - buffer.tell() buffer.seek(chunk_delta) self._stream.write(buffer.read()) buffer.seek(0) if run_async: self.add_task(Task(self._upload_part, buffer)) else: self._upload_part(buffer)
class S3Streamer(BufferedIOBase): """ The stream interface used by the handler which binds to S3 and utilizes the object class """ _stream_buffer_queue = queue.Queue() _rotation_queue = queue.Queue() def __init__(self, bucket, key_id, secret, key, chunk_size=DEFAULT_CHUNK_SIZE, max_file_log_time=DEFAULT_ROTATION_TIME_SECS, max_file_size_bytes=MAX_FILE_SIZE_BYTES, encoder='utf-8', workers=2, compress=False): self.session = Session(key_id, secret) self.s3 = self.session.resource('s3') self.start_time = int(datetime.utcnow().strftime('%s')) self.key = key.strip('/') self.chunk_size = chunk_size self.max_file_log_time = max_file_log_time self.max_file_size_bytes = max_file_size_bytes self.current_file_name = "{}_{}".format( key, int(datetime.utcnow().strftime('%s'))) if compress: self.current_file_name = "{}.gz".format(self.current_file_name) self.encoder = encoder try: self.s3.meta.client.head_bucket(Bucket=bucket) except Exception: raise ValueError( 'Bucket %s does not exist, or missing permissions' % bucket) self._bucket = self.s3.Bucket(bucket) self._current_object = self._get_stream_object(self.current_file_name) self.workers = [ threading.Thread(target=task_worker, args=(self._rotation_queue, )).start() for _ in range(int(max(workers, MIN_WORKERS_NUM) / 2) + 1) ] self.stream_bg_workers = [ threading.Thread(target=task_worker, args=(self._stream_buffer_queue, )).start() for _ in range(max(int(max(workers, MIN_WORKERS_NUM) / 2), 1)) ] self._is_open = True self.compress = compress BufferedIOBase.__init__(self) def get_filename(self): filename = "{}_{}".format(self.key, self.start_time) if not self.compress: return filename return "{}.gz".format(filename) def add_task(self, task): self._rotation_queue.put(task) def join_tasks(self): self._rotation_queue.join() def _get_stream_object(self, filename): try: return StreamObject(self.s3, self._bucket.name, filename, self._stream_buffer_queue) except Exception: raise RuntimeError('Failed to open new S3 stream object') def _rotate_chunk(self, async=True): assert self._current_object, "Stream object not found" part_num = self._current_object.chunk_count + 1 part = self._current_object.uploader.Part(part_num) buffer = self._current_object.buffer self._current_object.buffer = BytesIO() buffer.seek(0) if async: self._current_object.add_task( Task(self._upload_part, self._current_object, part, part_num, buffer)) else: self._upload_part(self._current_object, part, part_num, buffer) self._current_object.chunk_count += 1
) except Exception: raise RuntimeError('Failed to open new S3 stream object') def _rotate_chunk(self, async=True): assert self._current_object, 'Stream object not found' part_num = self._current_object.chunk_count + 1 part = self._current_object.uploader.Part(part_num) buffer = self._current_object.buffer self._current_object.buffer = BytesIO() buffer.seek(0) if async: self._current_object.add_task(Task(self._upload_part, self._current_object, part, part_num, buffer)) else: self._upload_part(self._current_object, part, part_num, buffer) self._current_object.chunk_count += 1 @staticmethod def _upload_part(s3_object, part, part_num, buffer): upload = part.upload(Body=buffer) s3_object.parts.append({'ETag': upload['ETag'], 'PartNumber': part_num}) def _rotate_file(self): if self._current_object.buffer.tell() > 0: self._rotate_chunk()