class DataBackend(_DataBackend): """ A DataBackend which stores in S3 compatible storages. The files are stored in a configurable bucket. """ WRITE_QUEUE_LENGTH = 20 READ_QUEUE_LENGTH = 20 _SUPPORTS_PARTIAL_READS = False _SUPPORTS_PARTIAL_WRITES = False fatal_error = None def __init__(self, config): aws_access_key_id = config.get('aws_access_key_id') aws_secret_access_key = config.get('aws_secret_access_key') host = config.get('host') port = config.getint('port') is_secure = config.getboolean('is_secure') bucket_name = config.get('bucket_name', 'backy2') simultaneous_writes = config.getint('simultaneous_writes', 1) simultaneous_reads = config.getint('simultaneous_reads', 1) calling_format = boto.s3.connection.OrdinaryCallingFormat() bandwidth_read = config.getint('bandwidth_read', 0) bandwidth_write = config.getint('bandwidth_write', 0) self.read_throttling = TokenBucket() self.read_throttling.set_rate(bandwidth_read) # 0 disables throttling self.write_throttling = TokenBucket() self.write_throttling.set_rate( bandwidth_write) # 0 disables throttling self.conn = boto.connect_s3( aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key, host=host, port=port, is_secure=is_secure, calling_format=calling_format) # create our bucket try: self.bucket = self.conn.create_bucket(bucket_name) except boto.exception.S3CreateError: # exists... pass except OSError as e: # no route to host self.fatal_error = e logger.error('Fatal error, dying: {}'.format(e)) print('Fatal error: {}'.format(e)) exit(10) self.write_queue_length = simultaneous_writes + self.WRITE_QUEUE_LENGTH self.read_queue_length = simultaneous_reads + self.READ_QUEUE_LENGTH self._write_queue = queue.Queue(self.write_queue_length) self._read_queue = queue.Queue() self._read_data_queue = queue.Queue(self.read_queue_length) self._writer_threads = [] self._reader_threads = [] for i in range(simultaneous_writes): _writer_thread = threading.Thread(target=self._writer, args=(i, )) _writer_thread.daemon = True _writer_thread.start() self._writer_threads.append(_writer_thread) for i in range(simultaneous_reads): _reader_thread = threading.Thread(target=self._reader, args=(i, )) _reader_thread.daemon = True _reader_thread.start() self._reader_threads.append(_reader_thread) def _writer(self, id_): """ A threaded background writer """ while True: entry = self._write_queue.get() if entry is None or self.fatal_error: logger.debug("Writer {} finishing.".format(id_)) break uid, data = entry time.sleep(self.write_throttling.consume(len(data))) t1 = time.time() key = self.bucket.new_key(uid) try: r = key.set_contents_from_string(data) except ( OSError, boto.exception.BotoServerError, boto.exception.S3ResponseError, ) as e: # OSError happens when the S3 host is gone (i.e. network died, # host down, ...). boto tries hard to recover, however after # several attempts it will give up and raise. # BotoServerError happens, when there is no server. # S3ResponseError sometimes happens, when the cluster is about # to shutdown. Hard to reproduce because the writer must write # in exactly this moment. # We let the backup job die here fataly. self.fatal_error = e logger.error('Fatal error, dying: {}'.format(e)) #exit('Fatal error: {}'.format(e)) # this only raises SystemExit os._exit(11) t2 = time.time() assert r == len(data) self._write_queue.task_done() logger.debug( 'Writer {} wrote data async. uid {} in {:.2f}s (Queue size is {})' .format(id_, uid, t2 - t1, self._write_queue.qsize())) def _reader(self, id_): """ A threaded background reader """ while True: block = self._read_queue.get() # contains block if block is None or self.fatal_error: logger.debug("Reader {} finishing.".format(id_)) break t1 = time.time() try: data = self.read_raw(block.uid) except FileNotFoundError: self._read_data_queue.put((block, None)) # catch this! else: self._read_data_queue.put((block, data)) t2 = time.time() self._read_queue.task_done() logger.debug( 'Reader {} read data async. uid {} in {:.2f}s (Queue size is {})' .format(id_, block.uid, t2 - t1, self._read_queue.qsize())) def read_raw(self, block_uid): key = self.bucket.get_key(block_uid) if not key: raise FileNotFoundError('UID {} not found.'.format(block_uid)) while True: try: data = key.get_contents_as_string() except socket.timeout: logger.error('Timeout while fetching from s3, trying again.') pass else: break time.sleep(self.read_throttling.consume(len(data))) return data def _uid(self): # 32 chars are allowed and we need to spread the first few chars so # that blobs are distributed nicely. And want to avoid hash collisions. # So we create a real base57-encoded uuid (22 chars) and prefix it with # its own md5 hash[:10]. suuid = shortuuid.uuid() hash = hashlib.md5(suuid.encode('ascii')).hexdigest() return hash[:10] + suuid def save(self, data, _sync=False): if self.fatal_error: raise self.fatal_error uid = self._uid() self._write_queue.put((uid, data)) if _sync: self._write_queue.join() return uid def rm(self, uid): key = self.bucket.get_key(uid) if not key: raise FileNotFoundError('UID {} not found.'.format(uid)) self.bucket.delete_key(uid) def rm_many(self, uids): """ Deletes many uids from the data backend and returns a list of uids that couldn't be deleted. """ errors = self.bucket.delete_keys(uids, quiet=True) if errors.errors: # unable to test this. ceph object gateway doesn't return errors. # raise FileNotFoundError('UIDS {} not found.'.format(errors.errors)) return errors.errors # TODO: which should be a list of uids. def read(self, block, sync=False): self._read_queue.put(block) if sync: rblock, offset, length, data = self.read_get() if rblock.id != block.id: raise RuntimeError( 'Do not mix threaded reading with sync reading!') if data is None: raise FileNotFoundError('UID {} not found.'.format(block.uid)) return data def read_get(self): block, data = self._read_data_queue.get() offset = 0 length = len(data) self._read_data_queue.task_done() return block, offset, length, data def read_queue_size(self): return self._read_queue.qsize() def get_all_blob_uids(self, prefix=None): return [k.name for k in self.bucket.list(prefix)] def close(self): for _writer_thread in self._writer_threads: self._write_queue.put(None) # ends the thread for _writer_thread in self._writer_threads: _writer_thread.join() for _reader_thread in self._reader_threads: self._read_queue.put(None) # ends the thread for _reader_thread in self._reader_threads: _reader_thread.join() self.conn.close()
class DataBackend(_DataBackend): """ A DataBackend which stores in S3 compatible storages. The files are stored in a configurable bucket. """ WRITE_QUEUE_LENGTH = 20 READ_QUEUE_LENGTH = 20 last_exception = None def __init__(self, config): aws_access_key_id = config.get('aws_access_key_id') if aws_access_key_id is None: aws_access_key_id_file = config.get('aws_access_key_id_file') with open(aws_access_key_id_file, 'r', encoding="ascii") as f: aws_access_key_id = f.read().rstrip() aws_secret_access_key = config.get('aws_secret_access_key') if aws_secret_access_key is None: aws_secret_access_key_file = config.get('aws_secret_access_key_file') with open(aws_secret_access_key_file, 'r', encoding="ascii") as f: aws_secret_access_key = f.read().rstrip() region_name = config.get('region_name', '') endpoint_url = config.get('endpoint_url', '') use_ssl = config.get('use_ssl', '') self._bucket_name = config.get('bucket_name', '') addressing_style = config.get('addressing_style', '') signature_version = config.get('signature_version', '') self._disable_encoding_type = config.get('disable_encoding_type', '') simultaneous_writes = config.getint('simultaneous_writes', 1) simultaneous_reads = config.getint('simultaneous_reads', 1) bandwidth_read = config.getint('bandwidth_read', 0) bandwidth_write = config.getint('bandwidth_write', 0) self.read_throttling = TokenBucket() self.read_throttling.set_rate(bandwidth_read) # 0 disables throttling self.write_throttling = TokenBucket() self.write_throttling.set_rate(bandwidth_write) # 0 disables throttling self._resource_config = { 'aws_access_key_id': aws_access_key_id, 'aws_secret_access_key': aws_secret_access_key, } if region_name: self._resource_config['region_name'] = region_name if endpoint_url: self._resource_config['endpoint_url'] = endpoint_url if use_ssl: self._resource_config['use_ssl'] = use_ssl resource_config = {} if addressing_style: resource_config['s3'] = {'addressing_style': addressing_style} if signature_version: resource_config['signature_version'] = signature_version # TODO #resource_config['max_pool_connections'] = 100 #resource_config['parameter_validation'] = False #resource_config['use_accelerate_endpoint'] = True self._resource_config['config'] = BotoCoreClientConfig(**resource_config) self.write_queue_length = simultaneous_writes + self.WRITE_QUEUE_LENGTH self.read_queue_length = simultaneous_reads + self.READ_QUEUE_LENGTH self._write_queue = queue.Queue(self.write_queue_length) self._read_queue = queue.Queue() self._read_data_queue = queue.Queue(self.read_queue_length) self.bucket = self._get_bucket() # for read_raw self._writer_threads = [] self._reader_threads = [] self.reader_thread_status = {} self.writer_thread_status = {} for i in range(simultaneous_writes): _writer_thread = threading.Thread(target=self._writer, args=(i,)) _writer_thread.daemon = True _writer_thread.start() self._writer_threads.append(_writer_thread) self.writer_thread_status[i] = STATUS_NOTHING for i in range(simultaneous_reads): _reader_thread = threading.Thread(target=self._reader, args=(i,)) _reader_thread.daemon = True _reader_thread.start() self._reader_threads.append(_reader_thread) self.reader_thread_status[i] = STATUS_NOTHING def _get_bucket(self): session = boto3.session.Session() if self._disable_encoding_type: session.events.unregister('before-parameter-build.s3.ListObjects', set_list_objects_encoding_type_url) resource = session.resource('s3', **self._resource_config) bucket = resource.Bucket(self._bucket_name) return bucket def _get_client(self): session = boto3.session.Session() if self._disable_encoding_type: session.events.unregister('before-parameter-build.s3.ListObjects', set_list_objects_encoding_type_url) client = session.client('s3', **self._resource_config) return client def _writer(self, id_): """ A threaded background writer """ #bucket = None client = None while True: self.writer_thread_status[id_] = STATUS_QUEUE entry = self._write_queue.get() self.writer_thread_status[id_] = STATUS_NOTHING if entry is None or self.last_exception: logger.debug("Writer {} finishing.".format(id_)) break if client is None: client = self._get_client() uid, data, callback = entry self.writer_thread_status[id_] = STATUS_THROTTLING time.sleep(self.write_throttling.consume(len(data))) self.writer_thread_status[id_] = STATUS_NOTHING try: self.writer_thread_status[id_] = STATUS_WRITING client.put_object(Body=data, Key=uid, Bucket=self._bucket_name) #client.upload_fileobj(io.BytesIO(data), Key=uid, Bucket=self._bucket_name) self.writer_thread_status[id_] = STATUS_NOTHING #if random.random() > 0.9: # raise ValueError("This is a test") except Exception as e: self.last_exception = e else: if callback: callback(uid) self._write_queue.task_done() def _reader(self, id_): """ A threaded background reader """ bucket = None while True: block = self._read_queue.get() # contains block if block is None or self.last_exception: logger.debug("Reader {} finishing.".format(id_)) break if bucket is None: bucket = self._get_bucket() t1 = time.time() try: self.reader_thread_status[id_] = STATUS_READING data = self.read_raw(block.uid, bucket) self.reader_thread_status[id_] = STATUS_NOTHING #except FileNotFoundError: except Exception as e: self.last_exception = e else: self._read_data_queue.put((block, data)) t2 = time.time() self._read_queue.task_done() logger.debug('Reader {} read data async. uid {} in {:.2f}s (Queue size is {})'.format(id_, block.uid, t2-t1, self._read_queue.qsize())) def read_raw(self, block_uid, _bucket=None): if not _bucket: _bucket = self.bucket while True: obj = _bucket.Object(block_uid) try: data_dict = obj.get() data = data_dict['Body'].read() except ClientError as e: if e.response['Error']['Code'] == 'NoSuchKey' or e.response['Error']['Code'] == '404': raise FileNotFoundError('Key {} not found.'.format(key)) from None else: raise except socket.timeout: logger.error('Timeout while fetching from s3, trying again.') pass except OSError as e: # TODO: This is new and currently untested code. I'm not sure # why this happens in favour of socket.timeout and also if it # might be better to abort the whole restore/backup/scrub if # this happens, because I can't tell if the s3 lib is able to # recover from this situation and continue or not. We will see # this in the logs next time s3 is generating timeouts. logger.error('Timeout while fetching from s3 - error is "{}", trying again.'.format(str(e))) pass else: break time.sleep(self.read_throttling.consume(len(data))) # TODO: Need throttling in thread statistics! return data def rm(self, uid): obj = self.bucket.Object(uid) try: obj.load() except ClientError as e: if e.response['Error']['Code'] == 'NoSuchKey' or e.response['Error']['Code'] == '404': raise FileNotFoundError('Key {} not found.'.format(uid)) from None else: raise else: obj.delete() def rm_many(self, uids): """ Deletes many uids from the data backend and returns a list of uids that couldn't be deleted. """ for uid in uids: self.rm(uid) # TODO: maybe use delete_objects def get_all_blob_uids(self, prefix=None): if prefix is None: objects_iterable = self.bucket.objects.all() else: objects_iterable = self.bucket.objects.filter(Prefix=prefix) return [o.key for o in objects_iterable]
class DataBackend(_DataBackend): """ A DataBackend which stores in S3 compatible storages. The files are stored in a configurable bucket. """ WRITE_QUEUE_LENGTH = 20 READ_QUEUE_LENGTH = 20 last_exception = None def __init__(self, config, encryption_key, encryption_version=None): super().__init__(config, encryption_key, encryption_version) self.aws_access_key_id = config.get('aws_access_key_id') if self.aws_access_key_id is None: aws_access_key_id_file = config.get('aws_access_key_id_file') with open(aws_access_key_id_file, 'r', encoding="ascii") as f: self.aws_access_key_id = f.read().rstrip() self.aws_secret_access_key = config.get('aws_secret_access_key') if self.aws_secret_access_key is None: aws_secret_access_key_file = config.get( 'aws_secret_access_key_file') with open(aws_secret_access_key_file, 'r', encoding="ascii") as f: self.aws_secret_access_key = f.read().rstrip() self.region_name = config.get('region_name', '') self.host = config.get('host', '') self.secure = config.getboolean('secure', False) self.bucket_name = config.get('bucket_name', '') simultaneous_writes = config.getint('simultaneous_writes', 1) simultaneous_reads = config.getint('simultaneous_reads', 1) bandwidth_read = config.getint('bandwidth_read', 0) bandwidth_write = config.getint('bandwidth_write', 0) self.read_throttling = TokenBucket() self.read_throttling.set_rate(bandwidth_read) # 0 disables throttling self.write_throttling = TokenBucket() self.write_throttling.set_rate( bandwidth_write) # 0 disables throttling self.write_queue_length = simultaneous_writes + self.WRITE_QUEUE_LENGTH self.read_queue_length = simultaneous_reads + self.READ_QUEUE_LENGTH self._write_queue = queue.Queue(self.write_queue_length) self._read_queue = queue.Queue() self._read_data_queue = queue.Queue(self.read_queue_length) self.client = self._get_client() # for read_raw, rm, ... self._writer_threads = [] self._reader_threads = [] self.reader_thread_status = {} self.writer_thread_status = {} for i in range(simultaneous_writes): _writer_thread = threading.Thread(target=self._writer, args=(i, )) _writer_thread.daemon = True _writer_thread.start() self._writer_threads.append(_writer_thread) self.writer_thread_status[i] = STATUS_NOTHING for i in range(simultaneous_reads): _reader_thread = threading.Thread(target=self._reader, args=(i, )) _reader_thread.daemon = True _reader_thread.start() self._reader_threads.append(_reader_thread) self.reader_thread_status[i] = STATUS_NOTHING def _get_client(self): client = Minio(self.host, access_key=self.aws_access_key_id, secret_key=self.aws_secret_access_key, secure=self.secure) return client def _writer(self, id_): """ A threaded background writer """ #bucket = None client = None while True: self.writer_thread_status[id_] = STATUS_QUEUE entry = self._write_queue.get() self.writer_thread_status[id_] = STATUS_NOTHING if entry is None or self.last_exception: logger.debug("Writer {} finishing.".format(id_)) break if client is None: client = self._get_client() uid, enc_envkey, enc_version, enc_nonce, data, callback = entry self.writer_thread_status[id_] = STATUS_THROTTLING time.sleep(self.write_throttling.consume(len(data))) self.writer_thread_status[id_] = STATUS_NOTHING try: self.writer_thread_status[id_] = STATUS_WRITING client.put_object(self.bucket_name, uid, io.BytesIO(data), len(data)) #client.upload_fileobj(io.BytesIO(data), Key=uid, Bucket=self._bucket_name) self.writer_thread_status[id_] = STATUS_NOTHING #if random.random() > 0.9: # raise ValueError("This is a test") except Exception as e: self.last_exception = e else: if callback: callback(uid, enc_envkey, enc_version, enc_nonce) self._write_queue.task_done() def _reader(self, id_): """ A threaded background reader """ client = None while True: block = self._read_queue.get() # contains block if block is None or self.last_exception: logger.debug("Reader {} finishing.".format(id_)) break if client is None: client = self._get_client() t1 = time.time() try: self.reader_thread_status[id_] = STATUS_READING data = self.read_raw(block, client) self.reader_thread_status[id_] = STATUS_NOTHING #except FileNotFoundError: except Exception as e: self.last_exception = e else: self._read_data_queue.put((block, data)) t2 = time.time() self._read_queue.task_done() logger.debug( 'Reader {} read data async. uid {} in {:.2f}s (Queue size is {})' .format(id_, block.uid, t2 - t1, self._read_queue.qsize())) def read_raw(self, block, _client=None): if not _client: _client = self._get_client() data = _client.get_object(self.bucket_name, block.uid).read() time.sleep(self.read_throttling.consume( len(data))) # TODO: Need throttling in thread statistics! return data def rm(self, uid): try: self.client.remove_object(self.bucket_name, uid) except ResponseError as e: raise def rm_many(self, uids): """ Deletes many uids from the data backend and returns a list of uids that couldn't be deleted. """ try: for del_err in self.client.remove_objects(self.bucket_name, uids): logger.error("S3 Object Deletion Error: {}".format(del_err)) except ResponseError as err: raise def get_all_blob_uids(self, prefix=None): objects = self.client.list_objects(self.bucket_name, prefix) return [o.object_name for o in objects]
class DataBackend(_DataBackend): """ A DataBackend which stores in files. The files are stored in directories starting with the bytes of the generated uid. The depth of this structure is configurable via the DEPTH parameter, which defaults to 2. """ DEPTH = 2 SPLIT = 2 SUFFIX = '.blob' WRITE_QUEUE_LENGTH = 10 READ_QUEUE_LENGTH = 20 last_exception = None def __init__(self, config): self.path = config.get('path') simultaneous_writes = config.getint('simultaneous_writes') simultaneous_reads = config.getint('simultaneous_reads', 1) self.write_queue_length = simultaneous_writes + self.WRITE_QUEUE_LENGTH self.read_queue_length = simultaneous_reads + self.READ_QUEUE_LENGTH bandwidth_read = config.getint('bandwidth_read', 0) bandwidth_write = config.getint('bandwidth_write', 0) self.read_throttling = TokenBucket() self.read_throttling.set_rate(bandwidth_read) # 0 disables throttling self.write_throttling = TokenBucket() self.write_throttling.set_rate( bandwidth_write) # 0 disables throttling self._write_queue = queue.Queue(self.write_queue_length) self._read_queue = queue.Queue() self._read_data_queue = queue.Queue(self.read_queue_length) self._writer_threads = [] self._reader_threads = [] self.reader_thread_status = {} self.writer_thread_status = {} for i in range(simultaneous_writes): _writer_thread = threading.Thread(target=self._writer, args=(i, )) _writer_thread.daemon = True _writer_thread.start() self._writer_threads.append(_writer_thread) self.writer_thread_status[i] = STATUS_NOTHING for i in range(simultaneous_reads): _reader_thread = threading.Thread(target=self._reader, args=(i, )) _reader_thread.daemon = True _reader_thread.start() self._reader_threads.append(_reader_thread) self.reader_thread_status[i] = STATUS_NOTHING def _writer(self, id_=0): """ A threaded background writer """ while True: entry = self._write_queue.get() if entry is None or self.last_exception: logger.debug("Writer {} finishing.".format(id_)) break uid, data, callback = entry # TODO: encrypt, compress data before throttling. path = os.path.join(self.path, self._path(uid)) filename = self._filename(uid) self.writer_thread_status[id_] = STATUS_THROTTLING time.sleep(self.write_throttling.consume(len(data))) self.writer_thread_status[id_] = STATUS_NOTHING t1 = time.time() try: try: self.writer_thread_status[id_] = STATUS_WRITING with open(filename, 'wb') as f: r = f.write(data) self.writer_thread_status[id_] = STATUS_NOTHING except FileNotFoundError: self.writer_thread_status[id_] = STATUS_WRITING makedirs(path) with open(filename, 'wb') as f: r = f.write(data) self.writer_thread_status[id_] = STATUS_NOTHING assert r == len(data) except Exception as e: self.last_exception = e else: t2 = time.time() if callback: callback(uid) self._write_queue.task_done() #logger.debug('Writer {} wrote data async. uid {} in {:.2f}s (Queue size is {})'.format(id_, uid, t2-t1, self._write_queue.qsize())) def _reader(self, id_): """ A threaded background reader """ while True: block = self._read_queue.get() if block is None: logger.debug("Reader {} finishing.".format(id_)) break t1 = time.time() try: self.reader_thread_status[id_] = STATUS_READING data = self.read_raw(block.uid) self.reader_thread_status[id_] = STATUS_NOTHING #except FileNotFoundError: except Exception as e: self.last_exception = e else: self._read_data_queue.put((block, data)) t2 = time.time() self._read_queue.task_done() logger.debug( 'Reader {} read data async. uid {} in {:.2f}s (Queue size is {})' .format(id_, block.uid, t2 - t1, self._read_queue.qsize())) def _path(self, uid): """ Returns a generated path (depth = self.DEPTH) from a uid. Example uid=831bde887afc11e5b45aa44e314f9270 and depth=2, then it returns "83/1b". If depth is larger than available bytes, then available bytes are returned only as path.""" parts = [uid[i:i + self.SPLIT] for i in range(0, len(uid), self.SPLIT)] return os.path.join(*parts[:self.DEPTH]) def _filename(self, uid): path = os.path.join(self.path, self._path(uid)) return os.path.join(path, uid + self.SUFFIX) def update(self, uid, data, offset=0): with open(self._filename(uid), 'r+b') as f: f.seek(offset) return f.write(data) def rm(self, uid): filename = self._filename(uid) if not os.path.exists(filename): raise FileNotFoundError('File {} not found.'.format(filename)) os.unlink(filename) def rm_many(self, uids): """ Deletes many uids from the data backend and returns a list of uids that couldn't be deleted. """ _no_del = [] for uid in uids: try: self.rm(uid) except FileNotFoundError: _no_del.append(uid) return _no_del def read_raw(self, uid): filename = self._filename(uid) if not os.path.exists(filename): raise FileNotFoundError('File {} not found.'.format(filename)) # TODO: Decrypt, uncompress data data = open(filename, 'rb').read() time.sleep(self.read_throttling.consume(len(data))) return data def get_all_blob_uids(self, prefix=None): if prefix: raise RuntimeError('prefix is not supported on file backends.') matches = [] for root, dirnames, filenames in os.walk(self.path): for filename in fnmatch.filter(filenames, '*.blob'): uid = filename.split('.')[0] matches.append(uid) return matches
class DataBackend(_DataBackend): """ A DataBackend which stores in AzureBlob compatible storages. The files are stored in a configurable bucket. """ WRITE_QUEUE_LENGTH = 20 READ_QUEUE_LENGTH = 20 _SUPPORTS_PARTIAL_READS = False _SUPPORTS_PARTIAL_WRITES = False fatal_error = None def __init__(self, config): azure_access_key_id = config.get('azure_access_key_id') azure_secret_access_key = config.get('azure_secret_access_key') container_name = config.get('bucket_name', 'backy2') simultaneous_writes = config.getint('simultaneous_writes', 1) simultaneous_reads = config.getint('simultaneous_reads', 1) bandwidth_read = config.getint('bandwidth_read', 0) bandwidth_write = config.getint('bandwidth_write', 0) self.read_throttling = TokenBucket() self.read_throttling.set_rate(bandwidth_read) # 0 disables throttling self.write_throttling = TokenBucket() self.write_throttling.set_rate(bandwidth_write) # 0 disables throttling self.container_name = container_name # print('Databackend: Azure blob') # print('azure_access_key_id: ', azure_access_key_id) # print('azure_secret_access_key: ', azure_secret_access_key) # print('container_name: ', container_name) self.conn = BlockBlobService( account_name=azure_access_key_id, account_key=azure_secret_access_key ) # create our bucket try: self.conn.create_container(container_name) # except boto.exception.S3CreateError: # # exists... # pass except (OSError, Exception) as e: # no route to host self.fatal_error = e logger.error('Fatal error, dying: {}'.format(e)) print('Fatal error: {}'.format(e)) exit(10) self.write_queue_length = simultaneous_writes + self.WRITE_QUEUE_LENGTH self.read_queue_length = simultaneous_reads + self.READ_QUEUE_LENGTH self._write_queue = queue.Queue(self.write_queue_length) self._read_queue = queue.Queue() self._read_data_queue = queue.Queue(self.read_queue_length) self._writer_threads = [] self._reader_threads = [] for i in range(simultaneous_writes): _writer_thread = threading.Thread(target=self._writer, args=(i,)) _writer_thread.daemon = True _writer_thread.start() self._writer_threads.append(_writer_thread) for i in range(simultaneous_reads): _reader_thread = threading.Thread(target=self._reader, args=(i,)) _reader_thread.daemon = True _reader_thread.start() self._reader_threads.append(_reader_thread) def _writer(self, id_): """ A threaded background writer """ while True: entry = self._write_queue.get() if entry is None or self.fatal_error: logger.debug("Writer {} finishing.".format(id_)) break uid, data = entry time.sleep(self.write_throttling.consume(len(data))) t1 = time.time() try: # res = self.conn.create_blob_from_text( # container_name=self.container_name, # blob_name=uid, # text=data, # validate_content=True, # encoding='ascii' # ) string_data = data if not isinstance(string_data, bytes): string_data = string_data.encode("utf-8") fp = BytesIO(string_data) res = self.conn.create_blob_from_bytes( container_name=self.container_name, blob_name=uid, blob=fp.getvalue(), validate_content=True, ) except (OSError, Exception) as e: # We let the backup job die here fataly. self.fatal_error = e logger.error('Fatal error, dying: {}'.format(e)) print('Error on Write File', e) #exit('Fatal error: {}'.format(e)) # this only raises SystemExit os._exit(11) t2 = time.time() self._write_queue.task_done() logger.debug('Writer {} wrote data async. uid {} in {:.2f}s (Queue size is {})'.format(id_, uid, t2-t1, self._write_queue.qsize())) def _reader(self, id_): """ A threaded background reader """ while True: block = self._read_queue.get() # contains block if block is None or self.fatal_error: logger.debug("Reader {} finishing.".format(id_)) break t1 = time.time() try: data = self.read_raw(block.uid) except FileNotFoundError: self._read_data_queue.put((block, None)) # catch this! else: self._read_data_queue.put((block, data)) t2 = time.time() self._read_queue.task_done() logger.debug('Reader {} read data async. uid {} in {:.2f}s (Queue size is {})'.format(id_, block.uid, t2-t1, self._read_queue.qsize())) def read_raw(self, block_uid): while True: try: data = self.conn.get_blob_to_bytes( container_name=self.container_name, blob_name=block_uid, validate_content=True, ) data = data.content except (OSError, Exception) as e: # TODO: Check what is the exact exception throwed here to show if has error logger.error('Timeout while fetching from azure - error is "{}"'.format(str(e))) pass else: break time.sleep(self.read_throttling.consume(len(data))) return data def _uid(self): # 32 chars are allowed and we need to spread the first few chars so # that blobs are distributed nicely. And want to avoid hash collisions. # So we create a real base57-encoded uuid (22 chars) and prefix it with # its own md5 hash[:10]. suuid = shortuuid.uuid() hash = hashlib.md5(suuid.encode('ascii')).hexdigest() return hash[:10] + suuid def _remove_many(self, uids): resultErrors = [] for uid in uids: try: self.rm(uid) except Exception as e: print('Remove Many Exception -> UID:', uid, ' Exception: ', e) resultErrors.append(uid) return resultErrors def save(self, data, _sync=False): if self.fatal_error: print('error fatal self') raise self.fatal_error uid = self._uid() self._write_queue.put((uid, data)) if _sync: self._write_queue.join() return uid def rm(self, uid): try: self.conn.delete_blob(self.container_name, uid) except (OSError, Exception) as e: raise FileNotFoundError('UID {} not found.'.format(uid)) def rm_many(self, uids): """ Deletes many uids from the data backend and returns a list of uids that couldn't be deleted. """ errors = self._remove_many(uids) if len(errors) > 0: return errors def read(self, block, sync=False): self._read_queue.put(block) if sync: rblock, offset, length, data = self.read_get() if rblock.id != block.id: raise RuntimeError('Do not mix threaded reading with sync reading!') if data is None: raise FileNotFoundError('UID {} not found.'.format(block.uid)) return data def read_get(self): block, data = self._read_data_queue.get() offset = 0 length = len(data) self._read_data_queue.task_done() return block, offset, length, data def read_queue_size(self): return self._read_queue.qsize() def get_all_blob_uids(self, prefix=None): return self.conn.list_blob_names(self.container_name, prefix) def close(self): for _writer_thread in self._writer_threads: self._write_queue.put(None) # ends the thread for _writer_thread in self._writer_threads: _writer_thread.join() for _reader_thread in self._reader_threads: self._read_queue.put(None) # ends the thread for _reader_thread in self._reader_threads: _reader_thread.join()
class DataBackend(_DataBackend): """ A DataBackend which stores in files. The files are stored in directories starting with the bytes of the generated uid. The depth of this structure is configurable via the DEPTH parameter, which defaults to 2. """ DEPTH = 2 SPLIT = 2 SUFFIX = '.blob' WRITE_QUEUE_LENGTH = 10 READ_QUEUE_LENGTH = 20 _SUPPORTS_PARTIAL_READS = True _SUPPORTS_PARTIAL_WRITES = True def __init__(self, config): self.path = config.get('path') simultaneous_writes = config.getint('simultaneous_writes') simultaneous_reads = config.getint('simultaneous_reads', 1) self.write_queue_length = simultaneous_writes + self.WRITE_QUEUE_LENGTH self.read_queue_length = simultaneous_reads + self.READ_QUEUE_LENGTH bandwidth_read = config.getint('bandwidth_read', 0) bandwidth_write = config.getint('bandwidth_write', 0) self.read_throttling = TokenBucket() self.read_throttling.set_rate(bandwidth_read) # 0 disables throttling self.write_throttling = TokenBucket() self.write_throttling.set_rate(bandwidth_write) # 0 disables throttling self._write_queue = queue.Queue(self.write_queue_length) self._read_queue = queue.Queue() self._read_data_queue = queue.Queue(self.read_queue_length) self._writer_threads = [] self._reader_threads = [] for i in range(simultaneous_writes): _writer_thread = threading.Thread(target=self._writer, args=(i,)) _writer_thread.daemon = True _writer_thread.start() self._writer_threads.append(_writer_thread) for i in range(simultaneous_reads): _reader_thread = threading.Thread(target=self._reader, args=(i,)) _reader_thread.daemon = True _reader_thread.start() self._reader_threads.append(_reader_thread) def _writer(self, id_=0): """ A threaded background writer """ while True: entry = self._write_queue.get() if entry is None: logger.debug("Writer {} finishing.".format(id_)) break uid, data = entry path = os.path.join(self.path, self._path(uid)) filename = self._filename(uid) time.sleep(self.write_throttling.consume(len(data))) t1 = time.time() try: with open(filename, 'wb') as f: r = f.write(data) except FileNotFoundError: makedirs(path) with open(filename, 'wb') as f: r = f.write(data) t2 = time.time() assert r == len(data) self._write_queue.task_done() logger.debug('Writer {} wrote data async. uid {} in {:.2f}s (Queue size is {})'.format(id_, uid, t2-t1, self._write_queue.qsize())) def _reader(self, id_): """ A threaded background reader """ while True: d = self._read_queue.get() # contains block, offset, length if d is None: logger.debug("Reader {} finishing.".format(id_)) break block, offset, length = d t1 = time.time() try: data = self.read_raw(block.uid, offset, length) except FileNotFoundError: self._read_data_queue.put((block, offset, length, None)) # catch this! else: self._read_data_queue.put((block, offset, length, data)) t2 = time.time() self._read_queue.task_done() logger.debug('Reader {} read data async. uid {} in {:.2f}s (Queue size is {})'.format(id_, block.uid, t2-t1, self._read_queue.qsize())) def _uid(self): # 32 chars are allowed and we need to spread the first few chars so # that blobs are distributed nicely. And want to avoid hash collisions. # So we create a real base57-encoded uuid (22 chars) and prefix it with # its own md5 hash[:10]. suuid = shortuuid.uuid() hash = hashlib.md5(suuid.encode('ascii')).hexdigest() return hash[:10] + suuid def _path(self, uid): """ Returns a generated path (depth = self.DEPTH) from a uid. Example uid=831bde887afc11e5b45aa44e314f9270 and depth=2, then it returns "83/1b". If depth is larger than available bytes, then available bytes are returned only as path.""" parts = [uid[i:i+self.SPLIT] for i in range(0, len(uid), self.SPLIT)] return os.path.join(*parts[:self.DEPTH]) def _filename(self, uid): path = os.path.join(self.path, self._path(uid)) return os.path.join(path, uid + self.SUFFIX) def save(self, data, _sync=False): uid = self._uid() self._write_queue.put((uid, data)) if _sync: self._write_queue.join() return uid def update(self, uid, data, offset=0): with open(self._filename(uid), 'r+b') as f: f.seek(offset) return f.write(data) def rm(self, uid): filename = self._filename(uid) if not os.path.exists(filename): raise FileNotFoundError('File {} not found.'.format(filename)) os.unlink(filename) def rm_many(self, uids): """ Deletes many uids from the data backend and returns a list of uids that couldn't be deleted. """ _no_del = [] for uid in uids: try: self.rm(uid) except FileNotFoundError: _no_del.append(uid) return _no_del def read(self, block, sync=False, offset=0, length=None): self._read_queue.put((block, offset, length)) if sync: rblock, offset, length, data = self.read_get() assert offset == offset assert length == length if rblock.id != block.id: raise RuntimeError('Do not mix threaded reading with sync reading!') if data is None: raise FileNotFoundError('UID {} not found.'.format(block.uid)) return data def read_get(self): block, offset, length, data = self._read_data_queue.get() self._read_data_queue.task_done() return block, offset, length, data def read_queue_size(self): return self._read_queue.qsize() def read_raw(self, uid, offset=0, length=None): filename = self._filename(uid) if not os.path.exists(filename): raise FileNotFoundError('File {} not found.'.format(filename)) if offset==0 and length is None: return open(filename, 'rb').read() else: with open(filename, 'rb') as f: if offset: f.seek(offset) if length: data = f.read(length) else: data = f.read() time.sleep(self.read_throttling.consume(len(data))) return data def get_all_blob_uids(self, prefix=None): if prefix: raise RuntimeError('prefix is not supported on file backends.') matches = [] for root, dirnames, filenames in os.walk(self.path): for filename in fnmatch.filter(filenames, '*.blob'): uid = filename.split('.')[0] matches.append(uid) return matches def close(self): for _writer_thread in self._writer_threads: self._write_queue.put(None) # ends the thread for _writer_thread in self._writer_threads: _writer_thread.join() for _reader_thread in self._reader_threads: self._read_queue.put(None) # ends the thread for _reader_thread in self._reader_threads: _reader_thread.join()
class DataBackend(_DataBackend): """ A DataBackend for performance testing. It reads and writes to NULL. DO NOT USE IN PRODUCTION. This essentially implements /dev/null """ WRITE_QUEUE_LENGTH = 20 READ_QUEUE_LENGTH = 20 last_exception = None def __init__(self, config): self.default_block_size = int([ value for key, value in config.items('DEFAULTS') if key == 'block_size' ][0]) simultaneous_writes = config.getint('simultaneous_writes', 1) simultaneous_reads = config.getint('simultaneous_reads', 1) bandwidth_read = config.getint('bandwidth_read', 0) bandwidth_write = config.getint('bandwidth_write', 0) self.read_throttling = TokenBucket() self.read_throttling.set_rate(bandwidth_read) # 0 disables throttling self.write_throttling = TokenBucket() self.write_throttling.set_rate( bandwidth_write) # 0 disables throttling self.write_queue_length = simultaneous_writes + self.WRITE_QUEUE_LENGTH self.read_queue_length = simultaneous_reads + self.READ_QUEUE_LENGTH self._write_queue = queue.Queue(self.write_queue_length) self._read_queue = queue.Queue() self._read_data_queue = queue.Queue(self.read_queue_length) self._writer_threads = [] self._reader_threads = [] self.reader_thread_status = {} self.writer_thread_status = {} for i in range(simultaneous_writes): _writer_thread = threading.Thread(target=self._writer, args=(i, )) _writer_thread.daemon = True _writer_thread.start() self._writer_threads.append(_writer_thread) self.writer_thread_status[i] = STATUS_NOTHING for i in range(simultaneous_reads): _reader_thread = threading.Thread(target=self._reader, args=(i, )) _reader_thread.daemon = True _reader_thread.start() self._reader_threads.append(_reader_thread) self.reader_thread_status[i] = STATUS_NOTHING def _writer(self, id_): """ A threaded background writer """ while True: entry = self._write_queue.get() if entry is None or self.last_exception: logger.debug("Writer {} finishing.".format(id_)) break uid, data, callback = entry self.writer_thread_status[id_] = STATUS_THROTTLING time.sleep(self.write_throttling.consume(len(data))) self.writer_thread_status[id_] = STATUS_NOTHING t1 = time.time() try: # storing data to key uid self.writer_thread_status[id_] = STATUS_WRITING #time.sleep(.1) self.writer_thread_status[id_] = STATUS_NOTHING except Exception as e: self.last_exception = e else: t2 = time.time() # assert r == len(data) if callback: callback(uid) self._write_queue.task_done() #logger.debug('Writer {} wrote data async. uid {} in {:.2f}s (Queue size is {})'.format(id_, uid, t2-t1, self._write_queue.qsize())) #if random.random() > 0.9: # raise ValueError("This is a test") def _reader(self, id_): """ A threaded background reader """ while True: block = self._read_queue.get() # contains block if block is None or self.last_exception: logger.debug("Reader {} finishing.".format(id_)) break t1 = time.time() try: self.reader_thread_status[id_] = STATUS_READING data = self.read_raw(block.id, block.size) self.reader_thread_status[id_] = STATUS_THROTTLING except Exception as e: self.last_exception = e else: time.sleep(self.read_throttling.consume(len(data))) self.reader_thread_status[id_] = STATUS_NOTHING #time.sleep(.5) self._read_data_queue.put((block, data)) t2 = time.time() self._read_queue.task_done() logger.debug( 'Reader {} read data async. uid {} in {:.2f}s (Queue size is {})' .format(id_, block.uid, t2 - t1, self._read_queue.qsize())) def read_raw(self, block_id, block_size): return generate_block(block_id, block_size) def rm(self, uid): # Don't delete anything pass def rm_many(self, uids): """ Deletes many uids from the data backend and returns a list of uids that couldn't be deleted. """ # Don't delete anything def get_all_blob_uids(self, prefix=None): return []
class DataBackend(_DataBackend): """ A DataBackend which stores in files. The files are stored in directories starting with the bytes of the generated uid. The depth of this structure is configurable via the DEPTH parameter, which defaults to 2. """ DEPTH = 2 SPLIT = 2 SUFFIX = '.blob' WRITE_QUEUE_LENGTH = 10 READ_QUEUE_LENGTH = 20 _SUPPORTS_PARTIAL_READS = True _SUPPORTS_PARTIAL_WRITES = True def __init__(self, config): self.path = config.get('path') simultaneous_writes = config.getint('simultaneous_writes') simultaneous_reads = config.getint('simultaneous_reads', 1) self.write_queue_length = simultaneous_writes + self.WRITE_QUEUE_LENGTH self.read_queue_length = simultaneous_reads + self.READ_QUEUE_LENGTH bandwidth_read = config.getint('bandwidth_read', 0) bandwidth_write = config.getint('bandwidth_write', 0) self.read_throttling = TokenBucket() self.read_throttling.set_rate(bandwidth_read) # 0 disables throttling self.write_throttling = TokenBucket() self.write_throttling.set_rate( bandwidth_write) # 0 disables throttling self._write_queue = queue.Queue(self.write_queue_length) self._read_queue = queue.Queue() self._read_data_queue = queue.Queue(self.read_queue_length) self._writer_threads = [] self._reader_threads = [] for i in range(simultaneous_writes): _writer_thread = threading.Thread(target=self._writer, args=(i, )) _writer_thread.daemon = True _writer_thread.start() self._writer_threads.append(_writer_thread) for i in range(simultaneous_reads): _reader_thread = threading.Thread(target=self._reader, args=(i, )) _reader_thread.daemon = True _reader_thread.start() self._reader_threads.append(_reader_thread) def _writer(self, id_=0): """ A threaded background writer """ while True: entry = self._write_queue.get() if entry is None: logger.debug("Writer {} finishing.".format(id_)) break uid, data = entry path = os.path.join(self.path, self._path(uid)) filename = self._filename(uid) time.sleep(self.write_throttling.consume(len(data))) t1 = time.time() try: with open(filename, 'wb') as f: r = f.write(data) except FileNotFoundError: makedirs(path) with open(filename, 'wb') as f: r = f.write(data) t2 = time.time() assert r == len(data) self._write_queue.task_done() logger.debug( 'Writer {} wrote data async. uid {} in {:.2f}s (Queue size is {})' .format(id_, uid, t2 - t1, self._write_queue.qsize())) def _reader(self, id_): """ A threaded background reader """ while True: d = self._read_queue.get() # contains block, offset, length if d is None: logger.debug("Reader {} finishing.".format(id_)) break block, offset, length = d t1 = time.time() try: data = self.read_raw(block.uid, offset, length) except FileNotFoundError: self._read_data_queue.put( (block, offset, length, None)) # catch this! else: self._read_data_queue.put((block, offset, length, data)) t2 = time.time() self._read_queue.task_done() logger.debug( 'Reader {} read data async. uid {} in {:.2f}s (Queue size is {})' .format(id_, block.uid, t2 - t1, self._read_queue.qsize())) def _uid(self): # 32 chars are allowed and we need to spread the first few chars so # that blobs are distributed nicely. And want to avoid hash collisions. # So we create a real base57-encoded uuid (22 chars) and prefix it with # its own md5 hash[:10]. suuid = shortuuid.uuid() hash = hashlib.md5(suuid.encode('ascii')).hexdigest() return hash[:10] + suuid def _path(self, uid): """ Returns a generated path (depth = self.DEPTH) from a uid. Example uid=831bde887afc11e5b45aa44e314f9270 and depth=2, then it returns "83/1b". If depth is larger than available bytes, then available bytes are returned only as path.""" parts = [uid[i:i + self.SPLIT] for i in range(0, len(uid), self.SPLIT)] return os.path.join(*parts[:self.DEPTH]) def _filename(self, uid): path = os.path.join(self.path, self._path(uid)) return os.path.join(path, uid + self.SUFFIX) def save(self, data, _sync=False): uid = self._uid() self._write_queue.put((uid, data)) if _sync: self._write_queue.join() return uid def update(self, uid, data, offset=0): with open(self._filename(uid), 'r+b') as f: f.seek(offset) return f.write(data) def rm(self, uid): filename = self._filename(uid) if not os.path.exists(filename): raise FileNotFoundError('File {} not found.'.format(filename)) os.unlink(filename) def rm_many(self, uids): """ Deletes many uids from the data backend and returns a list of uids that couldn't be deleted. """ _no_del = [] for uid in uids: try: self.rm(uid) except FileNotFoundError: _no_del.append(uid) return _no_del def read(self, block, sync=False, offset=0, length=None): self._read_queue.put((block, offset, length)) if sync: rblock, offset, length, data = self.read_get() assert offset == offset assert length == length if rblock.id != block.id: raise RuntimeError( 'Do not mix threaded reading with sync reading!') if data is None: raise FileNotFoundError('UID {} not found.'.format(block.uid)) return data def read_get(self): block, offset, length, data = self._read_data_queue.get() self._read_data_queue.task_done() return block, offset, length, data def read_queue_size(self): return self._read_queue.qsize() def read_raw(self, uid, offset=0, length=None): filename = self._filename(uid) if not os.path.exists(filename): raise FileNotFoundError('File {} not found.'.format(filename)) if offset == 0 and length is None: return open(filename, 'rb').read() else: with open(filename, 'rb') as f: if offset: f.seek(offset) if length: data = f.read(length) else: data = f.read() time.sleep(self.read_throttling.consume(len(data))) return data def get_all_blob_uids(self, prefix=None): if prefix: raise RuntimeError('prefix is not supported on file backends.') matches = [] for root, dirnames, filenames in os.walk(self.path): for filename in fnmatch.filter(filenames, '*.blob'): uid = filename.split('.')[0] matches.append(uid) return matches def close(self): for _writer_thread in self._writer_threads: self._write_queue.put(None) # ends the thread for _writer_thread in self._writer_threads: _writer_thread.join() for _reader_thread in self._reader_threads: self._read_queue.put(None) # ends the thread for _reader_thread in self._reader_threads: _reader_thread.join()
class DataBackend(_DataBackend): """ A DataBackend for performance testing. It reads and writes to NULL. DO NOT USE IN PRODUCTION. This essentially implements /dev/null """ WRITE_QUEUE_LENGTH = 20 READ_QUEUE_LENGTH = 20 last_exception = None def __init__(self, config, encryption_key, encryption_version=None): super().__init__(config, encryption_key, encryption_version) self.default_block_size = int([ value for key, value in config.items('DEFAULTS') if key == 'block_size' ][0]) simultaneous_writes = config.getint('simultaneous_writes', 1) simultaneous_reads = config.getint('simultaneous_reads', 1) bandwidth_read = config.getint('bandwidth_read', 0) bandwidth_write = config.getint('bandwidth_write', 0) self.read_throttling = TokenBucket() self.read_throttling.set_rate(bandwidth_read) # 0 disables throttling self.write_throttling = TokenBucket() self.write_throttling.set_rate( bandwidth_write) # 0 disables throttling self.write_queue_length = simultaneous_writes + self.WRITE_QUEUE_LENGTH self.read_queue_length = simultaneous_reads + self.READ_QUEUE_LENGTH self._write_queue = queue.Queue(self.write_queue_length) self._read_queue = queue.Queue() self._read_data_queue = queue.Queue(self.read_queue_length) self._writer_threads = [] self._reader_threads = [] self.reader_thread_status = {} self.writer_thread_status = {} for i in range(simultaneous_writes): _writer_thread = threading.Thread(target=self._writer, args=(i, )) _writer_thread.daemon = True _writer_thread.start() self._writer_threads.append(_writer_thread) self.writer_thread_status[i] = STATUS_NOTHING for i in range(simultaneous_reads): _reader_thread = threading.Thread(target=self._reader, args=(i, )) _reader_thread.daemon = True _reader_thread.start() self._reader_threads.append(_reader_thread) self.reader_thread_status[i] = STATUS_NOTHING def _writer(self, id_): """ A threaded background writer """ while True: entry = self._write_queue.get() if entry is None or self.last_exception: logger.debug("Writer {} finishing.".format(id_)) break uid, enc_envkey, enc_version, enc_nonce, data, callback = entry self.writer_thread_status[id_] = STATUS_THROTTLING time.sleep(self.write_throttling.consume(len(data))) self.writer_thread_status[id_] = STATUS_NOTHING t1 = time.time() try: # storing data to key uid self.writer_thread_status[id_] = STATUS_WRITING #time.sleep(.1) self.writer_thread_status[id_] = STATUS_NOTHING except Exception as e: self.last_exception = e else: t2 = time.time() # assert r == len(data) if callback: callback(uid, enc_envkey, enc_version, enc_nonce) self._write_queue.task_done() #logger.debug('Writer {} wrote data async. uid {} in {:.2f}s (Queue size is {})'.format(id_, uid, t2-t1, self._write_queue.qsize())) #if random.random() > 0.9: # raise ValueError("This is a test") def _reader(self, id_): """ A threaded background reader """ while True: block = self._read_queue.get() # contains block if block is None or self.last_exception: logger.debug("Reader {} finishing.".format(id_)) break t1 = time.time() try: self.reader_thread_status[id_] = STATUS_READING data = self.read_raw(block) self.reader_thread_status[id_] = STATUS_THROTTLING except Exception as e: self.last_exception = e else: time.sleep(self.read_throttling.consume(len(data))) self.reader_thread_status[id_] = STATUS_NOTHING #time.sleep(.5) self._read_data_queue.put((block, data)) t2 = time.time() self._read_queue.task_done() logger.debug( 'Reader {} read data async. uid {} in {:.2f}s (Queue size is {})' .format(id_, block.uid, t2 - t1, self._read_queue.qsize())) def read_raw(self, block): # We have to fake encrypted blocks here or the restore won't work. # This is bad as performance is not really comparable but this is our # only chance to restore from null://. raw_data = generate_block(block.id, block.size) data, enc_envkey = self.cc_latest.encrypt( raw_data, self.cc_latest.unwrap_key(binascii.unhexlify(block.enc_envkey)), raw_data[:16], # use the first 16 bytes as nonce ) #data = generate_block(block_id, block_size) return data # Overwriting save here as we need weak encryption for the null data backend. def save(self, data, _sync=False, callback=None): """ Saves data, returns unique ID """ if self.last_exception: raise self.last_exception uid = self._uid() # Important: It's important to call this from the main thread because # zstandard IS NOT THREAD SAFE as stated at https://pypi.org/project/zstandard/: # """ Unless specified otherwise, assume that no two methods of # ZstdCompressor instances can be called from multiple Python threads # simultaneously. In other words, assume instances are not thread safe # unless stated otherwise.""" #blob, enc_envkey = self.cc_latest.encrypt(data) blob, enc_envkey, enc_nonce = self.cc_latest.encrypt( data, None, data[:16]) # use the first 16 bytes as nonce enc_version = self.cc_latest.VERSION self._write_queue.put( (uid, enc_envkey, enc_version, enc_nonce, blob, callback)) if _sync: self._write_queue.join() return uid def rm(self, uid): # Don't delete anything pass def rm_many(self, uids): """ Deletes many uids from the data backend and returns a list of uids that couldn't be deleted. """ # Don't delete anything def get_all_blob_uids(self, prefix=None): return []
class DataBackend(_DataBackend): """ A DataBackend which stores in S3 compatible storages. The files are stored in a configurable bucket. """ WRITE_QUEUE_LENGTH = 20 READ_QUEUE_LENGTH = 20 _SUPPORTS_PARTIAL_READS = False _SUPPORTS_PARTIAL_WRITES = False fatal_error = None def __init__(self, config): aws_access_key_id = config.get('aws_access_key_id') aws_secret_access_key = config.get('aws_secret_access_key') host = config.get('host') port = config.getint('port') is_secure = config.getboolean('is_secure') bucket_name = config.get('bucket_name', 'backy2') simultaneous_writes = config.getint('simultaneous_writes', 1) simultaneous_reads = config.getint('simultaneous_reads', 1) calling_format=boto.s3.connection.OrdinaryCallingFormat() bandwidth_read = config.getint('bandwidth_read', 0) bandwidth_write = config.getint('bandwidth_write', 0) self.read_throttling = TokenBucket() self.read_throttling.set_rate(bandwidth_read) # 0 disables throttling self.write_throttling = TokenBucket() self.write_throttling.set_rate(bandwidth_write) # 0 disables throttling self.conn = boto.connect_s3( aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key, host=host, port=port, is_secure=is_secure, calling_format=calling_format ) # create our bucket try: self.bucket = self.conn.create_bucket(bucket_name) except boto.exception.S3CreateError: # exists... pass except OSError as e: # no route to host self.fatal_error = e logger.error('Fatal error, dying: {}'.format(e)) print('Fatal error: {}'.format(e)) exit(10) self.write_queue_length = simultaneous_writes + self.WRITE_QUEUE_LENGTH self.read_queue_length = simultaneous_reads + self.READ_QUEUE_LENGTH self._write_queue = queue.Queue(self.write_queue_length) self._read_queue = queue.Queue() self._read_data_queue = queue.Queue(self.read_queue_length) self._writer_threads = [] self._reader_threads = [] for i in range(simultaneous_writes): _writer_thread = threading.Thread(target=self._writer, args=(i,)) _writer_thread.daemon = True _writer_thread.start() self._writer_threads.append(_writer_thread) for i in range(simultaneous_reads): _reader_thread = threading.Thread(target=self._reader, args=(i,)) _reader_thread.daemon = True _reader_thread.start() self._reader_threads.append(_reader_thread) def _writer(self, id_): """ A threaded background writer """ while True: entry = self._write_queue.get() if entry is None or self.fatal_error: logger.debug("Writer {} finishing.".format(id_)) break uid, data = entry time.sleep(self.write_throttling.consume(len(data))) t1 = time.time() key = self.bucket.new_key(uid) try: r = key.set_contents_from_string(data) except ( OSError, boto.exception.BotoServerError, boto.exception.S3ResponseError, ) as e: # OSError happens when the S3 host is gone (i.e. network died, # host down, ...). boto tries hard to recover, however after # several attempts it will give up and raise. # BotoServerError happens, when there is no server. # S3ResponseError sometimes happens, when the cluster is about # to shutdown. Hard to reproduce because the writer must write # in exactly this moment. # We let the backup job die here fataly. self.fatal_error = e logger.error('Fatal error, dying: {}'.format(e)) #exit('Fatal error: {}'.format(e)) # this only raises SystemExit os._exit(11) t2 = time.time() assert r == len(data) self._write_queue.task_done() logger.debug('Writer {} wrote data async. uid {} in {:.2f}s (Queue size is {})'.format(id_, uid, t2-t1, self._write_queue.qsize())) def _reader(self, id_): """ A threaded background reader """ while True: block = self._read_queue.get() # contains block if block is None or self.fatal_error: logger.debug("Reader {} finishing.".format(id_)) break t1 = time.time() try: data = self.read_raw(block.uid) except FileNotFoundError: self._read_data_queue.put((block, None)) # catch this! else: self._read_data_queue.put((block, data)) t2 = time.time() self._read_queue.task_done() logger.debug('Reader {} read data async. uid {} in {:.2f}s (Queue size is {})'.format(id_, block.uid, t2-t1, self._read_queue.qsize())) def read_raw(self, block_uid): key = self.bucket.get_key(block_uid) if not key: raise FileNotFoundError('UID {} not found.'.format(block_uid)) data = key.get_contents_as_string() time.sleep(self.read_throttling.consume(len(data))) return data def _uid(self): # 32 chars are allowed and we need to spread the first few chars so # that blobs are distributed nicely. And want to avoid hash collisions. # So we create a real base57-encoded uuid (22 chars) and prefix it with # its own md5 hash[:10]. suuid = shortuuid.uuid() hash = hashlib.md5(suuid.encode('ascii')).hexdigest() return hash[:10] + suuid def save(self, data, _sync=False): if self.fatal_error: raise self.fatal_error uid = self._uid() self._write_queue.put((uid, data)) if _sync: self._write_queue.join() return uid def rm(self, uid): key = self.bucket.get_key(uid) if not key: raise FileNotFoundError('UID {} not found.'.format(uid)) self.bucket.delete_key(uid) def rm_many(self, uids): """ Deletes many uids from the data backend and returns a list of uids that couldn't be deleted. """ errors = self.bucket.delete_keys(uids, quiet=True) if errors.errors: # unable to test this. ceph object gateway doesn't return errors. # raise FileNotFoundError('UIDS {} not found.'.format(errors.errors)) return errors.errors # TODO: which should be a list of uids. def read(self, block, sync=False): self._read_queue.put(block) if sync: rblock, data = self.read_get() if rblock.id != block.id: raise RuntimeError('Do not mix threaded reading with sync reading!') if data is None: raise FileNotFoundError('UID {} not found.'.format(block.uid)) return data def read_get(self): block, data = self._read_data_queue.get() offset = 0 length = len(data) self._read_data_queue.task_done() return block, offset, length, data def read_queue_size(self): return self._read_queue.qsize() def get_all_blob_uids(self, prefix=None): return [k.name for k in self.bucket.list(prefix)] def close(self): for _writer_thread in self._writer_threads: self._write_queue.put(None) # ends the thread for _writer_thread in self._writer_threads: _writer_thread.join() for _reader_thread in self._reader_threads: self._read_queue.put(None) # ends the thread for _reader_thread in self._reader_threads: _reader_thread.join() self.conn.close()