Exemple #1
0
class DataBackend(_DataBackend):
    """ A DataBackend which stores in S3 compatible storages. The files are
    stored in a configurable bucket. """

    WRITE_QUEUE_LENGTH = 20
    READ_QUEUE_LENGTH = 20

    _SUPPORTS_PARTIAL_READS = False
    _SUPPORTS_PARTIAL_WRITES = False
    fatal_error = None

    def __init__(self, config):
        aws_access_key_id = config.get('aws_access_key_id')
        aws_secret_access_key = config.get('aws_secret_access_key')
        host = config.get('host')
        port = config.getint('port')
        is_secure = config.getboolean('is_secure')
        bucket_name = config.get('bucket_name', 'backy2')
        simultaneous_writes = config.getint('simultaneous_writes', 1)
        simultaneous_reads = config.getint('simultaneous_reads', 1)
        calling_format = boto.s3.connection.OrdinaryCallingFormat()
        bandwidth_read = config.getint('bandwidth_read', 0)
        bandwidth_write = config.getint('bandwidth_write', 0)

        self.read_throttling = TokenBucket()
        self.read_throttling.set_rate(bandwidth_read)  # 0 disables throttling
        self.write_throttling = TokenBucket()
        self.write_throttling.set_rate(
            bandwidth_write)  # 0 disables throttling

        self.conn = boto.connect_s3(
            aws_access_key_id=aws_access_key_id,
            aws_secret_access_key=aws_secret_access_key,
            host=host,
            port=port,
            is_secure=is_secure,
            calling_format=calling_format)
        # create our bucket
        try:
            self.bucket = self.conn.create_bucket(bucket_name)
        except boto.exception.S3CreateError:
            # exists...
            pass
        except OSError as e:
            # no route to host
            self.fatal_error = e
            logger.error('Fatal error, dying: {}'.format(e))
            print('Fatal error: {}'.format(e))
            exit(10)

        self.write_queue_length = simultaneous_writes + self.WRITE_QUEUE_LENGTH
        self.read_queue_length = simultaneous_reads + self.READ_QUEUE_LENGTH
        self._write_queue = queue.Queue(self.write_queue_length)
        self._read_queue = queue.Queue()
        self._read_data_queue = queue.Queue(self.read_queue_length)
        self._writer_threads = []
        self._reader_threads = []
        for i in range(simultaneous_writes):
            _writer_thread = threading.Thread(target=self._writer, args=(i, ))
            _writer_thread.daemon = True
            _writer_thread.start()
            self._writer_threads.append(_writer_thread)
        for i in range(simultaneous_reads):
            _reader_thread = threading.Thread(target=self._reader, args=(i, ))
            _reader_thread.daemon = True
            _reader_thread.start()
            self._reader_threads.append(_reader_thread)

    def _writer(self, id_):
        """ A threaded background writer """
        while True:
            entry = self._write_queue.get()
            if entry is None or self.fatal_error:
                logger.debug("Writer {} finishing.".format(id_))
                break
            uid, data = entry
            time.sleep(self.write_throttling.consume(len(data)))
            t1 = time.time()
            key = self.bucket.new_key(uid)
            try:
                r = key.set_contents_from_string(data)
            except (
                    OSError,
                    boto.exception.BotoServerError,
                    boto.exception.S3ResponseError,
            ) as e:
                # OSError happens when the S3 host is gone (i.e. network died,
                # host down, ...). boto tries hard to recover, however after
                # several attempts it will give up and raise.
                # BotoServerError happens, when there is no server.
                # S3ResponseError sometimes happens, when the cluster is about
                # to shutdown. Hard to reproduce because the writer must write
                # in exactly this moment.
                # We let the backup job die here fataly.
                self.fatal_error = e
                logger.error('Fatal error, dying: {}'.format(e))
                #exit('Fatal error: {}'.format(e))  # this only raises SystemExit
                os._exit(11)
            t2 = time.time()
            assert r == len(data)
            self._write_queue.task_done()
            logger.debug(
                'Writer {} wrote data async. uid {} in {:.2f}s (Queue size is {})'
                .format(id_, uid, t2 - t1, self._write_queue.qsize()))

    def _reader(self, id_):
        """ A threaded background reader """
        while True:
            block = self._read_queue.get()  # contains block
            if block is None or self.fatal_error:
                logger.debug("Reader {} finishing.".format(id_))
                break
            t1 = time.time()
            try:
                data = self.read_raw(block.uid)
            except FileNotFoundError:
                self._read_data_queue.put((block, None))  # catch this!
            else:
                self._read_data_queue.put((block, data))
                t2 = time.time()
                self._read_queue.task_done()
                logger.debug(
                    'Reader {} read data async. uid {} in {:.2f}s (Queue size is {})'
                    .format(id_, block.uid, t2 - t1, self._read_queue.qsize()))

    def read_raw(self, block_uid):
        key = self.bucket.get_key(block_uid)
        if not key:
            raise FileNotFoundError('UID {} not found.'.format(block_uid))
        while True:
            try:
                data = key.get_contents_as_string()
            except socket.timeout:
                logger.error('Timeout while fetching from s3, trying again.')
                pass
            else:
                break
        time.sleep(self.read_throttling.consume(len(data)))
        return data

    def _uid(self):
        # 32 chars are allowed and we need to spread the first few chars so
        # that blobs are distributed nicely. And want to avoid hash collisions.
        # So we create a real base57-encoded uuid (22 chars) and prefix it with
        # its own md5 hash[:10].
        suuid = shortuuid.uuid()
        hash = hashlib.md5(suuid.encode('ascii')).hexdigest()
        return hash[:10] + suuid

    def save(self, data, _sync=False):
        if self.fatal_error:
            raise self.fatal_error
        uid = self._uid()
        self._write_queue.put((uid, data))
        if _sync:
            self._write_queue.join()
        return uid

    def rm(self, uid):
        key = self.bucket.get_key(uid)
        if not key:
            raise FileNotFoundError('UID {} not found.'.format(uid))
        self.bucket.delete_key(uid)

    def rm_many(self, uids):
        """ Deletes many uids from the data backend and returns a list
        of uids that couldn't be deleted.
        """
        errors = self.bucket.delete_keys(uids, quiet=True)
        if errors.errors:
            # unable to test this. ceph object gateway doesn't return errors.
            # raise FileNotFoundError('UIDS {} not found.'.format(errors.errors))
            return errors.errors  # TODO: which should be a list of uids.

    def read(self, block, sync=False):
        self._read_queue.put(block)
        if sync:
            rblock, offset, length, data = self.read_get()
            if rblock.id != block.id:
                raise RuntimeError(
                    'Do not mix threaded reading with sync reading!')
            if data is None:
                raise FileNotFoundError('UID {} not found.'.format(block.uid))
            return data

    def read_get(self):
        block, data = self._read_data_queue.get()
        offset = 0
        length = len(data)
        self._read_data_queue.task_done()
        return block, offset, length, data

    def read_queue_size(self):
        return self._read_queue.qsize()

    def get_all_blob_uids(self, prefix=None):
        return [k.name for k in self.bucket.list(prefix)]

    def close(self):
        for _writer_thread in self._writer_threads:
            self._write_queue.put(None)  # ends the thread
        for _writer_thread in self._writer_threads:
            _writer_thread.join()
        for _reader_thread in self._reader_threads:
            self._read_queue.put(None)  # ends the thread
        for _reader_thread in self._reader_threads:
            _reader_thread.join()
        self.conn.close()
Exemple #2
0
class DataBackend(_DataBackend):
    """ A DataBackend which stores in S3 compatible storages. The files are
    stored in a configurable bucket. """

    WRITE_QUEUE_LENGTH = 20
    READ_QUEUE_LENGTH = 20

    last_exception = None

    def __init__(self, config):
        aws_access_key_id = config.get('aws_access_key_id')
        if aws_access_key_id is None:
            aws_access_key_id_file = config.get('aws_access_key_id_file')
            with open(aws_access_key_id_file, 'r', encoding="ascii") as f:
                aws_access_key_id = f.read().rstrip()

        aws_secret_access_key = config.get('aws_secret_access_key')
        if aws_secret_access_key is None:
            aws_secret_access_key_file = config.get('aws_secret_access_key_file')
            with open(aws_secret_access_key_file, 'r', encoding="ascii") as f:
                aws_secret_access_key = f.read().rstrip()

        region_name = config.get('region_name', '')
        endpoint_url = config.get('endpoint_url', '')
        use_ssl = config.get('use_ssl', '')
        self._bucket_name = config.get('bucket_name', '')
        addressing_style = config.get('addressing_style', '')
        signature_version = config.get('signature_version', '')
        self._disable_encoding_type = config.get('disable_encoding_type', '')

        simultaneous_writes = config.getint('simultaneous_writes', 1)
        simultaneous_reads = config.getint('simultaneous_reads', 1)
        bandwidth_read = config.getint('bandwidth_read', 0)
        bandwidth_write = config.getint('bandwidth_write', 0)

        self.read_throttling = TokenBucket()
        self.read_throttling.set_rate(bandwidth_read)  # 0 disables throttling
        self.write_throttling = TokenBucket()
        self.write_throttling.set_rate(bandwidth_write)  # 0 disables throttling


        self._resource_config = {
            'aws_access_key_id': aws_access_key_id,
            'aws_secret_access_key': aws_secret_access_key,
        }

        if region_name:
            self._resource_config['region_name'] = region_name

        if endpoint_url:
            self._resource_config['endpoint_url'] = endpoint_url

        if use_ssl:
            self._resource_config['use_ssl'] = use_ssl

        resource_config = {}
        if addressing_style:
            resource_config['s3'] = {'addressing_style': addressing_style}

        if signature_version:
            resource_config['signature_version'] = signature_version

        # TODO
        #resource_config['max_pool_connections'] = 100
        #resource_config['parameter_validation'] = False
        #resource_config['use_accelerate_endpoint'] = True

        self._resource_config['config'] = BotoCoreClientConfig(**resource_config)


        self.write_queue_length = simultaneous_writes + self.WRITE_QUEUE_LENGTH
        self.read_queue_length = simultaneous_reads + self.READ_QUEUE_LENGTH
        self._write_queue = queue.Queue(self.write_queue_length)
        self._read_queue = queue.Queue()
        self._read_data_queue = queue.Queue(self.read_queue_length)

        self.bucket = self._get_bucket()  # for read_raw

        self._writer_threads = []
        self._reader_threads = []
        self.reader_thread_status = {}
        self.writer_thread_status = {}
        for i in range(simultaneous_writes):
            _writer_thread = threading.Thread(target=self._writer, args=(i,))
            _writer_thread.daemon = True
            _writer_thread.start()
            self._writer_threads.append(_writer_thread)
            self.writer_thread_status[i] = STATUS_NOTHING
        for i in range(simultaneous_reads):
            _reader_thread = threading.Thread(target=self._reader, args=(i,))
            _reader_thread.daemon = True
            _reader_thread.start()
            self._reader_threads.append(_reader_thread)
            self.reader_thread_status[i] = STATUS_NOTHING


    def _get_bucket(self):
        session = boto3.session.Session()
        if self._disable_encoding_type:
            session.events.unregister('before-parameter-build.s3.ListObjects', set_list_objects_encoding_type_url)
        resource = session.resource('s3', **self._resource_config)
        bucket = resource.Bucket(self._bucket_name)
        return bucket


    def _get_client(self):
        session = boto3.session.Session()
        if self._disable_encoding_type:
            session.events.unregister('before-parameter-build.s3.ListObjects', set_list_objects_encoding_type_url)
        client = session.client('s3', **self._resource_config)
        return client


    def _writer(self, id_):
        """ A threaded background writer """
        #bucket = None
        client = None
        while True:
            self.writer_thread_status[id_] = STATUS_QUEUE
            entry = self._write_queue.get()
            self.writer_thread_status[id_] = STATUS_NOTHING
            if entry is None or self.last_exception:
                logger.debug("Writer {} finishing.".format(id_))
                break
            if client is None:
                client = self._get_client()
            uid, data, callback = entry

            self.writer_thread_status[id_] = STATUS_THROTTLING
            time.sleep(self.write_throttling.consume(len(data)))
            self.writer_thread_status[id_] = STATUS_NOTHING

            try:
                self.writer_thread_status[id_] = STATUS_WRITING
                client.put_object(Body=data, Key=uid, Bucket=self._bucket_name)
                #client.upload_fileobj(io.BytesIO(data), Key=uid, Bucket=self._bucket_name)
                self.writer_thread_status[id_] = STATUS_NOTHING
                #if random.random() > 0.9:
                #    raise ValueError("This is a test")
            except Exception as e:
                self.last_exception = e
            else:
                if callback:
                    callback(uid)
                self._write_queue.task_done()


    def _reader(self, id_):
        """ A threaded background reader """
        bucket = None
        while True:
            block = self._read_queue.get()  # contains block
            if block is None or self.last_exception:
                logger.debug("Reader {} finishing.".format(id_))
                break
            if bucket is None:
                bucket = self._get_bucket()
            t1 = time.time()
            try:
                self.reader_thread_status[id_] = STATUS_READING
                data = self.read_raw(block.uid, bucket)
                self.reader_thread_status[id_] = STATUS_NOTHING
                #except FileNotFoundError:
            except Exception as e:
                self.last_exception = e
            else:
                self._read_data_queue.put((block, data))
                t2 = time.time()
                self._read_queue.task_done()
                logger.debug('Reader {} read data async. uid {} in {:.2f}s (Queue size is {})'.format(id_, block.uid, t2-t1, self._read_queue.qsize()))


    def read_raw(self, block_uid, _bucket=None):
        if not _bucket:
            _bucket = self.bucket

        while True:
            obj = _bucket.Object(block_uid)
            try:
                data_dict = obj.get()
                data = data_dict['Body'].read()
            except ClientError as e:
                if e.response['Error']['Code'] == 'NoSuchKey' or e.response['Error']['Code'] == '404':
                    raise FileNotFoundError('Key {} not found.'.format(key)) from None
                else:
                    raise
            except socket.timeout:
                logger.error('Timeout while fetching from s3, trying again.')
                pass
            except OSError as e:
                # TODO: This is new and currently untested code. I'm not sure
                # why this happens in favour of socket.timeout and also if it
                # might be better to abort the whole restore/backup/scrub if
                # this happens, because I can't tell if the s3 lib is able to
                # recover from this situation and continue or not. We will see
                # this in the logs next time s3 is generating timeouts.
                logger.error('Timeout while fetching from s3 - error is "{}", trying again.'.format(str(e)))
                pass
            else:
                break
        time.sleep(self.read_throttling.consume(len(data)))  # TODO: Need throttling in thread statistics!
        return data


    def rm(self, uid):
        obj = self.bucket.Object(uid)
        try:
            obj.load()
        except ClientError as e:
            if e.response['Error']['Code'] == 'NoSuchKey' or e.response['Error']['Code'] == '404':
                raise FileNotFoundError('Key {} not found.'.format(uid)) from None
            else:
                raise
        else:
            obj.delete()


    def rm_many(self, uids):
        """ Deletes many uids from the data backend and returns a list
        of uids that couldn't be deleted.
        """
        for uid in uids:
            self.rm(uid)
        # TODO: maybe use delete_objects


    def get_all_blob_uids(self, prefix=None):
        if prefix is None:
            objects_iterable = self.bucket.objects.all()
        else:
            objects_iterable = self.bucket.objects.filter(Prefix=prefix)

        return [o.key for o in objects_iterable]
Exemple #3
0
class DataBackend(_DataBackend):
    """ A DataBackend which stores in S3 compatible storages. The files are
    stored in a configurable bucket. """

    WRITE_QUEUE_LENGTH = 20
    READ_QUEUE_LENGTH = 20

    last_exception = None

    def __init__(self, config, encryption_key, encryption_version=None):
        super().__init__(config, encryption_key, encryption_version)
        self.aws_access_key_id = config.get('aws_access_key_id')
        if self.aws_access_key_id is None:
            aws_access_key_id_file = config.get('aws_access_key_id_file')
            with open(aws_access_key_id_file, 'r', encoding="ascii") as f:
                self.aws_access_key_id = f.read().rstrip()

        self.aws_secret_access_key = config.get('aws_secret_access_key')
        if self.aws_secret_access_key is None:
            aws_secret_access_key_file = config.get(
                'aws_secret_access_key_file')
            with open(aws_secret_access_key_file, 'r', encoding="ascii") as f:
                self.aws_secret_access_key = f.read().rstrip()

        self.region_name = config.get('region_name', '')
        self.host = config.get('host', '')
        self.secure = config.getboolean('secure', False)
        self.bucket_name = config.get('bucket_name', '')

        simultaneous_writes = config.getint('simultaneous_writes', 1)
        simultaneous_reads = config.getint('simultaneous_reads', 1)
        bandwidth_read = config.getint('bandwidth_read', 0)
        bandwidth_write = config.getint('bandwidth_write', 0)

        self.read_throttling = TokenBucket()
        self.read_throttling.set_rate(bandwidth_read)  # 0 disables throttling
        self.write_throttling = TokenBucket()
        self.write_throttling.set_rate(
            bandwidth_write)  # 0 disables throttling

        self.write_queue_length = simultaneous_writes + self.WRITE_QUEUE_LENGTH
        self.read_queue_length = simultaneous_reads + self.READ_QUEUE_LENGTH
        self._write_queue = queue.Queue(self.write_queue_length)
        self._read_queue = queue.Queue()
        self._read_data_queue = queue.Queue(self.read_queue_length)

        self.client = self._get_client()  # for read_raw, rm, ...

        self._writer_threads = []
        self._reader_threads = []
        self.reader_thread_status = {}
        self.writer_thread_status = {}
        for i in range(simultaneous_writes):
            _writer_thread = threading.Thread(target=self._writer, args=(i, ))
            _writer_thread.daemon = True
            _writer_thread.start()
            self._writer_threads.append(_writer_thread)
            self.writer_thread_status[i] = STATUS_NOTHING
        for i in range(simultaneous_reads):
            _reader_thread = threading.Thread(target=self._reader, args=(i, ))
            _reader_thread.daemon = True
            _reader_thread.start()
            self._reader_threads.append(_reader_thread)
            self.reader_thread_status[i] = STATUS_NOTHING

    def _get_client(self):
        client = Minio(self.host,
                       access_key=self.aws_access_key_id,
                       secret_key=self.aws_secret_access_key,
                       secure=self.secure)
        return client

    def _writer(self, id_):
        """ A threaded background writer """
        #bucket = None
        client = None
        while True:
            self.writer_thread_status[id_] = STATUS_QUEUE
            entry = self._write_queue.get()
            self.writer_thread_status[id_] = STATUS_NOTHING
            if entry is None or self.last_exception:
                logger.debug("Writer {} finishing.".format(id_))
                break
            if client is None:
                client = self._get_client()
            uid, enc_envkey, enc_version, enc_nonce, data, callback = entry

            self.writer_thread_status[id_] = STATUS_THROTTLING
            time.sleep(self.write_throttling.consume(len(data)))
            self.writer_thread_status[id_] = STATUS_NOTHING

            try:
                self.writer_thread_status[id_] = STATUS_WRITING
                client.put_object(self.bucket_name, uid, io.BytesIO(data),
                                  len(data))
                #client.upload_fileobj(io.BytesIO(data), Key=uid, Bucket=self._bucket_name)
                self.writer_thread_status[id_] = STATUS_NOTHING
                #if random.random() > 0.9:
                #    raise ValueError("This is a test")
            except Exception as e:
                self.last_exception = e
            else:
                if callback:
                    callback(uid, enc_envkey, enc_version, enc_nonce)
                self._write_queue.task_done()

    def _reader(self, id_):
        """ A threaded background reader """
        client = None
        while True:
            block = self._read_queue.get()  # contains block
            if block is None or self.last_exception:
                logger.debug("Reader {} finishing.".format(id_))
                break
            if client is None:
                client = self._get_client()
            t1 = time.time()
            try:
                self.reader_thread_status[id_] = STATUS_READING
                data = self.read_raw(block, client)
                self.reader_thread_status[id_] = STATUS_NOTHING
                #except FileNotFoundError:
            except Exception as e:
                self.last_exception = e
            else:
                self._read_data_queue.put((block, data))
                t2 = time.time()
                self._read_queue.task_done()
                logger.debug(
                    'Reader {} read data async. uid {} in {:.2f}s (Queue size is {})'
                    .format(id_, block.uid, t2 - t1, self._read_queue.qsize()))

    def read_raw(self, block, _client=None):
        if not _client:
            _client = self._get_client()
        data = _client.get_object(self.bucket_name, block.uid).read()
        time.sleep(self.read_throttling.consume(
            len(data)))  # TODO: Need throttling in thread statistics!
        return data

    def rm(self, uid):
        try:
            self.client.remove_object(self.bucket_name, uid)
        except ResponseError as e:
            raise

    def rm_many(self, uids):
        """ Deletes many uids from the data backend and returns a list
        of uids that couldn't be deleted.
        """
        try:
            for del_err in self.client.remove_objects(self.bucket_name, uids):
                logger.error("S3 Object Deletion Error: {}".format(del_err))
        except ResponseError as err:
            raise

    def get_all_blob_uids(self, prefix=None):
        objects = self.client.list_objects(self.bucket_name, prefix)
        return [o.object_name for o in objects]
Exemple #4
0
class DataBackend(_DataBackend):
    """ A DataBackend which stores in files. The files are stored in directories
    starting with the bytes of the generated uid. The depth of this structure
    is configurable via the DEPTH parameter, which defaults to 2. """

    DEPTH = 2
    SPLIT = 2
    SUFFIX = '.blob'
    WRITE_QUEUE_LENGTH = 10
    READ_QUEUE_LENGTH = 20

    last_exception = None

    def __init__(self, config):
        self.path = config.get('path')
        simultaneous_writes = config.getint('simultaneous_writes')
        simultaneous_reads = config.getint('simultaneous_reads', 1)
        self.write_queue_length = simultaneous_writes + self.WRITE_QUEUE_LENGTH
        self.read_queue_length = simultaneous_reads + self.READ_QUEUE_LENGTH

        bandwidth_read = config.getint('bandwidth_read', 0)
        bandwidth_write = config.getint('bandwidth_write', 0)

        self.read_throttling = TokenBucket()
        self.read_throttling.set_rate(bandwidth_read)  # 0 disables throttling
        self.write_throttling = TokenBucket()
        self.write_throttling.set_rate(
            bandwidth_write)  # 0 disables throttling

        self._write_queue = queue.Queue(self.write_queue_length)
        self._read_queue = queue.Queue()
        self._read_data_queue = queue.Queue(self.read_queue_length)
        self._writer_threads = []
        self._reader_threads = []
        self.reader_thread_status = {}
        self.writer_thread_status = {}
        for i in range(simultaneous_writes):
            _writer_thread = threading.Thread(target=self._writer, args=(i, ))
            _writer_thread.daemon = True
            _writer_thread.start()
            self._writer_threads.append(_writer_thread)
            self.writer_thread_status[i] = STATUS_NOTHING
        for i in range(simultaneous_reads):
            _reader_thread = threading.Thread(target=self._reader, args=(i, ))
            _reader_thread.daemon = True
            _reader_thread.start()
            self._reader_threads.append(_reader_thread)
            self.reader_thread_status[i] = STATUS_NOTHING

    def _writer(self, id_=0):
        """ A threaded background writer """
        while True:
            entry = self._write_queue.get()
            if entry is None or self.last_exception:
                logger.debug("Writer {} finishing.".format(id_))
                break
            uid, data, callback = entry

            # TODO: encrypt, compress data before throttling.

            path = os.path.join(self.path, self._path(uid))
            filename = self._filename(uid)
            self.writer_thread_status[id_] = STATUS_THROTTLING
            time.sleep(self.write_throttling.consume(len(data)))
            self.writer_thread_status[id_] = STATUS_NOTHING
            t1 = time.time()
            try:
                try:
                    self.writer_thread_status[id_] = STATUS_WRITING
                    with open(filename, 'wb') as f:
                        r = f.write(data)
                    self.writer_thread_status[id_] = STATUS_NOTHING
                except FileNotFoundError:
                    self.writer_thread_status[id_] = STATUS_WRITING
                    makedirs(path)
                    with open(filename, 'wb') as f:
                        r = f.write(data)
                    self.writer_thread_status[id_] = STATUS_NOTHING
                assert r == len(data)
            except Exception as e:
                self.last_exception = e
            else:
                t2 = time.time()
                if callback:
                    callback(uid)
                self._write_queue.task_done()
                #logger.debug('Writer {} wrote data async. uid {} in {:.2f}s (Queue size is {})'.format(id_, uid, t2-t1, self._write_queue.qsize()))

    def _reader(self, id_):
        """ A threaded background reader """
        while True:
            block = self._read_queue.get()
            if block is None:
                logger.debug("Reader {} finishing.".format(id_))
                break
            t1 = time.time()
            try:
                self.reader_thread_status[id_] = STATUS_READING
                data = self.read_raw(block.uid)
                self.reader_thread_status[id_] = STATUS_NOTHING
                #except FileNotFoundError:
            except Exception as e:
                self.last_exception = e
            else:
                self._read_data_queue.put((block, data))
                t2 = time.time()
                self._read_queue.task_done()
                logger.debug(
                    'Reader {} read data async. uid {} in {:.2f}s (Queue size is {})'
                    .format(id_, block.uid, t2 - t1, self._read_queue.qsize()))

    def _path(self, uid):
        """ Returns a generated path (depth = self.DEPTH) from a uid.
        Example uid=831bde887afc11e5b45aa44e314f9270 and depth=2, then
        it returns "83/1b".
        If depth is larger than available bytes, then available bytes
        are returned only as path."""

        parts = [uid[i:i + self.SPLIT] for i in range(0, len(uid), self.SPLIT)]
        return os.path.join(*parts[:self.DEPTH])

    def _filename(self, uid):
        path = os.path.join(self.path, self._path(uid))
        return os.path.join(path, uid + self.SUFFIX)

    def update(self, uid, data, offset=0):
        with open(self._filename(uid), 'r+b') as f:
            f.seek(offset)
            return f.write(data)

    def rm(self, uid):
        filename = self._filename(uid)
        if not os.path.exists(filename):
            raise FileNotFoundError('File {} not found.'.format(filename))
        os.unlink(filename)

    def rm_many(self, uids):
        """ Deletes many uids from the data backend and returns a list
        of uids that couldn't be deleted.
        """
        _no_del = []
        for uid in uids:
            try:
                self.rm(uid)
            except FileNotFoundError:
                _no_del.append(uid)
        return _no_del

    def read_raw(self, uid):
        filename = self._filename(uid)
        if not os.path.exists(filename):
            raise FileNotFoundError('File {} not found.'.format(filename))
        # TODO: Decrypt, uncompress data
        data = open(filename, 'rb').read()
        time.sleep(self.read_throttling.consume(len(data)))
        return data

    def get_all_blob_uids(self, prefix=None):
        if prefix:
            raise RuntimeError('prefix is not supported on file backends.')
        matches = []
        for root, dirnames, filenames in os.walk(self.path):
            for filename in fnmatch.filter(filenames, '*.blob'):
                uid = filename.split('.')[0]
                matches.append(uid)
        return matches
Exemple #5
0
class DataBackend(_DataBackend):
    """ A DataBackend which stores in AzureBlob compatible storages. The files are
    stored in a configurable bucket. """

    WRITE_QUEUE_LENGTH = 20
    READ_QUEUE_LENGTH = 20

    _SUPPORTS_PARTIAL_READS = False
    _SUPPORTS_PARTIAL_WRITES = False
    fatal_error = None

    def __init__(self, config):

        azure_access_key_id = config.get('azure_access_key_id')
        azure_secret_access_key = config.get('azure_secret_access_key')
        container_name = config.get('bucket_name', 'backy2')
        simultaneous_writes = config.getint('simultaneous_writes', 1)
        simultaneous_reads = config.getint('simultaneous_reads', 1)
        bandwidth_read = config.getint('bandwidth_read', 0)
        bandwidth_write = config.getint('bandwidth_write', 0)

        self.read_throttling = TokenBucket()
        self.read_throttling.set_rate(bandwidth_read)  # 0 disables throttling
        self.write_throttling = TokenBucket()
        self.write_throttling.set_rate(bandwidth_write)  # 0 disables throttling
        self.container_name = container_name

        # print('Databackend: Azure blob')
        # print('azure_access_key_id: ', azure_access_key_id)
        # print('azure_secret_access_key: ', azure_secret_access_key)
        # print('container_name: ', container_name)

        self.conn = BlockBlobService(
            account_name=azure_access_key_id,
            account_key=azure_secret_access_key
        )

        # create our bucket
        try:
            self.conn.create_container(container_name)
        # except boto.exception.S3CreateError:
        #     # exists...
        #     pass
        except (OSError, Exception)  as e:
            # no route to host
            self.fatal_error = e
            logger.error('Fatal error, dying: {}'.format(e))
            print('Fatal error: {}'.format(e))
            exit(10)

        self.write_queue_length = simultaneous_writes + self.WRITE_QUEUE_LENGTH
        self.read_queue_length = simultaneous_reads + self.READ_QUEUE_LENGTH
        self._write_queue = queue.Queue(self.write_queue_length)
        self._read_queue = queue.Queue()
        self._read_data_queue = queue.Queue(self.read_queue_length)
        self._writer_threads = []
        self._reader_threads = []
        for i in range(simultaneous_writes):
            _writer_thread = threading.Thread(target=self._writer, args=(i,))
            _writer_thread.daemon = True
            _writer_thread.start()
            self._writer_threads.append(_writer_thread)
        for i in range(simultaneous_reads):
            _reader_thread = threading.Thread(target=self._reader, args=(i,))
            _reader_thread.daemon = True
            _reader_thread.start()
            self._reader_threads.append(_reader_thread)


    def _writer(self, id_):
        """ A threaded background writer """
        while True:
            entry = self._write_queue.get()
            if entry is None or self.fatal_error:
                logger.debug("Writer {} finishing.".format(id_))
                break
            uid, data = entry
            time.sleep(self.write_throttling.consume(len(data)))
            t1 = time.time()

            try:
                # res = self.conn.create_blob_from_text(
                #     container_name=self.container_name,
                #     blob_name=uid,
                #     text=data,
                #     validate_content=True,
                #     encoding='ascii'
                # )
                string_data = data
                if not isinstance(string_data, bytes):
                    string_data = string_data.encode("utf-8")
                fp = BytesIO(string_data)
                res = self.conn.create_blob_from_bytes(
                    container_name=self.container_name,
                    blob_name=uid,
                    blob=fp.getvalue(),
                    validate_content=True,
                )
            except (OSError, Exception) as e:
                # We let the backup job die here fataly.
                self.fatal_error = e
                logger.error('Fatal error, dying: {}'.format(e))
                print('Error on Write File', e)
                #exit('Fatal error: {}'.format(e))  # this only raises SystemExit
                os._exit(11)
            t2 = time.time()
            self._write_queue.task_done()
            logger.debug('Writer {} wrote data async. uid {} in {:.2f}s (Queue size is {})'.format(id_, uid, t2-t1, self._write_queue.qsize()))


    def _reader(self, id_):
        """ A threaded background reader """
        while True:
            block = self._read_queue.get()  # contains block
            if block is None or self.fatal_error:
                logger.debug("Reader {} finishing.".format(id_))
                break
            t1 = time.time()
            try:
                data = self.read_raw(block.uid)
            except FileNotFoundError:
                self._read_data_queue.put((block, None))  # catch this!
            else:
                self._read_data_queue.put((block, data))
                t2 = time.time()
                self._read_queue.task_done()
                logger.debug('Reader {} read data async. uid {} in {:.2f}s (Queue size is {})'.format(id_, block.uid, t2-t1, self._read_queue.qsize()))


    def read_raw(self, block_uid):
        while True:
            try:
                data = self.conn.get_blob_to_bytes(
                    container_name=self.container_name,
                    blob_name=block_uid,
                    validate_content=True,
                )
                data = data.content
            except (OSError, Exception) as e:
                # TODO: Check what is the exact exception throwed here to show if has error
                logger.error('Timeout while fetching from azure - error is "{}"'.format(str(e)))
                pass
            else:
                break
        time.sleep(self.read_throttling.consume(len(data)))
        return data

    def _uid(self):
        # 32 chars are allowed and we need to spread the first few chars so
        # that blobs are distributed nicely. And want to avoid hash collisions.
        # So we create a real base57-encoded uuid (22 chars) and prefix it with
        # its own md5 hash[:10].
        suuid = shortuuid.uuid()
        hash = hashlib.md5(suuid.encode('ascii')).hexdigest()
        return hash[:10] + suuid

    def _remove_many(self, uids):
        resultErrors = []
        for uid in uids:
            try:
                self.rm(uid)
            except Exception as e:
                print('Remove Many Exception -> UID:', uid, ' Exception: ', e)
                resultErrors.append(uid)
        return resultErrors


    def save(self, data, _sync=False):
        if self.fatal_error:
            print('error fatal self')
            raise self.fatal_error
        uid = self._uid()
        self._write_queue.put((uid, data))
        if _sync:
            self._write_queue.join()
        return uid


    def rm(self, uid):
        try:
            self.conn.delete_blob(self.container_name, uid)
        except (OSError, Exception) as e:
            raise FileNotFoundError('UID {} not found.'.format(uid))

    def rm_many(self, uids):
        """ Deletes many uids from the data backend and returns a list
        of uids that couldn't be deleted.
        """
        errors = self._remove_many(uids)
        if len(errors) > 0:
            return errors


    def read(self, block, sync=False):
        self._read_queue.put(block)
        if sync:
            rblock, offset, length, data = self.read_get()
            if rblock.id != block.id:
                raise RuntimeError('Do not mix threaded reading with sync reading!')
            if data is None:
                raise FileNotFoundError('UID {} not found.'.format(block.uid))
            return data


    def read_get(self):
        block, data = self._read_data_queue.get()
        offset = 0
        length = len(data)
        self._read_data_queue.task_done()
        return block, offset, length, data


    def read_queue_size(self):
        return self._read_queue.qsize()


    def get_all_blob_uids(self, prefix=None):
        return self.conn.list_blob_names(self.container_name, prefix)


    def close(self):
        for _writer_thread in self._writer_threads:
            self._write_queue.put(None)  # ends the thread
        for _writer_thread in self._writer_threads:
            _writer_thread.join()
        for _reader_thread in self._reader_threads:
            self._read_queue.put(None)  # ends the thread
        for _reader_thread in self._reader_threads:
            _reader_thread.join()
Exemple #6
0
class DataBackend(_DataBackend):
    """ A DataBackend which stores in files. The files are stored in directories
    starting with the bytes of the generated uid. The depth of this structure
    is configurable via the DEPTH parameter, which defaults to 2. """

    DEPTH = 2
    SPLIT = 2
    SUFFIX = '.blob'
    WRITE_QUEUE_LENGTH = 10
    READ_QUEUE_LENGTH = 20

    _SUPPORTS_PARTIAL_READS = True
    _SUPPORTS_PARTIAL_WRITES = True


    def __init__(self, config):
        self.path = config.get('path')
        simultaneous_writes = config.getint('simultaneous_writes')
        simultaneous_reads = config.getint('simultaneous_reads', 1)
        self.write_queue_length = simultaneous_writes + self.WRITE_QUEUE_LENGTH
        self.read_queue_length = simultaneous_reads + self.READ_QUEUE_LENGTH

        bandwidth_read = config.getint('bandwidth_read', 0)
        bandwidth_write = config.getint('bandwidth_write', 0)

        self.read_throttling = TokenBucket()
        self.read_throttling.set_rate(bandwidth_read)  # 0 disables throttling
        self.write_throttling = TokenBucket()
        self.write_throttling.set_rate(bandwidth_write)  # 0 disables throttling

        self._write_queue = queue.Queue(self.write_queue_length)
        self._read_queue = queue.Queue()
        self._read_data_queue = queue.Queue(self.read_queue_length)
        self._writer_threads = []
        self._reader_threads = []
        for i in range(simultaneous_writes):
            _writer_thread = threading.Thread(target=self._writer, args=(i,))
            _writer_thread.daemon = True
            _writer_thread.start()
            self._writer_threads.append(_writer_thread)
        for i in range(simultaneous_reads):
            _reader_thread = threading.Thread(target=self._reader, args=(i,))
            _reader_thread.daemon = True
            _reader_thread.start()
            self._reader_threads.append(_reader_thread)


    def _writer(self, id_=0):
        """ A threaded background writer """
        while True:
            entry = self._write_queue.get()
            if entry is None:
                logger.debug("Writer {} finishing.".format(id_))
                break
            uid, data = entry
            path = os.path.join(self.path, self._path(uid))
            filename = self._filename(uid)
            time.sleep(self.write_throttling.consume(len(data)))
            t1 = time.time()
            try:
                with open(filename, 'wb') as f:
                    r = f.write(data)
            except FileNotFoundError:
                makedirs(path)
                with open(filename, 'wb') as f:
                    r = f.write(data)
            t2 = time.time()
            assert r == len(data)
            self._write_queue.task_done()
            logger.debug('Writer {} wrote data async. uid {} in {:.2f}s (Queue size is {})'.format(id_, uid, t2-t1, self._write_queue.qsize()))


    def _reader(self, id_):
        """ A threaded background reader """
        while True:
            d = self._read_queue.get()  # contains block, offset, length
            if d is None:
                logger.debug("Reader {} finishing.".format(id_))
                break
            block, offset, length = d
            t1 = time.time()
            try:
                data = self.read_raw(block.uid, offset, length)
            except FileNotFoundError:
                self._read_data_queue.put((block, offset, length, None))  # catch this!
            else:
                self._read_data_queue.put((block, offset, length, data))
                t2 = time.time()
                self._read_queue.task_done()
                logger.debug('Reader {} read data async. uid {} in {:.2f}s (Queue size is {})'.format(id_, block.uid, t2-t1, self._read_queue.qsize()))


    def _uid(self):
        # 32 chars are allowed and we need to spread the first few chars so
        # that blobs are distributed nicely. And want to avoid hash collisions.
        # So we create a real base57-encoded uuid (22 chars) and prefix it with
        # its own md5 hash[:10].
        suuid = shortuuid.uuid()
        hash = hashlib.md5(suuid.encode('ascii')).hexdigest()
        return hash[:10] + suuid


    def _path(self, uid):
        """ Returns a generated path (depth = self.DEPTH) from a uid.
        Example uid=831bde887afc11e5b45aa44e314f9270 and depth=2, then
        it returns "83/1b".
        If depth is larger than available bytes, then available bytes
        are returned only as path."""

        parts = [uid[i:i+self.SPLIT] for i in range(0, len(uid), self.SPLIT)]
        return os.path.join(*parts[:self.DEPTH])


    def _filename(self, uid):
        path = os.path.join(self.path, self._path(uid))
        return os.path.join(path, uid + self.SUFFIX)


    def save(self, data, _sync=False):
        uid = self._uid()
        self._write_queue.put((uid, data))
        if _sync:
            self._write_queue.join()
        return uid


    def update(self, uid, data, offset=0):
        with open(self._filename(uid), 'r+b') as f:
            f.seek(offset)
            return f.write(data)


    def rm(self, uid):
        filename = self._filename(uid)
        if not os.path.exists(filename):
            raise FileNotFoundError('File {} not found.'.format(filename))
        os.unlink(filename)


    def rm_many(self, uids):
        """ Deletes many uids from the data backend and returns a list
        of uids that couldn't be deleted.
        """
        _no_del = []
        for uid in uids:
            try:
                self.rm(uid)
            except FileNotFoundError:
                _no_del.append(uid)
        return _no_del


    def read(self, block, sync=False, offset=0, length=None):
        self._read_queue.put((block, offset, length))
        if sync:
            rblock, offset, length, data = self.read_get()
            assert offset == offset
            assert length == length
            if rblock.id != block.id:
                raise RuntimeError('Do not mix threaded reading with sync reading!')
            if data is None:
                raise FileNotFoundError('UID {} not found.'.format(block.uid))
            return data


    def read_get(self):
        block, offset, length, data = self._read_data_queue.get()
        self._read_data_queue.task_done()
        return block, offset, length, data


    def read_queue_size(self):
        return self._read_queue.qsize()


    def read_raw(self, uid, offset=0, length=None):
        filename = self._filename(uid)
        if not os.path.exists(filename):
            raise FileNotFoundError('File {} not found.'.format(filename))
        if offset==0 and length is None:
            return open(filename, 'rb').read()
        else:
            with open(filename, 'rb') as f:
                if offset:
                    f.seek(offset)
                if length:
                    data = f.read(length)
                else:
                    data = f.read()
            time.sleep(self.read_throttling.consume(len(data)))
            return data


    def get_all_blob_uids(self, prefix=None):
        if prefix:
            raise RuntimeError('prefix is not supported on file backends.')
        matches = []
        for root, dirnames, filenames in os.walk(self.path):
            for filename in fnmatch.filter(filenames, '*.blob'):
                uid = filename.split('.')[0]
                matches.append(uid)
        return matches


    def close(self):
        for _writer_thread in self._writer_threads:
            self._write_queue.put(None)  # ends the thread
        for _writer_thread in self._writer_threads:
            _writer_thread.join()
        for _reader_thread in self._reader_threads:
            self._read_queue.put(None)  # ends the thread
        for _reader_thread in self._reader_threads:
            _reader_thread.join()
Exemple #7
0
class DataBackend(_DataBackend):
    """ A DataBackend for performance testing. It reads and writes to NULL.
    DO NOT USE IN PRODUCTION.
    This essentially implements /dev/null
    """

    WRITE_QUEUE_LENGTH = 20
    READ_QUEUE_LENGTH = 20

    last_exception = None

    def __init__(self, config):
        self.default_block_size = int([
            value for key, value in config.items('DEFAULTS')
            if key == 'block_size'
        ][0])

        simultaneous_writes = config.getint('simultaneous_writes', 1)
        simultaneous_reads = config.getint('simultaneous_reads', 1)

        bandwidth_read = config.getint('bandwidth_read', 0)
        bandwidth_write = config.getint('bandwidth_write', 0)

        self.read_throttling = TokenBucket()
        self.read_throttling.set_rate(bandwidth_read)  # 0 disables throttling
        self.write_throttling = TokenBucket()
        self.write_throttling.set_rate(
            bandwidth_write)  # 0 disables throttling

        self.write_queue_length = simultaneous_writes + self.WRITE_QUEUE_LENGTH
        self.read_queue_length = simultaneous_reads + self.READ_QUEUE_LENGTH
        self._write_queue = queue.Queue(self.write_queue_length)
        self._read_queue = queue.Queue()
        self._read_data_queue = queue.Queue(self.read_queue_length)
        self._writer_threads = []
        self._reader_threads = []
        self.reader_thread_status = {}
        self.writer_thread_status = {}
        for i in range(simultaneous_writes):
            _writer_thread = threading.Thread(target=self._writer, args=(i, ))
            _writer_thread.daemon = True
            _writer_thread.start()
            self._writer_threads.append(_writer_thread)
            self.writer_thread_status[i] = STATUS_NOTHING
        for i in range(simultaneous_reads):
            _reader_thread = threading.Thread(target=self._reader, args=(i, ))
            _reader_thread.daemon = True
            _reader_thread.start()
            self._reader_threads.append(_reader_thread)
            self.reader_thread_status[i] = STATUS_NOTHING

    def _writer(self, id_):
        """ A threaded background writer """
        while True:
            entry = self._write_queue.get()
            if entry is None or self.last_exception:
                logger.debug("Writer {} finishing.".format(id_))
                break
            uid, data, callback = entry
            self.writer_thread_status[id_] = STATUS_THROTTLING
            time.sleep(self.write_throttling.consume(len(data)))
            self.writer_thread_status[id_] = STATUS_NOTHING
            t1 = time.time()
            try:
                # storing data to key uid
                self.writer_thread_status[id_] = STATUS_WRITING
                #time.sleep(.1)
                self.writer_thread_status[id_] = STATUS_NOTHING
            except Exception as e:
                self.last_exception = e
            else:
                t2 = time.time()
                # assert r == len(data)
                if callback:
                    callback(uid)
                self._write_queue.task_done()
                #logger.debug('Writer {} wrote data async. uid {} in {:.2f}s (Queue size is {})'.format(id_, uid, t2-t1, self._write_queue.qsize()))
                #if random.random() > 0.9:
                #    raise ValueError("This is a test")

    def _reader(self, id_):
        """ A threaded background reader """
        while True:
            block = self._read_queue.get()  # contains block
            if block is None or self.last_exception:
                logger.debug("Reader {} finishing.".format(id_))
                break
            t1 = time.time()
            try:
                self.reader_thread_status[id_] = STATUS_READING
                data = self.read_raw(block.id, block.size)
                self.reader_thread_status[id_] = STATUS_THROTTLING
            except Exception as e:
                self.last_exception = e
            else:
                time.sleep(self.read_throttling.consume(len(data)))
                self.reader_thread_status[id_] = STATUS_NOTHING
                #time.sleep(.5)
                self._read_data_queue.put((block, data))
                t2 = time.time()
                self._read_queue.task_done()
                logger.debug(
                    'Reader {} read data async. uid {} in {:.2f}s (Queue size is {})'
                    .format(id_, block.uid, t2 - t1, self._read_queue.qsize()))

    def read_raw(self, block_id, block_size):
        return generate_block(block_id, block_size)

    def rm(self, uid):
        # Don't delete anything
        pass

    def rm_many(self, uids):
        """ Deletes many uids from the data backend and returns a list
        of uids that couldn't be deleted.
        """
        # Don't delete anything

    def get_all_blob_uids(self, prefix=None):
        return []
Exemple #8
0
class DataBackend(_DataBackend):
    """ A DataBackend which stores in files. The files are stored in directories
    starting with the bytes of the generated uid. The depth of this structure
    is configurable via the DEPTH parameter, which defaults to 2. """

    DEPTH = 2
    SPLIT = 2
    SUFFIX = '.blob'
    WRITE_QUEUE_LENGTH = 10
    READ_QUEUE_LENGTH = 20

    _SUPPORTS_PARTIAL_READS = True
    _SUPPORTS_PARTIAL_WRITES = True

    def __init__(self, config):
        self.path = config.get('path')
        simultaneous_writes = config.getint('simultaneous_writes')
        simultaneous_reads = config.getint('simultaneous_reads', 1)
        self.write_queue_length = simultaneous_writes + self.WRITE_QUEUE_LENGTH
        self.read_queue_length = simultaneous_reads + self.READ_QUEUE_LENGTH

        bandwidth_read = config.getint('bandwidth_read', 0)
        bandwidth_write = config.getint('bandwidth_write', 0)

        self.read_throttling = TokenBucket()
        self.read_throttling.set_rate(bandwidth_read)  # 0 disables throttling
        self.write_throttling = TokenBucket()
        self.write_throttling.set_rate(
            bandwidth_write)  # 0 disables throttling

        self._write_queue = queue.Queue(self.write_queue_length)
        self._read_queue = queue.Queue()
        self._read_data_queue = queue.Queue(self.read_queue_length)
        self._writer_threads = []
        self._reader_threads = []
        for i in range(simultaneous_writes):
            _writer_thread = threading.Thread(target=self._writer, args=(i, ))
            _writer_thread.daemon = True
            _writer_thread.start()
            self._writer_threads.append(_writer_thread)
        for i in range(simultaneous_reads):
            _reader_thread = threading.Thread(target=self._reader, args=(i, ))
            _reader_thread.daemon = True
            _reader_thread.start()
            self._reader_threads.append(_reader_thread)

    def _writer(self, id_=0):
        """ A threaded background writer """
        while True:
            entry = self._write_queue.get()
            if entry is None:
                logger.debug("Writer {} finishing.".format(id_))
                break
            uid, data = entry
            path = os.path.join(self.path, self._path(uid))
            filename = self._filename(uid)
            time.sleep(self.write_throttling.consume(len(data)))
            t1 = time.time()
            try:
                with open(filename, 'wb') as f:
                    r = f.write(data)
            except FileNotFoundError:
                makedirs(path)
                with open(filename, 'wb') as f:
                    r = f.write(data)
            t2 = time.time()
            assert r == len(data)
            self._write_queue.task_done()
            logger.debug(
                'Writer {} wrote data async. uid {} in {:.2f}s (Queue size is {})'
                .format(id_, uid, t2 - t1, self._write_queue.qsize()))

    def _reader(self, id_):
        """ A threaded background reader """
        while True:
            d = self._read_queue.get()  # contains block, offset, length
            if d is None:
                logger.debug("Reader {} finishing.".format(id_))
                break
            block, offset, length = d
            t1 = time.time()
            try:
                data = self.read_raw(block.uid, offset, length)
            except FileNotFoundError:
                self._read_data_queue.put(
                    (block, offset, length, None))  # catch this!
            else:
                self._read_data_queue.put((block, offset, length, data))
                t2 = time.time()
                self._read_queue.task_done()
                logger.debug(
                    'Reader {} read data async. uid {} in {:.2f}s (Queue size is {})'
                    .format(id_, block.uid, t2 - t1, self._read_queue.qsize()))

    def _uid(self):
        # 32 chars are allowed and we need to spread the first few chars so
        # that blobs are distributed nicely. And want to avoid hash collisions.
        # So we create a real base57-encoded uuid (22 chars) and prefix it with
        # its own md5 hash[:10].
        suuid = shortuuid.uuid()
        hash = hashlib.md5(suuid.encode('ascii')).hexdigest()
        return hash[:10] + suuid

    def _path(self, uid):
        """ Returns a generated path (depth = self.DEPTH) from a uid.
        Example uid=831bde887afc11e5b45aa44e314f9270 and depth=2, then
        it returns "83/1b".
        If depth is larger than available bytes, then available bytes
        are returned only as path."""

        parts = [uid[i:i + self.SPLIT] for i in range(0, len(uid), self.SPLIT)]
        return os.path.join(*parts[:self.DEPTH])

    def _filename(self, uid):
        path = os.path.join(self.path, self._path(uid))
        return os.path.join(path, uid + self.SUFFIX)

    def save(self, data, _sync=False):
        uid = self._uid()
        self._write_queue.put((uid, data))
        if _sync:
            self._write_queue.join()
        return uid

    def update(self, uid, data, offset=0):
        with open(self._filename(uid), 'r+b') as f:
            f.seek(offset)
            return f.write(data)

    def rm(self, uid):
        filename = self._filename(uid)
        if not os.path.exists(filename):
            raise FileNotFoundError('File {} not found.'.format(filename))
        os.unlink(filename)

    def rm_many(self, uids):
        """ Deletes many uids from the data backend and returns a list
        of uids that couldn't be deleted.
        """
        _no_del = []
        for uid in uids:
            try:
                self.rm(uid)
            except FileNotFoundError:
                _no_del.append(uid)
        return _no_del

    def read(self, block, sync=False, offset=0, length=None):
        self._read_queue.put((block, offset, length))
        if sync:
            rblock, offset, length, data = self.read_get()
            assert offset == offset
            assert length == length
            if rblock.id != block.id:
                raise RuntimeError(
                    'Do not mix threaded reading with sync reading!')
            if data is None:
                raise FileNotFoundError('UID {} not found.'.format(block.uid))
            return data

    def read_get(self):
        block, offset, length, data = self._read_data_queue.get()
        self._read_data_queue.task_done()
        return block, offset, length, data

    def read_queue_size(self):
        return self._read_queue.qsize()

    def read_raw(self, uid, offset=0, length=None):
        filename = self._filename(uid)
        if not os.path.exists(filename):
            raise FileNotFoundError('File {} not found.'.format(filename))
        if offset == 0 and length is None:
            return open(filename, 'rb').read()
        else:
            with open(filename, 'rb') as f:
                if offset:
                    f.seek(offset)
                if length:
                    data = f.read(length)
                else:
                    data = f.read()
            time.sleep(self.read_throttling.consume(len(data)))
            return data

    def get_all_blob_uids(self, prefix=None):
        if prefix:
            raise RuntimeError('prefix is not supported on file backends.')
        matches = []
        for root, dirnames, filenames in os.walk(self.path):
            for filename in fnmatch.filter(filenames, '*.blob'):
                uid = filename.split('.')[0]
                matches.append(uid)
        return matches

    def close(self):
        for _writer_thread in self._writer_threads:
            self._write_queue.put(None)  # ends the thread
        for _writer_thread in self._writer_threads:
            _writer_thread.join()
        for _reader_thread in self._reader_threads:
            self._read_queue.put(None)  # ends the thread
        for _reader_thread in self._reader_threads:
            _reader_thread.join()
Exemple #9
0
class DataBackend(_DataBackend):
    """ A DataBackend for performance testing. It reads and writes to NULL.
    DO NOT USE IN PRODUCTION.
    This essentially implements /dev/null
    """

    WRITE_QUEUE_LENGTH = 20
    READ_QUEUE_LENGTH = 20

    last_exception = None

    def __init__(self, config, encryption_key, encryption_version=None):
        super().__init__(config, encryption_key, encryption_version)
        self.default_block_size = int([
            value for key, value in config.items('DEFAULTS')
            if key == 'block_size'
        ][0])

        simultaneous_writes = config.getint('simultaneous_writes', 1)
        simultaneous_reads = config.getint('simultaneous_reads', 1)

        bandwidth_read = config.getint('bandwidth_read', 0)
        bandwidth_write = config.getint('bandwidth_write', 0)

        self.read_throttling = TokenBucket()
        self.read_throttling.set_rate(bandwidth_read)  # 0 disables throttling
        self.write_throttling = TokenBucket()
        self.write_throttling.set_rate(
            bandwidth_write)  # 0 disables throttling

        self.write_queue_length = simultaneous_writes + self.WRITE_QUEUE_LENGTH
        self.read_queue_length = simultaneous_reads + self.READ_QUEUE_LENGTH
        self._write_queue = queue.Queue(self.write_queue_length)
        self._read_queue = queue.Queue()
        self._read_data_queue = queue.Queue(self.read_queue_length)
        self._writer_threads = []
        self._reader_threads = []
        self.reader_thread_status = {}
        self.writer_thread_status = {}
        for i in range(simultaneous_writes):
            _writer_thread = threading.Thread(target=self._writer, args=(i, ))
            _writer_thread.daemon = True
            _writer_thread.start()
            self._writer_threads.append(_writer_thread)
            self.writer_thread_status[i] = STATUS_NOTHING
        for i in range(simultaneous_reads):
            _reader_thread = threading.Thread(target=self._reader, args=(i, ))
            _reader_thread.daemon = True
            _reader_thread.start()
            self._reader_threads.append(_reader_thread)
            self.reader_thread_status[i] = STATUS_NOTHING

    def _writer(self, id_):
        """ A threaded background writer """
        while True:
            entry = self._write_queue.get()
            if entry is None or self.last_exception:
                logger.debug("Writer {} finishing.".format(id_))
                break
            uid, enc_envkey, enc_version, enc_nonce, data, callback = entry

            self.writer_thread_status[id_] = STATUS_THROTTLING
            time.sleep(self.write_throttling.consume(len(data)))
            self.writer_thread_status[id_] = STATUS_NOTHING
            t1 = time.time()
            try:
                # storing data to key uid
                self.writer_thread_status[id_] = STATUS_WRITING
                #time.sleep(.1)
                self.writer_thread_status[id_] = STATUS_NOTHING
            except Exception as e:
                self.last_exception = e
            else:
                t2 = time.time()
                # assert r == len(data)
                if callback:
                    callback(uid, enc_envkey, enc_version, enc_nonce)
                self._write_queue.task_done()
                #logger.debug('Writer {} wrote data async. uid {} in {:.2f}s (Queue size is {})'.format(id_, uid, t2-t1, self._write_queue.qsize()))
                #if random.random() > 0.9:
                #    raise ValueError("This is a test")

    def _reader(self, id_):
        """ A threaded background reader """
        while True:
            block = self._read_queue.get()  # contains block
            if block is None or self.last_exception:
                logger.debug("Reader {} finishing.".format(id_))
                break
            t1 = time.time()
            try:
                self.reader_thread_status[id_] = STATUS_READING
                data = self.read_raw(block)
                self.reader_thread_status[id_] = STATUS_THROTTLING
            except Exception as e:
                self.last_exception = e
            else:
                time.sleep(self.read_throttling.consume(len(data)))
                self.reader_thread_status[id_] = STATUS_NOTHING
                #time.sleep(.5)
                self._read_data_queue.put((block, data))
                t2 = time.time()
                self._read_queue.task_done()
                logger.debug(
                    'Reader {} read data async. uid {} in {:.2f}s (Queue size is {})'
                    .format(id_, block.uid, t2 - t1, self._read_queue.qsize()))

    def read_raw(self, block):
        # We have to fake encrypted blocks here or the restore won't work.
        # This is bad as performance is not really comparable but this is our
        # only chance to restore from null://.
        raw_data = generate_block(block.id, block.size)
        data, enc_envkey = self.cc_latest.encrypt(
            raw_data,
            self.cc_latest.unwrap_key(binascii.unhexlify(block.enc_envkey)),
            raw_data[:16],  # use the first 16 bytes as nonce
        )
        #data = generate_block(block_id, block_size)
        return data

    # Overwriting save here as we need weak encryption for the null data backend.
    def save(self, data, _sync=False, callback=None):
        """ Saves data, returns unique ID """
        if self.last_exception:
            raise self.last_exception
        uid = self._uid()

        # Important: It's important to call this from the main thread because
        # zstandard IS NOT THREAD SAFE as stated at https://pypi.org/project/zstandard/:
        # """ Unless specified otherwise, assume that no two methods of
        # ZstdCompressor instances can be called from multiple Python threads
        # simultaneously. In other words, assume instances are not thread safe
        # unless stated otherwise."""
        #blob, enc_envkey = self.cc_latest.encrypt(data)
        blob, enc_envkey, enc_nonce = self.cc_latest.encrypt(
            data, None, data[:16])  # use the first 16 bytes as nonce
        enc_version = self.cc_latest.VERSION

        self._write_queue.put(
            (uid, enc_envkey, enc_version, enc_nonce, blob, callback))
        if _sync:
            self._write_queue.join()
        return uid

    def rm(self, uid):
        # Don't delete anything
        pass

    def rm_many(self, uids):
        """ Deletes many uids from the data backend and returns a list
        of uids that couldn't be deleted.
        """
        # Don't delete anything

    def get_all_blob_uids(self, prefix=None):
        return []
Exemple #10
0
class DataBackend(_DataBackend):
    """ A DataBackend which stores in S3 compatible storages. The files are
    stored in a configurable bucket. """

    WRITE_QUEUE_LENGTH = 20
    READ_QUEUE_LENGTH = 20

    _SUPPORTS_PARTIAL_READS = False
    _SUPPORTS_PARTIAL_WRITES = False
    fatal_error = None

    def __init__(self, config):
        aws_access_key_id = config.get('aws_access_key_id')
        aws_secret_access_key = config.get('aws_secret_access_key')
        host = config.get('host')
        port = config.getint('port')
        is_secure = config.getboolean('is_secure')
        bucket_name = config.get('bucket_name', 'backy2')
        simultaneous_writes = config.getint('simultaneous_writes', 1)
        simultaneous_reads = config.getint('simultaneous_reads', 1)
        calling_format=boto.s3.connection.OrdinaryCallingFormat()
        bandwidth_read = config.getint('bandwidth_read', 0)
        bandwidth_write = config.getint('bandwidth_write', 0)

        self.read_throttling = TokenBucket()
        self.read_throttling.set_rate(bandwidth_read)  # 0 disables throttling
        self.write_throttling = TokenBucket()
        self.write_throttling.set_rate(bandwidth_write)  # 0 disables throttling

        self.conn = boto.connect_s3(
                aws_access_key_id=aws_access_key_id,
                aws_secret_access_key=aws_secret_access_key,
                host=host,
                port=port,
                is_secure=is_secure,
                calling_format=calling_format
            )
        # create our bucket
        try:
            self.bucket = self.conn.create_bucket(bucket_name)
        except boto.exception.S3CreateError:
            # exists...
            pass
        except OSError as e:
            # no route to host
            self.fatal_error = e
            logger.error('Fatal error, dying: {}'.format(e))
            print('Fatal error: {}'.format(e))
            exit(10)

        self.write_queue_length = simultaneous_writes + self.WRITE_QUEUE_LENGTH
        self.read_queue_length = simultaneous_reads + self.READ_QUEUE_LENGTH
        self._write_queue = queue.Queue(self.write_queue_length)
        self._read_queue = queue.Queue()
        self._read_data_queue = queue.Queue(self.read_queue_length)
        self._writer_threads = []
        self._reader_threads = []
        for i in range(simultaneous_writes):
            _writer_thread = threading.Thread(target=self._writer, args=(i,))
            _writer_thread.daemon = True
            _writer_thread.start()
            self._writer_threads.append(_writer_thread)
        for i in range(simultaneous_reads):
            _reader_thread = threading.Thread(target=self._reader, args=(i,))
            _reader_thread.daemon = True
            _reader_thread.start()
            self._reader_threads.append(_reader_thread)


    def _writer(self, id_):
        """ A threaded background writer """
        while True:
            entry = self._write_queue.get()
            if entry is None or self.fatal_error:
                logger.debug("Writer {} finishing.".format(id_))
                break
            uid, data = entry
            time.sleep(self.write_throttling.consume(len(data)))
            t1 = time.time()
            key = self.bucket.new_key(uid)
            try:
                r = key.set_contents_from_string(data)
            except (
                    OSError,
                    boto.exception.BotoServerError,
                    boto.exception.S3ResponseError,
                    ) as e:
                # OSError happens when the S3 host is gone (i.e. network died,
                # host down, ...). boto tries hard to recover, however after
                # several attempts it will give up and raise.
                # BotoServerError happens, when there is no server.
                # S3ResponseError sometimes happens, when the cluster is about
                # to shutdown. Hard to reproduce because the writer must write
                # in exactly this moment.
                # We let the backup job die here fataly.
                self.fatal_error = e
                logger.error('Fatal error, dying: {}'.format(e))
                #exit('Fatal error: {}'.format(e))  # this only raises SystemExit
                os._exit(11)
            t2 = time.time()
            assert r == len(data)
            self._write_queue.task_done()
            logger.debug('Writer {} wrote data async. uid {} in {:.2f}s (Queue size is {})'.format(id_, uid, t2-t1, self._write_queue.qsize()))


    def _reader(self, id_):
        """ A threaded background reader """
        while True:
            block = self._read_queue.get()  # contains block
            if block is None or self.fatal_error:
                logger.debug("Reader {} finishing.".format(id_))
                break
            t1 = time.time()
            try:
                data = self.read_raw(block.uid)
            except FileNotFoundError:
                self._read_data_queue.put((block, None))  # catch this!
            else:
                self._read_data_queue.put((block, data))
                t2 = time.time()
                self._read_queue.task_done()
                logger.debug('Reader {} read data async. uid {} in {:.2f}s (Queue size is {})'.format(id_, block.uid, t2-t1, self._read_queue.qsize()))


    def read_raw(self, block_uid):
        key = self.bucket.get_key(block_uid)
        if not key:
            raise FileNotFoundError('UID {} not found.'.format(block_uid))
        data = key.get_contents_as_string()
        time.sleep(self.read_throttling.consume(len(data)))
        return data


    def _uid(self):
        # 32 chars are allowed and we need to spread the first few chars so
        # that blobs are distributed nicely. And want to avoid hash collisions.
        # So we create a real base57-encoded uuid (22 chars) and prefix it with
        # its own md5 hash[:10].
        suuid = shortuuid.uuid()
        hash = hashlib.md5(suuid.encode('ascii')).hexdigest()
        return hash[:10] + suuid


    def save(self, data, _sync=False):
        if self.fatal_error:
            raise self.fatal_error
        uid = self._uid()
        self._write_queue.put((uid, data))
        if _sync:
            self._write_queue.join()
        return uid


    def rm(self, uid):
        key = self.bucket.get_key(uid)
        if not key:
            raise FileNotFoundError('UID {} not found.'.format(uid))
        self.bucket.delete_key(uid)


    def rm_many(self, uids):
        """ Deletes many uids from the data backend and returns a list
        of uids that couldn't be deleted.
        """
        errors = self.bucket.delete_keys(uids, quiet=True)
        if errors.errors:
            # unable to test this. ceph object gateway doesn't return errors.
            # raise FileNotFoundError('UIDS {} not found.'.format(errors.errors))
            return errors.errors  # TODO: which should be a list of uids.


    def read(self, block, sync=False):
        self._read_queue.put(block)
        if sync:
            rblock, data = self.read_get()
            if rblock.id != block.id:
                raise RuntimeError('Do not mix threaded reading with sync reading!')
            if data is None:
                raise FileNotFoundError('UID {} not found.'.format(block.uid))
            return data


    def read_get(self):
        block, data = self._read_data_queue.get()
        offset = 0
        length = len(data)
        self._read_data_queue.task_done()
        return block, offset, length, data


    def read_queue_size(self):
        return self._read_queue.qsize()


    def get_all_blob_uids(self, prefix=None):
        return [k.name for k in self.bucket.list(prefix)]


    def close(self):
        for _writer_thread in self._writer_threads:
            self._write_queue.put(None)  # ends the thread
        for _writer_thread in self._writer_threads:
            _writer_thread.join()
        for _reader_thread in self._reader_threads:
            self._read_queue.put(None)  # ends the thread
        for _reader_thread in self._reader_threads:
            _reader_thread.join()
        self.conn.close()