def _backend_property(self, function, dynamic): """ Handles the internal caching of dynamic properties """ caller_name = dynamic.name cache_key = '{0}_{1}'.format(self._key, caller_name) mutex = volatile_mutex(cache_key) try: cached_data = self._volatile.get(cache_key) if cached_data is None: if dynamic.locked: mutex.acquire() cached_data = self._volatile.get(cache_key) if cached_data is None: function_info = inspect.getargspec(function) if 'dynamic' in function_info.args: cached_data = function(dynamic=dynamic) # Load data from backend else: cached_data = function() if cached_data is not None: correct, allowed_types, given_type = Toolbox.check_type(cached_data, dynamic.return_type) if not correct: raise TypeError('Dynamic property {0} allows types {1}. {2} given'.format( caller_name, str(allowed_types), given_type )) if dynamic.timeout > 0: self._volatile.set(cache_key, cached_data, dynamic.timeout) return cached_data finally: mutex.release()
def pulse(): """ Update the heartbeats for the Current Routers :return: None """ logger = LogHandler.get('extensions', name='heartbeat') machine_id = System.get_my_machine_id() current_time = int(time.time()) routers = StorageRouterList.get_storagerouters() for node in routers: if node.machine_id == machine_id: with volatile_mutex('storagerouter_heartbeat_{0}'.format(node.guid)): node_save = StorageRouter(node.guid) node_save.heartbeats['process'] = current_time node_save.save() StorageRouterController.ping.s(node.guid, current_time).apply_async(routing_key='sr.{0}'.format(machine_id)) else: try: # check timeout of other nodes and clear arp cache if node.heartbeats and 'process' in node.heartbeats: if current_time - node.heartbeats['process'] >= HeartBeat.ARP_TIMEOUT: check_output("/usr/sbin/arp -d '{0}'".format(node.name.replace(r"'", r"'\''")), shell=True) except CalledProcessError: logger.exception('Error clearing ARP cache')
def get_unused_arakoon_metadata_and_claim(cluster_type, locked=True): """ Retrieve arakoon cluster information based on its type :param cluster_type: Type of arakoon cluster (See ServiceType.ARAKOON_CLUSTER_TYPES) :type cluster_type: str :param locked: Execute this in a locked context :type locked: bool :return: List of ArakoonClusterMetadata objects :rtype: ArakoonClusterMetadata """ cluster_type = cluster_type.upper() if cluster_type not in ServiceType.ARAKOON_CLUSTER_TYPES: raise ValueError('Unsupported arakoon cluster type provided. Please choose from {0}'.format(', '.join(ServiceType.ARAKOON_CLUSTER_TYPES))) if not EtcdConfiguration.dir_exists('/ovs/arakoon'): return None mutex = volatile_mutex('claim_arakoon_metadata', wait=10) try: if locked is True: mutex.acquire() for cluster_name in EtcdConfiguration.list('/ovs/arakoon'): metadata = ArakoonClusterMetadata(cluster_id=cluster_name) metadata.load_metadata() if metadata.cluster_type == cluster_type and metadata.in_use is False and metadata.internal is False: metadata.claim() return metadata finally: if locked is True: mutex.release()
def new_function(*args, **kwargs): """ Wrapped function """ request = _find_request(args) now = time.time() key = 'ovs_api_limit_{0}.{1}_{2}'.format( f.__module__, f.__name__, request.META['HTTP_X_REAL_IP'] ) client = VolatileFactory.get_client() with volatile_mutex(key): rate_info = client.get(key, {'calls': [], 'timeout': None}) active_timeout = rate_info['timeout'] if active_timeout is not None: if active_timeout > now: logger.warning('Call {0} is being throttled with a wait of {1}'.format(key, active_timeout - now)) raise HttpTooManyRequestsException(error='rate_limit_timeout', error_description='Rate limit timeout ({0}s remaining)'.format(round(active_timeout - now, 2))) else: rate_info['timeout'] = None rate_info['calls'] = [call for call in rate_info['calls'] if call > (now - per)] + [now] calls = len(rate_info['calls']) if calls > amount: rate_info['timeout'] = now + timeout client.set(key, rate_info) logger.warning('Call {0} is being throttled with a wait of {1}'.format(key, timeout)) raise HttpTooManyRequestsException(error='rate_limit_reached', error_description='Rate limit reached ({0} in last {1}s)'.format(calls, per)) client.set(key, rate_info) return f(*args, **kwargs)
def _backend_property(self, function, dynamic): """ Handles the internal caching of dynamic properties """ caller_name = dynamic.name cache_key = '{0}_{1}'.format(self._key, caller_name) mutex = volatile_mutex(cache_key) try: cached_data = self._volatile.get(cache_key) if cached_data is None: if dynamic.locked: mutex.acquire() cached_data = self._volatile.get(cache_key) if cached_data is None: function_info = inspect.getargspec(function) if 'dynamic' in function_info.args: cached_data = function( dynamic=dynamic) # Load data from backend else: cached_data = function() if cached_data is not None: correct, allowed_types, given_type = Toolbox.check_type( cached_data, dynamic.return_type) if not correct: raise TypeError( 'Dynamic property {0} allows types {1}. {2} given' .format(caller_name, str(allowed_types), given_type)) if dynamic.timeout > 0: self._volatile.set(cache_key, cached_data, dynamic.timeout) return cached_data finally: mutex.release()
def update_value(key, append, value_to_update=None): """ Store the specified value in the PersistentFactory :param key: Key to store the value for :param append: If True, the specified value will be appended else element at index 0 will be popped :param value_to_update: Value to append to the list or remove from the list :return: Updated value """ with volatile_mutex(name=key, wait=5): if persistent_client.exists(key): val = persistent_client.get(key) if append is True and value_to_update is not None: val['values'].append(value_to_update) elif append is False and value_to_update is not None: for value_item in val['values']: if value_item == value_to_update: val['values'].remove(value_item) break elif append is False and len(val['values']) > 0: val['values'].pop(0) else: log_message('Setting initial value for key {0}'.format( persistent_key)) val = {'mode': mode, 'values': []} persistent_client.set(key, val) return val
def get_unused_arakoon_metadata_and_claim(cluster_type, locked=True): """ Retrieve arakoon cluster information based on its type :param cluster_type: Type of arakoon cluster (See ServiceType.ARAKOON_CLUSTER_TYPES) :type cluster_type: str :param locked: Execute this in a locked context :type locked: bool :return: List of ArakoonClusterMetadata objects :rtype: ArakoonClusterMetadata """ cluster_type = cluster_type.upper() if cluster_type not in ServiceType.ARAKOON_CLUSTER_TYPES: raise ValueError('Unsupported arakoon cluster type provided. Please choose from {0}'.format(', '.join(ServiceType.ARAKOON_CLUSTER_TYPES))) if not EtcdConfiguration.dir_exists('/ovs/arakoon'): return None mutex = volatile_mutex('claim_arakoon_metadata', wait=10) try: if locked is True: mutex.acquire() for cluster_name in EtcdConfiguration.list('/ovs/arakoon'): metadata = ArakoonClusterMetadata(cluster_id=cluster_name) metadata.load_metadata() if metadata.cluster_type == cluster_type and metadata.in_use is False and metadata.internal is False: metadata.claim() return metadata finally: if locked is True: mutex.release()
def extend_cluster(master_ip, new_ip, cluster_name, base_dir, locked=True): """ Extends a cluster to a given new node :param master_ip: IP of one of the already existing nodes :type master_ip: str :param new_ip: IP address of the node to be added :type new_ip: str :param cluster_name: Name of the cluster to be extended :type cluster_name: str :param base_dir: Base directory that will hold the db and tlogs :type base_dir: str :param locked: Indicates whether the extend should run in a locked context (e.g. to prevent port conflicts) :type locked: bool :return: Ports used by arakoon cluster :rtype: dict """ ArakoonInstaller._logger.debug('Extending cluster {0} from {1} to {2}'.format(cluster_name, master_ip, new_ip)) base_dir = base_dir.rstrip('/') config = ArakoonClusterConfig(cluster_name) config.load_config() client = SSHClient(new_ip, username=ArakoonInstaller.SSHCLIENT_USER) node_name = System.get_my_machine_id(client) home_dir = ArakoonInstaller.ARAKOON_HOME_DIR.format(base_dir, cluster_name) log_dir = ArakoonInstaller.ARAKOON_LOG_DIR.format(cluster_name) tlog_dir = ArakoonInstaller.ARAKOON_TLOG_DIR.format(base_dir, cluster_name) ArakoonInstaller.clean_leftover_arakoon_data(new_ip, {log_dir: True, home_dir: False, tlog_dir: False}) port_mutex = None try: if locked is True: from ovs.extensions.generic.volatilemutex import volatile_mutex port_mutex = volatile_mutex('arakoon_install_ports_{0}'.format(new_ip)) port_mutex.acquire(wait=60) ports = ArakoonInstaller._get_free_ports(client) if node_name not in [node.name for node in config.nodes]: config.nodes.append(ArakoonNodeConfig(name=node_name, ip=new_ip, client_port=ports[0], messaging_port=ports[1], log_dir=log_dir, home=home_dir, tlog_dir=tlog_dir)) ArakoonInstaller._deploy(config) finally: if port_mutex is not None: port_mutex.release() ArakoonInstaller._logger.debug('Extending cluster {0} from {1} to {2} completed'.format(cluster_name, master_ip, new_ip)) return {'client_port': ports[0], 'messaging_port': ports[1]}
def get_unused_arakoon_metadata_and_claim(cluster_type, locked=True): """ Retrieve arakoon cluster information based on its type :param cluster_type: Type of arakoon cluster (See ServiceType.ARAKOON_CLUSTER_TYPES) :type cluster_type: str :param locked: Execute this in a locked context :type locked: bool :return: Metadata of the arakoon cluster :rtype: dict """ cluster_type = cluster_type.upper() if cluster_type not in ServiceType.ARAKOON_CLUSTER_TYPES: raise ValueError('Unsupported arakoon cluster type provided. Please choose from {0}'.format(', '.join(ServiceType.ARAKOON_CLUSTER_TYPES))) if not Configuration.dir_exists(ArakoonInstaller.CONFIG_ROOT): return None mutex = volatile_mutex('claim_arakoon_metadata', wait=10) try: if locked is True: mutex.acquire() for cluster_name in Configuration.list(ArakoonInstaller.CONFIG_ROOT): config = ArakoonClusterConfig(cluster_id=cluster_name, filesystem=False) config.load_config() arakoon_client = ArakoonInstaller.build_client(config) if arakoon_client.exists(ArakoonInstaller.METADATA_KEY): metadata = json.loads(arakoon_client.get(ArakoonInstaller.METADATA_KEY)) if metadata['cluster_type'] == cluster_type and metadata['in_use'] is False and metadata['internal'] is False: metadata['in_use'] = True arakoon_client.set(ArakoonInstaller.METADATA_KEY, json.dumps(metadata, indent=4)) return metadata finally: if locked is True: mutex.release()
def new_function(*args, **kwargs): """ Wrapped function """ request = _find_request(args) now = time.time() key = 'ovs_api_limit_{0}.{1}_{2}'.format( f.__module__, f.__name__, request.META['HTTP_X_REAL_IP'] ) client = VolatileFactory.get_client() with volatile_mutex(key): rate_info = client.get(key, {'calls': [], 'timeout': None}) active_timeout = rate_info['timeout'] if active_timeout is not None: if active_timeout > now: logger.warning('Call {0} is being throttled with a wait of {1}'.format(key, active_timeout - now)) raise HttpTooManyRequestsException(error='rate_limit_timeout', error_description='Rate limit timeout ({0}s remaining)'.format(round(active_timeout - now, 2))) else: rate_info['timeout'] = None rate_info['calls'] = [call for call in rate_info['calls'] if call > (now - per)] + [now] calls = len(rate_info['calls']) if calls > amount: rate_info['timeout'] = now + timeout client.set(key, rate_info) logger.warning('Call {0} is being throttled with a wait of {1}'.format(key, timeout)) raise HttpTooManyRequestsException(error='rate_limit_reached', error_description='Rate limit reached ({0} in last {1}s)'.format(calls, per)) client.set(key, rate_info) return f(*args, **kwargs)
def update_value(key, append, value_to_update=None): """ Store the specified value in the PersistentFactory :param key: Key to store the value for :param append: If True, the specified value will be appended else element at index 0 will be popped :param value_to_update: Value to append to the list or remove from the list :return: Updated value """ with volatile_mutex(name=key, wait=5): if persistent_client.exists(key): val = persistent_client.get(key) if append is True and value_to_update is not None: val['values'].append(value_to_update) elif append is False and value_to_update is not None: for value_item in val['values']: if value_item == value_to_update: val['values'].remove(value_item) break elif append is False and len(val['values']) > 0: val['values'].pop(0) log_message('Amount of jobs pending for key {0}: {1}'.format(key, len(val['values']))) for kwarg in val['values']: log_message(' KWARGS: {0}'.format(kwarg['kwargs'])) else: log_message('Setting initial value for key {0}'.format(key)) val = {'mode': mode, 'values': []} persistent_client.set(key, val) return val
def wrapped(*args, **kwargs): if lock_type == 'local': _mutex = file_mutex(key) elif lock_type == 'cluster': _mutex = volatile_mutex(key) else: raise ValueError( 'Lock type {0} is not supported!'.format(lock_type)) try: _mutex.acquire(wait=0.005) local_sr = System.get_my_storagerouter() CacheHelper.set(key=key, item={ 'ip': local_sr.ip, 'hostname': local_sr.name }, expire_time=60) return func(*args, **kwargs) except (NoFileLockAvailableException, NoVolatileLockAvailableException): if callback is None: return else: executor_info = None start = time.time() while executor_info is None: # Calculated guesswork. If a callback function would be expected, the acquire has happened for another executor the volatilekey should be set eventually # However by setting it after the acquire, the callback executor and original method executor can race between fetch and set # A better implementation would be relying on the fwk ensure_single_decorator as they check for various races themselves # This is just a poor mans, temporary implementation if start - time.time() > 5: raise ValueError( 'Timed out after 5 seconds while fetching the information about the executor.' ) try: executor_info = CacheHelper.get(key=key) except: pass callback_func = callback.__func__ if isinstance( callback, staticmethod) else callback argnames = inspect.getargspec(callback_func)[0] arguments = list(args) kwargs.update({'test_name': func.__name__}) if executor_info is not None: kwargs.update(executor_info) if 'result_handler' in argnames: result_handler = kwargs.get('result_handler') for index, arg in enumerate(arguments): if isinstance(arg, HCResults.HCResultCollector): result_handler = arguments.pop(index) break if result_handler is None: raise TypeError( 'Expected an instance of {}'.format( type(HCResults.HCResultCollector))) kwargs['result_handler'] = result_handler return callback_func(*tuple(arguments), **kwargs) finally: _mutex.release()
def _clean_cache(): ovs_logger.info('Executing celery "clear_cache" startup script...') from ovs.lib.helpers.decorators import ENSURE_SINGLE_KEY active = inspect().active() active_tasks = [] if active is not None: for tasks in active.itervalues(): active_tasks += [task['id'] for task in tasks] cache = PersistentFactory.get_client() for key in cache.prefix(ENSURE_SINGLE_KEY): try: with volatile_mutex(name=key, wait=5): entry = cache.get(key) values = entry.get('values', []) new_values = [] for v in values: task_id = v.get('task_id') if task_id is not None and task_id in active_tasks: new_values.append(v) if len(new_values) > 0: entry['values'] = new_values cache.set(key, entry) ovs_logger.info('Updated key {0}'.format(key)) else: cache.delete(key) ovs_logger.info('Deleted key {0}'.format(key)) except KeyNotFoundException: pass ovs_logger.info('Executing celery "clear_cache" startup script... done')
def pulse(): """ Update the heartbeats for the Current Routers :return: None """ logger = Logger('extensions-generic') machine_id = System.get_my_machine_id() current_time = int(time.time()) routers = StorageRouterList.get_storagerouters() for node in routers: if node.machine_id == machine_id: with volatile_mutex('storagerouter_heartbeat_{0}'.format( node.guid)): node_save = StorageRouter(node.guid) node_save.heartbeats['process'] = current_time node_save.save() StorageRouterController.ping.s( node.guid, current_time).apply_async( routing_key='sr.{0}'.format(machine_id)) else: try: # check timeout of other nodes and clear arp cache if node.heartbeats and 'process' in node.heartbeats: if current_time - node.heartbeats[ 'process'] >= HeartBeat.ARP_TIMEOUT: check_output("/usr/sbin/arp -d '{0}'".format( node.name.replace(r"'", r"'\''")), shell=True) except CalledProcessError: logger.exception('Error clearing ARP cache')
def _unregister_task(self, task_data=None, delete=False): # type: (dict, bool) -> None """ Unregisters the execution of the task :param task_data: Dict containing all information about the task to unregister :type task_data: dict :param delete: Delete the registration key :type delete: bool :return: None """ # @todo use transaction with volatile_mutex(self.persistent_key, wait=5): if task_data: current_registrations, initial_registrations = self.get_task_registrations( ) try: current_registrations.remove(task_data) self.persistent_client.set(self.task_registration_key, current_registrations) except ValueError: # Registration was already removed pass if delete: self.logger.info( self.message.format('Deleting key {0}'.format( self.task_registration_key))) self.persistent_client.delete(self.task_registration_key, must_exist=False)
def new_function(self, request, *args, **kwargs): """ Wrapped function """ now = time.time() key = 'ovs_api_limit_{0}.{1}_{2}'.format( f.__module__, f.__name__, request.META['HTTP_X_REAL_IP'] ) client = VolatileFactory.get_client() mutex = volatile_mutex(key) try: mutex.acquire() rate_info = client.get(key, {'calls': [], 'timeout': None}) active_timeout = rate_info['timeout'] if active_timeout is not None: if active_timeout > now: logger.warning('Call {0} is being throttled with a wait of {1}'.format(key, active_timeout - now)) return HttpResponse, {'error_code': 'rate_limit_timeout', 'error': 'Rate limit timeout ({0}s remaining)'.format(round(active_timeout - now, 2))}, 429 else: rate_info['timeout'] = None rate_info['calls'] = [call for call in rate_info['calls'] if call > (now - per)] + [now] calls = len(rate_info['calls']) if calls > amount: rate_info['timeout'] = now + timeout client.set(key, rate_info) logger.warning('Call {0} is being throttled with a wait of {1}'.format(key, timeout)) return HttpResponse, {'error_code': 'rate_limit_reached', 'error': 'Rate limit reached ({0} in last {1}s)'.format(calls, per)}, 429 client.set(key, rate_info) finally: mutex.release() return f(self, request, *args, **kwargs)
def new_function(*args, **kwargs): """ Wrapped function """ request = _find_request(args) now = time.time() key = 'ovs_api_limit_{0}.{1}_{2}'.format( f.__module__, f.__name__, request.META['HTTP_X_REAL_IP']) client = VolatileFactory.get_client() with volatile_mutex(key): rate_info = client.get(key, {'calls': [], 'timeout': None}) active_timeout = rate_info['timeout'] if active_timeout is not None: if active_timeout > now: logger.warning( 'Call {0} is being throttled with a wait of {1}'. format(key, active_timeout - now)) raise Throttled(wait=active_timeout - now) else: rate_info['timeout'] = None rate_info['calls'] = [ call for call in rate_info['calls'] if call > (now - per) ] + [now] calls = len(rate_info['calls']) if calls > amount: rate_info['timeout'] = now + timeout client.set(key, rate_info) logger.warning( 'Call {0} is being throttled with a wait of {1}'. format(key, timeout)) raise Throttled(wait=timeout) client.set(key, rate_info) return f(*args, **kwargs)
def new_function(*args, **kwargs): """ Wrapped function """ request = _find_request(args) now = time.time() key = 'ovs_api_limit_{0}.{1}_{2}'.format( f.__module__, f.__name__, request.META['HTTP_X_REAL_IP'] ) client = VolatileFactory.get_client() with volatile_mutex(key): rate_info = client.get(key, {'calls': [], 'timeout': None}) active_timeout = rate_info['timeout'] if active_timeout is not None: if active_timeout > now: logger.warning('Call {0} is being throttled with a wait of {1}'.format(key, active_timeout - now)) raise Throttled(wait=active_timeout - now) else: rate_info['timeout'] = None rate_info['calls'] = [call for call in rate_info['calls'] if call > (now - per)] + [now] calls = len(rate_info['calls']) if calls > amount: rate_info['timeout'] = now + timeout client.set(key, rate_info) logger.warning('Call {0} is being throttled with a wait of {1}'.format(key, timeout)) raise Throttled(wait=timeout) client.set(key, rate_info) return f(*args, **kwargs)
def extend_cluster(master_ip, new_ip, cluster_name, base_dir, locked=True): """ Extends a cluster to a given new node :param master_ip: IP of one of the already existing nodes :type master_ip: str :param new_ip: IP address of the node to be added :type new_ip: str :param cluster_name: Name of the cluster to be extended :type cluster_name: str :param base_dir: Base directory that will hold the db and tlogs :type base_dir: str :param locked: Indicates whether the extend should run in a locked context (e.g. to prevent port conflicts) :type locked: bool :return: Ports used by arakoon cluster :rtype: dict """ ArakoonInstaller._logger.debug('Extending cluster {0} from {1} to {2}'.format(cluster_name, master_ip, new_ip)) base_dir = base_dir.rstrip('/') config = ArakoonClusterConfig(cluster_name) config.load_config() client = SSHClient(new_ip, username=ArakoonInstaller.SSHCLIENT_USER) node_name = System.get_my_machine_id(client) home_dir = ArakoonInstaller.ARAKOON_HOME_DIR.format(base_dir, cluster_name) tlog_dir = ArakoonInstaller.ARAKOON_TLOG_DIR.format(base_dir, cluster_name) ArakoonInstaller.clean_leftover_arakoon_data(new_ip, [home_dir, tlog_dir]) port_mutex = None try: if locked is True: from ovs.extensions.generic.volatilemutex import volatile_mutex port_mutex = volatile_mutex('arakoon_install_ports_{0}'.format(new_ip)) port_mutex.acquire(wait=60) ports = ArakoonInstaller._get_free_ports(client) if node_name not in [node.name for node in config.nodes]: config.nodes.append(ArakoonNodeConfig(name=node_name, ip=new_ip, client_port=ports[0], messaging_port=ports[1], log_sinks=LogHandler.get_sink_path('arakoon_server'), crash_log_sinks=LogHandler.get_sink_path('arakoon_server_crash'), home=home_dir, tlog_dir=tlog_dir)) ArakoonInstaller._deploy(config) finally: if port_mutex is not None: port_mutex.release() ArakoonInstaller._logger.debug('Extending cluster {0} from {1} to {2} completed'.format(cluster_name, master_ip, new_ip)) return {'client_port': ports[0], 'messaging_port': ports[1]}
def __init__(self, *args, **kwargs): """ Initializes the distributed scheduler """ self._logger = LogHandler.get('celery', name='celery beat') self._persistent = PersistentFactory.get_client() self._namespace = 'ovs_celery_beat' self._mutex = volatile_mutex('celery_beat', 10) self._has_lock = False super(DistributedScheduler, self).__init__(*args, **kwargs) self._logger.debug('DS init')
def __init__(self, *args, **kwargs): """ Initializes the distributed scheduler """ self._logger = LogHandler.get('celery', name='celery beat') self._persistent = PersistentFactory.get_client() self._namespace = 'ovs_celery_beat' self._mutex = volatile_mutex('celery_beat', 10) self._has_lock = False super(DistributedScheduler, self).__init__(*args, **kwargs) self._logger.debug('DS init')
def create(self): """ Prepares a new Storagedriver for a given vPool and Storagerouter :return: None :rtype: NoneType """ if self.sr_installer is None: raise RuntimeError('No StorageRouterInstaller instance found') machine_id = System.get_my_machine_id(client=self.sr_installer.root_client) port_range = Configuration.get('/ovs/framework/hosts/{0}/ports|storagedriver'.format(machine_id)) storagerouter = self.sr_installer.storagerouter with volatile_mutex('add_vpool_get_free_ports_{0}'.format(machine_id), wait=30): model_ports_in_use = [] for sd in StorageDriverList.get_storagedrivers(): if sd.storagerouter_guid == storagerouter.guid: model_ports_in_use += sd.ports.values() for proxy in sd.alba_proxies: model_ports_in_use.append(proxy.service.ports[0]) ports = System.get_free_ports(selected_range=port_range, exclude=model_ports_in_use, amount=4 + self.sr_installer.requested_proxies, client=self.sr_installer.root_client) vpool = self.vp_installer.vpool vrouter_id = '{0}{1}'.format(vpool.name, machine_id) storagedriver = StorageDriver() storagedriver.name = vrouter_id.replace('_', ' ') storagedriver.ports = {'management': ports[0], 'xmlrpc': ports[1], 'dtl': ports[2], 'edge': ports[3]} storagedriver.vpool = vpool storagedriver.cluster_ip = Configuration.get('/ovs/framework/hosts/{0}/ip'.format(machine_id)) storagedriver.storage_ip = self.storage_ip storagedriver.mountpoint = '/mnt/{0}'.format(vpool.name) storagedriver.description = storagedriver.name storagedriver.storagerouter = storagerouter storagedriver.storagedriver_id = vrouter_id storagedriver.save() # ALBA Proxies proxy_service_type = ServiceTypeList.get_by_name(ServiceType.SERVICE_TYPES.ALBA_PROXY) for proxy_id in xrange(self.sr_installer.requested_proxies): service = Service() service.storagerouter = storagerouter service.ports = [ports[4 + proxy_id]] service.name = 'albaproxy_{0}_{1}'.format(vpool.name, proxy_id) service.type = proxy_service_type service.save() alba_proxy = AlbaProxy() alba_proxy.service = service alba_proxy.storagedriver = storagedriver alba_proxy.save() self.storagedriver = storagedriver
def __init__(self, *args, **kwargs): """ Initializes the distributed scheduler """ self._mutex = volatile_mutex('celery_beat', 10) self._logger = Logger('celery') self._has_lock = False self._lock_name = 'ovs_celery_beat_lock' self._entry_name = 'ovs_celery_beat_entries' self._persistent = PersistentFactory.get_client() self._schedule_info = {} super(DistributedScheduler, self).__init__(*args, **kwargs) self._logger.debug('DS init')
def new_function(*args, **kw): """ Executes the decorated function in a locked context """ filemutex = file_mutex('messaging') try: filemutex.acquire(wait=60) mutex = volatile_mutex('messaging') try: mutex.acquire(wait=60) return f(*args, **kw) finally: mutex.release() finally: filemutex.release()
def new_function(*args, **kw): """ Executes the decorated function in a locked context """ filemutex = file_mutex('messaging') try: filemutex.acquire(wait=60) mutex = volatile_mutex('messaging') try: mutex.acquire(wait=60) return f(*args, **kw) finally: mutex.release() finally: filemutex.release()
def new_function(self, request, *args, **kwargs): """ Wrapped function """ now = time.time() key = 'ovs_api_limit_{0}.{1}_{2}'.format( f.__module__, f.__name__, request.META['HTTP_X_REAL_IP']) client = VolatileFactory.get_client() mutex = volatile_mutex(key) try: mutex.acquire() rate_info = client.get(key, {'calls': [], 'timeout': None}) active_timeout = rate_info['timeout'] if active_timeout is not None: if active_timeout > now: logger.warning( 'Call {0} is being throttled with a wait of {1}'. format(key, active_timeout - now)) return HttpResponse, { 'error_code': 'rate_limit_timeout', 'error': 'Rate limit timeout ({0}s remaining)'.format( round(active_timeout - now, 2)) }, 429 else: rate_info['timeout'] = None rate_info['calls'] = [ call for call in rate_info['calls'] if call > (now - per) ] + [now] calls = len(rate_info['calls']) if calls > amount: rate_info['timeout'] = now + timeout client.set(key, rate_info) logger.warning( 'Call {0} is being throttled with a wait of {1}'. format(key, timeout)) return HttpResponse, { 'error_code': 'rate_limit_reached', 'error': 'Rate limit reached ({0} in last {1}s)'.format( calls, per) }, 429 client.set(key, rate_info) finally: mutex.release() return f(self, request, *args, **kwargs)
def ping(storagerouter_guid, timestamp): """ Update a StorageRouter's celery heartbeat :param storagerouter_guid: Guid of the StorageRouter to update :type storagerouter_guid: str :param timestamp: Timestamp to compare to :type timestamp: float :return: None :rtype: NoneType """ with volatile_mutex( 'storagerouter_heartbeat_{0}'.format(storagerouter_guid)): storagerouter = StorageRouter(storagerouter_guid) if timestamp > storagerouter.heartbeats.get('celery', 0): storagerouter.heartbeats['celery'] = timestamp storagerouter.save()
def lock_default_mode(self): # type: () -> None """ Lock function racing for default ensure single mode Checks all possible task names :raises: EnsureSingleTaskDiscarded: If the task was discarded :raises: EnsureSingleDoCallBack: If the task is required to call the callback function instead of the task function """ # @todo instead of volatile mutex, do asserts within transaction # Acquire try: self.run_hook('before_validation') with volatile_mutex(self.persistent_key, wait=5): for task in self.ensure_single_container.task_names: key_to_check = self.generate_key_for_task_with_mode(task) if self.persistent_client.exists(key_to_check): # Raising errors as potential long functions should not be invoked in the acquire phase if self.async_task or not self.ensure_single_container.callback: self.discard_task() else: self.callback_task() self.logger.info( self.message.format('Setting key {0}'.format( self.persistent_key))) self._register_task({'task_id': self.task_id}) finally: self.run_hook('after_validation') success = True self.unittest_set_state_executing() self.run_hook('before_execution') try: yield # Release except: success = False raise finally: if success: self.unittest_set_state_finished() self.logger.info( self.message.format( 'Task {0} finished successfully'.format( self.ensure_single_container.task_name))) # Release self._unregister_task(delete=True) self.run_hook('after_execution')
def wrapped(*args, **kwargs): if lock_type == 'local': _mutex = file_mutex(key) elif lock_type == 'cluster': _mutex = volatile_mutex(key) else: raise ValueError('Lock type {0} is not supported!'.format(lock_type)) try: _mutex.acquire(wait=0.005) local_sr = System.get_my_storagerouter() CacheHelper.set(key=key, item={'ip': local_sr.ip, 'hostname': local_sr.name}, expire_time=60) return func(*args, **kwargs) except (NoFileLockAvailableException, NoVolatileLockAvailableException): if callback is None: return else: executor_info = None start = time.time() while executor_info is None: # Calculated guesswork. If a callback function would be expected, the acquire has happened for another executor the volatilekey should be set eventually # However by setting it after the acquire, the callback executor and original method executor can race between fetch and set # A better implementation would be relying on the fwk ensure_single_decorator as they check for various races themselves # This is just a poor mans, temporary implementation if start - time.time() > 5: raise ValueError('Timed out after 5 seconds while fetching the information about the executor.') try: executor_info = CacheHelper.get(key=key) except: pass callback_func = callback.__func__ if isinstance(callback, staticmethod) else callback argnames = inspect.getargspec(callback_func)[0] arguments = list(args) kwargs.update({'test_name': func.__name__}) if executor_info is not None: kwargs.update(executor_info) if 'result_handler' in argnames: result_handler = kwargs.get('result_handler') for index, arg in enumerate(arguments): if isinstance(arg, HCResults.HCResultCollector): result_handler = arguments.pop(index) break if result_handler is None: raise TypeError('Expected an instance of {0}'.format(HCResults.HCResultCollector)) kwargs['result_handler'] = result_handler return callback_func(**kwargs) finally: _mutex.release()
def _register_task(self, registration_data): # type: (any) -> list """ Registers the execution of the task :param registration_data: Dict containing all arguments as key word arguments :type registration_data: any :return: All registrations :rtype: list """ # @todo use transactions instead with volatile_mutex(self.persistent_key, wait=5): current_registrations, initial_registrations = self.get_task_registrations( ) current_registrations.append(registration_data) self.persistent_client.set(self.task_registration_key, current_registrations) return current_registrations
def invalidate_dynamics(self, properties=None): """ Invalidates all dynamic property caches. Use with caution, as this action can introduce a short performance hit. :param properties: Properties to invalidate """ if properties is not None and not isinstance(properties, list): properties = [properties] for dynamic in self._dynamics: if properties is None or dynamic.name in properties: key = '{0}_{1}'.format(self._key, dynamic.name) mutex = volatile_mutex(key) try: if dynamic.locked: mutex.acquire() self._volatile.delete(key) finally: mutex.release()
def invalidate_dynamics(self, properties=None): """ Invalidates all dynamic property caches. Use with caution, as this action can introduce a short performance hit. :param properties: Properties to invalidate """ if properties is not None and not isinstance(properties, list): properties = [properties] for dynamic in self._dynamics: if properties is None or dynamic.name in properties: key = '{0}_{1}'.format(self._key, dynamic.name) mutex = volatile_mutex(key) try: if dynamic.locked: mutex.acquire() self._volatile.delete(key) finally: mutex.release()
def get_unused_arakoon_metadata_and_claim(cluster_type, locked=True): """ Retrieve arakoon cluster information based on its type :param cluster_type: Type of arakoon cluster (See ServiceType.ARAKOON_CLUSTER_TYPES) :type cluster_type: str :param locked: Execute this in a locked context :type locked: bool :return: Metadata of the arakoon cluster :rtype: dict """ cluster_type = cluster_type.upper() if cluster_type not in ServiceType.ARAKOON_CLUSTER_TYPES: raise ValueError( 'Unsupported arakoon cluster type provided. Please choose from {0}' .format(', '.join(ServiceType.ARAKOON_CLUSTER_TYPES))) if not Configuration.dir_exists(ArakoonInstaller.CONFIG_ROOT): return None mutex = volatile_mutex('claim_arakoon_metadata', wait=10) try: if locked is True: mutex.acquire() for cluster_name in Configuration.list( ArakoonInstaller.CONFIG_ROOT): config = ArakoonClusterConfig(cluster_id=cluster_name, filesystem=False) config.load_config() arakoon_client = ArakoonInstaller.build_client(config) if arakoon_client.exists(ArakoonInstaller.METADATA_KEY): metadata = json.loads( arakoon_client.get(ArakoonInstaller.METADATA_KEY)) if metadata['cluster_type'] == cluster_type and metadata[ 'in_use'] is False and metadata[ 'internal'] is False: metadata['in_use'] = True arakoon_client.set(ArakoonInstaller.METADATA_KEY, json.dumps(metadata, indent=4)) return metadata finally: if locked is True: mutex.release()
def update_vmachine_name(instance_id, old_name, new_name): """ Update a vMachine name: find vmachine by management center instance id, set new name :param instance_id: ID for the virtual machine known by management center :param old_name: Old name of the virtual machine :param new_name: New name for the virtual machine """ vmachine = None for mgmt_center in MgmtCenterList.get_mgmtcenters(): mgmt = Factory.get_mgmtcenter(mgmt_center = mgmt_center) try: machine_info = mgmt.get_vmachine_device_info(instance_id) file_name = machine_info['file_name'] host_name = machine_info['host_name'] vpool_name = machine_info['vpool_name'] storage_router = StorageRouterList.get_by_name(host_name) machine_id = storage_router.machine_id device_name = '{0}/{1}'.format(machine_id, file_name) vp = VPoolList.get_vpool_by_name(vpool_name) vmachine = VMachineList.get_by_devicename_and_vpool(device_name, vp) if vmachine: break vmachine = VMachineList.get_by_devicename_and_vpool(device_name, None) if vmachine: break except Exception as ex: VMachineController._logger.info('Trying to get mgmt center failed for vmachine {0}. {1}'.format(old_name, ex)) if not vmachine: VMachineController._logger.error('No vmachine found for name {0}'.format(old_name)) return vpool = vmachine.vpool mutex = volatile_mutex('{0}_{1}'.format(old_name, vpool.guid if vpool is not None else 'none')) try: mutex.acquire(wait=5) vmachine.name = new_name vmachine.save() finally: mutex.release()
def update_vmachine_name(instance_id, old_name, new_name): """ Update a vMachine name: find vmachine by management center instance id, set new name :param instance_id: ID for the virtual machine known by management center :param old_name: Old name of the virtual machine :param new_name: New name for the virtual machine """ vmachine = None for mgmt_center in MgmtCenterList.get_mgmtcenters(): mgmt = Factory.get_mgmtcenter(mgmt_center = mgmt_center) try: machine_info = mgmt.get_vmachine_device_info(instance_id) file_name = machine_info['file_name'] host_name = machine_info['host_name'] vpool_name = machine_info['vpool_name'] storage_router = StorageRouterList.get_by_name(host_name) machine_id = storage_router.machine_id device_name = '{0}/{1}'.format(machine_id, file_name) vp = VPoolList.get_vpool_by_name(vpool_name) vmachine = VMachineList.get_by_devicename_and_vpool(device_name, vp) if vmachine: break vmachine = VMachineList.get_by_devicename_and_vpool(device_name, None) if vmachine: break except Exception as ex: VMachineController._logger.info('Trying to get mgmt center failed for vmachine {0}. {1}'.format(old_name, ex)) if not vmachine: VMachineController._logger.error('No vmachine found for name {0}'.format(old_name)) return vpool = vmachine.vpool mutex = volatile_mutex('{0}_{1}'.format(old_name, vpool.guid if vpool is not None else 'none')) try: mutex.acquire(wait=5) vmachine.name = new_name vmachine.save() finally: mutex.release()
def create_cluster(cluster_name, cluster_type, ip, base_dir, plugins=None, locked=True, internal=True, claim=False): """ Always creates a cluster but marks it's usage according to the internal flag :param cluster_name: Name of the cluster :type cluster_name: str :param cluster_type: Type of the cluster (See ServiceType.ARAKOON_CLUSTER_TYPES) :type cluster_type: str :param ip: IP address of the first node of the new cluster :type ip: str :param base_dir: Base directory that should contain the data and tlogs :type base_dir: str :param plugins: Plugins that should be added to the configuration file :type plugins: list :param locked: Indicates whether the create should run in a locked context (e.g. to prevent port conflicts) :type locked: bool :param internal: Is cluster internally managed by OVS :type internal: bool :param claim: Claim the cluster right away :type claim: bool :return: Ports used by arakoon cluster :rtype: dict """ if cluster_type not in ServiceType.ARAKOON_CLUSTER_TYPES: raise ValueError('Cluster type {0} is not supported. Please choose from {1}'.format(cluster_type, ', '.join(ServiceType.ARAKOON_CLUSTER_TYPES))) if EtcdConfiguration.dir_exists('/ovs/arakoon/{0}'.format(cluster_name)): raise ValueError('An Arakoon cluster with name "{0}" already exists'.format(cluster_name)) ArakoonInstaller._logger.debug('Creating cluster {0} on {1}'.format(cluster_name, ip)) base_dir = base_dir.rstrip('/') client = SSHClient(ip, username=ArakoonInstaller.SSHCLIENT_USER) if ArakoonInstaller.is_running(cluster_name, client): ArakoonInstaller._logger.info('Arakoon service running for cluster {0}'.format(cluster_name)) config = ArakoonClusterConfig(cluster_name, plugins) config.load_config() for node in config.nodes: if node.ip == ip: return {'client_port': node.client_port, 'messaging_port': node.messaging_port} node_name = System.get_my_machine_id(client) home_dir = ArakoonInstaller.ARAKOON_HOME_DIR.format(base_dir, cluster_name) tlog_dir = ArakoonInstaller.ARAKOON_TLOG_DIR.format(base_dir, cluster_name) ArakoonInstaller.clean_leftover_arakoon_data(ip, [home_dir, tlog_dir]) port_mutex = None try: if locked is True: from ovs.extensions.generic.volatilemutex import volatile_mutex port_mutex = volatile_mutex('arakoon_install_ports_{0}'.format(ip)) port_mutex.acquire(wait=60) ports = ArakoonInstaller._get_free_ports(client) config = ArakoonClusterConfig(cluster_name, plugins) config.nodes.append(ArakoonNodeConfig(name=node_name, ip=ip, client_port=ports[0], messaging_port=ports[1], log_sinks=LogHandler.get_sink_path('arakoon_server'), crash_log_sinks=LogHandler.get_sink_path('arakoon_server_crash'), home=home_dir, tlog_dir=tlog_dir)) ArakoonInstaller._deploy(config) metadata = ArakoonClusterMetadata(cluster_id=cluster_name) metadata.internal = internal metadata.cluster_type = cluster_type.upper() metadata.write() if claim is True: metadata.claim() finally: if port_mutex is not None: port_mutex.release() ArakoonInstaller._logger.debug('Creating cluster {0} on {1} completed'.format(cluster_name, ip)) return {'metadata': metadata, 'client_port': ports[0], 'messaging_port': ports[1]}
def __init__(self, guid=None, data=None, datastore_wins=False, volatile=False, _hook=None): """ Loads an object with a given guid. If no guid is given, a new object is generated with a new guid. * guid: The guid indicating which object should be loaded * datastoreWins: Optional boolean indicating save conflict resolve management. ** True: when saving, external modified fields will not be saved ** False: when saving, all changed data will be saved, regardless of external updates ** None: in case changed field were also changed externally, an error will be raised """ # Initialize super class super(DataObject, self).__init__() # Initialize internal fields self._frozen = False self._datastore_wins = datastore_wins self._guid = None # Guid identifier of the object self._original = {} # Original data copy self._metadata = {} # Some metadata, mainly used for unit testing self._data = {} # Internal data storage self._objects = {} # Internal objects storage # Initialize public fields self.dirty = False self.volatile = volatile # Worker fields/objects self._classname = self.__class__.__name__.lower() # Rebuild _relation types hybrid_structure = HybridRunner.get_hybrids() for relation in self._relations: if relation.foreign_type is not None: identifier = Descriptor( relation.foreign_type).descriptor['identifier'] if identifier in hybrid_structure and identifier != hybrid_structure[ identifier]['identifier']: relation.foreign_type = Descriptor().load( hybrid_structure[identifier]).get_object() # Init guid self._new = False if guid is None: self._guid = str(uuid.uuid4()) self._new = True else: self._guid = str(guid) # Build base keys self._key = '{0}_{1}_{2}'.format(DataObject.NAMESPACE, self._classname, self._guid) # Worker mutexes self._mutex_version = volatile_mutex('ovs_dataversion_{0}_{1}'.format( self._classname, self._guid)) # Load data from cache or persistent backend where appropriate self._volatile = VolatileFactory.get_client() self._persistent = PersistentFactory.get_client() self._metadata['cache'] = None if self._new: self._data = {} else: if data is not None: self._data = copy.deepcopy(data) self._metadata['cache'] = None else: self._data = self._volatile.get(self._key) if self._data is None: self._metadata['cache'] = False try: self._data = self._persistent.get(self._key) except KeyNotFoundException: raise ObjectNotFoundException( '{0} with guid \'{1}\' could not be found'.format( self.__class__.__name__, self._guid)) else: self._metadata['cache'] = True # Set default values on new fields for prop in self._properties: if prop.name not in self._data: self._data[prop.name] = prop.default self._add_property(prop) # Load relations for relation in self._relations: if relation.name not in self._data: if relation.foreign_type is None: cls = self.__class__ else: cls = relation.foreign_type self._data[relation.name] = Descriptor(cls).descriptor self._add_relation_property(relation) # Add wrapped properties for dynamic in self._dynamics: self._add_dynamic_property(dynamic) # Load foreign keys relations = RelationMapper.load_foreign_relations(self.__class__) if relations is not None: for key, info in relations.iteritems(): self._objects[key] = {'info': info, 'data': None} self._add_list_property(key, info['list']) if _hook is not None and hasattr(_hook, '__call__'): _hook() if not self._new: # Re-cache the object, if required if self._metadata['cache'] is False: # The data wasn't loaded from the cache, so caching is required now try: self._mutex_version.acquire(30) this_version = self._data['_version'] store_version = self._persistent.get(self._key)['_version'] if this_version == store_version: self._volatile.set(self._key, self._data) except KeyNotFoundException: raise ObjectNotFoundException( '{0} with guid \'{1}\' could not be found'.format( self.__class__.__name__, self._guid)) finally: self._mutex_version.release() # Freeze property creation self._frozen = True # Optionally, initialize some fields if data is not None: for prop in self._properties: if prop.name in data: setattr(self, prop.name, data[prop.name]) # Store original data self._original = copy.deepcopy(self._data)
def new_function(self, *args, **kwargs): """ Wrapped function :param self: With bind=True, the celery task result itself is passed in :param args: Arguments without default values :param kwargs: Arguments with default values """ def log_message(message, level='info'): """ Log a message with some additional information :param message: Message to log :param level: Log level :return: None """ if level not in ('info', 'warning', 'debug', 'error', 'exception'): raise ValueError('Unsupported log level "{0}" specified'.format(level)) if unittest_mode is False: complete_message = 'Ensure single {0} mode - ID {1} - {2}'.format(mode, now, message) else: complete_message = 'Ensure single {0} mode - ID {1} - {2} - {3}'.format(mode, now, threading.current_thread().getName(), message) getattr(logger, level)(complete_message) def update_value(key, append, value_to_update=None): """ Store the specified value in the PersistentFactory :param key: Key to store the value for :param append: If True, the specified value will be appended else element at index 0 will be popped :param value_to_update: Value to append to the list or remove from the list :return: Updated value """ with volatile_mutex(name=key, wait=5): vals = list(persistent_client.get_multi([key], must_exist=False)) if vals[0] is not None: val = vals[0] if append is True and value_to_update is not None: val['values'].append(value_to_update) elif append is False and value_to_update is not None: for value_item in val['values']: if value_item == value_to_update: val['values'].remove(value_item) break elif append is False and len(val['values']) > 0: val['values'].pop(0) else: log_message('Setting initial value for key {0}'.format(key)) val = {'mode': mode, 'values': []} persistent_client.set(key, val) return val if not hasattr(self, 'request'): raise RuntimeError('The decorator ensure_single can only be applied to bound tasks (with bind=True argument)') now = '{0}_{1}'.format(int(time.time()), ''.join(random.choice(string.ascii_letters + string.digits) for _ in range(10))) task_id = self.request.id async_task = task_id is not None # Async tasks have an ID, inline executed tasks have None as ID task_names = [task_name] if extra_task_names is None else [task_name] + extra_task_names thread_name = threading.current_thread().getName() unittest_mode = os.environ.get('RUNNING_UNITTESTS') == 'True' persistent_key = '{0}_{1}'.format(ENSURE_SINGLE_KEY, task_name) persistent_client = PersistentFactory.get_client() if mode == 'DEFAULT': with volatile_mutex(persistent_key, wait=5): for task in task_names: key_to_check = '{0}_{1}'.format(ENSURE_SINGLE_KEY, task) if persistent_client.exists(key_to_check): if async_task is True or callback is None: log_message('Execution of task {0} discarded'.format(task_name)) if unittest_mode is True: Decorators.unittest_thread_info_by_name[thread_name] = ('DISCARDED', None) return None else: log_message('Execution of task {0} in progress, executing callback function'.format(task_name)) if unittest_mode is True: Decorators.unittest_thread_info_by_name[thread_name] = ('CALLBACK', None) return callback(*args, **kwargs) log_message('Setting key {0}'.format(persistent_key)) persistent_client.set(persistent_key, {'mode': mode, 'values': [{'task_id': task_id}]}) try: if unittest_mode is True: Decorators.unittest_thread_info_by_name[thread_name] = ('EXECUTING', None) output = f(*args, **kwargs) if unittest_mode is True: Decorators.unittest_thread_info_by_name[thread_name] = ('FINISHED', None) Decorators.unittest_thread_info_by_state['FINISHED'].append(thread_name) log_message('Task {0} finished successfully'.format(task_name)) return output finally: with volatile_mutex(persistent_key, wait=5): log_message('Deleting key {0}'.format(persistent_key)) persistent_client.delete(persistent_key, must_exist=False) elif mode == 'DEDUPED': if extra_task_names is not None: log_message('Extra tasks are not allowed in this mode', level='error') raise ValueError('Ensure single {0} mode - ID {1} - Extra tasks are not allowed in this mode'.format(mode, now)) # Update kwargs with args sleep = 1 if unittest_mode is False else 0.1 timeout = kwargs.pop('ensure_single_timeout', 10 if unittest_mode is True else global_timeout) function_info = inspect.getargspec(f) kwargs_dict = {} for index, arg in enumerate(args): kwargs_dict[function_info.args[index]] = arg kwargs_dict.update(kwargs) params_info = 'with params {0}'.format(kwargs_dict) if kwargs_dict else 'with default params' # Set the key in arakoon if non-existent value = update_value(key=persistent_key, append=True) # Validate whether another job with same params is being executed job_counter = 0 for item in value['values']: if item['kwargs'] == kwargs_dict: job_counter += 1 if job_counter == 2: # 1st job with same params is being executed, 2nd is scheduled for execution ==> Discard current if async_task is True: # Not waiting for other jobs to finish since asynchronously log_message('Execution of task {0} {1} discarded because of identical parameters'.format(task_name, params_info)) if unittest_mode is True: Decorators.unittest_thread_info_by_name[thread_name] = ('DISCARDED', None) return None # If executed inline (sync), execute callback if any provided if callback is not None: log_message('Execution of task {0} {1} in progress, executing callback function'.format(task_name, params_info)) if unittest_mode is True: Decorators.unittest_thread_info_by_name[thread_name] = ('CALLBACK', None) return callback(*args, **kwargs) # Let's wait for 2nd job in queue to have finished if no callback provided slept = 0 while slept < timeout: log_message('Task {0} {1} is waiting for similar tasks to finish - ({2})'.format(task_name, params_info, slept + sleep)) values = list(persistent_client.get_multi([persistent_key], must_exist=False)) if values[0] is None: if unittest_mode is True: Decorators.unittest_thread_info_by_name[thread_name] = ('WAITED', None) return None # All pending jobs have been deleted in the meantime, no need to wait if item['timestamp'] not in [value['timestamp'] for value in values[0]['values']]: if unittest_mode is True: Decorators.unittest_thread_info_by_name[thread_name] = ('WAITED', None) return None # Similar tasks have been executed, so sync task currently waiting can return without having been executed slept += sleep time.sleep(sleep) if slept >= timeout: log_message('Task {0} {1} waited {2}s for similar tasks to finish, but timeout was reached'.format(task_name, params_info, slept), level='error') if unittest_mode is True: Decorators.unittest_thread_info_by_name[thread_name] = ('EXCEPTION', 'Could not start within timeout of {0}s while waiting for other tasks'.format(timeout)) raise EnsureSingleTimeoutReached('Ensure single {0} mode - ID {1} - Task {2} could not be started within timeout of {3}s'.format(mode, now, task_name, timeout)) if unittest_mode is True: if thread_name not in Decorators.unittest_thread_info_by_state['WAITING']: Decorators.unittest_thread_info_by_state['WAITING'].append(thread_name) log_message('New task {0} {1} scheduled for execution'.format(task_name, params_info)) update_value(key=persistent_key, append=True, value_to_update={'kwargs': kwargs_dict, 'task_id': task_id, 'timestamp': now}) # Poll the arakoon to see whether this call is the only in list, if so --> execute, else wait slept = 0 while slept < timeout: values = list(persistent_client.get_multi([persistent_key], must_exist=False)) if values[0] is not None: queued_jobs = [v for v in values[0]['values'] if v['kwargs'] == kwargs_dict] if len(queued_jobs) != 1: if unittest_mode is True: Decorators.unittest_thread_info_by_name[thread_name] = ('WAITING', None) if thread_name not in Decorators.unittest_thread_info_by_state['WAITING']: Decorators.unittest_thread_info_by_state['WAITING'].append(thread_name) else: try: if slept != 0: log_message('Task {0} {1} had to wait {2} seconds before being able to start'.format(task_name, params_info, slept)) if unittest_mode is True: Decorators.unittest_thread_info_by_name[thread_name] = ('EXECUTING', None) output = f(*args, **kwargs) if unittest_mode is True: Decorators.unittest_thread_info_by_name[thread_name] = ('FINISHED', None) Decorators.unittest_thread_info_by_state['FINISHED'].append(thread_name) log_message('Task {0} finished successfully'.format(task_name)) return output finally: update_value(key=persistent_key, append=False, value_to_update=queued_jobs[0]) slept += sleep time.sleep(sleep) if slept >= timeout: update_value(key=persistent_key, append=False, value_to_update={'kwargs': kwargs_dict, 'task_id': task_id, 'timestamp': now}) log_message('Could not start task {0} {1}, within expected time ({2}s). Removed it from queue'.format(task_name, params_info, timeout), level='error') if unittest_mode is True: Decorators.unittest_thread_info_by_name[thread_name] = ('EXCEPTION', 'Could not start within timeout of {0}s while queued'.format(timeout)) raise EnsureSingleTimeoutReached('Ensure single {0} mode - ID {1} - Task {2} could not be started within timeout of {3}s'.format(mode, now, task_name, timeout)) elif mode == 'CHAINED': if extra_task_names is not None: log_message('Extra tasks are not allowed in this mode', level='error') raise ValueError('Ensure single {0} mode - ID {1} - Extra tasks are not allowed in this mode'.format(mode, now)) # Update kwargs with args sleep = 1 if unittest_mode is False else 0.1 timeout = kwargs.pop('ensure_single_timeout', 10 if unittest_mode is True else global_timeout) function_info = inspect.getargspec(f) kwargs_dict = {} for index, arg in enumerate(args): kwargs_dict[function_info.args[index]] = arg kwargs_dict.update(kwargs) params_info = 'with params {0}'.format(kwargs_dict) if kwargs_dict else 'with default params' # Set the key in arakoon if non-existent value = update_value(key=persistent_key, append=True) # Validate whether another job with same params is being executed, skip if so for item in value['values'][1:]: # 1st element is processing job, we check all other queued jobs for identical params if item['kwargs'] == kwargs_dict: if async_task is True: # Not waiting for other jobs to finish since asynchronously log_message('Execution of task {0} {1} discarded because of identical parameters'.format(task_name, params_info)) if unittest_mode is True: Decorators.unittest_thread_info_by_name[thread_name] = ('DISCARDED', None) return None # If executed inline (sync), execute callback if any provided if callback is not None: log_message('Execution of task {0} {1} in progress, executing callback function'.format(task_name, params_info)) if unittest_mode is True: Decorators.unittest_thread_info_by_name[thread_name] = ('CALLBACK', None) return callback(*args, **kwargs) # Let's wait for 2nd job in queue to have finished if no callback provided slept = 0 while slept < timeout: log_message('Task {0} {1} is waiting for similar tasks to finish - ({2})'.format(task_name, params_info, slept + sleep)) values = list(persistent_client.get_multi([persistent_key], must_exist=False)) if values[0] is None: if unittest_mode is True: Decorators.unittest_thread_info_by_name[thread_name] = ('WAITED', None) return None # All pending jobs have been deleted in the meantime, no need to wait if item['timestamp'] not in [value['timestamp'] for value in values[0]['values']]: if unittest_mode is True: Decorators.unittest_thread_info_by_name[thread_name] = ('WAITED', None) return None # Similar tasks have been executed, so sync task currently waiting can return without having been executed slept += sleep time.sleep(sleep) if slept >= timeout: log_message('Task {0} {1} waited {2}s for similar tasks to finish, but timeout was reached'.format(task_name, params_info, slept), level='error') if unittest_mode is True: Decorators.unittest_thread_info_by_name[thread_name] = ('EXCEPTION', 'Could not start within timeout of {0}s while waiting for other tasks'.format(timeout)) raise EnsureSingleTimeoutReached('Ensure single {0} mode - ID {1} - Task {2} could not be started within timeout of {3}s'.format(mode, now, task_name, timeout)) if unittest_mode is True: if thread_name not in Decorators.unittest_thread_info_by_state['WAITING']: Decorators.unittest_thread_info_by_state['WAITING'].append(thread_name) log_message('New task {0} {1} scheduled for execution'.format(task_name, params_info)) update_value(key=persistent_key, append=True, value_to_update={'kwargs': kwargs_dict, 'task_id': task_id, 'timestamp': now}) # Poll the arakoon to see whether this call is the first in list, if so --> execute, else wait first_element = None slept = 0 while slept < timeout: values = list(persistent_client.get_multi([persistent_key], must_exist=False)) if values[0] is not None: value = values[0] first_element = value['values'][0]['timestamp'] if len(value['values']) > 0 else None if first_element == now: try: if slept > 0: log_message('Task {0} {1} had to wait {2} seconds before being able to start'.format(task_name, params_info, slept)) if unittest_mode is True: Decorators.unittest_thread_info_by_name[thread_name] = ('EXECUTING', None) output = f(*args, **kwargs) if unittest_mode is True: Decorators.unittest_thread_info_by_name[thread_name] = ('FINISHED', None) Decorators.unittest_thread_info_by_state['FINISHED'].append(thread_name) log_message('Task {0} finished successfully'.format(task_name)) return output finally: update_value(key=persistent_key, append=False) else: if unittest_mode is True: if thread_name not in Decorators.unittest_thread_info_by_state['WAITING']: Decorators.unittest_thread_info_by_name[thread_name] = ('WAITING', None) Decorators.unittest_thread_info_by_state['WAITING'].append(thread_name) slept += sleep time.sleep(sleep) if slept >= timeout: update_value(key=persistent_key, append=False, value_to_update={'kwargs': kwargs_dict, 'task_id': task_id, 'timestamp': now}) log_message('Could not start task {0} {1}, within expected time ({2}s). Removed it from queue'.format(task_name, params_info, timeout), level='error') if unittest_mode is True: Decorators.unittest_thread_info_by_name[thread_name] = ('EXCEPTION', 'Could not start within timeout of {0}s while queued'.format(timeout)) raise EnsureSingleTimeoutReached('Ensure single {0} mode - ID {1} - Task {2} could not be started within timeout of {3}s'.format(mode, now, task_name, timeout)) else: raise ValueError('Unsupported mode "{0}" provided'.format(mode))
def __init__(self, guid=None, data=None, datastore_wins=False, volatile=False, _hook=None): """ Loads an object with a given guid. If no guid is given, a new object is generated with a new guid. * guid: The guid indicating which object should be loaded * datastoreWins: Optional boolean indicating save conflict resolve management. ** True: when saving, external modified fields will not be saved ** False: when saving, all changed data will be saved, regardless of external updates ** None: in case changed field were also changed externally, an error will be raised """ # Initialize super class super(DataObject, self).__init__() # Initialize internal fields self._frozen = False self._datastore_wins = datastore_wins self._guid = None # Guid identifier of the object self._original = {} # Original data copy self._metadata = {} # Some metadata, mainly used for unit testing self._data = {} # Internal data storage self._objects = {} # Internal objects storage # Initialize public fields self.dirty = False self.volatile = volatile # Worker fields/objects self._classname = self.__class__.__name__.lower() # Rebuild _relation types hybrid_structure = HybridRunner.get_hybrids() for relation in self._relations: if relation.foreign_type is not None: identifier = Descriptor(relation.foreign_type).descriptor['identifier'] if identifier in hybrid_structure and identifier != hybrid_structure[identifier]['identifier']: relation.foreign_type = Descriptor().load(hybrid_structure[identifier]).get_object() # Init guid self._new = False if guid is None: self._guid = str(uuid.uuid4()) self._new = True else: self._guid = str(guid) # Build base keys self._key = '{0}_{1}_{2}'.format(DataObject.NAMESPACE, self._classname, self._guid) # Worker mutexes self._mutex_version = volatile_mutex('ovs_dataversion_{0}_{1}'.format(self._classname, self._guid)) # Load data from cache or persistent backend where appropriate self._volatile = VolatileFactory.get_client() self._persistent = PersistentFactory.get_client() self._metadata['cache'] = None if self._new: self._data = {} else: if data is not None: self._data = copy.deepcopy(data) self._metadata['cache'] = None else: self._data = self._volatile.get(self._key) if self._data is None: self._metadata['cache'] = False try: self._data = self._persistent.get(self._key) except KeyNotFoundException: raise ObjectNotFoundException('{0} with guid \'{1}\' could not be found'.format( self.__class__.__name__, self._guid )) else: self._metadata['cache'] = True # Set default values on new fields for prop in self._properties: if prop.name not in self._data: self._data[prop.name] = prop.default self._add_property(prop) # Load relations for relation in self._relations: if relation.name not in self._data: if relation.foreign_type is None: cls = self.__class__ else: cls = relation.foreign_type self._data[relation.name] = Descriptor(cls).descriptor self._add_relation_property(relation) # Add wrapped properties for dynamic in self._dynamics: self._add_dynamic_property(dynamic) # Load foreign keys relations = RelationMapper.load_foreign_relations(self.__class__) if relations is not None: for key, info in relations.iteritems(): self._objects[key] = {'info': info, 'data': None} self._add_list_property(key, info['list']) if _hook is not None and hasattr(_hook, '__call__'): _hook() if not self._new: # Re-cache the object, if required if self._metadata['cache'] is False: # The data wasn't loaded from the cache, so caching is required now try: self._mutex_version.acquire(30) this_version = self._data['_version'] store_version = self._persistent.get(self._key)['_version'] if this_version == store_version: self._volatile.set(self._key, self._data) except KeyNotFoundException: raise ObjectNotFoundException('{0} with guid \'{1}\' could not be found'.format( self.__class__.__name__, self._guid )) finally: self._mutex_version.release() # Freeze property creation self._frozen = True # Optionally, initialize some fields if data is not None: for prop in self._properties: if prop.name in data: setattr(self, prop.name, data[prop.name]) # Store original data self._original = copy.deepcopy(self._data)
def clone(machineguid, timestamp, name): """ Clone a vmachine using the disk snapshot based on a snapshot timestamp @param machineguid: guid of the machine to clone @param timestamp: timestamp of the disk snapshots to use for the clone @param name: name for the new machine """ machine = VMachine(machineguid) timestamp = str(timestamp) if timestamp not in (snap['timestamp'] for snap in machine.snapshots): raise RuntimeError('Invalid timestamp provided, not a valid snapshot of this vmachine.') vpool = None storagerouter = None if machine.pmachine is not None and machine.pmachine.hvtype == 'VMWARE': for vdisk in machine.vdisks: if vdisk.vpool is not None: vpool = vdisk.vpool break for vdisk in machine.vdisks: if vdisk.storagerouter_guid: storagerouter = StorageRouter(vdisk.storagerouter_guid) break hv = Factory.get(machine.pmachine) vm_path = hv.get_vmachine_path(name, storagerouter.machine_id if storagerouter is not None else '') # mutex in sync_with_hypervisor uses "None" for KVM hvtype mutex = volatile_mutex('{0}_{1}'.format(hv.clean_vmachine_filename(vm_path), vpool.guid if vpool is not None else 'none')) disks = {} for snapshot in machine.snapshots: if snapshot['timestamp'] == timestamp: for diskguid, snapshotguid in snapshot['snapshots'].iteritems(): disks[diskguid] = snapshotguid try: mutex.acquire(wait=120) new_machine = VMachine() new_machine.copy(machine) new_machine.name = name new_machine.devicename = hv.clean_vmachine_filename(vm_path) new_machine.pmachine = machine.pmachine new_machine.save() finally: mutex.release() new_disk_guids = [] vm_disks = [] mountpoint = None disks_by_order = sorted(machine.vdisks, key=lambda x: x.order) try: for currentDisk in disks_by_order: if machine.is_vtemplate and currentDisk.templatesnapshot: snapshotid = currentDisk.templatesnapshot else: snapshotid = disks[currentDisk.guid] prefix = '%s-clone' % currentDisk.name result = VDiskController.clone(diskguid=currentDisk.guid, snapshotid=snapshotid, devicename=prefix, pmachineguid=new_machine.pmachine_guid, machinename=new_machine.name, machineguid=new_machine.guid) new_disk_guids.append(result['diskguid']) mountpoint = StorageDriverList.get_by_storagedriver_id(currentDisk.storagedriver_id).mountpoint vm_disks.append(result) except Exception as ex: VMachineController._logger.error('Failed to clone disks. {0}'.format(ex)) VMachineController.delete(machineguid=new_machine.guid) raise try: result = hv.clone_vm(machine.hypervisor_id, name, vm_disks, mountpoint) except Exception as ex: VMachineController._logger.error('Failed to clone vm. {0}'.format(ex)) VMachineController.delete(machineguid=new_machine.guid) raise try: mutex.acquire(wait=120) new_machine.hypervisor_id = result new_machine.save() finally: mutex.release() return new_machine.guid
def new_function(*args, **kwargs): """ Wrapped function :param args: Arguments without default values :param kwargs: Arguments with default values """ def log_message(message, level='info'): """ Log a message with some additional information :param message: Message to log :param level: Log level :return: None """ if level not in ('info', 'warning', 'debug', 'error'): raise ValueError( 'Unsupported log level "{0}" specified'.format(level)) complete_message = 'Ensure single {0} mode - ID {1} - {2}'.format( mode, now, message) getattr(logger, level)(complete_message) def update_value(key, append, value_to_update=None): """ Store the specified value in the PersistentFactory :param key: Key to store the value for :param append: If True, the specified value will be appended else element at index 0 will be popped :param value_to_update: Value to append to the list or remove from the list :return: Updated value """ with volatile_mutex(name=key, wait=5): if persistent_client.exists(key): val = persistent_client.get(key) if append is True and value_to_update is not None: val['values'].append(value_to_update) elif append is False and value_to_update is not None: for value_item in val['values']: if value_item == value_to_update: val['values'].remove(value_item) break elif append is False and len(val['values']) > 0: val['values'].pop(0) else: log_message('Setting initial value for key {0}'.format( persistent_key)) val = {'mode': mode, 'values': []} persistent_client.set(key, val) return val now = '{0}_{1}'.format( int(time.time()), ''.join( random.choice(string.ascii_letters + string.digits) for _ in range(10))) task_names = [ task_name ] if extra_task_names is None else [task_name] + extra_task_names persistent_key = '{0}_{1}'.format(ENSURE_SINGLE_KEY, task_name) persistent_client = PersistentFactory.get_client() if mode == 'DEFAULT': with volatile_mutex(persistent_key, wait=5): for task in task_names: key_to_check = '{0}_{1}'.format( ENSURE_SINGLE_KEY, task) if persistent_client.exists(key_to_check): log_message( 'Execution of task {0} discarded'.format( task_name)) return None log_message('Setting key {0}'.format(persistent_key)) persistent_client.set(persistent_key, {'mode': mode}) try: output = function(*args, **kwargs) log_message( 'Task {0} finished successfully'.format(task_name)) return output finally: with volatile_mutex(persistent_key, wait=5): if persistent_client.exists(persistent_key): log_message( 'Deleting key {0}'.format(persistent_key)) persistent_client.delete(persistent_key) elif mode == 'DEDUPED': with volatile_mutex(persistent_key, wait=5): if extra_task_names is not None: for task in extra_task_names: key_to_check = '{0}_{1}'.format( ENSURE_SINGLE_KEY, task) if persistent_client.exists(key_to_check): log_message( 'Execution of task {0} discarded'.format( task_name)) return None log_message('Setting key {0}'.format(persistent_key)) # Update kwargs with args timeout = kwargs.pop( 'ensure_single_timeout' ) if 'ensure_single_timeout' in kwargs else global_timeout function_info = inspect.getargspec(function) kwargs_dict = {} for index, arg in enumerate(args): kwargs_dict[function_info.args[index]] = arg kwargs_dict.update(kwargs) params_info = 'with params {0}'.format( kwargs_dict) if kwargs_dict else 'with default params' # Set the key in arakoon if non-existent value = update_value(key=persistent_key, append=True) # Validate whether another job with same params is being executed job_counter = 0 for item in value['values']: if item['kwargs'] == kwargs_dict: job_counter += 1 if job_counter == 2: # 1st job with same params is being executed, 2nd is scheduled for execution ==> Discard current log_message( 'Execution of task {0} {1} discarded because of identical parameters' .format(task_name, params_info)) return None log_message('New task {0} {1} scheduled for execution'.format( task_name, params_info)) update_value(key=persistent_key, append=True, value_to_update={'kwargs': kwargs_dict}) # Poll the arakoon to see whether this call is the only in list, if so --> execute, else wait counter = 0 while counter < timeout: if persistent_client.exists(persistent_key): values = persistent_client.get( persistent_key)['values'] queued_jobs = [ v for v in values if v['kwargs'] == kwargs_dict ] if len(queued_jobs) == 1: try: if counter != 0: current_time = int(time.time()) starting_time = int(now.split('_')[0]) log_message( 'Task {0} {1} had to wait {2} seconds before being able to start' .format(task_name, params_info, current_time - starting_time)) output = function(*args, **kwargs) log_message( 'Task {0} finished successfully'.format( task_name)) return output finally: update_value( key=persistent_key, append=False, value_to_update={'kwargs': kwargs_dict}) counter += 1 time.sleep(1) if counter == timeout: update_value( key=persistent_key, append=False, value_to_update={'kwargs': kwargs_dict}) log_message( 'Could not start task {0} {1}, within expected time ({2}s). Removed it from queue' .format(task_name, params_info, timeout), level='error') raise EnsureSingleTimeoutReached( 'Ensure single {0} mode - ID {1} - Task {2} could not be started within timeout of {3}s' .format(mode, now, task_name, timeout)) elif mode == 'CHAINED': if extra_task_names is not None: log_message('Extra tasks are not allowed in this mode', level='error') raise ValueError( 'Ensure single {0} mode - ID {1} - Extra tasks are not allowed in this mode' .format(mode, now)) # Create key to be stored in arakoon and update kwargs with args timeout = kwargs.pop( 'ensure_single_timeout' ) if 'ensure_single_timeout' in kwargs else global_timeout function_info = inspect.getargspec(function) kwargs_dict = {} for index, arg in enumerate(args): kwargs_dict[function_info.args[index]] = arg kwargs_dict.update(kwargs) params_info = 'with params {0}'.format( kwargs_dict) if kwargs_dict else 'with default params' # Set the key in arakoon if non-existent value = update_value(key=persistent_key, append=True) # Validate whether another job with same params is being executed, skip if so for item in value['values'][ 1:]: # 1st element is processing job, we check all other queued jobs for identical params if item['kwargs'] == kwargs_dict: log_message( 'Execution of task {0} {1} discarded because of identical parameters' .format(task_name, params_info)) return None log_message('New task {0} {1} scheduled for execution'.format( task_name, params_info)) update_value(key=persistent_key, append=True, value_to_update={ 'kwargs': kwargs_dict, 'timestamp': now }) # Poll the arakoon to see whether this call is the first in list, if so --> execute, else wait first_element = None counter = 0 while first_element != now and counter < timeout: if persistent_client.exists(persistent_key): value = persistent_client.get(persistent_key) first_element = value['values'][0]['timestamp'] if first_element == now: try: if counter != 0: current_time = int(time.time()) starting_time = int(now.split('_')[0]) log_message( 'Task {0} {1} had to wait {2} seconds before being able to start' .format(task_name, params_info, current_time - starting_time)) output = function(*args, **kwargs) log_message( 'Task {0} finished successfully'.format( task_name)) finally: update_value(key=persistent_key, append=False) return output counter += 1 time.sleep(1) if counter == timeout: update_value(key=persistent_key, append=False) log_message( 'Could not start task {0} {1}, within expected time ({2}s). Removed it from queue' .format(task_name, params_info, timeout), level='error') raise EnsureSingleTimeoutReached( 'Ensure single {0} mode - ID {1} - Task {2} could not be started within timeout of {3}s' .format(mode, now, task_name, timeout)) else: raise ValueError( 'Unsupported mode "{0}" provided'.format(mode))
def new_function(*args, **kwargs): """ Wrapped function :param args: Arguments without default values :param kwargs: Arguments with default values """ def log_message(message, level='info'): """ Log a message with some additional information :param message: Message to log :param level: Log level :return: None """ if level not in ('info', 'warning', 'debug', 'error', 'exception'): raise ValueError('Unsupported log level "{0}" specified'.format(level)) complete_message = 'Ensure single {0} mode - ID {1} - {2}'.format(mode, now, message) getattr(logger, level)(complete_message) def update_value(key, append, value_to_update=None): """ Store the specified value in the PersistentFactory :param key: Key to store the value for :param append: If True, the specified value will be appended else element at index 0 will be popped :param value_to_update: Value to append to the list or remove from the list :return: Updated value """ with volatile_mutex(name=key, wait=5): if persistent_client.exists(key): val = persistent_client.get(key) if append is True and value_to_update is not None: val['values'].append(value_to_update) elif append is False and value_to_update is not None: for value_item in val['values']: if value_item == value_to_update: val['values'].remove(value_item) break elif append is False and len(val['values']) > 0: val['values'].pop(0) log_message('Amount of jobs pending for key {0}: {1}'.format(key, len(val['values']))) for kwarg in val['values']: log_message(' KWARGS: {0}'.format(kwarg['kwargs'])) else: log_message('Setting initial value for key {0}'.format(key)) val = {'mode': mode, 'values': []} persistent_client.set(key, val) return val now = '{0}_{1}'.format(int(time.time()), ''.join(random.choice(string.ascii_letters + string.digits) for _ in range(10))) task_names = [task_name] if extra_task_names is None else [task_name] + extra_task_names persistent_key = '{0}_{1}'.format(ENSURE_SINGLE_KEY, task_name) persistent_client = PersistentFactory.get_client() if mode == 'DEFAULT': with volatile_mutex(persistent_key, wait=5): for task in task_names: key_to_check = '{0}_{1}'.format(ENSURE_SINGLE_KEY, task) if persistent_client.exists(key_to_check): log_message('Execution of task {0} discarded'.format(task_name)) return None log_message('Setting key {0}'.format(persistent_key)) persistent_client.set(persistent_key, {'mode': mode}) try: output = function(*args, **kwargs) log_message('Task {0} finished successfully'.format(task_name)) return output finally: with volatile_mutex(persistent_key, wait=5): if persistent_client.exists(persistent_key): log_message('Deleting key {0}'.format(persistent_key)) persistent_client.delete(persistent_key) elif mode == 'DEDUPED': with volatile_mutex(persistent_key, wait=5): if extra_task_names is not None: for task in extra_task_names: key_to_check = '{0}_{1}'.format(ENSURE_SINGLE_KEY, task) if persistent_client.exists(key_to_check): log_message('Execution of task {0} discarded'.format(task_name)) return None log_message('Setting key {0}'.format(persistent_key)) # Update kwargs with args timeout = kwargs.pop('ensure_single_timeout') if 'ensure_single_timeout' in kwargs else global_timeout function_info = inspect.getargspec(function) kwargs_dict = {} for index, arg in enumerate(args): kwargs_dict[function_info.args[index]] = arg kwargs_dict.update(kwargs) params_info = 'with params {0}'.format(kwargs_dict) if kwargs_dict else 'with default params' # Set the key in arakoon if non-existent value = update_value(key=persistent_key, append=True) # Validate whether another job with same params is being executed job_counter = 0 for item in value['values']: if item['kwargs'] == kwargs_dict: job_counter += 1 if job_counter == 2: # 1st job with same params is being executed, 2nd is scheduled for execution ==> Discard current log_message('Execution of task {0} {1} discarded because of identical parameters'.format(task_name, params_info)) return None log_message('New task {0} {1} scheduled for execution'.format(task_name, params_info)) update_value(key=persistent_key, append=True, value_to_update={'kwargs': kwargs_dict}) # Poll the arakoon to see whether this call is the only in list, if so --> execute, else wait counter = 0 while counter < timeout: if persistent_client.exists(persistent_key): values = persistent_client.get(persistent_key)['values'] queued_jobs = [v for v in values if v['kwargs'] == kwargs_dict] if len(queued_jobs) == 1: try: if counter != 0: current_time = int(time.time()) starting_time = int(now.split('_')[0]) log_message('Task {0} {1} had to wait {2} seconds before being able to start'.format(task_name, params_info, current_time - starting_time)) output = function(*args, **kwargs) log_message('Task {0} finished successfully'.format(task_name)) return output finally: update_value(key=persistent_key, append=False, value_to_update={'kwargs': kwargs_dict}) counter += 1 time.sleep(1) if counter == timeout: update_value(key=persistent_key, append=False, value_to_update={'kwargs': kwargs_dict}) log_message('Could not start task {0} {1}, within expected time ({2}s). Removed it from queue'.format(task_name, params_info, timeout), level='error') raise EnsureSingleTimeoutReached('Ensure single {0} mode - ID {1} - Task {2} could not be started within timeout of {3}s'.format(mode, now, task_name, timeout)) elif mode == 'CHAINED': if extra_task_names is not None: log_message('Extra tasks are not allowed in this mode', level='error') raise ValueError('Ensure single {0} mode - ID {1} - Extra tasks are not allowed in this mode'.format(mode, now)) # Create key to be stored in arakoon and update kwargs with args timeout = kwargs.pop('ensure_single_timeout') if 'ensure_single_timeout' in kwargs else global_timeout function_info = inspect.getargspec(function) kwargs_dict = {} for index, arg in enumerate(args): kwargs_dict[function_info.args[index]] = arg kwargs_dict.update(kwargs) params_info = 'with params {0}'.format(kwargs_dict) if kwargs_dict else 'with default params' # Set the key in arakoon if non-existent value = update_value(key=persistent_key, append=True) # Validate whether another job with same params is being executed, skip if so for item in value['values'][1:]: # 1st element is processing job, we check all other queued jobs for identical params if item['kwargs'] == kwargs_dict: log_message('Execution of task {0} {1} discarded because of identical parameters'.format(task_name, params_info)) return None log_message('New task {0} {1} scheduled for execution'.format(task_name, params_info)) update_value(key=persistent_key, append=True, value_to_update={'kwargs': kwargs_dict, 'timestamp': now}) # Poll the arakoon to see whether this call is the first in list, if so --> execute, else wait first_element = None counter = 0 while counter < timeout: if persistent_client.exists(persistent_key): value = persistent_client.get(persistent_key) first_element = value['values'][0]['timestamp'] if len(value['values']) > 0 else None if first_element == now: output = None try: if counter != 0: current_time = int(time.time()) starting_time = int(now.split('_')[0]) log_message('Task {0} {1} had to wait {2} seconds before being able to start'.format(task_name, params_info, current_time - starting_time)) output = function(*args, **kwargs) log_message('Task {0} finished successfully'.format(task_name)) except Exception: log_message('Task {0} {1} failed'.format(task_name, params_info), level='exception') raise finally: update_value(key=persistent_key, append=False) return output counter += 1 time.sleep(1) if counter == timeout: update_value(key=persistent_key, append=False) log_message('Could not start task {0} {1}, within expected time ({2}s). Removed it from queue'.format(task_name, params_info, timeout), level='error') raise EnsureSingleTimeoutReached('Ensure single {0} mode - ID {1} - Task {2} could not be started within timeout of {3}s'.format(mode, now, task_name, timeout)) else: raise ValueError('Unsupported mode "{0}" provided'.format(mode))
def create_cluster(cluster_name, cluster_type, ip, base_dir, plugins=None, locked=True, internal=True, claim=False): """ Always creates a cluster but marks it's usage according to the internal flag :param cluster_name: Name of the cluster :type cluster_name: str :param cluster_type: Type of the cluster (See ServiceType.ARAKOON_CLUSTER_TYPES) :type cluster_type: str :param ip: IP address of the first node of the new cluster :type ip: str :param base_dir: Base directory that should contain the data and tlogs :type base_dir: str :param plugins: Plugins that should be added to the configuration file :type plugins: list :param locked: Indicates whether the create should run in a locked context (e.g. to prevent port conflicts) :type locked: bool :param internal: Is cluster internally managed by OVS :type internal: bool :param claim: Claim the cluster right away :type claim: bool :return: Ports used by arakoon cluster :rtype: dict """ if cluster_type not in ServiceType.ARAKOON_CLUSTER_TYPES: raise ValueError('Cluster type {0} is not supported. Please choose from {1}'.format(cluster_type, ', '.join(ServiceType.ARAKOON_CLUSTER_TYPES))) if EtcdConfiguration.dir_exists('/ovs/arakoon/{0}'.format(cluster_name)): raise ValueError('An Arakoon cluster with name "{0}" already exists'.format(cluster_name)) ArakoonInstaller._logger.debug('Creating cluster {0} on {1}'.format(cluster_name, ip)) base_dir = base_dir.rstrip('/') client = SSHClient(ip, username=ArakoonInstaller.SSHCLIENT_USER) if ArakoonInstaller.is_running(cluster_name, client): ArakoonInstaller._logger.info('Arakoon service running for cluster {0}'.format(cluster_name)) config = ArakoonClusterConfig(cluster_name, plugins) config.load_config() for node in config.nodes: if node.ip == ip: return {'client_port': node.client_port, 'messaging_port': node.messaging_port} node_name = System.get_my_machine_id(client) home_dir = ArakoonInstaller.ARAKOON_HOME_DIR.format(base_dir, cluster_name) log_dir = ArakoonInstaller.ARAKOON_LOG_DIR.format(cluster_name) tlog_dir = ArakoonInstaller.ARAKOON_TLOG_DIR.format(base_dir, cluster_name) ArakoonInstaller.clean_leftover_arakoon_data(ip, {log_dir: True, home_dir: False, tlog_dir: False}) port_mutex = None try: if locked is True: from ovs.extensions.generic.volatilemutex import volatile_mutex port_mutex = volatile_mutex('arakoon_install_ports_{0}'.format(ip)) port_mutex.acquire(wait=60) ports = ArakoonInstaller._get_free_ports(client) config = ArakoonClusterConfig(cluster_name, plugins) config.nodes.append(ArakoonNodeConfig(name=node_name, ip=ip, client_port=ports[0], messaging_port=ports[1], log_dir=log_dir, home=home_dir, tlog_dir=tlog_dir)) ArakoonInstaller._deploy(config) metadata = ArakoonClusterMetadata(cluster_id=cluster_name) metadata.internal = internal metadata.cluster_type = cluster_type.upper() metadata.write() if claim is True: metadata.claim() finally: if port_mutex is not None: port_mutex.release() ArakoonInstaller._logger.debug('Creating cluster {0} on {1} completed'.format(cluster_name, ip)) return {'metadata': metadata, 'client_port': ports[0], 'messaging_port': ports[1]}
def update_from_voldrv(name, storagedriver_id): """ This method will update/create a vmachine based on a given vmx/xml file :param name: Name of the vmx :param storagedriver_id: Storage Driver hosting the vmachine """ pmachine = PMachineList.get_by_storagedriver_id(storagedriver_id) if pmachine.hvtype not in ['VMWARE', 'KVM']: return hypervisor = Factory.get(pmachine) name = hypervisor.clean_vmachine_filename(name) storagedriver = StorageDriverList.get_by_storagedriver_id(storagedriver_id) vpool = storagedriver.vpool machine_ids = [storagedriver.storagerouter.machine_id for storagedriver in vpool.storagedrivers] if hypervisor.should_process(name, machine_ids=machine_ids): if pmachine.hvtype == 'VMWARE': storagedriver = StorageDriverList.get_by_storagedriver_id(storagedriver_id) vpool = storagedriver.vpool else: vpool = None pmachine = PMachineList.get_by_storagedriver_id(storagedriver_id) mutex = volatile_mutex('{0}_{1}'.format(name, vpool.guid if vpool is not None else 'none')) try: mutex.acquire(wait=120) limit = 5 exists = hypervisor.file_exists(storagedriver, name) while limit > 0 and exists is False: time.sleep(1) exists = hypervisor.file_exists(storagedriver, name) limit -= 1 if exists is False: VMachineController._logger.info('Could not locate vmachine with name {0} on vpool {1}'.format(name, vpool.name)) vmachine = VMachineList.get_by_devicename_and_vpool(name, vpool) if vmachine is not None: VMachineController.delete_from_voldrv(name, storagedriver_id=storagedriver_id) return finally: mutex.release() try: mutex.acquire(wait=5) vmachine = VMachineList.get_by_devicename_and_vpool(name, vpool) if not vmachine: vmachines = VMachineList.get_vmachine_by_name(name) if vmachines is not None: vmachine = vmachines[0] if not vmachine: vmachine = VMachine() vmachine.vpool = vpool vmachine.pmachine = pmachine vmachine.status = 'CREATED' vmachine.devicename = name vmachine.save() finally: mutex.release() if pmachine.hvtype == 'KVM': try: mutex.acquire(wait=120) VMachineController.sync_with_hypervisor(vmachine.guid, storagedriver_id=storagedriver_id) vmachine.status = 'SYNC' except: vmachine.status = 'SYNC_NOK' finally: mutex.release() vmachine.save() else: VMachineController._logger.info('Ignored invalid file {0}'.format(name))
def update_vmachine_config(vmachine, vm_object, pmachine=None): """ Update a vMachine configuration with a given vMachine configuration :param vmachine: Virtual Machine to update :param vm_object: New virtual machine info :param pmachine: Physical machine of the virtual machine """ try: vdisks_synced = 0 if vmachine.name is None: MessageController.fire(MessageController.Type.EVENT, {'type': 'vmachine_created', 'metadata': {'name': vm_object['name']}}) elif vmachine.name != vm_object['name']: MessageController.fire(MessageController.Type.EVENT, {'type': 'vmachine_renamed', 'metadata': {'old_name': vmachine.name, 'new_name': vm_object['name']}}) if pmachine is not None: vmachine.pmachine = pmachine vmachine.name = vm_object['name'] vmachine.hypervisor_id = vm_object['id'] vmachine.devicename = vm_object['backing']['filename'] vmachine.save() # Updating and linking disks storagedrivers = StorageDriverList.get_storagedrivers() datastores = dict([('{0}:{1}'.format(storagedriver.storage_ip, storagedriver.mountpoint), storagedriver) for storagedriver in storagedrivers]) vdisk_guids = [] mutex = volatile_mutex('{0}_{1}'.format(vmachine.name, vmachine.devicename)) for disk in vm_object['disks']: ensure_safety = False if disk['datastore'] in vm_object['datastores']: datastore = vm_object['datastores'][disk['datastore']] if datastore in datastores: try: mutex.acquire(wait=10) vdisk = VDiskList.get_by_devicename_and_vpool(disk['filename'], datastores[datastore].vpool) if vdisk is None: # The disk couldn't be located, but is in our datastore. We might be in a recovery scenario vdisk = VDisk() vdisk.vpool = datastores[datastore].vpool vdisk.reload_client() vdisk.devicename = disk['filename'] vdisk.volume_id = vdisk.storagedriver_client.get_volume_id(str(disk['backingfilename'])) vdisk.size = vdisk.info['volume_size'] vdisk.metadata = {'lba_size': vdisk.info['lba_size'], 'cluster_multiplier': vdisk.info['cluster_multiplier']} # Create the disk in a locked context, but don't execute long running-task in same context vdisk.save() ensure_safety = True finally: mutex.release() if ensure_safety: MDSServiceController.ensure_safety(vdisk) VDiskController.dtl_checkup(vdisk_guid=vdisk.guid) # Update the disk with information from the hypervisor if vdisk.vmachine is None: MessageController.fire(MessageController.Type.EVENT, {'type': 'vdisk_attached', 'metadata': {'vmachine_name': vmachine.name, 'vdisk_name': disk['name']}}) vdisk.vmachine = vmachine vdisk.name = disk['name'] vdisk.order = disk['order'] vdisk.save() vdisk_guids.append(vdisk.guid) vdisks_synced += 1 for vdisk in vmachine.vdisks: if vdisk.guid not in vdisk_guids: MessageController.fire(MessageController.Type.EVENT, {'type': 'vdisk_detached', 'metadata': {'vmachine_name': vmachine.name, 'vdisk_name': vdisk.name}}) vdisk.vmachine = None vdisk.save() VMachineController._logger.info('Updating vMachine finished (name {0}, {1} vdisks (re)linked)'.format( vmachine.name, vdisks_synced )) except Exception as ex: VMachineController._logger.info('Error during vMachine update: {0}'.format(str(ex))) raise
def update_from_voldrv(name, storagedriver_id): """ This method will update/create a vmachine based on a given vmx/xml file :param name: Name of the vmx :param storagedriver_id: Storage Driver hosting the vmachine """ pmachine = PMachineList.get_by_storagedriver_id(storagedriver_id) if pmachine.hvtype not in ['VMWARE', 'KVM']: return hypervisor = Factory.get(pmachine) name = hypervisor.clean_vmachine_filename(name) storagedriver = StorageDriverList.get_by_storagedriver_id(storagedriver_id) vpool = storagedriver.vpool machine_ids = [storagedriver.storagerouter.machine_id for storagedriver in vpool.storagedrivers] if hypervisor.should_process(name, machine_ids=machine_ids): if pmachine.hvtype == 'VMWARE': storagedriver = StorageDriverList.get_by_storagedriver_id(storagedriver_id) vpool = storagedriver.vpool else: vpool = None pmachine = PMachineList.get_by_storagedriver_id(storagedriver_id) mutex = volatile_mutex('{0}_{1}'.format(name, vpool.guid if vpool is not None else 'none')) try: mutex.acquire(wait=120) limit = 5 exists = hypervisor.file_exists(storagedriver, name) while limit > 0 and exists is False: time.sleep(1) exists = hypervisor.file_exists(storagedriver, name) limit -= 1 if exists is False: VMachineController._logger.info('Could not locate vmachine with name {0} on vpool {1}'.format(name, vpool.name)) vmachine = VMachineList.get_by_devicename_and_vpool(name, vpool) if vmachine is not None: VMachineController.delete_from_voldrv(name, storagedriver_id=storagedriver_id) return finally: mutex.release() try: mutex.acquire(wait=5) vmachine = VMachineList.get_by_devicename_and_vpool(name, vpool) if not vmachine: vmachines = VMachineList.get_vmachine_by_name(name) if vmachines is not None: vmachine = vmachines[0] if not vmachine: vmachine = VMachine() vmachine.vpool = vpool vmachine.pmachine = pmachine vmachine.status = 'CREATED' vmachine.devicename = name vmachine.save() finally: mutex.release() if pmachine.hvtype == 'KVM': try: mutex.acquire(wait=120) VMachineController.sync_with_hypervisor(vmachine.guid, storagedriver_id=storagedriver_id) vmachine.status = 'SYNC' except: vmachine.status = 'SYNC_NOK' finally: mutex.release() vmachine.save() else: VMachineController._logger.info('Ignored invalid file {0}'.format(name))
def extend_cluster(master_ip, new_ip, cluster_name, base_dir, locked=True, filesystem=False, ports=None): """ Extends a cluster to a given new node :param master_ip: IP of one of the already existing nodes :type master_ip: str :param new_ip: IP address of the node to be added :type new_ip: str :param cluster_name: Name of the cluster to be extended :type cluster_name: str :param base_dir: Base directory that will hold the db and tlogs :type base_dir: str :param locked: Indicates whether the extend should run in a locked context (e.g. to prevent port conflicts) :type locked: bool :param filesystem: Indicates whether the configuration should be on the filesystem or in a configuration cluster :type filesystem: bool :param ports: A list of ports to be used for this cluster's node :type ports: list :return: Ports used by arakoon cluster :rtype: dict """ ArakoonInstaller._logger.debug('Extending cluster {0} from {1} to {2}'.format(cluster_name, master_ip, new_ip)) base_dir = base_dir.rstrip('/') config = ArakoonClusterConfig(cluster_name, filesystem) config.load_config(master_ip) cluster_type = ArakoonInstaller.get_arakoon_metadata_by_cluster_name(cluster_name=config.cluster_id, filesystem=filesystem, ip=master_ip)['cluster_type'] client = SSHClient(new_ip, username=ArakoonInstaller.SSHCLIENT_USER) node_name = System.get_my_machine_id(client) home_dir = ArakoonInstaller.ARAKOON_HOME_DIR.format(base_dir, cluster_name) tlog_dir = ArakoonInstaller.ARAKOON_TLOG_DIR.format(base_dir, cluster_name) ArakoonInstaller.clean_leftover_arakoon_data(new_ip, [home_dir, tlog_dir]) port_mutex = None try: if locked is True: from ovs.extensions.generic.volatilemutex import volatile_mutex port_mutex = volatile_mutex('arakoon_install_ports_{0}'.format(new_ip)) port_mutex.acquire(wait=60) if ports is None: ports = ArakoonInstaller._get_free_ports(client) if node_name not in [node.name for node in config.nodes]: config.nodes.append(ArakoonNodeConfig(name=node_name, ip=new_ip, client_port=ports[0], messaging_port=ports[1], log_sinks=LogHandler.get_sink_path('arakoon_server'), crash_log_sinks=LogHandler.get_sink_path('arakoon_server_crash'), home=home_dir, tlog_dir=tlog_dir)) service_metadata = ArakoonInstaller._deploy(config=config, filesystem=filesystem, delay_service_registration=cluster_type == ServiceType.ARAKOON_CLUSTER_TYPES.CFG)[new_ip] finally: if port_mutex is not None: port_mutex.release() ArakoonInstaller._logger.debug('Extending cluster {0} from {1} to {2} completed'.format(cluster_name, master_ip, new_ip)) return {'client_port': ports[0], 'messaging_port': ports[1], 'service_metadata': service_metadata, 'ips': [node.ip for node in config.nodes]}
def _deploy_stack_and_scrub(queue, vpool, scrub_info, error_messages): """ Executes scrub work for a given vDisk queue and vPool, based on scrub_info :param queue: a Queue with vDisk guids that need to be scrubbed (they should only be member of a single vPool) :type queue: Queue :param vpool: the vPool object of the vDisks :type vpool: VPool :param scrub_info: A dict containing scrub information: `scrub_path` with the path where to scrub `storage_router` with the StorageRouter that needs to do the work :type scrub_info: dict :param error_messages: A list of error messages to be filled (by reference) :type error_messages: list :return: None :rtype: NoneType """ if len(vpool.storagedrivers ) == 0 or not vpool.storagedrivers[0].storagedriver_id: error_messages.append( 'vPool {0} does not have any valid StorageDrivers configured'. format(vpool.name)) return service_manager = ServiceFactory.get_manager() client = None lock_time = 5 * 60 storagerouter = scrub_info['storage_router'] partition_guid = scrub_info['partition_guid'] alba_proxy_service = 'ovs-albaproxy_{0}_{1}_{2}_scrub'.format( vpool.name, storagerouter.name, partition_guid) scrub_directory = '{0}/scrub_work_{1}_{2}'.format( scrub_info['scrub_path'], vpool.name, partition_guid) scrub_config_key = 'ovs/vpools/{0}/proxies/scrub/scrub_config_{1}'.format( vpool.guid, partition_guid) backend_config_key = 'ovs/vpools/{0}/proxies/scrub/backend_config_{1}'.format( vpool.guid, partition_guid) # Deploy a proxy try: with file_mutex(name='ovs_albaproxy_scrub', wait=lock_time): GenericController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - Deploying ALBA proxy {2}' .format(vpool.name, storagerouter.name, alba_proxy_service)) client = SSHClient(storagerouter, 'root') client.dir_create(scrub_directory) client.dir_chmod( scrub_directory, 0777 ) # Celery task executed by 'ovs' user and should be able to write in it if service_manager.has_service( name=alba_proxy_service, client=client ) is True and service_manager.get_service_status( name=alba_proxy_service, client=client) == 'active': GenericController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - Re-using existing proxy service {2}' .format(vpool.name, storagerouter.name, alba_proxy_service)) scrub_config = Configuration.get(scrub_config_key) else: machine_id = System.get_my_machine_id(client) port_range = Configuration.get( '/ovs/framework/hosts/{0}/ports|storagedriver'.format( machine_id)) with volatile_mutex('deploy_proxy_for_scrub_{0}'.format( storagerouter.guid), wait=30): port = System.get_free_ports(selected_range=port_range, nr=1, client=client)[0] scrub_config = Configuration.get( 'ovs/vpools/{0}/proxies/scrub/generic_scrub'.format( vpool.guid)) scrub_config['port'] = port scrub_config['transport'] = 'tcp' Configuration.set(scrub_config_key, json.dumps(scrub_config, indent=4), raw=True) params = { 'VPOOL_NAME': vpool.name, 'LOG_SINK': LogHandler.get_sink_path(alba_proxy_service), 'CONFIG_PATH': Configuration.get_configuration_path(scrub_config_key) } service_manager.add_service(name='ovs-albaproxy', params=params, client=client, target_name=alba_proxy_service) service_manager.start_service(name=alba_proxy_service, client=client) GenericController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - Deployed ALBA proxy {2}' .format(vpool.name, storagerouter.name, alba_proxy_service)) backend_config = Configuration.get( 'ovs/vpools/{0}/hosts/{1}/config'.format( vpool.guid, vpool.storagedrivers[0].storagedriver_id ))['backend_connection_manager'] if backend_config.get('backend_type') != 'MULTI': backend_config['alba_connection_host'] = '127.0.0.1' backend_config['alba_connection_port'] = scrub_config[ 'port'] else: for value in backend_config.itervalues(): if isinstance(value, dict): value['alba_connection_host'] = '127.0.0.1' value['alba_connection_port'] = scrub_config[ 'port'] # Copy backend connection manager information in separate key Configuration.set( backend_config_key, json.dumps({"backend_connection_manager": backend_config}, indent=4), raw=True) except Exception: message = 'Scrubber - vPool {0} - StorageRouter {1} - An error occurred deploying ALBA proxy {2}'.format( vpool.name, storagerouter.name, alba_proxy_service) error_messages.append(message) GenericController._logger.exception(message) if client is not None and service_manager.has_service( name=alba_proxy_service, client=client) is True: if service_manager.get_service_status( name=alba_proxy_service, client=client) == 'active': service_manager.stop_service(name=alba_proxy_service, client=client) service_manager.remove_service(name=alba_proxy_service, client=client) if Configuration.exists(scrub_config_key): Configuration.delete(scrub_config_key) # Execute the actual scrubbing threads = [] threads_key = '/ovs/framework/hosts/{0}/config|scrub_stack_threads'.format( storagerouter.machine_id) amount_threads = Configuration.get( key=threads_key) if Configuration.exists(key=threads_key) else 2 if not isinstance(amount_threads, int): error_messages.append( 'Amount of threads to spawn must be an integer for StorageRouter with ID {0}' .format(storagerouter.machine_id)) return amount_threads = max(amount_threads, 1) # Make sure amount_threads is at least 1 amount_threads = min(min(queue.qsize(), amount_threads), 20) # Make sure amount threads is max 20 GenericController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - Spawning {2} threads for proxy service {3}' .format(vpool.name, storagerouter.name, amount_threads, alba_proxy_service)) for index in range(amount_threads): thread = Thread(name='execute_scrub_{0}_{1}_{2}'.format( vpool.guid, partition_guid, index), target=GenericController._execute_scrub, args=(queue, vpool, scrub_info, scrub_directory, error_messages)) thread.start() threads.append(thread) for thread in threads: thread.join() # Delete the proxy again try: with file_mutex(name='ovs_albaproxy_scrub', wait=lock_time): GenericController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - Removing service {2}' .format(vpool.name, storagerouter.name, alba_proxy_service)) client = SSHClient(storagerouter, 'root') client.dir_delete(scrub_directory) if service_manager.has_service(alba_proxy_service, client=client): service_manager.stop_service(alba_proxy_service, client=client) service_manager.remove_service(alba_proxy_service, client=client) if Configuration.exists(scrub_config_key): Configuration.delete(scrub_config_key) GenericController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - Removed service {2}' .format(vpool.name, storagerouter.name, alba_proxy_service)) except Exception: message = 'Scrubber - vPool {0} - StorageRouter {1} - Removing service {2} failed'.format( vpool.name, storagerouter.name, alba_proxy_service) error_messages.append(message) GenericController._logger.exception(message)
def create_cluster(cluster_name, cluster_type, ip, base_dir, plugins=None, locked=True, internal=True, filesystem=False, ports=None): """ Always creates a cluster but marks it's usage according to the internal flag :param cluster_name: Name of the cluster :type cluster_name: str :param cluster_type: Type of the cluster (See ServiceType.ARAKOON_CLUSTER_TYPES) :type cluster_type: str :param ip: IP address of the first node of the new cluster :type ip: str :param base_dir: Base directory that should contain the data and tlogs :type base_dir: str :param plugins: Plugins that should be added to the configuration file :type plugins: dict :param locked: Indicates whether the create should run in a locked context (e.g. to prevent port conflicts) :type locked: bool :param internal: Is cluster internally managed by OVS :type internal: bool :param filesystem: Indicates whether the configuration should be on the filesystem or in a configuration cluster :type filesystem: bool :param ports: A list of ports to be used for this cluster's node :type ports: list :return: Ports used by arakoon cluster :rtype: dict """ if cluster_type not in ServiceType.ARAKOON_CLUSTER_TYPES: raise ValueError('Cluster type {0} is not supported. Please choose from {1}'.format(cluster_type, ', '.join(ServiceType.ARAKOON_CLUSTER_TYPES))) client = SSHClient(ip, username=ArakoonInstaller.SSHCLIENT_USER) if filesystem is True: exists = client.file_exists(ArakoonClusterConfig.CONFIG_FILE.format(cluster_name)) else: exists = Configuration.dir_exists('/ovs/arakoon/{0}'.format(cluster_name)) if exists is True: raise ValueError('An Arakoon cluster with name "{0}" already exists'.format(cluster_name)) ArakoonInstaller._logger.debug('Creating cluster {0} on {1}'.format(cluster_name, ip)) node_name = System.get_my_machine_id(client) base_dir = base_dir.rstrip('/') home_dir = ArakoonInstaller.ARAKOON_HOME_DIR.format(base_dir, cluster_name) tlog_dir = ArakoonInstaller.ARAKOON_TLOG_DIR.format(base_dir, cluster_name) ArakoonInstaller.clean_leftover_arakoon_data(ip, [home_dir, tlog_dir]) port_mutex = None try: if locked is True: from ovs.extensions.generic.volatilemutex import volatile_mutex port_mutex = volatile_mutex('arakoon_install_ports_{0}'.format(ip)) port_mutex.acquire(wait=60) if ports is None: ports = ArakoonInstaller._get_free_ports(client) config = ArakoonClusterConfig(cluster_name, filesystem, plugins.keys() if plugins is not None else None) config.nodes.append(ArakoonNodeConfig(name=node_name, ip=ip, client_port=ports[0], messaging_port=ports[1], log_sinks=LogHandler.get_sink_path('arakoon_server'), crash_log_sinks=LogHandler.get_sink_path('arakoon_server_crash'), home=home_dir, tlog_dir=tlog_dir)) metadata = {'internal': internal, 'cluster_name': cluster_name, 'cluster_type': cluster_type.upper(), 'in_use': False} service_metadata = ArakoonInstaller._deploy(config=config, filesystem=filesystem, plugins=plugins.values() if plugins is not None else None, delay_service_registration=cluster_type == ServiceType.ARAKOON_CLUSTER_TYPES.CFG)[ip] finally: if port_mutex is not None: port_mutex.release() ArakoonInstaller._logger.debug('Creating cluster {0} on {1} completed'.format(cluster_name, ip)) return {'metadata': metadata, 'client_port': ports[0], 'messaging_port': ports[1], 'service_metadata': service_metadata}
def add_vpool(cls, parameters): """ Add a vPool to the machine this task is running on :param parameters: Parameters for vPool creation :type parameters: dict :return: None :rtype: NoneType """ # TODO: Add logging cls._logger.debug('Adding vpool. Parameters: {}'.format(parameters)) # VALIDATIONS if not isinstance(parameters, dict): raise ValueError( 'Parameters passed to create a vPool should be of type dict') # Check StorageRouter existence storagerouter = StorageRouterList.get_by_ip( ip=parameters.get('storagerouter_ip')) if storagerouter is None: raise RuntimeError('Could not find StorageRouter') # Validate requested vPool configurations vp_installer = VPoolInstaller(name=parameters.get('vpool_name')) vp_installer.validate(storagerouter=storagerouter) # Validate requested StorageDriver configurations cls._logger.info( 'vPool {0}: Validating StorageDriver configurations'.format( vp_installer.name)) sd_installer = StorageDriverInstaller( vp_installer=vp_installer, configurations={ 'storage_ip': parameters.get('storage_ip'), 'caching_info': parameters.get('caching_info'), 'backend_info': { 'main': parameters.get('backend_info'), StorageDriverConfiguration.CACHE_BLOCK: parameters.get('backend_info_bc'), StorageDriverConfiguration.CACHE_FRAGMENT: parameters.get('backend_info_fc') }, 'connection_info': { 'main': parameters.get('connection_info'), StorageDriverConfiguration.CACHE_BLOCK: parameters.get('connection_info_bc'), StorageDriverConfiguration.CACHE_FRAGMENT: parameters.get('connection_info_fc') }, 'sd_configuration': parameters.get('config_params') }) partitions_mutex = volatile_mutex('add_vpool_partitions_{0}'.format( storagerouter.guid)) try: # VPOOL CREATION # Create the vPool as soon as possible in the process to be displayed in the GUI (INSTALLING/EXTENDING state) if vp_installer.is_new is True: vp_installer.create(rdma_enabled=sd_installer.rdma_enabled) vp_installer.configure_mds( config=parameters.get('mds_config_params', {})) else: vp_installer.update_status(status=VPool.STATUSES.EXTENDING) # ADDITIONAL VALIDATIONS # Check StorageRouter connectivity cls._logger.info( 'vPool {0}: Validating StorageRouter connectivity'.format( vp_installer.name)) linked_storagerouters = [storagerouter] if vp_installer.is_new is False: linked_storagerouters += [ sd.storagerouter for sd in vp_installer.vpool.storagedrivers ] sr_client_map = SSHClient.get_clients( endpoints=linked_storagerouters, user_names=['ovs', 'root']) offline_nodes = sr_client_map.pop('offline') if storagerouter in offline_nodes: raise RuntimeError( 'Node on which the vPool is being {0} is not reachable'. format('created' if vp_installer.is_new is True else 'extended')) sr_installer = StorageRouterInstaller( root_client=sr_client_map[storagerouter]['root'], sd_installer=sd_installer, vp_installer=vp_installer, storagerouter=storagerouter) # When 2 or more jobs simultaneously run on the same StorageRouter, we need to check and create the StorageDriver partitions in locked context partitions_mutex.acquire(wait=60) sr_installer.partition_info = StorageRouterController.get_partition_info( storagerouter_guid=storagerouter.guid) sr_installer.validate_vpool_extendable() sr_installer.validate_global_write_buffer( requested_size=parameters.get('writecache_size', 0)) sr_installer.validate_local_cache_size( requested_proxies=parameters.get('parallelism', {}).get( 'proxies', 2)) # MODEL STORAGEDRIVER AND PARTITION JUNCTIONS sd_installer.create() sd_installer.create_partitions() partitions_mutex.release() vp_installer.refresh_metadata() except Exception: cls._logger.exception( 'Something went wrong during the validation or modeling of vPool {0} on StorageRouter {1}' .format(vp_installer.name, storagerouter.name)) partitions_mutex.release() vp_installer.revert_vpool(status=VPool.STATUSES.RUNNING) raise # Arakoon setup counter = 0 while counter < 300: try: if StorageDriverController.manual_voldrv_arakoon_checkup( ) is True: break except Exception: cls._logger.exception( 'Arakoon checkup for voldrv cluster failed') vp_installer.revert_vpool(status=VPool.STATUSES.RUNNING) raise counter += 1 time.sleep(1) if counter == 300: vp_installer.revert_vpool(status=VPool.STATUSES.RUNNING) raise RuntimeError( 'Arakoon checkup for the StorageDriver cluster could not be started' ) # Cluster registry try: vp_installer.configure_cluster_registry(allow_raise=True) except Exception: if vp_installer.is_new is True: vp_installer.revert_vpool(status=VPool.STATUSES.RUNNING) else: vp_installer.revert_vpool(status=VPool.STATUSES.FAILURE) raise try: sd_installer.setup_proxy_configs() sd_installer.configure_storagedriver_service() DiskController.sync_with_reality(storagerouter.guid) MDSServiceController.prepare_mds_service( storagerouter=storagerouter, vpool=vp_installer.vpool) # Update the MDS safety if changed via API (vpool.configuration will be available at this point also for the newly added StorageDriver) vp_installer.vpool.invalidate_dynamics('configuration') if vp_installer.mds_safety is not None and vp_installer.vpool.configuration[ 'mds_config']['mds_safety'] != vp_installer.mds_safety: Configuration.set( key='/ovs/vpools/{0}/mds_config|mds_safety'.format( vp_installer.vpool.guid), value=vp_installer.mds_safety) sd_installer.start_services( ) # Create and start watcher volumedriver, DTL, proxies and StorageDriver services # Post creation/extension checkups mds_config_set = MDSServiceController.get_mds_storagedriver_config_set( vpool=vp_installer.vpool, offline_nodes=offline_nodes) for sr, clients in sr_client_map.iteritems(): for current_storagedriver in [ sd for sd in sr.storagedrivers if sd.vpool_guid == vp_installer.vpool.guid ]: storagedriver_config = StorageDriverConfiguration( vpool_guid=vp_installer.vpool.guid, storagedriver_id=current_storagedriver.storagedriver_id ) if storagedriver_config.config_missing is False: # Filesystem section in StorageDriver configuration are all parameters used for vDisks created directly on the filesystem # So when a vDisk gets created on the filesystem, these MDSes will be assigned to them storagedriver_config.configure_filesystem( fs_metadata_backend_mds_nodes=mds_config_set[ sr.guid]) storagedriver_config.save(client=clients['ovs']) # Everything's reconfigured, refresh new cluster configuration for current_storagedriver in vp_installer.vpool.storagedrivers: if current_storagedriver.storagerouter not in sr_client_map: continue vp_installer.vpool.storagedriver_client.update_cluster_node_configs( str(current_storagedriver.storagedriver_id), req_timeout_secs=10) except Exception: cls._logger.exception('vPool {0}: Creation failed'.format( vp_installer.name)) vp_installer.update_status(status=VPool.STATUSES.FAILURE) raise # When a node is offline, we can run into errors, but also when 1 or more volumes are not running # Scheduled tasks below, so don't really care whether they succeed or not try: VDiskController.dtl_checkup(vpool_guid=vp_installer.vpool.guid, ensure_single_timeout=600) except: pass for vdisk in vp_installer.vpool.vdisks: try: MDSServiceController.ensure_safety(vdisk_guid=vdisk.guid) except: pass vp_installer.update_status(status=VPool.STATUSES.RUNNING) cls._logger.info('Add vPool {0} ended successfully'.format( vp_installer.name))
def clone(machineguid, timestamp, name): """ Clone a vmachine using the disk snapshot based on a snapshot timestamp @param machineguid: guid of the machine to clone @param timestamp: timestamp of the disk snapshots to use for the clone @param name: name for the new machine """ machine = VMachine(machineguid) timestamp = str(timestamp) if timestamp not in (snap['timestamp'] for snap in machine.snapshots): raise RuntimeError('Invalid timestamp provided, not a valid snapshot of this vmachine.') vpool = None storagerouter = None if machine.pmachine is not None and machine.pmachine.hvtype == 'VMWARE': for vdisk in machine.vdisks: if vdisk.vpool is not None: vpool = vdisk.vpool break for vdisk in machine.vdisks: if vdisk.storagerouter_guid: storagerouter = StorageRouter(vdisk.storagerouter_guid) break hv = Factory.get(machine.pmachine) vm_path = hv.get_vmachine_path(name, storagerouter.machine_id if storagerouter is not None else '') # mutex in sync_with_hypervisor uses "None" for KVM hvtype mutex = volatile_mutex('{0}_{1}'.format(hv.clean_vmachine_filename(vm_path), vpool.guid if vpool is not None else 'none')) disks = {} for snapshot in machine.snapshots: if snapshot['timestamp'] == timestamp: for diskguid, snapshotguid in snapshot['snapshots'].iteritems(): disks[diskguid] = snapshotguid try: mutex.acquire(wait=120) new_machine = VMachine() new_machine.copy(machine) new_machine.name = name new_machine.devicename = hv.clean_vmachine_filename(vm_path) new_machine.pmachine = machine.pmachine new_machine.save() finally: mutex.release() new_disk_guids = [] vm_disks = [] mountpoint = None disks_by_order = sorted(machine.vdisks, key=lambda x: x.order) try: for currentDisk in disks_by_order: if machine.is_vtemplate and currentDisk.templatesnapshot: snapshotid = currentDisk.templatesnapshot else: snapshotid = disks[currentDisk.guid] prefix = '%s-clone' % currentDisk.name result = VDiskController.clone(diskguid=currentDisk.guid, snapshotid=snapshotid, devicename=prefix, pmachineguid=new_machine.pmachine_guid, machinename=new_machine.name, machineguid=new_machine.guid) new_disk_guids.append(result['diskguid']) mountpoint = StorageDriverList.get_by_storagedriver_id(currentDisk.storagedriver_id).mountpoint vm_disks.append(result) except Exception as ex: VMachineController._logger.error('Failed to clone disks. {0}'.format(ex)) VMachineController.delete(machineguid=new_machine.guid) raise try: result = hv.clone_vm(machine.hypervisor_id, name, vm_disks, mountpoint) except Exception as ex: VMachineController._logger.error('Failed to clone vm. {0}'.format(ex)) VMachineController.delete(machineguid=new_machine.guid) raise try: mutex.acquire(wait=120) new_machine.hypervisor_id = result new_machine.save() finally: mutex.release() return new_machine.guid
def update_vmachine_config(vmachine, vm_object, pmachine=None): """ Update a vMachine configuration with a given vMachine configuration :param vmachine: Virtual Machine to update :param vm_object: New virtual machine info :param pmachine: Physical machine of the virtual machine """ try: vdisks_synced = 0 if vmachine.name is None: MessageController.fire(MessageController.Type.EVENT, {'type': 'vmachine_created', 'metadata': {'name': vm_object['name']}}) elif vmachine.name != vm_object['name']: MessageController.fire(MessageController.Type.EVENT, {'type': 'vmachine_renamed', 'metadata': {'old_name': vmachine.name, 'new_name': vm_object['name']}}) if pmachine is not None: vmachine.pmachine = pmachine vmachine.name = vm_object['name'] vmachine.hypervisor_id = vm_object['id'] vmachine.devicename = vm_object['backing']['filename'] vmachine.save() # Updating and linking disks storagedrivers = StorageDriverList.get_storagedrivers() datastores = dict([('{0}:{1}'.format(storagedriver.storage_ip, storagedriver.mountpoint), storagedriver) for storagedriver in storagedrivers]) vdisk_guids = [] mutex = volatile_mutex('{0}_{1}'.format(vmachine.name, vmachine.devicename)) for disk in vm_object['disks']: ensure_safety = False if disk['datastore'] in vm_object['datastores']: datastore = vm_object['datastores'][disk['datastore']] if datastore in datastores: try: mutex.acquire(wait=10) vdisk = VDiskList.get_by_devicename_and_vpool(disk['filename'], datastores[datastore].vpool) if vdisk is None: # The disk couldn't be located, but is in our datastore. We might be in a recovery scenario vdisk = VDisk() vdisk.vpool = datastores[datastore].vpool vdisk.reload_client() vdisk.devicename = disk['filename'] vdisk.volume_id = vdisk.storagedriver_client.get_volume_id(str(disk['backingfilename'])) vdisk.size = vdisk.info['volume_size'] vdisk.metadata = {'lba_size': vdisk.info['lba_size'], 'cluster_multiplier': vdisk.info['cluster_multiplier']} # Create the disk in a locked context, but don't execute long running-task in same context vdisk.save() ensure_safety = True finally: mutex.release() if ensure_safety: MDSServiceController.ensure_safety(vdisk) VDiskController.dtl_checkup(vdisk_guid=vdisk.guid) # Update the disk with information from the hypervisor if vdisk.vmachine is None: MessageController.fire(MessageController.Type.EVENT, {'type': 'vdisk_attached', 'metadata': {'vmachine_name': vmachine.name, 'vdisk_name': disk['name']}}) vdisk.vmachine = vmachine vdisk.name = disk['name'] vdisk.order = disk['order'] vdisk.save() vdisk_guids.append(vdisk.guid) vdisks_synced += 1 for vdisk in vmachine.vdisks: if vdisk.guid not in vdisk_guids: MessageController.fire(MessageController.Type.EVENT, {'type': 'vdisk_detached', 'metadata': {'vmachine_name': vmachine.name, 'vdisk_name': vdisk.name}}) vdisk.vmachine = None vdisk.save() VMachineController._logger.info('Updating vMachine finished (name {0}, {1} vdisks (re)linked)'.format( vmachine.name, vdisks_synced )) except Exception as ex: VMachineController._logger.info('Error during vMachine update: {0}'.format(str(ex))) raise