def extend_cluster(master_ip, new_ip, cluster_name, base_dir, locked=True, filesystem=False, ports=None): """ Extends a cluster to a given new node :param master_ip: IP of one of the already existing nodes :type master_ip: str :param new_ip: IP address of the node to be added :type new_ip: str :param cluster_name: Name of the cluster to be extended :type cluster_name: str :param base_dir: Base directory that will hold the db and tlogs :type base_dir: str :param locked: Indicates whether the extend should run in a locked context (e.g. to prevent port conflicts) :type locked: bool :param filesystem: Indicates whether the configuration should be on the filesystem or in a configuration cluster :type filesystem: bool :param ports: A list of ports to be used for this cluster's node :type ports: list :return: Ports used by arakoon cluster :rtype: dict """ ArakoonInstaller._logger.debug('Extending cluster {0} from {1} to {2}'.format(cluster_name, master_ip, new_ip)) base_dir = base_dir.rstrip('/') config = ArakoonClusterConfig(cluster_name, filesystem) config.load_config(master_ip) client = SSHClient(new_ip, username=ArakoonInstaller.SSHCLIENT_USER) node_name = System.get_my_machine_id(client) home_dir = ArakoonInstaller.ARAKOON_HOME_DIR.format(base_dir, cluster_name) tlog_dir = ArakoonInstaller.ARAKOON_TLOG_DIR.format(base_dir, cluster_name) ArakoonInstaller.clean_leftover_arakoon_data(new_ip, [home_dir, tlog_dir]) port_mutex = None try: if locked is True: from ovs.extensions.generic.volatilemutex import volatile_mutex port_mutex = volatile_mutex('arakoon_install_ports_{0}'.format(new_ip)) port_mutex.acquire(wait=60) if ports is None: ports = ArakoonInstaller._get_free_ports(client) if node_name not in [node.name for node in config.nodes]: config.nodes.append(ArakoonNodeConfig(name=node_name, ip=new_ip, client_port=ports[0], messaging_port=ports[1], log_sinks=LogHandler.get_sink_path('arakoon_server'), crash_log_sinks=LogHandler.get_sink_path('arakoon_server_crash'), home=home_dir, tlog_dir=tlog_dir)) ArakoonInstaller._deploy(config, filesystem=filesystem) finally: if port_mutex is not None: port_mutex.release() ArakoonInstaller._logger.debug('Extending cluster {0} from {1} to {2} completed'.format(cluster_name, master_ip, new_ip)) return {'client_port': ports[0], 'messaging_port': ports[1], 'ips': [node.ip for node in config.nodes]}
def extend_cluster(master_ip, new_ip, cluster_name, base_dir, locked=True): """ Extends a cluster to a given new node :param master_ip: IP of one of the already existing nodes :type master_ip: str :param new_ip: IP address of the node to be added :type new_ip: str :param cluster_name: Name of the cluster to be extended :type cluster_name: str :param base_dir: Base directory that will hold the db and tlogs :type base_dir: str :param locked: Indicates whether the extend should run in a locked context (e.g. to prevent port conflicts) :type locked: bool :return: Ports used by arakoon cluster :rtype: dict """ ArakoonInstaller._logger.debug('Extending cluster {0} from {1} to {2}'.format(cluster_name, master_ip, new_ip)) base_dir = base_dir.rstrip('/') config = ArakoonClusterConfig(cluster_name) config.load_config() client = SSHClient(new_ip, username=ArakoonInstaller.SSHCLIENT_USER) node_name = System.get_my_machine_id(client) home_dir = ArakoonInstaller.ARAKOON_HOME_DIR.format(base_dir, cluster_name) tlog_dir = ArakoonInstaller.ARAKOON_TLOG_DIR.format(base_dir, cluster_name) ArakoonInstaller.clean_leftover_arakoon_data(new_ip, [home_dir, tlog_dir]) port_mutex = None try: if locked is True: from ovs.extensions.generic.volatilemutex import volatile_mutex port_mutex = volatile_mutex('arakoon_install_ports_{0}'.format(new_ip)) port_mutex.acquire(wait=60) ports = ArakoonInstaller._get_free_ports(client) if node_name not in [node.name for node in config.nodes]: config.nodes.append(ArakoonNodeConfig(name=node_name, ip=new_ip, client_port=ports[0], messaging_port=ports[1], log_sinks=LogHandler.get_sink_path('arakoon_server'), crash_log_sinks=LogHandler.get_sink_path('arakoon_server_crash'), home=home_dir, tlog_dir=tlog_dir)) ArakoonInstaller._deploy(config) finally: if port_mutex is not None: port_mutex.release() ArakoonInstaller._logger.debug('Extending cluster {0} from {1} to {2} completed'.format(cluster_name, master_ip, new_ip)) return {'client_port': ports[0], 'messaging_port': ports[1]}
def create_cluster(cluster_name, cluster_type, ip, base_dir, plugins=None, locked=True, internal=True, claim=False): """ Always creates a cluster but marks it's usage according to the internal flag :param cluster_name: Name of the cluster :type cluster_name: str :param cluster_type: Type of the cluster (See ServiceType.ARAKOON_CLUSTER_TYPES) :type cluster_type: str :param ip: IP address of the first node of the new cluster :type ip: str :param base_dir: Base directory that should contain the data and tlogs :type base_dir: str :param plugins: Plugins that should be added to the configuration file :type plugins: list :param locked: Indicates whether the create should run in a locked context (e.g. to prevent port conflicts) :type locked: bool :param internal: Is cluster internally managed by OVS :type internal: bool :param claim: Claim the cluster right away :type claim: bool :return: Ports used by arakoon cluster :rtype: dict """ if cluster_type not in ServiceType.ARAKOON_CLUSTER_TYPES: raise ValueError('Cluster type {0} is not supported. Please choose from {1}'.format(cluster_type, ', '.join(ServiceType.ARAKOON_CLUSTER_TYPES))) if EtcdConfiguration.dir_exists('/ovs/arakoon/{0}'.format(cluster_name)): raise ValueError('An Arakoon cluster with name "{0}" already exists'.format(cluster_name)) ArakoonInstaller._logger.debug('Creating cluster {0} on {1}'.format(cluster_name, ip)) base_dir = base_dir.rstrip('/') client = SSHClient(ip, username=ArakoonInstaller.SSHCLIENT_USER) if ArakoonInstaller.is_running(cluster_name, client): ArakoonInstaller._logger.info('Arakoon service running for cluster {0}'.format(cluster_name)) config = ArakoonClusterConfig(cluster_name, plugins) config.load_config() for node in config.nodes: if node.ip == ip: return {'client_port': node.client_port, 'messaging_port': node.messaging_port} node_name = System.get_my_machine_id(client) home_dir = ArakoonInstaller.ARAKOON_HOME_DIR.format(base_dir, cluster_name) tlog_dir = ArakoonInstaller.ARAKOON_TLOG_DIR.format(base_dir, cluster_name) ArakoonInstaller.clean_leftover_arakoon_data(ip, [home_dir, tlog_dir]) port_mutex = None try: if locked is True: from ovs.extensions.generic.volatilemutex import volatile_mutex port_mutex = volatile_mutex('arakoon_install_ports_{0}'.format(ip)) port_mutex.acquire(wait=60) ports = ArakoonInstaller._get_free_ports(client) config = ArakoonClusterConfig(cluster_name, plugins) config.nodes.append(ArakoonNodeConfig(name=node_name, ip=ip, client_port=ports[0], messaging_port=ports[1], log_sinks=LogHandler.get_sink_path('arakoon_server'), crash_log_sinks=LogHandler.get_sink_path('arakoon_server_crash'), home=home_dir, tlog_dir=tlog_dir)) ArakoonInstaller._deploy(config) metadata = ArakoonClusterMetadata(cluster_id=cluster_name) metadata.internal = internal metadata.cluster_type = cluster_type.upper() metadata.write() if claim is True: metadata.claim() finally: if port_mutex is not None: port_mutex.release() ArakoonInstaller._logger.debug('Creating cluster {0} on {1} completed'.format(cluster_name, ip)) return {'metadata': metadata, 'client_port': ports[0], 'messaging_port': ports[1]}
def _voldrv_arakoon_checkup(create_cluster): def _add_service(service_storagerouter, arakoon_ports, service_name): """ Add a service to the storage router """ new_service = Service() new_service.name = service_name new_service.type = service_type new_service.ports = arakoon_ports new_service.storagerouter = service_storagerouter new_service.save() return new_service current_ips = [] current_services = [] service_type = ServiceTypeList.get_by_name( ServiceType.SERVICE_TYPES.ARAKOON) cluster_name = Configuration.get( '/ovs/framework/arakoon_clusters').get('voldrv') if cluster_name is not None: arakoon_service_name = ArakoonInstaller.get_service_name_for_cluster( cluster_name=cluster_name) for service in service_type.services: if service.name == arakoon_service_name: current_services.append(service) if service.is_internal is True: current_ips.append(service.storagerouter.ip) all_sr_ips = [ storagerouter.ip for storagerouter in StorageRouterList.get_slaves() ] available_storagerouters = {} for storagerouter in StorageRouterList.get_masters(): storagerouter.invalidate_dynamics(['partition_config']) if len(storagerouter.partition_config[DiskPartition.ROLES.DB]) > 0: available_storagerouters[storagerouter] = DiskPartition( storagerouter.partition_config[DiskPartition.ROLES.DB][0]) all_sr_ips.append(storagerouter.ip) if create_cluster is True and len( current_services) == 0: # Create new cluster metadata = ArakoonInstaller.get_unused_arakoon_metadata_and_claim( cluster_type=ServiceType.ARAKOON_CLUSTER_TYPES.SD) if metadata is None: # No externally managed cluster found, we create 1 ourselves if not available_storagerouters: raise RuntimeError( 'Could not find any Storage Router with a DB role') storagerouter, partition = available_storagerouters.items()[0] arakoon_voldrv_cluster = 'voldrv' arakoon_installer = ArakoonInstaller( cluster_name=arakoon_voldrv_cluster) arakoon_installer.create_cluster( cluster_type=ServiceType.ARAKOON_CLUSTER_TYPES.SD, ip=storagerouter.ip, base_dir=partition.folder, log_sinks=LogHandler.get_sink_path( 'arakoon-server_{0}'.format(arakoon_voldrv_cluster)), crash_log_sinks=LogHandler.get_sink_path( 'arakoon-server-crash_{0}'.format( arakoon_voldrv_cluster))) arakoon_installer.start_cluster() ports = arakoon_installer.ports[storagerouter.ip] metadata = arakoon_installer.metadata current_ips.append(storagerouter.ip) else: ports = [] storagerouter = None cluster_name = metadata['cluster_name'] Configuration.set('/ovs/framework/arakoon_clusters|voldrv', cluster_name) StorageDriverController._logger.info( 'Claiming {0} managed arakoon cluster: {1}'.format( 'externally' if storagerouter is None else 'internally', cluster_name)) StorageDriverController._configure_arakoon_to_volumedriver( cluster_name=cluster_name) current_services.append( _add_service( service_storagerouter=storagerouter, arakoon_ports=ports, service_name=ArakoonInstaller.get_service_name_for_cluster( cluster_name=cluster_name))) cluster_name = Configuration.get( '/ovs/framework/arakoon_clusters').get('voldrv') if cluster_name is None: return metadata = ArakoonInstaller.get_arakoon_metadata_by_cluster_name( cluster_name=cluster_name) if 0 < len(current_services) < len( available_storagerouters) and metadata['internal'] is True: for storagerouter, partition in available_storagerouters.iteritems( ): if storagerouter.ip in current_ips: continue arakoon_installer = ArakoonInstaller(cluster_name=cluster_name) arakoon_installer.load() arakoon_installer.extend_cluster( new_ip=storagerouter.ip, base_dir=partition.folder, log_sinks=LogHandler.get_sink_path( 'arakoon-server_{0}'.format(cluster_name)), crash_log_sinks=LogHandler.get_sink_path( 'arakoon-server-crash_{0}'.format(cluster_name))) _add_service( service_storagerouter=storagerouter, arakoon_ports=arakoon_installer.ports[storagerouter.ip], service_name=ArakoonInstaller.get_service_name_for_cluster( cluster_name=cluster_name)) current_ips.append(storagerouter.ip) arakoon_installer.restart_cluster_after_extending( new_ip=storagerouter.ip) StorageDriverController._configure_arakoon_to_volumedriver( cluster_name=cluster_name)
def execute_scrub_work(queue, vpool, scrub_info, error_messages): """ Executes scrub work for a given vDisk queue and vPool, based on scrub_info :param queue: a Queue with vDisk guids that need to be scrubbed (they should only be member of a single vPool) :type queue: Queue :param vpool: the vPool object of the vDisks :type vpool: VPool :param scrub_info: A dict containing scrub information: `scrub_path` with the path where to scrub and `storage_router` with the StorageRouter that needs to do the work :type scrub_info: dict :param error_messages: A list of error messages to be filled :type error_messages: list :return: a list of error messages :rtype: list """ def _verify_mds_config(current_vdisk): current_vdisk.invalidate_dynamics('info') vdisk_configs = current_vdisk.info['metadata_backend_config'] if len(vdisk_configs) == 0: raise RuntimeError('Could not load MDS configuration') return vdisk_configs client = None lock_time = 5 * 60 storagerouter = scrub_info['storage_router'] scrub_directory = '{0}/scrub_work_{1}_{2}'.format(scrub_info['scrub_path'], vpool.name, storagerouter.name) scrub_config_key = 'ovs/vpools/{0}/proxies/scrub/scrub_config_{1}'.format(vpool.guid, storagerouter.guid) backend_config_key = 'ovs/vpools/{0}/proxies/scrub/backend_config_{1}'.format(vpool.guid, storagerouter.guid) alba_proxy_service = 'ovs-albaproxy_{0}_{1}_scrub'.format(vpool.name, storagerouter.name) # Deploy a proxy try: with file_mutex(name='ovs_albaproxy_scrub', wait=lock_time): ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - Deploying ALBA proxy {2}'.format(vpool.name, storagerouter.name, alba_proxy_service)) client = SSHClient(storagerouter, 'root') client.dir_create(scrub_directory) client.dir_chmod(scrub_directory, 0777) # Celery task executed by 'ovs' user and should be able to write in it if ServiceManager.has_service(name=alba_proxy_service, client=client) is True and ServiceManager.get_service_status(name=alba_proxy_service, client=client) is True: ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - Re-using existing proxy service {2}'.format(vpool.name, storagerouter.name, alba_proxy_service)) scrub_config = Configuration.get(scrub_config_key) else: machine_id = System.get_my_machine_id(client) port_range = Configuration.get('/ovs/framework/hosts/{0}/ports|storagedriver'.format(machine_id)) port = System.get_free_ports(selected_range=port_range, nr=1, client=client)[0] # Scrub config # {u'albamgr_cfg_url': u'arakoon://config/ovs/vpools/71e2f717-f270-4a41-bbb0-d4c8c084d43e/proxies/64759516-3471-4321-b912-fb424568fc5b/config/abm?ini=%2Fopt%2FOpenvStorage%2Fconfig%2Farakoon_cacc.ini', # u'fragment_cache': [u'none'], # u'ips': [u'127.0.0.1'], # u'log_level': u'info', # u'manifest_cache_size': 17179869184, # u'port': 0, # u'transport': u'tcp'} # Backend config # {u'alba_connection_host': u'10.100.193.155', # u'alba_connection_port': 26204, # u'alba_connection_preset': u'preset', # u'alba_connection_timeout': 15, # u'alba_connection_transport': u'TCP', # u'backend_interface_retries_on_error': 5, # u'backend_interface_retry_backoff_multiplier': 2.0, # u'backend_interface_retry_interval_secs': 1, # u'backend_type': u'ALBA'} scrub_config = Configuration.get('ovs/vpools/{0}/proxies/scrub/generic_scrub'.format(vpool.guid)) scrub_config['port'] = port scrub_config['transport'] = 'tcp' Configuration.set(scrub_config_key, json.dumps(scrub_config, indent=4), raw=True) params = {'VPOOL_NAME': vpool.name, 'LOG_SINK': LogHandler.get_sink_path('alba_proxy'), 'CONFIG_PATH': Configuration.get_configuration_path(scrub_config_key)} ServiceManager.add_service(name='ovs-albaproxy', params=params, client=client, target_name=alba_proxy_service) ServiceManager.start_service(name=alba_proxy_service, client=client) ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - Deployed ALBA proxy {2}'.format(vpool.name, storagerouter.name, alba_proxy_service)) backend_config = Configuration.get('ovs/vpools/{0}/hosts/{1}/config'.format(vpool.guid, vpool.storagedrivers[0].storagedriver_id))['backend_connection_manager'] backend_config['alba_connection_host'] = '127.0.0.1' backend_config['alba_connection_port'] = scrub_config['port'] Configuration.set(backend_config_key, json.dumps({"backend_connection_manager": backend_config}, indent=4), raw=True) except Exception: message = 'Scrubber - vPool {0} - StorageRouter {1} - An error occurred deploying ALBA proxy {2}'.format(vpool.name, storagerouter.name, alba_proxy_service) error_messages.append(message) ScheduledTaskController._logger.exception(message) if client is not None and ServiceManager.has_service(name=alba_proxy_service, client=client) is True: if ServiceManager.get_service_status(name=alba_proxy_service, client=client) is True: ServiceManager.stop_service(name=alba_proxy_service, client=client) ServiceManager.remove_service(name=alba_proxy_service, client=client) if Configuration.exists(scrub_config_key): Configuration.delete(scrub_config_key) try: # Empty the queue with vDisks to scrub with remote(storagerouter.ip, [VDisk]) as rem: while True: vdisk = None vdisk_guid = queue.get(False) try: # Check MDS master is local. Trigger MDS handover if necessary vdisk = rem.VDisk(vdisk_guid) ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Started scrubbing at location {3}'.format(vpool.name, storagerouter.name, vdisk.name, scrub_directory)) configs = _verify_mds_config(current_vdisk=vdisk) storagedriver = StorageDriverList.get_by_storagedriver_id(vdisk.storagedriver_id) if configs[0].get('ip') != storagedriver.storagerouter.ip: ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - MDS master is not local, trigger handover'.format(vpool.name, storagerouter.name, vdisk.name)) MDSServiceController.ensure_safety(VDisk(vdisk_guid)) # Do not use a remote VDisk instance here configs = _verify_mds_config(current_vdisk=vdisk) if configs[0].get('ip') != storagedriver.storagerouter.ip: ScheduledTaskController._logger.warning('Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Skipping because master MDS still not local'.format(vpool.name, storagerouter.name, vdisk.name)) continue # Do the actual scrubbing with vdisk.storagedriver_client.make_locked_client(str(vdisk.volume_id)) as locked_client: ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Retrieve and apply scrub work'.format(vpool.name, storagerouter.name, vdisk.name)) work_units = locked_client.get_scrubbing_workunits() for work_unit in work_units: res = locked_client.scrub(work_unit=work_unit, scratch_dir=scrub_directory, log_sinks=[LogHandler.get_sink_path('scrubber', allow_override=True)], backend_config=Configuration.get_configuration_path(backend_config_key)) locked_client.apply_scrubbing_result(scrubbing_work_result=res) if work_units: ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - {3} work units successfully applied'.format(vpool.name, storagerouter.name, vdisk.name, len(work_units))) else: ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - No scrubbing required'.format(vpool.name, storagerouter.name, vdisk.name)) except Exception: if vdisk is None: message = 'Scrubber - vPool {0} - StorageRouter {1} - vDisk with guid {2} could not be found'.format(vpool.name, storagerouter.name, vdisk_guid) else: message = 'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Scrubbing failed'.format(vpool.name, storagerouter.name, vdisk.name) error_messages.append(message) ScheduledTaskController._logger.exception(message) except Empty: # Raised when all items have been fetched from the queue ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - Queue completely processed'.format(vpool.name, storagerouter.name)) except Exception: message = 'Scrubber - vPool {0} - StorageRouter {1} - Scrubbing failed'.format(vpool.name, storagerouter.name) error_messages.append(message) ScheduledTaskController._logger.exception(message) # Delete the proxy again try: with file_mutex(name='ovs_albaproxy_scrub', wait=lock_time): ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - Removing service {2}'.format(vpool.name, storagerouter.name, alba_proxy_service)) client = SSHClient(storagerouter, 'root') client.dir_delete(scrub_directory) if ServiceManager.has_service(alba_proxy_service, client=client): ServiceManager.stop_service(alba_proxy_service, client=client) ServiceManager.remove_service(alba_proxy_service, client=client) if Configuration.exists(scrub_config_key): Configuration.delete(scrub_config_key) ScheduledTaskController._logger.info('Scrubber - vPool {0} - StorageRouter {1} - Removed service {2}'.format(vpool.name, storagerouter.name, alba_proxy_service)) except Exception: message = 'Scrubber - vPool {0} - StorageRouter {1} - Removing service {2} failed'.format(vpool.name, storagerouter.name, alba_proxy_service) error_messages.append(message) ScheduledTaskController._logger.exception(message)
def _deploy_stack_and_scrub(queue, vpool, scrub_info, error_messages): """ Executes scrub work for a given vDisk queue and vPool, based on scrub_info :param queue: a Queue with vDisk guids that need to be scrubbed (they should only be member of a single vPool) :type queue: Queue :param vpool: the vPool object of the vDisks :type vpool: VPool :param scrub_info: A dict containing scrub information: `scrub_path` with the path where to scrub `storage_router` with the StorageRouter that needs to do the work :type scrub_info: dict :param error_messages: A list of error messages to be filled (by reference) :type error_messages: list :return: None :rtype: NoneType """ if len(vpool.storagedrivers ) == 0 or not vpool.storagedrivers[0].storagedriver_id: error_messages.append( 'vPool {0} does not have any valid StorageDrivers configured'. format(vpool.name)) return service_manager = ServiceFactory.get_manager() client = None lock_time = 5 * 60 storagerouter = scrub_info['storage_router'] partition_guid = scrub_info['partition_guid'] alba_proxy_service = 'ovs-albaproxy_{0}_{1}_{2}_scrub'.format( vpool.name, storagerouter.name, partition_guid) scrub_directory = '{0}/scrub_work_{1}_{2}'.format( scrub_info['scrub_path'], vpool.name, partition_guid) scrub_config_key = 'ovs/vpools/{0}/proxies/scrub/scrub_config_{1}'.format( vpool.guid, partition_guid) backend_config_key = 'ovs/vpools/{0}/proxies/scrub/backend_config_{1}'.format( vpool.guid, partition_guid) # Deploy a proxy try: with file_mutex(name='ovs_albaproxy_scrub', wait=lock_time): GenericController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - Deploying ALBA proxy {2}' .format(vpool.name, storagerouter.name, alba_proxy_service)) client = SSHClient(storagerouter, 'root') client.dir_create(scrub_directory) client.dir_chmod( scrub_directory, 0777 ) # Celery task executed by 'ovs' user and should be able to write in it if service_manager.has_service( name=alba_proxy_service, client=client ) is True and service_manager.get_service_status( name=alba_proxy_service, client=client) == 'active': GenericController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - Re-using existing proxy service {2}' .format(vpool.name, storagerouter.name, alba_proxy_service)) scrub_config = Configuration.get(scrub_config_key) else: machine_id = System.get_my_machine_id(client) port_range = Configuration.get( '/ovs/framework/hosts/{0}/ports|storagedriver'.format( machine_id)) with volatile_mutex('deploy_proxy_for_scrub_{0}'.format( storagerouter.guid), wait=30): port = System.get_free_ports(selected_range=port_range, nr=1, client=client)[0] scrub_config = Configuration.get( 'ovs/vpools/{0}/proxies/scrub/generic_scrub'.format( vpool.guid)) scrub_config['port'] = port scrub_config['transport'] = 'tcp' Configuration.set(scrub_config_key, json.dumps(scrub_config, indent=4), raw=True) params = { 'VPOOL_NAME': vpool.name, 'LOG_SINK': LogHandler.get_sink_path(alba_proxy_service), 'CONFIG_PATH': Configuration.get_configuration_path(scrub_config_key) } service_manager.add_service(name='ovs-albaproxy', params=params, client=client, target_name=alba_proxy_service) service_manager.start_service(name=alba_proxy_service, client=client) GenericController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - Deployed ALBA proxy {2}' .format(vpool.name, storagerouter.name, alba_proxy_service)) backend_config = Configuration.get( 'ovs/vpools/{0}/hosts/{1}/config'.format( vpool.guid, vpool.storagedrivers[0].storagedriver_id ))['backend_connection_manager'] if backend_config.get('backend_type') != 'MULTI': backend_config['alba_connection_host'] = '127.0.0.1' backend_config['alba_connection_port'] = scrub_config[ 'port'] else: for value in backend_config.itervalues(): if isinstance(value, dict): value['alba_connection_host'] = '127.0.0.1' value['alba_connection_port'] = scrub_config[ 'port'] # Copy backend connection manager information in separate key Configuration.set( backend_config_key, json.dumps({"backend_connection_manager": backend_config}, indent=4), raw=True) except Exception: message = 'Scrubber - vPool {0} - StorageRouter {1} - An error occurred deploying ALBA proxy {2}'.format( vpool.name, storagerouter.name, alba_proxy_service) error_messages.append(message) GenericController._logger.exception(message) if client is not None and service_manager.has_service( name=alba_proxy_service, client=client) is True: if service_manager.get_service_status( name=alba_proxy_service, client=client) == 'active': service_manager.stop_service(name=alba_proxy_service, client=client) service_manager.remove_service(name=alba_proxy_service, client=client) if Configuration.exists(scrub_config_key): Configuration.delete(scrub_config_key) # Execute the actual scrubbing threads = [] threads_key = '/ovs/framework/hosts/{0}/config|scrub_stack_threads'.format( storagerouter.machine_id) amount_threads = Configuration.get( key=threads_key) if Configuration.exists(key=threads_key) else 2 if not isinstance(amount_threads, int): error_messages.append( 'Amount of threads to spawn must be an integer for StorageRouter with ID {0}' .format(storagerouter.machine_id)) return amount_threads = max(amount_threads, 1) # Make sure amount_threads is at least 1 amount_threads = min(min(queue.qsize(), amount_threads), 20) # Make sure amount threads is max 20 GenericController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - Spawning {2} threads for proxy service {3}' .format(vpool.name, storagerouter.name, amount_threads, alba_proxy_service)) for index in range(amount_threads): thread = Thread(name='execute_scrub_{0}_{1}_{2}'.format( vpool.guid, partition_guid, index), target=GenericController._execute_scrub, args=(queue, vpool, scrub_info, scrub_directory, error_messages)) thread.start() threads.append(thread) for thread in threads: thread.join() # Delete the proxy again try: with file_mutex(name='ovs_albaproxy_scrub', wait=lock_time): GenericController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - Removing service {2}' .format(vpool.name, storagerouter.name, alba_proxy_service)) client = SSHClient(storagerouter, 'root') client.dir_delete(scrub_directory) if service_manager.has_service(alba_proxy_service, client=client): service_manager.stop_service(alba_proxy_service, client=client) service_manager.remove_service(alba_proxy_service, client=client) if Configuration.exists(scrub_config_key): Configuration.delete(scrub_config_key) GenericController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - Removed service {2}' .format(vpool.name, storagerouter.name, alba_proxy_service)) except Exception: message = 'Scrubber - vPool {0} - StorageRouter {1} - Removing service {2} failed'.format( vpool.name, storagerouter.name, alba_proxy_service) error_messages.append(message) GenericController._logger.exception(message)
def _execute_scrub(queue, vpool, scrub_info, scrub_dir, error_messages): def _verify_mds_config(current_vdisk): current_vdisk.invalidate_dynamics('info') vdisk_configs = current_vdisk.info['metadata_backend_config'] if len(vdisk_configs) == 0: raise RuntimeError('Could not load MDS configuration') return vdisk_configs storagerouter = scrub_info['storage_router'] partition_guid = scrub_info['partition_guid'] volatile_client = VolatileFactory.get_client() backend_config_key = 'ovs/vpools/{0}/proxies/scrub/backend_config_{1}'.format( vpool.guid, partition_guid) try: # Empty the queue with vDisks to scrub with remote(storagerouter.ip, [VDisk]) as rem: while True: vdisk = None vdisk_guid = queue.get( False ) # Raises Empty Exception when queue is empty, so breaking the while True loop volatile_key = 'ovs_scrubbing_vdisk_{0}'.format(vdisk_guid) try: # Check MDS master is local. Trigger MDS handover if necessary vdisk = rem.VDisk(vdisk_guid) GenericController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Started scrubbing at location {3}' .format(vpool.name, storagerouter.name, vdisk.name, scrub_dir)) configs = _verify_mds_config(current_vdisk=vdisk) storagedriver = StorageDriverList.get_by_storagedriver_id( vdisk.storagedriver_id) if configs[0].get( 'ip') != storagedriver.storagerouter.ip: GenericController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - MDS master is not local, trigger handover' .format(vpool.name, storagerouter.name, vdisk.name)) MDSServiceController.ensure_safety( VDisk(vdisk_guid) ) # Do not use a remote VDisk instance here configs = _verify_mds_config(current_vdisk=vdisk) if configs[0].get( 'ip') != storagedriver.storagerouter.ip: GenericController._logger.warning( 'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Skipping because master MDS still not local' .format(vpool.name, storagerouter.name, vdisk.name)) continue # Check if vDisk is already being scrubbed if volatile_client.add(key=volatile_key, value=volatile_key, time=24 * 60 * 60) is False: GenericController._logger.warning( 'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Skipping because vDisk is already being scrubbed' .format(vpool.name, storagerouter.name, vdisk.name)) continue # Do the actual scrubbing with vdisk.storagedriver_client.make_locked_client( str(vdisk.volume_id)) as locked_client: GenericController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Retrieve and apply scrub work' .format(vpool.name, storagerouter.name, vdisk.name)) work_units = locked_client.get_scrubbing_workunits( ) for work_unit in work_units: res = locked_client.scrub( work_unit=work_unit, scratch_dir=scrub_dir, log_sinks=[ LogHandler.get_sink_path( 'scrubber_{0}'.format(vpool.name), allow_override=True, forced_target_type='file') ], backend_config=Configuration. get_configuration_path(backend_config_key)) locked_client.apply_scrubbing_result( scrubbing_work_result=res) if work_units: GenericController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - {3} work units successfully applied' .format(vpool.name, storagerouter.name, vdisk.name, len(work_units))) else: GenericController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - No scrubbing required' .format(vpool.name, storagerouter.name, vdisk.name)) except Exception: if vdisk is None: message = 'Scrubber - vPool {0} - StorageRouter {1} - vDisk with guid {2} could not be found'.format( vpool.name, storagerouter.name, vdisk_guid) else: message = 'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Scrubbing failed'.format( vpool.name, storagerouter.name, vdisk.name) error_messages.append(message) GenericController._logger.exception(message) finally: # Remove vDisk from volatile memory volatile_client.delete(volatile_key) except Empty: # Raised when all items have been fetched from the queue GenericController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - Queue completely processed' .format(vpool.name, storagerouter.name)) except Exception: message = 'Scrubber - vPool {0} - StorageRouter {1} - Scrubbing failed'.format( vpool.name, storagerouter.name) error_messages.append(message) GenericController._logger.exception(message)
def test_node_config_checkup(self): """ Validates correct working of cluster registry checkup """ base_structure = {'1': {'vrouter_id': '1', 'message_host': '10.0.1.1', 'message_port': 1, 'xmlrpc_host': '10.0.0.1', 'xmlrpc_port': 2, 'failovercache_host': '10.0.1.1', 'failovercache_port': 3, 'network_server_uri': 'tcp://10.0.1.1:4', 'node_distance_map': None}, '2': {'vrouter_id': '2', 'message_host': '10.0.1.2', 'message_port': 1, 'xmlrpc_host': '10.0.0.2', 'xmlrpc_port': 2, 'failovercache_host': '10.0.1.2', 'failovercache_port': 3, 'network_server_uri': 'tcp://10.0.1.2:4', 'node_distance_map': None}} def _validate_node_config(_config, _expected_map): expected = copy.deepcopy(base_structure[_config.vrouter_id]) expected['node_distance_map'] = _expected_map[_config.vrouter_id] self.assertDictEqual(expected, {'vrouter_id': _config.vrouter_id, 'message_host': _config.message_host, 'message_port': _config.message_port, 'xmlrpc_host': _config.xmlrpc_host, 'xmlrpc_port': _config.xmlrpc_port, 'failovercache_host': _config.failovercache_host, 'failovercache_port': _config.failovercache_port, 'network_server_uri': _config.network_server_uri, 'node_distance_map': _config.node_distance_map}) structure = DalHelper.build_dal_structure( {'vpools': [1], 'domains': [1, 2], 'storagerouters': [1, 2], 'storagedrivers': [(1, 1, 1), (2, 1, 2)], # (<id>, <vpool_id>, <storagerouter_id>) 'storagerouter_domains': [(1, 1, 1, False), (2, 2, 1, False)]} # (id>, <storagerouter_id>, <domain_id>, <backup>) ) storagerouters = structure['storagerouters'] vpool = structure['vpools'][1] arakoon_installer = ArakoonInstaller(cluster_name='voldrv') arakoon_installer.create_cluster(cluster_type=ServiceType.ARAKOON_CLUSTER_TYPES.SD, ip=storagerouters[1].ip, base_dir='/tmp', log_sinks=LogHandler.get_sink_path('arakoon-server_voldrv'), crash_log_sinks=LogHandler.get_sink_path('arakoon-server-crash_voldrv')) # Initial run, it will now be configured StorageRouterClient.node_config_recordings = [] result = StorageDriverController.cluster_registry_checkup() self.assertDictEqual(result, {vpool.guid: {'success': True, 'changes': True}}) self.assertListEqual(sorted(StorageRouterClient.node_config_recordings), ['1', '2']) expected_map = {'1': {'2': StorageDriver.DISTANCES.NEAR}, '2': {'1': StorageDriver.DISTANCES.NEAR}} configs = vpool.clusterregistry_client.get_node_configs() for config in configs: _validate_node_config(config, expected_map) # Running it again should not change anything StorageRouterClient.node_config_recordings = [] result = StorageDriverController.cluster_registry_checkup() self.assertDictEqual(result, {vpool.guid: {'success': True, 'changes': False}}) self.assertListEqual(sorted(StorageRouterClient.node_config_recordings), []) expected_map = {'1': {'2': StorageDriver.DISTANCES.NEAR}, '2': {'1': StorageDriver.DISTANCES.NEAR}} configs = vpool.clusterregistry_client.get_node_configs() for config in configs: _validate_node_config(config, expected_map) # Validate some error paths domain = structure['domains'][2] junction = structure['storagerouters'][1].domains[0] junction.domain = domain junction.save() vpool_config_path = 'file://opt/OpenvStorage/config/framework.json?key=/ovs/vpools/{0}/hosts/1/config'.format(vpool.guid) StorageRouterClient.exceptions['server_revision'] = {vpool_config_path: Exception('ClusterNotReachableException')} StorageRouterClient.node_config_recordings = [] result = StorageDriverController.cluster_registry_checkup() self.assertDictEqual(result, {vpool.guid: {'success': True, 'changes': True}}) self.assertListEqual(sorted(StorageRouterClient.node_config_recordings), ['2']) expected_map = {'1': {'2': StorageDriver.DISTANCES.INFINITE}, '2': {'1': StorageDriver.DISTANCES.INFINITE}} configs = vpool.clusterregistry_client.get_node_configs() for config in configs: _validate_node_config(config, expected_map)
def create_cluster(cluster_name, cluster_type, ip, base_dir, plugins=None, locked=True, internal=True, filesystem=False, ports=None): """ Always creates a cluster but marks it's usage according to the internal flag :param cluster_name: Name of the cluster :type cluster_name: str :param cluster_type: Type of the cluster (See ServiceType.ARAKOON_CLUSTER_TYPES) :type cluster_type: str :param ip: IP address of the first node of the new cluster :type ip: str :param base_dir: Base directory that should contain the data and tlogs :type base_dir: str :param plugins: Plugins that should be added to the configuration file :type plugins: dict :param locked: Indicates whether the create should run in a locked context (e.g. to prevent port conflicts) :type locked: bool :param internal: Is cluster internally managed by OVS :type internal: bool :param filesystem: Indicates whether the configuration should be on the filesystem or in a configuration cluster :type filesystem: bool :param ports: A list of ports to be used for this cluster's node :type ports: list :return: Ports used by arakoon cluster :rtype: dict """ if cluster_type not in ServiceType.ARAKOON_CLUSTER_TYPES: raise ValueError('Cluster type {0} is not supported. Please choose from {1}'.format(cluster_type, ', '.join(ServiceType.ARAKOON_CLUSTER_TYPES))) client = SSHClient(ip, username=ArakoonInstaller.SSHCLIENT_USER) if filesystem is True: exists = client.file_exists(ArakoonClusterConfig.CONFIG_FILE.format(cluster_name)) else: exists = Configuration.dir_exists('/ovs/arakoon/{0}'.format(cluster_name)) if exists is True: raise ValueError('An Arakoon cluster with name "{0}" already exists'.format(cluster_name)) ArakoonInstaller._logger.debug('Creating cluster {0} on {1}'.format(cluster_name, ip)) node_name = System.get_my_machine_id(client) base_dir = base_dir.rstrip('/') home_dir = ArakoonInstaller.ARAKOON_HOME_DIR.format(base_dir, cluster_name) tlog_dir = ArakoonInstaller.ARAKOON_TLOG_DIR.format(base_dir, cluster_name) ArakoonInstaller.clean_leftover_arakoon_data(ip, [home_dir, tlog_dir]) port_mutex = None try: if locked is True: from ovs.extensions.generic.volatilemutex import volatile_mutex port_mutex = volatile_mutex('arakoon_install_ports_{0}'.format(ip)) port_mutex.acquire(wait=60) if ports is None: ports = ArakoonInstaller._get_free_ports(client) config = ArakoonClusterConfig(cluster_name, filesystem, plugins.keys() if plugins is not None else None) config.nodes.append(ArakoonNodeConfig(name=node_name, ip=ip, client_port=ports[0], messaging_port=ports[1], log_sinks=LogHandler.get_sink_path('arakoon_server'), crash_log_sinks=LogHandler.get_sink_path('arakoon_server_crash'), home=home_dir, tlog_dir=tlog_dir)) metadata = {'internal': internal, 'cluster_name': cluster_name, 'cluster_type': cluster_type.upper(), 'in_use': False} service_metadata = ArakoonInstaller._deploy(config=config, filesystem=filesystem, plugins=plugins.values() if plugins is not None else None, delay_service_registration=cluster_type == ServiceType.ARAKOON_CLUSTER_TYPES.CFG)[ip] finally: if port_mutex is not None: port_mutex.release() ArakoonInstaller._logger.debug('Creating cluster {0} on {1} completed'.format(cluster_name, ip)) return {'metadata': metadata, 'client_port': ports[0], 'messaging_port': ports[1], 'service_metadata': service_metadata}
def promote_node(cluster_ip, master_ip, ip_client_map, unique_id, configure_memcached, configure_rabbitmq): """ Promotes a given node """ from ovs.dal.lists.storagerouterlist import StorageRouterList from ovs.dal.lists.servicetypelist import ServiceTypeList from ovs.dal.lists.servicelist import ServiceList from ovs.dal.hybrids.service import Service Toolbox.log(logger=NodeTypeController._logger, messages='Promoting node', title=True) service_manager = ServiceFactory.get_manager() if configure_memcached is True: if NodeTypeController._validate_local_memcache_servers( ip_client_map) is False: raise RuntimeError( 'Not all memcache nodes can be reached which is required for promoting a node.' ) target_client = ip_client_map[cluster_ip] machine_id = System.get_my_machine_id(target_client) node_name, _ = target_client.get_hostname() master_client = ip_client_map[master_ip] storagerouter = StorageRouterList.get_by_machine_id(unique_id) storagerouter.node_type = 'MASTER' storagerouter.save() external_config = Configuration.get('/ovs/framework/external_config') if external_config is None: Toolbox.log(logger=NodeTypeController._logger, messages='Joining Arakoon configuration cluster') arakoon_installer = ArakoonInstaller(cluster_name='config') arakoon_installer.load(ip=master_ip) arakoon_installer.extend_cluster( new_ip=cluster_ip, base_dir=Configuration.get('/ovs/framework/paths|ovsdb'), log_sinks=LogHandler.get_sink_path('arakoon-server_config'), crash_log_sinks=LogHandler.get_sink_path( 'arakoon-server-crash_config')) arakoon_installer.restart_cluster_after_extending( new_ip=cluster_ip) service_manager.register_service( node_name=machine_id, service_metadata=arakoon_installer.service_metadata[cluster_ip] ) # Find other (arakoon) master nodes arakoon_cluster_name = str( Configuration.get('/ovs/framework/arakoon_clusters|ovsdb')) arakoon_metadata = ArakoonInstaller.get_arakoon_metadata_by_cluster_name( cluster_name=arakoon_cluster_name) config = ArakoonClusterConfig(cluster_id=arakoon_cluster_name) master_node_ips = [node.ip for node in config.nodes] if cluster_ip in master_node_ips: master_node_ips.remove(cluster_ip) if len(master_node_ips) == 0: raise RuntimeError( 'There should be at least one other master node') arakoon_ports = [] if arakoon_metadata['internal'] is True: Toolbox.log(logger=NodeTypeController._logger, messages='Joining Arakoon OVS DB cluster') arakoon_installer = ArakoonInstaller( cluster_name=arakoon_cluster_name) arakoon_installer.load() arakoon_installer.extend_cluster( new_ip=cluster_ip, base_dir=Configuration.get('/ovs/framework/paths|ovsdb'), log_sinks=LogHandler.get_sink_path( 'arakoon-server_{0}'.format(arakoon_cluster_name)), crash_log_sinks=LogHandler.get_sink_path( 'arakoon-server-crash_{0}'.format(arakoon_cluster_name))) arakoon_installer.restart_cluster_after_extending( new_ip=cluster_ip) arakoon_ports = arakoon_installer.ports[cluster_ip] if configure_memcached is True: NodeTypeController.configure_memcached( client=target_client, logger=NodeTypeController._logger) NodeTypeController.add_services(client=target_client, node_type='master', logger=NodeTypeController._logger) Toolbox.log(logger=NodeTypeController._logger, messages='Update configurations') if configure_memcached is True: endpoints = Configuration.get('/ovs/framework/memcache|endpoints') endpoint = '{0}:11211'.format(cluster_ip) if endpoint not in endpoints: endpoints.append(endpoint) Configuration.set('/ovs/framework/memcache|endpoints', endpoints) if configure_rabbitmq is True: endpoints = Configuration.get( '/ovs/framework/messagequeue|endpoints') endpoint = '{0}:5672'.format(cluster_ip) if endpoint not in endpoints: endpoints.append(endpoint) Configuration.set('/ovs/framework/messagequeue|endpoints', endpoints) if arakoon_metadata['internal'] is True: Toolbox.log(logger=NodeTypeController._logger, messages='Restarting master node services') PersistentFactory.store = None VolatileFactory.store = None if 'arakoon-ovsdb' not in [ s.name for s in ServiceList.get_services() if s.is_internal is False or s.storagerouter.ip == cluster_ip ]: service = Service() service.name = 'arakoon-ovsdb' service.type = ServiceTypeList.get_by_name( ServiceType.SERVICE_TYPES.ARAKOON) service.ports = arakoon_ports service.storagerouter = storagerouter service.save() if configure_rabbitmq is True: NodeTypeController.configure_rabbitmq( client=target_client, logger=NodeTypeController._logger) # Copy rabbitmq cookie rabbitmq_cookie_file = '/var/lib/rabbitmq/.erlang.cookie' Toolbox.log(logger=NodeTypeController._logger, messages='Copying RabbitMQ cookie') contents = master_client.file_read(rabbitmq_cookie_file) master_hostname, _ = master_client.get_hostname() target_client.dir_create(os.path.dirname(rabbitmq_cookie_file)) target_client.file_write(rabbitmq_cookie_file, contents) target_client.file_chmod(rabbitmq_cookie_file, mode=0400) target_client.run(['rabbitmq-server', '-detached']) time.sleep(5) target_client.run(['rabbitmqctl', 'stop_app']) time.sleep(5) target_client.run([ 'rabbitmqctl', 'join_cluster', 'rabbit@{0}'.format(master_hostname) ]) time.sleep(5) target_client.run(['rabbitmqctl', 'stop']) time.sleep(5) # Enable HA for the rabbitMQ queues Toolbox.change_service_state(target_client, 'rabbitmq-server', 'start', NodeTypeController._logger) NodeTypeController.check_rabbitmq_and_enable_ha_mode( client=target_client, logger=NodeTypeController._logger) NodeTypeController._configure_amqp_to_volumedriver() Toolbox.log(logger=NodeTypeController._logger, messages='Starting services') services = ['memcached', 'arakoon-ovsdb', 'rabbitmq-server'] if arakoon_metadata['internal'] is True: services.remove('arakoon-ovsdb') for service in services: if service_manager.has_service(service, client=target_client): Toolbox.change_service_state(target_client, service, 'start', NodeTypeController._logger) Toolbox.log(logger=NodeTypeController._logger, messages='Restarting services') NodeTypeController.restart_framework_and_memcache_services( clients=ip_client_map, logger=NodeTypeController._logger) if Toolbox.run_hooks(component='nodetype', sub_component='promote', logger=NodeTypeController._logger, cluster_ip=cluster_ip, master_ip=master_ip): Toolbox.log(logger=NodeTypeController._logger, messages='Restarting services') NodeTypeController.restart_framework_and_memcache_services( clients=ip_client_map, logger=NodeTypeController._logger) if NodeTypeController.avahi_installed( client=target_client, logger=NodeTypeController._logger) is True: NodeTypeController.configure_avahi( client=target_client, node_name=node_name, node_type='master', logger=NodeTypeController._logger) Configuration.set('/ovs/framework/hosts/{0}/type'.format(machine_id), 'MASTER') target_client.run( ['chown', '-R', 'ovs:ovs', '/opt/OpenvStorage/config']) Configuration.set( '/ovs/framework/hosts/{0}/promotecompleted'.format(machine_id), True) if target_client.file_exists('/tmp/ovs_rollback'): target_client.file_delete('/tmp/ovs_rollback') Toolbox.log(logger=NodeTypeController._logger, messages='Promote complete')
def execute_scrub_work(queue, vpool, scrub_info, error_messages): """ Executes scrub work for a given vDisk queue and vPool, based on scrub_info :param queue: a Queue with vDisk guids that need to be scrubbed (they should only be member of a single vPool) :type queue: Queue :param vpool: the vPool object of the vDisks :type vpool: VPool :param scrub_info: A dict containing scrub information: `scrub_path` with the path where to scrub and `storage_router` with the StorageRouter that needs to do the work :type scrub_info: dict :param error_messages: A list of error messages to be filled :type error_messages: list :return: a list of error messages :rtype: list """ def _verify_mds_config(current_vdisk): current_vdisk.invalidate_dynamics('info') vdisk_configs = current_vdisk.info['metadata_backend_config'] if len(vdisk_configs) == 0: raise RuntimeError('Could not load MDS configuration') return vdisk_configs client = None lock_time = 5 * 60 storagerouter = scrub_info['storage_router'] scrub_directory = '{0}/scrub_work_{1}_{2}'.format( scrub_info['scrub_path'], vpool.name, storagerouter.name) scrub_config_key = 'ovs/vpools/{0}/proxies/scrub/scrub_config_{1}'.format( vpool.guid, storagerouter.guid) backend_config_key = 'ovs/vpools/{0}/proxies/scrub/backend_config_{1}'.format( vpool.guid, storagerouter.guid) alba_proxy_service = 'ovs-albaproxy_{0}_{1}_scrub'.format( vpool.name, storagerouter.name) # Deploy a proxy try: with file_mutex(name='ovs_albaproxy_scrub', wait=lock_time): ScheduledTaskController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - Deploying ALBA proxy {2}' .format(vpool.name, storagerouter.name, alba_proxy_service)) client = SSHClient(storagerouter, 'root') client.dir_create(scrub_directory) client.dir_chmod( scrub_directory, 0777 ) # Celery task executed by 'ovs' user and should be able to write in it if ServiceManager.has_service( name=alba_proxy_service, client=client ) is True and ServiceManager.get_service_status( name=alba_proxy_service, client=client) is True: ScheduledTaskController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - Re-using existing proxy service {2}' .format(vpool.name, storagerouter.name, alba_proxy_service)) scrub_config = Configuration.get(scrub_config_key) else: machine_id = System.get_my_machine_id(client) port_range = Configuration.get( '/ovs/framework/hosts/{0}/ports|storagedriver'.format( machine_id)) port = System.get_free_ports(selected_range=port_range, nr=1, client=client)[0] # Scrub config # {u'albamgr_cfg_url': u'arakoon://config/ovs/vpools/71e2f717-f270-4a41-bbb0-d4c8c084d43e/proxies/64759516-3471-4321-b912-fb424568fc5b/config/abm?ini=%2Fopt%2FOpenvStorage%2Fconfig%2Farakoon_cacc.ini', # u'fragment_cache': [u'none'], # u'ips': [u'127.0.0.1'], # u'log_level': u'info', # u'manifest_cache_size': 17179869184, # u'port': 0, # u'transport': u'tcp'} # Backend config # {u'alba_connection_host': u'10.100.193.155', # u'alba_connection_port': 26204, # u'alba_connection_preset': u'preset', # u'alba_connection_timeout': 15, # u'alba_connection_transport': u'TCP', # u'backend_interface_retries_on_error': 5, # u'backend_interface_retry_backoff_multiplier': 2.0, # u'backend_interface_retry_interval_secs': 1, # u'backend_type': u'ALBA'} scrub_config = Configuration.get( 'ovs/vpools/{0}/proxies/scrub/generic_scrub'.format( vpool.guid)) scrub_config['port'] = port scrub_config['transport'] = 'tcp' Configuration.set(scrub_config_key, json.dumps(scrub_config, indent=4), raw=True) params = { 'VPOOL_NAME': vpool.name, 'LOG_SINK': LogHandler.get_sink_path('alba_proxy'), 'CONFIG_PATH': Configuration.get_configuration_path(scrub_config_key) } ServiceManager.add_service(name='ovs-albaproxy', params=params, client=client, target_name=alba_proxy_service) ServiceManager.start_service(name=alba_proxy_service, client=client) ScheduledTaskController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - Deployed ALBA proxy {2}' .format(vpool.name, storagerouter.name, alba_proxy_service)) backend_config = Configuration.get( 'ovs/vpools/{0}/hosts/{1}/config'.format( vpool.guid, vpool.storagedrivers[0].storagedriver_id ))['backend_connection_manager'] backend_config['alba_connection_host'] = '127.0.0.1' backend_config['alba_connection_port'] = scrub_config['port'] Configuration.set( backend_config_key, json.dumps({"backend_connection_manager": backend_config}, indent=4), raw=True) except Exception: message = 'Scrubber - vPool {0} - StorageRouter {1} - An error occurred deploying ALBA proxy {2}'.format( vpool.name, storagerouter.name, alba_proxy_service) error_messages.append(message) ScheduledTaskController._logger.exception(message) if client is not None and ServiceManager.has_service( name=alba_proxy_service, client=client) is True: if ServiceManager.get_service_status(name=alba_proxy_service, client=client) is True: ServiceManager.stop_service(name=alba_proxy_service, client=client) ServiceManager.remove_service(name=alba_proxy_service, client=client) if Configuration.exists(scrub_config_key): Configuration.delete(scrub_config_key) try: # Empty the queue with vDisks to scrub with remote(storagerouter.ip, [VDisk]) as rem: while True: vdisk = None vdisk_guid = queue.get(False) try: # Check MDS master is local. Trigger MDS handover if necessary vdisk = rem.VDisk(vdisk_guid) ScheduledTaskController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Started scrubbing at location {3}' .format(vpool.name, storagerouter.name, vdisk.name, scrub_directory)) configs = _verify_mds_config(current_vdisk=vdisk) storagedriver = StorageDriverList.get_by_storagedriver_id( vdisk.storagedriver_id) if configs[0].get( 'ip') != storagedriver.storagerouter.ip: ScheduledTaskController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - MDS master is not local, trigger handover' .format(vpool.name, storagerouter.name, vdisk.name)) MDSServiceController.ensure_safety( VDisk(vdisk_guid) ) # Do not use a remote VDisk instance here configs = _verify_mds_config(current_vdisk=vdisk) if configs[0].get( 'ip') != storagedriver.storagerouter.ip: ScheduledTaskController._logger.warning( 'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Skipping because master MDS still not local' .format(vpool.name, storagerouter.name, vdisk.name)) continue # Do the actual scrubbing with vdisk.storagedriver_client.make_locked_client( str(vdisk.volume_id)) as locked_client: ScheduledTaskController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Retrieve and apply scrub work' .format(vpool.name, storagerouter.name, vdisk.name)) work_units = locked_client.get_scrubbing_workunits( ) for work_unit in work_units: res = locked_client.scrub( work_unit=work_unit, scratch_dir=scrub_directory, log_sinks=[ LogHandler.get_sink_path( 'scrubber', allow_override=True) ], backend_config=Configuration. get_configuration_path(backend_config_key)) locked_client.apply_scrubbing_result( scrubbing_work_result=res) if work_units: ScheduledTaskController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - {3} work units successfully applied' .format(vpool.name, storagerouter.name, vdisk.name, len(work_units))) else: ScheduledTaskController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - No scrubbing required' .format(vpool.name, storagerouter.name, vdisk.name)) except Exception: if vdisk is None: message = 'Scrubber - vPool {0} - StorageRouter {1} - vDisk with guid {2} could not be found'.format( vpool.name, storagerouter.name, vdisk_guid) else: message = 'Scrubber - vPool {0} - StorageRouter {1} - vDisk {2} - Scrubbing failed'.format( vpool.name, storagerouter.name, vdisk.name) error_messages.append(message) ScheduledTaskController._logger.exception(message) except Empty: # Raised when all items have been fetched from the queue ScheduledTaskController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - Queue completely processed' .format(vpool.name, storagerouter.name)) except Exception: message = 'Scrubber - vPool {0} - StorageRouter {1} - Scrubbing failed'.format( vpool.name, storagerouter.name) error_messages.append(message) ScheduledTaskController._logger.exception(message) # Delete the proxy again try: with file_mutex(name='ovs_albaproxy_scrub', wait=lock_time): ScheduledTaskController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - Removing service {2}' .format(vpool.name, storagerouter.name, alba_proxy_service)) client = SSHClient(storagerouter, 'root') client.dir_delete(scrub_directory) if ServiceManager.has_service(alba_proxy_service, client=client): ServiceManager.stop_service(alba_proxy_service, client=client) ServiceManager.remove_service(alba_proxy_service, client=client) if Configuration.exists(scrub_config_key): Configuration.delete(scrub_config_key) ScheduledTaskController._logger.info( 'Scrubber - vPool {0} - StorageRouter {1} - Removed service {2}' .format(vpool.name, storagerouter.name, alba_proxy_service)) except Exception: message = 'Scrubber - vPool {0} - StorageRouter {1} - Removing service {2} failed'.format( vpool.name, storagerouter.name, alba_proxy_service) error_messages.append(message) ScheduledTaskController._logger.exception(message)