def setUp(self): super(TestClientLock, self).setUp() self.locker = etcd.Lock(self.client, 'test_lock')
def get_lock(self, *args, **kwargs): return etcd.Lock(self, *args, **kwargs)
def run(self): args = {} vol = NS.gluster.objects.Volume( vol_id=self.parameters['Volume.vol_id']).load() if self.parameters.get('Volume.replica_count') is not None: args.update( {"replica_count": self.parameters.get('Volume.replica_count')}) if vol.replica_count != self.parameters.get( 'Volume.replica_count'): args.update({"decrease_replica_count": True}) elif self.parameters.get('Volume.disperse_count') is not None: args.update({ "disperse_count": self.parameters.get('Volume.disperse_count') }) else: if int(vol.replica_count) > 1: args.update({"replica_count": vol.replica_count}) elif int(vol.disperse_count) > 1: args.update({"disperse_count": vol.disperse_count}) if self.parameters.get('Volume.force') is not None: args.update({"force": self.parameters.get('Volume.force')}) action = self.parameters.get('Volume.action') logger.log( "info", NS.publisher_id, { "message": "Shrinking the volume %s" % self.parameters['Volume.volname'] }, job_id=self.parameters["job_id"], flow_id=self.parameters["flow_id"], integration_id=NS.tendrl_context.integration_id) if NS.gdeploy_plugin.shrink_volume( self.parameters.get('Volume.volname'), self.parameters.get('Volume.bricks'), action, **args): logger.log("info", NS.publisher_id, { "message": "Shrinked the volume %s" % self.parameters['Volume.volname'] }, job_id=self.parameters["job_id"], flow_id=self.parameters["flow_id"], integration_id=NS.tendrl_context.integration_id) if action != "commit" and not "decrease_"\ "replica_count" in args: return True try: # Delete the bricks from central store # Acquire lock before deleting the bricks from etcd # We are blocking till we acquire the lock # the lock will live for 60 sec after which it will # be released. lock = etcd.Lock(NS._int.wclient, 'volume') while not lock.is_acquired: try: # with ttl set, lock will be blocked only for 60 sec # after which it will raise lock_expired exception. # if this is raised, we have to retry for lock lock.acquire(blocking=True, lock_ttl=60) if lock.is_acquired: # renewing lock as we are not sure, how long we # were blocked before the lock was given. # NOTE: blocked time also counts as ttl lock.acquire(lock_ttl=60) except etcd.EtcdLockExpired: continue for sub_vol in self.parameters.get('Volume.bricks'): for b in sub_vol: brick_name = b.keys()[0] + ":" + b.values()[0].replace( "/", "_") try: NS._int.wclient.delete( "clusters/%s/Volumes/%s/Bricks/%s" % (NS.tendrl_context.integration_id, self.parameters['Volume.vol_id'], brick_name), recursive=True) except etcd.EtcdKeyNotFound: continue except Exception: raise finally: lock.release() logger.log( "info", NS.publisher_id, { "message": "Deleted bricks for volume %s" " from central store" % self.parameters['Volume.volname'] }, job_id=self.parameters["job_id"], flow_id=self.parameters["flow_id"], integration_id=NS.tendrl_context.integration_id) return True else: logger.log("error", NS.publisher_id, { "message": "Volume shrink failed for volume %s" % self.parameters['Volume.volname'] }, job_id=self.parameters["job_id"], flow_id=self.parameters["flow_id"], integration_id=NS.tendrl_context.integration_id) return False
#!/usr/bin/env python3 # -*- coding: utf-8 -*- import etcd # Initialize the lock object: # NOTE: this does not acquire a lock yet client = etcd.Client(host='127.0.0.1', port=2379) # Or you can custom lock prefix, default is '/_locks/' if you are using HEAD client = etcd.Client(lock_prefix='/my_etcd_root/_locks') lock = etcd.Lock(client, 'my_lock_name') # Use the lock object: lock.acquire( blocking=True, # will block until the lock is acquired lock_ttl=None) # lock will live until we release it lock.is_acquired # True lock.acquire(lock_ttl=60) # renew a lock lock.release() # release an existing lock lock.is_acquired # False # The lock object may also be used as a context manager: client = etcd.Client() with etcd.Lock(client, 'customer1') as my_lock: # do_stuff() #应该是做一些事情的意思 my_lock.is_acquired # True my_lock.acquire(lock_ttl=60) my_lock.is_acquired # False
def __init__(self, cluster_controller_started_event=None, terminate_event=None): """ Initialise the ClusterController. This sets some variables, registers as a member with ETC and start's the run loop :param terminate_event: """ self.logger = create_logger( name=multiprocessing.current_process().name) self.logger.info("Starting Cluster Controller") self.state = 'started' self.container = os.uname()[1] self.instance_id = str(uuid.uuid4()) self.terminate_event = terminate_event self.cluster_controller_started_event = cluster_controller_started_event # Check required variables if self.etcd_port: try: self.etcd_port = int(self.etcd_port) except ValueError: self.logger.error( f'ETCD Port should be a valid integer value.', extra={ 'stack': True, }) self.state = 'stopping' if not isinstance(self.etcd_hosts, list) or not isinstance( self.etcd_port, int): self.logger.error( f'No valid ETCD hosts and/or port specified: {self.etcd_hosts}:{self.etcd_port}', extra={ 'stack': True, }) self.state = 'stopping' if not self.environment or not self.service: self.logger.error('Environment and/or service is not set.', extra={ 'stack': True, }) self.state = 'stopping' if self.state == 'stopping': self.terminate_controller(exitcode=0) # Connect to ETCD connected = False timeout = time() + 30 while time() < timeout and not connected: for host in self.etcd_hosts: self.logger.info(f'Trying to connect to ETCD host {host}', extra={ 'stack': True, }) try: self.etcd_client = func_timeout(5, etcd.Client, args=(), kwargs={ 'host': host, 'port': int(self.etcd_port), 'allow_reconnect': True }) machines = self.etcd_client.machines if len(machines) >= 1: connected = True self.logger.info( f'Connected to ETCD machines: {machines}', extra={ 'stack': True, }) break except FunctionTimedOut as error: self.logger.info( f'Timeout while connecting to ETCD host {host}', extra={ 'stack': True, }) except etcd.EtcdException as error: self.logger.info(f'Unable to connect to ETCD: {error}', extra={ 'stack': True, }) sleep(1) if not connected: self.logger.warning(f'Unable to connect to ETCD, giving up...', extra={ 'stack': True, }) self.state = 'stopping' self.terminate_controller(exitcode=1) else: # Set ETC lock name, members dir and master key location self.lock_name = self.environment + '_' + self.service self.members_dir = f"/{self.environment}/{self.service}/members" self.master_key = f"/{self.environment}/{self.service}/master" self.member_dir = f"{self.members_dir}/{self.instance_id}" self.member_state_key = f"{self.member_dir}/state" self.member_container_key = f"{self.member_dir}/container" self.member_role_key = f"{self.member_dir}/role" self.master_lock = etcd.Lock(self.etcd_client, self.lock_name) # Start the schedule thread self.terminate_schedule_event = self.run_schedule_continously( schedule=schedule, interval=1) # Try to acquire the mater lock. master = self.acquire_master_lock() # Start the run loop self.terminate_run_event = self.run() # Run start to include a child class startup logic and raise the init event when finished. self.start() self.cluster_controller_started_event.set() if master: self.started_as_master() else: self.started_as_slave() # Register in ETCD self.etcd_client.write(self.member_state_key, self.state, ttl=60) self.etcd_client.write(self.member_role_key, self.role, ttl=60) self.etcd_client.write(self.member_container_key, self.container, ttl=60) # Keep the main process alive while the terminate event is not set. while not self.terminate_event.is_set(): self.check_active(ports=self.ports) sleep(1)
def brick_status_alert(hostname): try: # fetching brick details of disconnected node lock = None path = "clusters/%s/Bricks/all/%s" % ( NS.tendrl_context.integration_id, hostname ) lock = etcd.Lock( NS._int.client, path ) lock.acquire( blocking=True, lock_ttl=60 ) if lock.is_acquired: bricks = NS.gluster.objects.Brick( fqdn=hostname ).load_all() for brick in bricks: if brick.status.lower() == BRICK_STARTED: # raise an alert for brick msg = ( "Status of brick: %s " "under volume %s in cluster %s chan" "ged from %s to %s") % ( brick.brick_path, brick.vol_name, NS.tendrl_context.integration_id, BRICK_STARTED.title(), BRICK_STOPPED.title() ) instance = "volume_%s|brick_%s" % ( brick.vol_name, brick.brick_path, ) event_utils.emit_event( "brick_status", BRICK_STOPPED.title(), msg, instance, 'WARNING', tags={"entity_type": RESOURCE_TYPE_BRICK, "volume_name": brick.vol_name, "node_id": brick.node_id, "fqdn": brick.hostname } ) # Update brick status as stopped brick.status = BRICK_STOPPED.title() brick.save() lock.release() except ( etcd.EtcdException, KeyError, ValueError, AttributeError ) as ex: Event( ExceptionMessage( priority="error", publisher=NS.publisher_id, payload={ "message": "Unable to raise an brick status " "alert for host %s" % hostname, "exception": ex } ) ) finally: if isinstance(lock, etcd.lock.Lock) and lock.is_acquired: lock.release()
def run(self): vol_id = self.parameters['Volume.vol_id'] if NS.gdeploy_plugin.stop_volume( self.parameters.get('Volume.volname')): Event( Message( priority="info", publisher=NS.publisher_id, payload={ "message": "Stopped the volume %s before delete" % self.parameters['Volume.volname'] }, job_id=self.parameters["job_id"], flow_id=self.parameters["flow_id"], cluster_id=NS.tendrl_context.integration_id, )) else: Event( Message( priority="error", publisher=NS.publisher_id, payload={ "message": "Could not stop volume %s before delete" % self.parameters['Volume.volname'] }, job_id=self.parameters["job_id"], flow_id=self.parameters["flow_id"], cluster_id=NS.tendrl_context.integration_id, )) return False args = {} if self.parameters.get('Volume.volname') is not None: args.update( {"format_bricks": self.parameters.get('Volume.format_bricks')}) if NS.gdeploy_plugin.delete_volume( self.parameters.get('Volume.volname'), **args): Event( Message( priority="info", publisher=NS.publisher_id, payload={ "message": "Deleted the volume %s" % self.parameters['Volume.volname'] }, job_id=self.parameters["job_id"], flow_id=self.parameters["flow_id"], cluster_id=NS.tendrl_context.integration_id, )) else: Event( Message( priority="error", publisher=NS.publisher_id, payload={ "message": "Failed to delete volume %s" % self.parameters['Volume.volname'] }, job_id=self.parameters["job_id"], flow_id=self.parameters["flow_id"], cluster_id=NS.tendrl_context.integration_id, )) return False while True: try: # Acquire lock before deleting the volume from etcd # We are blocking till we acquire the lock # the lock will live for 60 sec after which it will be released. lock = etcd.Lock(NS._int.wclient, 'volume') while not lock.is_acquired: try: # with ttl set, lock will be blocked only for 60 sec # after which it will raise lock_expired exception. # if this is raised, we have to retry for lock lock.acquire(blocking=True, lock_ttl=60) if lock.is_acquired: # renewing lock as we are not sure, how long we # were blocked before the lock was given. # NOTE: blocked time also counts as ttl lock.acquire(lock_ttl=60) except etcd.EtcdLockExpired: continue NS._int.wclient.delete("clusters/%s/Volumes/%s" % (NS.tendrl_context.integration_id, self.parameters['Volume.vol_id']), recursive=True) except (etcd.EtcdKeyNotFound, KeyError): Event( Message( priority="info", publisher=NS.publisher_id, payload={ "message": "Deleted the volume %s" % self.parameters['Volume.volname'] }, job_id=self.parameters["job_id"], flow_id=self.parameters["flow_id"], cluster_id=NS.tendrl_context.integration_id, )) finally: lock.release() return True
def s3kv_get(self, bucket_name, obj_key): obj_key = re.sub(r'^/*(.+?)/*$', '\g<1>', obj_key) # remove prepending and appending / lock_name = re.sub( r'^/*(.+?)/*$', '\g<1>/', bucket_name) + obj_key # lock name <bucket_name>/<obj_key> etcd_name = re.sub( r'^/*(.+?)/*$', '/\g<1>/', bucket_name) + obj_key # etcd name /<bucket_name>/<obj_key> lock = etcd.Lock(self.etcd_client, lock_name) lock.acquire( blocking=True, lock_ttl=None) # Acquire the lock over the full path of data obj # Redis read 1: fetch cached obj from Redis val_obj = redis.get(obj_key) # double check the hash value of fetched obj matches h if val_obj is not None: lock.release() # release the lock return val_obj try: # For a Redis cache miss, read the latest hash value of the data obj / h is the hash value h = self.etcd_client.read(etcd_name).value while True: try: # fetch specified version ID until found, also this could avoid interferences from outside. response = self.s3_client.get_object( Bucket=bucket_name, Key=obj_key, # VersionId = version_id ) # keep fetching obj... # until the fetched obj is not NULL if response.get('Body').read() != None: break except Exception as e: response = None # print("not found: " + version_id); except etcd.EtcdKeyNotFound: response = None except Exception as e: logging.error(traceback.format_exc()) lock.release() # release the lock # double check the hash value of fetched obj matches h obj_data = None if response: obj_data = response.get('Body').read() m = hashlib.md5() m.update(obj_data) hash_obj_data = m.hexdigest() if hash_obj_data == h: redis.set(obj_key, obj_data) return obj_data else: return None
def run(self): vol_id = self.parameters['Volume.vol_id'] if NS.gdeploy_plugin.stop_volume( self.parameters.get('Volume.volname')): Event( Message( priority="info", publisher=NS.publisher_id, payload={ "message": "Stopped the volume %s before delete" % self.parameters['Volume.volname'] }, job_id=self.parameters["job_id"], flow_id=self.parameters["flow_id"], cluster_id=NS.tendrl_context.integration_id, )) else: Event( Message( priority="error", publisher=NS.publisher_id, payload={ "message": "Could not stop volume %s before delete" % self.parameters['Volume.volname'] }, job_id=self.parameters["job_id"], flow_id=self.parameters["flow_id"], cluster_id=NS.tendrl_context.integration_id, )) return False args = {} if self.parameters.get('Volume.volname') is not None: args.update( {"format_bricks": self.parameters.get('Volume.format_bricks')}) if NS.gdeploy_plugin.delete_volume( self.parameters.get('Volume.volname'), **args): Event( Message( priority="info", publisher=NS.publisher_id, payload={ "message": "Deleted the volume %s" % self.parameters['Volume.volname'] }, job_id=self.parameters["job_id"], flow_id=self.parameters["flow_id"], cluster_id=NS.tendrl_context.integration_id, )) else: Event( Message( priority="error", publisher=NS.publisher_id, payload={ "message": "Failed to delete volume %s" % self.parameters['Volume.volname'] }, job_id=self.parameters["job_id"], flow_id=self.parameters["flow_id"], cluster_id=NS.tendrl_context.integration_id, )) return False while True: try: # Acquire lock before deleting the volume from etcd # We are blocking till we acquire the lock lock = etcd.Lock(NS.etcd_orm.client, 'volume') lock.acquire(blocking=True, lock_ttl=None) NS.etcd_orm.client.delete("clusters/%s/Volumes/%s" % (NS.tendrl_context.integration_id, self.parameters['Volume.vol_id']), recursive=True) except (etcd.EtcdKeyNotFound, KeyError): Event( Message( priority="info", publisher=NS.publisher_id, payload={ "message": "Deleted the volume %s" % self.parameters['Volume.volname'] }, job_id=self.parameters["job_id"], flow_id=self.parameters["flow_id"], cluster_id=NS.tendrl_context.integration_id, )) finally: lock.release() return True