def _clean_iscsi_config(self, disk_id, path_index, iqn): logger.debug("Move action ,start clean disk {} path {}.".format( disk_id, path_index)) lio_api = LioAPI() try: # Get tpgs for iqn. tpgs = lio_api.get_iqns_with_enabled_tpgs().get(iqn, None) if not iqn or not tpgs or len(tpgs) == 0: logger.info("Move action ,could not find ips for %s " % disk_id) # Remove the assigned ips from our interfaces elif tpgs and len(tpgs) > 0: # Get assigned ips for each path. for tpg, ips in tpgs.iteritems(): if tpg == str(path_index + 1): lio_api.disable_path(iqn, tpg) logger.info( "Move action,cleaned disk {} path {}.".format( disk_id, path_index)) break except Exception as e: logger.error("Move action,could not clean disk path for %s" % disk_id) return False logger.debug("Move action end clean disk {} path {}.".format( disk_id, path_index)) return True
def get_next_partition_index(dev): """ Get the next free partition index on a given device. :return: Index number (> 1 if there is already a partition on the device) or 1 if there is no partition table. """ try: output, err = exec_command('parted --machine -- {} print'.format(dev)) lines = output except subprocess.CalledProcessError as e: logger.info('cannot read partition index; assume it ' 'isn\'t present\n (Error: %s)' % e) return 1 if not lines: raise logger.error('parted failed to output anything') logger.debug('get_free_partition_index: analyzing ' + lines) if ('CHS;' not in lines and 'CYL;' not in lines and 'BYT;' not in lines): raise logger.error('parted output expected to contain one of ' + 'CHH; CYL; or BYT; : ' + lines) if os.path.realpath(dev) not in lines: raise logger.error('parted output expected to contain ' + dev + ': ' + lines) _, partitions = lines.split(os.path.realpath(dev)) numbers_as_strings = re.findall('^\d+', partitions, re.MULTILINE) partition_numbers = map(int, numbers_as_strings) if partition_numbers: return max(partition_numbers) + 1 else: return 1
def __process(self): logger.debug("Start process, node session id is {}.".format( self.__session)) self.__last_acquire_succeeded = True self.__ignored_acquire_paths = dict() while self.__do_process() != True: pass logger.debug("End process.")
def __read_resources_consul(self): logger.debug("Start read resources consul.") self.__paths_per_session = {} self.__total_cluster_paths = 0 unlock_kvs = set() self.__paths_consul_locked_node = dict() try: disk_kvs = ConsulAPI().get_disk_kvs() for kv in disk_kvs: key = str(kv.Key).replace( self.__app_conf.get_consul_disks_path(), "") disk_id = str(key).split('/')[0] if disk_id in self.__disk_consul_stopped: continue if kv.Value == "disk": disk_id = str(key).split('/')[0] self.__paths_per_disk_local[disk_id] = 0 if str(kv.Flags) == "1": self.__disk_consul_stopped.add(disk_id) continue # Count paths in the cluster. self.__total_cluster_paths += 1 if hasattr(kv, "Session"): disk_id = str(key).split('/')[0] disks = self.__paths_consul_locked_node.get( kv.Session, dict()) paths = disks.get(disk_id, 0) disks[disk_id] = paths + 1 self.__paths_consul_locked_node[kv.Session] = disks # The count of paths for each session if self.__paths_per_session.has_key(kv.Session): count = self.__paths_per_session.get(kv.Session) self.__paths_per_session[kv.Session] = count + 1 else: self.__paths_per_session[kv.Session] = 1 if kv.Session == self.__session: self.__paths_consul_locked_node.add(key) disk_paths_count = self.__paths_per_disk_local.get( disk_id, 0) + 1 self.__paths_per_disk_local[disk_id] = disk_paths_count # unlocked paths elif not hasattr(kv, "Session"): unlock_kvs.add(kv) # Filter unlocked paths for kv in unlock_kvs: key = str(kv.Key).replace( self.__app_conf.get_consul_disks_path(), "") disk_id = str(key).split('/')[0] if self.__paths_per_disk_local.get(disk_id, 0) > 0: self.__paths_consul_unlocked_siblings[key] = kv.CreateIndex else: self.__paths_consul_unlocked_firstborn[ key] = kv.CreateIndex except Exception as e: logger.error("Could not read consul resources.") logger.exception(e) raise e logger.debug("End read resources consul.")
def _send_graphite(key, val): #cmd = "echo 'PetaSAN.NodeStats.{node}.{key} {val}' `date +%s` | nc -q0 {server} 2003".\ cmd = 'echo \"PetaSAN.NodeStats.{node}.{key} {val} `date +%s` \" | nc -q0 {server} 2003'.\ format(node=node_name, key=key, val=val, server=leader_ip) logger.debug(cmd) ret, stdout, stderr = exec_command_ex2(cmd) #use this function to check stderr if stderr is not None and len(stderr) > 0: raise Exception("Error running echo command :" + cmd)
def arping(args): i =0 cmd = "arping -A {} -I {} -c {}".format(args.ip,args.eth,args.c) logger.debug(cmd) while i < args.t: i+=1 call_cmd(cmd) sleep(args.i)
def __clean_unused_ips(self): ips = Network().get_all_configured_ips() for ip, eth_name in ips.iteritems(): ip, netmask = str(ip).split("/") if ip not in self.__local_ips and ip != self.__node_info.backend_1_ip and \ ip != self.__node_info.backend_2_ip and ip != self.__node_info.management_ip: NetworkAPI().delete_ip(ip, eth_name, netmask) logger.debug("Clean unused ip {} on interface {}.".format( ip, eth_name))
def __clean_unused_rbd_images(self): ceph_api = CephAPI() rbd_images = ceph_api.get_mapped_images() if rbd_images is None: return for image, mapped_count in rbd_images.iteritems(): if image not in self.__backstore: if int(mapped_count) > 0: for i in range(0, int(mapped_count)): ceph_api.unmap_image(image) logger.debug("Unmapped unused image {}.".format(image))
def __clean_unused_iqns(self): status = False lio_api = LioAPI() for iqn in lio_api.get_unused_iqns(): disk_id = str(iqn).split(":")[1] image_name = self.__image_name_prefix + str(disk_id) lio_api.delete_target(image_name, iqn) CephAPI().unmap_image(image_name) status = True logger.debug("Clean unused iqn {}.".format(iqn)) return status
def __unlock_consul_path(self, path): try: logger.debug("Unlock {} path locked by session {}.".format( path, self.__session)) consul_api = ConsulAPI() consul_api.release_disk_path( self.__app_conf.get_consul_disks_path() + path, self.__session, None) logger.info("Unlock path %s" % path) except Exception as e: logger.error("Could not unlock path %s" % path) raise e
def unmount( path, do_rm=True, ): """ Unmount and removes the given mount point. """ try: logger.debug('Unmounting %s', path) exec_command_ex('/bin/umount -- ' + path) except subprocess.CalledProcessError as e: raise Exception('Error unmonting disk.', e) if not do_rm: return os.rmdir(path)
def mount( dev, fstype, options, ): """ Mounts a device with given filessystem type and mount options to a tempfile path under /var/lib/ceph/tmp. """ # sanity check: none of the arguments are None if dev is None: raise ValueError('dev may not be None') if fstype is None: raise ValueError('fstype may not be None') # pick best-of-breed mount options based on fs type if options is None: options = "noatime" myTemp = STATEDIR + '/tmp' # mkdtemp expect 'dir' to be existing on the system # Let's be sure it's always the case if not os.path.exists(myTemp): os.makedirs(myTemp) # mount path = tempfile.mkdtemp( prefix='mnt.', dir=myTemp, ) try: logger.debug('Mounting %s on %s with options %s', dev, path, options) cmd = 'mount -t ' + fstype + '-o ' + options + ' -- ' + dev + ' ' + path exec_command_ex(cmd) if which('restorecon'): cmd = 'restorecon ' + path exec_command_ex(cmd) except subprocess.CalledProcessError as e: try: os.rmdir(path) except (OSError, IOError): pass raise Exception('Error Mounting disk.', e) return path
def new_mon_keyring(cluster): logger.debug('Creating a random mon key...') mon_keyring = '[mon.]\nkey = %s\ncaps mon = allow *\n' % generate_auth_key() keypath = '{name}.mon.keyring'.format( name=cluster, ) oldmask = os.umask(077) logger.debug('Writing monitor keyring to %s...', keypath) try: tmp = '%s.tmp' % keypath with open(tmp, 'w', 0600) as f: f.write(mon_keyring) try: os.rename(tmp, keypath) except OSError as e: raise
def __read_resources_local(self): logger.debug("Start read local resources.") lio_api = LioAPI() try: self.__backstore = lio_api.get_backstore_image_names() self.__iqn_tpgs = lio_api.get_iqns_with_enabled_tpgs() for iqn, tpgs in self.__iqn_tpgs.iteritems(): disk_id = str(iqn).split(":")[1] for tpg_index, ips in tpgs.iteritems(): self.__paths_local.add("/".join([disk_id, str(tpg_index)])) if ips and len(ips) > 0: for ip in ips: self.__local_ips.add(ip) except Exception as e: logger.error("Could not read consul resources.") raise e logger.debug("End read local resources.")
def __clean_unused_rbd_backstore(self): status = False iqns = self.__iqn_tpgs.keys() for rbd_backstore in self.__backstore: rbd_backstore_disk_id = str(rbd_backstore).replace( self.__image_name_prefix, "") is_used = False for iqn in iqns: disk_id = str(iqn).split(":")[1] if disk_id == rbd_backstore_disk_id: is_used = True break if not is_used: LioAPI().delete_backstore_image(rbd_backstore) logger.debug( "Clean unused lio backstore {}.".format(rbd_backstore)) status = True return status
def __clean_local_path(self, path): disk_id, path_index = str(path).split("/") logger.debug("Start clean disk path {}.".format(path)) image_name = self.__image_name_prefix + str(disk_id) ceph_api = CephAPI() lio_api = LioAPI() network_api = NetworkAPI() try: # Get iqn. logger.debug("Start get disk meta to clean path {}.".format(path)) # iqn = ceph_api.get_disk_meta(disk_id, pool).iqn iqn = self._get_iqn_by_disk(disk_id) logger.debug("End get disk meta to clean path {}.".format(path)) # Get tpgs for iqn. tpgs = self.__iqn_tpgs.get(iqn, None) if not iqn or not tpgs or len(tpgs) == 0: logger.info("Could not find ips for %s " % image_name) # Remove the assigned ips from our interfaces elif tpgs and len(tpgs) > 0: # Get assigned ips for each path. for tpg, ips in tpgs.iteritems(): if tpg == path_index: for ip in ips: logger.debug( "Delete ip {} to clean path {}.".format( ip, path)) if not network_api.delete_ip( ip, self.__cluster_info.iscsi_1_eth_name): network_api.delete_ip( ip, self.__cluster_info.iscsi_2_eth_name) lio_api.disable_path(iqn, path_index) logger.info("Cleaned disk path {}.".format(path)) break except Exception as e: logger.error("Could not clean disk path for %s" % image_name) raise e logger.debug("End clean disk path {}.".format(path)) return
def build_consul(): try: # Generate a Security Key keygen = PetaSAN.core.common.cmd.exec_command('consul keygen')[0] keygen = str(keygen).splitlines()[0] logger.debug('keygen: ' + keygen) conf = configuration() cluster_info = conf.get_cluster_info() cluster_name = cluster_info.name logger.info('cluster_name: ' + cluster_name) local_node_info = conf.get_node_info() logger.info("local_node_info.name: " + local_node_info.name) __create_leader_conf_locally(keygen) continue_building_cluster = __create_leader_conf_remotely( keygen, cluster_info, local_node_info) if continue_building_cluster is True: __start_leader_remotely(cluster_info, local_node_info) __start_leader_locally() else: logger.error('Error building Consul cluster') consul_status_report = StatusReport() consul_status_report.success = False consul_status_report.failed_tasks.append( 'core_consul_deploy_build_error_build_consul_cluster') return consul_status_report # sleep(5) consul_status_report = __test_leaders() logger.debug(consul_status_report) return consul_status_report except Exception as ex: logger.exception(ex.message) consul_status_report = StatusReport() consul_status_report.success = False consul_status_report.failed_tasks.append( 'core_consul_deploy_build_error_build_consul_cluster') return consul_status_report
def __wait_before_lock(self, path=None): disk_id, path_index = str(path).split("/") wait_time = 0 if path: # 1- Calc wait time if path has siblings. wait_time = int(self.__app_conf.get_siblings_paths_delay()) * int( self.__paths_per_disk_local.get(disk_id, 0)) logger.debug("Wait time for siblings is {}.".format(wait_time)) total_nodes = len(ConsulAPI().get_consul_members()) # 2- Calc average paths per node. average_node_paths = float( self.__total_cluster_paths) / float(total_nodes) # Calc the percent of local paths according to average paths. percent = float(self.__paths_per_session.get(self.__session, 0)) / average_node_paths # 3- Calc total wait time if self.__last_acquire_succeeded: wait_time += int( self.__app_conf.get_average_delay_before_lock()) * percent else: logger.debug("Skipping wait time for average delay.") logger.debug( "Wait time depending on average and siblings is {}.".format( math.ceil(wait_time))) sleep(math.ceil(wait_time))
def list_devices(): partmap = list_all_partitions() uuid_map = {} space_map = {} for base, parts in sorted(partmap.items()): for p in parts: dev = get_dev_path(p) part_name = get_dev_name(dev) part_uuid = get_partition_uuid(part_name) if part_uuid: uuid_map[part_uuid] = dev ptype = get_partition_type(part_name) logger.debug("main_list: " + dev + " ptype = " + str(ptype) + " uuid = " + str(part_uuid)) if ptype == PTYPE['osd']: dev_to_mount = dev fs_type = get_dev_fs(dev_to_mount) if fs_type is not None: try: tpath = mount(dev=dev_to_mount, fstype=fs_type, options='noatime') try: for name in Space.NAMES: space_uuid = get_oneliner(tpath, name + '_uuid') if space_uuid: space_map[space_uuid.lower()] = dev finally: unmount(tpath) except Exception('Mounting filesystem failed'): pass logger.debug("main_list: " + str(partmap) + ", uuid_map = " + str(uuid_map) + ", space_map = " + str(space_map)) devices = [] for base, parts in sorted(partmap.items()): if parts: disk = {'path': get_dev_path(base)} partitions = [] for p in sorted(parts): partitions.append(list_dev(get_dev_path(p), uuid_map, space_map)) disk['partitions'] = partitions devices.append(disk) else: device = list_dev(get_dev_path(base), uuid_map, space_map) device['path'] = get_dev_path(base) devices.append(device) logger.debug("list_devices: " + str(devices)) return devices
def manager(self, test_type, duration_sec, threads, clients, pool, cleanup): # CephAPI().create_rados_test_pool() logger.debug("Benchmark manager request.") logger.debug(clients) try: self.type = int(test_type) # Duration of write and read stress test self.stress_duration = duration_sec / 2 self.threads = threads # None storage nodes self.clients = clients # The span of time to wait between run rados test and collect state of storage nodes self.wait_for_collect_state = self.stress_duration / 4 # Duration of collect node state self.state_duration = self.stress_duration / 2 # pool self.pool = pool # running cleanup before the test, there will not be a cleanup file with written objects # will iterate for all objects in pool, very slow # self.__cleanup() nodes = ManageNode().get_node_list() # Get available storage nodes for node in nodes: if not node.name in clients and node.status == NodeStatus.up and node.is_storage: self.storage_nodes.append(str(node.name)) print self.storage_nodes if len(self.storage_nodes) == 0 and \ (self.type == RadosBenchmarkType.four_mg_Throughput or self.type == RadosBenchmarkType.four_kb_iops): raise Exception( "Cannot complete rados benchmark. No storage nodes available for run test." ) logger.debug(self.storage_nodes) if self.type == RadosBenchmarkType.four_mg_Throughput or self.type == RadosBenchmarkType.four_kb_iops: self.report = BenchmarkResult() logger.info("Benchmark start rados write.") self.__write() logger.info("Benchmark start rados read.") self.__read() logger.info("Benchmark finished.") return self.report else: # TODO pass except Exception as e: logger.exception(e.message) finally: #CephAPI().delete_rados_test_pool() if cleanup: self.__cleanup()
def handle_cluster_startup(self): i = 0 consul_api = ConsulAPI() logger.debug("Check cluster startup.") while True: try: current_node_name = self.__node_info.name result = consul_api.set_leader_startup_time( current_node_name, str(i)) if i == 0 and not result: sleep(2) continue elif result: # value returned, consul is up and running sleep(2) number_of_started_nodes = 0 for kv in consul_api.get_leaders_startup_times(): node_name = str(kv.Key).replace( ConfigAPI().get_consul_leaders_path(), "") if node_name != current_node_name: if int(kv.Value) == 0: number_of_started_nodes += 1 logger.debug("Number of started nodes = {}.".format( number_of_started_nodes)) # Another management node is just starting if i == 0 and number_of_started_nodes > 0: logger.info( "Cluster is just starting, system will delete all active disk resources" ) consul_api.delete_disk( ConfigAPI().get_consul_disks_path(), recurse=True) i += 1 sleep(58) except Exception as ex: logger.debug("Start up error") logger.exception(ex) # maybe other management nodes are starting, give them a chance to start if i == 0: sleep(2) else: i += 1 sleep(58)
def __start_leader_remotely(cluster_info, local_node_info): logger.info('Start consul leaders remotely.') ssh_exec = ssh() for cluster_node in cluster_info.management_nodes: remote_node_info = NodeInfo() remote_node_info.load_json(json.dumps(cluster_node)) logger.debug('local_node_info.backend_1_ip: ' + local_node_info.backend_1_ip) logger.debug('remote_node_info.backend_1_ip: ' + remote_node_info.backend_1_ip) if local_node_info.backend_1_ip != remote_node_info.backend_1_ip: logger.debug( 'Sending: ' + 'python ' + ConfigAPI().get_consul_start_up_script_path() + ' -retry-join {} '.format(local_node_info.backend_1_ip)) ssh_exec.exec_command( remote_node_info.backend_1_ip, 'python ' + ConfigAPI().get_consul_start_up_script_path() + ' -retry-join {} '.format(local_node_info.backend_1_ip)) return
def __read(self): # Run rados benchmark on selected nodes for node in self.clients: cmd = "python " + ConfigAPI().get_node_stress_job_script_path( ) + " -d {} -t {} -m r -p {}".format(self.stress_duration, self.threads, self.pool) logger.info("Run rados read cmd on node {} : ".format(node) + cmd) out, err = ssh().exec_command(node, cmd) # get job id from output and assign to its node if not err: self.read_jobs[int(out)] = node logger.info("Wait time before collect node state.") sleep(self.wait_for_collect_state) # Get state of storage nodes for node in self.storage_nodes: cmd = "python " + ConfigAPI().get_storage_load_job_script_path( ) + " -d {} ".format(self.state_duration) out, err = ssh().exec_command(node, cmd) logger.info("Run sar state cmd on node {} : ".format(node) + cmd) if not err: self.read_jobs[int(out)] = node # Wait to complete all jobs sleep(self.stress_duration - self.wait_for_collect_state) # Check the completed jobs and get the output while (len(self.read_jobs) > 0): remove_job_ids = [] for job_id, node_name in self.read_jobs.iteritems(): cmd = "python " + ConfigAPI().get_job_info_script_path( ) + " -id {} -t {}".format(job_id, 1) out, err = ssh().exec_command(node_name, cmd) # Job completed if int(out) == 1: remove_job_ids.append(job_id) cmd = "python " + ConfigAPI().get_job_info_script_path( ) + " -id {} -t {}".format(job_id, 2) out, err = ssh().exec_command(node_name, cmd) logger.debug( "Get job output by cmd {} from node {} ".format( cmd, node_name)) logger.debug("Output is {} ".format(out)) # job passed and get our output if out.startswith(self.output_split_text) or out.find( self.output_split_text) > -1: out = out.split(self.output_split_text)[1] else: continue # Get rados IOPs output if node_name in self.clients: rados_rs = RadosResult() if out: rados_rs.load_json(out) self.report.read_iops += rados_rs.iops self.report.read_throughput += rados_rs.throughput elif node_name in self.storage_nodes: # Get sar output sar_rs = SarResult() if out: sar_rs.load_json(out) self.report.read_nodes.append(sar_rs) # Remove completed jobs for i in remove_job_ids: self.read_jobs.pop(i) if len(self.read_jobs) > 0: sleep(5)
def process(self): logger.info("Start process reassignments paths.") max_retry = 100 current_reassignments = self.get_current_reassignment() config = configuration() assignment_script_path = ConfigAPI().get_assignment_script_path() if current_reassignments is None: return for ip, path_assignment_info in current_reassignments.iteritems(): logger.info("process path {} and its status is {}".format( ip, path_assignment_info.status)) if path_assignment_info.status == ReassignPathStatus.pending: logger.info( "Move action,try clean disk {} path {} remotely on node {}." .format(path_assignment_info.disk_name, path_assignment_info.disk_id, path_assignment_info.node)) status = False try: cmd = "python {} path_host -ip {} -disk_id {}".format( assignment_script_path, path_assignment_info.ip, path_assignment_info.disk_id) out, err = ssh().exec_command(path_assignment_info.node, cmd) logger.info(cmd) # self.clean_source_node(path_assignment_info.ip,path_assignment_info.disk_id) except Exception as ex: logger.exception(ex.message) out = "" if str(out).strip() == "0": logger.info("Move action passed") status = True current_path_assignment_info = None if status: for i in xrange(0, max_retry): logger.debug( "Wait to update status of path {}.".format( path_assignment_info.ip)) sleep(0.25) reassignments = self.get_current_reassignment() if reassignments: current_path_assignment_info = reassignments.get( path_assignment_info.ip) if current_path_assignment_info and current_path_assignment_info.status == ReassignPathStatus.moving: continue else: logger.info( "Process completed for path {} with status {}." .format( current_path_assignment_info.ip, current_path_assignment_info.status)) break if current_path_assignment_info and current_path_assignment_info.status == ReassignPathStatus.moving: self.update_path(current_path_assignment_info, ReassignPathStatus.failed) logger.info( "Move action,failed ,disk {} path {}.".format( path_assignment_info.disk_name, path_assignment_info.disk_id, path_assignment_info.node)) else: self.update_path(path_assignment_info, ReassignPathStatus.failed) logger.info( "Move action ,failed to clean disk {} path {} remotely on node ." .format(path_assignment_info.disk_name, path_assignment_info.disk_id, path_assignment_info.node)) sleep(10) # wait for display status to user if needed logger.info("Process completed.") self.remove_assignment() ConsulAPI().drop_all_node_sessions( self.__app_conf.get_consul_assignment_path(), config.get_node_name())
def __read_resources_consul(self): logger.debug("Start read resources consul.") self.__paths_per_session = {} self.__total_cluster_paths = 0 unlock_kvs = set() consul_api = ConsulAPI() try: disk_kvs = consul_api.get_disk_kvs() for kv in disk_kvs: key = str(kv.Key).replace( self.__app_conf.get_consul_disks_path(), "") disk_id = str(key).split('/')[0] if disk_id in self.__disk_consul_stopped: continue if kv.Value == "disk": disk_id = str(key).split('/')[0] self.__paths_per_disk_local[disk_id] = 0 if str(kv.Flags) == "1": self.__disk_consul_stopped.add(disk_id) continue # Count paths in the cluster. self.__total_cluster_paths += 1 if hasattr(kv, "Session"): # locked paths if kv.Session == self.__session: self.__paths_consul_locked_node.add(key) disk_paths_count = self.__paths_per_disk_local.get( disk_id, 0) + 1 self.__paths_per_disk_local[disk_id] = disk_paths_count # Total count of paths for each session if self.__paths_per_session.has_key(kv.Session): count = self.__paths_per_session.get(kv.Session) self.__paths_per_session[kv.Session] = count + 1 else: self.__paths_per_session[kv.Session] = 1 # unlocked paths elif not hasattr(kv, "Session"): unlock_kvs.add(kv) # Filter unlocked paths reassignments = None if len(unlock_kvs) > 0: reassignments = MangePathAssignment().get_forced_paths() for kv in unlock_kvs: key = str(kv.Key).replace( self.__app_conf.get_consul_disks_path(), "") if reassignments: path_assignment_info = reassignments.get(key) if path_assignment_info and path_assignment_info.target_node == self.__node_info.name: self.__force_acquire_paths[key] = kv continue else: self.__ignored_acquire_paths[key] = kv continue disk_id = str(key).split('/')[0] if self.__paths_per_disk_local.get(disk_id, 0) > 0: self.__paths_consul_unlocked_siblings[key] = kv else: self.__paths_consul_unlocked_firstborn[key] = kv except Exception as e: logger.error("Could not read consul resources.") logger.exception(e) raise e logger.debug("End read resources consul.")
def __acquire_path(self, path, consul_kv): if self.__ignored_acquire_paths.get(path): logger.info("Ignore forced path {}".format(path)) return logger.debug("Start acquire path {} by node session {}.".format( path, self.__session)) consul_api = ConsulAPI() ceph_api = CephAPI() lio_api = LioAPI() network_api = NetworkAPI() config = configuration() try: disk_id, path_index = str(path).split("/") pool = self._get_pool_by_disk(disk_id) if not pool: logger.error('Could not find pool for disk ' + disk_id) return image_name = self.__image_name_prefix + disk_id logger.debug( "Start read image meta for acquire path {}.".format(path)) all_image_meta = ceph_api.read_image_metadata(image_name, pool) petasan_meta = all_image_meta.get( self.__app_conf.get_image_meta_key()) disk_meta = DiskMeta() disk_meta.load_json(petasan_meta) logger.debug( "End read image meta for acquire path {}.".format(path)) logger.debug("Try to acquire path {}.".format(path)) node_name = config.get_node_name() result = consul_api.lock_disk_path( self.__app_conf.get_consul_disks_path() + path, self.__session, node_name, str(consul_kv.CreateIndex)) if not result: logger.info("Could not lock path {} with session {}.".format( path, self.__session)) elif result: if consul_kv.Value != None and len(str( consul_kv.Value)) > 0 and node_name != str( consul_kv.Value): logger.info("The path {} was locked by {}.".format( path, str(consul_kv.Value))) logger.debug("Node {} will kill node {}.".format( config.get_node_name(), str(consul_kv.Value))) self.__fencing(str(consul_kv.Value)) # we locked it if disk_meta.paths: # if lio has the image name in its backstore already, do not perform rbd mapping if image_name not in self.__backstore: status = ceph_api.map_iamge(image_name, pool) else: status = Status.done if Status.done == status: # Get path info from metadata path_obj = disk_meta.get_paths()[int(path_index) - 1] # add path ips to our network interfaces network_api.add_ip(path_obj.ip, path_obj.subnet_mask, path_obj.eth, path_obj.vlan_id) #update neighbors arp table network_api.update_neighbors_arp( path_obj.ip, path_obj.eth) # add new target in lio if not there already if not lio_api.is_backstore_image_found(image_name): # Give ceph map image complete it job sleep(3) # Add rbd backstores and target status = lio_api.add_target( disk_meta, disk_meta.pool) """ wwn = self.calculate_disk_wwn(disk_meta) status = lio_api.add_target(disk_meta, wwn, disk_meta.pool) """ if Status.done == status: # enable the path we locked to true self.__last_acquire_succeeded = True lio_api.enable_path(disk_meta.iqn, path_index, True) logger.info("Path %s acquired successfully" % path) if self.__acquire_warning_counter > 2: logger.info( "PetaSAN finally succeeded to acquire path after retrying {} times." .format(self.__acquire_warning_counter)) self.__acquire_warning_counter = 0 path_assignment_info = self.__force_acquire_paths.get( path) if path_assignment_info: MangePathAssignment().update_path( path_obj.ip, ReassignPathStatus.succeeded) else: path_assignment_info = self.__force_acquire_paths.get( path) if path_assignment_info: logger.info( "Acquired forced path {}".format(path)) MangePathAssignment().update_path( path_obj.ip, ReassignPathStatus.failed) self.__last_acquire_succeeded = False if self.__acquire_warning_counter > 2: logger.warning( "PetaSAN failed to acquire path after {} times." .format(self.__acquire_warning_counter)) self.__acquire_warning_counter += 1 logger.error("Error could not acquire path %s" % path) else: self.__unlock_consul_path(path) except Exception as e: logger.info("---------------------------------") logger.error(str(e.message) + "\n") logger.exception(e) if str(e.message).find("invalid session") > -1: logger.error("Session is invalid") try: logger.info("Trying to create new session id") self.__session = ConsulAPI().get_new_session_ID( self.__session_name, self.__node_info.name) logger.info("New session id is {}".format(self.__session)) logger.info("Cleaning all mapped disks from old session") self.__clean() except Exception as ex: logger.exception(ex) logger.exception("Could not acquire path %s" % path) raise e logger.debug("End acquire path {}.".format(path)) return