def run(self): ''' This function will be executed when we call the start method of any object in our PoolCheckerThread class ''' # Get which ceph user is using this function & get his keyring file path # # ====================================================================== # ceph_auth = CephAuthenticator() cmd = 'ceph pg ls-by-pool {} --format json-pretty {} --cluster {}'.format( self.pool, ceph_auth.get_authentication_string(), self.cluster_name) ret, stdout, stderr = exec_command_ex(cmd) if ret != 0: if stderr and ('Connection timed out' in stderr or 'error connecting' in stderr): logger.error('Error in Ceph Connection cmd:' + cmd) raise CephException(CephException.CONNECTION_TIMEOUT, 'ConnectionTimeError') logger.error('General error in Ceph cmd:' + cmd) raise CephException(CephException.GENERAL_EXCEPTION, 'GeneralCephException') output = stdout pgdp = PGDumpParser() pgdp.parse(output) self.active_pgs_num = pgdp.active_pgs self.active_osds_num = pgdp.active_osds return
def do_connect(self): try: conf_api = ConfigAPI() # Get which ceph user is using this function # # ========================================== # users = Users() user_name = users.get_current_system_user().strip() if user_name == "root": user_name = "admin" # Get ceph user's keyring file path # # ================================= # ceph_auth = CephAuthenticator() cluster_name = configuration().get_cluster_name() cluster = rados.Rados(conffile=conf_api.get_ceph_conf_path(cluster_name), conf=dict(keyring=ceph_auth.get_keyring_path()), rados_id=user_name) cluster.connect() return cluster except Exception as e: logger.error("do_connect() Cannot connect to ceph cluster.") logger.exception(e.message) try: cluster.shutdown() except Exception as e: pass return -1
def set_disk_metadata(args): io_ctx = None ceph_api = CephAPI() cluster = None try: cluster = ceph_api.connect() io_ctx = cluster.open_ioctx(args.pool) # Get which ceph user is using this function & get his keyring file path # ceph_auth = CephAuthenticator() config = configuration() cluster_name = config.get_cluster_name() if args.file: with open(str(args.file), 'r') as file: disk_metadata_str = file.read() else: disk_metadata = sys.stdin.readlines() disk_metadata_str = ''.join( str(line) for line in disk_metadata) # converting list to string # read object meta : cmd = "rbd info " + args.pool + "/" + str( args.image) + " " + ceph_auth.get_authentication_string( ) + " --cluster " + cluster_name + " | grep rbd_data" ret, stdout, stderr = exec_command_ex(cmd) if ret != 0: if stderr: cluster.shutdown() print("Cannot get image meta object from rbd header.") rbd_data = stdout.rstrip().strip() dot_indx = rbd_data.rfind(".") image_id = rbd_data[(dot_indx + 1):] meta_object = "rbd_header." + image_id attr_object = meta_object io_ctx.set_xattr(str(attr_object), str(ConfigAPI().get_image_meta_key()), disk_metadata_str) io_ctx.close() cluster.shutdown() sys.exit(0) except Exception as e: print("Error in executing script function : set_disk_metadata , " + str(e.message)) io_ctx.close() cluster.shutdown() sys.exit(-1)
def read_disks_metadata(args): io_ctx = None ceph_api = CephAPI() cluster = None try: cluster = ceph_api.connect() io_ctx = cluster.open_ioctx(args.pool) # Get which ceph user is using this function & get his keyring file path # ceph_auth = CephAuthenticator() config = configuration() cluster_name = config.get_cluster_name() cmd = "rbd info " + args.pool + "/" + str( args.image) + " " + ceph_auth.get_authentication_string( ) + " --cluster " + cluster_name + " | grep rbd_data" ret, stdout, stderr = exec_command_ex(cmd) if ret != 0: if stderr: cluster.shutdown() print("Cannot get image meta object from rbd header.") sys.exit(-1) rbd_data = stdout.rstrip().strip() dot_indx = rbd_data.rfind(".") image_id = rbd_data[(dot_indx + 1):] rbd_header_object = "rbd_header." + image_id try: ret = io_ctx.get_xattr(rbd_header_object, meta_key) except: ret = io_ctx.get_xattr(rbd_header_object[:-1], meta_key) io_ctx.close() cluster.shutdown() if ret: print(ret) sys.stdout.flush() sys.exit(0) else: # Non-PetaSAN Disk : sys.exit(-1) except Exception as e: print("Error in executing script function : read_disks_metadata , " + str(e.message)) io_ctx.close() cluster.shutdown() sys.exit(-1)
def rollback_to_snapshot(self, pool_name, image_name, snap_name): # Get which ceph user is using this function & get his keyring file path # ceph_auth = CephAuthenticator() config = configuration() cluster_name = config.get_cluster_name() cmd = 'rbd snap rollback {}/{}@{} {} --cluster {}'.format(pool_name, image_name, snap_name, ceph_auth.get_authentication_string(), cluster_name) ret, stdout, stderr = exec_command_ex(cmd) if ret != 0: logger.error('General error in Ceph cmd : ' + cmd) raise CephException(CephException.GENERAL_EXCEPTION, 'GeneralCephException') return True
def get_all_images(self, pool_name): # Get which ceph user is using this function & get his keyring file path # ceph_auth = CephAuthenticator() images = [] config = configuration() cluster_name = config.get_cluster_name() cmd = 'rbd ls {} {} --cluster {}'.format(pool_name, ceph_auth.get_authentication_string(),cluster_name) ret, stdout, stderr = exec_command_ex(cmd) if ret != 0: logger.error('General error in Ceph cmd : ' + cmd) raise CephException(CephException.GENERAL_EXCEPTION, 'GeneralCephException') ls = stdout.splitlines() for image in ls: images.append(image) return images
def readImageMetaData(ioctx, image, pool): ret = None # Get which ceph user is using this function & get his keyring file path # ceph_auth = CephAuthenticator() config = configuration() cluster_name = config.get_cluster_name() try: cmd = "rbd info " + pool + "/" + str( image) + " " + ceph_auth.get_authentication_string( ) + " --cluster " + cluster_name + " | grep rbd_data" ret, stdout, stderr = exec_command_ex(cmd) if ret != 0: if stderr: logger.error("Cannot get image meta object from rbd header.") return None rbd_data = stdout.rstrip().strip() dot_indx = rbd_data.rfind(".") image_id = rbd_data[(dot_indx + 1):] rbd_header_object = "rbd_header." + image_id try: ret = ioctx.get_xattr(rbd_header_object, meta_key) except: ret = ioctx.get_xattr(rbd_header_object[:-1], meta_key) except: return None return ret
def clear_disk(args): disk_id = args.disk_id image_name = "image-" + disk_id try: # Get which ceph user is using this function & get his keyring file path # # ---------------------------------------------------------------------- # ceph_auth = CephAuthenticator() config = configuration() cluster_name = config.get_cluster_name() # Get disk metadata : # ------------------- ceph_api = CephAPI() disk_metadata = ceph_api.get_diskmeta(disk_id) # Get pool name : # --------------- pool_name = disk_metadata.pool data_pool = "" # Check if disk has been created on replicated pool or erasure pool : # ------------------------------------------------------------------- if len(disk_metadata.data_pool) > 0: data_pool = disk_metadata.data_pool tmp_image_name = "tmp_disk_" + disk_metadata.id # (1.) Check if a previous tmp image for this disk is still existed : # =================================================================== images_list = ceph_api.get_all_images(pool_name) for image in images_list: if tmp_image_name in image: # Delete image # cmd = "rbd rm {}/{} {} --cluster {}".format( pool_name, image, ceph_auth.get_authentication_string(), cluster_name) if not call_cmd(cmd): print( "Error : clear_disk.py script : cannot remove tmp image ,\ncmd : " + cmd) sys.exit(-1) print( "Stage 1 :\n\tCheck if a previous tmp image for this disk is still existed > (Completed)" ) logger.info( "Stage 1 :\n\tCheck if a previous tmp image for this disk is still existed > (Completed)" ) # (2.) Stop old disk : # ==================== consul_api = ConsulAPI() kv = consul_api.find_disk(disk_id) if kv is not None: manage_disk = ManageDisk() status = manage_disk.stop(disk_id) if status != Status.done: print('Error : Cannot stop disk , id = ' + disk_id) sys.exit(-1) print("Stage 2 :\n\tStop old disk > (Completed)") logger.info("Stage 2 :\n\tStop old disk > (Completed)") time.sleep(3) # (3.) Check if old disk is stopped or not : # ========================================== if len(data_pool) > 0: pool_type = "erasure" _confirm_disk_stopped(data_pool, disk_id, pool_type) else: pool_type = "replicated" _confirm_disk_stopped(pool_name, disk_id, pool_type) print( "Stage 3 :\n\tConfirm that disk is completely stopped > (Completed)" ) logger.info( "Stage 3 :\n\tConfirm that disk is completely stopped > (Completed)" ) else: print("Stage 2 :\n\tStop old disk > (Completed)") logger.info("Stage 2 :\n\tStop old disk > (Completed)") print( "Stage 3 :\n\tConfirm that disk is completely stopped > (Completed)" ) logger.info( "Stage 3 :\n\tConfirm that disk is completely stopped > (Completed)" ) print('\tclear_disk.py script : disk {} is already stopped'.format( disk_id)) # (4.) Create a tmp image (not PetaSAN image) : # ============================================= # Generate a random value between 1 and 99999 # random_no = str(random.randint(1, 100000)) tmp_image_name = tmp_image_name + "_" + str(random_no) image_size = disk_metadata.size * 1024 if len(data_pool) > 0: cmd = "rbd create {}/{} --size {} --data-pool {} {} --cluster {}".format( pool_name, tmp_image_name, image_size, data_pool, ceph_auth.get_authentication_string(), cluster_name) else: cmd = "rbd create {}/{} --size {} {} --cluster {}".format( pool_name, tmp_image_name, image_size, ceph_auth.get_authentication_string(), cluster_name) if not call_cmd(cmd): print( "Error : clear_disk.py script : cannot create new tmp image ,\ncmd : " + cmd) sys.exit(-1) print("Stage 4 :\n\tCreate a tmp image called ( " + tmp_image_name + " ) > (Completed)") logger.info("Stage 4 :\n\tCreate a tmp image called ( " + tmp_image_name + " ) > (Completed)") # (5.) Run script to copy "old disk" metadata to new "tmp_disk" : # =============================================================== metadata_script_file = ConfigAPI().get_disk_meta_script_path() # Function : read_disks_metadata : parser_key_1 = "read" arg_1 = "--image" arg_2 = "--pool" # Function : set_disk_metadata : parser_key_2 = "write" arg_3 = "--file" cmd = metadata_script_file + " " + parser_key_1 + " " + arg_1 + " " + image_name + " " + arg_2 + " " + pool_name +\ " | " + metadata_script_file + " " + parser_key_2 + " " + arg_1 + " " + tmp_image_name + " " + arg_2 + " " + pool_name if not call_cmd(cmd): print( "Error : clear_disk.py script : cannot copy metadata from old disk to new tmp image ,\ncmd : " + cmd) sys.exit(-1) print( "Stage 5 :\n\tRun script to copy 'old disk' metadata to new 'tmp_disk' > (Completed)" ) logger.info( "Stage 5 :\n\tRun script to copy 'old disk' metadata to new 'tmp_disk' > (Completed)" ) time.sleep(3) # (6.) Remove metadata of old disk : # =========================================================== old_image_name = str(ceph_api.conf_api.get_image_name_prefix() + disk_metadata.id) confirm = ceph_api.remove_disk_metadata(old_image_name, disk_metadata.pool) if not confirm: print( "Error : clear_disk.py script : cannot remove metadata of old disk" ) # sys.exit(-1) print("Stage 6 :\n\tRemove metadata of old disk > (Completed)") logger.info("Stage 6 :\n\tRemove metadata of old disk > (Completed)") # (7.) Rename old disk image name with "deleted-" + disk_id + random_no: # ====================================================================== new_image_name = "deleted-" + disk_metadata.id + "-" + random_no cmd = "rbd mv {}/{} {} {} --cluster {}".format( pool_name, image_name, new_image_name, ceph_auth.get_authentication_string(), cluster_name) if not call_cmd(cmd): print( "Error : clear_disk.py script : cannot rename old image from {} to {} ,\ncmd : {}" .format(image_name, new_image_name, cmd)) sys.exit(-1) print("Stage 7 :\n\tRename old disk image name with ( " + new_image_name + " ) > (Completed)") logger.info("Stage 7 :\n\tRename old disk image name with ( " + new_image_name + " ) > (Completed)") time.sleep(5) # (8.) Rename "tmp_disk" with old disk image name : # ================================================= cmd = "rbd mv {}/{} {} {} --cluster {}".format( pool_name, tmp_image_name, image_name, ceph_auth.get_authentication_string(), cluster_name) if not call_cmd(cmd): print( "Error : clear_disk.py script : cannot rename \"tmp_disk\" from {} to {} ,\ncmd : {}" .format(tmp_image_name, image_name, cmd)) sys.exit(-1) print( "Stage 8 :\n\tRename 'tmp_disk' with old disk image name > (Completed)" ) logger.info( "Stage 8 :\n\tRename 'tmp_disk' with old disk image name > (Completed)" ) time.sleep(5) jm = JobManager() id = jm.add_job(JobType.DELETE_DISK, new_image_name + ' ' + pool_name) print("Stage 9 :\n\tStart a job to remove old disk image , job id = " + str(id)) logger.info( "Stage 9 :\n\tStart a job to remove old disk image , job id = " + str(id)) sys.exit(0) except PoolException as e: print("Error : PoolException , {}".format(e.message)) logger.error("Clear Disk Error : PoolException , {}".format(e.message)) sys.exit(-1) except DiskListException as e: print("Error : DiskListException , {}".format(e.message)) logger.error("Clear Disk Error : DiskListException , {}".format( e.message)) sys.exit(-1) except CephException as e: if e.id == CephException.GENERAL_EXCEPTION: print("Error : CephException , {}".format(e.message)) logger.error("Clear Disk Error : CephException , {}".format(e.message)) sys.exit(-1) except MetadataException as e: print("Error : MetadataException , {}".format(e.message)) logger.error("Clear Disk Error : MetadataException , {}".format( e.message)) sys.exit(-1) except Exception as e: print("Error : Exception , {}".format(e.message)) logger.error("Clear Disk Error : Exception , {}".format(e.message)) sys.exit(-1)
def _read_file_lines(self, backup=False): # Get which ceph user is using this function & get his keyring file path # ceph_auth = CephAuthenticator() call_cmd('mkdir -p ' + self.CRUSH_SAVE_PATH) cluster_name = configuration().get_cluster_name() rand = self._get_rand_string(6) bin_file = self.CRUSH_SAVE_PATH + 'crushmap-tmp-' + rand + '.bin' txt_file = self.CRUSH_SAVE_PATH + 'crushmap-tmp-' + rand + '.txt' cmd = 'ceph osd getcrushmap -o ' + bin_file + ' ' + ceph_auth.get_authentication_string( ) + ' --cluster ' + cluster_name ret, stdout, stderr = exec_command_ex(cmd) if ret != 0: if stderr and ('Connection timed out' in stderr or 'error connecting' in stderr): logger.error('Error in Ceph Connection cmd:' + cmd) raise CephException(CephException.CONNECTION_TIMEOUT, 'Connection Timeout Error') logger.error('General error in Ceph cmd:' + cmd + ' error:' + stderr) raise CephException(CephException.GENERAL_EXCEPTION, 'General Ceph Error') cmd = 'crushtool -d ' + bin_file + ' -o ' + txt_file if not call_cmd(cmd): raise CrushException(CrushException.DECOMPILE, 'Crush Decompile Error') with open(txt_file, 'r') as f: lines = f.readlines() lines = [line.strip() for line in lines] section = 'start' # for section tags see src/crush/CrushCompiler.cc decompile for line in lines: if len(line) == 0: continue if line.startswith('# begin crush map'): section = 'tunables' continue elif line.startswith('# devices'): section = 'devices' continue elif line.startswith('# types'): section = 'types' continue elif line.startswith('# buckets'): section = 'buckets' continue elif line.startswith('# rules'): section = 'rules' continue elif line.startswith('# choose_args'): section = 'end' break elif line.startswith('# end crush map'): section = 'end' break if section == 'tunables': self.lines_tunables.append(line) elif section == 'devices': self.lines_devices.append(line) elif section == 'types': self.lines_types.append(line) elif section == 'buckets': self.lines_buckets.append(line) elif section == 'rules': self.lines_rules.append(line) if backup: self._backup(txt_file) call_cmd('rm ' + txt_file) call_cmd('rm ' + bin_file)
def get_active_pools(self): active_pools = [] ceph_api = CephAPI() cluster = None try: # Get which ceph user is using this function # # ========================================== # # users = Users() # user_name = users.get_current_system_user().strip() # if user_name == "root": # user_name = "admin" # # Get ceph user's keyring file path # # # ================================= # # cluster = rados.Rados(conffile=ConfigAPI().get_ceph_conf_path(cluster_name), conf=dict(keyring=ceph_auth.get_keyring_path()), rados_id=user_name) # cluster.connect() cluster_name = configuration().get_cluster_name() ceph_auth = CephAuthenticator() cluster = ceph_api.connect() # Get all list of pools: pools = cluster.list_pools() if not pools or len(pools) == 0: active_pools = [] # Create a list of threads: threads = [] for pool in pools: thread = PoolCheckerThread(cluster_name, pool) thread.setDaemon(True) thread.start() # Start running the threads! threads.append(thread) end_time = time() + self.timeout for thread in threads: wait = end_time - time() if wait < 0: break thread.join( wait) # Wait for a timeout for the threads to finish for thread in threads: # Get pg_num for current thread pool: cmd = 'ceph osd pool get {} pg_num {} --cluster {}'.format( thread.pool, ceph_auth.get_authentication_string(), thread.cluster_name) ret, stdout, stderr = exec_command_ex(cmd) if ret != 0: if stderr and ('Connection timed out' in stderr or 'error connecting' in stderr): logger.error('Error in Ceph Connection cmd:' + cmd) cluster.shutdown() raise CephException(CephException.CONNECTION_TIMEOUT, 'ConnectionTimeError') logger.error('General error in Ceph cmd:' + cmd) cluster.shutdown() raise CephException(CephException.GENERAL_EXCEPTION, 'GeneralCephException') output = stdout output_ls = output.split() pool_pg_num = output_ls[1] if not thread.is_alive() and thread.active_pgs_num > 0: if thread.active_pgs_num == int(pool_pg_num): active_pools.append(thread.pool) active_pools.sort() except Exception as e: logger.error("PoolChecker error : " + e.message) cluster.shutdown() return active_pools