def do_connect(self): try: conf_api = ConfigAPI() # Get which ceph user is using this function # # ========================================== # users = Users() user_name = users.get_current_system_user().strip() if user_name == "root": user_name = "admin" # Get ceph user's keyring file path # # ================================= # ceph_auth = CephAuthenticator() cluster_name = configuration().get_cluster_name() cluster = rados.Rados(conffile=conf_api.get_ceph_conf_path(cluster_name), conf=dict(keyring=ceph_auth.get_keyring_path()), rados_id=user_name) cluster.connect() return cluster except Exception as e: logger.error("do_connect() Cannot connect to ceph cluster.") logger.exception(e.message) try: cluster.shutdown() except Exception as e: pass return -1
def manager(args): try: logger.info("Benchmark manager cmd. ") clients = args.c.split(',') if len(clients) < 1: print "No clients set." sys.exit(-1) cleanup = True if args.cleanup == "0": cleanup = False result = Benchmark().manager(args.type, args.d, args.t, clients, args.p, cleanup) result = result.write_json() # Write job passed flag sys.stdout.write(Benchmark().output_split_text) # Write output sys.stdout.write(result) except Exception as ex: logger.exception(ex.message) sys.exit(-1) sys.exit(0)
def run(self): try: result = Result() ceph_api = CephAPI() cluster_status = ceph_api.get_ceph_cluster_status() if cluster_status is not None: cluster_status = json.loads(cluster_status) available_size = 0 used_size = 0 if cluster_status['pgmap']['bytes_total'] > 0: available_size = cluster_status['pgmap']['bytes_avail'] * 100.0 / cluster_status['pgmap']['bytes_total'] used_size = cluster_status['pgmap']['bytes_used'] * 100.0 / cluster_status['pgmap']['bytes_total'] notify_cluster_space_percent = ConfigAPI().get_notify_cluster_used_space_percent() if float(used_size) > float(notify_cluster_space_percent): check_state = self.__context.state.get(self.get_plugin_name(), False) if check_state == False: result.title = gettext("core_message_notify_title_cluster_out_space") result.message = '\n'.join(gettext("core_message_notify_cluster_out_space").split("\\n")).format(int(available_size)) # logger.warning(result.message) result.plugin_name = str(self.get_plugin_name()) self.__context.results.append(result) self.__context.state[self.get_plugin_name()] = True logger.warning("Cluster is running out of disk space") return self.__context.state[self.get_plugin_name()] = False except: logger.exception("Error occur during get cluster state")
def run(self): try: status = False consul = ConsulAPI() failed_jobs = consul.get_replication_failed_jobs() if len(failed_jobs) > 0: failed_jobs_str = "" for job_id, job_info in failed_jobs.iteritems(): failed_jobs_str += "\n job id: " + job_id + " job name: " + job_info.job_name status = consul.delete_failed_job(job_id) result = Result() result.plugin_name = self.get_plugin_name() result.title = gettext("core_message_notify_failed_jobs_title") result.message = '\n'.join( gettext("core_message_notify_failed_jobs_body").split( "\\n")).format(failed_jobs_str) self.__context.results.append(result) logger.info(result.message) logger.info("status of deleting failed jobs from consul is " + str(status)) except Exception as e: logger.exception(e) logger.error( "An error occurred while ReplicationNotificationPlugin was running." )
def __sync_cluster_config_file(self): try: manage_conf = configuration() current_node_name = manage_conf.get_node_info().name cluster_info = manage_conf.get_cluster_info() config_api = ConfigAPI() for i in cluster_info.management_nodes: node_info = NodeInfo() node_info.load_json(json.dumps(i)) if node_info.name != current_node_name: ssh_obj = ssh() if not ssh_obj.copy_file_to_host( node_info.management_ip, config_api.get_cluster_info_file_path()): logger.error( "Could not copy configuration file to {} server.". format(node_info.name)) self.__status_report.success = False self.__status_report.failed_tasks.append( "core_cluster_deploy_couldnt_sync_config_file") return False except Exception as ex: logger.exception(ex.message) self.__status_report.success = False self.__status_report.failed_tasks.append( "core_cluster_deploy_couldnt_sync_config_file") return False # copy_file_to_host return True
def start(self, disk_id, pool): try: ceph_api = CephAPI() consul_api = ConsulAPI() attr = ceph_api.read_image_metadata( ConfigAPI().get_image_name_prefix() + disk_id, pool) petasan_meta = attr.get(ConfigAPI().get_image_meta_key()) disk_meta = DiskMeta() if petasan_meta: disk_meta.load_json(petasan_meta) else: return Status.error consul_api.add_disk_resource(disk_meta.id, "disk") consul_api.add_disk_pool(disk_meta.id, pool) i = 0 for p in disk_meta.paths: i += 1 consul_api.add_disk_resource( "/".join(["", disk_meta.id, str(i)]), None) except Exception as e: logger.error("Can not start disk %s" % disk_id) logger.exception(e.message) return Status.error return Status.done
def __process(self): while True: try: self.__do_check() self.__do_notify() except Exception as ex: logger.exception(ex.message) sleep(self.__sleep_time)
def __read_resources_consul(self): logger.debug("Start read resources consul.") self.__paths_per_session = {} self.__total_cluster_paths = 0 unlock_kvs = set() self.__paths_consul_locked_node = dict() try: disk_kvs = ConsulAPI().get_disk_kvs() for kv in disk_kvs: key = str(kv.Key).replace( self.__app_conf.get_consul_disks_path(), "") disk_id = str(key).split('/')[0] if disk_id in self.__disk_consul_stopped: continue if kv.Value == "disk": disk_id = str(key).split('/')[0] self.__paths_per_disk_local[disk_id] = 0 if str(kv.Flags) == "1": self.__disk_consul_stopped.add(disk_id) continue # Count paths in the cluster. self.__total_cluster_paths += 1 if hasattr(kv, "Session"): disk_id = str(key).split('/')[0] disks = self.__paths_consul_locked_node.get( kv.Session, dict()) paths = disks.get(disk_id, 0) disks[disk_id] = paths + 1 self.__paths_consul_locked_node[kv.Session] = disks # The count of paths for each session if self.__paths_per_session.has_key(kv.Session): count = self.__paths_per_session.get(kv.Session) self.__paths_per_session[kv.Session] = count + 1 else: self.__paths_per_session[kv.Session] = 1 if kv.Session == self.__session: self.__paths_consul_locked_node.add(key) disk_paths_count = self.__paths_per_disk_local.get( disk_id, 0) + 1 self.__paths_per_disk_local[disk_id] = disk_paths_count # unlocked paths elif not hasattr(kv, "Session"): unlock_kvs.add(kv) # Filter unlocked paths for kv in unlock_kvs: key = str(kv.Key).replace( self.__app_conf.get_consul_disks_path(), "") disk_id = str(key).split('/')[0] if self.__paths_per_disk_local.get(disk_id, 0) > 0: self.__paths_consul_unlocked_siblings[key] = kv.CreateIndex else: self.__paths_consul_unlocked_firstborn[ key] = kv.CreateIndex except Exception as e: logger.error("Could not read consul resources.") logger.exception(e) raise e logger.debug("End read resources consul.")
def manager(self, test_type, duration_sec, threads, clients, pool, cleanup): # CephAPI().create_rados_test_pool() logger.debug("Benchmark manager request.") logger.debug(clients) try: self.type = int(test_type) # Duration of write and read stress test self.stress_duration = duration_sec / 2 self.threads = threads # None storage nodes self.clients = clients # The span of time to wait between run rados test and collect state of storage nodes self.wait_for_collect_state = self.stress_duration / 4 # Duration of collect node state self.state_duration = self.stress_duration / 2 # pool self.pool = pool # running cleanup before the test, there will not be a cleanup file with written objects # will iterate for all objects in pool, very slow # self.__cleanup() nodes = ManageNode().get_node_list() # Get available storage nodes for node in nodes: if not node.name in clients and node.status == NodeStatus.up and node.is_storage: self.storage_nodes.append(str(node.name)) print self.storage_nodes if len(self.storage_nodes) == 0 and \ (self.type == RadosBenchmarkType.four_mg_Throughput or self.type == RadosBenchmarkType.four_kb_iops): raise Exception( "Cannot complete rados benchmark. No storage nodes available for run test." ) logger.debug(self.storage_nodes) if self.type == RadosBenchmarkType.four_mg_Throughput or self.type == RadosBenchmarkType.four_kb_iops: self.report = BenchmarkResult() logger.info("Benchmark start rados write.") self.__write() logger.info("Benchmark start rados read.") self.__read() logger.info("Benchmark finished.") return self.report else: # TODO pass except Exception as e: logger.exception(e.message) finally: #CephAPI().delete_rados_test_pool() if cleanup: self.__cleanup()
def notify(self): try: messages = self.__context.state.get(self.get_plugin_name(), None) if not messages: messages = [] config_api = ConfigAPI() smtp_config = config_api.read_app_config() if smtp_config.email_notify_smtp_server == "" and smtp_config.email_notify_smtp_email == "": # logger.warning("SMTP configuration not set.") return followers = self._get_followers() for result in self.__context.results: if hasattr(result, "is_email_process") and result.is_email_process: continue for user in followers: fromaddr = smtp_config.email_notify_smtp_email toaddr = user.email # msg = MIMEMultipart() msg = email_utils.create_msg(fromaddr, toaddr, result.title, result.message) msg.smtp_server = smtp_config.email_notify_smtp_server msg.smtp_server_port = smtp_config.email_notify_smtp_port msg.email_password = smtp_config.email_notify_smtp_password msg.retry_counter = 0 msg.full_msg = result.title + "\n" + result.message msg.security = smtp_config.email_notify_smtp_security messages.append(msg) result.is_email_process = True result.count_of_notify_plugins += 1 unsent_messages = [] for msg in messages: # Note: any error log and continua # Note: if pass remove message from messages status = email_utils.send_email(msg.smtp_server, msg.smtp_server_port, msg, msg.email_password, msg.security) if not status.success: msg.retry_counter += 1 if msg.retry_counter < 1440: unsent_messages.append(msg) if msg.retry_counter in (1, 120, 480, 960, 1440): logger.error( "PetaSAN tried to send this email {} times, Can't send this message: {}." .format(msg.retry_counter, msg.full_msg)) logger.exception(status.exception) self.__context.state[self.get_plugin_name()] = unsent_messages except Exception as ex: logger.exception(ex)
def clean(args): try: logger.info("Benchmark clean cmd. ") pool = args.p CephAPI().rados_benchmark_clean(pool) except Exception as ex: logger.exception(ex.message) sys.exit(-1) sys.exit(0)
def server(args): try: logger.info("Reassignment paths script invoked to run process action.") MangePathAssignment().process() except Exception as ex: logger.error("error process reassignments actions.") logger.exception(ex.message) print(-1) sys.exit(-1)
def get_node_status(self): config_api = ConfigAPI() root_path = config_api.get_consul_nodes_path() node_status = JoiningStatus.not_joined if os.path.exists(config_api.get_replace_file_path()): return node_status try: # Config try: config = configuration() node_name = config.get_node_info().name except Exception as config_exc: logger.exception( "Config file error. The PetaSAN os maybe just installed.") return node_status try: cluster_info = config.get_cluster_info().management_nodes mgmt_nodes_count = cluster_info.__len__() if mgmt_nodes_count < 3: raise Exception( "Cluster is not completed, PetaSAN will check node join status." ) consul_base = BaseAPI() data = consul_base.read_value(root_path + node_name) if data is not None and configuration( ).are_all_mgt_nodes_in_cluster_config(): return JoiningStatus.node_joined else: return JoiningStatus.not_joined except Exception as exc: cluster_info = config.get_cluster_info().management_nodes mgmt_nodes_count = cluster_info.__len__() logger.exception(exc.message) if mgmt_nodes_count < 3 and not config.is_node_in_cluster_config( ): return JoiningStatus.not_joined elif mgmt_nodes_count is 3 and config.is_node_in_cluster_config( ): return JoiningStatus.node_joined if mgmt_nodes_count is 1: return JoiningStatus.one_node_exists elif mgmt_nodes_count is 2: return JoiningStatus.two_node_exist except Exception as e: return JoiningStatus.not_joined return node_status
def __commit_cluster_info(self): confi_api = ConfigAPI() consul_base_api = BaseAPI() cluster_conf = configuration() try: consul_base_api.write_value( confi_api.get_consul_cluster_info_path(), cluster_conf.get_cluster_info().write_json()) except Exception as ex: logger.exception(ex.message) return False return True
def build_crontab(self): directory_path = "/opt/petasan/tmp/" if not os.path.exists(directory_path): os.makedirs(directory_path) replication = ReplicationCronTab(self.node_name) replication.generate_jobs("#Replication Jobs") try: shutil.move(self.tmp_file, self.dest_file) except Exception as e: logger.error("Can't generate Cron file") logger.exception(e.message)
def run(self): try: global old_pools current_pools = {} pools_used_space = {} notify_pool_size = ConfigAPI().get_notify_pool_used_space_percent() ## Getting the notional percentage of storage used for each pool ## ## ------------------------------------------------------------- ## pools_used_space = self.get_pools_used_space() ## Loop and do checks for each pool ## ## -------------------------------- ## for pool_name, pool_space in pools_used_space.iteritems(): # Check if pool space is greater than or equal 85% of pool total space or not # if pool_space >= notify_pool_size: # Check if the same pool name and the same pool space exist in "old_pools" dictionary # key, value = pool_name, pool_space if key in old_pools and value == old_pools[key]: continue current_pools[pool_name] = pool_space # If pool space is less than 85% --> remove pool from "old_pools" dictionary if exists # else: if pool_name in old_pools: del old_pools[pool_name] ## Notify user if length of "current_pools" > 0 ## ## -------------------------------------------- ## if len(current_pools) > 0: self.__context.state[self.get_plugin_name()] = current_pools result = Result() result.plugin_name = self.get_plugin_name() result.title = gettext( "core_message_notify_pool_used_space_title") result.message = '\n'.join(gettext("core_message_notify_pool_used_space_body").split("\\n")).format \ (''.join('\n - pool : {} , used space = {}%'.format(key, val) for key, val in current_pools.iteritems())) # logger.warning(result.message) self.__context.results.append(result) # Update the dictionary old_pools with the items of the dictionary current_pools # old_pools.update(current_pools) ## Empty the dictionary current_pools ## ## ----------------------------------- ## current_pools = dict() except Exception as e: logger.exception(e) logger.error("An error occurred while PoolSizePlugin was running.")
def collect_all(self): self.collect_local_node_state() self.collect_remote_nodes_state() cluster_name = configuration().get_cluster_name() cluster_file = '/opt/petasan/log/{}'.format(cluster_name) try: os.system("tar -cPvf {}.tar {}".format(cluster_file,ConfigAPI().get_collect_state_dir())) logger.info("collect management nodes state successfully") os.system("rm -rf {}".format(ConfigAPI().get_collect_state_dir())) return True except: logger.exception("error compress state files") return False
def write_file(self, file_path, value): try: #logger.info("Trying to Write file" +file_path) if os.path.isdir('/'+file_path): return # Trying to create a new file or open one file = open(file_path.replace(self.root_path, ''), 'w') file.write(value) file.close() except Exception as e: logger.exception(e.message) return
def __get_down_node_list(self): down_node_list = [] try: con_api = ConsulAPI() node_list = con_api.get_node_list() consul_members = con_api.get_consul_members() for i in node_list: if i.name not in consul_members: i.status = NodeStatus.down down_node_list.append(i.name) return down_node_list except Exception as e: logger.exception("error get down node list") return down_node_list
def sync(self, node_key=None): #logger.info("Begin sync") base = BaseAPI() # asynch poll for updates current_index = 0#None while True: try: if node_key is not None: index, data = base.watch(node_key, current_index) else: index, data = base.watch(self.root_path, current_index) print("index", index) print("data", data) print('-------------------------------------') current_index = index if data is not None: # current_index = index print("updated current_index: ",current_index) for data_obj in data: file_path = data_obj['Key'].replace(self.root_path, "") print("file_path") print(file_path) #logger.info("*************************") #logger.info( data_obj['Key']) #logger.info(data_obj['Value']) print("Gonna write file") self.write_file( "/"+data_obj['Key'], base64.b64decode(data_obj['Value']) ) print("wrote file") # if index > current_index: # print("HERE WE ARE") # if data is not None: # # current_index = index # print("updated current_index: ",current_index) # logger.info(data) # for data_obj in data: # file_path = data_obj['Key'].replace(self.root_path, "") # print("file_path") # print(file_path) # logger.info("*************************") # logger.info( data_obj['Key']) # logger.info(data_obj['Value']) # print("Gonna write file") # self.write_file( "/"+data_obj['Key'], base64.b64decode(data_obj['Value']) ) # print("wrote file") except Exception as e: print(e.message) logger.exception(e.message) sleep(2) return #my_value
def reset_cluster_interface(self): try: config = configuration() cluster_info = config.get_cluster_info() cluster_info.bonds = [] cluster_info.jumbo_frames = [] config.set_cluster_network_info(cluster_info) logger.info("Updated cluster interface successfully.") except Exception as ex: logger.exception(ex.message) return Status().error return Status().done
def __commit_remote_nodes(self): confi_api = ConfigAPI() consul_base_api = BaseAPI() cluster_conf = configuration() try: for node in cluster_conf.get_remote_nodes_config( cluster_conf.get_node_name()): consul_base_api.write_value( confi_api.get_consul_nodes_path() + node.name, node.write_json()) except Exception as ex: logger.exception(ex.message) return False return True
def storage(args): try: logger.info("Benchmark storage cmd. ") result = Benchmark().sar_stats(args.d) result = result.write_json() # Write job passed flag sys.stdout.write(Benchmark().output_split_text) # Write output sys.stdout.write(result) except Exception as ex: logger.exception(ex.message) sys.exit(-1) sys.exit(0)
def __clean(self): logger.info("Cleaning unused configurations. ") logger.info("Cleaning all mapped disks") ceph_api = CephAPI() lio_api = LioAPI() network_api = NetworkAPI() # Get tpgs of each iqn for iqn, tpgs in lio_api.get_iqns_with_tpgs().iteritems(): try: disk_id = str(iqn).split(":")[1] # Get assigned ips for each tpg for tpg, ips in tpgs.iteritems(): if ips and len(ips) > 0: for ip in ips: # 1- Remove ip from network interface. if not network_api.delete_ip( ip, self.__cluster_info.iscsi_1_eth_name): network_api.delete_ip( ip, self.__cluster_info.iscsi_2_eth_name) # 2- Delete iqn ,delete image from rbd backstore and unmap image. image_name = self.__image_name_prefix + str(disk_id) lio_api.delete_target(image_name, iqn) ceph_api.unmap_image(image_name) except Exception as e: logger.error("Error cleaning all mapped disks, disk %s " % image_name) logger.exception(e.message) # 3- From backstore for image_name in lio_api.get_backstore_image_names(): try: lio_api.delete_backstore_image(image_name) ceph_api.unmap_image(image_name) except Exception as e: logger.error("Error cleaning all mapped disks, disk %s " % image_name) logger.info("Cleaning unused rbd images.") try: self.__clean_unused_rbd_images() except: logger.error("Error cleaning unused rbd images.") logger.info("Cleaning unused ips.") try: self.__local_ips = set() self.__clean_unused_ips() except: logger.error("Cleaning unused ips.")
def is_valid_network_setting(self): config = configuration().get_cluster_info() net = Network() eths = net.get_node_interfaces() try: if config.eth_count != len(eths): return False elif config.management_eth_name != net.get_node_management_interface( ): return False else: return True except Exception as ex: logger.exception(ex.message) return False
def state_all(): try: StateUtil().collect_all() cluster_name = configuration().get_cluster_name() cluster_file = '/opt/petasan/log/' + cluster_name + '.tar' manage_node = ManageNode() return Response( stream_with_context(manage_node.read_file(cluster_file)), mimetype="application/x-tar", headers={ "Content-Disposition": "attachment; filename={}".format(cluster_name + '.tar') }) except Exception as e: logger.exception("error download state all file")
def run(self): try: result = Result() ceph_status_overall = "" ceph_api = CephAPI() cluster_status = ceph_api.get_ceph_cluster_status() # ceph status --format json-pretty if cluster_status is not None: cluster_status = json.loads(cluster_status) # Ceph 12 : if "overall_status" in cluster_status["health"] and cluster_status["health"]["overall_status"] is not None: ceph_status_overall = cluster_status["health"]["overall_status"] else: ceph_status_overall = cluster_status["health"]["status"] if ceph_status_overall == "HEALTH_ERR": prv_err = self.__context.state.get(self.get_plugin_name(), False) if not prv_err: ceph_health_obj = cluster_status["health"] summary_messages = "" summary_messages_ls = [] if "checks" in ceph_health_obj: for key in ceph_health_obj["checks"]: if ceph_health_obj["checks"][key] is not None: msg = ceph_health_obj["checks"][key]["summary"]["message"] summary_messages_ls.append(msg) summary_messages = '\n '.join(summary_messages_ls) result.title = gettext("core_message_notify_cluster_status_title") result.message = '\n'.join(gettext("core_message_notify_cluster_status_body").split("\\n")).format(summary_messages) result.plugin_name = str(self.get_plugin_name()) self.__context.results.append(result) self.__context.state[self.get_plugin_name()] = True logger.warning("Cluster overall health status is HEALTH_ERR") return self.__context.state[self.get_plugin_name()] = False except Exception as e: logger.exception(e) logger.error("An error occurred while ClusterStatusPlugin was running.")
def commit_file(self, file_path): try: logger.info("Begin commit_file, key: " + file_path) base = BaseAPI() logger.info("Begin reading key: " + file_path) file = open(file_path, 'r') str_value = file.read() file.close() base.write_value(self.root_path+file_path, base64.b64encode(str_value)) logger.info("Committed") return True except Exception as e: logger.error("Could not push key ['" + file_path + "'] to consul.") logger.exception(e.message) return False
def kill_petasan_console(self, remote=True): cluster_conf = configuration() ssh_obj = ssh() exec_command("python {} ".format( ConfigAPI().get_kill_console_script_path())) if not remote: return try: for ip in cluster_conf.get_remote_ips( cluster_conf.get_node_name()): ssh_obj.exec_command( ip, "python {} ".format( ConfigAPI().get_kill_console_script_path())) except Exception as ex: logger.exception(ex.message) raise ex
def state(): try: StateUtil().collect_local_node_state() node_name = configuration().get_node_name() collected_path = ConfigAPI().get_collect_state_dir( ) + node_name + '.tar' manage_node = ManageNode() return Response( stream_with_context(manage_node.read_file(collected_path)), mimetype="application/x-tar", headers={ "Content-Disposition": "attachment; filename={}".format(node_name + '.tar') }) except Exception as e: logger.exception("error download state file")