def save_current_tunings(self, ceph, lio, post_script, storage_engine): config_api = ConfigAPI() path = config_api.get_current_tunings_path() if not os.path.exists(path): os.makedirs(path) with open(path + config_api.get_ceph_tunings_file_name(), 'w', ) as f: f.write(ceph) with open(path + config_api.get_lio_tunings_file_name(), 'w', ) as f: f.write(lio) with open(path + config_api.get_post_deploy_script_file_name(), 'w', ) as f: f.write(post_script) logger.info("Current tuning configurations saved.") # Save "storage_engine" in Cluster info # # ------------------------------------- # try: ci = self.get_cluster_info() ci.storage_engine = storage_engine self.set_cluster_network_info(ci) except Exception as ex: logger.error("Cannot add storage engine to cluster info , {}".format(ex.message))
def _clean_iscsi_config(self, disk_id, path_index, iqn): logger.debug("Move action ,start clean disk {} path {}.".format( disk_id, path_index)) lio_api = LioAPI() try: # Get tpgs for iqn. tpgs = lio_api.get_iqns_with_enabled_tpgs().get(iqn, None) if not iqn or not tpgs or len(tpgs) == 0: logger.info("Move action ,could not find ips for %s " % disk_id) # Remove the assigned ips from our interfaces elif tpgs and len(tpgs) > 0: # Get assigned ips for each path. for tpg, ips in tpgs.iteritems(): if tpg == str(path_index + 1): lio_api.disable_path(iqn, tpg) logger.info( "Move action,cleaned disk {} path {}.".format( disk_id, path_index)) break except Exception as e: logger.error("Move action,could not clean disk path for %s" % disk_id) return False logger.debug("Move action end clean disk {} path {}.".format( disk_id, path_index)) return True
def path_host(args): logger.info("Reassignment paths script invoked to run clean action.") if MangePathAssignment().clean_source_node(args.ip, args.disk_id): print "0" return print "-1"
def run(self): try: status = False consul = ConsulAPI() failed_jobs = consul.get_replication_failed_jobs() if len(failed_jobs) > 0: failed_jobs_str = "" for job_id, job_info in failed_jobs.iteritems(): failed_jobs_str += "\n job id: " + job_id + " job name: " + job_info.job_name status = consul.delete_failed_job(job_id) result = Result() result.plugin_name = self.get_plugin_name() result.title = gettext("core_message_notify_failed_jobs_title") result.message = '\n'.join( gettext("core_message_notify_failed_jobs_body").split( "\\n")).format(failed_jobs_str) self.__context.results.append(result) logger.info(result.message) logger.info("status of deleting failed jobs from consul is " + str(status)) except Exception as e: logger.exception(e) logger.error( "An error occurred while ReplicationNotificationPlugin was running." )
def delete_cache(self, node_name, disk_name): ssh_obj = ssh() cmd = "python {} -disk_name {}".format( ConfigAPI().get_admin_delete_cache_job_script(), disk_name) stdout, stderr = ssh_obj.exec_command(node_name, cmd) logger.info("Start delete cache job {} ".format(stdout)) return stdout
def manager(args): try: logger.info("Benchmark manager cmd. ") clients = args.c.split(',') if len(clients) < 1: print "No clients set." sys.exit(-1) cleanup = True if args.cleanup == "0": cleanup = False result = Benchmark().manager(args.type, args.d, args.t, clients, args.p, cleanup) result = result.write_json() # Write job passed flag sys.stdout.write(Benchmark().output_split_text) # Write output sys.stdout.write(result) except Exception as ex: logger.exception(ex.message) sys.exit(-1) sys.exit(0)
def delete_osd_from_crush_map(osd_id): cluster_name = configuration().get_cluster_name() logger.info("Start remove osd.{} from crush map".format(osd_id)) is_executing_without_err = True if not call_cmd("ceph --cluster {} osd out osd.{}".format(cluster_name, osd_id)): logger.error("Error executing ceph osd out osd.{}".format(osd_id)) is_executing_without_err = False if not call_cmd("ceph --cluster {} osd crush remove osd.{}".format(cluster_name, osd_id)): logger.error("Error executing ceph osd crush remove osd.{}".format(osd_id)) is_executing_without_err = False if not call_cmd("ceph --cluster {} auth del osd.{}".format(cluster_name, osd_id)): logger.error("Error executing ceph auth del osd.{}".format(osd_id)) is_executing_without_err = False # Try to delete the osd completely from ceph in case the osd is up the next command will not execute if not call_cmd("ceph --cluster {} osd rm osd.{}".format(cluster_name, osd_id)): logger.warning("The osd still up you need to stop osd service of osd.{}".format(osd_id)) if is_executing_without_err: logger.info("osd.{} is removed from crush map".format(osd_id)) else: logger.warning("osd.{} is removed from crush map".format(osd_id))
def test_active_clean_old(): cluster_name = configuration().get_cluster_name() sleeps = [10, 15, 20, 25, 30, 40] tries = 5 while tries: status = False try: out, err = exec_command( "ceph --cluster {} -f json pg stat".format(cluster_name)) ceph_pg_stat = str(out).replace("'", '') ceph_pg_stat = json.loads(ceph_pg_stat) logger.info("Ceph status is " + ceph_pg_stat['num_pg_by_state'][0]['name']) if str(ceph_pg_stat['num_pg_by_state'][0] ['name']) == 'active+clean': status = True else: status = False except Exception as e: logger.error("Get ceph status returned error.\n" + e.message) if not status: tries -= 1 sleep_seconds = sleeps.pop() logger.warning( 'waiting %s seconds before retrying to check active+clean status', sleep_seconds) time.sleep(sleep_seconds) else: # Nautilius call pool init when active : call_cmd('rbd pool init rbd') break
def check_mount(self): while True: if 2 < len(configuration().get_cluster_info().management_nodes): break sleep(30) cluster_info = configuration().get_cluster_info() #ip1 = cluster_info.management_nodes[0]['management_ip'] #ip2 = cluster_info.management_nodes[1]['management_ip'] ip1 = cluster_info.management_nodes[0]['backend_1_ip'] ip2 = cluster_info.management_nodes[1]['backend_1_ip'] cmd_mount = 'mount -t glusterfs -o backupvolfile-server=' + ip2 cmd_mount += ' ' + ip1 + ':' + GFS_VOL_NAME cmd_mount += ' ' + GFS_MOUNT_PATH + ' >/dev/null 2>&1' cmd_mount_test = 'mount | grep ' + GFS_MOUNT_PATH + ' >/dev/null 2>&1' while True: if subprocess.call(cmd_mount_test, shell=True) != 0: logger.info("GlusterFS mount attempt ") subprocess.call(cmd_mount, shell=True) sleep(30) return
def get_next_partition_index(dev): """ Get the next free partition index on a given device. :return: Index number (> 1 if there is already a partition on the device) or 1 if there is no partition table. """ try: output, err = exec_command('parted --machine -- {} print'.format(dev)) lines = output except subprocess.CalledProcessError as e: logger.info('cannot read partition index; assume it ' 'isn\'t present\n (Error: %s)' % e) return 1 if not lines: raise logger.error('parted failed to output anything') logger.debug('get_free_partition_index: analyzing ' + lines) if ('CHS;' not in lines and 'CYL;' not in lines and 'BYT;' not in lines): raise logger.error('parted output expected to contain one of ' + 'CHH; CYL; or BYT; : ' + lines) if os.path.realpath(dev) not in lines: raise logger.error('parted output expected to contain ' + dev + ': ' + lines) _, partitions = lines.split(os.path.realpath(dev)) numbers_as_strings = re.findall('^\d+', partitions, re.MULTILINE) partition_numbers = map(int, numbers_as_strings) if partition_numbers: return max(partition_numbers) + 1 else: return 1
def create_osds_remote(remote_mons_ips_ls): config_api = ConfigAPI() remote_status = StatusReport() for remot_mon in remote_mons_ips_ls: ssh_obj = ssh() status = StatusReport() out, err = ssh_obj.exec_command( remot_mon, " python {} ".format(config_api.get_node_create_osd_script_path())) logger.info(" ".join([remot_mon, out])) if "/report/" in out: # To avoid -- IndexError: list index out of range status.load_json(str(out.split("/report/")[1])) else: if err: status.load_json("Status Report Error , error : {}".format( str(err))) else: status.load_json("Connection Error.") remote_status.failed_tasks.extend(status.failed_tasks) if not status.success: logger.error( "Cannot create osd for remote node {}".format(remot_mon)) remote_status.success = False return remote_status return remote_status
def auto(self, type=1): logger.info("User start auto reassignment paths.") assignments_stats = self.get_assignments_stats() if assignments_stats.is_reassign_busy: logger.error("There is already reassignment running.") raise Exception("There is already reassignment running.") ConsulAPI().drop_all_node_sessions( self.__app_conf.get_consul_assignment_path(), configuration().get_node_name()) sleep(3) assignments_stats.paths = [ path for path in assignments_stats.paths if len(path.node.strip()) > 0 and path.status == -1 ] self.__context.paths = assignments_stats.paths self.__context.nodes = assignments_stats.nodes for plugin in self._get_new_plugins_instances(auto_plugins): if plugin.is_enable() and plugin.get_plugin_id() == type: paths_assignments = plugin.get_new_assignments() if len(paths_assignments) == 0: logger.info("There is no node under average.") return self.set_new_assignments(paths_assignments) break self.run()
def delete_osd(self, node_name, disk_name, osd_id): ssh_obj = ssh() cmd = "python {} -id {} -disk_name {}".format( ConfigAPI().get_admin_delete_osd_job_script(), osd_id, disk_name) # stdout,stderr =exec_command(cmd) stdout, stderr = ssh_obj.exec_command(node_name, cmd) logger.info("Start delete osd job {} ".format(stdout)) return stdout
def collect_local_node_state(self): script_path = ConfigAPI().get_collect_state_script() node_name=configuration().get_node_name() command = "python {}".format(script_path) if call_cmd(command): logger.info("execute collect script on {}".format(node_name)) return True return False
def run_post_deploy_script(self): config_api = ConfigAPI() path = config_api.get_current_tunings_path( ) + config_api.get_post_deploy_script_file_name() if os.path.exists(path): call_cmd("chmod +x {}".format(path)) logger.info("Run post deploy script.") call_cmd(path)
def add_journal(self, node_name, disk_name): ssh_obj = ssh() cmd = "python {} -disk_name {}".format( ConfigAPI().get_admin_add_journal_job_script(), disk_name) # stdout,stderr =exec_command(cmd)# for test local stdout, stderr = ssh_obj.exec_command(node_name, cmd) logger.info("Start add journal job {} ".format(stdout)) return stdout
def clean_ceph(): cluster_conf = configuration() current_node_info = cluster_conf.get_node_info() current_node_name = current_node_info.name remote_mons_ips = cluster_conf.get_remote_ips(current_node_name) logger.info("Starting clean_ceph") clean_ceph_local() clean_ceph_remote(remote_mons_ips)
def clean_consul_remote(): conf = configuration() ssh_exec = ssh() for ip in conf.get_remote_ips(conf.get_node_name()): logger.info("Trying to clean Consul on {}".format(ip)) ssh_exec.call_command( ip, 'python ' + ConfigAPI().get_consul_stop_script_path()) ssh_exec.call_command( ip, 'python ' + ConfigAPI().get_consul_clean_script_path())
def set_cluster_interface(bonds=[]): if (bonds == None or len(bonds) == 0): return config = configuration() cluster_info = config.get_cluster_info() cluster_info.bonds = bonds config.set_cluster_network_info(cluster_info) logger.info("Updated cluster bonds to 1.3 successfully.")
def start_action(self): logger.info('ClusterLeader start action') SharedFS().block_till_mounted() logger.info('ClusterLeader starting services') subprocess.call('/opt/petasan/scripts/stats-setup.sh', shell=True) subprocess.call('/opt/petasan/scripts/stats-start.sh', shell=True) subprocess.call('systemctl start petasan-notification', shell=True) return
def manage_disk_add_disk(name): manage_disk = ManageDisk() disk_meta = DiskMeta() disk_meta.disk_name = "sanatech" + str(name) disk_meta.size = 1 #disk_meta.password="******" #disk_meta.user="******" status = manage_disk.add_disk(disk_meta, None, PathType.both, 2) logger.info(status)
def clean(args): try: logger.info("Benchmark clean cmd. ") pool = args.p CephAPI().rados_benchmark_clean(pool) except Exception as ex: logger.exception(ex.message) sys.exit(-1) sys.exit(0)
def __get_wwn(self, disk_id): wwn = disk_id app_config = ConfigAPI().read_app_config() if app_config.wwn_fsid_tag: logger.info('include_wwn_fsid_tag() is true') fsid = ceph_disk.get_fsid(configuration().get_cluster_name()) fsid_split = fsid[:8] wwn = fsid_split + disk_id logger.info('add disk wwn is ' + wwn) return wwn
def get_list_status(): ceph_manage = ManageDisk() ceph_manage.get_disks_meta() for i in ceph_manage.get_disks_meta(): if "mostafa" == i.user: logger.info("disk found") try: print i.user, i.disk_name, i.ip, i.ip2, i.subnet1, i.subnet2, i.password, i.id, i.status, i.iqn, i.size except Exception as x: pass
def server(args): try: logger.info("Reassignment paths script invoked to run process action.") MangePathAssignment().process() except Exception as ex: logger.error("error process reassignments actions.") logger.exception(ex.message) print(-1) sys.exit(-1)
def check_process_id(process_id): if request.method == 'GET': try: benchmark = Benchmark() is_complete = benchmark.is_test_complete(process_id) if is_complete: logger.info("Benchmark Test Completed") json_data = json.dumps(is_complete) return json_data except Exception as e: return False
def write_maintenance_status(self, maintenance_status): try: j = maintenance_status.write_json() cons = BaseAPI() cons.write_value(CONSUL_MAINTENANCE_STATUS_PATH, j) logger.info("Success saving application config ") except Exception as e: logger.error("Error saving application config " + e.message) raise ConfigWriteException("Error saving application config") return
def _get_node_session(self, node_name): logger.info(self.__node_session_dict) if self.__session_dict: session = self.__node_session_dict.get(node_name) if session is not None: return session else: for sess, node in self.__session_dict.iteritems(): if node.Node == node_name: self.__node_session_dict[node] = sess return sess
def __test_leaders(): sleeps = [15, 15, 10, 10, 5, 5] tries = 5 leaders_in_cluster = [] cluster_members = [] cluster_conf = configuration() current_cluster_info = cluster_conf.get_cluster_info() current_node_info = cluster_conf.get_node_info() cluster_members.append(current_node_info.name) for i in current_cluster_info.management_nodes: node_info = NodeInfo() node_info.load_json(json.dumps(i)) cluster_members.append(node_info.name) status_report = StatusReport() for host in cluster_members: while tries: status = None try: status = _leader_status_check_(host) except Exception as exc: logger.error("Error Connecting to consul for leader check.") # if not has_reached_quorum: if not status: tries -= 1 sleep_seconds = sleeps.pop() logger.warning('waiting %s seconds before retrying', sleep_seconds) # time.sleep(sleep_seconds) sleep(sleep_seconds) status_report.success = False else: leaders_in_cluster.append(host) logger.info('Cluster Node {} joined the cluster and is alive' + host) status_report.success = True break if status_report.success is False: status_report.failed_tasks.append( 'core_consul_deploy_build_node_fail_join_cluster_not_alive' + "%" + str(host)) if leaders_in_cluster == cluster_members: logger.info("Consul leaders are ready") status_report.success = True return status_report else: logger.error("Consul leaders are not ready") return status_report
def storage(args): job_manager = JobManager() params = '-d {} '.format(args.d) for j in job_manager.get_running_job_list(): if j.type == JobType.STORAGELOAD : logger.info("Cannot start storage load job for 'sar',") print("-1") return print( job_manager.add_job(JobType.STORAGELOAD,params)) logger.info("Start storage load job for 'sar'") sys.exit(0)