def serve_rpc(): plugin = manager.NeutronManager.get_plugin() # If 0 < rpc_workers then start_rpc_listeners would be called in a # subprocess and we cannot simply catch the NotImplementedError. It is # simpler to check this up front by testing whether the plugin supports # multiple RPC workers. if not plugin.rpc_workers_supported(): LOG.debug("Active plugin doesn't implement start_rpc_listeners") if 0 < cfg.CONF.rpc_workers: LOG.error(("'rpc_workers = %d' ignored because " "start_rpc_listeners is not implemented."), cfg.CONF.rpc_workers) raise NotImplementedError() try: rpc = RpcWorker(plugin) if cfg.CONF.rpc_workers < 1: rpc.start() return rpc else: # dispose the whole pool before os.fork, otherwise there will # be shared DB connections in child processes which may cause # DB errors. session.get_engine().pool.dispose() launcher = common_service.ProcessLauncher(wait_interval=1.0) launcher.launch_service(rpc, workers=cfg.CONF.rpc_workers) return launcher except Exception: with excutils.save_and_reraise_exception(): LOG.exception(('Unrecoverable error: please check log for ' 'details.'))
def handle_pending_instances(self): """ method description. recovery-controller I do the recovery of outstanding recovery VM at startup. """ try: db_engine = dbapi.get_engine() session = dbapi.get_session(db_engine) self._update_old_records_vm_list(session) result = self._find_reprocessing_records_vm_list(session) # [recover_starter]section recover_starter_dic = self.rc_config.get_value("recover_starter") semaphore_multiplicity = recover_starter_dic.get("semaphore_multiplicity") # Set multiplicity by semaphore_multiplicity sem = threading.Semaphore(int(semaphore_multiplicity)) # Execute vm_recovery_worker if len(result) > 0: # Execute the required number for row in result: vm_uuid = row.uuid primary_id = row.id self.rc_util.syslogout_ex("RecoveryControllerStarter_0032", syslog.LOG_INFO) msg = ( "Run thread rc_worker.recovery_instance." + " vm_uuid=" + vm_uuid + " primary_id=" + str(primary_id) ) self.rc_util.syslogout(msg, syslog.LOG_INFO) threading.Thread(target=self.rc_worker.recovery_instance, args=(vm_uuid, primary_id, sem)).start() # Imperfect_recover else: return return except KeyError: self.rc_util.syslogout_ex("RecoveryControllerStarter_0020", syslog.LOG_ERR) error_type, error_value, traceback_ = sys.exc_info() tb_list = traceback.format_tb(traceback_) self.rc_util.syslogout(error_type, syslog.LOG_ERR) self.rc_util.syslogout(error_value, syslog.LOG_ERR) for tb in tb_list: self.rc_util.syslogout(tb, syslog.LOG_ERR) return except: self.rc_util.syslogout_ex("RecoveryControllerStarter_0021", syslog.LOG_ERR) error_type, error_value, traceback_ = sys.exc_info() tb_list = traceback.format_tb(traceback_) self.rc_util.syslogout(error_type, syslog.LOG_ERR) self.rc_util.syslogout(error_value, syslog.LOG_ERR) for tb in tb_list: self.rc_util.syslogout(tb, syslog.LOG_ERR) return
def host_maintenance_mode(self, notification_id, hostname, update_progress): """ nova-compute service change to disable or enable. :param notification_id: Notification ID included in the notification :param hostname: Host name of brocade target """ try: self.rc_config.set_request_context() db_engine = dbapi.get_engine(self.rc_config) session = dbapi.get_session(db_engine) self.rc_util_api.disable_host_status(hostname) if update_progress is True: self.rc_util_db.update_notification_list_db( session, 'progress', 2, notification_id) except KeyError: error_type, error_value, traceback_ = sys.exc_info() tb_list = traceback.format_tb(traceback_) LOG.error(error_type) LOG.error(error_value) for tb in tb_list: LOG.error(tb) return except: error_type, error_value, traceback_ = sys.exc_info() tb_list = traceback.format_tb(traceback_) LOG.error(error_type) LOG.error(error_value) for tb in tb_list: LOG.error(tb) return
def handle_pending_instances(self): """ method description. recovery-controller I do the recovery of outstanding recovery VM at startup. """ try: self.rc_config.set_request_context() db_engine = dbapi.get_engine(self.rc_config) session = dbapi.get_session(db_engine) self._update_old_records_vm_list(session) result = self._find_reprocessing_records_vm_list(session) # [recover_starter]section recover_starter_dic = self.rc_config.get_value("recover_starter") semaphore_multiplicity = recover_starter_dic.get( "semaphore_multiplicity") # Set multiplicity by semaphore_multiplicity sem = threading.Semaphore(int(semaphore_multiplicity)) # Execute vm_recovery_worker if len(result) > 0: # Execute the required number for row in result: vm_uuid = row.uuid primary_id = row.id msg = "Run thread rc_worker.recovery_instance." \ + " vm_uuid=" + vm_uuid \ + " primary_id=" + str(primary_id) LOG.info(msg) thread_name = self.rc_util.make_thread_name( VM_LIST, primary_id) threading.Thread( target=self.rc_worker.recovery_instance, name=thread_name, args=(vm_uuid, primary_id, sem)).start() # Imperfect_recover else: return return except KeyError: error_type, error_value, traceback_ = sys.exc_info() tb_list = traceback.format_tb(traceback_) LOG.error(error_type) LOG.error(error_value) for tb in tb_list: LOG.error(tb) return except: error_type, error_value, traceback_ = sys.exc_info() tb_list = traceback.format_tb(traceback_) LOG.error(error_type) LOG.error(error_value) for tb in tb_list: LOG.error(tb) return
def host_maintenance_mode(self, notification_id, hostname, update_progress): """ nova-compute service change to disable or enable. :param notification_id: Notification ID included in the notification :param hostname: Host name of brocade target """ try: db_engine = dbapi.get_engine() session = dbapi.get_session(db_engine) self.rc_util_api.disable_host_status(hostname) if update_progress is True: self.rc_util_db.update_notification_list_db( session, 'progress', 2, notification_id) except KeyError: self.rc_util.syslogout_ex("RecoveryControllerWorker_0031", syslog.LOG_ERR) error_type, error_value, traceback_ = sys.exc_info() tb_list = traceback.format_tb(traceback_) self.rc_util.syslogout(error_type, syslog.LOG_ERR) self.rc_util.syslogout(error_value, syslog.LOG_ERR) for tb in tb_list: self.rc_util.syslogout(tb, syslog.LOG_ERR) return except: self.rc_util.syslogout_ex("RecoveryControllerWorker_0032", syslog.LOG_ERR) error_type, error_value, traceback_ = sys.exc_info() tb_list = traceback.format_tb(traceback_) self.rc_util.syslogout(error_type, syslog.LOG_ERR) self.rc_util.syslogout(error_value, syslog.LOG_ERR) for tb in tb_list: self.rc_util.syslogout(tb, syslog.LOG_ERR) return
def add_failed_instance(self, notification_id, notification_uuid, retry_mode): """ VM recover start thread : This thread starts the VM recover execution thread. :param notification_id: The notification ID included in the notification :param notification_uuid: The recovery target VM UUID of which are included in the notification :param retry_mode: Set True in the re-processing time of call, Set the False in the normal processing time of call """ try: db_engine = dbapi.get_engine() session = dbapi.get_session(db_engine) # Get primary id of vm_list primary_id = self._create_vm_list_db_for_failed_instance( session, notification_id, notification_uuid) # update record in notification_list self.rc_util_db.update_notification_list_db( session, 'progress', 2, notification_id) # create semaphore (Multiplicity = 1) sem_recovery_instance = threading.Semaphore(1) # create and start thread if primary_id: if retry_mode == True: # Skip recovery_instance. # Will delegate to handle_pending_instances self.rc_util.syslogout_ex("RecoveryControllerStarter_0027", syslog.LOG_INFO) msg = "RETRY MODE. Skip recovery_instance thread" \ + " vm_uuide=" + notification_uuid \ + " notification_id=" + notification_id self.rc_util.syslogout(msg, syslog.LOG_INFO) else: self.rc_util.syslogout_ex("RecoveryControllerStarter_0029", syslog.LOG_INFO) msg = "Run thread rc_worker.recovery_instance." \ + " notification_uuid=" + notification_uuid \ + " primary_id=" + str(primary_id) self.rc_util.syslogout(msg, syslog.LOG_INFO) threading.Thread(target=self.rc_worker.recovery_instance, args=(notification_uuid, primary_id, sem_recovery_instance)).start() return except KeyError: self.rc_util.syslogout_ex("RecoveryControllerStarter_0012", syslog.LOG_ERR) error_type, error_value, traceback_ = sys.exc_info() tb_list = traceback.format_tb(traceback_) self.rc_util.syslogout(error_type, syslog.LOG_ERR) self.rc_util.syslogout(error_value, syslog.LOG_ERR) for tb in tb_list: self.rc_util.syslogout(tb, syslog.LOG_ERR) return except: self.rc_util.syslogout_ex("RecoveryControllerStarter_0013", syslog.LOG_ERR) error_type, error_value, traceback_ = sys.exc_info() tb_list = traceback.format_tb(traceback_) self.rc_util.syslogout(error_type, syslog.LOG_ERR) self.rc_util.syslogout(error_value, syslog.LOG_ERR) for tb in tb_list: self.rc_util.syslogout(tb, syslog.LOG_ERR) return
conf_db_dic = config.get_value('db') host = conf_db_dic.get("host") db = conf_db_dic.get("name") user = conf_db_dic.get("user") passwd = conf_db_dic.get("passwd") charset = conf_db_dic.get("charset") except Exception as e: # error handling print "failed to load configuration parameters." print "Exception: ", e sys.exit(1) print "host:", host, "db:", db, "user:"******"passwd:", passwd, "charset:", charset try: # Create an engine to store data in the database engine = dbapi.get_engine() # Create database if not exists if not database_exists(engine.url): create_database(engine.url) # Create all tables in the engine Base.metadata.create_all(engine) except Exception as e: # error handling print "failed to create tables." print "Exception: ", e sys.exit(2) print "Successfully created tables" sys.exit(0)
def _create_notification_list_db(self, jsonData): ret_dic = {} # Get DB from here and pass it to _check_retry_notification try: # Get session for db db_engine = dbapi.get_engine() session = dbapi.get_session(db_engine) if self._check_retry_notification(jsonData, session): self.rc_util.syslogout_ex( "RecoveryController_0030", syslog.LOG_INFO) msg = "Duplicate notifications. id:" + jsonData.get("id") self.rc_util.syslogout(msg, syslog.LOG_INFO) self.rc_util.syslogout(jsonData, syslog.LOG_INFO) # Node Recovery(processing A) elif jsonData.get("type") == "rscGroup" and \ str(jsonData.get("eventID")) == "1" and \ str(jsonData.get("eventType")) == "2" and \ str(jsonData.get("detail")) == "2": tdatetime = datetime.datetime.strptime( jsonData.get("time"), '%Y%m%d%H%M%S') if not self._check_repeated_notify(tdatetime, jsonData.get("hostname"), session): recover_by = 0 # node recovery ret_dic = self.rc_util_db.insert_notification_list_db( jsonData, recover_by, session) self.rc_util.syslogout_ex( "RecoveryController_0014", syslog.LOG_INFO) self.rc_util.syslogout(jsonData, syslog.LOG_INFO) else: # Duplicate notifications. self.rc_util.syslogout_ex( "RecoveryController_0015", syslog.LOG_INFO) msg = "Duplicate notifications. id:" + jsonData.get("id") self.rc_util.syslogout(msg, syslog.LOG_INFO) self.rc_util.syslogout(jsonData, syslog.LOG_INFO) # Node is up elif jsonData.get("type") == "rscGroup" and \ str(jsonData.get("eventID")) == "1" and \ str(jsonData.get("eventType")) == "1" and \ str(jsonData.get("detail")) == "1": self.rc_worker.mark_host_up_pf9(jsonData.get('hostname')) # VM Recovery(processing G) elif jsonData.get("type") == 'VM' and \ str(jsonData.get("eventID")) == '0' and \ str(jsonData.get("eventType")) == '5' and \ str(jsonData.get("detail")) == '5': recover_by = 1 # VM recovery ret_dic = self.rc_util_db.insert_notification_list_db( jsonData, recover_by, session) self.rc_util.syslogout_ex( "RecoveryController_0019", syslog.LOG_INFO) self.rc_util.syslogout(jsonData, syslog.LOG_INFO) # Node Lock(processing D and F) # Node will be locked. elif (jsonData.get("type") == 'nodeStatus') or \ ((jsonData.get("type") == 'rscGroup' and str(jsonData.get("eventID")) == '1' and str(jsonData.get("eventType")) == '2') and (str(jsonData.get("detail")) == '3' or str(jsonData.get("detail")) == '4')): tdatetime = datetime.datetime.strptime( jsonData.get("time"), '%Y%m%d%H%M%S') if not self._check_repeated_notify(tdatetime, jsonData.get("hostname"), session): recover_by = 2 # NODE lock ret_dic = self.rc_util_db.insert_notification_list_db( jsonData, recover_by, session) self.rc_util.syslogout_ex( "RecoveryController_0021", syslog.LOG_INFO) self.rc_util.syslogout(jsonData, syslog.LOG_INFO) else: # Duplicate notifications. self.rc_util.syslogout_ex( "RecoveryController_0036", syslog.LOG_INFO) msg = "Duplicate notifications. id:" + jsonData.get("id") self.rc_util.syslogout(msg, syslog.LOG_INFO) self.rc_util.syslogout(jsonData, syslog.LOG_INFO) # Do not recover(Excuted Stop API) elif jsonData.get("type") == "VM" and \ str(jsonData.get("eventID")) == "0" and \ str(jsonData.get("eventType")) == "5" and \ str(jsonData.get("detail")) == "1": self.rc_util.syslogout_ex( "RecoveryController_0022", syslog.LOG_INFO) self.rc_util.syslogout(jsonData, syslog.LOG_INFO) msg = "Do not recover instance.(Excuted Stop API)" self.rc_util.syslogout(msg, syslog.LOG_INFO) # Notification of starting node. elif jsonData.get("type") == "rscGroup" and \ str(jsonData.get("eventID")) == "1" and \ str(jsonData.get("eventType")) == "1" and \ str(jsonData.get("detail")) == "1": self.rc_util.syslogout_ex( "RecoveryController_0023", syslog.LOG_INFO) self.rc_util.syslogout(jsonData, syslog.LOG_INFO) msg = "Recieved notification of node starting. Node:" + \ jsonData['hostname'] self.rc_util.syslogout(msg, syslog.LOG_INFO) # Ignore notification else: self.rc_util.syslogout_ex( "RecoveryController_0024", syslog.LOG_INFO) self.rc_util.syslogout(jsonData, syslog.LOG_INFO) msg = "Ignore notification. Notification:" + str(jsonData) self.rc_util.syslogout(msg, syslog.LOG_INFO) except Exception: error_type, error_value, traceback_ = sys.exc_info() tb_list = traceback.format_tb(traceback_) self.rc_util.syslogout_ex( "RecoveryController_0046", syslog.LOG_ERR) self.rc_util.syslogout(error_type, syslog.LOG_ERR) self.rc_util.syslogout(error_value, syslog.LOG_ERR) for tb in tb_list: self.rc_util.syslogout(tb, syslog.LOG_ERR) raise return ret_dic
def _create_notification_list_db(self, jsonData): ret_dic = {} # Get DB from here and pass it to _check_retry_notification try: # Get session for db db_engine = dbapi.get_engine(self.rc_config) session = dbapi.get_session(db_engine) if self._check_retry_notification(jsonData, session): msg = "Duplicate notifications. id:" + jsonData.get("id") LOG.info(msg) LOG.info(jsonData) # Node Recovery(processing A) elif ( jsonData.get("type") == "rscGroup" and str(jsonData.get("eventID")) == "1" and str(jsonData.get("eventType")) == "2" and str(jsonData.get("detail")) == "2" ): tdatetime = datetime.datetime.strptime(jsonData.get("time"), "%Y%m%d%H%M%S") if not self._check_repeated_notify(tdatetime, jsonData.get("hostname"), session): recover_by = 0 # node recovery ret_dic = self.rc_util_db.insert_notification_list_db(jsonData, recover_by, session) LOG.info(jsonData) else: # Duplicate notifications. msg = "Duplicate notifications. id:" + jsonData.get("id") LOG.info(msg) LOG.info(jsonData) # VM Recovery(processing G) elif ( jsonData.get("type") == "VM" and str(jsonData.get("eventID")) == "0" and str(jsonData.get("eventType")) == "5" and str(jsonData.get("detail")) == "5" ): recover_by = 1 # VM recovery ret_dic = self.rc_util_db.insert_notification_list_db(jsonData, recover_by, session) LOG.info(jsonData) # Node Lock(processing D and F) # Node will be locked. elif (jsonData.get("type") == "nodeStatus") or ( ( jsonData.get("type") == "rscGroup" and str(jsonData.get("eventID")) == "1" and str(jsonData.get("eventType")) == "2" ) and (str(jsonData.get("detail")) == "3" or str(jsonData.get("detail")) == "4") ): tdatetime = datetime.datetime.strptime(jsonData.get("time"), "%Y%m%d%H%M%S") if not self._check_repeated_notify(tdatetime, jsonData.get("hostname"), session): recover_by = 2 # NODE lock ret_dic = self.rc_util_db.insert_notification_list_db(jsonData, recover_by, session) LOG.info(jsonData) else: # Duplicate notifications. msg = "Duplicate notifications. id:" + jsonData.get("id") LOG.info(msg) LOG.info(jsonData) # Do not recover(Excuted Stop API) elif ( jsonData.get("type") == "VM" and str(jsonData.get("eventID")) == "0" and str(jsonData.get("eventType")) == "5" and str(jsonData.get("detail")) == "1" ): LOG.info(jsonData) msg = "Do not recover instance.(Excuted Stop API)" LOG.info(msg) # Notification of starting node. elif ( jsonData.get("type") == "rscGroup" and str(jsonData.get("eventID")) == "1" and str(jsonData.get("eventType")) == "1" and str(jsonData.get("detail")) == "1" ): LOG.info(jsonData) msg = "Recieved notification of node starting. Node:" + jsonData["hostname"] LOG.info(msg) # Ignore notification else: LOG.info(jsonData) msg = "Ignore notification. Notification:" + str(jsonData) LOG.info(msg) except Exception: error_type, error_value, traceback_ = sys.exc_info() tb_list = traceback.format_tb(traceback_) LOG.error(error_type) LOG.error(error_value) for tb in tb_list: LOG.error(tb) raise return ret_dic
def add_failed_instance(self, notification_id, notification_uuid, retry_mode): """ VM recover start thread : This thread starts the VM recover execution thread. :param notification_id: The notification ID included in the notification :param notification_uuid: The recovery target VM UUID of which are included in the notification :param retry_mode: Set True in the re-processing time of call, Set the False in the normal processing time of call """ try: self.rc_config.set_request_context() db_engine = dbapi.get_engine(self.rc_config) session = dbapi.get_session(db_engine) # Get primary id of vm_list primary_id = self._create_vm_list_db_for_failed_instance( session, notification_id, notification_uuid) # update record in notification_list self.rc_util_db.update_notification_list_db( session, 'progress', 2, notification_id) # create semaphore (Multiplicity = 1) sem_recovery_instance = threading.Semaphore(1) # create and start thread if primary_id: if retry_mode is True: # Skip recovery_instance. # Will delegate to handle_pending_instances msg = "RETRY MODE. Skip recovery_instance thread" \ + " vm_uuide=" + notification_uuid \ + " notification_id=" + notification_id LOG.info(msg) else: msg = "Run thread rc_worker.recovery_instance." \ + " notification_uuid=" + notification_uuid \ + " primary_id=" + str(primary_id) LOG.info(msg) thread_name = self.rc_util.make_thread_name( VM_LIST, primary_id) threading.Thread(target=self.rc_worker.recovery_instance, name=thread_name, args=(notification_uuid, primary_id, sem_recovery_instance)).start() return except KeyError: error_type, error_value, traceback_ = sys.exc_info() tb_list = traceback.format_tb(traceback_) LOG.error(error_type) LOG.error(error_value) for tb in tb_list: LOG.error(tb) return except: error_type, error_value, traceback_ = sys.exc_info() tb_list = traceback.format_tb(traceback_) LOG.error(error_type) LOG.error(error_value) for tb in tb_list: LOG.error(tb) return
def masakari(self): """ RecoveryController class main processing: This processing checks the VM list table of DB. If an unprocessed VM exists, and start thread to execute the recovery process. Then, the processing starts the wsgi server and waits for the notification. """ try: LOG.info("masakari START.") # Get a session and do not pass it to other threads db_engine = dbapi.get_engine(self.rc_config) session = dbapi.get_session(db_engine) self._update_old_records_notification_list(session) result = self._find_reprocessing_records_notification_list(session) preprocessing_count = len(result) if preprocessing_count > 0: for row in result: if row.recover_by == 0: # node recovery event msg = "Run thread rc_worker.host_maintenance_mode." \ + " notification_id=" + row.notification_id \ + " notification_hostname=" \ + row.notification_hostname \ + " update_progress=False" LOG.info(msg) thread_name = self.rc_util.make_thread_name( NOTIFICATION_LIST, row.notification_id) th = threading.Thread( target=self.rc_worker.host_maintenance_mode, name=thread_name, args=( row.notification_id, row.notification_hostname, False, )) th.start() # Sleep until updating nova-compute service status # down. dic = self.rc_config.get_value('recover_starter') node_err_wait = dic.get("node_err_wait") msg = ("Sleeping %s sec before starting node recovery" "thread, until updateing nova-compute" "service status." % (node_err_wait)) LOG.info(msg) greenthread.sleep(int(node_err_wait)) # Start add_failed_host thread # TODO(sampath): # Avoid create thread here, # insted call rc_starter.add_failed_host retry_mode = True msg = "Run thread rc_starter.add_failed_host." \ + " notification_id=" + row.notification_id \ + " notification_hostname=" \ + row.notification_hostname \ + " notification_cluster_port=" \ + row.notification_cluster_port \ + " retry_mode=" + str(retry_mode) LOG.info(msg) thread_name = self.rc_util.make_thread_name( NOTIFICATION_LIST, row.notification_id) th = threading.Thread( target=self.rc_starter.add_failed_host, name=thread_name, args=( row.notification_id, row.notification_hostname, row.notification_cluster_port, retry_mode, )) th.start() elif row.recover_by == 1: # instance recovery event # TODO(sampath): # Avoid create thread here, # insted call rc_starter.add_failed_instance msg = "Run thread rc_starter.add_failed_instance." \ + " notification_id=" + row.notification_id \ + " notification_uuid=" \ + row.notification_uuid LOG.info(msg) thread_name = self.rc_util.make_thread_name( NOTIFICATION_LIST, row.notification_id) th = threading.Thread( target=self.rc_starter.add_failed_instance, name=thread_name, args=( row.notification_id, row.notification_uuid, )) th.start() else: # maintenance mode event msg = "Run thread rc_starter.host_maintenance_mode." \ + " notification_id=" + row.notification_id \ + " notification_hostname=" \ + row.notification_hostname \ + "update_progress=True" LOG.info(msg) thread_name = self.rc_util.make_thread_name( NOTIFICATION_LIST, row.notification_id) th = threading.Thread( target=self.rc_worker.host_maintenance_mode, name=thread_name, args=( row.notification_id, row.notification_hostname, True, )) th.start() # Start handle_pending_instances thread # TODO(sampath): # Avoid create thread here, # insted call rc_starter.handle_pending_instances() msg = "Run thread rc_starter.handle_pending_instances." LOG.info(msg) thread_name = "Thread:handle_pending_instances" th = threading.Thread( target=self.rc_starter.handle_pending_instances, name=thread_name) th.start() # Start reciever process for notification conf_wsgi_dic = self.rc_config.get_value('wsgi') wsgi.server( eventlet.listen(('', int(conf_wsgi_dic['server_port']))), self._notification_reciever) except exc.SQLAlchemyError: error_type, error_value, traceback_ = sys.exc_info() tb_list = traceback.format_tb(traceback_) LOG.critical(error_type) LOG.critical(error_value) for tb in tb_list: LOG.critical(tb) sys.exit() except KeyError: error_type, error_value, traceback_ = sys.exc_info() tb_list = traceback.format_tb(traceback_) LOG.critical(error_type) LOG.critical(error_value) for tb in tb_list: LOG.critical(tb) sys.exit() except: error_type, error_value, traceback_ = sys.exc_info() tb_list = traceback.format_tb(traceback_) LOG.critical(error_type) LOG.critical(error_value) for tb in tb_list: LOG.critical(tb) sys.exit()
def add_failed_host(self, notification_id, notification_hostname, notification_cluster_port, retry_mode): """ Node recover start thread : This thread starts the VM recover execution thread, only the number of existing vm in the recovery target node. :param notification_id: The notification ID included in the notification :param notification_hostname: The host name of the failure node that is included in the notification """ try: db_engine = dbapi.get_engine() session = dbapi.get_session(db_engine) conf_dict = self.rc_config.get_value('recover_starter') recovery_max_retry_cnt = conf_dict.get('recovery_max_retry_cnt') recovery_retry_interval = conf_dict.get('recovery_retry_interval') vm_list = self.rc_util_api.fetch_servers_on_hypervisor( notification_hostname) # Count vm_list if len(vm_list) == 0: self.rc_util.syslogout_ex("RecoveryControllerStarter_0014", syslog.LOG_INFO) msg = "There is no instance in " + notification_hostname + "." self.rc_util.syslogout(msg, syslog.LOG_INFO) # update record in notification_list self.rc_util_db.update_notification_list_db( session, 'progress', 2, notification_id) return else: result = dbapi.get_all_notification_list_by_id_for_update( session, notification_id) recover_to = result.pop().recover_to if retry_mode is False: cnt = dbapi.get_all_reserve_list_by_hostname_not_deleted( session, recover_to) if not cnt: cnt = dbapi.\ get_one_reserve_list_by_cluster_port_for_update( session, notification_cluster_port, notification_hostname ) if not cnt: self.rc_util.syslogout_ex( "RecoveryControllerStarter_0022", syslog.LOG_WARNING) msg = "The reserve node not exist in " \ "reserve_list DB, " \ "so do not recover instances." self.rc_util.syslogout(msg, syslog.LOG_WARNING) self.rc_util_db.update_notification_list_db( 'progress', 3, notification_id) return result = cnt.pop() recover_to = result.hostname update_at = datetime.datetime.now() dbapi.update_notification_list_by_notification_id_recover_to( session, notification_id, update_at, recover_to) self.rc_util.syslogout_ex( "RecoveryControllerStarter_0024", syslog.LOG_INFO) self.rc_util.syslogout_ex("RecoveryControllerStarter_0015", syslog.LOG_INFO) delete_at = datetime.datetime.now() dbapi.update_reserve_list_by_hostname_as_deleted( session, recover_to, delete_at) # create semaphore (Multiplicity is get from config.) conf_dict = self.rc_config.get_value('recover_starter') sem_recovery_instance = threading.Semaphore( int(conf_dict.get('semaphore_multiplicity'))) incomplete_list = [] for i in range(0, int(recovery_max_retry_cnt)): incomplete_list = [] for vm_uuid in vm_list: primary_id = self._create_vm_list_db_for_failed_host( session, notification_id, vm_uuid) if primary_id: if retry_mode == True: # Skip recovery_instance thread. Will delegate to # ... msg = "RETRY MODE. Skip recovery_instance thread" \ + " vm_uuide=" + vm_uuid \ + " notification_id=" + notification_id self.rc_util.syslogout(msg, syslog.LOG_INFO) else: msg = "Run thread rc_worker.recovery_instance." \ + " vm_uuid=" + vm_uuid \ + " primary_id=" + str(primary_id) self.rc_util.syslogout(msg, syslog.LOG_INFO) threading.Thread( target=self.rc_worker.recovery_instance, args=(vm_uuid, primary_id, sem_recovery_instance)).start() else: if retry_mode == True: continue else: incomplete_list.append(vm_uuid) if incomplete_list: vm_list = incomplete_list greenthread.sleep(int(recovery_retry_interval)) else: break for vm_uuid in incomplete_list: primary_id = self.rc_util_db.insert_vm_list_db( session, notification_id, vm_uuid, 0) # Skip recovery_instance thread. Will delegate to ... self.rc_util.syslogout_ex("RecoveryControllerStarter_0031", syslog.LOG_INFO) msg = "Run thread rc_worker.recovery_instance." \ + " vm_uuid=" + vm_uuid \ + " primary_id=" + str(primary_id) self.rc_util.syslogout(msg, syslog.LOG_INFO) threading.Thread(target=self.rc_worker.recovery_instance, args=(vm_uuid, primary_id, sem_recovery_instance)).start() # update record in notification_list self.rc_util_db.update_notification_list_db( session, 'progress', 2, notification_id) return except KeyError: self.rc_util.syslogout_ex("RecoveryControllerStarter_0017", syslog.LOG_ERR) error_type, error_value, traceback_ = sys.exc_info() tb_list = traceback.format_tb(traceback_) self.rc_util.syslogout(error_type, syslog.LOG_ERR) self.rc_util.syslogout(error_value, syslog.LOG_ERR) for tb in tb_list: self.rc_util.syslogout(tb, syslog.LOG_ERR) return except: self.rc_util.syslogout_ex("RecoveryControllerStarter_0018", syslog.LOG_ERR) error_type, error_value, traceback_ = sys.exc_info() tb_list = traceback.format_tb(traceback_) self.rc_util.syslogout(error_type, syslog.LOG_ERR) self.rc_util.syslogout(error_value, syslog.LOG_ERR) for tb in tb_list: self.rc_util.syslogout(tb, syslog.LOG_ERR) return
def recovery_instance(self, uuid, primary_id, sem): """ Execute VM recovery. :param uuid: Recovery target VM UUID :param primary_id: Unique ID of the vm_list table :param sem: Semaphore """ try: sem.acquire() self.rc_config.set_request_context() db_engine = dbapi.get_engine(self.rc_config) session = dbapi.get_session(db_engine) # Initlize status. status = self.STATUS_NORMAL # Update vmha recovery status. self.rc_util_db.update_vm_list_db( session, 'progress', 1, primary_id) # Get vm infomation. vm_info = self._get_vm_param(uuid) HA_Enabled = vm_info.metadata.get('HA-Enabled') if HA_Enabled: HA_Enabled = HA_Enabled.upper() if HA_Enabled != 'OFF': HA_Enabled = 'ON' # Set recovery parameter. exe_param = {} exe_param['vm_state'] = getattr(vm_info, 'OS-EXT-STS:vm_state') exe_param['HA-Enabled'] = HA_Enabled recover_by, recover_to = self._get_vmha_param( session, uuid, primary_id) exe_param['recover_by'] = recover_by exe_param['recover_to'] = recover_to # Execute. status = self._execute_recovery(session, uuid, exe_param.get("vm_state"), exe_param.get("HA-Enabled"), exe_param.get("recover_by"), exe_param.get("recover_to")) except EnvironmentError: status = self.STATUS_ERROR error_type, error_value, traceback_ = sys.exc_info() tb_list = traceback.format_tb(traceback_) LOG.error(error_type) LOG.error(error_value) for tb in tb_list: LOG.error(tb) return except KeyError: status = self.STATUS_ERROR error_type, error_value, traceback_ = sys.exc_info() tb_list = traceback.format_tb(traceback_) LOG.error(error_type) LOG.error(error_value) for tb in tb_list: LOG.error(tb) return except: status = self.STATUS_ERROR error_type, error_value, traceback_ = sys.exc_info() tb_list = traceback.format_tb(traceback_) LOG.error(error_type) LOG.error(error_value) for tb in tb_list: LOG.error(tb) return finally: try: # Successful execution. if status == self.STATUS_NORMAL: self.rc_util_db.update_vm_list_db( session, 'progress', 2, primary_id) msg = "Recovery process has been completed successfully." LOG.info(msg) # Abnormal termination. else: self.rc_util_db.update_vm_list_db( session, 'progress', 3, primary_id) msg = "Recovery process has been terminated abnormally." LOG.info(msg) # Release semaphore if sem: sem.release() except: error_type, error_value, traceback_ = sys.exc_info() tb_list = traceback.format_tb(traceback_) LOG.error(error_type) LOG.error(error_value) for tb in tb_list: LOG.error(tb) return
def masakari(self): """ RecoveryController class main processing: This processing checks the VM list table of DB. If an unprocessed VM exists, and start thread to execute the recovery process. Then, the processing starts the wsgi server and waits for the notification. """ try: self.rc_util.syslogout_ex("RecoveryController_0004", syslog.LOG_INFO) self.rc_util.syslogout("masakari START.", syslog.LOG_INFO) # Get a session and do not pass it to other threads db_engine = dbapi.get_engine() session = dbapi.get_session(db_engine) self._update_old_records_notification_list(session) result = self._find_reprocessing_records_notification_list(session) preprocessing_count = len(result) if preprocessing_count > 0: for row in result: if row.recover_by == 0: # node recovery event th = threading.Thread( target=self.rc_worker.host_maintenance_mode, args=( row.notification_id, row.notification_hostname, False, )) th.start() # Sleep until updating nova-compute service status # down. self.rc_util.syslogout_ex("RecoveryController_0035", syslog.LOG_INFO) dic = self.rc_config.get_value('recover_starter') node_err_wait = dic.get("node_err_wait") msg = ("Sleeping %s sec before starting node recovery" "thread, until updateing nova-compute" "service status." % (node_err_wait)) self.rc_util.syslogout(msg, syslog.LOG_INFO) greenthread.sleep(int(node_err_wait)) # Start add_failed_host thread # TODO(sampath): # Avoid create thread here, # insted call rc_starter.add_failed_host retry_mode = True th = threading.Thread( target=self.rc_starter.add_failed_host, args=( row.notification_id, row.notification_hostname, row.notification_cluster_port, retry_mode, )) th.start() elif row.recover_by == 1: # instance recovery event # TODO(sampath): # Avoid create thread here, # insted call rc_starter.add_failed_instance th = threading.Thread( target=self.rc_starter.add_failed_instance, args=( row.notification_id, row.notification_uuid, )) th.start() else: # maintenance mode event th = threading.Thread( target=self.rc_worker.host_maintenance_mode, args=( row.notification_id, row.notification_hostname, True, )) th.start() # Start handle_pending_instances thread # TODO(sampath): # Avoid create thread here, # insted call rc_starter.handle_pending_instances() th = threading.Thread( target=self.rc_starter.handle_pending_instances) th.start() # Start reciever process for notification conf_wsgi_dic = self.rc_config.get_value('wsgi') wsgi.server( eventlet.listen(('', int(conf_wsgi_dic['server_port']))), self._notification_reciever) except exc.SQLAlchemyError: error_type, error_value, traceback_ = sys.exc_info() tb_list = traceback.format_tb(traceback_) self.rc_util.syslogout_ex("RecoveryController_0005", syslog.LOG_ERR) self.rc_util.syslogout(error_type, syslog.LOG_ERR) self.rc_util.syslogout(error_value, syslog.LOG_ERR) for tb in tb_list: self.rc_util.syslogout(tb, syslog.LOG_ERR) sys.exit() except KeyError: error_type, error_value, traceback_ = sys.exc_info() tb_list = traceback.format_tb(traceback_) self.rc_util.syslogout_ex("RecoveryController_0006", syslog.LOG_ERR) self.rc_util.syslogout(error_type, syslog.LOG_ERR) self.rc_util.syslogout(error_value, syslog.LOG_ERR) for tb in tb_list: self.rc_util.syslogout(tb, syslog.LOG_ERR) sys.exit() except: error_type, error_value, traceback_ = sys.exc_info() tb_list = traceback.format_tb(traceback_) self.rc_util.syslogout_ex("RecoveryController_0007", syslog.LOG_ERR) self.rc_util.syslogout(error_type, syslog.LOG_ERR) self.rc_util.syslogout(error_value, syslog.LOG_ERR) for tb in tb_list: self.rc_util.syslogout(tb, syslog.LOG_ERR) sys.exit()
def add_failed_host(self, notification_id, notification_hostname, notification_cluster_port, retry_mode): """ Node recover start thread : This thread starts the VM recover execution thread, only the number of existing vm in the recovery target node. :param notification_id: The notification ID included in the notification :param notification_hostname: The host name of the failure node that is included in the notification """ try: self.rc_config.set_request_context() db_engine = dbapi.get_engine(self.rc_config) session = dbapi.get_session(db_engine) conf_dict = self.rc_config.get_value('recover_starter') recovery_max_retry_cnt = conf_dict.get('recovery_max_retry_cnt') recovery_retry_interval = conf_dict.get('recovery_retry_interval') vm_list = self.rc_util_api.fetch_servers_on_hypervisor( notification_hostname) # Count vm_list if len(vm_list) == 0: msg = "There is no instance in " + notification_hostname + "." LOG.info(msg) # update record in notification_list self.rc_util_db.update_notification_list_db( session, 'progress', 2, notification_id) return else: msg = "Do get_all_notification_list_by_id_for_update." LOG.info(msg) result = dbapi.get_all_notification_list_by_id_for_update( session, notification_id) msg = "Succeeded in " \ + "get_all_notification_list_by_id_for_update. " \ + "Return_value = " + str(result) LOG.info(msg) recover_to = result.pop().recover_to if retry_mode is False: msg = "Do get_all_reserve_list_by_hostname_not_deleted." LOG.info(msg) cnt = dbapi.get_all_reserve_list_by_hostname_not_deleted( session, recover_to) msg = "Succeeded in " \ + "get_all_reserve_list_by_hostname_not_deleted. " \ + "Return_value = " + str(cnt) LOG.info(msg) if not cnt: msg = "Do " \ + "get_one_reserve_list_by_cluster_port_for_update." LOG.info(msg) cnt = dbapi.\ get_one_reserve_list_by_cluster_port_for_update( session, notification_cluster_port, notification_hostname ) msg = "Succeeded in " \ + "get_one_reserve_list_by_cluster_port_for_update. " \ + "Return_value = " + str(cnt) LOG.info(msg) if not cnt: msg = "The reserve node not exist in " \ "reserve_list DB, " \ "so do not recover instances." LOG.warning(msg) self.rc_util_db.update_notification_list_db( 'progress', 3, notification_id) return result = cnt.pop() recover_to = result.hostname update_at = datetime.datetime.now() msg = "Do " \ + "update_notification_list_by_notification_id_recover_to." LOG.info(msg) dbapi.update_notification_list_by_notification_id_recover_to( session, notification_id, update_at, recover_to ) msg = "Succeeded in " \ + "update_notification_list_by_notification_id_recover_to." LOG.info(msg) delete_at = datetime.datetime.now() msg = "Do update_reserve_list_by_hostname_as_deleted." LOG.info(msg) dbapi.update_reserve_list_by_hostname_as_deleted( session, recover_to, delete_at) msg = "Succeeded in " \ + "update_reserve_list_by_hostname_as_deleted." LOG.info(msg) # create semaphore (Multiplicity is get from config.) conf_dict = self.rc_config.get_value('recover_starter') sem_recovery_instance = threading.Semaphore( int(conf_dict.get('semaphore_multiplicity'))) incomplete_list = [] for i in range(0, int(recovery_max_retry_cnt)): incomplete_list = [] for vm_uuid in vm_list: primary_id = self._create_vm_list_db_for_failed_host( session, notification_id, vm_uuid) if primary_id: if retry_mode is True: # Skip recovery_instance thread. Will delegate to # ... msg = "RETRY MODE. Skip recovery_instance thread" \ + " vm_uuide=" + vm_uuid \ + " notification_id=" + notification_id LOG.info(msg) else: msg = "Run thread rc_worker.recovery_instance." \ + " vm_uuid=" + vm_uuid \ + " primary_id=" + str(primary_id) LOG.info(msg) thread_name = self.rc_util.make_thread_name( VM_LIST, primary_id) threading.Thread( target=self.rc_worker.recovery_instance, name=thread_name, args=(vm_uuid, primary_id, sem_recovery_instance)).start() else: if retry_mode is True: continue else: incomplete_list.append(vm_uuid) if incomplete_list: vm_list = incomplete_list greenthread.sleep(int(recovery_retry_interval)) else: break for vm_uuid in incomplete_list: primary_id = self.rc_util_db.insert_vm_list_db( session, notification_id, vm_uuid, 0) # Skip recovery_instance thread. Will delegate to ... msg = "Run thread rc_worker.recovery_instance." \ + " vm_uuid=" + vm_uuid \ + " primary_id=" + str(primary_id) LOG.info(msg) thread_name = self.rc_util.make_thread_name( VM_LIST, primary_id) threading.Thread( target=self.rc_worker.recovery_instance, name=thread_name, args=(vm_uuid, primary_id, sem_recovery_instance)).start() # update record in notification_list self.rc_util_db.update_notification_list_db( session, 'progress', 2, notification_id) return except KeyError: error_type, error_value, traceback_ = sys.exc_info() tb_list = traceback.format_tb(traceback_) LOG.error(error_type) LOG.error(error_value) for tb in tb_list: LOG.error(tb) return except: error_type, error_value, traceback_ = sys.exc_info() tb_list = traceback.format_tb(traceback_) LOG.error(error_type) LOG.error(error_value) for tb in tb_list: LOG.error(tb) return
def handle_pending_instances(self): """ method description. recovery-controller I do the recovery of outstanding recovery VM at startup. """ try: db_engine = dbapi.get_engine() session = dbapi.get_session(db_engine) self._update_old_records_vm_list(session) result = self._find_reprocessing_records_vm_list(session) # [recover_starter]section recover_starter_dic = self.rc_config.get_value("recover_starter") semaphore_multiplicity = recover_starter_dic.get( "semaphore_multiplicity") # Set multiplicity by semaphore_multiplicity sem = threading.Semaphore(int(semaphore_multiplicity)) # Execute vm_recovery_worker if len(result) > 0: # Execute the required number for row in result: vm_uuid = row.uuid primary_id = row.id self.rc_util.syslogout_ex("RecoveryControllerStarter_0032", syslog.LOG_INFO) msg = "Run thread rc_worker.recovery_instance." \ + " vm_uuid=" + vm_uuid \ + " primary_id=" + str(primary_id) self.rc_util.syslogout(msg, syslog.LOG_INFO) threading.Thread(target=self.rc_worker.recovery_instance, args=(vm_uuid, primary_id, sem)).start() # Imperfect_recover else: return return except KeyError: self.rc_util.syslogout_ex("RecoveryControllerStarter_0020", syslog.LOG_ERR) error_type, error_value, traceback_ = sys.exc_info() tb_list = traceback.format_tb(traceback_) self.rc_util.syslogout(error_type, syslog.LOG_ERR) self.rc_util.syslogout(error_value, syslog.LOG_ERR) for tb in tb_list: self.rc_util.syslogout(tb, syslog.LOG_ERR) return except: self.rc_util.syslogout_ex("RecoveryControllerStarter_0021", syslog.LOG_ERR) error_type, error_value, traceback_ = sys.exc_info() tb_list = traceback.format_tb(traceback_) self.rc_util.syslogout(error_type, syslog.LOG_ERR) self.rc_util.syslogout(error_value, syslog.LOG_ERR) for tb in tb_list: self.rc_util.syslogout(tb, syslog.LOG_ERR) return
def add_failed_instance(self, notification_id, notification_uuid, retry_mode): """ VM recover start thread : This thread starts the VM recover execution thread. :param notification_id: The notification ID included in the notification :param notification_uuid: The recovery target VM UUID of which are included in the notification :param retry_mode: Set True in the re-processing time of call, Set the False in the normal processing time of call """ try: db_engine = dbapi.get_engine() session = dbapi.get_session(db_engine) # Get primary id of vm_list primary_id = self._create_vm_list_db_for_failed_instance(session, notification_id, notification_uuid) # update record in notification_list self.rc_util_db.update_notification_list_db(session, "progress", 2, notification_id) # create semaphore (Multiplicity = 1) sem_recovery_instance = threading.Semaphore(1) # create and start thread if primary_id: if retry_mode == True: # Skip recovery_instance. # Will delegate to handle_pending_instances self.rc_util.syslogout_ex("RecoveryControllerStarter_0027", syslog.LOG_INFO) msg = ( "RETRY MODE. Skip recovery_instance thread" + " vm_uuide=" + notification_uuid + " notification_id=" + notification_id ) self.rc_util.syslogout(msg, syslog.LOG_INFO) else: self.rc_util.syslogout_ex("RecoveryControllerStarter_0029", syslog.LOG_INFO) msg = ( "Run thread rc_worker.recovery_instance." + " notification_uuid=" + notification_uuid + " primary_id=" + str(primary_id) ) self.rc_util.syslogout(msg, syslog.LOG_INFO) threading.Thread( target=self.rc_worker.recovery_instance, args=(notification_uuid, primary_id, sem_recovery_instance), ).start() return except KeyError: self.rc_util.syslogout_ex("RecoveryControllerStarter_0012", syslog.LOG_ERR) error_type, error_value, traceback_ = sys.exc_info() tb_list = traceback.format_tb(traceback_) self.rc_util.syslogout(error_type, syslog.LOG_ERR) self.rc_util.syslogout(error_value, syslog.LOG_ERR) for tb in tb_list: self.rc_util.syslogout(tb, syslog.LOG_ERR) return except: self.rc_util.syslogout_ex("RecoveryControllerStarter_0013", syslog.LOG_ERR) error_type, error_value, traceback_ = sys.exc_info() tb_list = traceback.format_tb(traceback_) self.rc_util.syslogout(error_type, syslog.LOG_ERR) self.rc_util.syslogout(error_value, syslog.LOG_ERR) for tb in tb_list: self.rc_util.syslogout(tb, syslog.LOG_ERR) return
def _create_notification_list_db(self, jsonData): ret_dic = {} # Get DB from here and pass it to _check_retry_notification try: # Get session for db db_engine = dbapi.get_engine(self.rc_config) session = dbapi.get_session(db_engine) if self._check_retry_notification(jsonData, session): msg = "Duplicate notifications. id:" + jsonData.get("id") LOG.info(msg) LOG.info(jsonData) # Node Recovery(processing A) elif jsonData.get("type") == "rscGroup" and \ str(jsonData.get("eventID")) == "1" and \ str(jsonData.get("eventType")) == "2" and \ str(jsonData.get("detail")) == "2": tdatetime = datetime.datetime.strptime(jsonData.get("time"), '%Y%m%d%H%M%S') if not self._check_repeated_notify( tdatetime, jsonData.get("hostname"), session): recover_by = 0 # node recovery ret_dic = self.rc_util_db.insert_notification_list_db( jsonData, recover_by, session) LOG.info(jsonData) else: # Duplicate notifications. msg = "Duplicate notifications. id:" + jsonData.get("id") LOG.info(msg) LOG.info(jsonData) # VM Recovery(processing G) elif jsonData.get("type") == 'VM' and \ str(jsonData.get("eventID")) == '0' and \ str(jsonData.get("eventType")) == '5' and \ str(jsonData.get("detail")) == '5': recover_by = 1 # VM recovery ret_dic = self.rc_util_db.insert_notification_list_db( jsonData, recover_by, session) LOG.info(jsonData) # Node Lock(processing D and F) # Node will be locked. elif (jsonData.get("type") == 'nodeStatus') or \ ((jsonData.get("type") == 'rscGroup' and str(jsonData.get("eventID")) == '1' and str(jsonData.get("eventType")) == '2') and (str(jsonData.get("detail")) == '3' or str(jsonData.get("detail")) == '4')): tdatetime = datetime.datetime.strptime(jsonData.get("time"), '%Y%m%d%H%M%S') if not self._check_repeated_notify( tdatetime, jsonData.get("hostname"), session): recover_by = 2 # NODE lock ret_dic = self.rc_util_db.insert_notification_list_db( jsonData, recover_by, session) LOG.info(jsonData) else: # Duplicate notifications. msg = "Duplicate notifications. id:" + jsonData.get("id") LOG.info(msg) LOG.info(jsonData) # Do not recover(Excuted Stop API) elif jsonData.get("type") == "VM" and \ str(jsonData.get("eventID")) == "0" and \ str(jsonData.get("eventType")) == "5" and \ str(jsonData.get("detail")) == "1": LOG.info(jsonData) msg = "Do not recover instance.(Excuted Stop API)" LOG.info(msg) # Notification of starting node. elif jsonData.get("type") == "rscGroup" and \ str(jsonData.get("eventID")) == "1" and \ str(jsonData.get("eventType")) == "1" and \ str(jsonData.get("detail")) == "1": LOG.info(jsonData) msg = "Recieved notification of node starting. Node:" + \ jsonData['hostname'] LOG.info(msg) # Ignore notification else: LOG.info(jsonData) msg = "Ignore notification. Notification:" + str(jsonData) LOG.info(msg) except Exception: error_type, error_value, traceback_ = sys.exc_info() tb_list = traceback.format_tb(traceback_) LOG.error(error_type) LOG.error(error_value) for tb in tb_list: LOG.error(tb) raise return ret_dic
def add_failed_host(self, notification_id, notification_hostname, notification_cluster_port, retry_mode): """ Node recover start thread : This thread starts the VM recover execution thread, only the number of existing vm in the recovery target node. :param notification_id: The notification ID included in the notification :param notification_hostname: The host name of the failure node that is included in the notification """ try: db_engine = dbapi.get_engine() session = dbapi.get_session(db_engine) conf_dict = self.rc_config.get_value("recover_starter") recovery_max_retry_cnt = conf_dict.get("recovery_max_retry_cnt") recovery_retry_interval = conf_dict.get("recovery_retry_interval") vm_list = self.rc_util_api.fetch_servers_on_hypervisor(notification_hostname) # Count vm_list if len(vm_list) == 0: self.rc_util.syslogout_ex("RecoveryControllerStarter_0014", syslog.LOG_INFO) msg = "There is no instance in " + notification_hostname + "." self.rc_util.syslogout(msg, syslog.LOG_INFO) # update record in notification_list self.rc_util_db.update_notification_list_db(session, "progress", 2, notification_id) return else: result = dbapi.get_all_notification_list_by_id_for_update(session, notification_id) recover_to = result.pop().recover_to if retry_mode is False: cnt = dbapi.get_all_reserve_list_by_hostname_not_deleted(session, recover_to) if not cnt: cnt = dbapi.get_one_reserve_list_by_cluster_port_for_update( session, notification_cluster_port, notification_hostname ) if not cnt: self.rc_util.syslogout_ex("RecoveryControllerStarter_0022", syslog.LOG_WARNING) msg = "The reserve node not exist in " "reserve_list DB, " "so do not recover instances." self.rc_util.syslogout(msg, syslog.LOG_WARNING) self.rc_util_db.update_notification_list_db("progress", 3, notification_id) return result = cnt.pop() recover_to = result.hostname update_at = datetime.datetime.now() dbapi.update_notification_list_by_notification_id_recover_to( session, notification_id, update_at, recover_to ) self.rc_util.syslogout_ex("RecoveryControllerStarter_0024", syslog.LOG_INFO) self.rc_util.syslogout_ex("RecoveryControllerStarter_0015", syslog.LOG_INFO) delete_at = datetime.datetime.now() dbapi.update_reserve_list_by_hostname_as_deleted(session, recover_to, delete_at) # create semaphore (Multiplicity is get from config.) conf_dict = self.rc_config.get_value("recover_starter") sem_recovery_instance = threading.Semaphore(int(conf_dict.get("semaphore_multiplicity"))) incomplete_list = [] for i in range(0, int(recovery_max_retry_cnt)): incomplete_list = [] for vm_uuid in vm_list: primary_id = self._create_vm_list_db_for_failed_host(session, notification_id, vm_uuid) if primary_id: if retry_mode == True: # Skip recovery_instance thread. Will delegate to # ... msg = ( "RETRY MODE. Skip recovery_instance thread" + " vm_uuide=" + vm_uuid + " notification_id=" + notification_id ) self.rc_util.syslogout(msg, syslog.LOG_INFO) else: msg = ( "Run thread rc_worker.recovery_instance." + " vm_uuid=" + vm_uuid + " primary_id=" + str(primary_id) ) self.rc_util.syslogout(msg, syslog.LOG_INFO) threading.Thread( target=self.rc_worker.recovery_instance, args=(vm_uuid, primary_id, sem_recovery_instance), ).start() else: if retry_mode == True: continue else: incomplete_list.append(vm_uuid) if incomplete_list: vm_list = incomplete_list greenthread.sleep(int(recovery_retry_interval)) else: break for vm_uuid in incomplete_list: primary_id = self.rc_util_db.insert_vm_list_db(session, notification_id, vm_uuid, 0) # Skip recovery_instance thread. Will delegate to ... self.rc_util.syslogout_ex("RecoveryControllerStarter_0031", syslog.LOG_INFO) msg = ( "Run thread rc_worker.recovery_instance." + " vm_uuid=" + vm_uuid + " primary_id=" + str(primary_id) ) self.rc_util.syslogout(msg, syslog.LOG_INFO) threading.Thread( target=self.rc_worker.recovery_instance, args=(vm_uuid, primary_id, sem_recovery_instance) ).start() # update record in notification_list self.rc_util_db.update_notification_list_db(session, "progress", 2, notification_id) return except KeyError: self.rc_util.syslogout_ex("RecoveryControllerStarter_0017", syslog.LOG_ERR) error_type, error_value, traceback_ = sys.exc_info() tb_list = traceback.format_tb(traceback_) self.rc_util.syslogout(error_type, syslog.LOG_ERR) self.rc_util.syslogout(error_value, syslog.LOG_ERR) for tb in tb_list: self.rc_util.syslogout(tb, syslog.LOG_ERR) return except: self.rc_util.syslogout_ex("RecoveryControllerStarter_0018", syslog.LOG_ERR) error_type, error_value, traceback_ = sys.exc_info() tb_list = traceback.format_tb(traceback_) self.rc_util.syslogout(error_type, syslog.LOG_ERR) self.rc_util.syslogout(error_value, syslog.LOG_ERR) for tb in tb_list: self.rc_util.syslogout(tb, syslog.LOG_ERR) return
def add_failed_host(self, notification_id, notification_hostname, notification_cluster_port, retry_mode): """ Node recover start thread : This thread starts the VM recover execution thread, only the number of existing vm in the recovery target node. :param notification_id: The notification ID included in the notification :param notification_hostname: The host name of the failure node that is included in the notification """ try: self.rc_config.set_request_context() db_engine = dbapi.get_engine(self.rc_config) session = dbapi.get_session(db_engine) conf_dict = self.rc_config.get_value('recover_starter') recovery_max_retry_cnt = conf_dict.get('recovery_max_retry_cnt') recovery_retry_interval = conf_dict.get('recovery_retry_interval') vm_list = self.rc_util_api.fetch_servers_on_hypervisor( notification_hostname) # Count vm_list if len(vm_list) == 0: msg = "There is no instance in " + notification_hostname + "." LOG.info(msg) # update record in notification_list self.rc_util_db.update_notification_list_db( session, 'progress', 2, notification_id) return else: msg = "Do get_all_notification_list_by_id_for_update." LOG.info(msg) result = dbapi.get_all_notification_list_by_id_for_update( session, notification_id) msg = "Succeeded in " \ + "get_all_notification_list_by_id_for_update. " \ + "Return_value = " + str(result) LOG.info(msg) recover_to = result.pop().recover_to if retry_mode is False: msg = "Do get_all_reserve_list_by_hostname_not_deleted." LOG.info(msg) cnt = dbapi.get_all_reserve_list_by_hostname_not_deleted( session, recover_to) msg = "Succeeded in " \ + "get_all_reserve_list_by_hostname_not_deleted. " \ + "Return_value = " + str(cnt) LOG.info(msg) if not cnt: msg = "Do " \ + "get_one_reserve_list_by_cluster_port_for_update." LOG.info(msg) cnt = dbapi.\ get_one_reserve_list_by_cluster_port_for_update( session, notification_cluster_port, notification_hostname ) msg = "Succeeded in " \ + "get_one_reserve_list_by_cluster_port_for_update. " \ + "Return_value = " + str(cnt) LOG.info(msg) if not cnt: msg = "The reserve node not exist in " \ "reserve_list DB, " \ "so do not recover instances." LOG.warning(msg) self.rc_util_db.update_notification_list_db( 'progress', 3, notification_id) return result = cnt.pop() recover_to = result.hostname update_at = datetime.datetime.now() msg = "Do " \ + "update_notification_list_by_notification_id_recover_to." LOG.info(msg) dbapi.update_notification_list_by_notification_id_recover_to( session, notification_id, update_at, recover_to) msg = "Succeeded in " \ + "update_notification_list_by_notification_id_recover_to." LOG.info(msg) delete_at = datetime.datetime.now() msg = "Do update_reserve_list_by_hostname_as_deleted." LOG.info(msg) dbapi.update_reserve_list_by_hostname_as_deleted( session, recover_to, delete_at) msg = "Succeeded in " \ + "update_reserve_list_by_hostname_as_deleted." LOG.info(msg) # create semaphore (Multiplicity is get from config.) conf_dict = self.rc_config.get_value('recover_starter') sem_recovery_instance = threading.Semaphore( int(conf_dict.get('semaphore_multiplicity'))) incomplete_list = [] for i in range(0, int(recovery_max_retry_cnt)): incomplete_list = [] for vm_uuid in vm_list: primary_id = self._create_vm_list_db_for_failed_host( session, notification_id, vm_uuid) if primary_id: if retry_mode is True: # Skip recovery_instance thread. Will delegate to # ... msg = "RETRY MODE. Skip recovery_instance thread" \ + " vm_uuide=" + vm_uuid \ + " notification_id=" + notification_id LOG.info(msg) else: msg = "Run thread rc_worker.recovery_instance." \ + " vm_uuid=" + vm_uuid \ + " primary_id=" + str(primary_id) LOG.info(msg) thread_name = self.rc_util.make_thread_name( VM_LIST, primary_id) threading.Thread( target=self.rc_worker.recovery_instance, name=thread_name, args=(vm_uuid, primary_id, sem_recovery_instance)).start() else: if retry_mode is True: continue else: incomplete_list.append(vm_uuid) if incomplete_list: vm_list = incomplete_list greenthread.sleep(int(recovery_retry_interval)) else: break for vm_uuid in incomplete_list: primary_id = self.rc_util_db.insert_vm_list_db( session, notification_id, vm_uuid, 0) # Skip recovery_instance thread. Will delegate to ... msg = "Run thread rc_worker.recovery_instance." \ + " vm_uuid=" + vm_uuid \ + " primary_id=" + str(primary_id) LOG.info(msg) thread_name = self.rc_util.make_thread_name( VM_LIST, primary_id) threading.Thread(target=self.rc_worker.recovery_instance, name=thread_name, args=(vm_uuid, primary_id, sem_recovery_instance)).start() # update record in notification_list self.rc_util_db.update_notification_list_db( session, 'progress', 2, notification_id) return except KeyError: error_type, error_value, traceback_ = sys.exc_info() tb_list = traceback.format_tb(traceback_) LOG.error(error_type) LOG.error(error_value) for tb in tb_list: LOG.error(tb) return except: error_type, error_value, traceback_ = sys.exc_info() tb_list = traceback.format_tb(traceback_) LOG.error(error_type) LOG.error(error_value) for tb in tb_list: LOG.error(tb) return
def masakari(self): """ RecoveryController class main processing: This processing checks the VM list table of DB. If an unprocessed VM exists, and start thread to execute the recovery process. Then, the processing starts the wsgi server and waits for the notification. """ try: self.rc_util.syslogout_ex( "RecoveryController_0004", syslog.LOG_INFO) self.rc_util.syslogout( "masakari START.", syslog.LOG_INFO) # Get a session and do not pass it to other threads db_engine = dbapi.get_engine() session = dbapi.get_session(db_engine) self._update_old_records_notification_list(session) result = self._find_reprocessing_records_notification_list(session) preprocessing_count = len(result) if preprocessing_count > 0: for row in result: if row.recover_by == 0: # node recovery event th = threading.Thread( target=self.rc_worker.host_maintenance_mode, args=(row.notification_id, row.notification_hostname, False,)) th.start() # PF9 begin """ # Sleep until updating nova-compute service status # down. self.rc_util.syslogout_ex( "RecoveryController_0035", syslog.LOG_INFO) dic = self.rc_config.get_value('recover_starter') node_err_wait = dic.get("node_err_wait") msg = ("Sleeping %s sec before starting node recovery" "thread, until updateing nova-compute" "service status." % (node_err_wait)) self.rc_util.syslogout(msg, syslog.LOG_INFO) greenthread.sleep(int(node_err_wait)) """ # Mark nova compute service as 'down' to immediately start evacuation self.rc_worker.mark_host_down_pf9(row.notification_hostname) # PF9 end # Start add_failed_host thread # TODO(sampath): # Avoid create thread here, # insted call rc_starter.add_failed_host retry_mode = True th = threading.Thread( target=self.rc_starter.add_failed_host, args=(row.notification_id, row.notification_hostname, row.notification_cluster_port, retry_mode, )) th.start() elif row.recover_by == 1: # instance recovery event # TODO(sampath): # Avoid create thread here, # insted call rc_starter.add_failed_instance th = threading.Thread( target=self.rc_starter.add_failed_instance, args=(row.notification_id, row.notification_uuid, )) th.start() else: # maintenance mode event th = threading.Thread( target=self.rc_worker.host_maintenance_mode, args=(row.notification_id, row.notification_hostname, True, )) th.start() # Start handle_pending_instances thread # TODO(sampath): # Avoid create thread here, # insted call rc_starter.handle_pending_instances() th = threading.Thread( target=self.rc_starter.handle_pending_instances) th.start() # PF9 move this to __call__ for wsgi-fy app # Start reciever process for notification """ conf_wsgi_dic = self.rc_config.get_value('wsgi') wsgi.server( eventlet.listen(('', int(conf_wsgi_dic['server_port']))), self._notification_reciever) """ # PF9 end except exc.SQLAlchemyError: error_type, error_value, traceback_ = sys.exc_info() tb_list = traceback.format_tb(traceback_) self.rc_util.syslogout_ex( "RecoveryController_0005", syslog.LOG_ERR) self.rc_util.syslogout(error_type, syslog.LOG_ERR) self.rc_util.syslogout(error_value, syslog.LOG_ERR) for tb in tb_list: self.rc_util.syslogout(tb, syslog.LOG_ERR) sys.exit() except KeyError: error_type, error_value, traceback_ = sys.exc_info() tb_list = traceback.format_tb(traceback_) self.rc_util.syslogout_ex( "RecoveryController_0006", syslog.LOG_ERR) self.rc_util.syslogout(error_type, syslog.LOG_ERR) self.rc_util.syslogout(error_value, syslog.LOG_ERR) for tb in tb_list: self.rc_util.syslogout(tb, syslog.LOG_ERR) sys.exit() except: error_type, error_value, traceback_ = sys.exc_info() tb_list = traceback.format_tb(traceback_) self.rc_util.syslogout_ex( "RecoveryController_0007", syslog.LOG_ERR) self.rc_util.syslogout(error_type, syslog.LOG_ERR) self.rc_util.syslogout(error_value, syslog.LOG_ERR) for tb in tb_list: self.rc_util.syslogout(tb, syslog.LOG_ERR) sys.exit()
def recovery_instance(self, uuid, primary_id, sem): """ Execute VM recovery. :param uuid: Recovery target VM UUID :param primary_id: Unique ID of the vm_list table :param sem: Semaphore """ try: sem.acquire() db_engine = dbapi.get_engine() session = dbapi.get_session(db_engine) # Initlize status. status = self.STATUS_NORMAL # Update vmha recovery status. self.rc_util_db.update_vm_list_db(session, 'progress', 1, primary_id) # Get vm infomation. vm_info = self._get_vm_param(uuid) HA_Enabled = vm_info.metadata.get('HA-Enabled') if HA_Enabled: HA_Enabled = HA_Enabled.upper() if HA_Enabled != 'OFF': HA_Enabled = 'ON' # Set recovery parameter. exe_param = {} exe_param['vm_state'] = getattr(vm_info, 'OS-EXT-STS:vm_state') exe_param['HA-Enabled'] = HA_Enabled recover_by, recover_to = self._get_vmha_param( session, uuid, primary_id) exe_param['recover_by'] = recover_by exe_param['recover_to'] = recover_to # Execute. status = self._execute_recovery(session, uuid, exe_param.get("vm_state"), exe_param.get("HA-Enabled"), exe_param.get("recover_by"), exe_param.get("recover_to")) except EnvironmentError: self.rc_util.syslogout_ex("RecoveryControllerWorker_0034", syslog.LOG_ERR) status = self.STATUS_ERROR error_type, error_value, traceback_ = sys.exc_info() tb_list = traceback.format_tb(traceback_) self.rc_util.syslogout(error_type, syslog.LOG_ERR) self.rc_util.syslogout(error_value, syslog.LOG_ERR) for tb in tb_list: self.rc_util.syslogout(tb, syslog.LOG_ERR) return except KeyError: self.rc_util.syslogout_ex("RecoveryControllerWorker_0035", syslog.LOG_ERR) status = self.STATUS_ERROR error_type, error_value, traceback_ = sys.exc_info() tb_list = traceback.format_tb(traceback_) self.rc_util.syslogout(error_type, syslog.LOG_ERR) self.rc_util.syslogout(error_value, syslog.LOG_ERR) for tb in tb_list: self.rc_util.syslogout(tb, syslog.LOG_ERR) return except: self.rc_util.syslogout_ex("RecoveryControllerWorker_0037", syslog.LOG_ERR) status = self.STATUS_ERROR error_type, error_value, traceback_ = sys.exc_info() tb_list = traceback.format_tb(traceback_) self.rc_util.syslogout(error_type, syslog.LOG_ERR) self.rc_util.syslogout(error_value, syslog.LOG_ERR) for tb in tb_list: self.rc_util.syslogout(tb, syslog.LOG_ERR) return finally: try: # Successful execution. if status == self.STATUS_NORMAL: self.rc_util_db.update_vm_list_db(session, 'progress', 2, primary_id) # Abnormal termination. else: self.rc_util_db.update_vm_list_db(session, 'progress', 3, primary_id) # Release semaphore if sem: sem.release() except: self.rc_util.syslogout_ex("RecoveryControllerWorker_0039", syslog.LOG_ERR) error_type, error_value, traceback_ = sys.exc_info() tb_list = traceback.format_tb(traceback_) self.rc_util.syslogout(error_type, syslog.LOG_ERR) self.rc_util.syslogout(error_value, syslog.LOG_ERR) for tb in tb_list: self.rc_util.syslogout(tb, syslog.LOG_ERR) return
def masakari(self): """ RecoveryController class main processing: This processing checks the VM list table of DB. If an unprocessed VM exists, and start thread to execute the recovery process. Then, the processing starts the wsgi server and waits for the notification. """ try: LOG.info("masakari START.") # Get a session and do not pass it to other threads db_engine = dbapi.get_engine(self.rc_config) session = dbapi.get_session(db_engine) self._update_old_records_notification_list(session) result = self._find_reprocessing_records_notification_list(session) preprocessing_count = len(result) if preprocessing_count > 0: for row in result: if row.recover_by == 0: # node recovery event msg = ( "Run thread rc_worker.host_maintenance_mode." + " notification_id=" + row.notification_id + " notification_hostname=" + row.notification_hostname + " update_progress=False" ) LOG.info(msg) thread_name = self.rc_util.make_thread_name(NOTIFICATION_LIST, row.notification_id) th = threading.Thread( target=self.rc_worker.host_maintenance_mode, name=thread_name, args=(row.notification_id, row.notification_hostname, False), ) th.start() # Sleep until updating nova-compute service status # down. dic = self.rc_config.get_value("recover_starter") node_err_wait = dic.get("node_err_wait") msg = ( "Sleeping %s sec before starting node recovery" "thread, until updateing nova-compute" "service status." % (node_err_wait) ) LOG.info(msg) greenthread.sleep(int(node_err_wait)) # Start add_failed_host thread # TODO(sampath): # Avoid create thread here, # insted call rc_starter.add_failed_host retry_mode = True msg = ( "Run thread rc_starter.add_failed_host." + " notification_id=" + row.notification_id + " notification_hostname=" + row.notification_hostname + " notification_cluster_port=" + row.notification_cluster_port + " retry_mode=" + str(retry_mode) ) LOG.info(msg) thread_name = self.rc_util.make_thread_name(NOTIFICATION_LIST, row.notification_id) th = threading.Thread( target=self.rc_starter.add_failed_host, name=thread_name, args=( row.notification_id, row.notification_hostname, row.notification_cluster_port, retry_mode, ), ) th.start() elif row.recover_by == 1: # instance recovery event # TODO(sampath): # Avoid create thread here, # insted call rc_starter.add_failed_instance msg = ( "Run thread rc_starter.add_failed_instance." + " notification_id=" + row.notification_id + " notification_uuid=" + row.notification_uuid ) LOG.info(msg) thread_name = self.rc_util.make_thread_name(NOTIFICATION_LIST, row.notification_id) th = threading.Thread( target=self.rc_starter.add_failed_instance, name=thread_name, args=(row.notification_id, row.notification_uuid), ) th.start() else: # maintenance mode event msg = ( "Run thread rc_starter.host_maintenance_mode." + " notification_id=" + row.notification_id + " notification_hostname=" + row.notification_hostname + "update_progress=True" ) LOG.info(msg) thread_name = self.rc_util.make_thread_name(NOTIFICATION_LIST, row.notification_id) th = threading.Thread( target=self.rc_worker.host_maintenance_mode, name=thread_name, args=(row.notification_id, row.notification_hostname, True), ) th.start() # Start handle_pending_instances thread # TODO(sampath): # Avoid create thread here, # insted call rc_starter.handle_pending_instances() msg = "Run thread rc_starter.handle_pending_instances." LOG.info(msg) thread_name = "Thread:handle_pending_instances" th = threading.Thread(target=self.rc_starter.handle_pending_instances, name=thread_name) th.start() # Start reciever process for notification conf_wsgi_dic = self.rc_config.get_value("wsgi") wsgi.server(eventlet.listen(("", int(conf_wsgi_dic["server_port"]))), self._notification_reciever) except exc.SQLAlchemyError: error_type, error_value, traceback_ = sys.exc_info() tb_list = traceback.format_tb(traceback_) LOG.critical(error_type) LOG.critical(error_value) for tb in tb_list: LOG.critical(tb) sys.exit() except KeyError: error_type, error_value, traceback_ = sys.exc_info() tb_list = traceback.format_tb(traceback_) LOG.critical(error_type) LOG.critical(error_value) for tb in tb_list: LOG.critical(tb) sys.exit() except: error_type, error_value, traceback_ = sys.exc_info() tb_list = traceback.format_tb(traceback_) LOG.critical(error_type) LOG.critical(error_value) for tb in tb_list: LOG.critical(tb) sys.exit()