Beispiel #1
0
def serve_rpc():
    plugin = manager.NeutronManager.get_plugin()

    # If 0 < rpc_workers then start_rpc_listeners would be called in a
    # subprocess and we cannot simply catch the NotImplementedError.  It is
    # simpler to check this up front by testing whether the plugin supports
    # multiple RPC workers.
    if not plugin.rpc_workers_supported():
        LOG.debug("Active plugin doesn't implement start_rpc_listeners")
        if 0 < cfg.CONF.rpc_workers:
            LOG.error(("'rpc_workers = %d' ignored because "
                       "start_rpc_listeners is not implemented."),
                      cfg.CONF.rpc_workers)
        raise NotImplementedError()

    try:
        rpc = RpcWorker(plugin)

        if cfg.CONF.rpc_workers < 1:
            rpc.start()
            return rpc
        else:
            # dispose the whole pool before os.fork, otherwise there will
            # be shared DB connections in child processes which may cause
            # DB errors.
            session.get_engine().pool.dispose()
            launcher = common_service.ProcessLauncher(wait_interval=1.0)
            launcher.launch_service(rpc, workers=cfg.CONF.rpc_workers)
            return launcher
    except Exception:
        with excutils.save_and_reraise_exception():
            LOG.exception(('Unrecoverable error: please check log for '
                           'details.'))
Beispiel #2
0
    def handle_pending_instances(self):
        """
        method description.
        recovery-controller I do the recovery
        of outstanding recovery VM at startup.
        """
        try:
            db_engine = dbapi.get_engine()
            session = dbapi.get_session(db_engine)

            self._update_old_records_vm_list(session)
            result = self._find_reprocessing_records_vm_list(session)

            # [recover_starter]section
            recover_starter_dic = self.rc_config.get_value("recover_starter")
            semaphore_multiplicity = recover_starter_dic.get("semaphore_multiplicity")

            # Set multiplicity by semaphore_multiplicity
            sem = threading.Semaphore(int(semaphore_multiplicity))

            # Execute vm_recovery_worker
            if len(result) > 0:
                # Execute the required number
                for row in result:
                    vm_uuid = row.uuid
                    primary_id = row.id
                    self.rc_util.syslogout_ex("RecoveryControllerStarter_0032", syslog.LOG_INFO)
                    msg = (
                        "Run thread rc_worker.recovery_instance."
                        + " vm_uuid="
                        + vm_uuid
                        + " primary_id="
                        + str(primary_id)
                    )
                    self.rc_util.syslogout(msg, syslog.LOG_INFO)
                    threading.Thread(target=self.rc_worker.recovery_instance, args=(vm_uuid, primary_id, sem)).start()

            # Imperfect_recover
            else:
                return

            return
        except KeyError:
            self.rc_util.syslogout_ex("RecoveryControllerStarter_0020", syslog.LOG_ERR)
            error_type, error_value, traceback_ = sys.exc_info()
            tb_list = traceback.format_tb(traceback_)
            self.rc_util.syslogout(error_type, syslog.LOG_ERR)
            self.rc_util.syslogout(error_value, syslog.LOG_ERR)
            for tb in tb_list:
                self.rc_util.syslogout(tb, syslog.LOG_ERR)
            return
        except:
            self.rc_util.syslogout_ex("RecoveryControllerStarter_0021", syslog.LOG_ERR)
            error_type, error_value, traceback_ = sys.exc_info()
            tb_list = traceback.format_tb(traceback_)
            self.rc_util.syslogout(error_type, syslog.LOG_ERR)
            self.rc_util.syslogout(error_value, syslog.LOG_ERR)
            for tb in tb_list:
                self.rc_util.syslogout(tb, syslog.LOG_ERR)
            return
Beispiel #3
0
    def host_maintenance_mode(self, notification_id, hostname,
                              update_progress):
        """
           nova-compute service change to disable or enable.
           :param notification_id: Notification ID included in the notification
           :param hostname: Host name of brocade target
        """
        try:
            self.rc_config.set_request_context()
            db_engine = dbapi.get_engine(self.rc_config)
            session = dbapi.get_session(db_engine)
            self.rc_util_api.disable_host_status(hostname)

            if update_progress is True:
                self.rc_util_db.update_notification_list_db(
                    session,
                    'progress', 2, notification_id)

        except KeyError:
            error_type, error_value, traceback_ = sys.exc_info()
            tb_list = traceback.format_tb(traceback_)
            LOG.error(error_type)
            LOG.error(error_value)
            for tb in tb_list:
                LOG.error(tb)
            return
        except:
            error_type, error_value, traceback_ = sys.exc_info()
            tb_list = traceback.format_tb(traceback_)
            LOG.error(error_type)
            LOG.error(error_value)
            for tb in tb_list:
                LOG.error(tb)
            return
Beispiel #4
0
    def handle_pending_instances(self):
        """
        method description.
        recovery-controller I do the recovery
        of outstanding recovery VM at startup.
        """
        try:
            self.rc_config.set_request_context()
            db_engine = dbapi.get_engine(self.rc_config)
            session = dbapi.get_session(db_engine)

            self._update_old_records_vm_list(session)
            result = self._find_reprocessing_records_vm_list(session)

            # [recover_starter]section
            recover_starter_dic = self.rc_config.get_value("recover_starter")
            semaphore_multiplicity = recover_starter_dic.get(
                "semaphore_multiplicity")

            # Set multiplicity by semaphore_multiplicity
            sem = threading.Semaphore(int(semaphore_multiplicity))

            # Execute vm_recovery_worker
            if len(result) > 0:
                # Execute the required number
                for row in result:
                    vm_uuid = row.uuid
                    primary_id = row.id
                    msg = "Run thread rc_worker.recovery_instance." \
                        + " vm_uuid=" + vm_uuid \
                        + " primary_id=" + str(primary_id)
                    LOG.info(msg)
                    thread_name = self.rc_util.make_thread_name(
                        VM_LIST, primary_id)
                    threading.Thread(
                        target=self.rc_worker.recovery_instance,
                        name=thread_name,
                        args=(vm_uuid, primary_id, sem)).start()

            # Imperfect_recover
            else:
                return

            return
        except KeyError:
            error_type, error_value, traceback_ = sys.exc_info()
            tb_list = traceback.format_tb(traceback_)
            LOG.error(error_type)
            LOG.error(error_value)
            for tb in tb_list:
                LOG.error(tb)
            return
        except:
            error_type, error_value, traceback_ = sys.exc_info()
            tb_list = traceback.format_tb(traceback_)
            LOG.error(error_type)
            LOG.error(error_value)
            for tb in tb_list:
                LOG.error(tb)
            return
Beispiel #5
0
    def host_maintenance_mode(self, notification_id, hostname,
                              update_progress):
        """
           nova-compute service change to disable or enable.
           :param notification_id: Notification ID included in the notification
           :param hostname: Host name of brocade target
        """
        try:
            db_engine = dbapi.get_engine()
            session = dbapi.get_session(db_engine)
            self.rc_util_api.disable_host_status(hostname)

            if update_progress is True:
                self.rc_util_db.update_notification_list_db(
                    session,
                    'progress', 2, notification_id)

        except KeyError:
            self.rc_util.syslogout_ex("RecoveryControllerWorker_0031",
                                      syslog.LOG_ERR)
            error_type, error_value, traceback_ = sys.exc_info()
            tb_list = traceback.format_tb(traceback_)
            self.rc_util.syslogout(error_type, syslog.LOG_ERR)
            self.rc_util.syslogout(error_value, syslog.LOG_ERR)
            for tb in tb_list:
                self.rc_util.syslogout(tb, syslog.LOG_ERR)
            return
        except:
            self.rc_util.syslogout_ex("RecoveryControllerWorker_0032",
                                      syslog.LOG_ERR)
            error_type, error_value, traceback_ = sys.exc_info()
            tb_list = traceback.format_tb(traceback_)
            self.rc_util.syslogout(error_type, syslog.LOG_ERR)
            self.rc_util.syslogout(error_value, syslog.LOG_ERR)
            for tb in tb_list:
                self.rc_util.syslogout(tb, syslog.LOG_ERR)
            return
Beispiel #6
0
    def host_maintenance_mode(self, notification_id, hostname,
                              update_progress):
        """
           nova-compute service change to disable or enable.
           :param notification_id: Notification ID included in the notification
           :param hostname: Host name of brocade target
        """
        try:
            db_engine = dbapi.get_engine()
            session = dbapi.get_session(db_engine)
            self.rc_util_api.disable_host_status(hostname)

            if update_progress is True:
                self.rc_util_db.update_notification_list_db(
                    session, 'progress', 2, notification_id)

        except KeyError:
            self.rc_util.syslogout_ex("RecoveryControllerWorker_0031",
                                      syslog.LOG_ERR)
            error_type, error_value, traceback_ = sys.exc_info()
            tb_list = traceback.format_tb(traceback_)
            self.rc_util.syslogout(error_type, syslog.LOG_ERR)
            self.rc_util.syslogout(error_value, syslog.LOG_ERR)
            for tb in tb_list:
                self.rc_util.syslogout(tb, syslog.LOG_ERR)
            return
        except:
            self.rc_util.syslogout_ex("RecoveryControllerWorker_0032",
                                      syslog.LOG_ERR)
            error_type, error_value, traceback_ = sys.exc_info()
            tb_list = traceback.format_tb(traceback_)
            self.rc_util.syslogout(error_type, syslog.LOG_ERR)
            self.rc_util.syslogout(error_value, syslog.LOG_ERR)
            for tb in tb_list:
                self.rc_util.syslogout(tb, syslog.LOG_ERR)
            return
Beispiel #7
0
    def add_failed_instance(self, notification_id, notification_uuid,
                            retry_mode):
        """
        VM recover start thread :
            This thread starts the VM recover execution thread.
        :param notification_id: The notification ID included in the
         notification
        :param notification_uuid: The recovery target VM UUID of which are
         included in the notification
        :param retry_mode: Set True in the re-processing time of call,
         Set the False in the normal processing time of call
        """

        try:
            db_engine = dbapi.get_engine()
            session = dbapi.get_session(db_engine)

            # Get primary id of vm_list
            primary_id = self._create_vm_list_db_for_failed_instance(
                session, notification_id, notification_uuid)
            # update record in notification_list
            self.rc_util_db.update_notification_list_db(
                session, 'progress', 2, notification_id)
            # create semaphore (Multiplicity = 1)
            sem_recovery_instance = threading.Semaphore(1)
            # create and start thread
            if primary_id:
                if retry_mode == True:
                    # Skip recovery_instance.
                    # Will delegate to handle_pending_instances
                    self.rc_util.syslogout_ex("RecoveryControllerStarter_0027",
                                              syslog.LOG_INFO)
                    msg = "RETRY MODE. Skip recovery_instance thread" \
                        + " vm_uuide=" + notification_uuid \
                        + " notification_id=" + notification_id
                    self.rc_util.syslogout(msg, syslog.LOG_INFO)
                else:
                    self.rc_util.syslogout_ex("RecoveryControllerStarter_0029",
                                              syslog.LOG_INFO)
                    msg = "Run thread rc_worker.recovery_instance." \
                        + " notification_uuid=" + notification_uuid \
                        + " primary_id=" + str(primary_id)
                    self.rc_util.syslogout(msg, syslog.LOG_INFO)

                    threading.Thread(target=self.rc_worker.recovery_instance,
                                     args=(notification_uuid, primary_id,
                                           sem_recovery_instance)).start()
            return

        except KeyError:
            self.rc_util.syslogout_ex("RecoveryControllerStarter_0012",
                                      syslog.LOG_ERR)
            error_type, error_value, traceback_ = sys.exc_info()
            tb_list = traceback.format_tb(traceback_)
            self.rc_util.syslogout(error_type, syslog.LOG_ERR)
            self.rc_util.syslogout(error_value, syslog.LOG_ERR)
            for tb in tb_list:
                self.rc_util.syslogout(tb, syslog.LOG_ERR)
            return
        except:
            self.rc_util.syslogout_ex("RecoveryControllerStarter_0013",
                                      syslog.LOG_ERR)
            error_type, error_value, traceback_ = sys.exc_info()
            tb_list = traceback.format_tb(traceback_)
            self.rc_util.syslogout(error_type, syslog.LOG_ERR)
            self.rc_util.syslogout(error_value, syslog.LOG_ERR)
            for tb in tb_list:
                self.rc_util.syslogout(tb, syslog.LOG_ERR)
            return
    conf_db_dic = config.get_value('db')
    host = conf_db_dic.get("host")
    db = conf_db_dic.get("name")
    user = conf_db_dic.get("user")
    passwd = conf_db_dic.get("passwd")
    charset = conf_db_dic.get("charset")
except Exception as e:
    # error handling
    print "failed to load configuration parameters."
    print "Exception: ", e
    sys.exit(1)

print "host:", host, "db:", db, "user:"******"passwd:", passwd, "charset:", charset

try:
    # Create an engine to store data in the database
    engine = dbapi.get_engine()
    # Create database if not exists
    if not database_exists(engine.url):
        create_database(engine.url)
    # Create all tables in the engine
    Base.metadata.create_all(engine)
except Exception as e:
    # error handling
    print "failed to create tables."
    print "Exception: ", e
    sys.exit(2)

print "Successfully created tables"
sys.exit(0)
Beispiel #9
0
    conf_db_dic = config.get_value('db')
    host = conf_db_dic.get("host")
    db = conf_db_dic.get("name")
    user = conf_db_dic.get("user")
    passwd = conf_db_dic.get("passwd")
    charset = conf_db_dic.get("charset")
except Exception as e:
    # error handling
    print "failed to load configuration parameters."
    print "Exception: ", e
    sys.exit(1)

print "host:", host, "db:", db, "user:"******"passwd:", passwd, "charset:", charset

try:
    # Create an engine to store data in the database
    engine = dbapi.get_engine()
    # Create database if not exists
    if not database_exists(engine.url):
        create_database(engine.url)
    # Create all tables in the engine
    Base.metadata.create_all(engine)
except Exception as e:
    # error handling
    print "failed to create tables."
    print "Exception: ", e
    sys.exit(2)

print "Successfully created tables"
sys.exit(0)
    def _create_notification_list_db(self, jsonData):
        ret_dic = {}

        # Get DB from here and pass it to _check_retry_notification
        try:
            # Get session for db
            db_engine = dbapi.get_engine()
            session = dbapi.get_session(db_engine)
            if self._check_retry_notification(jsonData, session):
                self.rc_util.syslogout_ex(
                    "RecoveryController_0030", syslog.LOG_INFO)
                msg = "Duplicate notifications. id:" + jsonData.get("id")
                self.rc_util.syslogout(msg, syslog.LOG_INFO)
                self.rc_util.syslogout(jsonData, syslog.LOG_INFO)

            # Node Recovery(processing A)
            elif jsonData.get("type") == "rscGroup" and \
                    str(jsonData.get("eventID")) == "1" and \
                    str(jsonData.get("eventType")) == "2" and \
                    str(jsonData.get("detail")) == "2":

                tdatetime = datetime.datetime.strptime(
                    jsonData.get("time"), '%Y%m%d%H%M%S')
                if not self._check_repeated_notify(tdatetime,
                                                   jsonData.get("hostname"),
                                                   session):
                    recover_by = 0  # node recovery
                    ret_dic = self.rc_util_db.insert_notification_list_db(
                        jsonData, recover_by, session)
                    self.rc_util.syslogout_ex(
                        "RecoveryController_0014", syslog.LOG_INFO)
                    self.rc_util.syslogout(jsonData, syslog.LOG_INFO)
                else:
                    # Duplicate notifications.
                    self.rc_util.syslogout_ex(
                        "RecoveryController_0015", syslog.LOG_INFO)
                    msg = "Duplicate notifications. id:" + jsonData.get("id")
                    self.rc_util.syslogout(msg, syslog.LOG_INFO)
                    self.rc_util.syslogout(jsonData, syslog.LOG_INFO)
            # Node is up
            elif jsonData.get("type") == "rscGroup" and \
                str(jsonData.get("eventID")) == "1" and \
                str(jsonData.get("eventType")) == "1" and \
                str(jsonData.get("detail")) == "1":
                self.rc_worker.mark_host_up_pf9(jsonData.get('hostname'))
            # VM Recovery(processing G)
            elif jsonData.get("type") == 'VM' and \
                    str(jsonData.get("eventID")) == '0' and \
                    str(jsonData.get("eventType")) == '5' and \
                    str(jsonData.get("detail")) == '5':

                recover_by = 1  # VM recovery
                ret_dic = self.rc_util_db.insert_notification_list_db(
                    jsonData, recover_by, session)
                self.rc_util.syslogout_ex(
                    "RecoveryController_0019", syslog.LOG_INFO)
                self.rc_util.syslogout(jsonData, syslog.LOG_INFO)

            # Node Lock(processing D and F)
            # Node will be locked.
            elif (jsonData.get("type") == 'nodeStatus') or \
                 ((jsonData.get("type") == 'rscGroup' and
                   str(jsonData.get("eventID")) == '1' and
                   str(jsonData.get("eventType")) == '2') and
                  (str(jsonData.get("detail")) == '3' or
                   str(jsonData.get("detail")) == '4')):

                tdatetime = datetime.datetime.strptime(
                    jsonData.get("time"), '%Y%m%d%H%M%S')
                if not self._check_repeated_notify(tdatetime,
                                                   jsonData.get("hostname"),
                                                   session):

                    recover_by = 2  # NODE lock
                    ret_dic = self.rc_util_db.insert_notification_list_db(
                        jsonData, recover_by, session)
                    self.rc_util.syslogout_ex(
                        "RecoveryController_0021", syslog.LOG_INFO)
                    self.rc_util.syslogout(jsonData, syslog.LOG_INFO)
                else:
                    # Duplicate notifications.
                    self.rc_util.syslogout_ex(
                        "RecoveryController_0036", syslog.LOG_INFO)
                    msg = "Duplicate notifications. id:" + jsonData.get("id")
                    self.rc_util.syslogout(msg, syslog.LOG_INFO)
                    self.rc_util.syslogout(jsonData, syslog.LOG_INFO)

            # Do not recover(Excuted Stop API)
            elif jsonData.get("type") == "VM" and \
                    str(jsonData.get("eventID")) == "0" and \
                    str(jsonData.get("eventType")) == "5" and \
                    str(jsonData.get("detail")) == "1":
                self.rc_util.syslogout_ex(
                    "RecoveryController_0022", syslog.LOG_INFO)
                self.rc_util.syslogout(jsonData, syslog.LOG_INFO)
                msg = "Do not recover instance.(Excuted Stop API)"
                self.rc_util.syslogout(msg, syslog.LOG_INFO)

            # Notification of starting node.
            elif jsonData.get("type") == "rscGroup" and \
                    str(jsonData.get("eventID")) == "1" and \
                    str(jsonData.get("eventType")) == "1" and \
                    str(jsonData.get("detail")) == "1":
                self.rc_util.syslogout_ex(
                    "RecoveryController_0023", syslog.LOG_INFO)
                self.rc_util.syslogout(jsonData, syslog.LOG_INFO)
                msg = "Recieved notification of node starting. Node:" + \
                      jsonData['hostname']
                self.rc_util.syslogout(msg, syslog.LOG_INFO)

            # Ignore notification
            else:
                self.rc_util.syslogout_ex(
                    "RecoveryController_0024", syslog.LOG_INFO)
                self.rc_util.syslogout(jsonData, syslog.LOG_INFO)
                msg = "Ignore notification. Notification:" + str(jsonData)
                self.rc_util.syslogout(msg, syslog.LOG_INFO)
        except Exception:
            error_type, error_value, traceback_ = sys.exc_info()
            tb_list = traceback.format_tb(traceback_)
            self.rc_util.syslogout_ex(
                "RecoveryController_0046", syslog.LOG_ERR)
            self.rc_util.syslogout(error_type, syslog.LOG_ERR)
            self.rc_util.syslogout(error_value, syslog.LOG_ERR)
            for tb in tb_list:
                self.rc_util.syslogout(tb, syslog.LOG_ERR)
            raise
        return ret_dic
Beispiel #11
0
    def _create_notification_list_db(self, jsonData):

        ret_dic = {}

        # Get DB from here and pass it to _check_retry_notification
        try:
            # Get session for db
            db_engine = dbapi.get_engine(self.rc_config)
            session = dbapi.get_session(db_engine)
            if self._check_retry_notification(jsonData, session):
                msg = "Duplicate notifications. id:" + jsonData.get("id")
                LOG.info(msg)
                LOG.info(jsonData)

            # Node Recovery(processing A)
            elif (
                jsonData.get("type") == "rscGroup"
                and str(jsonData.get("eventID")) == "1"
                and str(jsonData.get("eventType")) == "2"
                and str(jsonData.get("detail")) == "2"
            ):

                tdatetime = datetime.datetime.strptime(jsonData.get("time"), "%Y%m%d%H%M%S")
                if not self._check_repeated_notify(tdatetime, jsonData.get("hostname"), session):
                    recover_by = 0  # node recovery
                    ret_dic = self.rc_util_db.insert_notification_list_db(jsonData, recover_by, session)
                    LOG.info(jsonData)
                else:
                    # Duplicate notifications.
                    msg = "Duplicate notifications. id:" + jsonData.get("id")
                    LOG.info(msg)
                    LOG.info(jsonData)

            # VM Recovery(processing G)
            elif (
                jsonData.get("type") == "VM"
                and str(jsonData.get("eventID")) == "0"
                and str(jsonData.get("eventType")) == "5"
                and str(jsonData.get("detail")) == "5"
            ):

                recover_by = 1  # VM recovery
                ret_dic = self.rc_util_db.insert_notification_list_db(jsonData, recover_by, session)
                LOG.info(jsonData)

            # Node Lock(processing D and F)
            # Node will be locked.
            elif (jsonData.get("type") == "nodeStatus") or (
                (
                    jsonData.get("type") == "rscGroup"
                    and str(jsonData.get("eventID")) == "1"
                    and str(jsonData.get("eventType")) == "2"
                )
                and (str(jsonData.get("detail")) == "3" or str(jsonData.get("detail")) == "4")
            ):

                tdatetime = datetime.datetime.strptime(jsonData.get("time"), "%Y%m%d%H%M%S")
                if not self._check_repeated_notify(tdatetime, jsonData.get("hostname"), session):

                    recover_by = 2  # NODE lock
                    ret_dic = self.rc_util_db.insert_notification_list_db(jsonData, recover_by, session)
                    LOG.info(jsonData)
                else:
                    # Duplicate notifications.
                    msg = "Duplicate notifications. id:" + jsonData.get("id")
                    LOG.info(msg)
                    LOG.info(jsonData)

            # Do not recover(Excuted Stop API)
            elif (
                jsonData.get("type") == "VM"
                and str(jsonData.get("eventID")) == "0"
                and str(jsonData.get("eventType")) == "5"
                and str(jsonData.get("detail")) == "1"
            ):
                LOG.info(jsonData)
                msg = "Do not recover instance.(Excuted Stop API)"
                LOG.info(msg)

            # Notification of starting node.
            elif (
                jsonData.get("type") == "rscGroup"
                and str(jsonData.get("eventID")) == "1"
                and str(jsonData.get("eventType")) == "1"
                and str(jsonData.get("detail")) == "1"
            ):
                LOG.info(jsonData)
                msg = "Recieved notification of node starting. Node:" + jsonData["hostname"]
                LOG.info(msg)

            # Ignore notification
            else:
                LOG.info(jsonData)
                msg = "Ignore notification. Notification:" + str(jsonData)
                LOG.info(msg)
        except Exception:
            error_type, error_value, traceback_ = sys.exc_info()
            tb_list = traceback.format_tb(traceback_)
            LOG.error(error_type)
            LOG.error(error_value)
            for tb in tb_list:
                LOG.error(tb)
            raise

        return ret_dic
Beispiel #12
0
    def add_failed_instance(self, notification_id, notification_uuid,
                            retry_mode):
        """
        VM recover start thread :
            This thread starts the VM recover execution thread.
        :param notification_id: The notification ID included in the
         notification
        :param notification_uuid: The recovery target VM UUID of which are
         included in the notification
        :param retry_mode: Set True in the re-processing time of call,
         Set the False in the normal processing time of call
        """

        try:
            self.rc_config.set_request_context()
            db_engine = dbapi.get_engine(self.rc_config)
            session = dbapi.get_session(db_engine)

            # Get primary id of vm_list
            primary_id = self._create_vm_list_db_for_failed_instance(
                session, notification_id, notification_uuid)
            # update record in notification_list
            self.rc_util_db.update_notification_list_db(
                session, 'progress', 2, notification_id)
            # create semaphore (Multiplicity = 1)
            sem_recovery_instance = threading.Semaphore(1)
            # create and start thread
            if primary_id:
                if retry_mode is True:
                    # Skip recovery_instance.
                    # Will delegate to handle_pending_instances
                    msg = "RETRY MODE. Skip recovery_instance thread" \
                        + " vm_uuide=" + notification_uuid \
                        + " notification_id=" + notification_id
                    LOG.info(msg)
                else:
                    msg = "Run thread rc_worker.recovery_instance." \
                        + " notification_uuid=" + notification_uuid \
                        + " primary_id=" + str(primary_id)
                    LOG.info(msg)
                    thread_name = self.rc_util.make_thread_name(
                        VM_LIST, primary_id)
                    threading.Thread(target=self.rc_worker.recovery_instance,
                                     name=thread_name,
                                     args=(notification_uuid, primary_id,
                                           sem_recovery_instance)).start()

            return

        except KeyError:
            error_type, error_value, traceback_ = sys.exc_info()
            tb_list = traceback.format_tb(traceback_)
            LOG.error(error_type)
            LOG.error(error_value)
            for tb in tb_list:
                LOG.error(tb)
            return
        except:
            error_type, error_value, traceback_ = sys.exc_info()
            tb_list = traceback.format_tb(traceback_)
            LOG.error(error_type)
            LOG.error(error_value)
            for tb in tb_list:
                LOG.error(tb)
            return
Beispiel #13
0
    def masakari(self):
        """
        RecoveryController class main processing:
        This processing checks the VM list table of DB.
        If an unprocessed VM exists, and start thread to execute the recovery
        process.
        Then, the processing starts the wsgi server and waits for the
        notification.
        """
        try:
            LOG.info("masakari START.")

            # Get a session and do not pass it to other threads
            db_engine = dbapi.get_engine(self.rc_config)
            session = dbapi.get_session(db_engine)

            self._update_old_records_notification_list(session)
            result = self._find_reprocessing_records_notification_list(session)
            preprocessing_count = len(result)

            if preprocessing_count > 0:
                for row in result:
                    if row.recover_by == 0:
                        # node recovery event
                        msg = "Run thread rc_worker.host_maintenance_mode." \
                            + " notification_id=" + row.notification_id \
                            + " notification_hostname=" \
                            + row.notification_hostname \
                            + " update_progress=False"
                        LOG.info(msg)
                        thread_name = self.rc_util.make_thread_name(
                            NOTIFICATION_LIST, row.notification_id)
                        th = threading.Thread(
                            target=self.rc_worker.host_maintenance_mode,
                            name=thread_name,
                            args=(
                                row.notification_id,
                                row.notification_hostname,
                                False,
                            ))
                        th.start()

                        # Sleep until updating nova-compute service status
                        # down.
                        dic = self.rc_config.get_value('recover_starter')
                        node_err_wait = dic.get("node_err_wait")
                        msg = ("Sleeping %s sec before starting node recovery"
                               "thread, until updateing nova-compute"
                               "service status." % (node_err_wait))
                        LOG.info(msg)
                        greenthread.sleep(int(node_err_wait))

                        # Start add_failed_host thread
                        # TODO(sampath):
                        # Avoid create thread here,
                        # insted call rc_starter.add_failed_host
                        retry_mode = True
                        msg = "Run thread rc_starter.add_failed_host." \
                            + " notification_id=" + row.notification_id \
                            + " notification_hostname=" \
                            + row.notification_hostname \
                            + " notification_cluster_port=" \
                            + row.notification_cluster_port \
                            + " retry_mode=" + str(retry_mode)
                        LOG.info(msg)
                        thread_name = self.rc_util.make_thread_name(
                            NOTIFICATION_LIST, row.notification_id)
                        th = threading.Thread(
                            target=self.rc_starter.add_failed_host,
                            name=thread_name,
                            args=(
                                row.notification_id,
                                row.notification_hostname,
                                row.notification_cluster_port,
                                retry_mode,
                            ))
                        th.start()

                    elif row.recover_by == 1:
                        # instance recovery event
                        # TODO(sampath):
                        # Avoid create thread here,
                        # insted call rc_starter.add_failed_instance
                        msg = "Run thread rc_starter.add_failed_instance." \
                            + " notification_id=" + row.notification_id \
                            + " notification_uuid=" \
                            + row.notification_uuid
                        LOG.info(msg)
                        thread_name = self.rc_util.make_thread_name(
                            NOTIFICATION_LIST, row.notification_id)
                        th = threading.Thread(
                            target=self.rc_starter.add_failed_instance,
                            name=thread_name,
                            args=(
                                row.notification_id,
                                row.notification_uuid,
                            ))
                        th.start()

                    else:
                        # maintenance mode event
                        msg = "Run thread rc_starter.host_maintenance_mode." \
                            + " notification_id=" + row.notification_id \
                            + " notification_hostname=" \
                            + row.notification_hostname \
                            + "update_progress=True"
                        LOG.info(msg)
                        thread_name = self.rc_util.make_thread_name(
                            NOTIFICATION_LIST, row.notification_id)
                        th = threading.Thread(
                            target=self.rc_worker.host_maintenance_mode,
                            name=thread_name,
                            args=(
                                row.notification_id,
                                row.notification_hostname,
                                True,
                            ))
                        th.start()

            # Start handle_pending_instances thread
            # TODO(sampath):
            # Avoid create thread here,
            # insted call rc_starter.handle_pending_instances()
            msg = "Run thread rc_starter.handle_pending_instances."
            LOG.info(msg)
            thread_name = "Thread:handle_pending_instances"
            th = threading.Thread(
                target=self.rc_starter.handle_pending_instances,
                name=thread_name)
            th.start()

            # Start reciever process for notification
            conf_wsgi_dic = self.rc_config.get_value('wsgi')
            wsgi.server(
                eventlet.listen(('', int(conf_wsgi_dic['server_port']))),
                self._notification_reciever)

        except exc.SQLAlchemyError:
            error_type, error_value, traceback_ = sys.exc_info()
            tb_list = traceback.format_tb(traceback_)
            LOG.critical(error_type)
            LOG.critical(error_value)
            for tb in tb_list:
                LOG.critical(tb)

            sys.exit()
        except KeyError:
            error_type, error_value, traceback_ = sys.exc_info()
            tb_list = traceback.format_tb(traceback_)
            LOG.critical(error_type)
            LOG.critical(error_value)
            for tb in tb_list:
                LOG.critical(tb)

            sys.exit()
        except:
            error_type, error_value, traceback_ = sys.exc_info()
            tb_list = traceback.format_tb(traceback_)
            LOG.critical(error_type)
            LOG.critical(error_value)
            for tb in tb_list:
                LOG.critical(tb)

            sys.exit()
Beispiel #14
0
    def add_failed_host(self, notification_id, notification_hostname,
                        notification_cluster_port, retry_mode):
        """
        Node recover start thread :
            This thread starts the VM recover execution thread,
            only the number of existing vm in the recovery target node.
        :param notification_id: The notification ID included in the
         notification
        :param notification_hostname: The host name of the failure node that
         is included in the notification
        """

        try:
            db_engine = dbapi.get_engine()
            session = dbapi.get_session(db_engine)
            conf_dict = self.rc_config.get_value('recover_starter')
            recovery_max_retry_cnt = conf_dict.get('recovery_max_retry_cnt')
            recovery_retry_interval = conf_dict.get('recovery_retry_interval')

            vm_list = self.rc_util_api.fetch_servers_on_hypervisor(
                notification_hostname)

            # Count vm_list
            if len(vm_list) == 0:
                self.rc_util.syslogout_ex("RecoveryControllerStarter_0014",
                                          syslog.LOG_INFO)
                msg = "There is no instance in " + notification_hostname + "."
                self.rc_util.syslogout(msg, syslog.LOG_INFO)

                # update record in notification_list
                self.rc_util_db.update_notification_list_db(
                    session, 'progress', 2, notification_id)

                return
            else:
                result = dbapi.get_all_notification_list_by_id_for_update(
                    session, notification_id)
                recover_to = result.pop().recover_to

                if retry_mode is False:
                    cnt = dbapi.get_all_reserve_list_by_hostname_not_deleted(
                        session, recover_to)

                    if not cnt:
                        cnt = dbapi.\
                            get_one_reserve_list_by_cluster_port_for_update(
                                session,
                                notification_cluster_port,
                                notification_hostname
                            )

                        if not cnt:
                            self.rc_util.syslogout_ex(
                                "RecoveryControllerStarter_0022",
                                syslog.LOG_WARNING)
                            msg = "The reserve node not exist in " \
                                  "reserve_list DB, " \
                                  "so do not recover instances."
                            self.rc_util.syslogout(msg, syslog.LOG_WARNING)
                            self.rc_util_db.update_notification_list_db(
                                'progress', 3, notification_id)
                            return

                        result = cnt.pop()
                        recover_to = result.hostname
                        update_at = datetime.datetime.now()
                        dbapi.update_notification_list_by_notification_id_recover_to(
                            session, notification_id, update_at, recover_to)

                        self.rc_util.syslogout_ex(
                            "RecoveryControllerStarter_0024", syslog.LOG_INFO)
                self.rc_util.syslogout_ex("RecoveryControllerStarter_0015",
                                          syslog.LOG_INFO)

                delete_at = datetime.datetime.now()
                dbapi.update_reserve_list_by_hostname_as_deleted(
                    session, recover_to, delete_at)
            # create semaphore (Multiplicity is get from config.)
            conf_dict = self.rc_config.get_value('recover_starter')
            sem_recovery_instance = threading.Semaphore(
                int(conf_dict.get('semaphore_multiplicity')))

            incomplete_list = []
            for i in range(0, int(recovery_max_retry_cnt)):
                incomplete_list = []

                for vm_uuid in vm_list:
                    primary_id = self._create_vm_list_db_for_failed_host(
                        session, notification_id, vm_uuid)

                    if primary_id:
                        if retry_mode == True:
                            # Skip recovery_instance thread. Will delegate to
                            # ...
                            msg = "RETRY MODE. Skip recovery_instance thread" \
                                + " vm_uuide=" + vm_uuid \
                                + " notification_id=" + notification_id
                            self.rc_util.syslogout(msg, syslog.LOG_INFO)
                        else:
                            msg = "Run thread rc_worker.recovery_instance." \
                                + " vm_uuid=" + vm_uuid \
                                + " primary_id=" + str(primary_id)
                            self.rc_util.syslogout(msg, syslog.LOG_INFO)

                            threading.Thread(
                                target=self.rc_worker.recovery_instance,
                                args=(vm_uuid, primary_id,
                                      sem_recovery_instance)).start()
                    else:
                        if retry_mode == True:
                            continue
                        else:
                            incomplete_list.append(vm_uuid)

                if incomplete_list:
                    vm_list = incomplete_list
                    greenthread.sleep(int(recovery_retry_interval))
                else:
                    break

            for vm_uuid in incomplete_list:
                primary_id = self.rc_util_db.insert_vm_list_db(
                    session, notification_id, vm_uuid, 0)

                # Skip recovery_instance thread. Will delegate to ...
                self.rc_util.syslogout_ex("RecoveryControllerStarter_0031",
                                          syslog.LOG_INFO)
                msg = "Run thread rc_worker.recovery_instance." \
                    + " vm_uuid=" + vm_uuid \
                    + " primary_id=" + str(primary_id)
                self.rc_util.syslogout(msg, syslog.LOG_INFO)
                threading.Thread(target=self.rc_worker.recovery_instance,
                                 args=(vm_uuid, primary_id,
                                       sem_recovery_instance)).start()

            # update record in notification_list
            self.rc_util_db.update_notification_list_db(
                session, 'progress', 2, notification_id)

            return

        except KeyError:
            self.rc_util.syslogout_ex("RecoveryControllerStarter_0017",
                                      syslog.LOG_ERR)
            error_type, error_value, traceback_ = sys.exc_info()
            tb_list = traceback.format_tb(traceback_)
            self.rc_util.syslogout(error_type, syslog.LOG_ERR)
            self.rc_util.syslogout(error_value, syslog.LOG_ERR)
            for tb in tb_list:
                self.rc_util.syslogout(tb, syslog.LOG_ERR)
            return
        except:
            self.rc_util.syslogout_ex("RecoveryControllerStarter_0018",
                                      syslog.LOG_ERR)
            error_type, error_value, traceback_ = sys.exc_info()
            tb_list = traceback.format_tb(traceback_)
            self.rc_util.syslogout(error_type, syslog.LOG_ERR)
            self.rc_util.syslogout(error_value, syslog.LOG_ERR)
            for tb in tb_list:
                self.rc_util.syslogout(tb, syslog.LOG_ERR)
            return
Beispiel #15
0
    def recovery_instance(self, uuid, primary_id, sem):
        """
           Execute VM recovery.
           :param uuid: Recovery target VM UUID
           :param primary_id: Unique ID of the vm_list table
           :param sem: Semaphore
        """
        try:
            sem.acquire()
            self.rc_config.set_request_context()
            db_engine = dbapi.get_engine(self.rc_config)
            session = dbapi.get_session(db_engine)

            # Initlize status.
            status = self.STATUS_NORMAL

            # Update vmha recovery status.
            self.rc_util_db.update_vm_list_db(
                session, 'progress', 1, primary_id)

            # Get vm infomation.
            vm_info = self._get_vm_param(uuid)
            HA_Enabled = vm_info.metadata.get('HA-Enabled')
            if HA_Enabled:
                HA_Enabled = HA_Enabled.upper()
            if HA_Enabled != 'OFF':
                HA_Enabled = 'ON'

            # Set recovery parameter.
            exe_param = {}
            exe_param['vm_state'] = getattr(vm_info, 'OS-EXT-STS:vm_state')
            exe_param['HA-Enabled'] = HA_Enabled
            recover_by, recover_to = self._get_vmha_param(
                session, uuid, primary_id)
            exe_param['recover_by'] = recover_by
            exe_param['recover_to'] = recover_to

            # Execute.
            status = self._execute_recovery(session,
                                            uuid,
                                            exe_param.get("vm_state"),
                                            exe_param.get("HA-Enabled"),
                                            exe_param.get("recover_by"),
                                            exe_param.get("recover_to"))

        except EnvironmentError:
            status = self.STATUS_ERROR
            error_type, error_value, traceback_ = sys.exc_info()
            tb_list = traceback.format_tb(traceback_)
            LOG.error(error_type)
            LOG.error(error_value)
            for tb in tb_list:
                LOG.error(tb)
            return
        except KeyError:
            status = self.STATUS_ERROR
            error_type, error_value, traceback_ = sys.exc_info()
            tb_list = traceback.format_tb(traceback_)
            LOG.error(error_type)
            LOG.error(error_value)
            for tb in tb_list:
                LOG.error(tb)
            return
        except:
            status = self.STATUS_ERROR
            error_type, error_value, traceback_ = sys.exc_info()
            tb_list = traceback.format_tb(traceback_)
            LOG.error(error_type)
            LOG.error(error_value)
            for tb in tb_list:
                LOG.error(tb)
            return
        finally:
            try:
                # Successful execution.
                if status == self.STATUS_NORMAL:
                    self.rc_util_db.update_vm_list_db(
                        session, 'progress', 2, primary_id)

                    msg = "Recovery process has been completed successfully."
                    LOG.info(msg)

                # Abnormal termination.
                else:
                    self.rc_util_db.update_vm_list_db(
                        session, 'progress', 3, primary_id)

                    msg = "Recovery process has been terminated abnormally."
                    LOG.info(msg)

                # Release semaphore
                if sem:
                    sem.release()

            except:
                error_type, error_value, traceback_ = sys.exc_info()
                tb_list = traceback.format_tb(traceback_)
                LOG.error(error_type)
                LOG.error(error_value)
                for tb in tb_list:
                    LOG.error(tb)
                return
    def masakari(self):
        """
        RecoveryController class main processing:
        This processing checks the VM list table of DB.
        If an unprocessed VM exists, and start thread to execute the recovery
        process.
        Then, the processing starts the wsgi server and waits for the
        notification.
        """

        try:
            self.rc_util.syslogout_ex("RecoveryController_0004",
                                      syslog.LOG_INFO)
            self.rc_util.syslogout("masakari START.", syslog.LOG_INFO)

            # Get a session and do not pass it to other threads
            db_engine = dbapi.get_engine()
            session = dbapi.get_session(db_engine)

            self._update_old_records_notification_list(session)
            result = self._find_reprocessing_records_notification_list(session)
            preprocessing_count = len(result)

            if preprocessing_count > 0:
                for row in result:
                    if row.recover_by == 0:
                        # node recovery event
                        th = threading.Thread(
                            target=self.rc_worker.host_maintenance_mode,
                            args=(
                                row.notification_id,
                                row.notification_hostname,
                                False,
                            ))
                        th.start()

                        # Sleep until updating nova-compute service status
                        # down.
                        self.rc_util.syslogout_ex("RecoveryController_0035",
                                                  syslog.LOG_INFO)
                        dic = self.rc_config.get_value('recover_starter')
                        node_err_wait = dic.get("node_err_wait")
                        msg = ("Sleeping %s sec before starting node recovery"
                               "thread, until updateing nova-compute"
                               "service status." % (node_err_wait))
                        self.rc_util.syslogout(msg, syslog.LOG_INFO)
                        greenthread.sleep(int(node_err_wait))

                        # Start add_failed_host thread
                        # TODO(sampath):
                        # Avoid create thread here,
                        # insted call rc_starter.add_failed_host
                        retry_mode = True
                        th = threading.Thread(
                            target=self.rc_starter.add_failed_host,
                            args=(
                                row.notification_id,
                                row.notification_hostname,
                                row.notification_cluster_port,
                                retry_mode,
                            ))
                        th.start()

                    elif row.recover_by == 1:
                        # instance recovery event
                        # TODO(sampath):
                        # Avoid create thread here,
                        # insted call rc_starter.add_failed_instance
                        th = threading.Thread(
                            target=self.rc_starter.add_failed_instance,
                            args=(
                                row.notification_id,
                                row.notification_uuid,
                            ))
                        th.start()

                    else:
                        # maintenance mode event
                        th = threading.Thread(
                            target=self.rc_worker.host_maintenance_mode,
                            args=(
                                row.notification_id,
                                row.notification_hostname,
                                True,
                            ))
                        th.start()

            # Start handle_pending_instances thread
            # TODO(sampath):
            # Avoid create thread here,
            # insted call rc_starter.handle_pending_instances()
            th = threading.Thread(
                target=self.rc_starter.handle_pending_instances)
            th.start()

            # Start reciever process for notification
            conf_wsgi_dic = self.rc_config.get_value('wsgi')
            wsgi.server(
                eventlet.listen(('', int(conf_wsgi_dic['server_port']))),
                self._notification_reciever)
        except exc.SQLAlchemyError:
            error_type, error_value, traceback_ = sys.exc_info()
            tb_list = traceback.format_tb(traceback_)
            self.rc_util.syslogout_ex("RecoveryController_0005",
                                      syslog.LOG_ERR)
            self.rc_util.syslogout(error_type, syslog.LOG_ERR)
            self.rc_util.syslogout(error_value, syslog.LOG_ERR)
            for tb in tb_list:
                self.rc_util.syslogout(tb, syslog.LOG_ERR)
            sys.exit()
        except KeyError:
            error_type, error_value, traceback_ = sys.exc_info()
            tb_list = traceback.format_tb(traceback_)
            self.rc_util.syslogout_ex("RecoveryController_0006",
                                      syslog.LOG_ERR)
            self.rc_util.syslogout(error_type, syslog.LOG_ERR)
            self.rc_util.syslogout(error_value, syslog.LOG_ERR)
            for tb in tb_list:
                self.rc_util.syslogout(tb, syslog.LOG_ERR)
            sys.exit()
        except:
            error_type, error_value, traceback_ = sys.exc_info()
            tb_list = traceback.format_tb(traceback_)
            self.rc_util.syslogout_ex("RecoveryController_0007",
                                      syslog.LOG_ERR)
            self.rc_util.syslogout(error_type, syslog.LOG_ERR)
            self.rc_util.syslogout(error_value, syslog.LOG_ERR)
            for tb in tb_list:
                self.rc_util.syslogout(tb, syslog.LOG_ERR)
            sys.exit()
Beispiel #17
0
    def add_failed_host(self,
                        notification_id,
                        notification_hostname,
                        notification_cluster_port,
                        retry_mode):
        """
        Node recover start thread :
            This thread starts the VM recover execution thread,
            only the number of existing vm in the recovery target node.
        :param notification_id: The notification ID included in the
         notification
        :param notification_hostname: The host name of the failure node that
         is included in the notification
        """

        try:
            self.rc_config.set_request_context()
            db_engine = dbapi.get_engine(self.rc_config)
            session = dbapi.get_session(db_engine)
            conf_dict = self.rc_config.get_value('recover_starter')
            recovery_max_retry_cnt = conf_dict.get('recovery_max_retry_cnt')
            recovery_retry_interval = conf_dict.get('recovery_retry_interval')

            vm_list = self.rc_util_api.fetch_servers_on_hypervisor(
                notification_hostname)

            # Count vm_list
            if len(vm_list) == 0:
                msg = "There is no instance in " + notification_hostname + "."
                LOG.info(msg)

                # update record in notification_list
                self.rc_util_db.update_notification_list_db(
                    session, 'progress', 2, notification_id)

                return
            else:
                msg = "Do get_all_notification_list_by_id_for_update."
                LOG.info(msg)
                result = dbapi.get_all_notification_list_by_id_for_update(
                    session, notification_id)
                msg = "Succeeded in " \
                    + "get_all_notification_list_by_id_for_update. " \
                    + "Return_value = " + str(result)
                LOG.info(msg)
                recover_to = result.pop().recover_to

                if retry_mode is False:
                    msg = "Do get_all_reserve_list_by_hostname_not_deleted."
                    LOG.info(msg)
                    cnt = dbapi.get_all_reserve_list_by_hostname_not_deleted(
                        session,
                        recover_to)
                    msg = "Succeeded in " \
                        + "get_all_reserve_list_by_hostname_not_deleted. " \
                        + "Return_value = " + str(cnt)
                    LOG.info(msg)

                    if not cnt:
                        msg = "Do " \
                            + "get_one_reserve_list_by_cluster_port_for_update."
                        LOG.info(msg)
                        cnt = dbapi.\
                            get_one_reserve_list_by_cluster_port_for_update(
                                session,
                                notification_cluster_port,
                                notification_hostname
                            )
                        msg = "Succeeded in " \
                            + "get_one_reserve_list_by_cluster_port_for_update. " \
                            + "Return_value = " + str(cnt)
                        LOG.info(msg)

                        if not cnt:
                            msg = "The reserve node not exist in " \
                                  "reserve_list DB, " \
                                  "so do not recover instances."
                            LOG.warning(msg)
                            self.rc_util_db.update_notification_list_db(
                                'progress', 3, notification_id)

                            return

                        result = cnt.pop()
                        recover_to = result.hostname
                        update_at = datetime.datetime.now()
                        msg = "Do " \
                            + "update_notification_list_by_notification_id_recover_to."
                        LOG.info(msg)
                        dbapi.update_notification_list_by_notification_id_recover_to(
                            session,
                            notification_id,
                            update_at,
                            recover_to
                        )
                        msg = "Succeeded in " \
                            + "update_notification_list_by_notification_id_recover_to."
                        LOG.info(msg)

                delete_at = datetime.datetime.now()

                msg = "Do update_reserve_list_by_hostname_as_deleted."
                LOG.info(msg)
                dbapi.update_reserve_list_by_hostname_as_deleted(
                    session, recover_to, delete_at)
                msg = "Succeeded in " \
                    + "update_reserve_list_by_hostname_as_deleted."
                LOG.info(msg)
            # create semaphore (Multiplicity is get from config.)
            conf_dict = self.rc_config.get_value('recover_starter')
            sem_recovery_instance = threading.Semaphore(
                int(conf_dict.get('semaphore_multiplicity')))

            incomplete_list = []
            for i in range(0, int(recovery_max_retry_cnt)):
                incomplete_list = []

                for vm_uuid in vm_list:
                    primary_id = self._create_vm_list_db_for_failed_host(
                        session, notification_id, vm_uuid)

                    if primary_id:
                        if retry_mode is True:
                            # Skip recovery_instance thread. Will delegate to
                            # ...
                            msg = "RETRY MODE. Skip recovery_instance thread" \
                                + " vm_uuide=" + vm_uuid \
                                + " notification_id=" + notification_id
                            LOG.info(msg)
                        else:
                            msg = "Run thread rc_worker.recovery_instance." \
                                + " vm_uuid=" + vm_uuid \
                                + " primary_id=" + str(primary_id)
                            LOG.info(msg)

                            thread_name = self.rc_util.make_thread_name(
                                VM_LIST, primary_id)
                            threading.Thread(
                                target=self.rc_worker.recovery_instance,
                                name=thread_name,
                                args=(vm_uuid, primary_id,
                                      sem_recovery_instance)).start()
                    else:
                        if retry_mode is True:
                            continue
                        else:
                            incomplete_list.append(vm_uuid)

                if incomplete_list:
                    vm_list = incomplete_list
                    greenthread.sleep(int(recovery_retry_interval))
                else:
                    break

            for vm_uuid in incomplete_list:
                primary_id = self.rc_util_db.insert_vm_list_db(
                    session, notification_id, vm_uuid, 0)

                # Skip recovery_instance thread. Will delegate to ...
                msg = "Run thread rc_worker.recovery_instance." \
                    + " vm_uuid=" + vm_uuid \
                    + " primary_id=" + str(primary_id)
                LOG.info(msg)
                thread_name = self.rc_util.make_thread_name(
                    VM_LIST, primary_id)
                threading.Thread(
                    target=self.rc_worker.recovery_instance,
                    name=thread_name,
                    args=(vm_uuid, primary_id,
                          sem_recovery_instance)).start()

            # update record in notification_list
            self.rc_util_db.update_notification_list_db(
                session, 'progress', 2, notification_id)

            return

        except KeyError:
            error_type, error_value, traceback_ = sys.exc_info()
            tb_list = traceback.format_tb(traceback_)
            LOG.error(error_type)
            LOG.error(error_value)
            for tb in tb_list:
                LOG.error(tb)
            return
        except:
            error_type, error_value, traceback_ = sys.exc_info()
            tb_list = traceback.format_tb(traceback_)
            LOG.error(error_type)
            LOG.error(error_value)
            for tb in tb_list:
                LOG.error(tb)
            return
Beispiel #18
0
    def add_failed_instance(self, notification_id,
                            notification_uuid, retry_mode):
        """
        VM recover start thread :
            This thread starts the VM recover execution thread.
        :param notification_id: The notification ID included in the
         notification
        :param notification_uuid: The recovery target VM UUID of which are
         included in the notification
        :param retry_mode: Set True in the re-processing time of call,
         Set the False in the normal processing time of call
        """

        try:
            self.rc_config.set_request_context()
            db_engine = dbapi.get_engine(self.rc_config)
            session = dbapi.get_session(db_engine)

            # Get primary id of vm_list
            primary_id = self._create_vm_list_db_for_failed_instance(
                session, notification_id, notification_uuid)
            # update record in notification_list
            self.rc_util_db.update_notification_list_db(
                session, 'progress', 2, notification_id)
            # create semaphore (Multiplicity = 1)
            sem_recovery_instance = threading.Semaphore(1)
            # create and start thread
            if primary_id:
                if retry_mode is True:
                    # Skip recovery_instance.
                    # Will delegate to handle_pending_instances
                    msg = "RETRY MODE. Skip recovery_instance thread" \
                        + " vm_uuide=" + notification_uuid \
                        + " notification_id=" + notification_id
                    LOG.info(msg)
                else:
                    msg = "Run thread rc_worker.recovery_instance." \
                        + " notification_uuid=" + notification_uuid \
                        + " primary_id=" + str(primary_id)
                    LOG.info(msg)
                    thread_name = self.rc_util.make_thread_name(
                        VM_LIST, primary_id)
                    threading.Thread(target=self.rc_worker.recovery_instance,
                                     name=thread_name,
                                     args=(notification_uuid, primary_id,
                                           sem_recovery_instance)).start()

            return

        except KeyError:
            error_type, error_value, traceback_ = sys.exc_info()
            tb_list = traceback.format_tb(traceback_)
            LOG.error(error_type)
            LOG.error(error_value)
            for tb in tb_list:
                LOG.error(tb)
            return
        except:
            error_type, error_value, traceback_ = sys.exc_info()
            tb_list = traceback.format_tb(traceback_)
            LOG.error(error_type)
            LOG.error(error_value)
            for tb in tb_list:
                LOG.error(tb)
            return
Beispiel #19
0
    def handle_pending_instances(self):
        """
        method description.
        recovery-controller I do the recovery
        of outstanding recovery VM at startup.
        """
        try:
            db_engine = dbapi.get_engine()
            session = dbapi.get_session(db_engine)

            self._update_old_records_vm_list(session)
            result = self._find_reprocessing_records_vm_list(session)

            # [recover_starter]section
            recover_starter_dic = self.rc_config.get_value("recover_starter")
            semaphore_multiplicity = recover_starter_dic.get(
                "semaphore_multiplicity")

            # Set multiplicity by semaphore_multiplicity
            sem = threading.Semaphore(int(semaphore_multiplicity))

            # Execute vm_recovery_worker
            if len(result) > 0:
                # Execute the required number
                for row in result:
                    vm_uuid = row.uuid
                    primary_id = row.id
                    self.rc_util.syslogout_ex("RecoveryControllerStarter_0032",
                                              syslog.LOG_INFO)
                    msg = "Run thread rc_worker.recovery_instance." \
                        + " vm_uuid=" + vm_uuid \
                        + " primary_id=" + str(primary_id)
                    self.rc_util.syslogout(msg, syslog.LOG_INFO)
                    threading.Thread(target=self.rc_worker.recovery_instance,
                                     args=(vm_uuid, primary_id, sem)).start()

            # Imperfect_recover
            else:
                return

            return
        except KeyError:
            self.rc_util.syslogout_ex("RecoveryControllerStarter_0020",
                                      syslog.LOG_ERR)
            error_type, error_value, traceback_ = sys.exc_info()
            tb_list = traceback.format_tb(traceback_)
            self.rc_util.syslogout(error_type, syslog.LOG_ERR)
            self.rc_util.syslogout(error_value, syslog.LOG_ERR)
            for tb in tb_list:
                self.rc_util.syslogout(tb, syslog.LOG_ERR)
            return
        except:
            self.rc_util.syslogout_ex("RecoveryControllerStarter_0021",
                                      syslog.LOG_ERR)
            error_type, error_value, traceback_ = sys.exc_info()
            tb_list = traceback.format_tb(traceback_)
            self.rc_util.syslogout(error_type, syslog.LOG_ERR)
            self.rc_util.syslogout(error_value, syslog.LOG_ERR)
            for tb in tb_list:
                self.rc_util.syslogout(tb, syslog.LOG_ERR)
            return
Beispiel #20
0
    def add_failed_instance(self, notification_id, notification_uuid, retry_mode):
        """
        VM recover start thread :
            This thread starts the VM recover execution thread.
        :param notification_id: The notification ID included in the
         notification
        :param notification_uuid: The recovery target VM UUID of which are
         included in the notification
        :param retry_mode: Set True in the re-processing time of call,
         Set the False in the normal processing time of call
        """

        try:
            db_engine = dbapi.get_engine()
            session = dbapi.get_session(db_engine)

            # Get primary id of vm_list
            primary_id = self._create_vm_list_db_for_failed_instance(session, notification_id, notification_uuid)
            # update record in notification_list
            self.rc_util_db.update_notification_list_db(session, "progress", 2, notification_id)
            # create semaphore (Multiplicity = 1)
            sem_recovery_instance = threading.Semaphore(1)
            # create and start thread
            if primary_id:
                if retry_mode == True:
                    # Skip recovery_instance.
                    # Will delegate to handle_pending_instances
                    self.rc_util.syslogout_ex("RecoveryControllerStarter_0027", syslog.LOG_INFO)
                    msg = (
                        "RETRY MODE. Skip recovery_instance thread"
                        + " vm_uuide="
                        + notification_uuid
                        + " notification_id="
                        + notification_id
                    )
                    self.rc_util.syslogout(msg, syslog.LOG_INFO)
                else:
                    self.rc_util.syslogout_ex("RecoveryControllerStarter_0029", syslog.LOG_INFO)
                    msg = (
                        "Run thread rc_worker.recovery_instance."
                        + " notification_uuid="
                        + notification_uuid
                        + " primary_id="
                        + str(primary_id)
                    )
                    self.rc_util.syslogout(msg, syslog.LOG_INFO)

                    threading.Thread(
                        target=self.rc_worker.recovery_instance,
                        args=(notification_uuid, primary_id, sem_recovery_instance),
                    ).start()
            return

        except KeyError:
            self.rc_util.syslogout_ex("RecoveryControllerStarter_0012", syslog.LOG_ERR)
            error_type, error_value, traceback_ = sys.exc_info()
            tb_list = traceback.format_tb(traceback_)
            self.rc_util.syslogout(error_type, syslog.LOG_ERR)
            self.rc_util.syslogout(error_value, syslog.LOG_ERR)
            for tb in tb_list:
                self.rc_util.syslogout(tb, syslog.LOG_ERR)
            return
        except:
            self.rc_util.syslogout_ex("RecoveryControllerStarter_0013", syslog.LOG_ERR)
            error_type, error_value, traceback_ = sys.exc_info()
            tb_list = traceback.format_tb(traceback_)
            self.rc_util.syslogout(error_type, syslog.LOG_ERR)
            self.rc_util.syslogout(error_value, syslog.LOG_ERR)
            for tb in tb_list:
                self.rc_util.syslogout(tb, syslog.LOG_ERR)
            return
Beispiel #21
0
    def _create_notification_list_db(self, jsonData):

        ret_dic = {}

        # Get DB from here and pass it to _check_retry_notification
        try:
            # Get session for db
            db_engine = dbapi.get_engine(self.rc_config)
            session = dbapi.get_session(db_engine)
            if self._check_retry_notification(jsonData, session):
                msg = "Duplicate notifications. id:" + jsonData.get("id")
                LOG.info(msg)
                LOG.info(jsonData)

            # Node Recovery(processing A)
            elif jsonData.get("type") == "rscGroup" and \
                    str(jsonData.get("eventID")) == "1" and \
                    str(jsonData.get("eventType")) == "2" and \
                    str(jsonData.get("detail")) == "2":

                tdatetime = datetime.datetime.strptime(jsonData.get("time"),
                                                       '%Y%m%d%H%M%S')
                if not self._check_repeated_notify(
                        tdatetime, jsonData.get("hostname"), session):
                    recover_by = 0  # node recovery
                    ret_dic = self.rc_util_db.insert_notification_list_db(
                        jsonData, recover_by, session)
                    LOG.info(jsonData)
                else:
                    # Duplicate notifications.
                    msg = "Duplicate notifications. id:" + jsonData.get("id")
                    LOG.info(msg)
                    LOG.info(jsonData)

            # VM Recovery(processing G)
            elif jsonData.get("type") == 'VM' and \
                    str(jsonData.get("eventID")) == '0' and \
                    str(jsonData.get("eventType")) == '5' and \
                    str(jsonData.get("detail")) == '5':

                recover_by = 1  # VM recovery
                ret_dic = self.rc_util_db.insert_notification_list_db(
                    jsonData, recover_by, session)
                LOG.info(jsonData)

            # Node Lock(processing D and F)
            # Node will be locked.
            elif (jsonData.get("type") == 'nodeStatus') or \
                 ((jsonData.get("type") == 'rscGroup' and
                   str(jsonData.get("eventID")) == '1' and
                   str(jsonData.get("eventType")) == '2') and
                  (str(jsonData.get("detail")) == '3' or
                   str(jsonData.get("detail")) == '4')):

                tdatetime = datetime.datetime.strptime(jsonData.get("time"),
                                                       '%Y%m%d%H%M%S')
                if not self._check_repeated_notify(
                        tdatetime, jsonData.get("hostname"), session):

                    recover_by = 2  # NODE lock
                    ret_dic = self.rc_util_db.insert_notification_list_db(
                        jsonData, recover_by, session)
                    LOG.info(jsonData)
                else:
                    # Duplicate notifications.
                    msg = "Duplicate notifications. id:" + jsonData.get("id")
                    LOG.info(msg)
                    LOG.info(jsonData)

            # Do not recover(Excuted Stop API)
            elif jsonData.get("type") == "VM" and \
                    str(jsonData.get("eventID")) == "0" and \
                    str(jsonData.get("eventType")) == "5" and \
                    str(jsonData.get("detail")) == "1":
                LOG.info(jsonData)
                msg = "Do not recover instance.(Excuted Stop API)"
                LOG.info(msg)

            # Notification of starting node.
            elif jsonData.get("type") == "rscGroup" and \
                    str(jsonData.get("eventID")) == "1" and \
                    str(jsonData.get("eventType")) == "1" and \
                    str(jsonData.get("detail")) == "1":
                LOG.info(jsonData)
                msg = "Recieved notification of node starting. Node:" + \
                      jsonData['hostname']
                LOG.info(msg)

            # Ignore notification
            else:
                LOG.info(jsonData)
                msg = "Ignore notification. Notification:" + str(jsonData)
                LOG.info(msg)
        except Exception:
            error_type, error_value, traceback_ = sys.exc_info()
            tb_list = traceback.format_tb(traceback_)
            LOG.error(error_type)
            LOG.error(error_value)
            for tb in tb_list:
                LOG.error(tb)
            raise

        return ret_dic
Beispiel #22
0
    def add_failed_host(self, notification_id, notification_hostname, notification_cluster_port, retry_mode):
        """
        Node recover start thread :
            This thread starts the VM recover execution thread,
            only the number of existing vm in the recovery target node.
        :param notification_id: The notification ID included in the
         notification
        :param notification_hostname: The host name of the failure node that
         is included in the notification
        """

        try:
            db_engine = dbapi.get_engine()
            session = dbapi.get_session(db_engine)
            conf_dict = self.rc_config.get_value("recover_starter")
            recovery_max_retry_cnt = conf_dict.get("recovery_max_retry_cnt")
            recovery_retry_interval = conf_dict.get("recovery_retry_interval")

            vm_list = self.rc_util_api.fetch_servers_on_hypervisor(notification_hostname)

            # Count vm_list
            if len(vm_list) == 0:
                self.rc_util.syslogout_ex("RecoveryControllerStarter_0014", syslog.LOG_INFO)
                msg = "There is no instance in " + notification_hostname + "."
                self.rc_util.syslogout(msg, syslog.LOG_INFO)

                # update record in notification_list
                self.rc_util_db.update_notification_list_db(session, "progress", 2, notification_id)

                return
            else:
                result = dbapi.get_all_notification_list_by_id_for_update(session, notification_id)
                recover_to = result.pop().recover_to

                if retry_mode is False:
                    cnt = dbapi.get_all_reserve_list_by_hostname_not_deleted(session, recover_to)

                    if not cnt:
                        cnt = dbapi.get_one_reserve_list_by_cluster_port_for_update(
                            session, notification_cluster_port, notification_hostname
                        )

                        if not cnt:
                            self.rc_util.syslogout_ex("RecoveryControllerStarter_0022", syslog.LOG_WARNING)
                            msg = "The reserve node not exist in " "reserve_list DB, " "so do not recover instances."
                            self.rc_util.syslogout(msg, syslog.LOG_WARNING)
                            self.rc_util_db.update_notification_list_db("progress", 3, notification_id)
                            return

                        result = cnt.pop()
                        recover_to = result.hostname
                        update_at = datetime.datetime.now()
                        dbapi.update_notification_list_by_notification_id_recover_to(
                            session, notification_id, update_at, recover_to
                        )

                        self.rc_util.syslogout_ex("RecoveryControllerStarter_0024", syslog.LOG_INFO)
                self.rc_util.syslogout_ex("RecoveryControllerStarter_0015", syslog.LOG_INFO)

                delete_at = datetime.datetime.now()
                dbapi.update_reserve_list_by_hostname_as_deleted(session, recover_to, delete_at)
            # create semaphore (Multiplicity is get from config.)
            conf_dict = self.rc_config.get_value("recover_starter")
            sem_recovery_instance = threading.Semaphore(int(conf_dict.get("semaphore_multiplicity")))

            incomplete_list = []
            for i in range(0, int(recovery_max_retry_cnt)):
                incomplete_list = []

                for vm_uuid in vm_list:
                    primary_id = self._create_vm_list_db_for_failed_host(session, notification_id, vm_uuid)

                    if primary_id:
                        if retry_mode == True:
                            # Skip recovery_instance thread. Will delegate to
                            # ...
                            msg = (
                                "RETRY MODE. Skip recovery_instance thread"
                                + " vm_uuide="
                                + vm_uuid
                                + " notification_id="
                                + notification_id
                            )
                            self.rc_util.syslogout(msg, syslog.LOG_INFO)
                        else:
                            msg = (
                                "Run thread rc_worker.recovery_instance."
                                + " vm_uuid="
                                + vm_uuid
                                + " primary_id="
                                + str(primary_id)
                            )
                            self.rc_util.syslogout(msg, syslog.LOG_INFO)

                            threading.Thread(
                                target=self.rc_worker.recovery_instance,
                                args=(vm_uuid, primary_id, sem_recovery_instance),
                            ).start()
                    else:
                        if retry_mode == True:
                            continue
                        else:
                            incomplete_list.append(vm_uuid)

                if incomplete_list:
                    vm_list = incomplete_list
                    greenthread.sleep(int(recovery_retry_interval))
                else:
                    break

            for vm_uuid in incomplete_list:
                primary_id = self.rc_util_db.insert_vm_list_db(session, notification_id, vm_uuid, 0)

                # Skip recovery_instance thread. Will delegate to ...
                self.rc_util.syslogout_ex("RecoveryControllerStarter_0031", syslog.LOG_INFO)
                msg = (
                    "Run thread rc_worker.recovery_instance." + " vm_uuid=" + vm_uuid + " primary_id=" + str(primary_id)
                )
                self.rc_util.syslogout(msg, syslog.LOG_INFO)
                threading.Thread(
                    target=self.rc_worker.recovery_instance, args=(vm_uuid, primary_id, sem_recovery_instance)
                ).start()

            # update record in notification_list
            self.rc_util_db.update_notification_list_db(session, "progress", 2, notification_id)

            return

        except KeyError:
            self.rc_util.syslogout_ex("RecoveryControllerStarter_0017", syslog.LOG_ERR)
            error_type, error_value, traceback_ = sys.exc_info()
            tb_list = traceback.format_tb(traceback_)
            self.rc_util.syslogout(error_type, syslog.LOG_ERR)
            self.rc_util.syslogout(error_value, syslog.LOG_ERR)
            for tb in tb_list:
                self.rc_util.syslogout(tb, syslog.LOG_ERR)
            return
        except:
            self.rc_util.syslogout_ex("RecoveryControllerStarter_0018", syslog.LOG_ERR)
            error_type, error_value, traceback_ = sys.exc_info()
            tb_list = traceback.format_tb(traceback_)
            self.rc_util.syslogout(error_type, syslog.LOG_ERR)
            self.rc_util.syslogout(error_value, syslog.LOG_ERR)
            for tb in tb_list:
                self.rc_util.syslogout(tb, syslog.LOG_ERR)
            return
Beispiel #23
0
    def add_failed_host(self, notification_id, notification_hostname,
                        notification_cluster_port, retry_mode):
        """
        Node recover start thread :
            This thread starts the VM recover execution thread,
            only the number of existing vm in the recovery target node.
        :param notification_id: The notification ID included in the
         notification
        :param notification_hostname: The host name of the failure node that
         is included in the notification
        """

        try:
            self.rc_config.set_request_context()
            db_engine = dbapi.get_engine(self.rc_config)
            session = dbapi.get_session(db_engine)
            conf_dict = self.rc_config.get_value('recover_starter')
            recovery_max_retry_cnt = conf_dict.get('recovery_max_retry_cnt')
            recovery_retry_interval = conf_dict.get('recovery_retry_interval')

            vm_list = self.rc_util_api.fetch_servers_on_hypervisor(
                notification_hostname)

            # Count vm_list
            if len(vm_list) == 0:
                msg = "There is no instance in " + notification_hostname + "."
                LOG.info(msg)

                # update record in notification_list
                self.rc_util_db.update_notification_list_db(
                    session, 'progress', 2, notification_id)

                return
            else:
                msg = "Do get_all_notification_list_by_id_for_update."
                LOG.info(msg)
                result = dbapi.get_all_notification_list_by_id_for_update(
                    session, notification_id)
                msg = "Succeeded in " \
                    + "get_all_notification_list_by_id_for_update. " \
                    + "Return_value = " + str(result)
                LOG.info(msg)
                recover_to = result.pop().recover_to

                if retry_mode is False:
                    msg = "Do get_all_reserve_list_by_hostname_not_deleted."
                    LOG.info(msg)
                    cnt = dbapi.get_all_reserve_list_by_hostname_not_deleted(
                        session, recover_to)
                    msg = "Succeeded in " \
                        + "get_all_reserve_list_by_hostname_not_deleted. " \
                        + "Return_value = " + str(cnt)
                    LOG.info(msg)

                    if not cnt:
                        msg = "Do " \
                            + "get_one_reserve_list_by_cluster_port_for_update."
                        LOG.info(msg)
                        cnt = dbapi.\
                            get_one_reserve_list_by_cluster_port_for_update(
                                session,
                                notification_cluster_port,
                                notification_hostname
                            )
                        msg = "Succeeded in " \
                            + "get_one_reserve_list_by_cluster_port_for_update. " \
                            + "Return_value = " + str(cnt)
                        LOG.info(msg)

                        if not cnt:
                            msg = "The reserve node not exist in " \
                                  "reserve_list DB, " \
                                  "so do not recover instances."
                            LOG.warning(msg)
                            self.rc_util_db.update_notification_list_db(
                                'progress', 3, notification_id)

                            return

                        result = cnt.pop()
                        recover_to = result.hostname
                        update_at = datetime.datetime.now()
                        msg = "Do " \
                            + "update_notification_list_by_notification_id_recover_to."
                        LOG.info(msg)
                        dbapi.update_notification_list_by_notification_id_recover_to(
                            session, notification_id, update_at, recover_to)
                        msg = "Succeeded in " \
                            + "update_notification_list_by_notification_id_recover_to."
                        LOG.info(msg)

                delete_at = datetime.datetime.now()

                msg = "Do update_reserve_list_by_hostname_as_deleted."
                LOG.info(msg)
                dbapi.update_reserve_list_by_hostname_as_deleted(
                    session, recover_to, delete_at)
                msg = "Succeeded in " \
                    + "update_reserve_list_by_hostname_as_deleted."
                LOG.info(msg)
            # create semaphore (Multiplicity is get from config.)
            conf_dict = self.rc_config.get_value('recover_starter')
            sem_recovery_instance = threading.Semaphore(
                int(conf_dict.get('semaphore_multiplicity')))

            incomplete_list = []
            for i in range(0, int(recovery_max_retry_cnt)):
                incomplete_list = []

                for vm_uuid in vm_list:
                    primary_id = self._create_vm_list_db_for_failed_host(
                        session, notification_id, vm_uuid)

                    if primary_id:
                        if retry_mode is True:
                            # Skip recovery_instance thread. Will delegate to
                            # ...
                            msg = "RETRY MODE. Skip recovery_instance thread" \
                                + " vm_uuide=" + vm_uuid \
                                + " notification_id=" + notification_id
                            LOG.info(msg)
                        else:
                            msg = "Run thread rc_worker.recovery_instance." \
                                + " vm_uuid=" + vm_uuid \
                                + " primary_id=" + str(primary_id)
                            LOG.info(msg)

                            thread_name = self.rc_util.make_thread_name(
                                VM_LIST, primary_id)
                            threading.Thread(
                                target=self.rc_worker.recovery_instance,
                                name=thread_name,
                                args=(vm_uuid, primary_id,
                                      sem_recovery_instance)).start()
                    else:
                        if retry_mode is True:
                            continue
                        else:
                            incomplete_list.append(vm_uuid)

                if incomplete_list:
                    vm_list = incomplete_list
                    greenthread.sleep(int(recovery_retry_interval))
                else:
                    break

            for vm_uuid in incomplete_list:
                primary_id = self.rc_util_db.insert_vm_list_db(
                    session, notification_id, vm_uuid, 0)

                # Skip recovery_instance thread. Will delegate to ...
                msg = "Run thread rc_worker.recovery_instance." \
                    + " vm_uuid=" + vm_uuid \
                    + " primary_id=" + str(primary_id)
                LOG.info(msg)
                thread_name = self.rc_util.make_thread_name(
                    VM_LIST, primary_id)
                threading.Thread(target=self.rc_worker.recovery_instance,
                                 name=thread_name,
                                 args=(vm_uuid, primary_id,
                                       sem_recovery_instance)).start()

            # update record in notification_list
            self.rc_util_db.update_notification_list_db(
                session, 'progress', 2, notification_id)

            return

        except KeyError:
            error_type, error_value, traceback_ = sys.exc_info()
            tb_list = traceback.format_tb(traceback_)
            LOG.error(error_type)
            LOG.error(error_value)
            for tb in tb_list:
                LOG.error(tb)
            return
        except:
            error_type, error_value, traceback_ = sys.exc_info()
            tb_list = traceback.format_tb(traceback_)
            LOG.error(error_type)
            LOG.error(error_value)
            for tb in tb_list:
                LOG.error(tb)
            return
    def masakari(self):
        """
        RecoveryController class main processing:
        This processing checks the VM list table of DB.
        If an unprocessed VM exists, and start thread to execute the recovery
        process.
        Then, the processing starts the wsgi server and waits for the
        notification.
        """

        try:
            self.rc_util.syslogout_ex(
                "RecoveryController_0004", syslog.LOG_INFO)
            self.rc_util.syslogout(
                "masakari START.", syslog.LOG_INFO)

            # Get a session and do not pass it to other threads
            db_engine = dbapi.get_engine()
            session = dbapi.get_session(db_engine)

            self._update_old_records_notification_list(session)
            result = self._find_reprocessing_records_notification_list(session)
            preprocessing_count = len(result)

            if preprocessing_count > 0:
                for row in result:
                    if row.recover_by == 0:
                        # node recovery event
                        th = threading.Thread(
                            target=self.rc_worker.host_maintenance_mode,
                            args=(row.notification_id,
                                  row.notification_hostname,
                                  False,))
                        th.start()

                        # PF9 begin
                        """
                        # Sleep until updating nova-compute service status
                        # down.
                        self.rc_util.syslogout_ex(
                            "RecoveryController_0035", syslog.LOG_INFO)
                        dic = self.rc_config.get_value('recover_starter')
                        node_err_wait = dic.get("node_err_wait")
                        msg = ("Sleeping %s sec before starting node recovery"
                               "thread, until updateing nova-compute"
                               "service status." % (node_err_wait))
                        self.rc_util.syslogout(msg, syslog.LOG_INFO)
                        greenthread.sleep(int(node_err_wait))
                        """
                        # Mark nova compute service as 'down' to immediately start evacuation
                        self.rc_worker.mark_host_down_pf9(row.notification_hostname)
                        # PF9 end

                        # Start add_failed_host thread
                        # TODO(sampath):
                        # Avoid create thread here,
                        # insted call rc_starter.add_failed_host
                        retry_mode = True
                        th = threading.Thread(
                            target=self.rc_starter.add_failed_host,
                            args=(row.notification_id,
                                  row.notification_hostname,
                                  row.notification_cluster_port,
                                  retry_mode, ))
                        th.start()

                    elif row.recover_by == 1:
                        # instance recovery event
                        # TODO(sampath):
                        # Avoid create thread here,
                        # insted call rc_starter.add_failed_instance
                        th = threading.Thread(
                            target=self.rc_starter.add_failed_instance,
                            args=(row.notification_id,
                                  row.notification_uuid, ))
                        th.start()

                    else:
                        # maintenance mode event
                        th = threading.Thread(
                            target=self.rc_worker.host_maintenance_mode,
                            args=(row.notification_id,
                                  row.notification_hostname,
                                  True, ))
                        th.start()

            # Start handle_pending_instances thread
            # TODO(sampath):
            # Avoid create thread here,
            # insted call rc_starter.handle_pending_instances()
            th = threading.Thread(
                target=self.rc_starter.handle_pending_instances)
            th.start()

            # PF9 move this to __call__ for wsgi-fy app
            # Start reciever process for notification
            """
            conf_wsgi_dic = self.rc_config.get_value('wsgi')
            wsgi.server(
                eventlet.listen(('', int(conf_wsgi_dic['server_port']))),
                self._notification_reciever)
            """
            # PF9 end
        except exc.SQLAlchemyError:
            error_type, error_value, traceback_ = sys.exc_info()
            tb_list = traceback.format_tb(traceback_)
            self.rc_util.syslogout_ex(
                "RecoveryController_0005", syslog.LOG_ERR)
            self.rc_util.syslogout(error_type, syslog.LOG_ERR)
            self.rc_util.syslogout(error_value, syslog.LOG_ERR)
            for tb in tb_list:
                self.rc_util.syslogout(tb, syslog.LOG_ERR)
            sys.exit()
        except KeyError:
            error_type, error_value, traceback_ = sys.exc_info()
            tb_list = traceback.format_tb(traceback_)
            self.rc_util.syslogout_ex(
                "RecoveryController_0006", syslog.LOG_ERR)
            self.rc_util.syslogout(error_type, syslog.LOG_ERR)
            self.rc_util.syslogout(error_value, syslog.LOG_ERR)
            for tb in tb_list:
                self.rc_util.syslogout(tb, syslog.LOG_ERR)
            sys.exit()
        except:
            error_type, error_value, traceback_ = sys.exc_info()
            tb_list = traceback.format_tb(traceback_)
            self.rc_util.syslogout_ex(
                "RecoveryController_0007", syslog.LOG_ERR)
            self.rc_util.syslogout(error_type, syslog.LOG_ERR)
            self.rc_util.syslogout(error_value, syslog.LOG_ERR)
            for tb in tb_list:
                self.rc_util.syslogout(tb, syslog.LOG_ERR)
            sys.exit()
Beispiel #25
0
    def recovery_instance(self, uuid, primary_id, sem):
        """
           Execute VM recovery.
           :param uuid: Recovery target VM UUID
           :param primary_id: Unique ID of the vm_list table
           :param sem: Semaphore
        """
        try:
            sem.acquire()
            db_engine = dbapi.get_engine()
            session = dbapi.get_session(db_engine)

            # Initlize status.
            status = self.STATUS_NORMAL

            # Update vmha recovery status.
            self.rc_util_db.update_vm_list_db(session, 'progress', 1,
                                              primary_id)

            # Get vm infomation.
            vm_info = self._get_vm_param(uuid)
            HA_Enabled = vm_info.metadata.get('HA-Enabled')
            if HA_Enabled:
                HA_Enabled = HA_Enabled.upper()
            if HA_Enabled != 'OFF':
                HA_Enabled = 'ON'

            # Set recovery parameter.
            exe_param = {}
            exe_param['vm_state'] = getattr(vm_info, 'OS-EXT-STS:vm_state')
            exe_param['HA-Enabled'] = HA_Enabled
            recover_by, recover_to = self._get_vmha_param(
                session, uuid, primary_id)
            exe_param['recover_by'] = recover_by
            exe_param['recover_to'] = recover_to

            # Execute.
            status = self._execute_recovery(session, uuid,
                                            exe_param.get("vm_state"),
                                            exe_param.get("HA-Enabled"),
                                            exe_param.get("recover_by"),
                                            exe_param.get("recover_to"))

        except EnvironmentError:
            self.rc_util.syslogout_ex("RecoveryControllerWorker_0034",
                                      syslog.LOG_ERR)
            status = self.STATUS_ERROR
            error_type, error_value, traceback_ = sys.exc_info()
            tb_list = traceback.format_tb(traceback_)
            self.rc_util.syslogout(error_type, syslog.LOG_ERR)
            self.rc_util.syslogout(error_value, syslog.LOG_ERR)
            for tb in tb_list:
                self.rc_util.syslogout(tb, syslog.LOG_ERR)
            return
        except KeyError:
            self.rc_util.syslogout_ex("RecoveryControllerWorker_0035",
                                      syslog.LOG_ERR)
            status = self.STATUS_ERROR
            error_type, error_value, traceback_ = sys.exc_info()
            tb_list = traceback.format_tb(traceback_)
            self.rc_util.syslogout(error_type, syslog.LOG_ERR)
            self.rc_util.syslogout(error_value, syslog.LOG_ERR)
            for tb in tb_list:
                self.rc_util.syslogout(tb, syslog.LOG_ERR)
            return
        except:
            self.rc_util.syslogout_ex("RecoveryControllerWorker_0037",
                                      syslog.LOG_ERR)
            status = self.STATUS_ERROR
            error_type, error_value, traceback_ = sys.exc_info()
            tb_list = traceback.format_tb(traceback_)
            self.rc_util.syslogout(error_type, syslog.LOG_ERR)
            self.rc_util.syslogout(error_value, syslog.LOG_ERR)
            for tb in tb_list:
                self.rc_util.syslogout(tb, syslog.LOG_ERR)
            return
        finally:
            try:
                # Successful execution.
                if status == self.STATUS_NORMAL:
                    self.rc_util_db.update_vm_list_db(session, 'progress', 2,
                                                      primary_id)

                # Abnormal termination.
                else:
                    self.rc_util_db.update_vm_list_db(session, 'progress', 3,
                                                      primary_id)

                # Release semaphore
                if sem:
                    sem.release()
            except:
                self.rc_util.syslogout_ex("RecoveryControllerWorker_0039",
                                          syslog.LOG_ERR)
                error_type, error_value, traceback_ = sys.exc_info()
                tb_list = traceback.format_tb(traceback_)
                self.rc_util.syslogout(error_type, syslog.LOG_ERR)
                self.rc_util.syslogout(error_value, syslog.LOG_ERR)
                for tb in tb_list:
                    self.rc_util.syslogout(tb, syslog.LOG_ERR)
                return
Beispiel #26
0
    def masakari(self):
        """
        RecoveryController class main processing:
        This processing checks the VM list table of DB.
        If an unprocessed VM exists, and start thread to execute the recovery
        process.
        Then, the processing starts the wsgi server and waits for the
        notification.
        """
        try:
            LOG.info("masakari START.")

            # Get a session and do not pass it to other threads
            db_engine = dbapi.get_engine(self.rc_config)
            session = dbapi.get_session(db_engine)

            self._update_old_records_notification_list(session)
            result = self._find_reprocessing_records_notification_list(session)
            preprocessing_count = len(result)

            if preprocessing_count > 0:
                for row in result:
                    if row.recover_by == 0:
                        # node recovery event
                        msg = (
                            "Run thread rc_worker.host_maintenance_mode."
                            + " notification_id="
                            + row.notification_id
                            + " notification_hostname="
                            + row.notification_hostname
                            + " update_progress=False"
                        )
                        LOG.info(msg)
                        thread_name = self.rc_util.make_thread_name(NOTIFICATION_LIST, row.notification_id)
                        th = threading.Thread(
                            target=self.rc_worker.host_maintenance_mode,
                            name=thread_name,
                            args=(row.notification_id, row.notification_hostname, False),
                        )
                        th.start()

                        # Sleep until updating nova-compute service status
                        # down.
                        dic = self.rc_config.get_value("recover_starter")
                        node_err_wait = dic.get("node_err_wait")
                        msg = (
                            "Sleeping %s sec before starting node recovery"
                            "thread, until updateing nova-compute"
                            "service status." % (node_err_wait)
                        )
                        LOG.info(msg)
                        greenthread.sleep(int(node_err_wait))

                        # Start add_failed_host thread
                        # TODO(sampath):
                        # Avoid create thread here,
                        # insted call rc_starter.add_failed_host
                        retry_mode = True
                        msg = (
                            "Run thread rc_starter.add_failed_host."
                            + " notification_id="
                            + row.notification_id
                            + " notification_hostname="
                            + row.notification_hostname
                            + " notification_cluster_port="
                            + row.notification_cluster_port
                            + " retry_mode="
                            + str(retry_mode)
                        )
                        LOG.info(msg)
                        thread_name = self.rc_util.make_thread_name(NOTIFICATION_LIST, row.notification_id)
                        th = threading.Thread(
                            target=self.rc_starter.add_failed_host,
                            name=thread_name,
                            args=(
                                row.notification_id,
                                row.notification_hostname,
                                row.notification_cluster_port,
                                retry_mode,
                            ),
                        )
                        th.start()

                    elif row.recover_by == 1:
                        # instance recovery event
                        # TODO(sampath):
                        # Avoid create thread here,
                        # insted call rc_starter.add_failed_instance
                        msg = (
                            "Run thread rc_starter.add_failed_instance."
                            + " notification_id="
                            + row.notification_id
                            + " notification_uuid="
                            + row.notification_uuid
                        )
                        LOG.info(msg)
                        thread_name = self.rc_util.make_thread_name(NOTIFICATION_LIST, row.notification_id)
                        th = threading.Thread(
                            target=self.rc_starter.add_failed_instance,
                            name=thread_name,
                            args=(row.notification_id, row.notification_uuid),
                        )
                        th.start()

                    else:
                        # maintenance mode event
                        msg = (
                            "Run thread rc_starter.host_maintenance_mode."
                            + " notification_id="
                            + row.notification_id
                            + " notification_hostname="
                            + row.notification_hostname
                            + "update_progress=True"
                        )
                        LOG.info(msg)
                        thread_name = self.rc_util.make_thread_name(NOTIFICATION_LIST, row.notification_id)
                        th = threading.Thread(
                            target=self.rc_worker.host_maintenance_mode,
                            name=thread_name,
                            args=(row.notification_id, row.notification_hostname, True),
                        )
                        th.start()

            # Start handle_pending_instances thread
            # TODO(sampath):
            # Avoid create thread here,
            # insted call rc_starter.handle_pending_instances()
            msg = "Run thread rc_starter.handle_pending_instances."
            LOG.info(msg)
            thread_name = "Thread:handle_pending_instances"
            th = threading.Thread(target=self.rc_starter.handle_pending_instances, name=thread_name)
            th.start()

            # Start reciever process for notification
            conf_wsgi_dic = self.rc_config.get_value("wsgi")
            wsgi.server(eventlet.listen(("", int(conf_wsgi_dic["server_port"]))), self._notification_reciever)

        except exc.SQLAlchemyError:
            error_type, error_value, traceback_ = sys.exc_info()
            tb_list = traceback.format_tb(traceback_)
            LOG.critical(error_type)
            LOG.critical(error_value)
            for tb in tb_list:
                LOG.critical(tb)

            sys.exit()
        except KeyError:
            error_type, error_value, traceback_ = sys.exc_info()
            tb_list = traceback.format_tb(traceback_)
            LOG.critical(error_type)
            LOG.critical(error_value)
            for tb in tb_list:
                LOG.critical(tb)

            sys.exit()
        except:
            error_type, error_value, traceback_ = sys.exc_info()
            tb_list = traceback.format_tb(traceback_)
            LOG.critical(error_type)
            LOG.critical(error_value)
            for tb in tb_list:
                LOG.critical(tb)

            sys.exit()