def _receive_blocking(self, timeout_in_secs=5): """ Receive a message from the TCP connection (blocking) """ start_ms = timers.get_monotonic_timestamp_in_ms() while self._socket is not None: read_objs = [self._socket.fileno()] try: readable, writeable, in_error \ = select.select(read_objs, [], [], timeout_in_secs) for selobj in readable: if selobj == self._socket.fileno(): msg = self._receive_non_blocking() if msg is not None: return msg except (OSError, socket.error, select.error) as e: if errno.EINTR != e.args[0]: pass now_ms = timers.get_monotonic_timestamp_in_ms() secs_expired = (now_ms - start_ms) / 1000 if timeout_in_secs <= secs_expired: DLOG.info("Timed out waiting for a message.") break else: timeout_in_secs -= secs_expired return None
def auto_commit(self): timer_id = (yield) if timer_id == self._commit_timer_id: start_ms = timers.get_monotonic_timestamp_in_ms() self._session.commit() elapsed_ms = timers.get_monotonic_timestamp_in_ms() - start_ms histogram.add_histogram_data("database-commits (periodic)", elapsed_ms / 100, "decisecond") self._commit_timer_id = None
def commit(self): if self._commit_inline: start_ms = timers.get_monotonic_timestamp_in_ms() self._session.commit() elapsed_ms = timers.get_monotonic_timestamp_in_ms() - start_ms histogram.add_histogram_data("database-commits (inline)", elapsed_ms / 100, "decisecond") else: if self._commit_timer_id is None: self._commit_timer_id \ = timers.timers_create_timer('db-auto-commit', 1, 1, self.auto_commit)
def handle_event(self, host, event, event_data=None): """ Handle event while in the deleting state """ if HOST_EVENT.DELETE == event: if not host.task.inprogress(): host.task = DeleteHostTask(host) host.task.start() elif host.task.is_failed() or host.task.timed_out(): host.task.start() elif HOST_EVENT.TASK_COMPLETED == event: return HOST_STATE.DELETED elif HOST_EVENT.TASK_FAILED == event: DLOG.info("Delete failed for %s." % host.name) return HOST_STATE.DELETING_FAILED elif HOST_EVENT.AUDIT == event: if config.section_exists('host-configuration'): section = config.CONF['host-configuration'] max_wait = int(section.get('max_host_deleting_wait_in_secs', 60)) else: max_wait = 60 if not host.fsm_start_time: host.fsm_start_time = timers.get_monotonic_timestamp_in_ms() now_ms = timers.get_monotonic_timestamp_in_ms() secs_expired = (now_ms - host.fsm_start_time) / 1000 if max_wait > secs_expired: if not host.task.inprogress(): host.task = DeleteHostTask(host) host.task.start() elif host.task.is_failed() or host.task.timed_out(): host.task.start() else: DLOG.info("Timed out waiting for delete completion of %s." % host.name) return HOST_STATE.CONFIGURE elif HOST_EVENT.TASK_TIMEOUT == event: DLOG.info("Delete timed out for %s." % host.name) else: DLOG.verbose("Ignoring %s event for %s." % (event, host.name)) return self.name
def _task_coroutine_with_timer(future, arg1, callback): assert (arg1 == 'arg1') timer_id = future.timer('timer-test', 2) start_ms = timers.get_monotonic_timestamp_in_ms() future.result = (yield) end_ms = timers.get_monotonic_timestamp_in_ms() if future.result.is_complete(): if future.result.is_timer: if future.result.data == timer_id: elapsed_secs = (end_ms - start_ms) / 1000 if 2 < elapsed_secs: callback.send("FUNCTION PASSED") return callback.send(None)
def _state_change_callback(self, prev_state, state, event): """ Host state change callback """ from nfv_vim import directors DLOG.info( "Host %s FSM State-Change: prev_state=%s, state=%s, event=%s." % (self.name, prev_state, state, event)) self._elapsed_time_in_state = 0 self._last_state_timestamp = timers.get_monotonic_timestamp_in_ms() if self.is_locking() and host_fsm.HOST_STATE.DISABLED == self.state: if nfvi.objects.v1.HOST_ADMIN_STATE.LOCKED \ == self.nfvi_host.admin_state: self._action = self._ACTION_NONE if self.is_unlocking(): if nfvi.objects.v1.HOST_ADMIN_STATE.UNLOCKED \ == self.nfvi_host.admin_state: self._action = self._ACTION_NONE self._persist() host_director = directors.get_host_director() host_director.host_state_change_notify(self)
def stall_elapsed_secs(self): """ Returns the elapsed time in seconds that the thread has been stalled """ if self._stall_timestamp_ms is not None: now = timers.get_monotonic_timestamp_in_ms() return int((now - self._stall_timestamp_ms) / 1000) return 0
def enter(self, instance): """ Entering cold migrate state """ DLOG.info("Entering state (%s) for %s." % (self.name, instance.name)) instance.action_fsm.start_time = timers.get_monotonic_timestamp_in_ms() instance.action_fsm.wait_time = 0 instance.action_fsm.from_host_name = instance.host_name instance.task = ColdMigrateTask(instance) instance.task.start()
def _audit_dump_debug_info(do_dump=True): """ Dump Audit Debug Information """ global _audit_debug_dump_back_off_ms, _last_audit_debug_dump_ms elapsed_ms = timers.get_monotonic_timestamp_in_ms() - _last_audit_debug_dump_ms if do_dump: if 30000 + _audit_debug_dump_back_off_ms <= elapsed_ms: histogram.display_histogram_data(pretty_format=False) _last_audit_debug_dump_ms = timers.get_monotonic_timestamp_in_ms() _audit_debug_dump_back_off_ms += 20000 if 600000 < _audit_debug_dump_back_off_ms: _audit_debug_dump_back_off_ms = 600000 else: _audit_debug_dump_back_off_ms -= 20000 if 0 > _audit_debug_dump_back_off_ms: _audit_debug_dump_back_off_ms = 0
def enter(self, host): """ Entering deleting state """ DLOG.info("Entering state (%s) for %s." % (self.name, host.name)) host.fsm_start_time = timers.get_monotonic_timestamp_in_ms() host.clear_reason() host.task = DeleteHostTask(host) host.task.start()
def elapsed_time_in_state(self): """ Returns the elapsed time this host has been in the current state """ elapsed_time_in_state = self._elapsed_time_in_state if 0 != self._last_state_timestamp: now_ms = timers.get_monotonic_timestamp_in_ms() secs_expired = (now_ms - self._last_state_timestamp) / 1000 elapsed_time_in_state += int(secs_expired) return elapsed_time_in_state
def __init__(self, nfvi_host, initial_state=None, action=None, elapsed_time_in_state=0, upgrade_inprogress=False, recover_instances=True, host_services_locked=False): super(Host, self).__init__('1.0.0') if initial_state is None: initial_state = host_fsm.HOST_STATE.INITIAL if action is None: action = self._ACTION_NONE self._elapsed_time_in_state = int(elapsed_time_in_state) self._task = state_machine.StateTask('EmptyTask', list()) self._action = action self._reason = '' self._upgrade_inprogress = upgrade_inprogress self._recover_instances = recover_instances self._host_services_locked = host_services_locked self._nfvi_host = nfvi_host self._fsm = host_fsm.HostStateMachine(self, initial_state) self._fsm.register_state_change_callback(self._state_change_callback) self._last_state_timestamp = timers.get_monotonic_timestamp_in_ms() self._fail_notification_required = False self._fsm_start_time = None self._host_service_state = dict() if self.host_service_configured(HOST_SERVICES.COMPUTE): self._host_service_state[HOST_SERVICES.COMPUTE] = \ HOST_SERVICE_STATE.ENABLED if self.is_enabled() else \ HOST_SERVICE_STATE.DISABLED if self.host_service_configured(HOST_SERVICES.NETWORK): self._host_service_state[HOST_SERVICES.NETWORK] = \ HOST_SERVICE_STATE.ENABLED if self.is_enabled() else \ HOST_SERVICE_STATE.DISABLED if self.host_service_configured(HOST_SERVICES.GUEST): self._host_service_state[HOST_SERVICES.GUEST] = \ HOST_SERVICE_STATE.ENABLED if self.is_enabled() else \ HOST_SERVICE_STATE.DISABLED if self.host_service_configured(HOST_SERVICES.CONTAINER): self._host_service_state[HOST_SERVICES.CONTAINER] = \ HOST_SERVICE_STATE.ENABLED if self.is_enabled() else \ HOST_SERVICE_STATE.DISABLED self._alarms = list() self._events = list()
def _dor_timer(): """ DOR timer """ global _alarm_data global _dor_stabilized, _dor_completed global _system_state_get_inprogress while not _dor_completed: (yield) if _dor_completed: break if os.path.exists(NFV_VIM_DOR_COMPLETE_FILE): _dor_stabilized = True _dor_completed = True if _alarm_data is not None: alarm.clear_general_alarm(_alarm_data) event_log.issue_general_log( event_log.EVENT_ID.MULTI_NODE_RECOVERY_MODE_EXIT) _alarm_data = None DLOG.info("DOR completed.") break if local_uptime_in_secs() > _dor_complete_uptime: open(NFV_VIM_DOR_COMPLETE_FILE, 'w').close() _dor_stabilized = True _dor_completed = True if _alarm_data is not None: alarm.clear_general_alarm(_alarm_data) event_log.issue_general_log( event_log.EVENT_ID.MULTI_NODE_RECOVERY_MODE_EXIT) _alarm_data = None DLOG.info("DOR completed.") break now_ms = timers.get_monotonic_timestamp_in_ms() elapsed_secs = (now_ms - _process_start_timestamp_ms) / 1000 if not _dor_stabilized and elapsed_secs > _dor_stabilize_uptime: _dor_stabilized = True DLOG.info("DOR stabilized.") if not (_system_state_get_inprogress or _system_state_gathered): nfvi.nfvi_get_system_state(_system_state_query_callback()) _system_state_get_inprogress = True
def get_task_work_result(self): """ Returns the result of task work completed """ result = self._worker.get_result() if hasattr(result.ancillary_result_data, 'execution_time'): histogram.add_histogram_data( result.name + ' [worker-execution-time]', result.ancillary_result_data.execution_time, 'secs') now_ms = timers.get_monotonic_timestamp_in_ms() elapsed_secs = (now_ms - result.create_timestamp_ms) / 1000 histogram.add_histogram_data(result.name + ' [execution-time]', elapsed_secs, 'secs') return result
def __init__(self, timeout, target, *args, **kwargs): """ Create task work """ self._id = TaskWork._id self._name = target.__name__ self._task_id = None self._target = target self._timeout_in_secs = timeout self._args = list(args) self._kwargs = dict(kwargs) self._result = None self._ancillary_result_data = None self._create_timestamp_ms = timers.get_monotonic_timestamp_in_ms() DLOG.debug("TaskWork created, id=%s, name=%s, timeout_in_secs=%i." % (self._id, self._name, self._timeout_in_secs)) TaskWork._id += 1
def do_check(self): """ Check the Thread for progress """ while True: (yield) if self._last_marker_value is not None: if self._last_marker_value == self._progress_marker.value: if self._stall_timestamp_ms is None: self._stall_timestamp_ms = \ timers.get_monotonic_timestamp_in_ms() DLOG.error("Thread %s stalled, progress_marker=%s, " "elapsed_secs=%s." % (self._name, self._progress_marker.value, self.stall_elapsed_secs)) else: self._stall_timestamp_ms = None self._last_marker_value = self._progress_marker.value
def selobj_dispatch(timeout_in_ms): """ Dispatch selection objects that have become readable or writeable within the given time period """ from nfv_common import histogram from nfv_common import timers global _read_callbacks, _write_callbacks, _error_callbacks read_objs = _read_callbacks.keys() write_objs = _write_callbacks.keys() try: readable, writeable, in_error = select.select(read_objs, write_objs, [], timeout_in_ms / 1000.0) for selobj in readable: callback = _read_callbacks.get(selobj, None) if callback is not None: start_ms = timers.get_monotonic_timestamp_in_ms() try: callback.send(selobj) except StopIteration: _read_callbacks.pop(selobj) elapsed_ms = timers.get_monotonic_timestamp_in_ms() - start_ms histogram.add_histogram_data( "selobj read: " + callback.__name__, elapsed_ms / 100, "decisecond") for selobj in writeable: callback = _write_callbacks.get(selobj, None) if callback is not None: start_ms = timers.get_monotonic_timestamp_in_ms() try: callback.send(selobj) except StopIteration: _write_callbacks.pop(selobj) elapsed_ms = timers.get_monotonic_timestamp_in_ms() - start_ms histogram.add_histogram_data( "selobj write: " + callback.__name__, elapsed_ms / 100, "decisecond") for selobj in in_error: callback = _error_callbacks.get(selobj, None) if callback is not None: start_ms = timers.get_monotonic_timestamp_in_ms() try: callback.send(selobj) except StopIteration: _error_callbacks.pop(selobj) elapsed_ms = timers.get_monotonic_timestamp_in_ms() - start_ms histogram.add_histogram_data( "selobj error: " + callback.__name__, elapsed_ms / 100, "decisecond") if selobj in _read_callbacks.keys(): _read_callbacks.pop(selobj) if selobj in _write_callbacks.keys(): _write_callbacks.pop(selobj) except (OSError, socket.error, select.error) as e: if errno.EINTR == e.args[0]: pass
def handle_event(self, instance, event, event_data=None): """ Handle event while in the cold migrate state """ from nfv_vim import directors instance_director = directors.get_instance_director() if event_data is not None: reason = event_data.get('reason', '') else: reason = '' if instance.task.inprogress(): if instance.task.handle_event(event, event_data): return self.name if INSTANCE_EVENT.TASK_STOP == event: return INSTANCE_STATE.INITIAL elif INSTANCE_EVENT.NFVI_RESIZED == event: from_host_name = instance.action_fsm.from_host_name instance_director.instance_migrate_complete( instance, from_host_name) return INSTANCE_STATE.COLD_MIGRATE_CONFIRM elif INSTANCE_EVENT.TASK_COMPLETED == event: if instance.action_fsm is not None: action_data = instance.action_fsm_data if action_data is not None: if action_data.initiated_from_cli(): DLOG.debug("Cold-Migrate complete for %s, initiated " "from cli." % instance.name) return INSTANCE_STATE.INITIAL DLOG.debug("Cold-Migrate inprogress for %s." % instance.name) elif INSTANCE_EVENT.TASK_FAILED == event: DLOG.info("Cold-Migrate failed for %s." % instance.name) instance.fail_action(instance.action_fsm_action_type, reason) from_host_name = instance.action_fsm.from_host_name instance_director.instance_migrate_complete(instance, from_host_name, failed=True) return INSTANCE_STATE.INITIAL elif INSTANCE_EVENT.TASK_TIMEOUT == event: DLOG.info("Cold-Migrate timed out for %s." % instance.name) elif INSTANCE_EVENT.AUDIT == event: if not (instance.task.inprogress() or instance.is_resizing()): if 0 == instance.action_fsm.wait_time: instance.action_fsm.wait_time \ = timers.get_monotonic_timestamp_in_ms() now_ms = timers.get_monotonic_timestamp_in_ms() secs_expired = (now_ms - instance.action_fsm.wait_time) / 1000 if 60 <= secs_expired: instance.fail_action(instance.action_fsm_action_type, 'timeout') instance_director.instance_evacuate_complete( instance, instance.action_fsm.from_host_name, failed=False, timed_out=True) return INSTANCE_STATE.INITIAL else: now_ms = timers.get_monotonic_timestamp_in_ms() secs_expired = (now_ms - instance.action_fsm.start_time) / 1000 if instance.max_cold_migrate_wait_in_secs <= secs_expired: instance.fail_action(instance.action_fsm_action_type, 'timeout') instance_director.instance_migrate_complete( instance, instance.action_fsm.from_host_name, failed=False, timed_out=True) return INSTANCE_STATE.INITIAL elif instance.task.timed_out(): instance.fail_action(instance.action_fsm_action_type, 'timeout') instance_director.instance_migrate_complete( instance, instance.action_fsm.from_host_name, failed=False, timed_out=True) return INSTANCE_STATE.INITIAL else: DLOG.verbose("Ignoring %s event for %s." % (event, instance.name)) return self.name
def _rest_api_request(token_id, method, api_cmd, api_cmd_headers=None, api_cmd_payload=None): """ Internal: make a rest-api request """ headers_per_hop = [ 'connection', 'keep-alive', 'proxy-authenticate', 'proxy-authorization', 'te', 'trailers', 'transfer-encoding', 'upgrade' ] start_ms = timers.get_monotonic_timestamp_in_ms() try: request_info = urllib.request.Request(api_cmd) request_info.get_method = lambda: method request_info.add_header("X-Auth-Token", token_id) request_info.add_header("Accept", "application/json") if api_cmd_headers is not None: for header_type, header_value in api_cmd_headers.items(): request_info.add_header(header_type, header_value) if api_cmd_payload is not None: request_info.add_data(api_cmd_payload) DLOG.verbose("Rest-API method=%s, api_cmd=%s, api_cmd_headers=%s, " "api_cmd_payload=%s" % (method, api_cmd, api_cmd_headers, api_cmd_payload)) # Enable Debug # handler = urllib.request.HTTPHandler(debuglevel=1) # opener = urllib.request.build_opener(handler) # urllib.request.install_opener(opener) request = urllib.request.urlopen(request_info) headers = list() # list of tuples for key, value in request.info().items(): if key not in headers_per_hop: cap_key = '-'.join((ck.capitalize() for ck in key.split('-'))) headers.append((cap_key, value)) response_raw = request.read() if response_raw == "": response = dict() else: response = json.loads(response_raw) request.close() now_ms = timers.get_monotonic_timestamp_in_ms() elapsed_ms = now_ms - start_ms elapsed_secs = elapsed_ms / 1000 DLOG.verbose("Rest-API code=%s, headers=%s, response=%s" % (request.code, headers, response)) log_info( "Rest-API status=%s, %s, %s, hdrs=%s, payload=%s, elapsed_ms=%s" % (request.code, method, api_cmd, api_cmd_headers, api_cmd_payload, int(elapsed_ms))) return Result( response, Object(status_code=request.code, headers=headers, response=response_raw, execution_time=elapsed_secs)) except urllib.error.HTTPError as e: headers = list() response_raw = dict() if e.fp is not None: headers = list() # list of tuples for key, value in e.fp.info().items(): if key not in headers_per_hop: cap_key = '-'.join( (ck.capitalize() for ck in key.split('-'))) headers.append((cap_key, value)) response_raw = e.fp.read() now_ms = timers.get_monotonic_timestamp_in_ms() elapsed_ms = now_ms - start_ms log_error( "Rest-API status=%s, %s, %s, hdrs=%s, payload=%s, elapsed_ms=%s" % (e.code, method, api_cmd, api_cmd_headers, api_cmd_payload, int(elapsed_ms))) if httplib.FOUND == e.code: return Result( response_raw, Object(status_code=e.code, headers=headers, response=response_raw)) # Attempt to get the reason for the http error from the response reason = '' for header, value in headers: if 'Content-Type' == header: if 'application/json' == value.split(';')[0]: try: response = json.loads(response_raw) compute_fault = response.get('computeFault', None) if compute_fault is not None: message = compute_fault.get('message', None) if message is not None: reason = str(message.lower().rstrip('.')) if not reason: bad_request = response.get('badRequest', None) if bad_request is not None: message = bad_request.get('message', None) if message is not None: reason = str(message.lower().rstrip('.')) if not reason: error_message = response.get('error_message', None) if error_message is not None: error_message = json.loads(error_message) message = error_message.get( 'faultstring', None) if message is not None: reason = str(message.lower().rstrip('.')) except ValueError: pass raise OpenStackRestAPIException(method, api_cmd, api_cmd_headers, api_cmd_payload, e.code, str(e), str(e), headers, response_raw, reason) except urllib.error.URLError as e: now_ms = timers.get_monotonic_timestamp_in_ms() elapsed_ms = now_ms - start_ms log_error( "Rest-API status=ERR, %s, %s, hdrs=%s, payload=%s, elapsed_ms=%s" % (method, api_cmd, api_cmd_headers, api_cmd_payload, int(elapsed_ms))) raise OpenStackException(method, api_cmd, api_cmd_headers, api_cmd_payload, str(e), str(e))
from nfv_vim import tables DLOG = debug.debug_get_logger('nfv_vim.dor') _alarm_data = None _minimum_hosts = 0 _dor_stabilized = False _dor_completed = False _dor_process_uptime = 0 _dor_stabilize_uptime = 0 _dor_complete_uptime = 0 _dor_complete_percentage = 0 _system_state_get_inprogress = False _system_state_gathered = False _process_start_timestamp_ms = timers.get_monotonic_timestamp_in_ms() NFV_VIM_DOR_COMPLETE_FILE = '/var/run/.nfv-vim.dor_complete' @coroutine def _system_state_query_callback(): """ System state query callback """ global _alarm_data global _minimum_hosts, _dor_stabilized, _dor_completed global _dor_complete_percentage global _system_state_get_inprogress, _system_state_gathered response = (yield)
def handle_event(self, instance, event, event_data=None): """ Handle event while in the evacuate state """ from nfv_vim import directors instance_director = directors.get_instance_director() if event_data is not None: reason = event_data.get('reason', '') else: reason = '' if instance.task.inprogress(): if instance.task.handle_event(event, event_data): return self.name if INSTANCE_EVENT.TASK_STOP == event: return INSTANCE_STATE.INITIAL elif event in [INSTANCE_EVENT.NFVI_ENABLED, INSTANCE_EVENT.NFVI_DISABLED, INSTANCE_EVENT.NFVI_HOST_CHANGED]: if instance.action_fsm.from_host_name != instance.host_name and \ not instance.is_rebuilding(): instance_director.instance_evacuate_complete( instance, instance.action_fsm.from_host_name) return INSTANCE_STATE.INITIAL elif INSTANCE_EVENT.NFVI_DISABLED == event: if instance.is_rebuilding(): if not instance._evacuate_started: DLOG.info("Evacuate starting for %s." % instance.name) # Evacuate has started instance._evacuate_started = True elif instance._evacuate_started and \ instance.action_fsm.from_host_name == instance.host_name: DLOG.info("Evacuate no longer in progress for %s." % instance.name) # Evacuate was in progress once, but is no longer and # the host has not changed. Nova does this (for example) if # it fails to schedule a destination host for the evacuate. # Look at me - I'm evacuating. Oh - guess I decided not to. # Stupid nova. # Tell the instance director that the evacuate failed so it # can update any host operation that may be in progress. instance_director.instance_evacuate_complete( instance, instance.action_fsm.from_host_name, failed=True) return INSTANCE_STATE.INITIAL elif INSTANCE_EVENT.TASK_COMPLETED == event: DLOG.debug("Evacuate inprogress for %s." % instance.name) elif INSTANCE_EVENT.TASK_FAILED == event: DLOG.info("Evacuate failed for %s." % instance.name) instance.fail_action(instance.action_fsm_action_type, reason) instance_director.instance_evacuate_complete( instance, instance.action_fsm.from_host_name, failed=True) return INSTANCE_STATE.INITIAL elif INSTANCE_EVENT.TASK_TIMEOUT == event: DLOG.info("Evacuate timed out for %s." % instance.name) elif INSTANCE_EVENT.AUDIT == event: if instance.action_fsm.from_host_name != instance.host_name and \ not instance.is_rebuilding(): instance_director.instance_evacuate_complete( instance, instance.action_fsm.from_host_name) return INSTANCE_STATE.INITIAL elif not (instance.task.inprogress() or instance.is_rebuilding()): if 0 == instance.action_fsm.wait_time: instance.action_fsm.wait_time \ = timers.get_monotonic_timestamp_in_ms() now_ms = timers.get_monotonic_timestamp_in_ms() secs_expired = (now_ms - instance.action_fsm.wait_time) / 1000 if 120 <= secs_expired: instance.fail_action(instance.action_fsm_action_type, 'timeout') instance_director.instance_evacuate_complete( instance, instance.action_fsm.from_host_name, failed=False, timed_out=True) return INSTANCE_STATE.INITIAL else: now_ms = timers.get_monotonic_timestamp_in_ms() secs_expired = (now_ms - instance.action_fsm.start_time) / 1000 if instance.max_evacuate_wait_in_secs <= secs_expired: instance.fail_action(instance.action_fsm_action_type, 'timeout') instance_director.instance_evacuate_complete( instance, instance.action_fsm.from_host_name, failed=False, timed_out=True) return INSTANCE_STATE.INITIAL elif instance.task.timed_out(): instance.fail_action(instance.action_fsm_action_type, 'timeout') instance_director.instance_evacuate_complete( instance, instance.action_fsm.from_host_name, failed=False, timed_out=True) return INSTANCE_STATE.INITIAL else: DLOG.verbose("Ignoring %s event for %s." % (event, instance.name)) return self.name
def nfvi_host_state_change(self, nfvi_admin_state, nfvi_oper_state, nfvi_avail_status, nfvi_data=None): """ NFVI Host State Change """ if nfvi_data is not None: self._nfvi_host.nfvi_data = nfvi_data self._persist() if nfvi.objects.v1.HOST_ADMIN_STATE.UNKNOWN == nfvi_admin_state: DLOG.info("Ignoring unknown administrative state change for %s." % self._nfvi_host.name) return if nfvi.objects.v1.HOST_OPER_STATE.UNKNOWN == nfvi_oper_state: DLOG.info("Ignoring unknown operation state change for %s." % self._nfvi_host.name) return if nfvi_admin_state != self._nfvi_host.admin_state \ or nfvi_oper_state != self._nfvi_host.oper_state \ or nfvi_avail_status != self._nfvi_host.avail_status: DLOG.debug("Host State-Change detected: nfvi_admin_state=%s " "host_admin_state=%s, nfvi_oper_state=%s " "host_oper_state=%s, nfvi_avail_state=%s " "host_avail_status=%s, locking=%s unlocking=%s " "fsm current_state=%s for %s." % (nfvi_admin_state, self._nfvi_host.admin_state, nfvi_oper_state, self._nfvi_host.oper_state, nfvi_avail_status, self._nfvi_host.avail_status, self.is_locking(), self.is_unlocking(), self._fsm.current_state.name, self._nfvi_host.name)) notify_offline = False if nfvi.objects.v1.HOST_AVAIL_STATUS.OFFLINE == nfvi_avail_status: if nfvi.objects.v1.HOST_AVAIL_STATUS.OFFLINE \ != self._nfvi_host.avail_status: notify_offline = True self._nfvi_host.admin_state = nfvi_admin_state self._nfvi_host.oper_state = nfvi_oper_state self._nfvi_host.avail_status = nfvi_avail_status self._persist() self._nfvi_host_handle_state_change() if notify_offline: from nfv_vim import directors host_director = directors.get_host_director() host_director.host_offline(self) elif host_fsm.HOST_STATE.INITIAL == self._fsm.current_state.name: self._fsm.handle_event(host_fsm.HOST_EVENT.ADD) return elif host_fsm.HOST_STATE.CONFIGURE == self._fsm.current_state.name: self._fsm.handle_event(host_fsm.HOST_EVENT.ADD) return elif host_fsm.HOST_STATE.ENABLED == self._fsm.current_state.name \ and nfvi.objects.v1.HOST_OPER_STATE.DISABLED == nfvi_oper_state: self._fsm.handle_event(host_fsm.HOST_EVENT.DISABLE) return elif host_fsm.HOST_STATE.DISABLED == self._fsm.current_state.name \ and nfvi.objects.v1.HOST_OPER_STATE.ENABLED == nfvi_oper_state: self._fsm.handle_event(host_fsm.HOST_EVENT.ENABLE) return else: now_ms = timers.get_monotonic_timestamp_in_ms() secs_expired = (now_ms - self._last_state_timestamp) / 1000 if 30 <= secs_expired: if 0 != self._last_state_timestamp: self._elapsed_time_in_state += int(secs_expired) self._last_state_timestamp = now_ms self._persist() self._fsm.handle_event(host_fsm.HOST_EVENT.AUDIT)
def handle_event(self, instance, event, event_data=None): """ Handle event while in the live migrate state """ from nfv_vim import directors instance_director = directors.get_instance_director() if event_data is not None: reason = event_data.get('reason', '') else: reason = '' if instance.task.inprogress(): if instance.task.handle_event(event, event_data): return self.name if INSTANCE_EVENT.TASK_STOP == event: return INSTANCE_STATE.INITIAL elif INSTANCE_EVENT.NFVI_HOST_CHANGED == event: if instance.action_fsm.from_host_name != instance.host_name: DLOG.info("Live-Migrate for %s from host %s to host %s." % (instance.name, instance.action_fsm.from_host_name, instance.host_name)) instance_director.instance_migrate_complete( instance, instance.action_fsm.from_host_name) guest_services = instance.guest_services if guest_services.are_provisioned(): return INSTANCE_STATE.LIVE_MIGRATE_FINISH else: return INSTANCE_STATE.INITIAL elif INSTANCE_EVENT.LIVE_MIGRATE_ROLLBACK == event: DLOG.info("Live-Migrate rollback for %s." % instance.name) guest_services = instance.guest_services # Tell the instance director that the live migrate failed so it # can update any host operation that may be in progress. instance_director.instance_migrate_complete( instance, instance.action_fsm.from_host_name, failed=True) if guest_services.are_provisioned(): return INSTANCE_STATE.LIVE_MIGRATE_FINISH else: return INSTANCE_STATE.INITIAL elif INSTANCE_EVENT.TASK_COMPLETED == event: DLOG.debug("Live-Migrate inprogress for %s." % instance.name) elif INSTANCE_EVENT.TASK_FAILED == event: DLOG.info("Live-Migrate failed for %s." % instance.name) instance.fail_action(instance.action_fsm_action_type, reason) instance_director.instance_migrate_complete( instance, instance.action_fsm.from_host_name, failed=True) return INSTANCE_STATE.INITIAL elif INSTANCE_EVENT.TASK_TIMEOUT == event: DLOG.info("Live-Migrate timed out for %s." % instance.name) elif INSTANCE_EVENT.NFVI_ENABLED == event: if instance.is_migrating(): if not instance._live_migration_started: DLOG.info("Live-Migrate starting for %s." % instance.name) # Live migration has started instance._live_migration_started = True elif instance._live_migration_started and \ instance.action_fsm.from_host_name == instance.host_name: DLOG.info("Live-Migrate no longer in progress for %s." % instance.name) # Live migration was in progress once, but is no longer and # the host has not changed. Nova does this (for example) if it # fails to schedule a destination host for the live migration. # Look at me - I'm migrating. Oh - guess I decided not to. # Stupid nova. # Tell the instance director that the live migrate failed so it # can update any host operation that may be in progress. guest_services = instance.guest_services instance_director.instance_migrate_complete( instance, instance.action_fsm.from_host_name, failed=True) if guest_services.are_provisioned(): return INSTANCE_STATE.LIVE_MIGRATE_FINISH else: return INSTANCE_STATE.INITIAL elif INSTANCE_EVENT.AUDIT == event: if instance.action_fsm.from_host_name != instance.host_name: instance_director.instance_migrate_complete( instance, instance.action_fsm.from_host_name) guest_services = instance.guest_services if guest_services.are_provisioned(): return INSTANCE_STATE.LIVE_MIGRATE_FINISH else: return INSTANCE_STATE.INITIAL elif not (instance.task.inprogress() or instance.is_migrating()): if 0 == instance.action_fsm.wait_time: instance.action_fsm.wait_time \ = timers.get_monotonic_timestamp_in_ms() now_ms = timers.get_monotonic_timestamp_in_ms() secs_expired = (now_ms - instance.action_fsm.wait_time) / 1000 if 60 <= secs_expired: instance.fail_action(instance.action_fsm_action_type, 'timeout') instance_director.instance_migrate_complete( instance, instance.action_fsm.from_host_name, failed=False, timed_out=True) return INSTANCE_STATE.INITIAL else: now_ms = timers.get_monotonic_timestamp_in_ms() secs_expired = (now_ms - instance.action_fsm.start_time) / 1000 max_live_migrate_wait_in_secs = \ instance.max_live_migrate_wait_in_secs if 0 != max_live_migrate_wait_in_secs: # Add 60 seconds buffer on top of nova timeout value max_wait = max_live_migrate_wait_in_secs + 60 if max_wait <= secs_expired: instance.fail_action(instance.action_fsm_action_type, 'timeout') instance_director.instance_migrate_complete( instance, instance.action_fsm.from_host_name, failed=False, timed_out=True) return INSTANCE_STATE.INITIAL elif instance.task.timed_out(): instance.fail_action(instance.action_fsm_action_type, 'timeout') instance_director.instance_migrate_complete( instance, instance.action_fsm.from_host_name, failed=False, timed_out=True) return INSTANCE_STATE.INITIAL else: DLOG.verbose("Ignoring %s event for %s." % (event, instance.name)) return self.name
def handle_event(self, instance, event, event_data=None): """ Handle event while in the start state """ from nfv_vim import directors instance_director = directors.get_instance_director() if event_data is not None: reason = event_data.get('reason', '') else: reason = '' if instance.task.inprogress(): if instance.task.handle_event(event, event_data): return self.name if INSTANCE_EVENT.TASK_STOP == event: instance_director.instance_start_complete(instance, instance.host_name, failed=False, timed_out=False, cancelled=True) return INSTANCE_STATE.INITIAL elif INSTANCE_EVENT.TASK_COMPLETED == event: DLOG.debug("Start inprogress for %s." % instance.name) instance.action_fsm.wait_time = \ timers.get_monotonic_timestamp_in_ms() elif INSTANCE_EVENT.TASK_FAILED == event: DLOG.info("Start failed for %s." % instance.name) instance.fail_action(instance.action_fsm_action_type, reason) instance_director.instance_start_complete(instance, instance.host_name, failed=True) return INSTANCE_STATE.INITIAL elif INSTANCE_EVENT.TASK_TIMEOUT == event: DLOG.info("Start timed out for %s." % instance.name) instance.fail_action(instance.action_fsm_action_type, 'timeout') instance_director.instance_start_complete(instance, instance.host_name, failed=False, timed_out=True) return INSTANCE_STATE.INITIAL elif INSTANCE_EVENT.NFVI_ENABLED == event: instance_director.instance_start_complete(instance, instance.host_name) return INSTANCE_STATE.INITIAL elif INSTANCE_EVENT.AUDIT == event: if not instance.task.inprogress(): if instance.is_enabled(): instance_director.instance_start_complete( instance, instance.host_name) return INSTANCE_STATE.INITIAL else: now_ms = timers.get_monotonic_timestamp_in_ms() secs_expired = \ (now_ms - instance.action_fsm.wait_time) / 1000 # Only wait 60 seconds for the instance to start. if 60 <= secs_expired: instance.fail_action(instance.action_fsm_action_type, 'timeout') instance_director.instance_start_complete( instance, instance.host_name, failed=False, timed_out=True) return INSTANCE_STATE.INITIAL else: DLOG.verbose("Ignoring %s event for %s." % (event, instance.name)) return self.name
def process_main(): """ Virtual Infrastructure Manager - Main """ def _force_exit(): os._exit(-1) global do_reload, dump_data_captured, reset_data_captured process_start_time = timers.get_monotonic_timestamp_in_ms() try: # signal.signal(signal.SIGTERM, process_signal_handler) signal.signal(signal.SIGINT, process_signal_handler) signal.signal(signal.SIGHUP, process_signal_handler) signal.signal(signal.SIGUSR1, process_signal_handler) signal.signal(signal.SIGUSR2, process_signal_handler) parser = argparse.ArgumentParser() parser.add_argument('-c', '--config', help='configuration file') parser.add_argument('-t', '--tox', action="store_true", help='tox test environment') args = parser.parse_args() config.load(args.config) if args.tox: # Append the tox root directory to the system path to get # the config.ini and debug.ini files. debug_ini = sys.prefix + '/' + config.CONF['debug']['config_file'] config.CONF['debug']['config_file'] = debug_ini init_complete = process_initialize() last_init_time = timers.get_monotonic_timestamp_in_ms() DLOG.info("Started") while stay_on: selobj.selobj_dispatch(PROCESS_TICK_INTERVAL_IN_MS) timers.timers_schedule() if not alarm.alarm_subsystem_sane(): DLOG.error("Alarm subsystem is not sane, exiting") break if not event_log.event_log_subsystem_sane(): DLOG.error("Event-Log subsystem is not sane, exiting") break if do_reload: DLOG.info("Reload signalled.") debug.debug_reload_config() DLOG.info("Reload complete.") do_reload = False if dump_data_captured: DLOG.info("Dump captured data signalled.") histogram.display_histogram_data() profiler.profile_memory_dump() DLOG.info("Dump captured data complete.") dump_data_captured = False if reset_data_captured: DLOG.info("Reset captured data signalled.") histogram.reset_histogram_data() profiler.profile_memory_set_reference() DLOG.info("Reset captured data complete.") reset_data_captured = False if not init_complete: # Retry initialization for up to 3 minutes. now_ms = timers.get_monotonic_timestamp_in_ms() secs_expired = (now_ms - process_start_time) / 1000 if secs_expired < 180: time_since_init = (now_ms - last_init_time) / 1000 # Reattempt initialization every 10 seconds. if time_since_init > 10: init_complete = process_reinitialize() last_init_time = timers.get_monotonic_timestamp_in_ms() else: DLOG.warn("Initialization failed - exiting.") sys.exit(200) except KeyboardInterrupt: print("Keyboard Interrupt received.") except Exception as e: DLOG.exception("%s" % e) sys.exit(200) finally: open(PROCESS_NOT_RUNNING_FILE, 'w').close() # Allow up to 10 seconds for the process to shut down. If the # process_finalize hangs, we will do a hard exit. signal.signal(signal.SIGALRM, _force_exit) signal.alarm(10) process_finalize()