def test_resets_status_expires(self): node = factory.make_Node(status=NODE_STATUS.DEPLOYING, status_expires=factory.make_date(), with_empty_script_sets=True) payload = { 'event_type': random.choice(['start', 'finish']), 'origin': 'curtin', 'name': random.choice([ 'cmd-install', 'cmd-install/stage-early', 'cmd-install/stage-late' ]), 'description': 'Installing', 'timestamp': datetime.utcnow(), } self.processMessage(node, payload) node = reload_object(node) # Testing for the exact time will fail during testing due to now() # being different in reset_status_expires vs here. Pad by 1 minute # to make sure its reset but won't fail testing. expected_time = now() + timedelta( minutes=get_node_timeout(NODE_STATUS.DEPLOYING)) self.assertGreaterEqual(node.status_expires, expected_time - timedelta(minutes=1)) self.assertLessEqual(node.status_expires, expected_time + timedelta(minutes=1))
def test__resets_status_expires(self): rack_controller = factory.make_RackController() local_ip = factory.make_ip_address() remote_ip = factory.make_ip_address() status = random.choice(MONITORED_STATUSES) node = self.make_node( status=status, status_expires=factory.make_date()) mac = node.get_boot_interface().mac_address get_config( rack_controller.system_id, local_ip, remote_ip, mac=mac) node = reload_object(node) # Testing for the exact time will fail during testing due to now() # being different in reset_status_expires vs here. Pad by 1 minute # to make sure its reset but won't fail testing. expected_time = now() + timedelta(minutes=get_node_timeout(status)) self.assertGreaterEqual( node.status_expires, expected_time - timedelta(minutes=1)) self.assertLessEqual( node.status_expires, expected_time + timedelta(minutes=1))
def mark_nodes_failed_after_expiring(now, node_timeout): """Mark all nodes in that database as failed where the status did not transition in time. `status_expires` is checked on the node to see if the current time is newer than the expired time. """ expired_nodes = Node.objects.filter( status__in=MONITORED_STATUSES, status_expires__isnull=False, status_expires__lte=now, ) for node in expired_nodes: minutes = get_node_timeout(node.status, node_timeout) maaslog.info( "%s: Operation '%s' timed out after %s minutes." % (node.hostname, NODE_STATUS_CHOICES_DICT[node.status], minutes)) node.mark_failed( comment="Node operation '%s' timed out after %s minutes." % (NODE_STATUS_CHOICES_DICT[node.status], minutes), script_result_status=SCRIPT_STATUS.ABORTED, )
def test_sets_status_expires_when_flatlined_with_may_reboot_script(self): node, script_set = self.make_node() current_time = now() if self.status == NODE_STATUS.COMMISSIONING: script_type = SCRIPT_TYPE.COMMISSIONING else: script_type = SCRIPT_TYPE.TESTING script = factory.make_Script(script_type=script_type, may_reboot=True) factory.make_ScriptResult( script=script, script_set=script_set, status=SCRIPT_STATUS.RUNNING) script_set.last_ping = current_time - timedelta(11) script_set.save() mark_nodes_failed_after_missing_script_timeout(current_time, 20) node = reload_object(node) self.assertEquals( current_time - (current_time - script_set.last_ping) + timedelta( minutes=get_node_timeout(self.status, 20)), node.status_expires)
def mark_nodes_failed_after_missing_script_timeout(now, node_timeout): """Check on the status of commissioning or testing nodes. For any node currently commissioning or testing check that a region is still receiving its heartbeat and no running script has gone past its run limit. If the node fails either condition its put into a failed status. """ # maas-run-remote-scripts sends a heartbeat every two minutes. If we # haven't received a heartbeat within node_timeout(20 min by default) # it's dead. heartbeat_expired = now - timedelta(minutes=node_timeout) # Get the list of nodes currently running testing. status_expires is used # while the node is booting. Once MAAS receives the signal that testing # has begun it resets status_expires and checks for the heartbeat instead. qs = Node.objects.filter( status__in=[NODE_STATUS.COMMISSIONING, NODE_STATUS.TESTING], status_expires=None, ) qs = qs.prefetch_related( Prefetch( "current_commissioning_script_set", ScriptSet.objects.prefetch_related( Prefetch( "scriptresult_set", ScriptResult.objects.defer( "output", "stdout", "stderr", "result").prefetch_related( Prefetch( "script", Script.objects.only("script_type", "name", "may_reboot", "timeout"), )), )), ), Prefetch( "current_testing_script_set", ScriptSet.objects.prefetch_related( Prefetch( "scriptresult_set", ScriptResult.objects.defer( "output", "stdout", "stderr", "result").prefetch_related( Prefetch( "script", Script.objects.only("script_type", "name", "may_reboot", "timeout"), )), )), ), ) for node in qs: if node.status == NODE_STATUS.COMMISSIONING: script_set = node.current_commissioning_script_set elif node.status == NODE_STATUS.TESTING: script_set = node.current_testing_script_set script_results = [ script_result for script_result in script_set if script_result.status == SCRIPT_STATUS.RUNNING ] maybe_rebooting = False for script_result in script_results: if script_result.script and script_result.script.may_reboot: maybe_rebooting = True break flatlined = (script_set.last_ping is not None and script_set.last_ping < heartbeat_expired) if maybe_rebooting and flatlined: # If the script currently running may_reboot and the nodes # heartbeat has flatlined assume the node is rebooting. Set the # node.status_expires time to the boot timeout minus what has # already passed. minutes = get_node_timeout(node.status, node_timeout) node.status_expires = (now - (now - script_set.last_ping) + timedelta(minutes=minutes)) node.save(update_fields=["status_expires"]) continue elif flatlined: maaslog.info( "%s: Has not been heard from for the last %s minutes" % (node.hostname, node_timeout)) node.mark_failed( comment=( "Node has not been heard from for the last %s minutes" % node_timeout), script_result_status=SCRIPT_STATUS.TIMEDOUT, ) if not node.enable_ssh: maaslog.info("%s: Stopped because SSH is disabled" % node.hostname) node.stop(comment="Node stopped because SSH is disabled") continue # Check for scripts which have gone past their timeout. for script_result in script_results: timeout = None for param in script_result.parameters.values(): if param.get("type") == "runtime": timeout = param.get("value") break if (timeout is None and script_result.name in NODE_INFO_SCRIPTS and "timeout" in NODE_INFO_SCRIPTS[script_result.name]): timeout = NODE_INFO_SCRIPTS[script_result.name]["timeout"] elif (timeout is None and script_result.script is not None and script_result.script.timeout.seconds > 0): timeout = script_result.script.timeout else: continue # The node running the scripts checks if the script has run past # its time limit. The node will try to kill the script and move on # by signaling the region. If after 5 minutes past the timeout the # region hasn't recieved the signal mark_failed and stop the node. script_expires = (script_result.started + timeout + timedelta(minutes=5)) if script_expires < now: script_result.status = SCRIPT_STATUS.TIMEDOUT script_result.save(update_fields=["status"]) maaslog.info("%s: %s has run past it's timeout(%s)" % (node.hostname, script_result.name, str(timeout))) node.mark_failed( comment="%s has run past it's timeout(%s)" % (script_result.name, str(timeout)), script_result_status=SCRIPT_STATUS.ABORTED, ) if not node.enable_ssh: maaslog.info("%s: Stopped because SSH is disabled" % node.hostname) node.stop(comment="Node stopped because SSH is disabled") break