def _terminate_if_down(scheduler_module, config, asg_name, instance_id, max_wait): """Check that node is correctly attached to scheduler otherwise terminate the instance.""" asg_client = boto3.client("autoscaling", region_name=config.region, config=config.proxy_config) @retry(wait_fixed=seconds(10), retry_on_result=lambda result: result is True, stop_max_delay=max_wait) def _poll_wait_for_node_ready(): is_down = scheduler_module.is_node_down() if is_down: log.warning("Node reported as down") return is_down try: _poll_wait_for_node_ready() except RetryError: log.error( "Node is marked as down by scheduler or not attached correctly. Terminating..." ) _dump_logs(instance_id) # jobwatcher already has the logic to request a new host in case of down nodes, # which is done in order to speed up cluster recovery. _self_terminate( asg_client, instance_id, decrement_desired=not _maintain_size(asg_name, asg_client))
def run_remote_command(self, command, timeout=seconds(5), log_error=True, fail_on_error=True): """ Execute remote command on the configured host. :param command: command to execute. :param log_error: log errors. :return: result of the execution. """ if isinstance(command, list): command = " ".join(command) logging.info("Executing remote command command on {0}: {1}".format( self.__user_at_hostname, command)) result = None try: stdin, stdout, stderr = self.__ssh_client.exec_command( command, get_pty=True) self._wait_for_command_execution(timeout, stdout) result = RemoteCommandResult( return_code=stdout.channel.recv_exit_status(), stdout="\n".join(stdout.read().decode().splitlines()), stderr="\n".join(stderr.read().decode().splitlines()), ) if result.return_code != 0 and fail_on_error: raise RemoteCommandExecutionError(result) return result except Exception: if log_error and result: logging.error( "Command {0} failed with error:\n{1}\nand output:\n{2}". format(command, result.stderr, result.stdout)) raise
messages = _retrieve_all_sqs_messages(queue) update_events = _parse_sqs_messages(messages, table) _process_sqs_messages( update_events, scheduler_module, sqs_config, table, queue, max_cluster_size, instance_properties, force_cluster_update, ) time.sleep(LOOP_TIME) @retry(wait_fixed=seconds(LOOP_TIME)) def main(): logging.basicConfig( level=logging.INFO, format="%(asctime)s %(levelname)s [%(module)s:%(funcName)s] %(message)s" ) log.info("sqswatcher startup") try: config = _get_config() queue = _get_sqs_queue(config.region, config.sqsqueue, config.proxy_config) table = _get_ddb_table(config.region, config.table_name, config.proxy_config) asg_name = get_asg_name(config.stack_name, config.region, config.proxy_config)
_poll_wait_for_node_ready() except RetryError: log.error( "Node is marked as down by scheduler or not attached correctly. Terminating..." ) _dump_logs(instance_id) # jobwatcher already has the logic to request a new host in case of down nodes, # which is done in order to speed up cluster recovery. _self_terminate( asg_client, instance_id, decrement_desired=not _maintain_size(asg_name, asg_client)) @retry( wait_exponential_multiplier=seconds(1), wait_exponential_max=seconds(10), retry_on_result=lambda result: result is False, stop_max_delay=minutes(10), ) def _wait_for_stack_ready(stack_name, region, proxy_config): """ Verify if the Stack is in one of the *_COMPLETE states. :param stack_name: Stack to query for :param region: AWS region :param proxy_config: Proxy configuration :return: true if the stack is in the *_COMPLETE status """ log.info("Waiting for stack %s to be ready", stack_name) cfn_client = boto3.client("cloudformation",
"Values": [stack_name] }]) asg_name = response.get("Tags")[0].get("ResourceId") log.info("ASG %s found for the stack %s", asg_name, stack_name) return asg_name except IndexError: log.warning("Unable to get ASG for stack %s", stack_name) raise except Exception as e: raise CriticalError( "Unable to get ASG for stack {0}. Failed with exception: {1}". format(stack_name, e)) @retry(stop_max_attempt_number=5, wait_exponential_multiplier=seconds(0.5), wait_exponential_max=seconds(10)) def get_asg_settings(region, proxy_config, asg_name): try: asg_client = boto3.client("autoscaling", region_name=region, config=proxy_config) asg = asg_client.describe_auto_scaling_groups( AutoScalingGroupNames=[asg_name]).get("AutoScalingGroups")[0] min_size = asg.get("MinSize") desired_capacity = asg.get("DesiredCapacity") max_size = asg.get("MaxSize") log.info("ASG min/desired/max: %d/%d/%d" % (min_size, desired_capacity, max_size)) return min_size, desired_capacity, max_size
_poll_wait_for_node_ready() except RetryError: log.error( "Node is marked as down by scheduler or not attached correctly. Terminating..." ) _dump_logs(instance_id) # decrement asg desired only if not reached the min. # jobwatcher already has the logic to request a new host in case of down nodes, # which is done in order to speed up cluster recovery. _self_terminate( asg_client, instance_id, decrement_desired=not _maintain_size(asg_name, asg_client)) @retry(wait_fixed=seconds(10), retry_on_result=lambda result: result is False, stop_max_delay=minutes(10)) def _wait_for_stack_ready(stack_name, region, proxy_config): """ Verify if the Stack is in one of the *_COMPLETE states. :param stack_name: Stack to query for :param region: AWS region :param proxy_config: Proxy configuration :return: true if the stack is in the *_COMPLETE status """ log.info("Waiting for stack %s to be ready", stack_name) cfn_client = boto3.client("cloudformation", region_name=region, config=proxy_config)