def __init_sections_from_cfn(self, cluster_name): try: self.cfn_stack = get_stack(get_stack_name(cluster_name)) if self.__enforce_version and get_stack_version( self.cfn_stack) != get_installed_version(): self.error( "The cluster {0} was created with a different version of ParallelCluster: {1}. " "Installed version is {2}. This operation may only be performed using the same ParallelCluster " "version used to create the cluster.".format( cluster_name, get_stack_version(self.cfn_stack), get_installed_version())) cfn_params = self.cfn_stack.get("Parameters") json_params = self.__load_json_config( self.cfn_stack) if not self.__skip_load_json_config else None cfn_tags = self.cfn_stack.get("Tags") # Infer cluster model and load cluster section accordingly cluster_model = infer_cluster_model(cfn_stack=self.cfn_stack) section = ClusterCfnSection(section_definition=cluster_model. get_cluster_section_definition(), pcluster_config=self) self.add_section(section) section.from_storage(StorageData(cfn_params, json_params, cfn_tags)) except ClientError as e: self.error( "Unable to retrieve the configuration of the cluster '{0}'.\n{1}" .format(cluster_name, e.response.get("Error").get("Message")))
def __retrieve_cluster_config(self, bucket, artifact_directory): table = boto3.resource("dynamodb").Table( get_stack_name(self.cluster_name)) config_version = None # Use latest if not found try: config_version_item = table.get_item(ConsistentRead=True, Key={"Id": "CLUSTER_CONFIG"}) if config_version_item or "Item" in config_version_item: config_version = config_version_item["Item"].get("Version") except Exception as e: self.error( "Failed when retrieving cluster config version from DynamoDB with error {0}" .format(e)) try: config_version_args = { "VersionId": config_version } if config_version else {} s3_object = boto3.resource("s3").Object( bucket, "{prefix}/configs/cluster-config.json".format( prefix=artifact_directory)) json_str = s3_object.get( **config_version_args)["Body"].read().decode("utf-8") return json.loads(json_str, object_pairs_hook=OrderedDict) except Exception as e: self.error( "Unable to load configuration from bucket '{bucket}/{prefix}'.\n{error}" .format(bucket=bucket, prefix=artifact_directory, error=e))
def start(args): """Restore ASG limits or awsbatch CE to min/max/desired.""" stack_name = utils.get_stack_name(args.cluster_name) pcluster_config = PclusterConfig(config_file=args.config_file, cluster_name=args.cluster_name) cluster_section = pcluster_config.get_section("cluster") if cluster_section.get_param_value("scheduler") == "awsbatch": LOGGER.info("Enabling AWS Batch compute environment : %s", args.cluster_name) max_vcpus = cluster_section.get_param_value("max_vcpus") desired_vcpus = cluster_section.get_param_value("desired_vcpus") min_vcpus = cluster_section.get_param_value("min_vcpus") ce_name = _get_batch_ce(stack_name) _start_batch_ce(ce_name=ce_name, min_vcpus=min_vcpus, desired_vcpus=desired_vcpus, max_vcpus=max_vcpus) else: LOGGER.info("Starting compute fleet : %s", args.cluster_name) max_queue_size = cluster_section.get_param_value("max_queue_size") min_desired_size = ( cluster_section.get_param_value("initial_queue_size") if cluster_section.get_param_value("maintain_initial_size") else 0) asg_name = _get_asg_name(stack_name) _set_asg_limits(asg_name=asg_name, min=min_desired_size, max=max_queue_size, desired=min_desired_size)
def status(args): # noqa: C901 FIXME!!! stack_name = utils.get_stack_name(args.cluster_name) # Parse configuration file to read the AWS section PclusterConfig.init_aws(config_file=args.config_file) cfn = boto3.client("cloudformation") try: stack = utils.get_stack(stack_name, cfn) sys.stdout.write("\rStatus: %s" % stack.get("StackStatus")) sys.stdout.flush() if not args.nowait: while stack.get("StackStatus") not in [ "CREATE_COMPLETE", "UPDATE_COMPLETE", "UPDATE_ROLLBACK_COMPLETE", "ROLLBACK_COMPLETE", "CREATE_FAILED", "DELETE_FAILED", ]: time.sleep(5) stack = utils.get_stack(stack_name, cfn) events = utils.get_stack_events(stack_name)[0] resource_status = ( "Status: %s - %s" % (events.get("LogicalResourceId"), events.get("ResourceStatus")) ).ljust(80) sys.stdout.write("\r%s" % resource_status) sys.stdout.flush() sys.stdout.write("\rStatus: %s\n" % stack.get("StackStatus")) sys.stdout.flush() if stack.get("StackStatus") in ["CREATE_COMPLETE", "UPDATE_COMPLETE", "UPDATE_ROLLBACK_COMPLETE"]: state = _poll_head_node_state(stack_name) if state == "running": _print_stack_outputs(stack) _print_compute_fleet_status(args.cluster_name, stack) elif stack.get("StackStatus") in ["ROLLBACK_COMPLETE", "CREATE_FAILED", "DELETE_FAILED"]: events = utils.get_stack_events(stack_name) for event in events: if event.get("ResourceStatus") in ["CREATE_FAILED", "DELETE_FAILED", "UPDATE_FAILED"]: LOGGER.info( "%s %s %s %s %s", event.get("Timestamp"), event.get("ResourceStatus"), event.get("ResourceType"), event.get("LogicalResourceId"), event.get("ResourceStatusReason"), ) else: sys.stdout.write("\n") sys.stdout.flush() except ClientError as e: LOGGER.critical(e.response.get("Error").get("Message")) sys.stdout.flush() sys.exit(1) except KeyboardInterrupt: LOGGER.info("\nExiting...") sys.exit(0)
def start(self, args, pcluster_config): """Start the compute fleet.""" LOGGER.info("Enabling AWS Batch compute environment : %s", args.cluster_name) stack_name = utils.get_stack_name(args.cluster_name) cluster_section = pcluster_config.get_section("cluster") max_vcpus = cluster_section.get_param_value("max_vcpus") desired_vcpus = cluster_section.get_param_value("desired_vcpus") min_vcpus = cluster_section.get_param_value("min_vcpus") ce_name = utils.get_batch_ce(stack_name) self._start_batch_ce(ce_name=ce_name, min_vcpus=min_vcpus, desired_vcpus=desired_vcpus, max_vcpus=max_vcpus)
def delete(args): saw_update = False LOGGER.info("Deleting: %s", args.cluster_name) stack_name = utils.get_stack_name(args.cluster_name) # Parse configuration file to read the AWS section PclusterConfig.init_aws(config_file=args.config_file) cfn = boto3.client("cloudformation") try: # delete_stack does not raise an exception if stack does not exist # Use describe_stacks to explicitly check if the stack exists cfn.describe_stacks(StackName=stack_name) cfn.delete_stack(StackName=stack_name) saw_update = True stack_status = utils.get_stack(stack_name, cfn).get("StackStatus") sys.stdout.write("\rStatus: %s" % stack_status) sys.stdout.flush() LOGGER.debug("Status: %s", stack_status) if not args.nowait: while stack_status == "DELETE_IN_PROGRESS": time.sleep(5) stack_status = utils.get_stack(stack_name, cfn).get("StackStatus") events = cfn.describe_stack_events( StackName=stack_name).get("StackEvents")[0] resource_status = ("Status: %s - %s" % (events.get("LogicalResourceId"), events.get("ResourceStatus"))).ljust(80) sys.stdout.write("\r%s" % resource_status) sys.stdout.flush() sys.stdout.write("\rStatus: %s\n" % stack_status) sys.stdout.flush() LOGGER.debug("Status: %s", stack_status) else: sys.stdout.write("\n") sys.stdout.flush() if stack_status == "DELETE_FAILED": LOGGER.info( "Cluster did not delete successfully. Run 'pcluster delete %s' again", args.cluster_name) except ClientError as e: if e.response.get("Error").get("Message").endswith("does not exist"): if saw_update: LOGGER.info("\nCluster deleted successfully.") sys.exit(0) LOGGER.critical(e.response.get("Error").get("Message")) sys.stdout.flush() sys.exit(1) except KeyboardInterrupt: LOGGER.info("\nExiting...") sys.exit(0)
def _delete_cluster(cluster_name, nowait): """Delete cluster described by cluster_name.""" cfn = boto3.client("cloudformation") saw_update = False terminate_compute_fleet = not nowait stack_name = utils.get_stack_name(cluster_name) try: # delete_stack does not raise an exception if stack does not exist # Use describe_stacks to explicitly check if the stack exists cfn.delete_stack(StackName=stack_name) saw_update = True stack_status = utils.get_stack(stack_name, cfn).get("StackStatus") sys.stdout.write("\rStatus: %s" % stack_status) sys.stdout.flush() LOGGER.debug("Status: %s", stack_status) if not nowait: while stack_status == "DELETE_IN_PROGRESS": time.sleep(5) stack_status = utils.get_stack( stack_name, cfn, raise_on_error=True).get("StackStatus") events = utils.get_stack_events(stack_name, raise_on_error=True)[0] resource_status = ("Status: %s - %s" % (events.get("LogicalResourceId"), events.get("ResourceStatus"))).ljust(80) sys.stdout.write("\r%s" % resource_status) sys.stdout.flush() sys.stdout.write("\rStatus: %s\n" % stack_status) sys.stdout.flush() LOGGER.debug("Status: %s", stack_status) else: sys.stdout.write("\n") sys.stdout.flush() if stack_status == "DELETE_FAILED": LOGGER.info( "Cluster did not delete successfully. Run 'pcluster delete %s' again", cluster_name) except ClientError as e: if e.response.get("Error").get("Message").endswith("does not exist"): if saw_update: LOGGER.info("\nCluster deleted successfully.") sys.exit(0) LOGGER.critical(e.response.get("Error").get("Message")) sys.stdout.flush() sys.exit(1) except KeyboardInterrupt: terminate_compute_fleet = False LOGGER.info("\nExiting...") sys.exit(0) finally: if terminate_compute_fleet: _terminate_cluster_nodes(stack_name)
def start(self, args, pcluster_config): """Start the compute fleet.""" LOGGER.info("Starting compute fleet: %s", args.cluster_name) cluster_section = pcluster_config.get_section("cluster") stack_name = utils.get_stack_name(args.cluster_name) max_queue_size = cluster_section.get_param_value("max_queue_size") min_desired_size = ( cluster_section.get_param_value("initial_queue_size") if cluster_section.get_param_value("maintain_initial_size") else 0 ) asg_name = utils.get_asg_name(stack_name) utils.set_asg_limits(asg_name=asg_name, min=min_desired_size, max=max_queue_size, desired=min_desired_size)
def __init_sections_from_cfn(self, cluster_name): try: stack = get_stack(get_stack_name(cluster_name)) section_type = CLUSTER.get("type") section = section_type(section_definition=CLUSTER, pcluster_config=self).from_cfn_params( cfn_params=stack.get("Parameters", [])) self.add_section(section) except ClientError as e: self.error( "Unable to retrieve the configuration of the cluster '{0}'.\n{1}" .format(cluster_name, e.response.get("Error").get("Message")))
def delete(args): PclusterConfig.init_aws(config_file=args.config_file) LOGGER.info("Deleting: %s", args.cluster_name) stack_name = utils.get_stack_name(args.cluster_name) if not utils.stack_exists(stack_name): if args.keep_logs: utils.warn( "Stack for {0} does not exist. Cannot prevent its log groups from being deleted." .format(args.cluster_name)) utils.warn("Cluster {0} has already been deleted.".format( args.cluster_name)) sys.exit(0) elif args.keep_logs: _persist_cloudwatch_log_groups(args.cluster_name) _delete_cluster(args.cluster_name, args.nowait)
def stop(args): """Set ASG limits or awsbatch ce to min/max/desired = 0/0/0.""" stack_name = utils.get_stack_name(args.cluster_name) pcluster_config = PclusterConfig(config_file=args.config_file, cluster_name=args.cluster_name) cluster_section = pcluster_config.get_section("cluster") if cluster_section.get_param_value("scheduler") == "awsbatch": LOGGER.info("Disabling AWS Batch compute environment : %s", args.cluster_name) ce_name = _get_batch_ce(stack_name) _stop_batch_ce(ce_name=ce_name) else: LOGGER.info("Stopping compute fleet : %s", args.cluster_name) asg_name = _get_asg_name(stack_name) _set_asg_limits(asg_name=asg_name, min=0, max=0, desired=0)
def instances(args): stack_name = utils.get_stack_name(args.cluster_name) pcluster_config = PclusterConfig(config_file=args.config_file, cluster_name=args.cluster_name) cluster_section = pcluster_config.get_section("cluster") instances = [] instances.extend(_get_ec2_instances(stack_name)) if cluster_section.get_param_value("scheduler") != "awsbatch": instances.extend(_get_asg_instances(stack_name)) for instance in instances: LOGGER.info("%s %s", instance[0], instance[1]) if cluster_section.get_param_value("scheduler") == "awsbatch": LOGGER.info( "Run 'awsbhosts --cluster %s' to list the compute instances", args.cluster_name)
def dcv_connect(args): """ Execute pcluster dcv connect command. :param args: pcluster cli arguments. """ # Parse configuration file to read the AWS section PclusterConfig.init_aws( ) # FIXME it always searches for the default configuration file # Prepare ssh command to execute in the head node instance stack = get_stack(get_stack_name(args.cluster_name)) shared_dir = get_cfn_param(stack.get("Parameters"), "SharedDir") head_node_ip, username = get_head_node_ip_and_username(args.cluster_name) cmd = 'ssh {CFN_USER}@{HEAD_NODE_IP} {KEY} "{REMOTE_COMMAND} {DCV_SHARED_DIR}"'.format( CFN_USER=username, HEAD_NODE_IP=head_node_ip, KEY="-i {0}".format(args.key_path) if args.key_path else "", REMOTE_COMMAND=DCV_CONNECT_SCRIPT, DCV_SHARED_DIR=shared_dir, ) try: url = retry(_retrieve_dcv_session_url, func_args=[cmd, args.cluster_name, head_node_ip], attempts=4) url_message = "Please use the following one-time URL in your browser within 30 seconds:\n{0}".format( url) except DCVConnectionError as e: error("Something went wrong during DCV connection.\n{0}" "Please check the logs in the /var/log/parallelcluster/ folder " "of the head node and submit an issue {1}\n".format( e, PCLUSTER_ISSUES_LINK)) if args.show_url: LOGGER.info(url_message) return try: if not webbrowser.open_new(url): raise webbrowser.Error("Unable to open the Web browser.") except webbrowser.Error as e: LOGGER.info("{0}\n{1}".format(e, url_message))
def instances(args): stack_name = utils.get_stack_name(args.cluster_name) PclusterConfig.init_aws(config_file=args.config_file) cfn_stack = utils.get_stack(stack_name) scheduler = utils.get_cfn_param(cfn_stack.get("Parameters"), "Scheduler") instances = [] head_node_server = utils.describe_cluster_instances(stack_name, node_type=utils.NodeType.head_node) if head_node_server: instances.append(("MasterServer", head_node_server[0].get("InstanceId"))) if scheduler != "awsbatch": instances.extend(_get_compute_instances(stack_name)) for instance in instances: LOGGER.info("%s %s", instance[0], instance[1]) if scheduler == "awsbatch": LOGGER.info("Run 'awsbhosts --cluster %s' to list the compute instances", args.cluster_name)
def dcv_connect(args): """ Execute pcluster dcv connect command. :param args: pcluster cli arguments. """ # Parse configuration file to read the AWS section PclusterConfig.init_aws( ) # FIXME it always searches for the default configuration file # Prepare ssh command to execute in the master instance stack = get_stack(get_stack_name(args.cluster_name)) shared_dir = get_cfn_param(stack.get("Parameters"), "SharedDir") master_ip, username = get_master_ip_and_username(args.cluster_name) cmd = 'ssh {CFN_USER}@{MASTER_IP} {KEY} "{REMOTE_COMMAND} {DCV_SHARED_DIR}"'.format( CFN_USER=username, MASTER_IP=master_ip, KEY="-i {0}".format(args.key_path) if args.key_path else "", REMOTE_COMMAND=DCV_CONNECT_SCRIPT, DCV_SHARED_DIR=shared_dir, ) # Connect by ssh to the master instance and prepare DCV session try: LOGGER.debug("SSH command: {0}".format(cmd)) output = _check_command_output(cmd) # At first ssh connection, the ssh command alerts it is adding the host to the known hosts list if re.search("Permanently added .* to the list of known hosts.", output): output = _check_command_output(cmd) dcv_parameters = re.search( r"PclusterDcvServerPort=([\d]+) PclusterDcvSessionId=([\w]+) PclusterDcvSessionToken=([\w-]+)", output) if dcv_parameters: dcv_server_port = dcv_parameters.group(1) dcv_session_id = dcv_parameters.group(2) dcv_session_token = dcv_parameters.group(3) else: error( "Something went wrong during DCV connection. Please manually execute the command:\n{0}\n" "If the problem persists, please check the logs in the /var/log/parallelcluster/ folder " "of the master instance and submit an issue {1}.".format( cmd, PCLUSTER_ISSUES_LINK)) except sub.CalledProcessError as e: if "{0}: No such file or directory".format( DCV_CONNECT_SCRIPT) in e.output: error( "The cluster {0} has been created with an old version of ParallelCluster " "without the DCV support.".format(args.cluster_name)) else: error("Something went wrong during DCV connection.\n{0}".format( e.output)) # Open web browser url = "https://{IP}:{PORT}?authToken={TOKEN}#{SESSION_ID}".format( IP=master_ip, PORT=dcv_server_port, TOKEN=dcv_session_token, SESSION_ID=dcv_session_id) try: webbrowser.open_new(url) except webbrowser.Error: LOGGER.info( "Unable to open the Web browser. " "Please use the following URL in your browser within 30 seconds:\n{0}" .format(url))
def create(args): # noqa: C901 FIXME!!! LOGGER.info("Beginning cluster creation for cluster: %s", args.cluster_name) LOGGER.debug("Building cluster config based on args %s", str(args)) # Build the config based on args pcluster_config = PclusterConfig(config_file=args.config_file, cluster_label=args.cluster_template, fail_on_file_absence=True) pcluster_config.validate() # get CFN parameters, template url and tags from config cluster_section = pcluster_config.get_section("cluster") cfn_params = pcluster_config.to_cfn() _check_for_updates(pcluster_config) batch_temporary_bucket = None try: cfn_client = boto3.client("cloudformation") stack_name = utils.get_stack_name(args.cluster_name) # If scheduler is awsbatch create bucket with resources if cluster_section.get_param_value("scheduler") == "awsbatch": batch_resources = pkg_resources.resource_filename( __name__, "resources/batch") batch_temporary_bucket = _create_bucket_with_batch_resources( stack_name=stack_name, resources_dir=batch_resources, region=pcluster_config.region) cfn_params["ResourcesS3Bucket"] = batch_temporary_bucket LOGGER.info("Creating stack named: %s", stack_name) LOGGER.debug(cfn_params) # determine the CloudFormation Template URL to use template_url = _evaluate_pcluster_template_url( pcluster_config, preferred_template_url=args.template_url) # merge tags from configuration, command-line and internal ones tags = _evaluate_tags(pcluster_config, preferred_tags=args.tags) # append extra parameters from command-line if args.extra_parameters: LOGGER.debug("Adding extra parameters to the CFN parameters") cfn_params.update(dict(args.extra_parameters)) # prepare input parameters for stack creation and create the stack LOGGER.debug(cfn_params) params = [{ "ParameterKey": key, "ParameterValue": value } for key, value in cfn_params.items()] stack = cfn_client.create_stack( StackName=stack_name, TemplateURL=template_url, Parameters=params, Capabilities=["CAPABILITY_IAM"], DisableRollback=args.norollback, Tags=tags, ) LOGGER.debug("StackId: %s", stack.get("StackId")) if not args.nowait: utils.verify_stack_creation(stack_name, cfn_client) LOGGER.info("") result_stack = utils.get_stack(stack_name, cfn_client) _print_stack_outputs(result_stack) else: stack_status = utils.get_stack(stack_name, cfn_client).get("StackStatus") LOGGER.info("Status: %s", stack_status) except ClientError as e: LOGGER.critical(e.response.get("Error").get("Message")) sys.stdout.flush() if batch_temporary_bucket: utils.delete_s3_bucket(bucket_name=batch_temporary_bucket) sys.exit(1) except KeyboardInterrupt: LOGGER.info("\nExiting...") sys.exit(0) except KeyError as e: LOGGER.critical("ERROR: KeyError - reason:") LOGGER.critical(e) if batch_temporary_bucket: utils.delete_s3_bucket(bucket_name=batch_temporary_bucket) sys.exit(1) except Exception as e: LOGGER.critical(e) if batch_temporary_bucket: utils.delete_s3_bucket(bucket_name=batch_temporary_bucket) sys.exit(1)
def stack_name(self): """Get the name of the stack this patch is referred to.""" return (utils.get_stack_name(self.base_config.cluster_name) if hasattr( self.base_config, "cluster_name") else None)
def stop(self, args, pcluster_config): """Stop the compute fleet.""" LOGGER.info("Stopping compute fleet: %s", args.cluster_name) stack_name = utils.get_stack_name(args.cluster_name) asg_name = utils.get_asg_name(stack_name) utils.set_asg_limits(asg_name=asg_name, min=0, max=0, desired=0)
def create(args): # noqa: C901 FIXME!!! LOGGER.info("Beginning cluster creation for cluster: %s", args.cluster_name) LOGGER.debug("Building cluster config based on args %s", str(args)) _validate_cluster_name(args.cluster_name) # Build the config based on args pcluster_config = PclusterConfig( config_file=args.config_file, cluster_label=args.cluster_template, fail_on_file_absence=True ) pcluster_config.validate() # Automatic SIT -> HIT conversion, if needed HitConverter(pcluster_config).convert() # get CFN parameters, template url and tags from config storage_data = pcluster_config.to_storage() cfn_params = storage_data.cfn_params _check_for_updates(pcluster_config) bucket_name = None artifact_directory = None cleanup_bucket = False try: cfn_client = boto3.client("cloudformation") stack_name = utils.get_stack_name(args.cluster_name) # merge tags from configuration, command-line and internal ones tags = _evaluate_tags(pcluster_config, preferred_tags=args.tags) bucket_name, artifact_directory, cleanup_bucket = _setup_bucket_with_resources( pcluster_config, storage_data, stack_name, tags ) cfn_params["ResourcesS3Bucket"] = bucket_name cfn_params["ArtifactS3RootDirectory"] = artifact_directory cfn_params["RemoveBucketOnDeletion"] = str(cleanup_bucket) LOGGER.info("Creating stack named: %s", stack_name) # determine the CloudFormation Template URL to use template_url = evaluate_pcluster_template_url(pcluster_config, preferred_template_url=args.template_url) # append extra parameters from command-line if args.extra_parameters: LOGGER.debug("Adding extra parameters to the CFN parameters") cfn_params.update(dict(args.extra_parameters)) # prepare input parameters for stack creation and create the stack LOGGER.debug(cfn_params) params = [{"ParameterKey": key, "ParameterValue": value} for key, value in cfn_params.items()] stack = cfn_client.create_stack( StackName=stack_name, TemplateURL=template_url, Parameters=params, Capabilities=["CAPABILITY_IAM"], DisableRollback=args.norollback, Tags=tags, ) LOGGER.debug("StackId: %s", stack.get("StackId")) if not args.nowait: verified = utils.verify_stack_creation(stack_name, cfn_client) LOGGER.info("") result_stack = utils.get_stack(stack_name, cfn_client) _print_stack_outputs(result_stack) if not verified: sys.exit(1) else: stack_status = utils.get_stack(stack_name, cfn_client).get("StackStatus") LOGGER.info("Status: %s", stack_status) except ClientError as e: LOGGER.critical(e.response.get("Error").get("Message")) sys.stdout.flush() utils.cleanup_s3_resources(bucket_name, artifact_directory, cleanup_bucket) sys.exit(1) except KeyboardInterrupt: LOGGER.info("\nExiting...") if not utils.stack_exists(stack_name): # Cleanup S3 artifacts if stack is not created yet utils.cleanup_s3_resources(bucket_name, artifact_directory, cleanup_bucket) sys.exit(0) except KeyError as e: LOGGER.critical("ERROR: KeyError - reason:\n%s", e) utils.cleanup_s3_resources(bucket_name, artifact_directory, cleanup_bucket) sys.exit(1) except Exception as e: LOGGER.critical(e) utils.cleanup_s3_resources(bucket_name, artifact_directory, cleanup_bucket) sys.exit(1)
def stop(self, args, pcluster_config): """Stop the compute fleet.""" LOGGER.info("Disabling AWS Batch compute environment : %s", args.cluster_name) stack_name = utils.get_stack_name(args.cluster_name) ce_name = utils.get_batch_ce(stack_name) self._stop_batch_ce(ce_name=ce_name)
def update(args): # noqa: C901 FIXME!!! LOGGER.info("Updating: %s", args.cluster_name) stack_name = utils.get_stack_name(args.cluster_name) pcluster_config = PclusterConfig(config_file=args.config_file, cluster_label=args.cluster_template, fail_on_file_absence=True) pcluster_config.validate() cfn_params = pcluster_config.to_cfn() cluster_section = pcluster_config.get_section("cluster") cfn = boto3.client("cloudformation") if cluster_section.get_param_value("scheduler") != "awsbatch": if not args.reset_desired: asg_name = _get_asg_name(stack_name) desired_capacity = ( boto3.client("autoscaling").describe_auto_scaling_groups( AutoScalingGroupNames=[asg_name]).get( "AutoScalingGroups")[0].get("DesiredCapacity")) cfn_params["DesiredSize"] = str(desired_capacity) else: if args.reset_desired: LOGGER.info( "reset_desired flag does not work with awsbatch scheduler") params = utils.get_stack(stack_name, cfn).get("Parameters") for parameter in params: if parameter.get("ParameterKey") == "ResourcesS3Bucket": cfn_params["ResourcesS3Bucket"] = parameter.get( "ParameterValue") try: LOGGER.debug(cfn_params) if args.extra_parameters: LOGGER.debug("Adding extra parameters to the CFN parameters") cfn_params.update(dict(args.extra_parameters)) cfn_params = [{ "ParameterKey": key, "ParameterValue": value } for key, value in cfn_params.items()] LOGGER.info("Calling update_stack") cfn.update_stack(StackName=stack_name, UsePreviousTemplate=True, Parameters=cfn_params, Capabilities=["CAPABILITY_IAM"]) stack_status = utils.get_stack(stack_name, cfn).get("StackStatus") if not args.nowait: while stack_status == "UPDATE_IN_PROGRESS": stack_status = utils.get_stack(stack_name, cfn).get("StackStatus") events = cfn.describe_stack_events( StackName=stack_name).get("StackEvents")[0] resource_status = ("Status: %s - %s" % (events.get("LogicalResourceId"), events.get("ResourceStatus"))).ljust(80) sys.stdout.write("\r%s" % resource_status) sys.stdout.flush() time.sleep(5) else: stack_status = utils.get_stack(stack_name, cfn).get("StackStatus") LOGGER.info("Status: %s", stack_status) except ClientError as e: LOGGER.critical(e.response.get("Error").get("Message")) sys.exit(1) except KeyboardInterrupt: LOGGER.info("\nExiting...") sys.exit(0)
def test_get_stack_name(): """Test utils.get_stack_name.""" expected_stack_name = "parallelcluster-{0}".format(FAKE_CLUSTER_NAME) assert_that(utils.get_stack_name(FAKE_CLUSTER_NAME)).is_equal_to( expected_stack_name)
"""This module provides unit tests for the functions in the pcluster.utils module.""" import json import pytest import pcluster.utils as utils from assertpy import assert_that from tests.common import MockedBoto3Request FAKE_CLUSTER_NAME = "cluster_name" FAKE_STACK_NAME = utils.get_stack_name(FAKE_CLUSTER_NAME) STACK_TYPE = "AWS::CloudFormation::Stack" @pytest.fixture() def boto3_stubber_path(): """Specify that boto3_mocker should stub calls to boto3 for the pcluster.utils module.""" return "pcluster.utils.boto3" def test_get_stack_name(): """Test utils.get_stack_name.""" expected_stack_name = "parallelcluster-{0}".format(FAKE_CLUSTER_NAME) assert_that(utils.get_stack_name(FAKE_CLUSTER_NAME)).is_equal_to( expected_stack_name) @pytest.mark.parametrize( "template_body,error_message", [
def execute(args): LOGGER.info("Retrieving configuration from CloudFormation for cluster {0}...".format(args.cluster_name)) base_config = PclusterConfig(config_file=args.config_file, cluster_name=args.cluster_name) stack_status = base_config.cfn_stack.get("StackStatus") if "IN_PROGRESS" in stack_status: utils.error("Cannot execute update while stack is in {} status.".format(stack_status)) LOGGER.info("Validating configuration file {0}...".format(args.config_file if args.config_file else "")) stack_name = utils.get_stack_name(args.cluster_name) target_config = PclusterConfig( config_file=args.config_file, cluster_label=args.cluster_template, fail_on_file_absence=True ) target_config.validate() if _check_cluster_models(base_config, target_config, args.cluster_template) and _check_changes( args, base_config, target_config ): # Update base config settings base_config.update(target_config) cfn_params = base_config.to_cfn() cfn_client = boto3.client("cloudformation") _restore_cfn_only_params(cfn_client, args, cfn_params, stack_name, target_config) s3_bucket_name = cfn_params["ResourcesS3Bucket"] tags = _get_target_config_tags_list(target_config) artifact_directory = cfn_params["ArtifactS3RootDirectory"] is_hit = utils.is_hit_enabled_cluster(base_config.cfn_stack) template_url = None if is_hit: try: upload_hit_resources( s3_bucket_name, artifact_directory, target_config, target_config.to_storage().json_params, tags ) except Exception: utils.error("Failed when uploading resources to cluster S3 bucket {0}".format(s3_bucket_name)) template_url = evaluate_pcluster_template_url(target_config) try: upload_dashboard_resource( s3_bucket_name, artifact_directory, target_config, target_config.to_storage().json_params, target_config.to_storage().cfn_params, ) except Exception: utils.error("Failed when uploading the dashboard resource to cluster S3 bucket {0}".format(s3_bucket_name)) _update_cluster( args, cfn_client, cfn_params, stack_name, use_previous_template=not is_hit, template_url=template_url, tags=tags, ) else: LOGGER.info("Update aborted.") sys.exit(1)