def get_compute_nodes_allocation(scheduler_commands, region, stack_name, max_monitoring_time): """ Watch periodically the number of compute nodes in the cluster. :return: (asg_capacity_time_series, compute_nodes_time_series, timestamps): three lists describing the variation over time in the number of compute nodes and the timestamp when these fluctuations occurred. asg_capacity_time_series describes the variation in the desired asg capacity. compute_nodes_time_series describes the variation in the number of compute nodes seen by the scheduler. timestamps describes the time since epoch when the variations occurred. """ asg_capacity_time_series = [] compute_nodes_time_series = [] timestamps = [] @retry( # Retry until ASG and Scheduler capacities scale down to 0 # Also make sure cluster scaled up before scaling down retry_on_result=lambda _: asg_capacity_time_series[-1] != 0 or compute_nodes_time_series[-1] != 0 or max(asg_capacity_time_series) == 0 or max(compute_nodes_time_series) == 0, wait_fixed=seconds(20), stop_max_delay=max_monitoring_time, ) def _watch_compute_nodes_allocation(): compute_nodes = scheduler_commands.compute_nodes_count() asg_capacity = _get_desired_asg_capacity(region, stack_name) timestamp = time.time() # add values only if there is a transition. if ( len(asg_capacity_time_series) == 0 or asg_capacity_time_series[-1] != asg_capacity or compute_nodes_time_series[-1] != compute_nodes ): asg_capacity_time_series.append(asg_capacity) compute_nodes_time_series.append(compute_nodes) timestamps.append(timestamp) try: _watch_compute_nodes_allocation() except RetryError: # ignoring this error in order to perform assertions on the collected data. pass logging.info( "Monitoring completed: %s, %s, %s", "asg_capacity_time_series [" + " ".join(map(str, asg_capacity_time_series)) + "]", "compute_nodes_time_series [" + " ".join(map(str, compute_nodes_time_series)) + "]", "timestamps [" + " ".join(map(str, timestamps)) + "]", ) return asg_capacity_time_series, compute_nodes_time_series, timestamps
class TorqueCommands(SchedulerCommands): """Implement commands for torque scheduler.""" def __init__(self, remote_command_executor): super().__init__(remote_command_executor) @retry(retry_on_result=lambda result: "job_state = C" not in result, wait_fixed=seconds(3), stop_max_delay=minutes(12)) def wait_job_completed(self, job_id): # noqa: D102 result = self._remote_command_executor.run_remote_command( "qstat -f {0}".format(job_id)) return result.stdout def get_job_exit_status(self, job_id): # noqa: D102 result = self._remote_command_executor.run_remote_command( "qstat -f {0}".format(job_id)) match = re.search(r"exit_status = (\d+)", result.stdout) return match.group(1) def assert_job_submitted(self, qsub_output): # noqa: D102 __tracebackhide__ = True # qsub_output is the id of the job in case of successful submissions id = qsub_output # check that the job exists self._remote_command_executor.run_remote_command( "qstat -f {0}".format(id)) return id def submit_command(self, command, nodes=1, slots=None, after_ok=None): # noqa: D102 flags = "-l nodes={0}:ppn={1}".format(nodes or 1, slots or 1) if after_ok: flags += " -W depend=afterok:{0}".format(after_ok) return self._remote_command_executor.run_remote_command( "echo '{0}' | qsub {1}".format(command, flags), raise_on_error=False) def submit_script(self, script, script_args=None, nodes=1, slots=None, additional_files=None): # noqa: D102 if not additional_files: additional_files = [] script_name = os.path.basename(script) additional_files.append(script) flags = "-l nodes={0}:ppn={1}".format(nodes or 1, slots or 1) if script_args: flags += ' -F "{0}"'.format(" ".join(script_args)) return self._remote_command_executor.run_remote_command( "qsub {0} {1}".format(flags, script_name), additional_files=additional_files) def assert_job_succeeded(self, job_id, children_number=0): # noqa: D102 __tracebackhide__ = True status = self.get_job_exit_status(job_id) assert_that(status).is_equal_to("0") def compute_nodes_count(self): # noqa: D102 result = self._remote_command_executor.run_remote_command( "echo $(( $(/opt/torque/bin/pbsnodes -l all | wc -l) - 1))") # split()[-1] to extract last line and trim whitespaces return int(result.stdout.split()[-1]) def get_compute_nodes(self): # noqa: D102 result = self._remote_command_executor.run_remote_command( "pbsnodes -l all | grep -v $(hostname) | awk '{print $1}'") return result.stdout.splitlines() @retry(retry_on_result=lambda result: "offline" not in result, wait_fixed=seconds(5), stop_max_delay=minutes(5)) def wait_for_locked_node(self): # noqa: D102 # discard the first node since that is the master server return self._remote_command_executor.run_remote_command( r'pbsnodes | grep -e "\sstate = " | tail -n +2').stdout def get_node_cores(self): """Return number of slots from the scheduler.""" result = self._remote_command_executor.run_remote_command( "pbsnodes | tail -n +10") return re.search(r"np = (\d+)", result.stdout).group(1)
class SlurmCommands(SchedulerCommands): """Implement commands for slurm scheduler.""" def __init__(self, remote_command_executor): super().__init__(remote_command_executor) @retry( retry_on_result=lambda result: "JobState" not in result or any(value in result for value in ["EndTime=Unknown", "JobState=RUNNING", "JobState=COMPLETING"]), wait_fixed=seconds(3), stop_max_delay=minutes(7), ) def wait_job_completed(self, job_id): # noqa: D102 result = self._remote_command_executor.run_remote_command( "scontrol show jobs -o {0}".format(job_id), raise_on_error=False) return result.stdout def get_job_exit_status(self, job_id): # noqa: D102 result = self._remote_command_executor.run_remote_command( "scontrol show jobs -o {0}".format(job_id)) match = re.search(r"ExitCode=(.+?) ", result.stdout) return match.group(1) def assert_job_submitted(self, sbatch_output): # noqa: D102 __tracebackhide__ = True match = re.search(r"Submitted batch job ([0-9]+)", sbatch_output) assert_that(match).is_not_none() return match.group(1) def submit_command(self, command, nodes=1, slots=None, host=None, after_ok=None, other_options=None): # noqa: D102 submission_command = "sbatch --wrap='{0}'".format(command) if nodes > 0: submission_command += " -N {0}".format(nodes) if host: submission_command += " --nodelist={0}".format(host) if slots: submission_command += " -n {0}".format(slots) if after_ok: submission_command += " -d afterok:{0}".format(after_ok) if other_options: submission_command += " {0}".format(other_options) return self._remote_command_executor.run_remote_command( submission_command) def submit_script(self, script, script_args=None, nodes=1, slots=None, host=None, additional_files=None): # noqa: D102 if not additional_files: additional_files = [] if not script_args: script_args = [] additional_files.append(script) script_name = os.path.basename(script) submission_command = "sbatch" if host: submission_command += " --nodelist={0}".format(host) if slots: submission_command += " -n {0}".format(slots) if nodes > 1: submission_command += " -N {0}".format(nodes) submission_command += " {1} {2}".format(nodes, script_name, " ".join(script_args)) return self._remote_command_executor.run_remote_command( submission_command, additional_files=additional_files) def assert_job_succeeded(self, job_id, children_number=0): # noqa: D102 result = self._remote_command_executor.run_remote_command( "scontrol show jobs -o {0}".format(job_id)) assert_that(result.stdout).contains("JobState=COMPLETED") def compute_nodes_count(self): # noqa: D102 result = self._remote_command_executor.run_remote_command( "sinfo --Node --noheader | grep compute | wc -l") # split()[-1] to extract last line and trim whitespaces return int(result.stdout.split()[-1]) def get_compute_nodes(self): # noqa: D102 result = self._remote_command_executor.run_remote_command( "sinfo --Node --noheader | grep compute | awk '{print $1}'") return result.stdout.splitlines() @retry(retry_on_result=lambda result: "drain" not in result, wait_fixed=seconds(3), stop_max_delay=minutes(5)) def wait_for_locked_node(self): # noqa: D102 return self._remote_command_executor.run_remote_command( "/opt/slurm/bin/sinfo -h -o '%t'").stdout def get_node_cores(self): """Return number of slots from the scheduler.""" result = self._remote_command_executor.run_remote_command( "/opt/slurm/bin/sinfo -o '%c' -h") return re.search(r"(\d+)", result.stdout).group(1) def get_job_info(self, job_id): """Return job details from slurm""" return self._remote_command_executor.run_remote_command( "scontrol show jobs -o {0}".format(job_id)).stdout
class SgeCommands(SchedulerCommands): """Implement commands for sge scheduler.""" def __init__(self, remote_command_executor): super().__init__(remote_command_executor) @retry(retry_on_result=lambda result: result != 0, wait_fixed=seconds(3), stop_max_delay=minutes(7)) def wait_job_completed(self, job_id): # noqa: D102 result = self._remote_command_executor.run_remote_command( "qacct -j {0}".format(job_id), raise_on_error=False) return result.return_code def get_job_exit_status(self, job_id): # noqa: D102 result = self._remote_command_executor.run_remote_command( "qacct -j {0}".format(job_id)) match = re.search(r"exit_status\s+([0-9]+)", result.stdout) assert_that(match).is_not_none() return match.group(1) def assert_job_submitted(self, qsub_output, is_array=False): # noqa: D102 __tracebackhide__ = True if is_array: regex = r"Your job-array ([0-9]+)\.[0-9\-:]+ \(.+\) has been submitted" else: regex = r"Your job ([0-9]+) \(.+\) has been submitted" match = re.search(regex, qsub_output) assert_that(match).is_not_none() return match.group(1) def submit_command(self, command, nodes=1, slots=None, hold=False, after_ok=None): # noqa: D102 flags = "" if nodes > 1: slots = nodes * slots if slots: flags += "-pe mpi {0} ".format(slots) if hold: flags += "-h " if after_ok: flags += "-hold_jid {0} ".format(after_ok) return self._remote_command_executor.run_remote_command( "echo '{0}' | qsub {1}".format(command, flags), raise_on_error=False) def submit_script(self, script, script_args=None, nodes=1, slots=None, additional_files=None): # noqa: D102 if not additional_files: additional_files = [] if not script_args: script_args = [] additional_files.append(script) flags = "" if slots: flags += "-pe mpi {0} ".format(slots) script_name = os.path.basename(script) return self._remote_command_executor.run_remote_command( "qsub {0} {1} {2}".format(flags, script_name, " ".join(script_args)), additional_files=additional_files) def assert_job_succeeded(self, job_id, children_number=0): # noqa: D102 __tracebackhide__ = True status = self.get_job_exit_status(job_id) assert_that(status).is_equal_to("0") def compute_nodes_count(self): # noqa: D102 result = self._remote_command_executor.run_remote_command( "qhost | grep -o ip- | wc -l") # split()[-1] to extract last line and trim whitespaces return int(result.stdout.split()[-1]) def get_compute_nodes(self): # noqa: D102 result = self._remote_command_executor.run_remote_command( "qhost | grep ip- | awk '{print $1}'") return result.stdout.splitlines() @retry( retry_on_result=lambda result: "<state>d</state>" not in result, wait_fixed=seconds(3), stop_max_delay=minutes(5), ) def wait_for_locked_node(self): # noqa: D102 return self._remote_command_executor.run_remote_command( "qstat -f -xml").stdout def get_node_cores(self): """Return number of slots from the scheduler.""" result = self._remote_command_executor.run_remote_command( "qhost -F | grep hl:m_core") return re.search(r"hl:m_core=(\d+).000000", result.stdout).group(1)
class EBSSnapshotsFactory: """Manage creation and destruction of volume snapshots.""" def __init__(self): self.config = None self.instance = None self.volume = None self.snapshot = None self.security_group_id = None self.ec2 = None self.boto_client = None def create_snapshot(self, request, subnet_id, region): """ Create a snapshot in a given region. :param request: The current request :param subnet_id: The subnet id where to get the snapshot :param region: The region where to get the snapshot """ # Only one snapshot creation per factory allowed if self.snapshot: raise Exception("Snapshot already created") self.ec2 = boto3.resource("ec2", region_name=region) self.boto_client = boto3.client("ec2", region_name=region) snapshot_config = SnapshotConfig( request.config.getoption("key_path"), request.config.getoption("key_name"), self.ec2.Subnet(subnet_id).vpc_id, subnet_id, ) self.snapshot = self._create_snapshot(region, snapshot_config) return self.snapshot.id def create_existing_volume(self, request, subnet_id, region): """ Create a volume in a given region. :param request: The current request :param subnet_id: The subnet id where to get the snapshot :param region: The region where to get the snapshot """ # Only one volume creation per factory allowed if self.volume: raise Exception("Volume already created") self.ec2 = boto3.resource("ec2", region_name=region) self.boto_client = boto3.client("ec2", region_name=region) volume_config = SnapshotConfig( request.config.getoption("key_path"), request.config.getoption("key_name"), self.ec2.Subnet(subnet_id).vpc_id, subnet_id, ) self._create_volume_process(region, volume_config) return self.volume.id def _create_volume_process(self, region, snapshot_config): self.config = snapshot_config ami_id = self._get_amazonlinux2_ami() self.security_group_id = self._get_security_group_id() subnet = self.ec2.Subnet(self.config.head_node_subnet_id) # Create a new volume and attach to the instance self.volume = self._create_volume(subnet) self.instance = self._launch_instance(ami_id, subnet) self._attach_volume() # Open ssh connection self.ssh_conn = self._open_ssh_connection() # Partitions the disk with a gpt table and 1 single partition inside self._format_volume(self.ssh_conn) # Stops the instance before taking the snapshot self._release_instance() def _create_snapshot(self, region, snapshot_config): self._create_volume_process(region, snapshot_config) self.snapshot = self._create_volume_snapshot() return self.snapshot def _create_volume_snapshot(self): logging.info("creating snapshot...") snapshot = self.ec2.create_snapshot( Description="parallelcluster-test-snapshot", VolumeId=self.volume.id) while snapshot.state == "pending": time.sleep(10) snapshot = self.ec2.Snapshot(snapshot.id) logging.info("Snapshot ready: %s" % snapshot.id) return snapshot def _format_volume(self, ssh_conn): logging.info("Partitioning device...") ssh_conn.run( "sudo sh -c 'echo -e \"g\nn\np\n1\n\n\nw\" | fdisk /dev/sdf'", warn=True, pty=False, hide=False) # Finds out the device name of the volume logging.info("Finding device name...") device_name = ssh_conn.run("readlink -f /dev/sdf").stdout.strip() # formats the 1st partition of disk logging.info("Formatting 1st partition...") ssh_conn.run("sudo sh -c 'mkfs.ext4 {}1'".format(device_name)) logging.info("Mounting partition...") ssh_conn.run("sudo mkdir /mnt/tmp") ssh_conn.run("sudo mount {}1 /mnt/tmp".format(device_name)) logging.info("Writing test data...") ssh_conn.run("echo 'hello world' | sudo tee -a /mnt/tmp/test.txt") logging.info("Device ready") def _open_ssh_connection(self): tries = 5 logging.info("Connecting to instance %s " % self.instance.public_ip_address) logging.info("ssh_key: %s " % self.config.ssh_key) ssh_conn = None while tries > 0: try: ssh_conn = Connection( host=self.instance.public_ip_address, user="******", forward_agent=False, connect_kwargs={"key_filename": [self.config.ssh_key]}, ) ssh_conn.open() tries = 0 except BaseException: logging.info("SSH connection error - retrying...") tries -= 1 time.sleep(20) if (ssh_conn is None) or (not ssh_conn.is_connected): raise ConnectionError() return ssh_conn @retry(retry_on_result=lambda state: state != "attached", wait_fixed=seconds(2), stop_max_delay=minutes(5)) def _wait_volume_attached(self): vol = self.ec2.Volume(self.volume.id) attachment_state = next( (attachment["State"] for attachment in vol.attachments if attachment["InstanceId"] == self.instance.id), "") return attachment_state def _attach_volume(self): result = self.volume.attach_to_instance(InstanceId=self.instance.id, Device="/dev/sdf") logging.info("Attach Volume Result: %s", result) self._wait_volume_attached() logging.info("Volume attached") def _create_volume(self, subnet): vol = self.ec2.create_volume( Size=10, Encrypted=False, AvailabilityZone=subnet.availability_zone, TagSpecifications=[{ "ResourceType": "volume", "Tags": [{ "Key": "name", "Value": "parallel-cluster-test-volume" }] }], ) logging.info("Volume Id: %s" % vol.id) # We can check if the volume is now ready and available: logging.info("Waiting for the volume to be ready...") while vol.state == "creating": vol = self.ec2.Volume(vol.id) time.sleep(2) logging.info("Volume ready") return vol def _get_security_group_id(self): security_group_id = self.boto_client.create_security_group( Description="security group for snapshot instance node", GroupName="snapshot-" + random_alphanumeric(), VpcId=self.config.vpc_id, )["GroupId"] self.boto_client.authorize_security_group_ingress( GroupId=security_group_id, IpPermissions=[{ "IpProtocol": "tcp", "FromPort": 22, "ToPort": 22, "IpRanges": [{ "CidrIp": "0.0.0.0/0" }] }], ) return security_group_id def _launch_instance(self, ami_id, subnet): instance = self.ec2.create_instances( ImageId=ami_id, KeyName=self.config.key_name, MinCount=1, MaxCount=1, InstanceType="t2.micro", NetworkInterfaces=[{ "SubnetId": subnet.id, "DeviceIndex": 0, "AssociatePublicIpAddress": True, "Groups": [self.security_group_id], }], TagSpecifications=[{ "ResourceType": "instance", "Tags": [{ "Key": "Name", "Value": "pcluster-snapshot-instance" }] }], )[0] logging.info("Waiting for instance to be running...") while instance.state["Name"] == "pending": time.sleep(10) instance = self.ec2.Instance(instance.id) logging.info("Instance state: %s" % instance.state) logging.info("Public dns: %s" % instance.public_dns_name) return instance def _get_amazonlinux2_ami(self): # Finds most recent alinux2 ami in region response = self.boto_client.describe_images( Owners=["amazon"], Filters=[ { "Name": "name", "Values": ["amzn2-ami-hvm-*"] }, { "Name": "description", "Values": ["Amazon Linux 2 AMI*"] }, { "Name": "architecture", "Values": ["x86_64"] }, { "Name": "root-device-type", "Values": ["ebs"] }, { "Name": "state", "Values": ["available"] }, ], ) amis = sorted(response["Images"], key=lambda x: x["CreationDate"], reverse=True) return amis[0]["ImageId"] def release_all(self): """Release all resources""" self._release_instance() self._release_volume() self._release_snapshot() self._release_security_group() @retry(stop_max_attempt_number=5, wait_fixed=5000) def _release_snapshot(self): if self.snapshot: logging.info("Deleting snapshot %s" % self.snapshot.id) self.snapshot.delete() @retry(stop_max_attempt_number=5, wait_fixed=5000) def _release_instance(self): if self.instance: self.instance.terminate() logging.info("Waiting for instance to be terminated...") while self.instance.state["Name"] != "terminated": time.sleep(10) self.instance = self.ec2.Instance(self.instance.id) logging.info("Instance terminated") self.instance = None @retry(stop_max_attempt_number=5, wait_fixed=5000) def _release_volume(self): if self.volume: logging.info("Deleting volume %s" % self.volume.id) self.volume.delete() self.volume = None def _release_security_group(self): if self.security_group_id: logging.info("Deleting security group %s" % self.security_group_id) self.boto_client.delete_security_group( GroupId=self.security_group_id) self.security_group_id = None
_wait_instance_running(ec2_client, [instance_id]) # Wait instance to complete cloud-init _wait_compute_cloudinit_done(command_executor, compute_node) return compute_node def _wait_instance_running(ec2_client, instance_ids): """Wait EC2 instance to go running""" logging.info(f"Waiting for {instance_ids} to be running") ec2_client.get_waiter("instance_running").wait( InstanceIds=instance_ids, WaiterConfig={"Delay": 60, "MaxAttempts": 5} ) @retry(wait_fixed=seconds(10), stop_max_delay=minutes(3)) def _wait_compute_cloudinit_done(command_executor, compute_node): """Wait till cloud-init complete on a given compute node""" compute_node_private_ip = compute_node.get("privateIpAddress") compute_cloudinit_status_output = command_executor.run_remote_command( f"ssh -q {compute_node_private_ip} sudo cloud-init status" ).stdout assert_that(compute_cloudinit_status_output).contains("status: done") def _test_event_handler_execution(cluster, region, os, architecture, command_executor, head_node, compute_node): """Test event handler execution and environment""" head_scheduler_plugin_log_output = command_executor.run_remote_command( f"cat {SCHEDULER_PLUGIN_LOG_OUT_PATH}" ).stdout python_root = command_executor.run_remote_command(f"sudo su - {SCHEDULER_PLUGIN_USER} -c 'which python'").stdout[
remote_command_executor.run_remote_command( "aws s3 cp s3://{bucket_name}/export_dir/file_to_export ./file_to_export" .format(bucket_name=bucket_name)) result = remote_command_executor.run_remote_command("cat ./file_to_export") assert_that(result.stdout).is_equal_to("Exported by FSx Lustre") def _assert_job_submitted(qsub_output): __tracebackhide__ = True match = re.search(r"Your job ([0-9]+) \(.+\) has been submitted", qsub_output) assert_that(match).is_not_none() return match.group(1) @retry(retry_on_result=lambda result: result != 0, wait_fixed=seconds(7), stop_max_delay=minutes(5)) def _wait_job_completed(remote_command_executor, job_id): result = remote_command_executor.run_remote_command( "qacct -j {0}".format(job_id), raise_on_error=False) return result.return_code def _get_job_exit_status(remote_command_executor, job_id): result = remote_command_executor.run_remote_command( "qacct -j {0}".format(job_id)) match = re.search(r"exit_status\s+([0-9]+)", result.stdout) assert_that(match).is_not_none() return match.group(1)
raise except AttributeError as e: LOGGER.critical("Error no attribute {0} in dict: {1}".format(os, e)) raise except IndexError as e: LOGGER.critical("Error no ami retrieved: {0}".format(e)) raise @retry(stop_max_attempt_number=3, wait_fixed=5000) def fetch_instance_slots(region, instance_type): return get_instance_info(instance_type, region).get("VCpuInfo").get("DefaultVCpus") @retry(stop_max_attempt_number=10, wait_fixed=seconds(50)) def _assert_ami_is_available(region, ami_id): LOGGER.info("Asserting the ami is available") ami_state = boto3.client("ec2", region_name=region).describe_images( ImageIds=[ami_id]).get("Images")[0].get("State") assert_that(ami_state).is_equal_to("available") def get_installed_parallelcluster_version(): """Get the version of the installed aws-parallelcluster package.""" return pkg_resources.get_distribution("aws-parallelcluster").version def get_sts_endpoint(region): """Get regionalized STS endpoint.""" return "https://sts.{0}.{1}".format(
scheduler_commands.submit_script( str(test_datadir / "{0}_kill_scheduler_job.sh".format(scheduler))) instance_id = wait_compute_log(remote_command_executor) _assert_compute_logs(remote_command_executor, instance_id) assert_instance_replaced_or_terminating(instance_id, region) # verify that desired capacity is still 1 assert_that(get_desired_asg_capacity(region, cluster.cfn_name)).is_equal_to(1) _assert_nodes_removed_from_scheduler(scheduler_commands, compute_nodes) assert_no_errors_in_logs(remote_command_executor, ["/var/log/sqswatcher", "/var/log/jobwatcher"]) @retry(wait_fixed=seconds(20), stop_max_delay=minutes(5)) def _assert_nodes_removed_from_scheduler(scheduler_commands, nodes): assert_that( scheduler_commands.get_compute_nodes()).does_not_contain(*nodes) def _assert_compute_logs(remote_command_executor, instance_id): remote_command_executor.run_remote_command( "tar -xf /home/logs/compute/{0}.tar.gz --directory /tmp".format( instance_id)) remote_command_executor.run_remote_command( "test -f /tmp/var/log/nodewatcher") messages_log = remote_command_executor.run_remote_command( "cat /tmp/var/log/nodewatcher", hide=True).stdout assert_that(messages_log).contains( "Node is marked as down by scheduler or not attached correctly. Terminating..."
image_builder = boto3.client("imagebuilder") image_builder.start_image_pipeline_execution( imagePipelineArn=image_builder_pipeline, ) response = image_builder.list_image_pipeline_images( imagePipelineArn=image_builder_pipeline, ) assert_that(response["imageSummaryList"]).is_length(1) image = _wait_for_image_build(image_builder_pipeline) logging.info("Image %s", image) assert_that(image["state"]["status"]).is_equal_to("AVAILABLE") # Wait for 2 minutes for the Lambda to be updated time.sleep(120) lambda_client = boto3.client("lambda") lambda_resource = lambda_client.get_function(FunctionName=lambda_name) logging.info("API Lambda %s", lambda_resource) assert_that(lambda_resource["Code"]["ImageUri"]).is_equal_to( image["outputResources"]["containers"][0]["imageUris"][0]) @retry( retry_on_result=lambda result: result["state"]["status"] not in {"AVAILABLE", "CANCELLED", "FAILED", "DELETED"}, wait_fixed=seconds(10), stop_max_delay=minutes(15), ) def _wait_for_image_build(image_builder_pipeline): image_builder = boto3.client("imagebuilder") return image_builder.list_image_pipeline_images( imagePipelineArn=image_builder_pipeline, )["imageSummaryList"][0]
"echo 'Exported by FSx Lustre' > {mount_dir}/file_to_export".format( mount_dir=mount_dir)) remote_command_executor.run_remote_command( "sudo lfs hsm_archive {mount_dir}/file_to_export && sleep 5".format( mount_dir=mount_dir)) remote_command_executor.run_remote_command( "aws s3 cp s3://{bucket_name}/export_dir/file_to_export ./file_to_export" .format(bucket_name=bucket_name)) result = remote_command_executor.run_remote_command("cat ./file_to_export") assert_that(result.stdout).is_equal_to("Exported by FSx Lustre") @retry( retry_on_result=lambda result: result.get("Lifecycle") in ["PENDING", "EXECUTING", "CANCELLING"], wait_fixed=seconds(5), stop_max_delay=minutes(7), ) def poll_on_data_export(task, fsx): logging.info("Data Export Task {task_id}: {status}".format( task_id=task.get("TaskId"), status=task.get("Lifecycle"))) return fsx.describe_data_repository_tasks( TaskIds=[task.get("TaskId")]).get("DataRepositoryTasks")[0] def _test_data_repository_task(remote_command_executor, mount_dir, bucket_name, fsx_fs_id, region): logging.info("Testing fsx lustre data repository task") file_contents = "Exported by FSx Lustre" remote_command_executor.run_remote_command( "echo '{file_contents}' > {mount_dir}/file_to_export".format(
# with the License. A copy of the License is located at # # http://aws.amazon.com/apache2.0/ # # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and # limitations under the License. from remote_command_executor import RemoteCommandExecutionError from retrying import retry from time_utils import minutes, seconds @retry( retry_on_exception=lambda exception: isinstance( exception, RemoteCommandExecutionError), wait_fixed=seconds(30), stop_max_delay=minutes(15), ) def wait_compute_log(remote_command_executor, expected_num_nodes=1): """Return list of compute node instance_ids in case of failure.""" remote_command_executor.run_remote_command("test -d /home/logs/compute", log_error=False) output = remote_command_executor.run_remote_command( "ls /home/logs/compute/", log_error=False).stdout # sample output: "i-049ce596aa69ac988.tar.gz i-064f07c373d926ba4.tar.gz" instance_ids = [ instance.replace(".tar.gz", "") for instance in output.split() ] # make sure we got all the expected failing compute nodes if len(instance_ids) != expected_num_nodes: raise RemoteCommandExecutionError(
class SlurmCommands(SchedulerCommands): """Implement commands for slurm scheduler.""" def __init__(self, remote_command_executor): super().__init__(remote_command_executor) def wait_job_completed(self, job_id, timeout=None): # noqa: D102 if not timeout: timeout = 12 @retry( retry_on_result=lambda result: "JobState" not in result or any(value in result for value in [ "EndTime=Unknown", "JobState=RUNNING", "JobState=COMPLETING", "JobState=CONFIGURING" ]), wait_fixed=seconds(10), stop_max_delay=minutes(timeout), ) def _job_status_retryer(): result = self._remote_command_executor.run_remote_command( "scontrol show jobs -o {0}".format(job_id), raise_on_error=False) return result.stdout return _job_status_retryer() def get_job_exit_status(self, job_id): # noqa: D102 result = self._remote_command_executor.run_remote_command( "scontrol show jobs -o {0}".format(job_id)) match = re.search(r"ExitCode=(.+?) ", result.stdout) return match.group(1) def assert_job_submitted(self, sbatch_output): # noqa: D102 __tracebackhide__ = True match = re.search(r"Submitted batch job ([0-9]+)", sbatch_output) assert_that(match).is_not_none() return match.group(1) def submit_command( self, command, nodes=0, slots=None, host=None, after_ok=None, partition=None, constraint=None, other_options=None, raise_on_error=True, ): """Submit job with command.""" job_submit_command = "--wrap='{0}'".format(command) return self._submit_batch_job( job_submit_command, nodes, slots, host, after_ok, partition, constraint, other_options, raise_on_error=raise_on_error, ) def submit_script( self, script, script_args=None, nodes=0, slots=None, host=None, after_ok=None, partition=None, constraint=None, other_options=None, additional_files=None, raise_on_error=True, ): """Submit job with script.""" if not additional_files: additional_files = [] if not script_args: script_args = [] additional_files.append(script) script_name = os.path.basename(script) job_submit_command = " {0} {1}".format(script_name, " ".join(script_args)) return self._submit_batch_job( job_submit_command, nodes, slots, host, after_ok, partition, constraint, other_options, additional_files, raise_on_error=raise_on_error, ) def _submit_batch_job( self, job_submit_command, nodes=0, slots=None, host=None, after_ok=None, partition=None, constraint=None, other_options=None, additional_files=None, raise_on_error=True, ): submission_command = "sbatch" if host: submission_command += " --nodelist={0}".format(host) if slots: submission_command += " -n {0}".format(slots) if nodes > 0: submission_command += " -N {0}".format(nodes) if after_ok: submission_command += " -d afterok:{0}".format(after_ok) if partition: submission_command += " -p {0}".format(partition) if constraint: submission_command += " -C '{0}'".format(constraint) if other_options: submission_command += " {0}".format(other_options) submission_command += " {0}".format(job_submit_command) if additional_files: return self._remote_command_executor.run_remote_command( submission_command, additional_files=additional_files, raise_on_error=raise_on_error) else: return self._remote_command_executor.run_remote_command( submission_command, raise_on_error=raise_on_error) def _dump_job_output(self, job_info): params = re.split(r"\s+", job_info) stderr = None stdout = None for param in params: match_stderr = re.match(r"StdErr=(.*)?", param) match_stdout = re.match(r"StdOut=(.*)?", param) if match_stderr: stderr = match_stderr.group(1) logging.info("stderr:" + stderr) if match_stdout: stdout = match_stdout.group(1) logging.info("stdout:" + stdout) if stderr is not None or stdout is not None: if stderr == stdout: result = self._remote_command_executor.run_remote_command( f'echo "stderr/stdout:" && cat {stderr}') logging.error(result.stdout) else: if stderr is not None: stderr_result = self._remote_command_executor.run_remote_command( f'echo "stderr" && cat {stderr}') logging.error(stderr_result.stdout) if stdout is not None: stdout_result = self._remote_command_executor.run_remote_command( f'echo "stdout" && cat {stdout}') logging.error(stdout_result.stdout) else: logging.error("Unable to retrieve job output.") def assert_job_succeeded(self, job_id, children_number=0): # noqa: D102 result = self._remote_command_executor.run_remote_command( "scontrol show jobs -o {0}".format(job_id)) try: assert_that(result.stdout).contains("JobState=COMPLETED") except AssertionError: self._dump_job_output(result.stdout) raise def compute_nodes_count(self, filter_by_partition=None): # noqa: D102 return len(self.get_compute_nodes(filter_by_partition)) def get_compute_nodes(self, filter_by_partition=None): # noqa: D102 command = "sinfo --Node --noheader --responding" if filter_by_partition: command += " --partition {}".format(filter_by_partition) # Print first and fourth columns to get nodename and state only (default partition contains *) # Filter out nodes that are not responding or in power saving states command += " | awk '{print $1, $4}' | grep -v '[*#~%]' | awk '{print $1}'" result = self._remote_command_executor.run_remote_command(command) return result.stdout.splitlines() @retry(retry_on_result=lambda result: "drain" not in result, wait_fixed=seconds(3), stop_max_delay=minutes(5)) def wait_for_locked_node(self): # noqa: D102 return self._remote_command_executor.run_remote_command( "/opt/slurm/bin/sinfo -h -o '%t'").stdout def get_node_cores(self, partition=None): """Return number of slots from the scheduler.""" check_core_cmd = "/opt/slurm/bin/sinfo -o '%c' -h" if partition: check_core_cmd += " -p {}".format(partition) result = self._remote_command_executor.run_remote_command( check_core_cmd) return re.search(r"(\d+)", result.stdout).group(1) def get_job_info(self, job_id): """Return job details from slurm""" return self._remote_command_executor.run_remote_command( "scontrol show jobs -o {0}".format(job_id)).stdout def cancel_job(self, job_id): """Cancel a job""" return self._remote_command_executor.run_remote_command( "scancel {}".format(job_id)) def set_nodes_state(self, compute_nodes, state): """Put nodes into a state.""" self._remote_command_executor.run_remote_command( "sudo /opt/slurm/bin/scontrol update NodeName={} state={} reason=testing" .format(",".join(compute_nodes), state)) def set_partition_state(self, partition, state): """Put partition into a state.""" self._remote_command_executor.run_remote_command( "sudo /opt/slurm/bin/scontrol update partition={} state={}".format( partition, state)) def get_nodes_status(self, filter_by_nodes=None): """Retrieve node state/status from scheduler""" result = self._remote_command_executor.run_remote_command( "/opt/slurm/bin/sinfo -N --long -h | awk '{print$1, $4}'" ).stdout.splitlines() current_node_states = {} for entry in result: nodename, state = entry.split() current_node_states[nodename] = state return ({ node: current_node_states.get(node, "Unable to retrieve state") for node in filter_by_nodes } if filter_by_nodes else current_node_states) def get_node_addr_host(self): """Return a list of nodename, nodeaddr, nodehostname entries.""" # q1-dy-c5xlarge-1 172.31.4.241 q1-dy-c5xlarge-1 # q1-dy-c5xlarge-2 172.31.4.136 q1-dy-c5xlarge-2 # q1-dy-c5xlarge-3 q1-dy-c5xlarge-3 q1-dy-c5xlarge-3 return self._remote_command_executor.run_remote_command( "/opt/slurm/bin/sinfo -O NodeList:' ',NodeAddr:' ',NodeHost:' ' -N -h | awk '{print$1, $2, $3}'" ).stdout.splitlines() def submit_command_and_assert_job_accepted(self, submit_command_args): """Submit a command and assert the job is accepted by scheduler.""" result = self.submit_command(**submit_command_args) return self.assert_job_submitted(result.stdout) def get_partition_state(self, partition): """Get the state of the partition.""" return self._remote_command_executor.run_remote_command( f'/opt/slurm/bin/scontrol show partition={partition} | grep -oP "State=\\K(\\S+)"' ).stdout @retry(wait_fixed=seconds(20), stop_max_delay=minutes(8)) def wait_job_running(self, job_id): """Wait till job starts running.""" result = self._remote_command_executor.run_remote_command( "scontrol show jobs -o {0}".format(job_id)) assert_that(result.stdout).contains("JobState=RUNNING")
def publish_compute_nodes_metric(scheduler_commands, max_monitoring_time, region, cluster_name): logging.info("Monitoring scheduler status and publishing metrics") cw_client = boto3.client("cloudwatch", region_name=region) compute_nodes_time_series = [] ec2_nodes_time_series = [] timestamps = [datetime.datetime.utcnow()] @retry( # Retry until EC2 and Scheduler capacities scale down to 0 # Also make sure cluster scaled up before scaling down retry_on_result=lambda _: ec2_nodes_time_series[-1] != 0 or compute_nodes_time_series[-1] != 0 or max( ec2_nodes_time_series) == 0 or max(compute_nodes_time_series) == 0, wait_fixed=seconds(20), stop_max_delay=max_monitoring_time, ) def _watch_compute_nodes_allocation(): try: compute_nodes = scheduler_commands.compute_nodes_count() logging.info( "Publishing schedueler compute metric: count={0}".format( compute_nodes)) cw_client.put_metric_data( Namespace="ParallelCluster/benchmarking/{cluster_name}".format( cluster_name=cluster_name), MetricData=[{ "MetricName": "ComputeNodesCount", "Value": compute_nodes, "Unit": "Count" }], ) ec2_instances_count = len( _describe_cluster_instances(cluster_name, region, filter_by_node_type="Compute")) logging.info("Publishing EC2 compute metric: count={0}".format( ec2_instances_count)) cw_client.put_metric_data( Namespace="ParallelCluster/benchmarking/{cluster_name}".format( cluster_name=cluster_name), MetricData=[{ "MetricName": "EC2NodesCount", "Value": ec2_instances_count, "Unit": "Count" }], ) # add values only if there is a transition. if (len(ec2_nodes_time_series) == 0 or ec2_nodes_time_series[-1] != ec2_instances_count or compute_nodes_time_series[-1] != compute_nodes): ec2_nodes_time_series.append(ec2_instances_count) compute_nodes_time_series.append(compute_nodes) timestamps.append(datetime.datetime.utcnow()) except Exception as e: logging.warning( "Failed while watching nodes allocation with exception: %s", e) raise try: _watch_compute_nodes_allocation() except RetryError: # ignoring this error in order to perform assertions on the collected data. pass end_time = datetime.datetime.utcnow() logging.info( "Monitoring completed: compute_nodes_time_series [ %s ], timestamps [ %s ]", " ".join(map(str, compute_nodes_time_series)), " ".join(map(str, timestamps)), ) logging.info( "Sleeping for 3 minutes to wait for the metrics to propagate...") sleep(180) return compute_nodes_time_series, timestamps, end_time
class AWSBatchCommands(SchedulerCommands): """Implement commands for awsbatch scheduler.""" def __init__(self, remote_command_executor): super().__init__(remote_command_executor) @retry( retry_on_result=lambda result: "FAILED" not in result and any( status != "SUCCEEDED" for status in result), wait_fixed=seconds(7), stop_max_delay=minutes(15), ) def wait_job_completed(self, job_id): # noqa: D102 result = self._remote_command_executor.run_remote_command( "awsbstat -d {0}".format(job_id), log_output=True) return re.findall(r"status\s+: (.+)", result.stdout) def get_job_exit_status(self, job_id): # noqa: D102 return self.wait_job_completed(job_id) def assert_job_submitted(self, awsbsub_output): # noqa: D102 __tracebackhide__ = True match = re.search(r"Job ([a-z0-9\-]{36}) \(.+\) has been submitted.", awsbsub_output) assert_that(match).is_not_none() return match.group(1) def submit_command(self, command, nodes=1, slots=None): # noqa: D102 return self._remote_command_executor.run_remote_command( 'echo "{0}" | awsbsub -n {1}'.format(command, nodes)) def submit_script(self, script, script_args=None, nodes=1, additional_files=None, slots=None): # noqa: D102 raise NotImplementedError def assert_job_succeeded(self, job_id, children_number=0): # noqa: D102 __tracebackhide__ = True status = self.get_job_exit_status(job_id) assert_that(status).is_length(1 + children_number) assert_that(status).contains_only("SUCCEEDED") def compute_nodes_count(self): # noqa: D102 raise NotImplementedError def get_compute_nodes(self): # noqa: D102 raise NotImplementedError def wait_for_locked_node(self): # noqa: D102 raise NotImplementedError def get_node_cores(self): # noqa: D102 raise NotImplementedError def set_nodes_state(self, compute_nodes, state): """Not implemented.""" raise NotImplementedError def get_nodes_status(self): """Not implemented.""" raise NotImplementedError