def get_nodes_testbed(testbed): """ This function gets the testbed object information, from that information it determines if the testbed it is of the category SLURM. If it is SLURM, it will determine if it connects via ssh. If it connects via ssh it will get the node info executing the command via ssh If it is not ssh, it will execute directly the sinfo command in console. If it is not type SLURM it will just return an empty list """ command = "sinfo" params = ["-a"] if testbed.category == Testbed.slurm_category: if testbed.protocol == Testbed.protocol_local: output = shell.execute_command(command=command, params=params) elif testbed.protocol == Testbed.protocol_ssh: output = shell.execute_command(command=command, server=testbed.endpoint, params=params) else: return [] return parse_sinfo_partitions(output) else: return []
def get_cpuinfo_node(testbed, node): """ Given a testbed of type linux and a node, it is able to connect to it and retrieve the necessary information of the cpu_info of the node. In returns the cpu information in the form of a list of CPU objects """ try: if node in testbed.nodes and not node.disabled: command = "ssh" params = [node.name, "'cat", "/proc/cpuinfo'"] if Testbed.protocol_local == testbed.protocol: cpu_info = shell.execute_command(command=command, params=params) elif Testbed.protocol_ssh == testbed.protocol: cpu_info = shell.execute_command(command=command, server=testbed.endpoint, params=params) else: logging.info( "Tesbed protocol: %s not supported to get node information", testbed.protocol) return [] return parse_cpu_info(cpu_info) else: return [] except subprocess.CalledProcessError: logging.error("Exception trying to get the node cpu info") return []
def upload_deployment(executable, testbed, app_folder='/tmp'): """ It uploads a executable to the testbed to be executed """ # TODO app_folder needs to go via configuration. # TODO upload the executable # TODO Updates the status of the deployment if executable.compilation_type == Executable.__type_singularity_pm__ and testbed.category == Testbed.slurm_category and 'SINGULARITY' in testbed.package_formats: path = str(uuid.uuid4()) if testbed.protocol == Testbed.protocol_ssh: # TODO for local protocol deployment = db.session.query(Deployment).filter_by( executable_id=executable.id, testbed_id=testbed.id).first() shell.execute_command('mkdir', testbed.endpoint, [path]) filename = os.path.basename(executable.singularity_image_file) path = path + "/" deployment.path = os.path.join(path, filename) # Uploading the file to the testbed shell.scp_file(executable.singularity_image_file, testbed.endpoint, path) deployment.status = Deployment.__status_uploaded_updated__ db.session.commit()
def unzip_src(executable, connection_url, destination_folder): """ It unzips the selected zip file in the selected location for compiling """ zip_file = os.path.join(destination_folder, executable.source_code_file) shell.execute_command('unzip', connection_url, [zip_file, '-d', destination_folder])
def create_singularity_image(configuration, connection_url, image_file): """ Creating the image in the compilation node """ image_size = configuration['singularity_image_size'] shell.execute_command('singularity', connection_url, ['create', '-F', '--size', image_size, image_file])
def test_ssh_command(self, mock_subprocess): """ Test that it is possible to exectue an ssh command if a server is given """ shell.execute_command(command = "ls", server="*****@*****.**", params=["-la", "."]) # We verify that the right params are passed to the mock_subprocess mock_subprocess.check_output.assert_called_with(" ssh [email protected] ls -la .", shell=True)
def test_check_port_notation(self, mock_subprocess): """ It checks taht we are parsing correctly the : for extracting the port and adding it as a parameter """ shell.execute_command(command = "ls", server="[email protected]:2222", params=["-la", "."]) # We verify that the right params are passed to the mock_subprocess mock_subprocess.check_output.assert_called_with(" ssh -p 2222 [email protected] ls -la .", shell=True)
def _execute_comparator(execution, endpoint, path): """ It takes an execution object and calculates the comparator provided by the self-adapation manager """ shell.execute_command(os.path.join( path, 'post_run_processing.sh'), endpoint, [ str(execution.slurm_sbatch_id), execution.execution_configuration.application.name, str(execution.execution_configuration.id) ])
def cancel_execution(execution, url): """ It finds an execution an cancels it if running """ if (( execution.execution_type == execute_type_singularity_pm ) or ( execution.execution_type == Executable.__type_pm__ ) or ( execution.execution_type == Executable.__type_slurm_sbatch__ ) or ( execution.execution_type == execute_type_singularity_srun ) or ( execution.execution_type == execute_type_singularity_srun ) or ( execution.execution_type == execute_type_slurm_srun )) and ( execution.status == Execution.__status_running__ ) : if execution.children is not None : for child in execution.children : if child.status == Execution.__status_running__ : shell.execute_command('scancel', url, [ str(child.batch_id) ]) shell.execute_command('scancel', url, [ str(execution.batch_id) ])
def create_random_folder(connection_url): """ It creates a random folder via ssh into a server and returns its localiton """ # We generate a UUID for the folder folder_name = str(uuid.uuid4()) # We create the folder shell.execute_command('mkdir', connection_url, [folder_name]) return folder_name
def upload_zip_file_application(executable, connection_url, destination_folder, upload_folder): """ It uploads the zip file of the application to the selected destination folder """ filename = os.path.join(upload_folder, executable.source_code_file) destination = os.path.join('.', destination_folder) if connection_url != '': shell.scp_file(filename, connection_url, destination) else: shell.execute_command('cp', params=[filename, destination])
def stop_execution(execution_id, endpoint): """ This will use scontrol to stop an execution scontrol suspend 7993 """ command = "scontrol" params = ["suspend", str(execution_id)] if endpoint: shell.execute_command(command=command, server=endpoint, params=params) else: shell.execute_command(command=command, params=params)
def idle_a_node(node_id): """ It changes a node to idle state so it can execute jobs """ node = db.session.query(Node).filter_by(id=int(node_id)).first() url = node.testbed.endpoint command = "scontrol" params = [] params.append('update') params.append('NodeName=' + node.name) params.append('State=idle') shell.execute_command(command, url, params)
def test_non_ssh_command(self, mock_subprocess): """ test that a command is executed without using ssh """ # We setup the mock mock_subprocess.check_output.return_value = "It is ok" output = shell.execute_command(command = "ls") # We verify this simple commands works self.assertEquals("It is ok", output) mock_subprocess.check_output.assert_called_with("ls", shell=True) # We verify a more complex scenario with several params output = shell.execute_command(command = "ls", params=["-la", "."]) # We verify that the params are passed in the correct way mock_subprocess.check_output.assert_called_with(" ls -la .", shell=True)
def _parse_sacct_output(id, url): """ It executes the sacct command and extracts the status information """ if id is None or id == '': return '?' output = shell.execute_command( 'sacct', server=url, params=[ '-j', id, '-o', 'JobID,NNodes,State,ExitCode,DerivedExitcode,Comment' ]) if output.count(b'\n') <= 2: return '?' elif output.count(b'RUNNING') >= 1: return 'RUNNING' elif output.count(b'FAILED') >= 1: return 'FAILED' elif output.count(b'COMPLETED') >= 1: return 'COMPLETED' elif output.count(b'TIMEOUT') >= 1: return 'TIMEOUT' elif output.count(b'CANCELLED') >= 1: return 'CANCELLED' else: return 'UNKNOWN'
def find_first_node(sbatch_id, url): """ This method finds the first node of a job using squeue command garciad@ns54 ~]$ squeue -j 7035 -o %N NODELIST ns51 """ output = shell.execute_command("squeue", url, ['-j', sbatch_id, '-o', '%N']) lines = output.decode('utf-8') lines = lines.split("\n") last = None for line in (line for line in lines if line.rstrip('\n')): last = line last = last.strip() nodes = last.split(',') nodes = nodes[0] if '[' in nodes: parts = nodes.split('[') node_name = parts[0] node_numbers = parts[1] first_number = node_numbers.split('-')[0] return node_name + first_number else: return nodes
def __launch_execution__(command, endpoint, params, execution_configuration, child_execution=None): """ It updates after any srun execution, singularity or not """ output = shell.execute_command(command, endpoint, params) __parse_output__(output, endpoint, execution_configuration, child_execution)
def get_nodes_testbed(testbed, command, params, parse_func): """This function gets the list of nodes of a testbed, executing the given command+params according to the testbed protocol. This function is intended to be called by the get_nodes_testbed of a workload manager implementation """ if testbed.protocol == Testbed.protocol_local: output = shell.execute_command(command=command, params=params) elif testbed.protocol == Testbed.protocol_ssh: output = shell.execute_command(command=command, server=testbed.endpoint, params=params) else: return [] return parse_func(output)
def add_resource(execution): """ it adds resources to a running execution adapt_compss_resources <master_node> <master_job_id> CREATE SLURM-Cluster default <singularity_image> """ if (( execution.execution_type == execute_type_singularity_pm)) : logging.info("Executing type corresponds with SINGULARITY_PM, trying adaptation") if (( execution.status == Execution.__status_running__)) : url = execution.execution_configuration.testbed.endpoint scaling_upper_bound = execution.execution_configuration.application.scaling_upper_bound enqueue_env_file = execution.execution_configuration.testbed.extra_config['enqueue_env_file'] singularity_image_file = execution.execution_configuration.executable.singularity_image_file sbatch_id = execution.batch_id upper_bound_ok = True if ( scaling_upper_bound is not None ) and ( scaling_upper_bound != 0 ) : if scaling_upper_bound <= execution.get_number_extra_jobs() : upper_bound_ok = False if upper_bound_ok : node = find_first_node(sbatch_id, url) command = "source" params = [] params.append(enqueue_env_file) params.append(";") params.append("adapt_compss_resources") params.append(node) params.append(sbatch_id) params.append('CREATE SLURM-Cluster default') params.append(singularity_image_file) output = shell.execute_command(command, url, params) job_name = parse_add_resource_output(output) print(job_name) time.sleep(2) extra_job_id = get_job_id_after_adaptation(job_name, url) print(extra_job_id) if extra_job_id != '' or extra_job_id is not None : child = Execution() child.status = Execution.__status_running__ child.execution_type = execute_type_singularity_pm child.batch_id = extra_job_id execution.children.append(child) db.session.commit() time.sleep(5) __add_nodes_to_execution__(child, url) else : logging.info('Execution already reached its maximum number of extra jobs, no adaptation possible') else : logging.info("Execution is not in RUNNING status, no action can be done") else : logging.info("Execution: " + execution.execution_type + " it is not compatible with add resource action")
def drain_a_node(node_id, reason): """ It drains a node scontrol update NodeName=nodelist State=drain Reason="describe reason here" """ node = db.session.query(Node).filter_by(id=int(node_id)).first() url = node.testbed.endpoint command = "scontrol" params = [] params.append('update') params.append('NodeName=' + node.name) params.append('State=drain') params.append('Reason="' + reason + '"') shell.execute_command(command, url, params)
def get_job_id_after_adaptation(job_name, url): """ It executes the following squeue line to get the job ID squeue --name=job_name -h -o %A """ output = shell.execute_command("squeue", url , [ '--name=' + job_name, '-h', '-o', '%A' ]) lines = output.decode('utf-8') lines = lines.split("\n") return lines[0].strip()
def remove_resource(execution): """ it removes resources to a running execution: adapt_compss_resources <master_node> <master_job_id> REMOVE SLURM-Cluster <node_to_delete> adapt_compss_resources ns51 7262 REMOVE SLURM-Cluster ns50 """ if ((execution.execution_type == execute_type_singularity_pm)): logging.info( "Executing type corresponds with SINGULARITY_PM, trying adaptation" ) if ((execution.status == Execution.__status_running__)): url = execution.execution_configuration.testbed.endpoint enqueue_env_file = execution.execution_configuration.testbed.extra_config[ 'enqueue_env_file'] sbatch_id = execution.slurm_sbatch_id if len(execution.children) > 0: execution_to_remove = execution.children[-1] node = find_first_node(sbatch_id, url) node_job_to_remove = find_first_node( execution_to_remove.slurm_sbatch_id, url) command = "source" params = [] params.append(enqueue_env_file) params.append(";") params.append("adapt_compss_resources") params.append(node) params.append(sbatch_id) params.append('REMOVE SLURM-Cluster') params.append(node_job_to_remove) output = shell.execute_command(command, url, params) if verify_adaptation_went_ok(output): logging.info("Adaptation performed ok") execution_to_remove.status = Execution.__status_cancelled__ db.session.commit() else: logging.info("There was an error in the adaptation:") output = output.decode('utf-8') logging.info(output) else: logging.info("No extra jobs to be able to delete") else: logging.info( "Execution is not in RUNNING status, no action can be done") else: logging.info("Execution: " + execution.execution_type + " it is not compatible with add resource action")
def get_nodes_information(testbed, command, params, parse_func): """Returns a list of dictionaries where each item list correspond to a node in the testbed. Dictionary keys must be normalized according to the constants defined in constants.py (e.g., the key for the node name is constants.NAME). The information is retrieved from the workload manager executing command + params. The output of the command is parsed and normalized by parse_func. This function is intended to be called by the get_nodes_testbed of a workload manager implementation """ if testbed.protocol == Testbed.protocol_local: output = shell.execute_command(command=command, params=params) elif testbed.protocol == Testbed.protocol_ssh: output = shell.execute_command(command=command, server=testbed.endpoint, params=params) else: return [] return parse_func(output)
def find_squeue_job_status(command_output): """ It finds the status of a squeue job: PENDING (PD), RUNNING (R), SUSPENDED (S), STOPPED (ST), COMPLETING (CG), COMPLETED (CD), CONFIGURING (CF), CANCELLED (CA), FAILED (F), TIMEOUT (TO), PREEMPTED (PR), BOOT_FAIL (BF) , NODE_FAIL (NF), REVOKED (RV), and SPECIAL_EXIT (SE) it returns "UNKNOWN" if it was not in the command output """ output = shell.execute_command('squeue', testbed.endpoint, []) lines = output.decode('utf-8') lines = lines.split("\n") pass
def execute_srun(testbed, execution_configuration, executable, deployment, singularity=False): """ This will execute an slurm application and return the output """ # Preparing the command to be executed command = "' (" endpoint = testbed.endpoint params = [] params.append("srun") if execution_configuration.num_nodes: params.append("-N") params.append(str(execution_configuration.num_nodes)) if execution_configuration.num_gpus_per_node: params.append("--gres=gpu:" + str(execution_configuration.num_gpus_per_node)) params.append("-n") params.append(str(execution_configuration.num_cpus_per_node)) if execution_configuration.srun_config: params.append(execution_configuration.srun_config) if singularity: params.append('singularity') params.append('run') params.append(deployment.path) else: params.append(executable.executable_file) params.append(execution_configuration.command) params.append(">") params.append("allout.txt") params.append("2>&1") params.append("&") params.append(")") params.append(";") params.append("sleep") params.append("1;") params.append("squeue") params.append("'") logging.info("Launching execution of application: command: " + command + " | endpoint: " + endpoint + " | params: " + str(params)) output = shell.execute_command(command, endpoint, params) return output
def execute_application_type_slurm_sbatch(execution, identifier): """ Executes an application with a device supervisor configured for slurm sbatch """ execution_configuration, testbed, deployment, executable = __get_srun_info__( execution, identifier) if testbed.category != Testbed.slurm_category: # If the category is not SLURM we can not execute the app execution.status = execute_status_failed execution.output = "Testbed does not support " + execute_type_slurm_sbatch + " applications" db.session.commit() elif not testbed.on_line: # If the testbed is off-line is not SLURM we can not execute the app execution.status = execute_status_failed execution.output = "Testbed is off-line" db.session.commit() else: # Preparing the command to be executed command = "sbatch" endpoint = testbed.endpoint params = [] params.append(executable.executable_file) logging.info("Launching execution of application: command: " + command + " | endpoint: " + endpoint + " | params: " + str(params)) output = shell.execute_command(command, endpoint, params) print(output) sbatch_id = __extract_id_from_sbatch__(output) execution = Execution() execution.execution_type = execution_configuration.execution_type execution.status = Execution.__status_running__ execution_configuration.executions.append(execution) execution.slurm_sbatch_id = sbatch_id db.session.commit() # Add nodes __add_nodes_to_execution__(execution, endpoint)
def __add_nodes_to_execution__(execution, url): """ This method takes the squeue id and adds nodes that are being used by the execution. [garciad@ns54 ~]$ squeue -j 7286 -h -o "%N" ns51 """ if execution.status == Execution.__status_running__ and execution.slurm_sbatch_id != None: command_output = shell.execute_command( "squeue", url, ['-j ' + str(execution.slurm_sbatch_id), '-h -o "%N"']) if command_output != b'\n': nodes = [] nodes_string = command_output.decode('utf-8').split('\n')[0] array_nodes = nodes_string.split(',') for node_in_array in array_nodes: if '[' not in node_in_array: node = db.session.query(Node).filter_by( name=str(node_in_array)).first() nodes.append(node) else: node_start_name = node_in_array.split('[')[0] boundaries = node_in_array.split('[')[1].split(']')[0] limits = boundaries.split('-') start = int(limits[0]) end = int(limits[1]) + 1 for number in range(start, end): node_name = node_start_name + str(number) node = db.session.query(Node).filter_by( name=node_name).first() nodes.append(node) execution.nodes = nodes db.session.commit()
def execute_batch(execution, identifier): """ Executes an application with a device supervisor configured for slurm sbatch """ execution_configuration, testbed, deployment, executable = executor_common.get_db_info( execution, identifier) if testbed.category != Testbed.torque_category: # If the category is not TORQUE we can not execute the app execution.status = Execution.__status_failed__ execution.output = "Testbed does not support " + Executable.__type_torque_qsub__ + " applications" db.session.commit() elif not testbed.on_line: # If the testbed is off-line we can not execute the app execution.status = Execution.__status_failed__ execution.output = "Testbed is off-line" db.session.commit() else: # Preparing the command to be executed command = "qsub" endpoint = testbed.endpoint params = [] params.append(executable.executable_file) logging.info("Launching execution of application: command: " + command + " | endpoint: " + endpoint + " | params: " + str(params)) output = shell.execute_command(command, endpoint, params) qsub_id = extract_id_from_qsub(output) #execution = Execution() execution.execution_type = execution_configuration.execution_type execution.status = Execution.__status_running__ execution_configuration.executions.append(execution) execution.batch_id = qsub_id db.session.commit()
def build_singularity_container(connection_url, template, image_file, upload_folder, become=True): """ It builds a singularity container following an specific definition sudo singularity bootstrap test.img docker.def """ img_file_name = str(uuid.uuid4()) + '.img' local_filename = os.path.join(upload_folder, img_file_name) if connection_url != '': template = os.path.basename(template) if become: logging.info("Executing [%s], 'sudo singulary build -F %s %s'", connection_url, image_file, template) shell.execute_command( 'sudo', connection_url, ['singularity', 'build', '-F', image_file, template]) else: logging.info("Executing [%s], 'singulary build -F %s %s'", connection_url, image_file, template) shell.execute_command('singularity', connection_url, ['build', '-F', image_file, template]) if connection_url != '': logging.info("Downloading image from %s", connection_url) shell.scp_file(local_filename, connection_url, image_file, False) else: logging.info("Moving image to final destination") shell.execute_command('mv', connection_url, [image_file, local_filename]) return local_filename
def __execute_pm_applications__(execution, identifier, create_profile, use_storage_profile, profile_folder, singularity): """ It executes a Singularity PM application in a targatted testbed """ # If create_profile = True we need to create a profile and associate it with the execution profile_file = '' if create_profile: profile_file = profile_folder + '/' + str(uuid.uuid4()) + '.profile' # Lets recover all the information needed...execution_configuration execution_configuration = db.session.query( ExecutionConfiguration).filter_by(id=identifier).first( ) # This is to avoid reusing objects from other thread testbed = db.session.query(Testbed).filter_by( id=execution_configuration.testbed_id).first() deployment = db.session.query(Deployment).filter_by( executable_id=execution_configuration.executable_id, testbed_id=testbed.id).first() executable = db.session.query(Executable).filter_by( id=execution_configuration.executable_id).first() # Preparing the command to be executed command = "source" endpoint = testbed.endpoint params = [] params.append(testbed.extra_config['enqueue_env_file']) params.append(";") params.append("enqueue_compss") params.append("--sc_cfg=" + testbed.extra_config['enqueue_compss_sc_cfg']) params.append("--num_nodes=" + str(execution_configuration.num_nodes)) params.append("--gpus_per_node=" + str(execution_configuration.num_gpus_per_node)) params.append("--cpus_per_node=" + str(execution_configuration.num_cpus_per_node)) if singularity: params.append("--container_image=" + deployment.path) params.append( "--container_compss_path=/opt/TANGO/TANGO_ProgrammingModel/COMPSs/" ) # TODO Ugly... ugly... and more ugly... #params.append("--appdir=" + executable.singularity_app_folder) params.append( "--appdir=/apps/application/") # TODO Ugly... fix this... else: params.append("--appdir=" + executable.singularity_app_folder) params.append("--exec_time=" + str(execution_configuration.exec_time)) # If create profile if create_profile: params.append("--output_profile=" + profile_file) # If we use a profile --output_profile=<path> if use_storage_profile: params.append("--input_profile=" + execution_configuration.profile_file) params.append(execution_configuration.compss_config) params.append(execution_configuration.command) logging.info("Launching execution of application: command: " + command + " | endpoint: " + endpoint + " | params: " + str(params)) output = shell.execute_command(command, endpoint, params) sbatch_id = __extract_id_from_sigularity_pm_app__(output) execution = Execution() execution.execution_type = execution_configuration.execution_type execution.status = Execution.__status_running__ execution_configuration.executions.append(execution) # if we create the profile, we add it to the execution configuration if create_profile: execution_configuration.profile_file = profile_file execution.slurm_sbatch_id = sbatch_id db.session.commit() # Add nodes time.sleep(5) __add_nodes_to_execution__(execution, endpoint)