def generate_hosts(hosts_input): """Generate a list of hosts from the given file. Args: hosts_input: The path of the file containing the hosts to be used, or a comma separated list of site:job_id or an oargrid_job_id. If a file is used, each host should be in a different line. Repeated hosts are pruned. Hint: in a running Grid5000 job, $OAR_NODEFILE should be used. Return: list of Host: The list of hosts. """ hosts = [] if os.path.isfile(hosts_input): for line in open(hosts_input): h = Host(line.rstrip()) if h not in hosts: hosts.append(h) elif ':' in hosts_input: # We assume the string is a comma separated list of site:job_id for job in hosts_input.split(','): site, job_id = job.split(':') hosts += get_oar_job_nodes(int(job_id), site) else: # If the file_name is a number, we assume this is a oargrid_job_id hosts = get_oargrid_job_nodes(int(hosts_input)) logger.debug( 'Hosts list: \n%s', ' '.join(style.host(host.address.split('.')[0]) for host in hosts)) return hosts
def get_oargrid_job_nodes(oargrid_job_id, frontend_connection_params=None, timeout=False): """Return an iterable of `execo.host.Host` containing the hosts of an oargrid job. :param oargrid_job_id: the oargrid job id. :param frontend_connection_params: connection params for connecting to frontends if needed. Values override those in `execo_g5k.config.default_frontend_connection_params`. :param timeout: timeout for retrieving. Default is False, which means use ``execo_g5k.config.g5k_configuration['default_timeout']``. None means no timeout. """ if isinstance(timeout, bool) and timeout == False: timeout = g5k_configuration.get('default_timeout') process = get_process( "oargridstat -wl %i 2>/dev/null || oargridstat -l %i 2>/dev/null" % (oargrid_job_id, oargrid_job_id), host=get_frontend_host(), connection_params=make_connection_params( frontend_connection_params, default_frontend_connection_params)) process.timeout = timeout process.pty = True process.run() if process.ok: host_addresses = re.findall("(\S+)", process.stdout, re.MULTILINE) return list( set([Host(host_address) for host_address in host_addresses])) else: raise ProcessesFailed([process])
def test_build_roles_less_deployed_nodes(self): self.engine.deployed_nodes = map(lambda x: Host(x), ["a-1", "a-2", "a-3", "a-4", "a-5"]) roles = self.engine.build_roles() self.assertEquals(1, len(roles["controller"])) self.assertEquals(1, len(roles["storage"])) self.assertEquals(1, len(roles["compute"])) self.assertEquals(1, len(roles["network"])) self.assertEquals(1, len(roles["util"]))
def get_frontend_host(frontend=None): """Given a frontend name, or None, and based on the global configuration, returns the frontend to connect to or None.""" if frontend == None: frontend = get_default_frontend() if g5k_configuration.get('no_ssh_for_local_frontend' ) == True and frontend == get_default_frontend(): frontend = None if frontend: frontend = Host(frontend) return frontend
def get_oar_job_nodes(oar_job_id=None, frontend=None, frontend_connection_params=None, timeout=False): """Return an iterable of `execo.host.Host` containing the hosts of an oar job. This method waits for the job start (the list of nodes isn't fixed until the job start). :param oar_job_id: the oar job id. If None given, will try to get it from ``OAR_JOB_ID`` environment variable. :param frontend: the frontend of the oar job. If None given, use default frontend. :param frontend_connection_params: connection params for connecting to frontends if needed. Values override those in `execo_g5k.config.default_frontend_connection_params`. :param timeout: timeout for retrieving. Default is False, which means use ``execo_g5k.config.g5k_configuration['default_timeout']``. None means no timeout. """ if isinstance(timeout, bool) and timeout == False: timeout = g5k_configuration.get('default_timeout') if oar_job_id == None: if 'OAR_JOB_ID' in os.environ: oar_job_id = os.environ['OAR_JOB_ID'] else: raise ValueError( "no oar job id given and no OAR_JOB_ID environment variable found" ) countdown = Timer(timeout) wait_oar_job_start(oar_job_id, frontend, frontend_connection_params, countdown.remaining()) process = get_process( "(oarstat -sj %(oar_job_id)i | grep 'Running\|Terminated\|Error') > /dev/null 2>&1 && oarstat -pj %(oar_job_id)i | oarprint host -f -" % {'oar_job_id': oar_job_id}, host=get_frontend_host(frontend), connection_params=make_connection_params( frontend_connection_params, default_frontend_connection_params)) process.timeout = countdown.remaining() process.shell = process.pty = True process.run() if process.ok: host_addresses = re.findall("(\S+)", process.stdout, re.MULTILINE) return [Host(host_address) for host_address in host_addresses] else: raise ProcessesFailed([process])
def test_build_roles_with_multiple_clusters(self): self.engine.config = { "resources": { "a": { "controller": 1, "compute": 2, "network": 1, "storage": 1, "util": 1 }, "b": { "compute": 2 } } } self.engine.deployed_nodes = map( lambda x: Host(x), ["a-1", "a-2", "a-3", "a-4", "a-5", "a-6", "b-1", "b-2"]) roles = self.engine.build_roles() self.assertEquals(1, len(roles["controller"])) self.assertEquals(1, len(roles["storage"])) self.assertEquals(4, len(roles["compute"])) self.assertEquals(1, len(roles["network"])) self.assertEquals(1, len(roles["util"]))
def deploy(deployment, check_deployed_command=True, node_connection_params={'user': '******'}, num_tries=1, check_enough_func=None, frontend_connection_params=None, deploy_timeout=None, check_timeout=30, stdout_handlers=None, stderr_handlers=None): """Deploy nodes, many times if needed, checking which of these nodes are already deployed with a user-supplied command. If no command given for checking if nodes deployed, rely on kadeploy to know which nodes are deployed. - loop `num_tries` times: - if ``check_deployed_command`` given, try to connect to these hosts using the supplied `node_connection_params` (or the default ones), and to execute ``check_deployed_command``. If connection succeeds and the command returns 0, the host is assumed to be deployed, else it is assumed to be undeployed. - optionnaly call user-supplied ``check_enough_func``, passing to it the list of deployed and undeployed hosts, to let user code decide if enough nodes deployed. Otherwise, try as long as there are undeployed nodes. - deploy the undeployed nodes returns a tuple with the list of deployed hosts and the list of undeployed hosts. When checking correctly deployed nodes with ``check_deployed_command``, and if the deployment is using the kavlan option, this function will try to contact the nodes using the appropriate DNS hostnames in the new vlan. :param deployment: instance of `execo.kadeploy.Deployment` class describing the intended kadeployment. :param check_deployed_command: command to perform remotely to check node deployement. May be a String, True, False or None. If String: the actual command to be used (This command should return 0 if the node is correctly deployed, or another value otherwise). If True, the default command value will be used (from `execo_g5k.config.g5k_configuration`). If None or False, no check is made and deployed/undeployed status will be taken from kadeploy's output. :param node_connection_params: a dict similar to `execo.config.default_connection_params` whose values will override those in `execo.config.default_connection_params` when connecting to check node deployment with ``check_deployed_command`` (see below). :param num_tries: number of deploy tries :param check_enough_func: a function taking as parameter a list of deployed hosts and a list of undeployed hosts, which will be called at each deployment iteration end, and that should return a boolean indicating if there is already enough nodes (in this case, no further deployement will be attempted). :param frontend_connection_params: connection params for connecting to frontends if needed. Values override those in `execo_g5k.config.default_frontend_connection_params`. :param deploy_timeout: timeout for deployement. Default is None, which means no timeout. :param check_timeout: timeout for node deployment checks. Default is 30 seconds. :param stdout_handlers: iterable of `ProcessOutputHandlers` which will be passed to the actual deploy processes. :param stderr_handlers: iterable of `ProcessOutputHandlers` which will be passed to the actual deploy processes. """ if check_enough_func == None: check_enough_func = lambda deployed, undeployed: len(undeployed) == 0 if check_deployed_command == True: check_deployed_command = g5k_configuration.get( 'check_deployed_command') def check_update_deployed(undeployed_hosts, check_deployed_command, node_connection_params, vlan): #IGNORE:W0613 logger.debug( style.emph("check which hosts are already deployed among:") + " %s", undeployed_hosts) deployment_hostnames_mapping = dict() if vlan: for host in undeployed_hosts: deployment_hostnames_mapping[get_kavlan_host_name(host, vlan)] = host else: for host in undeployed_hosts: deployment_hostnames_mapping[host] = host deployed_check = get_remote(check_deployed_command, list(deployment_hostnames_mapping), connection_params=node_connection_params) for p in deployed_check.processes: p.nolog_exit_code = True p.nolog_timeout = True p.nolog_error = True p.timeout = check_timeout deployed_check.run() newly_deployed = list() for process in deployed_check.processes: logger.debug( style.emph("check on %s:" % (process.host, )) + " %s\n" % (process, ) + style.emph("stdout:") + "\n%s\n" % (process.stdout) + style.emph("stderr:") + "\n%s\n" % (process.stderr)) if (process.ok): newly_deployed.append( deployment_hostnames_mapping[process.host.address]) logger.debug( "OK %s", deployment_hostnames_mapping[process.host.address]) else: logger.debug( "KO %s", deployment_hostnames_mapping[process.host.address]) return newly_deployed start_time = time.time() deployed_hosts = set() undeployed_hosts = set([Host(host).address for host in deployment.hosts]) my_newly_deployed = [] if check_deployed_command: my_newly_deployed = check_update_deployed(undeployed_hosts, check_deployed_command, node_connection_params, deployment.vlan) deployed_hosts.update(my_newly_deployed) undeployed_hosts.difference_update(my_newly_deployed) num_tries_done = 0 elapsed = time.time() - start_time last_time = time.time() deploy_stats = list() # contains tuples ( timestamp, # num attempted deploys, # len(kadeployer.deployed_hosts), # len(my_newly_deployed), # len(deployed_hosts), # len(undeployed_hosts ) deploy_stats.append((elapsed, None, None, len(my_newly_deployed), len(deployed_hosts), len(undeployed_hosts))) while (not check_enough_func(deployed_hosts, undeployed_hosts) and num_tries_done < num_tries): num_tries_done += 1 logger.debug( style.emph("try %i, deploying on:" % (num_tries_done, )) + " %s", undeployed_hosts) tmp_deployment = copy.copy(deployment) tmp_deployment.hosts = undeployed_hosts kadeployer = Kadeployer( tmp_deployment, frontend_connection_params=frontend_connection_params, stdout_handlers=stdout_handlers, stderr_handlers=stderr_handlers) kadeployer.timeout = deploy_timeout kadeployer.run() my_newly_deployed = [] if check_deployed_command: my_newly_deployed = check_update_deployed(undeployed_hosts, check_deployed_command, node_connection_params, deployment.vlan) deployed_hosts.update(my_newly_deployed) undeployed_hosts.difference_update(my_newly_deployed) else: deployed_hosts.update(kadeployer.deployed_hosts) undeployed_hosts.difference_update(kadeployer.deployed_hosts) logger.debug( style.emph("kadeploy reported newly deployed hosts:") + " %s", kadeployer.deployed_hosts) logger.debug( style.emph("check reported newly deployed hosts:") + " %s", my_newly_deployed) logger.debug( style.emph("all deployed hosts:") + " %s", deployed_hosts) logger.debug( style.emph("still undeployed hosts:") + " %s", undeployed_hosts) elapsed = time.time() - last_time last_time = time.time() deploy_stats.append( (elapsed, len(tmp_deployment.hosts), len(kadeployer.deployed_hosts), len(my_newly_deployed), len(deployed_hosts), len(undeployed_hosts))) logger.detail( style.emph("deploy finished") + " in %i tries, %s", num_tries_done, format_seconds(time.time() - start_time)) logger.detail( "deploy duration attempted deployed deployed total total" ) logger.detail( " deploys as reported as reported already still" ) logger.detail( " by kadeploy by check deployed undeployed" ) logger.detail( "---------------------------------------------------------------------------" ) for (deploy_index, deploy_stat) in enumerate(deploy_stats): logger.detail( "#%-5.5s %-8.8s %-9.9s %-11.11s %-11.11s %-8.8s %-10.10s", deploy_index, format_seconds(deploy_stat[0]), deploy_stat[1], deploy_stat[2], deploy_stat[3], deploy_stat[4], deploy_stat[5]) logger.debug(style.emph("deployed hosts:") + " %s", deployed_hosts) logger.debug(style.emph("undeployed hosts:") + " %s", undeployed_hosts) return (deployed_hosts, undeployed_hosts)
def test_not_enough_nodes(self): self.engine.deployed_nodes = map(lambda x: Host(x), ["a-1", "a-2", "a-3", "a-4"]) with self.assertRaises(Exception): roles = self.engine.build_roles()