def generate_hosts(hosts_input): """Generate a list of hosts from the given file. Args: hosts_input: The path of the file containing the hosts to be used, or a comma separated list of site:job_id or an oargrid_job_id. If a file is used, each host should be in a different line. Repeated hosts are pruned. Hint: in a running Grid5000 job, $OAR_NODEFILE should be used. Return: list of Host: The list of hosts. """ hosts = [] if os.path.isfile(hosts_input): for line in open(hosts_input): h = Host(line.rstrip()) if h not in hosts: hosts.append(h) elif ':' in hosts_input: # We assume the string is a comma separated list of site:job_id for job in hosts_input.split(','): site, job_id = job.split(':') hosts += get_oar_job_nodes(int(job_id), site) else: # If the file_name is a number, we assume this is a oargrid_job_id hosts = get_oargrid_job_nodes(int(hosts_input)) logger.debug( 'Hosts list: \n%s', ' '.join(style.host(host.address.split('.')[0]) for host in hosts)) return hosts
def concretize_resources(resources, gridjob, reservation_type): if reservation_type == "oar": nodes = ex5.get_oar_job_nodes(gridjob) else: nodes = ex5.get_oargrid_job_nodes(gridjob) concretize_nodes(resources, nodes) if reservation_type == "oar": # This block is in charge of detecting the site of the oar reservation site_candidates = [] for network_description in resources.get("machines", []): cluster = network_description.get("cluster") site_candidates += [ex5.get_cluster_site(cluster)] for network_description in resources.get("networks", []): site_candidates += [network_description.get("site", "unknown")] if len(set(site_candidates)) == 1: site = site_candidates[0] else: raise "Could not detect the g5k site of the oarjob %s" % gridjob job_sites = [(gridjob, site)] else: job_sites = ex5.get_oargrid_job_oar_jobs(gridjob) vlans = [] for (job_id, site) in job_sites: vlan_ids = ex5.get_oar_job_kavlan(job_id, site) vlans.extend([{ "site": site, "vlan_id": vlan_id } for vlan_id in vlan_ids]) concretize_networks(resources, vlans)
def _get_jobs_and_vlans(self, conf): """Get the hosts from an existing job (if any) or from a new job. This will perform a reservation if necessary.""" provider_conf = conf['provider'] # Look if there is a running job or make a new reservation gridjob, _ = EX5.planning.get_job_by_name(provider_conf['name']) if gridjob is None: gridjob = self._make_reservation(conf) else: logging.info("Using running oargrid job %s" % gridjob) # Wait for the job to start EX5.wait_oargrid_job_start(gridjob) nodes = sorted(EX5.get_oargrid_job_nodes(gridjob), key=lambda n: n.address) # Checking the number of nodes given # the disribution policy self._check_nodes(nodes=nodes, resources=conf['resources'], mode=provider_conf['role_distribution']) # vlans information job_sites = EX5.get_oargrid_job_oar_jobs(gridjob) jobs = [] vlans = [] for (job_id, site) in job_sites: jobs.append((site, job_id)) vlan_id = EX5.get_oar_job_kavlan(job_id, site) if vlan_id is not None: vlans.append((site, EX5.get_oar_job_kavlan(job_id, site))) return (jobs, vlans, nodes)
def generate_hosts(hosts_input): """Generate a list of hosts from the given file. Args: hosts_input: The path of the file containing the hosts to be used, or a comma separated list of site:job_id or an oargrid_job_id. If a file is used, each host should be in a different line. Repeated hosts are pruned. Hint: in a running Grid5000 job, $OAR_NODEFILE should be used. Return: list of Host: The list of hosts. """ hosts = [] if os.path.isfile(hosts_input): for line in open(hosts_input): h = Host(line.rstrip()) if h not in hosts: hosts.append(h) elif ':' in hosts_input: # We assume the string is a comma separated list of site:job_id for job in hosts_input.split(','): site, job_id = job.split(':') hosts += get_oar_job_nodes(int(job_id), site) else: # If the file_name is a number, we assume this is a oargrid_job_id hosts = get_oargrid_job_nodes(int(hosts_input)) logger.debug('Hosts list: \n%s', ' '.join(style.host(host.address.split('.')[0]) for host in hosts)) return hosts
def grid_reload_from_id(gridjob): logger.info("Reloading the resources from oargrid job %s", gridjob) gridjob = int(gridjob) nodes = ex5.get_oargrid_job_nodes(gridjob) job_sites = ex5.get_oargrid_job_oar_jobs(gridjob) vlans = [] subnets = [] for (job_id, site) in job_sites: vlan_ids = ex5.get_oar_job_kavlan(job_id, site) vlans.extend([{ "site": site, "vlan_id": vlan_id } for vlan_id in vlan_ids]) # NOTE(msimonin): this currently returned only one subnet # even if several are reserved # We'll need to patch execo the same way it has been patched for vlans ipmac, info = ex5.get_oar_job_subnets(job_id, site) if not ipmac: logger.debug("No subnet information found for this job") continue subnet = { "site": site, "ipmac": ipmac, } subnet.update(info) # Mandatory key when it comes to concretize resources subnet.update({"network": info["ip_prefix"]}) subnets.append(subnet) return nodes, vlans, subnets
def make_reservation(job_name=JOB_NAME, job_type='allow_classic_ssh'): plan = ex5.planning end = ex.time_utils.format_date(time.time()+12600) logging.basicConfig(level=logging.DEBUG) oargrid_job_id, _ = ex5.planning.get_job_by_name(job_name) if oargrid_job_id is None: logging.info("Starting a new job") planning = plan.get_planning(endtime=end) slots = plan.compute_slots(planning, walltime=WALLTIME, excluded_elements=excluded) startdate, enddate, resources = plan.find_free_slot(slots, {'grid5000':1}) logging.info("startdate = %s, enddate = %s resources = %s" % (startdate, enddate, resources)) resources = plan.distribute_hosts(resources, {'grid5000':1}, excluded_elements=excluded) # shuffling to load balance load accros nodes random.shuffle(resources) specs = plan.get_jobs_specs(resources, excluded_elements=excluded) spec, frontend = specs[0] spec.name = job_name logging.info("specs = %s" % spec) oargrid_job_id, _ = ex5.oargridsub(specs, job_type=job_type, walltime=WALLTIME) logging.info("Using running oargrid job %s" % oargrid_job_id) jobs = ex5.oargrid.get_oargrid_job_oar_jobs(oargrid_job_id=oargrid_job_id) # Get the frontend _, frontend = jobs[0] # Get the host hosts = ex5.get_oargrid_job_nodes(oargrid_job_id) logging.info("The slave will be running on %s,%s" % (hosts[0], frontend)) return hosts[0], frontend
def grid_reload_from_id(gridjob): logger.info("Reloading the resources from oargrid job %s", gridjob) gridjob = int(gridjob) nodes = ex5.get_oargrid_job_nodes(gridjob) job_sites = ex5.get_oargrid_job_oar_jobs(gridjob) vlans = [] subnets = [] for (job_id, site) in job_sites: vlans, subnets = get_network_info_from_job_id(job_id, site, vlans, subnets) return nodes, vlans, subnets
def _get_job(self): """Get the hosts from an existing job (if any) or from a new job. This will perform a reservation if necessary.""" # Look if there is a running job or make a new reservation self.gridjob, _ = EX5.planning.get_job_by_name(self.config['name']) if self.gridjob is None: self._make_reservation() else: logging.info("Using running oargrid job %s" % self.gridjob) # Wait for the job to start EX5.wait_oargrid_job_start(self.gridjob) # # XXX Still useful? # attempts = 0 # self.nodes = None # while self.nodes is None and attempts < MAX_ATTEMPTS: # self.nodes = sorted(EX5.get_oargrid_job_nodes(self.gridjob), # key = lambda n: n.address) # attempts += 1 self.nodes = sorted(EX5.get_oargrid_job_nodes(self.gridjob), key=lambda n: n.address) # # XXX check already done into `_deploy`. self._check_nodes(nodes=self.nodes, resources=self.config['resources'], mode=self.config['role_distribution']) # XXX(Ad_rien_) Start_date is never used, deadcode? - August # 11th 2016 self.start_date = None job_info = EX5.get_oargrid_job_info(self.gridjob) if 'start_date' in job_info: self.start_date = job_info['start_date'] # filling some information about the jobs here self.user = None job_info = EX5.get_oargrid_job_info(self.gridjob) if 'user' in job_info: self.user = job_info['user'] # vlans information job_sites = EX5.get_oargrid_job_oar_jobs(self.gridjob) self.jobs = [] self.vlans = [] for (job_id, site) in job_sites: self.jobs.append((site, job_id)) vlan_id = EX5.get_oar_job_kavlan(job_id, site) if vlan_id is not None: self.vlans.append((site, EX5.get_oar_job_kavlan(job_id, site)))
def concretize_resources(resources, gridjob): nodes = ex5.get_oargrid_job_nodes(gridjob) concretize_nodes(resources, nodes) job_sites = ex5.get_oargrid_job_oar_jobs(gridjob) vlans = [] for (job_id, site) in job_sites: vlan_ids = ex5.get_oar_job_kavlan(job_id, site) vlans.extend([{ "site": site, "vlan_id": vlan_id} for vlan_id in vlan_ids]) concretize_networks(resources, vlans)
def get_job(self): """Get the hosts from an existing job (if any) or from a new job. This will perform a reservation if necessary.""" # Look if there is a running job or make a new reservation self.gridjob, _ = EX5.planning.get_job_by_name(self.config['name']) if self.gridjob is None: self._make_reservation() else: logger.info("Using running oargrid job %s" % style.emph(self.gridjob)) # Wait for the job to start EX5.wait_oargrid_job_start(self.gridjob) attempts = 0 self.nodes = None while self.nodes is None and attempts < MAX_ATTEMPTS: self.nodes = sorted(EX5.get_oargrid_job_nodes(self.gridjob), key = lambda n: n.address) attempts += 1 check_nodes( nodes = self.nodes, resources = self.config['resources'], mode = self.config['role_distribution']) # TODO - Start_date is never used, deadcode ? Ad_rien_ - August 11th 2016 self.start_date = None job_info = EX5.get_oargrid_job_info(self.gridjob) if 'start_date' in job_info: self.start_date = job_info['start_date'] ## filling some information about the jobs here self.user = None job_info = EX5.get_oargrid_job_info(self.gridjob) if 'user' in job_info: self.user = job_info['user'] ## vlans information job_sites = EX5.get_oargrid_job_oar_jobs(self.gridjob) self.jobs = [] self.vlans = [] for (job_id, site) in job_sites: self.jobs.append((site, job_id)) vlan_id = EX5.get_oar_job_kavlan(job_id, site) if vlan_id is not None: self.vlans.append((site, EX5.get_oar_job_kavlan(job_id, site))) return self.gridjob
def get_hosts_list(self, hosts_str): """Generate a list of hosts from the given string. Args: hosts_str (str): The following options are supported - The path of the file containing the hosts to be used. Each host should be in a different line. Repeated hosts are pruned. Hint: in a running Grid5000 job, $OAR_NODEFILE should be used. - A comma-separated list of site:job_id - A comma-separated list of hosts. - An oargrid_job_id Return: list of Host: The list of hosts. """ hosts = [] if os.path.isfile(hosts_str): for line in open(hosts_str): h = Host(line.rstrip()) if h not in hosts: hosts.append(h) elif ':' in hosts_str: # We assume the string is a comma separated list of site:job_id for job in hosts_str.split(','): site, job_id = job.split(':') hosts += get_oar_job_nodes(int(job_id), site) elif "," in hosts_str: # We assume the string is a comma separated list of hosts for hstr in hosts_str.split(','): h = Host(hstr.rstrip()) if h not in hosts: hosts.append(h) elif hosts_str.isdigit(): # If the file_name is a number, we assume this is a oargrid_job_id hosts = get_oargrid_job_nodes(int(hosts_str)) else: # If not any of the previous, we assume is a single-host cluster # where the given input is the only host hosts = [Host(hosts_str.rstrip())] logger.debug('Hosts list: \n%s', ' '.join(style.host(host.address.split('.')[0]) for host in hosts)) return hosts
def generate_hosts(hosts_input): """Generate a list of hosts from the given file. Args: hosts_input: The path of the file containing the hosts to be used, or a comma separated list of site:job_id or an a comma separated list of hosts or an oargrid_job_id. If a file is used, each host should be in a different line. Repeated hosts are pruned. Hint: in a running Grid5000 job, $OAR_NODEFILE should be used. Return: list of Host: The list of hosts. """ hosts = [] if os.path.isfile(hosts_input): for line in open(hosts_input): h = Host(line.rstrip()) if h not in hosts: hosts.append(h) elif ":" in hosts_input: # We assume the string is a comma separated list of site:job_id for job in hosts_input.split(","): site, job_id = job.split(":") hosts += get_oar_job_nodes(int(job_id), site) elif "," in hosts_input: # We assume the string is a comma separated list of hosts for hstr in hosts_input.split(","): h = Host(hstr.rstrip()) if h not in hosts: hosts.append(h) elif hosts_input.isdigit(): # If the file_name is a number, we assume this is a oargrid_job_id hosts = get_oargrid_job_nodes(int(hosts_input)) else: # If not any of the previous, we assume is a single-host cluster where # the given input is the only host hosts = [Host(hosts_input.rstrip())] logger.debug("Hosts list: \n%s", " ".join(style.host(host.address.split(".")[0]) for host in hosts)) return hosts
def run(self): """ """ if self.options.oargrid_job_id: self.oargrid_job_id = self.options.oargrid_job_id else: self.oargrid_job_id = None try: # Creation of the main iterator which is used for the first control loop. self.define_parameters() job_is_dead = False # While there are combinations to treat while len(self.sweeper.get_remaining()) > 0: # If no job, we make a reservation and prepare the hosts for the experiments if self.oargrid_job_id is None: self.make_reservation() # Wait that the job starts logger.info('Waiting that the job start') wait_oargrid_job_start(self.oargrid_job_id) # Retrieving the hosts and subnets parameters self.hosts = get_oargrid_job_nodes(self.oargrid_job_id) # Hosts deployment and configuration default_connection_params['user'] = '******' logger.info("Start hosts configuration") ex_log.setLevel('INFO') deployment = Deployment( hosts=self.hosts, env_file='/home/sirimie/env/mywheezy-x64-base.env') self.hosts, _ = deploy(deployment) Remote("rm -f /home/Work/sgcbntier/paasage_demo/csv/REQTASK_*", self.hosts).run() Remote( "rm -f /home/Work/sgcbntier/paasage_demo/platform_aws.xml", self.hosts).run() Remote("rm -f /home/Work/sgcbntier/paasage_demo/cloud_ec2.xml", self.hosts).run() Put(self.hosts, [ "run_all_execo.py", "xml_gen_execo.py", "conf.xml", "platform_aws.xml", "cloud_ec2.xml" ], remote_location="/home/Work/sgcbntier/paasage_demo/").run( ) logger.info("Done") if len(self.hosts) == 0: break # Initializing the resources and threads available_hosts = [ host for host in self.hosts for i in range( get_host_attributes(host)['architecture']['smt_size']) ] threads = {} # Creating the unique folder for storing the results comb_dir = self.result_dir + '/csv_results' if not os.path.exists(comb_dir): os.mkdir(comb_dir) # Checking that the job is running and not in Error while self.is_job_alive() or len(threads.keys()) > 0: job_is_dead = False while self.options.n_nodes > len(available_hosts): tmp_threads = dict(threads) for t in tmp_threads: if not t.is_alive(): available_hosts.append(tmp_threads[t]['host']) del threads[t] sleep(5) if not self.is_job_alive(): job_is_dead = True break if job_is_dead: break # Getting the next combination comb = self.sweeper.get_next() if not comb: while len(threads.keys()) > 0: tmp_threads = dict(threads) for t in tmp_threads: if not t.is_alive(): del threads[t] logger.info('Waiting for threads to complete') sleep(20) break host = available_hosts[0] available_hosts = available_hosts[1:] t = Thread(target=self.workflow, args=(comb, host, comb_dir)) threads[t] = {'host': host} t.daemon = True t.start() if not self.is_job_alive(): job_is_dead = True if job_is_dead: self.oargrid_job_id = None finally: if self.oargrid_job_id is not None: if not self.options.keep_alive: logger.info('Deleting job') oargriddel([self.oargrid_job_id]) else: logger.info('Keeping job alive for debugging')