def _get_jobs_and_vlans(self, conf): """Get the hosts from an existing job (if any) or from a new job. This will perform a reservation if necessary.""" provider_conf = conf['provider'] # Look if there is a running job or make a new reservation gridjob, _ = EX5.planning.get_job_by_name(provider_conf['name']) if gridjob is None: gridjob = self._make_reservation(conf) else: logging.info("Using running oargrid job %s" % gridjob) # Wait for the job to start EX5.wait_oargrid_job_start(gridjob) nodes = sorted(EX5.get_oargrid_job_nodes(gridjob), key=lambda n: n.address) # Checking the number of nodes given # the disribution policy self._check_nodes(nodes=nodes, resources=conf['resources'], mode=provider_conf['role_distribution']) # vlans information job_sites = EX5.get_oargrid_job_oar_jobs(gridjob) jobs = [] vlans = [] for (job_id, site) in job_sites: jobs.append((site, job_id)) vlan_id = EX5.get_oar_job_kavlan(job_id, site) if vlan_id is not None: vlans.append((site, EX5.get_oar_job_kavlan(job_id, site))) return (jobs, vlans, nodes)
def get_or_create_job(resources, job_name, walltime): gridjob, _ = ex5.planning.get_job_by_name(job_name) if gridjob is None: gridjob = make_reservation(resources, job_name, walltime) logging.info("Waiting for oargridjob %s to start" % gridjob) ex5.wait_oargrid_job_start(gridjob) return gridjob
def grid_get_or_create_job(job_name, walltime, reservation_date, queue, job_type, machines, networks): gridjob, _ = ex5.planning.get_job_by_name(job_name) if gridjob is None: gridjob = grid_make_reservation(job_name, walltime, reservation_date, queue, job_type, machines, networks) logger.info("Waiting for oargridjob %s to start" % gridjob) ex5.wait_oargrid_job_start(gridjob) return gridjob
def _get_job(self): """Get the hosts from an existing job (if any) or from a new job. This will perform a reservation if necessary.""" # Look if there is a running job or make a new reservation self.gridjob, _ = EX5.planning.get_job_by_name(self.config['name']) if self.gridjob is None: self._make_reservation() else: logging.info("Using running oargrid job %s" % self.gridjob) # Wait for the job to start EX5.wait_oargrid_job_start(self.gridjob) # # XXX Still useful? # attempts = 0 # self.nodes = None # while self.nodes is None and attempts < MAX_ATTEMPTS: # self.nodes = sorted(EX5.get_oargrid_job_nodes(self.gridjob), # key = lambda n: n.address) # attempts += 1 self.nodes = sorted(EX5.get_oargrid_job_nodes(self.gridjob), key=lambda n: n.address) # # XXX check already done into `_deploy`. self._check_nodes(nodes=self.nodes, resources=self.config['resources'], mode=self.config['role_distribution']) # XXX(Ad_rien_) Start_date is never used, deadcode? - August # 11th 2016 self.start_date = None job_info = EX5.get_oargrid_job_info(self.gridjob) if 'start_date' in job_info: self.start_date = job_info['start_date'] # filling some information about the jobs here self.user = None job_info = EX5.get_oargrid_job_info(self.gridjob) if 'user' in job_info: self.user = job_info['user'] # vlans information job_sites = EX5.get_oargrid_job_oar_jobs(self.gridjob) self.jobs = [] self.vlans = [] for (job_id, site) in job_sites: self.jobs.append((site, job_id)) vlan_id = EX5.get_oar_job_kavlan(job_id, site) if vlan_id is not None: self.vlans.append((site, EX5.get_oar_job_kavlan(job_id, site)))
def get_or_create_job(resources, job_name, walltime, reservation_date, queue, reservation_type): gridjob, _ = ex5.planning.get_job_by_name(job_name) if gridjob is None: gridjob = make_reservation(resources, job_name, walltime, reservation_date, queue, reservation_type) if reservation_type == "oar": logger.info("Waiting for oarjob %s to start" % gridjob) ex5.wait_oar_job_start(gridjob) else: logger.info("Waiting for oargridjob %s to start" % gridjob) ex5.wait_oargrid_job_start(gridjob) return gridjob
def get_job(self): """Get the hosts from an existing job (if any) or from a new job. This will perform a reservation if necessary.""" # Look if there is a running job or make a new reservation self.gridjob, _ = EX5.planning.get_job_by_name(self.config['name']) if self.gridjob is None: self._make_reservation() else: logger.info("Using running oargrid job %s" % style.emph(self.gridjob)) # Wait for the job to start EX5.wait_oargrid_job_start(self.gridjob) attempts = 0 self.nodes = None while self.nodes is None and attempts < MAX_ATTEMPTS: self.nodes = sorted(EX5.get_oargrid_job_nodes(self.gridjob), key = lambda n: n.address) attempts += 1 check_nodes( nodes = self.nodes, resources = self.config['resources'], mode = self.config['role_distribution']) # TODO - Start_date is never used, deadcode ? Ad_rien_ - August 11th 2016 self.start_date = None job_info = EX5.get_oargrid_job_info(self.gridjob) if 'start_date' in job_info: self.start_date = job_info['start_date'] ## filling some information about the jobs here self.user = None job_info = EX5.get_oargrid_job_info(self.gridjob) if 'user' in job_info: self.user = job_info['user'] ## vlans information job_sites = EX5.get_oargrid_job_oar_jobs(self.gridjob) self.jobs = [] self.vlans = [] for (job_id, site) in job_sites: self.jobs.append((site, job_id)) vlan_id = EX5.get_oar_job_kavlan(job_id, site) if vlan_id is not None: self.vlans.append((site, EX5.get_oar_job_kavlan(job_id, site))) return self.gridjob
def get_oargrid_job_vm5k_resources(oargrid_job_id): """Retrieve the hosts list and (ip, mac) list by sites from an oargrid_job_id and return the resources dict needed by vm5k_deployment, with kavlan-global if used in the oargrid job """ oargrid_job_id = int(oargrid_job_id) logger.info('Waiting job start') wait_oargrid_job_start(oargrid_job_id) resources = get_oar_job_vm5k_resources([(oar_job_id, site) for oar_job_id, site in get_oargrid_job_oar_jobs(oargrid_job_id)]) kavlan_global = None for site, res in resources.iteritems(): if res['kavlan'] >= 10: kavlan_global = {'kavlan': res['kavlan'], 'ip_mac': resources[site]['ip_mac'], 'site': site} break if kavlan_global: resources['global'] = kavlan_global return resources
def run(self): """ """ if self.options.oargrid_job_id: self.oargrid_job_id = self.options.oargrid_job_id else: self.oargrid_job_id = None try: # Creation of the main iterator which is used for the first control loop. self.define_parameters() job_is_dead = False # While there are combinations to treat while len(self.sweeper.get_remaining()) > 0: # If no job, we make a reservation and prepare the hosts for the experiments if self.oargrid_job_id is None: self.make_reservation() # Wait that the job starts logger.info('Waiting that the job start') wait_oargrid_job_start(self.oargrid_job_id) # Retrieving the hosts and subnets parameters self.hosts = get_oargrid_job_nodes(self.oargrid_job_id) # Hosts deployment and configuration default_connection_params['user'] = '******' logger.info("Start hosts configuration") ex_log.setLevel('INFO') deployment = Deployment( hosts=self.hosts, env_file='/home/sirimie/env/mywheezy-x64-base.env') self.hosts, _ = deploy(deployment) Remote("rm -f /home/Work/sgcbntier/paasage_demo/csv/REQTASK_*", self.hosts).run() Remote( "rm -f /home/Work/sgcbntier/paasage_demo/platform_aws.xml", self.hosts).run() Remote("rm -f /home/Work/sgcbntier/paasage_demo/cloud_ec2.xml", self.hosts).run() Put(self.hosts, [ "run_all_execo.py", "xml_gen_execo.py", "conf.xml", "platform_aws.xml", "cloud_ec2.xml" ], remote_location="/home/Work/sgcbntier/paasage_demo/").run( ) logger.info("Done") if len(self.hosts) == 0: break # Initializing the resources and threads available_hosts = [ host for host in self.hosts for i in range( get_host_attributes(host)['architecture']['smt_size']) ] threads = {} # Creating the unique folder for storing the results comb_dir = self.result_dir + '/csv_results' if not os.path.exists(comb_dir): os.mkdir(comb_dir) # Checking that the job is running and not in Error while self.is_job_alive() or len(threads.keys()) > 0: job_is_dead = False while self.options.n_nodes > len(available_hosts): tmp_threads = dict(threads) for t in tmp_threads: if not t.is_alive(): available_hosts.append(tmp_threads[t]['host']) del threads[t] sleep(5) if not self.is_job_alive(): job_is_dead = True break if job_is_dead: break # Getting the next combination comb = self.sweeper.get_next() if not comb: while len(threads.keys()) > 0: tmp_threads = dict(threads) for t in tmp_threads: if not t.is_alive(): del threads[t] logger.info('Waiting for threads to complete') sleep(20) break host = available_hosts[0] available_hosts = available_hosts[1:] t = Thread(target=self.workflow, args=(comb, host, comb_dir)) threads[t] = {'host': host} t.daemon = True t.start() if not self.is_job_alive(): job_is_dead = True if job_is_dead: self.oargrid_job_id = None finally: if self.oargrid_job_id is not None: if not self.options.keep_alive: logger.info('Deleting job') oargriddel([self.oargrid_job_id]) else: logger.info('Keeping job alive for debugging')