def run_xp(self): """Iterate over the parameters and execute the bench""" while len(self.sweeper.get_remaining()) > 0: comb = self.sweeper.get_next() if comb['n_core'] > get_host_attributes(comb['cluster']+'-1')['architecture']['smt_size'] * self.n_nodes: self.sweeper.skip(comb) continue logger.info('Processing new combination %s' % (comb,)) site = get_cluster_site(comb['cluster']) jobs = oarsub([(OarSubmission(resources = "{cluster='" + comb['cluster']+"'}/nodes=" + str(self.n_nodes), job_type = 'allow_classic_ssh', walltime ='0:10:00'), site)]) if jobs[0][0]: try: wait_oar_job_start(*jobs[0]) nodes = get_oar_job_nodes(*jobs[0]) bench_cmd = 'mpirun -H %s -n %i %s ~/NPB3.3-MPI/bin/lu.%s.%i' % ( ",".join([node.address for node in nodes]), comb['n_core'], get_mpi_opts(comb['cluster']), comb['size'], comb['n_core']) lu_bench = SshProcess(bench_cmd, nodes[0]) lu_bench.stdout_handlers.append(self.result_dir + '/' + slugify(comb) + '.out') lu_bench.run() if lu_bench.ok: logger.info("comb ok: %s" % (comb,)) self.sweeper.done(comb) continue finally: oardel(jobs) logger.info("comb NOT ok: %s" % (comb,)) self.sweeper.cancel(comb)
def worker_start(self, cluster, site, oarsubmission, data, worker_index): th = current_thread() th.cluster = cluster th.site = site th.worker_index = worker_index th.jobid = None try: with th.oarsublock: if th.willterminate: return worker_log.detail("submit oar job") ((th.jobid, _),) = oarsub([(oarsubmission, site)]) if not th.jobid: worker_log.detail("job submission failed") self.worker(cluster, site, data, None, worker_index, oarsubmission, None) worker_log.detail("job submitted - wait job start") wait_oar_job_start(th.jobid, site, prediction_callback = lambda ts: worker_log.detail("job start prediction: %s" % (format_date(ts),))) th.waiting = False worker_log.detail("job started - get job nodes") nodes = get_oar_job_nodes(th.jobid, site) worker_log.detail("got %i nodes" % (len(nodes),)) self.worker(cluster, site, data, nodes, worker_index, oarsubmission, th.jobid) finally: with th.oarsublock: if th.jobid: worker_log.detail("delete oar job") oardel([(th.jobid, site)]) th.jobid = None worker_log.detail("exit")
def worker_start(self, cluster, site, oarsubmission, data, worker_index): th = current_thread() th.cluster = cluster th.site = site th.worker_index = worker_index th.jobid = None try: with th.oarsublock: if th.willterminate: return worker_log.detail("submit oar job") ((th.jobid, _), ) = oarsub([(oarsubmission, site)]) if not th.jobid: worker_log.detail("job submission failed") self.worker(cluster, site, data, None, worker_index, oarsubmission, None) worker_log.detail("job submitted - wait job start") wait_oar_job_start(th.jobid, site, prediction_callback=lambda ts: worker_log. detail("job start prediction: %s" % (format_date(ts), ))) th.waiting = False worker_log.detail("job started - get job nodes") nodes = get_oar_job_nodes(th.jobid, site) worker_log.detail("got %i nodes" % (len(nodes), )) self.worker(cluster, site, data, nodes, worker_index, oarsubmission, th.jobid) finally: with th.oarsublock: if th.jobid: worker_log.detail("delete oar job") oardel([(th.jobid, site)]) th.jobid = None worker_log.detail("exit")
def signal_handler(signal, frame): global interrupted, workers, jobid, site if interrupted: print('\n Releasing nodes') execo_g5k.oardel([(jobid, site)]) sys.exit(1) else: print('\n Press Ctrl+C again to exit') interrupted = True if workers is not None: workers.kill()
def run(self): logger.debug("Parse and convert configs for G5K provisioner") self.configs = parse_config_file(self.args.config_file_path) kube_master_site = self.create_configs() logger.info("""Your topology: %s""" % self.configs["exp_env"]["antidote_clusters"]) # Logarithmic scale interval of latency if self.configs["parameters"][ "latency_interval"] == "logarithmic scale": start, end = self.configs["parameters"]["latency"] latency = [start, end] log_start = int(math.ceil(math.log(start))) log_end = int(math.ceil(math.log(end))) for i in range(log_start, log_end): latency.append(int(math.exp(i))) latency.append(int(math.exp(i + 0.5))) del self.configs["parameters"]["latency_interval"] self.configs["parameters"]["latency"] = list(set(latency)) if self.configs["parameters"]["benchmarks"] == "performance": self.configs["parameters"]["n_nodes_run_per_dc"] = list( range( 1, self.configs["exp_env"]["antidote_clusters"][0] ["n_antidotedb_per_dc"] + 1)) sweeper = create_combs_queue( result_dir=self.configs["exp_env"]["results_dir"], parameters=self.configs["parameters"], ) kube_namespace = "elmerfs-exp" oar_job_ids = None while len(sweeper.get_remaining()) > 0: if oar_job_ids is None: oar_job_ids, kube_master, elmerfs_hosts = self.setup_env( kube_master_site, kube_namespace) comb = sweeper.get_next() sweeper = self.run_workflow( elmerfs_hosts=elmerfs_hosts, kube_master=kube_master, kube_namespace=kube_namespace, comb=comb, sweeper=sweeper, ) if not is_job_alive(oar_job_ids): oardel(oar_job_ids) oar_job_ids = None logger.info("Finish the experiment!!!")
def get_cpu_topology(cluster, xpdir=None): """ """ logger.info('Determining the architecture of cluster ' + \ style.emph(cluster)) root = None # Trying to reed topology from a directory if xpdir: fname = xpdir + '/topo_' + cluster + '.xml' try: tree = parse(fname) root = tree.getroot() except: logger.info('No cache file found, will reserve a node and ' + \ 'determine topology from virsh capabilities') pass if root is None: frontend = get_cluster_site(cluster) submission = OarSubmission(resources="{cluster='" + cluster + "'}/nodes=1", walltime="0:02:00", job_type="allow_classic_ssh") ((job_id, _), ) = oarsub([(submission, frontend)]) wait_oar_job_start(job_id, frontend) host = get_oar_job_nodes(job_id, frontend)[0] capa = SshProcess('virsh capabilities', host, connection_params={ 'user': default_frontend_connection_params['user'] }).run() oardel([(job_id, frontend)]) root = fromstring(capa.stdout) if xpdir is not None: tree = ElementTree(root) tree.write(fname) cpu_topology = [] i_cell = 0 for cell in root.findall('.//cell'): cpu_topology.append([]) for cpu in cell.findall('.//cpu'): cpu_topology[i_cell].append(int(cpu.attrib['id'])) i_cell += 1 logger.info(pformat(cpu_topology)) return cpu_topology
def run(self): """ Main engine method to perform the experiment """ self.define_parameters() while len(self.sweeper.get_remaining()) > 0: # Getting the next combination comb = self.sweeper.get_next() logger.info(style.host(slugify(comb)) + ' has been started') self.get_nodes(comb) # If the job is broken, the program is stopped if get_oar_job_info(self.oar_job_id, self.frontend)['state'] == 'Error': break try: self.workflow(comb) # Process all combinations that can use the same submission while True: # Find the next combination combinations that can use the same submission subcomb = self.sweeper.get_next(lambda r: filter( lambda x: x['cores'] == comb['cores'] and x['cluster'] == comb['cluster'], r)) if not subcomb: logger.info( 'No more combination for cluster=%s and cores=%s', comb['cluster'], comb['cores']) break else: logger.info( style.host(slugify(subcomb)) + ' has been started') if get_oar_job_info(self.oar_job_id, self.frontend)['state'] != 'Error': self.workflow(subcomb) else: break # Whatever happens (errors, end of loop), the job is deleted finally: logger.info('Deleting job...') oardel([(self.oar_job_id, self.frontend)])
def run(self): logger.debug('Parse and convert configs for G5K provisioner') self.configs = parse_config_file(self.args.config_file_path) # Add the number of Antidote DC as a parameter self.configs['parameters']['n_dc'] = len( self.configs['exp_env']['clusters']) logger.debug('Normalize the parameter space') self.normalized_parameters = define_parameters( self.configs['parameters']) logger.debug('Normalize the given configs') kube_master_site = self.create_configs() logger.info('''Your largest topology: Antidote DCs: %s n_antidotedb_per_DC: %s ''' % (len(self.configs['exp_env']['clusters']), max(self.normalized_parameters['n_nodes_per_dc']))) sweeper = create_combs_queue( result_dir=self.configs['exp_env']['results_dir'], parameters=self.configs['parameters'], ) kube_namespace = 'elmerfs-exp' oar_job_ids = None while len(sweeper.get_remaining()) > 0: if oar_job_ids is None: oar_job_ids, kube_master = self.setup_env( kube_master_site, kube_namespace) comb = sweeper.get_next() sweeper = self.run_workflow( kube_master=kube_master, kube_namespace=kube_namespace, comb=comb, sweeper=sweeper, ) if not is_job_alive(oar_job_ids): oardel(oar_job_ids) oar_job_ids = None logger.info('Finish the experiment!!!')
def get_cpu_topology(cluster, xpdir=None): """ """ logger.info('Determining the architecture of cluster ' + \ style.emph(cluster)) root = None # Trying to reed topology from a directory if xpdir: fname = xpdir + '/topo_' + cluster + '.xml' try: tree = parse(fname) root = tree.getroot() except: logger.info('No cache file found, will reserve a node and ' + \ 'determine topology from virsh capabilities') pass if root is None: frontend = get_cluster_site(cluster) submission = OarSubmission( resources="{cluster='" + cluster + "'}/nodes=1", walltime="0:02:00", job_type="allow_classic_ssh") ((job_id, _), ) = oarsub([(submission, frontend)]) wait_oar_job_start(job_id, frontend) host = get_oar_job_nodes(job_id, frontend)[0] capa = SshProcess('virsh capabilities', host, connection_params={'user': default_frontend_connection_params['user']} ).run() oardel([(job_id, frontend)]) root = fromstring(capa.stdout) if xpdir is not None: tree = ElementTree(root) tree.write(fname) cpu_topology = [] i_cell = 0 for cell in root.findall('.//cell'): cpu_topology.append([]) for cpu in cell.findall('.//cpu'): cpu_topology[i_cell].append(int(cpu.attrib['id'])) i_cell += 1 logger.info(pformat(cpu_topology)) return cpu_topology
def prepare_bench(self): """bench configuration and compilation, copy binaries to frontends return True if preparation is ok """ logger.info("preparation: configure and compile benchmark") # the involved sites. We will do the compilation on the first of these. sites = list(set(map(get_cluster_site, self.parameters['cluster']))) # generate the bench compilation configuration bench_list = '\n'.join([ 'lu\t%s\t%s' % (size, n_core) for n_core in self.parameters['n_core'] for size in self.parameters['size'] ]) # Reserving a node because compiling on the frontend is forbidden # and because we need mpif77 jobs = oarsub([(OarSubmission(resources = "nodes=1", job_type = 'allow_classic_ssh', walltime ='0:10:00'), sites[0])]) if jobs[0][0]: try: logger.info("copying bench archive to %s" % (sites[0],)) copy_bench = Put([sites[0]], ['NPB3.3-MPI.tar.bz2']).run() logger.info("extracting bench archive on %s" % (sites[0],)) extract_bench = Remote('tar -xjf NPB3.3-MPI.tar.bz2', [sites[0]]).run() logger.info("waiting job start %s" % (jobs[0],)) wait_oar_job_start(*jobs[0], prediction_callback = pred_cb) logger.info("getting nodes of %s" % (jobs[0],)) nodes = get_oar_job_nodes(*jobs[0]) logger.info("configure bench compilation") conf_bench = Remote('echo "%s" > ~/NPB3.3-MPI/config/suite.def' % bench_list, nodes).run() logger.info("compil bench") compilation = Remote('cd NPB3.3-MPI && make clean && make suite', nodes).run() logger.info("compil finished") except: logger.error("unable to compile bench") return False finally: oardel(jobs) # Copying binaries to all other frontends frontends = sites[1:] rsync = Remote('rsync -avuP ~/NPB3.3-MPI/ {{frontends}}:NPB3.3-MPI', [get_host_site(nodes[0])] * len(frontends)) rsync.run() return compilation.ok and rsync.ok
def run(self): """ Main engine method to perform the experiment """ self.define_parameters() while len(self.sweeper.get_remaining()) > 0: # Getting the next combination comb = self.sweeper.get_next() logger.info(style.host(slugify(comb)) + ' has been started') self.get_nodes(comb) # If the job is broken, the program is stopped if get_oar_job_info(self.oar_job_id, self.frontend)['state'] == 'Error': break try: self.workflow(comb) # Process all combinations that can use the same submission while True: # Find the next combination combinations that can use the same submission subcomb = self.sweeper.get_next(lambda r: filter(lambda x: x['cores'] == comb['cores'] and x['cluster'] == comb['cluster'], r)) if not subcomb: logger.info('No more combination for cluster=%s and cores=%s', comb['cluster'], comb['cores']) break else: logger.info(style.host(slugify(subcomb)) + ' has been started') if get_oar_job_info(self.oar_job_id, self.frontend)['state'] != 'Error': self.workflow(subcomb) else: break # Whatever happens (errors, end of loop), the job is deleted finally: logger.info('Deleting job...') oardel([(self.oar_job_id, self.frontend)])
def tear_down(self): # Destroy the Rally deployment try: if self.rally_deployed: logger.info("Destroying Rally deployment " + self.config['deployment-name']) self._run_or_abort( 'rally deployment destroy %s' % self.config['deployment-name'], self.host, 'Could not destroy the Rally deployment. This will likely ' 'cause errors when the node is used again.', False, {'user': '******'}) except AttributeError: pass # self.host has not been defined yet, and that's ok # Kill the job try: if not self.options.keep_alive and self.job_id: logger.info("Killing job " + str(self.job_id)) EX5.oardel([(self.job_id, self.site)]) except AttributeError: pass # self.job_id has not been defined either, and that's ok too
def launch_bench(oarsubmission, site, folder): """Copy required files on frontend(s) and compile bench suite.""" logger.info("Reserving a node.") jobs = oarsub([(oarsubmission, site)]) (job_id, site) = jobs[0] logger.info(jobs) if job_id: try: logger.info("Node reserved.") wait_oar_job_start(job_id, site) logger.info("Deploying environment.") node = deploy_node(job_id, site, oarsubmission) logger.info("Compiling Bots.") setup_node(node) logger.info("Starting benchs.") run_bench(folder, node) except: logger.error("Unable to deploy & compile Bench.") oardel(jobs) return False logger.info("Benchs completed. Deleting jobs.") oardel(jobs) return True
def run(self): logger.debug('Parse and convert configs for G5K provisioner') self.configs = parse_config_file(self.args.config_file_path) kube_master_site = self.create_configs() logger.info('''Your topology: Antidote DCs: %s n_antidotedb_per_DC: %s n_fmke_per_DC: %s n_fmke_client_per_DC: %s ''' % ( len(self.configs['exp_env']['clusters']), self.configs['exp_env']['n_antidotedb_per_dc'], self.configs['exp_env']['n_fmke_app_per_dc'], self.configs['exp_env']['n_fmke_client_per_dc']) ) logger.debug('Creating the combination list') sweeper = create_combs_queue(result_dir=self.configs['exp_env']['results_dir'], parameters=self.configs['parameters']) kube_namespace = 'fmke-exp' oar_job_ids = None while len(sweeper.get_remaining()) > 0: if oar_job_ids is None: kube_master, oar_job_ids = self.setup_env(kube_master_site, kube_namespace) comb = sweeper.get_next() sweeper = self.run_workflow(kube_namespace=kube_namespace, kube_master=kube_master, comb=comb, sweeper=sweeper) if not is_job_alive(oar_job_ids): oardel(oar_job_ids) oar_job_ids = None logger.info('Finish the experiment!!!')
def run(self): # Go to the result folder before everything os.chdir(self.result_dir) jobs = [(_jobID, _site)] # Get nodes nodes = get_oar_job_nodes(_jobID, _site) try: logger.info("Creating hostfiles for all combinations...") for nbr_node in _nbrNodes: hostfile_filename = self.result_dir + '/' + 'hostfile-' + nbr_node with open(hostfile_filename, 'w') as hostfile: for node in nodes[:int(nbr_node)]: print>>hostfile, node.address spack_process = Process('spack install -v chameleon@trunk+starpu+fxt ^starpu@svn-trunk+fxt') spack_process.start() spack_process.wait() spack_process.kill() finally: logger.info("Delete job: {}".format(jobs)) oardel(jobs)
def run(self): sweeper = self.create_paramsweeper() while True: comb = sweeper.get_next() if not comb: break comb_dir = self.result_dir + '/' + slugify(comb) if not os.path.isdir(comb_dir): os.mkdir(comb_dir) comb_file = comb_dir + '/trace' g5k_configuration['kadeploy3'] = comb['version'] logger.info('Treating combination %s', pformat(comb)) get_version = SshProcess( comb['version'] + ' -v', comb['site'], connection_params=default_frontend_connection_params).run() logger.info(get_version.stdout) resources = "" if comb['kavlan']: resources += "{type='kavlan'}/vlan=1+" resources += "nodes=" + str(comb['n_nodes']) sub = OarSubmission(resources=resources, job_type='deploy', walltime="0:30:00", name='Kadeploy_Tests') logger.info('Performing submission of %s on site %s', resources, comb['site']) jobs = oarsub([(sub, comb['site'])]) if jobs[0][0]: try: logger.info('Waiting for job to start') wait_oar_job_start(jobs[0][0], jobs[0][1]) hosts = get_oar_job_nodes(jobs[0][0], jobs[0][1]) logger.info('Deployment of %s', ' '.join([host.address for host in hosts])) kavlan = get_oar_job_kavlan(jobs[0][0], jobs[0][1]) if kavlan: logger.info('In kavlan %s', kavlan) deployment = Deployment(hosts, env_name=comb['env'], vlan=kavlan) deployed, undeployed = deploy(deployment, stdout_handlers=[comb_file], stderr_handlers=[comb_file]) finally: logger.info('Destroying job %s on %s', str(jobs[0][0]), jobs[0][1]) oardel([(jobs[0][0], jobs[0][1])]) else: deployed = [] if len(undeployed) == 0: logger.info('%s is OK', slugify(comb)) elif len(deployed) == 0: logger.error('%s is KO', slugify(comb)) else: logger.warning('%s encountered problems with some hosts', slugify(comb)) sweeper.done(comb)
def run(self): """Inherited method, put here the code for running the engine""" self.define_parameters() self.cluster = self.args[0] self.site = get_cluster_site(self.cluster) if self.options.oar_job_id: self.oar_job_id = self.options.oar_job_id else: self.oar_job_id = None try: # Creation of the main iterator which is used for the first control loop. # You need have a method called define_parameters, that returns a list of parameter dicts self.define_parameters() job_is_dead = False # While they are combinations to treat while len(self.sweeper.get_remaining()) > 0: # If no job, we make a reservation and prepare the hosts for the experiments if job_is_dead or self.oar_job_id is None: self.make_reservation() # Retrieving the hosts and subnets parameters self.hosts = get_oar_job_nodes(self.oar_job_id, self.frontend) # Hosts deployment deployed, undeployed = deploy( Deployment(self.hosts, env_file="/home/mliroz/deploys/hadoop6.env")) logger.info("%i deployed, %i undeployed" % (len(deployed), len(undeployed))) if len(deployed) == 0: break # Configuration du systeme => look at the execo_g5k.topology module attr = get_host_attributes(self.cluster + '-1') ## SETUP FINISHED # Getting the next combination comb = self.sweeper.get_next() self.prepare_dataset(comb) self.xp(comb) # subloop over the combinations that have the same sizes while True: newcomb = self.sweeper.get_next(lambda r: filter( lambda subcomb: subcomb['sizes'] == comb['sizes'], r)) if newcomb: try: self.xp(newcomb) except: break else: break if get_oar_job_info(self.oar_job_id, self.frontend)['state'] == 'Error': job_is_dead = True finally: if self.oar_job_id is not None: if not self.options.keep_alive: logger.info('Deleting job') oardel([(self.oar_job_id, self.frontend)]) else: logger.info('Keeping job alive for debugging')
execo.sleep(1) print 'sending command: ' + line workers = execo.Remote(line, cores).start() app = App() if jobid: try: print 'Waiting for job to start' execo_g5k.wait_oar_job_start(jobid, site) print 'Retrieving nodes' nodes = execo_g5k.get_oar_job_nodes(jobid, site) # Setup nodes print 'Preparing workers with cmd: ' + setup_cmd workers = execo.Remote(setup_cmd, nodes).start() workers.expect('Worker Setup Completed') workers.kill() # Possibly open more than one connection per machine cores = nodes * args.nb_cores print cores print 'Example cmd: %s' % (workers_cmd) app.prompt = '%s (%d node(s), %d core(s)/node)> ' % ( site, args.volunteers, args.nb_cores) app.cmdloop() # execo.sleep(600) # print 'Workers done' finally: execo_g5k.oardel([(jobid, site)])
def run(self): # Go to the result folder before everything os.chdir(self.result_dir) # OARSUB jobs = oarsub([(OarSubmission(resources='nodes=' + _nbrNodes.__str__(), job_type='deploy', walltime=_walltime, sql_properties=_properties), _site)]) job_id, site = jobs[0] try: # KADEPLOY logger.info("Waiting job start %s on %s" % (job_id, site)) wait_oar_job_start(job_id, site, prediction_callback=prediction_callback) logger.info("getting nodes of %s on %s" % (job_id, site)) nodes = get_oar_job_nodes(job_id, site) deployed, undeployed = deploy(Deployment(nodes, env_name=env_name), check_deployed_command=already_configured) if undeployed: logger.warn( "NOT deployed nodes : {}".format(str(undeployed))) raise RuntimeError('Deployement failed') # STARPU INSTALLATION spack_spec = 'chameleon@trunk+starpu+fxt ^starpu@svn-trunk+fxt' spack_command = 'spack install -v' + ' ' + spack_spec logger.info("Starting StarPU installation...") spack_process = Process(spack_command).start() spack_process.wait() logger.info("StarPU installation DONE...") self.checkProcess(spack_process) spack_process.kill() # STARPU DIRECTORY logger.info("Searching and going to StarPU installation directory...") starpu_location_process = Process(spack_spec).start() starpu_location_process.wait() self.checkProcess(starpu_location) starpu_cd_process = Process('cd ' + starpu_location_process.stdout + '/lib/chameleon').start() starpu_cd_process.wait() self.checkProcess(starpu_cd_process) starpu_location_process.kill() starpu_cd_process.kill() # RUNNING EXPERIMENT logger.info("Starting StarPU experiment...") starpu_experiment_process = Process(""" export STARPU_WORKER_STATS=1 export STARPU_CALIBRATE=2 ./timing/time_spotrf_tile --warmup --gpus=3 --threads=9 --nb=960 --ib=96 --n_range=48000:48000:9600 """) starpu_experiment_process.stdout_handlers.append(self.result_dir + '/' + 'StarPU.out') # create output file for StarPU starpu_experiment_process.start() starpu_experiment_process.wait() logger.info("StarPU experiment DONE...") self.checkProcess(starpu_experiment_process) starpu_experiment_process.kill() finally: logger.info("Delete job : {}".format(jobs)) oardel(jobs)
def oar_destroy_from_id(oarjob, site): """Destroy the job.""" oarjob = int(oarjob) if oarjob is not None and site is not None: ex5.oardel([[oarjob, site]]) logger.info("Killing the job %s" % oarjob)
def run(self): """Inherited method, put here the code for running the engine""" self.define_parameters() self.cluster = self.args[0] self.site = get_cluster_site(self.cluster) if self.options.oar_job_id: self.oar_job_id = self.options.oar_job_id else: self.oar_job_id = None try: # Creation of the main iterator which is used for the first control loop. # You need have a method called define_parameters, that returns a list of parameter dicts self.define_parameters() job_is_dead = False # While they are combinations to treat while len(self.sweeper.get_remaining()) > 0: # If no job, we make a reservation and prepare the hosts for the experiments if job_is_dead or self.oar_job_id is None: self.make_reservation() # Retrieving the hosts and subnets parameters self.hosts = get_oar_job_nodes(self.oar_job_id, self.frontend) # Hosts deployment deployed, undeployed = deploy(Deployment(self.hosts, env_file="/home/mliroz/deploys/hadoop6.env")) logger.info("%i deployed, %i undeployed" % (len(deployed), len(undeployed))) if len(deployed) == 0: break # Configuration du systeme => look at the execo_g5k.topology module attr = get_host_attributes(self.cluster + '-1') ## SETUP FINISHED # Getting the next combination comb = self.sweeper.get_next() self.prepare_dataset(comb) self.xp(comb) # subloop over the combinations that have the same sizes while True: newcomb = self.sweeper.get_next(lambda r: filter(lambda subcomb: subcomb['sizes'] == comb['sizes'], r)) if newcomb: try: self.xp(newcomb) except: break else: break if get_oar_job_info(self.oar_job_id, self.frontend)['state'] == 'Error': job_is_dead = True finally: if self.oar_job_id is not None: if not self.options.keep_alive: logger.info('Deleting job') oardel([(self.oar_job_id, self.frontend)]) else: logger.info('Keeping job alive for debugging')
def run(self): num_total_workers = 0 sites_clusters_threads = {} # dict: keys = sites, values = # dict: keys = clusters, values = # list: threads try: while True: t = Timer() clusters_to_submit = set() for clusterspec in self.get_clusters(): cluster, _, site = clusterspec.partition(".") if site == "": site = get_cluster_site(cluster) clusters_to_submit.add((cluster, site)) for site in sites_clusters_threads.keys(): for cluster in sites_clusters_threads[site].keys(): sites_clusters_threads[site][cluster] = [ th for th in sites_clusters_threads[site][cluster] if th.is_alive() ] if len(sites_clusters_threads[site][cluster]) == 0: del sites_clusters_threads[site][cluster] if len(sites_clusters_threads[site]) == 0: del sites_clusters_threads[site] all_involved_sites = set(sites_clusters_threads.keys()) all_involved_sites.update([s for (c, s) in clusters_to_submit]) no_submissions = True for site in all_involved_sites: all_involved_clusters = set() if sites_clusters_threads.has_key(site): all_involved_clusters.update( sites_clusters_threads[site].keys()) all_involved_clusters.update( [c for (c, s) in clusters_to_submit if s == site]) for cluster in all_involved_clusters: num_workers = 0 num_waiting = 0 if sites_clusters_threads.has_key( site) and sites_clusters_threads[site].has_key( cluster): num_workers = len( sites_clusters_threads[site][cluster]) num_waiting = len([ th for th in sites_clusters_threads[site][cluster] if th.waiting ]) num_max_new_workers = min( self.options.max_workers - num_workers, self.options.max_waiting - num_waiting) logger.trace( "rescheduling on cluster %s@%s: num_workers = %s / num_waiting = %s / num_max_new_workers = %s" % (cluster, site, num_workers, num_waiting, num_max_new_workers)) if num_max_new_workers > 0: for worker_index in range(0, num_max_new_workers): jobdata = self.get_job(cluster) if not jobdata: break no_submissions = False logger.detail( "spawning worker %i on %s@%s" % (num_total_workers, cluster, site)) (oarsubmission, data) = jobdata th = Thread(target=self.worker_start, args=( cluster, site, oarsubmission, data, num_total_workers, )) th.waiting = True th.daemon = True th.oarsublock = Lock() th.willterminate = False th.start() num_total_workers += 1 if not sites_clusters_threads.has_key(site): sites_clusters_threads[site] = {} if not sites_clusters_threads[site].has_key( cluster): sites_clusters_threads[site][cluster] = [] sites_clusters_threads[site][cluster].append( th) if no_submissions and len(sites_clusters_threads) == 0: break sleep(self.options.schedule_delay) logger.detail( "no more combinations to explore. exit schedule loop") finally: for site in sites_clusters_threads.keys(): for cluster in sites_clusters_threads[site].keys(): for th in sites_clusters_threads[site][cluster]: with th.oarsublock: th.willterminate = True if th.jobid: logger.detail( "cleaning: delete job %i of worker #%i on %s" % (th.jobid, th.worker_index, site)) oardel([(th.jobid, site)]) th.jobid = None
def run(self): sweeper = self.create_paramsweeper() while True: comb = sweeper.get_next() if not comb: break comb_dir = self.result_dir + '/' + slugify(comb) if not os.path.isdir(comb_dir): os.mkdir(comb_dir) comb_file = comb_dir + '/trace' g5k_configuration['kadeploy3'] = comb['version'] logger.info('Treating combination %s', pformat(comb)) get_version = SshProcess(comb['version'] + ' -v', comb['site'], connection_params=default_frontend_connection_params).run() logger.info(get_version.stdout) resources = "" if comb['kavlan']: resources += "{type='kavlan'}/vlan=1+" resources += "nodes=" + str(comb['n_nodes']) sub = OarSubmission(resources=resources, job_type='deploy', walltime="0:30:00", name='Kadeploy_Tests') logger.info('Performing submission of %s on site %s', resources, comb['site']) jobs = oarsub([(sub, comb['site'])]) if jobs[0][0]: try: logger.info('Waiting for job to start') wait_oar_job_start(jobs[0][0], jobs[0][1]) hosts = get_oar_job_nodes(jobs[0][0], jobs[0][1]) logger.info('Deployment of %s', ' '.join([host.address for host in hosts])) kavlan = get_oar_job_kavlan(jobs[0][0], jobs[0][1]) if kavlan: logger.info('In kavlan %s', kavlan) deployment = Deployment(hosts, env_name=comb['env'], vlan=kavlan) deployed, undeployed = deploy(deployment, stdout_handlers=[comb_file], stderr_handlers=[comb_file]) finally: logger.info('Destroying job %s on %s', str(jobs[0][0]), jobs[0][1]) oardel([(jobs[0][0], jobs[0][1])]) else: deployed = [] if len(undeployed) == 0: logger.info('%s is OK', slugify(comb)) elif len(deployed) == 0: logger.error('%s is KO', slugify(comb)) else: logger.warning('%s encountered problems with some hosts', slugify(comb)) sweeper.done(comb)
def delete_job(self): EX5.oardel([self.gridjob])
def run(self): num_total_workers = 0 sites_clusters_threads = {} # dict: keys = sites, values = # dict: keys = clusters, values = # list: threads try: while True: t = Timer() clusters_to_submit = set() for clusterspec in self.get_clusters(): cluster, _, site = clusterspec.partition(".") if site == "": site = get_cluster_site(cluster) clusters_to_submit.add((cluster, site)) for site in sites_clusters_threads.keys(): for cluster in sites_clusters_threads[site].keys(): sites_clusters_threads[site][cluster] = [ th for th in sites_clusters_threads[site][cluster] if th.is_alive() ] if len(sites_clusters_threads[site][cluster]) == 0: del sites_clusters_threads[site][cluster] if len(sites_clusters_threads[site]) == 0: del sites_clusters_threads[site] all_involved_sites = set(sites_clusters_threads.keys()) all_involved_sites.update([ s for (c, s) in clusters_to_submit ]) no_submissions = True for site in all_involved_sites: all_involved_clusters = set() if sites_clusters_threads.has_key(site): all_involved_clusters.update(sites_clusters_threads[site].keys()) all_involved_clusters.update([ c for (c, s) in clusters_to_submit if s == site ]) for cluster in all_involved_clusters: num_workers = 0 num_waiting = 0 if sites_clusters_threads.has_key(site) and sites_clusters_threads[site].has_key(cluster): num_workers = len(sites_clusters_threads[site][cluster]) num_waiting = len([ th for th in sites_clusters_threads[site][cluster] if th.waiting ]) num_max_new_workers = min(self.options.max_workers - num_workers, self.options.max_waiting - num_waiting) logger.trace( "rescheduling on cluster %s@%s: num_workers = %s / num_waiting = %s / num_max_new_workers = %s" % (cluster, site, num_workers, num_waiting, num_max_new_workers)) if num_max_new_workers > 0: for worker_index in range(0, num_max_new_workers): jobdata = self.get_job(cluster) if not jobdata: break no_submissions = False logger.detail( "spawning worker %i on %s@%s" % ( num_total_workers, cluster, site)) (oarsubmission, data) = jobdata th = Thread(target = self.worker_start, args = (cluster, site, oarsubmission, data, num_total_workers,)) th.waiting = True th.daemon = True th.oarsublock = Lock() th.willterminate = False th.start() num_total_workers += 1 if not sites_clusters_threads.has_key(site): sites_clusters_threads[site] = {} if not sites_clusters_threads[site].has_key(cluster): sites_clusters_threads[site][cluster] = [] sites_clusters_threads[site][cluster].append(th) if no_submissions and len(sites_clusters_threads) == 0: break sleep(self.options.schedule_delay) logger.detail("no more combinations to explore. exit schedule loop") finally: for site in sites_clusters_threads.keys(): for cluster in sites_clusters_threads[site].keys(): for th in sites_clusters_threads[site][cluster]: with th.oarsublock: th.willterminate = True if th.jobid: logger.detail("cleaning: delete job %i of worker #%i on %s" % ( th.jobid, th.worker_index, site)) oardel([(th.jobid, site)]) th.jobid = None
def run(self): """The main experimental workflow, as described in ``Using the Execo toolkit to perform ...`` """ self.force_options() # The argument is a cluster self.cluster = self.args[0] self.frontend = get_cluster_site(self.cluster) # Analyzing options if self.options.oar_job_id: self.oar_job_id = self.options.oar_job_id else: self.oar_job_id = None try: # Creation of the main iterator which is used for the first control loop. # You need have a method called define_parameters, that returns a list of parameter dicts self.create_paramsweeper() job_is_dead = False # While they are combinations to treat while len(self.sweeper.get_remaining()) > 0: # If no job, we make a reservation and prepare the hosts for the experiments if self.oar_job_id is None: self.make_reservation() # Retrieving the hosts and subnets parameters self.get_resources() # Hosts deployment and configuration if not self.options.no_hosts_setup: self.setup_hosts() if len(self.hosts) == 0: break # Initializing the resources and threads available_hosts = list(self.hosts) available_ip_mac = list(self.ip_mac) threads = {} # Checking that the job is running and not in Error while self.is_job_alive()['state'] != 'Error' \ or len(threads.keys()) > 0: # while get_oar_job_info(self.oar_job_id, self.frontend)['state'] != 'Error' \ # or len(threads.keys()) > 0: job_is_dead = False while self.options.n_nodes > len(available_hosts): tmp_threads = dict(threads) for t in tmp_threads: if not t.is_alive(): available_hosts.extend(tmp_threads[t]['hosts']) available_ip_mac.extend(tmp_threads[t]['ip_mac']) del threads[t] sleep(5) if self.is_job_alive()['state'] == 'Error': job_is_dead = True break if job_is_dead: break # Getting the next combination comb = self.sweeper.get_next() if not comb: while len(threads.keys()) > 0: tmp_threads = dict(threads) for t in tmp_threads: if not t.is_alive(): del threads[t] logger.info('Waiting for threads to complete') sleep(20) break used_hosts = available_hosts[0:self.options.n_nodes] available_hosts = available_hosts[self.options.n_nodes:] n_vm = self.comb_nvm(comb) used_ip_mac = available_ip_mac[0:n_vm] available_ip_mac = available_ip_mac[n_vm:] t = Thread(target=self.workflow, args=(comb, used_hosts, used_ip_mac)) threads[t] = {'hosts': used_hosts, 'ip_mac': used_ip_mac} logger.debug('Threads: %s', len(threads)) t.daemon = True t.start() # if get_oar_job_info(self.oar_job_id, self.frontend)['state'] == 'Error': if self.is_job_alive()['state'] == 'Error': job_is_dead = True if job_is_dead: self.oar_job_id = None finally: if self.oar_job_id is not None: if not self.options.keep_alive: logger.info('Deleting job') oardel([(self.oar_job_id, self.frontend)]) else: logger.info('Keeping job alive for debugging')
def run(self): """ """ if self.options.oargrid_job_id is not None: self.oar_job_id = self.options.oargrid_job_id else: self.oar_job_id = None self.list_of_clusters = [ 'parasilo', 'paravance', 'parapluie', 'paranoia' ] try: # Creation of the main iterator which is used for the first control loop. self.define_parameters() self.working_dir = '/data/jorouzaudcornabas_' + str( self.options.storage5k_job_id) job_is_dead = False # While there are combinations to treat while len(self.sweeper.get_remaining()) > 0: # If no job, we make a reservation and prepare the hosts for the experiments if self.oar_job_id is None: self.submit_all_available_best_effort( self.list_of_clusters, self.options.walltime) # self.make_reservation_local() # Wait that the job starts logger.info('Waiting that the job start ' + str(self.oar_job_id)) wait_oar_job_start(self.oar_job_id) # Retrieving the hosts and subnets parameters self.hosts = get_oar_job_nodes(self.oar_job_id) # Hosts deployment and configuration default_connection_params['user'] = '******' logger.info("Start hosts configuration") ex_log.setLevel('INFO') #=============================================================== # deployment = Deployment(hosts = self.hosts, # env_file='/home/sirimie/env/mywheezy-x64-base.env') # self.hosts, _ = deploy(deployment) #=============================================================== if len(self.hosts) == 0: break # Initializing the resources and threads available_hosts = self.hosts threads = {} # Creating the unique folder for storing the results comb_dir = self.result_dir + '/logs' if not os.path.exists(comb_dir): os.mkdir(comb_dir) logger.info("Starting the thread " + str(self.is_job_alive()) + " " + str(len(threads.keys()))) # Checking that the job is running and not in Error while self.is_job_alive() or len(threads.keys()) > 0: job_is_dead = False while self.options.n_nodes > len(available_hosts): tmp_threads = dict(threads) for t in tmp_threads: if not t.is_alive(): available_hosts.append(tmp_threads[t]['host']) del threads[t] sleep(5) if not self.is_job_alive(): job_is_dead = True break if job_is_dead: break # Getting the next combination comb = self.sweeper.get_next() if not comb: while len(threads.keys()) > 0: tmp_threads = dict(threads) for t in tmp_threads: if not t.is_alive(): del threads[t] logger.info('Waiting for threads to complete') sleep(20) break host = available_hosts[0] available_hosts = available_hosts[1:] logger.info("Launching thread") t = Thread(target=self.workflow, args=(comb, host, comb_dir)) threads[t] = {'host': host} t.daemon = True t.start() if not self.is_job_alive(): job_is_dead = True if job_is_dead: self.oar_job_id = None finally: if self.oar_job_id is not None: if not self.options.keep_alive: logger.info('Deleting job') oardel([self.oar_job_id]) else: logger.info('Keeping job alive for debugging')
def run(self): """Run the experiment""" already_configured = self.options.already_configured reservation_job_id = int(self.options.reservation_id) \ if self.options.reservation_id is not None else None is_a_test = self.options.is_a_test if is_a_test: logger.warn('THIS IS A TEST! This run will use only a few ' 'resources') # make the result folder writable for all os.chmod(self.result_dir, 0o777) # Import configuration with open(self.args[0]) as config_file: config = json.load(config_file) # backup configuration copy(self.args[0], self.result_dir) site = config["grid5000_site"] resources = config["resources"] nb_experiment_nodes = config["nb_experiment_nodes"] walltime = str(config["walltime"]) env_name = config["kadeploy_env_name"] workloads = config["workloads"] # check if workloads exists (Suppose that the same NFS mount point # is present on the remote and the local environment for workload_file in workloads: with open(workload_file): pass # copy the workloads files to the results dir copy(workload_file, self.result_dir) # define the workloads parameters self.parameters = {'workload_filename': workloads} logger.info('Workloads: {}'.format(workloads)) # define the iterator over the parameters combinations self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweep(self.parameters)) # Due to previous (using -c result_dir) run skip some combination logger.info('Skipped parameters:' + '{}'.format(str(self.sweeper.get_skipped()))) logger.info('Number of parameters combinations {}'.format( str(len(self.sweeper.get_remaining())))) logger.info('combinations {}'.format(str( self.sweeper.get_remaining()))) if reservation_job_id is not None: jobs = [(reservation_job_id, site)] else: jobs = oarsub([(OarSubmission(resources=resources, job_type='deploy', walltime=walltime), site)]) job_id, site = jobs[0] if job_id: try: logger.info("waiting job start %s on %s" % (job_id, site)) wait_oar_job_start(job_id, site, prediction_callback=prediction_callback) logger.info("getting nodes of %s on %s" % (job_id, site)) nodes = get_oar_job_nodes(job_id, site) # sort the nodes nodes = sorted(nodes, key=lambda node: node.address) # get only the necessary nodes under the switch if nb_experiment_nodes > len(nodes): raise RuntimeError('The number of given node in the ' 'reservation ({}) do not match the ' 'requested resources ' '({})'.format(len(nodes), nb_experiment_nodes)) nodes = nodes[:nb_experiment_nodes] logger.info("deploying nodes: {}".format(str(nodes))) deployed, undeployed = deploy( Deployment(nodes, env_name=env_name), check_deployed_command=already_configured) if undeployed: logger.warn("NOT deployed nodes: {}".format( str(undeployed))) raise RuntimeError('Deployement failed') if not already_configured: # install OAR install_cmd = "apt-get update; apt-get install -y " node_packages = "oar-node" logger.info("installing OAR nodes: {}".format( str(nodes[1:]))) install_oar_nodes = Remote( install_cmd + node_packages, nodes[1:], connection_params={'user': '******'}) install_oar_nodes.start() server_packages = ( "oar-server oar-server-pgsql oar-user " "oar-user-pgsql postgresql python3-pip " "libjson-perl postgresql-server-dev-all") install_oar_sched_cmd = """ mkdir -p /opt/oar_sched; \ cd /opt/oar_sched; \ git clone https://github.com/oar-team/oar3.git; \ cd oar3; \ git checkout dce942bebc2; \ pip3 install -e .; \ cd /usr/lib/oar/schedulers; \ ln -s /usr/local/bin/kamelot; \ pip3 install psycopg2 """ logger.info("installing OAR server node: {}".format( str(nodes[0]))) install_master = SshProcess( install_cmd + server_packages + ";" + install_oar_sched_cmd, nodes[0], connection_params={'user': '******'}) install_master.run() install_oar_nodes.wait() if not install_master.ok: Report(install_master) configure_oar_cmd = """ sed -i \ -e 's/^\(DB_TYPE\)=.*/\\1="Pg"/' \ -e 's/^\(DB_HOSTNAME\)=.*/\\1="localhost"/' \ -e 's/^\(DB_PORT\)=.*/\\1="5432"/' \ -e 's/^\(DB_BASE_PASSWD\)=.*/\\1="oar"/' \ -e 's/^\(DB_BASE_LOGIN\)=.*/\\1="oar"/' \ -e 's/^\(DB_BASE_PASSWD_RO\)=.*/\\1="oar_ro"/' \ -e 's/^\(DB_BASE_LOGIN_RO\)=.*/\\1="oar_ro"/' \ -e 's/^\(SERVER_HOSTNAME\)=.*/\\1="localhost"/' \ -e 's/^\(SERVER_PORT\)=.*/\\1="16666"/' \ -e 's/^\(LOG_LEVEL\)\=\"2\"/\\1\=\"3\"/' \ -e 's#^\(LOG_FILE\)\=.*#\\1="{result_dir}/oar.log"#' \ -e 's/^\(JOB_RESOURCE_MANAGER_PROPERTY_DB_FIELD\=\"cpuset\".*\)/#\\1/' \ -e 's/^#\(CPUSET_PATH\=\"\/oar\".*\)/\\1/' \ -e 's/^\(FINAUD_FREQUENCY\)\=.*/\\1="0"/' \ /etc/oar/oar.conf """.format(result_dir=self.result_dir) configure_oar = Remote(configure_oar_cmd, nodes, connection_params={'user': '******'}) configure_oar.run() logger.info("OAR is configured on all nodes") # Configure server create_db = "oar-database --create --db-is-local" config_oar_sched = ( "oarnotify --remove-queue default;" "oarnotify --add-queue default,1,kamelot") start_oar = "systemctl start oar-server.service" logger.info("configuring OAR database: {}".format( str(nodes[0]))) config_master = SshProcess( create_db + ";" + config_oar_sched + ";" + start_oar, nodes[0], connection_params={'user': '******'}) config_master.run() # propagate SSH keys logger.info("configuring OAR SSH") oar_key = "/tmp/.ssh" Process('rm -rf ' + oar_key).run() Process( 'scp -o BatchMode=yes -o PasswordAuthentication=no ' '-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ' '-o ConnectTimeout=20 -rp -o User=root ' + nodes[0].address + ":/var/lib/oar/.ssh" ' ' + oar_key).run() # Get(nodes[0], "/var/lib/oar/.ssh", [oar_key], connection_params={'user': '******'}).run() Put(nodes[1:], [oar_key], "/var/lib/oar/", connection_params={ 'user': '******' }).run() add_resources_cmd = """ oarproperty -a cpu || true; \ oarproperty -a core || true; \ oarproperty -c -a host || true; \ oarproperty -a mem || true; \ """ for node in nodes[1:]: add_resources_cmd = add_resources_cmd + "oarnodesetting -a -h {node} -p host={node} -p cpu=1 -p core=4 -p cpuset=0 -p mem=16; \\\n".format( node=node.address) add_resources = SshProcess( add_resources_cmd, nodes[0], connection_params={'user': '******'}) add_resources.run() if add_resources.ok: logger.info("oar is now configured!") else: raise RuntimeError( "error in the OAR configuration: Abort!") # TODO backup de la config de OAR # Do the replay logger.info('begining the replay') while len(self.sweeper.get_remaining()) > 0: combi = self.sweeper.get_next() workload_file = os.path.basename( combi['workload_filename']) oar_replay = SshProcess( script_path + "/oar_replay.py " + combi['workload_filename'] + " " + self.result_dir + " oar_gant_" + workload_file, nodes[0]) oar_replay.stdout_handlers.append(self.result_dir + '/' + workload_file + '.out') logger.info("replaying workload: {}".format(combi)) oar_replay.run() if oar_replay.ok: logger.info("Replay workload OK: {}".format(combi)) self.sweeper.done(combi) else: logger.info("Replay workload NOT OK: {}".format(combi)) self.sweeper.cancel(combi) raise RuntimeError("error in the OAR replay: Abort!") except: traceback.print_exc() ipdb.set_trace() finally: if is_a_test: ipdb.set_trace() if reservation_job_id is None: logger.info("delete job: {}".format(jobs)) oardel(jobs)
def run(self): """Run the experiment""" already_configured = self.options.already_configured reservation_job_id = int(self.options.reservation_id) \ if self.options.reservation_id is not None else None is_a_test = self.options.is_a_test if is_a_test: logger.warn('THIS IS A TEST! This run will use only a few ' 'resources') # make the result folder writable for all os.chmod(self.result_dir, 0o777) # Import configuration with open(self.args[0]) as config_file: config = json.load(config_file) # backup configuration copy(self.args[0], self.result_dir) site = config["grid5000_site"] resources = config["resources"] nb_experiment_nodes = config["nb_experiment_nodes"] walltime = str(config["walltime"]) env_name = config["kadeploy_env_name"] workloads = config["workloads"] # check if workloads exists (Suppose that the same NFS mount point # is present on the remote and the local environment for workload_file in workloads: with open(workload_file): pass # copy the workloads files to the results dir copy(workload_file, self.result_dir) # define the workloads parameters self.parameters = { 'workload_filename': workloads } logger.info('Workloads: {}'.format(workloads)) # define the iterator over the parameters combinations self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweep(self.parameters)) # Due to previous (using -c result_dir) run skip some combination logger.info('Skipped parameters:' + '{}'.format(str(self.sweeper.get_skipped()))) logger.info('Number of parameters combinations {}'.format( str(len(self.sweeper.get_remaining())))) logger.info('combinations {}'.format( str(self.sweeper.get_remaining()))) if reservation_job_id is not None: jobs = [(reservation_job_id, site)] else: jobs = oarsub([(OarSubmission(resources=resources, job_type='deploy', walltime=walltime), site)]) job_id, site = jobs[0] if job_id: try: logger.info("waiting job start %s on %s" % (job_id, site)) wait_oar_job_start( job_id, site, prediction_callback=prediction_callback) logger.info("getting nodes of %s on %s" % (job_id, site)) nodes = get_oar_job_nodes(job_id, site) # sort the nodes nodes = sorted(nodes, key=lambda node: node.address) # get only the necessary nodes under the switch if nb_experiment_nodes > len(nodes): raise RuntimeError('The number of given node in the ' 'reservation ({}) do not match the ' 'requested resources ' '({})'.format(len(nodes), nb_experiment_nodes)) nodes = nodes[:nb_experiment_nodes] logger.info("deploying nodes: {}".format(str(nodes))) deployed, undeployed = deploy( Deployment(nodes, env_name=env_name), check_deployed_command=already_configured) if undeployed: logger.warn( "NOT deployed nodes: {}".format(str(undeployed))) raise RuntimeError('Deployement failed') if not already_configured: # install OAR install_cmd = "apt-get update; apt-get install -y " node_packages = "oar-node" logger.info( "installing OAR nodes: {}".format(str(nodes[1:]))) install_oar_nodes = Remote( install_cmd + node_packages, nodes[1:], connection_params={'user': '******'}) install_oar_nodes.start() server_packages = ("oar-server oar-server-pgsql oar-user " "oar-user-pgsql postgresql python3-pip " "libjson-perl postgresql-server-dev-all") install_oar_sched_cmd = """ mkdir -p /opt/oar_sched; \ cd /opt/oar_sched; \ git clone https://github.com/oar-team/oar3.git; \ cd oar3; \ git checkout dce942bebc2; \ pip3 install -e .; \ cd /usr/lib/oar/schedulers; \ ln -s /usr/local/bin/kamelot; \ pip3 install psycopg2 """ logger.info("installing OAR server node: {}".format(str(nodes[0]))) install_master = SshProcess(install_cmd + server_packages + ";" + install_oar_sched_cmd, nodes[0], connection_params={'user': '******'}) install_master.run() install_oar_nodes.wait() if not install_master.ok: Report(install_master) configure_oar_cmd = """ sed -i \ -e 's/^\(DB_TYPE\)=.*/\\1="Pg"/' \ -e 's/^\(DB_HOSTNAME\)=.*/\\1="localhost"/' \ -e 's/^\(DB_PORT\)=.*/\\1="5432"/' \ -e 's/^\(DB_BASE_PASSWD\)=.*/\\1="oar"/' \ -e 's/^\(DB_BASE_LOGIN\)=.*/\\1="oar"/' \ -e 's/^\(DB_BASE_PASSWD_RO\)=.*/\\1="oar_ro"/' \ -e 's/^\(DB_BASE_LOGIN_RO\)=.*/\\1="oar_ro"/' \ -e 's/^\(SERVER_HOSTNAME\)=.*/\\1="localhost"/' \ -e 's/^\(SERVER_PORT\)=.*/\\1="16666"/' \ -e 's/^\(LOG_LEVEL\)\=\"2\"/\\1\=\"3\"/' \ -e 's#^\(LOG_FILE\)\=.*#\\1="{result_dir}/oar.log"#' \ -e 's/^\(JOB_RESOURCE_MANAGER_PROPERTY_DB_FIELD\=\"cpuset\".*\)/#\\1/' \ -e 's/^#\(CPUSET_PATH\=\"\/oar\".*\)/\\1/' \ -e 's/^\(FINAUD_FREQUENCY\)\=.*/\\1="0"/' \ /etc/oar/oar.conf """.format(result_dir=self.result_dir) configure_oar = Remote(configure_oar_cmd, nodes, connection_params={'user': '******'}) configure_oar.run() logger.info("OAR is configured on all nodes") # Configure server create_db = "oar-database --create --db-is-local" config_oar_sched = ("oarnotify --remove-queue default;" "oarnotify --add-queue default,1,kamelot") start_oar = "systemctl start oar-server.service" logger.info( "configuring OAR database: {}".format(str(nodes[0]))) config_master = SshProcess(create_db + ";" + config_oar_sched + ";" + start_oar, nodes[0], connection_params={'user': '******'}) config_master.run() # propagate SSH keys logger.info("configuring OAR SSH") oar_key = "/tmp/.ssh" Process('rm -rf ' + oar_key).run() Process('scp -o BatchMode=yes -o PasswordAuthentication=no ' '-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ' '-o ConnectTimeout=20 -rp -o User=root ' + nodes[0].address + ":/var/lib/oar/.ssh" ' ' + oar_key).run() # Get(nodes[0], "/var/lib/oar/.ssh", [oar_key], connection_params={'user': '******'}).run() Put(nodes[1:], [oar_key], "/var/lib/oar/", connection_params={'user': '******'}).run() add_resources_cmd = """ oarproperty -a cpu || true; \ oarproperty -a core || true; \ oarproperty -c -a host || true; \ oarproperty -a mem || true; \ """ for node in nodes[1:]: add_resources_cmd = add_resources_cmd + "oarnodesetting -a -h {node} -p host={node} -p cpu=1 -p core=4 -p cpuset=0 -p mem=16; \\\n".format(node=node.address) add_resources = SshProcess(add_resources_cmd, nodes[0], connection_params={'user': '******'}) add_resources.run() if add_resources.ok: logger.info("oar is now configured!") else: raise RuntimeError("error in the OAR configuration: Abort!") # TODO backup de la config de OAR # Do the replay logger.info('begining the replay') while len(self.sweeper.get_remaining()) > 0: combi = self.sweeper.get_next() workload_file = os.path.basename(combi['workload_filename']) oar_replay = SshProcess(script_path + "/oar_replay.py " + combi['workload_filename'] + " " + self.result_dir + " oar_gant_" + workload_file, nodes[0]) oar_replay.stdout_handlers.append(self.result_dir + '/' + workload_file + '.out') logger.info("replaying workload: {}".format(combi)) oar_replay.run() if oar_replay.ok: logger.info("Replay workload OK: {}".format(combi)) self.sweeper.done(combi) else: logger.info("Replay workload NOT OK: {}".format(combi)) self.sweeper.cancel(combi) raise RuntimeError("error in the OAR replay: Abort!") except: traceback.print_exc() ipdb.set_trace() finally: if is_a_test: ipdb.set_trace() if reservation_job_id is None: logger.info("delete job: {}".format(jobs)) oardel(jobs)
def run(self): """The main experimental workflow, as described in ``Using the Execo toolkit to perform ...`` """ self.force_options() # The argument is a cluster self.cluster = self.args[0] self.frontend = get_cluster_site(self.cluster) # Analyzing options if self.options.oar_job_id: self.oar_job_id = self.options.oar_job_id else: self.oar_job_id = None try: # Creation of the main iterator which is used for the first control loop. # You need have a method called define_parameters, that returns a list of parameter dicts self.create_paramsweeper() job_is_dead = False # While they are combinations to treat while len(self.sweeper.get_remaining()) > 0: # If no job, we make a reservation and prepare the hosts for the experiments if self.oar_job_id is None: self.make_reservation() # Retrieving the hosts and subnets parameters self.get_resources() # Hosts deployment and configuration if not self.options.no_hosts_setup: self.setup_hosts() if len(self.hosts) == 0: break # Initializing the resources and threads available_hosts = list(self.hosts) available_ip_mac = list(self.ip_mac) threads = {} # Checking that the job is running and not in Error while get_oar_job_info(self.oar_job_id, self.frontend)['state'] != 'Error' \ or len(threads.keys()) > 0: job_is_dead = False while self.options.n_nodes > len(available_hosts): tmp_threads = dict(threads) for t in tmp_threads: if not t.is_alive(): available_hosts.extend(tmp_threads[t]['hosts']) available_ip_mac.extend( tmp_threads[t]['ip_mac']) del threads[t] sleep(5) if get_oar_job_info(self.oar_job_id, self.frontend)['state'] == 'Error': job_is_dead = True break if job_is_dead: break # Getting the next combination comb = self.sweeper.get_next() if not comb: while len(threads.keys()) > 0: tmp_threads = dict(threads) for t in tmp_threads: if not t.is_alive(): del threads[t] logger.info('Waiting for threads to complete') sleep(20) break used_hosts = available_hosts[0:self.options.n_nodes] available_hosts = available_hosts[self.options.n_nodes:] n_vm = self.comb_nvm(comb) used_ip_mac = available_ip_mac[0:n_vm] available_ip_mac = available_ip_mac[n_vm:] t = Thread(target=self.workflow, args=(comb, used_hosts, used_ip_mac)) threads[t] = {'hosts': used_hosts, 'ip_mac': used_ip_mac} logger.debug('Threads: %s', len(threads)) t.daemon = True t.start() if get_oar_job_info(self.oar_job_id, self.frontend)['state'] == 'Error': job_is_dead = True if job_is_dead: self.oar_job_id = None finally: if self.oar_job_id is not None: if not self.options.keep_alive: logger.info('Deleting job') oardel([(self.oar_job_id, self.frontend)]) else: logger.info('Keeping job alive for debugging')
job_name="cloudal_docker") provisioner.provisioning() hosts = provisioner.hosts self.oar_result = provisioner.oar_result logger.info("Provisioning nodes: DONE") logger.info("Starting configure Docker on nodes") self.config_host(hosts) logger.info("Configuring Docker on nodes: DONE") if __name__ == "__main__": logger.info("Init engine in %s" % __file__) engine = config_docker_env_g5k() try: logger.info("Start engine in %s" % __file__) engine.start() except Exception as e: logger.info('Program is terminated by the following exception:') traceback.print_exc() except KeyboardInterrupt: logger.info('Program is terminated by keyboard interrupt.') if not engine.args.keep_alive: logger.info('Deleting reservation') oardel(engine.oar_result) logger.info('Reservation deleted') else: logger.info('Reserved nodes are kept alive for inspection purpose.')
def run(self): rtt_file = self.result_dir + "/rtt.csv" resolver = None client = 'tcpclient' if self.args.mode == 'tcp' else 'udpclient' try: logger.debug("Experiment ID: {}".format(self.exp_id)) if self.multi_site(): logger.info("Running in multi-site mode") if not self.multi_site(): self.reserve_resources_singlejob() logger.debug("Waiting for OAR job to start...") g5k.wait_oar_job_start(*self.vmhosts_job) self.prepare_subnet() logger.debug("Prepared subnet") # Dependencies (besides the obvious ones): # - deploy_server depends on prepare_global_vlan # - prepare_server depends on deploy_server # - prepare_server depends on prepare_subnet # - prepare_vm depends on deploy_server if self.multi_site(): self.reserve_global_vlan() g5k.wait_oar_job_start(*self.globalvlan_job) logger.debug("Waiting for global VLAN job to start...") self.prepare_global_vlan() self.log_experimental_conditions() logger.debug("Deploying VM hosts...") machines_deploy_process = self.start_deploy_vmhosts() logger.debug("Deploying server image...") server_deploy_process = self.start_deploy_server() machines_deploy_process.wait() logger.debug("Finishing deploying VM hosts...") self.finish_deploy_vmhosts(machines_deploy_process) logger.debug("Setting up VM hosts...") machines_setup_process = self.prepare_vmhosts() machines_setup_process.wait() logger.debug("VM hosts are setup.") server_deploy_process.wait() logger.debug("Finishing deploying server...") self.finish_deploy_server(server_deploy_process) logger.debug("Server is deployed.") self.vm_process = self.start_all_vm() # Ensure VM are killed when we exit with self.vm_process: server_setup_process = self.prepare_server() self.wait_until_vm_ready() vm_setup_process = self.prepare_vm() server_setup_process.wait() self.log_output(server_setup_process, "server_setup_process") if not server_setup_process.ok: logger.error( "Error while preparing server, please check logs for 'server_setup_process'" ) raise Exception logger.debug("Prepared server: {}".format(self.server.address)) vm_setup_process.wait() self.log_output(vm_setup_process, "vm_setup_process") if not vm_setup_process.ok: logger.error( "Error while preparing VMs, please check logs for 'vm_setup_process'" ) raise Exception logger.debug("Prepared VM") logger.info("Started {} VMs.".format(len(self.vm))) cpunetlog_vms = self.start_cpunetlog(self.vm) cpunetlog_server = self.start_cpunetlog( [self.server], self.server_conn_params) resolver = self.start_dns_server() logger.info("Started resolver ({}) on {}.".format( self.resolver_name, self.server.address)) # Leave time for resolver to start if self.args.resolver_slots_per_thread < 1000000: execo.sleep(15) else: execo.sleep(60) logger.info("Starting {} on all VMs...".format(client)) clients = self.start_client_vm() clients.wait() logger.info("{} finished!".format(client)) logger.info("Writing cpunetlog output to disk.") cpunetlog_server.kill().wait() cpunetlog_vms.kill().wait() self.log_output(cpunetlog_server, "cpunetlog_server") self.log_output(cpunetlog_vms, "cpunetlog_vms") logger.info("writing {} results to disk.".format(client)) self.log_output(clients, "clients", log_stdout=False) with open(rtt_file, 'w') as rtt_output: need_header = True rtt = csv.writer(rtt_output) for client_id, client in enumerate(clients.processes): first_line = True for line in iter(client.stdout.splitlines()): # Skip anything that does not look like CSV if ',' not in line: continue if need_header: # Take CSV header from first client and add a column data = line.split(",") data.insert(0, "vm_id") rtt.writerow(data) need_header = False first_line = False elif first_line: # Skip first line of subsequent clients first_line = False else: # Add column with VM ID data = line.split(",") data.insert(0, client_id) rtt.writerow(data) except Exception as e: logger.error("Exception raised: {}\n{}".format(e, format_exc())) finally: #self.kill_all_vm() if self.vm_process: self.vm_process.kill() if resolver: resolver.kill() logger.debug("Waiting for resolver to exit") resolver.wait() self.log_output(resolver, "resolver") if self.vm_process: logger.debug("Waiting for VM to exit") self.vm_process.wait() logger.info("Resolver and all VMs are shut down") self.log_output(self.vm_process, "vm_process") print(execo.Report([self.vm_process]).to_string()) #for s in self.vm_process.processes: # print("\n%s\nstdout:\n%s\nstderr:\n%s\n" % (s, s.stdout, s.stderr)) g5k.oardel([self.vmhosts_job])