Ejemplo n.º 1
0
 def run_xp(self):
     """Iterate over the parameters and execute the bench"""
     while len(self.sweeper.get_remaining()) > 0:
         comb = self.sweeper.get_next()
         if comb['n_core'] > get_host_attributes(comb['cluster']+'-1')['architecture']['smt_size'] * self.n_nodes: 
             self.sweeper.skip(comb)
             continue
         logger.info('Processing new combination %s' % (comb,))
         site = get_cluster_site(comb['cluster'])
         jobs = oarsub([(OarSubmission(resources = "{cluster='" + comb['cluster']+"'}/nodes=" + str(self.n_nodes),
                                       job_type = 'allow_classic_ssh', 
                                       walltime ='0:10:00'), 
                         site)])
         if jobs[0][0]:
             try:
                 wait_oar_job_start(*jobs[0])
                 nodes = get_oar_job_nodes(*jobs[0])
                 bench_cmd = 'mpirun -H %s -n %i %s ~/NPB3.3-MPI/bin/lu.%s.%i' % (
                     ",".join([node.address for node in nodes]),
                     comb['n_core'],
                     get_mpi_opts(comb['cluster']),
                     comb['size'],
                     comb['n_core'])
                 lu_bench = SshProcess(bench_cmd, nodes[0])
                 lu_bench.stdout_handlers.append(self.result_dir + '/' + slugify(comb) + '.out')
                 lu_bench.run()
                 if lu_bench.ok:
                     logger.info("comb ok: %s" % (comb,))
                     self.sweeper.done(comb)
                     continue
             finally:
                 oardel(jobs)
         logger.info("comb NOT ok: %s" % (comb,))
         self.sweeper.cancel(comb)
 def worker_start(self, cluster, site, oarsubmission, data, worker_index):
     th = current_thread()
     th.cluster = cluster
     th.site = site
     th.worker_index = worker_index
     th.jobid = None
     try:
         with th.oarsublock:
             if th.willterminate:
                 return
             worker_log.detail("submit oar job")
             ((th.jobid, _),) = oarsub([(oarsubmission, site)])
         if not th.jobid:
             worker_log.detail("job submission failed")
             self.worker(cluster, site, data, None, worker_index, oarsubmission, None)
         worker_log.detail("job submitted - wait job start")
         wait_oar_job_start(th.jobid, site,
                            prediction_callback = lambda ts:
                                worker_log.detail("job start prediction: %s" % (format_date(ts),)))
         th.waiting = False
         worker_log.detail("job started - get job nodes")
         nodes = get_oar_job_nodes(th.jobid, site)
         worker_log.detail("got %i nodes" % (len(nodes),))
         self.worker(cluster, site, data, nodes, worker_index, oarsubmission, th.jobid)
     finally:
         with th.oarsublock:
             if th.jobid:
                 worker_log.detail("delete oar job")
                 oardel([(th.jobid, site)])
                 th.jobid = None
         worker_log.detail("exit")
 def worker_start(self, cluster, site, oarsubmission, data, worker_index):
     th = current_thread()
     th.cluster = cluster
     th.site = site
     th.worker_index = worker_index
     th.jobid = None
     try:
         with th.oarsublock:
             if th.willterminate:
                 return
             worker_log.detail("submit oar job")
             ((th.jobid, _), ) = oarsub([(oarsubmission, site)])
         if not th.jobid:
             worker_log.detail("job submission failed")
             self.worker(cluster, site, data, None, worker_index,
                         oarsubmission, None)
         worker_log.detail("job submitted - wait job start")
         wait_oar_job_start(th.jobid,
                            site,
                            prediction_callback=lambda ts: worker_log.
                            detail("job start prediction: %s" %
                                   (format_date(ts), )))
         th.waiting = False
         worker_log.detail("job started - get job nodes")
         nodes = get_oar_job_nodes(th.jobid, site)
         worker_log.detail("got %i nodes" % (len(nodes), ))
         self.worker(cluster, site, data, nodes, worker_index,
                     oarsubmission, th.jobid)
     finally:
         with th.oarsublock:
             if th.jobid:
                 worker_log.detail("delete oar job")
                 oardel([(th.jobid, site)])
                 th.jobid = None
         worker_log.detail("exit")
Ejemplo n.º 4
0
def signal_handler(signal, frame):
    global interrupted, workers, jobid, site
    if interrupted:
        print('\n Releasing nodes')
        execo_g5k.oardel([(jobid, site)])
        sys.exit(1)
    else:
        print('\n  Press Ctrl+C again to exit')
        interrupted = True
        if workers is not None:
            workers.kill()
Ejemplo n.º 5
0
    def run(self):
        logger.debug("Parse and convert configs for G5K provisioner")
        self.configs = parse_config_file(self.args.config_file_path)
        kube_master_site = self.create_configs()

        logger.info("""Your topology:  %s""" %
                    self.configs["exp_env"]["antidote_clusters"])

        # Logarithmic scale interval of latency
        if self.configs["parameters"][
                "latency_interval"] == "logarithmic scale":
            start, end = self.configs["parameters"]["latency"]
            latency = [start, end]
            log_start = int(math.ceil(math.log(start)))
            log_end = int(math.ceil(math.log(end)))
            for i in range(log_start, log_end):
                latency.append(int(math.exp(i)))
                latency.append(int(math.exp(i + 0.5)))
            del self.configs["parameters"]["latency_interval"]
            self.configs["parameters"]["latency"] = list(set(latency))

        if self.configs["parameters"]["benchmarks"] == "performance":
            self.configs["parameters"]["n_nodes_run_per_dc"] = list(
                range(
                    1, self.configs["exp_env"]["antidote_clusters"][0]
                    ["n_antidotedb_per_dc"] + 1))

        sweeper = create_combs_queue(
            result_dir=self.configs["exp_env"]["results_dir"],
            parameters=self.configs["parameters"],
        )
        kube_namespace = "elmerfs-exp"
        oar_job_ids = None
        while len(sweeper.get_remaining()) > 0:
            if oar_job_ids is None:
                oar_job_ids, kube_master, elmerfs_hosts = self.setup_env(
                    kube_master_site, kube_namespace)

            comb = sweeper.get_next()
            sweeper = self.run_workflow(
                elmerfs_hosts=elmerfs_hosts,
                kube_master=kube_master,
                kube_namespace=kube_namespace,
                comb=comb,
                sweeper=sweeper,
            )

            if not is_job_alive(oar_job_ids):
                oardel(oar_job_ids)
                oar_job_ids = None
        logger.info("Finish the experiment!!!")
Ejemplo n.º 6
0
def get_cpu_topology(cluster, xpdir=None):
    """ """
    logger.info('Determining the architecture of cluster ' + \
                style.emph(cluster))
    root = None
    # Trying to reed topology from a directory
    if xpdir:
        fname = xpdir + '/topo_' + cluster + '.xml'
        try:
            tree = parse(fname)
            root = tree.getroot()
        except:
            logger.info('No cache file found, will reserve a node and ' + \
                        'determine topology from virsh capabilities')
            pass

    if root is None:
        frontend = get_cluster_site(cluster)
        submission = OarSubmission(resources="{cluster='" + cluster +
                                   "'}/nodes=1",
                                   walltime="0:02:00",
                                   job_type="allow_classic_ssh")
        ((job_id, _), ) = oarsub([(submission, frontend)])
        wait_oar_job_start(job_id, frontend)
        host = get_oar_job_nodes(job_id, frontend)[0]
        capa = SshProcess('virsh capabilities',
                          host,
                          connection_params={
                              'user':
                              default_frontend_connection_params['user']
                          }).run()
        oardel([(job_id, frontend)])
        root = fromstring(capa.stdout)
        if xpdir is not None:
            tree = ElementTree(root)
            tree.write(fname)

    cpu_topology = []
    i_cell = 0
    for cell in root.findall('.//cell'):
        cpu_topology.append([])
        for cpu in cell.findall('.//cpu'):
            cpu_topology[i_cell].append(int(cpu.attrib['id']))
        i_cell += 1
    logger.info(pformat(cpu_topology))
    return cpu_topology
Ejemplo n.º 7
0
    def run(self):
        """
            Main engine method to perform the experiment
        """
        self.define_parameters()

        while len(self.sweeper.get_remaining()) > 0:
            # Getting the next combination
            comb = self.sweeper.get_next()
            logger.info(style.host(slugify(comb)) + ' has been started')
            self.get_nodes(comb)

            # If the job is broken, the program is stopped
            if get_oar_job_info(self.oar_job_id,
                                self.frontend)['state'] == 'Error':
                break

            try:
                self.workflow(comb)

                # Process all combinations that can use the same submission
                while True:
                    # Find the next combination combinations that can use the same submission
                    subcomb = self.sweeper.get_next(lambda r: filter(
                        lambda x: x['cores'] == comb['cores'] and x['cluster']
                        == comb['cluster'], r))

                    if not subcomb:
                        logger.info(
                            'No more combination for cluster=%s and cores=%s',
                            comb['cluster'], comb['cores'])
                        break
                    else:
                        logger.info(
                            style.host(slugify(subcomb)) + ' has been started')

                        if get_oar_job_info(self.oar_job_id,
                                            self.frontend)['state'] != 'Error':
                            self.workflow(subcomb)
                        else:
                            break

            # Whatever happens (errors, end of loop), the job is deleted
            finally:
                logger.info('Deleting job...')
                oardel([(self.oar_job_id, self.frontend)])
Ejemplo n.º 8
0
    def run(self):
        logger.debug('Parse and convert configs for G5K provisioner')
        self.configs = parse_config_file(self.args.config_file_path)

        # Add the number of Antidote DC as a parameter
        self.configs['parameters']['n_dc'] = len(
            self.configs['exp_env']['clusters'])

        logger.debug('Normalize the parameter space')
        self.normalized_parameters = define_parameters(
            self.configs['parameters'])

        logger.debug('Normalize the given configs')
        kube_master_site = self.create_configs()

        logger.info('''Your largest topology:
                        Antidote DCs: %s
                        n_antidotedb_per_DC: %s  ''' %
                    (len(self.configs['exp_env']['clusters']),
                     max(self.normalized_parameters['n_nodes_per_dc'])))

        sweeper = create_combs_queue(
            result_dir=self.configs['exp_env']['results_dir'],
            parameters=self.configs['parameters'],
        )
        kube_namespace = 'elmerfs-exp'
        oar_job_ids = None
        while len(sweeper.get_remaining()) > 0:
            if oar_job_ids is None:
                oar_job_ids, kube_master = self.setup_env(
                    kube_master_site, kube_namespace)

            comb = sweeper.get_next()
            sweeper = self.run_workflow(
                kube_master=kube_master,
                kube_namespace=kube_namespace,
                comb=comb,
                sweeper=sweeper,
            )

            if not is_job_alive(oar_job_ids):
                oardel(oar_job_ids)
                oar_job_ids = None
        logger.info('Finish the experiment!!!')
Ejemplo n.º 9
0
def get_cpu_topology(cluster, xpdir=None):
    """ """
    logger.info('Determining the architecture of cluster ' + \
                style.emph(cluster))
    root = None
    # Trying to reed topology from a directory
    if xpdir:
        fname = xpdir + '/topo_' + cluster + '.xml'
        try:
            tree = parse(fname)
            root = tree.getroot()
        except:
            logger.info('No cache file found, will reserve a node and ' + \
                        'determine topology from virsh capabilities')
            pass

    if root is None:
        frontend = get_cluster_site(cluster)
        submission = OarSubmission(
            resources="{cluster='" + cluster + "'}/nodes=1",
            walltime="0:02:00",
            job_type="allow_classic_ssh")
        ((job_id, _), ) = oarsub([(submission, frontend)])
        wait_oar_job_start(job_id, frontend)
        host = get_oar_job_nodes(job_id, frontend)[0]
        capa = SshProcess('virsh capabilities', host,
            connection_params={'user': default_frontend_connection_params['user']}
            ).run()
        oardel([(job_id, frontend)])
        root = fromstring(capa.stdout)
        if xpdir is not None:
            tree = ElementTree(root)
            tree.write(fname)

    cpu_topology = []
    i_cell = 0
    for cell in root.findall('.//cell'):
        cpu_topology.append([])
        for cpu in cell.findall('.//cpu'):
            cpu_topology[i_cell].append(int(cpu.attrib['id']))
        i_cell += 1
    logger.info(pformat(cpu_topology))
    return cpu_topology
Ejemplo n.º 10
0
 def prepare_bench(self):
     """bench configuration and compilation, copy binaries to frontends
     
     return True if preparation is ok
     """
     logger.info("preparation: configure and compile benchmark")
     # the involved sites. We will do the compilation on the first of these.
     sites = list(set(map(get_cluster_site, self.parameters['cluster'])))
     # generate the bench compilation configuration
     bench_list = '\n'.join([ 'lu\t%s\t%s' % (size, n_core)
                              for n_core in self.parameters['n_core']
                              for size in self.parameters['size'] ])
     # Reserving a node because compiling on the frontend is forbidden
     # and because we need mpif77
     jobs = oarsub([(OarSubmission(resources = "nodes=1",
                                   job_type = 'allow_classic_ssh',
                                   walltime ='0:10:00'), sites[0])])
     if jobs[0][0]:
         try:
             logger.info("copying bench archive to %s" % (sites[0],))
             copy_bench = Put([sites[0]], ['NPB3.3-MPI.tar.bz2']).run()
             logger.info("extracting bench archive on %s" % (sites[0],))
             extract_bench = Remote('tar -xjf NPB3.3-MPI.tar.bz2', [sites[0]]).run()
             logger.info("waiting job start %s" % (jobs[0],))
             wait_oar_job_start(*jobs[0], prediction_callback = pred_cb)
             logger.info("getting nodes of %s" % (jobs[0],))
             nodes = get_oar_job_nodes(*jobs[0])
             logger.info("configure bench compilation")
             conf_bench = Remote('echo "%s" > ~/NPB3.3-MPI/config/suite.def' % bench_list, nodes).run()
             logger.info("compil bench")
             compilation = Remote('cd NPB3.3-MPI && make clean && make suite', nodes).run()
             logger.info("compil finished")
         except:
             logger.error("unable to compile bench")
             return False
         finally:
             oardel(jobs)
     # Copying binaries to all other frontends
     frontends = sites[1:]
     rsync = Remote('rsync -avuP ~/NPB3.3-MPI/ {{frontends}}:NPB3.3-MPI', 
                    [get_host_site(nodes[0])] * len(frontends)) 
     rsync.run()
     return compilation.ok and rsync.ok
Ejemplo n.º 11
0
    def run(self):
        """
            Main engine method to perform the experiment
        """
        self.define_parameters()
        
        while len(self.sweeper.get_remaining()) > 0:
            # Getting the next combination
            comb = self.sweeper.get_next()
            logger.info(style.host(slugify(comb)) + ' has been started')
            self.get_nodes(comb)

            # If the job is broken, the program is stopped
            if get_oar_job_info(self.oar_job_id, self.frontend)['state'] == 'Error': 
                break

            try:
                self.workflow(comb)

                # Process all combinations that can use the same submission
                while True:
                    # Find the next combination combinations that can use the same submission
                    subcomb = self.sweeper.get_next(lambda r: 
                        filter(lambda x: x['cores'] == comb['cores']
                                        and x['cluster'] == comb['cluster'], r))

                    if not subcomb: 
                        logger.info('No more combination for cluster=%s and cores=%s',
                            comb['cluster'], comb['cores'])
                        break
                    else:
                        logger.info(style.host(slugify(subcomb)) + ' has been started')

                        if get_oar_job_info(self.oar_job_id, self.frontend)['state'] != 'Error':
                            self.workflow(subcomb)
                        else:
                            break
            
            # Whatever happens (errors, end of loop), the job is deleted
            finally:
                logger.info('Deleting job...')
                oardel([(self.oar_job_id, self.frontend)])
Ejemplo n.º 12
0
    def tear_down(self):
        # Destroy the Rally deployment
        try:
            if self.rally_deployed:
                logger.info("Destroying Rally deployment " +
                            self.config['deployment-name'])
                self._run_or_abort(
                    'rally deployment destroy %s' %
                    self.config['deployment-name'], self.host,
                    'Could not destroy the Rally deployment. This will likely '
                    'cause errors when the node is used again.', False,
                    {'user': '******'})
        except AttributeError:
            pass  # self.host has not been defined yet, and that's ok

        # Kill the job
        try:
            if not self.options.keep_alive and self.job_id:
                logger.info("Killing job " + str(self.job_id))
                EX5.oardel([(self.job_id, self.site)])
        except AttributeError:
            pass  # self.job_id has not been defined either, and that's ok too
Ejemplo n.º 13
0
def launch_bench(oarsubmission, site, folder):
        """Copy required files on frontend(s) and compile bench suite."""
        logger.info("Reserving a node.")
        jobs = oarsub([(oarsubmission, site)])
        (job_id, site) = jobs[0]
        logger.info(jobs)
        if job_id:
            try:
                logger.info("Node reserved.")
                wait_oar_job_start(job_id, site)
                logger.info("Deploying environment.")
                node = deploy_node(job_id, site, oarsubmission)
                logger.info("Compiling Bots.")
                setup_node(node)
                logger.info("Starting benchs.")
                run_bench(folder, node)
            except:
                logger.error("Unable to deploy & compile Bench.")
                oardel(jobs)
                return False
        logger.info("Benchs completed. Deleting jobs.")
        oardel(jobs)
        return True
Ejemplo n.º 14
0
    def run(self):
        logger.debug('Parse and convert configs for G5K provisioner')
        self.configs = parse_config_file(self.args.config_file_path)
        kube_master_site = self.create_configs()

        logger.info('''Your topology:
                        Antidote DCs: %s
                        n_antidotedb_per_DC: %s
                        n_fmke_per_DC: %s
                        n_fmke_client_per_DC: %s ''' % (
            len(self.configs['exp_env']['clusters']),
            self.configs['exp_env']['n_antidotedb_per_dc'],
            self.configs['exp_env']['n_fmke_app_per_dc'],
            self.configs['exp_env']['n_fmke_client_per_dc'])
        )

        logger.debug('Creating the combination list')
        sweeper = create_combs_queue(result_dir=self.configs['exp_env']['results_dir'],
                                     parameters=self.configs['parameters'])

        kube_namespace = 'fmke-exp'
        oar_job_ids = None
        while len(sweeper.get_remaining()) > 0:
            if oar_job_ids is None:
                kube_master, oar_job_ids = self.setup_env(kube_master_site, kube_namespace)

            comb = sweeper.get_next()
            sweeper = self.run_workflow(kube_namespace=kube_namespace,
                                        kube_master=kube_master,
                                        comb=comb,
                                        sweeper=sweeper)

            if not is_job_alive(oar_job_ids):
                oardel(oar_job_ids)
                oar_job_ids = None
        logger.info('Finish the experiment!!!')
    def run(self):
        # Go to the result folder before everything
        os.chdir(self.result_dir)

        jobs = [(_jobID, _site)]
        # Get nodes
        nodes = get_oar_job_nodes(_jobID, _site)

        try:
            logger.info("Creating hostfiles for all combinations...")
            for nbr_node in _nbrNodes:
                hostfile_filename = self.result_dir + '/' + 'hostfile-' + nbr_node
                with open(hostfile_filename, 'w') as hostfile:
                    for node in nodes[:int(nbr_node)]:
                        print>>hostfile, node.address

            spack_process = Process('spack install -v chameleon@trunk+starpu+fxt ^starpu@svn-trunk+fxt')            
            spack_process.start()
            spack_process.wait()
            spack_process.kill()

        finally:
            logger.info("Delete job: {}".format(jobs))
            oardel(jobs)
    def run(self):
        sweeper = self.create_paramsweeper()

        while True:
            comb = sweeper.get_next()
            if not comb:
                break
            comb_dir = self.result_dir + '/' + slugify(comb)
            if not os.path.isdir(comb_dir):
                os.mkdir(comb_dir)
            comb_file = comb_dir + '/trace'
            g5k_configuration['kadeploy3'] = comb['version']
            logger.info('Treating combination %s', pformat(comb))
            get_version = SshProcess(
                comb['version'] + ' -v',
                comb['site'],
                connection_params=default_frontend_connection_params).run()
            logger.info(get_version.stdout)

            resources = ""
            if comb['kavlan']:
                resources += "{type='kavlan'}/vlan=1+"
            resources += "nodes=" + str(comb['n_nodes'])
            sub = OarSubmission(resources=resources,
                                job_type='deploy',
                                walltime="0:30:00",
                                name='Kadeploy_Tests')
            logger.info('Performing submission of %s on site %s', resources,
                        comb['site'])
            jobs = oarsub([(sub, comb['site'])])

            if jobs[0][0]:
                try:
                    logger.info('Waiting for job to start')
                    wait_oar_job_start(jobs[0][0], jobs[0][1])
                    hosts = get_oar_job_nodes(jobs[0][0], jobs[0][1])
                    logger.info('Deployment of %s',
                                ' '.join([host.address for host in hosts]))
                    kavlan = get_oar_job_kavlan(jobs[0][0], jobs[0][1])
                    if kavlan:
                        logger.info('In kavlan %s', kavlan)
                    deployment = Deployment(hosts,
                                            env_name=comb['env'],
                                            vlan=kavlan)
                    deployed, undeployed = deploy(deployment,
                                                  stdout_handlers=[comb_file],
                                                  stderr_handlers=[comb_file])

                finally:
                    logger.info('Destroying job %s on %s', str(jobs[0][0]),
                                jobs[0][1])
                    oardel([(jobs[0][0], jobs[0][1])])
            else:
                deployed = []

            if len(undeployed) == 0:
                logger.info('%s is OK', slugify(comb))
            elif len(deployed) == 0:
                logger.error('%s is KO', slugify(comb))
            else:
                logger.warning('%s encountered problems with some hosts',
                               slugify(comb))

            sweeper.done(comb)
Ejemplo n.º 17
0
    def run(self):
        """Inherited method, put here the code for running the engine"""
        self.define_parameters()
        self.cluster = self.args[0]
        self.site = get_cluster_site(self.cluster)
        if self.options.oar_job_id:
            self.oar_job_id = self.options.oar_job_id
        else:
            self.oar_job_id = None

        try:
            # Creation of the main iterator which is used for the first control loop.
            # You need have a method called define_parameters, that returns a list of parameter dicts
            self.define_parameters()

            job_is_dead = False
            # While they are combinations to treat
            while len(self.sweeper.get_remaining()) > 0:
                # If no job, we make a reservation and prepare the hosts for the experiments
                if job_is_dead or self.oar_job_id is None:
                    self.make_reservation()
                # Retrieving the hosts and subnets parameters
                self.hosts = get_oar_job_nodes(self.oar_job_id, self.frontend)
                # Hosts deployment
                deployed, undeployed = deploy(
                    Deployment(self.hosts,
                               env_file="/home/mliroz/deploys/hadoop6.env"))
                logger.info("%i deployed, %i undeployed" %
                            (len(deployed), len(undeployed)))
                if len(deployed) == 0:
                    break
                # Configuration du systeme => look at the execo_g5k.topology module
                attr = get_host_attributes(self.cluster + '-1')

                ## SETUP FINISHED

                # Getting the next combination
                comb = self.sweeper.get_next()
                self.prepare_dataset(comb)
                self.xp(comb)
                # subloop over the combinations that have the same sizes
                while True:
                    newcomb = self.sweeper.get_next(lambda r: filter(
                        lambda subcomb: subcomb['sizes'] == comb['sizes'], r))
                    if newcomb:
                        try:
                            self.xp(newcomb)
                        except:
                            break
                    else:
                        break

                if get_oar_job_info(self.oar_job_id,
                                    self.frontend)['state'] == 'Error':
                    job_is_dead = True

        finally:
            if self.oar_job_id is not None:
                if not self.options.keep_alive:
                    logger.info('Deleting job')
                    oardel([(self.oar_job_id, self.frontend)])
                else:
                    logger.info('Keeping job alive for debugging')
Ejemplo n.º 18
0
        execo.sleep(1)
        print 'sending command: ' + line
        workers = execo.Remote(line, cores).start()


app = App()

if jobid:
    try:
        print 'Waiting for job to start'
        execo_g5k.wait_oar_job_start(jobid, site)
        print 'Retrieving nodes'
        nodes = execo_g5k.get_oar_job_nodes(jobid, site)
        # Setup nodes
        print 'Preparing workers with cmd: ' + setup_cmd
        workers = execo.Remote(setup_cmd, nodes).start()
        workers.expect('Worker Setup Completed')
        workers.kill()
        # Possibly open more than one connection per machine
        cores = nodes * args.nb_cores
        print cores
        print 'Example cmd: %s' % (workers_cmd)
        app.prompt = '%s (%d node(s), %d core(s)/node)> ' % (
            site, args.volunteers, args.nb_cores)
        app.cmdloop()
        # execo.sleep(600)
        # print 'Workers done'

    finally:
        execo_g5k.oardel([(jobid, site)])
    def run(self):
        # Go to the result folder before everything
        os.chdir(self.result_dir)

        # OARSUB
        jobs = oarsub([(OarSubmission(resources='nodes=' + _nbrNodes.__str__(), 
                                      job_type='deploy', 
                                      walltime=_walltime, 
                                      sql_properties=_properties), _site)])
        
        job_id, site = jobs[0]
        try:
            # KADEPLOY
            logger.info("Waiting job start %s on %s" % (job_id, site))
            wait_oar_job_start(job_id, site, prediction_callback=prediction_callback)
            logger.info("getting nodes of %s on %s" % (job_id, site))
            nodes = get_oar_job_nodes(job_id, site)

            deployed, undeployed = deploy(Deployment(nodes, env_name=env_name),
                                          check_deployed_command=already_configured)
            if undeployed:
                logger.warn(
                    "NOT deployed nodes : {}".format(str(undeployed)))
                raise RuntimeError('Deployement failed')

            # STARPU INSTALLATION
            spack_spec = 'chameleon@trunk+starpu+fxt ^starpu@svn-trunk+fxt'
            spack_command = 'spack install -v' + ' ' + spack_spec 

            logger.info("Starting StarPU installation...")
            spack_process = Process(spack_command).start()
            spack_process.wait()

            logger.info("StarPU installation DONE...")
            self.checkProcess(spack_process)
            spack_process.kill()

            # STARPU DIRECTORY
            logger.info("Searching and going to StarPU installation directory...")

            starpu_location_process = Process(spack_spec).start()
            starpu_location_process.wait()
            self.checkProcess(starpu_location)

            starpu_cd_process = Process('cd ' + starpu_location_process.stdout + '/lib/chameleon').start()
            starpu_cd_process.wait()
            self.checkProcess(starpu_cd_process)

            starpu_location_process.kill()
            starpu_cd_process.kill()

            # RUNNING EXPERIMENT
            logger.info("Starting StarPU experiment...")
            starpu_experiment_process = Process(""" export STARPU_WORKER_STATS=1
                                                    export STARPU_CALIBRATE=2
                                                    ./timing/time_spotrf_tile --warmup --gpus=3 --threads=9 --nb=960 --ib=96 --n_range=48000:48000:9600 """)
            starpu_experiment_process.stdout_handlers.append(self.result_dir + '/' + 'StarPU.out') # create output file for StarPU    
            starpu_experiment_process.start()
            starpu_experiment_process.wait()

            logger.info("StarPU experiment DONE...")
            self.checkProcess(starpu_experiment_process)        
            starpu_experiment_process.kill()

        finally:
	        logger.info("Delete job : {}".format(jobs))
            oardel(jobs)
Ejemplo n.º 20
0
def oar_destroy_from_id(oarjob, site):
    """Destroy the job."""
    oarjob = int(oarjob)
    if oarjob is not None and site is not None:
        ex5.oardel([[oarjob, site]])
        logger.info("Killing the job %s" % oarjob)
Ejemplo n.º 21
0
    def run(self):
        """Inherited method, put here the code for running the engine"""
        self.define_parameters()
        self.cluster = self.args[0]
        self.site = get_cluster_site(self.cluster)
        if self.options.oar_job_id:
            self.oar_job_id = self.options.oar_job_id
        else:
            self.oar_job_id = None

        try:
            # Creation of the main iterator which is used for the first control loop.
            # You need have a method called define_parameters, that returns a list of parameter dicts
            self.define_parameters()

            job_is_dead = False
            # While they are combinations to treat
            while len(self.sweeper.get_remaining()) > 0:
                # If no job, we make a reservation and prepare the hosts for the experiments
                if job_is_dead or self.oar_job_id is None:
                    self.make_reservation()
                # Retrieving the hosts and subnets parameters
                self.hosts = get_oar_job_nodes(self.oar_job_id, self.frontend)
                # Hosts deployment
                deployed, undeployed = deploy(Deployment(self.hosts, 
                    env_file="/home/mliroz/deploys/hadoop6.env"))
                logger.info("%i deployed, %i undeployed" % (len(deployed), 
                                                            len(undeployed)))
                if len(deployed) == 0:
                    break
                # Configuration du systeme => look at the execo_g5k.topology module 
                attr = get_host_attributes(self.cluster + '-1')
                
                ## SETUP FINISHED
                
                # Getting the next combination
                comb = self.sweeper.get_next()
                self.prepare_dataset(comb)
                self.xp(comb)
                # subloop over the combinations that have the same sizes
                while True:
                    newcomb = self.sweeper.get_next(lambda r:
                            filter(lambda subcomb: subcomb['sizes'] == comb['sizes'], r))
                    if newcomb:
                        try:
                            self.xp(newcomb)
                        except:
                            break
                    else:
                        break

                if get_oar_job_info(self.oar_job_id, self.frontend)['state'] == 'Error':
                    job_is_dead = True

        finally:
            if self.oar_job_id is not None:
                if not self.options.keep_alive:
                    logger.info('Deleting job')
                    oardel([(self.oar_job_id, self.frontend)])
                else:
                    logger.info('Keeping job alive for debugging') 
 def run(self):
     num_total_workers = 0
     sites_clusters_threads = {}  # dict: keys = sites, values =
     # dict: keys = clusters, values =
     # list: threads
     try:
         while True:
             t = Timer()
             clusters_to_submit = set()
             for clusterspec in self.get_clusters():
                 cluster, _, site = clusterspec.partition(".")
                 if site == "":
                     site = get_cluster_site(cluster)
                 clusters_to_submit.add((cluster, site))
             for site in sites_clusters_threads.keys():
                 for cluster in sites_clusters_threads[site].keys():
                     sites_clusters_threads[site][cluster] = [
                         th for th in sites_clusters_threads[site][cluster]
                         if th.is_alive()
                     ]
                     if len(sites_clusters_threads[site][cluster]) == 0:
                         del sites_clusters_threads[site][cluster]
                 if len(sites_clusters_threads[site]) == 0:
                     del sites_clusters_threads[site]
             all_involved_sites = set(sites_clusters_threads.keys())
             all_involved_sites.update([s for (c, s) in clusters_to_submit])
             no_submissions = True
             for site in all_involved_sites:
                 all_involved_clusters = set()
                 if sites_clusters_threads.has_key(site):
                     all_involved_clusters.update(
                         sites_clusters_threads[site].keys())
                 all_involved_clusters.update(
                     [c for (c, s) in clusters_to_submit if s == site])
                 for cluster in all_involved_clusters:
                     num_workers = 0
                     num_waiting = 0
                     if sites_clusters_threads.has_key(
                             site) and sites_clusters_threads[site].has_key(
                                 cluster):
                         num_workers = len(
                             sites_clusters_threads[site][cluster])
                         num_waiting = len([
                             th
                             for th in sites_clusters_threads[site][cluster]
                             if th.waiting
                         ])
                     num_max_new_workers = min(
                         self.options.max_workers - num_workers,
                         self.options.max_waiting - num_waiting)
                     logger.trace(
                         "rescheduling on cluster %s@%s: num_workers = %s / num_waiting = %s / num_max_new_workers = %s"
                         % (cluster, site, num_workers, num_waiting,
                            num_max_new_workers))
                     if num_max_new_workers > 0:
                         for worker_index in range(0, num_max_new_workers):
                             jobdata = self.get_job(cluster)
                             if not jobdata:
                                 break
                             no_submissions = False
                             logger.detail(
                                 "spawning worker %i on %s@%s" %
                                 (num_total_workers, cluster, site))
                             (oarsubmission, data) = jobdata
                             th = Thread(target=self.worker_start,
                                         args=(
                                             cluster,
                                             site,
                                             oarsubmission,
                                             data,
                                             num_total_workers,
                                         ))
                             th.waiting = True
                             th.daemon = True
                             th.oarsublock = Lock()
                             th.willterminate = False
                             th.start()
                             num_total_workers += 1
                             if not sites_clusters_threads.has_key(site):
                                 sites_clusters_threads[site] = {}
                             if not sites_clusters_threads[site].has_key(
                                     cluster):
                                 sites_clusters_threads[site][cluster] = []
                             sites_clusters_threads[site][cluster].append(
                                 th)
             if no_submissions and len(sites_clusters_threads) == 0:
                 break
             sleep(self.options.schedule_delay)
         logger.detail(
             "no more combinations to explore. exit schedule loop")
     finally:
         for site in sites_clusters_threads.keys():
             for cluster in sites_clusters_threads[site].keys():
                 for th in sites_clusters_threads[site][cluster]:
                     with th.oarsublock:
                         th.willterminate = True
                         if th.jobid:
                             logger.detail(
                                 "cleaning: delete job %i of worker #%i on %s"
                                 % (th.jobid, th.worker_index, site))
                             oardel([(th.jobid, site)])
                             th.jobid = None
Ejemplo n.º 23
0
    def run(self):
        sweeper = self.create_paramsweeper()

        while True:
            comb = sweeper.get_next()
            if not comb:
                break
            comb_dir = self.result_dir + '/' + slugify(comb)
            if not os.path.isdir(comb_dir):
                os.mkdir(comb_dir)
            comb_file = comb_dir + '/trace'
            g5k_configuration['kadeploy3'] = comb['version']
            logger.info('Treating combination %s', pformat(comb))
            get_version = SshProcess(comb['version'] + ' -v',
                                     comb['site'],
                                     connection_params=default_frontend_connection_params).run()
            logger.info(get_version.stdout)

            resources = ""
            if comb['kavlan']:
                resources += "{type='kavlan'}/vlan=1+"
            resources += "nodes=" + str(comb['n_nodes'])
            sub = OarSubmission(resources=resources,
                                job_type='deploy',
                                walltime="0:30:00",
                                name='Kadeploy_Tests')
            logger.info('Performing submission of %s on site %s',
                        resources, comb['site'])
            jobs = oarsub([(sub, comb['site'])])

            if jobs[0][0]:
                try:
                    logger.info('Waiting for job to start')
                    wait_oar_job_start(jobs[0][0], jobs[0][1])
                    hosts = get_oar_job_nodes(jobs[0][0], jobs[0][1])
                    logger.info('Deployment of %s',
                                ' '.join([host.address for host in hosts]))
                    kavlan = get_oar_job_kavlan(jobs[0][0], jobs[0][1])
                    if kavlan:
                        logger.info('In kavlan %s', kavlan)
                    deployment = Deployment(hosts, env_name=comb['env'],
                                            vlan=kavlan)
                    deployed, undeployed = deploy(deployment,
                                                  stdout_handlers=[comb_file],
                                                  stderr_handlers=[comb_file])

                finally:
                    logger.info('Destroying job %s on %s', str(jobs[0][0]),
                                jobs[0][1])
                    oardel([(jobs[0][0], jobs[0][1])])
            else:
                deployed = []

            if len(undeployed) == 0:
                logger.info('%s is OK', slugify(comb))
            elif len(deployed) == 0:
                logger.error('%s is KO', slugify(comb))
            else:
                logger.warning('%s encountered problems with some hosts',
                               slugify(comb))

            sweeper.done(comb)
Ejemplo n.º 24
0
 def delete_job(self):
     EX5.oardel([self.gridjob])
 def run(self):
     num_total_workers = 0
     sites_clusters_threads = {} # dict: keys = sites, values =
                                 # dict: keys = clusters, values =
                                 # list: threads
     try:
         while True:
             t = Timer()
             clusters_to_submit = set()
             for clusterspec in self.get_clusters():
                 cluster, _, site = clusterspec.partition(".")
                 if site == "":
                     site = get_cluster_site(cluster)
                 clusters_to_submit.add((cluster, site))
             for site in sites_clusters_threads.keys():
                 for cluster in sites_clusters_threads[site].keys():
                     sites_clusters_threads[site][cluster] = [
                         th
                         for th in sites_clusters_threads[site][cluster]
                         if th.is_alive() ]
                     if len(sites_clusters_threads[site][cluster]) == 0:
                         del sites_clusters_threads[site][cluster]
                 if len(sites_clusters_threads[site]) == 0:
                     del sites_clusters_threads[site]
             all_involved_sites = set(sites_clusters_threads.keys())
             all_involved_sites.update([ s for (c, s) in clusters_to_submit ])
             no_submissions = True
             for site in all_involved_sites:
                 all_involved_clusters = set()
                 if sites_clusters_threads.has_key(site):
                     all_involved_clusters.update(sites_clusters_threads[site].keys())
                 all_involved_clusters.update([ c for (c, s) in clusters_to_submit if s == site ])
                 for cluster in all_involved_clusters:
                     num_workers = 0
                     num_waiting = 0
                     if sites_clusters_threads.has_key(site) and sites_clusters_threads[site].has_key(cluster):
                         num_workers = len(sites_clusters_threads[site][cluster])
                         num_waiting = len([
                                 th
                                 for th in sites_clusters_threads[site][cluster]
                                 if th.waiting ])
                     num_max_new_workers = min(self.options.max_workers - num_workers,
                                               self.options.max_waiting - num_waiting)
                     logger.trace(
                         "rescheduling on cluster %s@%s: num_workers = %s / num_waiting = %s / num_max_new_workers = %s" %
                         (cluster, site, num_workers, num_waiting, num_max_new_workers))
                     if num_max_new_workers > 0:
                         for worker_index in range(0, num_max_new_workers):
                             jobdata = self.get_job(cluster)
                             if not jobdata:
                                 break
                             no_submissions = False
                             logger.detail(
                                 "spawning worker %i on %s@%s" % (
                                     num_total_workers,
                                     cluster, site))
                             (oarsubmission, data) = jobdata
                             th = Thread(target = self.worker_start,
                                         args = (cluster, site,
                                                 oarsubmission, data,
                                                 num_total_workers,))
                             th.waiting = True
                             th.daemon = True
                             th.oarsublock = Lock()
                             th.willterminate = False
                             th.start()
                             num_total_workers += 1
                             if not sites_clusters_threads.has_key(site):
                                 sites_clusters_threads[site] = {}
                             if not sites_clusters_threads[site].has_key(cluster):
                                 sites_clusters_threads[site][cluster] = []
                             sites_clusters_threads[site][cluster].append(th)
             if no_submissions and len(sites_clusters_threads) == 0:
                 break
             sleep(self.options.schedule_delay)
         logger.detail("no more combinations to explore. exit schedule loop")
     finally:
         for site in sites_clusters_threads.keys():
             for cluster in sites_clusters_threads[site].keys():
                 for th in sites_clusters_threads[site][cluster]:
                     with th.oarsublock:
                         th.willterminate = True
                         if th.jobid:
                             logger.detail("cleaning: delete job %i of worker #%i on %s" % (
                                     th.jobid, th.worker_index, site))
                             oardel([(th.jobid, site)])
                             th.jobid = None
Ejemplo n.º 26
0
    def run(self):
        """The main experimental workflow, as described in
        ``Using the Execo toolkit to perform ...``
        """
        self.force_options()

        # The argument is a cluster
        self.cluster = self.args[0]
        self.frontend = get_cluster_site(self.cluster)
        # Analyzing options
        if self.options.oar_job_id:
            self.oar_job_id = self.options.oar_job_id
        else:
            self.oar_job_id = None

        try:
            # Creation of the main iterator which is used for the first control loop.
            # You need have a method called define_parameters, that returns a list of parameter dicts
            self.create_paramsweeper()

            job_is_dead = False
            # While they are combinations to treat
            while len(self.sweeper.get_remaining()) > 0:
                # If no job, we make a reservation and prepare the hosts for the experiments
                if self.oar_job_id is None:
                    self.make_reservation()
                # Retrieving the hosts and subnets parameters
                self.get_resources()
                # Hosts deployment and configuration
                if not self.options.no_hosts_setup:
                    self.setup_hosts()
                if len(self.hosts) == 0:
                    break

                # Initializing the resources and threads
                available_hosts = list(self.hosts)
                available_ip_mac = list(self.ip_mac)
                threads = {}

                # Checking that the job is running and not in Error
                while self.is_job_alive()['state'] != 'Error' \
                    or len(threads.keys()) > 0:
                # while get_oar_job_info(self.oar_job_id, self.frontend)['state'] != 'Error' \
                #     or len(threads.keys()) > 0:
                    job_is_dead = False
                    while self.options.n_nodes > len(available_hosts):
                        tmp_threads = dict(threads)
                        for t in tmp_threads:
                            if not t.is_alive():
                                available_hosts.extend(tmp_threads[t]['hosts'])
                                available_ip_mac.extend(tmp_threads[t]['ip_mac'])
                                del threads[t]
                        sleep(5)
                        if self.is_job_alive()['state'] == 'Error':
                            job_is_dead = True
                            break
                    if job_is_dead:
                        break

                    # Getting the next combination
                    comb = self.sweeper.get_next()
                    if not comb:
                        while len(threads.keys()) > 0:
                            tmp_threads = dict(threads)
                            for t in tmp_threads:
                                if not t.is_alive():
                                    del threads[t]
                            logger.info('Waiting for threads to complete')
                            sleep(20)
                        break

                    used_hosts = available_hosts[0:self.options.n_nodes]
                    available_hosts = available_hosts[self.options.n_nodes:]

                    n_vm = self.comb_nvm(comb)
                    used_ip_mac = available_ip_mac[0:n_vm]
                    available_ip_mac = available_ip_mac[n_vm:]

                    t = Thread(target=self.workflow,
                               args=(comb, used_hosts, used_ip_mac))
                    threads[t] = {'hosts': used_hosts, 'ip_mac': used_ip_mac}
                    logger.debug('Threads: %s', len(threads))
                    t.daemon = True
                    t.start()

                # if get_oar_job_info(self.oar_job_id, self.frontend)['state'] == 'Error':
                if self.is_job_alive()['state'] == 'Error':
                    job_is_dead = True

                if job_is_dead:
                    self.oar_job_id = None

        finally:
            if self.oar_job_id is not None:
                if not self.options.keep_alive:
                    logger.info('Deleting job')
                    oardel([(self.oar_job_id, self.frontend)])
                else:
                    logger.info('Keeping job alive for debugging')
Ejemplo n.º 27
0
    def run(self):
        """ """
        if self.options.oargrid_job_id is not None:
            self.oar_job_id = self.options.oargrid_job_id
        else:
            self.oar_job_id = None

        self.list_of_clusters = [
            'parasilo', 'paravance', 'parapluie', 'paranoia'
        ]

        try:
            # Creation of the main iterator which is used for the first control loop.
            self.define_parameters()
            self.working_dir = '/data/jorouzaudcornabas_' + str(
                self.options.storage5k_job_id)

            job_is_dead = False
            # While there are combinations to treat
            while len(self.sweeper.get_remaining()) > 0:
                # If no job, we make a reservation and prepare the hosts for the experiments
                if self.oar_job_id is None:
                    self.submit_all_available_best_effort(
                        self.list_of_clusters, self.options.walltime)
                    # self.make_reservation_local()
                # Wait that the job starts
                logger.info('Waiting that the job start ' +
                            str(self.oar_job_id))
                wait_oar_job_start(self.oar_job_id)
                # Retrieving the hosts and subnets parameters
                self.hosts = get_oar_job_nodes(self.oar_job_id)
                # Hosts deployment and configuration
                default_connection_params['user'] = '******'

                logger.info("Start hosts configuration")
                ex_log.setLevel('INFO')
                #===============================================================
                # deployment = Deployment(hosts = self.hosts,
                #             env_file='/home/sirimie/env/mywheezy-x64-base.env')
                # self.hosts, _ = deploy(deployment)
                #===============================================================
                if len(self.hosts) == 0:
                    break

                # Initializing the resources and threads
                available_hosts = self.hosts

                threads = {}

                # Creating the unique folder for storing the results
                comb_dir = self.result_dir + '/logs'
                if not os.path.exists(comb_dir):
                    os.mkdir(comb_dir)

                logger.info("Starting the thread " + str(self.is_job_alive()) +
                            " " + str(len(threads.keys())))
                # Checking that the job is running and not in Error
                while self.is_job_alive() or len(threads.keys()) > 0:
                    job_is_dead = False

                    while self.options.n_nodes > len(available_hosts):
                        tmp_threads = dict(threads)
                        for t in tmp_threads:
                            if not t.is_alive():
                                available_hosts.append(tmp_threads[t]['host'])
                                del threads[t]
                        sleep(5)
                        if not self.is_job_alive():
                            job_is_dead = True
                            break
                    if job_is_dead:
                        break

                    # Getting the next combination
                    comb = self.sweeper.get_next()
                    if not comb:
                        while len(threads.keys()) > 0:
                            tmp_threads = dict(threads)
                            for t in tmp_threads:
                                if not t.is_alive():
                                    del threads[t]
                            logger.info('Waiting for threads to complete')
                            sleep(20)
                        break

                    host = available_hosts[0]
                    available_hosts = available_hosts[1:]
                    logger.info("Launching thread")
                    t = Thread(target=self.workflow,
                               args=(comb, host, comb_dir))
                    threads[t] = {'host': host}
                    t.daemon = True
                    t.start()

                if not self.is_job_alive():
                    job_is_dead = True

                if job_is_dead:
                    self.oar_job_id = None

        finally:
            if self.oar_job_id is not None:
                if not self.options.keep_alive:
                    logger.info('Deleting job')
                    oardel([self.oar_job_id])
                else:
                    logger.info('Keeping job alive for debugging')
    def run(self):
        """Run the experiment"""
        already_configured = self.options.already_configured
        reservation_job_id = int(self.options.reservation_id) \
            if self.options.reservation_id is not None else None
        is_a_test = self.options.is_a_test

        if is_a_test:
            logger.warn('THIS IS A TEST! This run will use only a few '
                        'resources')

        # make the result folder writable for all
        os.chmod(self.result_dir, 0o777)
        # Import configuration
        with open(self.args[0]) as config_file:
            config = json.load(config_file)
        # backup configuration
        copy(self.args[0], self.result_dir)

        site = config["grid5000_site"]
        resources = config["resources"]
        nb_experiment_nodes = config["nb_experiment_nodes"]
        walltime = str(config["walltime"])
        env_name = config["kadeploy_env_name"]
        workloads = config["workloads"]
        # check if workloads exists (Suppose that the same NFS mount point
        # is present on the remote and the local environment
        for workload_file in workloads:
            with open(workload_file):
                pass
            # copy the workloads files to the results dir
            copy(workload_file, self.result_dir)

        # define the workloads parameters
        self.parameters = {'workload_filename': workloads}
        logger.info('Workloads: {}'.format(workloads))

        # define the iterator over the parameters combinations
        self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"),
                                    sweep(self.parameters))

        # Due to previous (using -c result_dir) run skip some combination
        logger.info('Skipped parameters:' +
                    '{}'.format(str(self.sweeper.get_skipped())))

        logger.info('Number of parameters combinations {}'.format(
            str(len(self.sweeper.get_remaining()))))
        logger.info('combinations {}'.format(str(
            self.sweeper.get_remaining())))

        if reservation_job_id is not None:
            jobs = [(reservation_job_id, site)]
        else:
            jobs = oarsub([(OarSubmission(resources=resources,
                                          job_type='deploy',
                                          walltime=walltime), site)])
        job_id, site = jobs[0]
        if job_id:
            try:
                logger.info("waiting job start %s on %s" % (job_id, site))
                wait_oar_job_start(job_id,
                                   site,
                                   prediction_callback=prediction_callback)
                logger.info("getting nodes of %s on %s" % (job_id, site))
                nodes = get_oar_job_nodes(job_id, site)
                # sort the nodes
                nodes = sorted(nodes, key=lambda node: node.address)
                # get only the necessary nodes under the switch
                if nb_experiment_nodes > len(nodes):
                    raise RuntimeError('The number of given node in the '
                                       'reservation ({}) do not match the '
                                       'requested resources '
                                       '({})'.format(len(nodes),
                                                     nb_experiment_nodes))
                nodes = nodes[:nb_experiment_nodes]
                logger.info("deploying nodes: {}".format(str(nodes)))
                deployed, undeployed = deploy(
                    Deployment(nodes, env_name=env_name),
                    check_deployed_command=already_configured)
                if undeployed:
                    logger.warn("NOT deployed nodes: {}".format(
                        str(undeployed)))
                    raise RuntimeError('Deployement failed')

                if not already_configured:

                    # install OAR
                    install_cmd = "apt-get update; apt-get install -y "
                    node_packages = "oar-node"
                    logger.info("installing OAR nodes: {}".format(
                        str(nodes[1:])))
                    install_oar_nodes = Remote(
                        install_cmd + node_packages,
                        nodes[1:],
                        connection_params={'user': '******'})
                    install_oar_nodes.start()

                    server_packages = (
                        "oar-server oar-server-pgsql oar-user "
                        "oar-user-pgsql postgresql python3-pip "
                        "libjson-perl postgresql-server-dev-all")
                    install_oar_sched_cmd = """
                    mkdir -p /opt/oar_sched; \
                    cd /opt/oar_sched; \
                    git clone https://github.com/oar-team/oar3.git; \
                    cd oar3; \
                    git checkout dce942bebc2; \
                    pip3 install -e .; \
                    cd /usr/lib/oar/schedulers; \
                    ln -s /usr/local/bin/kamelot; \
                    pip3 install psycopg2
                    """
                    logger.info("installing OAR server node: {}".format(
                        str(nodes[0])))
                    install_master = SshProcess(
                        install_cmd + server_packages + ";" +
                        install_oar_sched_cmd,
                        nodes[0],
                        connection_params={'user': '******'})
                    install_master.run()
                    install_oar_nodes.wait()

                    if not install_master.ok:
                        Report(install_master)

                    configure_oar_cmd = """
                    sed -i \
                        -e 's/^\(DB_TYPE\)=.*/\\1="Pg"/' \
                        -e 's/^\(DB_HOSTNAME\)=.*/\\1="localhost"/' \
                        -e 's/^\(DB_PORT\)=.*/\\1="5432"/' \
                        -e 's/^\(DB_BASE_PASSWD\)=.*/\\1="oar"/' \
                        -e 's/^\(DB_BASE_LOGIN\)=.*/\\1="oar"/' \
                        -e 's/^\(DB_BASE_PASSWD_RO\)=.*/\\1="oar_ro"/' \
                        -e 's/^\(DB_BASE_LOGIN_RO\)=.*/\\1="oar_ro"/' \
                        -e 's/^\(SERVER_HOSTNAME\)=.*/\\1="localhost"/' \
                        -e 's/^\(SERVER_PORT\)=.*/\\1="16666"/' \
                        -e 's/^\(LOG_LEVEL\)\=\"2\"/\\1\=\"3\"/' \
                        -e 's#^\(LOG_FILE\)\=.*#\\1="{result_dir}/oar.log"#' \
                        -e 's/^\(JOB_RESOURCE_MANAGER_PROPERTY_DB_FIELD\=\"cpuset\".*\)/#\\1/' \
                        -e 's/^#\(CPUSET_PATH\=\"\/oar\".*\)/\\1/' \
                        -e 's/^\(FINAUD_FREQUENCY\)\=.*/\\1="0"/' \
                        /etc/oar/oar.conf
                    """.format(result_dir=self.result_dir)
                    configure_oar = Remote(configure_oar_cmd,
                                           nodes,
                                           connection_params={'user': '******'})
                    configure_oar.run()
                    logger.info("OAR is configured on all nodes")

                    # Configure server
                    create_db = "oar-database --create --db-is-local"
                    config_oar_sched = (
                        "oarnotify --remove-queue default;"
                        "oarnotify --add-queue default,1,kamelot")
                    start_oar = "systemctl start oar-server.service"
                    logger.info("configuring OAR database: {}".format(
                        str(nodes[0])))
                    config_master = SshProcess(
                        create_db + ";" + config_oar_sched + ";" + start_oar,
                        nodes[0],
                        connection_params={'user': '******'})
                    config_master.run()

                    # propagate SSH keys
                    logger.info("configuring OAR SSH")
                    oar_key = "/tmp/.ssh"
                    Process('rm -rf ' + oar_key).run()
                    Process(
                        'scp -o BatchMode=yes -o PasswordAuthentication=no '
                        '-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null '
                        '-o ConnectTimeout=20 -rp -o User=root ' +
                        nodes[0].address + ":/var/lib/oar/.ssh"
                        ' ' + oar_key).run()
                    # Get(nodes[0], "/var/lib/oar/.ssh", [oar_key], connection_params={'user': '******'}).run()
                    Put(nodes[1:], [oar_key],
                        "/var/lib/oar/",
                        connection_params={
                            'user': '******'
                        }).run()
                    add_resources_cmd = """
                    oarproperty -a cpu || true; \
                    oarproperty -a core || true; \
                    oarproperty -c -a host || true; \
                    oarproperty -a mem || true; \
                    """
                    for node in nodes[1:]:
                        add_resources_cmd = add_resources_cmd + "oarnodesetting -a -h {node} -p host={node} -p cpu=1 -p core=4 -p cpuset=0 -p mem=16; \\\n".format(
                            node=node.address)

                    add_resources = SshProcess(
                        add_resources_cmd,
                        nodes[0],
                        connection_params={'user': '******'})
                    add_resources.run()

                    if add_resources.ok:
                        logger.info("oar is now configured!")
                    else:
                        raise RuntimeError(
                            "error in the OAR configuration: Abort!")

                # TODO backup de la config de OAR

                # Do the replay
                logger.info('begining the replay')
                while len(self.sweeper.get_remaining()) > 0:
                    combi = self.sweeper.get_next()
                    workload_file = os.path.basename(
                        combi['workload_filename'])
                    oar_replay = SshProcess(
                        script_path + "/oar_replay.py " +
                        combi['workload_filename'] + " " + self.result_dir +
                        "  oar_gant_" + workload_file, nodes[0])
                    oar_replay.stdout_handlers.append(self.result_dir + '/' +
                                                      workload_file + '.out')
                    logger.info("replaying workload: {}".format(combi))
                    oar_replay.run()
                    if oar_replay.ok:
                        logger.info("Replay workload OK: {}".format(combi))
                        self.sweeper.done(combi)
                    else:
                        logger.info("Replay workload NOT OK: {}".format(combi))
                        self.sweeper.cancel(combi)
                        raise RuntimeError("error in the OAR replay: Abort!")

            except:
                traceback.print_exc()
                ipdb.set_trace()

            finally:
                if is_a_test:
                    ipdb.set_trace()
                if reservation_job_id is None:
                    logger.info("delete job: {}".format(jobs))
                    oardel(jobs)
    def run(self):
        """Run the experiment"""
        already_configured = self.options.already_configured
        reservation_job_id = int(self.options.reservation_id) \
            if self.options.reservation_id is not None else None
        is_a_test = self.options.is_a_test

        if is_a_test:
            logger.warn('THIS IS A TEST! This run will use only a few '
                        'resources')

        # make the result folder writable for all
        os.chmod(self.result_dir, 0o777)
        # Import configuration
        with open(self.args[0]) as config_file:
            config = json.load(config_file)
        # backup configuration
        copy(self.args[0], self.result_dir)

        site = config["grid5000_site"]
        resources = config["resources"]
        nb_experiment_nodes = config["nb_experiment_nodes"]
        walltime = str(config["walltime"])
        env_name = config["kadeploy_env_name"]
        workloads = config["workloads"]
        # check if workloads exists (Suppose that the same NFS mount point
        # is present on the remote and the local environment
        for workload_file in workloads:
            with open(workload_file):
                pass
            # copy the workloads files to the results dir
            copy(workload_file, self.result_dir)

        # define the workloads parameters
        self.parameters = {
            'workload_filename': workloads
        }
        logger.info('Workloads: {}'.format(workloads))

        # define the iterator over the parameters combinations
        self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"),
                                    sweep(self.parameters))

        # Due to previous (using -c result_dir) run skip some combination
        logger.info('Skipped parameters:' +
                    '{}'.format(str(self.sweeper.get_skipped())))

        logger.info('Number of parameters combinations {}'.format(
            str(len(self.sweeper.get_remaining()))))
        logger.info('combinations {}'.format(
            str(self.sweeper.get_remaining())))

        if reservation_job_id is not None:
            jobs = [(reservation_job_id, site)]
        else:
            jobs = oarsub([(OarSubmission(resources=resources,
                                          job_type='deploy',
                                          walltime=walltime), site)])
        job_id, site = jobs[0]
        if job_id:
            try:
                logger.info("waiting job start %s on %s" % (job_id, site))
                wait_oar_job_start(
                    job_id, site, prediction_callback=prediction_callback)
                logger.info("getting nodes of %s on %s" % (job_id, site))
                nodes = get_oar_job_nodes(job_id, site)
                # sort the nodes
                nodes = sorted(nodes, key=lambda node: node.address)
                # get only the necessary nodes under the switch
                if nb_experiment_nodes > len(nodes):
                    raise RuntimeError('The number of given node in the '
                                       'reservation ({}) do not match the '
                                       'requested resources '
                                       '({})'.format(len(nodes),
                                                     nb_experiment_nodes))
                nodes = nodes[:nb_experiment_nodes]
                logger.info("deploying nodes: {}".format(str(nodes)))
                deployed, undeployed = deploy(
                    Deployment(nodes, env_name=env_name),
                    check_deployed_command=already_configured)
                if undeployed:
                    logger.warn(
                        "NOT deployed nodes: {}".format(str(undeployed)))
                    raise RuntimeError('Deployement failed')

                if not already_configured:

                    # install OAR
                    install_cmd = "apt-get update; apt-get install -y "
                    node_packages = "oar-node"
                    logger.info(
                        "installing OAR nodes: {}".format(str(nodes[1:])))
                    install_oar_nodes = Remote(
                        install_cmd + node_packages,
                        nodes[1:],
                        connection_params={'user': '******'})
                    install_oar_nodes.start()

                    server_packages = ("oar-server oar-server-pgsql oar-user "
                                       "oar-user-pgsql postgresql python3-pip "
                                       "libjson-perl postgresql-server-dev-all")
                    install_oar_sched_cmd = """
                    mkdir -p /opt/oar_sched; \
                    cd /opt/oar_sched; \
                    git clone https://github.com/oar-team/oar3.git; \
                    cd oar3; \
                    git checkout dce942bebc2; \
                    pip3 install -e .; \
                    cd /usr/lib/oar/schedulers; \
                    ln -s /usr/local/bin/kamelot; \
                    pip3 install psycopg2
                    """
                    logger.info("installing OAR server node: {}".format(str(nodes[0])))
                    install_master = SshProcess(install_cmd + server_packages +
                                                ";" + install_oar_sched_cmd, nodes[0],
                                                connection_params={'user': '******'})
                    install_master.run()
                    install_oar_nodes.wait()

                    if not install_master.ok:
                        Report(install_master)

                    configure_oar_cmd = """
                    sed -i \
                        -e 's/^\(DB_TYPE\)=.*/\\1="Pg"/' \
                        -e 's/^\(DB_HOSTNAME\)=.*/\\1="localhost"/' \
                        -e 's/^\(DB_PORT\)=.*/\\1="5432"/' \
                        -e 's/^\(DB_BASE_PASSWD\)=.*/\\1="oar"/' \
                        -e 's/^\(DB_BASE_LOGIN\)=.*/\\1="oar"/' \
                        -e 's/^\(DB_BASE_PASSWD_RO\)=.*/\\1="oar_ro"/' \
                        -e 's/^\(DB_BASE_LOGIN_RO\)=.*/\\1="oar_ro"/' \
                        -e 's/^\(SERVER_HOSTNAME\)=.*/\\1="localhost"/' \
                        -e 's/^\(SERVER_PORT\)=.*/\\1="16666"/' \
                        -e 's/^\(LOG_LEVEL\)\=\"2\"/\\1\=\"3\"/' \
                        -e 's#^\(LOG_FILE\)\=.*#\\1="{result_dir}/oar.log"#' \
                        -e 's/^\(JOB_RESOURCE_MANAGER_PROPERTY_DB_FIELD\=\"cpuset\".*\)/#\\1/' \
                        -e 's/^#\(CPUSET_PATH\=\"\/oar\".*\)/\\1/' \
                        -e 's/^\(FINAUD_FREQUENCY\)\=.*/\\1="0"/' \
                        /etc/oar/oar.conf
                    """.format(result_dir=self.result_dir)
                    configure_oar = Remote(configure_oar_cmd, nodes,
                                           connection_params={'user': '******'})
                    configure_oar.run()
                    logger.info("OAR is configured on all nodes")

                    # Configure server
                    create_db = "oar-database --create --db-is-local"
                    config_oar_sched = ("oarnotify --remove-queue default;"
                                        "oarnotify --add-queue default,1,kamelot")
                    start_oar = "systemctl start oar-server.service"
                    logger.info(
                        "configuring OAR database: {}".format(str(nodes[0])))
                    config_master = SshProcess(create_db + ";" + config_oar_sched + ";" + start_oar,
                                               nodes[0],
                                               connection_params={'user': '******'})
                    config_master.run()

                    # propagate SSH keys
                    logger.info("configuring OAR SSH")
                    oar_key = "/tmp/.ssh"
                    Process('rm -rf ' + oar_key).run()
                    Process('scp -o BatchMode=yes -o PasswordAuthentication=no '
                            '-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null '
                            '-o ConnectTimeout=20 -rp -o User=root ' +
                            nodes[0].address + ":/var/lib/oar/.ssh"
                            ' ' + oar_key).run()
                    # Get(nodes[0], "/var/lib/oar/.ssh", [oar_key], connection_params={'user': '******'}).run()
                    Put(nodes[1:], [oar_key], "/var/lib/oar/", connection_params={'user': '******'}).run()
                    add_resources_cmd = """
                    oarproperty -a cpu || true; \
                    oarproperty -a core || true; \
                    oarproperty -c -a host || true; \
                    oarproperty -a mem || true; \
                    """
                    for node in nodes[1:]:
                        add_resources_cmd = add_resources_cmd + "oarnodesetting -a -h {node} -p host={node} -p cpu=1 -p core=4 -p cpuset=0 -p mem=16; \\\n".format(node=node.address)

                    add_resources = SshProcess(add_resources_cmd, nodes[0],
                                               connection_params={'user': '******'})
                    add_resources.run()

                    if add_resources.ok:
                        logger.info("oar is now configured!")
                    else:
                        raise RuntimeError("error in the OAR configuration: Abort!")

                # TODO backup de la config de OAR

                # Do the replay
                logger.info('begining the replay')
                while len(self.sweeper.get_remaining()) > 0:
                    combi = self.sweeper.get_next()
                    workload_file = os.path.basename(combi['workload_filename'])
                    oar_replay = SshProcess(script_path + "/oar_replay.py " +
                                            combi['workload_filename'] + " " +
                                            self.result_dir + "  oar_gant_" +
                                            workload_file,
                                            nodes[0])
                    oar_replay.stdout_handlers.append(self.result_dir + '/' +
                                                      workload_file + '.out')
                    logger.info("replaying workload: {}".format(combi))
                    oar_replay.run()
                    if oar_replay.ok:
                        logger.info("Replay workload OK: {}".format(combi))
                        self.sweeper.done(combi)
                    else:
                        logger.info("Replay workload NOT OK: {}".format(combi))
                        self.sweeper.cancel(combi)
                        raise RuntimeError("error in the OAR replay: Abort!")

            except:
                traceback.print_exc()
                ipdb.set_trace()

            finally:
                if is_a_test:
                    ipdb.set_trace()
                if reservation_job_id is None:
                    logger.info("delete job: {}".format(jobs))
                    oardel(jobs)
Ejemplo n.º 30
0
    def run(self):
        """The main experimental workflow, as described in
        ``Using the Execo toolkit to perform ...``
        """
        self.force_options()

        # The argument is a cluster
        self.cluster = self.args[0]
        self.frontend = get_cluster_site(self.cluster)
        # Analyzing options
        if self.options.oar_job_id:
            self.oar_job_id = self.options.oar_job_id
        else:
            self.oar_job_id = None

        try:
            # Creation of the main iterator which is used for the first control loop.
            # You need have a method called define_parameters, that returns a list of parameter dicts
            self.create_paramsweeper()

            job_is_dead = False
            # While they are combinations to treat
            while len(self.sweeper.get_remaining()) > 0:
                # If no job, we make a reservation and prepare the hosts for the experiments
                if self.oar_job_id is None:
                    self.make_reservation()
                # Retrieving the hosts and subnets parameters
                self.get_resources()
                # Hosts deployment and configuration
                if not self.options.no_hosts_setup:
                    self.setup_hosts()
                if len(self.hosts) == 0:
                    break

                # Initializing the resources and threads
                available_hosts = list(self.hosts)
                available_ip_mac = list(self.ip_mac)
                threads = {}

                # Checking that the job is running and not in Error
                while get_oar_job_info(self.oar_job_id, self.frontend)['state'] != 'Error' \
                    or len(threads.keys()) > 0:
                    job_is_dead = False
                    while self.options.n_nodes > len(available_hosts):
                        tmp_threads = dict(threads)
                        for t in tmp_threads:
                            if not t.is_alive():
                                available_hosts.extend(tmp_threads[t]['hosts'])
                                available_ip_mac.extend(
                                    tmp_threads[t]['ip_mac'])
                                del threads[t]
                        sleep(5)
                        if get_oar_job_info(self.oar_job_id,
                                            self.frontend)['state'] == 'Error':
                            job_is_dead = True
                            break
                    if job_is_dead:
                        break

                    # Getting the next combination
                    comb = self.sweeper.get_next()
                    if not comb:
                        while len(threads.keys()) > 0:
                            tmp_threads = dict(threads)
                            for t in tmp_threads:
                                if not t.is_alive():
                                    del threads[t]
                            logger.info('Waiting for threads to complete')
                            sleep(20)
                        break

                    used_hosts = available_hosts[0:self.options.n_nodes]
                    available_hosts = available_hosts[self.options.n_nodes:]

                    n_vm = self.comb_nvm(comb)
                    used_ip_mac = available_ip_mac[0:n_vm]
                    available_ip_mac = available_ip_mac[n_vm:]

                    t = Thread(target=self.workflow,
                               args=(comb, used_hosts, used_ip_mac))
                    threads[t] = {'hosts': used_hosts, 'ip_mac': used_ip_mac}
                    logger.debug('Threads: %s', len(threads))
                    t.daemon = True
                    t.start()

                if get_oar_job_info(self.oar_job_id,
                                    self.frontend)['state'] == 'Error':
                    job_is_dead = True

                if job_is_dead:
                    self.oar_job_id = None

        finally:
            if self.oar_job_id is not None:
                if not self.options.keep_alive:
                    logger.info('Deleting job')
                    oardel([(self.oar_job_id, self.frontend)])
                else:
                    logger.info('Keeping job alive for debugging')
Ejemplo n.º 31
0
                                      job_name="cloudal_docker")
        provisioner.provisioning()
        hosts = provisioner.hosts
        self.oar_result = provisioner.oar_result
        logger.info("Provisioning nodes: DONE")

        logger.info("Starting configure Docker on nodes")
        self.config_host(hosts)
        logger.info("Configuring Docker on nodes: DONE")


if __name__ == "__main__":
    logger.info("Init engine in %s" % __file__)
    engine = config_docker_env_g5k()

    try:
        logger.info("Start engine in %s" % __file__)
        engine.start()
    except Exception as e:
        logger.info('Program is terminated by the following exception:')
        traceback.print_exc()
    except KeyboardInterrupt:
        logger.info('Program is terminated by keyboard interrupt.')

    if not engine.args.keep_alive:
        logger.info('Deleting reservation')
        oardel(engine.oar_result)
        logger.info('Reservation deleted')
    else:
        logger.info('Reserved nodes are kept alive for inspection purpose.')
Ejemplo n.º 32
0
    def run(self):
        rtt_file = self.result_dir + "/rtt.csv"
        resolver = None
        client = 'tcpclient' if self.args.mode == 'tcp' else 'udpclient'
        try:
            logger.debug("Experiment ID: {}".format(self.exp_id))
            if self.multi_site():
                logger.info("Running in multi-site mode")
            if not self.multi_site():
                self.reserve_resources_singlejob()
                logger.debug("Waiting for OAR job to start...")
                g5k.wait_oar_job_start(*self.vmhosts_job)
                self.prepare_subnet()
                logger.debug("Prepared subnet")
            # Dependencies (besides the obvious ones):
            # - deploy_server depends on prepare_global_vlan
            # - prepare_server depends on deploy_server
            # - prepare_server depends on prepare_subnet
            # - prepare_vm depends on deploy_server
            if self.multi_site():
                self.reserve_global_vlan()
                g5k.wait_oar_job_start(*self.globalvlan_job)
                logger.debug("Waiting for global VLAN job to start...")
                self.prepare_global_vlan()
            self.log_experimental_conditions()
            logger.debug("Deploying VM hosts...")
            machines_deploy_process = self.start_deploy_vmhosts()
            logger.debug("Deploying server image...")
            server_deploy_process = self.start_deploy_server()
            machines_deploy_process.wait()
            logger.debug("Finishing deploying VM hosts...")
            self.finish_deploy_vmhosts(machines_deploy_process)
            logger.debug("Setting up VM hosts...")
            machines_setup_process = self.prepare_vmhosts()
            machines_setup_process.wait()
            logger.debug("VM hosts are setup.")
            server_deploy_process.wait()
            logger.debug("Finishing deploying server...")
            self.finish_deploy_server(server_deploy_process)
            logger.debug("Server is deployed.")
            self.vm_process = self.start_all_vm()
            # Ensure VM are killed when we exit
            with self.vm_process:
                server_setup_process = self.prepare_server()
                self.wait_until_vm_ready()
                vm_setup_process = self.prepare_vm()
                server_setup_process.wait()
                self.log_output(server_setup_process, "server_setup_process")
                if not server_setup_process.ok:
                    logger.error(
                        "Error while preparing server, please check logs for 'server_setup_process'"
                    )
                    raise Exception
                logger.debug("Prepared server: {}".format(self.server.address))
                vm_setup_process.wait()
                self.log_output(vm_setup_process, "vm_setup_process")
                if not vm_setup_process.ok:
                    logger.error(
                        "Error while preparing VMs, please check logs for 'vm_setup_process'"
                    )
                    raise Exception
                logger.debug("Prepared VM")
                logger.info("Started {} VMs.".format(len(self.vm)))
                cpunetlog_vms = self.start_cpunetlog(self.vm)
                cpunetlog_server = self.start_cpunetlog(
                    [self.server], self.server_conn_params)
                resolver = self.start_dns_server()
                logger.info("Started resolver ({}) on {}.".format(
                    self.resolver_name, self.server.address))
                # Leave time for resolver to start
                if self.args.resolver_slots_per_thread < 1000000:
                    execo.sleep(15)
                else:
                    execo.sleep(60)
                logger.info("Starting {} on all VMs...".format(client))
                clients = self.start_client_vm()
                clients.wait()
                logger.info("{} finished!".format(client))
                logger.info("Writing cpunetlog output to disk.")
                cpunetlog_server.kill().wait()
                cpunetlog_vms.kill().wait()
                self.log_output(cpunetlog_server, "cpunetlog_server")
                self.log_output(cpunetlog_vms, "cpunetlog_vms")
                logger.info("writing {} results to disk.".format(client))
                self.log_output(clients, "clients", log_stdout=False)
                with open(rtt_file, 'w') as rtt_output:
                    need_header = True
                    rtt = csv.writer(rtt_output)
                    for client_id, client in enumerate(clients.processes):
                        first_line = True
                        for line in iter(client.stdout.splitlines()):
                            # Skip anything that does not look like CSV
                            if ',' not in line:
                                continue
                            if need_header:
                                # Take CSV header from first client and add a column
                                data = line.split(",")
                                data.insert(0, "vm_id")
                                rtt.writerow(data)
                                need_header = False
                                first_line = False
                            elif first_line:
                                # Skip first line of subsequent clients
                                first_line = False
                            else:
                                # Add column with VM ID
                                data = line.split(",")
                                data.insert(0, client_id)
                                rtt.writerow(data)

        except Exception as e:
            logger.error("Exception raised: {}\n{}".format(e, format_exc()))
        finally:
            #self.kill_all_vm()
            if self.vm_process:
                self.vm_process.kill()
            if resolver:
                resolver.kill()
                logger.debug("Waiting for resolver to exit")
                resolver.wait()
                self.log_output(resolver, "resolver")
            if self.vm_process:
                logger.debug("Waiting for VM to exit")
                self.vm_process.wait()
                logger.info("Resolver and all VMs are shut down")
                self.log_output(self.vm_process, "vm_process")
                print(execo.Report([self.vm_process]).to_string())
            #for s in self.vm_process.processes:
            #    print("\n%s\nstdout:\n%s\nstderr:\n%s\n" % (s, s.stdout, s.stderr))
            g5k.oardel([self.vmhosts_job])