コード例 #1
0
ファイル: rally-g5k.py プロジェクト: asimonet/rally-g5k
    def get_host(self):
        """Returns the hosts from an existing reservation (if any), or from
		a new reservation"""

        # Look if there is a running job
        self.site = get_cluster_site(self.config['cluster'])
        jobs = EX5.get_current_oar_jobs([self.site])

        self.job_id = None
        for t in jobs:
            if EX5.get_oar_job_info(
                    t[0], self.site)['name'] == self.options.job_name:
                self.job_id = t[0]
                break

        if self.job_id:
            logger.info('Using job %s' % style.emph(self.job_id))
        else:
            logger.info('Making a new reservation')
            self._make_reservation(self.site)

        if not self.job_id:
            logger.error("Could not get a reservation for the job")
            exit(6)

        EX5.wait_oar_job_start(self.job_id, self.site)

        pp(EX5.get_oar_job_nodes(self.job_id, self.site))
        return EX5.get_oar_job_nodes(self.job_id, self.site)[0]
コード例 #2
0
ファイル: os-distri-db.py プロジェクト: Marie-Donnie/misc
    def run(self):
        """Perform experiment"""
        try:
            if self.job_id is None:
                # make a reservation                                 
                logger.info("Making a reservation for %s nodes, on site: %s, during %s" % (str(self.machines), self.site, self.duration))
                job = ex5.oarsub([( ex5.OarSubmission(resources = "/cluster=1/nodes="+str(self.machines), walltime=self.duration, job_type='deploy'), self.site)])
                job_id, site = job[0]
                if job_id is None:
                    raise ValueError("Could not find a slot for the requested resources.")
                logger.info("Using new oar job : %s, on site : %s" % (job_id, site))
                logger.info("Waiting for job to start")
                self.nodes = ex5.get_oar_job_nodes(job_id, site)
                logger.info("Reservation done")
                if self.resa_only:
                    sys.exit(0)
            else:
                if self.job_id.isdigit():
                    self.nodes = ex5.get_oar_job_nodes(int(self.job_id), self.job_site)
                else:
                    raise ValueError("Use a number for job id")
            logger.info("Using nodes : %s" % self.nodes)
            
        except Exception as e:
            t, value, tb = sys.exc_info()
            print str(t) + " " + str(value)
            traceback.print_tb(tb)
            logger.info(__doc__)
            sys.exit(3)

        try:
            # this will be useful to create a result folder
            self.dt = datetime.datetime.now().strftime('%Y%m%d_%H%M')

            # run tests for each implementation
            for impl in self.implementation:
                logger.info("Using %s for implementation" % impl)
                if impl not in {"mysql", "disco"}:
                    raise ValueError("Use only mysql or disco arguments")                    
                self.deploy_ubuntu()
                self._preparation(impl)
                self._deploy_disco_vag()
                self._rally()
                self._get_files(impl)
                    
        except Exception as e:
            t, value, tb = sys.exc_info()
            print str(t) + " " + str(value)
            traceback.print_tb(tb)
            logger.info(__doc__)
            sys.exit(4)
コード例 #3
0
ファイル: util.py プロジェクト: sarlam/hadoop_g5k
def generate_hosts(hosts_input):
    """Generate a list of hosts from the given file.

    Args:
      hosts_input: The path of the file containing the hosts to be used,
        or a comma separated list of site:job_id or an oargrid_job_id.
        If a file is used, each host should be in a different line.
        Repeated hosts are pruned.
        Hint: in a running Grid5000 job,  $OAR_NODEFILE should be used.

    Return:
      list of Host: The list of hosts.
    """
    hosts = []
    if os.path.isfile(hosts_input):
        for line in open(hosts_input):
            h = Host(line.rstrip())
            if h not in hosts:
                hosts.append(h)
    elif ':' in hosts_input:
        # We assume the string is a comma separated list of site:job_id
        for job in hosts_input.split(','):
            site, job_id = job.split(':')
            hosts += get_oar_job_nodes(int(job_id), site)
    else:
        # If the file_name is a number, we assume this is a oargrid_job_id
        hosts = get_oargrid_job_nodes(int(hosts_input))
    logger.debug(
        'Hosts list: \n%s',
        ' '.join(style.host(host.address.split('.')[0]) for host in hosts))
    return hosts
コード例 #4
0
 def worker_start(self, cluster, site, oarsubmission, data, worker_index):
     th = current_thread()
     th.cluster = cluster
     th.site = site
     th.worker_index = worker_index
     th.jobid = None
     try:
         with th.oarsublock:
             if th.willterminate:
                 return
             worker_log.detail("submit oar job")
             ((th.jobid, _), ) = oarsub([(oarsubmission, site)])
         if not th.jobid:
             worker_log.detail("job submission failed")
             self.worker(cluster, site, data, None, worker_index,
                         oarsubmission, None)
         worker_log.detail("job submitted - wait job start")
         wait_oar_job_start(th.jobid,
                            site,
                            prediction_callback=lambda ts: worker_log.
                            detail("job start prediction: %s" %
                                   (format_date(ts), )))
         th.waiting = False
         worker_log.detail("job started - get job nodes")
         nodes = get_oar_job_nodes(th.jobid, site)
         worker_log.detail("got %i nodes" % (len(nodes), ))
         self.worker(cluster, site, data, nodes, worker_index,
                     oarsubmission, th.jobid)
     finally:
         with th.oarsublock:
             if th.jobid:
                 worker_log.detail("delete oar job")
                 oardel([(th.jobid, site)])
                 th.jobid = None
         worker_log.detail("exit")
コード例 #5
0
ファイル: utils.py プロジェクト: badock/enoslib
def concretize_resources(resources, gridjob, reservation_type):
    if reservation_type == "oar":
        nodes = ex5.get_oar_job_nodes(gridjob)
    else:
        nodes = ex5.get_oargrid_job_nodes(gridjob)

    concretize_nodes(resources, nodes)

    if reservation_type == "oar":
        # This block is in charge of detecting the site of the oar reservation
        site_candidates = []
        for network_description in resources.get("machines", []):
            cluster = network_description.get("cluster")
            site_candidates += [ex5.get_cluster_site(cluster)]
        for network_description in resources.get("networks", []):
            site_candidates += [network_description.get("site", "unknown")]
        if len(set(site_candidates)) == 1:
            site = site_candidates[0]
        else:
            raise "Could not detect the g5k site of the oarjob %s" % gridjob
        job_sites = [(gridjob, site)]
    else:
        job_sites = ex5.get_oargrid_job_oar_jobs(gridjob)
    vlans = []
    for (job_id, site) in job_sites:
        vlan_ids = ex5.get_oar_job_kavlan(job_id, site)
        vlans.extend([{
            "site": site,
            "vlan_id": vlan_id
        } for vlan_id in vlan_ids])

    concretize_networks(resources, vlans)
コード例 #6
0
 def run_xp(self):
     """Iterate over the parameters and execute the bench"""
     while len(self.sweeper.get_remaining()) > 0:
         comb = self.sweeper.get_next()
         if comb['n_core'] > get_host_attributes(comb['cluster']+'-1')['architecture']['smt_size'] * self.n_nodes: 
             self.sweeper.skip(comb)
             continue
         logger.info('Processing new combination %s' % (comb,))
         site = get_cluster_site(comb['cluster'])
         jobs = oarsub([(OarSubmission(resources = "{cluster='" + comb['cluster']+"'}/nodes=" + str(self.n_nodes),
                                       job_type = 'allow_classic_ssh', 
                                       walltime ='0:10:00'), 
                         site)])
         if jobs[0][0]:
             try:
                 wait_oar_job_start(*jobs[0])
                 nodes = get_oar_job_nodes(*jobs[0])
                 bench_cmd = 'mpirun -H %s -n %i %s ~/NPB3.3-MPI/bin/lu.%s.%i' % (
                     ",".join([node.address for node in nodes]),
                     comb['n_core'],
                     get_mpi_opts(comb['cluster']),
                     comb['size'],
                     comb['n_core'])
                 lu_bench = SshProcess(bench_cmd, nodes[0])
                 lu_bench.stdout_handlers.append(self.result_dir + '/' + slugify(comb) + '.out')
                 lu_bench.run()
                 if lu_bench.ok:
                     logger.info("comb ok: %s" % (comb,))
                     self.sweeper.done(comb)
                     continue
             finally:
                 oardel(jobs)
         logger.info("comb NOT ok: %s" % (comb,))
         self.sweeper.cancel(comb)
コード例 #7
0
    def get_resources(self):
        """Retrieve the hosts address list and (ip, mac) list from a list of oar_result and
        return the resources which is a dict needed by g5k_provisioner
        """
        logger.info("Getting resources specs")
        self.resources = dict()
        self.hosts = list()

        for oar_job_id, site in self.oar_result:
            logger.info('Waiting for the reserved nodes on %s to be up' % site)
            if not wait_oar_job_start(oar_job_id, site):
                logger.error('The reserved resources cannot be used.\nThe program is terminated.')
                exit()

        for oar_job_id, site in self.oar_result:
            logger.info('Retrieving resource information on %s' % site)
            logger.debug('Retrieving hosts')
            hosts = [host.address for host in get_oar_job_nodes(oar_job_id, site)]

            # if len(hosts) != self.clusters[site]:

            logger.debug('Retrieving subnet')
            ip_mac, _ = get_oar_job_subnets(oar_job_id, site)
            kavlan = None
            if len(ip_mac) == 0:
                logger.debug('Retrieving kavlan')
                kavlan = get_oar_job_kavlan(oar_job_id, site)
                if kavlan:
                    ip_mac = self.get_kavlan_ip_mac(kavlan, site)
            self.resources[site] = {'hosts': hosts,
                                    'ip_mac': ip_mac,
                                    'kavlan': kavlan}

        for site, resource in self.resources.items():
            self.hosts += resource['hosts']
コード例 #8
0
 def start_deploy_vmhosts(self):
     hosts = g5k.get_oar_job_nodes(*self.vmhosts_job)
     if self.multi_site():
         self.vm_hosts = hosts
     else:
         # Take all but the first host
         self.vm_hosts = sorted(hosts, key=lambda node: node.address)[1:]
     if os.path.isfile(self.args.vmhosts_env):
         deploy_opts = {"env_file": self.args.vmhosts_env}
     else:
         deploy_opts = {
             "env_name": self.args.vmhosts_env,
             "user": self.args.vmhosts_kadeploy_user
         }
     if self.multi_site():
         deploy_opts["vlan"] = self.global_vlan
         logger.debug(
             "Deploying environment '{}' on {} VM hosts in VLAN {}...".
             format(self.args.vmhosts_env, len(self.vm_hosts),
                    self.global_vlan))
     else:
         logger.debug("Deploying environment '{}' on {} VM hosts...".format(
             self.args.vmhosts_env, len(self.vm_hosts)))
     d = g5k.Deployment(self.vm_hosts, **deploy_opts)
     return g5k.kadeploy.Kadeployer(d).start()
コード例 #9
0
def get_oar_job_vm5k_resources(jobs):
    """Retrieve the hosts list and (ip, mac) list from a list of oar_job and
    return the resources dict needed by vm5k_deployment """
    resources = {}
    for oar_job_id, site in jobs:
        logger.detail('Retrieving resources from %s:%s', style.emph(site),
                      oar_job_id)
        oar_job_id = int(oar_job_id)
        wait_oar_job_start(oar_job_id, site)
        logger.debug('Retrieving hosts')
        hosts = [host.address for host in get_oar_job_nodes(oar_job_id, site)]
        logger.debug('Retrieving subnet')
        ip_mac, _ = get_oar_job_subnets(oar_job_id, site)
        kavlan = None
        if len(ip_mac) == 0:
            logger.debug('Retrieving kavlan')
            kavlan = get_oar_job_kavlan(oar_job_id, site)
            if kavlan:
                assert (len(kavlan) == 1)
                kavlan = kavlan[0]
                ip_mac = get_kavlan_ip_mac(kavlan, site)
        resources[site] = {
            'hosts': hosts,
            'ip_mac': ip_mac[300:],
            'kavlan': kavlan
        }
    return resources
コード例 #10
0
 def start_deploy_server(self):
     """Deploy the server with the given Kadeploy environment.  Blocks until
     deployment is done"""
     # Sort hosts by name and take the first one: this is useful when
     # using a single reservation for all nodes, since we will always
     # pick the same host as server.
     self.server = sorted(g5k.get_oar_job_nodes(*self.server_job),
                          key=lambda node: node.address)[0]
     if os.path.isfile(self.args.server_env):
         deploy_opts = {"env_file": self.args.server_env}
     else:
         deploy_opts = {
             "env_name": self.args.server_env,
             "user": self.args.kadeploy_user
         }
     if self.multi_site():
         deploy_opts["vlan"] = self.global_vlan
         logger.debug(
             "Deploying environment '{}' on server {} in VLAN {}...".format(
                 self.args.server_env, self.server.address,
                 self.global_vlan))
     else:
         logger.debug("Deploying environment '{}' on server {}...".format(
             self.args.server_env, self.server.address))
     d = g5k.Deployment([self.server], **deploy_opts)
     return g5k.kadeploy.Kadeployer(d).start()
コード例 #11
0
ファイル: utils.py プロジェクト: lpouillo/vm5k
def get_oar_job_vm5k_resources(jobs):
    """Retrieve the hosts list and (ip, mac) list from a list of oar_job and
    return the resources dict needed by vm5k_deployment """
    resources = {}
    for oar_job_id, site in jobs:
        logger.detail('Retrieving resources from %s:%s',
                      style.emph(site), oar_job_id)
        oar_job_id = int(oar_job_id)
        wait_oar_job_start(oar_job_id, site)
        logger.debug('Retrieving hosts')
        hosts = [host.address for host in get_oar_job_nodes(oar_job_id, site)]
        logger.debug('Retrieving subnet')
        ip_mac, _ = get_oar_job_subnets(oar_job_id, site)
        kavlan = None
        if len(ip_mac) == 0:
            logger.debug('Retrieving kavlan')
            kavlan = get_oar_job_kavlan(oar_job_id, site)
            if kavlan:
                assert(len(kavlan) == 1)
                kavlan = kavlan[0]
                ip_mac = get_kavlan_ip_mac(kavlan, site)
        resources[site] = {'hosts': hosts,
                           'ip_mac': ip_mac[300:],
                           'kavlan': kavlan}
    return resources
コード例 #12
0
 def worker_start(self, cluster, site, oarsubmission, data, worker_index):
     th = current_thread()
     th.cluster = cluster
     th.site = site
     th.worker_index = worker_index
     th.jobid = None
     try:
         with th.oarsublock:
             if th.willterminate:
                 return
             worker_log.detail("submit oar job")
             ((th.jobid, _),) = oarsub([(oarsubmission, site)])
         if not th.jobid:
             worker_log.detail("job submission failed")
             self.worker(cluster, site, data, None, worker_index, oarsubmission, None)
         worker_log.detail("job submitted - wait job start")
         wait_oar_job_start(th.jobid, site,
                            prediction_callback = lambda ts:
                                worker_log.detail("job start prediction: %s" % (format_date(ts),)))
         th.waiting = False
         worker_log.detail("job started - get job nodes")
         nodes = get_oar_job_nodes(th.jobid, site)
         worker_log.detail("got %i nodes" % (len(nodes),))
         self.worker(cluster, site, data, nodes, worker_index, oarsubmission, th.jobid)
     finally:
         with th.oarsublock:
             if th.jobid:
                 worker_log.detail("delete oar job")
                 oardel([(th.jobid, site)])
                 th.jobid = None
         worker_log.detail("exit")
コード例 #13
0
ファイル: util.py プロジェクト: sarlam/hadoop_g5k
def generate_hosts(hosts_input):
    """Generate a list of hosts from the given file.

    Args:
      hosts_input: The path of the file containing the hosts to be used,
        or a comma separated list of site:job_id or an oargrid_job_id.
        If a file is used, each host should be in a different line.
        Repeated hosts are pruned.
        Hint: in a running Grid5000 job,  $OAR_NODEFILE should be used.

    Return:
      list of Host: The list of hosts.
    """
    hosts = []
    if os.path.isfile(hosts_input):
        for line in open(hosts_input):
            h = Host(line.rstrip())
            if h not in hosts:
                hosts.append(h)
    elif ':' in hosts_input:
        # We assume the string is a comma separated list of site:job_id
        for job in hosts_input.split(','):
            site, job_id = job.split(':')
            hosts += get_oar_job_nodes(int(job_id), site)
    else:
        # If the file_name is a number, we assume this is a oargrid_job_id
        hosts = get_oargrid_job_nodes(int(hosts_input))
    logger.debug('Hosts list: \n%s',
                 ' '.join(style.host(host.address.split('.')[0])
                          for host in hosts))
    return hosts
コード例 #14
0
ファイル: utils.py プロジェクト: jrbalderrama/enoslib
def oar_reload_from_id(oarjob, site):
    logger.info("Reloading the resources from oar job %s", oarjob)
    job_id = int(oarjob)
    nodes = ex5.get_oar_job_nodes(job_id)

    vlans = []
    subnets = []
    vlans, subnets = get_network_info_from_job_id(job_id, site, vlans, subnets)
    return nodes, vlans, subnets
コード例 #15
0
 def log_experimental_conditions(self):
     logger.info("Random seed: {}".format(self.args.random_seed))
     logger.info("Subnet [job {}]: {}".format(self.subnet_job[0],
                                              self.subnet))
     all_hosts = sorted(
         [s.address for s in g5k.get_oar_job_nodes(*self.vmhosts_job)])
     logger.info("{} machines [job {}]: {}".format(len(all_hosts),
                                                   self.vmhosts_job[0],
                                                   ' '.join(all_hosts)))
コード例 #16
0
ファイル: launch.py プロジェクト: p-jacquot/unikernel-tools
def deploy_node(job_id, site, submission):
    node = get_oar_job_nodes(job_id,site)[0]
    deployment = Deployment(hosts = [node],
                    env_file = "unikernels/hermit/debian10-x64-nfs-hermit.env",
                    user = "******",
                    other_options = "-k")
    node.user = "******"
    deploy(deployment)
    return node
コード例 #17
0
    def get_hosts_list(self, hosts_str):
        """Generate a list of hosts from the given string.

        Args:
          hosts_str (str): The following options are supported

            - The path of the file containing the hosts to be used. Each host
            should be in a different line. Repeated hosts are pruned.
            Hint: in a running Grid5000 job, $OAR_NODEFILE should be used.

            - A comma-separated list of  site:job_id

            - A comma-separated list of hosts.

            - An oargrid_job_id

        Return:
          list of Host: The list of hosts.
        """
        hosts = []
        if os.path.isfile(hosts_str):
            for line in open(hosts_str):
                h = Host(line.rstrip())
                if h not in hosts:
                    hosts.append(h)
        elif ':' in hosts_str:
            # We assume the string is a comma separated list of site:job_id
            for job in hosts_str.split(','):
                site, job_id = job.split(':')
                hosts += get_oar_job_nodes(int(job_id), site)
        elif "," in hosts_str:
            # We assume the string is a comma separated list of hosts
            for hstr in hosts_str.split(','):
                h = Host(hstr.rstrip())
                if h not in hosts:
                    hosts.append(h)
        elif hosts_str.isdigit():
            # If the file_name is a number, we assume this is a oargrid_job_id
            hosts = get_oargrid_job_nodes(int(hosts_str))
        else:
            # If not any of the previous, we assume is a single-host cluster
            # where the given input is the only host
            hosts = [Host(hosts_str.rstrip())]

        logger.debug('Hosts list: \n%s',
                     ' '.join(style.host(host.address.split('.')[0])
                              for host in hosts))
        return hosts
コード例 #18
0
def get_cpu_topology(cluster, xpdir=None):
    """ """
    logger.info('Determining the architecture of cluster ' + \
                style.emph(cluster))
    root = None
    # Trying to reed topology from a directory
    if xpdir:
        fname = xpdir + '/topo_' + cluster + '.xml'
        try:
            tree = parse(fname)
            root = tree.getroot()
        except:
            logger.info('No cache file found, will reserve a node and ' + \
                        'determine topology from virsh capabilities')
            pass

    if root is None:
        frontend = get_cluster_site(cluster)
        submission = OarSubmission(resources="{cluster='" + cluster +
                                   "'}/nodes=1",
                                   walltime="0:02:00",
                                   job_type="allow_classic_ssh")
        ((job_id, _), ) = oarsub([(submission, frontend)])
        wait_oar_job_start(job_id, frontend)
        host = get_oar_job_nodes(job_id, frontend)[0]
        capa = SshProcess('virsh capabilities',
                          host,
                          connection_params={
                              'user':
                              default_frontend_connection_params['user']
                          }).run()
        oardel([(job_id, frontend)])
        root = fromstring(capa.stdout)
        if xpdir is not None:
            tree = ElementTree(root)
            tree.write(fname)

    cpu_topology = []
    i_cell = 0
    for cell in root.findall('.//cell'):
        cpu_topology.append([])
        for cpu in cell.findall('.//cpu'):
            cpu_topology[i_cell].append(int(cpu.attrib['id']))
        i_cell += 1
    logger.info(pformat(cpu_topology))
    return cpu_topology
コード例 #19
0
ファイル: l2c_fft.py プロジェクト: lpouillo/execo-g5k-tools
 def get_nodes(self, comb):
     """
         Perform a submission for a given comb and 
         retrieve the submission node list
     """
     logger.info('Performing submission')
     n_core = get_host_attributes(comb['cluster'] + '-1')['architecture']['smt_size']
     submission = OarSubmission(resources="nodes=%d" % (max(1, comb['cores']/n_core), ), 
                sql_properties="cluster='%s'"%comb['cluster'],
                job_type="besteffort", 
                name="l2c_fft_eval")
     self.oar_job_id, self.frontend = oarsub([(submission, get_cluster_site(comb['cluster']))])[0]
     logger.info("Waiting for job start")
     wait_oar_job_start(self.oar_job_id, self.frontend)
     logger.info("Retrieving hosts list")
     nodes = get_oar_job_nodes(self.oar_job_id, self.frontend)
     self.hosts = [host for host in nodes for i in range(n_core)]
コード例 #20
0
 def prepare_bench(self):
     """bench configuration and compilation, copy binaries to frontends
     
     return True if preparation is ok
     """
     logger.info("preparation: configure and compile benchmark")
     # the involved sites. We will do the compilation on the first of these.
     sites = list(set(map(get_cluster_site, self.parameters['cluster'])))
     # generate the bench compilation configuration
     bench_list = '\n'.join([ 'lu\t%s\t%s' % (size, n_core)
                              for n_core in self.parameters['n_core']
                              for size in self.parameters['size'] ])
     # Reserving a node because compiling on the frontend is forbidden
     # and because we need mpif77
     jobs = oarsub([(OarSubmission(resources = "nodes=1",
                                   job_type = 'allow_classic_ssh',
                                   walltime ='0:10:00'), sites[0])])
     if jobs[0][0]:
         try:
             logger.info("copying bench archive to %s" % (sites[0],))
             copy_bench = Put([sites[0]], ['NPB3.3-MPI.tar.bz2']).run()
             logger.info("extracting bench archive on %s" % (sites[0],))
             extract_bench = Remote('tar -xjf NPB3.3-MPI.tar.bz2', [sites[0]]).run()
             logger.info("waiting job start %s" % (jobs[0],))
             wait_oar_job_start(*jobs[0], prediction_callback = pred_cb)
             logger.info("getting nodes of %s" % (jobs[0],))
             nodes = get_oar_job_nodes(*jobs[0])
             logger.info("configure bench compilation")
             conf_bench = Remote('echo "%s" > ~/NPB3.3-MPI/config/suite.def' % bench_list, nodes).run()
             logger.info("compil bench")
             compilation = Remote('cd NPB3.3-MPI && make clean && make suite', nodes).run()
             logger.info("compil finished")
         except:
             logger.error("unable to compile bench")
             return False
         finally:
             oardel(jobs)
     # Copying binaries to all other frontends
     frontends = sites[1:]
     rsync = Remote('rsync -avuP ~/NPB3.3-MPI/ {{frontends}}:NPB3.3-MPI', 
                    [get_host_site(nodes[0])] * len(frontends)) 
     rsync.run()
     return compilation.ok and rsync.ok
コード例 #21
0
ファイル: engine.py プロジェクト: badock/vm5k
def get_cpu_topology(cluster, xpdir=None):
    """ """
    logger.info('Determining the architecture of cluster ' + \
                style.emph(cluster))
    root = None
    # Trying to reed topology from a directory
    if xpdir:
        fname = xpdir + '/topo_' + cluster + '.xml'
        try:
            tree = parse(fname)
            root = tree.getroot()
        except:
            logger.info('No cache file found, will reserve a node and ' + \
                        'determine topology from virsh capabilities')
            pass

    if root is None:
        frontend = get_cluster_site(cluster)
        submission = OarSubmission(
            resources="{cluster='" + cluster + "'}/nodes=1",
            walltime="0:02:00",
            job_type="allow_classic_ssh")
        ((job_id, _), ) = oarsub([(submission, frontend)])
        wait_oar_job_start(job_id, frontend)
        host = get_oar_job_nodes(job_id, frontend)[0]
        capa = SshProcess('virsh capabilities', host,
            connection_params={'user': default_frontend_connection_params['user']}
            ).run()
        oardel([(job_id, frontend)])
        root = fromstring(capa.stdout)
        if xpdir is not None:
            tree = ElementTree(root)
            tree.write(fname)

    cpu_topology = []
    i_cell = 0
    for cell in root.findall('.//cell'):
        cpu_topology.append([])
        for cpu in cell.findall('.//cpu'):
            cpu_topology[i_cell].append(int(cpu.attrib['id']))
        i_cell += 1
    logger.info(pformat(cpu_topology))
    return cpu_topology
コード例 #22
0
ファイル: util.py プロジェクト: djamelinfo/hadoop_g5k
def generate_hosts(hosts_input):
    """Generate a list of hosts from the given file.

    Args:
      hosts_input: The path of the file containing the hosts to be used,
        or a comma separated list of site:job_id or an a comma separated list
        of hosts or an oargrid_job_id.
        If a file is used, each host should be in a different line.
        Repeated hosts are pruned.
        Hint: in a running Grid5000 job, $OAR_NODEFILE should be used.

    Return:
      list of Host: The list of hosts.
    """
    hosts = []
    if os.path.isfile(hosts_input):
        for line in open(hosts_input):
            h = Host(line.rstrip())
            if h not in hosts:
                hosts.append(h)
    elif ":" in hosts_input:
        # We assume the string is a comma separated list of site:job_id
        for job in hosts_input.split(","):
            site, job_id = job.split(":")
            hosts += get_oar_job_nodes(int(job_id), site)
    elif "," in hosts_input:
        # We assume the string is a comma separated list of hosts
        for hstr in hosts_input.split(","):
            h = Host(hstr.rstrip())
            if h not in hosts:
                hosts.append(h)
    elif hosts_input.isdigit():
        # If the file_name is a number, we assume this is a oargrid_job_id
        hosts = get_oargrid_job_nodes(int(hosts_input))
    else:
        # If not any of the previous, we assume is a single-host cluster where
        # the given input is the only host
        hosts = [Host(hosts_input.rstrip())]

    logger.debug("Hosts list: \n%s", " ".join(style.host(host.address.split(".")[0]) for host in hosts))
    return hosts
コード例 #23
0
 def get_nodes(self, comb):
     """
         Perform a submission for a given comb and 
         retrieve the submission node list
     """
     logger.info('Performing submission')
     n_core = get_host_attributes(comb['cluster'] +
                                  '-1')['architecture']['smt_size']
     submission = OarSubmission(
         resources="nodes=%d" % (max(1, comb['cores'] / n_core), ),
         sql_properties="cluster='%s'" % comb['cluster'],
         job_type="besteffort",
         name="l2c_fft_eval")
     self.oar_job_id, self.frontend = oarsub([
         (submission, get_cluster_site(comb['cluster']))
     ])[0]
     logger.info("Waiting for job start")
     wait_oar_job_start(self.oar_job_id, self.frontend)
     logger.info("Retrieving hosts list")
     nodes = get_oar_job_nodes(self.oar_job_id, self.frontend)
     self.hosts = [host for host in nodes for i in range(n_core)]
コード例 #24
0
    def run(self):
        # Go to the result folder before everything
        os.chdir(self.result_dir)

        jobs = [(_jobID, _site)]
        # Get nodes
        nodes = get_oar_job_nodes(_jobID, _site)

        try:
            logger.info("Creating hostfiles for all combinations...")
            for nbr_node in _nbrNodes:
                hostfile_filename = self.result_dir + '/' + 'hostfile-' + nbr_node
                with open(hostfile_filename, 'w') as hostfile:
                    for node in nodes[:int(nbr_node)]:
                        print>>hostfile, node.address

            spack_process = Process('spack install -v chameleon@trunk+starpu+fxt ^starpu@svn-trunk+fxt')            
            spack_process.start()
            spack_process.wait()
            spack_process.kill()

        finally:
            logger.info("Delete job: {}".format(jobs))
            oardel(jobs)
コード例 #25
0
    def run(self):
        """Inherited method, put here the code for running the engine"""
        self.define_parameters()
        self.cluster = self.args[0]
        self.site = get_cluster_site(self.cluster)
        if self.options.oar_job_id:
            self.oar_job_id = self.options.oar_job_id
        else:
            self.oar_job_id = None

        try:
            # Creation of the main iterator which is used for the first control loop.
            # You need have a method called define_parameters, that returns a list of parameter dicts
            self.define_parameters()

            job_is_dead = False
            # While they are combinations to treat
            while len(self.sweeper.get_remaining()) > 0:
                # If no job, we make a reservation and prepare the hosts for the experiments
                if job_is_dead or self.oar_job_id is None:
                    self.make_reservation()
                # Retrieving the hosts and subnets parameters
                self.hosts = get_oar_job_nodes(self.oar_job_id, self.frontend)
                # Hosts deployment
                deployed, undeployed = deploy(
                    Deployment(self.hosts,
                               env_file="/home/mliroz/deploys/hadoop6.env"))
                logger.info("%i deployed, %i undeployed" %
                            (len(deployed), len(undeployed)))
                if len(deployed) == 0:
                    break
                # Configuration du systeme => look at the execo_g5k.topology module
                attr = get_host_attributes(self.cluster + '-1')

                ## SETUP FINISHED

                # Getting the next combination
                comb = self.sweeper.get_next()
                self.prepare_dataset(comb)
                self.xp(comb)
                # subloop over the combinations that have the same sizes
                while True:
                    newcomb = self.sweeper.get_next(lambda r: filter(
                        lambda subcomb: subcomb['sizes'] == comb['sizes'], r))
                    if newcomb:
                        try:
                            self.xp(newcomb)
                        except:
                            break
                    else:
                        break

                if get_oar_job_info(self.oar_job_id,
                                    self.frontend)['state'] == 'Error':
                    job_is_dead = True

        finally:
            if self.oar_job_id is not None:
                if not self.options.keep_alive:
                    logger.info('Deleting job')
                    oardel([(self.oar_job_id, self.frontend)])
                else:
                    logger.info('Keeping job alive for debugging')
コード例 #26
0
ファイル: test-g5k.py プロジェクト: omerjerk/pando-computing
        interrupted = False
        print 'interrupting previous command'
        workers.kill()
        execo.sleep(1)
        print 'sending command: ' + line
        workers = execo.Remote(line, cores).start()


app = App()

if jobid:
    try:
        print 'Waiting for job to start'
        execo_g5k.wait_oar_job_start(jobid, site)
        print 'Retrieving nodes'
        nodes = execo_g5k.get_oar_job_nodes(jobid, site)
        # Setup nodes
        print 'Preparing workers with cmd: ' + setup_cmd
        workers = execo.Remote(setup_cmd, nodes).start()
        workers.expect('Worker Setup Completed')
        workers.kill()
        # Possibly open more than one connection per machine
        cores = nodes * args.nb_cores
        print cores
        print 'Example cmd: %s' % (workers_cmd)
        app.prompt = '%s (%d node(s), %d core(s)/node)> ' % (
            site, args.volunteers, args.nb_cores)
        app.cmdloop()
        # execo.sleep(600)
        # print 'Workers done'
コード例 #27
0
                'size': 0.004
            },  # 300*15
            #     {'number': 4500, 'size': 0.256},
            #     {'number': 300, 'size': 1},
            #     {'number': 10000, 'size': 0.032},
            #     {'number': 100, 'size': 1},
            #     {'number': 200, 'size': 0.256},
            #     {'number': 200, 'size': 1},
            #     {'number': 100, 'size': 10},
            #     {'number': 1000, 'size': 0.256},
            #     {'number': 1000, 'size': 1},
        ]
    }

    nodes = []
    for node in get_oar_job_nodes(oar_job_id=int(sys.argv[1])):
        ip_vlan, ip = get_ip(node)
        #    if ip not in ['172.16.96.4']:
        if ip not in []:
            nodes.append({'node': node, 'ip_vlan': ip_vlan, 'ip_5k': ip})
    nodes.sort()
    topology = map_nodes(nodes)
    pprint.pprint(topology)

    raw_input('go')
    #  prepare_dummy(topology['client'][0])

    #  install_bench()
    #  set_ip()
    #  install_nuttcp()
    #  install_curl()
コード例 #28
0
    def run(self):
        """Run the experiment"""
        already_configured = self.options.already_configured
        reservation_job_id = int(self.options.reservation_id) \
            if self.options.reservation_id is not None else None
        is_a_test = self.options.is_a_test

        if is_a_test:
            logger.warn('THIS IS A TEST! This run will use only a few '
                        'resources')

        # make the result folder writable for all
        os.chmod(self.result_dir, 0o777)
        # Import configuration
        with open(self.args[0]) as config_file:
            config = json.load(config_file)
        # backup configuration
        copy(self.args[0], self.result_dir)

        site = config["grid5000_site"]
        resources = config["resources"]
        nb_experiment_nodes = config["nb_experiment_nodes"]
        walltime = str(config["walltime"])
        env_name = config["kadeploy_env_name"]
        workloads = config["workloads"]
        # check if workloads exists (Suppose that the same NFS mount point
        # is present on the remote and the local environment
        for workload_file in workloads:
            with open(workload_file):
                pass
            # copy the workloads files to the results dir
            copy(workload_file, self.result_dir)

        # define the workloads parameters
        self.parameters = {'workload_filename': workloads}
        logger.info('Workloads: {}'.format(workloads))

        # define the iterator over the parameters combinations
        self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"),
                                    sweep(self.parameters))

        # Due to previous (using -c result_dir) run skip some combination
        logger.info('Skipped parameters:' +
                    '{}'.format(str(self.sweeper.get_skipped())))

        logger.info('Number of parameters combinations {}'.format(
            str(len(self.sweeper.get_remaining()))))
        logger.info('combinations {}'.format(str(
            self.sweeper.get_remaining())))

        if reservation_job_id is not None:
            jobs = [(reservation_job_id, site)]
        else:
            jobs = oarsub([(OarSubmission(resources=resources,
                                          job_type='deploy',
                                          walltime=walltime), site)])
        job_id, site = jobs[0]
        if job_id:
            try:
                logger.info("waiting job start %s on %s" % (job_id, site))
                wait_oar_job_start(job_id,
                                   site,
                                   prediction_callback=prediction_callback)
                logger.info("getting nodes of %s on %s" % (job_id, site))
                nodes = get_oar_job_nodes(job_id, site)
                # sort the nodes
                nodes = sorted(nodes, key=lambda node: node.address)
                # get only the necessary nodes under the switch
                if nb_experiment_nodes > len(nodes):
                    raise RuntimeError('The number of given node in the '
                                       'reservation ({}) do not match the '
                                       'requested resources '
                                       '({})'.format(len(nodes),
                                                     nb_experiment_nodes))
                nodes = nodes[:nb_experiment_nodes]
                logger.info("deploying nodes: {}".format(str(nodes)))
                deployed, undeployed = deploy(
                    Deployment(nodes, env_name=env_name),
                    check_deployed_command=already_configured)
                if undeployed:
                    logger.warn("NOT deployed nodes: {}".format(
                        str(undeployed)))
                    raise RuntimeError('Deployement failed')

                if not already_configured:

                    # install OAR
                    install_cmd = "apt-get update; apt-get install -y "
                    node_packages = "oar-node"
                    logger.info("installing OAR nodes: {}".format(
                        str(nodes[1:])))
                    install_oar_nodes = Remote(
                        install_cmd + node_packages,
                        nodes[1:],
                        connection_params={'user': '******'})
                    install_oar_nodes.start()

                    server_packages = (
                        "oar-server oar-server-pgsql oar-user "
                        "oar-user-pgsql postgresql python3-pip "
                        "libjson-perl postgresql-server-dev-all")
                    install_oar_sched_cmd = """
                    mkdir -p /opt/oar_sched; \
                    cd /opt/oar_sched; \
                    git clone https://github.com/oar-team/oar3.git; \
                    cd oar3; \
                    git checkout dce942bebc2; \
                    pip3 install -e .; \
                    cd /usr/lib/oar/schedulers; \
                    ln -s /usr/local/bin/kamelot; \
                    pip3 install psycopg2
                    """
                    logger.info("installing OAR server node: {}".format(
                        str(nodes[0])))
                    install_master = SshProcess(
                        install_cmd + server_packages + ";" +
                        install_oar_sched_cmd,
                        nodes[0],
                        connection_params={'user': '******'})
                    install_master.run()
                    install_oar_nodes.wait()

                    if not install_master.ok:
                        Report(install_master)

                    configure_oar_cmd = """
                    sed -i \
                        -e 's/^\(DB_TYPE\)=.*/\\1="Pg"/' \
                        -e 's/^\(DB_HOSTNAME\)=.*/\\1="localhost"/' \
                        -e 's/^\(DB_PORT\)=.*/\\1="5432"/' \
                        -e 's/^\(DB_BASE_PASSWD\)=.*/\\1="oar"/' \
                        -e 's/^\(DB_BASE_LOGIN\)=.*/\\1="oar"/' \
                        -e 's/^\(DB_BASE_PASSWD_RO\)=.*/\\1="oar_ro"/' \
                        -e 's/^\(DB_BASE_LOGIN_RO\)=.*/\\1="oar_ro"/' \
                        -e 's/^\(SERVER_HOSTNAME\)=.*/\\1="localhost"/' \
                        -e 's/^\(SERVER_PORT\)=.*/\\1="16666"/' \
                        -e 's/^\(LOG_LEVEL\)\=\"2\"/\\1\=\"3\"/' \
                        -e 's#^\(LOG_FILE\)\=.*#\\1="{result_dir}/oar.log"#' \
                        -e 's/^\(JOB_RESOURCE_MANAGER_PROPERTY_DB_FIELD\=\"cpuset\".*\)/#\\1/' \
                        -e 's/^#\(CPUSET_PATH\=\"\/oar\".*\)/\\1/' \
                        -e 's/^\(FINAUD_FREQUENCY\)\=.*/\\1="0"/' \
                        /etc/oar/oar.conf
                    """.format(result_dir=self.result_dir)
                    configure_oar = Remote(configure_oar_cmd,
                                           nodes,
                                           connection_params={'user': '******'})
                    configure_oar.run()
                    logger.info("OAR is configured on all nodes")

                    # Configure server
                    create_db = "oar-database --create --db-is-local"
                    config_oar_sched = (
                        "oarnotify --remove-queue default;"
                        "oarnotify --add-queue default,1,kamelot")
                    start_oar = "systemctl start oar-server.service"
                    logger.info("configuring OAR database: {}".format(
                        str(nodes[0])))
                    config_master = SshProcess(
                        create_db + ";" + config_oar_sched + ";" + start_oar,
                        nodes[0],
                        connection_params={'user': '******'})
                    config_master.run()

                    # propagate SSH keys
                    logger.info("configuring OAR SSH")
                    oar_key = "/tmp/.ssh"
                    Process('rm -rf ' + oar_key).run()
                    Process(
                        'scp -o BatchMode=yes -o PasswordAuthentication=no '
                        '-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null '
                        '-o ConnectTimeout=20 -rp -o User=root ' +
                        nodes[0].address + ":/var/lib/oar/.ssh"
                        ' ' + oar_key).run()
                    # Get(nodes[0], "/var/lib/oar/.ssh", [oar_key], connection_params={'user': '******'}).run()
                    Put(nodes[1:], [oar_key],
                        "/var/lib/oar/",
                        connection_params={
                            'user': '******'
                        }).run()
                    add_resources_cmd = """
                    oarproperty -a cpu || true; \
                    oarproperty -a core || true; \
                    oarproperty -c -a host || true; \
                    oarproperty -a mem || true; \
                    """
                    for node in nodes[1:]:
                        add_resources_cmd = add_resources_cmd + "oarnodesetting -a -h {node} -p host={node} -p cpu=1 -p core=4 -p cpuset=0 -p mem=16; \\\n".format(
                            node=node.address)

                    add_resources = SshProcess(
                        add_resources_cmd,
                        nodes[0],
                        connection_params={'user': '******'})
                    add_resources.run()

                    if add_resources.ok:
                        logger.info("oar is now configured!")
                    else:
                        raise RuntimeError(
                            "error in the OAR configuration: Abort!")

                # TODO backup de la config de OAR

                # Do the replay
                logger.info('begining the replay')
                while len(self.sweeper.get_remaining()) > 0:
                    combi = self.sweeper.get_next()
                    workload_file = os.path.basename(
                        combi['workload_filename'])
                    oar_replay = SshProcess(
                        script_path + "/oar_replay.py " +
                        combi['workload_filename'] + " " + self.result_dir +
                        "  oar_gant_" + workload_file, nodes[0])
                    oar_replay.stdout_handlers.append(self.result_dir + '/' +
                                                      workload_file + '.out')
                    logger.info("replaying workload: {}".format(combi))
                    oar_replay.run()
                    if oar_replay.ok:
                        logger.info("Replay workload OK: {}".format(combi))
                        self.sweeper.done(combi)
                    else:
                        logger.info("Replay workload NOT OK: {}".format(combi))
                        self.sweeper.cancel(combi)
                        raise RuntimeError("error in the OAR replay: Abort!")

            except:
                traceback.print_exc()
                ipdb.set_trace()

            finally:
                if is_a_test:
                    ipdb.set_trace()
                if reservation_job_id is None:
                    logger.info("delete job: {}".format(jobs))
                    oardel(jobs)
コード例 #29
0
    def run(self):
        sweeper = self.create_paramsweeper()

        while True:
            comb = sweeper.get_next()
            if not comb:
                break
            comb_dir = self.result_dir + '/' + slugify(comb)
            if not os.path.isdir(comb_dir):
                os.mkdir(comb_dir)
            comb_file = comb_dir + '/trace'
            g5k_configuration['kadeploy3'] = comb['version']
            logger.info('Treating combination %s', pformat(comb))
            get_version = SshProcess(
                comb['version'] + ' -v',
                comb['site'],
                connection_params=default_frontend_connection_params).run()
            logger.info(get_version.stdout)

            resources = ""
            if comb['kavlan']:
                resources += "{type='kavlan'}/vlan=1+"
            resources += "nodes=" + str(comb['n_nodes'])
            sub = OarSubmission(resources=resources,
                                job_type='deploy',
                                walltime="0:30:00",
                                name='Kadeploy_Tests')
            logger.info('Performing submission of %s on site %s', resources,
                        comb['site'])
            jobs = oarsub([(sub, comb['site'])])

            if jobs[0][0]:
                try:
                    logger.info('Waiting for job to start')
                    wait_oar_job_start(jobs[0][0], jobs[0][1])
                    hosts = get_oar_job_nodes(jobs[0][0], jobs[0][1])
                    logger.info('Deployment of %s',
                                ' '.join([host.address for host in hosts]))
                    kavlan = get_oar_job_kavlan(jobs[0][0], jobs[0][1])
                    if kavlan:
                        logger.info('In kavlan %s', kavlan)
                    deployment = Deployment(hosts,
                                            env_name=comb['env'],
                                            vlan=kavlan)
                    deployed, undeployed = deploy(deployment,
                                                  stdout_handlers=[comb_file],
                                                  stderr_handlers=[comb_file])

                finally:
                    logger.info('Destroying job %s on %s', str(jobs[0][0]),
                                jobs[0][1])
                    oardel([(jobs[0][0], jobs[0][1])])
            else:
                deployed = []

            if len(undeployed) == 0:
                logger.info('%s is OK', slugify(comb))
            elif len(deployed) == 0:
                logger.error('%s is KO', slugify(comb))
            else:
                logger.warning('%s encountered problems with some hosts',
                               slugify(comb))

            sweeper.done(comb)
コード例 #30
0
def host_rewrite_func(host):
    return re.sub("\.grid5000\.fr$", ".g5k", host)

# sites = EX5.get_g5k_sites()
# sites.remove('bordeaux')


EX.logger.setLevel('INFO')
jobs = EX5.get_current_oar_jobs(['reims'])
 
if len(jobs) == 0:
    jobs = EX5.oarsub([( EX5.OarSubmission(resources = "{type=\\'kavlan\\'}/vlan=1+/nodes=2", walltime="3:00:00", job_type ='deploy'), "reims")])
    EX5.wait_oar_job_start( oar_job_id=jobs[0][0], frontend=jobs[0][1])  

print jobs
hosts = EX5.get_oar_job_nodes(jobs[0][0], jobs[0][1])
print hosts
kavlan_id = EX5.get_oar_job_kavlan(jobs[0][0], jobs[0][1])
print kavlan_id
deployment = EX5.Deployment( hosts = hosts, env_file= "ubuntu-x64-1204", vlan = kavlan_id) 

deployed_hosts, undeployed_hosts = EX5.deploy(deployment)
#deployed_hosts, undeployed_hosts = EX5.deploy(deployment, num_tries=0,check_deployed_command=True)

if kavlan_id is not None:
        hosts = [ EX5.get_kavlan_host_name(host, kavlan_id) for host in deployed_hosts ]
print hosts[0]


def get_kavlan_network(kavlan, site):
    """Retrieve the network parameters for a given kavlan from the API"""
コード例 #31
0
import os

from execo_g5k import get_oar_job_nodes
from hadoop_g5k import HadoopV2Cluster
from hadoop_g5k.ecosystem.spark import YARN_MODE, SparkCluster, \
    JavaOrScalaSparkJob

# Parameters
hosts = get_oar_job_nodes(int(os.environ["OAR_JOB_ID"]), None)

hadoop_tar_file = "/home/mliroz/public/sw/hadoop/hadoop-2.6.0.tar.gz"
spark_tar_file = "/home/mliroz/public/sw/spark/spark-1.5.1-bin-hadoop2.6.tgz"

jar_path = "/home/mliroz/public/sw/spark/spark-examples-1.5.1-hadoop2.6.0.jar"

# Create and configure Hadoop cluster
hc = HadoopV2Cluster(hosts)
hc.bootstrap(hadoop_tar_file)
hc.initialize()
hc.start_and_wait()

# Create and configure Spark cluster
sc = SparkCluster(YARN_MODE, hadoop_cluster=hc)
sc.bootstrap(spark_tar_file)
sc.initialize()
sc.start()

# Execute job
main_class = "org.apache.spark.examples.SparkPi"
params = []
コード例 #32
0
    def make_reservation(self):
        """Perform a reservation of the required number of nodes.

        Parameters
        ----------

        Returns
        -------

        """
        if self.oar_result:
            message = "Validated OAR_JOB_ID:"
            for job_id, site in self.oar_result:
                message += "\n%s: %s" % (site, job_id)
            logger.info(message)
            message = "The list of hosts:"
            for job_id, site in self.oar_result:
                hosts = get_oar_job_nodes(oar_job_id=job_id, frontend=site)
                message += "\n--- %s: %s nodes ---" % (site, len(hosts))
                for host in hosts:
                    message += "\n%s" % (host.address)
            logger.info(message)
            return

        if self.configs['walltime'] <= 99*3600+99*60+99:
            walltime = time.strftime('%H:%M:%S', time.gmtime(self.configs['walltime']))
        else:
            walltime = '%s seconds' % self.configs['walltime']
        message = 'You are requesting %s nodes for %s:' % (sum(self.clusters.values()), walltime)

        for cluster, n_nodes in self.clusters.items():
            message += "\n%s: %s nodes" % (cluster, n_nodes)
        logger.info(message)

        logger.info('Performing reservation .......')
        if 'starttime' not in self.configs or self.configs['starttime'] is None:
            self.configs['starttime'] = int(
                time.time() + timedelta_to_seconds(datetime.timedelta(minutes=1)))

        starttime = int(get_unixts(self.configs['starttime']))
        endtime = int(
            starttime + timedelta_to_seconds(datetime.timedelta(days=3, minutes=1)))
        startdate = self._get_nodes(starttime, endtime)

        while startdate is None:
            logger.info('No enough nodes found between %s and %s, ' +
                        '\nIncreasing the window time....', format_date(starttime), format_date(endtime))
            starttime = endtime
            endtime = int(
                starttime + timedelta_to_seconds(datetime.timedelta(days=3, minutes=1)))

            startdate = self._get_nodes(starttime, endtime)
            if starttime > int(self.configs['starttime'] + timedelta_to_seconds(datetime.timedelta(weeks=6))):
                logger.error(
                    'What a pity! There is no slot which satisfies your request until %s :(' % format_date(endtime))
                exit()

        jobs_specs = get_jobs_specs(self.clusters, name=self.job_name)
        for job_spec, site_name in jobs_specs:
            tmp = str(job_spec.resources).replace('\\', '')
            job_spec.resources = 'slash_22=4+' + tmp.replace('"', '')
            job_spec.walltime = self.configs['walltime']
            # -t deploy to reserve node without deploying OS
            job_spec.additional_options = '-t deploy'
            job_spec.reservation_date = startdate + 10

        self.oar_result = oarsub(jobs_specs)

        for oar_job_id, _ in self.oar_result:
            if oar_job_id is None:
                logger.info('Performing reservation FAILED')
                exit()

        message = "Reserved nodes successfully!!! \nOAR JOB ID:\n"
        for each in self.oar_result:
            message += "%s:%s," % (each[1], each[0])
        logger.info(message)
コード例 #33
0
ファイル: fp_hadoop.py プロジェクト: lpouillo/execo-g5k-tools
    def run(self):
        """Inherited method, put here the code for running the engine"""
        self.define_parameters()
        self.cluster = self.args[0]
        self.site = get_cluster_site(self.cluster)
        if self.options.oar_job_id:
            self.oar_job_id = self.options.oar_job_id
        else:
            self.oar_job_id = None

        try:
            # Creation of the main iterator which is used for the first control loop.
            # You need have a method called define_parameters, that returns a list of parameter dicts
            self.define_parameters()

            job_is_dead = False
            # While they are combinations to treat
            while len(self.sweeper.get_remaining()) > 0:
                # If no job, we make a reservation and prepare the hosts for the experiments
                if job_is_dead or self.oar_job_id is None:
                    self.make_reservation()
                # Retrieving the hosts and subnets parameters
                self.hosts = get_oar_job_nodes(self.oar_job_id, self.frontend)
                # Hosts deployment
                deployed, undeployed = deploy(Deployment(self.hosts, 
                    env_file="/home/mliroz/deploys/hadoop6.env"))
                logger.info("%i deployed, %i undeployed" % (len(deployed), 
                                                            len(undeployed)))
                if len(deployed) == 0:
                    break
                # Configuration du systeme => look at the execo_g5k.topology module 
                attr = get_host_attributes(self.cluster + '-1')
                
                ## SETUP FINISHED
                
                # Getting the next combination
                comb = self.sweeper.get_next()
                self.prepare_dataset(comb)
                self.xp(comb)
                # subloop over the combinations that have the same sizes
                while True:
                    newcomb = self.sweeper.get_next(lambda r:
                            filter(lambda subcomb: subcomb['sizes'] == comb['sizes'], r))
                    if newcomb:
                        try:
                            self.xp(newcomb)
                        except:
                            break
                    else:
                        break

                if get_oar_job_info(self.oar_job_id, self.frontend)['state'] == 'Error':
                    job_is_dead = True

        finally:
            if self.oar_job_id is not None:
                if not self.options.keep_alive:
                    logger.info('Deleting job')
                    oardel([(self.oar_job_id, self.frontend)])
                else:
                    logger.info('Keeping job alive for debugging') 
コード例 #34
0
    def run(self):
        sweeper = self.create_paramsweeper()

        while True:
            comb = sweeper.get_next()
            if not comb:
                break
            comb_dir = self.result_dir + '/' + slugify(comb)
            if not os.path.isdir(comb_dir):
                os.mkdir(comb_dir)
            comb_file = comb_dir + '/trace'
            g5k_configuration['kadeploy3'] = comb['version']
            logger.info('Treating combination %s', pformat(comb))
            get_version = SshProcess(comb['version'] + ' -v',
                                     comb['site'],
                                     connection_params=default_frontend_connection_params).run()
            logger.info(get_version.stdout)

            resources = ""
            if comb['kavlan']:
                resources += "{type='kavlan'}/vlan=1+"
            resources += "nodes=" + str(comb['n_nodes'])
            sub = OarSubmission(resources=resources,
                                job_type='deploy',
                                walltime="0:30:00",
                                name='Kadeploy_Tests')
            logger.info('Performing submission of %s on site %s',
                        resources, comb['site'])
            jobs = oarsub([(sub, comb['site'])])

            if jobs[0][0]:
                try:
                    logger.info('Waiting for job to start')
                    wait_oar_job_start(jobs[0][0], jobs[0][1])
                    hosts = get_oar_job_nodes(jobs[0][0], jobs[0][1])
                    logger.info('Deployment of %s',
                                ' '.join([host.address for host in hosts]))
                    kavlan = get_oar_job_kavlan(jobs[0][0], jobs[0][1])
                    if kavlan:
                        logger.info('In kavlan %s', kavlan)
                    deployment = Deployment(hosts, env_name=comb['env'],
                                            vlan=kavlan)
                    deployed, undeployed = deploy(deployment,
                                                  stdout_handlers=[comb_file],
                                                  stderr_handlers=[comb_file])

                finally:
                    logger.info('Destroying job %s on %s', str(jobs[0][0]),
                                jobs[0][1])
                    oardel([(jobs[0][0], jobs[0][1])])
            else:
                deployed = []

            if len(undeployed) == 0:
                logger.info('%s is OK', slugify(comb))
            elif len(deployed) == 0:
                logger.error('%s is KO', slugify(comb))
            else:
                logger.warning('%s encountered problems with some hosts',
                               slugify(comb))

            sweeper.done(comb)
コード例 #35
0
    def run(self):
        # Go to the result folder before everything
        os.chdir(self.result_dir)

        # OARSUB
        jobs = oarsub([(OarSubmission(resources='nodes=' + _nbrNodes.__str__(), 
                                      job_type='deploy', 
                                      walltime=_walltime, 
                                      sql_properties=_properties), _site)])
        
        job_id, site = jobs[0]
        try:
            # KADEPLOY
            logger.info("Waiting job start %s on %s" % (job_id, site))
            wait_oar_job_start(job_id, site, prediction_callback=prediction_callback)
            logger.info("getting nodes of %s on %s" % (job_id, site))
            nodes = get_oar_job_nodes(job_id, site)

            deployed, undeployed = deploy(Deployment(nodes, env_name=env_name),
                                          check_deployed_command=already_configured)
            if undeployed:
                logger.warn(
                    "NOT deployed nodes : {}".format(str(undeployed)))
                raise RuntimeError('Deployement failed')

            # STARPU INSTALLATION
            spack_spec = 'chameleon@trunk+starpu+fxt ^starpu@svn-trunk+fxt'
            spack_command = 'spack install -v' + ' ' + spack_spec 

            logger.info("Starting StarPU installation...")
            spack_process = Process(spack_command).start()
            spack_process.wait()

            logger.info("StarPU installation DONE...")
            self.checkProcess(spack_process)
            spack_process.kill()

            # STARPU DIRECTORY
            logger.info("Searching and going to StarPU installation directory...")

            starpu_location_process = Process(spack_spec).start()
            starpu_location_process.wait()
            self.checkProcess(starpu_location)

            starpu_cd_process = Process('cd ' + starpu_location_process.stdout + '/lib/chameleon').start()
            starpu_cd_process.wait()
            self.checkProcess(starpu_cd_process)

            starpu_location_process.kill()
            starpu_cd_process.kill()

            # RUNNING EXPERIMENT
            logger.info("Starting StarPU experiment...")
            starpu_experiment_process = Process(""" export STARPU_WORKER_STATS=1
                                                    export STARPU_CALIBRATE=2
                                                    ./timing/time_spotrf_tile --warmup --gpus=3 --threads=9 --nb=960 --ib=96 --n_range=48000:48000:9600 """)
            starpu_experiment_process.stdout_handlers.append(self.result_dir + '/' + 'StarPU.out') # create output file for StarPU    
            starpu_experiment_process.start()
            starpu_experiment_process.wait()

            logger.info("StarPU experiment DONE...")
            self.checkProcess(starpu_experiment_process)        
            starpu_experiment_process.kill()

        finally:
	        logger.info("Delete job : {}".format(jobs))
            oardel(jobs)
コード例 #36
0
    
    if oargrid_job_id < 0:
        print oargrid_job_id
        logger.info("No ressources availables")
        logger.info("End of program")
        sys.exit(0)
        
    logger.info("Wait for job to start...")
    print oargrid_job_id
    wait_oar_job_start(oar_job_id = oargrid_job_id)

logger.info("Wait for job to start...")
wait_oar_job_start(oargrid_job_id) #wait_oargrid_job_start(oargrid_job_id) #
print oargrid_job_id
print ssh_key 
nodes = get_oar_job_nodes(oargrid_job_id) #nodes = get_oargrid_job_nodes(oargrid_job_id)
logger.info("Job has started")
 
print nodes

logger.info("Deployment started")
#logger.setLevel(1)
nodes = deploy(Deployment(hosts = nodes, env_name = "wheezy-x64-diet", 
                          user = "******", other_options='-d -V4'), out = True, check_deployed_command=True)#, check_deployed_command = False)
deploy_nodes = nodes[0]   
ko_nodes = nodes[1]
logger.info("Deployment completed")
 
if not deploy_nodes:
    logger.info("No nodes were correctly deployed")
    logger.info("End of program")
コード例 #37
0
ファイル: jobdeployssh.py プロジェクト: Marie-Donnie/misc
try:
    # makes a reservation
    print("Making a reservation")
    planning = plan.get_planning(endtime=end)
    slots = plan.compute_slots(planning, walltime="03:00:00", excluded_elements=excluded)
    startdate, enddate, resources = plan.find_free_slot(slots, {'grid5000':1})                    
    resources = plan.distribute_hosts(resources, {'grid5000':1}, excluded_elements=excluded)
    specs = plan.get_jobs_specs(resources, excluded_elements=excluded)
    sub, site = specs[0]
    sub.additional_options = "-t deploy"
    sub.reservation_date = startdate
    sub.walltime = "03:00:00"
    job = ex5.oarsub([(sub, site)])
    job_id = job[0][0]
    job_site = job[0][1]
    host = ex5.get_oar_job_nodes(job_id, job_site)
    
except Exception as e:
    t, value, tb = sys.exc_info()
    print str(t) + " " + str(value)
    traceback.print_tb(tb)

try:
    # deploys
    print("Deploying monubuntu at " + job_site)
    deployment = ex5.kadeploy.Deployment(hosts=host, env_file=envfile)
    deployed_hosts, _ = ex5.deploy(deployment)
    # print("Deployed on " + deployed_host[0])
    
    if len(deployed_hosts) != 0:
        # commented because the slave agent is already downloaded
コード例 #38
0
def get_job_info(site, job_id):
	jobs = EX5.get_current_oar_jobs([site])
	for t in jobs:
		info = EX5.get_oar_job_info(t[0], site)
		return EX5.get_oar_job_nodes(job_id, site)[0]
コード例 #39
0
    def run(self):
        """Run the experiment"""
        already_configured = self.options.already_configured
        reservation_job_id = int(self.options.reservation_id) \
            if self.options.reservation_id is not None else None
        is_a_test = self.options.is_a_test

        if is_a_test:
            logger.warn('THIS IS A TEST! This run will use only a few '
                        'resources')

        # make the result folder writable for all
        os.chmod(self.result_dir, 0o777)
        # Import configuration
        with open(self.args[0]) as config_file:
            config = json.load(config_file)
        # backup configuration
        copy(self.args[0], self.result_dir)

        site = config["grid5000_site"]
        resources = config["resources"]
        nb_experiment_nodes = config["nb_experiment_nodes"]
        walltime = str(config["walltime"])
        env_name = config["kadeploy_env_name"]
        workloads = config["workloads"]
        # check if workloads exists (Suppose that the same NFS mount point
        # is present on the remote and the local environment
        for workload_file in workloads:
            with open(workload_file):
                pass
            # copy the workloads files to the results dir
            copy(workload_file, self.result_dir)

        # define the workloads parameters
        self.parameters = {
            'workload_filename': workloads
        }
        logger.info('Workloads: {}'.format(workloads))

        # define the iterator over the parameters combinations
        self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"),
                                    sweep(self.parameters))

        # Due to previous (using -c result_dir) run skip some combination
        logger.info('Skipped parameters:' +
                    '{}'.format(str(self.sweeper.get_skipped())))

        logger.info('Number of parameters combinations {}'.format(
            str(len(self.sweeper.get_remaining()))))
        logger.info('combinations {}'.format(
            str(self.sweeper.get_remaining())))

        if reservation_job_id is not None:
            jobs = [(reservation_job_id, site)]
        else:
            jobs = oarsub([(OarSubmission(resources=resources,
                                          job_type='deploy',
                                          walltime=walltime), site)])
        job_id, site = jobs[0]
        if job_id:
            try:
                logger.info("waiting job start %s on %s" % (job_id, site))
                wait_oar_job_start(
                    job_id, site, prediction_callback=prediction_callback)
                logger.info("getting nodes of %s on %s" % (job_id, site))
                nodes = get_oar_job_nodes(job_id, site)
                # sort the nodes
                nodes = sorted(nodes, key=lambda node: node.address)
                # get only the necessary nodes under the switch
                if nb_experiment_nodes > len(nodes):
                    raise RuntimeError('The number of given node in the '
                                       'reservation ({}) do not match the '
                                       'requested resources '
                                       '({})'.format(len(nodes),
                                                     nb_experiment_nodes))
                nodes = nodes[:nb_experiment_nodes]
                logger.info("deploying nodes: {}".format(str(nodes)))
                deployed, undeployed = deploy(
                    Deployment(nodes, env_name=env_name),
                    check_deployed_command=already_configured)
                if undeployed:
                    logger.warn(
                        "NOT deployed nodes: {}".format(str(undeployed)))
                    raise RuntimeError('Deployement failed')

                if not already_configured:

                    # install OAR
                    install_cmd = "apt-get update; apt-get install -y "
                    node_packages = "oar-node"
                    logger.info(
                        "installing OAR nodes: {}".format(str(nodes[1:])))
                    install_oar_nodes = Remote(
                        install_cmd + node_packages,
                        nodes[1:],
                        connection_params={'user': '******'})
                    install_oar_nodes.start()

                    server_packages = ("oar-server oar-server-pgsql oar-user "
                                       "oar-user-pgsql postgresql python3-pip "
                                       "libjson-perl postgresql-server-dev-all")
                    install_oar_sched_cmd = """
                    mkdir -p /opt/oar_sched; \
                    cd /opt/oar_sched; \
                    git clone https://github.com/oar-team/oar3.git; \
                    cd oar3; \
                    git checkout dce942bebc2; \
                    pip3 install -e .; \
                    cd /usr/lib/oar/schedulers; \
                    ln -s /usr/local/bin/kamelot; \
                    pip3 install psycopg2
                    """
                    logger.info("installing OAR server node: {}".format(str(nodes[0])))
                    install_master = SshProcess(install_cmd + server_packages +
                                                ";" + install_oar_sched_cmd, nodes[0],
                                                connection_params={'user': '******'})
                    install_master.run()
                    install_oar_nodes.wait()

                    if not install_master.ok:
                        Report(install_master)

                    configure_oar_cmd = """
                    sed -i \
                        -e 's/^\(DB_TYPE\)=.*/\\1="Pg"/' \
                        -e 's/^\(DB_HOSTNAME\)=.*/\\1="localhost"/' \
                        -e 's/^\(DB_PORT\)=.*/\\1="5432"/' \
                        -e 's/^\(DB_BASE_PASSWD\)=.*/\\1="oar"/' \
                        -e 's/^\(DB_BASE_LOGIN\)=.*/\\1="oar"/' \
                        -e 's/^\(DB_BASE_PASSWD_RO\)=.*/\\1="oar_ro"/' \
                        -e 's/^\(DB_BASE_LOGIN_RO\)=.*/\\1="oar_ro"/' \
                        -e 's/^\(SERVER_HOSTNAME\)=.*/\\1="localhost"/' \
                        -e 's/^\(SERVER_PORT\)=.*/\\1="16666"/' \
                        -e 's/^\(LOG_LEVEL\)\=\"2\"/\\1\=\"3\"/' \
                        -e 's#^\(LOG_FILE\)\=.*#\\1="{result_dir}/oar.log"#' \
                        -e 's/^\(JOB_RESOURCE_MANAGER_PROPERTY_DB_FIELD\=\"cpuset\".*\)/#\\1/' \
                        -e 's/^#\(CPUSET_PATH\=\"\/oar\".*\)/\\1/' \
                        -e 's/^\(FINAUD_FREQUENCY\)\=.*/\\1="0"/' \
                        /etc/oar/oar.conf
                    """.format(result_dir=self.result_dir)
                    configure_oar = Remote(configure_oar_cmd, nodes,
                                           connection_params={'user': '******'})
                    configure_oar.run()
                    logger.info("OAR is configured on all nodes")

                    # Configure server
                    create_db = "oar-database --create --db-is-local"
                    config_oar_sched = ("oarnotify --remove-queue default;"
                                        "oarnotify --add-queue default,1,kamelot")
                    start_oar = "systemctl start oar-server.service"
                    logger.info(
                        "configuring OAR database: {}".format(str(nodes[0])))
                    config_master = SshProcess(create_db + ";" + config_oar_sched + ";" + start_oar,
                                               nodes[0],
                                               connection_params={'user': '******'})
                    config_master.run()

                    # propagate SSH keys
                    logger.info("configuring OAR SSH")
                    oar_key = "/tmp/.ssh"
                    Process('rm -rf ' + oar_key).run()
                    Process('scp -o BatchMode=yes -o PasswordAuthentication=no '
                            '-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null '
                            '-o ConnectTimeout=20 -rp -o User=root ' +
                            nodes[0].address + ":/var/lib/oar/.ssh"
                            ' ' + oar_key).run()
                    # Get(nodes[0], "/var/lib/oar/.ssh", [oar_key], connection_params={'user': '******'}).run()
                    Put(nodes[1:], [oar_key], "/var/lib/oar/", connection_params={'user': '******'}).run()
                    add_resources_cmd = """
                    oarproperty -a cpu || true; \
                    oarproperty -a core || true; \
                    oarproperty -c -a host || true; \
                    oarproperty -a mem || true; \
                    """
                    for node in nodes[1:]:
                        add_resources_cmd = add_resources_cmd + "oarnodesetting -a -h {node} -p host={node} -p cpu=1 -p core=4 -p cpuset=0 -p mem=16; \\\n".format(node=node.address)

                    add_resources = SshProcess(add_resources_cmd, nodes[0],
                                               connection_params={'user': '******'})
                    add_resources.run()

                    if add_resources.ok:
                        logger.info("oar is now configured!")
                    else:
                        raise RuntimeError("error in the OAR configuration: Abort!")

                # TODO backup de la config de OAR

                # Do the replay
                logger.info('begining the replay')
                while len(self.sweeper.get_remaining()) > 0:
                    combi = self.sweeper.get_next()
                    workload_file = os.path.basename(combi['workload_filename'])
                    oar_replay = SshProcess(script_path + "/oar_replay.py " +
                                            combi['workload_filename'] + " " +
                                            self.result_dir + "  oar_gant_" +
                                            workload_file,
                                            nodes[0])
                    oar_replay.stdout_handlers.append(self.result_dir + '/' +
                                                      workload_file + '.out')
                    logger.info("replaying workload: {}".format(combi))
                    oar_replay.run()
                    if oar_replay.ok:
                        logger.info("Replay workload OK: {}".format(combi))
                        self.sweeper.done(combi)
                    else:
                        logger.info("Replay workload NOT OK: {}".format(combi))
                        self.sweeper.cancel(combi)
                        raise RuntimeError("error in the OAR replay: Abort!")

            except:
                traceback.print_exc()
                ipdb.set_trace()

            finally:
                if is_a_test:
                    ipdb.set_trace()
                if reservation_job_id is None:
                    logger.info("delete job: {}".format(jobs))
                    oardel(jobs)
コード例 #40
0
	def run(self):
		# Defining experiment parameters
		self.parameters = {
			'n_clients': [400, 450, 500, 550, 600],
			'n_transitions': [10000]
		}
		cluster = 'griffon'
		sweeps = sweep(self.parameters)
		sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweeps)
		server_out_path = os.path.join(self.result_dir, "server.out")
		
		self._updateStat(sweeper.stats())
		
		# Loop on the number of nodes
		while True:
			# Taking the next parameter combinations
			comb = sweeper.get_next()
			if not comb: break

			# Performing the submission on G5K
			site = get_cluster_site(cluster)
			self._log("Output will go to " + self.result_dir)
			
			n_nodes = int(math.ceil(float(comb['n_clients']) / EX5.get_host_attributes(cluster + '-1')['architecture']['smt_size'])) + 1
			self._log("Reserving {0} nodes on {1}".format(n_nodes, site))
			
			resources = "{cluster=\\'" + cluster + "\\'}/nodes=" + str(n_nodes)
			submission = EX5.OarSubmission(resources = resources, job_type = 'allow_classic_ssh', walltime ='00:10:00')
			
			job = EX5.oarsub([(submission, site)])
			self.__class__._job = job
			
			# Sometimes oarsub fails silently
			if job[0][0] is None:
				print("\nError: no job was created")
				sys.exit(1)
				
			# Wait for the job to start
			self._log("Waiting for job {0} to start...\n".format(BOLD_MAGENTA + str(job[0][0]) + NORMAL))
			EX5.wait_oar_job_start(job[0][0], job[0][1], prediction_callback = prediction)
			nodes = EX5.get_oar_job_nodes(job[0][0], job[0][1])
			
			# Deploying nodes
			#deployment = EX5.Deployment(hosts = nodes, env_file='path_to_env_file')
			#run_deploy = EX5.deploy(deployment)
			#nodes_deployed = run_deploy.hosts[0]
			
			# Copying active_data program on all deployed hosts
			EX.Put([nodes[0]], '../dist/active-data-lib-0.1.2.jar', connexion_params = {'user': '******'}).run()
			EX.Put([nodes[0]], '../server.policy', connexion_params = {'user': '******'}).run()
			
			# Loop on the number of requests per client process
			while True:
				# Split the nodes
				clients = nodes[1:]
				server = nodes[0] 
				
				self._log("Running experiment with {0} nodes and {1} transitions per client".format(len(clients), comb['n_transitions']))
				
				# Launching Server on one node
				out_handler = FileOutputHandler(server_out_path)
				launch_server = EX.Remote('java -jar active-data-lib-0.1.2.jar', [server], stdout_handler = out_handler, stderr_handler = out_handler).start()
				self._log("Server started on " + server.address)
				time.sleep(2)
				
				# Launching clients
				rank=0
				n_cores = EX5.get_host_attributes(clients[0])['architecture']['smt_size'];
				cores = nodes * n_cores
				cores = cores[0:comb['n_clients']] # Cut out the additional cores
				
				client_connection_params = {
						'taktuk_gateway': 'lyon.grid5000.fr',
						'host_rewrite_func': None
				}
				
				self._log("Launching {0} clients...".format(len(cores)))
				
				client_cmd = "/usr/bin/env java -cp /home/ansimonet/active-data-lib-0.1.2.jar org.inria.activedata.examples.perf.TransitionsPerSecond " + \
								"{0} {1} {2} {3} {4}".format(server.address, 1200, "{{range(len(cores))}}", len(cores), comb['n_transitions'])
				client_out_handler = FileOutputHandler(os.path.join(self.result_dir, "clients.out"))
				client_request = EX.TaktukRemote(client_cmd, cores, connexion_params = client_connection_params, \
									stdout_handler = client_out_handler, stderr_handler = client_out_handler)
				
				client_request.run()
				
				if not client_request.ok():
					# Some client failed, please panic
					self._log("One or more client process failed. Enjoy reading their outputs.")
					self._log("OUTPUT STARTS -------------------------------------------------\n")
					for process in client_request.processes():
						print("----- {0} returned {1}".format(process.host().address, process.exit_code()))
						if not process.stdout() == "": print(GREEN + process.stdout() + NORMAL)
						if not process.stderr() == "": print(RED + process.stderr() + NORMAL)
						print("")
					self._log("OUTPUT ENDS ---------------------------------------------------\n")
					sweeper.skip(comb)
					launch_server.kill()
					launch_server.wait()
				else:
					# Waiting for server to end
					launch_server.wait()
				
					# Getting log files
					distant_path = OUT_FILE_FORMAT.format(len(cores), comb['n_transitions'])
					local_path = distant_path
					
					EX.Get([server], distant_path).run()
					
					EX.Local('mv ' + distant_path + ' ' + os.path.join(self.result_dir, local_path)).run()
					
					EX.Get([server], 'client_*.out', local_location = self.result_dir)
					EX.Remote('rm -f client_*.out', [server])
					
					self._log("Finishing experiment with {0} clients and {1} transitions per client".format(comb['n_clients'], comb['n_transitions']))
					
					sweeper.done(comb)
					
				sub_comb = sweeper.get_next (filtr = lambda r: filter(lambda s: s["n_clients"] == comb['n_clients'], r))
				self._updateStat(sweeper.stats())
				
				if not sub_comb: 
					# Killing job
					EX5.oar.oardel(job)
					self.__class__._job = None
					break
				else: 
					comb = sub_comb
		
		print ""
コード例 #41
0
    def run(self):
        """ """
        if self.options.oargrid_job_id is not None:
            self.oar_job_id = self.options.oargrid_job_id
        else:
            self.oar_job_id = None

        self.list_of_clusters = [
            'parasilo', 'paravance', 'parapluie', 'paranoia'
        ]

        try:
            # Creation of the main iterator which is used for the first control loop.
            self.define_parameters()
            self.working_dir = '/data/jorouzaudcornabas_' + str(
                self.options.storage5k_job_id)

            job_is_dead = False
            # While there are combinations to treat
            while len(self.sweeper.get_remaining()) > 0:
                # If no job, we make a reservation and prepare the hosts for the experiments
                if self.oar_job_id is None:
                    self.submit_all_available_best_effort(
                        self.list_of_clusters, self.options.walltime)
                    # self.make_reservation_local()
                # Wait that the job starts
                logger.info('Waiting that the job start ' +
                            str(self.oar_job_id))
                wait_oar_job_start(self.oar_job_id)
                # Retrieving the hosts and subnets parameters
                self.hosts = get_oar_job_nodes(self.oar_job_id)
                # Hosts deployment and configuration
                default_connection_params['user'] = '******'

                logger.info("Start hosts configuration")
                ex_log.setLevel('INFO')
                #===============================================================
                # deployment = Deployment(hosts = self.hosts,
                #             env_file='/home/sirimie/env/mywheezy-x64-base.env')
                # self.hosts, _ = deploy(deployment)
                #===============================================================
                if len(self.hosts) == 0:
                    break

                # Initializing the resources and threads
                available_hosts = self.hosts

                threads = {}

                # Creating the unique folder for storing the results
                comb_dir = self.result_dir + '/logs'
                if not os.path.exists(comb_dir):
                    os.mkdir(comb_dir)

                logger.info("Starting the thread " + str(self.is_job_alive()) +
                            " " + str(len(threads.keys())))
                # Checking that the job is running and not in Error
                while self.is_job_alive() or len(threads.keys()) > 0:
                    job_is_dead = False

                    while self.options.n_nodes > len(available_hosts):
                        tmp_threads = dict(threads)
                        for t in tmp_threads:
                            if not t.is_alive():
                                available_hosts.append(tmp_threads[t]['host'])
                                del threads[t]
                        sleep(5)
                        if not self.is_job_alive():
                            job_is_dead = True
                            break
                    if job_is_dead:
                        break

                    # Getting the next combination
                    comb = self.sweeper.get_next()
                    if not comb:
                        while len(threads.keys()) > 0:
                            tmp_threads = dict(threads)
                            for t in tmp_threads:
                                if not t.is_alive():
                                    del threads[t]
                            logger.info('Waiting for threads to complete')
                            sleep(20)
                        break

                    host = available_hosts[0]
                    available_hosts = available_hosts[1:]
                    logger.info("Launching thread")
                    t = Thread(target=self.workflow,
                               args=(comb, host, comb_dir))
                    threads[t] = {'host': host}
                    t.daemon = True
                    t.start()

                if not self.is_job_alive():
                    job_is_dead = True

                if job_is_dead:
                    self.oar_job_id = None

        finally:
            if self.oar_job_id is not None:
                if not self.options.keep_alive:
                    logger.info('Deleting job')
                    oardel([self.oar_job_id])
                else:
                    logger.info('Keeping job alive for debugging')
コード例 #42
0
    def run(self):
        # Defining experiment parameters
        self.parameters = {
            'n_clients': [400, 450, 500, 550, 600],
            'n_transitions': [10000]
        }
        cluster = 'griffon'
        sweeps = sweep(self.parameters)
        sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweeps)
        server_out_path = os.path.join(self.result_dir, "server.out")

        self._updateStat(sweeper.stats())

        # Loop on the number of nodes
        while True:
            # Taking the next parameter combinations
            comb = sweeper.get_next()
            if not comb: break

            # Performing the submission on G5K
            site = get_cluster_site(cluster)
            self._log("Output will go to " + self.result_dir)

            n_nodes = int(
                math.ceil(
                    float(comb['n_clients']) / EX5.get_host_attributes(
                        cluster + '-1')['architecture']['smt_size'])) + 1
            self._log("Reserving {0} nodes on {1}".format(n_nodes, site))

            resources = "{cluster=\\'" + cluster + "\\'}/nodes=" + str(n_nodes)
            submission = EX5.OarSubmission(resources=resources,
                                           job_type='allow_classic_ssh',
                                           walltime='00:10:00')

            job = EX5.oarsub([(submission, site)])
            self.__class__._job = job

            # Sometimes oarsub fails silently
            if job[0][0] is None:
                print("\nError: no job was created")
                sys.exit(1)

            # Wait for the job to start
            self._log(
                "Waiting for job {0} to start...\n".format(BOLD_MAGENTA +
                                                           str(job[0][0]) +
                                                           NORMAL))
            EX5.wait_oar_job_start(job[0][0],
                                   job[0][1],
                                   prediction_callback=prediction)
            nodes = EX5.get_oar_job_nodes(job[0][0], job[0][1])

            # Deploying nodes
            #deployment = EX5.Deployment(hosts = nodes, env_file='path_to_env_file')
            #run_deploy = EX5.deploy(deployment)
            #nodes_deployed = run_deploy.hosts[0]

            # Copying active_data program on all deployed hosts
            EX.Put([nodes[0]],
                   '../dist/active-data-lib-0.1.2.jar',
                   connexion_params={
                       'user': '******'
                   }).run()
            EX.Put([nodes[0]],
                   '../server.policy',
                   connexion_params={
                       'user': '******'
                   }).run()

            # Loop on the number of requests per client process
            while True:
                # Split the nodes
                clients = nodes[1:]
                server = nodes[0]

                self._log(
                    "Running experiment with {0} nodes and {1} transitions per client"
                    .format(len(clients), comb['n_transitions']))

                # Launching Server on one node
                out_handler = FileOutputHandler(server_out_path)
                launch_server = EX.Remote(
                    'java -jar active-data-lib-0.1.2.jar', [server],
                    stdout_handler=out_handler,
                    stderr_handler=out_handler).start()
                self._log("Server started on " + server.address)
                time.sleep(2)

                # Launching clients
                rank = 0
                n_cores = EX5.get_host_attributes(
                    clients[0])['architecture']['smt_size']
                cores = nodes * n_cores
                cores = cores[
                    0:comb['n_clients']]  # Cut out the additional cores

                client_connection_params = {
                    'taktuk_gateway': 'lyon.grid5000.fr',
                    'host_rewrite_func': None
                }

                self._log("Launching {0} clients...".format(len(cores)))

                client_cmd = "/usr/bin/env java -cp /home/ansimonet/active-data-lib-0.1.2.jar org.inria.activedata.examples.perf.TransitionsPerSecond " + \
                    "{0} {1} {2} {3} {4}".format(server.address, 1200, "{{range(len(cores))}}", len(cores), comb['n_transitions'])
                client_out_handler = FileOutputHandler(
                    os.path.join(self.result_dir, "clients.out"))
                client_request = EX.TaktukRemote(client_cmd, cores, connexion_params = client_connection_params, \
                     stdout_handler = client_out_handler, stderr_handler = client_out_handler)

                client_request.run()

                if not client_request.ok():
                    # Some client failed, please panic
                    self._log(
                        "One or more client process failed. Enjoy reading their outputs."
                    )
                    self._log(
                        "OUTPUT STARTS -------------------------------------------------\n"
                    )
                    for process in client_request.processes():
                        print("----- {0} returned {1}".format(
                            process.host().address, process.exit_code()))
                        if not process.stdout() == "":
                            print(GREEN + process.stdout() + NORMAL)
                        if not process.stderr() == "":
                            print(RED + process.stderr() + NORMAL)
                        print("")
                    self._log(
                        "OUTPUT ENDS ---------------------------------------------------\n"
                    )
                    sweeper.skip(comb)
                    launch_server.kill()
                    launch_server.wait()
                else:
                    # Waiting for server to end
                    launch_server.wait()

                    # Getting log files
                    distant_path = OUT_FILE_FORMAT.format(
                        len(cores), comb['n_transitions'])
                    local_path = distant_path

                    EX.Get([server], distant_path).run()

                    EX.Local('mv ' + distant_path + ' ' +
                             os.path.join(self.result_dir, local_path)).run()

                    EX.Get([server],
                           'client_*.out',
                           local_location=self.result_dir)
                    EX.Remote('rm -f client_*.out', [server])

                    self._log(
                        "Finishing experiment with {0} clients and {1} transitions per client"
                        .format(comb['n_clients'], comb['n_transitions']))

                    sweeper.done(comb)

                sub_comb = sweeper.get_next(filtr=lambda r: filter(
                    lambda s: s["n_clients"] == comb['n_clients'], r))
                self._updateStat(sweeper.stats())

                if not sub_comb:
                    # Killing job
                    EX5.oar.oardel(job)
                    self.__class__._job = None
                    break
                else:
                    comb = sub_comb

        print ""