コード例 #1
0
ファイル: engine.py プロジェクト: badock/vm5k
 def get_resources(self):
     """Retrieve the ressources for the vm5k_deployement and define
     the list of hosts and ip_mac.
     """
     self.resources = get_oar_job_vm5k_resources([(self.oar_job_id,
                                                   self.frontend)])
     self.hosts = self.resources[get_cluster_site(self.cluster)]['hosts']
     self.ip_mac = self.resources[get_cluster_site(self.cluster)]['ip_mac']
コード例 #2
0
 def get_resources(self):
     """Retrieve the ressources for the vm5k_deployement and define
     the list of hosts and ip_mac.
     """
     self.resources = get_oar_job_vm5k_resources([(self.oar_job_id,
                                                   self.frontend)])
     self.hosts = self.resources[get_cluster_site(self.cluster)]['hosts']
     self.ip_mac = self.resources[get_cluster_site(self.cluster)]['ip_mac']
コード例 #3
0
 def run_xp(self):
     """Iterate over the parameters and execute the bench"""
     while len(self.sweeper.get_remaining()) > 0:
         comb = self.sweeper.get_next()
         if comb['n_core'] > get_host_attributes(comb['cluster']+'-1')['architecture']['smt_size'] * self.n_nodes: 
             self.sweeper.skip(comb)
             continue
         logger.info('Processing new combination %s' % (comb,))
         site = get_cluster_site(comb['cluster'])
         jobs = oarsub([(OarSubmission(resources = "{cluster='" + comb['cluster']+"'}/nodes=" + str(self.n_nodes),
                                       job_type = 'allow_classic_ssh', 
                                       walltime ='0:10:00'), 
                         site)])
         if jobs[0][0]:
             try:
                 wait_oar_job_start(*jobs[0])
                 nodes = get_oar_job_nodes(*jobs[0])
                 bench_cmd = 'mpirun -H %s -n %i %s ~/NPB3.3-MPI/bin/lu.%s.%i' % (
                     ",".join([node.address for node in nodes]),
                     comb['n_core'],
                     get_mpi_opts(comb['cluster']),
                     comb['size'],
                     comb['n_core'])
                 lu_bench = SshProcess(bench_cmd, nodes[0])
                 lu_bench.stdout_handlers.append(self.result_dir + '/' + slugify(comb) + '.out')
                 lu_bench.run()
                 if lu_bench.ok:
                     logger.info("comb ok: %s" % (comb,))
                     self.sweeper.done(comb)
                     continue
             finally:
                 oardel(jobs)
         logger.info("comb NOT ok: %s" % (comb,))
         self.sweeper.cancel(comb)
コード例 #4
0
def get_clusters_interfaces(clusters, extra_cond=lambda nic: True):
    """ Returns for each cluster the available cluster interfaces

    Args:
        clusters (str): list of the clusters
        extra_cond (lambda): extra predicate to filter network card retrieved
    from the API. E.g lambda nic: not nic['mounted'] will retrieve all the
    usable network cards that are not mounted by default.

    Returns:
        dict of cluster with their associated nic names

    Examples:
        .. code-block:: python

            # pseudo code
            actual = get_clusters_interfaces(["paravance"])
            expected = {"paravance": ["eth0", "eth1"]}
            assertDictEquals(expected, actual)
    """
    utils.get_clusters_interfaces(clusters, extra_cond=extra_cond)
    interfaces = {}
    for cluster in clusters:
        site = EX5.get_cluster_site(cluster)
        nics = EX5.get_resource_attributes("/sites/%s/clusters/%s/nodes" %
                                           (site, cluster))
        nics = nics['items'][0]['network_adapters']
        nics = [
            nic['device'] for nic in nics
            if nic['mountable'] and nic['interface'] == 'Ethernet'
            and not nic['management'] and extra_cond(nic)
        ]
        nics = sorted(nics)
        interfaces.setdefault(cluster, nics)
    return interfaces
コード例 #5
0
ファイル: utils.py プロジェクト: badock/enoslib
def concretize_resources(resources, gridjob, reservation_type):
    if reservation_type == "oar":
        nodes = ex5.get_oar_job_nodes(gridjob)
    else:
        nodes = ex5.get_oargrid_job_nodes(gridjob)

    concretize_nodes(resources, nodes)

    if reservation_type == "oar":
        # This block is in charge of detecting the site of the oar reservation
        site_candidates = []
        for network_description in resources.get("machines", []):
            cluster = network_description.get("cluster")
            site_candidates += [ex5.get_cluster_site(cluster)]
        for network_description in resources.get("networks", []):
            site_candidates += [network_description.get("site", "unknown")]
        if len(set(site_candidates)) == 1:
            site = site_candidates[0]
        else:
            raise "Could not detect the g5k site of the oarjob %s" % gridjob
        job_sites = [(gridjob, site)]
    else:
        job_sites = ex5.get_oargrid_job_oar_jobs(gridjob)
    vlans = []
    for (job_id, site) in job_sites:
        vlan_ids = ex5.get_oar_job_kavlan(job_id, site)
        vlans.extend([{
            "site": site,
            "vlan_id": vlan_id
        } for vlan_id in vlan_ids])

    concretize_networks(resources, vlans)
コード例 #6
0
ファイル: utils.py プロジェクト: msimonin/deploy5k
def get_cluster_interfaces(cluster, extra_cond=lambda nic: True):
    site = ex5.get_cluster_site(cluster)
    nics = ex5.get_resource_attributes(
        "/sites/%s/clusters/%s/nodes" % (site, cluster))
    nics = nics['items'][0]['network_adapters']
    nics = [nic['device'] for nic in nics
           if nic['mountable'] and
           nic['interface'] == 'Ethernet' and
           not nic['management'] and extra_cond(nic)]
    nics = sorted(nics)
    return nics
コード例 #7
0
ファイル: g5k.py プロジェクト: SmartInfrastructures/enos
    def _mount_cluster_nics(self, conf, cluster, nodes, kavlan_nodes, vlans):
        """Get the NIC devices of the reserved cluster.

        :param nodes: List of hostnames unmodified by the vlan
        """
        provider_conf = conf['provider']
        # XXX: this only works if all nodes are on the same cluster,
        # or if nodes from different clusters have the same devices
        site = EX5.get_cluster_site(cluster)
        nics = EX5.get_resource_attributes(
            "/sites/%s/clusters/%s/nodes" % (site, cluster)
            )['items'][0]['network_adapters']

        interfaces = [nic['device'] for nic in nics
                                    if nic['mountable'] and
                                    nic['interface'] == 'Ethernet']

        network_interface = str(interfaces[0])
        external_interface = None

        if len(interfaces) > 1 and not provider_conf['single_interface']:
            external_interface = str(interfaces[1])
            _, vlan = self._get_primary_vlan(vlans)
            api.set_nodes_vlan(site,
                               map(lambda d: EX.Host(d), nodes),
                               external_interface,
                               vlan)

            self._exec_command_on_nodes(
                kavlan_nodes,
                "ifconfig %s up && dhclient -nw %s" % (
                    external_interface, external_interface),
                'mounting secondary interface')
        else:
            # TODO(msimonin) fix the network in this case as well.
            external_interface = 'veth0'
            if provider_conf['single_interface']:
                logging.warning("Forcing the use of a one network interface")
            else:
                logging.warning("%s has only one NIC. The same interface "
                                "will be used for network_interface and "
                                "neutron_external_interface."
                                % conf['resources'].keys()[0])

            self._exec_command_on_nodes(
                kavlan_nodes,
                'ip link show veth0 || ip link add type veth peer',
                'Creating a veth')

        return (network_interface, external_interface)
コード例 #8
0
ファイル: utils.py プロジェクト: badock/enoslib
def get_cluster_interfaces(cluster, extra_cond=lambda nic: True):
    site = ex5.get_cluster_site(cluster)
    nics = ex5.get_resource_attributes("/sites/%s/clusters/%s/nodes" %
                                       (site, cluster))
    nics = nics['items'][0]['network_adapters']
    # NOTE(msimonin): Since 05/18 nics on g5k nodes have predictable names but
    # the api description keep the legacy name (device key) and the new
    # predictable name (key name).  The legacy names is still used for api
    # request to the vlan endpoint This should be fixed in
    # https://intranet.grid5000.fr/bugzilla/show_bug.cgi?id=9272
    # When its fixed we should be able to only use the new predictable name.
    nics = [(nic['device'], nic['name']) for nic in nics
            if nic['mountable'] and nic['interface'] == 'Ethernet'
            and not nic['management'] and extra_cond(nic)]
    nics = sorted(nics)
    return nics
コード例 #9
0
def get_cpu_topology(cluster, xpdir=None):
    """ """
    logger.info('Determining the architecture of cluster ' + \
                style.emph(cluster))
    root = None
    # Trying to reed topology from a directory
    if xpdir:
        fname = xpdir + '/topo_' + cluster + '.xml'
        try:
            tree = parse(fname)
            root = tree.getroot()
        except:
            logger.info('No cache file found, will reserve a node and ' + \
                        'determine topology from virsh capabilities')
            pass

    if root is None:
        frontend = get_cluster_site(cluster)
        submission = OarSubmission(resources="{cluster='" + cluster +
                                   "'}/nodes=1",
                                   walltime="0:02:00",
                                   job_type="allow_classic_ssh")
        ((job_id, _), ) = oarsub([(submission, frontend)])
        wait_oar_job_start(job_id, frontend)
        host = get_oar_job_nodes(job_id, frontend)[0]
        capa = SshProcess('virsh capabilities',
                          host,
                          connection_params={
                              'user':
                              default_frontend_connection_params['user']
                          }).run()
        oardel([(job_id, frontend)])
        root = fromstring(capa.stdout)
        if xpdir is not None:
            tree = ElementTree(root)
            tree.write(fname)

    cpu_topology = []
    i_cell = 0
    for cell in root.findall('.//cell'):
        cpu_topology.append([])
        for cpu in cell.findall('.//cpu'):
            cpu_topology[i_cell].append(int(cpu.attrib['id']))
        i_cell += 1
    logger.info(pformat(cpu_topology))
    return cpu_topology
コード例 #10
0
ファイル: l2c_fft.py プロジェクト: lpouillo/execo-g5k-tools
 def get_nodes(self, comb):
     """
         Perform a submission for a given comb and 
         retrieve the submission node list
     """
     logger.info('Performing submission')
     n_core = get_host_attributes(comb['cluster'] + '-1')['architecture']['smt_size']
     submission = OarSubmission(resources="nodes=%d" % (max(1, comb['cores']/n_core), ), 
                sql_properties="cluster='%s'"%comb['cluster'],
                job_type="besteffort", 
                name="l2c_fft_eval")
     self.oar_job_id, self.frontend = oarsub([(submission, get_cluster_site(comb['cluster']))])[0]
     logger.info("Waiting for job start")
     wait_oar_job_start(self.oar_job_id, self.frontend)
     logger.info("Retrieving hosts list")
     nodes = get_oar_job_nodes(self.oar_job_id, self.frontend)
     self.hosts = [host for host in nodes for i in range(n_core)]
コード例 #11
0
ファイル: engine.py プロジェクト: badock/vm5k
def get_cpu_topology(cluster, xpdir=None):
    """ """
    logger.info('Determining the architecture of cluster ' + \
                style.emph(cluster))
    root = None
    # Trying to reed topology from a directory
    if xpdir:
        fname = xpdir + '/topo_' + cluster + '.xml'
        try:
            tree = parse(fname)
            root = tree.getroot()
        except:
            logger.info('No cache file found, will reserve a node and ' + \
                        'determine topology from virsh capabilities')
            pass

    if root is None:
        frontend = get_cluster_site(cluster)
        submission = OarSubmission(
            resources="{cluster='" + cluster + "'}/nodes=1",
            walltime="0:02:00",
            job_type="allow_classic_ssh")
        ((job_id, _), ) = oarsub([(submission, frontend)])
        wait_oar_job_start(job_id, frontend)
        host = get_oar_job_nodes(job_id, frontend)[0]
        capa = SshProcess('virsh capabilities', host,
            connection_params={'user': default_frontend_connection_params['user']}
            ).run()
        oardel([(job_id, frontend)])
        root = fromstring(capa.stdout)
        if xpdir is not None:
            tree = ElementTree(root)
            tree.write(fname)

    cpu_topology = []
    i_cell = 0
    for cell in root.findall('.//cell'):
        cpu_topology.append([])
        for cpu in cell.findall('.//cpu'):
            cpu_topology[i_cell].append(int(cpu.attrib['id']))
        i_cell += 1
    logger.info(pformat(cpu_topology))
    return cpu_topology
コード例 #12
0
 def get_nodes(self, comb):
     """
         Perform a submission for a given comb and 
         retrieve the submission node list
     """
     logger.info('Performing submission')
     n_core = get_host_attributes(comb['cluster'] +
                                  '-1')['architecture']['smt_size']
     submission = OarSubmission(
         resources="nodes=%d" % (max(1, comb['cores'] / n_core), ),
         sql_properties="cluster='%s'" % comb['cluster'],
         job_type="besteffort",
         name="l2c_fft_eval")
     self.oar_job_id, self.frontend = oarsub([
         (submission, get_cluster_site(comb['cluster']))
     ])[0]
     logger.info("Waiting for job start")
     wait_oar_job_start(self.oar_job_id, self.frontend)
     logger.info("Retrieving hosts list")
     nodes = get_oar_job_nodes(self.oar_job_id, self.frontend)
     self.hosts = [host for host in nodes for i in range(n_core)]
コード例 #13
0
 def get_cluster_nics(self, cluster):
     site = EX5.get_cluster_site(cluster)
     nics = EX5.get_resource_attributes('/sites/%s/clusters/%s/nodes' % (site, cluster))['items'][0]['network_adapters']
     return [nic['device'] for nic in nics if nic['mountable']]
コード例 #14
0
    def run(self):
        """The main experimental workflow, as described in
        ``Using the Execo toolkit to perform ...``
        """
        self.force_options()

        # The argument is a cluster
        self.cluster = self.args[0]
        self.frontend = get_cluster_site(self.cluster)
        # Analyzing options
        if self.options.oar_job_id:
            self.oar_job_id = self.options.oar_job_id
        else:
            self.oar_job_id = None

        try:
            # Creation of the main iterator which is used for the first control loop.
            # You need have a method called define_parameters, that returns a list of parameter dicts
            self.create_paramsweeper()

            job_is_dead = False
            # While they are combinations to treat
            while len(self.sweeper.get_remaining()) > 0:
                # If no job, we make a reservation and prepare the hosts for the experiments
                if self.oar_job_id is None:
                    self.make_reservation()
                # Retrieving the hosts and subnets parameters
                self.get_resources()
                # Hosts deployment and configuration
                if not self.options.no_hosts_setup:
                    self.setup_hosts()
                if len(self.hosts) == 0:
                    break

                # Initializing the resources and threads
                available_hosts = list(self.hosts)
                available_ip_mac = list(self.ip_mac)
                threads = {}

                # Checking that the job is running and not in Error
                while get_oar_job_info(self.oar_job_id, self.frontend)['state'] != 'Error' \
                    or len(threads.keys()) > 0:
                    job_is_dead = False
                    while self.options.n_nodes > len(available_hosts):
                        tmp_threads = dict(threads)
                        for t in tmp_threads:
                            if not t.is_alive():
                                available_hosts.extend(tmp_threads[t]['hosts'])
                                available_ip_mac.extend(
                                    tmp_threads[t]['ip_mac'])
                                del threads[t]
                        sleep(5)
                        if get_oar_job_info(self.oar_job_id,
                                            self.frontend)['state'] == 'Error':
                            job_is_dead = True
                            break
                    if job_is_dead:
                        break

                    # Getting the next combination
                    comb = self.sweeper.get_next()
                    if not comb:
                        while len(threads.keys()) > 0:
                            tmp_threads = dict(threads)
                            for t in tmp_threads:
                                if not t.is_alive():
                                    del threads[t]
                            logger.info('Waiting for threads to complete')
                            sleep(20)
                        break

                    used_hosts = available_hosts[0:self.options.n_nodes]
                    available_hosts = available_hosts[self.options.n_nodes:]

                    n_vm = self.comb_nvm(comb)
                    used_ip_mac = available_ip_mac[0:n_vm]
                    available_ip_mac = available_ip_mac[n_vm:]

                    t = Thread(target=self.workflow,
                               args=(comb, used_hosts, used_ip_mac))
                    threads[t] = {'hosts': used_hosts, 'ip_mac': used_ip_mac}
                    logger.debug('Threads: %s', len(threads))
                    t.daemon = True
                    t.start()

                if get_oar_job_info(self.oar_job_id,
                                    self.frontend)['state'] == 'Error':
                    job_is_dead = True

                if job_is_dead:
                    self.oar_job_id = None

        finally:
            if self.oar_job_id is not None:
                if not self.options.keep_alive:
                    logger.info('Deleting job')
                    oardel([(self.oar_job_id, self.frontend)])
                else:
                    logger.info('Keeping job alive for debugging')
コード例 #15
0
 def run(self):
     num_total_workers = 0
     sites_clusters_threads = {} # dict: keys = sites, values =
                                 # dict: keys = clusters, values =
                                 # list: threads
     try:
         while True:
             t = Timer()
             clusters_to_submit = set()
             for clusterspec in self.get_clusters():
                 cluster, _, site = clusterspec.partition(".")
                 if site == "":
                     site = get_cluster_site(cluster)
                 clusters_to_submit.add((cluster, site))
             for site in sites_clusters_threads.keys():
                 for cluster in sites_clusters_threads[site].keys():
                     sites_clusters_threads[site][cluster] = [
                         th
                         for th in sites_clusters_threads[site][cluster]
                         if th.is_alive() ]
                     if len(sites_clusters_threads[site][cluster]) == 0:
                         del sites_clusters_threads[site][cluster]
                 if len(sites_clusters_threads[site]) == 0:
                     del sites_clusters_threads[site]
             all_involved_sites = set(sites_clusters_threads.keys())
             all_involved_sites.update([ s for (c, s) in clusters_to_submit ])
             no_submissions = True
             for site in all_involved_sites:
                 all_involved_clusters = set()
                 if sites_clusters_threads.has_key(site):
                     all_involved_clusters.update(sites_clusters_threads[site].keys())
                 all_involved_clusters.update([ c for (c, s) in clusters_to_submit if s == site ])
                 for cluster in all_involved_clusters:
                     num_workers = 0
                     num_waiting = 0
                     if sites_clusters_threads.has_key(site) and sites_clusters_threads[site].has_key(cluster):
                         num_workers = len(sites_clusters_threads[site][cluster])
                         num_waiting = len([
                                 th
                                 for th in sites_clusters_threads[site][cluster]
                                 if th.waiting ])
                     num_max_new_workers = min(self.options.max_workers - num_workers,
                                               self.options.max_waiting - num_waiting)
                     logger.trace(
                         "rescheduling on cluster %s@%s: num_workers = %s / num_waiting = %s / num_max_new_workers = %s" %
                         (cluster, site, num_workers, num_waiting, num_max_new_workers))
                     if num_max_new_workers > 0:
                         for worker_index in range(0, num_max_new_workers):
                             jobdata = self.get_job(cluster)
                             if not jobdata:
                                 break
                             no_submissions = False
                             logger.detail(
                                 "spawning worker %i on %s@%s" % (
                                     num_total_workers,
                                     cluster, site))
                             (oarsubmission, data) = jobdata
                             th = Thread(target = self.worker_start,
                                         args = (cluster, site,
                                                 oarsubmission, data,
                                                 num_total_workers,))
                             th.waiting = True
                             th.daemon = True
                             th.oarsublock = Lock()
                             th.willterminate = False
                             th.start()
                             num_total_workers += 1
                             if not sites_clusters_threads.has_key(site):
                                 sites_clusters_threads[site] = {}
                             if not sites_clusters_threads[site].has_key(cluster):
                                 sites_clusters_threads[site][cluster] = []
                             sites_clusters_threads[site][cluster].append(th)
             if no_submissions and len(sites_clusters_threads) == 0:
                 break
             sleep(self.options.schedule_delay)
         logger.detail("no more combinations to explore. exit schedule loop")
     finally:
         for site in sites_clusters_threads.keys():
             for cluster in sites_clusters_threads[site].keys():
                 for th in sites_clusters_threads[site][cluster]:
                     with th.oarsublock:
                         th.willterminate = True
                         if th.jobid:
                             logger.detail("cleaning: delete job %i of worker #%i on %s" % (
                                     th.jobid, th.worker_index, site))
                             oardel([(th.jobid, site)])
                             th.jobid = None
コード例 #16
0
ファイル: utils.py プロジェクト: jrbalderrama/enoslib
def get_cluster_site(cluster):
    return ex5.get_cluster_site(cluster)
コード例 #17
0
ファイル: g5k.py プロジェクト: asimonet/enos
 def _get_cluster_nics(self, cluster):
     site = EX5.get_cluster_site(cluster)
     nics = EX5.get_resource_attributes(
         '/sites/%s/clusters/%s/nodes' %
         (site, cluster))['items'][0]['network_adapters']
     return [nic['device'] for nic in nics if nic['mountable']]
コード例 #18
0
    def run(self):
        """Inherited method, put here the code for running the engine"""
        self.define_parameters()
        self.cluster = self.args[0]
        self.site = get_cluster_site(self.cluster)
        if self.options.oar_job_id:
            self.oar_job_id = self.options.oar_job_id
        else:
            self.oar_job_id = None

        try:
            # Creation of the main iterator which is used for the first control loop.
            # You need have a method called define_parameters, that returns a list of parameter dicts
            self.define_parameters()

            job_is_dead = False
            # While they are combinations to treat
            while len(self.sweeper.get_remaining()) > 0:
                # If no job, we make a reservation and prepare the hosts for the experiments
                if job_is_dead or self.oar_job_id is None:
                    self.make_reservation()
                # Retrieving the hosts and subnets parameters
                self.hosts = get_oar_job_nodes(self.oar_job_id, self.frontend)
                # Hosts deployment
                deployed, undeployed = deploy(
                    Deployment(self.hosts,
                               env_file="/home/mliroz/deploys/hadoop6.env"))
                logger.info("%i deployed, %i undeployed" %
                            (len(deployed), len(undeployed)))
                if len(deployed) == 0:
                    break
                # Configuration du systeme => look at the execo_g5k.topology module
                attr = get_host_attributes(self.cluster + '-1')

                ## SETUP FINISHED

                # Getting the next combination
                comb = self.sweeper.get_next()
                self.prepare_dataset(comb)
                self.xp(comb)
                # subloop over the combinations that have the same sizes
                while True:
                    newcomb = self.sweeper.get_next(lambda r: filter(
                        lambda subcomb: subcomb['sizes'] == comb['sizes'], r))
                    if newcomb:
                        try:
                            self.xp(newcomb)
                        except:
                            break
                    else:
                        break

                if get_oar_job_info(self.oar_job_id,
                                    self.frontend)['state'] == 'Error':
                    job_is_dead = True

        finally:
            if self.oar_job_id is not None:
                if not self.options.keep_alive:
                    logger.info('Deleting job')
                    oardel([(self.oar_job_id, self.frontend)])
                else:
                    logger.info('Keeping job alive for debugging')
コード例 #19
0
ファイル: engine.py プロジェクト: badock/vm5k
    def run(self):
        """The main experimental workflow, as described in
        ``Using the Execo toolkit to perform ...``
        """
        self.force_options()

        # The argument is a cluster
        self.cluster = self.args[0]
        self.frontend = get_cluster_site(self.cluster)
        # Analyzing options
        if self.options.oar_job_id:
            self.oar_job_id = self.options.oar_job_id
        else:
            self.oar_job_id = None

        try:
            # Creation of the main iterator which is used for the first control loop.
            # You need have a method called define_parameters, that returns a list of parameter dicts
            self.create_paramsweeper()

            job_is_dead = False
            # While they are combinations to treat
            while len(self.sweeper.get_remaining()) > 0:
                # If no job, we make a reservation and prepare the hosts for the experiments
                if self.oar_job_id is None:
                    self.make_reservation()
                # Retrieving the hosts and subnets parameters
                self.get_resources()
                # Hosts deployment and configuration
                if not self.options.no_hosts_setup:
                    self.setup_hosts()
                if len(self.hosts) == 0:
                    break

                # Initializing the resources and threads
                available_hosts = list(self.hosts)
                available_ip_mac = list(self.ip_mac)
                threads = {}

                # Checking that the job is running and not in Error
                while self.is_job_alive()['state'] != 'Error' \
                    or len(threads.keys()) > 0:
                # while get_oar_job_info(self.oar_job_id, self.frontend)['state'] != 'Error' \
                #     or len(threads.keys()) > 0:
                    job_is_dead = False
                    while self.options.n_nodes > len(available_hosts):
                        tmp_threads = dict(threads)
                        for t in tmp_threads:
                            if not t.is_alive():
                                available_hosts.extend(tmp_threads[t]['hosts'])
                                available_ip_mac.extend(tmp_threads[t]['ip_mac'])
                                del threads[t]
                        sleep(5)
                        if self.is_job_alive()['state'] == 'Error':
                            job_is_dead = True
                            break
                    if job_is_dead:
                        break

                    # Getting the next combination
                    comb = self.sweeper.get_next()
                    if not comb:
                        while len(threads.keys()) > 0:
                            tmp_threads = dict(threads)
                            for t in tmp_threads:
                                if not t.is_alive():
                                    del threads[t]
                            logger.info('Waiting for threads to complete')
                            sleep(20)
                        break

                    used_hosts = available_hosts[0:self.options.n_nodes]
                    available_hosts = available_hosts[self.options.n_nodes:]

                    n_vm = self.comb_nvm(comb)
                    used_ip_mac = available_ip_mac[0:n_vm]
                    available_ip_mac = available_ip_mac[n_vm:]

                    t = Thread(target=self.workflow,
                               args=(comb, used_hosts, used_ip_mac))
                    threads[t] = {'hosts': used_hosts, 'ip_mac': used_ip_mac}
                    logger.debug('Threads: %s', len(threads))
                    t.daemon = True
                    t.start()

                # if get_oar_job_info(self.oar_job_id, self.frontend)['state'] == 'Error':
                if self.is_job_alive()['state'] == 'Error':
                    job_is_dead = True

                if job_is_dead:
                    self.oar_job_id = None

        finally:
            if self.oar_job_id is not None:
                if not self.options.keep_alive:
                    logger.info('Deleting job')
                    oardel([(self.oar_job_id, self.frontend)])
                else:
                    logger.info('Keeping job alive for debugging')
コード例 #20
0
ファイル: fp_hadoop.py プロジェクト: lpouillo/execo-g5k-tools
    def run(self):
        """Inherited method, put here the code for running the engine"""
        self.define_parameters()
        self.cluster = self.args[0]
        self.site = get_cluster_site(self.cluster)
        if self.options.oar_job_id:
            self.oar_job_id = self.options.oar_job_id
        else:
            self.oar_job_id = None

        try:
            # Creation of the main iterator which is used for the first control loop.
            # You need have a method called define_parameters, that returns a list of parameter dicts
            self.define_parameters()

            job_is_dead = False
            # While they are combinations to treat
            while len(self.sweeper.get_remaining()) > 0:
                # If no job, we make a reservation and prepare the hosts for the experiments
                if job_is_dead or self.oar_job_id is None:
                    self.make_reservation()
                # Retrieving the hosts and subnets parameters
                self.hosts = get_oar_job_nodes(self.oar_job_id, self.frontend)
                # Hosts deployment
                deployed, undeployed = deploy(Deployment(self.hosts, 
                    env_file="/home/mliroz/deploys/hadoop6.env"))
                logger.info("%i deployed, %i undeployed" % (len(deployed), 
                                                            len(undeployed)))
                if len(deployed) == 0:
                    break
                # Configuration du systeme => look at the execo_g5k.topology module 
                attr = get_host_attributes(self.cluster + '-1')
                
                ## SETUP FINISHED
                
                # Getting the next combination
                comb = self.sweeper.get_next()
                self.prepare_dataset(comb)
                self.xp(comb)
                # subloop over the combinations that have the same sizes
                while True:
                    newcomb = self.sweeper.get_next(lambda r:
                            filter(lambda subcomb: subcomb['sizes'] == comb['sizes'], r))
                    if newcomb:
                        try:
                            self.xp(newcomb)
                        except:
                            break
                    else:
                        break

                if get_oar_job_info(self.oar_job_id, self.frontend)['state'] == 'Error':
                    job_is_dead = True

        finally:
            if self.oar_job_id is not None:
                if not self.options.keep_alive:
                    logger.info('Deleting job')
                    oardel([(self.oar_job_id, self.frontend)])
                else:
                    logger.info('Keeping job alive for debugging') 
コード例 #21
0
 def run(self):
     num_total_workers = 0
     sites_clusters_threads = {}  # dict: keys = sites, values =
     # dict: keys = clusters, values =
     # list: threads
     try:
         while True:
             t = Timer()
             clusters_to_submit = set()
             for clusterspec in self.get_clusters():
                 cluster, _, site = clusterspec.partition(".")
                 if site == "":
                     site = get_cluster_site(cluster)
                 clusters_to_submit.add((cluster, site))
             for site in sites_clusters_threads.keys():
                 for cluster in sites_clusters_threads[site].keys():
                     sites_clusters_threads[site][cluster] = [
                         th for th in sites_clusters_threads[site][cluster]
                         if th.is_alive()
                     ]
                     if len(sites_clusters_threads[site][cluster]) == 0:
                         del sites_clusters_threads[site][cluster]
                 if len(sites_clusters_threads[site]) == 0:
                     del sites_clusters_threads[site]
             all_involved_sites = set(sites_clusters_threads.keys())
             all_involved_sites.update([s for (c, s) in clusters_to_submit])
             no_submissions = True
             for site in all_involved_sites:
                 all_involved_clusters = set()
                 if sites_clusters_threads.has_key(site):
                     all_involved_clusters.update(
                         sites_clusters_threads[site].keys())
                 all_involved_clusters.update(
                     [c for (c, s) in clusters_to_submit if s == site])
                 for cluster in all_involved_clusters:
                     num_workers = 0
                     num_waiting = 0
                     if sites_clusters_threads.has_key(
                             site) and sites_clusters_threads[site].has_key(
                                 cluster):
                         num_workers = len(
                             sites_clusters_threads[site][cluster])
                         num_waiting = len([
                             th
                             for th in sites_clusters_threads[site][cluster]
                             if th.waiting
                         ])
                     num_max_new_workers = min(
                         self.options.max_workers - num_workers,
                         self.options.max_waiting - num_waiting)
                     logger.trace(
                         "rescheduling on cluster %s@%s: num_workers = %s / num_waiting = %s / num_max_new_workers = %s"
                         % (cluster, site, num_workers, num_waiting,
                            num_max_new_workers))
                     if num_max_new_workers > 0:
                         for worker_index in range(0, num_max_new_workers):
                             jobdata = self.get_job(cluster)
                             if not jobdata:
                                 break
                             no_submissions = False
                             logger.detail(
                                 "spawning worker %i on %s@%s" %
                                 (num_total_workers, cluster, site))
                             (oarsubmission, data) = jobdata
                             th = Thread(target=self.worker_start,
                                         args=(
                                             cluster,
                                             site,
                                             oarsubmission,
                                             data,
                                             num_total_workers,
                                         ))
                             th.waiting = True
                             th.daemon = True
                             th.oarsublock = Lock()
                             th.willterminate = False
                             th.start()
                             num_total_workers += 1
                             if not sites_clusters_threads.has_key(site):
                                 sites_clusters_threads[site] = {}
                             if not sites_clusters_threads[site].has_key(
                                     cluster):
                                 sites_clusters_threads[site][cluster] = []
                             sites_clusters_threads[site][cluster].append(
                                 th)
             if no_submissions and len(sites_clusters_threads) == 0:
                 break
             sleep(self.options.schedule_delay)
         logger.detail(
             "no more combinations to explore. exit schedule loop")
     finally:
         for site in sites_clusters_threads.keys():
             for cluster in sites_clusters_threads[site].keys():
                 for th in sites_clusters_threads[site][cluster]:
                     with th.oarsublock:
                         th.willterminate = True
                         if th.jobid:
                             logger.detail(
                                 "cleaning: delete job %i of worker #%i on %s"
                                 % (th.jobid, th.worker_index, site))
                             oardel([(th.jobid, site)])
                             th.jobid = None