Example #1
0
    def _enable_bridge(self, name='br0'):
        """We need a bridge to have automatic DHCP configuration for the VM."""
        logger.detail('Configuring the bridge')
        hosts_br = self._get_bridge(self.hosts)
        nobr_hosts = []
        for host, br in hosts_br.iteritems():
            if br is None:
                logger.debug('No bridge on host %s', style.host(host))
                nobr_hosts.append(host)
            elif br != name:
                logger.debug('Wrong bridge on host %s, destroying it',
                             style.host(host))
                SshProcess('ip link set ' + br + ' down ; brctl delbr ' + br,
                           host).run()
                nobr_hosts.append(host)
            else:
                logger.debug('Bridge %s is present on host %s',
                             style.emph('name'), style.host(host))

        nobr_hosts = map(lambda x: x.address
                         if isinstance(x, Host) else x, nobr_hosts)

        if len(nobr_hosts) > 0:
            logger.debug('Creating bridge on %s', hosts_list(nobr_hosts))
            script = 'export br_if=`ip route |grep default |cut -f 5 -d " "`; \n' + \
    'ifdown $br_if ; \n' + \
    'sed -i "s/$br_if inet dhcp/$br_if inet manual/g" /etc/network/interfaces ; \n' + \
    'sed -i "s/auto $br_if//g" /etc/network/interfaces ; \n' + \
    'echo " " >> /etc/network/interfaces ; \n' + \
    'echo "auto ' + name + '" >> /etc/network/interfaces ; \n' + \
    'echo "iface ' + name + ' inet dhcp" >> /etc/network/interfaces ; \n' + \
    'echo "  bridge_ports $br_if" >> /etc/network/interfaces ; \n' + \
    'echo "  bridge_stp off" >> /etc/network/interfaces ; \n' + \
    'echo "  bridge_maxwait 0" >> /etc/network/interfaces ; \n' + \
    'echo "  bridge_fd 0" >> /etc/network/interfaces ; \n' + \
    'ifup ' + name
            fd, br_script = mkstemp(dir='/tmp/', prefix='create_br_')
            f = fdopen(fd, 'w')
            f.write(script)
            f.close()

            self.fact.get_fileput(nobr_hosts, [br_script]).run()
            self.fact.get_remote('nohup sh ' + br_script.split('/')[-1],
                                 nobr_hosts).run()

            logger.debug('Waiting for network restart')
            if_up = False
            nmap_tries = 0
            while (not if_up) and nmap_tries < 20:
                sleep(20)
                nmap_tries += 1
                nmap = Process('nmap ' +
                               ' '.join([host for host in nobr_hosts]) +
                               ' -p 22').run()
                for line in nmap.stdout.split('\n'):
                    if 'Nmap done' in line:
                        if_up = line.split()[2] == line.split()[5].replace(
                            '(', '')
            logger.debug('Network has been restarted')
        logger.detail('All hosts have the bridge %s', style.emph(name))
Example #2
0
    def _enable_bridge(self, name='br0'):
        """We need a bridge to have automatic DHCP configuration for the VM."""
        logger.detail('Configuring the bridge')
        hosts_br = self._get_bridge(self.hosts)
        nobr_hosts = []
        for host, br in hosts_br.iteritems():
            if br is None:
                logger.debug('No bridge on host %s', style.host(host))
                nobr_hosts.append(host)
            elif br != name:
                logger.debug('Wrong bridge on host %s, destroying it',
                             style.host(host))
                SshProcess('ip link set ' + br + ' down ; brctl delbr ' + br,
                            host).run()
                nobr_hosts.append(host)
            else:
                logger.debug('Bridge %s is present on host %s',
                             style.emph('name'), style.host(host))

        nobr_hosts = map(lambda x: x.address if isinstance(x, Host) else x, 
                         nobr_hosts)

        if len(nobr_hosts) > 0:
            logger.debug('Creating bridge on %s', hosts_list(nobr_hosts))
            script = 'export br_if=`ip route |grep default |cut -f 5 -d " "`; \n' + \
    'ifdown $br_if ; \n' + \
    'sed -i "s/$br_if inet dhcp/$br_if inet manual/g" /etc/network/interfaces ; \n' + \
    'sed -i "s/auto $br_if//g" /etc/network/interfaces ; \n' + \
    'echo " " >> /etc/network/interfaces ; \n' + \
    'echo "auto ' + name + '" >> /etc/network/interfaces ; \n' + \
    'echo "iface ' + name + ' inet dhcp" >> /etc/network/interfaces ; \n' + \
    'echo "  bridge_ports $br_if" >> /etc/network/interfaces ; \n' + \
    'echo "  bridge_stp off" >> /etc/network/interfaces ; \n' + \
    'echo "  bridge_maxwait 0" >> /etc/network/interfaces ; \n' + \
    'echo "  bridge_fd 0" >> /etc/network/interfaces ; \n' + \
    'ifup ' + name
            fd, br_script = mkstemp(dir='/tmp/', prefix='create_br_')
            f = fdopen(fd, 'w')
            f.write(script)
            f.close()

            self.fact.get_fileput(nobr_hosts, [br_script]).run()
            self.fact.get_remote('nohup sh ' + br_script.split('/')[-1],
                                 nobr_hosts).run()

            logger.debug('Waiting for network restart')
            if_up = False
            nmap_tries = 0
            while (not if_up) and nmap_tries < 20:
                sleep(20)
                nmap_tries += 1
                nmap = Process('nmap ' +
                               ' '.join([host for host in nobr_hosts]) +
                               ' -p 22').run()
                for line in nmap.stdout.split('\n'):
                    if 'Nmap done' in line:
                        if_up = line.split()[2] == line.split()[5].replace('(',
                                                                           '')
            logger.debug('Network has been restarted')
        logger.detail('All hosts have the bridge %s', style.emph(name))
Example #3
0
 def default(self, line):
     global interrupted, workers, cores
     interrupted = False
     print 'interrupting previous command'
     workers.kill()
     execo.sleep(1)
     print 'sending command: ' + line
     workers = execo.Remote(line, cores).start()
Example #4
0
    def workflow(self, comb):
        self.create_par_file(comb)
        job_id = self.submit_job(comb)
        logger.info('Combination %s will be treated by job %s',
                    slugify(comb), str(job_id))

        while self.is_job_running(job_id):
            sleep(10)

        self.sweeper.done(comb)
Example #5
0
def wait_hosts_up(hosts, timeout=300):
    """ """
    down_hosts = map(lambda x: x.address if isinstance(x, Host) else x, hosts)
    fd, hosts_file = mkstemp(dir='/tmp/', prefix='hosts_')
    f = fdopen(fd, 'w')
    f.write('\n' + '\n'.join(down_hosts))
    f.close()
    timer = Timer()
    while len(down_hosts) > 0 and timer.elapsed() < timeout:
        nmap = Process("nmap -v -oG - -i %s -p 22 |grep Host|grep Status" %
                       (hosts_file, ),
                       shell=True).run()
        logger.debug('timer: %s \nnmap output: \n%s', timer.elapsed(),
                     nmap.stdout.strip())
        for line in nmap.stdout.strip().split('\n'):
            s = line.split()[2]
            host = s[s.find("(") + 1:s.find(")")]
            if host in down_hosts:
                logger.detail('%s is up', host)
                down_hosts.remove(host)
    Process('rm ' + hosts_file).run()
    sleep(3)
    return len(down_hosts) == 0
Example #6
0
File: utils.py Project: badock/vm5k
def wait_hosts_up(hosts, timeout=300):
    """ """
    down_hosts = map(lambda x: x.address if isinstance(x, Host) else x,
                     hosts)
    fd, hosts_file = mkstemp(dir='/tmp/', prefix='hosts_')
    f = fdopen(fd, 'w')
    f.write('\n' + '\n'.join(down_hosts))
    f.close()
    timer = Timer()
    while len(down_hosts) > 0 and timer.elapsed() < timeout:
        nmap = Process("nmap -v -oG - -i %s -p 22 |grep Host|grep Status" %
                       (hosts_file, ), shell=True).run()
        logger.debug('timer: %s \nnmap output: \n%s', timer.elapsed(),
                     nmap.stdout.strip())
        for line in nmap.stdout.strip().split('\n'):
            s = line.split()[2]
            host = s[s.find("(") + 1:s.find(")")]
            if host in down_hosts:
                logger.detail('%s is up', host)
                down_hosts.remove(host)
    Process('rm ' + hosts_file).run()
    sleep(3)
    return len(down_hosts) == 0
    def run(self):
        """ """
        if self.options.oargrid_job_id is not None:
            self.oar_job_id = self.options.oargrid_job_id
        else:
            self.oar_job_id = None

        self.list_of_clusters = [
            'parasilo', 'paravance', 'parapluie', 'paranoia'
        ]

        try:
            # Creation of the main iterator which is used for the first control loop.
            self.define_parameters()
            self.working_dir = '/data/jorouzaudcornabas_' + str(
                self.options.storage5k_job_id)

            job_is_dead = False
            # While there are combinations to treat
            while len(self.sweeper.get_remaining()) > 0:
                # If no job, we make a reservation and prepare the hosts for the experiments
                if self.oar_job_id is None:
                    self.submit_all_available_best_effort(
                        self.list_of_clusters, self.options.walltime)
                    # self.make_reservation_local()
                # Wait that the job starts
                logger.info('Waiting that the job start ' +
                            str(self.oar_job_id))
                wait_oar_job_start(self.oar_job_id)
                # Retrieving the hosts and subnets parameters
                self.hosts = get_oar_job_nodes(self.oar_job_id)
                # Hosts deployment and configuration
                default_connection_params['user'] = '******'

                logger.info("Start hosts configuration")
                ex_log.setLevel('INFO')
                #===============================================================
                # deployment = Deployment(hosts = self.hosts,
                #             env_file='/home/sirimie/env/mywheezy-x64-base.env')
                # self.hosts, _ = deploy(deployment)
                #===============================================================
                if len(self.hosts) == 0:
                    break

                # Initializing the resources and threads
                available_hosts = self.hosts

                threads = {}

                # Creating the unique folder for storing the results
                comb_dir = self.result_dir + '/logs'
                if not os.path.exists(comb_dir):
                    os.mkdir(comb_dir)

                logger.info("Starting the thread " + str(self.is_job_alive()) +
                            " " + str(len(threads.keys())))
                # Checking that the job is running and not in Error
                while self.is_job_alive() or len(threads.keys()) > 0:
                    job_is_dead = False

                    while self.options.n_nodes > len(available_hosts):
                        tmp_threads = dict(threads)
                        for t in tmp_threads:
                            if not t.is_alive():
                                available_hosts.append(tmp_threads[t]['host'])
                                del threads[t]
                        sleep(5)
                        if not self.is_job_alive():
                            job_is_dead = True
                            break
                    if job_is_dead:
                        break

                    # Getting the next combination
                    comb = self.sweeper.get_next()
                    if not comb:
                        while len(threads.keys()) > 0:
                            tmp_threads = dict(threads)
                            for t in tmp_threads:
                                if not t.is_alive():
                                    del threads[t]
                            logger.info('Waiting for threads to complete')
                            sleep(20)
                        break

                    host = available_hosts[0]
                    available_hosts = available_hosts[1:]
                    logger.info("Launching thread")
                    t = Thread(target=self.workflow,
                               args=(comb, host, comb_dir))
                    threads[t] = {'host': host}
                    t.daemon = True
                    t.start()

                if not self.is_job_alive():
                    job_is_dead = True

                if job_is_dead:
                    self.oar_job_id = None

        finally:
            if self.oar_job_id is not None:
                if not self.options.keep_alive:
                    logger.info('Deleting job')
                    oardel([self.oar_job_id])
                else:
                    logger.info('Keeping job alive for debugging')
Example #8
0
    def run(self):
        """ """
        if self.options.oargrid_job_id:
            self.oargrid_job_id = self.options.oargrid_job_id
        else:
            self.oargrid_job_id = None

        try:
            # Creation of the main iterator which is used for the first control loop.
            self.define_parameters()

            job_is_dead = False
            # While there are combinations to treat
            while len(self.sweeper.get_remaining()) > 0:
                # If no job, we make a reservation and prepare the hosts for the experiments
                if self.oargrid_job_id is None:
                    self.make_reservation()
                # Wait that the job starts
                logger.info('Waiting that the job start')
                wait_oargrid_job_start(self.oargrid_job_id)
                # Retrieving the hosts and subnets parameters
                self.hosts = get_oargrid_job_nodes(self.oargrid_job_id)
                # Hosts deployment and configuration

                default_connection_params['user'] = '******'

                logger.info("Start hosts configuration")
                ex_log.setLevel('INFO')
                deployment = Deployment(
                    hosts=self.hosts,
                    env_file='/home/sirimie/env/mywheezy-x64-base.env')
                self.hosts, _ = deploy(deployment)

                Remote("rm -f /home/Work/sgcbntier/paasage_demo/csv/REQTASK_*",
                       self.hosts).run()
                Remote(
                    "rm -f /home/Work/sgcbntier/paasage_demo/platform_aws.xml",
                    self.hosts).run()
                Remote("rm -f /home/Work/sgcbntier/paasage_demo/cloud_ec2.xml",
                       self.hosts).run()

                Put(self.hosts, [
                    "run_all_execo.py", "xml_gen_execo.py", "conf.xml",
                    "platform_aws.xml", "cloud_ec2.xml"
                ],
                    remote_location="/home/Work/sgcbntier/paasage_demo/").run(
                    )
                logger.info("Done")

                if len(self.hosts) == 0:
                    break

                # Initializing the resources and threads
                available_hosts = [
                    host for host in self.hosts for i in range(
                        get_host_attributes(host)['architecture']['smt_size'])
                ]

                threads = {}

                # Creating the unique folder for storing the results
                comb_dir = self.result_dir + '/csv_results'
                if not os.path.exists(comb_dir):
                    os.mkdir(comb_dir)

                # Checking that the job is running and not in Error
                while self.is_job_alive() or len(threads.keys()) > 0:
                    job_is_dead = False
                    while self.options.n_nodes > len(available_hosts):
                        tmp_threads = dict(threads)
                        for t in tmp_threads:
                            if not t.is_alive():
                                available_hosts.append(tmp_threads[t]['host'])
                                del threads[t]
                        sleep(5)
                        if not self.is_job_alive():
                            job_is_dead = True
                            break
                    if job_is_dead:
                        break

                    # Getting the next combination
                    comb = self.sweeper.get_next()
                    if not comb:
                        while len(threads.keys()) > 0:
                            tmp_threads = dict(threads)
                            for t in tmp_threads:
                                if not t.is_alive():
                                    del threads[t]
                            logger.info('Waiting for threads to complete')
                            sleep(20)
                        break

                    host = available_hosts[0]
                    available_hosts = available_hosts[1:]

                    t = Thread(target=self.workflow,
                               args=(comb, host, comb_dir))
                    threads[t] = {'host': host}
                    t.daemon = True
                    t.start()

                if not self.is_job_alive():
                    job_is_dead = True

                if job_is_dead:
                    self.oargrid_job_id = None

        finally:
            if self.oargrid_job_id is not None:
                if not self.options.keep_alive:
                    logger.info('Deleting job')
                    oargriddel([self.oargrid_job_id])
                else:
                    logger.info('Keeping job alive for debugging')
 def run(self):
     num_total_workers = 0
     sites_clusters_threads = {}  # dict: keys = sites, values =
     # dict: keys = clusters, values =
     # list: threads
     try:
         while True:
             t = Timer()
             clusters_to_submit = set()
             for clusterspec in self.get_clusters():
                 cluster, _, site = clusterspec.partition(".")
                 if site == "":
                     site = get_cluster_site(cluster)
                 clusters_to_submit.add((cluster, site))
             for site in sites_clusters_threads.keys():
                 for cluster in sites_clusters_threads[site].keys():
                     sites_clusters_threads[site][cluster] = [
                         th for th in sites_clusters_threads[site][cluster]
                         if th.is_alive()
                     ]
                     if len(sites_clusters_threads[site][cluster]) == 0:
                         del sites_clusters_threads[site][cluster]
                 if len(sites_clusters_threads[site]) == 0:
                     del sites_clusters_threads[site]
             all_involved_sites = set(sites_clusters_threads.keys())
             all_involved_sites.update([s for (c, s) in clusters_to_submit])
             no_submissions = True
             for site in all_involved_sites:
                 all_involved_clusters = set()
                 if sites_clusters_threads.has_key(site):
                     all_involved_clusters.update(
                         sites_clusters_threads[site].keys())
                 all_involved_clusters.update(
                     [c for (c, s) in clusters_to_submit if s == site])
                 for cluster in all_involved_clusters:
                     num_workers = 0
                     num_waiting = 0
                     if sites_clusters_threads.has_key(
                             site) and sites_clusters_threads[site].has_key(
                                 cluster):
                         num_workers = len(
                             sites_clusters_threads[site][cluster])
                         num_waiting = len([
                             th
                             for th in sites_clusters_threads[site][cluster]
                             if th.waiting
                         ])
                     num_max_new_workers = min(
                         self.options.max_workers - num_workers,
                         self.options.max_waiting - num_waiting)
                     logger.trace(
                         "rescheduling on cluster %s@%s: num_workers = %s / num_waiting = %s / num_max_new_workers = %s"
                         % (cluster, site, num_workers, num_waiting,
                            num_max_new_workers))
                     if num_max_new_workers > 0:
                         for worker_index in range(0, num_max_new_workers):
                             jobdata = self.get_job(cluster)
                             if not jobdata:
                                 break
                             no_submissions = False
                             logger.detail(
                                 "spawning worker %i on %s@%s" %
                                 (num_total_workers, cluster, site))
                             (oarsubmission, data) = jobdata
                             th = Thread(target=self.worker_start,
                                         args=(
                                             cluster,
                                             site,
                                             oarsubmission,
                                             data,
                                             num_total_workers,
                                         ))
                             th.waiting = True
                             th.daemon = True
                             th.oarsublock = Lock()
                             th.willterminate = False
                             th.start()
                             num_total_workers += 1
                             if not sites_clusters_threads.has_key(site):
                                 sites_clusters_threads[site] = {}
                             if not sites_clusters_threads[site].has_key(
                                     cluster):
                                 sites_clusters_threads[site][cluster] = []
                             sites_clusters_threads[site][cluster].append(
                                 th)
             if no_submissions and len(sites_clusters_threads) == 0:
                 break
             sleep(self.options.schedule_delay)
         logger.detail(
             "no more combinations to explore. exit schedule loop")
     finally:
         for site in sites_clusters_threads.keys():
             for cluster in sites_clusters_threads[site].keys():
                 for th in sites_clusters_threads[site][cluster]:
                     with th.oarsublock:
                         th.willterminate = True
                         if th.jobid:
                             logger.detail(
                                 "cleaning: delete job %i of worker #%i on %s"
                                 % (th.jobid, th.worker_index, site))
                             oardel([(th.jobid, site)])
                             th.jobid = None
Example #10
0
    def run(self):
        """The main experimental workflow, as described in
        ``Using the Execo toolkit to perform ...``
        """
        self.force_options()

        # The argument is a cluster
        self.cluster = self.args[0]
        self.frontend = get_cluster_site(self.cluster)
        # Analyzing options
        if self.options.oar_job_id:
            self.oar_job_id = self.options.oar_job_id
        else:
            self.oar_job_id = None

        try:
            # Creation of the main iterator which is used for the first control loop.
            # You need have a method called define_parameters, that returns a list of parameter dicts
            self.create_paramsweeper()

            job_is_dead = False
            # While they are combinations to treat
            while len(self.sweeper.get_remaining()) > 0:
                # If no job, we make a reservation and prepare the hosts for the experiments
                if self.oar_job_id is None:
                    self.make_reservation()
                # Retrieving the hosts and subnets parameters
                self.get_resources()
                # Hosts deployment and configuration
                if not self.options.no_hosts_setup:
                    self.setup_hosts()
                if len(self.hosts) == 0:
                    break

                # Initializing the resources and threads
                available_hosts = list(self.hosts)
                available_ip_mac = list(self.ip_mac)
                threads = {}

                # Checking that the job is running and not in Error
                while self.is_job_alive()['state'] != 'Error' \
                    or len(threads.keys()) > 0:
                # while get_oar_job_info(self.oar_job_id, self.frontend)['state'] != 'Error' \
                #     or len(threads.keys()) > 0:
                    job_is_dead = False
                    while self.options.n_nodes > len(available_hosts):
                        tmp_threads = dict(threads)
                        for t in tmp_threads:
                            if not t.is_alive():
                                available_hosts.extend(tmp_threads[t]['hosts'])
                                available_ip_mac.extend(tmp_threads[t]['ip_mac'])
                                del threads[t]
                        sleep(5)
                        if self.is_job_alive()['state'] == 'Error':
                            job_is_dead = True
                            break
                    if job_is_dead:
                        break

                    # Getting the next combination
                    comb = self.sweeper.get_next()
                    if not comb:
                        while len(threads.keys()) > 0:
                            tmp_threads = dict(threads)
                            for t in tmp_threads:
                                if not t.is_alive():
                                    del threads[t]
                            logger.info('Waiting for threads to complete')
                            sleep(20)
                        break

                    used_hosts = available_hosts[0:self.options.n_nodes]
                    available_hosts = available_hosts[self.options.n_nodes:]

                    n_vm = self.comb_nvm(comb)
                    used_ip_mac = available_ip_mac[0:n_vm]
                    available_ip_mac = available_ip_mac[n_vm:]

                    t = Thread(target=self.workflow,
                               args=(comb, used_hosts, used_ip_mac))
                    threads[t] = {'hosts': used_hosts, 'ip_mac': used_ip_mac}
                    logger.debug('Threads: %s', len(threads))
                    t.daemon = True
                    t.start()

                # if get_oar_job_info(self.oar_job_id, self.frontend)['state'] == 'Error':
                if self.is_job_alive()['state'] == 'Error':
                    job_is_dead = True

                if job_is_dead:
                    self.oar_job_id = None

        finally:
            if self.oar_job_id is not None:
                if not self.options.keep_alive:
                    logger.info('Deleting job')
                    oardel([(self.oar_job_id, self.frontend)])
                else:
                    logger.info('Keeping job alive for debugging')
Example #11
0
def check_hosts_up(hosts,
                   timeout=None,
                   connection_params=None,
                   polling_interval=5):
    """Check that a list of host are joinable with ssh.

    Checks that all hosts of the list are joinable with ssh. Retry
    continuously to connect to them every <polling_interval> seconds,
    until either all are reachable or the timeout is reached. Returns
    the list of hosts which are joinable.

    :param hosts: list of hosts

    :param timeout: timeout of the checks. No timeout if None.

    :param connection_params: to connect to the hosts. Note that the
      ssh_option entry of the connection_params is overwritten by this
      function

    :param polling_interval: tries to connect each <polling_interval>
      seconds.

    :returns: list of joinable hosts
    """

    start_ts = time.time()
    if timeout != None:
        completion_ts = start_ts + timeout
    remaining_hosts = set(hosts)
    if connection_params != None:
        real_connection_params = connection_params
    else:
        real_connection_params = {}
    while len(remaining_hosts) > 0 and (timeout == None
                                        or time.time() <= completion_ts):
        #print('remaining_hosts=%s' % (remaining_hosts,))
        if timeout != None:
            next_poll_ts = min(time.time() + polling_interval, completion_ts)
        else:
            next_poll_ts = time.time() + polling_interval
        poll_timeout = max(0, next_poll_ts - time.time())
        real_connection_params.update({
            'ssh_options':
            ('-tt', '-o', 'BatchMode=yes', '-o', 'PasswordAuthentication=no',
             '-o', 'StrictHostKeyChecking=no', '-o',
             'UserKnownHostsFile=/dev/null', '-o',
             'ConnectTimeout=%s' % (int(poll_timeout), ))
        })
        check = execo.Remote('true',
                             remaining_hosts,
                             connection_params=real_connection_params,
                             process_args={
                                 'timeout': poll_timeout,
                                 'nolog_exit_code': True,
                                 'nolog_timeout': True
                             }).run()
        hosts_up = [p.host for p in check.processes if p.finished_ok]
        #print('hosts_up=%s' %(hosts_up,))
        remaining_hosts = remaining_hosts.difference(hosts_up)
        if len(remaining_hosts) > 0:
            execo.sleep(max(0, next_poll_ts - time.time()))
    return list(set(hosts).difference(remaining_hosts))
Example #12
0
    def run(self):
        rtt_file = self.result_dir + "/rtt.csv"
        resolver = None
        client = 'tcpclient' if self.args.mode == 'tcp' else 'udpclient'
        try:
            logger.debug("Experiment ID: {}".format(self.exp_id))
            if self.multi_site():
                logger.info("Running in multi-site mode")
            if not self.multi_site():
                self.reserve_resources_singlejob()
                logger.debug("Waiting for OAR job to start...")
                g5k.wait_oar_job_start(*self.vmhosts_job)
                self.prepare_subnet()
                logger.debug("Prepared subnet")
            # Dependencies (besides the obvious ones):
            # - deploy_server depends on prepare_global_vlan
            # - prepare_server depends on deploy_server
            # - prepare_server depends on prepare_subnet
            # - prepare_vm depends on deploy_server
            if self.multi_site():
                self.reserve_global_vlan()
                g5k.wait_oar_job_start(*self.globalvlan_job)
                logger.debug("Waiting for global VLAN job to start...")
                self.prepare_global_vlan()
            self.log_experimental_conditions()
            logger.debug("Deploying VM hosts...")
            machines_deploy_process = self.start_deploy_vmhosts()
            logger.debug("Deploying server image...")
            server_deploy_process = self.start_deploy_server()
            machines_deploy_process.wait()
            logger.debug("Finishing deploying VM hosts...")
            self.finish_deploy_vmhosts(machines_deploy_process)
            logger.debug("Setting up VM hosts...")
            machines_setup_process = self.prepare_vmhosts()
            machines_setup_process.wait()
            logger.debug("VM hosts are setup.")
            server_deploy_process.wait()
            logger.debug("Finishing deploying server...")
            self.finish_deploy_server(server_deploy_process)
            logger.debug("Server is deployed.")
            self.vm_process = self.start_all_vm()
            # Ensure VM are killed when we exit
            with self.vm_process:
                server_setup_process = self.prepare_server()
                self.wait_until_vm_ready()
                vm_setup_process = self.prepare_vm()
                server_setup_process.wait()
                self.log_output(server_setup_process, "server_setup_process")
                if not server_setup_process.ok:
                    logger.error(
                        "Error while preparing server, please check logs for 'server_setup_process'"
                    )
                    raise Exception
                logger.debug("Prepared server: {}".format(self.server.address))
                vm_setup_process.wait()
                self.log_output(vm_setup_process, "vm_setup_process")
                if not vm_setup_process.ok:
                    logger.error(
                        "Error while preparing VMs, please check logs for 'vm_setup_process'"
                    )
                    raise Exception
                logger.debug("Prepared VM")
                logger.info("Started {} VMs.".format(len(self.vm)))
                cpunetlog_vms = self.start_cpunetlog(self.vm)
                cpunetlog_server = self.start_cpunetlog(
                    [self.server], self.server_conn_params)
                resolver = self.start_dns_server()
                logger.info("Started resolver ({}) on {}.".format(
                    self.resolver_name, self.server.address))
                # Leave time for resolver to start
                if self.args.resolver_slots_per_thread < 1000000:
                    execo.sleep(15)
                else:
                    execo.sleep(60)
                logger.info("Starting {} on all VMs...".format(client))
                clients = self.start_client_vm()
                clients.wait()
                logger.info("{} finished!".format(client))
                logger.info("Writing cpunetlog output to disk.")
                cpunetlog_server.kill().wait()
                cpunetlog_vms.kill().wait()
                self.log_output(cpunetlog_server, "cpunetlog_server")
                self.log_output(cpunetlog_vms, "cpunetlog_vms")
                logger.info("writing {} results to disk.".format(client))
                self.log_output(clients, "clients", log_stdout=False)
                with open(rtt_file, 'w') as rtt_output:
                    need_header = True
                    rtt = csv.writer(rtt_output)
                    for client_id, client in enumerate(clients.processes):
                        first_line = True
                        for line in iter(client.stdout.splitlines()):
                            # Skip anything that does not look like CSV
                            if ',' not in line:
                                continue
                            if need_header:
                                # Take CSV header from first client and add a column
                                data = line.split(",")
                                data.insert(0, "vm_id")
                                rtt.writerow(data)
                                need_header = False
                                first_line = False
                            elif first_line:
                                # Skip first line of subsequent clients
                                first_line = False
                            else:
                                # Add column with VM ID
                                data = line.split(",")
                                data.insert(0, client_id)
                                rtt.writerow(data)

        except Exception as e:
            logger.error("Exception raised: {}\n{}".format(e, format_exc()))
        finally:
            #self.kill_all_vm()
            if self.vm_process:
                self.vm_process.kill()
            if resolver:
                resolver.kill()
                logger.debug("Waiting for resolver to exit")
                resolver.wait()
                self.log_output(resolver, "resolver")
            if self.vm_process:
                logger.debug("Waiting for VM to exit")
                self.vm_process.wait()
                logger.info("Resolver and all VMs are shut down")
                self.log_output(self.vm_process, "vm_process")
                print(execo.Report([self.vm_process]).to_string())
            #for s in self.vm_process.processes:
            #    print("\n%s\nstdout:\n%s\nstderr:\n%s\n" % (s, s.stdout, s.stderr))
            g5k.oardel([self.vmhosts_job])
 def run(self):
     num_total_workers = 0
     sites_clusters_threads = {} # dict: keys = sites, values =
                                 # dict: keys = clusters, values =
                                 # list: threads
     try:
         while True:
             t = Timer()
             clusters_to_submit = set()
             for clusterspec in self.get_clusters():
                 cluster, _, site = clusterspec.partition(".")
                 if site == "":
                     site = get_cluster_site(cluster)
                 clusters_to_submit.add((cluster, site))
             for site in sites_clusters_threads.keys():
                 for cluster in sites_clusters_threads[site].keys():
                     sites_clusters_threads[site][cluster] = [
                         th
                         for th in sites_clusters_threads[site][cluster]
                         if th.is_alive() ]
                     if len(sites_clusters_threads[site][cluster]) == 0:
                         del sites_clusters_threads[site][cluster]
                 if len(sites_clusters_threads[site]) == 0:
                     del sites_clusters_threads[site]
             all_involved_sites = set(sites_clusters_threads.keys())
             all_involved_sites.update([ s for (c, s) in clusters_to_submit ])
             no_submissions = True
             for site in all_involved_sites:
                 all_involved_clusters = set()
                 if sites_clusters_threads.has_key(site):
                     all_involved_clusters.update(sites_clusters_threads[site].keys())
                 all_involved_clusters.update([ c for (c, s) in clusters_to_submit if s == site ])
                 for cluster in all_involved_clusters:
                     num_workers = 0
                     num_waiting = 0
                     if sites_clusters_threads.has_key(site) and sites_clusters_threads[site].has_key(cluster):
                         num_workers = len(sites_clusters_threads[site][cluster])
                         num_waiting = len([
                                 th
                                 for th in sites_clusters_threads[site][cluster]
                                 if th.waiting ])
                     num_max_new_workers = min(self.options.max_workers - num_workers,
                                               self.options.max_waiting - num_waiting)
                     logger.trace(
                         "rescheduling on cluster %s@%s: num_workers = %s / num_waiting = %s / num_max_new_workers = %s" %
                         (cluster, site, num_workers, num_waiting, num_max_new_workers))
                     if num_max_new_workers > 0:
                         for worker_index in range(0, num_max_new_workers):
                             jobdata = self.get_job(cluster)
                             if not jobdata:
                                 break
                             no_submissions = False
                             logger.detail(
                                 "spawning worker %i on %s@%s" % (
                                     num_total_workers,
                                     cluster, site))
                             (oarsubmission, data) = jobdata
                             th = Thread(target = self.worker_start,
                                         args = (cluster, site,
                                                 oarsubmission, data,
                                                 num_total_workers,))
                             th.waiting = True
                             th.daemon = True
                             th.oarsublock = Lock()
                             th.willterminate = False
                             th.start()
                             num_total_workers += 1
                             if not sites_clusters_threads.has_key(site):
                                 sites_clusters_threads[site] = {}
                             if not sites_clusters_threads[site].has_key(cluster):
                                 sites_clusters_threads[site][cluster] = []
                             sites_clusters_threads[site][cluster].append(th)
             if no_submissions and len(sites_clusters_threads) == 0:
                 break
             sleep(self.options.schedule_delay)
         logger.detail("no more combinations to explore. exit schedule loop")
     finally:
         for site in sites_clusters_threads.keys():
             for cluster in sites_clusters_threads[site].keys():
                 for th in sites_clusters_threads[site][cluster]:
                     with th.oarsublock:
                         th.willterminate = True
                         if th.jobid:
                             logger.detail("cleaning: delete job %i of worker #%i on %s" % (
                                     th.jobid, th.worker_index, site))
                             oardel([(th.jobid, site)])
                             th.jobid = None
Example #14
0
    def run(self):
        """The main experimental workflow, as described in
        ``Using the Execo toolkit to perform ...``
        """
        self.force_options()

        # The argument is a cluster
        self.cluster = self.args[0]
        self.frontend = get_cluster_site(self.cluster)
        # Analyzing options
        if self.options.oar_job_id:
            self.oar_job_id = self.options.oar_job_id
        else:
            self.oar_job_id = None

        try:
            # Creation of the main iterator which is used for the first control loop.
            # You need have a method called define_parameters, that returns a list of parameter dicts
            self.create_paramsweeper()

            job_is_dead = False
            # While they are combinations to treat
            while len(self.sweeper.get_remaining()) > 0:
                # If no job, we make a reservation and prepare the hosts for the experiments
                if self.oar_job_id is None:
                    self.make_reservation()
                # Retrieving the hosts and subnets parameters
                self.get_resources()
                # Hosts deployment and configuration
                if not self.options.no_hosts_setup:
                    self.setup_hosts()
                if len(self.hosts) == 0:
                    break

                # Initializing the resources and threads
                available_hosts = list(self.hosts)
                available_ip_mac = list(self.ip_mac)
                threads = {}

                # Checking that the job is running and not in Error
                while get_oar_job_info(self.oar_job_id, self.frontend)['state'] != 'Error' \
                    or len(threads.keys()) > 0:
                    job_is_dead = False
                    while self.options.n_nodes > len(available_hosts):
                        tmp_threads = dict(threads)
                        for t in tmp_threads:
                            if not t.is_alive():
                                available_hosts.extend(tmp_threads[t]['hosts'])
                                available_ip_mac.extend(
                                    tmp_threads[t]['ip_mac'])
                                del threads[t]
                        sleep(5)
                        if get_oar_job_info(self.oar_job_id,
                                            self.frontend)['state'] == 'Error':
                            job_is_dead = True
                            break
                    if job_is_dead:
                        break

                    # Getting the next combination
                    comb = self.sweeper.get_next()
                    if not comb:
                        while len(threads.keys()) > 0:
                            tmp_threads = dict(threads)
                            for t in tmp_threads:
                                if not t.is_alive():
                                    del threads[t]
                            logger.info('Waiting for threads to complete')
                            sleep(20)
                        break

                    used_hosts = available_hosts[0:self.options.n_nodes]
                    available_hosts = available_hosts[self.options.n_nodes:]

                    n_vm = self.comb_nvm(comb)
                    used_ip_mac = available_ip_mac[0:n_vm]
                    available_ip_mac = available_ip_mac[n_vm:]

                    t = Thread(target=self.workflow,
                               args=(comb, used_hosts, used_ip_mac))
                    threads[t] = {'hosts': used_hosts, 'ip_mac': used_ip_mac}
                    logger.debug('Threads: %s', len(threads))
                    t.daemon = True
                    t.start()

                if get_oar_job_info(self.oar_job_id,
                                    self.frontend)['state'] == 'Error':
                    job_is_dead = True

                if job_is_dead:
                    self.oar_job_id = None

        finally:
            if self.oar_job_id is not None:
                if not self.options.keep_alive:
                    logger.info('Deleting job')
                    oardel([(self.oar_job_id, self.frontend)])
                else:
                    logger.info('Keeping job alive for debugging')
worker_cmd = 'node pando.js/test/volunteer.js %s'
params = execo_g5k.default_oarsh_oarcp_params

if jobid:
    try:
        print 'Waiting for job to start'
        execo_g5k.wait_oar_job_start(jobid, site)
        print 'Retrieving nodes'
        nodes = execo_g5k.get_oar_job_nodes(jobid, site)
        # Open one connection per core (there are 8 cores per node in grenoble)
        cores = nodes * 8
        if (len(cores) >= 2):
            print 'Starting server'
            server = execo.TaktukRemote(server_cmd, cores[0])
            with server.start():
                execo.sleep(0.5)
                (h, i, m) = server.expect(r'^(\/ip4\/172.*)')[0]
                multiaddr = m.group()
                print 'Starting workers with cmd: ' + worker_cmd % (multiaddr)
                workers = execo.TaktukRemote(worker_cmd % (multiaddr),
                                             cores[1:]).start()
                workers.expect('Node ready')
                print 'Workers ready'
                start_time = time.time()
                print 'Started processing'
                server.expect('done')
                stop_time = time.time()
                print 'Processing done in %fs' % (stop_time - start_time)
                print execo.Report([server, workers]).to_string()
                for index, p in enumerate(server.processes):
                    with open('server-out.log', 'w') as f: