Example #1
0
def wait_hosts_down(hosts, timeout=300):
    """ """
    timer = Timer()
    up_hosts = map(lambda x: x.address if isinstance(x, Host) else x, hosts)
    fd, hosts_file = mkstemp(dir='/tmp/', prefix='hosts_')
    with fdopen(fd, 'w') as f:
        f.write('\n' + '\n'.join(up_hosts))

    while len(up_hosts) > 0 and timer.elapsed() < timeout:
        nmap = Process("nmap -v -oG - -i %s -p 22 |grep Host|grep Status" %
                       (hosts_file, ),
                       shell=True).run()
        logger.debug('timer: %s \nnmap output: \n%s', timer.elapsed(),
                     nmap.stdout.strip())
        for line in nmap.stdout.strip().split('\n'):
            if 'Down' in line:
                ip = line.split()[1]
                get_host = Process('host ' + ip + '| cut -f 5 -d " "',
                                   shell=True).run()
                host = get_host.stdout.strip()[0:-1]
                if host in up_hosts:
                    logger.detail(host + ' is down')
                    up_hosts.remove(host)
    Process('rm ' + hosts_file).run()

    return len(up_hosts) == 0
    def run(self):
        # Go to the result folder before everything
        os.chdir(self.result_dir)

        # jobs = [(_jobID, _site)]
        # Get nodes
        # nodes = get_oar_job_nodes(_jobID, _site)

        try:
            # logger.info("Creating hostfiles for all combinations...")
            # for nbr_node in _nbrNodes:
            #     hostfile_filename = self.result_dir + '/' + 'hostfile-' + nbr_node
            #     with open(hostfile_filename, 'w') as hostfile:
            #         for node in nodes[:int(nbr_node)]:
            #             print>>hostfile, node.address

            spack_command = 'spack install -v chameleon@trunk+starpu+fxt ^starpu@svn-trunk+fxt'
            # spack_process = Remote(spack_command, nodes)
            logger.info("Starting StarPU installation...")
            spack_process = Process(spack_command).start()            

            spack_process.wait()
            logger.info("StarPU installation DONE...")
            if  (not spack_process.ok):
                logger.info("Error : " + spack_process.error_reason)
            else:
                logger.info("spac stdout: {}".format(spack_process.stdout));
            spack_process.kill()

            # Pilotage
        except:
            traceback.print_exc()
        finally:
	        logger.info("Fin...")
Example #3
0
def _munin_server(server, clients):
    """Install the monitoring service munin. Must be executed inside Grid'5000
    to be able to resolve the server and clients IP.

    :param server: a execo.Host

    :param clients: a list of execo.Hosts

    :param plugins: a list of munin plugins

    """
    logger.info(
        'Munin monitoring service installation, server = %s, clients = \n %s',
        server.address, [host.address for host in clients])

    logger.debug('Configuring munin server %s', style.host('server'))
    cmd = 'export DEBIAN_MASTER=noninteractive ; apt-get update && apt-get install -y munin'
    inst_munin_server = SshProcess(cmd, server).run()

    logger.debug('Creating configuration files for server')
    fd, server_conf = mkstemp(dir='/tmp/', prefix='munin-nodes_')
    f = fdopen(fd, 'w')
    for host in clients:
        get_ip = Process('host ' + host.address).run()
        ip = get_ip.stdout.strip().split(' ')[3]
        f.write('[' + host.address + ']\n    address ' + ip +
                '\n   use_node_name yes\n\n')
    f.close()

    Put([server], [server_conf], remote_location='/etc/').run()
    SshProcess('cd /etc && cp ' + server_conf.split('/')[-1] + ' munin.conf',
               server).run()
    Process('rm ' + server_conf).run()
Example #4
0
    def workflow(self, comb):
        """ """
        comb_ok = False
        logger.info(slugify(comb) + \
                             ' starts to compile')
        try:
	    export = "source /opt/intel/bin/compilervars.sh intel64; "
	    
	    src_directory = "/home/arrouan/workspace/aevol/git/world/aevol/"
	    
	    bin_directory = "/home/arrouan/workspace/aevol/compiled_binary/"
    
	    configure_option = "--with-tracing --without-x"
	    
	    if comb['parallel'] == 'tbb':
	      configure_option += " --with-tbb"
	      
	    if comb['blas'] == 'openblas':
	      configure_option += " --with-blas"
	    elif comb['blas'] == 'mkl':
	      configure_option += " --with-mkl"
	    elif comb['blas'] == 'atlas':
	      configure_option += " --with-atlas"
	        
	    if comb['experiment'] == 'raevol':
	      configure_option += " --with-raevol"
	      
	    if comb['compilator'] == 'intel':
	      configure_option += " CXX=icc"
	      
	    full_bin_directory = bin_directory + comb['experiment']+'_'+comb['compilator']+'_'+comb['parallel']+'_'+comb['blas']
	    
	    try:
	      os.mkdir(full_bin_directory)
            except:
	      for f in os.listdir(full_bin_directory):
		os.remove(full_bin_directory + "/" + f)
		
	    p = Process(export+'cd '+src_directory+'; autoreconf; ./configure '+configure_option+'; make clean; make; cp src/aevol_run '+full_bin_directory+'/; cp src/aevol_create '+full_bin_directory+'/')
	    p.shell = True
	    #
	    p.run()
	    
	    print p.stdout
	    
            comb_ok = True
        finally:
            if comb_ok:
                self.sweeper.done(comb)
	        logger.info(slugify(comb) + \
                             ' has been done')
            else:
                self.sweeper.cancel(comb)
                logger.warning(slugify(comb) + \
                            ' has been canceled')
        logger.info(style.step('%s Remaining'),
                        len(self.sweeper.get_remaining()))
Example #5
0
def get_server_ip(host):
    """Get the server IP"""
    if isinstance(host, Host):
        host = host.address
    logger.debug('Retrieving IP from %s', style.host(host))
    get_ip = Process('host ' + host + ' |cut -d \' \' -f 4')
    get_ip.shell = True
    get_ip.run()
    ip = get_ip.stdout.strip()
    return ip
Example #6
0
def get_server_ip(host):
    """Get the server IP"""
    if isinstance(host, Host):
        host = host.address
    logger.debug('Retrieving IP from %s', style.host(host))
    get_ip = Process('host ' + host + ' |cut -d \' \' -f 4')
    get_ip.shell = True
    get_ip.run()
    ip = get_ip.stdout.strip()
    return ip
Example #7
0
 def clean_archi(self):
     """ Delete all files related to an existing DIET archi """
     logger.info("Clean DIET architecture")
     process = Process("./dietg/clean_archi_diet.sh")
     process.run()
     process = Process("./dietg/clean.sh")
     process.run()
     process = Process("if [ -e ./tmp ]; then rm ./tmp; fi")
     process.run()
Example #8
0
 def create_diet_architecture_files(self):
     logger.info("Create a DIET architecture") 
     # Architecture without LA
     process = Process("./dietg/set_archi_diet_4.sh gridnodes "+str(self.nodes_service[0]))
     process.run()
     
     MA_file = "./dietg/cfgs/MA1.cfg"
     SeD_file = ['./dietg/cfgs/server.cfg']
     logger.info("Create MA file")
     set_scheduler(MA_file, self.scheduler)
     logger.info("Create Sed files")
     for file2 in SeD_file:
         # print file2
         set_parallel_jobs(file2, self.concLimit)
Example #9
0
    def _enable_bridge(self, name='br0'):
        """We need a bridge to have automatic DHCP configuration for the VM."""
        logger.detail('Configuring the bridge')
        hosts_br = self._get_bridge(self.hosts)
        nobr_hosts = []
        for host, br in hosts_br.iteritems():
            if br is None:
                logger.debug('No bridge on host %s', style.host(host))
                nobr_hosts.append(host)
            elif br != name:
                logger.debug('Wrong bridge on host %s, destroying it',
                             style.host(host))
                SshProcess('ip link set ' + br + ' down ; brctl delbr ' + br,
                           host).run()
                nobr_hosts.append(host)
            else:
                logger.debug('Bridge %s is present on host %s',
                             style.emph('name'), style.host(host))

        nobr_hosts = map(lambda x: x.address
                         if isinstance(x, Host) else x, nobr_hosts)

        if len(nobr_hosts) > 0:
            logger.debug('Creating bridge on %s', hosts_list(nobr_hosts))
            script = 'export br_if=`ip route |grep default |cut -f 5 -d " "`; \n' + \
    'ifdown $br_if ; \n' + \
    'sed -i "s/$br_if inet dhcp/$br_if inet manual/g" /etc/network/interfaces ; \n' + \
    'sed -i "s/auto $br_if//g" /etc/network/interfaces ; \n' + \
    'echo " " >> /etc/network/interfaces ; \n' + \
    'echo "auto ' + name + '" >> /etc/network/interfaces ; \n' + \
    'echo "iface ' + name + ' inet dhcp" >> /etc/network/interfaces ; \n' + \
    'echo "  bridge_ports $br_if" >> /etc/network/interfaces ; \n' + \
    'echo "  bridge_stp off" >> /etc/network/interfaces ; \n' + \
    'echo "  bridge_maxwait 0" >> /etc/network/interfaces ; \n' + \
    'echo "  bridge_fd 0" >> /etc/network/interfaces ; \n' + \
    'ifup ' + name
            fd, br_script = mkstemp(dir='/tmp/', prefix='create_br_')
            f = fdopen(fd, 'w')
            f.write(script)
            f.close()

            self.fact.get_fileput(nobr_hosts, [br_script]).run()
            self.fact.get_remote('nohup sh ' + br_script.split('/')[-1],
                                 nobr_hosts).run()

            logger.debug('Waiting for network restart')
            if_up = False
            nmap_tries = 0
            while (not if_up) and nmap_tries < 20:
                sleep(20)
                nmap_tries += 1
                nmap = Process('nmap ' +
                               ' '.join([host for host in nobr_hosts]) +
                               ' -p 22').run()
                for line in nmap.stdout.split('\n'):
                    if 'Nmap done' in line:
                        if_up = line.split()[2] == line.split()[5].replace(
                            '(', '')
            logger.debug('Network has been restarted')
        logger.detail('All hosts have the bridge %s', style.emph(name))
Example #10
0
    def _start_disk_copy(self, disks=None):
        """ """
        disks_copy = []
        if not disks:
            disks = self.backing_files
        for bf in disks:
            logger.info('Treating ' + style.emph(bf))
            logger.debug("Checking frontend disk vs host disk")
            raw_disk = '/tmp/orig_' + bf.split('/')[-1]
            f_disk = Process('md5sum -b ' + bf).run()
            disk_hash = f_disk.stdout.split(' ')[0]
            cmd = 'if [ -f ' + raw_disk + ' ]; ' + \
                'then md5sum  -b ' + raw_disk + '; fi'
            h_disk = self.fact.get_remote(cmd, self.hosts).run()
            disk_ok = True
            for p in h_disk.processes:

                if p.stdout.split(' ')[0] != disk_hash:
                    disk_ok = False
                    break
            if disk_ok:
                logger.info("Disk " + style.emph(bf) +
                            " is already present, skipping copy")
            else:
                disks_copy.append(
                    self.fact.get_fileput(self.hosts, [bf],
                                          remote_location="/tmp"))
        if len(disks_copy) > 0:
            self.copy_actions = ParallelActions(disks_copy).start()
        else:
            self.copy_actions = Remote('ls', self.hosts[0]).run()
Example #11
0
def dnsmasq_server(server, clients=None, vms=None, dhcp=True):
    """Configure a DHCP server with dnsmasq

    :param server: host where the server will be installed

    :param clients: list of hosts that will be declared in dnsmasq

    :param vms: list of virtual machines

    """
    logger.debug('Installing and configuring a DNS/DHCP server on %s', server)

    test_running = Process('nmap ' + server + ' -p 53 | grep domain')
    test_running.shell = True
    test_running.run()
    if 'open' in test_running.stdout:
        logger.info('DNS server already running, updating configuration')
    else:
        cmd = 'killall dnsmasq; export DEBIAN_MASTER=noninteractive ; ' + \
            'apt-get update ; apt-get -y purge dnsmasq-base ; ' + \
            'apt-get install -t wheezy -o Dpkg::Options::="--force-confdef" ' + \
            '-o Dpkg::Options::="--force-confnew" ' + \
            '-y dnsmasq; echo 1 > /proc/sys/net/ipv4/ip_forward '
        SshProcess(cmd, server).run()

    sites = list(
        set([
            get_host_site(client)
            for client in clients if get_host_site(client)
        ] + [get_host_site(server)]))
    add_vms(vms, server)
    if clients:
        kill_dnsmasq = TaktukRemote('killall dnsmasq', clients)
        for p in kill_dnsmasq.processes:
            p.ignore_exit_code = p.nolog_exit_code = True
        kill_dnsmasq.run()
        resolv_conf(server, clients, sites)

    if dhcp:
        sysctl_conf(server, vms)
        dhcp_conf(server, vms, sites)

    logger.debug('Restarting service ...')
    cmd = 'service dnsmasq stop ; rm /var/lib/misc/dnsmasq.leases ; ' + \
        'service dnsmasq start',
    SshProcess(cmd, server).run()
Example #12
0
    def get_logs_from_server(self):
        distant_file = "/root/MA.stat"
        local_folder = "./results_"+self.oargrid_job_id+"_"+self.scheduler+"/"
        
        nb_files = 0
#         Get(self.servers, [distant_file])
        for host in self.servers:
                local_file = local_folder+host+"_"+self.scheduler+"_SeD.stat"
                process = Process("scp root@"+host+":"+distant_file+" "+local_file)
                process.run()
                try: #si le fichier existe
                    with open(local_file) as fichier:
                        nb_files += 1
                        fichier.close
                except IOError: #si le fichier n'existe pas
                    pass
        return nb_files
Example #13
0
    def get_nb_tasks_server(self):
        distant_file = "/root/dietg/log/total.jobs"
        local_file = "./task_counter"
        
#         Get(self.servers, [distant_file])
        for host in self.servers:
                process = Process("scp root@"+host+":"+distant_file+" "+local_file)
                process.run()
                try: #si le fichier existe
                    with open(local_file) as fichier:
                        self.nb_tasks[host] = get_nb_tasks(local_file)
                        fichier.close
                except IOError: #si le fichier n'existe pas
                    self.nb_tasks[host] = 0
                try:
                    os.remove(local_file);
                except OSError:
                    pass
Example #14
0
def dnsmasq_server(server, clients=None, vms=None, dhcp=True):
    """Configure a DHCP server with dnsmasq

    :param server: host where the server will be installed

    :param clients: list of hosts that will be declared in dnsmasq

    :param vms: list of virtual machines

    """
    logger.debug('Installing and configuring a DNS/DHCP server on %s', server)

    test_running = Process('nmap ' + server + ' -p 53 | grep domain')
    test_running.shell = True
    test_running.run()
    if 'open' in test_running.stdout:
        logger.info('DNS server already running, updating configuration')
    else:
        cmd = 'killall dnsmasq; export DEBIAN_MASTER=noninteractive ; ' + \
            'apt-get update ; apt-get -y purge dnsmasq-base ; ' + \
            'apt-get install -t wheezy -o Dpkg::Options::="--force-confdef" ' + \
            '-o Dpkg::Options::="--force-confnew" ' + \
            '-y dnsmasq; echo 1 > /proc/sys/net/ipv4/ip_forward '
        SshProcess(cmd, server).run()

    sites = list(set([get_host_site(client) for client in clients
                      if get_host_site(client)] + [get_host_site(server)]))
    add_vms(vms, server)
    if clients:
        kill_dnsmasq = TaktukRemote('killall dnsmasq', clients)
        for p in kill_dnsmasq.processes:
            p.ignore_exit_code = p.nolog_exit_code = True
        kill_dnsmasq.run()
        resolv_conf(server, clients, sites)

    if dhcp:
        sysctl_conf(server, vms)
        dhcp_conf(server, vms, sites)

    logger.debug('Restarting service ...')
    cmd = 'service dnsmasq stop ; rm /var/lib/misc/dnsmasq.leases ; ' + \
        'service dnsmasq start',
    SshProcess(cmd, server).run()
Example #15
0
def resolv_conf(server, clients, sites):
    """Generate the resolv.conf with dhcp parameters and put it on the server
    """
    fd, resolv = mkstemp(dir='/tmp/', prefix='resolv_')
    f = fdopen(fd, 'w')
    f.write('domain grid5000.fr\nsearch grid5000.fr ' +
            ' '.join([site + '.grid5000.fr' for site in sites]) +
            '\nnameserver ' + get_server_ip(server))
    f.close()
    TaktukPut(clients, [resolv], remote_location='/etc/').run()
    TaktukRemote('cd /etc && cp ' + resolv.split('/')[-1] + ' resolv.conf',
                 clients).run()
    Process('rm ' + resolv).run()
Example #16
0
def wait_hosts_up(hosts, timeout=300):
    """ """
    down_hosts = map(lambda x: x.address if isinstance(x, Host) else x, hosts)
    fd, hosts_file = mkstemp(dir='/tmp/', prefix='hosts_')
    f = fdopen(fd, 'w')
    f.write('\n' + '\n'.join(down_hosts))
    f.close()
    timer = Timer()
    while len(down_hosts) > 0 and timer.elapsed() < timeout:
        nmap = Process("nmap -v -oG - -i %s -p 22 |grep Host|grep Status" %
                       (hosts_file, ),
                       shell=True).run()
        logger.debug('timer: %s \nnmap output: \n%s', timer.elapsed(),
                     nmap.stdout.strip())
        for line in nmap.stdout.strip().split('\n'):
            s = line.split()[2]
            host = s[s.find("(") + 1:s.find(")")]
            if host in down_hosts:
                logger.detail('%s is up', host)
                down_hosts.remove(host)
    Process('rm ' + hosts_file).run()
    sleep(3)
    return len(down_hosts) == 0
Example #17
0
def add_vms(vms, server):
    """Generate the list of virtual machines """
    logger.debug('Adding the VM in /etc/hosts ...')
    fd, vms_list = mkstemp(dir='/tmp/', prefix='vms_')
    f = fdopen(fd, 'w')
    f.write('\n' + '\n'.join([vm['ip'] + ' \t ' + vm['id'] for vm in vms]))
    f.close()
    Put([server], [vms_list], remote_location='/etc/').run()
    SshProcess(
        '[ -f /etc/hosts.bak ] && cp /etc/hosts.bak /etc/hosts || ' +
        ' cp /etc/hosts /etc/hosts.bak', server).run()
    Remote('cat /etc/' + vms_list.split('/')[-1] + ' >> /etc/hosts',
           [server]).run()
    Process('rm ' + vms_list).run()
Example #18
0
def sysctl_conf(server, vms):
    """Change the default value of net.ipv4.neigh.default.gc_thresh*
    to handle large number of IP"""
    val = int(2**ceil(log(len(vms), 2)))
    conf = "\nnet.ipv4.neigh.default.gc_thresh3 = " + str(3 * val) + \
        "\nnet.ipv4.neigh.default.gc_thresh2 = " + str(2 * val) + \
        "\nnet.ipv4.neigh.default.gc_thresh1 = " + str(val)
    fd, sysctl = mkstemp(dir='/tmp/', prefix='sysctl_')
    f = fdopen(fd, 'w')
    f.write(conf)
    f.close()
    Put([server], [sysctl], remote_location='/etc/').run()
    SshProcess(
        'cd /etc && cat ' + sysctl.split('/')[-1] +
        ' >> sysctl.conf && sysctl -p', server).run()
    Process('rm ' + sysctl).run()
Example #19
0
 def import_from_kaenv(self, env, remote=None):
     """import a kadeploy environment from	a kadeploy database"""
     env = re.match(
         r"^(?P<name>[-_.\w]+)(?:@(?P<user>[_.\w]+))?(:?:(?P<version>[_.\w]+))?$",
         env).groupdict("")
     if env['user']:
         env['user'] = "******" + env['user']
     if env['version']:
         env['version'] = " --env-version " + env['version']
     kaenv_cmd = "kaenv3{user}{version} -p {name}".format(**env)
     if remote:
         remote = re.match(
             r"^(?:(?P<user>[-_.\w]+)@)?(?P<address>[-_.\w]+)(?::(?P<port>\d{1,5}))?$",
             remote).groupdict()
         p = SshProcess(kaenv_cmd, Host(**remote))
     else:
         p = Process(kaenv_cmd, shell=True)
     p.run()
     self.desc = yaml.load(p.stdout)
     return self
Example #20
0
def dhcp_conf(server, vms, sites):
    """Generate the dnsmasq.conf with dhcp parameters and
    put it on the server"""
    logger.debug('Creating dnsmasq.conf')
    ip_mac = [(vm['ip'], vm['mac']) for vm in vms]
    dhcp_lease = 'dhcp-lease-max=10000\n'
    dhcp_range = 'dhcp-range=' + ip_mac[0][0] + ',' + ip_mac[len(vms) -
                                                             1][0] + ',12h\n'
    dhcp_router = 'dhcp-option=option:router,' + get_server_ip(server) + '\n'
    dhcp_hosts = '' + '\n'.join([
        'dhcp-host=' + ':' + ip_mac[i][1] + ',' + vms[i]['id'] + ',' +
        ip_mac[i][0] for i in range(len(vms))
    ])
    dhcp_option = 'dhcp-option=option:domain-search,grid5000.fr,' + \
        ','.join([site + '.grid5000.fr' for site in sites]) + '\n'
    fd, dnsmasq = mkstemp(dir='/tmp/', prefix='dnsmasq_')
    f = fdopen(fd, 'w')
    f.write(dhcp_lease + dhcp_range + dhcp_router + dhcp_hosts + '\n' +
            dhcp_option)
    f.close()
    Put([server], [dnsmasq], remote_location='/etc/').run()
    SshProcess('cd /etc && cp ' + dnsmasq.split('/')[-1] + ' dnsmasq.conf',
               server).run()
    Process('rm ' + dnsmasq).run()
    def run(self):
        # Go to the result folder before everything
        os.chdir(self.result_dir)

        jobs = [(_jobID, _site)]
        # Get nodes
        nodes = get_oar_job_nodes(_jobID, _site)

        try:
            logger.info("Creating hostfiles for all combinations...")
            for nbr_node in _nbrNodes:
                hostfile_filename = self.result_dir + '/' + 'hostfile-' + nbr_node
                with open(hostfile_filename, 'w') as hostfile:
                    for node in nodes[:int(nbr_node)]:
                        print>>hostfile, node.address

            spack_process = Process('spack install -v chameleon@trunk+starpu+fxt ^starpu@svn-trunk+fxt')            
            spack_process.start()
            spack_process.wait()
            spack_process.kill()

        finally:
            logger.info("Delete job: {}".format(jobs))
            oardel(jobs)
    def run(self):
        """Run the experiment"""
        already_configured = self.options.already_configured
        reservation_job_id = int(self.options.reservation_id) \
            if self.options.reservation_id is not None else None
        is_a_test = self.options.is_a_test

        if is_a_test:
            logger.warn('THIS IS A TEST! This run will use only a few '
                        'resources')

        # make the result folder writable for all
        os.chmod(self.result_dir, 0o777)
        # Import configuration
        with open(self.args[0]) as config_file:
            config = json.load(config_file)
        # backup configuration
        copy(self.args[0], self.result_dir)

        site = config["grid5000_site"]
        resources = config["resources"]
        nb_experiment_nodes = config["nb_experiment_nodes"]
        walltime = str(config["walltime"])
        env_name = config["kadeploy_env_name"]
        workloads = config["workloads"]
        # check if workloads exists (Suppose that the same NFS mount point
        # is present on the remote and the local environment
        for workload_file in workloads:
            with open(workload_file):
                pass
            # copy the workloads files to the results dir
            copy(workload_file, self.result_dir)

        # define the workloads parameters
        self.parameters = {'workload_filename': workloads}
        logger.info('Workloads: {}'.format(workloads))

        # define the iterator over the parameters combinations
        self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"),
                                    sweep(self.parameters))

        # Due to previous (using -c result_dir) run skip some combination
        logger.info('Skipped parameters:' +
                    '{}'.format(str(self.sweeper.get_skipped())))

        logger.info('Number of parameters combinations {}'.format(
            str(len(self.sweeper.get_remaining()))))
        logger.info('combinations {}'.format(str(
            self.sweeper.get_remaining())))

        if reservation_job_id is not None:
            jobs = [(reservation_job_id, site)]
        else:
            jobs = oarsub([(OarSubmission(resources=resources,
                                          job_type='deploy',
                                          walltime=walltime), site)])
        job_id, site = jobs[0]
        if job_id:
            try:
                logger.info("waiting job start %s on %s" % (job_id, site))
                wait_oar_job_start(job_id,
                                   site,
                                   prediction_callback=prediction_callback)
                logger.info("getting nodes of %s on %s" % (job_id, site))
                nodes = get_oar_job_nodes(job_id, site)
                # sort the nodes
                nodes = sorted(nodes, key=lambda node: node.address)
                # get only the necessary nodes under the switch
                if nb_experiment_nodes > len(nodes):
                    raise RuntimeError('The number of given node in the '
                                       'reservation ({}) do not match the '
                                       'requested resources '
                                       '({})'.format(len(nodes),
                                                     nb_experiment_nodes))
                nodes = nodes[:nb_experiment_nodes]
                logger.info("deploying nodes: {}".format(str(nodes)))
                deployed, undeployed = deploy(
                    Deployment(nodes, env_name=env_name),
                    check_deployed_command=already_configured)
                if undeployed:
                    logger.warn("NOT deployed nodes: {}".format(
                        str(undeployed)))
                    raise RuntimeError('Deployement failed')

                if not already_configured:

                    # install OAR
                    install_cmd = "apt-get update; apt-get install -y "
                    node_packages = "oar-node"
                    logger.info("installing OAR nodes: {}".format(
                        str(nodes[1:])))
                    install_oar_nodes = Remote(
                        install_cmd + node_packages,
                        nodes[1:],
                        connection_params={'user': '******'})
                    install_oar_nodes.start()

                    server_packages = (
                        "oar-server oar-server-pgsql oar-user "
                        "oar-user-pgsql postgresql python3-pip "
                        "libjson-perl postgresql-server-dev-all")
                    install_oar_sched_cmd = """
                    mkdir -p /opt/oar_sched; \
                    cd /opt/oar_sched; \
                    git clone https://github.com/oar-team/oar3.git; \
                    cd oar3; \
                    git checkout dce942bebc2; \
                    pip3 install -e .; \
                    cd /usr/lib/oar/schedulers; \
                    ln -s /usr/local/bin/kamelot; \
                    pip3 install psycopg2
                    """
                    logger.info("installing OAR server node: {}".format(
                        str(nodes[0])))
                    install_master = SshProcess(
                        install_cmd + server_packages + ";" +
                        install_oar_sched_cmd,
                        nodes[0],
                        connection_params={'user': '******'})
                    install_master.run()
                    install_oar_nodes.wait()

                    if not install_master.ok:
                        Report(install_master)

                    configure_oar_cmd = """
                    sed -i \
                        -e 's/^\(DB_TYPE\)=.*/\\1="Pg"/' \
                        -e 's/^\(DB_HOSTNAME\)=.*/\\1="localhost"/' \
                        -e 's/^\(DB_PORT\)=.*/\\1="5432"/' \
                        -e 's/^\(DB_BASE_PASSWD\)=.*/\\1="oar"/' \
                        -e 's/^\(DB_BASE_LOGIN\)=.*/\\1="oar"/' \
                        -e 's/^\(DB_BASE_PASSWD_RO\)=.*/\\1="oar_ro"/' \
                        -e 's/^\(DB_BASE_LOGIN_RO\)=.*/\\1="oar_ro"/' \
                        -e 's/^\(SERVER_HOSTNAME\)=.*/\\1="localhost"/' \
                        -e 's/^\(SERVER_PORT\)=.*/\\1="16666"/' \
                        -e 's/^\(LOG_LEVEL\)\=\"2\"/\\1\=\"3\"/' \
                        -e 's#^\(LOG_FILE\)\=.*#\\1="{result_dir}/oar.log"#' \
                        -e 's/^\(JOB_RESOURCE_MANAGER_PROPERTY_DB_FIELD\=\"cpuset\".*\)/#\\1/' \
                        -e 's/^#\(CPUSET_PATH\=\"\/oar\".*\)/\\1/' \
                        -e 's/^\(FINAUD_FREQUENCY\)\=.*/\\1="0"/' \
                        /etc/oar/oar.conf
                    """.format(result_dir=self.result_dir)
                    configure_oar = Remote(configure_oar_cmd,
                                           nodes,
                                           connection_params={'user': '******'})
                    configure_oar.run()
                    logger.info("OAR is configured on all nodes")

                    # Configure server
                    create_db = "oar-database --create --db-is-local"
                    config_oar_sched = (
                        "oarnotify --remove-queue default;"
                        "oarnotify --add-queue default,1,kamelot")
                    start_oar = "systemctl start oar-server.service"
                    logger.info("configuring OAR database: {}".format(
                        str(nodes[0])))
                    config_master = SshProcess(
                        create_db + ";" + config_oar_sched + ";" + start_oar,
                        nodes[0],
                        connection_params={'user': '******'})
                    config_master.run()

                    # propagate SSH keys
                    logger.info("configuring OAR SSH")
                    oar_key = "/tmp/.ssh"
                    Process('rm -rf ' + oar_key).run()
                    Process(
                        'scp -o BatchMode=yes -o PasswordAuthentication=no '
                        '-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null '
                        '-o ConnectTimeout=20 -rp -o User=root ' +
                        nodes[0].address + ":/var/lib/oar/.ssh"
                        ' ' + oar_key).run()
                    # Get(nodes[0], "/var/lib/oar/.ssh", [oar_key], connection_params={'user': '******'}).run()
                    Put(nodes[1:], [oar_key],
                        "/var/lib/oar/",
                        connection_params={
                            'user': '******'
                        }).run()
                    add_resources_cmd = """
                    oarproperty -a cpu || true; \
                    oarproperty -a core || true; \
                    oarproperty -c -a host || true; \
                    oarproperty -a mem || true; \
                    """
                    for node in nodes[1:]:
                        add_resources_cmd = add_resources_cmd + "oarnodesetting -a -h {node} -p host={node} -p cpu=1 -p core=4 -p cpuset=0 -p mem=16; \\\n".format(
                            node=node.address)

                    add_resources = SshProcess(
                        add_resources_cmd,
                        nodes[0],
                        connection_params={'user': '******'})
                    add_resources.run()

                    if add_resources.ok:
                        logger.info("oar is now configured!")
                    else:
                        raise RuntimeError(
                            "error in the OAR configuration: Abort!")

                # TODO backup de la config de OAR

                # Do the replay
                logger.info('begining the replay')
                while len(self.sweeper.get_remaining()) > 0:
                    combi = self.sweeper.get_next()
                    workload_file = os.path.basename(
                        combi['workload_filename'])
                    oar_replay = SshProcess(
                        script_path + "/oar_replay.py " +
                        combi['workload_filename'] + " " + self.result_dir +
                        "  oar_gant_" + workload_file, nodes[0])
                    oar_replay.stdout_handlers.append(self.result_dir + '/' +
                                                      workload_file + '.out')
                    logger.info("replaying workload: {}".format(combi))
                    oar_replay.run()
                    if oar_replay.ok:
                        logger.info("Replay workload OK: {}".format(combi))
                        self.sweeper.done(combi)
                    else:
                        logger.info("Replay workload NOT OK: {}".format(combi))
                        self.sweeper.cancel(combi)
                        raise RuntimeError("error in the OAR replay: Abort!")

            except:
                traceback.print_exc()
                ipdb.set_trace()

            finally:
                if is_a_test:
                    ipdb.set_trace()
                if reservation_job_id is None:
                    logger.info("delete job: {}".format(jobs))
                    oardel(jobs)
    def run(self):
        # Go to the result folder before everything
        os.chdir(self.result_dir)

        # STARPU INSTALLATION
        spack_spec = 'chameleon@trunk+starpu+fxt ^starpu@svn-trunk+fxt'
        spack_command = 'spack install -v' + ' ' + spack_spec 
        
        logger.info("Starting StarPU installation...")
        spack_process = Process(spack_command).start()
        spack_process.wait()

        logger.info("StarPU installation DONE...")
        checkProcess(spack_process)
        spack_process.kill()

        # STARPU DIRECTORY
        logger.info("Searching and going to StarPU installation directory...")

        starpu_location_process = Process(spack_spec).start()
        starpu_location_process.wait()
        checkProcess(starpu_location)

        starpu_cd_process = Process('cd ' + starpu_location_process.stdout + '/lib/chameleon').start()
        starpu_cd_process.wait()
        checkProcess(starpu_cd_process)
        
        starpu_location_process.kill()
        starpu_cd_process.kill()

        # RUNNING EXPERIMENT
        logger.info("Starting StarPU experiment...")
        starpu_experiment_process = Process(""" export STARPU_WORKER_STATS=1
                                                export STARPU_CALIBRATE=2
                                                ./timing/time_spotrf_tile --warmup --gpus=3 --threads=9 --nb=960 --ib=96 --n_range=48000:48000:9600 """)
        starpu_experiment_process.stdout_handlers.append(self.result_dir + '/' + 'StarPU.out') # create output file for StarPU    
        starpu_experiment_process.start()
        starpu_experiment_process.wait()

        logger.info("StarPU experiment DONE...")
        checkProcess(starpu_experiment_process)        
        starpu_experiment_process.kill()
Example #24
0
def create_subdir(base_dir, sub_dir):
    dir_path="{}/{}".format(base_dir, sub_dir)
    mkdir_cmd="mkdir -p {}".format(dir_path)
    Process(mkdir_cmd).run()
    return dir_path
Example #25
0
def wait_vms_have_started(vms, restart=True):
    """Scan port 22 on all vms, distributed on hosts"""
    # Creating file with list of VMs ip
    fd, tmpfile = tempfile.mkstemp(prefix='vmips')
    f = fdopen(fd, 'w')
    for vm in vms:
        f.write(vm['ip'] + '\n')
    f.close()
    # getting the list of host
    hosts = list(set([vm['host'] for vm in vms]))
    hosts.sort()
    # Pushing file on all hosts
    TaktukPut(hosts, [tmpfile]).run()
    logger.debug(pformat(hosts))
    # Splitting nmap scan
    n_vm_scan = ceil(len(vms) / len(hosts)) + 1
    cmds = []
    for i in range(len(hosts)):
        start = str(int(i * n_vm_scan))
        end = str(int((i + 1) * n_vm_scan))
        cmds.append("awk 'NR>=" + start + " && NR<" + end + "' " +
                    tmpfile.split('/')[-1] + " > nmap_file ; " +
                    "nmap -v -oG - -i nmap_file -p 22")
    logger.debug('%s', pformat(cmds))
    nmap = TaktukRemote('{{cmds}}', hosts)
    nmap_tries = 0
    all_up = False
    started_vms = []
    old_started = started_vms[:]
    while (not all_up) and nmap_tries < 10:
        sleep(15)
        logger.detail('nmap_tries %s', nmap_tries)
        nmap.run()
        for p in nmap.processes:
            for line in p.stdout.split('\n'):
                if 'Status' in line:
                    split_line = line.split(' ')
                    ip = split_line[1]
                    state = split_line[3].strip()
                    if state == 'Up':
                        vm = [vm for vm in vms if vm['ip'] == ip]
                        if len(vm) > 0:
                            vm[0]['state'] = 'OK'

        started_vms = [vm for vm in vms if vm['state'] == 'OK']
        all_up = len(started_vms) == len(vms)
        if started_vms != old_started:
            old_started = started_vms
        else:
            if restart:
                restart_vms([vm for vm in vms if vm['state'] == 'KO'])
            nmap_tries += 1
        if nmap_tries == 1:
            activate_vms([vm for vm in vms if vm['state'] == 'KO'])
        if not all_up:
            logger.info(
                str(nmap_tries) + ': ' + str(len(started_vms)) + '/' +
                str(len(vms)))
        nmap.reset()

    TaktukRemote('rm ' + tmpfile.split('/')[-1], hosts).run()
    Process('rm ' + tmpfile).run()
    if all_up:
        logger.info('All VM have been started')
        return True
    else:
        logger.error('All VM have not been started')
        return False
Example #26
0
    def workflow(self, comb):
        """
            Compute one application launch 
            using a given parameter group
        """
        comb_ok = False
        try:
            # Generate configuration file needed by MPI processes
            logger.info("Generating assembly file...")
            py = comb['cores'] / comb['px']
            prepare = Process(
                'cd %s && python %s %d %d %d %d %d %s app.lad' %
                (self.workingPath, self.genLadScript, comb['datasize'],
                 comb['datasize'], comb['datasize'], comb['px'], py,
                 comb['transposition']))
            prepare.shell = True
            prepare.run()

            # Generate the MPI host file
            mfile = self.generate_machine_file()

            # Start L2C
            lad = "./app.lad"
            logger.info("Computing...")
            res = Process(
                "export OAR_JOB_KEY_FILE=~/.oar_key ; cd %s && l2c_loader -M,-machinefile,%s --mpi -c %d %s"
                % (self.workingPath, mfile, comb['cores'], lad))
            res.shell = True
            res.stdout_handlers.append(
                os.path.join(self.result_dir,
                             slugify(comb) + '.out'))
            res.stdout_handlers.append(sys.stdout)
            res.stderr_handlers.append(
                os.path.join(self.result_dir,
                             slugify(comb) + '.err'))
            res.stderr_handlers.append(sys.stderr)
            res.run()
            if not res.ok:
                logger.error('Bad L2C termination')
                raise Exception('Bad L2C termination')
            if len(
                    res.stderr
            ) > 0:  # WARNING: when L2C cannot find the LAD file or something strange like this
                logger.warning('Not empty error output')

            # Clean configuration files
            logger.info("Removing assembly files...")
            res = Process('cd %s && rm -f app.lad*' % self.workingPath)
            res.shell = True
            res.run()

            comb_ok = True
        except Exception:
            pass
        finally:
            if comb_ok:
                self.sweeper.done(comb)
                logger.info(style.host(slugify(comb)) + ' has been done')
            else:
                self.sweeper.cancel(comb)
                logger.warning(
                    style.host(slugify(comb)) + ' has been canceled')

            logger.info(style.step('%s Remaining'),
                        len(self.sweeper.get_remaining()))
Example #27
0
    tmp_vms.append(p.host.address)
    processes.append(p.stdout.strip())
    vms_proc[p.host.address] = p.stdout.strip()
init_limit = TaktukRemote('cpulimit -p {{processes}} -l 1', tmp_vms)
for p in init_limit.processes:
    p.nolog_exit_code = ignore_exit_code = True
init_limit.start()

logger.info('Configuring events generator')
n_host = 0
f = open('hosts.list')
for line in f:
    n_host += 1
f.close()
sed_time = Process('sed -i "s/simulator.duration.*/simulator.duration = ' +
                   str(1000) + '/g"' +
                   ' load_events_generator/config/simulator.properties').run()
sed_nodes = Process(
    'sed -i "s/nodes.number.*/nodes.number = ' + str(n_host) + '/g"' +
    ' load_events_generator/config/simulator.properties').run()
sed_vms = Process('sed -i "s/vm.number.*/vm.number = ' + str(len(vms)) +
                  '/g"' +
                  ' load_events_generator/config/simulator.properties').run()

logger.info('Generating events list')
gen_events = Process(
    'cd load_events_generator ; ' +
    'java -jar load_events_generator.jar vms.list > ../events_load.xml')
gen_events.shell = True
gen_events.run()
    def run(self):
        # Go to the result folder before everything
        os.chdir(self.result_dir)

        # OARSUB
        jobs = oarsub([(OarSubmission(resources='nodes=' + _nbrNodes.__str__(), 
                                      job_type='deploy', 
                                      walltime=_walltime, 
                                      sql_properties=_properties), _site)])
        
        job_id, site = jobs[0]
        try:
            # KADEPLOY
            logger.info("Waiting job start %s on %s" % (job_id, site))
            wait_oar_job_start(job_id, site, prediction_callback=prediction_callback)
            logger.info("getting nodes of %s on %s" % (job_id, site))
            nodes = get_oar_job_nodes(job_id, site)

            deployed, undeployed = deploy(Deployment(nodes, env_name=env_name),
                                          check_deployed_command=already_configured)
            if undeployed:
                logger.warn(
                    "NOT deployed nodes : {}".format(str(undeployed)))
                raise RuntimeError('Deployement failed')

            # STARPU INSTALLATION
            spack_spec = 'chameleon@trunk+starpu+fxt ^starpu@svn-trunk+fxt'
            spack_command = 'spack install -v' + ' ' + spack_spec 

            logger.info("Starting StarPU installation...")
            spack_process = Process(spack_command).start()
            spack_process.wait()

            logger.info("StarPU installation DONE...")
            self.checkProcess(spack_process)
            spack_process.kill()

            # STARPU DIRECTORY
            logger.info("Searching and going to StarPU installation directory...")

            starpu_location_process = Process(spack_spec).start()
            starpu_location_process.wait()
            self.checkProcess(starpu_location)

            starpu_cd_process = Process('cd ' + starpu_location_process.stdout + '/lib/chameleon').start()
            starpu_cd_process.wait()
            self.checkProcess(starpu_cd_process)

            starpu_location_process.kill()
            starpu_cd_process.kill()

            # RUNNING EXPERIMENT
            logger.info("Starting StarPU experiment...")
            starpu_experiment_process = Process(""" export STARPU_WORKER_STATS=1
                                                    export STARPU_CALIBRATE=2
                                                    ./timing/time_spotrf_tile --warmup --gpus=3 --threads=9 --nb=960 --ib=96 --n_range=48000:48000:9600 """)
            starpu_experiment_process.stdout_handlers.append(self.result_dir + '/' + 'StarPU.out') # create output file for StarPU    
            starpu_experiment_process.start()
            starpu_experiment_process.wait()

            logger.info("StarPU experiment DONE...")
            self.checkProcess(starpu_experiment_process)        
            starpu_experiment_process.kill()

        finally:
	        logger.info("Delete job : {}".format(jobs))
            oardel(jobs)
Example #29
0
logger.info('Configuring events generator')
n_host = 0
f = open('hosts.list')
for line in f:
    n_host += 1
f.close()
sed_time = Process('sed -i "s/simulator.duration.*/simulator.duration = ' + str(1000) + '/g"' + 
      ' load_events_generator/config/simulator.properties').run()
sed_nodes = Process('sed -i "s/nodes.number.*/nodes.number = ' + str(n_host) + '/g"' + 
      ' load_events_generator/config/simulator.properties').run()
sed_vms = Process('sed -i "s/vm.number.*/vm.number = ' + str(len(vms)) + '/g"' + 
      ' load_events_generator/config/simulator.properties').run()

logger.info('Generating events list')
gen_events = Process('cd load_events_generator ; ' +
      'java -jar load_events_generator.jar vms.list > ../events_load.xml')
gen_events.shell = True
gen_events.run()

tree = ET.parse('events_load.xml') 
root = tree.getroot()
events = {}
for event in root.findall('./event'):
    events[int(round(float(event.get('time'))))] = {'vm': event.get('target'),
                                'load': event.get('value')}

 
def set_cpu_load(load, vm_ip, pid):
    """Use cpulimit to change process intensity on vm"""
    logger.info('kill cpu_limit on %s and set it to %s', vm_ip, load)
    kill_cpu_limit = SshProcess('ps aux| grep "cpulimit" | grep -v "grep" | awk \'{print $2}\' | xargs -r kill -9',
    def run(self):
        csvr = RevisionsReader(csv_file) # Launch CSV reader
        csvr_abstract = RevisionsReader(csv_file_abstract)

        os.chdir(self.result_dir) # Go to result directory before everything

        while True:
            try:
                csvr.next()
                csvr_abstract.next()

                chameleon_name = csvr_abstract.name() + '_' + csvr_abstract.chameleonBranch() + '_' + csvr_abstract.chameleonRevision() + '_' + csvr_abstract.command()
                starpu_name = csvr_abstract.name() + '_' + csvr_abstract.starpuBranch() + '_' + csvr_abstract.starpuRevision() + '_' + csvr_abstract.command()
                global_name = csvr_abstract.name() \
                                + '_chameleon_' + csvr_abstract.chameleonBranch() + '_' + csvr_abstract.chameleonRevision() \
                                + '_starpu_' + csvr_abstract.starpuBranch() + '_' + csvr_abstract.starpuRevision() \
                                + '_' + csvr_abstract.command()
                
                logger.info("Starting experiment %s" % (global_name))

                spack_spec = 'chameleon@' + chameleon_name + ' +starpu+fxt ^starpu@' + starpu_name + ' +fxt'

                # FOLDER CREATION
                folder_name = 'chameleon_' + csvr_abstract.chameleonBranch() + '_' + csvr_abstract.chameleonRevision() \
                                + '_starpu_' + csvr_abstract.starpuBranch() + '_' + csvr_abstract.starpuRevision()
                folder = self.result_dir + '/' + folder_name

                try:
                    os.mkdir(folder, 0764)
                except OSError as exc:
                    if (exc.errno != errno.EEXIST):
                        raise exc
                    pass

                # STARPU INSTALLATION
                logger.info("Starting StarPU installation")
                spack_process = Process('spack -d install' + ' ' + spack_spec)


                if (not os.path.isfile(folder + '/' + 'compil_' + folder_name)):
                    spack_process.stdout_handlers.append(folder + '/' + 'compil_' + folder_name) # create output file for StarPU installation
                spack_process.start()
                spack_process.wait()

                logger.info("StarPU installation DONE")
                is_ok = self.checkProcess(spack_process)
                spack_process.kill()

                if (not is_ok):
                    continue # stop this experiment

                # STARPU DIRECTORY
                logger.info("Searching and going to StarPU installation directory")

                starpu_location_process = Process('spack location -i' + ' ' + spack_spec).start()
                starpu_location_process.wait()
                is_ok = self.checkProcess(starpu_location_process)
            
                starpu_path = starpu_location_process.stdout.replace("\n", "") # remove end_of_line
                starpu_cd = 'cd' + ' ' + starpu_path + '/lib/chameleon/'
                starpu_location_process.kill()

                if (not is_ok):
                    continue # stop this experiment
                
                # RUNNING EXPERIMENT
                logger.info("Starting StarPU experiment")

                starpu_experiment_process = Process(starpu_cd + '\n' + csvr.command(), shell=True)
                           
                starpu_experiment_process.stdout_handlers.append(folder + '/' + 'stdout_' + global_name) # create output file for StarPU execution
                starpu_experiment_process.stderr_handlers.append(folder + '/' + 'stderr_' + global_name) # create error file for StarPU execution
                starpu_experiment_process.start()
                starpu_experiment_process.wait()

                logger.info("StarPU experiment DONE")
                is_ok = self.checkProcess(starpu_experiment_process)        
                starpu_experiment_process.kill()

                if (not is_ok):
                    continue # stop this experiment

            except StopIteration:
                break;
Example #31
0
    def workflow(self, comb):
        """
            Compute one application launch 
            using a given parameter group
        """
        comb_ok = False
        try:
            # Generate configuration file needed by MPI processes
            logger.info("Generating assembly file...")
            py = comb['cores'] / comb['px']
            prepare = Process('cd %s && python %s %d %d %d %d %d %s app.lad' % 
                (self.workingPath, self.genLadScript, comb['datasize'], comb['datasize'], 
                    comb['datasize'], comb['px'], py, comb['transposition']))
            prepare.shell = True
            prepare.run()

            # Generate the MPI host file
            mfile = self.generate_machine_file()

            # Start L2C
            lad = "./app.lad"
            logger.info("Computing...")
            res = Process("export OAR_JOB_KEY_FILE=~/.oar_key ; cd %s && l2c_loader -M,-machinefile,%s --mpi -c %d %s" % (self.workingPath, mfile, comb['cores'], lad))
            res.shell = True
            res.stdout_handlers.append(os.path.join(self.result_dir, slugify(comb) + '.out'))
            res.stdout_handlers.append(sys.stdout)
            res.stderr_handlers.append(os.path.join(self.result_dir, slugify(comb) + '.err'))
            res.stderr_handlers.append(sys.stderr)
            res.run()
            if not res.ok:
                logger.error('Bad L2C termination')
                raise Exception('Bad L2C termination')
            if len(res.stderr) > 0: # WARNING: when L2C cannot find the LAD file or something strange like this
                logger.warning('Not empty error output')

            # Clean configuration files
            logger.info("Removing assembly files...")
            res = Process('cd %s && rm -f app.lad*' % self.workingPath)
            res.shell = True
            res.run()
                
            comb_ok = True
        except Exception:
            pass
        finally:
            if comb_ok:
                self.sweeper.done(comb)
                logger.info(style.host(slugify(comb)) + ' has been done')
            else:
                self.sweeper.cancel(comb)
                logger.warning(style.host(slugify(comb)) + ' has been canceled')
        
            logger.info(style.step('%s Remaining'),
                        len(self.sweeper.get_remaining()))