def wait_hosts_down(hosts, timeout=300): """ """ timer = Timer() up_hosts = map(lambda x: x.address if isinstance(x, Host) else x, hosts) fd, hosts_file = mkstemp(dir='/tmp/', prefix='hosts_') with fdopen(fd, 'w') as f: f.write('\n' + '\n'.join(up_hosts)) while len(up_hosts) > 0 and timer.elapsed() < timeout: nmap = Process("nmap -v -oG - -i %s -p 22 |grep Host|grep Status" % (hosts_file, ), shell=True).run() logger.debug('timer: %s \nnmap output: \n%s', timer.elapsed(), nmap.stdout.strip()) for line in nmap.stdout.strip().split('\n'): if 'Down' in line: ip = line.split()[1] get_host = Process('host ' + ip + '| cut -f 5 -d " "', shell=True).run() host = get_host.stdout.strip()[0:-1] if host in up_hosts: logger.detail(host + ' is down') up_hosts.remove(host) Process('rm ' + hosts_file).run() return len(up_hosts) == 0
def run(self): # Go to the result folder before everything os.chdir(self.result_dir) # jobs = [(_jobID, _site)] # Get nodes # nodes = get_oar_job_nodes(_jobID, _site) try: # logger.info("Creating hostfiles for all combinations...") # for nbr_node in _nbrNodes: # hostfile_filename = self.result_dir + '/' + 'hostfile-' + nbr_node # with open(hostfile_filename, 'w') as hostfile: # for node in nodes[:int(nbr_node)]: # print>>hostfile, node.address spack_command = 'spack install -v chameleon@trunk+starpu+fxt ^starpu@svn-trunk+fxt' # spack_process = Remote(spack_command, nodes) logger.info("Starting StarPU installation...") spack_process = Process(spack_command).start() spack_process.wait() logger.info("StarPU installation DONE...") if (not spack_process.ok): logger.info("Error : " + spack_process.error_reason) else: logger.info("spac stdout: {}".format(spack_process.stdout)); spack_process.kill() # Pilotage except: traceback.print_exc() finally: logger.info("Fin...")
def _munin_server(server, clients): """Install the monitoring service munin. Must be executed inside Grid'5000 to be able to resolve the server and clients IP. :param server: a execo.Host :param clients: a list of execo.Hosts :param plugins: a list of munin plugins """ logger.info( 'Munin monitoring service installation, server = %s, clients = \n %s', server.address, [host.address for host in clients]) logger.debug('Configuring munin server %s', style.host('server')) cmd = 'export DEBIAN_MASTER=noninteractive ; apt-get update && apt-get install -y munin' inst_munin_server = SshProcess(cmd, server).run() logger.debug('Creating configuration files for server') fd, server_conf = mkstemp(dir='/tmp/', prefix='munin-nodes_') f = fdopen(fd, 'w') for host in clients: get_ip = Process('host ' + host.address).run() ip = get_ip.stdout.strip().split(' ')[3] f.write('[' + host.address + ']\n address ' + ip + '\n use_node_name yes\n\n') f.close() Put([server], [server_conf], remote_location='/etc/').run() SshProcess('cd /etc && cp ' + server_conf.split('/')[-1] + ' munin.conf', server).run() Process('rm ' + server_conf).run()
def workflow(self, comb): """ """ comb_ok = False logger.info(slugify(comb) + \ ' starts to compile') try: export = "source /opt/intel/bin/compilervars.sh intel64; " src_directory = "/home/arrouan/workspace/aevol/git/world/aevol/" bin_directory = "/home/arrouan/workspace/aevol/compiled_binary/" configure_option = "--with-tracing --without-x" if comb['parallel'] == 'tbb': configure_option += " --with-tbb" if comb['blas'] == 'openblas': configure_option += " --with-blas" elif comb['blas'] == 'mkl': configure_option += " --with-mkl" elif comb['blas'] == 'atlas': configure_option += " --with-atlas" if comb['experiment'] == 'raevol': configure_option += " --with-raevol" if comb['compilator'] == 'intel': configure_option += " CXX=icc" full_bin_directory = bin_directory + comb['experiment']+'_'+comb['compilator']+'_'+comb['parallel']+'_'+comb['blas'] try: os.mkdir(full_bin_directory) except: for f in os.listdir(full_bin_directory): os.remove(full_bin_directory + "/" + f) p = Process(export+'cd '+src_directory+'; autoreconf; ./configure '+configure_option+'; make clean; make; cp src/aevol_run '+full_bin_directory+'/; cp src/aevol_create '+full_bin_directory+'/') p.shell = True # p.run() print p.stdout comb_ok = True finally: if comb_ok: self.sweeper.done(comb) logger.info(slugify(comb) + \ ' has been done') else: self.sweeper.cancel(comb) logger.warning(slugify(comb) + \ ' has been canceled') logger.info(style.step('%s Remaining'), len(self.sweeper.get_remaining()))
def get_server_ip(host): """Get the server IP""" if isinstance(host, Host): host = host.address logger.debug('Retrieving IP from %s', style.host(host)) get_ip = Process('host ' + host + ' |cut -d \' \' -f 4') get_ip.shell = True get_ip.run() ip = get_ip.stdout.strip() return ip
def clean_archi(self): """ Delete all files related to an existing DIET archi """ logger.info("Clean DIET architecture") process = Process("./dietg/clean_archi_diet.sh") process.run() process = Process("./dietg/clean.sh") process.run() process = Process("if [ -e ./tmp ]; then rm ./tmp; fi") process.run()
def create_diet_architecture_files(self): logger.info("Create a DIET architecture") # Architecture without LA process = Process("./dietg/set_archi_diet_4.sh gridnodes "+str(self.nodes_service[0])) process.run() MA_file = "./dietg/cfgs/MA1.cfg" SeD_file = ['./dietg/cfgs/server.cfg'] logger.info("Create MA file") set_scheduler(MA_file, self.scheduler) logger.info("Create Sed files") for file2 in SeD_file: # print file2 set_parallel_jobs(file2, self.concLimit)
def _enable_bridge(self, name='br0'): """We need a bridge to have automatic DHCP configuration for the VM.""" logger.detail('Configuring the bridge') hosts_br = self._get_bridge(self.hosts) nobr_hosts = [] for host, br in hosts_br.iteritems(): if br is None: logger.debug('No bridge on host %s', style.host(host)) nobr_hosts.append(host) elif br != name: logger.debug('Wrong bridge on host %s, destroying it', style.host(host)) SshProcess('ip link set ' + br + ' down ; brctl delbr ' + br, host).run() nobr_hosts.append(host) else: logger.debug('Bridge %s is present on host %s', style.emph('name'), style.host(host)) nobr_hosts = map(lambda x: x.address if isinstance(x, Host) else x, nobr_hosts) if len(nobr_hosts) > 0: logger.debug('Creating bridge on %s', hosts_list(nobr_hosts)) script = 'export br_if=`ip route |grep default |cut -f 5 -d " "`; \n' + \ 'ifdown $br_if ; \n' + \ 'sed -i "s/$br_if inet dhcp/$br_if inet manual/g" /etc/network/interfaces ; \n' + \ 'sed -i "s/auto $br_if//g" /etc/network/interfaces ; \n' + \ 'echo " " >> /etc/network/interfaces ; \n' + \ 'echo "auto ' + name + '" >> /etc/network/interfaces ; \n' + \ 'echo "iface ' + name + ' inet dhcp" >> /etc/network/interfaces ; \n' + \ 'echo " bridge_ports $br_if" >> /etc/network/interfaces ; \n' + \ 'echo " bridge_stp off" >> /etc/network/interfaces ; \n' + \ 'echo " bridge_maxwait 0" >> /etc/network/interfaces ; \n' + \ 'echo " bridge_fd 0" >> /etc/network/interfaces ; \n' + \ 'ifup ' + name fd, br_script = mkstemp(dir='/tmp/', prefix='create_br_') f = fdopen(fd, 'w') f.write(script) f.close() self.fact.get_fileput(nobr_hosts, [br_script]).run() self.fact.get_remote('nohup sh ' + br_script.split('/')[-1], nobr_hosts).run() logger.debug('Waiting for network restart') if_up = False nmap_tries = 0 while (not if_up) and nmap_tries < 20: sleep(20) nmap_tries += 1 nmap = Process('nmap ' + ' '.join([host for host in nobr_hosts]) + ' -p 22').run() for line in nmap.stdout.split('\n'): if 'Nmap done' in line: if_up = line.split()[2] == line.split()[5].replace( '(', '') logger.debug('Network has been restarted') logger.detail('All hosts have the bridge %s', style.emph(name))
def _start_disk_copy(self, disks=None): """ """ disks_copy = [] if not disks: disks = self.backing_files for bf in disks: logger.info('Treating ' + style.emph(bf)) logger.debug("Checking frontend disk vs host disk") raw_disk = '/tmp/orig_' + bf.split('/')[-1] f_disk = Process('md5sum -b ' + bf).run() disk_hash = f_disk.stdout.split(' ')[0] cmd = 'if [ -f ' + raw_disk + ' ]; ' + \ 'then md5sum -b ' + raw_disk + '; fi' h_disk = self.fact.get_remote(cmd, self.hosts).run() disk_ok = True for p in h_disk.processes: if p.stdout.split(' ')[0] != disk_hash: disk_ok = False break if disk_ok: logger.info("Disk " + style.emph(bf) + " is already present, skipping copy") else: disks_copy.append( self.fact.get_fileput(self.hosts, [bf], remote_location="/tmp")) if len(disks_copy) > 0: self.copy_actions = ParallelActions(disks_copy).start() else: self.copy_actions = Remote('ls', self.hosts[0]).run()
def dnsmasq_server(server, clients=None, vms=None, dhcp=True): """Configure a DHCP server with dnsmasq :param server: host where the server will be installed :param clients: list of hosts that will be declared in dnsmasq :param vms: list of virtual machines """ logger.debug('Installing and configuring a DNS/DHCP server on %s', server) test_running = Process('nmap ' + server + ' -p 53 | grep domain') test_running.shell = True test_running.run() if 'open' in test_running.stdout: logger.info('DNS server already running, updating configuration') else: cmd = 'killall dnsmasq; export DEBIAN_MASTER=noninteractive ; ' + \ 'apt-get update ; apt-get -y purge dnsmasq-base ; ' + \ 'apt-get install -t wheezy -o Dpkg::Options::="--force-confdef" ' + \ '-o Dpkg::Options::="--force-confnew" ' + \ '-y dnsmasq; echo 1 > /proc/sys/net/ipv4/ip_forward ' SshProcess(cmd, server).run() sites = list( set([ get_host_site(client) for client in clients if get_host_site(client) ] + [get_host_site(server)])) add_vms(vms, server) if clients: kill_dnsmasq = TaktukRemote('killall dnsmasq', clients) for p in kill_dnsmasq.processes: p.ignore_exit_code = p.nolog_exit_code = True kill_dnsmasq.run() resolv_conf(server, clients, sites) if dhcp: sysctl_conf(server, vms) dhcp_conf(server, vms, sites) logger.debug('Restarting service ...') cmd = 'service dnsmasq stop ; rm /var/lib/misc/dnsmasq.leases ; ' + \ 'service dnsmasq start', SshProcess(cmd, server).run()
def get_logs_from_server(self): distant_file = "/root/MA.stat" local_folder = "./results_"+self.oargrid_job_id+"_"+self.scheduler+"/" nb_files = 0 # Get(self.servers, [distant_file]) for host in self.servers: local_file = local_folder+host+"_"+self.scheduler+"_SeD.stat" process = Process("scp root@"+host+":"+distant_file+" "+local_file) process.run() try: #si le fichier existe with open(local_file) as fichier: nb_files += 1 fichier.close except IOError: #si le fichier n'existe pas pass return nb_files
def get_nb_tasks_server(self): distant_file = "/root/dietg/log/total.jobs" local_file = "./task_counter" # Get(self.servers, [distant_file]) for host in self.servers: process = Process("scp root@"+host+":"+distant_file+" "+local_file) process.run() try: #si le fichier existe with open(local_file) as fichier: self.nb_tasks[host] = get_nb_tasks(local_file) fichier.close except IOError: #si le fichier n'existe pas self.nb_tasks[host] = 0 try: os.remove(local_file); except OSError: pass
def dnsmasq_server(server, clients=None, vms=None, dhcp=True): """Configure a DHCP server with dnsmasq :param server: host where the server will be installed :param clients: list of hosts that will be declared in dnsmasq :param vms: list of virtual machines """ logger.debug('Installing and configuring a DNS/DHCP server on %s', server) test_running = Process('nmap ' + server + ' -p 53 | grep domain') test_running.shell = True test_running.run() if 'open' in test_running.stdout: logger.info('DNS server already running, updating configuration') else: cmd = 'killall dnsmasq; export DEBIAN_MASTER=noninteractive ; ' + \ 'apt-get update ; apt-get -y purge dnsmasq-base ; ' + \ 'apt-get install -t wheezy -o Dpkg::Options::="--force-confdef" ' + \ '-o Dpkg::Options::="--force-confnew" ' + \ '-y dnsmasq; echo 1 > /proc/sys/net/ipv4/ip_forward ' SshProcess(cmd, server).run() sites = list(set([get_host_site(client) for client in clients if get_host_site(client)] + [get_host_site(server)])) add_vms(vms, server) if clients: kill_dnsmasq = TaktukRemote('killall dnsmasq', clients) for p in kill_dnsmasq.processes: p.ignore_exit_code = p.nolog_exit_code = True kill_dnsmasq.run() resolv_conf(server, clients, sites) if dhcp: sysctl_conf(server, vms) dhcp_conf(server, vms, sites) logger.debug('Restarting service ...') cmd = 'service dnsmasq stop ; rm /var/lib/misc/dnsmasq.leases ; ' + \ 'service dnsmasq start', SshProcess(cmd, server).run()
def resolv_conf(server, clients, sites): """Generate the resolv.conf with dhcp parameters and put it on the server """ fd, resolv = mkstemp(dir='/tmp/', prefix='resolv_') f = fdopen(fd, 'w') f.write('domain grid5000.fr\nsearch grid5000.fr ' + ' '.join([site + '.grid5000.fr' for site in sites]) + '\nnameserver ' + get_server_ip(server)) f.close() TaktukPut(clients, [resolv], remote_location='/etc/').run() TaktukRemote('cd /etc && cp ' + resolv.split('/')[-1] + ' resolv.conf', clients).run() Process('rm ' + resolv).run()
def wait_hosts_up(hosts, timeout=300): """ """ down_hosts = map(lambda x: x.address if isinstance(x, Host) else x, hosts) fd, hosts_file = mkstemp(dir='/tmp/', prefix='hosts_') f = fdopen(fd, 'w') f.write('\n' + '\n'.join(down_hosts)) f.close() timer = Timer() while len(down_hosts) > 0 and timer.elapsed() < timeout: nmap = Process("nmap -v -oG - -i %s -p 22 |grep Host|grep Status" % (hosts_file, ), shell=True).run() logger.debug('timer: %s \nnmap output: \n%s', timer.elapsed(), nmap.stdout.strip()) for line in nmap.stdout.strip().split('\n'): s = line.split()[2] host = s[s.find("(") + 1:s.find(")")] if host in down_hosts: logger.detail('%s is up', host) down_hosts.remove(host) Process('rm ' + hosts_file).run() sleep(3) return len(down_hosts) == 0
def add_vms(vms, server): """Generate the list of virtual machines """ logger.debug('Adding the VM in /etc/hosts ...') fd, vms_list = mkstemp(dir='/tmp/', prefix='vms_') f = fdopen(fd, 'w') f.write('\n' + '\n'.join([vm['ip'] + ' \t ' + vm['id'] for vm in vms])) f.close() Put([server], [vms_list], remote_location='/etc/').run() SshProcess( '[ -f /etc/hosts.bak ] && cp /etc/hosts.bak /etc/hosts || ' + ' cp /etc/hosts /etc/hosts.bak', server).run() Remote('cat /etc/' + vms_list.split('/')[-1] + ' >> /etc/hosts', [server]).run() Process('rm ' + vms_list).run()
def sysctl_conf(server, vms): """Change the default value of net.ipv4.neigh.default.gc_thresh* to handle large number of IP""" val = int(2**ceil(log(len(vms), 2))) conf = "\nnet.ipv4.neigh.default.gc_thresh3 = " + str(3 * val) + \ "\nnet.ipv4.neigh.default.gc_thresh2 = " + str(2 * val) + \ "\nnet.ipv4.neigh.default.gc_thresh1 = " + str(val) fd, sysctl = mkstemp(dir='/tmp/', prefix='sysctl_') f = fdopen(fd, 'w') f.write(conf) f.close() Put([server], [sysctl], remote_location='/etc/').run() SshProcess( 'cd /etc && cat ' + sysctl.split('/')[-1] + ' >> sysctl.conf && sysctl -p', server).run() Process('rm ' + sysctl).run()
def import_from_kaenv(self, env, remote=None): """import a kadeploy environment from a kadeploy database""" env = re.match( r"^(?P<name>[-_.\w]+)(?:@(?P<user>[_.\w]+))?(:?:(?P<version>[_.\w]+))?$", env).groupdict("") if env['user']: env['user'] = "******" + env['user'] if env['version']: env['version'] = " --env-version " + env['version'] kaenv_cmd = "kaenv3{user}{version} -p {name}".format(**env) if remote: remote = re.match( r"^(?:(?P<user>[-_.\w]+)@)?(?P<address>[-_.\w]+)(?::(?P<port>\d{1,5}))?$", remote).groupdict() p = SshProcess(kaenv_cmd, Host(**remote)) else: p = Process(kaenv_cmd, shell=True) p.run() self.desc = yaml.load(p.stdout) return self
def dhcp_conf(server, vms, sites): """Generate the dnsmasq.conf with dhcp parameters and put it on the server""" logger.debug('Creating dnsmasq.conf') ip_mac = [(vm['ip'], vm['mac']) for vm in vms] dhcp_lease = 'dhcp-lease-max=10000\n' dhcp_range = 'dhcp-range=' + ip_mac[0][0] + ',' + ip_mac[len(vms) - 1][0] + ',12h\n' dhcp_router = 'dhcp-option=option:router,' + get_server_ip(server) + '\n' dhcp_hosts = '' + '\n'.join([ 'dhcp-host=' + ':' + ip_mac[i][1] + ',' + vms[i]['id'] + ',' + ip_mac[i][0] for i in range(len(vms)) ]) dhcp_option = 'dhcp-option=option:domain-search,grid5000.fr,' + \ ','.join([site + '.grid5000.fr' for site in sites]) + '\n' fd, dnsmasq = mkstemp(dir='/tmp/', prefix='dnsmasq_') f = fdopen(fd, 'w') f.write(dhcp_lease + dhcp_range + dhcp_router + dhcp_hosts + '\n' + dhcp_option) f.close() Put([server], [dnsmasq], remote_location='/etc/').run() SshProcess('cd /etc && cp ' + dnsmasq.split('/')[-1] + ' dnsmasq.conf', server).run() Process('rm ' + dnsmasq).run()
def run(self): # Go to the result folder before everything os.chdir(self.result_dir) jobs = [(_jobID, _site)] # Get nodes nodes = get_oar_job_nodes(_jobID, _site) try: logger.info("Creating hostfiles for all combinations...") for nbr_node in _nbrNodes: hostfile_filename = self.result_dir + '/' + 'hostfile-' + nbr_node with open(hostfile_filename, 'w') as hostfile: for node in nodes[:int(nbr_node)]: print>>hostfile, node.address spack_process = Process('spack install -v chameleon@trunk+starpu+fxt ^starpu@svn-trunk+fxt') spack_process.start() spack_process.wait() spack_process.kill() finally: logger.info("Delete job: {}".format(jobs)) oardel(jobs)
def run(self): """Run the experiment""" already_configured = self.options.already_configured reservation_job_id = int(self.options.reservation_id) \ if self.options.reservation_id is not None else None is_a_test = self.options.is_a_test if is_a_test: logger.warn('THIS IS A TEST! This run will use only a few ' 'resources') # make the result folder writable for all os.chmod(self.result_dir, 0o777) # Import configuration with open(self.args[0]) as config_file: config = json.load(config_file) # backup configuration copy(self.args[0], self.result_dir) site = config["grid5000_site"] resources = config["resources"] nb_experiment_nodes = config["nb_experiment_nodes"] walltime = str(config["walltime"]) env_name = config["kadeploy_env_name"] workloads = config["workloads"] # check if workloads exists (Suppose that the same NFS mount point # is present on the remote and the local environment for workload_file in workloads: with open(workload_file): pass # copy the workloads files to the results dir copy(workload_file, self.result_dir) # define the workloads parameters self.parameters = {'workload_filename': workloads} logger.info('Workloads: {}'.format(workloads)) # define the iterator over the parameters combinations self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweep(self.parameters)) # Due to previous (using -c result_dir) run skip some combination logger.info('Skipped parameters:' + '{}'.format(str(self.sweeper.get_skipped()))) logger.info('Number of parameters combinations {}'.format( str(len(self.sweeper.get_remaining())))) logger.info('combinations {}'.format(str( self.sweeper.get_remaining()))) if reservation_job_id is not None: jobs = [(reservation_job_id, site)] else: jobs = oarsub([(OarSubmission(resources=resources, job_type='deploy', walltime=walltime), site)]) job_id, site = jobs[0] if job_id: try: logger.info("waiting job start %s on %s" % (job_id, site)) wait_oar_job_start(job_id, site, prediction_callback=prediction_callback) logger.info("getting nodes of %s on %s" % (job_id, site)) nodes = get_oar_job_nodes(job_id, site) # sort the nodes nodes = sorted(nodes, key=lambda node: node.address) # get only the necessary nodes under the switch if nb_experiment_nodes > len(nodes): raise RuntimeError('The number of given node in the ' 'reservation ({}) do not match the ' 'requested resources ' '({})'.format(len(nodes), nb_experiment_nodes)) nodes = nodes[:nb_experiment_nodes] logger.info("deploying nodes: {}".format(str(nodes))) deployed, undeployed = deploy( Deployment(nodes, env_name=env_name), check_deployed_command=already_configured) if undeployed: logger.warn("NOT deployed nodes: {}".format( str(undeployed))) raise RuntimeError('Deployement failed') if not already_configured: # install OAR install_cmd = "apt-get update; apt-get install -y " node_packages = "oar-node" logger.info("installing OAR nodes: {}".format( str(nodes[1:]))) install_oar_nodes = Remote( install_cmd + node_packages, nodes[1:], connection_params={'user': '******'}) install_oar_nodes.start() server_packages = ( "oar-server oar-server-pgsql oar-user " "oar-user-pgsql postgresql python3-pip " "libjson-perl postgresql-server-dev-all") install_oar_sched_cmd = """ mkdir -p /opt/oar_sched; \ cd /opt/oar_sched; \ git clone https://github.com/oar-team/oar3.git; \ cd oar3; \ git checkout dce942bebc2; \ pip3 install -e .; \ cd /usr/lib/oar/schedulers; \ ln -s /usr/local/bin/kamelot; \ pip3 install psycopg2 """ logger.info("installing OAR server node: {}".format( str(nodes[0]))) install_master = SshProcess( install_cmd + server_packages + ";" + install_oar_sched_cmd, nodes[0], connection_params={'user': '******'}) install_master.run() install_oar_nodes.wait() if not install_master.ok: Report(install_master) configure_oar_cmd = """ sed -i \ -e 's/^\(DB_TYPE\)=.*/\\1="Pg"/' \ -e 's/^\(DB_HOSTNAME\)=.*/\\1="localhost"/' \ -e 's/^\(DB_PORT\)=.*/\\1="5432"/' \ -e 's/^\(DB_BASE_PASSWD\)=.*/\\1="oar"/' \ -e 's/^\(DB_BASE_LOGIN\)=.*/\\1="oar"/' \ -e 's/^\(DB_BASE_PASSWD_RO\)=.*/\\1="oar_ro"/' \ -e 's/^\(DB_BASE_LOGIN_RO\)=.*/\\1="oar_ro"/' \ -e 's/^\(SERVER_HOSTNAME\)=.*/\\1="localhost"/' \ -e 's/^\(SERVER_PORT\)=.*/\\1="16666"/' \ -e 's/^\(LOG_LEVEL\)\=\"2\"/\\1\=\"3\"/' \ -e 's#^\(LOG_FILE\)\=.*#\\1="{result_dir}/oar.log"#' \ -e 's/^\(JOB_RESOURCE_MANAGER_PROPERTY_DB_FIELD\=\"cpuset\".*\)/#\\1/' \ -e 's/^#\(CPUSET_PATH\=\"\/oar\".*\)/\\1/' \ -e 's/^\(FINAUD_FREQUENCY\)\=.*/\\1="0"/' \ /etc/oar/oar.conf """.format(result_dir=self.result_dir) configure_oar = Remote(configure_oar_cmd, nodes, connection_params={'user': '******'}) configure_oar.run() logger.info("OAR is configured on all nodes") # Configure server create_db = "oar-database --create --db-is-local" config_oar_sched = ( "oarnotify --remove-queue default;" "oarnotify --add-queue default,1,kamelot") start_oar = "systemctl start oar-server.service" logger.info("configuring OAR database: {}".format( str(nodes[0]))) config_master = SshProcess( create_db + ";" + config_oar_sched + ";" + start_oar, nodes[0], connection_params={'user': '******'}) config_master.run() # propagate SSH keys logger.info("configuring OAR SSH") oar_key = "/tmp/.ssh" Process('rm -rf ' + oar_key).run() Process( 'scp -o BatchMode=yes -o PasswordAuthentication=no ' '-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ' '-o ConnectTimeout=20 -rp -o User=root ' + nodes[0].address + ":/var/lib/oar/.ssh" ' ' + oar_key).run() # Get(nodes[0], "/var/lib/oar/.ssh", [oar_key], connection_params={'user': '******'}).run() Put(nodes[1:], [oar_key], "/var/lib/oar/", connection_params={ 'user': '******' }).run() add_resources_cmd = """ oarproperty -a cpu || true; \ oarproperty -a core || true; \ oarproperty -c -a host || true; \ oarproperty -a mem || true; \ """ for node in nodes[1:]: add_resources_cmd = add_resources_cmd + "oarnodesetting -a -h {node} -p host={node} -p cpu=1 -p core=4 -p cpuset=0 -p mem=16; \\\n".format( node=node.address) add_resources = SshProcess( add_resources_cmd, nodes[0], connection_params={'user': '******'}) add_resources.run() if add_resources.ok: logger.info("oar is now configured!") else: raise RuntimeError( "error in the OAR configuration: Abort!") # TODO backup de la config de OAR # Do the replay logger.info('begining the replay') while len(self.sweeper.get_remaining()) > 0: combi = self.sweeper.get_next() workload_file = os.path.basename( combi['workload_filename']) oar_replay = SshProcess( script_path + "/oar_replay.py " + combi['workload_filename'] + " " + self.result_dir + " oar_gant_" + workload_file, nodes[0]) oar_replay.stdout_handlers.append(self.result_dir + '/' + workload_file + '.out') logger.info("replaying workload: {}".format(combi)) oar_replay.run() if oar_replay.ok: logger.info("Replay workload OK: {}".format(combi)) self.sweeper.done(combi) else: logger.info("Replay workload NOT OK: {}".format(combi)) self.sweeper.cancel(combi) raise RuntimeError("error in the OAR replay: Abort!") except: traceback.print_exc() ipdb.set_trace() finally: if is_a_test: ipdb.set_trace() if reservation_job_id is None: logger.info("delete job: {}".format(jobs)) oardel(jobs)
def run(self): # Go to the result folder before everything os.chdir(self.result_dir) # STARPU INSTALLATION spack_spec = 'chameleon@trunk+starpu+fxt ^starpu@svn-trunk+fxt' spack_command = 'spack install -v' + ' ' + spack_spec logger.info("Starting StarPU installation...") spack_process = Process(spack_command).start() spack_process.wait() logger.info("StarPU installation DONE...") checkProcess(spack_process) spack_process.kill() # STARPU DIRECTORY logger.info("Searching and going to StarPU installation directory...") starpu_location_process = Process(spack_spec).start() starpu_location_process.wait() checkProcess(starpu_location) starpu_cd_process = Process('cd ' + starpu_location_process.stdout + '/lib/chameleon').start() starpu_cd_process.wait() checkProcess(starpu_cd_process) starpu_location_process.kill() starpu_cd_process.kill() # RUNNING EXPERIMENT logger.info("Starting StarPU experiment...") starpu_experiment_process = Process(""" export STARPU_WORKER_STATS=1 export STARPU_CALIBRATE=2 ./timing/time_spotrf_tile --warmup --gpus=3 --threads=9 --nb=960 --ib=96 --n_range=48000:48000:9600 """) starpu_experiment_process.stdout_handlers.append(self.result_dir + '/' + 'StarPU.out') # create output file for StarPU starpu_experiment_process.start() starpu_experiment_process.wait() logger.info("StarPU experiment DONE...") checkProcess(starpu_experiment_process) starpu_experiment_process.kill()
def create_subdir(base_dir, sub_dir): dir_path="{}/{}".format(base_dir, sub_dir) mkdir_cmd="mkdir -p {}".format(dir_path) Process(mkdir_cmd).run() return dir_path
def wait_vms_have_started(vms, restart=True): """Scan port 22 on all vms, distributed on hosts""" # Creating file with list of VMs ip fd, tmpfile = tempfile.mkstemp(prefix='vmips') f = fdopen(fd, 'w') for vm in vms: f.write(vm['ip'] + '\n') f.close() # getting the list of host hosts = list(set([vm['host'] for vm in vms])) hosts.sort() # Pushing file on all hosts TaktukPut(hosts, [tmpfile]).run() logger.debug(pformat(hosts)) # Splitting nmap scan n_vm_scan = ceil(len(vms) / len(hosts)) + 1 cmds = [] for i in range(len(hosts)): start = str(int(i * n_vm_scan)) end = str(int((i + 1) * n_vm_scan)) cmds.append("awk 'NR>=" + start + " && NR<" + end + "' " + tmpfile.split('/')[-1] + " > nmap_file ; " + "nmap -v -oG - -i nmap_file -p 22") logger.debug('%s', pformat(cmds)) nmap = TaktukRemote('{{cmds}}', hosts) nmap_tries = 0 all_up = False started_vms = [] old_started = started_vms[:] while (not all_up) and nmap_tries < 10: sleep(15) logger.detail('nmap_tries %s', nmap_tries) nmap.run() for p in nmap.processes: for line in p.stdout.split('\n'): if 'Status' in line: split_line = line.split(' ') ip = split_line[1] state = split_line[3].strip() if state == 'Up': vm = [vm for vm in vms if vm['ip'] == ip] if len(vm) > 0: vm[0]['state'] = 'OK' started_vms = [vm for vm in vms if vm['state'] == 'OK'] all_up = len(started_vms) == len(vms) if started_vms != old_started: old_started = started_vms else: if restart: restart_vms([vm for vm in vms if vm['state'] == 'KO']) nmap_tries += 1 if nmap_tries == 1: activate_vms([vm for vm in vms if vm['state'] == 'KO']) if not all_up: logger.info( str(nmap_tries) + ': ' + str(len(started_vms)) + '/' + str(len(vms))) nmap.reset() TaktukRemote('rm ' + tmpfile.split('/')[-1], hosts).run() Process('rm ' + tmpfile).run() if all_up: logger.info('All VM have been started') return True else: logger.error('All VM have not been started') return False
def workflow(self, comb): """ Compute one application launch using a given parameter group """ comb_ok = False try: # Generate configuration file needed by MPI processes logger.info("Generating assembly file...") py = comb['cores'] / comb['px'] prepare = Process( 'cd %s && python %s %d %d %d %d %d %s app.lad' % (self.workingPath, self.genLadScript, comb['datasize'], comb['datasize'], comb['datasize'], comb['px'], py, comb['transposition'])) prepare.shell = True prepare.run() # Generate the MPI host file mfile = self.generate_machine_file() # Start L2C lad = "./app.lad" logger.info("Computing...") res = Process( "export OAR_JOB_KEY_FILE=~/.oar_key ; cd %s && l2c_loader -M,-machinefile,%s --mpi -c %d %s" % (self.workingPath, mfile, comb['cores'], lad)) res.shell = True res.stdout_handlers.append( os.path.join(self.result_dir, slugify(comb) + '.out')) res.stdout_handlers.append(sys.stdout) res.stderr_handlers.append( os.path.join(self.result_dir, slugify(comb) + '.err')) res.stderr_handlers.append(sys.stderr) res.run() if not res.ok: logger.error('Bad L2C termination') raise Exception('Bad L2C termination') if len( res.stderr ) > 0: # WARNING: when L2C cannot find the LAD file or something strange like this logger.warning('Not empty error output') # Clean configuration files logger.info("Removing assembly files...") res = Process('cd %s && rm -f app.lad*' % self.workingPath) res.shell = True res.run() comb_ok = True except Exception: pass finally: if comb_ok: self.sweeper.done(comb) logger.info(style.host(slugify(comb)) + ' has been done') else: self.sweeper.cancel(comb) logger.warning( style.host(slugify(comb)) + ' has been canceled') logger.info(style.step('%s Remaining'), len(self.sweeper.get_remaining()))
tmp_vms.append(p.host.address) processes.append(p.stdout.strip()) vms_proc[p.host.address] = p.stdout.strip() init_limit = TaktukRemote('cpulimit -p {{processes}} -l 1', tmp_vms) for p in init_limit.processes: p.nolog_exit_code = ignore_exit_code = True init_limit.start() logger.info('Configuring events generator') n_host = 0 f = open('hosts.list') for line in f: n_host += 1 f.close() sed_time = Process('sed -i "s/simulator.duration.*/simulator.duration = ' + str(1000) + '/g"' + ' load_events_generator/config/simulator.properties').run() sed_nodes = Process( 'sed -i "s/nodes.number.*/nodes.number = ' + str(n_host) + '/g"' + ' load_events_generator/config/simulator.properties').run() sed_vms = Process('sed -i "s/vm.number.*/vm.number = ' + str(len(vms)) + '/g"' + ' load_events_generator/config/simulator.properties').run() logger.info('Generating events list') gen_events = Process( 'cd load_events_generator ; ' + 'java -jar load_events_generator.jar vms.list > ../events_load.xml') gen_events.shell = True gen_events.run()
def run(self): # Go to the result folder before everything os.chdir(self.result_dir) # OARSUB jobs = oarsub([(OarSubmission(resources='nodes=' + _nbrNodes.__str__(), job_type='deploy', walltime=_walltime, sql_properties=_properties), _site)]) job_id, site = jobs[0] try: # KADEPLOY logger.info("Waiting job start %s on %s" % (job_id, site)) wait_oar_job_start(job_id, site, prediction_callback=prediction_callback) logger.info("getting nodes of %s on %s" % (job_id, site)) nodes = get_oar_job_nodes(job_id, site) deployed, undeployed = deploy(Deployment(nodes, env_name=env_name), check_deployed_command=already_configured) if undeployed: logger.warn( "NOT deployed nodes : {}".format(str(undeployed))) raise RuntimeError('Deployement failed') # STARPU INSTALLATION spack_spec = 'chameleon@trunk+starpu+fxt ^starpu@svn-trunk+fxt' spack_command = 'spack install -v' + ' ' + spack_spec logger.info("Starting StarPU installation...") spack_process = Process(spack_command).start() spack_process.wait() logger.info("StarPU installation DONE...") self.checkProcess(spack_process) spack_process.kill() # STARPU DIRECTORY logger.info("Searching and going to StarPU installation directory...") starpu_location_process = Process(spack_spec).start() starpu_location_process.wait() self.checkProcess(starpu_location) starpu_cd_process = Process('cd ' + starpu_location_process.stdout + '/lib/chameleon').start() starpu_cd_process.wait() self.checkProcess(starpu_cd_process) starpu_location_process.kill() starpu_cd_process.kill() # RUNNING EXPERIMENT logger.info("Starting StarPU experiment...") starpu_experiment_process = Process(""" export STARPU_WORKER_STATS=1 export STARPU_CALIBRATE=2 ./timing/time_spotrf_tile --warmup --gpus=3 --threads=9 --nb=960 --ib=96 --n_range=48000:48000:9600 """) starpu_experiment_process.stdout_handlers.append(self.result_dir + '/' + 'StarPU.out') # create output file for StarPU starpu_experiment_process.start() starpu_experiment_process.wait() logger.info("StarPU experiment DONE...") self.checkProcess(starpu_experiment_process) starpu_experiment_process.kill() finally: logger.info("Delete job : {}".format(jobs)) oardel(jobs)
logger.info('Configuring events generator') n_host = 0 f = open('hosts.list') for line in f: n_host += 1 f.close() sed_time = Process('sed -i "s/simulator.duration.*/simulator.duration = ' + str(1000) + '/g"' + ' load_events_generator/config/simulator.properties').run() sed_nodes = Process('sed -i "s/nodes.number.*/nodes.number = ' + str(n_host) + '/g"' + ' load_events_generator/config/simulator.properties').run() sed_vms = Process('sed -i "s/vm.number.*/vm.number = ' + str(len(vms)) + '/g"' + ' load_events_generator/config/simulator.properties').run() logger.info('Generating events list') gen_events = Process('cd load_events_generator ; ' + 'java -jar load_events_generator.jar vms.list > ../events_load.xml') gen_events.shell = True gen_events.run() tree = ET.parse('events_load.xml') root = tree.getroot() events = {} for event in root.findall('./event'): events[int(round(float(event.get('time'))))] = {'vm': event.get('target'), 'load': event.get('value')} def set_cpu_load(load, vm_ip, pid): """Use cpulimit to change process intensity on vm""" logger.info('kill cpu_limit on %s and set it to %s', vm_ip, load) kill_cpu_limit = SshProcess('ps aux| grep "cpulimit" | grep -v "grep" | awk \'{print $2}\' | xargs -r kill -9',
def run(self): csvr = RevisionsReader(csv_file) # Launch CSV reader csvr_abstract = RevisionsReader(csv_file_abstract) os.chdir(self.result_dir) # Go to result directory before everything while True: try: csvr.next() csvr_abstract.next() chameleon_name = csvr_abstract.name() + '_' + csvr_abstract.chameleonBranch() + '_' + csvr_abstract.chameleonRevision() + '_' + csvr_abstract.command() starpu_name = csvr_abstract.name() + '_' + csvr_abstract.starpuBranch() + '_' + csvr_abstract.starpuRevision() + '_' + csvr_abstract.command() global_name = csvr_abstract.name() \ + '_chameleon_' + csvr_abstract.chameleonBranch() + '_' + csvr_abstract.chameleonRevision() \ + '_starpu_' + csvr_abstract.starpuBranch() + '_' + csvr_abstract.starpuRevision() \ + '_' + csvr_abstract.command() logger.info("Starting experiment %s" % (global_name)) spack_spec = 'chameleon@' + chameleon_name + ' +starpu+fxt ^starpu@' + starpu_name + ' +fxt' # FOLDER CREATION folder_name = 'chameleon_' + csvr_abstract.chameleonBranch() + '_' + csvr_abstract.chameleonRevision() \ + '_starpu_' + csvr_abstract.starpuBranch() + '_' + csvr_abstract.starpuRevision() folder = self.result_dir + '/' + folder_name try: os.mkdir(folder, 0764) except OSError as exc: if (exc.errno != errno.EEXIST): raise exc pass # STARPU INSTALLATION logger.info("Starting StarPU installation") spack_process = Process('spack -d install' + ' ' + spack_spec) if (not os.path.isfile(folder + '/' + 'compil_' + folder_name)): spack_process.stdout_handlers.append(folder + '/' + 'compil_' + folder_name) # create output file for StarPU installation spack_process.start() spack_process.wait() logger.info("StarPU installation DONE") is_ok = self.checkProcess(spack_process) spack_process.kill() if (not is_ok): continue # stop this experiment # STARPU DIRECTORY logger.info("Searching and going to StarPU installation directory") starpu_location_process = Process('spack location -i' + ' ' + spack_spec).start() starpu_location_process.wait() is_ok = self.checkProcess(starpu_location_process) starpu_path = starpu_location_process.stdout.replace("\n", "") # remove end_of_line starpu_cd = 'cd' + ' ' + starpu_path + '/lib/chameleon/' starpu_location_process.kill() if (not is_ok): continue # stop this experiment # RUNNING EXPERIMENT logger.info("Starting StarPU experiment") starpu_experiment_process = Process(starpu_cd + '\n' + csvr.command(), shell=True) starpu_experiment_process.stdout_handlers.append(folder + '/' + 'stdout_' + global_name) # create output file for StarPU execution starpu_experiment_process.stderr_handlers.append(folder + '/' + 'stderr_' + global_name) # create error file for StarPU execution starpu_experiment_process.start() starpu_experiment_process.wait() logger.info("StarPU experiment DONE") is_ok = self.checkProcess(starpu_experiment_process) starpu_experiment_process.kill() if (not is_ok): continue # stop this experiment except StopIteration: break;
def workflow(self, comb): """ Compute one application launch using a given parameter group """ comb_ok = False try: # Generate configuration file needed by MPI processes logger.info("Generating assembly file...") py = comb['cores'] / comb['px'] prepare = Process('cd %s && python %s %d %d %d %d %d %s app.lad' % (self.workingPath, self.genLadScript, comb['datasize'], comb['datasize'], comb['datasize'], comb['px'], py, comb['transposition'])) prepare.shell = True prepare.run() # Generate the MPI host file mfile = self.generate_machine_file() # Start L2C lad = "./app.lad" logger.info("Computing...") res = Process("export OAR_JOB_KEY_FILE=~/.oar_key ; cd %s && l2c_loader -M,-machinefile,%s --mpi -c %d %s" % (self.workingPath, mfile, comb['cores'], lad)) res.shell = True res.stdout_handlers.append(os.path.join(self.result_dir, slugify(comb) + '.out')) res.stdout_handlers.append(sys.stdout) res.stderr_handlers.append(os.path.join(self.result_dir, slugify(comb) + '.err')) res.stderr_handlers.append(sys.stderr) res.run() if not res.ok: logger.error('Bad L2C termination') raise Exception('Bad L2C termination') if len(res.stderr) > 0: # WARNING: when L2C cannot find the LAD file or something strange like this logger.warning('Not empty error output') # Clean configuration files logger.info("Removing assembly files...") res = Process('cd %s && rm -f app.lad*' % self.workingPath) res.shell = True res.run() comb_ok = True except Exception: pass finally: if comb_ok: self.sweeper.done(comb) logger.info(style.host(slugify(comb)) + ' has been done') else: self.sweeper.cancel(comb) logger.warning(style.host(slugify(comb)) + ' has been canceled') logger.info(style.step('%s Remaining'), len(self.sweeper.get_remaining()))