def takeMetric( path, startTime, endTime, metric=['cpu', 'mem', 'disk', 'swap', 'network']): opt = '' cmd_template_sar = ( "sar -f /var/log/sysstat/sa* -{opt} -s {startTime} -e {endTime}" ) for met in metric: if met == 'cpu': opt = 'u' elif met == 'mem': opt = 'r' elif met == 'disk': opt = 'dp' elif met == 'swap': opt = 'S' elif met == 'network': opt = 'n DEV' cmd = cmd_template_sar.format(opt=opt, startTime=startTime, endTime=endTime) for host in self.cluster: hE = SshProcess(cmd, host, connection_params={'user': '******'}) hE.run() stdMetric = host + '-' + met + '.txt' with open(os.path.join(path, stdMetric), "w") as sout: sout.write(hE.stdout)
def _munin_server(server, clients): """Install the monitoring service munin. Must be executed inside Grid'5000 to be able to resolve the server and clients IP. :param server: a execo.Host :param clients: a list of execo.Hosts :param plugins: a list of munin plugins """ logger.info( 'Munin monitoring service installation, server = %s, clients = \n %s', server.address, [host.address for host in clients]) logger.debug('Configuring munin server %s', style.host('server')) cmd = 'export DEBIAN_MASTER=noninteractive ; apt-get update && apt-get install -y munin' inst_munin_server = SshProcess(cmd, server).run() logger.debug('Creating configuration files for server') fd, server_conf = mkstemp(dir='/tmp/', prefix='munin-nodes_') f = fdopen(fd, 'w') for host in clients: get_ip = Process('host ' + host.address).run() ip = get_ip.stdout.strip().split(' ')[3] f.write('[' + host.address + ']\n address ' + ip + '\n use_node_name yes\n\n') f.close() Put([server], [server_conf], remote_location='/etc/').run() SshProcess('cd /etc && cp ' + server_conf.split('/')[-1] + ' munin.conf', server).run() Process('rm ' + server_conf).run()
def run_xp(self): """Iterate over the parameters and execute the bench""" while len(self.sweeper.get_remaining()) > 0: comb = self.sweeper.get_next() if comb['n_core'] > get_host_attributes(comb['cluster']+'-1')['architecture']['smt_size'] * self.n_nodes: self.sweeper.skip(comb) continue logger.info('Processing new combination %s' % (comb,)) site = get_cluster_site(comb['cluster']) jobs = oarsub([(OarSubmission(resources = "{cluster='" + comb['cluster']+"'}/nodes=" + str(self.n_nodes), job_type = 'allow_classic_ssh', walltime ='0:10:00'), site)]) if jobs[0][0]: try: wait_oar_job_start(*jobs[0]) nodes = get_oar_job_nodes(*jobs[0]) bench_cmd = 'mpirun -H %s -n %i %s ~/NPB3.3-MPI/bin/lu.%s.%i' % ( ",".join([node.address for node in nodes]), comb['n_core'], get_mpi_opts(comb['cluster']), comb['size'], comb['n_core']) lu_bench = SshProcess(bench_cmd, nodes[0]) lu_bench.stdout_handlers.append(self.result_dir + '/' + slugify(comb) + '.out') lu_bench.run() if lu_bench.ok: logger.info("comb ok: %s" % (comb,)) self.sweeper.done(comb) continue finally: oardel(jobs) logger.info("comb NOT ok: %s" % (comb,)) self.sweeper.cancel(comb)
def set_cpu_load(load, vm_ip, pid): """Use cpulimit to change process intensity on vm""" logger.info('kill cpu_limit on %s and set it to %s', vm_ip, load) kill_cpu_limit = SshProcess('ps aux| grep "cpulimit" | grep -v "grep" | awk \'{print $2}\' | xargs -r kill -9', vm_ip).run() start_cpu_limit = SshProcess('cpulimit -p ' + str(pid) + ' -l ' + str(load), vm_ip) start_cpu_limit.nolog_exit_code = start_cpu_limit.ignore_exit_code = True start_cpu_limit.start()
def set_cpu_load(load, vm_ip, pid): """Use cpulimit to change process intensity on vm""" logger.info('kill cpu_limit on %s and set it to %s', vm_ip, load) kill_cpu_limit = SshProcess( 'ps aux| grep "cpulimit" | grep -v "grep" | awk \'{print $2}\' | xargs -r kill -9', vm_ip).run() start_cpu_limit = SshProcess( 'cpulimit -p ' + str(pid) + ' -l ' + str(load), vm_ip) start_cpu_limit.nolog_exit_code = start_cpu_limit.ignore_exit_code = True start_cpu_limit.start()
def get_memory_and_cores(self, host): """Obtain the total available memory in MB and number of cores of the given host. Supported systems include Linux and Max OS X.. In linux it uses nproc and /proc/meminfo to obtain the informtion. In Max OS X it uses system_profiler. Args: host (Host): The host to query. Return: tuple of (int, int): A tuple containing the host available memory in MB and its number of cores. """ linux_cmd = "nproc && " \ "cat /proc/meminfo | grep MemTotal | awk '{ print $2,$3 }'" max_cmd = "system_profiler SPHardwareDataType | grep Cores | " \ "awk '{ print $NF }' && " \ "system_profiler SPHardwareDataType | grep Memory | " \ "awk '{ print $2,$3 }'" undef_str = "?" undef_cmd = "echo '" + undef_str + "'" command = 'if [ $(uname) == "Linux" ]; then %s; ' \ 'elif [ $(uname) == "Darwin" ]; then %s; ' \ 'else %s; fi' % \ (linux_cmd, max_cmd, undef_cmd) proc = SshProcess(command, host) proc.run() out = proc.stdout if out == undef_str: return None else: units = { "kb": lambda x: int(x) // 1024, "mb": lambda x: int(x), "gb": lambda x: int(x) * 1024, } (cores_str, mem_str) = out.splitlines() cores = int(cores_str) (num, unit) = mem_str.split() mem = units[unit.lower()](num) return mem, cores
def bootstrap(self, tar_file): """Install Cassandra in all cluster nodes from the specified tar.gz file. Args: tar_file (str): The file containing Cassandra binaries. """ # 0. Check that required packages are present required_packages = "openjdk-7-jre openjdk-7-jdk" check_packages = TaktukRemote("dpkg -s " + required_packages, self.hosts) for p in check_packages.processes: p.nolog_exit_code = p.nolog_error = True check_packages.run() if not check_packages.ok: logger.info("Packages not installed, trying to install") install_packages = TaktukRemote( "export DEBIAN_MASTER=noninteractive ; " + "apt-get update && apt-get install -y --force-yes " + required_packages, self.hosts, ).run() if not install_packages.ok: logger.error("Unable to install the packages") get_java_home = SshProcess("echo $(readlink -f /usr/bin/javac | " 'sed "s:/bin/javac::")', self.master) get_java_home.run() self.java_home = get_java_home.stdout.strip() logger.info("All required packages are present") # 1. Copy hadoop tar file and uncompress logger.info("Copy " + tar_file + " to hosts and uncompress") rm_dirs = TaktukRemote("rm -rf " + self.base_dir + " " + self.conf_dir + " " + self.logs_dir, self.hosts) put_tar = TaktukPut(self.hosts, [tar_file], "/tmp") tar_xf = TaktukRemote("tar xf /tmp/" + os.path.basename(tar_file) + " -C /tmp", self.hosts) SequentialActions([rm_dirs, put_tar, tar_xf]).run() # 2. Move installation to base dir and create other dirs logger.info("Create installation directories") mv_base_dir = TaktukRemote( "mv /tmp/" + os.path.basename(tar_file).replace(".tar.gz", "") + " " + self.base_dir, self.hosts ) mkdirs = TaktukRemote("mkdir -p " + self.conf_dir + " && mkdir -p " + self.logs_dir, self.hosts) chmods = TaktukRemote( "chmod g+w " + self.base_dir + " && chmod g+w " + self.conf_dir + " && chmod g+w " + self.logs_dir, self.hosts, ) SequentialActions([mv_base_dir, mkdirs, chmods]).run()
def get_server_iface(server): """Get the default network interface of the serve """ logger.debug('Retrieving default interface from %s', style.host(server.address)) get_if = SshProcess('ip route |grep default |cut -d " " -f 5', server).run() return get_if.stdout.strip()
def _enable_bridge(self, name='br0'): """We need a bridge to have automatic DHCP configuration for the VM.""" logger.detail('Configuring the bridge') hosts_br = self._get_bridge(self.hosts) nobr_hosts = [] for host, br in hosts_br.iteritems(): if br is None: logger.debug('No bridge on host %s', style.host(host)) nobr_hosts.append(host) elif br != name: logger.debug('Wrong bridge on host %s, destroying it', style.host(host)) SshProcess('ip link set ' + br + ' down ; brctl delbr ' + br, host).run() nobr_hosts.append(host) else: logger.debug('Bridge %s is present on host %s', style.emph('name'), style.host(host)) nobr_hosts = map(lambda x: x.address if isinstance(x, Host) else x, nobr_hosts) if len(nobr_hosts) > 0: logger.debug('Creating bridge on %s', hosts_list(nobr_hosts)) script = 'export br_if=`ip route |grep default |cut -f 5 -d " "`; \n' + \ 'ifdown $br_if ; \n' + \ 'sed -i "s/$br_if inet dhcp/$br_if inet manual/g" /etc/network/interfaces ; \n' + \ 'sed -i "s/auto $br_if//g" /etc/network/interfaces ; \n' + \ 'echo " " >> /etc/network/interfaces ; \n' + \ 'echo "auto ' + name + '" >> /etc/network/interfaces ; \n' + \ 'echo "iface ' + name + ' inet dhcp" >> /etc/network/interfaces ; \n' + \ 'echo " bridge_ports $br_if" >> /etc/network/interfaces ; \n' + \ 'echo " bridge_stp off" >> /etc/network/interfaces ; \n' + \ 'echo " bridge_maxwait 0" >> /etc/network/interfaces ; \n' + \ 'echo " bridge_fd 0" >> /etc/network/interfaces ; \n' + \ 'ifup ' + name fd, br_script = mkstemp(dir='/tmp/', prefix='create_br_') f = fdopen(fd, 'w') f.write(script) f.close() self.fact.get_fileput(nobr_hosts, [br_script]).run() self.fact.get_remote('nohup sh ' + br_script.split('/')[-1], nobr_hosts).run() logger.debug('Waiting for network restart') if_up = False nmap_tries = 0 while (not if_up) and nmap_tries < 20: sleep(20) nmap_tries += 1 nmap = Process('nmap ' + ' '.join([host for host in nobr_hosts]) + ' -p 22').run() for line in nmap.stdout.split('\n'): if 'Nmap done' in line: if_up = line.split()[2] == line.split()[5].replace( '(', '') logger.debug('Network has been restarted') logger.detail('All hosts have the bridge %s', style.emph(name))
def dnsmasq_server(server, clients=None, vms=None, dhcp=True): """Configure a DHCP server with dnsmasq :param server: host where the server will be installed :param clients: list of hosts that will be declared in dnsmasq :param vms: list of virtual machines """ logger.debug('Installing and configuring a DNS/DHCP server on %s', server) test_running = Process('nmap ' + server + ' -p 53 | grep domain') test_running.shell = True test_running.run() if 'open' in test_running.stdout: logger.info('DNS server already running, updating configuration') else: cmd = 'killall dnsmasq; export DEBIAN_MASTER=noninteractive ; ' + \ 'apt-get update ; apt-get -y purge dnsmasq-base ; ' + \ 'apt-get install -t wheezy -o Dpkg::Options::="--force-confdef" ' + \ '-o Dpkg::Options::="--force-confnew" ' + \ '-y dnsmasq; echo 1 > /proc/sys/net/ipv4/ip_forward ' SshProcess(cmd, server).run() sites = list( set([ get_host_site(client) for client in clients if get_host_site(client) ] + [get_host_site(server)])) add_vms(vms, server) if clients: kill_dnsmasq = TaktukRemote('killall dnsmasq', clients) for p in kill_dnsmasq.processes: p.ignore_exit_code = p.nolog_exit_code = True kill_dnsmasq.run() resolv_conf(server, clients, sites) if dhcp: sysctl_conf(server, vms) dhcp_conf(server, vms, sites) logger.debug('Restarting service ...') cmd = 'service dnsmasq stop ; rm /var/lib/misc/dnsmasq.leases ; ' + \ 'service dnsmasq start', SshProcess(cmd, server).run()
def runMpi(cmd): act = SshProcess(cmd, master, connection_params={'user': '******'}, shell=True) act.run() if not os.path.exists(curPath): os.makedirs(curPath) with open(os.path.join(curPath, "stdout.txt"), "a+") as sout, open( os.path.join(curPath, "stderr.txt"), "w") as serr: sout.write(act.stdout) serr.write(act.stderr) return act.ok
def restart_vms(vms): """ """ hosts = [vm['host'] for vm in vms] running_vms = list_vm(hosts) for vm in vms: if {'id': vm['id']} not in running_vms[vm['host']]: logger.info('%s has not been started on %s, starting it', style.vm(vm['id']), style.host(vm['host'])) SshProcess('virsh start ' + vm['id'], vm['host']).run()
def define_parameters(self): nbNodes = len(self.cluster) # build parameters and make nbCore list per benchmark freqList = [2534000, 2000000, 1200000] n_nodes = float(len(self.cluster)) max_core = SshProcess('cat /proc/cpuinfo | grep -i processor |wc -l', self.cluster[0], connection_params={ 'user': '******' }).run().stdout max_core = n_nodes * float(max_core) even = filter( lambda i: i > n_nodes, list(takewhile(lambda i: i < max_core, (2**i for i in count(0, 1))))) powerTwo = filter( lambda i: i > n_nodes, list(takewhile(lambda i: i < max_core, (i**2 for i in count(0, 1))))) # Define parameters self.parameters = { 'Repeat': [1], "Freq": [2534000], "NPBclass": ['C'], "Benchmark": { # 'ft': { # 'n_core': even # }, # 'ep': { # 'n_core': even # }, # 'lu': { # 'n_core': even # }, # 'is': { # 'n_core': even # }, # 'sg': { # 'n_core': even # }, # 'bt': { # 'n_core': powerTwo # }, 'sp': { 'n_core': powerTwo } } } logger.info(self.parameters) # make all possible parameters object, self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweep(self.parameters)) logger.info('Number of parameters combinations %s', len(self.sweeper.get_remaining()))
def import_from_kaenv(self, env, remote=None): """import a kadeploy environment from a kadeploy database""" env = re.match( r"^(?P<name>[-_.\w]+)(?:@(?P<user>[_.\w]+))?(:?:(?P<version>[_.\w]+))?$", env).groupdict("") if env['user']: env['user'] = "******" + env['user'] if env['version']: env['version'] = " --env-version " + env['version'] kaenv_cmd = "kaenv3{user}{version} -p {name}".format(**env) if remote: remote = re.match( r"^(?:(?P<user>[-_.\w]+)@)?(?P<address>[-_.\w]+)(?::(?P<port>\d{1,5}))?$", remote).groupdict() p = SshProcess(kaenv_cmd, Host(**remote)) else: p = Process(kaenv_cmd, shell=True) p.run() self.desc = yaml.load(p.stdout) return self
def add_vms(vms, server): """Generate the list of virtual machines """ logger.debug('Adding the VM in /etc/hosts ...') fd, vms_list = mkstemp(dir='/tmp/', prefix='vms_') f = fdopen(fd, 'w') f.write('\n' + '\n'.join([vm['ip'] + ' \t ' + vm['id'] for vm in vms])) f.close() Put([server], [vms_list], remote_location='/etc/').run() SshProcess( '[ -f /etc/hosts.bak ] && cp /etc/hosts.bak /etc/hosts || ' + ' cp /etc/hosts /etc/hosts.bak', server).run() Remote('cat /etc/' + vms_list.split('/')[-1] + ' >> /etc/hosts', [server]).run() Process('rm ' + vms_list).run()
def sysctl_conf(server, vms): """Change the default value of net.ipv4.neigh.default.gc_thresh* to handle large number of IP""" val = int(2**ceil(log(len(vms), 2))) conf = "\nnet.ipv4.neigh.default.gc_thresh3 = " + str(3 * val) + \ "\nnet.ipv4.neigh.default.gc_thresh2 = " + str(2 * val) + \ "\nnet.ipv4.neigh.default.gc_thresh1 = " + str(val) fd, sysctl = mkstemp(dir='/tmp/', prefix='sysctl_') f = fdopen(fd, 'w') f.write(conf) f.close() Put([server], [sysctl], remote_location='/etc/').run() SshProcess( 'cd /etc && cat ' + sysctl.split('/')[-1] + ' >> sysctl.conf && sysctl -p', server).run() Process('rm ' + sysctl).run()
def get_cpu_topology(cluster, xpdir=None): """ """ logger.info('Determining the architecture of cluster ' + \ style.emph(cluster)) root = None # Trying to reed topology from a directory if xpdir: fname = xpdir + '/topo_' + cluster + '.xml' try: tree = parse(fname) root = tree.getroot() except: logger.info('No cache file found, will reserve a node and ' + \ 'determine topology from virsh capabilities') pass if root is None: frontend = get_cluster_site(cluster) submission = OarSubmission(resources="{cluster='" + cluster + "'}/nodes=1", walltime="0:02:00", job_type="allow_classic_ssh") ((job_id, _), ) = oarsub([(submission, frontend)]) wait_oar_job_start(job_id, frontend) host = get_oar_job_nodes(job_id, frontend)[0] capa = SshProcess('virsh capabilities', host, connection_params={ 'user': default_frontend_connection_params['user'] }).run() oardel([(job_id, frontend)]) root = fromstring(capa.stdout) if xpdir is not None: tree = ElementTree(root) tree.write(fname) cpu_topology = [] i_cell = 0 for cell in root.findall('.//cell'): cpu_topology.append([]) for cpu in cell.findall('.//cpu'): cpu_topology[i_cell].append(int(cpu.attrib['id'])) i_cell += 1 logger.info(pformat(cpu_topology)) return cpu_topology
def dhcp_conf(server, vms, sites): """Generate the dnsmasq.conf with dhcp parameters and put it on the server""" logger.debug('Creating dnsmasq.conf') ip_mac = [(vm['ip'], vm['mac']) for vm in vms] dhcp_lease = 'dhcp-lease-max=10000\n' dhcp_range = 'dhcp-range=' + ip_mac[0][0] + ',' + ip_mac[len(vms) - 1][0] + ',12h\n' dhcp_router = 'dhcp-option=option:router,' + get_server_ip(server) + '\n' dhcp_hosts = '' + '\n'.join([ 'dhcp-host=' + ':' + ip_mac[i][1] + ',' + vms[i]['id'] + ',' + ip_mac[i][0] for i in range(len(vms)) ]) dhcp_option = 'dhcp-option=option:domain-search,grid5000.fr,' + \ ','.join([site + '.grid5000.fr' for site in sites]) + '\n' fd, dnsmasq = mkstemp(dir='/tmp/', prefix='dnsmasq_') f = fdopen(fd, 'w') f.write(dhcp_lease + dhcp_range + dhcp_router + dhcp_hosts + '\n' + dhcp_option) f.close() Put([server], [dnsmasq], remote_location='/etc/').run() SshProcess('cd /etc && cp ' + dnsmasq.split('/')[-1] + ' dnsmasq.conf', server).run() Process('rm ' + dnsmasq).run()
def get_java_home(host): proc = SshProcess('echo $(readlink -f /usr/bin/javac | ' 'sed "s:/bin/javac::")', host) proc.run() return proc.stdout.strip()
def get_java_home(host): proc = SshProcess( 'echo $(readlink -f /usr/bin/javac | ' 'sed "s:/bin/javac::")', host) proc.run() return proc.stdout.strip()
# walltime=walltime, job_type='deploy'), # get_cluster_site(cluster))]) jobs = [(49083, get_cluster_site(cluster))] if jobs[0][0]: wait_oar_job_start(jobs[0][0], jobs[0][1]) logger.info('Job %s has started on %s, retrieving nodes', jobs[0][0], jobs[0][1]) nodes = get_oar_job_nodes(jobs[0][0], jobs[0][1]) logger.info('Deploying on node %s', nodes[0].address) deployed, undeployed = deploy(Deployment(nodes, env_name="wheezy-x64-base")) logger.info('Installing fio') cmd = 'export DEBIAN_MASTER=noninteractive ; apt-get update && apt-get ' + \ 'install -y --force-yes fio' install = SshProcess(cmd, nodes[0]).run() col_number = { 'common': { 'output_version': 1, 'fio_version': 2, 'error': 5 }, 'read': { 'runtime': 9, 'bw': 7, 'bw_min': 42, 'bw_max': 43, 'lat': 40, 'lat_min': 38, 'lat_max': 39,
def is_job_running(self, job_id=None): """ """ get_state = SshProcess('qstat -f ' + str(job_id), jobserver) get_state.ignore_exit_code = True get_state.run() return get_state.ok
def run(self): sweeper = self.create_paramsweeper() while True: comb = sweeper.get_next() if not comb: break comb_dir = self.result_dir + '/' + slugify(comb) if not os.path.isdir(comb_dir): os.mkdir(comb_dir) comb_file = comb_dir + '/trace' g5k_configuration['kadeploy3'] = comb['version'] logger.info('Treating combination %s', pformat(comb)) get_version = SshProcess( comb['version'] + ' -v', comb['site'], connection_params=default_frontend_connection_params).run() logger.info(get_version.stdout) resources = "" if comb['kavlan']: resources += "{type='kavlan'}/vlan=1+" resources += "nodes=" + str(comb['n_nodes']) sub = OarSubmission(resources=resources, job_type='deploy', walltime="0:30:00", name='Kadeploy_Tests') logger.info('Performing submission of %s on site %s', resources, comb['site']) jobs = oarsub([(sub, comb['site'])]) if jobs[0][0]: try: logger.info('Waiting for job to start') wait_oar_job_start(jobs[0][0], jobs[0][1]) hosts = get_oar_job_nodes(jobs[0][0], jobs[0][1]) logger.info('Deployment of %s', ' '.join([host.address for host in hosts])) kavlan = get_oar_job_kavlan(jobs[0][0], jobs[0][1]) if kavlan: logger.info('In kavlan %s', kavlan) deployment = Deployment(hosts, env_name=comb['env'], vlan=kavlan) deployed, undeployed = deploy(deployment, stdout_handlers=[comb_file], stderr_handlers=[comb_file]) finally: logger.info('Destroying job %s on %s', str(jobs[0][0]), jobs[0][1]) oardel([(jobs[0][0], jobs[0][1])]) else: deployed = [] if len(undeployed) == 0: logger.info('%s is OK', slugify(comb)) elif len(deployed) == 0: logger.error('%s is KO', slugify(comb)) else: logger.warning('%s encountered problems with some hosts', slugify(comb)) sweeper.done(comb)
def run(self): """Run the experiment""" already_configured = self.options.already_configured reservation_job_id = int(self.options.reservation_id) \ if self.options.reservation_id is not None else None is_a_test = self.options.is_a_test if is_a_test: logger.warn('THIS IS A TEST! This run will use only a few ' 'resources') # make the result folder writable for all os.chmod(self.result_dir, 0o777) # Import configuration with open(self.args[0]) as config_file: config = json.load(config_file) # backup configuration copy(self.args[0], self.result_dir) site = config["grid5000_site"] resources = config["resources"] nb_experiment_nodes = config["nb_experiment_nodes"] walltime = str(config["walltime"]) env_name = config["kadeploy_env_name"] workloads = config["workloads"] # check if workloads exists (Suppose that the same NFS mount point # is present on the remote and the local environment for workload_file in workloads: with open(workload_file): pass # copy the workloads files to the results dir copy(workload_file, self.result_dir) # define the workloads parameters self.parameters = {'workload_filename': workloads} logger.info('Workloads: {}'.format(workloads)) # define the iterator over the parameters combinations self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweep(self.parameters)) # Due to previous (using -c result_dir) run skip some combination logger.info('Skipped parameters:' + '{}'.format(str(self.sweeper.get_skipped()))) logger.info('Number of parameters combinations {}'.format( str(len(self.sweeper.get_remaining())))) logger.info('combinations {}'.format(str( self.sweeper.get_remaining()))) if reservation_job_id is not None: jobs = [(reservation_job_id, site)] else: jobs = oarsub([(OarSubmission(resources=resources, job_type='deploy', walltime=walltime), site)]) job_id, site = jobs[0] if job_id: try: logger.info("waiting job start %s on %s" % (job_id, site)) wait_oar_job_start(job_id, site, prediction_callback=prediction_callback) logger.info("getting nodes of %s on %s" % (job_id, site)) nodes = get_oar_job_nodes(job_id, site) # sort the nodes nodes = sorted(nodes, key=lambda node: node.address) # get only the necessary nodes under the switch if nb_experiment_nodes > len(nodes): raise RuntimeError('The number of given node in the ' 'reservation ({}) do not match the ' 'requested resources ' '({})'.format(len(nodes), nb_experiment_nodes)) nodes = nodes[:nb_experiment_nodes] logger.info("deploying nodes: {}".format(str(nodes))) deployed, undeployed = deploy( Deployment(nodes, env_name=env_name), check_deployed_command=already_configured) if undeployed: logger.warn("NOT deployed nodes: {}".format( str(undeployed))) raise RuntimeError('Deployement failed') if not already_configured: # install OAR install_cmd = "apt-get update; apt-get install -y " node_packages = "oar-node" logger.info("installing OAR nodes: {}".format( str(nodes[1:]))) install_oar_nodes = Remote( install_cmd + node_packages, nodes[1:], connection_params={'user': '******'}) install_oar_nodes.start() server_packages = ( "oar-server oar-server-pgsql oar-user " "oar-user-pgsql postgresql python3-pip " "libjson-perl postgresql-server-dev-all") install_oar_sched_cmd = """ mkdir -p /opt/oar_sched; \ cd /opt/oar_sched; \ git clone https://github.com/oar-team/oar3.git; \ cd oar3; \ git checkout dce942bebc2; \ pip3 install -e .; \ cd /usr/lib/oar/schedulers; \ ln -s /usr/local/bin/kamelot; \ pip3 install psycopg2 """ logger.info("installing OAR server node: {}".format( str(nodes[0]))) install_master = SshProcess( install_cmd + server_packages + ";" + install_oar_sched_cmd, nodes[0], connection_params={'user': '******'}) install_master.run() install_oar_nodes.wait() if not install_master.ok: Report(install_master) configure_oar_cmd = """ sed -i \ -e 's/^\(DB_TYPE\)=.*/\\1="Pg"/' \ -e 's/^\(DB_HOSTNAME\)=.*/\\1="localhost"/' \ -e 's/^\(DB_PORT\)=.*/\\1="5432"/' \ -e 's/^\(DB_BASE_PASSWD\)=.*/\\1="oar"/' \ -e 's/^\(DB_BASE_LOGIN\)=.*/\\1="oar"/' \ -e 's/^\(DB_BASE_PASSWD_RO\)=.*/\\1="oar_ro"/' \ -e 's/^\(DB_BASE_LOGIN_RO\)=.*/\\1="oar_ro"/' \ -e 's/^\(SERVER_HOSTNAME\)=.*/\\1="localhost"/' \ -e 's/^\(SERVER_PORT\)=.*/\\1="16666"/' \ -e 's/^\(LOG_LEVEL\)\=\"2\"/\\1\=\"3\"/' \ -e 's#^\(LOG_FILE\)\=.*#\\1="{result_dir}/oar.log"#' \ -e 's/^\(JOB_RESOURCE_MANAGER_PROPERTY_DB_FIELD\=\"cpuset\".*\)/#\\1/' \ -e 's/^#\(CPUSET_PATH\=\"\/oar\".*\)/\\1/' \ -e 's/^\(FINAUD_FREQUENCY\)\=.*/\\1="0"/' \ /etc/oar/oar.conf """.format(result_dir=self.result_dir) configure_oar = Remote(configure_oar_cmd, nodes, connection_params={'user': '******'}) configure_oar.run() logger.info("OAR is configured on all nodes") # Configure server create_db = "oar-database --create --db-is-local" config_oar_sched = ( "oarnotify --remove-queue default;" "oarnotify --add-queue default,1,kamelot") start_oar = "systemctl start oar-server.service" logger.info("configuring OAR database: {}".format( str(nodes[0]))) config_master = SshProcess( create_db + ";" + config_oar_sched + ";" + start_oar, nodes[0], connection_params={'user': '******'}) config_master.run() # propagate SSH keys logger.info("configuring OAR SSH") oar_key = "/tmp/.ssh" Process('rm -rf ' + oar_key).run() Process( 'scp -o BatchMode=yes -o PasswordAuthentication=no ' '-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ' '-o ConnectTimeout=20 -rp -o User=root ' + nodes[0].address + ":/var/lib/oar/.ssh" ' ' + oar_key).run() # Get(nodes[0], "/var/lib/oar/.ssh", [oar_key], connection_params={'user': '******'}).run() Put(nodes[1:], [oar_key], "/var/lib/oar/", connection_params={ 'user': '******' }).run() add_resources_cmd = """ oarproperty -a cpu || true; \ oarproperty -a core || true; \ oarproperty -c -a host || true; \ oarproperty -a mem || true; \ """ for node in nodes[1:]: add_resources_cmd = add_resources_cmd + "oarnodesetting -a -h {node} -p host={node} -p cpu=1 -p core=4 -p cpuset=0 -p mem=16; \\\n".format( node=node.address) add_resources = SshProcess( add_resources_cmd, nodes[0], connection_params={'user': '******'}) add_resources.run() if add_resources.ok: logger.info("oar is now configured!") else: raise RuntimeError( "error in the OAR configuration: Abort!") # TODO backup de la config de OAR # Do the replay logger.info('begining the replay') while len(self.sweeper.get_remaining()) > 0: combi = self.sweeper.get_next() workload_file = os.path.basename( combi['workload_filename']) oar_replay = SshProcess( script_path + "/oar_replay.py " + combi['workload_filename'] + " " + self.result_dir + " oar_gant_" + workload_file, nodes[0]) oar_replay.stdout_handlers.append(self.result_dir + '/' + workload_file + '.out') logger.info("replaying workload: {}".format(combi)) oar_replay.run() if oar_replay.ok: logger.info("Replay workload OK: {}".format(combi)) self.sweeper.done(combi) else: logger.info("Replay workload NOT OK: {}".format(combi)) self.sweeper.cancel(combi) raise RuntimeError("error in the OAR replay: Abort!") except: traceback.print_exc() ipdb.set_trace() finally: if is_a_test: ipdb.set_trace() if reservation_job_id is None: logger.info("delete job: {}".format(jobs)) oardel(jobs)
def run(self): """Run the experiment""" already_configured = self.options.already_configured reservation_job_id = int(self.options.reservation_id) \ if self.options.reservation_id is not None else None is_a_test = self.options.is_a_test if is_a_test: logger.warn('THIS IS A TEST! This run will use only a few ' 'resources') # make the result folder writable for all os.chmod(self.result_dir, 0o777) # Import configuration with open(self.args[0]) as config_file: config = json.load(config_file) # backup configuration copy(self.args[0], self.result_dir) site = config["grid5000_site"] resources = config["resources"] nb_experiment_nodes = config["nb_experiment_nodes"] walltime = str(config["walltime"]) env_name = config["kadeploy_env_name"] workloads = config["workloads"] # check if workloads exists (Suppose that the same NFS mount point # is present on the remote and the local environment for workload_file in workloads: with open(workload_file): pass # copy the workloads files to the results dir copy(workload_file, self.result_dir) # define the workloads parameters self.parameters = { 'workload_filename': workloads } logger.info('Workloads: {}'.format(workloads)) # define the iterator over the parameters combinations self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweep(self.parameters)) # Due to previous (using -c result_dir) run skip some combination logger.info('Skipped parameters:' + '{}'.format(str(self.sweeper.get_skipped()))) logger.info('Number of parameters combinations {}'.format( str(len(self.sweeper.get_remaining())))) logger.info('combinations {}'.format( str(self.sweeper.get_remaining()))) if reservation_job_id is not None: jobs = [(reservation_job_id, site)] else: jobs = oarsub([(OarSubmission(resources=resources, job_type='deploy', walltime=walltime), site)]) job_id, site = jobs[0] if job_id: try: logger.info("waiting job start %s on %s" % (job_id, site)) wait_oar_job_start( job_id, site, prediction_callback=prediction_callback) logger.info("getting nodes of %s on %s" % (job_id, site)) nodes = get_oar_job_nodes(job_id, site) # sort the nodes nodes = sorted(nodes, key=lambda node: node.address) # get only the necessary nodes under the switch if nb_experiment_nodes > len(nodes): raise RuntimeError('The number of given node in the ' 'reservation ({}) do not match the ' 'requested resources ' '({})'.format(len(nodes), nb_experiment_nodes)) nodes = nodes[:nb_experiment_nodes] logger.info("deploying nodes: {}".format(str(nodes))) deployed, undeployed = deploy( Deployment(nodes, env_name=env_name), check_deployed_command=already_configured) if undeployed: logger.warn( "NOT deployed nodes: {}".format(str(undeployed))) raise RuntimeError('Deployement failed') if not already_configured: # install OAR install_cmd = "apt-get update; apt-get install -y " node_packages = "oar-node" logger.info( "installing OAR nodes: {}".format(str(nodes[1:]))) install_oar_nodes = Remote( install_cmd + node_packages, nodes[1:], connection_params={'user': '******'}) install_oar_nodes.start() server_packages = ("oar-server oar-server-pgsql oar-user " "oar-user-pgsql postgresql python3-pip " "libjson-perl postgresql-server-dev-all") install_oar_sched_cmd = """ mkdir -p /opt/oar_sched; \ cd /opt/oar_sched; \ git clone https://github.com/oar-team/oar3.git; \ cd oar3; \ git checkout dce942bebc2; \ pip3 install -e .; \ cd /usr/lib/oar/schedulers; \ ln -s /usr/local/bin/kamelot; \ pip3 install psycopg2 """ logger.info("installing OAR server node: {}".format(str(nodes[0]))) install_master = SshProcess(install_cmd + server_packages + ";" + install_oar_sched_cmd, nodes[0], connection_params={'user': '******'}) install_master.run() install_oar_nodes.wait() if not install_master.ok: Report(install_master) configure_oar_cmd = """ sed -i \ -e 's/^\(DB_TYPE\)=.*/\\1="Pg"/' \ -e 's/^\(DB_HOSTNAME\)=.*/\\1="localhost"/' \ -e 's/^\(DB_PORT\)=.*/\\1="5432"/' \ -e 's/^\(DB_BASE_PASSWD\)=.*/\\1="oar"/' \ -e 's/^\(DB_BASE_LOGIN\)=.*/\\1="oar"/' \ -e 's/^\(DB_BASE_PASSWD_RO\)=.*/\\1="oar_ro"/' \ -e 's/^\(DB_BASE_LOGIN_RO\)=.*/\\1="oar_ro"/' \ -e 's/^\(SERVER_HOSTNAME\)=.*/\\1="localhost"/' \ -e 's/^\(SERVER_PORT\)=.*/\\1="16666"/' \ -e 's/^\(LOG_LEVEL\)\=\"2\"/\\1\=\"3\"/' \ -e 's#^\(LOG_FILE\)\=.*#\\1="{result_dir}/oar.log"#' \ -e 's/^\(JOB_RESOURCE_MANAGER_PROPERTY_DB_FIELD\=\"cpuset\".*\)/#\\1/' \ -e 's/^#\(CPUSET_PATH\=\"\/oar\".*\)/\\1/' \ -e 's/^\(FINAUD_FREQUENCY\)\=.*/\\1="0"/' \ /etc/oar/oar.conf """.format(result_dir=self.result_dir) configure_oar = Remote(configure_oar_cmd, nodes, connection_params={'user': '******'}) configure_oar.run() logger.info("OAR is configured on all nodes") # Configure server create_db = "oar-database --create --db-is-local" config_oar_sched = ("oarnotify --remove-queue default;" "oarnotify --add-queue default,1,kamelot") start_oar = "systemctl start oar-server.service" logger.info( "configuring OAR database: {}".format(str(nodes[0]))) config_master = SshProcess(create_db + ";" + config_oar_sched + ";" + start_oar, nodes[0], connection_params={'user': '******'}) config_master.run() # propagate SSH keys logger.info("configuring OAR SSH") oar_key = "/tmp/.ssh" Process('rm -rf ' + oar_key).run() Process('scp -o BatchMode=yes -o PasswordAuthentication=no ' '-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ' '-o ConnectTimeout=20 -rp -o User=root ' + nodes[0].address + ":/var/lib/oar/.ssh" ' ' + oar_key).run() # Get(nodes[0], "/var/lib/oar/.ssh", [oar_key], connection_params={'user': '******'}).run() Put(nodes[1:], [oar_key], "/var/lib/oar/", connection_params={'user': '******'}).run() add_resources_cmd = """ oarproperty -a cpu || true; \ oarproperty -a core || true; \ oarproperty -c -a host || true; \ oarproperty -a mem || true; \ """ for node in nodes[1:]: add_resources_cmd = add_resources_cmd + "oarnodesetting -a -h {node} -p host={node} -p cpu=1 -p core=4 -p cpuset=0 -p mem=16; \\\n".format(node=node.address) add_resources = SshProcess(add_resources_cmd, nodes[0], connection_params={'user': '******'}) add_resources.run() if add_resources.ok: logger.info("oar is now configured!") else: raise RuntimeError("error in the OAR configuration: Abort!") # TODO backup de la config de OAR # Do the replay logger.info('begining the replay') while len(self.sweeper.get_remaining()) > 0: combi = self.sweeper.get_next() workload_file = os.path.basename(combi['workload_filename']) oar_replay = SshProcess(script_path + "/oar_replay.py " + combi['workload_filename'] + " " + self.result_dir + " oar_gant_" + workload_file, nodes[0]) oar_replay.stdout_handlers.append(self.result_dir + '/' + workload_file + '.out') logger.info("replaying workload: {}".format(combi)) oar_replay.run() if oar_replay.ok: logger.info("Replay workload OK: {}".format(combi)) self.sweeper.done(combi) else: logger.info("Replay workload NOT OK: {}".format(combi)) self.sweeper.cancel(combi) raise RuntimeError("error in the OAR replay: Abort!") except: traceback.print_exc() ipdb.set_trace() finally: if is_a_test: ipdb.set_trace() if reservation_job_id is None: logger.info("delete job: {}".format(jobs)) oardel(jobs)
def start(self): """Start MongoDB server.""" self._check_initialization() logger.info("Starting MongoDB") if self.running: logger.warn("MongoDB was already started") return # Start nodes procs = [] for h in self.hosts: mongo_command = (NUMA_PREFIX + " " + self.bin_dir + "/mongod " " --fork " " --config " + os.path.join(self.conf_dir, CONF_FILE) + " --bind_ip " + h.address + " --port " + str(self.md_port)) logger.debug(mongo_command) proc = SshProcess(mongo_command, h) proc.start() procs.append(proc) finished_ok = True for p in procs: p.wait() if not p.finished_ok: finished_ok = False if not finished_ok: logger.warn("Error while starting MongoDB") return else: self.running = True # Start replication if self.do_replication: logger.info("Configuring replication") mongo_command = "rs.initiate();" mongo_command += ';'.join( 'rs.add("' + h.address + ':' + str(self.md_port) + '")' for h in self.hosts) logger.debug(mongo_command) proc = TaktukRemote(self.bin_dir + "/mongo " "--eval '" + mongo_command + "' " + self.master.address, [self.master]) proc.run() if not proc.finished_ok: logger.warn("Not able to start replication") if self.do_sharding: if not self.initialized_sharding: logger.info("Configuring sharding") time.sleep(2) mongo_command = ( 'rs.initiate({' '_id : "%s",' 'configsvr : true,' 'members : [%s]})' % ( self.rs_name, ",".join('{ _id : %d, host : "%s:%d" }' % (_id, h.address, self.md_port) for (_id, h) in enumerate(self.hosts)) ) ) logger.debug(mongo_command) proc = SshProcess(self.bin_dir + "/mongo " + "--eval '" + mongo_command + "' " + self.master.address, self.master) proc.run() if proc.finished_ok: self.initialized_sharding = True else: logger.warn("Not able to configure sharding") logger.info("Starting sharding servers") mongo_command = ( NUMA_PREFIX + " " + self.bin_dir + "/mongos" " --configdb " + self.rs_name + "/" + ",".join('%s:%d' % (h.address, self.md_port) for h in self.hosts) + " --bind_ip " + self.master.address + " --port " + str(self.ms_port) + " --fork" " --logpath " + self.logs_dir + "/mongos.log" " --pidfilepath " + self.mongos_pid_file ) logger.debug(mongo_command) start_ms = TaktukRemote(mongo_command, [self.master]) start_ms.run()