def _initialize_conf(self): """Merge locally-specified configuration files with default files from the distribution""" action = Remote( "cp " + os.path.join(self.conf_dir, MR_CONF_FILE + ".template ") + os.path.join(self.conf_dir, MR_CONF_FILE), self.hosts) action.run() super(HadoopV2Cluster, self)._initialize_conf()
def _initialize_conf(self): """Merge locally-specified configuration files with default files from the distribution""" action = Remote("cp " + os.path.join(self.conf_dir, MR_CONF_FILE + ".template ") + os.path.join(self.conf_dir, MR_CONF_FILE), self.hosts) action.run() super(HadoopV2Cluster, self)._initialize_conf()
def run_bench(output_folder, node): # debian part logger.info("Starting debian benchs...") debian_folder = create_subdir(output_folder, "debian") debian_bench_command = "../../tools/benchs.sh \"commands/omp-tasks/debian-omp-tasks\" {} 10 \"1 2 4 8 16\"".format(debian_folder) debian_bench = Remote('cd ./unikernel-tools/benchs/bots && {}'.format(debian_bench_command), node).run() # hermitux part logger.info("Starting hermitux benchs...") hermitux_folder = create_subdir(output_folder, "hermitux") hermitux_bench_command = "../../tools/benchs.sh \"commands/omp-tasks/hermitux-omp-tasks\" {} 10 \"1 2 4 8 16\"".format(hermitux_folder) hermitux_bench = Remote('cd ./unikernel-tools/benchs/bots && {}'.format(hermitux_bench_command), node).run()
def bootstrap(self, tar_file): """Install Hadoop in all cluster nodes from the specified tar.gz file. Args: hadoop_tar_file (str): The file containing Hadoop binaries. """ if super(HadoopV2Cluster, self).bootstrap(tar_file): action = Remote( "cp " + os.path.join(self.conf_dir, MR_CONF_FILE + ".template ") + os.path.join(self.conf_dir, MR_CONF_FILE), self.hosts) action.run()
def bootstrap(self, tar_file): """Install Hadoop in all cluster nodes from the specified tar.gz file. Args: hadoop_tar_file (str): The file containing Hadoop binaries. """ if super(HadoopV2Cluster, self).bootstrap(tar_file): action = Remote("cp " + os.path.join(self.conf_dir, MR_CONF_FILE + ".template ") + os.path.join(self.conf_dir, MR_CONF_FILE), self.hosts) action.run()
def workflow(self, comb, host, comb_dir): """ """ comb_ok = False thread_name = style.Thread(host.split('.')[0]) + ': ' logger.info(thread_name + 'Starting combination ' + slugify(comb)) try: logger.info(thread_name + 'Generate conf file') param_str = self.create_string(comb) Remote( "python /home/Work/sgcbntier/paasage_demo/xml_gen_execo.py --cb " + param_str, [host]).run() logger.info(thread_name + 'Run code') Remote( "cd /home/Work/sgcbntier/paasage_demo/ ; python run_all_execo.py --cb %s" % param_str, [host]).run() logger.info(thread_name + 'Get results') traceFile = "ntier_" + param_str get_results = Get([host], [ "/home/Work/sgcbntier/paasage_demo/csv/REQTASK_" + traceFile + ".csv" ], local_location=comb_dir).run() for p in get_results.processes: if not p.ok: logger.error( host + ': Unable to retrieve the files for combination %s', slugify(comb)) exit() comb_ok = True finally: if comb_ok: self.sweeper.done(comb) logger.info(thread_name + ': ' + slugify(comb) + \ ' has been done') else: self.sweeper.cancel(comb) logger.warning(thread_name + ': ' + slugify(comb) + \ ' has been canceled') logger.info(style.step('%s Remaining'), len(self.sweeper.get_remaining()))
def check_if_file_exists(file_name, nodes, connection_params): """ Check if a file exists in a set of nodes. It returns the set of nodes that DON'T have the files :param file_name: :param nodes: :param connection_params: :return: """ r = Remote(cmd="test -e " + file_name, hosts=nodes, connection_params=connection_params, process_args={"nolog_exit_code": True} ) r.run() not_ok = filter(lambda p: p.ok is not True, r.processes) return set([node.host.address for node in not_ok])
def upload_to_all_g5k_sites(self): sites = get_g5k_sites() sitesg5k = [s + ".g5k" for s in sites] # we add the .g5k to be able to ssh to the frontend # delete any existent spark-benchdirectories Remote("rm -rf $HOME/spark-bench",sitesg5k,connection_params={'user': g5k_configuration.get("g5k_user")}).run() # upload the source code to all sites Put(sitesg5k, ["spark/spark-bench"], "~/", connection_params = {'user': g5k_configuration.get("g5k_user")}).run()
def load_workload(self, from_node, workload, recordcount, threadcount, fieldlength): """ Run a workload from the core workloads with the CassandraDB db wrapper :param from_node: from which node you want to run the workload. This is a set variable :param workload: the type of workload e.g. workloada, workloadb, workloadc ... :param recordcount: the total number of records to insert :param insertcount: the number of records to insert with this execution :param insertstart: from where to start inserting. Useful if we have different clients and each one will insert in different ranges e.g.[client1:0-10000,client2:10000-20000,...] :param threadcount: the number of threads for each client. Increase this to increase the load on the system """ # we transform the set to a str with the format 'node1,node2,node3...' cassandra_nodes_str = ','.join(list(self.cassandra_nodes)) # divide the number of records equally among clients insertcount = int(recordcount) / from_node.__len__() insertstart_steps = list(range(from_node.__len__())) insertstart_values = map(lambda x: x * insertcount, insertstart_steps) # We load the data into the cassandra database # This could be changed to an execo generator expression like insertstart={{[x for x in insertstart_values]}} #"ycsb-0.12.0/bin/ycsb.sh load cassandra-cql -P ycsb-0.12.0/workloads/" + workload + " -p hosts=" + cassandra_nodes_str + " -p recordcount=" + recordcount + " -p insertstart={{[x for x in insertstart_values]}} -p insertcount=" + str(insertcount) + " -p threadcount=" + threadcount + " -p fieldlength=" + str(fieldlength) Remote( "ycsb-0.12.0/bin/ycsb.sh load cassandra-cql -P ycsb-0.12.0/workloads/" + workload + " -p hosts=" + cassandra_nodes_str + " -p recordcount=" + recordcount + " -p insertstart={{[x for x in insertstart_values]}} -p insertcount=" + str(insertcount) + " -p threadcount=" + threadcount + " -p fieldlength=" + str(fieldlength), hosts=from_node, connection_params=self.execo_conn_params, process_args={ 'stdout_handlers': [sys.stdout], 'stderr_handlers': [sys.stderr] }).run(timeout=900)
def run_workload(self, iteration, res_dir, from_node, workload, recordcount, threadcount, fieldlength, target): # we transform the set to a str with the format 'node1,node2,node3...' """ Run a given workload. it receives an iteration parameter to be able to repeat the workload several times :param iteration: the iteration number. Will be appended to the name of the file with the output :param from_node: from which nodes should we execute the benchmark :param workload: the type of workload :param threadcount: the number of threads """ cassandra_nodes_str = ','.join(list(self.cassandra_nodes)) # We run the workload Remote( "ycsb-0.12.0/bin/ycsb.sh run cassandra-cql -P ycsb-0.12.0/workloads/" + workload + " -p hosts=" + cassandra_nodes_str + " -p recordcount=" + recordcount + " -p threadcount=" + str(threadcount) + " -p fieldlength=" + str(fieldlength) + " -p exportfile=" + res_dir + "/output_" + workload + "_{{{host}}}_it" + str(iteration), hosts=from_node, connection_params=self.execo_conn_params, process_args={ 'stdout_handlers': [sys.stdout], 'stderr_handlers': [sys.stderr] }).run(timeout=900)
def submit(self, class_in_jar, class_params, jar, master, submit_conf, scheduler_options): """ :param class_in_jar: the class we want to launch inside the jar :param class_params: the parameters expected by that class :param jar: the jar where the class is bundled :param master: the master that is going to take care of launching the app. ("yarn","spark:/192.168.0.1", etc..) :param submit_conf: a list of tuples with the form [["spark.executor.memory","2g"],["spark.executor.cores","1"]] :param scheduler_options: options that are only applicable to that resource manager e.g. (Mesos tags, yarn labels...) """ if master is None: master = self.default_master if scheduler_options is None: scheduler_options = "" scheduler_str = self.generate_scheduler_options(scheduler_options) conf_str = self.generate_conf(submit_conf) cmd = "{0} --class {1} --master {2} --deploy-mode client {3} {4} {5} {6}".format( self.root_to_spark_submit, class_in_jar, master, conf_str, jar, " ".join(class_params), scheduler_str) Remote(cmd, hosts=self.master_node, connection_params={ 'user': g5k_configuration.get('g5k_user') }, process_args={ 'stdout_handlers': [sys.stdout], 'stderr_handlers': [sys.stderr] }).run()
def reboot_hosts(hosts, timeout=300): """ """ reboot = Remote('reboot', hosts).run() if not reboot.ok: return False wait_hosts_down(hosts, timeout) wait_hosts_up(hosts, timeout)
def start(self): logger.info('Put benchmarking files on hosts.') file_path = os.path.join(os.path.dirname(__file__), '../resources/unixbench-5.1.3.tgz') bench_copy = Put(self.bench_list.values(), [file_path], "/tmp/").run() logger.info('Start benchmarking on ' + str(len(self.bench_list)) + ' hosts.') bench_install = Remote( 'cd /tmp/ &&' + \ 'tar xvfz unixbench-5.1.3.tgz &&' + \ 'cd unixbench-5.1.3/ &&' + \ './Run arithmetic &&' + \ 'cd ../ &&' + \ 'rm -rf unixbench-5.1.3/ &&' + \ 'rm -rf unixbench-5.1.3.tgz', self.bench_list.values()) for p in bench_install.processes: host = p.host.address + (':' + str(p.host.port)) if(p.host.port != None) else "" p.stdout_handlers.append(end_forwarder_stdout_handler(host, self.callback)) bench_install.start()
def add_vms(vms, server): """Generate the list of virtual machines """ logger.debug('Adding the VM in /etc/hosts ...') fd, vms_list = mkstemp(dir='/tmp/', prefix='vms_') f = fdopen(fd, 'w') f.write('\n' + '\n'.join([vm['ip'] + ' \t ' + vm['id'] for vm in vms])) f.close() Put([server], [vms_list], remote_location='/etc/').run() SshProcess( '[ -f /etc/hosts.bak ] && cp /etc/hosts.bak /etc/hosts || ' + ' cp /etc/hosts /etc/hosts.bak', server).run() Remote('cat /etc/' + vms_list.split('/')[-1] + ' >> /etc/hosts', [server]).run() Process('rm ' + vms_list).run()
def install_docker(nodes): docker_deb = "https://download.docker.com/linux/debian/dists/jessie/pool/stable/amd64/docker-ce_17.03.1~ce-0~debian-jessie_amd64.deb" Remote("wget {0} -O {1}/docker-ce.deb 2>1".format(docker_deb, wget_destination), hosts=nodes, connection_params={ 'user': '******' }).run() ## download the debian package for docker Remote("dpkg -i {0}/docker-ce.deb".format(wget_destination), hosts=nodes, connection_params={ 'user': '******' }).run() # install the package Remote("apt-get -fy install", hosts=nodes, connection_params={ 'user': '******' }).run() # install the package Remote("docker run hello-world", hosts=nodes, connection_params={ 'user': '******' }).run() # install the package
def __init__(self, install_nodes, execo_conn_params): """ This is the parent class for all the YSCB benchmarks. It needs as parameters the nodes where we want to install it and the execo_conn_params (e.g. vagrantg5k, g5k_user, etc...) :type install_nodes: set :type execo_conn_params: dict """ self.install_nodes = install_nodes self.execo_conn_params = execo_conn_params # before downloading anything let's check if the files already exists in the home directory not_exist = check_if_file_exists("ycsb-0.12.0.tar.gz", install_nodes, self.execo_conn_params) # a set if (not_exist.__len__() != 0): print "Downloading YSCB from Git for nodes: {0}".format(not_exist) Remote( "curl -O --location https://github.com/brianfrankcooper/YCSB/releases/download/0.12.0/ycsb-0.12.0.tar.gz ycsb-0.12.0.tar.gz", hosts=not_exist, connection_params=execo_conn_params).run() print "Untar ycsb-0.12.0.tar.gz" Remote("tar xfvz ycsb-0.12.0.tar.gz", hosts=install_nodes, connection_params=execo_conn_params).run() print "YSCB installed in nodes: {0}".format(install_nodes) print "Remember to prepare the DB before running any workload. More info on https://github.com/brianfrankcooper/YCSB"
def prepare_bench(self): """bench configuration and compilation, copy binaries to frontends return True if preparation is ok """ logger.info("preparation: configure and compile benchmark") # the involved sites. We will do the compilation on the first of these. sites = list(set(map(get_cluster_site, self.parameters['cluster']))) # generate the bench compilation configuration bench_list = '\n'.join([ 'lu\t%s\t%s' % (size, n_core) for n_core in self.parameters['n_core'] for size in self.parameters['size'] ]) # Reserving a node because compiling on the frontend is forbidden # and because we need mpif77 jobs = oarsub([(OarSubmission(resources = "nodes=1", job_type = 'allow_classic_ssh', walltime ='0:10:00'), sites[0])]) if jobs[0][0]: try: logger.info("copying bench archive to %s" % (sites[0],)) copy_bench = Put([sites[0]], ['NPB3.3-MPI.tar.bz2']).run() logger.info("extracting bench archive on %s" % (sites[0],)) extract_bench = Remote('tar -xjf NPB3.3-MPI.tar.bz2', [sites[0]]).run() logger.info("waiting job start %s" % (jobs[0],)) wait_oar_job_start(*jobs[0], prediction_callback = pred_cb) logger.info("getting nodes of %s" % (jobs[0],)) nodes = get_oar_job_nodes(*jobs[0]) logger.info("configure bench compilation") conf_bench = Remote('echo "%s" > ~/NPB3.3-MPI/config/suite.def' % bench_list, nodes).run() logger.info("compil bench") compilation = Remote('cd NPB3.3-MPI && make clean && make suite', nodes).run() logger.info("compil finished") except: logger.error("unable to compile bench") return False finally: oardel(jobs) # Copying binaries to all other frontends frontends = sites[1:] rsync = Remote('rsync -avuP ~/NPB3.3-MPI/ {{frontends}}:NPB3.3-MPI', [get_host_site(nodes[0])] * len(frontends)) rsync.run() return compilation.ok and rsync.ok
def setup_node(node): clone_bench = Remote('git clone https://github.com/p-jacquot/unikernel-tools', node).run() compile_bench = Remote('cd ./unikernel-tools/benchs/bots && make debian && make hermitux', node).run() return clone_bench.ok and compile_bench.ok
def run_xp(self): master = self.cluster[0] opt = '' """Iterate over the parameters and execute the bench""" while len(self.sweeper.get_remaining()) > 0: # Take sweeper comb = self.sweeper.get_next() logger.info('Processing new combination %s' % (comb, )) try: # metric from linux sar tools, works with clock def takeMetric( path, startTime, endTime, metric=['cpu', 'mem', 'disk', 'swap', 'network']): opt = '' cmd_template_sar = ( "sar -f /var/log/sysstat/sa* -{opt} -s {startTime} -e {endTime}" ) for met in metric: if met == 'cpu': opt = 'u' elif met == 'mem': opt = 'r' elif met == 'disk': opt = 'dp' elif met == 'swap': opt = 'S' elif met == 'network': opt = 'n DEV' cmd = cmd_template_sar.format(opt=opt, startTime=startTime, endTime=endTime) for host in self.cluster: hE = SshProcess(cmd, host, connection_params={'user': '******'}) hE.run() stdMetric = host + '-' + met + '.txt' with open(os.path.join(path, stdMetric), "w") as sout: sout.write(hE.stdout) #Set CPU Freq and Policy according current combination cmd_template_Freq_Policy = ("cpufreq-set -r -g {policy}") cmd_template_Freq = ("cpufreq-set -r -f {freq}") if comb['Freq'] == 'OnDemand': cmd_freq_policy = cmd_template_Freq_Policy.format( policy='ondemand') Remote(cmd_freq_policy, master, connection_params={ 'user': '******' }).run() elif comb['Freq'] == 'conservative': cmd_freq_policy = cmd_template_Freq_Policy.format( policy='conservative') Remote(cmd_freq_policy, master, connection_params={ 'user': '******' }).run() else: cmd_freq_policy = cmd_template_Freq_Policy.format( policy='userspace') Remote(cmd_freq_policy, master, connection_params={ 'user': '******' }).run() cmd_freq = cmd_template_Freq.format(freq=comb['Freq']) Remote(cmd_freq, master, connection_params={ 'user': '******' }).run() # build command src = 'source /opt/intel-performance-snapshoot/apsvars.sh' cmd_mpirun_template = ( "mpirun {opt} -f /root/cluster.txt -np {pr1} aps -r '/tmp/log/' /tmp/NPB/npb-mpi/bin/{typeNPB}.{NPBclass}.{pr2}" ) cmd_mpirun = cmd_mpirun_template.format( opt='', pr1=comb['n_core'], typeNPB=comb['Benchmark'], NPBclass=comb['NPBclass'], pr2=comb['n_core']) cmd = "{}; /tmp/NPB/bin/runMPI.sh '{}' '{}'".format( src, cmd_mpirun, slugify(comb)) curPath = self.result_dir + slugify(comb) # run Mpi through execo remote SshProcess def runMpi(cmd): act = SshProcess(cmd, master, connection_params={'user': '******'}, shell=True) act.run() if not os.path.exists(curPath): os.makedirs(curPath) with open(os.path.join(curPath, "stdout.txt"), "a+") as sout, open( os.path.join(curPath, "stderr.txt"), "w") as serr: sout.write(act.stdout) serr.write(act.stderr) return act.ok # start clock and exec command in the master node time.sleep(5) startUnix = int(time.time()) start24Hour = datetime.datetime.fromtimestamp( startUnix).strftime('%H:%M:%S') task1 = runMpi(cmd) endUnix = int(time.time()) end24Hour = datetime.datetime.fromtimestamp(endUnix).strftime( '%H:%M:%S') time.sleep(5) with open(os.path.join(curPath, "executionTime.txt"), "w") as sout: sout.write( 'ExecTime:{}\nStartDate:{}\nEndDate:{}\n'.format( str(endUnix - startUnix), start24Hour, end24Hour)) takeMetric(curPath, start24Hour, end24Hour, ['cpu', 'mem', 'disk', 'swap', 'network']) # collect power from kWAPI: grid5000 infrastructure made tool for hostname in self.cluster: powerOut = '{}_power'.format(hostname) collect_metric(startUnix, endUnix, 'power', curPath, self.site, powerOut, hostname) st = '/tmp/out/' + slugify(comb) intelAppPerf = str(st + '.html') # get the data from ['Application Performance Snapshot', 'Storage Performance Snapshot'] # https://software.intel.com/en-us/performance-snapshot Get(master, [intelAppPerf], curPath, connection_params={ 'user': '******' }).run() if task1: logger.info("comb ok: %s" % (comb, )) self.sweeper.done(comb) continue except OSError as err: print("OS error: {0}".format(err)) except ValueError: print("Could not convert data to an integer.") except: print("Unexpected error:", sys.exc_info()[0]) raise logger.info("comb NOT ok: %s" % (comb, )) self.sweeper.cancel(comb)
def run_bench(output_folder, node): debian_bench_command = "./benchs.sh {} 10 \"1 2 4 8 16\"".format(debian_folder) debian_bench = Remote('cd ./unikernel-tools/benchs/rodinias && {}'.format(debian_bench_command), node).run()
def setup_node(node): clone_bench = Remote('git clone https://github.com/p-jacquot/unikernel-tools', node).run() return clone_bench.ok
def run(self): """ """ if self.options.oargrid_job_id: self.oargrid_job_id = self.options.oargrid_job_id else: self.oargrid_job_id = None try: # Creation of the main iterator which is used for the first control loop. self.define_parameters() job_is_dead = False # While there are combinations to treat while len(self.sweeper.get_remaining()) > 0: # If no job, we make a reservation and prepare the hosts for the experiments if self.oargrid_job_id is None: self.make_reservation() # Wait that the job starts logger.info('Waiting that the job start') wait_oargrid_job_start(self.oargrid_job_id) # Retrieving the hosts and subnets parameters self.hosts = get_oargrid_job_nodes(self.oargrid_job_id) # Hosts deployment and configuration default_connection_params['user'] = '******' logger.info("Start hosts configuration") ex_log.setLevel('INFO') deployment = Deployment( hosts=self.hosts, env_file='/home/sirimie/env/mywheezy-x64-base.env') self.hosts, _ = deploy(deployment) Remote("rm -f /home/Work/sgcbntier/paasage_demo/csv/REQTASK_*", self.hosts).run() Remote( "rm -f /home/Work/sgcbntier/paasage_demo/platform_aws.xml", self.hosts).run() Remote("rm -f /home/Work/sgcbntier/paasage_demo/cloud_ec2.xml", self.hosts).run() Put(self.hosts, [ "run_all_execo.py", "xml_gen_execo.py", "conf.xml", "platform_aws.xml", "cloud_ec2.xml" ], remote_location="/home/Work/sgcbntier/paasage_demo/").run( ) logger.info("Done") if len(self.hosts) == 0: break # Initializing the resources and threads available_hosts = [ host for host in self.hosts for i in range( get_host_attributes(host)['architecture']['smt_size']) ] threads = {} # Creating the unique folder for storing the results comb_dir = self.result_dir + '/csv_results' if not os.path.exists(comb_dir): os.mkdir(comb_dir) # Checking that the job is running and not in Error while self.is_job_alive() or len(threads.keys()) > 0: job_is_dead = False while self.options.n_nodes > len(available_hosts): tmp_threads = dict(threads) for t in tmp_threads: if not t.is_alive(): available_hosts.append(tmp_threads[t]['host']) del threads[t] sleep(5) if not self.is_job_alive(): job_is_dead = True break if job_is_dead: break # Getting the next combination comb = self.sweeper.get_next() if not comb: while len(threads.keys()) > 0: tmp_threads = dict(threads) for t in tmp_threads: if not t.is_alive(): del threads[t] logger.info('Waiting for threads to complete') sleep(20) break host = available_hosts[0] available_hosts = available_hosts[1:] t = Thread(target=self.workflow, args=(comb, host, comb_dir)) threads[t] = {'host': host} t.daemon = True t.start() if not self.is_job_alive(): job_is_dead = True if job_is_dead: self.oargrid_job_id = None finally: if self.oargrid_job_id is not None: if not self.options.keep_alive: logger.info('Deleting job') oargriddel([self.oargrid_job_id]) else: logger.info('Keeping job alive for debugging')
def workflow(self, comb, host, comb_dir): """ """ comb_ok = False thread_name = style.Thread(str(host).split('.')[0]) + ': ' logger.info(thread_name + 'Starting combination ' + slugify(comb)) if 'parapluie' in str(host): nb_proc = 24 elif 'paranoia' in str(host): nb_proc = 20 elif 'parapide' in str(host): nb_proc = 8 else: nb_proc = 16 try: self.export = "source ~/aevol_binary/intel/linux/mkl/bin/mklvars.sh intel64; " bucketname = self.working_dir + '/raevol_5_mut_lat/' + slugify( comb) + '/' logger.info(thread_name + "Killing other RAevol") killa = Remote("killall -9 aevol_run", [host]) for killp in killa.processes: killp.ignore_error = True killa.run() if os.path.isdir(bucketname) and os.path.exists(bucketname + '/last_gener.txt'): logger.info(thread_name + "Resuming AEVOL from NFS backup") gen_file = open(bucketname + '/last_gener.txt', 'r') last_gen = gen_file.read() if int(last_gen) < 300000: logger.info(thread_name + "Resuming AEVOL Run from " + str(int(last_gen))) rem = Remote( self.export + 'cd ' + bucketname + '; /home/jorouzaudcornabas/aevol_binary/aevol/src/aevol_run -p ' + str(nb_proc) + ' -e 300000 -r ' + last_gen + ' >> aevol_run.log', [host], process_args={ 'default_stdout_handler': False, 'default_stderr_handler': False }).run() if rem.ok: comb_ok = True else: comb_ok = True else: Remote('mkdir -p ' + bucketname, [host]).run() param_file = '/home/jorouzaudcornabas/aevol_binary/aevol/execo/mut_lat/param_tmpl.in' logger.info(thread_name + 'Generate config file ' + param_file) f_template = open(param_file) fd, outfile = mkstemp(dir='/tmp/', prefix=slugify(comb) + '_param') f = os.fdopen(fd, 'w') for line in f_template: if 'CONFIGURE_ENVIRONMENT_VALUES' in line: if comb['env'] == 'const': line = line.replace('CONFIGURE_ENVIRONMENT_VALUES', 'NB_ENVIRONMENTS 1') f.write(line) f.write('ENV_ADD_GAUSSIAN 1 0.5 0.2 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 1 0.5 0.4 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 1 0.5 0.6 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 1 0.5 0.8 0.05' + os.linesep) elif comb['env'] == 'lat_3': line = line.replace('CONFIGURE_ENVIRONMENT_VALUES', 'NB_ENVIRONMENTS 2') f.write(line) f.write('ENV_ADD_GAUSSIAN 1 0.5 0.2 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 1 0.5 0.4 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 1 0.5 0.6 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 1 0.5 0.8 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 2 0.5 0.2 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 2 0.5 0.4 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 2 0.5 0.65 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 2 0.5 0.8 0.05' + os.linesep) elif comb['env'] == 'lat_all': line = line.replace('CONFIGURE_ENVIRONMENT_VALUES', 'NB_ENVIRONMENTS 16') f.write(line) #const f.write('ENV_ADD_GAUSSIAN 1 0.5 0.2 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 1 0.5 0.4 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 1 0.5 0.6 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 1 0.5 0.8 0.05' + os.linesep) # 1 f.write('ENV_ADD_GAUSSIAN 2 0.5 0.25 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 2 0.5 0.4 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 2 0.5 0.6 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 2 0.5 0.8 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 3 0.5 0.2 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 3 0.5 0.45 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 3 0.5 0.6 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 3 0.5 0.8 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 4 0.5 0.2 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 4 0.5 0.4 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 4 0.5 0.65 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 4 0.5 0.8 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 5 0.5 0.2 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 5 0.5 0.4 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 5 0.5 0.6 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 5 0.5 0.85 0.05' + os.linesep) # 2 f.write('ENV_ADD_GAUSSIAN 6 0.5 0.25 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 6 0.5 0.45 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 6 0.5 0.6 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 6 0.5 0.8 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 7 0.5 0.25 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 7 0.5 0.4 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 7 0.5 0.65 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 7 0.5 0.8 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 8 0.5 0.25 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 8 0.5 0.4 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 8 0.5 0.6 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 8 0.5 0.85 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 9 0.5 0.2 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 9 0.5 0.45 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 9 0.5 0.65 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 9 0.5 0.8 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 10 0.5 0.2 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 10 0.5 0.45 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 10 0.5 0.6 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 10 0.5 0.85 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 11 0.5 0.2 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 11 0.5 0.4 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 11 0.5 0.65 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 11 0.5 0.85 0.05' + os.linesep) # 3 f.write('ENV_ADD_GAUSSIAN 12 0.5 0.25 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 12 0.5 0.45 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 12 0.5 0.65 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 12 0.5 0.8 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 13 0.5 0.25 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 13 0.5 0.45 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 13 0.5 0.6 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 13 0.5 0.85 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 14 0.5 0.25 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 14 0.5 0.4 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 14 0.5 0.65 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 14 0.5 0.85 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 15 0.5 0.2 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 15 0.5 0.45 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 15 0.5 0.65 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 15 0.5 0.85 0.05' + os.linesep) # 4 f.write('ENV_ADD_GAUSSIAN 16 0.5 0.25 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 16 0.5 0.45 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 16 0.5 0.65 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 16 0.5 0.85 0.05' + os.linesep) elif 'CONFIGURE_SIGNAL_VALUES' in line: if comb['env'] == 'const': line = line.replace('CONFIGURE_SIGNAL_VALUES', '') f.write(line) elif comb['env'] == 'lat_3': line = line.replace( 'CONFIGURE_SIGNAL_VALUES', 'CREATE_SIGNAL h0 h0 h0 w0 m0 m1 m0 h1 h0 m0 h0 m1 h1 w0 h1 h0 m1 h1 m0 w0 w0 m0 w0 h0 h1 m1 w0 m0 m1 m0 w0 h1 h0 m0 h0 m1 h1 w0 h0 w0 m0 m1 m0 w0 h1 h0 w0 w0 h1' ) f.write(line) f.write('ENV_ADD_SIGNAL 2 1' + os.linesep) elif comb['env'] == 'lat_all': line = line.replace( 'CONFIGURE_SIGNAL_VALUES', 'CREATE_SIGNAL h0 w0 h1 m1 w0 h1 m0 h0 h1 w0 h0 m1 h1 h1 m1 m0 h0 w0 h1 m1 w0 h1 m0 h0 h1 w0 h0 m1 h1 h1 m1 m0 h1 m0 m1' ) f.write(line) f.write( 'CREATE_SIGNAL m0 h0 m1 h1 m1 w0 m0 m1 m0 h0 m1 h1 w0 h0 h0 h1 m1 m0 h1 w0 h1 h0 m1 h1 m0 w0 w0 m0 m1 w0 w0 h1 h0 w0 h1 h0 h0 m0 h0 w0 h0 m1 m0 w0 h1 w0 w0 h1 m0' + os.linesep) f.write( 'CREATE_SIGNAL h0 h0 h0 w0 m0 m1 m0 h1 h0 m0 h0 m1 h1 w0 h1 h0 m1 h1 m0 w0 w0 m0 w0 h0 h1 m1 w0 m0 m1 m0 w0 h1 h0 m0 h0 m1 h1 w0 h0 w0 m0 m1 m0 w0 h1 h0 w0 w0 h1' + os.linesep) f.write( 'CREATE_SIGNAL h1 h1 m0 w0 w0 h1 m1 h1 h1 m1 m0 w0 m1 m0 m0 w0 m0 h0 m0 h0 w0 h0 m0 h0 h1 m1 h0 h1 w0 h0 h1 m1 h1 m1 m0' + os.linesep) f.write('ENV_ADD_SIGNAL 2 1' + os.linesep) f.write('ENV_ADD_SIGNAL 3 2' + os.linesep) f.write('ENV_ADD_SIGNAL 4 3' + os.linesep) f.write('ENV_ADD_SIGNAL 5 4' + os.linesep) f.write('ENV_ADD_SIGNAL 6 1' + os.linesep) f.write('ENV_ADD_SIGNAL 6 2' + os.linesep) f.write('ENV_ADD_SIGNAL 7 1' + os.linesep) f.write('ENV_ADD_SIGNAL 7 3' + os.linesep) f.write('ENV_ADD_SIGNAL 8 1' + os.linesep) f.write('ENV_ADD_SIGNAL 8 4' + os.linesep) f.write('ENV_ADD_SIGNAL 9 2' + os.linesep) f.write('ENV_ADD_SIGNAL 9 3' + os.linesep) f.write('ENV_ADD_SIGNAL 10 2' + os.linesep) f.write('ENV_ADD_SIGNAL 10 4' + os.linesep) f.write('ENV_ADD_SIGNAL 11 3' + os.linesep) f.write('ENV_ADD_SIGNAL 11 4' + os.linesep) f.write('ENV_ADD_SIGNAL 12 1' + os.linesep) f.write('ENV_ADD_SIGNAL 12 2' + os.linesep) f.write('ENV_ADD_SIGNAL 12 3' + os.linesep) f.write('ENV_ADD_SIGNAL 13 1' + os.linesep) f.write('ENV_ADD_SIGNAL 13 2' + os.linesep) f.write('ENV_ADD_SIGNAL 13 4' + os.linesep) f.write('ENV_ADD_SIGNAL 14 1' + os.linesep) f.write('ENV_ADD_SIGNAL 14 3' + os.linesep) f.write('ENV_ADD_SIGNAL 14 4' + os.linesep) f.write('ENV_ADD_SIGNAL 15 2' + os.linesep) f.write('ENV_ADD_SIGNAL 15 3' + os.linesep) f.write('ENV_ADD_SIGNAL 15 4' + os.linesep) f.write('ENV_ADD_SIGNAL 16 1' + os.linesep) f.write('ENV_ADD_SIGNAL 16 2' + os.linesep) f.write('ENV_ADD_SIGNAL 16 3' + os.linesep) f.write('ENV_ADD_SIGNAL 16 4' + os.linesep) else: line = line.replace('SEED_NUMBER', str(comb['seed'])) line = line.replace('MUTATION_RATE_VALUE', comb['mutation']) line = line.replace('SELECTION_PRESSURE', str(comb['selection'])) f.write(line) f_template.close() f.close() put_file = Put([host], [outfile], remote_location=bucketname).run() if not put_file.ok: exit() os.remove(outfile) Remote( 'cd ' + bucketname + '; cp ' + outfile.split('/')[-1] + ' param.in; cp /home/jorouzaudcornabas/aevol_binary/aevol/execo/mut_lat/binding_matrix.rae .', [host]).run() logger.info(thread_name + "Launching AEVOL Create") Remote( self.export + 'cd ' + bucketname + '; /home/jorouzaudcornabas/aevol_binary/aevol/src/aevol_create > aevol_create.log', [host], process_args={ 'default_stdout_handler': False, 'default_stderr_handler': False }).run() logger.info(thread_name + "Launching AEVOL Run") rem = Remote( self.export + 'cd ' + bucketname + '; /home/jorouzaudcornabas/aevol_binary/aevol/src/aevol_run -p ' + str(nb_proc) + ' -n 300000 > aevol_run.log', [host], process_args={ 'default_stdout_handler': False, 'default_stderr_handler': False }).run() if rem.ok: comb_ok = True logger.info(thread_name + 'Get results ' + comb_dir + "/" + slugify(comb)) #try: #os.mkdir(comb_dir + "/" + slugify(comb)) #except: #logger.warning(thread_name + #'%s already exists, removing existing files', comb_dir + "/" + slugify(comb)) #shutil.rmtree(comb_dir+ "/" + slugify(comb)) #try: #os.mkdir(comb_dir + "/" + slugify(comb)) #except: #logger.warning(thread_name + #'%s already exists, recreating directory', comb_dir + "/" + slugify(comb)) #get_results = Get([host], [bucketname+ "/aevol_create.log", bucketname+ "/aevol_run.log", bucketname+'/stats/'], #local_location=comb_dir + "/" + slugify(comb)).run() #for p in get_results.processes: #if not p.ok: #logger.error(thread_name + #': Unable to retrieve the files for combination %s', #slugify(comb)) #exit() finally: if comb_ok: self.sweeper.done(comb) # shutil.rmtree(bucketname) logger.info(thread_name + ': ' + slugify(comb) + \ ' has been done') else: self.sweeper.cancel(comb) logger.warning(thread_name + ': ' + slugify(comb) + \ ' has been canceled') logger.info(style.step('%s Remaining'), len(self.sweeper.get_remaining()))
def workflow(self, comb, host, comb_dir): """ """ comb_ok = False thread_name = style.Thread(str(host).split('.')[0]) + ': ' logger.info(thread_name + 'Starting combination ' + slugify(comb)) try: self.export = "source ~/aevol_binary/intel/linux/mkl/bin/mklvars.sh intel64; " bucketname = self.working_dir + '/raevol_5_mut_lat/' + slugify( comb) + '/' if os.path.isdir(bucketname) and os.path.exists(bucketname + '/last_gener.txt'): logger.info(thread_name + "Resuming AEVOL from NFS backup") gen_file = open(bucketname + '/last_gener.txt', 'r') last_gen = gen_file.read() if int(last_gen) < 500000: logger.info(thread_name + "Resuming AEVOL Run from " + str(int(last_gen))) rem = Remote( self.export + 'cd ' + bucketname + '; /home/jorouzaudcornabas/aevol_binary/aevol/src/aevol_run -p 16' + ' -e 300000 -r ' + last_gen + ' >> aevol_run.log', [host]).run() if rem.ok: comb_ok = True else: comb_ok = True else: Remote('mkdir -p ' + bucketname, [host]).run() param_file = '/home/jorouzaudcornabas/aevol_binary/execo/mut_lat/param_tmpl.in' logger.info(thread_name + 'Generate config file ' + param_file) f_template = open(param_file) fd, outfile = mkstemp(dir='/tmp/', prefix=slugify(comb) + '_param') f = os.fdopen(fd, 'w') for line in f_template: line = line.replace('SEED_NUMBER', str(comb['seed'])) line = line.replace('FUZZY_VERSION', str(comb['fuzzy'])) if comb['move']: line = line.replace('FIRST_GAUSSIAN_MEDIAN', '0.25') line = line.replace('THIRD_GAUSSIAN_MEDIAN', '0.65') else: line = line.replace('FIRST_GAUSSIAN_MEDIAN', '0.2') line = line.replace('THIRD_GAUSSIAN_MEDIAN', '0.6') line = line.replace('GAUSSIAN_HEIGHT', str(comb['height'])) f.write(line) f_template.close() f.close() put_file = Put([host], [outfile], remote_location=bucketname).run() if not put_file.ok: exit() os.remove(outfile) Remote( 'cd ' + bucketname + '; cp ' + outfile.split('/')[-1] + ' param.in', [host]).run() logger.info(thread_name + "Launching AEVOL Create") Remote( self.export + 'cd ' + bucketname + '; /home/jorouzaudcornabas/aevol_diff_area/aevol/src/aevol_create > aevol_create.log', [host]).run() logger.info(thread_name + "Launching AEVOL Run") rem = Remote( self.export + 'cd ' + bucketname + '; /home/jorouzaudcornabas/aevol_diff_area/aevol/src/aevol_run -p 16 -n 500000 > aevol_run.log', [host]).run() if rem.ok: comb_ok = True logger.info(thread_name + 'Get results ' + comb_dir + "/" + slugify(comb)) #try: #os.mkdir(comb_dir + "/" + slugify(comb)) #except: #logger.warning(thread_name + #'%s already exists, removing existing files', comb_dir + "/" + slugify(comb)) #shutil.rmtree(comb_dir+ "/" + slugify(comb)) #try: #os.mkdir(comb_dir + "/" + slugify(comb)) #except: #logger.warning(thread_name + #'%s already exists, recreating directory', comb_dir + "/" + slugify(comb)) #get_results = Get([host], [bucketname+ "/aevol_create.log", bucketname+ "/aevol_run.log", bucketname+'/stats/'], #local_location=comb_dir + "/" + slugify(comb)).run() #for p in get_results.processes: #if not p.ok: #logger.error(thread_name + #': Unable to retrieve the files for combination %s', #slugify(comb)) #exit() finally: if comb_ok: self.sweeper.done(comb) # shutil.rmtree(bucketname) logger.info(thread_name + ': ' + slugify(comb) + \ ' has been done') else: self.sweeper.cancel(comb) logger.warning(thread_name + ': ' + slugify(comb) + \ ' has been canceled') logger.info(style.step('%s Remaining'), len(self.sweeper.get_remaining()))
def run(self): """Run the experiment""" already_configured = self.options.already_configured reservation_job_id = int(self.options.reservation_id) \ if self.options.reservation_id is not None else None is_a_test = self.options.is_a_test if is_a_test: logger.warn('THIS IS A TEST! This run will use only a few ' 'resources') # make the result folder writable for all os.chmod(self.result_dir, 0o777) # Import configuration with open(self.args[0]) as config_file: config = json.load(config_file) # backup configuration copy(self.args[0], self.result_dir) site = config["grid5000_site"] resources = config["resources"] nb_experiment_nodes = config["nb_experiment_nodes"] walltime = str(config["walltime"]) env_name = config["kadeploy_env_name"] workloads = config["workloads"] # check if workloads exists (Suppose that the same NFS mount point # is present on the remote and the local environment for workload_file in workloads: with open(workload_file): pass # copy the workloads files to the results dir copy(workload_file, self.result_dir) # define the workloads parameters self.parameters = { 'workload_filename': workloads } logger.info('Workloads: {}'.format(workloads)) # define the iterator over the parameters combinations self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweep(self.parameters)) # Due to previous (using -c result_dir) run skip some combination logger.info('Skipped parameters:' + '{}'.format(str(self.sweeper.get_skipped()))) logger.info('Number of parameters combinations {}'.format( str(len(self.sweeper.get_remaining())))) logger.info('combinations {}'.format( str(self.sweeper.get_remaining()))) if reservation_job_id is not None: jobs = [(reservation_job_id, site)] else: jobs = oarsub([(OarSubmission(resources=resources, job_type='deploy', walltime=walltime), site)]) job_id, site = jobs[0] if job_id: try: logger.info("waiting job start %s on %s" % (job_id, site)) wait_oar_job_start( job_id, site, prediction_callback=prediction_callback) logger.info("getting nodes of %s on %s" % (job_id, site)) nodes = get_oar_job_nodes(job_id, site) # sort the nodes nodes = sorted(nodes, key=lambda node: node.address) # get only the necessary nodes under the switch if nb_experiment_nodes > len(nodes): raise RuntimeError('The number of given node in the ' 'reservation ({}) do not match the ' 'requested resources ' '({})'.format(len(nodes), nb_experiment_nodes)) nodes = nodes[:nb_experiment_nodes] logger.info("deploying nodes: {}".format(str(nodes))) deployed, undeployed = deploy( Deployment(nodes, env_name=env_name), check_deployed_command=already_configured) if undeployed: logger.warn( "NOT deployed nodes: {}".format(str(undeployed))) raise RuntimeError('Deployement failed') if not already_configured: # install OAR install_cmd = "apt-get update; apt-get install -y " node_packages = "oar-node" logger.info( "installing OAR nodes: {}".format(str(nodes[1:]))) install_oar_nodes = Remote( install_cmd + node_packages, nodes[1:], connection_params={'user': '******'}) install_oar_nodes.start() server_packages = ("oar-server oar-server-pgsql oar-user " "oar-user-pgsql postgresql python3-pip " "libjson-perl postgresql-server-dev-all") install_oar_sched_cmd = """ mkdir -p /opt/oar_sched; \ cd /opt/oar_sched; \ git clone https://github.com/oar-team/oar3.git; \ cd oar3; \ git checkout dce942bebc2; \ pip3 install -e .; \ cd /usr/lib/oar/schedulers; \ ln -s /usr/local/bin/kamelot; \ pip3 install psycopg2 """ logger.info("installing OAR server node: {}".format(str(nodes[0]))) install_master = SshProcess(install_cmd + server_packages + ";" + install_oar_sched_cmd, nodes[0], connection_params={'user': '******'}) install_master.run() install_oar_nodes.wait() if not install_master.ok: Report(install_master) configure_oar_cmd = """ sed -i \ -e 's/^\(DB_TYPE\)=.*/\\1="Pg"/' \ -e 's/^\(DB_HOSTNAME\)=.*/\\1="localhost"/' \ -e 's/^\(DB_PORT\)=.*/\\1="5432"/' \ -e 's/^\(DB_BASE_PASSWD\)=.*/\\1="oar"/' \ -e 's/^\(DB_BASE_LOGIN\)=.*/\\1="oar"/' \ -e 's/^\(DB_BASE_PASSWD_RO\)=.*/\\1="oar_ro"/' \ -e 's/^\(DB_BASE_LOGIN_RO\)=.*/\\1="oar_ro"/' \ -e 's/^\(SERVER_HOSTNAME\)=.*/\\1="localhost"/' \ -e 's/^\(SERVER_PORT\)=.*/\\1="16666"/' \ -e 's/^\(LOG_LEVEL\)\=\"2\"/\\1\=\"3\"/' \ -e 's#^\(LOG_FILE\)\=.*#\\1="{result_dir}/oar.log"#' \ -e 's/^\(JOB_RESOURCE_MANAGER_PROPERTY_DB_FIELD\=\"cpuset\".*\)/#\\1/' \ -e 's/^#\(CPUSET_PATH\=\"\/oar\".*\)/\\1/' \ -e 's/^\(FINAUD_FREQUENCY\)\=.*/\\1="0"/' \ /etc/oar/oar.conf """.format(result_dir=self.result_dir) configure_oar = Remote(configure_oar_cmd, nodes, connection_params={'user': '******'}) configure_oar.run() logger.info("OAR is configured on all nodes") # Configure server create_db = "oar-database --create --db-is-local" config_oar_sched = ("oarnotify --remove-queue default;" "oarnotify --add-queue default,1,kamelot") start_oar = "systemctl start oar-server.service" logger.info( "configuring OAR database: {}".format(str(nodes[0]))) config_master = SshProcess(create_db + ";" + config_oar_sched + ";" + start_oar, nodes[0], connection_params={'user': '******'}) config_master.run() # propagate SSH keys logger.info("configuring OAR SSH") oar_key = "/tmp/.ssh" Process('rm -rf ' + oar_key).run() Process('scp -o BatchMode=yes -o PasswordAuthentication=no ' '-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ' '-o ConnectTimeout=20 -rp -o User=root ' + nodes[0].address + ":/var/lib/oar/.ssh" ' ' + oar_key).run() # Get(nodes[0], "/var/lib/oar/.ssh", [oar_key], connection_params={'user': '******'}).run() Put(nodes[1:], [oar_key], "/var/lib/oar/", connection_params={'user': '******'}).run() add_resources_cmd = """ oarproperty -a cpu || true; \ oarproperty -a core || true; \ oarproperty -c -a host || true; \ oarproperty -a mem || true; \ """ for node in nodes[1:]: add_resources_cmd = add_resources_cmd + "oarnodesetting -a -h {node} -p host={node} -p cpu=1 -p core=4 -p cpuset=0 -p mem=16; \\\n".format(node=node.address) add_resources = SshProcess(add_resources_cmd, nodes[0], connection_params={'user': '******'}) add_resources.run() if add_resources.ok: logger.info("oar is now configured!") else: raise RuntimeError("error in the OAR configuration: Abort!") # TODO backup de la config de OAR # Do the replay logger.info('begining the replay') while len(self.sweeper.get_remaining()) > 0: combi = self.sweeper.get_next() workload_file = os.path.basename(combi['workload_filename']) oar_replay = SshProcess(script_path + "/oar_replay.py " + combi['workload_filename'] + " " + self.result_dir + " oar_gant_" + workload_file, nodes[0]) oar_replay.stdout_handlers.append(self.result_dir + '/' + workload_file + '.out') logger.info("replaying workload: {}".format(combi)) oar_replay.run() if oar_replay.ok: logger.info("Replay workload OK: {}".format(combi)) self.sweeper.done(combi) else: logger.info("Replay workload NOT OK: {}".format(combi)) self.sweeper.cancel(combi) raise RuntimeError("error in the OAR replay: Abort!") except: traceback.print_exc() ipdb.set_trace() finally: if is_a_test: ipdb.set_trace() if reservation_job_id is None: logger.info("delete job: {}".format(jobs)) oardel(jobs)
def run(self): """Run the experiment""" already_configured = self.options.already_configured reservation_job_id = int(self.options.reservation_id) \ if self.options.reservation_id is not None else None is_a_test = self.options.is_a_test if is_a_test: logger.warn('THIS IS A TEST! This run will use only a few ' 'resources') # make the result folder writable for all os.chmod(self.result_dir, 0o777) # Import configuration with open(self.args[0]) as config_file: config = json.load(config_file) # backup configuration copy(self.args[0], self.result_dir) site = config["grid5000_site"] resources = config["resources"] nb_experiment_nodes = config["nb_experiment_nodes"] walltime = str(config["walltime"]) env_name = config["kadeploy_env_name"] workloads = config["workloads"] # check if workloads exists (Suppose that the same NFS mount point # is present on the remote and the local environment for workload_file in workloads: with open(workload_file): pass # copy the workloads files to the results dir copy(workload_file, self.result_dir) # define the workloads parameters self.parameters = {'workload_filename': workloads} logger.info('Workloads: {}'.format(workloads)) # define the iterator over the parameters combinations self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweep(self.parameters)) # Due to previous (using -c result_dir) run skip some combination logger.info('Skipped parameters:' + '{}'.format(str(self.sweeper.get_skipped()))) logger.info('Number of parameters combinations {}'.format( str(len(self.sweeper.get_remaining())))) logger.info('combinations {}'.format(str( self.sweeper.get_remaining()))) if reservation_job_id is not None: jobs = [(reservation_job_id, site)] else: jobs = oarsub([(OarSubmission(resources=resources, job_type='deploy', walltime=walltime), site)]) job_id, site = jobs[0] if job_id: try: logger.info("waiting job start %s on %s" % (job_id, site)) wait_oar_job_start(job_id, site, prediction_callback=prediction_callback) logger.info("getting nodes of %s on %s" % (job_id, site)) nodes = get_oar_job_nodes(job_id, site) # sort the nodes nodes = sorted(nodes, key=lambda node: node.address) # get only the necessary nodes under the switch if nb_experiment_nodes > len(nodes): raise RuntimeError('The number of given node in the ' 'reservation ({}) do not match the ' 'requested resources ' '({})'.format(len(nodes), nb_experiment_nodes)) nodes = nodes[:nb_experiment_nodes] logger.info("deploying nodes: {}".format(str(nodes))) deployed, undeployed = deploy( Deployment(nodes, env_name=env_name), check_deployed_command=already_configured) if undeployed: logger.warn("NOT deployed nodes: {}".format( str(undeployed))) raise RuntimeError('Deployement failed') if not already_configured: # install OAR install_cmd = "apt-get update; apt-get install -y " node_packages = "oar-node" logger.info("installing OAR nodes: {}".format( str(nodes[1:]))) install_oar_nodes = Remote( install_cmd + node_packages, nodes[1:], connection_params={'user': '******'}) install_oar_nodes.start() server_packages = ( "oar-server oar-server-pgsql oar-user " "oar-user-pgsql postgresql python3-pip " "libjson-perl postgresql-server-dev-all") install_oar_sched_cmd = """ mkdir -p /opt/oar_sched; \ cd /opt/oar_sched; \ git clone https://github.com/oar-team/oar3.git; \ cd oar3; \ git checkout dce942bebc2; \ pip3 install -e .; \ cd /usr/lib/oar/schedulers; \ ln -s /usr/local/bin/kamelot; \ pip3 install psycopg2 """ logger.info("installing OAR server node: {}".format( str(nodes[0]))) install_master = SshProcess( install_cmd + server_packages + ";" + install_oar_sched_cmd, nodes[0], connection_params={'user': '******'}) install_master.run() install_oar_nodes.wait() if not install_master.ok: Report(install_master) configure_oar_cmd = """ sed -i \ -e 's/^\(DB_TYPE\)=.*/\\1="Pg"/' \ -e 's/^\(DB_HOSTNAME\)=.*/\\1="localhost"/' \ -e 's/^\(DB_PORT\)=.*/\\1="5432"/' \ -e 's/^\(DB_BASE_PASSWD\)=.*/\\1="oar"/' \ -e 's/^\(DB_BASE_LOGIN\)=.*/\\1="oar"/' \ -e 's/^\(DB_BASE_PASSWD_RO\)=.*/\\1="oar_ro"/' \ -e 's/^\(DB_BASE_LOGIN_RO\)=.*/\\1="oar_ro"/' \ -e 's/^\(SERVER_HOSTNAME\)=.*/\\1="localhost"/' \ -e 's/^\(SERVER_PORT\)=.*/\\1="16666"/' \ -e 's/^\(LOG_LEVEL\)\=\"2\"/\\1\=\"3\"/' \ -e 's#^\(LOG_FILE\)\=.*#\\1="{result_dir}/oar.log"#' \ -e 's/^\(JOB_RESOURCE_MANAGER_PROPERTY_DB_FIELD\=\"cpuset\".*\)/#\\1/' \ -e 's/^#\(CPUSET_PATH\=\"\/oar\".*\)/\\1/' \ -e 's/^\(FINAUD_FREQUENCY\)\=.*/\\1="0"/' \ /etc/oar/oar.conf """.format(result_dir=self.result_dir) configure_oar = Remote(configure_oar_cmd, nodes, connection_params={'user': '******'}) configure_oar.run() logger.info("OAR is configured on all nodes") # Configure server create_db = "oar-database --create --db-is-local" config_oar_sched = ( "oarnotify --remove-queue default;" "oarnotify --add-queue default,1,kamelot") start_oar = "systemctl start oar-server.service" logger.info("configuring OAR database: {}".format( str(nodes[0]))) config_master = SshProcess( create_db + ";" + config_oar_sched + ";" + start_oar, nodes[0], connection_params={'user': '******'}) config_master.run() # propagate SSH keys logger.info("configuring OAR SSH") oar_key = "/tmp/.ssh" Process('rm -rf ' + oar_key).run() Process( 'scp -o BatchMode=yes -o PasswordAuthentication=no ' '-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ' '-o ConnectTimeout=20 -rp -o User=root ' + nodes[0].address + ":/var/lib/oar/.ssh" ' ' + oar_key).run() # Get(nodes[0], "/var/lib/oar/.ssh", [oar_key], connection_params={'user': '******'}).run() Put(nodes[1:], [oar_key], "/var/lib/oar/", connection_params={ 'user': '******' }).run() add_resources_cmd = """ oarproperty -a cpu || true; \ oarproperty -a core || true; \ oarproperty -c -a host || true; \ oarproperty -a mem || true; \ """ for node in nodes[1:]: add_resources_cmd = add_resources_cmd + "oarnodesetting -a -h {node} -p host={node} -p cpu=1 -p core=4 -p cpuset=0 -p mem=16; \\\n".format( node=node.address) add_resources = SshProcess( add_resources_cmd, nodes[0], connection_params={'user': '******'}) add_resources.run() if add_resources.ok: logger.info("oar is now configured!") else: raise RuntimeError( "error in the OAR configuration: Abort!") # TODO backup de la config de OAR # Do the replay logger.info('begining the replay') while len(self.sweeper.get_remaining()) > 0: combi = self.sweeper.get_next() workload_file = os.path.basename( combi['workload_filename']) oar_replay = SshProcess( script_path + "/oar_replay.py " + combi['workload_filename'] + " " + self.result_dir + " oar_gant_" + workload_file, nodes[0]) oar_replay.stdout_handlers.append(self.result_dir + '/' + workload_file + '.out') logger.info("replaying workload: {}".format(combi)) oar_replay.run() if oar_replay.ok: logger.info("Replay workload OK: {}".format(combi)) self.sweeper.done(combi) else: logger.info("Replay workload NOT OK: {}".format(combi)) self.sweeper.cancel(combi) raise RuntimeError("error in the OAR replay: Abort!") except: traceback.print_exc() ipdb.set_trace() finally: if is_a_test: ipdb.set_trace() if reservation_job_id is None: logger.info("delete job: {}".format(jobs)) oardel(jobs)