def create_paramsweeper(parameters, result_dir): """Generate an iterator over combination parameters This function initializes a `ParamSweeper` as an iterator over the possible parameters space (The dictionary of parameters space is created from the `define_parameters` function.). The detail information about the `ParamSweeper` can be found here: http://execo.gforge.inria.fr/doc/latest-stable/execo_engine.html#paramsweeper Parameters ---------- parameters: dict a dictionary contains the parameters space key: str, the name of the experiment parameter value: list, a list of possible values for a parameter of the experiment result_dir: str the path to the result directory on the disk for `ParamSweeper` to persist the state of combinations Returns ------- ParamSweeper an instance of the `ParamSweeper` object. """ logger.debug('Parameters:\n%s' % parameters) sweeps = sweep(parameters) sweeper = ParamSweeper(os.path.join(result_dir, "sweeps"), sweeps) logger.info('-----> TOTAL COMBINATIONS: %s', len(sweeps)) if len(sweeper.get_remaining()) < len(sweeps): logger.info('%s combinations remaining\n' % len(sweeper.get_remaining())) return sweeper
def define_parameters(self): """ """ parameters = self.get_parameters("conf.xml") sweeps = sweep(parameters) self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweeps) logger.info('Number of parameters combinations %s', len(self.sweeper.get_remaining()))
def create_paramsweeper(self): """Generate an iterator over combination parameters""" if self.parameters is None: parameters = self.define_parameters() logger.detail(pformat(parameters)) sweeps = sweep(parameters) logger.info('% s combinations', len(sweeps)) self.sweeper = ParamSweeper(path.join(self.result_dir, "sweeps"), sweeps)
def define_parameters(self): nbNodes = len(self.cluster) # build parameters and make nbCore list per benchmark freqList = [2534000, 2000000, 1200000] n_nodes = float(len(self.cluster)) max_core = SshProcess('cat /proc/cpuinfo | grep -i processor |wc -l', self.cluster[0], connection_params={ 'user': '******' }).run().stdout max_core = n_nodes * float(max_core) even = filter( lambda i: i > n_nodes, list(takewhile(lambda i: i < max_core, (2**i for i in count(0, 1))))) powerTwo = filter( lambda i: i > n_nodes, list(takewhile(lambda i: i < max_core, (i**2 for i in count(0, 1))))) # Define parameters self.parameters = { 'Repeat': [1], "Freq": [2534000], "NPBclass": ['C'], "Benchmark": { # 'ft': { # 'n_core': even # }, # 'ep': { # 'n_core': even # }, # 'lu': { # 'n_core': even # }, # 'is': { # 'n_core': even # }, # 'sg': { # 'n_core': even # }, # 'bt': { # 'n_core': powerTwo # }, 'sp': { 'n_core': powerTwo } } } logger.info(self.parameters) # make all possible parameters object, self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweep(self.parameters)) logger.info('Number of parameters combinations %s', len(self.sweeper.get_remaining()))
def create_sweeper(self): """Define the parameter space and return a sweeper.""" parameters = { 'RA': ['1.e5', '1.e6', '1.e7'], 'RCMB' : [2.], 'KFe' : [0.85, 0.9, 0.95, 0.99] } sweeps = sweep(parameters) self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweeps)
def define_parameters(self): """ """ parameters = { 'blas' : ['none','mkl','atlas','openblas'], 'experiment' : ['aevol','raevol'], 'compilator' : ['gcc','intel'], 'parallel' : ['openmp','tbb'] } sweeps = sweep(parameters) self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweeps) logger.info('Number of parameters combinations %s', len(self.sweeper.get_remaining()))
def create_paramsweeper(self): """Test all the sites, with or without a KaVLAN and for several env.""" params = { "version": ['kadeploy3-dev', 'kadeploy3'], "kavlan": [True, False], "site": get_g5k_sites(), "n_nodes": [1, 4, 10], "env": ['wheezy-x64-base', 'wheezy-x64-prod', 'wheezy-x64-xen'] } logger.info('Defining parameters: %s', pformat(params)) combs = sweep(params) return ParamSweeper(self.result_dir + "/sweeper", combs)
def define_parameters(self): """Create the iterator that contains the parameters to be explored """ self.parameters = { 'sizes': [100], 'zipf': [1], 'pop_keys': [100], 'min_size': [500, 1000], 'int_phases': [1, 2, 3, 4, 5, 10], 'iosf': [100] } logger.info(self.parameters) self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweep(self.parameters)) logger.info('Number of parameters combinations %s', len(self.sweeper.get_remaining()))
def define_parameters(self): """ Define the parametters used by the L2C application """ parameters = { 'cluster': [cluster for site in ['grenoble', 'nancy'] for cluster in get_site_clusters(site) if cluster != 'graphite'], 'cores': {i: {'px': expRange(1, i)} for i in expRange(4, 64)}, 'datasize': expRange(256, 256), 'transposition': ['XYZ', 'XZY', 'YXZ', 'YZX', 'ZXY', 'ZYX']} logger.info(pformat(parameters)) sweeps = sweep(parameters) self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweeps) logger.info('Number of parameters combinations %s', len(self.sweeper.get_remaining()))
def define_parameters(self): """ """ parameters = { 'seed': [ 51456165, 33263658, 7158785, 456847894, 1223144, 878944, 121145, 3587842 ], 'mutation': ['5e-4', '1e-4', '5e-5', '5e-6'], 'env': ['const', 'lat_3', 'lat_all'], 'selection': [750, 2000, 4000] } sweeps = sweep(parameters) self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweeps) logger.info('Number of parameters combinations %s', len(self.sweeper.get_remaining()))
def run(self): """ """ token = 'bRIJb9jp5igAAAAAAAAACc5QzQ619Vp0pYa2PdIrt0q2y0qFyJgwrKvtzuTp3Sz_' client = dropbox.client.DropboxClient(token) parameters = {'size': igeom(128, 2048, 5), 'db_if': ['rest', 'sdk']} combs = sweep(parameters) sweeper = ParamSweeper(self.result_dir + "/sweeps", combs) f = open(self.result_dir + '/results.txt', 'w') while len(sweeper.get_remaining()) > 0: comb = sweeper.get_next() logger.info('Treating combination %s', pformat(comb)) comb_dir = self.result_dir + '/' + slugify(comb) try: os.mkdir(comb_dir) except: pass fname = self.create_file(comb['size']) timer = Timer() if comb['db_if'] == 'sdk': self.upload_file_sdk(client, fname, fname.split('/')[-1]) up_time = timer.elapsed() self.download_file_sdk(client, fname.split('/')[-1], comb_dir + fname.split('/')[-1]) dl_time = timer.elapsed() - up_time sweeper.done(comb) elif comb['db_if'] == 'rest': logger.warning('REST interface not implemented') sweeper.skip(comb) continue os.remove(fname) f.write("%f %i %f %f \n" % (timer.start_date(), comb['size'], up_time, dl_time)) f.close()
def define_parameters(self): """ Define the parametters used by the L2C application """ parameters = { 'cluster': [ cluster for site in ['grenoble', 'nancy'] for cluster in get_site_clusters(site) if cluster != 'graphite' ], 'cores': {i: { 'px': expRange(1, i) } for i in expRange(4, 64)}, 'datasize': expRange(256, 256), 'transposition': ['XYZ', 'XZY', 'YXZ', 'YZX', 'ZXY', 'ZYX'] } logger.info(pformat(parameters)) sweeps = sweep(parameters) self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweeps) logger.info('Number of parameters combinations %s', len(self.sweeper.get_remaining()))
def define_parameters(self): """Create the iterator on the parameters combinations to be explored""" # fixed number of nodes self.n_nodes = 4 # choose a list of clusters clusters = ['graphene', 'petitprince', 'edel', 'paradent', 'stremi'] #clusters = ['petitprince', 'paradent'] # compute the maximum number of cores among all clusters max_core = self.n_nodes * max([ get_host_attributes(cluster + '-1')['architecture']['smt_size'] for cluster in clusters]) # define the parameters self.parameters = { 'cluster' : clusters, 'n_core': filter(lambda i: i >= self.n_nodes, list(takewhile(lambda i: i<max_core, (2**i for i in count(0, 1))))), 'size' : ['A', 'B', 'C'] } logger.info(self.parameters) # define the iterator over the parameters combinations self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweep(self.parameters)) logger.info('Number of parameters combinations %s' % len(self.sweeper.get_remaining()))
else: # we can afford more clients # based on our estimation a client sends 200msgs at full rate return cast_ratio_max * params["nbr_servers"] >= params[ "nbr_clients"] * 1000 * params["pause"] #Function to pass in parameter to ParamSweeper.get_next() #Give the illusion that the Set of params is sorted by nbr_clients def sort_params_by_nbr_clients(set): return sorted((list(set)), key=lambda k: k['nbr_clients']) if __name__ == "__main__": logging.basicConfig(level=logging.DEBUG) sweeps = sweep(PARAMETERS) sweeper = ParamSweeper( # Maybe puts the sweeper under the experimentation directory # This should be current/sweeps persistence_dir=os.path.join("%s/sweeps" % TEST_DIR), sweeps=sweeps, save_sweeps=True, name="test_case_1") #Get the next parameter in the set of all remaining params #This set is temporary viewed as sorted List with this filter function. params = sweeper.get_next(sort_params_by_nbr_clients) while params: if not accept(params): # skipping element # Note that the semantic of sweeper.skip is different
def run(self): """Run the experiment""" already_configured = self.options.already_configured reservation_job_id = int(self.options.reservation_id) \ if self.options.reservation_id is not None else None is_a_test = self.options.is_a_test if is_a_test: logger.warn('THIS IS A TEST! This run will use only a few ' 'resources') # make the result folder writable for all os.chmod(self.result_dir, 0o777) # Import configuration with open(self.args[0]) as config_file: config = json.load(config_file) # backup configuration copy(self.args[0], self.result_dir) site = config["grid5000_site"] resources = config["resources"] nb_experiment_nodes = config["nb_experiment_nodes"] walltime = str(config["walltime"]) env_name = config["kadeploy_env_name"] workloads = config["workloads"] # check if workloads exists (Suppose that the same NFS mount point # is present on the remote and the local environment for workload_file in workloads: with open(workload_file): pass # copy the workloads files to the results dir copy(workload_file, self.result_dir) # define the workloads parameters self.parameters = {'workload_filename': workloads} logger.info('Workloads: {}'.format(workloads)) # define the iterator over the parameters combinations self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweep(self.parameters)) # Due to previous (using -c result_dir) run skip some combination logger.info('Skipped parameters:' + '{}'.format(str(self.sweeper.get_skipped()))) logger.info('Number of parameters combinations {}'.format( str(len(self.sweeper.get_remaining())))) logger.info('combinations {}'.format(str( self.sweeper.get_remaining()))) if reservation_job_id is not None: jobs = [(reservation_job_id, site)] else: jobs = oarsub([(OarSubmission(resources=resources, job_type='deploy', walltime=walltime), site)]) job_id, site = jobs[0] if job_id: try: logger.info("waiting job start %s on %s" % (job_id, site)) wait_oar_job_start(job_id, site, prediction_callback=prediction_callback) logger.info("getting nodes of %s on %s" % (job_id, site)) nodes = get_oar_job_nodes(job_id, site) # sort the nodes nodes = sorted(nodes, key=lambda node: node.address) # get only the necessary nodes under the switch if nb_experiment_nodes > len(nodes): raise RuntimeError('The number of given node in the ' 'reservation ({}) do not match the ' 'requested resources ' '({})'.format(len(nodes), nb_experiment_nodes)) nodes = nodes[:nb_experiment_nodes] logger.info("deploying nodes: {}".format(str(nodes))) deployed, undeployed = deploy( Deployment(nodes, env_name=env_name), check_deployed_command=already_configured) if undeployed: logger.warn("NOT deployed nodes: {}".format( str(undeployed))) raise RuntimeError('Deployement failed') if not already_configured: # install OAR install_cmd = "apt-get update; apt-get install -y " node_packages = "oar-node" logger.info("installing OAR nodes: {}".format( str(nodes[1:]))) install_oar_nodes = Remote( install_cmd + node_packages, nodes[1:], connection_params={'user': '******'}) install_oar_nodes.start() server_packages = ( "oar-server oar-server-pgsql oar-user " "oar-user-pgsql postgresql python3-pip " "libjson-perl postgresql-server-dev-all") install_oar_sched_cmd = """ mkdir -p /opt/oar_sched; \ cd /opt/oar_sched; \ git clone https://github.com/oar-team/oar3.git; \ cd oar3; \ git checkout dce942bebc2; \ pip3 install -e .; \ cd /usr/lib/oar/schedulers; \ ln -s /usr/local/bin/kamelot; \ pip3 install psycopg2 """ logger.info("installing OAR server node: {}".format( str(nodes[0]))) install_master = SshProcess( install_cmd + server_packages + ";" + install_oar_sched_cmd, nodes[0], connection_params={'user': '******'}) install_master.run() install_oar_nodes.wait() if not install_master.ok: Report(install_master) configure_oar_cmd = """ sed -i \ -e 's/^\(DB_TYPE\)=.*/\\1="Pg"/' \ -e 's/^\(DB_HOSTNAME\)=.*/\\1="localhost"/' \ -e 's/^\(DB_PORT\)=.*/\\1="5432"/' \ -e 's/^\(DB_BASE_PASSWD\)=.*/\\1="oar"/' \ -e 's/^\(DB_BASE_LOGIN\)=.*/\\1="oar"/' \ -e 's/^\(DB_BASE_PASSWD_RO\)=.*/\\1="oar_ro"/' \ -e 's/^\(DB_BASE_LOGIN_RO\)=.*/\\1="oar_ro"/' \ -e 's/^\(SERVER_HOSTNAME\)=.*/\\1="localhost"/' \ -e 's/^\(SERVER_PORT\)=.*/\\1="16666"/' \ -e 's/^\(LOG_LEVEL\)\=\"2\"/\\1\=\"3\"/' \ -e 's#^\(LOG_FILE\)\=.*#\\1="{result_dir}/oar.log"#' \ -e 's/^\(JOB_RESOURCE_MANAGER_PROPERTY_DB_FIELD\=\"cpuset\".*\)/#\\1/' \ -e 's/^#\(CPUSET_PATH\=\"\/oar\".*\)/\\1/' \ -e 's/^\(FINAUD_FREQUENCY\)\=.*/\\1="0"/' \ /etc/oar/oar.conf """.format(result_dir=self.result_dir) configure_oar = Remote(configure_oar_cmd, nodes, connection_params={'user': '******'}) configure_oar.run() logger.info("OAR is configured on all nodes") # Configure server create_db = "oar-database --create --db-is-local" config_oar_sched = ( "oarnotify --remove-queue default;" "oarnotify --add-queue default,1,kamelot") start_oar = "systemctl start oar-server.service" logger.info("configuring OAR database: {}".format( str(nodes[0]))) config_master = SshProcess( create_db + ";" + config_oar_sched + ";" + start_oar, nodes[0], connection_params={'user': '******'}) config_master.run() # propagate SSH keys logger.info("configuring OAR SSH") oar_key = "/tmp/.ssh" Process('rm -rf ' + oar_key).run() Process( 'scp -o BatchMode=yes -o PasswordAuthentication=no ' '-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ' '-o ConnectTimeout=20 -rp -o User=root ' + nodes[0].address + ":/var/lib/oar/.ssh" ' ' + oar_key).run() # Get(nodes[0], "/var/lib/oar/.ssh", [oar_key], connection_params={'user': '******'}).run() Put(nodes[1:], [oar_key], "/var/lib/oar/", connection_params={ 'user': '******' }).run() add_resources_cmd = """ oarproperty -a cpu || true; \ oarproperty -a core || true; \ oarproperty -c -a host || true; \ oarproperty -a mem || true; \ """ for node in nodes[1:]: add_resources_cmd = add_resources_cmd + "oarnodesetting -a -h {node} -p host={node} -p cpu=1 -p core=4 -p cpuset=0 -p mem=16; \\\n".format( node=node.address) add_resources = SshProcess( add_resources_cmd, nodes[0], connection_params={'user': '******'}) add_resources.run() if add_resources.ok: logger.info("oar is now configured!") else: raise RuntimeError( "error in the OAR configuration: Abort!") # TODO backup de la config de OAR # Do the replay logger.info('begining the replay') while len(self.sweeper.get_remaining()) > 0: combi = self.sweeper.get_next() workload_file = os.path.basename( combi['workload_filename']) oar_replay = SshProcess( script_path + "/oar_replay.py " + combi['workload_filename'] + " " + self.result_dir + " oar_gant_" + workload_file, nodes[0]) oar_replay.stdout_handlers.append(self.result_dir + '/' + workload_file + '.out') logger.info("replaying workload: {}".format(combi)) oar_replay.run() if oar_replay.ok: logger.info("Replay workload OK: {}".format(combi)) self.sweeper.done(combi) else: logger.info("Replay workload NOT OK: {}".format(combi)) self.sweeper.cancel(combi) raise RuntimeError("error in the OAR replay: Abort!") except: traceback.print_exc() ipdb.set_trace() finally: if is_a_test: ipdb.set_trace() if reservation_job_id is None: logger.info("delete job: {}".format(jobs)) oardel(jobs)
def run(self): """ run method from engine in order to do our workflow """ mongo = ClientMongo() size = dict if not self.options.file: if not self.options.only: size = { 1, long(self.options.size * 0.25), long(self.options.size * 0.5), long(self.options.size * 0.75), long(self.options.size) } else: size = {long(self.options.size)} else: if self.OnlyDownload: size = getFilSize(self.options.file) else: size = {0} drive = None if self.options.drive: drive = self.options.drive else: drive = self.drive interface = ['rest', 'sdk'] parameters = { 'size': size, 'if': interface, 'drive': drive, 'transfert': self.transfert } p = None for n in range(0, int(self.options.ntest), 1): logger.info('---------------------') logger.info('Round %i', n + 1) combs = sweep(parameters) date = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S") pathResults = os.getcwd() + '/Results/Bench' + date sweeper = ParamSweeper(pathResults + "/sweeps", combs) f = open(pathResults + '/results.txt', 'w') while len(sweeper.get_remaining()) > 0: # sort the parameters for i in interface: for dr in drive: for s in size: comb = sweeper.get_next(filtr=lambda r: filter( lambda x: x['drive'] == dr and x['size'] == s and x['if'] == i, r)) if not comb: continue # start of the workflow if comb['drive'] == 'amazon': p = providerS3.ProviderS3() elif comb['drive'] == 'dropbox': p = providerDB.ProviderDB() else: p = providerGD.ProviderGD() logger.info('Treating combination %s', pformat(comb)) comb_dir = pathResults + '/' + slugify(comb) if not os.path.isdir(comb_dir): os.mkdir(comb_dir) if not self.options.file: fname = self.create_file(comb['size']) else: fname = self.options.file timer = Timer() up_time = 0 dl_time = 0 start_date = datetime.datetime.now() if comb['if'] == 'sdk': if p.provider_name == "amazon": # AMAZON clientAmz = p.getConnexion() if self.OnlyDownload: p.bucketKey += fname else: p.bucketKey += '/' + fname if comb['transfert'] == "upload" or comb[ 'transfert'] == 'upDown': p.upload_file_sdk( clientAmz.get_bucket(p.bucketName), p.bucketKey, fname) up_time = timer.elapsed() if comb['transfert'] == "download" or comb[ 'transfert'] == 'upDown': p.download_file_sdk( clientAmz.get_bucket(p.bucketName), p.bucketKey, comb_dir + '/' + fname.split('/')[-1]) dl_time = timer.elapsed() - up_time if not self.OnlyDownload: p.delete_file_sdk( clientAmz.get_bucket(p.bucketName), p.bucketKey) elif p.provider_name == "dropbox": # DROPBOX client = p.getToken() if comb['transfert'] == "upload" or comb[ 'transfert'] == 'upDown': p.upload_file_sdk( client, fname, fname.split('/')[-1]) up_time = timer.elapsed() if comb['transfert'] == "download" or comb[ 'transfert'] == 'upDown': p.download_file_sdk( client, fname.split('/')[-1], comb_dir + '/' + fname.split('/')[-1]) dl_time = timer.elapsed() - up_time if not self.OnlyDownload: p.delete_file(client, fname.split('/')[-1]) elif p.provider_name == "googledrive": # GOOGLEDRIVE drive_service = p.getConnexion() new_file = None if comb['transfert'] == 'upload' or comb[ 'transfert'] == 'upDown': new_file = p.upload_file_sdk( drive_service, fname, fname.split('/')[-1], 'text/plain') up_time = timer.elapsed() if comb['transfert'] == 'download' or comb[ 'transfert'] == 'upDown': p.download_file_sdk( drive_service, new_file, comb_dir + '/' + fname.split('/')[-1]) dl_time = timer.elapsed() - up_time if not self.OnlyDownload: p.delete_file_sdk( drive_service, new_file['id']) sweeper.done(comb) elif comb['if'] == 'rest': logger.warning( 'REST interface not implemented') sweeper.skip(comb) if not self.OnlyDownload: # logger.info('delete de '+fname) if os.path.isfile(fname): os.remove(fname) # delete only if rest is implmented # os.remove(comb_dir + '/' + fname.split('/')[-1]) continue if comb['transfert'] == "upload" or comb[ 'transfert'] == "upDown": f.write("%s %s %s %s %s %s %s %f %i %s %f\n" % (self.localisation['ip'], self.localisation['lat'], self.localisation['lon'], self.localisation['city'], self.localisation['country'], comb['drive'], comb['if'], timer.start_date(), comb['size'], "upload", up_time)) mongo.collection.insert({ 'ip': self.localisation['ip'], 'latitude': self.localisation['lat'], 'longitude': self.localisation['lon'], 'city': self.localisation['city'], 'country': self.localisation['country'], 'drive': comb['drive'], 'interface': comb['if'], 'start_date': start_date, 'size': comb['size'], 'transfert': 'upload', 'time': up_time }) if comb['transfert'] == "download" or comb[ 'transfert'] == "upDown": f.write("%s %s %s %s %s %s %s %f %i %s %f\n" % (self.localisation['ip'], self.localisation['lat'], self.localisation['lon'], self.localisation['city'], self.localisation['country'], comb['drive'], comb['if'], timer.start_date(), comb['size'], "download", dl_time)) mongo.collection.insert({ 'ip': self.localisation['ip'], 'latitude': self.localisation['lat'], 'longitude': self.localisation['lon'], 'city': self.localisation['city'], 'country': self.localisation['country'], 'drive': comb['drive'], 'interface': comb['if'], 'start_date': start_date, 'size': comb['size'], 'transfert': 'download', 'time': dl_time }) if not self.OnlyDownload: # logger.info('delete de '+fname) if os.path.isfile(fname): os.remove(fname) if os.path.isfile(comb_dir + '/' + fname): os.remove(comb_dir + '/' + fname.split('/')[-1]) f.close() # delete the Bench Folder os.rmdir(self.result_dir) logger.info("---------------------------------------") for t in check_Exp_database(self.options, self.localisation)['result']: logger.info(t)
def run(self): # Defining experiment parameters self.parameters = { 'n_clients': [400, 450, 500, 550, 600], 'n_transitions': [10000] } cluster = 'griffon' sweeps = sweep(self.parameters) sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweeps) server_out_path = os.path.join(self.result_dir, "server.out") self._updateStat(sweeper.stats()) # Loop on the number of nodes while True: # Taking the next parameter combinations comb = sweeper.get_next() if not comb: break # Performing the submission on G5K site = get_cluster_site(cluster) self._log("Output will go to " + self.result_dir) n_nodes = int(math.ceil(float(comb['n_clients']) / EX5.get_host_attributes(cluster + '-1')['architecture']['smt_size'])) + 1 self._log("Reserving {0} nodes on {1}".format(n_nodes, site)) resources = "{cluster=\\'" + cluster + "\\'}/nodes=" + str(n_nodes) submission = EX5.OarSubmission(resources = resources, job_type = 'allow_classic_ssh', walltime ='00:10:00') job = EX5.oarsub([(submission, site)]) self.__class__._job = job # Sometimes oarsub fails silently if job[0][0] is None: print("\nError: no job was created") sys.exit(1) # Wait for the job to start self._log("Waiting for job {0} to start...\n".format(BOLD_MAGENTA + str(job[0][0]) + NORMAL)) EX5.wait_oar_job_start(job[0][0], job[0][1], prediction_callback = prediction) nodes = EX5.get_oar_job_nodes(job[0][0], job[0][1]) # Deploying nodes #deployment = EX5.Deployment(hosts = nodes, env_file='path_to_env_file') #run_deploy = EX5.deploy(deployment) #nodes_deployed = run_deploy.hosts[0] # Copying active_data program on all deployed hosts EX.Put([nodes[0]], '../dist/active-data-lib-0.1.2.jar', connexion_params = {'user': '******'}).run() EX.Put([nodes[0]], '../server.policy', connexion_params = {'user': '******'}).run() # Loop on the number of requests per client process while True: # Split the nodes clients = nodes[1:] server = nodes[0] self._log("Running experiment with {0} nodes and {1} transitions per client".format(len(clients), comb['n_transitions'])) # Launching Server on one node out_handler = FileOutputHandler(server_out_path) launch_server = EX.Remote('java -jar active-data-lib-0.1.2.jar', [server], stdout_handler = out_handler, stderr_handler = out_handler).start() self._log("Server started on " + server.address) time.sleep(2) # Launching clients rank=0 n_cores = EX5.get_host_attributes(clients[0])['architecture']['smt_size']; cores = nodes * n_cores cores = cores[0:comb['n_clients']] # Cut out the additional cores client_connection_params = { 'taktuk_gateway': 'lyon.grid5000.fr', 'host_rewrite_func': None } self._log("Launching {0} clients...".format(len(cores))) client_cmd = "/usr/bin/env java -cp /home/ansimonet/active-data-lib-0.1.2.jar org.inria.activedata.examples.perf.TransitionsPerSecond " + \ "{0} {1} {2} {3} {4}".format(server.address, 1200, "{{range(len(cores))}}", len(cores), comb['n_transitions']) client_out_handler = FileOutputHandler(os.path.join(self.result_dir, "clients.out")) client_request = EX.TaktukRemote(client_cmd, cores, connexion_params = client_connection_params, \ stdout_handler = client_out_handler, stderr_handler = client_out_handler) client_request.run() if not client_request.ok(): # Some client failed, please panic self._log("One or more client process failed. Enjoy reading their outputs.") self._log("OUTPUT STARTS -------------------------------------------------\n") for process in client_request.processes(): print("----- {0} returned {1}".format(process.host().address, process.exit_code())) if not process.stdout() == "": print(GREEN + process.stdout() + NORMAL) if not process.stderr() == "": print(RED + process.stderr() + NORMAL) print("") self._log("OUTPUT ENDS ---------------------------------------------------\n") sweeper.skip(comb) launch_server.kill() launch_server.wait() else: # Waiting for server to end launch_server.wait() # Getting log files distant_path = OUT_FILE_FORMAT.format(len(cores), comb['n_transitions']) local_path = distant_path EX.Get([server], distant_path).run() EX.Local('mv ' + distant_path + ' ' + os.path.join(self.result_dir, local_path)).run() EX.Get([server], 'client_*.out', local_location = self.result_dir) EX.Remote('rm -f client_*.out', [server]) self._log("Finishing experiment with {0} clients and {1} transitions per client".format(comb['n_clients'], comb['n_transitions'])) sweeper.done(comb) sub_comb = sweeper.get_next (filtr = lambda r: filter(lambda s: s["n_clients"] == comb['n_clients'], r)) self._updateStat(sweeper.stats()) if not sub_comb: # Killing job EX5.oar.oardel(job) self.__class__._job = None break else: comb = sub_comb print ""
LOG.info(f"Running bench with {parameter} on {env['roles']}") @enostask() def backup(env=None): LOG.info(f"Running backup on {env['roles']}") @enostask() def destroy(env=None): LOG.info(f"Running destroy on {env['roles']}") # Iterate over a set of parameters parameters = {"param1": [1, 4], "param2": ["a", "b"]} sweeps = sweep(parameters) sweeper = ParamSweeper( persistence_dir=str(Path("sweeps")), sweeps=sweeps, save_sweeps=True ) parameter = sweeper.get_next() while parameter: try: deploy() bench(parameter) backup() sweeper.done(parameter) except Exception as e: traceback.print_exc() sweeper.skip(parameter) finally: destroy()
def campaign(broker, provider, conf, test, env): def generate_id(params): def clean(s): return str(s).replace("/", "_sl_") \ .replace(":", "_sc_") return "-".join([ "%s__%s" % (clean(k), clean(v)) for k, v in sorted(params.items()) ]) def accept(params): call_ratio_max = 3 cast_ratio_max = 3 call_type = params["call_type"] if params["nbr_servers"] > params["nbr_clients"]: return False if call_type == "rpc-call": if not params["pause"]: # maximum rate return call_ratio_max * params["nbr_servers"] >= params[ "nbr_clients"] else: # we can afford more clients # based on our estimation a client sends 200msgs at full rate return call_ratio_max * params["nbr_servers"] >= params[ "nbr_clients"] * 200 * params["pause"] else: if not params["pause"]: # maximum rate return cast_ratio_max * params["nbr_servers"] >= params[ "nbr_clients"] else: # we can afford more clients # based on our estimation a client sends 200msgs at full rate return cast_ratio_max * params["nbr_servers"] >= params[ "nbr_clients"] * 1000 * params["pause"] # Function to pass in parameter to ParamSweeper.get_next() # Give the illusion that the Set of params is sorted by nbr_clients def sort_params_by_nbr_clients(set): return sorted((list(set)), key=lambda k: k['nbr_clients']) # Dump each params in the backup dir def dump_param(params): if not os.path.exists("%s/params.json" % test): with open("%s/params.json" % test, 'w') as outfile: json.dump([], outfile) #Add the current params to the json with open("%s/params.json" % test, 'r') as outfile: all_params = json.load(outfile) all_params.append(params) with open("%s/params.json" % test, 'w') as outfile: json.dump(all_params, outfile) # Loading the conf config = {} with open(conf) as f: config = yaml.load(f) parameters = config["campaign"][test] sweeps = sweep(parameters) filtered_sweeps = [param for param in sweeps if accept(param)] sweeper = ParamSweeper( # Maybe puts the sweeper under the experimentation directory # This should be current/sweeps persistence_dir=os.path.join("%s/sweeps" % test), sweeps=filtered_sweeps, save_sweeps=True, name=test) params = sweeper.get_next(sort_params_by_nbr_clients) PROVIDERS[provider](broker=broker, config=config, env=test) t.inventory() while params: params.pop("backup_dir", None) params.update({"backup_dir": generate_id(params)}) t.prepare(broker=broker) t.test_case_1(**params) sweeper.done(params) dump_param(params) params = sweeper.get_next(sort_params_by_nbr_clients) t.destroy()
def run(self): """Run the experiment""" already_configured = self.options.already_configured reservation_job_id = int(self.options.reservation_id) \ if self.options.reservation_id is not None else None is_a_test = self.options.is_a_test if is_a_test: logger.warn('THIS IS A TEST! This run will use only a few ' 'resources') # make the result folder writable for all os.chmod(self.result_dir, 0o777) # Import configuration with open(self.args[0]) as config_file: config = json.load(config_file) # backup configuration copy(self.args[0], self.result_dir) site = config["grid5000_site"] resources = config["resources"] nb_experiment_nodes = config["nb_experiment_nodes"] walltime = str(config["walltime"]) env_name = config["kadeploy_env_name"] workloads = config["workloads"] # check if workloads exists (Suppose that the same NFS mount point # is present on the remote and the local environment for workload_file in workloads: with open(workload_file): pass # copy the workloads files to the results dir copy(workload_file, self.result_dir) # define the workloads parameters self.parameters = { 'workload_filename': workloads } logger.info('Workloads: {}'.format(workloads)) # define the iterator over the parameters combinations self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweep(self.parameters)) # Due to previous (using -c result_dir) run skip some combination logger.info('Skipped parameters:' + '{}'.format(str(self.sweeper.get_skipped()))) logger.info('Number of parameters combinations {}'.format( str(len(self.sweeper.get_remaining())))) logger.info('combinations {}'.format( str(self.sweeper.get_remaining()))) if reservation_job_id is not None: jobs = [(reservation_job_id, site)] else: jobs = oarsub([(OarSubmission(resources=resources, job_type='deploy', walltime=walltime), site)]) job_id, site = jobs[0] if job_id: try: logger.info("waiting job start %s on %s" % (job_id, site)) wait_oar_job_start( job_id, site, prediction_callback=prediction_callback) logger.info("getting nodes of %s on %s" % (job_id, site)) nodes = get_oar_job_nodes(job_id, site) # sort the nodes nodes = sorted(nodes, key=lambda node: node.address) # get only the necessary nodes under the switch if nb_experiment_nodes > len(nodes): raise RuntimeError('The number of given node in the ' 'reservation ({}) do not match the ' 'requested resources ' '({})'.format(len(nodes), nb_experiment_nodes)) nodes = nodes[:nb_experiment_nodes] logger.info("deploying nodes: {}".format(str(nodes))) deployed, undeployed = deploy( Deployment(nodes, env_name=env_name), check_deployed_command=already_configured) if undeployed: logger.warn( "NOT deployed nodes: {}".format(str(undeployed))) raise RuntimeError('Deployement failed') if not already_configured: # install OAR install_cmd = "apt-get update; apt-get install -y " node_packages = "oar-node" logger.info( "installing OAR nodes: {}".format(str(nodes[1:]))) install_oar_nodes = Remote( install_cmd + node_packages, nodes[1:], connection_params={'user': '******'}) install_oar_nodes.start() server_packages = ("oar-server oar-server-pgsql oar-user " "oar-user-pgsql postgresql python3-pip " "libjson-perl postgresql-server-dev-all") install_oar_sched_cmd = """ mkdir -p /opt/oar_sched; \ cd /opt/oar_sched; \ git clone https://github.com/oar-team/oar3.git; \ cd oar3; \ git checkout dce942bebc2; \ pip3 install -e .; \ cd /usr/lib/oar/schedulers; \ ln -s /usr/local/bin/kamelot; \ pip3 install psycopg2 """ logger.info("installing OAR server node: {}".format(str(nodes[0]))) install_master = SshProcess(install_cmd + server_packages + ";" + install_oar_sched_cmd, nodes[0], connection_params={'user': '******'}) install_master.run() install_oar_nodes.wait() if not install_master.ok: Report(install_master) configure_oar_cmd = """ sed -i \ -e 's/^\(DB_TYPE\)=.*/\\1="Pg"/' \ -e 's/^\(DB_HOSTNAME\)=.*/\\1="localhost"/' \ -e 's/^\(DB_PORT\)=.*/\\1="5432"/' \ -e 's/^\(DB_BASE_PASSWD\)=.*/\\1="oar"/' \ -e 's/^\(DB_BASE_LOGIN\)=.*/\\1="oar"/' \ -e 's/^\(DB_BASE_PASSWD_RO\)=.*/\\1="oar_ro"/' \ -e 's/^\(DB_BASE_LOGIN_RO\)=.*/\\1="oar_ro"/' \ -e 's/^\(SERVER_HOSTNAME\)=.*/\\1="localhost"/' \ -e 's/^\(SERVER_PORT\)=.*/\\1="16666"/' \ -e 's/^\(LOG_LEVEL\)\=\"2\"/\\1\=\"3\"/' \ -e 's#^\(LOG_FILE\)\=.*#\\1="{result_dir}/oar.log"#' \ -e 's/^\(JOB_RESOURCE_MANAGER_PROPERTY_DB_FIELD\=\"cpuset\".*\)/#\\1/' \ -e 's/^#\(CPUSET_PATH\=\"\/oar\".*\)/\\1/' \ -e 's/^\(FINAUD_FREQUENCY\)\=.*/\\1="0"/' \ /etc/oar/oar.conf """.format(result_dir=self.result_dir) configure_oar = Remote(configure_oar_cmd, nodes, connection_params={'user': '******'}) configure_oar.run() logger.info("OAR is configured on all nodes") # Configure server create_db = "oar-database --create --db-is-local" config_oar_sched = ("oarnotify --remove-queue default;" "oarnotify --add-queue default,1,kamelot") start_oar = "systemctl start oar-server.service" logger.info( "configuring OAR database: {}".format(str(nodes[0]))) config_master = SshProcess(create_db + ";" + config_oar_sched + ";" + start_oar, nodes[0], connection_params={'user': '******'}) config_master.run() # propagate SSH keys logger.info("configuring OAR SSH") oar_key = "/tmp/.ssh" Process('rm -rf ' + oar_key).run() Process('scp -o BatchMode=yes -o PasswordAuthentication=no ' '-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ' '-o ConnectTimeout=20 -rp -o User=root ' + nodes[0].address + ":/var/lib/oar/.ssh" ' ' + oar_key).run() # Get(nodes[0], "/var/lib/oar/.ssh", [oar_key], connection_params={'user': '******'}).run() Put(nodes[1:], [oar_key], "/var/lib/oar/", connection_params={'user': '******'}).run() add_resources_cmd = """ oarproperty -a cpu || true; \ oarproperty -a core || true; \ oarproperty -c -a host || true; \ oarproperty -a mem || true; \ """ for node in nodes[1:]: add_resources_cmd = add_resources_cmd + "oarnodesetting -a -h {node} -p host={node} -p cpu=1 -p core=4 -p cpuset=0 -p mem=16; \\\n".format(node=node.address) add_resources = SshProcess(add_resources_cmd, nodes[0], connection_params={'user': '******'}) add_resources.run() if add_resources.ok: logger.info("oar is now configured!") else: raise RuntimeError("error in the OAR configuration: Abort!") # TODO backup de la config de OAR # Do the replay logger.info('begining the replay') while len(self.sweeper.get_remaining()) > 0: combi = self.sweeper.get_next() workload_file = os.path.basename(combi['workload_filename']) oar_replay = SshProcess(script_path + "/oar_replay.py " + combi['workload_filename'] + " " + self.result_dir + " oar_gant_" + workload_file, nodes[0]) oar_replay.stdout_handlers.append(self.result_dir + '/' + workload_file + '.out') logger.info("replaying workload: {}".format(combi)) oar_replay.run() if oar_replay.ok: logger.info("Replay workload OK: {}".format(combi)) self.sweeper.done(combi) else: logger.info("Replay workload NOT OK: {}".format(combi)) self.sweeper.cancel(combi) raise RuntimeError("error in the OAR replay: Abort!") except: traceback.print_exc() ipdb.set_trace() finally: if is_a_test: ipdb.set_trace() if reservation_job_id is None: logger.info("delete job: {}".format(jobs)) oardel(jobs)
def run(self): # Defining experiment parameters self.parameters = { 'n_clients': [400, 450, 500, 550, 600], 'n_transitions': [10000] } cluster = 'griffon' sweeps = sweep(self.parameters) sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweeps) server_out_path = os.path.join(self.result_dir, "server.out") self._updateStat(sweeper.stats()) # Loop on the number of nodes while True: # Taking the next parameter combinations comb = sweeper.get_next() if not comb: break # Performing the submission on G5K site = get_cluster_site(cluster) self._log("Output will go to " + self.result_dir) n_nodes = int( math.ceil( float(comb['n_clients']) / EX5.get_host_attributes( cluster + '-1')['architecture']['smt_size'])) + 1 self._log("Reserving {0} nodes on {1}".format(n_nodes, site)) resources = "{cluster=\\'" + cluster + "\\'}/nodes=" + str(n_nodes) submission = EX5.OarSubmission(resources=resources, job_type='allow_classic_ssh', walltime='00:10:00') job = EX5.oarsub([(submission, site)]) self.__class__._job = job # Sometimes oarsub fails silently if job[0][0] is None: print("\nError: no job was created") sys.exit(1) # Wait for the job to start self._log( "Waiting for job {0} to start...\n".format(BOLD_MAGENTA + str(job[0][0]) + NORMAL)) EX5.wait_oar_job_start(job[0][0], job[0][1], prediction_callback=prediction) nodes = EX5.get_oar_job_nodes(job[0][0], job[0][1]) # Deploying nodes #deployment = EX5.Deployment(hosts = nodes, env_file='path_to_env_file') #run_deploy = EX5.deploy(deployment) #nodes_deployed = run_deploy.hosts[0] # Copying active_data program on all deployed hosts EX.Put([nodes[0]], '../dist/active-data-lib-0.1.2.jar', connexion_params={ 'user': '******' }).run() EX.Put([nodes[0]], '../server.policy', connexion_params={ 'user': '******' }).run() # Loop on the number of requests per client process while True: # Split the nodes clients = nodes[1:] server = nodes[0] self._log( "Running experiment with {0} nodes and {1} transitions per client" .format(len(clients), comb['n_transitions'])) # Launching Server on one node out_handler = FileOutputHandler(server_out_path) launch_server = EX.Remote( 'java -jar active-data-lib-0.1.2.jar', [server], stdout_handler=out_handler, stderr_handler=out_handler).start() self._log("Server started on " + server.address) time.sleep(2) # Launching clients rank = 0 n_cores = EX5.get_host_attributes( clients[0])['architecture']['smt_size'] cores = nodes * n_cores cores = cores[ 0:comb['n_clients']] # Cut out the additional cores client_connection_params = { 'taktuk_gateway': 'lyon.grid5000.fr', 'host_rewrite_func': None } self._log("Launching {0} clients...".format(len(cores))) client_cmd = "/usr/bin/env java -cp /home/ansimonet/active-data-lib-0.1.2.jar org.inria.activedata.examples.perf.TransitionsPerSecond " + \ "{0} {1} {2} {3} {4}".format(server.address, 1200, "{{range(len(cores))}}", len(cores), comb['n_transitions']) client_out_handler = FileOutputHandler( os.path.join(self.result_dir, "clients.out")) client_request = EX.TaktukRemote(client_cmd, cores, connexion_params = client_connection_params, \ stdout_handler = client_out_handler, stderr_handler = client_out_handler) client_request.run() if not client_request.ok(): # Some client failed, please panic self._log( "One or more client process failed. Enjoy reading their outputs." ) self._log( "OUTPUT STARTS -------------------------------------------------\n" ) for process in client_request.processes(): print("----- {0} returned {1}".format( process.host().address, process.exit_code())) if not process.stdout() == "": print(GREEN + process.stdout() + NORMAL) if not process.stderr() == "": print(RED + process.stderr() + NORMAL) print("") self._log( "OUTPUT ENDS ---------------------------------------------------\n" ) sweeper.skip(comb) launch_server.kill() launch_server.wait() else: # Waiting for server to end launch_server.wait() # Getting log files distant_path = OUT_FILE_FORMAT.format( len(cores), comb['n_transitions']) local_path = distant_path EX.Get([server], distant_path).run() EX.Local('mv ' + distant_path + ' ' + os.path.join(self.result_dir, local_path)).run() EX.Get([server], 'client_*.out', local_location=self.result_dir) EX.Remote('rm -f client_*.out', [server]) self._log( "Finishing experiment with {0} clients and {1} transitions per client" .format(comb['n_clients'], comb['n_transitions'])) sweeper.done(comb) sub_comb = sweeper.get_next(filtr=lambda r: filter( lambda s: s["n_clients"] == comb['n_clients'], r)) self._updateStat(sweeper.stats()) if not sub_comb: # Killing job EX5.oar.oardel(job) self.__class__._job = None break else: comb = sub_comb print ""