def run_xp(self): """Iterate over the parameters and execute the bench""" while len(self.sweeper.get_remaining()) > 0: comb = self.sweeper.get_next() if comb['n_core'] > get_host_attributes(comb['cluster']+'-1')['architecture']['smt_size'] * self.n_nodes: self.sweeper.skip(comb) continue logger.info('Processing new combination %s' % (comb,)) site = get_cluster_site(comb['cluster']) jobs = oarsub([(OarSubmission(resources = "{cluster='" + comb['cluster']+"'}/nodes=" + str(self.n_nodes), job_type = 'allow_classic_ssh', walltime ='0:10:00'), site)]) if jobs[0][0]: try: wait_oar_job_start(*jobs[0]) nodes = get_oar_job_nodes(*jobs[0]) bench_cmd = 'mpirun -H %s -n %i %s ~/NPB3.3-MPI/bin/lu.%s.%i' % ( ",".join([node.address for node in nodes]), comb['n_core'], get_mpi_opts(comb['cluster']), comb['size'], comb['n_core']) lu_bench = SshProcess(bench_cmd, nodes[0]) lu_bench.stdout_handlers.append(self.result_dir + '/' + slugify(comb) + '.out') lu_bench.run() if lu_bench.ok: logger.info("comb ok: %s" % (comb,)) self.sweeper.done(comb) continue finally: oardel(jobs) logger.info("comb NOT ok: %s" % (comb,)) self.sweeper.cancel(comb)
def get_cores_hosts(hosts): """Get the number of cores of a list of given hosts Parameters ---------- hosts: list a list of hosts Returns ------- dict key: str, name of host (e.g. econome-8.nantes.grid5000.fr) value: int, the number of cores """ n_cores_hosts = dict() for host in hosts: host_name = host.split('.')[0] try: n_cores_hosts[host] = get_host_attributes( host_name)['architecture']['nb_cores'] logger.info('Number of cores of [%s] = %s' % (host_name, n_cores_hosts[host])) except Exception as e: logger.error('Cannot get number of cores from host [%s]' % host_name) logger.error('Exception: %s' % e, exc_info=True) return n_cores_hosts
def get_ip(node): """return the ip of the given host""" host = get_host_attributes(node) num = 0 # because with some clusters, the ip to use is not the first one # for example, change to 0,1 for lyon for i in host['network_adapters']: if 'ip' in i: return prefix + i['ip'].split('.')[3], i['ip']
def get_ip(node): """return the ip of the given host""" host = get_host_attributes(node) nip = 0 # because with some clusters, the ip to use is not the first one # for example, change to 0,1 for lyon for i in host['network_adapters']: if 'ip' in i: if nip > len(i['ip']): return None return i['ip'][nip]
def is_snmp_available(args): """ Allow to know if SNMP-sensor is available for this node :param args: Script argument :return: True if available, False otherwise """ try: data = get_host_attributes(args.node_name) data['sensors']['power']['via']['pdu'] except KeyError: return False return True
def getwatt(node, from_ts, to_ts): """Get power values from Grid'5000 Lyon Wattmetre (requires Execo) :param node: Node name :param from_ts: Time from which metric is collected, as an integer Unix timestamp :param from_ts: Time until which metric is collected, as an integer Unix timestamp :return: A list of (timestamp, value) tuples. """ import datetime import requests import gzip import time from execo_g5k import get_host_attributes watt = [] node_wattmetre = get_host_attributes( node)['sensors']['power']['via']['pdu'] for i in range(len(node_wattmetre)): node = node_wattmetre[i] tmp_watt = [] for ts in range(int(from_ts), int(to_ts) + 3600, 3600): suffix = datetime.datetime.fromtimestamp(ts).strftime( '%Y-%m-%dT%H') if suffix != datetime.datetime.fromtimestamp( time.time()).strftime('%Y-%m-%dT%H'): suffix += ".gz" data = requests.get("http://wattmetre.lyon.grid5000.fr/data/" + node['uid'] + "-log/power.csv." + suffix).content if suffix.endswith(".gz"): data = gzip.decompress(data) for l in str(data).split('\\n')[1:-1]: l = l.split(',') if l[3] == 'OK' and l[4 + node['port']] != '': ts, value = (float(l[2]), float(l[4 + node['port']])) if from_ts <= ts and ts <= to_ts: tmp_watt.append((ts, value)) if not suffix.endswith(".gz"): break watt.append(tmp_watt) if len(watt) > 1: for i in range(1, len(watt)): for j in range(min(len(watt[0]), len(watt[i]))): watt[0][j] = (watt[0][j][0], watt[0][j][1] + watt[i][j][1]) return watt[0]
def get_switch(self, host): """Return the network switch to which the host is connected. Args: host (Host): The host to query. Return: str: The network switch to which the given host is connected. """ nw_adapters = get_host_attributes(host)[u'network_adapters'] for nwa in nw_adapters: if (u'network_address' in nwa and nwa[u'network_address'] == host.address): return nwa[u'switch']
def getwatt(node=None, from_ts=None, to_ts=None): """Get power values from Grid'5000 Lyon Wattmetre (requires Execo) :param node: Node name :param from_ts: Time from which metric is collected, as an integer Unix timestamp :param from_ts: Time until which metric is collected, as an integer Unix timestamp :return: A list of (timestamp, value) tuples. """ if node is None: node = platform.node().split(".")[0] if to_ts is None: to_ts = time.time() if from_ts is None: from_ts = to_ts - 300 watt = [] host_attrs = get_host_attributes(node) node_wattmetre = host_attrs["sensors"]["power"]["via"]["pdu"][0] first_part_address = ("http://wattmetre.lyon.grid5000.fr/data/" + node_wattmetre["uid"] + "-log/power.csv.") for ts in range(int(from_ts), int(to_ts) + 3600, 3600): suffix = datetime.datetime.fromtimestamp(ts).strftime("%Y-%m-%dT%H") if suffix != datetime.datetime.fromtimestamp( time.time()).strftime("%Y-%m-%dT%H"): suffix += ".bz2" address = first_part_address + suffix print("getting file:", address) data = requests.get(address).content if suffix.endswith(".bz2"): data = bz2.decompress(data) for l in str(data).split("\\n")[1:-1]: l = l.split(",") if l[3] == "OK" and l[4 + node_wattmetre["port"]] != "": ts, value = (float(l[2]), float(l[4 + node_wattmetre["port"]])) if from_ts <= ts and ts <= to_ts: watt.append((ts, value)) if not suffix.endswith(".bz2"): break return watt
def get_nodes(self, comb): """ Perform a submission for a given comb and retrieve the submission node list """ logger.info('Performing submission') n_core = get_host_attributes(comb['cluster'] + '-1')['architecture']['smt_size'] submission = OarSubmission(resources="nodes=%d" % (max(1, comb['cores']/n_core), ), sql_properties="cluster='%s'"%comb['cluster'], job_type="besteffort", name="l2c_fft_eval") self.oar_job_id, self.frontend = oarsub([(submission, get_cluster_site(comb['cluster']))])[0] logger.info("Waiting for job start") wait_oar_job_start(self.oar_job_id, self.frontend) logger.info("Retrieving hosts list") nodes = get_oar_job_nodes(self.oar_job_id, self.frontend) self.hosts = [host for host in nodes for i in range(n_core)]
def _configure_servers(self, hosts=None): """Configure servers and host-dependant parameters. Args: hosts (list of Host, optional): The list of hosts to take into account in the configuration. If not specified, all the hosts of the Spark cluster are used. The first host of this list is always used as the reference. """ if not hosts: hosts = self.hosts host_attrs = get_host_attributes(hosts[0]) num_cores = host_attrs[u'architecture'][u'smt_size'] total_memory_mb = (int(host_attrs[u'main_memory'][u'ram_size']) / (1024 * 1024)) memory_per_worker = int(0.75 * total_memory_mb) memory_per_task = int(memory_per_worker / num_cores) # Set memory for each worker command = "cat >> " + self.conf_dir + "/spark-env.sh << EOF\n" command += "SPARK_MASTER_PORT=" + str(self.port) + "\n" command += "SPARK_WORKER_MEMORY=" + str(memory_per_worker) + "m\n" command += "EOF\n" action = Remote(command, self.hosts) action.run() # Default parameters driver_mem = "1g" executor_mem = str(memory_per_task) + "m" with open(self.temp_conf_dir + "/spark-defaults.conf", "a") \ as defaults_file: defaults_file.write("spark.executor.memory\t" + executor_mem + "\n") defaults_file.write("spark.driver.memory\t" + driver_mem + "\n") # defaults_file.write("spark.driver.maxResultSize\t1g\n") defaults_file.write("spark.logConf\ttrue\n") # defaults_file.write("spark.python.worker.memory\t512m") if self.evs_log_dir: defaults_file.write("spark.eventLog.enabled\ttrue\n") defaults_file.write("spark.eventLog.dir\t" + self.evs_log_dir + "\n")
def _configure_servers(self, hosts=None): """Configure servers and host-dependant parameters. Args: hosts (list of Host, optional): The list of hosts to take into account in the configuration. If not specified, all the hosts of the Spark cluster are used. The first host of this list is always used as the reference. """ if not hosts: hosts = self.hosts host_attrs = get_host_attributes(hosts[0]) num_cores = host_attrs[u'architecture'][u'smt_size'] total_memory_mb = (int(host_attrs[u'main_memory'][u'ram_size']) / (1024 * 1024)) memory_per_worker = int(0.75 * total_memory_mb) memory_per_task = int(memory_per_worker / num_cores) # Set memory for each worker command = "cat >> " + self.conf_dir + "/spark-env.sh << EOF\n" command += "SPARK_MASTER_PORT=" + str(self.port) + "\n" command += "SPARK_WORKER_MEMORY=" + str(memory_per_worker) + "m\n" command += "EOF\n" action = Remote(command, self.hosts) action.run() # Default parameters driver_mem = "1g" executor_mem = str(memory_per_task) + "m" with open(self.temp_conf_dir + "/spark-defaults.conf", "a") \ as defaults_file: defaults_file.write("spark.executor.memory\t" + executor_mem + "\n") defaults_file.write("spark.driver.memory\t" + driver_mem + "\n") # defaults_file.write("spark.driver.maxResultSize\t1g\n") defaults_file.write("spark.logConf\ttrue\n") # defaults_file.write("spark.python.worker.memory\t512m") defaults_file.write("spark.eventLog.enabled\ttrue\n") defaults_file.write("spark.eventLog.dir\t" + self.event_log_dir + "\n")
def parse_omegawatt(args): """ source: https://gitlab.inria.fr/delamare/wattmetre-read/raw/master/tools/getwatt.py :param args: Script argument :return: A list of (timestamp, value) tuples. """ watt = {} node_wattmetre = get_host_attributes( args.node_name)['sensors']['power']['via']['pdu'][0] from_ts = int(args.timestamp_start) to_ts = int(args.timestamp_stop) for ts in range(from_ts, to_ts + 3600, 3600): suffix = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%dT%H') if suffix != datetime.datetime.fromtimestamp( time.time()).strftime('%Y-%m-%dT%H'): suffix += ".gz" req = requests.get("http://wattmetre." + args.city_name + ".grid5000.fr/data/" + node_wattmetre['uid'] + "-log/power.csv." + suffix) if req.status_code == 404: return watt data = req.content if suffix.endswith(".gz"): data = gzip.decompress(data) for l in str(data).split('\\n')[1:-1]: l = l.split(',') if l[3] == 'OK' and l[4 + node_wattmetre['port']] != '': ts, value = (int(np.round(float(l[2]))), float(l[4 + node_wattmetre['port']])) if from_ts <= ts and ts <= to_ts: if ts not in watt: watt[ts] = [0, 0] watt[ts][0] += value watt[ts][1] += 1 if not suffix.endswith(".gz"): break for ts, val in watt.items(): watt[ts] = watt[ts][0] / watt[ts][1] return watt
def get_nodes(self, comb): """ Perform a submission for a given comb and retrieve the submission node list """ logger.info('Performing submission') n_core = get_host_attributes(comb['cluster'] + '-1')['architecture']['smt_size'] submission = OarSubmission( resources="nodes=%d" % (max(1, comb['cores'] / n_core), ), sql_properties="cluster='%s'" % comb['cluster'], job_type="besteffort", name="l2c_fft_eval") self.oar_job_id, self.frontend = oarsub([ (submission, get_cluster_site(comb['cluster'])) ])[0] logger.info("Waiting for job start") wait_oar_job_start(self.oar_job_id, self.frontend) logger.info("Retrieving hosts list") nodes = get_oar_job_nodes(self.oar_job_id, self.frontend) self.hosts = [host for host in nodes for i in range(n_core)]
def get_pdu_ip_and_port(args): """ Return the PDU IP String, and the port associate to the current Node :param args: Script argument :return: List of PDU with IP/port """ data = get_host_attributes(args.node_name) # Get PDU name pdus_name = [pdu['uid'] for pdu in data['sensors']['power']['via']['pdu']] # Get PDU IP/port pdus_infos = [] for pdu_name in pdus_name: port = None for pdu_info in data['sensors']['power']['via']['pdu']: if pdu_info['uid'] == pdu_name: port = pdu_info['port'] break ip = socket.gethostbyname(pdu_name + "." + args.city_name + ".grid5000.fr") pdus_infos.append((pdu_name, ip, port)) return pdus_infos
def define_parameters(self): """Create the iterator on the parameters combinations to be explored""" # fixed number of nodes self.n_nodes = 4 # choose a list of clusters clusters = ['graphene', 'petitprince', 'edel', 'paradent', 'stremi'] #clusters = ['petitprince', 'paradent'] # compute the maximum number of cores among all clusters max_core = self.n_nodes * max([ get_host_attributes(cluster + '-1')['architecture']['smt_size'] for cluster in clusters]) # define the parameters self.parameters = { 'cluster' : clusters, 'n_core': filter(lambda i: i >= self.n_nodes, list(takewhile(lambda i: i<max_core, (2**i for i in count(0, 1))))), 'size' : ['A', 'B', 'C'] } logger.info(self.parameters) # define the iterator over the parameters combinations self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweep(self.parameters)) logger.info('Number of parameters combinations %s' % len(self.sweeper.get_remaining()))
def run(self): """Inherited method, put here the code for running the engine""" self.define_parameters() self.cluster = self.args[0] self.site = get_cluster_site(self.cluster) if self.options.oar_job_id: self.oar_job_id = self.options.oar_job_id else: self.oar_job_id = None try: # Creation of the main iterator which is used for the first control loop. # You need have a method called define_parameters, that returns a list of parameter dicts self.define_parameters() job_is_dead = False # While they are combinations to treat while len(self.sweeper.get_remaining()) > 0: # If no job, we make a reservation and prepare the hosts for the experiments if job_is_dead or self.oar_job_id is None: self.make_reservation() # Retrieving the hosts and subnets parameters self.hosts = get_oar_job_nodes(self.oar_job_id, self.frontend) # Hosts deployment deployed, undeployed = deploy(Deployment(self.hosts, env_file="/home/mliroz/deploys/hadoop6.env")) logger.info("%i deployed, %i undeployed" % (len(deployed), len(undeployed))) if len(deployed) == 0: break # Configuration du systeme => look at the execo_g5k.topology module attr = get_host_attributes(self.cluster + '-1') ## SETUP FINISHED # Getting the next combination comb = self.sweeper.get_next() self.prepare_dataset(comb) self.xp(comb) # subloop over the combinations that have the same sizes while True: newcomb = self.sweeper.get_next(lambda r: filter(lambda subcomb: subcomb['sizes'] == comb['sizes'], r)) if newcomb: try: self.xp(newcomb) except: break else: break if get_oar_job_info(self.oar_job_id, self.frontend)['state'] == 'Error': job_is_dead = True finally: if self.oar_job_id is not None: if not self.options.keep_alive: logger.info('Deleting job') oardel([(self.oar_job_id, self.frontend)]) else: logger.info('Keeping job alive for debugging')
def get_ip(node): """return the ip of the given host""" host = get_host_attributes(node) for i in host['network_adapters']: if 'ip' in i: return config['prefix'] + i['ip'].split('.')[3], i['ip']
def __init__(self, name, hosts): super(G5kPhysicalCluster, self).__init__(name, hosts) host_attrs = get_host_attributes(hosts[0]) self._num_cores = host_attrs[u'architecture'][u'smt_size'] self._memory = host_attrs[u'main_memory'][u'ram_size'] / (1024 * 1024)
def get_memory_and_cores(self, host): host_attrs = get_host_attributes(host) cores = host_attrs[u'architecture'][u'nb_cores'] mem = host_attrs[u'main_memory'][u'ram_size'] / (1024 * 1024) return mem, cores
def run(self): # Defining experiment parameters self.parameters = { 'n_clients': [400, 450, 500, 550, 600], 'n_transitions': [10000] } cluster = 'griffon' sweeps = sweep(self.parameters) sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweeps) server_out_path = os.path.join(self.result_dir, "server.out") self._updateStat(sweeper.stats()) # Loop on the number of nodes while True: # Taking the next parameter combinations comb = sweeper.get_next() if not comb: break # Performing the submission on G5K site = get_cluster_site(cluster) self._log("Output will go to " + self.result_dir) n_nodes = int(math.ceil(float(comb['n_clients']) / EX5.get_host_attributes(cluster + '-1')['architecture']['smt_size'])) + 1 self._log("Reserving {0} nodes on {1}".format(n_nodes, site)) resources = "{cluster=\\'" + cluster + "\\'}/nodes=" + str(n_nodes) submission = EX5.OarSubmission(resources = resources, job_type = 'allow_classic_ssh', walltime ='00:10:00') job = EX5.oarsub([(submission, site)]) self.__class__._job = job # Sometimes oarsub fails silently if job[0][0] is None: print("\nError: no job was created") sys.exit(1) # Wait for the job to start self._log("Waiting for job {0} to start...\n".format(BOLD_MAGENTA + str(job[0][0]) + NORMAL)) EX5.wait_oar_job_start(job[0][0], job[0][1], prediction_callback = prediction) nodes = EX5.get_oar_job_nodes(job[0][0], job[0][1]) # Deploying nodes #deployment = EX5.Deployment(hosts = nodes, env_file='path_to_env_file') #run_deploy = EX5.deploy(deployment) #nodes_deployed = run_deploy.hosts[0] # Copying active_data program on all deployed hosts EX.Put([nodes[0]], '../dist/active-data-lib-0.1.2.jar', connexion_params = {'user': '******'}).run() EX.Put([nodes[0]], '../server.policy', connexion_params = {'user': '******'}).run() # Loop on the number of requests per client process while True: # Split the nodes clients = nodes[1:] server = nodes[0] self._log("Running experiment with {0} nodes and {1} transitions per client".format(len(clients), comb['n_transitions'])) # Launching Server on one node out_handler = FileOutputHandler(server_out_path) launch_server = EX.Remote('java -jar active-data-lib-0.1.2.jar', [server], stdout_handler = out_handler, stderr_handler = out_handler).start() self._log("Server started on " + server.address) time.sleep(2) # Launching clients rank=0 n_cores = EX5.get_host_attributes(clients[0])['architecture']['smt_size']; cores = nodes * n_cores cores = cores[0:comb['n_clients']] # Cut out the additional cores client_connection_params = { 'taktuk_gateway': 'lyon.grid5000.fr', 'host_rewrite_func': None } self._log("Launching {0} clients...".format(len(cores))) client_cmd = "/usr/bin/env java -cp /home/ansimonet/active-data-lib-0.1.2.jar org.inria.activedata.examples.perf.TransitionsPerSecond " + \ "{0} {1} {2} {3} {4}".format(server.address, 1200, "{{range(len(cores))}}", len(cores), comb['n_transitions']) client_out_handler = FileOutputHandler(os.path.join(self.result_dir, "clients.out")) client_request = EX.TaktukRemote(client_cmd, cores, connexion_params = client_connection_params, \ stdout_handler = client_out_handler, stderr_handler = client_out_handler) client_request.run() if not client_request.ok(): # Some client failed, please panic self._log("One or more client process failed. Enjoy reading their outputs.") self._log("OUTPUT STARTS -------------------------------------------------\n") for process in client_request.processes(): print("----- {0} returned {1}".format(process.host().address, process.exit_code())) if not process.stdout() == "": print(GREEN + process.stdout() + NORMAL) if not process.stderr() == "": print(RED + process.stderr() + NORMAL) print("") self._log("OUTPUT ENDS ---------------------------------------------------\n") sweeper.skip(comb) launch_server.kill() launch_server.wait() else: # Waiting for server to end launch_server.wait() # Getting log files distant_path = OUT_FILE_FORMAT.format(len(cores), comb['n_transitions']) local_path = distant_path EX.Get([server], distant_path).run() EX.Local('mv ' + distant_path + ' ' + os.path.join(self.result_dir, local_path)).run() EX.Get([server], 'client_*.out', local_location = self.result_dir) EX.Remote('rm -f client_*.out', [server]) self._log("Finishing experiment with {0} clients and {1} transitions per client".format(comb['n_clients'], comb['n_transitions'])) sweeper.done(comb) sub_comb = sweeper.get_next (filtr = lambda r: filter(lambda s: s["n_clients"] == comb['n_clients'], r)) self._updateStat(sweeper.stats()) if not sub_comb: # Killing job EX5.oar.oardel(job) self.__class__._job = None break else: comb = sub_comb print ""
def _configure_servers(self, hosts=None): """Configure servers and host-dependant parameters. Args: hosts (list of Host, optional): The list of hosts to take into account in the configuration. If not specified, all the hosts of the Hadoop cluster are used. The first host of this list is always used as the reference. """ if not hosts: hosts = self.hosts host_attrs = get_host_attributes(hosts[0]) num_cores = host_attrs[u'architecture'][u'smt_size'] available_memory = (int(host_attrs[u'main_memory'][u'ram_size']) / (1024 * 1024)) total_memory_mb = min(available_memory - 2 * 1024, int(0.75 * available_memory)) mem_per_task_mb = total_memory_mb / (num_cores - 1) replace_in_xml_file(os.path.join(self.temp_conf_dir, CORE_CONF_FILE), "fs.defaultFS", "hdfs://" + self.master.address + ":" + str(self.hdfs_port) + "/", True) replace_in_xml_file(os.path.join(self.temp_conf_dir, CORE_CONF_FILE), "hadoop.tmp.dir", self.hadoop_temp_dir, True) replace_in_xml_file(os.path.join(self.temp_conf_dir, CORE_CONF_FILE), "topology.script.file.name", self.conf_dir + "/topo.sh", True) replace_in_xml_file(os.path.join(self.temp_conf_dir, MR_CONF_FILE), "mapreduce.framework.name", "yarn", True) replace_in_xml_file(os.path.join(self.temp_conf_dir, MR_CONF_FILE), "mapreduce.map.memory.mb", str(mem_per_task_mb), True) replace_in_xml_file(os.path.join(self.temp_conf_dir, MR_CONF_FILE), "mapreduce.map.java.opts", "-Xmx" + str(mem_per_task_mb) + "m", True) replace_in_xml_file(os.path.join(self.temp_conf_dir, MR_CONF_FILE), "mapreduce.map.cpu.vcores", "1", True) replace_in_xml_file(os.path.join(self.temp_conf_dir, MR_CONF_FILE), "mapreduce.reduce.memory.mb", str(mem_per_task_mb), True) replace_in_xml_file(os.path.join(self.temp_conf_dir, MR_CONF_FILE), "mapreduce.reduce.cpu.vcores", "1", True) replace_in_xml_file(os.path.join(self.temp_conf_dir, MR_CONF_FILE), "mapreduce.reduce.java.opts", "-Xmx" + str(mem_per_task_mb) + "m", True) replace_in_xml_file(os.path.join(self.temp_conf_dir, YARN_CONF_FILE), "yarn.resourcemanager.hostname", self.master.address, True) replace_in_xml_file(os.path.join(self.temp_conf_dir, YARN_CONF_FILE), "yarn.nodemanager.resource.memory-mb", str(total_memory_mb), True) replace_in_xml_file(os.path.join(self.temp_conf_dir, YARN_CONF_FILE), "yarn.nodemanager.resource.cpu-vcores", str(num_cores - 1), True) replace_in_xml_file(os.path.join(self.temp_conf_dir, YARN_CONF_FILE), "yarn.scheduler.maximum-allocation-mb", str(total_memory_mb), True) replace_in_xml_file(os.path.join(self.temp_conf_dir, YARN_CONF_FILE), "yarn.nodemanager.aux-services", "mapreduce_shuffle", True)
def run(self): """Inherited method, put here the code for running the engine""" self.define_parameters() self.cluster = self.args[0] self.site = get_cluster_site(self.cluster) if self.options.oar_job_id: self.oar_job_id = self.options.oar_job_id else: self.oar_job_id = None try: # Creation of the main iterator which is used for the first control loop. # You need have a method called define_parameters, that returns a list of parameter dicts self.define_parameters() job_is_dead = False # While they are combinations to treat while len(self.sweeper.get_remaining()) > 0: # If no job, we make a reservation and prepare the hosts for the experiments if job_is_dead or self.oar_job_id is None: self.make_reservation() # Retrieving the hosts and subnets parameters self.hosts = get_oar_job_nodes(self.oar_job_id, self.frontend) # Hosts deployment deployed, undeployed = deploy( Deployment(self.hosts, env_file="/home/mliroz/deploys/hadoop6.env")) logger.info("%i deployed, %i undeployed" % (len(deployed), len(undeployed))) if len(deployed) == 0: break # Configuration du systeme => look at the execo_g5k.topology module attr = get_host_attributes(self.cluster + '-1') ## SETUP FINISHED # Getting the next combination comb = self.sweeper.get_next() self.prepare_dataset(comb) self.xp(comb) # subloop over the combinations that have the same sizes while True: newcomb = self.sweeper.get_next(lambda r: filter( lambda subcomb: subcomb['sizes'] == comb['sizes'], r)) if newcomb: try: self.xp(newcomb) except: break else: break if get_oar_job_info(self.oar_job_id, self.frontend)['state'] == 'Error': job_is_dead = True finally: if self.oar_job_id is not None: if not self.options.keep_alive: logger.info('Deleting job') oardel([(self.oar_job_id, self.frontend)]) else: logger.info('Keeping job alive for debugging')
def _configure_servers(self, hosts=None): """Configure servers and host-dependant parameters. Args: hosts (list of Host, optional): The list of hosts to take into account in the configuration. If not specified, all the hosts of the Hadoop cluster are used. The first host of this list is always used as the reference. """ if not hosts: hosts = self.hosts host_attrs = get_host_attributes(hosts[0]) num_cores = host_attrs[u'architecture'][u'smt_size'] available_memory = (int(host_attrs[u'main_memory'][u'ram_size']) / (1024 * 1024)) total_memory_mb = min(available_memory - 2 * 1024, int(0.75 * available_memory)) mem_per_task_mb = total_memory_mb / (num_cores - 1) replace_in_xml_file( os.path.join(self.temp_conf_dir, CORE_CONF_FILE), "fs.defaultFS", "hdfs://" + self.master.address + ":" + str(self.hdfs_port) + "/", True) replace_in_xml_file(os.path.join(self.temp_conf_dir, CORE_CONF_FILE), "hadoop.tmp.dir", self.hadoop_temp_dir, True) replace_in_xml_file(os.path.join(self.temp_conf_dir, CORE_CONF_FILE), "topology.script.file.name", self.conf_dir + "/topo.sh", True) replace_in_xml_file(os.path.join(self.temp_conf_dir, MR_CONF_FILE), "mapreduce.framework.name", "yarn", True) replace_in_xml_file(os.path.join(self.temp_conf_dir, MR_CONF_FILE), "mapreduce.map.memory.mb", str(mem_per_task_mb), True) replace_in_xml_file(os.path.join(self.temp_conf_dir, MR_CONF_FILE), "mapreduce.map.java.opts", "-Xmx" + str(mem_per_task_mb) + "m", True) replace_in_xml_file(os.path.join(self.temp_conf_dir, MR_CONF_FILE), "mapreduce.map.cpu.vcores", "1", True) replace_in_xml_file(os.path.join(self.temp_conf_dir, MR_CONF_FILE), "mapreduce.reduce.memory.mb", str(mem_per_task_mb), True) replace_in_xml_file(os.path.join(self.temp_conf_dir, MR_CONF_FILE), "mapreduce.reduce.cpu.vcores", "1", True) replace_in_xml_file(os.path.join(self.temp_conf_dir, MR_CONF_FILE), "mapreduce.reduce.java.opts", "-Xmx" + str(mem_per_task_mb) + "m", True) replace_in_xml_file(os.path.join(self.temp_conf_dir, YARN_CONF_FILE), "yarn.resourcemanager.address", self.master.address + ":" + str(self.mapred_port), True) replace_in_xml_file(os.path.join(self.temp_conf_dir, YARN_CONF_FILE), "yarn.nodemanager.resource.memory-mb", str(total_memory_mb), True) replace_in_xml_file(os.path.join(self.temp_conf_dir, YARN_CONF_FILE), "yarn.nodemanager.resource.cpu-vcores", str(num_cores - 1), True) replace_in_xml_file(os.path.join(self.temp_conf_dir, YARN_CONF_FILE), "yarn.scheduler.maximum-allocation-mb", str(total_memory_mb), True) replace_in_xml_file(os.path.join(self.temp_conf_dir, YARN_CONF_FILE), "yarn.nodemanager.aux-services", "mapreduce_shuffle", True)
def run(self): # Defining experiment parameters self.parameters = { 'n_clients': [400, 450, 500, 550, 600], 'n_transitions': [10000] } cluster = 'griffon' sweeps = sweep(self.parameters) sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweeps) server_out_path = os.path.join(self.result_dir, "server.out") self._updateStat(sweeper.stats()) # Loop on the number of nodes while True: # Taking the next parameter combinations comb = sweeper.get_next() if not comb: break # Performing the submission on G5K site = get_cluster_site(cluster) self._log("Output will go to " + self.result_dir) n_nodes = int( math.ceil( float(comb['n_clients']) / EX5.get_host_attributes( cluster + '-1')['architecture']['smt_size'])) + 1 self._log("Reserving {0} nodes on {1}".format(n_nodes, site)) resources = "{cluster=\\'" + cluster + "\\'}/nodes=" + str(n_nodes) submission = EX5.OarSubmission(resources=resources, job_type='allow_classic_ssh', walltime='00:10:00') job = EX5.oarsub([(submission, site)]) self.__class__._job = job # Sometimes oarsub fails silently if job[0][0] is None: print("\nError: no job was created") sys.exit(1) # Wait for the job to start self._log( "Waiting for job {0} to start...\n".format(BOLD_MAGENTA + str(job[0][0]) + NORMAL)) EX5.wait_oar_job_start(job[0][0], job[0][1], prediction_callback=prediction) nodes = EX5.get_oar_job_nodes(job[0][0], job[0][1]) # Deploying nodes #deployment = EX5.Deployment(hosts = nodes, env_file='path_to_env_file') #run_deploy = EX5.deploy(deployment) #nodes_deployed = run_deploy.hosts[0] # Copying active_data program on all deployed hosts EX.Put([nodes[0]], '../dist/active-data-lib-0.1.2.jar', connexion_params={ 'user': '******' }).run() EX.Put([nodes[0]], '../server.policy', connexion_params={ 'user': '******' }).run() # Loop on the number of requests per client process while True: # Split the nodes clients = nodes[1:] server = nodes[0] self._log( "Running experiment with {0} nodes and {1} transitions per client" .format(len(clients), comb['n_transitions'])) # Launching Server on one node out_handler = FileOutputHandler(server_out_path) launch_server = EX.Remote( 'java -jar active-data-lib-0.1.2.jar', [server], stdout_handler=out_handler, stderr_handler=out_handler).start() self._log("Server started on " + server.address) time.sleep(2) # Launching clients rank = 0 n_cores = EX5.get_host_attributes( clients[0])['architecture']['smt_size'] cores = nodes * n_cores cores = cores[ 0:comb['n_clients']] # Cut out the additional cores client_connection_params = { 'taktuk_gateway': 'lyon.grid5000.fr', 'host_rewrite_func': None } self._log("Launching {0} clients...".format(len(cores))) client_cmd = "/usr/bin/env java -cp /home/ansimonet/active-data-lib-0.1.2.jar org.inria.activedata.examples.perf.TransitionsPerSecond " + \ "{0} {1} {2} {3} {4}".format(server.address, 1200, "{{range(len(cores))}}", len(cores), comb['n_transitions']) client_out_handler = FileOutputHandler( os.path.join(self.result_dir, "clients.out")) client_request = EX.TaktukRemote(client_cmd, cores, connexion_params = client_connection_params, \ stdout_handler = client_out_handler, stderr_handler = client_out_handler) client_request.run() if not client_request.ok(): # Some client failed, please panic self._log( "One or more client process failed. Enjoy reading their outputs." ) self._log( "OUTPUT STARTS -------------------------------------------------\n" ) for process in client_request.processes(): print("----- {0} returned {1}".format( process.host().address, process.exit_code())) if not process.stdout() == "": print(GREEN + process.stdout() + NORMAL) if not process.stderr() == "": print(RED + process.stderr() + NORMAL) print("") self._log( "OUTPUT ENDS ---------------------------------------------------\n" ) sweeper.skip(comb) launch_server.kill() launch_server.wait() else: # Waiting for server to end launch_server.wait() # Getting log files distant_path = OUT_FILE_FORMAT.format( len(cores), comb['n_transitions']) local_path = distant_path EX.Get([server], distant_path).run() EX.Local('mv ' + distant_path + ' ' + os.path.join(self.result_dir, local_path)).run() EX.Get([server], 'client_*.out', local_location=self.result_dir) EX.Remote('rm -f client_*.out', [server]) self._log( "Finishing experiment with {0} clients and {1} transitions per client" .format(comb['n_clients'], comb['n_transitions'])) sweeper.done(comb) sub_comb = sweeper.get_next(filtr=lambda r: filter( lambda s: s["n_clients"] == comb['n_clients'], r)) self._updateStat(sweeper.stats()) if not sub_comb: # Killing job EX5.oar.oardel(job) self.__class__._job = None break else: comb = sub_comb print ""
def run(self): """ """ if self.options.oargrid_job_id: self.oargrid_job_id = self.options.oargrid_job_id else: self.oargrid_job_id = None try: # Creation of the main iterator which is used for the first control loop. self.define_parameters() job_is_dead = False # While there are combinations to treat while len(self.sweeper.get_remaining()) > 0: # If no job, we make a reservation and prepare the hosts for the experiments if self.oargrid_job_id is None: self.make_reservation() # Wait that the job starts logger.info('Waiting that the job start') wait_oargrid_job_start(self.oargrid_job_id) # Retrieving the hosts and subnets parameters self.hosts = get_oargrid_job_nodes(self.oargrid_job_id) # Hosts deployment and configuration default_connection_params['user'] = '******' logger.info("Start hosts configuration") ex_log.setLevel('INFO') deployment = Deployment( hosts=self.hosts, env_file='/home/sirimie/env/mywheezy-x64-base.env') self.hosts, _ = deploy(deployment) Remote("rm -f /home/Work/sgcbntier/paasage_demo/csv/REQTASK_*", self.hosts).run() Remote( "rm -f /home/Work/sgcbntier/paasage_demo/platform_aws.xml", self.hosts).run() Remote("rm -f /home/Work/sgcbntier/paasage_demo/cloud_ec2.xml", self.hosts).run() Put(self.hosts, [ "run_all_execo.py", "xml_gen_execo.py", "conf.xml", "platform_aws.xml", "cloud_ec2.xml" ], remote_location="/home/Work/sgcbntier/paasage_demo/").run( ) logger.info("Done") if len(self.hosts) == 0: break # Initializing the resources and threads available_hosts = [ host for host in self.hosts for i in range( get_host_attributes(host)['architecture']['smt_size']) ] threads = {} # Creating the unique folder for storing the results comb_dir = self.result_dir + '/csv_results' if not os.path.exists(comb_dir): os.mkdir(comb_dir) # Checking that the job is running and not in Error while self.is_job_alive() or len(threads.keys()) > 0: job_is_dead = False while self.options.n_nodes > len(available_hosts): tmp_threads = dict(threads) for t in tmp_threads: if not t.is_alive(): available_hosts.append(tmp_threads[t]['host']) del threads[t] sleep(5) if not self.is_job_alive(): job_is_dead = True break if job_is_dead: break # Getting the next combination comb = self.sweeper.get_next() if not comb: while len(threads.keys()) > 0: tmp_threads = dict(threads) for t in tmp_threads: if not t.is_alive(): del threads[t] logger.info('Waiting for threads to complete') sleep(20) break host = available_hosts[0] available_hosts = available_hosts[1:] t = Thread(target=self.workflow, args=(comb, host, comb_dir)) threads[t] = {'host': host} t.daemon = True t.start() if not self.is_job_alive(): job_is_dead = True if job_is_dead: self.oargrid_job_id = None finally: if self.oargrid_job_id is not None: if not self.options.keep_alive: logger.info('Deleting job') oargriddel([self.oargrid_job_id]) else: logger.info('Keeping job alive for debugging')