Ejemplo n.º 1
0
 def run_xp(self):
     """Iterate over the parameters and execute the bench"""
     while len(self.sweeper.get_remaining()) > 0:
         comb = self.sweeper.get_next()
         if comb['n_core'] > get_host_attributes(comb['cluster']+'-1')['architecture']['smt_size'] * self.n_nodes: 
             self.sweeper.skip(comb)
             continue
         logger.info('Processing new combination %s' % (comb,))
         site = get_cluster_site(comb['cluster'])
         jobs = oarsub([(OarSubmission(resources = "{cluster='" + comb['cluster']+"'}/nodes=" + str(self.n_nodes),
                                       job_type = 'allow_classic_ssh', 
                                       walltime ='0:10:00'), 
                         site)])
         if jobs[0][0]:
             try:
                 wait_oar_job_start(*jobs[0])
                 nodes = get_oar_job_nodes(*jobs[0])
                 bench_cmd = 'mpirun -H %s -n %i %s ~/NPB3.3-MPI/bin/lu.%s.%i' % (
                     ",".join([node.address for node in nodes]),
                     comb['n_core'],
                     get_mpi_opts(comb['cluster']),
                     comb['size'],
                     comb['n_core'])
                 lu_bench = SshProcess(bench_cmd, nodes[0])
                 lu_bench.stdout_handlers.append(self.result_dir + '/' + slugify(comb) + '.out')
                 lu_bench.run()
                 if lu_bench.ok:
                     logger.info("comb ok: %s" % (comb,))
                     self.sweeper.done(comb)
                     continue
             finally:
                 oardel(jobs)
         logger.info("comb NOT ok: %s" % (comb,))
         self.sweeper.cancel(comb)
Ejemplo n.º 2
0
def get_cores_hosts(hosts):
    """Get the number of cores of a list of given hosts

    Parameters
    ----------
    hosts: list
        a list of hosts

    Returns
    -------
    dict
        key: str, name of host (e.g. econome-8.nantes.grid5000.fr)
        value: int, the number of cores

    """

    n_cores_hosts = dict()
    for host in hosts:
        host_name = host.split('.')[0]
        try:
            n_cores_hosts[host] = get_host_attributes(
                host_name)['architecture']['nb_cores']
            logger.info('Number of cores of [%s] = %s' %
                        (host_name, n_cores_hosts[host]))
        except Exception as e:
            logger.error('Cannot get number of cores from host [%s]' %
                         host_name)
            logger.error('Exception: %s' % e, exc_info=True)
    return n_cores_hosts
Ejemplo n.º 3
0
def get_ip(node):
    """return the ip of the given host"""
    host = get_host_attributes(node)
    num = 0  # because with some clusters, the ip to use is not the first one
    # for example, change to 0,1 for lyon
    for i in host['network_adapters']:
        if 'ip' in i:
            return prefix + i['ip'].split('.')[3], i['ip']
Ejemplo n.º 4
0
def get_ip(node):
    """return the ip of the given host"""
    host = get_host_attributes(node)
    nip = 0  # because with some clusters, the ip to use is not the first one
    # for example, change to 0,1 for lyon
    for i in host['network_adapters']:
        if 'ip' in i:
            if nip > len(i['ip']):
                return None
            return i['ip'][nip]
Ejemplo n.º 5
0
def is_snmp_available(args):
    """
    Allow to know if SNMP-sensor is available for this node
    :param args: Script argument
    :return: True if available, False otherwise
    """
    try:
        data = get_host_attributes(args.node_name)
        data['sensors']['power']['via']['pdu']
    except KeyError:
        return False
    return True
Ejemplo n.º 6
0
def getwatt(node, from_ts, to_ts):
    """Get power values from Grid'5000 Lyon Wattmetre (requires Execo)

    :param node: Node name

    :param from_ts: Time from which metric is collected, as an integer Unix timestamp

    :param from_ts: Time until which metric is collected, as an integer Unix timestamp

    :return: A list of (timestamp, value) tuples.
    """

    import datetime
    import requests
    import gzip
    import time
    from execo_g5k import get_host_attributes

    watt = []
    node_wattmetre = get_host_attributes(
        node)['sensors']['power']['via']['pdu']
    for i in range(len(node_wattmetre)):
        node = node_wattmetre[i]
        tmp_watt = []
        for ts in range(int(from_ts), int(to_ts) + 3600, 3600):
            suffix = datetime.datetime.fromtimestamp(ts).strftime(
                '%Y-%m-%dT%H')
            if suffix != datetime.datetime.fromtimestamp(
                    time.time()).strftime('%Y-%m-%dT%H'):
                suffix += ".gz"
            data = requests.get("http://wattmetre.lyon.grid5000.fr/data/" +
                                node['uid'] + "-log/power.csv." +
                                suffix).content
            if suffix.endswith(".gz"):
                data = gzip.decompress(data)
            for l in str(data).split('\\n')[1:-1]:
                l = l.split(',')
                if l[3] == 'OK' and l[4 + node['port']] != '':
                    ts, value = (float(l[2]), float(l[4 + node['port']]))
                    if from_ts <= ts and ts <= to_ts:
                        tmp_watt.append((ts, value))
            if not suffix.endswith(".gz"):
                break
        watt.append(tmp_watt)

    if len(watt) > 1:
        for i in range(1, len(watt)):
            for j in range(min(len(watt[0]), len(watt[i]))):
                watt[0][j] = (watt[0][j][0], watt[0][j][1] + watt[i][j][1])

    return watt[0]
Ejemplo n.º 7
0
    def get_switch(self, host):
        """Return the network switch to which the host is connected.

        Args:
          host (Host): The host to query.

        Return:
          str: The network switch to which the given host is connected.
        """
        nw_adapters = get_host_attributes(host)[u'network_adapters']
        for nwa in nw_adapters:
            if (u'network_address' in nwa and
                        nwa[u'network_address'] == host.address):
                return nwa[u'switch']
Ejemplo n.º 8
0
def getwatt(node=None, from_ts=None, to_ts=None):
    """Get power values from Grid'5000 Lyon Wattmetre (requires Execo)

    :param node: Node name

    :param from_ts: Time from which metric is collected, as an integer Unix timestamp

    :param from_ts: Time until which metric is collected, as an integer Unix timestamp

    :return: A list of (timestamp, value) tuples.
    """

    if node is None:
        node = platform.node().split(".")[0]

    if to_ts is None:
        to_ts = time.time()

    if from_ts is None:
        from_ts = to_ts - 300

    watt = []
    host_attrs = get_host_attributes(node)
    node_wattmetre = host_attrs["sensors"]["power"]["via"]["pdu"][0]
    first_part_address = ("http://wattmetre.lyon.grid5000.fr/data/" +
                          node_wattmetre["uid"] + "-log/power.csv.")

    for ts in range(int(from_ts), int(to_ts) + 3600, 3600):
        suffix = datetime.datetime.fromtimestamp(ts).strftime("%Y-%m-%dT%H")
        if suffix != datetime.datetime.fromtimestamp(
                time.time()).strftime("%Y-%m-%dT%H"):
            suffix += ".bz2"
        address = first_part_address + suffix
        print("getting file:", address)
        data = requests.get(address).content
        if suffix.endswith(".bz2"):
            data = bz2.decompress(data)
        for l in str(data).split("\\n")[1:-1]:
            l = l.split(",")
            if l[3] == "OK" and l[4 + node_wattmetre["port"]] != "":
                ts, value = (float(l[2]), float(l[4 + node_wattmetre["port"]]))
                if from_ts <= ts and ts <= to_ts:
                    watt.append((ts, value))
        if not suffix.endswith(".bz2"):
            break
    return watt
Ejemplo n.º 9
0
 def get_nodes(self, comb):
     """
         Perform a submission for a given comb and 
         retrieve the submission node list
     """
     logger.info('Performing submission')
     n_core = get_host_attributes(comb['cluster'] + '-1')['architecture']['smt_size']
     submission = OarSubmission(resources="nodes=%d" % (max(1, comb['cores']/n_core), ), 
                sql_properties="cluster='%s'"%comb['cluster'],
                job_type="besteffort", 
                name="l2c_fft_eval")
     self.oar_job_id, self.frontend = oarsub([(submission, get_cluster_site(comb['cluster']))])[0]
     logger.info("Waiting for job start")
     wait_oar_job_start(self.oar_job_id, self.frontend)
     logger.info("Retrieving hosts list")
     nodes = get_oar_job_nodes(self.oar_job_id, self.frontend)
     self.hosts = [host for host in nodes for i in range(n_core)]
Ejemplo n.º 10
0
    def _configure_servers(self, hosts=None):
        """Configure servers and host-dependant parameters.

           Args:
             hosts (list of Host, optional):
               The list of hosts to take into account in the configuration. If
               not specified, all the hosts of the Spark cluster are used. The
               first host of this list is always used as the reference.
        """

        if not hosts:
            hosts = self.hosts

        host_attrs = get_host_attributes(hosts[0])
        num_cores = host_attrs[u'architecture'][u'smt_size']
        total_memory_mb = (int(host_attrs[u'main_memory'][u'ram_size']) /
                           (1024 * 1024))
        memory_per_worker = int(0.75 * total_memory_mb)
        memory_per_task = int(memory_per_worker / num_cores)

        # Set memory for each worker
        command = "cat >> " + self.conf_dir + "/spark-env.sh << EOF\n"
        command += "SPARK_MASTER_PORT=" + str(self.port) + "\n"
        command += "SPARK_WORKER_MEMORY=" + str(memory_per_worker) + "m\n"
        command += "EOF\n"
        action = Remote(command, self.hosts)
        action.run()

        # Default parameters
        driver_mem = "1g"
        executor_mem = str(memory_per_task) + "m"

        with open(self.temp_conf_dir + "/spark-defaults.conf", "a") \
                as defaults_file:
            defaults_file.write("spark.executor.memory\t" + executor_mem + "\n")
            defaults_file.write("spark.driver.memory\t" + driver_mem + "\n")
            # defaults_file.write("spark.driver.maxResultSize\t1g\n")
            defaults_file.write("spark.logConf\ttrue\n")
            # defaults_file.write("spark.python.worker.memory\t512m")
            if self.evs_log_dir:
                defaults_file.write("spark.eventLog.enabled\ttrue\n")
                defaults_file.write("spark.eventLog.dir\t" +
                                    self.evs_log_dir + "\n")
Ejemplo n.º 11
0
    def _configure_servers(self, hosts=None):
        """Configure servers and host-dependant parameters.

           Args:
             hosts (list of Host, optional):
               The list of hosts to take into account in the configuration. If
               not specified, all the hosts of the Spark cluster are used. The
               first host of this list is always used as the reference.
        """

        if not hosts:
            hosts = self.hosts

        host_attrs = get_host_attributes(hosts[0])
        num_cores = host_attrs[u'architecture'][u'smt_size']
        total_memory_mb = (int(host_attrs[u'main_memory'][u'ram_size']) /
                           (1024 * 1024))
        memory_per_worker = int(0.75 * total_memory_mb)
        memory_per_task = int(memory_per_worker / num_cores)

        # Set memory for each worker
        command = "cat >> " + self.conf_dir + "/spark-env.sh << EOF\n"
        command += "SPARK_MASTER_PORT=" + str(self.port) + "\n"
        command += "SPARK_WORKER_MEMORY=" + str(memory_per_worker) + "m\n"
        command += "EOF\n"
        action = Remote(command, self.hosts)
        action.run()

        # Default parameters
        driver_mem = "1g"
        executor_mem = str(memory_per_task) + "m"

        with open(self.temp_conf_dir + "/spark-defaults.conf", "a") \
                as defaults_file:
            defaults_file.write("spark.executor.memory\t" + executor_mem +
                                "\n")
            defaults_file.write("spark.driver.memory\t" + driver_mem + "\n")
            # defaults_file.write("spark.driver.maxResultSize\t1g\n")
            defaults_file.write("spark.logConf\ttrue\n")
            # defaults_file.write("spark.python.worker.memory\t512m")
            defaults_file.write("spark.eventLog.enabled\ttrue\n")
            defaults_file.write("spark.eventLog.dir\t" + self.event_log_dir +
                                "\n")
Ejemplo n.º 12
0
def parse_omegawatt(args):
    """
    source: https://gitlab.inria.fr/delamare/wattmetre-read/raw/master/tools/getwatt.py
    :param args: Script argument
    :return: A list of (timestamp, value) tuples.
    """

    watt = {}
    node_wattmetre = get_host_attributes(
        args.node_name)['sensors']['power']['via']['pdu'][0]
    from_ts = int(args.timestamp_start)
    to_ts = int(args.timestamp_stop)

    for ts in range(from_ts, to_ts + 3600, 3600):
        suffix = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%dT%H')
        if suffix != datetime.datetime.fromtimestamp(
                time.time()).strftime('%Y-%m-%dT%H'):
            suffix += ".gz"
        req = requests.get("http://wattmetre." + args.city_name +
                           ".grid5000.fr/data/" + node_wattmetre['uid'] +
                           "-log/power.csv." + suffix)
        if req.status_code == 404:
            return watt
        data = req.content
        if suffix.endswith(".gz"):
            data = gzip.decompress(data)
        for l in str(data).split('\\n')[1:-1]:
            l = l.split(',')
            if l[3] == 'OK' and l[4 + node_wattmetre['port']] != '':
                ts, value = (int(np.round(float(l[2]))),
                             float(l[4 + node_wattmetre['port']]))
                if from_ts <= ts and ts <= to_ts:
                    if ts not in watt:
                        watt[ts] = [0, 0]
                    watt[ts][0] += value
                    watt[ts][1] += 1
        if not suffix.endswith(".gz"):
            break

    for ts, val in watt.items():
        watt[ts] = watt[ts][0] / watt[ts][1]
    return watt
Ejemplo n.º 13
0
 def get_nodes(self, comb):
     """
         Perform a submission for a given comb and 
         retrieve the submission node list
     """
     logger.info('Performing submission')
     n_core = get_host_attributes(comb['cluster'] +
                                  '-1')['architecture']['smt_size']
     submission = OarSubmission(
         resources="nodes=%d" % (max(1, comb['cores'] / n_core), ),
         sql_properties="cluster='%s'" % comb['cluster'],
         job_type="besteffort",
         name="l2c_fft_eval")
     self.oar_job_id, self.frontend = oarsub([
         (submission, get_cluster_site(comb['cluster']))
     ])[0]
     logger.info("Waiting for job start")
     wait_oar_job_start(self.oar_job_id, self.frontend)
     logger.info("Retrieving hosts list")
     nodes = get_oar_job_nodes(self.oar_job_id, self.frontend)
     self.hosts = [host for host in nodes for i in range(n_core)]
Ejemplo n.º 14
0
def get_pdu_ip_and_port(args):
    """
    Return the PDU IP String, and the port associate to the current Node
    :param args: Script argument
    :return: List of PDU with IP/port
    """
    data = get_host_attributes(args.node_name)

    # Get PDU name
    pdus_name = [pdu['uid'] for pdu in data['sensors']['power']['via']['pdu']]

    # Get PDU IP/port
    pdus_infos = []
    for pdu_name in pdus_name:
        port = None
        for pdu_info in data['sensors']['power']['via']['pdu']:
            if pdu_info['uid'] == pdu_name:
                port = pdu_info['port']
                break
        ip = socket.gethostbyname(pdu_name + "." + args.city_name +
                                  ".grid5000.fr")
        pdus_infos.append((pdu_name, ip, port))

    return pdus_infos
Ejemplo n.º 15
0
 def define_parameters(self):
     """Create the iterator on the parameters combinations to be explored"""
     # fixed number of nodes
     self.n_nodes = 4
     # choose a list of clusters
     clusters = ['graphene', 'petitprince', 'edel', 'paradent', 'stremi']
     #clusters = ['petitprince', 'paradent']
     # compute the maximum number of cores among all clusters
     max_core = self.n_nodes * max([
             get_host_attributes(cluster + '-1')['architecture']['smt_size']
             for cluster in clusters])
     # define the parameters
     self.parameters = {
         'cluster' : clusters,
         'n_core': filter(lambda i: i >= self.n_nodes,
                          list(takewhile(lambda i: i<max_core,
                                         (2**i for i in count(0, 1))))),
         'size' : ['A', 'B', 'C']
         }
     logger.info(self.parameters)
     # define the iterator over the parameters combinations
     self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"),
                                 sweep(self.parameters))
     logger.info('Number of parameters combinations %s' % len(self.sweeper.get_remaining()))
Ejemplo n.º 16
0
    def run(self):
        """Inherited method, put here the code for running the engine"""
        self.define_parameters()
        self.cluster = self.args[0]
        self.site = get_cluster_site(self.cluster)
        if self.options.oar_job_id:
            self.oar_job_id = self.options.oar_job_id
        else:
            self.oar_job_id = None

        try:
            # Creation of the main iterator which is used for the first control loop.
            # You need have a method called define_parameters, that returns a list of parameter dicts
            self.define_parameters()

            job_is_dead = False
            # While they are combinations to treat
            while len(self.sweeper.get_remaining()) > 0:
                # If no job, we make a reservation and prepare the hosts for the experiments
                if job_is_dead or self.oar_job_id is None:
                    self.make_reservation()
                # Retrieving the hosts and subnets parameters
                self.hosts = get_oar_job_nodes(self.oar_job_id, self.frontend)
                # Hosts deployment
                deployed, undeployed = deploy(Deployment(self.hosts, 
                    env_file="/home/mliroz/deploys/hadoop6.env"))
                logger.info("%i deployed, %i undeployed" % (len(deployed), 
                                                            len(undeployed)))
                if len(deployed) == 0:
                    break
                # Configuration du systeme => look at the execo_g5k.topology module 
                attr = get_host_attributes(self.cluster + '-1')
                
                ## SETUP FINISHED
                
                # Getting the next combination
                comb = self.sweeper.get_next()
                self.prepare_dataset(comb)
                self.xp(comb)
                # subloop over the combinations that have the same sizes
                while True:
                    newcomb = self.sweeper.get_next(lambda r:
                            filter(lambda subcomb: subcomb['sizes'] == comb['sizes'], r))
                    if newcomb:
                        try:
                            self.xp(newcomb)
                        except:
                            break
                    else:
                        break

                if get_oar_job_info(self.oar_job_id, self.frontend)['state'] == 'Error':
                    job_is_dead = True

        finally:
            if self.oar_job_id is not None:
                if not self.options.keep_alive:
                    logger.info('Deleting job')
                    oardel([(self.oar_job_id, self.frontend)])
                else:
                    logger.info('Keeping job alive for debugging') 
Ejemplo n.º 17
0
def get_ip(node):
    """return the ip of the given host"""
    host = get_host_attributes(node)
    for i in host['network_adapters']:
        if 'ip' in i:
            return config['prefix'] + i['ip'].split('.')[3], i['ip']
Ejemplo n.º 18
0
    def __init__(self, name, hosts):
        super(G5kPhysicalCluster, self).__init__(name, hosts)

        host_attrs = get_host_attributes(hosts[0])
        self._num_cores = host_attrs[u'architecture'][u'smt_size']
        self._memory = host_attrs[u'main_memory'][u'ram_size'] / (1024 * 1024)
Ejemplo n.º 19
0
    def get_memory_and_cores(self, host):
        host_attrs = get_host_attributes(host)
        cores = host_attrs[u'architecture'][u'nb_cores']
        mem = host_attrs[u'main_memory'][u'ram_size'] / (1024 * 1024)

        return mem, cores
Ejemplo n.º 20
0
	def run(self):
		# Defining experiment parameters
		self.parameters = {
			'n_clients': [400, 450, 500, 550, 600],
			'n_transitions': [10000]
		}
		cluster = 'griffon'
		sweeps = sweep(self.parameters)
		sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweeps)
		server_out_path = os.path.join(self.result_dir, "server.out")
		
		self._updateStat(sweeper.stats())
		
		# Loop on the number of nodes
		while True:
			# Taking the next parameter combinations
			comb = sweeper.get_next()
			if not comb: break

			# Performing the submission on G5K
			site = get_cluster_site(cluster)
			self._log("Output will go to " + self.result_dir)
			
			n_nodes = int(math.ceil(float(comb['n_clients']) / EX5.get_host_attributes(cluster + '-1')['architecture']['smt_size'])) + 1
			self._log("Reserving {0} nodes on {1}".format(n_nodes, site))
			
			resources = "{cluster=\\'" + cluster + "\\'}/nodes=" + str(n_nodes)
			submission = EX5.OarSubmission(resources = resources, job_type = 'allow_classic_ssh', walltime ='00:10:00')
			
			job = EX5.oarsub([(submission, site)])
			self.__class__._job = job
			
			# Sometimes oarsub fails silently
			if job[0][0] is None:
				print("\nError: no job was created")
				sys.exit(1)
				
			# Wait for the job to start
			self._log("Waiting for job {0} to start...\n".format(BOLD_MAGENTA + str(job[0][0]) + NORMAL))
			EX5.wait_oar_job_start(job[0][0], job[0][1], prediction_callback = prediction)
			nodes = EX5.get_oar_job_nodes(job[0][0], job[0][1])
			
			# Deploying nodes
			#deployment = EX5.Deployment(hosts = nodes, env_file='path_to_env_file')
			#run_deploy = EX5.deploy(deployment)
			#nodes_deployed = run_deploy.hosts[0]
			
			# Copying active_data program on all deployed hosts
			EX.Put([nodes[0]], '../dist/active-data-lib-0.1.2.jar', connexion_params = {'user': '******'}).run()
			EX.Put([nodes[0]], '../server.policy', connexion_params = {'user': '******'}).run()
			
			# Loop on the number of requests per client process
			while True:
				# Split the nodes
				clients = nodes[1:]
				server = nodes[0] 
				
				self._log("Running experiment with {0} nodes and {1} transitions per client".format(len(clients), comb['n_transitions']))
				
				# Launching Server on one node
				out_handler = FileOutputHandler(server_out_path)
				launch_server = EX.Remote('java -jar active-data-lib-0.1.2.jar', [server], stdout_handler = out_handler, stderr_handler = out_handler).start()
				self._log("Server started on " + server.address)
				time.sleep(2)
				
				# Launching clients
				rank=0
				n_cores = EX5.get_host_attributes(clients[0])['architecture']['smt_size'];
				cores = nodes * n_cores
				cores = cores[0:comb['n_clients']] # Cut out the additional cores
				
				client_connection_params = {
						'taktuk_gateway': 'lyon.grid5000.fr',
						'host_rewrite_func': None
				}
				
				self._log("Launching {0} clients...".format(len(cores)))
				
				client_cmd = "/usr/bin/env java -cp /home/ansimonet/active-data-lib-0.1.2.jar org.inria.activedata.examples.perf.TransitionsPerSecond " + \
								"{0} {1} {2} {3} {4}".format(server.address, 1200, "{{range(len(cores))}}", len(cores), comb['n_transitions'])
				client_out_handler = FileOutputHandler(os.path.join(self.result_dir, "clients.out"))
				client_request = EX.TaktukRemote(client_cmd, cores, connexion_params = client_connection_params, \
									stdout_handler = client_out_handler, stderr_handler = client_out_handler)
				
				client_request.run()
				
				if not client_request.ok():
					# Some client failed, please panic
					self._log("One or more client process failed. Enjoy reading their outputs.")
					self._log("OUTPUT STARTS -------------------------------------------------\n")
					for process in client_request.processes():
						print("----- {0} returned {1}".format(process.host().address, process.exit_code()))
						if not process.stdout() == "": print(GREEN + process.stdout() + NORMAL)
						if not process.stderr() == "": print(RED + process.stderr() + NORMAL)
						print("")
					self._log("OUTPUT ENDS ---------------------------------------------------\n")
					sweeper.skip(comb)
					launch_server.kill()
					launch_server.wait()
				else:
					# Waiting for server to end
					launch_server.wait()
				
					# Getting log files
					distant_path = OUT_FILE_FORMAT.format(len(cores), comb['n_transitions'])
					local_path = distant_path
					
					EX.Get([server], distant_path).run()
					
					EX.Local('mv ' + distant_path + ' ' + os.path.join(self.result_dir, local_path)).run()
					
					EX.Get([server], 'client_*.out', local_location = self.result_dir)
					EX.Remote('rm -f client_*.out', [server])
					
					self._log("Finishing experiment with {0} clients and {1} transitions per client".format(comb['n_clients'], comb['n_transitions']))
					
					sweeper.done(comb)
					
				sub_comb = sweeper.get_next (filtr = lambda r: filter(lambda s: s["n_clients"] == comb['n_clients'], r))
				self._updateStat(sweeper.stats())
				
				if not sub_comb: 
					# Killing job
					EX5.oar.oardel(job)
					self.__class__._job = None
					break
				else: 
					comb = sub_comb
		
		print ""
Ejemplo n.º 21
0
    def _configure_servers(self, hosts=None):
        """Configure servers and host-dependant parameters.

           Args:
             hosts (list of Host, optional):
               The list of hosts to take into account in the configuration. If
               not specified, all the hosts of the Hadoop cluster are used. The
               first host of this list is always used as the reference.
        """

        if not hosts:
            hosts = self.hosts

        host_attrs = get_host_attributes(hosts[0])
        num_cores = host_attrs[u'architecture'][u'smt_size']
        available_memory = (int(host_attrs[u'main_memory'][u'ram_size']) /
                            (1024 * 1024))
        total_memory_mb = min(available_memory - 2 * 1024,
                              int(0.75 * available_memory))
        mem_per_task_mb = total_memory_mb / (num_cores - 1)

        replace_in_xml_file(os.path.join(self.temp_conf_dir, CORE_CONF_FILE),
                            "fs.defaultFS",
                            "hdfs://" + self.master.address + ":" +
                                        str(self.hdfs_port) + "/",
                            True)
        replace_in_xml_file(os.path.join(self.temp_conf_dir, CORE_CONF_FILE),
                            "hadoop.tmp.dir",
                            self.hadoop_temp_dir, True)
        replace_in_xml_file(os.path.join(self.temp_conf_dir, CORE_CONF_FILE),
                            "topology.script.file.name",
                            self.conf_dir + "/topo.sh", True)

        replace_in_xml_file(os.path.join(self.temp_conf_dir, MR_CONF_FILE),
                            "mapreduce.framework.name", "yarn", True)
        replace_in_xml_file(os.path.join(self.temp_conf_dir, MR_CONF_FILE),
                            "mapreduce.map.memory.mb",
                            str(mem_per_task_mb), True)
        replace_in_xml_file(os.path.join(self.temp_conf_dir, MR_CONF_FILE),
                            "mapreduce.map.java.opts",
                            "-Xmx" + str(mem_per_task_mb) + "m", True)
        replace_in_xml_file(os.path.join(self.temp_conf_dir, MR_CONF_FILE),
                            "mapreduce.map.cpu.vcores", "1", True)
        replace_in_xml_file(os.path.join(self.temp_conf_dir, MR_CONF_FILE),
                            "mapreduce.reduce.memory.mb",
                            str(mem_per_task_mb), True)
        replace_in_xml_file(os.path.join(self.temp_conf_dir, MR_CONF_FILE),
                            "mapreduce.reduce.cpu.vcores", "1", True)
        replace_in_xml_file(os.path.join(self.temp_conf_dir, MR_CONF_FILE),
                            "mapreduce.reduce.java.opts",
                            "-Xmx" + str(mem_per_task_mb) + "m", True)

        replace_in_xml_file(os.path.join(self.temp_conf_dir, YARN_CONF_FILE),
                            "yarn.resourcemanager.hostname",
                            self.master.address, True)
        replace_in_xml_file(os.path.join(self.temp_conf_dir, YARN_CONF_FILE),
                            "yarn.nodemanager.resource.memory-mb",
                            str(total_memory_mb), True)
        replace_in_xml_file(os.path.join(self.temp_conf_dir, YARN_CONF_FILE),
                            "yarn.nodemanager.resource.cpu-vcores",
                            str(num_cores - 1), True)
        replace_in_xml_file(os.path.join(self.temp_conf_dir, YARN_CONF_FILE),
                            "yarn.scheduler.maximum-allocation-mb",
                            str(total_memory_mb), True)
        replace_in_xml_file(os.path.join(self.temp_conf_dir, YARN_CONF_FILE),
                            "yarn.nodemanager.aux-services",
                            "mapreduce_shuffle", True)
Ejemplo n.º 22
0
    def run(self):
        """Inherited method, put here the code for running the engine"""
        self.define_parameters()
        self.cluster = self.args[0]
        self.site = get_cluster_site(self.cluster)
        if self.options.oar_job_id:
            self.oar_job_id = self.options.oar_job_id
        else:
            self.oar_job_id = None

        try:
            # Creation of the main iterator which is used for the first control loop.
            # You need have a method called define_parameters, that returns a list of parameter dicts
            self.define_parameters()

            job_is_dead = False
            # While they are combinations to treat
            while len(self.sweeper.get_remaining()) > 0:
                # If no job, we make a reservation and prepare the hosts for the experiments
                if job_is_dead or self.oar_job_id is None:
                    self.make_reservation()
                # Retrieving the hosts and subnets parameters
                self.hosts = get_oar_job_nodes(self.oar_job_id, self.frontend)
                # Hosts deployment
                deployed, undeployed = deploy(
                    Deployment(self.hosts,
                               env_file="/home/mliroz/deploys/hadoop6.env"))
                logger.info("%i deployed, %i undeployed" %
                            (len(deployed), len(undeployed)))
                if len(deployed) == 0:
                    break
                # Configuration du systeme => look at the execo_g5k.topology module
                attr = get_host_attributes(self.cluster + '-1')

                ## SETUP FINISHED

                # Getting the next combination
                comb = self.sweeper.get_next()
                self.prepare_dataset(comb)
                self.xp(comb)
                # subloop over the combinations that have the same sizes
                while True:
                    newcomb = self.sweeper.get_next(lambda r: filter(
                        lambda subcomb: subcomb['sizes'] == comb['sizes'], r))
                    if newcomb:
                        try:
                            self.xp(newcomb)
                        except:
                            break
                    else:
                        break

                if get_oar_job_info(self.oar_job_id,
                                    self.frontend)['state'] == 'Error':
                    job_is_dead = True

        finally:
            if self.oar_job_id is not None:
                if not self.options.keep_alive:
                    logger.info('Deleting job')
                    oardel([(self.oar_job_id, self.frontend)])
                else:
                    logger.info('Keeping job alive for debugging')
Ejemplo n.º 23
0
    def _configure_servers(self, hosts=None):
        """Configure servers and host-dependant parameters.

           Args:
             hosts (list of Host, optional):
               The list of hosts to take into account in the configuration. If
               not specified, all the hosts of the Hadoop cluster are used. The
               first host of this list is always used as the reference.
        """

        if not hosts:
            hosts = self.hosts

        host_attrs = get_host_attributes(hosts[0])
        num_cores = host_attrs[u'architecture'][u'smt_size']
        available_memory = (int(host_attrs[u'main_memory'][u'ram_size']) /
                            (1024 * 1024))
        total_memory_mb = min(available_memory - 2 * 1024,
                              int(0.75 * available_memory))
        mem_per_task_mb = total_memory_mb / (num_cores - 1)

        replace_in_xml_file(
            os.path.join(self.temp_conf_dir, CORE_CONF_FILE), "fs.defaultFS",
            "hdfs://" + self.master.address + ":" + str(self.hdfs_port) + "/",
            True)
        replace_in_xml_file(os.path.join(self.temp_conf_dir, CORE_CONF_FILE),
                            "hadoop.tmp.dir", self.hadoop_temp_dir, True)
        replace_in_xml_file(os.path.join(self.temp_conf_dir, CORE_CONF_FILE),
                            "topology.script.file.name",
                            self.conf_dir + "/topo.sh", True)

        replace_in_xml_file(os.path.join(self.temp_conf_dir, MR_CONF_FILE),
                            "mapreduce.framework.name", "yarn", True)
        replace_in_xml_file(os.path.join(self.temp_conf_dir, MR_CONF_FILE),
                            "mapreduce.map.memory.mb", str(mem_per_task_mb),
                            True)
        replace_in_xml_file(os.path.join(self.temp_conf_dir, MR_CONF_FILE),
                            "mapreduce.map.java.opts",
                            "-Xmx" + str(mem_per_task_mb) + "m", True)
        replace_in_xml_file(os.path.join(self.temp_conf_dir, MR_CONF_FILE),
                            "mapreduce.map.cpu.vcores", "1", True)
        replace_in_xml_file(os.path.join(self.temp_conf_dir, MR_CONF_FILE),
                            "mapreduce.reduce.memory.mb", str(mem_per_task_mb),
                            True)
        replace_in_xml_file(os.path.join(self.temp_conf_dir, MR_CONF_FILE),
                            "mapreduce.reduce.cpu.vcores", "1", True)
        replace_in_xml_file(os.path.join(self.temp_conf_dir, MR_CONF_FILE),
                            "mapreduce.reduce.java.opts",
                            "-Xmx" + str(mem_per_task_mb) + "m", True)

        replace_in_xml_file(os.path.join(self.temp_conf_dir, YARN_CONF_FILE),
                            "yarn.resourcemanager.address",
                            self.master.address + ":" + str(self.mapred_port),
                            True)
        replace_in_xml_file(os.path.join(self.temp_conf_dir, YARN_CONF_FILE),
                            "yarn.nodemanager.resource.memory-mb",
                            str(total_memory_mb), True)
        replace_in_xml_file(os.path.join(self.temp_conf_dir, YARN_CONF_FILE),
                            "yarn.nodemanager.resource.cpu-vcores",
                            str(num_cores - 1), True)
        replace_in_xml_file(os.path.join(self.temp_conf_dir, YARN_CONF_FILE),
                            "yarn.scheduler.maximum-allocation-mb",
                            str(total_memory_mb), True)
        replace_in_xml_file(os.path.join(self.temp_conf_dir, YARN_CONF_FILE),
                            "yarn.nodemanager.aux-services",
                            "mapreduce_shuffle", True)
Ejemplo n.º 24
0
    def run(self):
        # Defining experiment parameters
        self.parameters = {
            'n_clients': [400, 450, 500, 550, 600],
            'n_transitions': [10000]
        }
        cluster = 'griffon'
        sweeps = sweep(self.parameters)
        sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweeps)
        server_out_path = os.path.join(self.result_dir, "server.out")

        self._updateStat(sweeper.stats())

        # Loop on the number of nodes
        while True:
            # Taking the next parameter combinations
            comb = sweeper.get_next()
            if not comb: break

            # Performing the submission on G5K
            site = get_cluster_site(cluster)
            self._log("Output will go to " + self.result_dir)

            n_nodes = int(
                math.ceil(
                    float(comb['n_clients']) / EX5.get_host_attributes(
                        cluster + '-1')['architecture']['smt_size'])) + 1
            self._log("Reserving {0} nodes on {1}".format(n_nodes, site))

            resources = "{cluster=\\'" + cluster + "\\'}/nodes=" + str(n_nodes)
            submission = EX5.OarSubmission(resources=resources,
                                           job_type='allow_classic_ssh',
                                           walltime='00:10:00')

            job = EX5.oarsub([(submission, site)])
            self.__class__._job = job

            # Sometimes oarsub fails silently
            if job[0][0] is None:
                print("\nError: no job was created")
                sys.exit(1)

            # Wait for the job to start
            self._log(
                "Waiting for job {0} to start...\n".format(BOLD_MAGENTA +
                                                           str(job[0][0]) +
                                                           NORMAL))
            EX5.wait_oar_job_start(job[0][0],
                                   job[0][1],
                                   prediction_callback=prediction)
            nodes = EX5.get_oar_job_nodes(job[0][0], job[0][1])

            # Deploying nodes
            #deployment = EX5.Deployment(hosts = nodes, env_file='path_to_env_file')
            #run_deploy = EX5.deploy(deployment)
            #nodes_deployed = run_deploy.hosts[0]

            # Copying active_data program on all deployed hosts
            EX.Put([nodes[0]],
                   '../dist/active-data-lib-0.1.2.jar',
                   connexion_params={
                       'user': '******'
                   }).run()
            EX.Put([nodes[0]],
                   '../server.policy',
                   connexion_params={
                       'user': '******'
                   }).run()

            # Loop on the number of requests per client process
            while True:
                # Split the nodes
                clients = nodes[1:]
                server = nodes[0]

                self._log(
                    "Running experiment with {0} nodes and {1} transitions per client"
                    .format(len(clients), comb['n_transitions']))

                # Launching Server on one node
                out_handler = FileOutputHandler(server_out_path)
                launch_server = EX.Remote(
                    'java -jar active-data-lib-0.1.2.jar', [server],
                    stdout_handler=out_handler,
                    stderr_handler=out_handler).start()
                self._log("Server started on " + server.address)
                time.sleep(2)

                # Launching clients
                rank = 0
                n_cores = EX5.get_host_attributes(
                    clients[0])['architecture']['smt_size']
                cores = nodes * n_cores
                cores = cores[
                    0:comb['n_clients']]  # Cut out the additional cores

                client_connection_params = {
                    'taktuk_gateway': 'lyon.grid5000.fr',
                    'host_rewrite_func': None
                }

                self._log("Launching {0} clients...".format(len(cores)))

                client_cmd = "/usr/bin/env java -cp /home/ansimonet/active-data-lib-0.1.2.jar org.inria.activedata.examples.perf.TransitionsPerSecond " + \
                    "{0} {1} {2} {3} {4}".format(server.address, 1200, "{{range(len(cores))}}", len(cores), comb['n_transitions'])
                client_out_handler = FileOutputHandler(
                    os.path.join(self.result_dir, "clients.out"))
                client_request = EX.TaktukRemote(client_cmd, cores, connexion_params = client_connection_params, \
                     stdout_handler = client_out_handler, stderr_handler = client_out_handler)

                client_request.run()

                if not client_request.ok():
                    # Some client failed, please panic
                    self._log(
                        "One or more client process failed. Enjoy reading their outputs."
                    )
                    self._log(
                        "OUTPUT STARTS -------------------------------------------------\n"
                    )
                    for process in client_request.processes():
                        print("----- {0} returned {1}".format(
                            process.host().address, process.exit_code()))
                        if not process.stdout() == "":
                            print(GREEN + process.stdout() + NORMAL)
                        if not process.stderr() == "":
                            print(RED + process.stderr() + NORMAL)
                        print("")
                    self._log(
                        "OUTPUT ENDS ---------------------------------------------------\n"
                    )
                    sweeper.skip(comb)
                    launch_server.kill()
                    launch_server.wait()
                else:
                    # Waiting for server to end
                    launch_server.wait()

                    # Getting log files
                    distant_path = OUT_FILE_FORMAT.format(
                        len(cores), comb['n_transitions'])
                    local_path = distant_path

                    EX.Get([server], distant_path).run()

                    EX.Local('mv ' + distant_path + ' ' +
                             os.path.join(self.result_dir, local_path)).run()

                    EX.Get([server],
                           'client_*.out',
                           local_location=self.result_dir)
                    EX.Remote('rm -f client_*.out', [server])

                    self._log(
                        "Finishing experiment with {0} clients and {1} transitions per client"
                        .format(comb['n_clients'], comb['n_transitions']))

                    sweeper.done(comb)

                sub_comb = sweeper.get_next(filtr=lambda r: filter(
                    lambda s: s["n_clients"] == comb['n_clients'], r))
                self._updateStat(sweeper.stats())

                if not sub_comb:
                    # Killing job
                    EX5.oar.oardel(job)
                    self.__class__._job = None
                    break
                else:
                    comb = sub_comb

        print ""
Ejemplo n.º 25
0
    def run(self):
        """ """
        if self.options.oargrid_job_id:
            self.oargrid_job_id = self.options.oargrid_job_id
        else:
            self.oargrid_job_id = None

        try:
            # Creation of the main iterator which is used for the first control loop.
            self.define_parameters()

            job_is_dead = False
            # While there are combinations to treat
            while len(self.sweeper.get_remaining()) > 0:
                # If no job, we make a reservation and prepare the hosts for the experiments
                if self.oargrid_job_id is None:
                    self.make_reservation()
                # Wait that the job starts
                logger.info('Waiting that the job start')
                wait_oargrid_job_start(self.oargrid_job_id)
                # Retrieving the hosts and subnets parameters
                self.hosts = get_oargrid_job_nodes(self.oargrid_job_id)
                # Hosts deployment and configuration

                default_connection_params['user'] = '******'

                logger.info("Start hosts configuration")
                ex_log.setLevel('INFO')
                deployment = Deployment(
                    hosts=self.hosts,
                    env_file='/home/sirimie/env/mywheezy-x64-base.env')
                self.hosts, _ = deploy(deployment)

                Remote("rm -f /home/Work/sgcbntier/paasage_demo/csv/REQTASK_*",
                       self.hosts).run()
                Remote(
                    "rm -f /home/Work/sgcbntier/paasage_demo/platform_aws.xml",
                    self.hosts).run()
                Remote("rm -f /home/Work/sgcbntier/paasage_demo/cloud_ec2.xml",
                       self.hosts).run()

                Put(self.hosts, [
                    "run_all_execo.py", "xml_gen_execo.py", "conf.xml",
                    "platform_aws.xml", "cloud_ec2.xml"
                ],
                    remote_location="/home/Work/sgcbntier/paasage_demo/").run(
                    )
                logger.info("Done")

                if len(self.hosts) == 0:
                    break

                # Initializing the resources and threads
                available_hosts = [
                    host for host in self.hosts for i in range(
                        get_host_attributes(host)['architecture']['smt_size'])
                ]

                threads = {}

                # Creating the unique folder for storing the results
                comb_dir = self.result_dir + '/csv_results'
                if not os.path.exists(comb_dir):
                    os.mkdir(comb_dir)

                # Checking that the job is running and not in Error
                while self.is_job_alive() or len(threads.keys()) > 0:
                    job_is_dead = False
                    while self.options.n_nodes > len(available_hosts):
                        tmp_threads = dict(threads)
                        for t in tmp_threads:
                            if not t.is_alive():
                                available_hosts.append(tmp_threads[t]['host'])
                                del threads[t]
                        sleep(5)
                        if not self.is_job_alive():
                            job_is_dead = True
                            break
                    if job_is_dead:
                        break

                    # Getting the next combination
                    comb = self.sweeper.get_next()
                    if not comb:
                        while len(threads.keys()) > 0:
                            tmp_threads = dict(threads)
                            for t in tmp_threads:
                                if not t.is_alive():
                                    del threads[t]
                            logger.info('Waiting for threads to complete')
                            sleep(20)
                        break

                    host = available_hosts[0]
                    available_hosts = available_hosts[1:]

                    t = Thread(target=self.workflow,
                               args=(comb, host, comb_dir))
                    threads[t] = {'host': host}
                    t.daemon = True
                    t.start()

                if not self.is_job_alive():
                    job_is_dead = True

                if job_is_dead:
                    self.oargrid_job_id = None

        finally:
            if self.oargrid_job_id is not None:
                if not self.options.keep_alive:
                    logger.info('Deleting job')
                    oargriddel([self.oargrid_job_id])
                else:
                    logger.info('Keeping job alive for debugging')