def _enable_bridge(self, name='br0'): """We need a bridge to have automatic DHCP configuration for the VM.""" logger.detail('Configuring the bridge') hosts_br = self._get_bridge(self.hosts) nobr_hosts = [] for host, br in hosts_br.iteritems(): if br is None: logger.debug('No bridge on host %s', style.host(host)) nobr_hosts.append(host) elif br != name: logger.debug('Wrong bridge on host %s, destroying it', style.host(host)) SshProcess('ip link set ' + br + ' down ; brctl delbr ' + br, host).run() nobr_hosts.append(host) else: logger.debug('Bridge %s is present on host %s', style.emph('name'), style.host(host)) nobr_hosts = map(lambda x: x.address if isinstance(x, Host) else x, nobr_hosts) if len(nobr_hosts) > 0: logger.debug('Creating bridge on %s', hosts_list(nobr_hosts)) script = 'export br_if=`ip route |grep default |cut -f 5 -d " "`; \n' + \ 'ifdown $br_if ; \n' + \ 'sed -i "s/$br_if inet dhcp/$br_if inet manual/g" /etc/network/interfaces ; \n' + \ 'sed -i "s/auto $br_if//g" /etc/network/interfaces ; \n' + \ 'echo " " >> /etc/network/interfaces ; \n' + \ 'echo "auto ' + name + '" >> /etc/network/interfaces ; \n' + \ 'echo "iface ' + name + ' inet dhcp" >> /etc/network/interfaces ; \n' + \ 'echo " bridge_ports $br_if" >> /etc/network/interfaces ; \n' + \ 'echo " bridge_stp off" >> /etc/network/interfaces ; \n' + \ 'echo " bridge_maxwait 0" >> /etc/network/interfaces ; \n' + \ 'echo " bridge_fd 0" >> /etc/network/interfaces ; \n' + \ 'ifup ' + name fd, br_script = mkstemp(dir='/tmp/', prefix='create_br_') f = fdopen(fd, 'w') f.write(script) f.close() self.fact.get_fileput(nobr_hosts, [br_script]).run() self.fact.get_remote('nohup sh ' + br_script.split('/')[-1], nobr_hosts).run() logger.debug('Waiting for network restart') if_up = False nmap_tries = 0 while (not if_up) and nmap_tries < 20: sleep(20) nmap_tries += 1 nmap = Process('nmap ' + ' '.join([host for host in nobr_hosts]) + ' -p 22').run() for line in nmap.stdout.split('\n'): if 'Nmap done' in line: if_up = line.split()[2] == line.split()[5].replace('(', '') logger.debug('Network has been restarted') logger.detail('All hosts have the bridge %s', style.emph(name))
def _enable_bridge(self, name='br0'): """We need a bridge to have automatic DHCP configuration for the VM.""" logger.detail('Configuring the bridge') hosts_br = self._get_bridge(self.hosts) nobr_hosts = [] for host, br in hosts_br.iteritems(): if br is None: logger.debug('No bridge on host %s', style.host(host)) nobr_hosts.append(host) elif br != name: logger.debug('Wrong bridge on host %s, destroying it', style.host(host)) SshProcess('ip link set ' + br + ' down ; brctl delbr ' + br, host).run() nobr_hosts.append(host) else: logger.debug('Bridge %s is present on host %s', style.emph('name'), style.host(host)) nobr_hosts = map(lambda x: x.address if isinstance(x, Host) else x, nobr_hosts) if len(nobr_hosts) > 0: logger.debug('Creating bridge on %s', hosts_list(nobr_hosts)) script = 'export br_if=`ip route |grep default |cut -f 5 -d " "`; \n' + \ 'ifdown $br_if ; \n' + \ 'sed -i "s/$br_if inet dhcp/$br_if inet manual/g" /etc/network/interfaces ; \n' + \ 'sed -i "s/auto $br_if//g" /etc/network/interfaces ; \n' + \ 'echo " " >> /etc/network/interfaces ; \n' + \ 'echo "auto ' + name + '" >> /etc/network/interfaces ; \n' + \ 'echo "iface ' + name + ' inet dhcp" >> /etc/network/interfaces ; \n' + \ 'echo " bridge_ports $br_if" >> /etc/network/interfaces ; \n' + \ 'echo " bridge_stp off" >> /etc/network/interfaces ; \n' + \ 'echo " bridge_maxwait 0" >> /etc/network/interfaces ; \n' + \ 'echo " bridge_fd 0" >> /etc/network/interfaces ; \n' + \ 'ifup ' + name fd, br_script = mkstemp(dir='/tmp/', prefix='create_br_') f = fdopen(fd, 'w') f.write(script) f.close() self.fact.get_fileput(nobr_hosts, [br_script]).run() self.fact.get_remote('nohup sh ' + br_script.split('/')[-1], nobr_hosts).run() logger.debug('Waiting for network restart') if_up = False nmap_tries = 0 while (not if_up) and nmap_tries < 20: sleep(20) nmap_tries += 1 nmap = Process('nmap ' + ' '.join([host for host in nobr_hosts]) + ' -p 22').run() for line in nmap.stdout.split('\n'): if 'Nmap done' in line: if_up = line.split()[2] == line.split()[5].replace( '(', '') logger.debug('Network has been restarted') logger.detail('All hosts have the bridge %s', style.emph(name))
def __init__(self, hosts, topo_list=None, config_file=None): """Create a new Hadoop cluster with the given hosts and topology. Args: hosts (list of Host): The hosts to be assigned a topology. topo_list (list of str, optional): The racks to be assigned to each host. len(hosts) should be equal to len(topo_list). configFile (str, optional): The path of the config file to be used. """ # Load cluster properties config = ConfigParser(self.defaults) config.add_section("cluster") config.add_section("local") if config_file: config.readfp(open(config_file)) self.base_dir = config.get("cluster", "hadoop_base_dir") self.conf_dir = config.get("cluster", "hadoop_conf_dir") self.logs_dir = config.get("cluster", "hadoop_logs_dir") self.hadoop_temp_dir = config.get("cluster", "hadoop_temp_dir") self.hdfs_port = config.getint("cluster", "hdfs_port") self.mapred_port = config.getint("cluster", "mapred_port") self.local_base_conf_dir = config.get("local", "local_base_conf_dir") self.bin_dir = self.base_dir + "/bin" self.sbin_dir = self.base_dir + "/bin" # Configure master and slaves self.hosts = hosts self.master = hosts[0] # Create topology self.topology = HadoopTopology(hosts, topo_list) # Store cluster information self.host_clusters = {} for h in self.hosts: g5k_cluster = get_host_cluster(h) if g5k_cluster in self.host_clusters: self.host_clusters[g5k_cluster].append(h) else: self.host_clusters[g5k_cluster] = [h] # Create a string to display the topology t = {v: [] for v in self.topology.topology.values()} for key, value in self.topology.topology.iteritems(): t[value].append(key.address) log_topo = ', '.join([style.user2(k) + ': ' + ' '.join(map(lambda x: style.host(x.split('.')[0]), v)) for k, v in t.iteritems()]) logger.info("Hadoop cluster created with master %s, hosts %s and topology %s", style.host(self.master.address), ' '.join([style.host(h.address.split('.')[0]) for h in self.hosts]), log_topo)
def workflow(self, comb): """ Compute one application launch using a given parameter group """ comb_ok = False try: # Generate configuration file needed by MPI processes logger.info("Generating assembly file...") py = comb['cores'] / comb['px'] prepare = Process('cd %s && python %s %d %d %d %d %d %s app.lad' % (self.workingPath, self.genLadScript, comb['datasize'], comb['datasize'], comb['datasize'], comb['px'], py, comb['transposition'])) prepare.shell = True prepare.run() # Generate the MPI host file mfile = self.generate_machine_file() # Start L2C lad = "./app.lad" logger.info("Computing...") res = Process("export OAR_JOB_KEY_FILE=~/.oar_key ; cd %s && l2c_loader -M,-machinefile,%s --mpi -c %d %s" % (self.workingPath, mfile, comb['cores'], lad)) res.shell = True res.stdout_handlers.append(os.path.join(self.result_dir, slugify(comb) + '.out')) res.stdout_handlers.append(sys.stdout) res.stderr_handlers.append(os.path.join(self.result_dir, slugify(comb) + '.err')) res.stderr_handlers.append(sys.stderr) res.run() if not res.ok: logger.error('Bad L2C termination') raise Exception('Bad L2C termination') if len(res.stderr) > 0: # WARNING: when L2C cannot find the LAD file or something strange like this logger.warning('Not empty error output') # Clean configuration files logger.info("Removing assembly files...") res = Process('cd %s && rm -f app.lad*' % self.workingPath) res.shell = True res.run() comb_ok = True except Exception: pass finally: if comb_ok: self.sweeper.done(comb) logger.info(style.host(slugify(comb)) + ' has been done') else: self.sweeper.cancel(comb) logger.warning(style.host(slugify(comb)) + ' has been canceled') logger.info(style.step('%s Remaining'), len(self.sweeper.get_remaining()))
def hosts_list(hosts, separator=' ', site=False): """Return a formatted string from a list of hosts""" tmp_hosts = copy.deepcopy(sorted(hosts)) for i, host in enumerate(tmp_hosts): if isinstance(host, Host): tmp_hosts[i] = host.address if site: return separator.join([style.host(host.split('.')[0] + '.' + host.split('.')[1]) for host in sorted(tmp_hosts)]) else: return separator.join([style.host(host.split('.')[0]) for host in sorted(tmp_hosts)])
def hosts_list(hosts, separator=' ', site=False): """Return a formatted string from a list of hosts""" tmp_hosts = copy.deepcopy(sorted(hosts)) for i, host in enumerate(tmp_hosts): if isinstance(host, Host): tmp_hosts[i] = host.address if site: return separator.join([ style.host(host.split('.')[0] + '.' + host.split('.')[1]) for host in sorted(tmp_hosts) ]) else: return separator.join( [style.host(host.split('.')[0]) for host in sorted(tmp_hosts)])
def generate_hosts(hosts_input): """Generate a list of hosts from the given file. Args: hosts_input: The path of the file containing the hosts to be used, or a comma separated list of site:job_id or an oargrid_job_id. If a file is used, each host should be in a different line. Repeated hosts are pruned. Hint: in a running Grid5000 job, $OAR_NODEFILE should be used. Return: list of Host: The list of hosts. """ hosts = [] if os.path.isfile(hosts_input): for line in open(hosts_input): h = Host(line.rstrip()) if h not in hosts: hosts.append(h) elif ':' in hosts_input: # We assume the string is a comma separated list of site:job_id for job in hosts_input.split(','): site, job_id = job.split(':') hosts += get_oar_job_nodes(int(job_id), site) else: # If the file_name is a number, we assume this is a oargrid_job_id hosts = get_oargrid_job_nodes(int(hosts_input)) logger.debug('Hosts list: \n%s', ' '.join(style.host(host.address.split('.')[0]) for host in hosts)) return hosts
def _munin_server(server, clients): """Install the monitoring service munin. Must be executed inside Grid'5000 to be able to resolve the server and clients IP. :param server: a execo.Host :param clients: a list of execo.Hosts :param plugins: a list of munin plugins """ logger.info( 'Munin monitoring service installation, server = %s, clients = \n %s', server.address, [host.address for host in clients]) logger.debug('Configuring munin server %s', style.host('server')) cmd = 'export DEBIAN_MASTER=noninteractive ; apt-get update && apt-get install -y munin' inst_munin_server = SshProcess(cmd, server).run() logger.debug('Creating configuration files for server') fd, server_conf = mkstemp(dir='/tmp/', prefix='munin-nodes_') f = fdopen(fd, 'w') for host in clients: get_ip = Process('host ' + host.address).run() ip = get_ip.stdout.strip().split(' ')[3] f.write('[' + host.address + ']\n address ' + ip + '\n use_node_name yes\n\n') f.close() Put([server], [server_conf], remote_location='/etc/').run() SshProcess('cd /etc && cp ' + server_conf.split('/')[-1] + ' munin.conf', server).run() Process('rm ' + server_conf).run()
def generate_hosts(hosts_input): """Generate a list of hosts from the given file. Args: hosts_input: The path of the file containing the hosts to be used, or a comma separated list of site:job_id or an oargrid_job_id. If a file is used, each host should be in a different line. Repeated hosts are pruned. Hint: in a running Grid5000 job, $OAR_NODEFILE should be used. Return: list of Host: The list of hosts. """ hosts = [] if os.path.isfile(hosts_input): for line in open(hosts_input): h = Host(line.rstrip()) if h not in hosts: hosts.append(h) elif ':' in hosts_input: # We assume the string is a comma separated list of site:job_id for job in hosts_input.split(','): site, job_id = job.split(':') hosts += get_oar_job_nodes(int(job_id), site) else: # If the file_name is a number, we assume this is a oargrid_job_id hosts = get_oargrid_job_nodes(int(hosts_input)) logger.debug( 'Hosts list: \n%s', ' '.join(style.host(host.address.split('.')[0]) for host in hosts)) return hosts
def __force_clean(self): """Stop previous Spark processes (if any) and remove all remote files created by it.""" spark_processes = ["Master", "Worker"] force_kill = False for h in self.hosts: proc = SshProcess("jps", h) proc.run() ids_to_kill = [] for line in proc.stdout.splitlines(): field = line.split() if field[1] in spark_processes: ids_to_kill.append(field[0]) if ids_to_kill: force_kill = True ids_to_kill_str = "" for pid in ids_to_kill: ids_to_kill_str += " " + pid logger.warn("Killing running Spark processes in host %s" % style.host(h.address.split('.')[0])) proc = SshProcess("kill -9" + ids_to_kill_str, h) proc.run() if force_kill: logger.info( "Processes from previous hadoop deployments had to be killed") self.clean_logs()
def get_server_iface(server): """Get the default network interface of the serve """ logger.debug('Retrieving default interface from %s', style.host(server.address)) get_if = SshProcess('ip route |grep default |cut -d " " -f 5', server).run() return get_if.stdout.strip()
def add_host(self, host, data=None): """Add a host in the graph :param host: a string corresponding to the node name :param data: a dict containing the Grid'5000 host attributes""" if isinstance(host, Host): _host = get_host_shortname(host.address) else: _host = host if data: power = data['performance']['core_flops'] cores = data['architecture']['nb_cores'] else: power = 0 cores = 0 if len(self.get_host_adapters(_host)) > 0: logger.debug('Adding %s', style.host(_host)) self.add_node(_host, {'kind': 'node', 'power': power, 'cores': cores}) for eq in self.get_host_adapters(_host): if eq['mounted']: self.add_equip(eq['switch'], get_host_site(_host)) else: logger.warning('Node %s has no valid network connection', _host)
def add_host(self, host, data=None): """Add a host in the graph :param host: a string corresponding to the node name :param data: a dict containing the Grid'5000 host attributes""" if isinstance(host, Host): _host = get_host_shortname(host.address) else: _host = host if data: power = data['performance']['core_flops'] cores = data['architecture']['nb_cores'] else: power = 0 cores = 0 if len(self.get_host_adapters(_host)) == 0: logger.warning('Node %s has no valid network connection', _host) logger.debug('Adding %s', style.host(_host)) self.add_node(_host, {'kind': 'node', 'power': power, 'cores': cores}) for eq in self.get_host_adapters(_host): if eq['mounted']: self.add_equip(eq['switch'], get_host_site(_host)) self._filter_equip_leaves()
def __init__(self, infile=None, resources=None, hosts=None, ip_mac=None, vlan=None, env_name=None, env_file=None, vms=None, distribution=None, outdir=None): """:param infile: an XML file that describe the topology of the deployment :param resources: a dict whose keys are Grid'5000 sites and values are dict, whose keys are hosts and ip_mac, where hosts is a list of execo.Host and ip_mac is a list of tuple (ip, mac). :param env_name: name of the Kadeploy environment :param env_file: path to the Kadeploy environment file :params vms: dict defining the virtual machines :params distribution: how to distribute the vms on the hosts (``round-robin`` , ``concentrated``, ``random``) :params outdir: directory to store the deployment files """ # set a factory for the deployment that use taktuk and chainput self.fact = ActionFactory(remote_tool=TAKTUK, fileput_tool=CHAINPUT, fileget_tool=TAKTUK) self.kavlan = None if not vlan else vlan self.kavlan_site = None if env_name is not None: self.env_file = None if ':' not in env_name: self.env_name, self.env_user = env_name, None else: self.env_user, self.env_name = env_name.split(':') else: if env_file is not None: self.env_name = None self.env_user = None self.env_file = env_file else: self.env_name = 'vm5k' self.env_user = '******' self.env_file = None if outdir: self.outdir = outdir else: self.outdir = 'vm5k_' + strftime("%Y%m%d_%H%M%S_%z") self.copy_actions = None self.state = Element('vm5k') self._define_elements(infile, resources, hosts, vms, ip_mac, distribution) logger.info('%s %s %s %s %s %s %s %s', len(self.sites), style.emph('sites'), len(self.clusters), style.user1('clusters'), len(self.hosts), style.host('hosts'), len(self.vms), style.vm('vms'))
def _munin_server(server, clients): """Install the monitoring service munin. Must be executed inside Grid'5000 to be able to resolve the server and clients IP. :param server: a execo.Host :param clients: a list of execo.Hosts :param plugins: a list of munin plugins """ logger.info('Munin monitoring service installation, server = %s, clients = \n %s', server.address, [host.address for host in clients]) logger.debug('Configuring munin server %s', style.host('server')) cmd = 'export DEBIAN_MASTER=noninteractive ; apt-get update && apt-get install -y munin' inst_munin_server = SshProcess(cmd, server).run() logger.debug('Creating configuration files for server') fd, server_conf = mkstemp(dir='/tmp/', prefix='munin-nodes_') f = fdopen(fd, 'w') for host in clients: get_ip = Process('host '+host.address).run() ip = get_ip.stdout.strip().split(' ')[3] f.write('['+host.address+']\n address '+ip+'\n use_node_name yes\n\n') f.close() Put([server], [server_conf], remote_location='/etc/').run() SshProcess('cd /etc && cp '+server_conf.split('/')[-1]+' munin.conf', server).run() Process('rm '+server_conf).run()
def run(self): """ Main engine method to perform the experiment """ self.define_parameters() while len(self.sweeper.get_remaining()) > 0: # Getting the next combination comb = self.sweeper.get_next() logger.info(style.host(slugify(comb)) + ' has been started') self.get_nodes(comb) # If the job is broken, the program is stopped if get_oar_job_info(self.oar_job_id, self.frontend)['state'] == 'Error': break try: self.workflow(comb) # Process all combinations that can use the same submission while True: # Find the next combination combinations that can use the same submission subcomb = self.sweeper.get_next(lambda r: filter( lambda x: x['cores'] == comb['cores'] and x['cluster'] == comb['cluster'], r)) if not subcomb: logger.info( 'No more combination for cluster=%s and cores=%s', comb['cluster'], comb['cores']) break else: logger.info( style.host(slugify(subcomb)) + ' has been started') if get_oar_job_info(self.oar_job_id, self.frontend)['state'] != 'Error': self.workflow(subcomb) else: break # Whatever happens (errors, end of loop), the job is deleted finally: logger.info('Deleting job...') oardel([(self.oar_job_id, self.frontend)])
def rm_equip(self, equip): """Remove an equipment from the node""" logger.debug('Removing equip %s', style.host(equip)) self.remove_node(equip) if get_network_equipment_attributes(equip)['kind'] == 'router': lc_nodes = [x for x in self.nodes() if equip in x] logger.debug('Removing router linecard %s', ' '.join(lc_nodes)) self.remove_nodes_from(lc_nodes)
def rm_host(self, host): """Remove the host from the graph""" logger.debug('Removing host %s', style.host(host)) self.remove_node(host) for eq in self.get_host_adapters(host): if not self._equip_has_nodes(eq['switch']): logger.debug('Removing equip %s', eq['switch']) self.rm_equip(eq['switch'])
def restart_vms(vms): """ """ hosts = [vm['host'] for vm in vms] running_vms = list_vm(hosts) for vm in vms: if {'id': vm['id']} not in running_vms[vm['host']]: logger.info('%s has not been started on %s, starting it', style.vm(vm['id']), style.host(vm['host'])) SshProcess('virsh start ' + vm['id'], vm['host']).run()
def get_server_ip(host): """Get the server IP""" if isinstance(host, Host): host = host.address logger.debug('Retrieving IP from %s', style.host(host)) get_ip = Process('host ' + host + ' |cut -d \' \' -f 4') get_ip.shell = True get_ip.run() ip = get_ip.stdout.strip() return ip
def run(self): """ Main engine method to perform the experiment """ self.define_parameters() while len(self.sweeper.get_remaining()) > 0: # Getting the next combination comb = self.sweeper.get_next() logger.info(style.host(slugify(comb)) + ' has been started') self.get_nodes(comb) # If the job is broken, the program is stopped if get_oar_job_info(self.oar_job_id, self.frontend)['state'] == 'Error': break try: self.workflow(comb) # Process all combinations that can use the same submission while True: # Find the next combination combinations that can use the same submission subcomb = self.sweeper.get_next(lambda r: filter(lambda x: x['cores'] == comb['cores'] and x['cluster'] == comb['cluster'], r)) if not subcomb: logger.info('No more combination for cluster=%s and cores=%s', comb['cluster'], comb['cores']) break else: logger.info(style.host(slugify(subcomb)) + ' has been started') if get_oar_job_info(self.oar_job_id, self.frontend)['state'] != 'Error': self.workflow(subcomb) else: break # Whatever happens (errors, end of loop), the job is deleted finally: logger.info('Deleting job...') oardel([(self.oar_job_id, self.frontend)])
def setup_aptcacher_server(hosts, base_dir='/tmp/apt-cacher-ng'): """Install and configure apt-cacher on one server""" hosts = map(Host, hosts) logger.info('Installing apt-cacher on %s', ','.join([style.host(host.address) for host in hosts])) logger.detail('Package') package = TaktukRemote('export DEBIAN_MASTER=noninteractive ; apt-get update ; ' + 'apt-get install -o Dpkg::Options::="--force-confdef" -o ' + 'Dpkg::Options::="--force-confnew" -y apt-cacher-ng', hosts).run() if not package.ok: logger.error('Unable to install apt-cacher-ng on %s') return logger.detail('Directory creation') log_dir = base_dir + '/log' cache_dir = base_dir + '/cache' mkdirs = TaktukRemote('mkdir -p ' + log_dir + '; mkdir -p ' + cache_dir + '; chown -R apt-cacher-ng:apt-cacher-ng ' + base_dir, hosts).run() if not mkdirs.ok: logger.error('Unable to create the directories') return cmd = 'sed -i "s#/var/cache/apt-cacher-ng#' + cache_dir + \ '#g" /etc/apt-cacher-ng/acng.conf ;' + \ 'sed -i "s#/var/log/apt-cacher-ng#' + log_dir + '#g" ' + \ '/etc/apt-cacher-ng/acng.conf ;' + \ 'sed -i "s/3142/9999/g" /etc/apt-cacher-ng/acng.conf ; ' + \ 'sed -i "s?#Proxy: http://www-proxy.example.net:80?Proxy: ' + \ 'http://proxy:3128?g" /etc/apt-cacher-ng/acng.conf ; ' + \ 'service apt-cacher-ng restart' configure = TaktukRemote(cmd, hosts).run() if not configure.ok: logger.error('Unable to configure and restart the service') return logger.info('apt-cacher-ng up and running on %s', ','.join([style.host(host.address) for host in hosts]))
def setup_aptcacher_server(hosts, base_dir='/tmp/apt-cacher-ng'): """Install and configure apt-cacher on one server""" hosts = map(Host, hosts) logger.info('Installing apt-cacher on %s', ','.join([style.host(host.address) for host in hosts])) logger.detail('Package') package = TaktukRemote( 'export DEBIAN_MASTER=noninteractive ; apt-get update ; ' + 'apt-get install -o Dpkg::Options::="--force-confdef" -o ' + 'Dpkg::Options::="--force-confnew" -y apt-cacher-ng', hosts).run() if not package.ok: logger.error('Unable to install apt-cacher-ng on %s') return logger.detail('Directory creation') log_dir = base_dir + '/log' cache_dir = base_dir + '/cache' mkdirs = TaktukRemote( 'mkdir -p ' + log_dir + '; mkdir -p ' + cache_dir + '; chown -R apt-cacher-ng:apt-cacher-ng ' + base_dir, hosts).run() if not mkdirs.ok: logger.error('Unable to create the directories') return cmd = 'sed -i "s#/var/cache/apt-cacher-ng#' + cache_dir + \ '#g" /etc/apt-cacher-ng/acng.conf ;' + \ 'sed -i "s#/var/log/apt-cacher-ng#' + log_dir + '#g" ' + \ '/etc/apt-cacher-ng/acng.conf ;' + \ 'sed -i "s/3142/9999/g" /etc/apt-cacher-ng/acng.conf ; ' + \ 'sed -i "s?#Proxy: http://www-proxy.example.net:80?Proxy: ' + \ 'http://proxy:3128?g" /etc/apt-cacher-ng/acng.conf ; ' + \ 'service apt-cacher-ng restart' configure = TaktukRemote(cmd, hosts).run() if not configure.ok: logger.error('Unable to configure and restart the service') return logger.info('apt-cacher-ng up and running on %s', ','.join([style.host(host.address) for host in hosts]))
def __init__(self, hosts, config_file=None, sharding=True, replication=False): """Create a new MongoDB cluster with the given hosts. Args: hosts (list of Host): The hosts that conform the cluster. config_file (str, optional): The path of the config file to be used. """ super(MongoDBCluster, self).__init__(hosts, config_file) # Cluster properties ctype = self.get_cluster_type() self.data_dir = self.config.get("cluster", ctype + "_data_dir") self.logs_dir = self.config.get("cluster", ctype + "_logs_dir") self.md_port = self.config.getint("cluster", ctype + "_md_port") self.ms_port = self.config.getint("cluster", ctype + "_ms_port") self.bin_dir = self.base_dir + "/bin" self.conf_mandatory_files = [CONF_FILE] # Configure master self.master = hosts[0] self.do_sharding = sharding self.initialized_sharding = False self.mongos_pid_file = self.base_dir + "/mongos.pid" self.do_replication = replication # TODO: allow more fine-frained configuration of hosts: assign roles logger.info("MongoDB cluster created with master %s and hosts %s %s " "replication, %s sharding", style.host(self.master.address), ' '.join([style.host(h.address.split('.')[0]) for h in self.hosts]), "with" if self.do_replication else "without", "with" if self.do_sharding else "without")
def get_host_adapters(self, host): """Return the mountable network interfaces from a host""" try: if host in self.data['hosts']: return [m for m in self.data['hosts'][host]['network_adapters'] if 'switch' in m and not m['management'] and m['mountable'] and m['switch'] and m['interface'] == 'Ethernet'] except: logger.warning("Wrong description for host %s" % style.host(host)) logger.debug("host's network_adapters = %s" % (self.data['hosts'][host]['network_adapters'],)) return []
def get_host_adapters(self, host): """Return the mountable network interfaces from a host""" try: if host in self.data['hosts']: return [ m for m in self.data['hosts'][host]['network_adapters'] if 'switch' in m and not m['management'] and m['mountable'] and m['switch'] and m['interface'] == 'Ethernet' ] except: logger.warning("Wrong description for host %s" % style.host(host)) logger.debug("host's network_adapters = %s" % (self.data['hosts'][host]['network_adapters'], )) return []
def get_hosts_list(self, hosts_str): """Generate a list of hosts from the given string. Args: hosts_str (str): The following options are supported - The path of the file containing the hosts to be used. Each host should be in a different line. Repeated hosts are pruned. Hint: in a running Grid5000 job, $OAR_NODEFILE should be used. - A comma-separated list of site:job_id - A comma-separated list of hosts. - An oargrid_job_id Return: list of Host: The list of hosts. """ hosts = [] if os.path.isfile(hosts_str): for line in open(hosts_str): h = Host(line.rstrip()) if h not in hosts: hosts.append(h) elif ':' in hosts_str: # We assume the string is a comma separated list of site:job_id for job in hosts_str.split(','): site, job_id = job.split(':') hosts += get_oar_job_nodes(int(job_id), site) elif "," in hosts_str: # We assume the string is a comma separated list of hosts for hstr in hosts_str.split(','): h = Host(hstr.rstrip()) if h not in hosts: hosts.append(h) elif hosts_str.isdigit(): # If the file_name is a number, we assume this is a oargrid_job_id hosts = get_oargrid_job_nodes(int(hosts_str)) else: # If not any of the previous, we assume is a single-host cluster # where the given input is the only host hosts = [Host(hosts_str.rstrip())] logger.debug('Hosts list: \n%s', ' '.join(style.host(host.address.split('.')[0]) for host in hosts)) return hosts
def configure_apt_proxy(vms): """Override apt proxy-guess with server as proxy""" hosts_vms = {} for vm in vms: if not vm['host'] in hosts_vms: hosts_vms[vm['host']] = [] hosts_vms[vm['host']].append(vm['ip']) conf = [] for server, clients in hosts_vms.iteritems(): server = Host(server) logger.detail('Configuring %s as APT proxy for %s', style.host(server.address), ','.join(clients)) conf.append(TaktukRemote(' echo \'Acquire::http::Proxy \"http://' + server.address + ':9999" ; \' > /etc/apt/apt.conf.d/proxy-guess', clients)) ParallelActions(conf).run()
def configure_apt_proxy(vms): """Override apt proxy-guess with server as proxy""" hosts_vms = {} for vm in vms: if not vm['host'] in hosts_vms: hosts_vms[vm['host']] = [] hosts_vms[vm['host']].append(vm['ip']) conf = [] for server, clients in hosts_vms.iteritems(): server = Host(server) logger.detail('Configuring %s as APT proxy for %s', style.host(server.address), ','.join(clients)) conf.append( TaktukRemote( ' echo \'Acquire::http::Proxy \"http://' + server.address + ':9999" ; \' > /etc/apt/apt.conf.d/proxy-guess', clients)) ParallelActions(conf).run()
def configure_service_node(self): """Setup automatically a DNS server to access virtual machines by id and also install a DHCP server if kavlan is used""" if self.kavlan: service = 'DNS/DHCP' dhcp = True else: service = 'DNS' dhcp = False service_node = get_fastest_host(self.hosts) logger.info('Setting up %s on %s', style.emph(service), style.host(service_node.split('.')[0])) clients = list(self.hosts) clients.remove(service_node) dnsmasq_server(service_node, clients, self.vms, dhcp)
def _print_state_compact(self): """Display in a compact form the distribution of vms on hosts.""" dist = {} max_len_host = 0 for host in self.hosts: if len(host.split('.')[0]) > max_len_host: max_len_host = len(host.split('.')[0]) for vm in self.vms: host = vm['host'].split('.')[0] if len(host) > max_len_host: max_len_host = len(host) if host not in dist.keys(): dist[host] = {vm['id']: vm['state']} else: dist[host][vm['id']] = vm['state'] log = '' for host in sorted(self.hosts, key=lambda x: (x.split('.')[0].split('-')[0], int(x.split('.')[0].split('-')[1]))): host = host.split('.')[0] if host not in dist: dist[host] = {} log += '\n' + style.host(host) + ' '.ljust(max_len_host + 2 - len(host)) + \ str(len(dist[host].keys())) + ' ' try: vms = sorted(dist[host].keys(), key=lambda x: (x.split('.')[0].split('-')[0], int(x.split('.')[0].split('-')[1]))) except: vms = sorted(dist[host].keys()) pass for vm in vms: if dist[host][vm] == 'OK': log += style.OK(vm) elif dist[host][vm] == 'KO': log += style.KO(vm) else: log += style.Unknown(vm) log += ' ' return log
def _make_reservation(self, site): """Make a new reservation""" elements = {self.config['cluster']: 1} logger.info('Finding slot for the experiment ' '\nrally %s:1', style.host(self.config['cluster']).rjust(5)) planning = funk.get_planning(elements) slots = funk.compute_slots(planning, walltime=self.config['walltime'].encode( 'ascii', 'ignore'), excluded_elements=EXCLUDED_ELEMENTS) startdate, enddate, resources = funk.find_free_slot(slots, elements) resources = funk.distribute_hosts(resources, elements, EXCLUDED_ELEMENTS) if startdate is None: logger.error("Sorry, could not find the resources requested.") exit(4) jobs_specs = funk.get_jobs_specs(resources, name=self.options.job_name, excluded_elements=EXCLUDED_ELEMENTS) print jobs_specs sub, site = jobs_specs[0] sub.additional_options = "-t deploy" sub.reservation_date = startdate sub.walltime = self.config['walltime'].encode('ascii', 'ignore') sub.name = self.options.job_name if 'testing' in EX5.get_cluster_attributes( self.config['cluster'])['queues']: sub.queue = 'testing' jobs = EX5.oarsub([(sub, site)]) self.job_id = jobs[0][0] logger.info('Job %s will start at %s', style.emph(self.job_id), style.log_header(EX.time_utils.format_date(startdate)))
def generate_hosts(hosts_input): """Generate a list of hosts from the given file. Args: hosts_input: The path of the file containing the hosts to be used, or a comma separated list of site:job_id or an a comma separated list of hosts or an oargrid_job_id. If a file is used, each host should be in a different line. Repeated hosts are pruned. Hint: in a running Grid5000 job, $OAR_NODEFILE should be used. Return: list of Host: The list of hosts. """ hosts = [] if os.path.isfile(hosts_input): for line in open(hosts_input): h = Host(line.rstrip()) if h not in hosts: hosts.append(h) elif ":" in hosts_input: # We assume the string is a comma separated list of site:job_id for job in hosts_input.split(","): site, job_id = job.split(":") hosts += get_oar_job_nodes(int(job_id), site) elif "," in hosts_input: # We assume the string is a comma separated list of hosts for hstr in hosts_input.split(","): h = Host(hstr.rstrip()) if h not in hosts: hosts.append(h) elif hosts_input.isdigit(): # If the file_name is a number, we assume this is a oargrid_job_id hosts = get_oargrid_job_nodes(int(hosts_input)) else: # If not any of the previous, we assume is a single-host cluster where # the given input is the only host hosts = [Host(hosts_input.rstrip())] logger.debug("Hosts list: \n%s", " ".join(style.host(host.address.split(".")[0]) for host in hosts)) return hosts
def __init__(self, hadoop_cluster, config_file=None): # Load cluster properties config = ConfigParser(self.defaults) config.add_section("cluster") config.add_section("local") if config_file: config.readfp(open(config_file)) self.base_dir = config.get("cluster", "mahout_base_dir") self.conf_dir = config.get("cluster", "mahout_conf_dir") self.bin_dir = self.base_dir + "/bin" self.hc = hadoop_cluster # Create topology logger.info("Mahout cluster created in hosts %s", ' '.join([style.host(h.address.split('.')[0]) for h in self.hc.hosts]))
def get_job_by_name(job_name, sites=None): """ """ logger.detail('Looking for a job named %s', style.emph(job_name)) if not sites: sites = get_g5k_sites() oargrid_jobs = get_current_oargrid_jobs() if len(oargrid_jobs) > 0: for g_job in oargrid_jobs: for job in get_oargrid_job_oar_jobs(g_job): info = get_oar_job_info(job[0], job[1]) if info['name'] == job_name: logger.info('Oargridjob %s found !', style.emph(g_job)) return g_job, None running_jobs = get_current_oar_jobs(sites) for job in running_jobs: info = get_oar_job_info(job[0], job[1]) if info['name'] == job_name: logger.info('Job %s found on site %s !', style.emph(job[0]), style.host(job[1])) return job return None, None
def __init__(self, hadoop_cluster, config_file=None): # Load cluster properties config = ConfigParser(self.defaults) config.add_section("cluster") config.add_section("local") if config_file: config.readfp(open(config_file)) self.base_dir = config.get("cluster", "mahout_base_dir") self.conf_dir = config.get("cluster", "mahout_conf_dir") self.bin_dir = self.base_dir + "/bin" self.hc = hadoop_cluster # Create topology logger.info( "Mahout cluster created in hosts %s", ' '.join( [style.host(h.address.split('.')[0]) for h in self.hc.hosts]))
def __init__(self, hadoop_cluster, config_file=None): """Create a new Hive cluster. It can be created as a standalone cluster or on top of YARN. Args: hadoop_cluster (HadoopCluster, optional): The Hadoop cluster to link. configFile (str, optional): The path of the config file to be used. """ # Load cluster properties config = ConfigParser(self.defaults) config.add_section("cluster") config.add_section("local") if config_file: config.readfp(open(config_file)) self.base_dir = config.get("cluster", "hive_base_dir") self.conf_dir = config.get("cluster", "hive_conf_dir") self.logs_dir = config.get("cluster", "hive_logs_dir") self.warehouse_dir = config.get("cluster", "hive_warehouse_dir") self.metastore_dir = config.get("cluster", "hive_metastore_dir") self.local_base_conf_dir = config.get("local", "local_base_conf_dir") self.bin_dir = self.base_dir + "/bin" # Initialize hosts self.hosts = hadoop_cluster.hosts self.master = hadoop_cluster.master # Store reference to Hadoop cluster and check if mandatory self.hc = hadoop_cluster logger.info( "Hive cluster created in hosts %s." " It is linked to a Hadoop cluster." if self.hc else "", ' '.join([style.host(h.address.split('.')[0]) for h in self.hosts]))
def __init__(self, hadoop_cluster, config_file=None): """Create a new Hive cluster. It can be created as a standalone cluster or on top of YARN. Args: hadoop_cluster (HadoopCluster, optional): The Hadoop cluster to link. configFile (str, optional): The path of the config file to be used. """ # Load cluster properties config = ConfigParser(self.defaults) config.add_section("cluster") config.add_section("local") if config_file: config.readfp(open(config_file)) self.base_dir = config.get("cluster", "hive_base_dir") self.conf_dir = config.get("cluster", "hive_conf_dir") self.logs_dir = config.get("cluster", "hive_logs_dir") self.warehouse_dir = config.get("cluster", "hive_warehouse_dir") self.metastore_dir = config.get("cluster", "hive_metastore_dir") self.local_base_conf_dir = config.get("local", "local_base_conf_dir") self.bin_dir = self.base_dir + "/bin" # Initialize hosts self.hosts = hadoop_cluster.hosts self.master = hadoop_cluster.master # Store reference to Hadoop cluster and check if mandatory self.hc = hadoop_cluster logger.info("Hive cluster created in hosts %s." " It is linked to a Hadoop cluster." if self.hc else "", ' '.join([style.host(h.address.split('.')[0]) for h in self.hosts]))
def __force_clean(self): """Stop previous Spark processes (if any) and remove all remote files created by it.""" spark_processes = [ "Master", "Worker" ] force_kill = False for h in self.hosts: proc = SshProcess("jps", h) proc.run() ids_to_kill = [] for line in proc.stdout.splitlines(): field = line.split() if field[1] in spark_processes: ids_to_kill.append(field[0]) if ids_to_kill: force_kill = True ids_to_kill_str = "" for pid in ids_to_kill: ids_to_kill_str += " " + pid logger.warn( "Killing running Spark processes in host %s" % style.host(h.address.split('.')[0])) proc = SshProcess("kill -9" + ids_to_kill_str, h) proc.run() if force_kill: logger.info( "Processes from previous hadoop deployments had to be killed") self.clean_logs()
def get_hosts_list(self, hosts_str): """Generate a list of hosts from the given file. Args: hosts_str (str): The following options are supported - The path of the file containing the hosts to be used. Each host should be in a different line. Repeated hosts are pruned. Hint: in a running Grid5000 job, $OAR_NODEFILE should be used. - A comma-separated list of hosts. Return: list of Host: The list of hosts. """ hosts = [] if os.path.isfile(hosts_str): for line in open(hosts_str): h = Host(line.rstrip()) if h not in hosts: hosts.append(h) elif "," in hosts_str: # We assume the string is a comma separated list of hosts for hstr in hosts_str.split(','): h = Host(hstr.rstrip()) if h not in hosts: hosts.append(h) else: # If not any of the previous, we assume is a single-host cluster # where the given input is the only host hosts = [Host(hosts_str.rstrip())] logger.debug('Hosts list: \n%s', ' '.join(style.host(host.address.split('.')[0]) for host in hosts)) return hosts
def __init__(self, mode, config_file=None, hosts=None, hadoop_cluster=None): """Create a new Spark cluster. It can be created as a standalone cluster or on top of YARN. Args: mode (int): The cluster manager that is used (STANDALONE_MODE or YARN_MODE). configFile (str, optional): The path of the config file to be used. hosts (list of Host, optional): The hosts of the cluster (standalone operation). hadoop_cluster (HadoopCluster, optional): The Hadoop cluster to link. """ # Load cluster properties config = ConfigParser(self.defaults) config.add_section("cluster") config.add_section("local") if config_file: config.readfp(open(config_file)) # Deployment properties self.local_base_conf_dir = config.get("local", "local_base_conf_dir") self.init_conf_dir = tempfile.mkdtemp("", "spark-init-", "/tmp") self.conf_mandatory_files = [SPARK_CONF_FILE] self.base_dir = config.get("cluster", "spark_base_dir") self.conf_dir = config.get("cluster", "spark_conf_dir") self.logs_dir = config.get("cluster", "spark_logs_dir") self.evs_log_dir = config.get("cluster", "spark_events_dir") self.work_dir = config.get("cluster", "spark_work_dir") self.port = config.getint("cluster", "spark_port") self.local_base_conf_dir = config.get("local", "local_base_conf_dir") self.bin_dir = self.base_dir + "/bin" self.sbin_dir = self.base_dir + "/sbin" self.mode = mode self.java_home = None # Initialize hosts if hosts: self.hosts = hosts self.master = hosts[0] elif hadoop_cluster: self.hosts = hadoop_cluster.hosts self.master = hadoop_cluster.master else: logger.error("Hosts in the cluster must be specified either" "directly or indirectly through a Hadoop cluster.") raise SparkException("Hosts in the cluster must be specified " "either directly or indirectly through a " "Hadoop cluster.") # Store cluster information self.hw = hw_manager.make_deployment_hardware() self.hw.add_hosts(self.hosts) self.master_cluster = self.hw.get_host_cluster(self.master) # Store reference to Hadoop cluster and check if mandatory self.hc = hadoop_cluster if not self.hc and self.mode == YARN_MODE: logger.error("When using a YARN_MODE mode, a reference to the " "Hadoop cluster should be provided.") raise SparkException("When using a YARN_MODE mode, a reference " "to the Hadoop cluster should be provided") if self.mode == STANDALONE_MODE: mode_text = "in standalone mode" else: mode_text = "on top of YARN" logger.info( "Spark cluster created %s in hosts %s." + (" It is linked to a Hadoop cluster." if self.hc else ""), mode_text, ' '.join([style.host(h.address.split('.')[0]) for h in self.hosts]))
def __init__(self, hosts, topo_list=None, config_file=None): """Create a new Hadoop cluster with the given hosts and topology. Args: hosts (list of Host): The hosts to be assigned a topology. topo_list (list of str, optional): The racks to be assigned to each host. len(hosts) should be equal to len(topo_list). configFile (str, optional): The path of the config file to be used. """ # Load properties config = ConfigParser(self.defaults) config.add_section("cluster") config.add_section("local") if config_file: config.readfp(open(config_file)) # Deployment properties self.local_base_conf_dir = config.get("local", "local_base_conf_dir") self.init_conf_dir = tempfile.mkdtemp("", "hadoop-init-", "/tmp") self.conf_mandatory_files = [CORE_CONF_FILE, HDFS_CONF_FILE, MR_CONF_FILE] # Node properties self.base_dir = config.get("cluster", "hadoop_base_dir") self.conf_dir = config.get("cluster", "hadoop_conf_dir") self.logs_dir = config.get("cluster", "hadoop_logs_dir") self.hadoop_temp_dir = config.get("cluster", "hadoop_temp_dir") self.hdfs_port = config.getint("cluster", "hdfs_port") self.mapred_port = config.getint("cluster", "mapred_port") self.bin_dir = self.base_dir + "/bin" self.sbin_dir = self.base_dir + "/bin" self.java_home = None # Configure master and slaves self.hosts = list(hosts) self.master = self.hosts[0] # Create topology self.topology = HadoopTopology(hosts, topo_list) # Store cluster information self.hw = hw_manager.make_deployment_hardware() self.hw.add_hosts(self.hosts) self.master_cluster = self.hw.get_host_cluster(self.master) # Create a string to display the topology t = {v: [] for v in self.topology.topology.values()} for key, value in self.topology.topology.iteritems(): t[value].append(key.address) log_topo = ', '.join([style.user2(k) + ': ' + ' '.join(map(lambda x: style.host(x.split('.')[0]), v)) for k, v in t.iteritems()]) logger.info("Hadoop cluster created with master %s, hosts %s and " "topology %s", style.host(self.master.address), ' '.join([style.host(h.address.split('.')[0]) for h in self.hosts]), log_topo)
def __init__(self, mode, config_file=None, hosts=None, hadoop_cluster=None): """Create a new Spark cluster. It can be created as a standalone cluster or on top of YARN. Args: mode (int): The cluster manager that is used (STANDALONE_MODE or YARN_MODE). configFile (str, optional): The path of the config file to be used. hosts (list of Host, optional): The hosts of the cluster (standalone operation). hadoop_cluster (HadoopCluster, optional): The Hadoop cluster to link. """ # Load cluster properties config = ConfigParser(self.defaults) config.add_section("cluster") config.add_section("local") if config_file: config.readfp(open(config_file)) # Deployment properties self.local_base_conf_dir = config.get("local", "local_base_conf_dir") self.init_conf_dir = tempfile.mkdtemp("", "spark-init-", "/tmp") self.conf_mandatory_files = [SPARK_CONF_FILE] self.base_dir = config.get("cluster", "spark_base_dir") self.conf_dir = config.get("cluster", "spark_conf_dir") self.logs_dir = config.get("cluster", "spark_logs_dir") self.evs_log_dir = config.get("cluster", "spark_events_dir") self.work_dir = config.get("cluster", "spark_work_dir") self.port = config.getint("cluster", "spark_port") self.local_base_conf_dir = config.get("local", "local_base_conf_dir") self.bin_dir = self.base_dir + "/bin" self.sbin_dir = self.base_dir + "/sbin" self.mode = mode self.java_home = None # Initialize hosts if hosts: self.hosts = hosts self.master = hosts[0] elif hadoop_cluster: self.hosts = hadoop_cluster.hosts self.master = hadoop_cluster.master else: logger.error("Hosts in the cluster must be specified either" "directly or indirectly through a Hadoop cluster.") raise SparkException("Hosts in the cluster must be specified " "either directly or indirectly through a " "Hadoop cluster.") # Store cluster information self.hw = G5kDeploymentHardware() self.hw.add_hosts(self.hosts) self.master_cluster = self.hw.get_cluster(get_host_cluster(self.master)) # Store reference to Hadoop cluster and check if mandatory self.hc = hadoop_cluster if not self.hc and self.mode == YARN_MODE: logger.error("When using a YARN_MODE mode, a reference to the " "Hadoop cluster should be provided.") raise SparkException("When using a YARN_MODE mode, a reference " "to the Hadoop cluster should be provided") if self.mode == STANDALONE_MODE: mode_text = "in standalone mode" else: mode_text = "on top of YARN" logger.info("Spark cluster created %s in hosts %s." + (" It is linked to a Hadoop cluster." if self.hc else ""), mode_text, ' '.join([style.host(h.address.split('.')[0]) for h in self.hosts]))
def __init__(self, hosts, topo_list=None, config_file=None): """Create a new Hadoop cluster with the given hosts and topology. Args: hosts (list of Host): The hosts to be assigned a topology. topo_list (list of str, optional): The racks to be assigned to each host. len(hosts) should be equal to len(topo_list). configFile (str, optional): The path of the config file to be used. """ # Load properties config = ConfigParser(self.defaults) config.add_section("cluster") config.add_section("local") if config_file: config.readfp(open(config_file)) # Deployment properties self.local_base_conf_dir = config.get("local", "local_base_conf_dir") self.init_conf_dir = tempfile.mkdtemp("", "hadoop-init-", "/tmp") self.conf_mandatory_files = [ CORE_CONF_FILE, HDFS_CONF_FILE, MR_CONF_FILE ] # Node properties self.base_dir = config.get("cluster", "hadoop_base_dir") self.conf_dir = config.get("cluster", "hadoop_conf_dir") self.logs_dir = config.get("cluster", "hadoop_logs_dir") self.hadoop_temp_dir = config.get("cluster", "hadoop_temp_dir") self.hdfs_port = config.getint("cluster", "hdfs_port") self.mapred_port = config.getint("cluster", "mapred_port") self.bin_dir = self.base_dir + "/bin" self.sbin_dir = self.base_dir + "/bin" self.java_home = None # Configure master and slaves self.hosts = list(hosts) self.master = self.hosts[0] # Create topology self.topology = HadoopTopology(hosts, topo_list) # Store cluster information self.hw = hw_manager.make_deployment_hardware() self.hw.add_hosts(self.hosts) self.master_cluster = self.hw.get_host_cluster(self.master) # Create a string to display the topology t = {v: [] for v in self.topology.topology.values()} for key, value in self.topology.topology.iteritems(): t[value].append(key.address) log_topo = ', '.join([ style.user2(k) + ': ' + ' '.join(map(lambda x: style.host(x.split('.')[0]), v)) for k, v in t.iteritems() ]) logger.info( "Hadoop cluster created with master %s, hosts %s and " "topology %s", style.host(self.master.address), ' '.join([style.host(h.address.split('.')[0]) for h in self.hosts]), log_topo)
#!/usr/bin/env python from execo_g5k.topology import g5k_graph, treemap from execo.log import logger, style from execo_g5k.oar import get_oar_job_nodes from execo_g5k.utils import hosts_list from networkx.algorithms.shortest_paths.generic import shortest_path from execo_g5k.api_utils import get_host_shortname from random import uniform jobs = [(1696863, 'grenoble'), (1502558, 'lille'), (74715, 'luxembourg')] logger.info('Retrieving hosts used for jobs %s', ', '.join([style.host(site) + ':' + style.emph(job_id) for job_id, site in jobs])) hosts = [get_host_shortname(h) for job_id, site in jobs for h in get_oar_job_nodes(job_id, site)] logger.info(hosts_list(hosts)) logger.info('Creating topological graph') g = g5k_graph(elements=hosts) i, j = int(uniform(1, len(hosts))), int(uniform(1, len(hosts))) path = shortest_path(g, hosts[i], hosts[j]) logger.info('Communication between %s and %s go through ' 'the following links: \n%s', style.host(hosts[i]), style.host(hosts[j]), ' -> '.join(path)) logger.info('Active links between nodes %s and %s are: \n%s', style.host(path[0]),
#!/usr/bin/env python from execo_g5k.topology import g5k_graph, treemap from execo.log import logger, style from execo_g5k.oar import get_oar_job_nodes from execo_g5k.utils import hosts_list from networkx.algorithms.shortest_paths.generic import shortest_path from execo_g5k.api_utils import get_host_shortname from random import uniform jobs = [(1696863, 'grenoble'), (1502558, 'lille'), (74715, 'luxembourg')] logger.info( 'Retrieving hosts used for jobs %s', ', '.join([ style.host(site) + ':' + style.emph(job_id) for job_id, site in jobs ])) hosts = [ get_host_shortname(h) for job_id, site in jobs for h in get_oar_job_nodes(job_id, site) ] logger.info(hosts_list(hosts)) logger.info('Creating topological graph') g = g5k_graph(elements=hosts) i, j = int(uniform(1, len(hosts))), int(uniform(1, len(hosts))) path = shortest_path(g, hosts[i], hosts[j]) logger.info( 'Communication between %s and %s go through ' 'the following links: \n%s', style.host(hosts[i]), style.host(hosts[j]), ' -> '.join(path))
def workflow(self, comb): """ Compute one application launch using a given parameter group """ comb_ok = False try: # Generate configuration file needed by MPI processes logger.info("Generating assembly file...") py = comb['cores'] / comb['px'] prepare = Process( 'cd %s && python %s %d %d %d %d %d %s app.lad' % (self.workingPath, self.genLadScript, comb['datasize'], comb['datasize'], comb['datasize'], comb['px'], py, comb['transposition'])) prepare.shell = True prepare.run() # Generate the MPI host file mfile = self.generate_machine_file() # Start L2C lad = "./app.lad" logger.info("Computing...") res = Process( "export OAR_JOB_KEY_FILE=~/.oar_key ; cd %s && l2c_loader -M,-machinefile,%s --mpi -c %d %s" % (self.workingPath, mfile, comb['cores'], lad)) res.shell = True res.stdout_handlers.append( os.path.join(self.result_dir, slugify(comb) + '.out')) res.stdout_handlers.append(sys.stdout) res.stderr_handlers.append( os.path.join(self.result_dir, slugify(comb) + '.err')) res.stderr_handlers.append(sys.stderr) res.run() if not res.ok: logger.error('Bad L2C termination') raise Exception('Bad L2C termination') if len( res.stderr ) > 0: # WARNING: when L2C cannot find the LAD file or something strange like this logger.warning('Not empty error output') # Clean configuration files logger.info("Removing assembly files...") res = Process('cd %s && rm -f app.lad*' % self.workingPath) res.shell = True res.run() comb_ok = True except Exception: pass finally: if comb_ok: self.sweeper.done(comb) logger.info(style.host(slugify(comb)) + ' has been done') else: self.sweeper.cancel(comb) logger.warning( style.host(slugify(comb)) + ' has been canceled') logger.info(style.step('%s Remaining'), len(self.sweeper.get_remaining()))