def start_deploy_server(self): """Deploy the server with the given Kadeploy environment. Blocks until deployment is done""" # Sort hosts by name and take the first one: this is useful when # using a single reservation for all nodes, since we will always # pick the same host as server. self.server = sorted(g5k.get_oar_job_nodes(*self.server_job), key=lambda node: node.address)[0] if os.path.isfile(self.args.server_env): deploy_opts = {"env_file": self.args.server_env} else: deploy_opts = { "env_name": self.args.server_env, "user": self.args.kadeploy_user } if self.multi_site(): deploy_opts["vlan"] = self.global_vlan logger.debug( "Deploying environment '{}' on server {} in VLAN {}...".format( self.args.server_env, self.server.address, self.global_vlan)) else: logger.debug("Deploying environment '{}' on server {}...".format( self.args.server_env, self.server.address)) d = g5k.Deployment([self.server], **deploy_opts) return g5k.kadeploy.Kadeployer(d).start()
def start_all_vm(self): """Starts VM on reserved machines, and returns the associated task object. This function will return immediately, but the caller has to wait for the VM to be setup before using them. """ assert (len(self.vm_hosts) > 0) (all_ip, all_mac) = zip(*self.subnet_ip_mac) self.vm_macs = all_mac[:self.args.nb_vm * len(self.vm_hosts)] self.vm_ips = all_ip[:self.args.nb_vm * len(self.vm_hosts)] logger.debug("VMs IP: {}".format(' '.join(self.vm_ips))) memory = self.args.memory nb_vm = self.args.nb_vm # For each physical host, build a list of MAC addresses to be used for its VMs macs_per_host = [ self.vm_macs[i * nb_vm:(i + 1) * nb_vm] for i, host in enumerate(self.vm_hosts) ] # Double escaping is magic (after .format, it will become {{macs_per_host}}) script = """\ for mac in {{{{[' '.join(macs) for macs in macs_per_host]}}}} do iface=$(tunctl -b) brctl addif br0 "$iface" ip link set "$iface" up kvm -m {memory} -smp cores={cores},threads=1,sockets=1 -nographic -localtime -enable-kvm -drive file="{image}",if=virtio,media=disk -snapshot -net nic,model=virtio,macaddr="$mac" -net tap,ifname="$iface",script=no & done wait """.format(memory=memory, cores=1, image=self.args.vm_image) vm_task = execo.Remote(script, self.vm_hosts, connection_params=self.server_conn_params, name="Run VM on all hosts") return vm_task.start()
def serialize_cluster(cluster_type, cid, cluster_object): """Serialize the cluster object. Replace also the linked Hadoop cluster if it exists. Args: cluster_type (str): The type of cluster to serialize. cid (int): The id of the cluster. cluster_object: The cluster to serialize. """ fname = __get_cluster_file(cluster_type, cid) logger.debug("Serialize cluster (" + cluster_type + ") in " + fname) c_file = open(fname, "wb") pickle.dump(cluster_object, c_file) if cluster_type != HadoopCluster.get_cluster_type(): hc_link_fname = __get_hc_link_file(cluster_type, cid) if os.path.exists(hc_link_fname): with open(hc_link_fname) as link_file: hc_id = int(link_file.readline()) serialize_cluster(HadoopCluster.get_cluster_type(), hc_id, cluster_object.hc)
def start_shell(self, node=None, mongos=True): """Open a MongoDB shell. Args: node (Host, optional): The host were the shell is to be started. If not provided, self.master is chosen. """ self._check_initialization() if not node: node = self.master if mongos and self.do_sharding: port = self.ms_port else: port = self.md_port mongo_command = ( self.bin_dir + "/mongo" " --host " + node.address + " --port " + str(port) ) logger.debug(mongo_command) call("ssh -t " + node.address + " " + NUMA_PREFIX + " " + mongo_command, shell=True)
def generate_hosts(hosts_input): """Generate a list of hosts from the given file. Args: hosts_input: The path of the file containing the hosts to be used, or a comma separated list of site:job_id or an oargrid_job_id. If a file is used, each host should be in a different line. Repeated hosts are pruned. Hint: in a running Grid5000 job, $OAR_NODEFILE should be used. Return: list of Host: The list of hosts. """ hosts = [] if os.path.isfile(hosts_input): for line in open(hosts_input): h = Host(line.rstrip()) if h not in hosts: hosts.append(h) elif ':' in hosts_input: # We assume the string is a comma separated list of site:job_id for job in hosts_input.split(','): site, job_id = job.split(':') hosts += get_oar_job_nodes(int(job_id), site) else: # If the file_name is a number, we assume this is a oargrid_job_id hosts = get_oargrid_job_nodes(int(hosts_input)) logger.debug('Hosts list: \n%s', ' '.join(style.host(host.address.split('.')[0]) for host in hosts)) return hosts
def serialize_cluster(cluster_type, cid, cluster_object): """Serialize the cluster object. Replace also the linked Hadoop cluster if it exists. Args: cluster_type (str): The type of cluster to serialize. cid (int): The id of the cluster. cluster_object: The cluster to serialize. """ fname = __get_cluster_file(cluster_type, cid) logger.debug("Serialize cluster (" + cluster_type + ") in " + fname) c_file = open(fname, 'wb') pickle.dump(cluster_object, c_file) if cluster_type != HadoopCluster.get_cluster_type(): hc_link_fname = __get_hc_link_file(cluster_type, cid) if os.path.exists(hc_link_fname): with open(hc_link_fname) as link_file: hc_id = int(link_file.readline()) serialize_cluster(HadoopCluster.get_cluster_type(), hc_id, cluster_object.hc)
def generate_hosts(hosts_input): """Generate a list of hosts from the given file. Args: hosts_input: The path of the file containing the hosts to be used, or a comma separated list of site:job_id or an oargrid_job_id. If a file is used, each host should be in a different line. Repeated hosts are pruned. Hint: in a running Grid5000 job, $OAR_NODEFILE should be used. Return: list of Host: The list of hosts. """ hosts = [] if os.path.isfile(hosts_input): for line in open(hosts_input): h = Host(line.rstrip()) if h not in hosts: hosts.append(h) elif ':' in hosts_input: # We assume the string is a comma separated list of site:job_id for job in hosts_input.split(','): site, job_id = job.split(':') hosts += get_oar_job_nodes(int(job_id), site) else: # If the file_name is a number, we assume this is a oargrid_job_id hosts = get_oargrid_job_nodes(int(hosts_input)) logger.debug( 'Hosts list: \n%s', ' '.join(style.host(host.address.split('.')[0]) for host in hosts)) return hosts
def start_deploy_vmhosts(self): hosts = g5k.get_oar_job_nodes(*self.vmhosts_job) if self.multi_site(): self.vm_hosts = hosts else: # Take all but the first host self.vm_hosts = sorted(hosts, key=lambda node: node.address)[1:] if os.path.isfile(self.args.vmhosts_env): deploy_opts = {"env_file": self.args.vmhosts_env} else: deploy_opts = { "env_name": self.args.vmhosts_env, "user": self.args.vmhosts_kadeploy_user } if self.multi_site(): deploy_opts["vlan"] = self.global_vlan logger.debug( "Deploying environment '{}' on {} VM hosts in VLAN {}...". format(self.args.vmhosts_env, len(self.vm_hosts), self.global_vlan)) else: logger.debug("Deploying environment '{}' on {} VM hosts...".format( self.args.vmhosts_env, len(self.vm_hosts))) d = g5k.Deployment(self.vm_hosts, **deploy_opts) return g5k.kadeploy.Kadeployer(d).start()
def __init__(self): """Initialize the execo engine""" super(IsotopicBoxModel, self).__init__() self.init_plots() logger.info(style.log_header('\n\n Welcome to the ' + 'human isotopic Box Model\n')) logger.debug(pformat(self.__dict__))
def prepare_global_vlan(self): vlans = g5k.get_oar_job_kavlan(*self.globalvlan_job) if len(vlans) > 0: self.global_vlan = vlans[0] logger.debug("Global VLAN ID: {}".format(self.global_vlan)) else: logger.error("Could not reserve global VLAN") sys.exit(1)
def finish_deploy_server(self, deploy_process): deployed = deploy_process.deployed_hosts if len(deployed) == 0: logger.error("Could not deploy server") sys.exit(1) if self.multi_site(): logger.debug("Deployed, transforming {} into {}".format( self.server.address, g5k.get_kavlan_host_name(self.server.address, self.global_vlan))) self.server.address = g5k.get_kavlan_host_name( self.server.address, self.global_vlan)
def finish_deploy_vmhosts(self, deploy_process): deployed = deploy_process.deployed_hosts if len(deployed) != len(self.vm_hosts): logger.error( "Could not deploy all VM hosts, only {}/{} deployed".format( len(deployed), len(self.vm_hosts))) sys.exit(1) if self.multi_site(): logger.debug( "Deployed, transforming VM hosts name to be able to reach them in the new VLAN" ) for host in self.vm_hosts: host.address = g5k.get_kavlan_host_name( host.address, self.global_vlan)
def start_dns_server(self): resolver_params = { "resolver": self.args.resolver, "buffer_size": 4096, "nb_threads": self.args.server_threads, "max_tcp_clients_per_thread": self.args.resolver_slots_per_thread, # Only used by bind9: according to the documentation, reserved-sockets can be at most maxsockets - 128 "maxsockets": self.args.resolver_slots_per_thread + 128, "mode": self.args.mode, "port": 853 if self.args.mode == 'tls' else 53, } max_clients = self.args.server_threads * self.args.resolver_slots_per_thread logger.debug( "{resolver} in {mode} mode using {nb_threads} threads, {max_tcp_clients_per_thread} max TCP/TLS clients per thread, {buffer_size}b buffer size" .format(**resolver_params)) logger.debug("Max TCP/TLS clients: {}".format(max_clients)) if self.args.resolver == 'unbound': resolver_config = self.configure_unbound(resolver_params) elif self.args.resolver == 'bind9': resolver_config = self.configure_bind9(resolver_params) elif self.args.resolver == 'knot-resolver': resolver_config = self.configure_knot(resolver_params) execo.Remote(resolver_config, [self.server], connection_params=self.server_conn_params, name="Configure resolver").run() # Generate TLS key and self-signed certificate if self.args.mode == 'tls': generate_tls = "openssl req -x509 -subj '/CN=localhost' -nodes -newkey {} -keyout /tmp/resolver.key -out /tmp/resolver.cert -days 365".format( self.args.tls_keytype) execo.Remote(generate_tls, [self.server], connection_params=self.server_conn_params, name="Generate TLS key and cert").run() # Run resolver if self.args.resolver == 'unbound': resolver_cmd = "pkill unbound; sleep 3; /root/unbound/unbound -d -v -c /tmp/unbound.conf" elif self.args.resolver == 'bind9': resolver_cmd = "/root/bind9/bin/named/named -c /tmp/named.conf -g -n {nb_threads} -U {nb_threads} -S {maxsockets}" elif self.args.resolver == 'knot-resolver': resolver_cmd = "LD_LIBRARY_PATH=/usr/local/lib /usr/local/sbin/kresd -f {nb_threads} -c /tmp/knot-resolver.conf" task = execo.Remote(resolver_cmd.format(**resolver_params), [self.server], connection_params=self.server_conn_params, name="Resolver process").start() return task
def log_output(self, task, task_name, log_stdout=True, log_stderr=True): logger.debug("Logging stdout/stderr of task {} ({} processes)".format( task_name, len(task.processes))) for process_id, process in enumerate(task.processes): if len(task.processes) > 1: stdout_file = self.result_dir + "/{}_{}_stdout".format( task_name, process_id) stderr_file = self.result_dir + "/{}_{}_stderr".format( task_name, process_id) else: stdout_file = self.result_dir + "/{}_stdout".format(task_name) stderr_file = self.result_dir + "/{}_stderr".format(task_name) if log_stdout: with open(stdout_file, 'w') as stdout: stdout.write(process.stdout) if log_stderr: with open(stderr_file, 'w') as stderr: stderr.write(process.stderr)
def _get_nodes(self, starttime, endtime): planning = get_planning(elements=[self.cluster], starttime=starttime, endtime=endtime, out_of_chart=self.options.outofchart) slots = compute_slots(planning, self.options.walltime) startdate = slots[0][0] i_slot = 0 n_nodes = slots[i_slot][2][self.cluster] while n_nodes < self.n_nodes: logger.debug(slots[i_slot]) startdate = slots[i_slot][0] n_nodes = slots[i_slot][2][self.cluster] i_slot += 1 if i_slot == len(slots) - 1: return False, False return startdate, self.n_nodes
def _get_nodes(self, starttime, endtime): """ """ planning = get_planning(elements=[self.cluster], starttime=starttime, endtime=endtime, out_of_chart=self.options.outofchart) slots = compute_slots(planning, self.options.walltime) startdate = slots[0][0] i_slot = 0 n_nodes = slots[i_slot][2][self.cluster] logger.info("nodes %s in %s at %s", str(n_nodes), str(self.cluster), format_date(startdate)) while n_nodes < self.options.n_nodes: logger.debug(slots[i_slot]) startdate = slots[i_slot][0] n_nodes = slots[i_slot][2][self.cluster] i_slot += 1 if i_slot == len(slots) - 1: return False, False return startdate, n_nodes
def generate_hosts(hosts_input): """Generate a list of hosts from the given file. Args: hosts_input: The path of the file containing the hosts to be used, or a comma separated list of site:job_id or an a comma separated list of hosts or an oargrid_job_id. If a file is used, each host should be in a different line. Repeated hosts are pruned. Hint: in a running Grid5000 job, $OAR_NODEFILE should be used. Return: list of Host: The list of hosts. """ hosts = [] if os.path.isfile(hosts_input): for line in open(hosts_input): h = Host(line.rstrip()) if h not in hosts: hosts.append(h) elif ":" in hosts_input: # We assume the string is a comma separated list of site:job_id for job in hosts_input.split(","): site, job_id = job.split(":") hosts += get_oar_job_nodes(int(job_id), site) elif "," in hosts_input: # We assume the string is a comma separated list of hosts for hstr in hosts_input.split(","): h = Host(hstr.rstrip()) if h not in hosts: hosts.append(h) elif hosts_input.isdigit(): # If the file_name is a number, we assume this is a oargrid_job_id hosts = get_oargrid_job_nodes(int(hosts_input)) else: # If not any of the previous, we assume is a single-host cluster where # the given input is the only host hosts = [Host(hosts_input.rstrip())] logger.debug("Hosts list: \n%s", " ".join(style.host(host.address.split(".")[0]) for host in hosts)) return hosts
def reserve_global_vlan(self): """Global VLAN, only used for multi-site experiment (server not on the same site as the VM)""" # TODO: integrate that into the "single job reservation" thing. # Existing job, look in all currently running jobs for (job_id, frontend) in g5k.get_current_oar_jobs(): vlans = g5k.get_oar_job_kavlan(job_id, frontend) if len(vlans) > 0: logger.debug( "Found existing Kavlan job {} (VLAN ID: {})".format( job_id, vlans[0])) self.globalvlan_job = (job_id, frontend) return # New job submission = g5k.OarSubmission( resources="{{type='kavlan-global'}}/vlan=1", name="VLAN {}".format(self.exp_id), reservation_date=self.args.start_date, walltime=self.args.walltime) [(jobid, site)] = g5k.oarsub([(submission, None)]) self.globalvlan_job = (jobid, site)
def _get_nodes(self, starttime, endtime): """ """ planning = get_planning(elements=[self.cluster], starttime=starttime, endtime=endtime, out_of_chart=self.options.outofchart) slots = compute_slots(planning, self.options.walltime) startdate = slots[0][0] i_slot = 0 n_nodes = self.options.n_nodes * \ (slots[i_slot][2][self.cluster] // self.options.n_nodes) while n_nodes < self.options.n_nodes: logger.debug(slots[i_slot]) startdate = slots[i_slot][0] n_nodes = self.options.n_nodes * \ (slots[i_slot][2][self.cluster] // self.options.n_nodes) i_slot += 1 if i_slot == len(slots) - 1: return False, False logger.debug('Reserving %s nodes at %s', n_nodes, format_date(startdate)) return startdate, 1
def deserialize_cluster(cluster_type, cid): """Return a cluster object from the given file. Args: cluster_type (str): The type of cluster to obtain. cid (int): The id of the cluster. Returns: The deserialized cluster object. """ fname = __get_cluster_file(cluster_type, cid) logger.debug("Deserialize cluster from " + fname) with open(fname, "rb") as c_file: cluster_object = pickle.load(c_file) return cluster_object
def deserialize_cluster(cluster_type, cid): """Return a cluster object from the given file. Args: cluster_type (str): The type of cluster to obtain. cid (int): The id of the cluster. Returns: The deserialized cluster object. """ fname = __get_cluster_file(cluster_type, cid) logger.debug("Deserialize cluster from " + fname) with open(fname, 'rb') as c_file: cluster_object = pickle.load(c_file) return cluster_object
def _get_nodes(self, starttime, endtime): """ """ planning = get_planning(elements=[self.cluster], starttime=starttime, endtime=endtime, out_of_chart=self.options.outofchart) slots = compute_slots(planning, self.options.walltime) startdate = slots[0][0] i_slot = 0 n_nodes = self.options.n_nodes * \ (slots[i_slot][2][self.cluster] // self.options.n_nodes) while n_nodes < self.options.n_nodes: logger.debug(slots[i_slot]) startdate = slots[i_slot][0] n_nodes = self.options.n_nodes * \ (slots[i_slot][2][self.cluster] // self.options.n_nodes) i_slot += 1 if i_slot == len(slots) - 1: return False, False logger.debug('Reserving %s nodes at %s', n_nodes, format_date(startdate)) return startdate, n_nodes
def start(self): """Start MongoDB server.""" self._check_initialization() logger.info("Starting MongoDB") if self.running: logger.warn("MongoDB was already started") return # Start nodes procs = [] for h in self.hosts: mongo_command = (NUMA_PREFIX + " " + self.bin_dir + "/mongod " " --fork " " --config " + os.path.join(self.conf_dir, CONF_FILE) + " --bind_ip " + h.address + " --port " + str(self.md_port)) logger.debug(mongo_command) proc = SshProcess(mongo_command, h) proc.start() procs.append(proc) finished_ok = True for p in procs: p.wait() if not p.finished_ok: finished_ok = False if not finished_ok: logger.warn("Error while starting MongoDB") return else: self.running = True # Start replication if self.do_replication: logger.info("Configuring replication") mongo_command = "rs.initiate();" mongo_command += ';'.join( 'rs.add("' + h.address + ':' + str(self.md_port) + '")' for h in self.hosts) logger.debug(mongo_command) proc = TaktukRemote(self.bin_dir + "/mongo " "--eval '" + mongo_command + "' " + self.master.address, [self.master]) proc.run() if not proc.finished_ok: logger.warn("Not able to start replication") if self.do_sharding: if not self.initialized_sharding: logger.info("Configuring sharding") time.sleep(2) mongo_command = ( 'rs.initiate({' '_id : "%s",' 'configsvr : true,' 'members : [%s]})' % ( self.rs_name, ",".join('{ _id : %d, host : "%s:%d" }' % (_id, h.address, self.md_port) for (_id, h) in enumerate(self.hosts)) ) ) logger.debug(mongo_command) proc = SshProcess(self.bin_dir + "/mongo " + "--eval '" + mongo_command + "' " + self.master.address, self.master) proc.run() if proc.finished_ok: self.initialized_sharding = True else: logger.warn("Not able to configure sharding") logger.info("Starting sharding servers") mongo_command = ( NUMA_PREFIX + " " + self.bin_dir + "/mongos" " --configdb " + self.rs_name + "/" + ",".join('%s:%d' % (h.address, self.md_port) for h in self.hosts) + " --bind_ip " + self.master.address + " --port " + str(self.ms_port) + " --fork" " --logpath " + self.logs_dir + "/mongos.log" " --pidfilepath " + self.mongos_pid_file ) logger.debug(mongo_command) start_ms = TaktukRemote(mongo_command, [self.master]) start_ms.run()
def run(self): rtt_file = self.result_dir + "/rtt.csv" resolver = None client = 'tcpclient' if self.args.mode == 'tcp' else 'udpclient' try: logger.debug("Experiment ID: {}".format(self.exp_id)) if self.multi_site(): logger.info("Running in multi-site mode") if not self.multi_site(): self.reserve_resources_singlejob() logger.debug("Waiting for OAR job to start...") g5k.wait_oar_job_start(*self.vmhosts_job) self.prepare_subnet() logger.debug("Prepared subnet") # Dependencies (besides the obvious ones): # - deploy_server depends on prepare_global_vlan # - prepare_server depends on deploy_server # - prepare_server depends on prepare_subnet # - prepare_vm depends on deploy_server if self.multi_site(): self.reserve_global_vlan() g5k.wait_oar_job_start(*self.globalvlan_job) logger.debug("Waiting for global VLAN job to start...") self.prepare_global_vlan() self.log_experimental_conditions() logger.debug("Deploying VM hosts...") machines_deploy_process = self.start_deploy_vmhosts() logger.debug("Deploying server image...") server_deploy_process = self.start_deploy_server() machines_deploy_process.wait() logger.debug("Finishing deploying VM hosts...") self.finish_deploy_vmhosts(machines_deploy_process) logger.debug("Setting up VM hosts...") machines_setup_process = self.prepare_vmhosts() machines_setup_process.wait() logger.debug("VM hosts are setup.") server_deploy_process.wait() logger.debug("Finishing deploying server...") self.finish_deploy_server(server_deploy_process) logger.debug("Server is deployed.") self.vm_process = self.start_all_vm() # Ensure VM are killed when we exit with self.vm_process: server_setup_process = self.prepare_server() self.wait_until_vm_ready() vm_setup_process = self.prepare_vm() server_setup_process.wait() self.log_output(server_setup_process, "server_setup_process") if not server_setup_process.ok: logger.error( "Error while preparing server, please check logs for 'server_setup_process'" ) raise Exception logger.debug("Prepared server: {}".format(self.server.address)) vm_setup_process.wait() self.log_output(vm_setup_process, "vm_setup_process") if not vm_setup_process.ok: logger.error( "Error while preparing VMs, please check logs for 'vm_setup_process'" ) raise Exception logger.debug("Prepared VM") logger.info("Started {} VMs.".format(len(self.vm))) cpunetlog_vms = self.start_cpunetlog(self.vm) cpunetlog_server = self.start_cpunetlog( [self.server], self.server_conn_params) resolver = self.start_dns_server() logger.info("Started resolver ({}) on {}.".format( self.resolver_name, self.server.address)) # Leave time for resolver to start if self.args.resolver_slots_per_thread < 1000000: execo.sleep(15) else: execo.sleep(60) logger.info("Starting {} on all VMs...".format(client)) clients = self.start_client_vm() clients.wait() logger.info("{} finished!".format(client)) logger.info("Writing cpunetlog output to disk.") cpunetlog_server.kill().wait() cpunetlog_vms.kill().wait() self.log_output(cpunetlog_server, "cpunetlog_server") self.log_output(cpunetlog_vms, "cpunetlog_vms") logger.info("writing {} results to disk.".format(client)) self.log_output(clients, "clients", log_stdout=False) with open(rtt_file, 'w') as rtt_output: need_header = True rtt = csv.writer(rtt_output) for client_id, client in enumerate(clients.processes): first_line = True for line in iter(client.stdout.splitlines()): # Skip anything that does not look like CSV if ',' not in line: continue if need_header: # Take CSV header from first client and add a column data = line.split(",") data.insert(0, "vm_id") rtt.writerow(data) need_header = False first_line = False elif first_line: # Skip first line of subsequent clients first_line = False else: # Add column with VM ID data = line.split(",") data.insert(0, client_id) rtt.writerow(data) except Exception as e: logger.error("Exception raised: {}\n{}".format(e, format_exc())) finally: #self.kill_all_vm() if self.vm_process: self.vm_process.kill() if resolver: resolver.kill() logger.debug("Waiting for resolver to exit") resolver.wait() self.log_output(resolver, "resolver") if self.vm_process: logger.debug("Waiting for VM to exit") self.vm_process.wait() logger.info("Resolver and all VMs are shut down") self.log_output(self.vm_process, "vm_process") print(execo.Report([self.vm_process]).to_string()) #for s in self.vm_process.processes: # print("\n%s\nstdout:\n%s\nstderr:\n%s\n" % (s, s.stdout, s.stderr)) g5k.oardel([self.vmhosts_job])
def run(self): """The main experimental workflow, as described in ``Using the Execo toolkit to perform ...`` """ self.force_options() # The argument is a cluster self.cluster = self.args[0] self.frontend = get_cluster_site(self.cluster) # Analyzing options if self.options.oar_job_id: self.oar_job_id = self.options.oar_job_id else: self.oar_job_id = None try: # Creation of the main iterator which is used for the first control loop. # You need have a method called define_parameters, that returns a list of parameter dicts self.create_paramsweeper() job_is_dead = False # While they are combinations to treat while len(self.sweeper.get_remaining()) > 0: # If no job, we make a reservation and prepare the hosts for the experiments if self.oar_job_id is None: self.make_reservation() # Retrieving the hosts and subnets parameters self.get_resources() # Hosts deployment and configuration if not self.options.no_hosts_setup: self.setup_hosts() if len(self.hosts) == 0: break # Initializing the resources and threads available_hosts = list(self.hosts) available_ip_mac = list(self.ip_mac) threads = {} # Checking that the job is running and not in Error while get_oar_job_info(self.oar_job_id, self.frontend)['state'] != 'Error' \ or len(threads.keys()) > 0: job_is_dead = False while self.options.n_nodes > len(available_hosts): tmp_threads = dict(threads) for t in tmp_threads: if not t.is_alive(): available_hosts.extend(tmp_threads[t]['hosts']) available_ip_mac.extend( tmp_threads[t]['ip_mac']) del threads[t] sleep(5) if get_oar_job_info(self.oar_job_id, self.frontend)['state'] == 'Error': job_is_dead = True break if job_is_dead: break # Getting the next combination comb = self.sweeper.get_next() if not comb: while len(threads.keys()) > 0: tmp_threads = dict(threads) for t in tmp_threads: if not t.is_alive(): del threads[t] logger.info('Waiting for threads to complete') sleep(20) break used_hosts = available_hosts[0:self.options.n_nodes] available_hosts = available_hosts[self.options.n_nodes:] n_vm = self.comb_nvm(comb) used_ip_mac = available_ip_mac[0:n_vm] available_ip_mac = available_ip_mac[n_vm:] t = Thread(target=self.workflow, args=(comb, used_hosts, used_ip_mac)) threads[t] = {'hosts': used_hosts, 'ip_mac': used_ip_mac} logger.debug('Threads: %s', len(threads)) t.daemon = True t.start() if get_oar_job_info(self.oar_job_id, self.frontend)['state'] == 'Error': job_is_dead = True if job_is_dead: self.oar_job_id = None finally: if self.oar_job_id is not None: if not self.options.keep_alive: logger.info('Deleting job') oardel([(self.oar_job_id, self.frontend)]) else: logger.info('Keeping job alive for debugging')
def wait_until_vm_ready(self): prospective_vms = [execo.Host(ip, user='******') for ip in self.vm_ips] logger.debug('Waiting for {} VMs to become reachable...'.format( len(prospective_vms))) self.vm = check_hosts_up(prospective_vms, timeout=60) logger.debug('Result: {} VMs are reachable.'.format(len(self.vm)))
def run(self): """The main experimental workflow, as described in ``Using the Execo toolkit to perform ...`` """ self.force_options() # The argument is a cluster self.cluster = self.args[0] self.frontend = get_cluster_site(self.cluster) # Analyzing options if self.options.oar_job_id: self.oar_job_id = self.options.oar_job_id else: self.oar_job_id = None try: # Creation of the main iterator which is used for the first control loop. # You need have a method called define_parameters, that returns a list of parameter dicts self.create_paramsweeper() job_is_dead = False # While they are combinations to treat while len(self.sweeper.get_remaining()) > 0: # If no job, we make a reservation and prepare the hosts for the experiments if self.oar_job_id is None: self.make_reservation() # Retrieving the hosts and subnets parameters self.get_resources() # Hosts deployment and configuration if not self.options.no_hosts_setup: self.setup_hosts() if len(self.hosts) == 0: break # Initializing the resources and threads available_hosts = list(self.hosts) available_ip_mac = list(self.ip_mac) threads = {} # Checking that the job is running and not in Error while self.is_job_alive()['state'] != 'Error' \ or len(threads.keys()) > 0: # while get_oar_job_info(self.oar_job_id, self.frontend)['state'] != 'Error' \ # or len(threads.keys()) > 0: job_is_dead = False while self.options.n_nodes > len(available_hosts): tmp_threads = dict(threads) for t in tmp_threads: if not t.is_alive(): available_hosts.extend(tmp_threads[t]['hosts']) available_ip_mac.extend(tmp_threads[t]['ip_mac']) del threads[t] sleep(5) if self.is_job_alive()['state'] == 'Error': job_is_dead = True break if job_is_dead: break # Getting the next combination comb = self.sweeper.get_next() if not comb: while len(threads.keys()) > 0: tmp_threads = dict(threads) for t in tmp_threads: if not t.is_alive(): del threads[t] logger.info('Waiting for threads to complete') sleep(20) break used_hosts = available_hosts[0:self.options.n_nodes] available_hosts = available_hosts[self.options.n_nodes:] n_vm = self.comb_nvm(comb) used_ip_mac = available_ip_mac[0:n_vm] available_ip_mac = available_ip_mac[n_vm:] t = Thread(target=self.workflow, args=(comb, used_hosts, used_ip_mac)) threads[t] = {'hosts': used_hosts, 'ip_mac': used_ip_mac} logger.debug('Threads: %s', len(threads)) t.daemon = True t.start() # if get_oar_job_info(self.oar_job_id, self.frontend)['state'] == 'Error': if self.is_job_alive()['state'] == 'Error': job_is_dead = True if job_is_dead: self.oar_job_id = None finally: if self.oar_job_id is not None: if not self.options.keep_alive: logger.info('Deleting job') oardel([(self.oar_job_id, self.frontend)]) else: logger.info('Keeping job alive for debugging')