Example #1
0
 def start_deploy_server(self):
     """Deploy the server with the given Kadeploy environment.  Blocks until
     deployment is done"""
     # Sort hosts by name and take the first one: this is useful when
     # using a single reservation for all nodes, since we will always
     # pick the same host as server.
     self.server = sorted(g5k.get_oar_job_nodes(*self.server_job),
                          key=lambda node: node.address)[0]
     if os.path.isfile(self.args.server_env):
         deploy_opts = {"env_file": self.args.server_env}
     else:
         deploy_opts = {
             "env_name": self.args.server_env,
             "user": self.args.kadeploy_user
         }
     if self.multi_site():
         deploy_opts["vlan"] = self.global_vlan
         logger.debug(
             "Deploying environment '{}' on server {} in VLAN {}...".format(
                 self.args.server_env, self.server.address,
                 self.global_vlan))
     else:
         logger.debug("Deploying environment '{}' on server {}...".format(
             self.args.server_env, self.server.address))
     d = g5k.Deployment([self.server], **deploy_opts)
     return g5k.kadeploy.Kadeployer(d).start()
Example #2
0
    def start_all_vm(self):
        """Starts VM on reserved machines, and returns the associated task
        object.  This function will return immediately, but the caller has to
        wait for the VM to be setup before using them.
        """
        assert (len(self.vm_hosts) > 0)
        (all_ip, all_mac) = zip(*self.subnet_ip_mac)
        self.vm_macs = all_mac[:self.args.nb_vm * len(self.vm_hosts)]
        self.vm_ips = all_ip[:self.args.nb_vm * len(self.vm_hosts)]
        logger.debug("VMs IP: {}".format(' '.join(self.vm_ips)))
        memory = self.args.memory
        nb_vm = self.args.nb_vm
        # For each physical host, build a list of MAC addresses to be used for its VMs
        macs_per_host = [
            self.vm_macs[i * nb_vm:(i + 1) * nb_vm]
            for i, host in enumerate(self.vm_hosts)
        ]
        # Double escaping is magic (after .format, it will become {{macs_per_host}})
        script = """\
for mac in {{{{[' '.join(macs) for macs in macs_per_host]}}}}
do
  iface=$(tunctl -b)
  brctl addif br0 "$iface"
  ip link set "$iface" up
  kvm -m {memory} -smp cores={cores},threads=1,sockets=1 -nographic -localtime -enable-kvm -drive file="{image}",if=virtio,media=disk -snapshot -net nic,model=virtio,macaddr="$mac" -net tap,ifname="$iface",script=no &
done
wait
        """.format(memory=memory, cores=1, image=self.args.vm_image)
        vm_task = execo.Remote(script,
                               self.vm_hosts,
                               connection_params=self.server_conn_params,
                               name="Run VM on all hosts")
        return vm_task.start()
Example #3
0
def serialize_cluster(cluster_type, cid, cluster_object):
    """Serialize the cluster object. Replace also the linked Hadoop cluster if
    it exists.

    Args:
      cluster_type (str):
        The type of cluster to serialize.
      cid (int):
        The id of the cluster.
      cluster_object:
        The cluster to serialize.
    """

    fname = __get_cluster_file(cluster_type, cid)

    logger.debug("Serialize cluster (" + cluster_type + ") in " + fname)

    c_file = open(fname, "wb")
    pickle.dump(cluster_object, c_file)

    if cluster_type != HadoopCluster.get_cluster_type():
        hc_link_fname = __get_hc_link_file(cluster_type, cid)
        if os.path.exists(hc_link_fname):
            with open(hc_link_fname) as link_file:
                hc_id = int(link_file.readline())
            serialize_cluster(HadoopCluster.get_cluster_type(), hc_id, cluster_object.hc)
Example #4
0
    def start_shell(self, node=None, mongos=True):
        """Open a MongoDB shell.

        Args:
          node (Host, optional):
            The host were the shell is to be started. If not provided,
            self.master is chosen.
        """

        self._check_initialization()

        if not node:
            node = self.master

        if mongos and self.do_sharding:
            port = self.ms_port
        else:
            port = self.md_port

        mongo_command = (
            self.bin_dir + "/mongo"
            " --host " + node.address +
            " --port " + str(port)
        )

        logger.debug(mongo_command)

        call("ssh -t " + node.address + " " +
             NUMA_PREFIX + " " + mongo_command,
             shell=True)
Example #5
0
def generate_hosts(hosts_input):
    """Generate a list of hosts from the given file.

    Args:
      hosts_input: The path of the file containing the hosts to be used,
        or a comma separated list of site:job_id or an oargrid_job_id.
        If a file is used, each host should be in a different line.
        Repeated hosts are pruned.
        Hint: in a running Grid5000 job,  $OAR_NODEFILE should be used.

    Return:
      list of Host: The list of hosts.
    """
    hosts = []
    if os.path.isfile(hosts_input):
        for line in open(hosts_input):
            h = Host(line.rstrip())
            if h not in hosts:
                hosts.append(h)
    elif ':' in hosts_input:
        # We assume the string is a comma separated list of site:job_id
        for job in hosts_input.split(','):
            site, job_id = job.split(':')
            hosts += get_oar_job_nodes(int(job_id), site)
    else:
        # If the file_name is a number, we assume this is a oargrid_job_id
        hosts = get_oargrid_job_nodes(int(hosts_input))
    logger.debug('Hosts list: \n%s',
                 ' '.join(style.host(host.address.split('.')[0])
                          for host in hosts))
    return hosts
def serialize_cluster(cluster_type, cid, cluster_object):
    """Serialize the cluster object. Replace also the linked Hadoop cluster if
    it exists.

    Args:
      cluster_type (str):
        The type of cluster to serialize.
      cid (int):
        The id of the cluster.
      cluster_object:
        The cluster to serialize.
    """

    fname = __get_cluster_file(cluster_type, cid)

    logger.debug("Serialize cluster (" + cluster_type + ") in " + fname)

    c_file = open(fname, 'wb')
    pickle.dump(cluster_object, c_file)

    if cluster_type != HadoopCluster.get_cluster_type():
        hc_link_fname = __get_hc_link_file(cluster_type, cid)
        if os.path.exists(hc_link_fname):
            with open(hc_link_fname) as link_file:
                hc_id = int(link_file.readline())
            serialize_cluster(HadoopCluster.get_cluster_type(), hc_id,
                              cluster_object.hc)
Example #7
0
def generate_hosts(hosts_input):
    """Generate a list of hosts from the given file.

    Args:
      hosts_input: The path of the file containing the hosts to be used,
        or a comma separated list of site:job_id or an oargrid_job_id.
        If a file is used, each host should be in a different line.
        Repeated hosts are pruned.
        Hint: in a running Grid5000 job,  $OAR_NODEFILE should be used.

    Return:
      list of Host: The list of hosts.
    """
    hosts = []
    if os.path.isfile(hosts_input):
        for line in open(hosts_input):
            h = Host(line.rstrip())
            if h not in hosts:
                hosts.append(h)
    elif ':' in hosts_input:
        # We assume the string is a comma separated list of site:job_id
        for job in hosts_input.split(','):
            site, job_id = job.split(':')
            hosts += get_oar_job_nodes(int(job_id), site)
    else:
        # If the file_name is a number, we assume this is a oargrid_job_id
        hosts = get_oargrid_job_nodes(int(hosts_input))
    logger.debug(
        'Hosts list: \n%s',
        ' '.join(style.host(host.address.split('.')[0]) for host in hosts))
    return hosts
Example #8
0
 def start_deploy_vmhosts(self):
     hosts = g5k.get_oar_job_nodes(*self.vmhosts_job)
     if self.multi_site():
         self.vm_hosts = hosts
     else:
         # Take all but the first host
         self.vm_hosts = sorted(hosts, key=lambda node: node.address)[1:]
     if os.path.isfile(self.args.vmhosts_env):
         deploy_opts = {"env_file": self.args.vmhosts_env}
     else:
         deploy_opts = {
             "env_name": self.args.vmhosts_env,
             "user": self.args.vmhosts_kadeploy_user
         }
     if self.multi_site():
         deploy_opts["vlan"] = self.global_vlan
         logger.debug(
             "Deploying environment '{}' on {} VM hosts in VLAN {}...".
             format(self.args.vmhosts_env, len(self.vm_hosts),
                    self.global_vlan))
     else:
         logger.debug("Deploying environment '{}' on {} VM hosts...".format(
             self.args.vmhosts_env, len(self.vm_hosts)))
     d = g5k.Deployment(self.vm_hosts, **deploy_opts)
     return g5k.kadeploy.Kadeployer(d).start()
Example #9
0
 def __init__(self):
     """Initialize the execo engine"""
     super(IsotopicBoxModel, self).__init__()
     self.init_plots()
     logger.info(style.log_header('\n\n                 Welcome to the ' +
                                  'human isotopic Box Model\n'))
     logger.debug(pformat(self.__dict__))
Example #10
0
 def prepare_global_vlan(self):
     vlans = g5k.get_oar_job_kavlan(*self.globalvlan_job)
     if len(vlans) > 0:
         self.global_vlan = vlans[0]
         logger.debug("Global VLAN ID: {}".format(self.global_vlan))
     else:
         logger.error("Could not reserve global VLAN")
         sys.exit(1)
Example #11
0
 def finish_deploy_server(self, deploy_process):
     deployed = deploy_process.deployed_hosts
     if len(deployed) == 0:
         logger.error("Could not deploy server")
         sys.exit(1)
     if self.multi_site():
         logger.debug("Deployed, transforming {} into {}".format(
             self.server.address,
             g5k.get_kavlan_host_name(self.server.address,
                                      self.global_vlan)))
         self.server.address = g5k.get_kavlan_host_name(
             self.server.address, self.global_vlan)
Example #12
0
 def finish_deploy_vmhosts(self, deploy_process):
     deployed = deploy_process.deployed_hosts
     if len(deployed) != len(self.vm_hosts):
         logger.error(
             "Could not deploy all VM hosts, only {}/{} deployed".format(
                 len(deployed), len(self.vm_hosts)))
         sys.exit(1)
     if self.multi_site():
         logger.debug(
             "Deployed, transforming VM hosts name to be able to reach them in the new VLAN"
         )
         for host in self.vm_hosts:
             host.address = g5k.get_kavlan_host_name(
                 host.address, self.global_vlan)
Example #13
0
    def start_dns_server(self):
        resolver_params = {
            "resolver": self.args.resolver,
            "buffer_size": 4096,
            "nb_threads": self.args.server_threads,
            "max_tcp_clients_per_thread": self.args.resolver_slots_per_thread,
            # Only used by bind9: according to the documentation, reserved-sockets can be at most maxsockets - 128
            "maxsockets": self.args.resolver_slots_per_thread + 128,
            "mode": self.args.mode,
            "port": 853 if self.args.mode == 'tls' else 53,
        }
        max_clients = self.args.server_threads * self.args.resolver_slots_per_thread
        logger.debug(
            "{resolver} in {mode} mode using {nb_threads} threads, {max_tcp_clients_per_thread} max TCP/TLS clients per thread, {buffer_size}b buffer size"
            .format(**resolver_params))
        logger.debug("Max TCP/TLS clients: {}".format(max_clients))
        if self.args.resolver == 'unbound':
            resolver_config = self.configure_unbound(resolver_params)
        elif self.args.resolver == 'bind9':
            resolver_config = self.configure_bind9(resolver_params)
        elif self.args.resolver == 'knot-resolver':
            resolver_config = self.configure_knot(resolver_params)
        execo.Remote(resolver_config, [self.server],
                     connection_params=self.server_conn_params,
                     name="Configure resolver").run()
        # Generate TLS key and self-signed certificate
        if self.args.mode == 'tls':
            generate_tls = "openssl req -x509 -subj '/CN=localhost' -nodes -newkey {} -keyout /tmp/resolver.key -out /tmp/resolver.cert -days 365".format(
                self.args.tls_keytype)
            execo.Remote(generate_tls, [self.server],
                         connection_params=self.server_conn_params,
                         name="Generate TLS key and cert").run()
        # Run resolver
        if self.args.resolver == 'unbound':
            resolver_cmd = "pkill unbound; sleep 3; /root/unbound/unbound -d -v -c /tmp/unbound.conf"
        elif self.args.resolver == 'bind9':
            resolver_cmd = "/root/bind9/bin/named/named -c /tmp/named.conf -g -n {nb_threads} -U {nb_threads} -S {maxsockets}"
        elif self.args.resolver == 'knot-resolver':
            resolver_cmd = "LD_LIBRARY_PATH=/usr/local/lib /usr/local/sbin/kresd -f {nb_threads} -c /tmp/knot-resolver.conf"

        task = execo.Remote(resolver_cmd.format(**resolver_params),
                            [self.server],
                            connection_params=self.server_conn_params,
                            name="Resolver process").start()
        return task
Example #14
0
 def log_output(self, task, task_name, log_stdout=True, log_stderr=True):
     logger.debug("Logging stdout/stderr of task {} ({} processes)".format(
         task_name, len(task.processes)))
     for process_id, process in enumerate(task.processes):
         if len(task.processes) > 1:
             stdout_file = self.result_dir + "/{}_{}_stdout".format(
                 task_name, process_id)
             stderr_file = self.result_dir + "/{}_{}_stderr".format(
                 task_name, process_id)
         else:
             stdout_file = self.result_dir + "/{}_stdout".format(task_name)
             stderr_file = self.result_dir + "/{}_stderr".format(task_name)
         if log_stdout:
             with open(stdout_file, 'w') as stdout:
                 stdout.write(process.stdout)
         if log_stderr:
             with open(stderr_file, 'w') as stderr:
                 stderr.write(process.stderr)
Example #15
0
    def _get_nodes(self, starttime, endtime):

        planning = get_planning(elements=[self.cluster],
                                starttime=starttime,
                                endtime=endtime,
                                out_of_chart=self.options.outofchart)
        slots = compute_slots(planning, self.options.walltime)
        startdate = slots[0][0]
        i_slot = 0
        n_nodes = slots[i_slot][2][self.cluster]
        while n_nodes < self.n_nodes:
            logger.debug(slots[i_slot])
            startdate = slots[i_slot][0]
            n_nodes = slots[i_slot][2][self.cluster]
            i_slot += 1
            if i_slot == len(slots) - 1:
                return False, False
        return startdate, self.n_nodes
Example #16
0
    def _get_nodes(self, starttime, endtime):

        planning = get_planning(elements=[self.cluster],
                                starttime=starttime,
                                endtime=endtime,
                                out_of_chart=self.options.outofchart)
        slots = compute_slots(planning, self.options.walltime)
        startdate = slots[0][0]
        i_slot = 0
        n_nodes = slots[i_slot][2][self.cluster]
        while n_nodes < self.n_nodes:
            logger.debug(slots[i_slot])
            startdate = slots[i_slot][0]
            n_nodes = slots[i_slot][2][self.cluster]
            i_slot += 1
            if i_slot == len(slots) - 1:
                return False, False
        return startdate, self.n_nodes
 def _get_nodes(self, starttime, endtime):
     """ """
     planning = get_planning(elements=[self.cluster],
                             starttime=starttime,
                             endtime=endtime,
                             out_of_chart=self.options.outofchart)
     slots = compute_slots(planning, self.options.walltime)
     startdate = slots[0][0]
     i_slot = 0
     n_nodes = slots[i_slot][2][self.cluster]
     logger.info("nodes %s in %s at %s", str(n_nodes), str(self.cluster),
                 format_date(startdate))
     while n_nodes < self.options.n_nodes:
         logger.debug(slots[i_slot])
         startdate = slots[i_slot][0]
         n_nodes = slots[i_slot][2][self.cluster]
         i_slot += 1
         if i_slot == len(slots) - 1:
             return False, False
     return startdate, n_nodes
Example #18
0
def generate_hosts(hosts_input):
    """Generate a list of hosts from the given file.

    Args:
      hosts_input: The path of the file containing the hosts to be used,
        or a comma separated list of site:job_id or an a comma separated list
        of hosts or an oargrid_job_id.
        If a file is used, each host should be in a different line.
        Repeated hosts are pruned.
        Hint: in a running Grid5000 job, $OAR_NODEFILE should be used.

    Return:
      list of Host: The list of hosts.
    """
    hosts = []
    if os.path.isfile(hosts_input):
        for line in open(hosts_input):
            h = Host(line.rstrip())
            if h not in hosts:
                hosts.append(h)
    elif ":" in hosts_input:
        # We assume the string is a comma separated list of site:job_id
        for job in hosts_input.split(","):
            site, job_id = job.split(":")
            hosts += get_oar_job_nodes(int(job_id), site)
    elif "," in hosts_input:
        # We assume the string is a comma separated list of hosts
        for hstr in hosts_input.split(","):
            h = Host(hstr.rstrip())
            if h not in hosts:
                hosts.append(h)
    elif hosts_input.isdigit():
        # If the file_name is a number, we assume this is a oargrid_job_id
        hosts = get_oargrid_job_nodes(int(hosts_input))
    else:
        # If not any of the previous, we assume is a single-host cluster where
        # the given input is the only host
        hosts = [Host(hosts_input.rstrip())]

    logger.debug("Hosts list: \n%s", " ".join(style.host(host.address.split(".")[0]) for host in hosts))
    return hosts
Example #19
0
 def reserve_global_vlan(self):
     """Global VLAN, only used for multi-site experiment (server not on the
     same site as the VM)"""
     # TODO: integrate that into the "single job reservation" thing.
     # Existing job, look in all currently running jobs
     for (job_id, frontend) in g5k.get_current_oar_jobs():
         vlans = g5k.get_oar_job_kavlan(job_id, frontend)
         if len(vlans) > 0:
             logger.debug(
                 "Found existing Kavlan job {} (VLAN ID: {})".format(
                     job_id, vlans[0]))
             self.globalvlan_job = (job_id, frontend)
             return
     # New job
     submission = g5k.OarSubmission(
         resources="{{type='kavlan-global'}}/vlan=1",
         name="VLAN {}".format(self.exp_id),
         reservation_date=self.args.start_date,
         walltime=self.args.walltime)
     [(jobid, site)] = g5k.oarsub([(submission, None)])
     self.globalvlan_job = (jobid, site)
Example #20
0
 def _get_nodes(self, starttime, endtime):
     """ """
     planning = get_planning(elements=[self.cluster],
                             starttime=starttime,
                             endtime=endtime,
                             out_of_chart=self.options.outofchart)
     slots = compute_slots(planning, self.options.walltime)
     startdate = slots[0][0]
     i_slot = 0
     n_nodes = self.options.n_nodes * \
             (slots[i_slot][2][self.cluster] // self.options.n_nodes)
     while n_nodes < self.options.n_nodes:
         logger.debug(slots[i_slot])
         startdate = slots[i_slot][0]
         n_nodes = self.options.n_nodes * \
             (slots[i_slot][2][self.cluster] // self.options.n_nodes)
         i_slot += 1
         if i_slot == len(slots) - 1:
             return False, False
     logger.debug('Reserving %s nodes at %s', n_nodes, format_date(startdate))
     return startdate, 1
Example #21
0
def deserialize_cluster(cluster_type, cid):
    """Return a cluster object from the given file.

    Args:
      cluster_type (str):
        The type of cluster to obtain.
      cid (int):
        The id of the cluster.

    Returns:
      The deserialized cluster object.
    """

    fname = __get_cluster_file(cluster_type, cid)

    logger.debug("Deserialize cluster from " + fname)

    with open(fname, "rb") as c_file:
        cluster_object = pickle.load(c_file)

    return cluster_object
def deserialize_cluster(cluster_type, cid):
    """Return a cluster object from the given file.

    Args:
      cluster_type (str):
        The type of cluster to obtain.
      cid (int):
        The id of the cluster.

    Returns:
      The deserialized cluster object.
    """

    fname = __get_cluster_file(cluster_type, cid)

    logger.debug("Deserialize cluster from " + fname)

    with open(fname, 'rb') as c_file:
        cluster_object = pickle.load(c_file)

    return cluster_object
Example #23
0
 def _get_nodes(self, starttime, endtime):
     """ """
     planning = get_planning(elements=[self.cluster],
                             starttime=starttime,
                             endtime=endtime,
                             out_of_chart=self.options.outofchart)
     slots = compute_slots(planning, self.options.walltime)
     startdate = slots[0][0]
     i_slot = 0
     n_nodes = self.options.n_nodes * \
             (slots[i_slot][2][self.cluster] // self.options.n_nodes)
     while n_nodes < self.options.n_nodes:
         logger.debug(slots[i_slot])
         startdate = slots[i_slot][0]
         n_nodes = self.options.n_nodes * \
             (slots[i_slot][2][self.cluster] // self.options.n_nodes)
         i_slot += 1
         if i_slot == len(slots) - 1:
             return False, False
     logger.debug('Reserving %s nodes at %s', n_nodes,
                  format_date(startdate))
     return startdate, n_nodes
Example #24
0
    def start(self):
        """Start MongoDB server."""

        self._check_initialization()

        logger.info("Starting MongoDB")

        if self.running:
            logger.warn("MongoDB was already started")
            return

        # Start nodes
        procs = []
        for h in self.hosts:
            mongo_command = (NUMA_PREFIX + " " +
                             self.bin_dir + "/mongod "
                             " --fork "
                             " --config " + os.path.join(self.conf_dir,
                                                         CONF_FILE) +
                             " --bind_ip " + h.address +
                             " --port " + str(self.md_port))

            logger.debug(mongo_command)

            proc = SshProcess(mongo_command, h)
            proc.start()
            procs.append(proc)

        finished_ok = True
        for p in procs:
            p.wait()
            if not p.finished_ok:
                finished_ok = False

        if not finished_ok:
            logger.warn("Error while starting MongoDB")
            return
        else:
            self.running = True

        # Start replication
        if self.do_replication:
            logger.info("Configuring replication")
            mongo_command = "rs.initiate();"
            mongo_command += ';'.join(
                'rs.add("' + h.address + ':' + str(self.md_port) + '")'
                for h in self.hosts)

            logger.debug(mongo_command)

            proc = TaktukRemote(self.bin_dir + "/mongo "
                                "--eval '" + mongo_command + "' " +
                                self.master.address,
                                [self.master])
            proc.run()

            if not proc.finished_ok:
                logger.warn("Not able to start replication")

        if self.do_sharding:
            if not self.initialized_sharding:
                logger.info("Configuring sharding")
                time.sleep(2)
                mongo_command = (
                    'rs.initiate({'
                    '_id : "%s",'
                    'configsvr : true,'
                    'members : [%s]})' % (
                        self.rs_name,
                        ",".join('{ _id : %d, host : "%s:%d" }' %
                                 (_id, h.address, self.md_port)
                                 for (_id, h) in enumerate(self.hosts))
                    )
                )

                logger.debug(mongo_command)

                proc = SshProcess(self.bin_dir + "/mongo " +
                                  "--eval '" + mongo_command + "' " +
                                  self.master.address,
                                  self.master)
                proc.run()
                if proc.finished_ok:
                    self.initialized_sharding = True
                else:
                    logger.warn("Not able to configure sharding")

            logger.info("Starting sharding servers")
            mongo_command = (
                NUMA_PREFIX + " " +
                self.bin_dir + "/mongos"
                " --configdb " + self.rs_name + "/" +
                ",".join('%s:%d' % (h.address, self.md_port)
                         for h in self.hosts) +
                " --bind_ip " + self.master.address +
                " --port " + str(self.ms_port) +
                " --fork"
                " --logpath " + self.logs_dir + "/mongos.log"
                " --pidfilepath " + self.mongos_pid_file
            )

            logger.debug(mongo_command)

            start_ms = TaktukRemote(mongo_command, [self.master])
            start_ms.run()
Example #25
0
    def run(self):
        rtt_file = self.result_dir + "/rtt.csv"
        resolver = None
        client = 'tcpclient' if self.args.mode == 'tcp' else 'udpclient'
        try:
            logger.debug("Experiment ID: {}".format(self.exp_id))
            if self.multi_site():
                logger.info("Running in multi-site mode")
            if not self.multi_site():
                self.reserve_resources_singlejob()
                logger.debug("Waiting for OAR job to start...")
                g5k.wait_oar_job_start(*self.vmhosts_job)
                self.prepare_subnet()
                logger.debug("Prepared subnet")
            # Dependencies (besides the obvious ones):
            # - deploy_server depends on prepare_global_vlan
            # - prepare_server depends on deploy_server
            # - prepare_server depends on prepare_subnet
            # - prepare_vm depends on deploy_server
            if self.multi_site():
                self.reserve_global_vlan()
                g5k.wait_oar_job_start(*self.globalvlan_job)
                logger.debug("Waiting for global VLAN job to start...")
                self.prepare_global_vlan()
            self.log_experimental_conditions()
            logger.debug("Deploying VM hosts...")
            machines_deploy_process = self.start_deploy_vmhosts()
            logger.debug("Deploying server image...")
            server_deploy_process = self.start_deploy_server()
            machines_deploy_process.wait()
            logger.debug("Finishing deploying VM hosts...")
            self.finish_deploy_vmhosts(machines_deploy_process)
            logger.debug("Setting up VM hosts...")
            machines_setup_process = self.prepare_vmhosts()
            machines_setup_process.wait()
            logger.debug("VM hosts are setup.")
            server_deploy_process.wait()
            logger.debug("Finishing deploying server...")
            self.finish_deploy_server(server_deploy_process)
            logger.debug("Server is deployed.")
            self.vm_process = self.start_all_vm()
            # Ensure VM are killed when we exit
            with self.vm_process:
                server_setup_process = self.prepare_server()
                self.wait_until_vm_ready()
                vm_setup_process = self.prepare_vm()
                server_setup_process.wait()
                self.log_output(server_setup_process, "server_setup_process")
                if not server_setup_process.ok:
                    logger.error(
                        "Error while preparing server, please check logs for 'server_setup_process'"
                    )
                    raise Exception
                logger.debug("Prepared server: {}".format(self.server.address))
                vm_setup_process.wait()
                self.log_output(vm_setup_process, "vm_setup_process")
                if not vm_setup_process.ok:
                    logger.error(
                        "Error while preparing VMs, please check logs for 'vm_setup_process'"
                    )
                    raise Exception
                logger.debug("Prepared VM")
                logger.info("Started {} VMs.".format(len(self.vm)))
                cpunetlog_vms = self.start_cpunetlog(self.vm)
                cpunetlog_server = self.start_cpunetlog(
                    [self.server], self.server_conn_params)
                resolver = self.start_dns_server()
                logger.info("Started resolver ({}) on {}.".format(
                    self.resolver_name, self.server.address))
                # Leave time for resolver to start
                if self.args.resolver_slots_per_thread < 1000000:
                    execo.sleep(15)
                else:
                    execo.sleep(60)
                logger.info("Starting {} on all VMs...".format(client))
                clients = self.start_client_vm()
                clients.wait()
                logger.info("{} finished!".format(client))
                logger.info("Writing cpunetlog output to disk.")
                cpunetlog_server.kill().wait()
                cpunetlog_vms.kill().wait()
                self.log_output(cpunetlog_server, "cpunetlog_server")
                self.log_output(cpunetlog_vms, "cpunetlog_vms")
                logger.info("writing {} results to disk.".format(client))
                self.log_output(clients, "clients", log_stdout=False)
                with open(rtt_file, 'w') as rtt_output:
                    need_header = True
                    rtt = csv.writer(rtt_output)
                    for client_id, client in enumerate(clients.processes):
                        first_line = True
                        for line in iter(client.stdout.splitlines()):
                            # Skip anything that does not look like CSV
                            if ',' not in line:
                                continue
                            if need_header:
                                # Take CSV header from first client and add a column
                                data = line.split(",")
                                data.insert(0, "vm_id")
                                rtt.writerow(data)
                                need_header = False
                                first_line = False
                            elif first_line:
                                # Skip first line of subsequent clients
                                first_line = False
                            else:
                                # Add column with VM ID
                                data = line.split(",")
                                data.insert(0, client_id)
                                rtt.writerow(data)

        except Exception as e:
            logger.error("Exception raised: {}\n{}".format(e, format_exc()))
        finally:
            #self.kill_all_vm()
            if self.vm_process:
                self.vm_process.kill()
            if resolver:
                resolver.kill()
                logger.debug("Waiting for resolver to exit")
                resolver.wait()
                self.log_output(resolver, "resolver")
            if self.vm_process:
                logger.debug("Waiting for VM to exit")
                self.vm_process.wait()
                logger.info("Resolver and all VMs are shut down")
                self.log_output(self.vm_process, "vm_process")
                print(execo.Report([self.vm_process]).to_string())
            #for s in self.vm_process.processes:
            #    print("\n%s\nstdout:\n%s\nstderr:\n%s\n" % (s, s.stdout, s.stderr))
            g5k.oardel([self.vmhosts_job])
Example #26
0
    def run(self):
        """The main experimental workflow, as described in
        ``Using the Execo toolkit to perform ...``
        """
        self.force_options()

        # The argument is a cluster
        self.cluster = self.args[0]
        self.frontend = get_cluster_site(self.cluster)
        # Analyzing options
        if self.options.oar_job_id:
            self.oar_job_id = self.options.oar_job_id
        else:
            self.oar_job_id = None

        try:
            # Creation of the main iterator which is used for the first control loop.
            # You need have a method called define_parameters, that returns a list of parameter dicts
            self.create_paramsweeper()

            job_is_dead = False
            # While they are combinations to treat
            while len(self.sweeper.get_remaining()) > 0:
                # If no job, we make a reservation and prepare the hosts for the experiments
                if self.oar_job_id is None:
                    self.make_reservation()
                # Retrieving the hosts and subnets parameters
                self.get_resources()
                # Hosts deployment and configuration
                if not self.options.no_hosts_setup:
                    self.setup_hosts()
                if len(self.hosts) == 0:
                    break

                # Initializing the resources and threads
                available_hosts = list(self.hosts)
                available_ip_mac = list(self.ip_mac)
                threads = {}

                # Checking that the job is running and not in Error
                while get_oar_job_info(self.oar_job_id, self.frontend)['state'] != 'Error' \
                    or len(threads.keys()) > 0:
                    job_is_dead = False
                    while self.options.n_nodes > len(available_hosts):
                        tmp_threads = dict(threads)
                        for t in tmp_threads:
                            if not t.is_alive():
                                available_hosts.extend(tmp_threads[t]['hosts'])
                                available_ip_mac.extend(
                                    tmp_threads[t]['ip_mac'])
                                del threads[t]
                        sleep(5)
                        if get_oar_job_info(self.oar_job_id,
                                            self.frontend)['state'] == 'Error':
                            job_is_dead = True
                            break
                    if job_is_dead:
                        break

                    # Getting the next combination
                    comb = self.sweeper.get_next()
                    if not comb:
                        while len(threads.keys()) > 0:
                            tmp_threads = dict(threads)
                            for t in tmp_threads:
                                if not t.is_alive():
                                    del threads[t]
                            logger.info('Waiting for threads to complete')
                            sleep(20)
                        break

                    used_hosts = available_hosts[0:self.options.n_nodes]
                    available_hosts = available_hosts[self.options.n_nodes:]

                    n_vm = self.comb_nvm(comb)
                    used_ip_mac = available_ip_mac[0:n_vm]
                    available_ip_mac = available_ip_mac[n_vm:]

                    t = Thread(target=self.workflow,
                               args=(comb, used_hosts, used_ip_mac))
                    threads[t] = {'hosts': used_hosts, 'ip_mac': used_ip_mac}
                    logger.debug('Threads: %s', len(threads))
                    t.daemon = True
                    t.start()

                if get_oar_job_info(self.oar_job_id,
                                    self.frontend)['state'] == 'Error':
                    job_is_dead = True

                if job_is_dead:
                    self.oar_job_id = None

        finally:
            if self.oar_job_id is not None:
                if not self.options.keep_alive:
                    logger.info('Deleting job')
                    oardel([(self.oar_job_id, self.frontend)])
                else:
                    logger.info('Keeping job alive for debugging')
Example #27
0
 def wait_until_vm_ready(self):
     prospective_vms = [execo.Host(ip, user='******') for ip in self.vm_ips]
     logger.debug('Waiting for {} VMs to become reachable...'.format(
         len(prospective_vms)))
     self.vm = check_hosts_up(prospective_vms, timeout=60)
     logger.debug('Result: {} VMs are reachable.'.format(len(self.vm)))
Example #28
0
    def run(self):
        """The main experimental workflow, as described in
        ``Using the Execo toolkit to perform ...``
        """
        self.force_options()

        # The argument is a cluster
        self.cluster = self.args[0]
        self.frontend = get_cluster_site(self.cluster)
        # Analyzing options
        if self.options.oar_job_id:
            self.oar_job_id = self.options.oar_job_id
        else:
            self.oar_job_id = None

        try:
            # Creation of the main iterator which is used for the first control loop.
            # You need have a method called define_parameters, that returns a list of parameter dicts
            self.create_paramsweeper()

            job_is_dead = False
            # While they are combinations to treat
            while len(self.sweeper.get_remaining()) > 0:
                # If no job, we make a reservation and prepare the hosts for the experiments
                if self.oar_job_id is None:
                    self.make_reservation()
                # Retrieving the hosts and subnets parameters
                self.get_resources()
                # Hosts deployment and configuration
                if not self.options.no_hosts_setup:
                    self.setup_hosts()
                if len(self.hosts) == 0:
                    break

                # Initializing the resources and threads
                available_hosts = list(self.hosts)
                available_ip_mac = list(self.ip_mac)
                threads = {}

                # Checking that the job is running and not in Error
                while self.is_job_alive()['state'] != 'Error' \
                    or len(threads.keys()) > 0:
                # while get_oar_job_info(self.oar_job_id, self.frontend)['state'] != 'Error' \
                #     or len(threads.keys()) > 0:
                    job_is_dead = False
                    while self.options.n_nodes > len(available_hosts):
                        tmp_threads = dict(threads)
                        for t in tmp_threads:
                            if not t.is_alive():
                                available_hosts.extend(tmp_threads[t]['hosts'])
                                available_ip_mac.extend(tmp_threads[t]['ip_mac'])
                                del threads[t]
                        sleep(5)
                        if self.is_job_alive()['state'] == 'Error':
                            job_is_dead = True
                            break
                    if job_is_dead:
                        break

                    # Getting the next combination
                    comb = self.sweeper.get_next()
                    if not comb:
                        while len(threads.keys()) > 0:
                            tmp_threads = dict(threads)
                            for t in tmp_threads:
                                if not t.is_alive():
                                    del threads[t]
                            logger.info('Waiting for threads to complete')
                            sleep(20)
                        break

                    used_hosts = available_hosts[0:self.options.n_nodes]
                    available_hosts = available_hosts[self.options.n_nodes:]

                    n_vm = self.comb_nvm(comb)
                    used_ip_mac = available_ip_mac[0:n_vm]
                    available_ip_mac = available_ip_mac[n_vm:]

                    t = Thread(target=self.workflow,
                               args=(comb, used_hosts, used_ip_mac))
                    threads[t] = {'hosts': used_hosts, 'ip_mac': used_ip_mac}
                    logger.debug('Threads: %s', len(threads))
                    t.daemon = True
                    t.start()

                # if get_oar_job_info(self.oar_job_id, self.frontend)['state'] == 'Error':
                if self.is_job_alive()['state'] == 'Error':
                    job_is_dead = True

                if job_is_dead:
                    self.oar_job_id = None

        finally:
            if self.oar_job_id is not None:
                if not self.options.keep_alive:
                    logger.info('Deleting job')
                    oardel([(self.oar_job_id, self.frontend)])
                else:
                    logger.info('Keeping job alive for debugging')