Example #1
0
    def _enable_bridge(self, name='br0'):
        """We need a bridge to have automatic DHCP configuration for the VM."""
        logger.detail('Configuring the bridge')
        hosts_br = self._get_bridge(self.hosts)
        nobr_hosts = []
        for host, br in hosts_br.iteritems():
            if br is None:
                logger.debug('No bridge on host %s', style.host(host))
                nobr_hosts.append(host)
            elif br != name:
                logger.debug('Wrong bridge on host %s, destroying it',
                             style.host(host))
                SshProcess('ip link set ' + br + ' down ; brctl delbr ' + br,
                            host).run()
                nobr_hosts.append(host)
            else:
                logger.debug('Bridge %s is present on host %s',
                             style.emph('name'), style.host(host))

        nobr_hosts = map(lambda x: x.address if isinstance(x, Host) else x, 
                         nobr_hosts)

        if len(nobr_hosts) > 0:
            logger.debug('Creating bridge on %s', hosts_list(nobr_hosts))
            script = 'export br_if=`ip route |grep default |cut -f 5 -d " "`; \n' + \
    'ifdown $br_if ; \n' + \
    'sed -i "s/$br_if inet dhcp/$br_if inet manual/g" /etc/network/interfaces ; \n' + \
    'sed -i "s/auto $br_if//g" /etc/network/interfaces ; \n' + \
    'echo " " >> /etc/network/interfaces ; \n' + \
    'echo "auto ' + name + '" >> /etc/network/interfaces ; \n' + \
    'echo "iface ' + name + ' inet dhcp" >> /etc/network/interfaces ; \n' + \
    'echo "  bridge_ports $br_if" >> /etc/network/interfaces ; \n' + \
    'echo "  bridge_stp off" >> /etc/network/interfaces ; \n' + \
    'echo "  bridge_maxwait 0" >> /etc/network/interfaces ; \n' + \
    'echo "  bridge_fd 0" >> /etc/network/interfaces ; \n' + \
    'ifup ' + name
            fd, br_script = mkstemp(dir='/tmp/', prefix='create_br_')
            f = fdopen(fd, 'w')
            f.write(script)
            f.close()

            self.fact.get_fileput(nobr_hosts, [br_script]).run()
            self.fact.get_remote('nohup sh ' + br_script.split('/')[-1],
                                 nobr_hosts).run()

            logger.debug('Waiting for network restart')
            if_up = False
            nmap_tries = 0
            while (not if_up) and nmap_tries < 20:
                sleep(20)
                nmap_tries += 1
                nmap = Process('nmap ' +
                               ' '.join([host for host in nobr_hosts]) +
                               ' -p 22').run()
                for line in nmap.stdout.split('\n'):
                    if 'Nmap done' in line:
                        if_up = line.split()[2] == line.split()[5].replace('(',
                                                                           '')
            logger.debug('Network has been restarted')
        logger.detail('All hosts have the bridge %s', style.emph(name))
Example #2
0
    def _enable_bridge(self, name='br0'):
        """We need a bridge to have automatic DHCP configuration for the VM."""
        logger.detail('Configuring the bridge')
        hosts_br = self._get_bridge(self.hosts)
        nobr_hosts = []
        for host, br in hosts_br.iteritems():
            if br is None:
                logger.debug('No bridge on host %s', style.host(host))
                nobr_hosts.append(host)
            elif br != name:
                logger.debug('Wrong bridge on host %s, destroying it',
                             style.host(host))
                SshProcess('ip link set ' + br + ' down ; brctl delbr ' + br,
                           host).run()
                nobr_hosts.append(host)
            else:
                logger.debug('Bridge %s is present on host %s',
                             style.emph('name'), style.host(host))

        nobr_hosts = map(lambda x: x.address
                         if isinstance(x, Host) else x, nobr_hosts)

        if len(nobr_hosts) > 0:
            logger.debug('Creating bridge on %s', hosts_list(nobr_hosts))
            script = 'export br_if=`ip route |grep default |cut -f 5 -d " "`; \n' + \
    'ifdown $br_if ; \n' + \
    'sed -i "s/$br_if inet dhcp/$br_if inet manual/g" /etc/network/interfaces ; \n' + \
    'sed -i "s/auto $br_if//g" /etc/network/interfaces ; \n' + \
    'echo " " >> /etc/network/interfaces ; \n' + \
    'echo "auto ' + name + '" >> /etc/network/interfaces ; \n' + \
    'echo "iface ' + name + ' inet dhcp" >> /etc/network/interfaces ; \n' + \
    'echo "  bridge_ports $br_if" >> /etc/network/interfaces ; \n' + \
    'echo "  bridge_stp off" >> /etc/network/interfaces ; \n' + \
    'echo "  bridge_maxwait 0" >> /etc/network/interfaces ; \n' + \
    'echo "  bridge_fd 0" >> /etc/network/interfaces ; \n' + \
    'ifup ' + name
            fd, br_script = mkstemp(dir='/tmp/', prefix='create_br_')
            f = fdopen(fd, 'w')
            f.write(script)
            f.close()

            self.fact.get_fileput(nobr_hosts, [br_script]).run()
            self.fact.get_remote('nohup sh ' + br_script.split('/')[-1],
                                 nobr_hosts).run()

            logger.debug('Waiting for network restart')
            if_up = False
            nmap_tries = 0
            while (not if_up) and nmap_tries < 20:
                sleep(20)
                nmap_tries += 1
                nmap = Process('nmap ' +
                               ' '.join([host for host in nobr_hosts]) +
                               ' -p 22').run()
                for line in nmap.stdout.split('\n'):
                    if 'Nmap done' in line:
                        if_up = line.split()[2] == line.split()[5].replace(
                            '(', '')
            logger.debug('Network has been restarted')
        logger.detail('All hosts have the bridge %s', style.emph(name))
Example #3
0
    def __init__(self, hosts, topo_list=None, config_file=None):
        """Create a new Hadoop cluster with the given hosts and topology.
        
        Args:
          hosts (list of Host):
            The hosts to be assigned a topology.
          topo_list (list of str, optional):
            The racks to be assigned to each host. len(hosts) should be equal to
            len(topo_list).
          configFile (str, optional):
            The path of the config file to be used.
        """

        # Load cluster properties
        config = ConfigParser(self.defaults)
        config.add_section("cluster")
        config.add_section("local")

        if config_file:
            config.readfp(open(config_file))

        self.base_dir = config.get("cluster", "hadoop_base_dir")
        self.conf_dir = config.get("cluster", "hadoop_conf_dir")
        self.logs_dir = config.get("cluster", "hadoop_logs_dir")
        self.hadoop_temp_dir = config.get("cluster", "hadoop_temp_dir")
        self.hdfs_port = config.getint("cluster", "hdfs_port")
        self.mapred_port = config.getint("cluster", "mapred_port")
        self.local_base_conf_dir = config.get("local", "local_base_conf_dir")

        self.bin_dir = self.base_dir + "/bin"
        self.sbin_dir = self.base_dir + "/bin"

        # Configure master and slaves
        self.hosts = hosts
        self.master = hosts[0]

        # Create topology
        self.topology = HadoopTopology(hosts, topo_list)
        # Store cluster information
        self.host_clusters = {}
        for h in self.hosts:
            g5k_cluster = get_host_cluster(h)
            if g5k_cluster in self.host_clusters:
                self.host_clusters[g5k_cluster].append(h)
            else:
                self.host_clusters[g5k_cluster] = [h]

        # Create a string to display the topology
        t = {v: [] for v in self.topology.topology.values()}
        for key, value in self.topology.topology.iteritems():
            t[value].append(key.address)
        log_topo = ', '.join([style.user2(k) + ': ' +
                              ' '.join(map(lambda x: style.host(x.split('.')[0]), v)) 
                              for k, v in t.iteritems()])
        
        logger.info("Hadoop cluster created with master %s, hosts %s and topology %s",
                    style.host(self.master.address), 
                    ' '.join([style.host(h.address.split('.')[0]) for h in self.hosts]),
                    log_topo)
Example #4
0
    def __init__(self, hosts, topo_list=None, config_file=None):
        """Create a new Hadoop cluster with the given hosts and topology.
        
        Args:
          hosts (list of Host):
            The hosts to be assigned a topology.
          topo_list (list of str, optional):
            The racks to be assigned to each host. len(hosts) should be equal to
            len(topo_list).
          configFile (str, optional):
            The path of the config file to be used.
        """

        # Load cluster properties
        config = ConfigParser(self.defaults)
        config.add_section("cluster")
        config.add_section("local")

        if config_file:
            config.readfp(open(config_file))

        self.base_dir = config.get("cluster", "hadoop_base_dir")
        self.conf_dir = config.get("cluster", "hadoop_conf_dir")
        self.logs_dir = config.get("cluster", "hadoop_logs_dir")
        self.hadoop_temp_dir = config.get("cluster", "hadoop_temp_dir")
        self.hdfs_port = config.getint("cluster", "hdfs_port")
        self.mapred_port = config.getint("cluster", "mapred_port")
        self.local_base_conf_dir = config.get("local", "local_base_conf_dir")

        self.bin_dir = self.base_dir + "/bin"
        self.sbin_dir = self.base_dir + "/bin"

        # Configure master and slaves
        self.hosts = hosts
        self.master = hosts[0]

        # Create topology
        self.topology = HadoopTopology(hosts, topo_list)
        # Store cluster information
        self.host_clusters = {}
        for h in self.hosts:
            g5k_cluster = get_host_cluster(h)
            if g5k_cluster in self.host_clusters:
                self.host_clusters[g5k_cluster].append(h)
            else:
                self.host_clusters[g5k_cluster] = [h]

        # Create a string to display the topology
        t = {v: [] for v in self.topology.topology.values()}
        for key, value in self.topology.topology.iteritems():
            t[value].append(key.address)
        log_topo = ', '.join([style.user2(k) + ': ' +
                              ' '.join(map(lambda x: style.host(x.split('.')[0]), v)) 
                              for k, v in t.iteritems()])
        
        logger.info("Hadoop cluster created with master %s, hosts %s and topology %s",
                    style.host(self.master.address), 
                    ' '.join([style.host(h.address.split('.')[0]) for h in self.hosts]),
                    log_topo)
Example #5
0
    def workflow(self, comb):
        """
            Compute one application launch 
            using a given parameter group
        """
        comb_ok = False
        try:
            # Generate configuration file needed by MPI processes
            logger.info("Generating assembly file...")
            py = comb['cores'] / comb['px']
            prepare = Process('cd %s && python %s %d %d %d %d %d %s app.lad' % 
                (self.workingPath, self.genLadScript, comb['datasize'], comb['datasize'], 
                    comb['datasize'], comb['px'], py, comb['transposition']))
            prepare.shell = True
            prepare.run()

            # Generate the MPI host file
            mfile = self.generate_machine_file()

            # Start L2C
            lad = "./app.lad"
            logger.info("Computing...")
            res = Process("export OAR_JOB_KEY_FILE=~/.oar_key ; cd %s && l2c_loader -M,-machinefile,%s --mpi -c %d %s" % (self.workingPath, mfile, comb['cores'], lad))
            res.shell = True
            res.stdout_handlers.append(os.path.join(self.result_dir, slugify(comb) + '.out'))
            res.stdout_handlers.append(sys.stdout)
            res.stderr_handlers.append(os.path.join(self.result_dir, slugify(comb) + '.err'))
            res.stderr_handlers.append(sys.stderr)
            res.run()
            if not res.ok:
                logger.error('Bad L2C termination')
                raise Exception('Bad L2C termination')
            if len(res.stderr) > 0: # WARNING: when L2C cannot find the LAD file or something strange like this
                logger.warning('Not empty error output')

            # Clean configuration files
            logger.info("Removing assembly files...")
            res = Process('cd %s && rm -f app.lad*' % self.workingPath)
            res.shell = True
            res.run()
                
            comb_ok = True
        except Exception:
            pass
        finally:
            if comb_ok:
                self.sweeper.done(comb)
                logger.info(style.host(slugify(comb)) + ' has been done')
            else:
                self.sweeper.cancel(comb)
                logger.warning(style.host(slugify(comb)) + ' has been canceled')
        
            logger.info(style.step('%s Remaining'),
                        len(self.sweeper.get_remaining()))
Example #6
0
def hosts_list(hosts, separator=' ', site=False):
    """Return a formatted string from a list of hosts"""
    tmp_hosts = copy.deepcopy(sorted(hosts))
    for i, host in enumerate(tmp_hosts):
        if isinstance(host, Host):
            tmp_hosts[i] = host.address

    if site:
        return separator.join([style.host(host.split('.')[0] + '.'
                                          + host.split('.')[1])
                               for host in sorted(tmp_hosts)])
    else:
        return separator.join([style.host(host.split('.')[0])
                           for host in sorted(tmp_hosts)])
Example #7
0
def hosts_list(hosts, separator=' ', site=False):
    """Return a formatted string from a list of hosts"""
    tmp_hosts = copy.deepcopy(sorted(hosts))
    for i, host in enumerate(tmp_hosts):
        if isinstance(host, Host):
            tmp_hosts[i] = host.address

    if site:
        return separator.join([
            style.host(host.split('.')[0] + '.' + host.split('.')[1])
            for host in sorted(tmp_hosts)
        ])
    else:
        return separator.join(
            [style.host(host.split('.')[0]) for host in sorted(tmp_hosts)])
Example #8
0
def generate_hosts(hosts_input):
    """Generate a list of hosts from the given file.

    Args:
      hosts_input: The path of the file containing the hosts to be used,
        or a comma separated list of site:job_id or an oargrid_job_id.
        If a file is used, each host should be in a different line.
        Repeated hosts are pruned.
        Hint: in a running Grid5000 job,  $OAR_NODEFILE should be used.

    Return:
      list of Host: The list of hosts.
    """
    hosts = []
    if os.path.isfile(hosts_input):
        for line in open(hosts_input):
            h = Host(line.rstrip())
            if h not in hosts:
                hosts.append(h)
    elif ':' in hosts_input:
        # We assume the string is a comma separated list of site:job_id
        for job in hosts_input.split(','):
            site, job_id = job.split(':')
            hosts += get_oar_job_nodes(int(job_id), site)
    else:
        # If the file_name is a number, we assume this is a oargrid_job_id
        hosts = get_oargrid_job_nodes(int(hosts_input))
    logger.debug('Hosts list: \n%s',
                 ' '.join(style.host(host.address.split('.')[0])
                          for host in hosts))
    return hosts
Example #9
0
def _munin_server(server, clients):
    """Install the monitoring service munin. Must be executed inside Grid'5000
    to be able to resolve the server and clients IP.

    :param server: a execo.Host

    :param clients: a list of execo.Hosts

    :param plugins: a list of munin plugins

    """
    logger.info(
        'Munin monitoring service installation, server = %s, clients = \n %s',
        server.address, [host.address for host in clients])

    logger.debug('Configuring munin server %s', style.host('server'))
    cmd = 'export DEBIAN_MASTER=noninteractive ; apt-get update && apt-get install -y munin'
    inst_munin_server = SshProcess(cmd, server).run()

    logger.debug('Creating configuration files for server')
    fd, server_conf = mkstemp(dir='/tmp/', prefix='munin-nodes_')
    f = fdopen(fd, 'w')
    for host in clients:
        get_ip = Process('host ' + host.address).run()
        ip = get_ip.stdout.strip().split(' ')[3]
        f.write('[' + host.address + ']\n    address ' + ip +
                '\n   use_node_name yes\n\n')
    f.close()

    Put([server], [server_conf], remote_location='/etc/').run()
    SshProcess('cd /etc && cp ' + server_conf.split('/')[-1] + ' munin.conf',
               server).run()
    Process('rm ' + server_conf).run()
Example #10
0
def generate_hosts(hosts_input):
    """Generate a list of hosts from the given file.

    Args:
      hosts_input: The path of the file containing the hosts to be used,
        or a comma separated list of site:job_id or an oargrid_job_id.
        If a file is used, each host should be in a different line.
        Repeated hosts are pruned.
        Hint: in a running Grid5000 job,  $OAR_NODEFILE should be used.

    Return:
      list of Host: The list of hosts.
    """
    hosts = []
    if os.path.isfile(hosts_input):
        for line in open(hosts_input):
            h = Host(line.rstrip())
            if h not in hosts:
                hosts.append(h)
    elif ':' in hosts_input:
        # We assume the string is a comma separated list of site:job_id
        for job in hosts_input.split(','):
            site, job_id = job.split(':')
            hosts += get_oar_job_nodes(int(job_id), site)
    else:
        # If the file_name is a number, we assume this is a oargrid_job_id
        hosts = get_oargrid_job_nodes(int(hosts_input))
    logger.debug(
        'Hosts list: \n%s',
        ' '.join(style.host(host.address.split('.')[0]) for host in hosts))
    return hosts
Example #11
0
    def __force_clean(self):
        """Stop previous Spark processes (if any) and remove all remote files
        created by it."""

        spark_processes = ["Master", "Worker"]

        force_kill = False
        for h in self.hosts:
            proc = SshProcess("jps", h)
            proc.run()

            ids_to_kill = []
            for line in proc.stdout.splitlines():
                field = line.split()
                if field[1] in spark_processes:
                    ids_to_kill.append(field[0])

            if ids_to_kill:
                force_kill = True
                ids_to_kill_str = ""
                for pid in ids_to_kill:
                    ids_to_kill_str += " " + pid

                logger.warn("Killing running Spark processes in host %s" %
                            style.host(h.address.split('.')[0]))

                proc = SshProcess("kill -9" + ids_to_kill_str, h)
                proc.run()

        if force_kill:
            logger.info(
                "Processes from previous hadoop deployments had to be killed")

        self.clean_logs()
Example #12
0
def get_server_iface(server):
    """Get the default network interface of the serve """
    logger.debug('Retrieving default interface from %s',
                 style.host(server.address))
    get_if = SshProcess('ip route |grep default |cut -d " " -f 5',
                        server).run()
    return get_if.stdout.strip()
Example #13
0
    def add_host(self, host, data=None):
        """Add a host in the graph

        :param host: a string corresponding to the node name

        :param data: a dict containing the Grid'5000 host attributes"""
        if isinstance(host, Host):
            _host = get_host_shortname(host.address)
        else:
            _host = host
        if data:
            power = data['performance']['core_flops']
            cores = data['architecture']['nb_cores']
        else:
            power = 0
            cores = 0

        if len(self.get_host_adapters(_host)) > 0:
            logger.debug('Adding %s', style.host(_host))
            self.add_node(_host, {'kind': 'node',
                                  'power': power,
                                  'cores': cores})
            for eq in self.get_host_adapters(_host):
                if eq['mounted']:
                    self.add_equip(eq['switch'], get_host_site(_host))
        else:
            logger.warning('Node %s has no valid network connection',
                           _host)
Example #14
0
    def add_host(self, host, data=None):
        """Add a host in the graph

        :param host: a string corresponding to the node name

        :param data: a dict containing the Grid'5000 host attributes"""
        if isinstance(host, Host):
            _host = get_host_shortname(host.address)
        else:
            _host = host
        if data:
            power = data['performance']['core_flops']
            cores = data['architecture']['nb_cores']
        else:
            power = 0
            cores = 0

        if len(self.get_host_adapters(_host)) == 0:
            logger.warning('Node %s has no valid network connection', _host)
        logger.debug('Adding %s', style.host(_host))
        self.add_node(_host, {'kind': 'node', 'power': power, 'cores': cores})
        for eq in self.get_host_adapters(_host):
            if eq['mounted']:
                self.add_equip(eq['switch'], get_host_site(_host))
                self._filter_equip_leaves()
Example #15
0
    def __init__(self, infile=None, resources=None, hosts=None,
                 ip_mac=None, vlan=None,
                 env_name=None, env_file=None, vms=None,
                 distribution=None, outdir=None):
        """:param infile: an XML file that describe the topology of the
        deployment

        :param resources: a dict whose keys are Grid'5000 sites and values are
        dict, whose keys are hosts and ip_mac, where hosts is a list of
        execo.Host and ip_mac is a list of tuple (ip, mac).

        :param env_name: name of the Kadeploy environment

        :param env_file: path to the Kadeploy environment file

        :params vms: dict defining the virtual machines

        :params distribution: how to distribute the vms on the hosts
        (``round-robin`` , ``concentrated``, ``random``)

        :params outdir: directory to store the deployment files
        """
        # set a factory for the deployment that use taktuk and chainput
        self.fact = ActionFactory(remote_tool=TAKTUK,
                                  fileput_tool=CHAINPUT,
                                  fileget_tool=TAKTUK)
        self.kavlan = None if not vlan else vlan
        self.kavlan_site = None
        if env_name is not None:
            self.env_file = None
            if ':' not in env_name:
                self.env_name, self.env_user = env_name, None
            else:
                self.env_user, self.env_name = env_name.split(':')
        else:
            if env_file is not None:
                self.env_name = None
                self.env_user = None
                self.env_file = env_file
            else:
                self.env_name = 'vm5k'
                self.env_user = '******'
                self.env_file = None

        if outdir:
            self.outdir = outdir
        else:
            self.outdir = 'vm5k_' + strftime("%Y%m%d_%H%M%S_%z")

        self.copy_actions = None

        self.state = Element('vm5k')
        self._define_elements(infile, resources, hosts, vms, ip_mac,
                              distribution)

        logger.info('%s %s %s %s %s %s %s %s',
                    len(self.sites), style.emph('sites'),
                    len(self.clusters), style.user1('clusters'),
                    len(self.hosts), style.host('hosts'),
                    len(self.vms), style.vm('vms'))
Example #16
0
def get_server_iface(server):
    """Get the default network interface of the serve """
    logger.debug('Retrieving default interface from %s',
                 style.host(server.address))
    get_if = SshProcess('ip route |grep default |cut -d " " -f 5',
                        server).run()
    return get_if.stdout.strip()
Example #17
0
File: munin.py Project: badock/vm5k
def _munin_server(server, clients):
    """Install the monitoring service munin. Must be executed inside Grid'5000
    to be able to resolve the server and clients IP.

    :param server: a execo.Host

    :param clients: a list of execo.Hosts

    :param plugins: a list of munin plugins

    """
    logger.info('Munin monitoring service installation, server = %s, clients = \n %s',
                server.address, [host.address for host in clients])

    logger.debug('Configuring munin server %s', style.host('server'))
    cmd = 'export DEBIAN_MASTER=noninteractive ; apt-get update && apt-get install -y munin'
    inst_munin_server = SshProcess(cmd, server).run()

    logger.debug('Creating configuration files for server')
    fd, server_conf = mkstemp(dir='/tmp/', prefix='munin-nodes_')
    f = fdopen(fd, 'w')
    for host in clients:
        get_ip = Process('host '+host.address).run()
        ip = get_ip.stdout.strip().split(' ')[3]
        f.write('['+host.address+']\n    address '+ip+'\n   use_node_name yes\n\n')
    f.close()

    Put([server], [server_conf], remote_location='/etc/').run()
    SshProcess('cd /etc && cp '+server_conf.split('/')[-1]+' munin.conf', server).run()
    Process('rm '+server_conf).run()
Example #18
0
    def run(self):
        """
            Main engine method to perform the experiment
        """
        self.define_parameters()

        while len(self.sweeper.get_remaining()) > 0:
            # Getting the next combination
            comb = self.sweeper.get_next()
            logger.info(style.host(slugify(comb)) + ' has been started')
            self.get_nodes(comb)

            # If the job is broken, the program is stopped
            if get_oar_job_info(self.oar_job_id,
                                self.frontend)['state'] == 'Error':
                break

            try:
                self.workflow(comb)

                # Process all combinations that can use the same submission
                while True:
                    # Find the next combination combinations that can use the same submission
                    subcomb = self.sweeper.get_next(lambda r: filter(
                        lambda x: x['cores'] == comb['cores'] and x['cluster']
                        == comb['cluster'], r))

                    if not subcomb:
                        logger.info(
                            'No more combination for cluster=%s and cores=%s',
                            comb['cluster'], comb['cores'])
                        break
                    else:
                        logger.info(
                            style.host(slugify(subcomb)) + ' has been started')

                        if get_oar_job_info(self.oar_job_id,
                                            self.frontend)['state'] != 'Error':
                            self.workflow(subcomb)
                        else:
                            break

            # Whatever happens (errors, end of loop), the job is deleted
            finally:
                logger.info('Deleting job...')
                oardel([(self.oar_job_id, self.frontend)])
Example #19
0
 def rm_equip(self, equip):
     """Remove an equipment from the node"""
     logger.debug('Removing equip %s', style.host(equip))
     self.remove_node(equip)
     if get_network_equipment_attributes(equip)['kind'] == 'router':
         lc_nodes = [x for x in self.nodes() if equip in x]
         logger.debug('Removing router linecard %s', ' '.join(lc_nodes))
         self.remove_nodes_from(lc_nodes)
Example #20
0
 def rm_host(self, host):
     """Remove the host from the graph"""
     logger.debug('Removing host %s', style.host(host))
     self.remove_node(host)
     for eq in self.get_host_adapters(host):
         if not self._equip_has_nodes(eq['switch']):
             logger.debug('Removing equip %s', eq['switch'])
             self.rm_equip(eq['switch'])
Example #21
0
 def rm_equip(self, equip):
     """Remove an equipment from the node"""
     logger.debug('Removing equip %s', style.host(equip))
     self.remove_node(equip)
     if get_network_equipment_attributes(equip)['kind'] == 'router':
         lc_nodes = [x for x in self.nodes() if equip in x]
         logger.debug('Removing router linecard %s', ' '.join(lc_nodes))
         self.remove_nodes_from(lc_nodes)
Example #22
0
 def rm_host(self, host):
     """Remove the host from the graph"""
     logger.debug('Removing host %s', style.host(host))
     self.remove_node(host)
     for eq in self.get_host_adapters(host):
         if not self._equip_has_nodes(eq['switch']):
             logger.debug('Removing equip %s', eq['switch'])
             self.rm_equip(eq['switch'])
Example #23
0
def restart_vms(vms):
    """ """
    hosts = [vm['host'] for vm in vms]
    running_vms = list_vm(hosts)
    for vm in vms:
        if {'id': vm['id']} not in running_vms[vm['host']]:
            logger.info('%s has not been started on %s, starting it',
                        style.vm(vm['id']), style.host(vm['host']))
            SshProcess('virsh start ' + vm['id'], vm['host']).run()
Example #24
0
def restart_vms(vms):
    """ """
    hosts = [vm['host'] for vm in vms]
    running_vms = list_vm(hosts)
    for vm in vms:
        if {'id': vm['id']} not in running_vms[vm['host']]:
            logger.info('%s has not been started on %s, starting it',
                        style.vm(vm['id']), style.host(vm['host']))
            SshProcess('virsh start ' + vm['id'], vm['host']).run()
Example #25
0
def get_server_ip(host):
    """Get the server IP"""
    if isinstance(host, Host):
        host = host.address
    logger.debug('Retrieving IP from %s', style.host(host))
    get_ip = Process('host ' + host + ' |cut -d \' \' -f 4')
    get_ip.shell = True
    get_ip.run()
    ip = get_ip.stdout.strip()
    return ip
Example #26
0
def get_server_ip(host):
    """Get the server IP"""
    if isinstance(host, Host):
        host = host.address
    logger.debug('Retrieving IP from %s', style.host(host))
    get_ip = Process('host ' + host + ' |cut -d \' \' -f 4')
    get_ip.shell = True
    get_ip.run()
    ip = get_ip.stdout.strip()
    return ip
Example #27
0
    def run(self):
        """
            Main engine method to perform the experiment
        """
        self.define_parameters()
        
        while len(self.sweeper.get_remaining()) > 0:
            # Getting the next combination
            comb = self.sweeper.get_next()
            logger.info(style.host(slugify(comb)) + ' has been started')
            self.get_nodes(comb)

            # If the job is broken, the program is stopped
            if get_oar_job_info(self.oar_job_id, self.frontend)['state'] == 'Error': 
                break

            try:
                self.workflow(comb)

                # Process all combinations that can use the same submission
                while True:
                    # Find the next combination combinations that can use the same submission
                    subcomb = self.sweeper.get_next(lambda r: 
                        filter(lambda x: x['cores'] == comb['cores']
                                        and x['cluster'] == comb['cluster'], r))

                    if not subcomb: 
                        logger.info('No more combination for cluster=%s and cores=%s',
                            comb['cluster'], comb['cores'])
                        break
                    else:
                        logger.info(style.host(slugify(subcomb)) + ' has been started')

                        if get_oar_job_info(self.oar_job_id, self.frontend)['state'] != 'Error':
                            self.workflow(subcomb)
                        else:
                            break
            
            # Whatever happens (errors, end of loop), the job is deleted
            finally:
                logger.info('Deleting job...')
                oardel([(self.oar_job_id, self.frontend)])
Example #28
0
def setup_aptcacher_server(hosts, base_dir='/tmp/apt-cacher-ng'):
    """Install and configure apt-cacher on one server"""
    hosts = map(Host, hosts)
    logger.info('Installing apt-cacher on %s',
                ','.join([style.host(host.address) for host in hosts]))
    logger.detail('Package')
    package = TaktukRemote('export DEBIAN_MASTER=noninteractive ; apt-get update ; ' +
                           'apt-get install -o Dpkg::Options::="--force-confdef" -o ' +
                           'Dpkg::Options::="--force-confnew" -y apt-cacher-ng',
                           hosts).run()
    if not package.ok:
        logger.error('Unable to install apt-cacher-ng on %s')
        return

    logger.detail('Directory creation')
    log_dir = base_dir + '/log'
    cache_dir = base_dir + '/cache'
    mkdirs = TaktukRemote('mkdir -p ' + log_dir + '; mkdir -p ' + cache_dir +
                          '; chown -R apt-cacher-ng:apt-cacher-ng ' + base_dir,
                          hosts).run()
    if not mkdirs.ok:
        logger.error('Unable to create the directories')
        return
    cmd = 'sed -i "s#/var/cache/apt-cacher-ng#' + cache_dir + \
          '#g" /etc/apt-cacher-ng/acng.conf ;' + \
          'sed -i "s#/var/log/apt-cacher-ng#' + log_dir + '#g" ' + \
          '/etc/apt-cacher-ng/acng.conf ;' + \
          'sed -i "s/3142/9999/g" /etc/apt-cacher-ng/acng.conf ; ' + \
          'sed -i "s?#Proxy: http://www-proxy.example.net:80?Proxy: ' + \
          'http://proxy:3128?g" /etc/apt-cacher-ng/acng.conf ; ' + \
          'service apt-cacher-ng restart'
    configure = TaktukRemote(cmd, hosts).run()
    if not configure.ok:
        logger.error('Unable to configure and restart the service')
        return

    logger.info('apt-cacher-ng up and running on %s',
                ','.join([style.host(host.address) for host in hosts]))
Example #29
0
def setup_aptcacher_server(hosts, base_dir='/tmp/apt-cacher-ng'):
    """Install and configure apt-cacher on one server"""
    hosts = map(Host, hosts)
    logger.info('Installing apt-cacher on %s',
                ','.join([style.host(host.address) for host in hosts]))
    logger.detail('Package')
    package = TaktukRemote(
        'export DEBIAN_MASTER=noninteractive ; apt-get update ; ' +
        'apt-get install -o Dpkg::Options::="--force-confdef" -o ' +
        'Dpkg::Options::="--force-confnew" -y apt-cacher-ng', hosts).run()
    if not package.ok:
        logger.error('Unable to install apt-cacher-ng on %s')
        return

    logger.detail('Directory creation')
    log_dir = base_dir + '/log'
    cache_dir = base_dir + '/cache'
    mkdirs = TaktukRemote(
        'mkdir -p ' + log_dir + '; mkdir -p ' + cache_dir +
        '; chown -R apt-cacher-ng:apt-cacher-ng ' + base_dir, hosts).run()
    if not mkdirs.ok:
        logger.error('Unable to create the directories')
        return
    cmd = 'sed -i "s#/var/cache/apt-cacher-ng#' + cache_dir + \
          '#g" /etc/apt-cacher-ng/acng.conf ;' + \
          'sed -i "s#/var/log/apt-cacher-ng#' + log_dir + '#g" ' + \
          '/etc/apt-cacher-ng/acng.conf ;' + \
          'sed -i "s/3142/9999/g" /etc/apt-cacher-ng/acng.conf ; ' + \
          'sed -i "s?#Proxy: http://www-proxy.example.net:80?Proxy: ' + \
          'http://proxy:3128?g" /etc/apt-cacher-ng/acng.conf ; ' + \
          'service apt-cacher-ng restart'
    configure = TaktukRemote(cmd, hosts).run()
    if not configure.ok:
        logger.error('Unable to configure and restart the service')
        return

    logger.info('apt-cacher-ng up and running on %s',
                ','.join([style.host(host.address) for host in hosts]))
Example #30
0
    def __init__(self, hosts, config_file=None,
                 sharding=True, replication=False):
        """Create a new MongoDB cluster with the given hosts.

        Args:
          hosts (list of Host):
            The hosts that conform the cluster.
          config_file (str, optional):
            The path of the config file to be used.
        """

        super(MongoDBCluster, self).__init__(hosts, config_file)

        # Cluster properties
        ctype = self.get_cluster_type()
        self.data_dir = self.config.get("cluster", ctype + "_data_dir")
        self.logs_dir = self.config.get("cluster", ctype + "_logs_dir")
        self.md_port = self.config.getint("cluster", ctype + "_md_port")
        self.ms_port = self.config.getint("cluster", ctype + "_ms_port")
        self.bin_dir = self.base_dir + "/bin"
        self.conf_mandatory_files = [CONF_FILE]

        # Configure master
        self.master = hosts[0]

        self.do_sharding = sharding
        self.initialized_sharding = False
        self.mongos_pid_file = self.base_dir + "/mongos.pid"
        self.do_replication = replication
        # TODO: allow more fine-frained configuration of hosts: assign roles

        logger.info("MongoDB cluster created with master %s and hosts %s %s "
                    "replication, %s sharding",
                    style.host(self.master.address),
                    ' '.join([style.host(h.address.split('.')[0])
                              for h in self.hosts]),
                    "with" if self.do_replication else "without",
                    "with" if self.do_sharding else "without")
Example #31
0
 def get_host_adapters(self, host):
     """Return the mountable network interfaces from a host"""
     try:
         if host in self.data['hosts']:
             return [m for m in self.data['hosts'][host]['network_adapters']
                     if 'switch' in m
                     and not m['management']
                     and m['mountable']
                     and m['switch']
                     and m['interface'] == 'Ethernet']
     except:
         logger.warning("Wrong description for host %s" % style.host(host))
         logger.debug("host's network_adapters = %s" % (self.data['hosts'][host]['network_adapters'],))
         return []
Example #32
0
 def get_host_adapters(self, host):
     """Return the mountable network interfaces from a host"""
     try:
         if host in self.data['hosts']:
             return [
                 m for m in self.data['hosts'][host]['network_adapters']
                 if 'switch' in m and not m['management'] and m['mountable']
                 and m['switch'] and m['interface'] == 'Ethernet'
             ]
     except:
         logger.warning("Wrong description for host %s" % style.host(host))
         logger.debug("host's network_adapters = %s" %
                      (self.data['hosts'][host]['network_adapters'], ))
         return []
Example #33
0
    def get_hosts_list(self, hosts_str):
        """Generate a list of hosts from the given string.

        Args:
          hosts_str (str): The following options are supported

            - The path of the file containing the hosts to be used. Each host
            should be in a different line. Repeated hosts are pruned.
            Hint: in a running Grid5000 job, $OAR_NODEFILE should be used.

            - A comma-separated list of  site:job_id

            - A comma-separated list of hosts.

            - An oargrid_job_id

        Return:
          list of Host: The list of hosts.
        """
        hosts = []
        if os.path.isfile(hosts_str):
            for line in open(hosts_str):
                h = Host(line.rstrip())
                if h not in hosts:
                    hosts.append(h)
        elif ':' in hosts_str:
            # We assume the string is a comma separated list of site:job_id
            for job in hosts_str.split(','):
                site, job_id = job.split(':')
                hosts += get_oar_job_nodes(int(job_id), site)
        elif "," in hosts_str:
            # We assume the string is a comma separated list of hosts
            for hstr in hosts_str.split(','):
                h = Host(hstr.rstrip())
                if h not in hosts:
                    hosts.append(h)
        elif hosts_str.isdigit():
            # If the file_name is a number, we assume this is a oargrid_job_id
            hosts = get_oargrid_job_nodes(int(hosts_str))
        else:
            # If not any of the previous, we assume is a single-host cluster
            # where the given input is the only host
            hosts = [Host(hosts_str.rstrip())]

        logger.debug('Hosts list: \n%s',
                     ' '.join(style.host(host.address.split('.')[0])
                              for host in hosts))
        return hosts
Example #34
0
def configure_apt_proxy(vms):
    """Override apt proxy-guess with server as proxy"""
    hosts_vms = {}
    for vm in vms:
        if not vm['host'] in hosts_vms:
            hosts_vms[vm['host']] = []
        hosts_vms[vm['host']].append(vm['ip'])
    conf = []
    for server, clients in hosts_vms.iteritems():
        server = Host(server)
        logger.detail('Configuring %s as APT proxy for %s',
                      style.host(server.address), ','.join(clients))
        conf.append(TaktukRemote(' echo \'Acquire::http::Proxy \"http://' + 
                                 server.address + ':9999" ; \' > /etc/apt/apt.conf.d/proxy-guess', 
                                 clients))
    ParallelActions(conf).run()
Example #35
0
def configure_apt_proxy(vms):
    """Override apt proxy-guess with server as proxy"""
    hosts_vms = {}
    for vm in vms:
        if not vm['host'] in hosts_vms:
            hosts_vms[vm['host']] = []
        hosts_vms[vm['host']].append(vm['ip'])
    conf = []
    for server, clients in hosts_vms.iteritems():
        server = Host(server)
        logger.detail('Configuring %s as APT proxy for %s',
                      style.host(server.address), ','.join(clients))
        conf.append(
            TaktukRemote(
                ' echo \'Acquire::http::Proxy \"http://' + server.address +
                ':9999" ; \' > /etc/apt/apt.conf.d/proxy-guess', clients))
    ParallelActions(conf).run()
Example #36
0
    def configure_service_node(self):
        """Setup automatically a DNS server to access virtual machines by id
        and also install a DHCP server if kavlan is used"""
        if self.kavlan:
            service = 'DNS/DHCP'
            dhcp = True
        else:
            service = 'DNS'
            dhcp = False

        service_node = get_fastest_host(self.hosts)
        logger.info('Setting up %s on %s', style.emph(service),
                    style.host(service_node.split('.')[0]))
        clients = list(self.hosts)
        clients.remove(service_node)

        dnsmasq_server(service_node, clients, self.vms, dhcp)
Example #37
0
    def configure_service_node(self):
        """Setup automatically a DNS server to access virtual machines by id
        and also install a DHCP server if kavlan is used"""
        if self.kavlan:
            service = 'DNS/DHCP'
            dhcp = True
        else:
            service = 'DNS'
            dhcp = False

        service_node = get_fastest_host(self.hosts)
        logger.info('Setting up %s on %s', style.emph(service),
                    style.host(service_node.split('.')[0]))
        clients = list(self.hosts)
        clients.remove(service_node)

        dnsmasq_server(service_node, clients, self.vms, dhcp)
Example #38
0
    def _print_state_compact(self):
        """Display in a compact form the distribution of vms on hosts."""
        dist = {}
        max_len_host = 0
        for host in self.hosts:
            if len(host.split('.')[0]) > max_len_host:
                max_len_host = len(host.split('.')[0])

        for vm in self.vms:
            host = vm['host'].split('.')[0]
            if len(host) > max_len_host:
                max_len_host = len(host)
            if host not in dist.keys():
                dist[host] = {vm['id']: vm['state']}
            else:
                dist[host][vm['id']] = vm['state']
        log = ''
        for host in sorted(self.hosts,
                           key=lambda x: (x.split('.')[0].split('-')[0],
                                          int(x.split('.')[0].split('-')[1]))):
            host = host.split('.')[0]
            if host not in dist:
                dist[host] = {}

            log += '\n' + style.host(host) + ' '.ljust(max_len_host + 2 - len(host)) + \
                   str(len(dist[host].keys())) + ' '
            try:
                vms = sorted(dist[host].keys(),
                             key=lambda x:
                             (x.split('.')[0].split('-')[0],
                              int(x.split('.')[0].split('-')[1])))
            except:
                vms = sorted(dist[host].keys())
                pass
            for vm in vms:
                if dist[host][vm] == 'OK':
                    log += style.OK(vm)
                elif dist[host][vm] == 'KO':
                    log += style.KO(vm)
                else:
                    log += style.Unknown(vm)
                log += ' '
        return log
Example #39
0
    def _make_reservation(self, site):
        """Make a new reservation"""

        elements = {self.config['cluster']: 1}
        logger.info('Finding slot for the experiment '
                    '\nrally %s:1',
                    style.host(self.config['cluster']).rjust(5))

        planning = funk.get_planning(elements)
        slots = funk.compute_slots(planning,
                                   walltime=self.config['walltime'].encode(
                                       'ascii', 'ignore'),
                                   excluded_elements=EXCLUDED_ELEMENTS)

        startdate, enddate, resources = funk.find_free_slot(slots, elements)
        resources = funk.distribute_hosts(resources, elements,
                                          EXCLUDED_ELEMENTS)

        if startdate is None:
            logger.error("Sorry, could not find the resources requested.")
            exit(4)

        jobs_specs = funk.get_jobs_specs(resources,
                                         name=self.options.job_name,
                                         excluded_elements=EXCLUDED_ELEMENTS)

        print jobs_specs

        sub, site = jobs_specs[0]
        sub.additional_options = "-t deploy"
        sub.reservation_date = startdate
        sub.walltime = self.config['walltime'].encode('ascii', 'ignore')
        sub.name = self.options.job_name

        if 'testing' in EX5.get_cluster_attributes(
                self.config['cluster'])['queues']:
            sub.queue = 'testing'

        jobs = EX5.oarsub([(sub, site)])
        self.job_id = jobs[0][0]
        logger.info('Job %s will start at %s', style.emph(self.job_id),
                    style.log_header(EX.time_utils.format_date(startdate)))
Example #40
0
def generate_hosts(hosts_input):
    """Generate a list of hosts from the given file.

    Args:
      hosts_input: The path of the file containing the hosts to be used,
        or a comma separated list of site:job_id or an a comma separated list
        of hosts or an oargrid_job_id.
        If a file is used, each host should be in a different line.
        Repeated hosts are pruned.
        Hint: in a running Grid5000 job, $OAR_NODEFILE should be used.

    Return:
      list of Host: The list of hosts.
    """
    hosts = []
    if os.path.isfile(hosts_input):
        for line in open(hosts_input):
            h = Host(line.rstrip())
            if h not in hosts:
                hosts.append(h)
    elif ":" in hosts_input:
        # We assume the string is a comma separated list of site:job_id
        for job in hosts_input.split(","):
            site, job_id = job.split(":")
            hosts += get_oar_job_nodes(int(job_id), site)
    elif "," in hosts_input:
        # We assume the string is a comma separated list of hosts
        for hstr in hosts_input.split(","):
            h = Host(hstr.rstrip())
            if h not in hosts:
                hosts.append(h)
    elif hosts_input.isdigit():
        # If the file_name is a number, we assume this is a oargrid_job_id
        hosts = get_oargrid_job_nodes(int(hosts_input))
    else:
        # If not any of the previous, we assume is a single-host cluster where
        # the given input is the only host
        hosts = [Host(hosts_input.rstrip())]

    logger.debug("Hosts list: \n%s", " ".join(style.host(host.address.split(".")[0]) for host in hosts))
    return hosts
Example #41
0
    def __init__(self, hadoop_cluster, config_file=None):

        # Load cluster properties
        config = ConfigParser(self.defaults)
        config.add_section("cluster")
        config.add_section("local")

        if config_file:
            config.readfp(open(config_file))

        self.base_dir = config.get("cluster", "mahout_base_dir")
        self.conf_dir = config.get("cluster", "mahout_conf_dir")

        self.bin_dir = self.base_dir + "/bin"

        self.hc = hadoop_cluster

        # Create topology
        logger.info("Mahout cluster created in hosts %s",
                    ' '.join([style.host(h.address.split('.')[0])
                              for h in self.hc.hosts]))
Example #42
0
def get_job_by_name(job_name, sites=None):
    """ """
    logger.detail('Looking for a job named %s', style.emph(job_name))
    if not sites:
        sites = get_g5k_sites()
    oargrid_jobs = get_current_oargrid_jobs()
    if len(oargrid_jobs) > 0:
        for g_job in oargrid_jobs:
            for job in get_oargrid_job_oar_jobs(g_job):
                info = get_oar_job_info(job[0], job[1])
                if info['name'] == job_name:
                    logger.info('Oargridjob %s found !', style.emph(g_job))
                    return g_job, None
    running_jobs = get_current_oar_jobs(sites)
    for job in running_jobs:
        info = get_oar_job_info(job[0], job[1])
        if info['name'] == job_name:
            logger.info('Job %s found on site %s !', style.emph(job[0]),
                        style.host(job[1]))
            return job
    return None, None
Example #43
0
    def __init__(self, hadoop_cluster, config_file=None):

        # Load cluster properties
        config = ConfigParser(self.defaults)
        config.add_section("cluster")
        config.add_section("local")

        if config_file:
            config.readfp(open(config_file))

        self.base_dir = config.get("cluster", "mahout_base_dir")
        self.conf_dir = config.get("cluster", "mahout_conf_dir")

        self.bin_dir = self.base_dir + "/bin"

        self.hc = hadoop_cluster

        # Create topology
        logger.info(
            "Mahout cluster created in hosts %s", ' '.join(
                [style.host(h.address.split('.')[0]) for h in self.hc.hosts]))
Example #44
0
    def __init__(self, hadoop_cluster, config_file=None):
        """Create a new Hive cluster. It can be created as a standalone
        cluster or on top of YARN.

        Args:
          hadoop_cluster (HadoopCluster, optional):
            The Hadoop cluster to link.
          configFile (str, optional):
            The path of the config file to be used.
        """

        # Load cluster properties
        config = ConfigParser(self.defaults)
        config.add_section("cluster")
        config.add_section("local")

        if config_file:
            config.readfp(open(config_file))

        self.base_dir = config.get("cluster", "hive_base_dir")
        self.conf_dir = config.get("cluster", "hive_conf_dir")
        self.logs_dir = config.get("cluster", "hive_logs_dir")
        self.warehouse_dir = config.get("cluster", "hive_warehouse_dir")
        self.metastore_dir = config.get("cluster", "hive_metastore_dir")
        self.local_base_conf_dir = config.get("local", "local_base_conf_dir")

        self.bin_dir = self.base_dir + "/bin"

        # Initialize hosts
        self.hosts = hadoop_cluster.hosts
        self.master = hadoop_cluster.master

        # Store reference to Hadoop cluster and check if mandatory
        self.hc = hadoop_cluster

        logger.info(
            "Hive cluster created in hosts %s."
            " It is linked to a Hadoop cluster." if self.hc else "",
            ' '.join([style.host(h.address.split('.')[0])
                      for h in self.hosts]))
Example #45
0
    def _print_state_compact(self):
        """Display in a compact form the distribution of vms on hosts."""
        dist = {}
        max_len_host = 0
        for host in self.hosts:
            if len(host.split('.')[0]) > max_len_host:
                max_len_host = len(host.split('.')[0])

        for vm in self.vms:
            host = vm['host'].split('.')[0]
            if len(host) > max_len_host:
                max_len_host = len(host)
            if host not in dist.keys():
                dist[host] = {vm['id']: vm['state']}
            else:
                dist[host][vm['id']] = vm['state']
        log = ''
        for host in sorted(self.hosts, key=lambda x: (x.split('.')[0].split('-')[0],
                                                      int(x.split('.')[0].split('-')[1]))):
            host = host.split('.')[0]
            if host not in dist:
                dist[host] = {}

            log += '\n' + style.host(host) + ' '.ljust(max_len_host + 2 - len(host)) + \
                   str(len(dist[host].keys())) + ' '
            try:
                vms = sorted(dist[host].keys(), key=lambda x: (x.split('.')[0].split('-')[0],
                                                               int(x.split('.')[0].split('-')[1])))
            except:
                vms = sorted(dist[host].keys())
                pass
            for vm in vms:
                if dist[host][vm] == 'OK':
                    log += style.OK(vm)
                elif dist[host][vm] == 'KO':
                    log += style.KO(vm)
                else:
                    log += style.Unknown(vm)
                log += ' '
        return log
Example #46
0
    def __init__(self, hadoop_cluster, config_file=None):
        """Create a new Hive cluster. It can be created as a standalone
        cluster or on top of YARN.

        Args:
          hadoop_cluster (HadoopCluster, optional):
            The Hadoop cluster to link.
          configFile (str, optional):
            The path of the config file to be used.
        """

        # Load cluster properties
        config = ConfigParser(self.defaults)
        config.add_section("cluster")
        config.add_section("local")

        if config_file:
            config.readfp(open(config_file))

        self.base_dir = config.get("cluster", "hive_base_dir")
        self.conf_dir = config.get("cluster", "hive_conf_dir")
        self.logs_dir = config.get("cluster", "hive_logs_dir")
        self.warehouse_dir = config.get("cluster", "hive_warehouse_dir")
        self.metastore_dir = config.get("cluster", "hive_metastore_dir")
        self.local_base_conf_dir = config.get("local", "local_base_conf_dir")

        self.bin_dir = self.base_dir + "/bin"

        # Initialize hosts
        self.hosts = hadoop_cluster.hosts
        self.master = hadoop_cluster.master

        # Store reference to Hadoop cluster and check if mandatory
        self.hc = hadoop_cluster

        logger.info("Hive cluster created in hosts %s."
                    " It is linked to a Hadoop cluster." if self.hc else "",
                    ' '.join([style.host(h.address.split('.')[0])
                              for h in self.hosts]))
Example #47
0
    def __force_clean(self):
        """Stop previous Spark processes (if any) and remove all remote files
        created by it."""

        spark_processes = [
            "Master",
            "Worker"
        ]

        force_kill = False
        for h in self.hosts:
            proc = SshProcess("jps", h)
            proc.run()

            ids_to_kill = []
            for line in proc.stdout.splitlines():
                field = line.split()
                if field[1] in spark_processes:
                    ids_to_kill.append(field[0])

            if ids_to_kill:
                force_kill = True
                ids_to_kill_str = ""
                for pid in ids_to_kill:
                    ids_to_kill_str += " " + pid

                logger.warn(
                    "Killing running Spark processes in host %s" %
                    style.host(h.address.split('.')[0]))

                proc = SshProcess("kill -9" + ids_to_kill_str, h)
                proc.run()

        if force_kill:
            logger.info(
                "Processes from previous hadoop deployments had to be killed")

        self.clean_logs()
Example #48
0
    def get_hosts_list(self, hosts_str):
        """Generate a list of hosts from the given file.

        Args:
          hosts_str (str): The following options are supported

            - The path of the file containing the hosts to be used. Each host
            should be in a different line. Repeated hosts are pruned.
            Hint: in a running Grid5000 job, $OAR_NODEFILE should be used.

            - A comma-separated list of hosts.

        Return:
          list of Host: The list of hosts.
        """
        hosts = []
        if os.path.isfile(hosts_str):
            for line in open(hosts_str):
                h = Host(line.rstrip())
                if h not in hosts:
                    hosts.append(h)
        elif "," in hosts_str:
            # We assume the string is a comma separated list of hosts
            for hstr in hosts_str.split(','):
                h = Host(hstr.rstrip())
                if h not in hosts:
                    hosts.append(h)
        else:
            # If not any of the previous, we assume is a single-host cluster
            # where the given input is the only host
            hosts = [Host(hosts_str.rstrip())]

        logger.debug('Hosts list: \n%s',
                     ' '.join(style.host(host.address.split('.')[0])
                              for host in hosts))
        return hosts
Example #49
0
    def __init__(self,
                 mode,
                 config_file=None,
                 hosts=None,
                 hadoop_cluster=None):
        """Create a new Spark cluster. It can be created as a standalone
        cluster or on top of YARN.

        Args:
          mode (int):
            The cluster manager that is used (STANDALONE_MODE or YARN_MODE).
          configFile (str, optional):
            The path of the config file to be used.
          hosts (list of Host, optional):
            The hosts of the cluster (standalone operation).
          hadoop_cluster (HadoopCluster, optional):
            The Hadoop cluster to link.
        """

        # Load cluster properties
        config = ConfigParser(self.defaults)
        config.add_section("cluster")
        config.add_section("local")

        if config_file:
            config.readfp(open(config_file))

        # Deployment properties
        self.local_base_conf_dir = config.get("local", "local_base_conf_dir")
        self.init_conf_dir = tempfile.mkdtemp("", "spark-init-", "/tmp")
        self.conf_mandatory_files = [SPARK_CONF_FILE]

        self.base_dir = config.get("cluster", "spark_base_dir")
        self.conf_dir = config.get("cluster", "spark_conf_dir")
        self.logs_dir = config.get("cluster", "spark_logs_dir")
        self.evs_log_dir = config.get("cluster", "spark_events_dir")
        self.work_dir = config.get("cluster", "spark_work_dir")
        self.port = config.getint("cluster", "spark_port")
        self.local_base_conf_dir = config.get("local", "local_base_conf_dir")

        self.bin_dir = self.base_dir + "/bin"
        self.sbin_dir = self.base_dir + "/sbin"

        self.mode = mode

        self.java_home = None

        # Initialize hosts
        if hosts:
            self.hosts = hosts
            self.master = hosts[0]
        elif hadoop_cluster:
            self.hosts = hadoop_cluster.hosts
            self.master = hadoop_cluster.master
        else:
            logger.error("Hosts in the cluster must be specified either"
                         "directly or indirectly through a Hadoop cluster.")
            raise SparkException("Hosts in the cluster must be specified "
                                 "either directly or indirectly through a "
                                 "Hadoop cluster.")

        # Store cluster information
        self.hw = hw_manager.make_deployment_hardware()
        self.hw.add_hosts(self.hosts)
        self.master_cluster = self.hw.get_host_cluster(self.master)

        # Store reference to Hadoop cluster and check if mandatory
        self.hc = hadoop_cluster
        if not self.hc and self.mode == YARN_MODE:
            logger.error("When using a YARN_MODE mode, a reference to the "
                         "Hadoop cluster should be provided.")
            raise SparkException("When using a YARN_MODE mode, a reference "
                                 "to the Hadoop cluster should be provided")

        if self.mode == STANDALONE_MODE:
            mode_text = "in standalone mode"
        else:
            mode_text = "on top of YARN"
        logger.info(
            "Spark cluster created %s in hosts %s." +
            (" It is linked to a Hadoop cluster." if self.hc else ""),
            mode_text,
            ' '.join([style.host(h.address.split('.')[0])
                      for h in self.hosts]))
Example #50
0
    def __init__(self, hosts, topo_list=None, config_file=None):
        """Create a new Hadoop cluster with the given hosts and topology.
        
        Args:
          hosts (list of Host):
            The hosts to be assigned a topology.
          topo_list (list of str, optional):
            The racks to be assigned to each host. len(hosts) should be equal
            to len(topo_list).
          configFile (str, optional):
            The path of the config file to be used.
        """

        # Load properties
        config = ConfigParser(self.defaults)
        config.add_section("cluster")
        config.add_section("local")

        if config_file:
            config.readfp(open(config_file))

        # Deployment properties
        self.local_base_conf_dir = config.get("local", "local_base_conf_dir")
        self.init_conf_dir = tempfile.mkdtemp("", "hadoop-init-", "/tmp")
        self.conf_mandatory_files = [CORE_CONF_FILE,
                                     HDFS_CONF_FILE,
                                     MR_CONF_FILE]

        # Node properties
        self.base_dir = config.get("cluster", "hadoop_base_dir")
        self.conf_dir = config.get("cluster", "hadoop_conf_dir")
        self.logs_dir = config.get("cluster", "hadoop_logs_dir")
        self.hadoop_temp_dir = config.get("cluster", "hadoop_temp_dir")
        self.hdfs_port = config.getint("cluster", "hdfs_port")
        self.mapred_port = config.getint("cluster", "mapred_port")

        self.bin_dir = self.base_dir + "/bin"
        self.sbin_dir = self.base_dir + "/bin"

        self.java_home = None

        # Configure master and slaves
        self.hosts = list(hosts)
        self.master = self.hosts[0]

        # Create topology
        self.topology = HadoopTopology(hosts, topo_list)

        # Store cluster information
        self.hw = hw_manager.make_deployment_hardware()
        self.hw.add_hosts(self.hosts)
        self.master_cluster = self.hw.get_host_cluster(self.master)

        # Create a string to display the topology
        t = {v: [] for v in self.topology.topology.values()}
        for key, value in self.topology.topology.iteritems():
            t[value].append(key.address)
        log_topo = ', '.join([style.user2(k) + ': ' +
                              ' '.join(map(lambda x: style.host(x.split('.')[0]), v))
                              for k, v in t.iteritems()])
        
        logger.info("Hadoop cluster created with master %s, hosts %s and "
                    "topology %s",
                    style.host(self.master.address), 
                    ' '.join([style.host(h.address.split('.')[0])
                              for h in self.hosts]),
                    log_topo)
Example #51
0
    def __init__(self, mode, config_file=None, hosts=None,
                 hadoop_cluster=None):
        """Create a new Spark cluster. It can be created as a standalone
        cluster or on top of YARN.

        Args:
          mode (int):
            The cluster manager that is used (STANDALONE_MODE or YARN_MODE).
          configFile (str, optional):
            The path of the config file to be used.
          hosts (list of Host, optional):
            The hosts of the cluster (standalone operation).
          hadoop_cluster (HadoopCluster, optional):
            The Hadoop cluster to link.
        """

        # Load cluster properties
        config = ConfigParser(self.defaults)
        config.add_section("cluster")
        config.add_section("local")

        if config_file:
            config.readfp(open(config_file))

        # Deployment properties
        self.local_base_conf_dir = config.get("local", "local_base_conf_dir")
        self.init_conf_dir = tempfile.mkdtemp("", "spark-init-", "/tmp")
        self.conf_mandatory_files = [SPARK_CONF_FILE]

        self.base_dir = config.get("cluster", "spark_base_dir")
        self.conf_dir = config.get("cluster", "spark_conf_dir")
        self.logs_dir = config.get("cluster", "spark_logs_dir")
        self.evs_log_dir = config.get("cluster", "spark_events_dir")
        self.work_dir = config.get("cluster", "spark_work_dir")
        self.port = config.getint("cluster", "spark_port")
        self.local_base_conf_dir = config.get("local", "local_base_conf_dir")

        self.bin_dir = self.base_dir + "/bin"
        self.sbin_dir = self.base_dir + "/sbin"

        self.mode = mode

        self.java_home = None

        # Initialize hosts
        if hosts:
            self.hosts = hosts
            self.master = hosts[0]
        elif hadoop_cluster:
            self.hosts = hadoop_cluster.hosts
            self.master = hadoop_cluster.master
        else:
            logger.error("Hosts in the cluster must be specified either"
                         "directly or indirectly through a Hadoop cluster.")
            raise SparkException("Hosts in the cluster must be specified "
                                 "either directly or indirectly through a "
                                 "Hadoop cluster.")

        # Store cluster information
        self.hw = G5kDeploymentHardware()
        self.hw.add_hosts(self.hosts)
        self.master_cluster = self.hw.get_cluster(get_host_cluster(self.master))

        # Store reference to Hadoop cluster and check if mandatory
        self.hc = hadoop_cluster
        if not self.hc and self.mode == YARN_MODE:
            logger.error("When using a YARN_MODE mode, a reference to the "
                         "Hadoop cluster should be provided.")
            raise SparkException("When using a YARN_MODE mode, a reference "
                                 "to the Hadoop cluster should be provided")

        if self.mode == STANDALONE_MODE:
            mode_text = "in standalone mode"
        else:
            mode_text = "on top of YARN"
        logger.info("Spark cluster created %s in hosts %s." +
                    (" It is linked to a Hadoop cluster." if self.hc else ""),
                    mode_text,
                    ' '.join([style.host(h.address.split('.')[0])
                              for h in self.hosts]))
Example #52
0
    def __init__(self, hosts, topo_list=None, config_file=None):
        """Create a new Hadoop cluster with the given hosts and topology.
        
        Args:
          hosts (list of Host):
            The hosts to be assigned a topology.
          topo_list (list of str, optional):
            The racks to be assigned to each host. len(hosts) should be equal
            to len(topo_list).
          configFile (str, optional):
            The path of the config file to be used.
        """

        # Load properties
        config = ConfigParser(self.defaults)
        config.add_section("cluster")
        config.add_section("local")

        if config_file:
            config.readfp(open(config_file))

        # Deployment properties
        self.local_base_conf_dir = config.get("local", "local_base_conf_dir")
        self.init_conf_dir = tempfile.mkdtemp("", "hadoop-init-", "/tmp")
        self.conf_mandatory_files = [
            CORE_CONF_FILE, HDFS_CONF_FILE, MR_CONF_FILE
        ]

        # Node properties
        self.base_dir = config.get("cluster", "hadoop_base_dir")
        self.conf_dir = config.get("cluster", "hadoop_conf_dir")
        self.logs_dir = config.get("cluster", "hadoop_logs_dir")
        self.hadoop_temp_dir = config.get("cluster", "hadoop_temp_dir")
        self.hdfs_port = config.getint("cluster", "hdfs_port")
        self.mapred_port = config.getint("cluster", "mapred_port")

        self.bin_dir = self.base_dir + "/bin"
        self.sbin_dir = self.base_dir + "/bin"

        self.java_home = None

        # Configure master and slaves
        self.hosts = list(hosts)
        self.master = self.hosts[0]

        # Create topology
        self.topology = HadoopTopology(hosts, topo_list)

        # Store cluster information
        self.hw = hw_manager.make_deployment_hardware()
        self.hw.add_hosts(self.hosts)
        self.master_cluster = self.hw.get_host_cluster(self.master)

        # Create a string to display the topology
        t = {v: [] for v in self.topology.topology.values()}
        for key, value in self.topology.topology.iteritems():
            t[value].append(key.address)
        log_topo = ', '.join([
            style.user2(k) + ': ' +
            ' '.join(map(lambda x: style.host(x.split('.')[0]), v))
            for k, v in t.iteritems()
        ])

        logger.info(
            "Hadoop cluster created with master %s, hosts %s and "
            "topology %s", style.host(self.master.address),
            ' '.join([style.host(h.address.split('.')[0])
                      for h in self.hosts]), log_topo)
Example #53
0
#!/usr/bin/env python
from execo_g5k.topology import g5k_graph, treemap
from execo.log import logger, style
from execo_g5k.oar import get_oar_job_nodes
from execo_g5k.utils import hosts_list
from networkx.algorithms.shortest_paths.generic import shortest_path
from execo_g5k.api_utils import get_host_shortname
from random import uniform

jobs = [(1696863, 'grenoble'), (1502558, 'lille'), (74715, 'luxembourg')]

logger.info('Retrieving hosts used for jobs %s',
            ', '.join([style.host(site) + ':' + style.emph(job_id)
                       for job_id, site in jobs]))
hosts = [get_host_shortname(h) for job_id, site in jobs
         for h in get_oar_job_nodes(job_id, site)]
logger.info(hosts_list(hosts))

logger.info('Creating topological graph')
g = g5k_graph(elements=hosts)

i, j = int(uniform(1, len(hosts))), int(uniform(1, len(hosts)))
path = shortest_path(g, hosts[i], hosts[j])
logger.info('Communication between %s and %s go through '
            'the following links: \n%s',
            style.host(hosts[i]),
            style.host(hosts[j]),
            ' -> '.join(path))

logger.info('Active links between nodes %s and %s are: \n%s',
            style.host(path[0]),
Example #54
0
#!/usr/bin/env python
from execo_g5k.topology import g5k_graph, treemap
from execo.log import logger, style
from execo_g5k.oar import get_oar_job_nodes
from execo_g5k.utils import hosts_list
from networkx.algorithms.shortest_paths.generic import shortest_path
from execo_g5k.api_utils import get_host_shortname
from random import uniform

jobs = [(1696863, 'grenoble'), (1502558, 'lille'), (74715, 'luxembourg')]

logger.info(
    'Retrieving hosts used for jobs %s', ', '.join([
        style.host(site) + ':' + style.emph(job_id) for job_id, site in jobs
    ]))
hosts = [
    get_host_shortname(h) for job_id, site in jobs
    for h in get_oar_job_nodes(job_id, site)
]
logger.info(hosts_list(hosts))

logger.info('Creating topological graph')
g = g5k_graph(elements=hosts)

i, j = int(uniform(1, len(hosts))), int(uniform(1, len(hosts)))
path = shortest_path(g, hosts[i], hosts[j])
logger.info(
    'Communication between %s and %s go through '
    'the following links: \n%s', style.host(hosts[i]), style.host(hosts[j]),
    ' -> '.join(path))
Example #55
0
    def __init__(self,
                 infile=None,
                 resources=None,
                 hosts=None,
                 ip_mac=None,
                 vlan=None,
                 env_name=None,
                 env_file=None,
                 vms=None,
                 distribution=None,
                 outdir=None):
        """:param infile: an XML file that describe the topology of the
        deployment

        :param resources: a dict whose keys are Grid'5000 sites and values are
        dict, whose keys are hosts and ip_mac, where hosts is a list of
        execo.Host and ip_mac is a list of tuple (ip, mac).

        :param env_name: name of the Kadeploy environment

        :param env_file: path to the Kadeploy environment file

        :params vms: dict defining the virtual machines

        :params distribution: how to distribute the vms on the hosts
        (``round-robin`` , ``concentrated``, ``random``)

        :params outdir: directory to store the deployment files
        """
        # set a factory for the deployment that use taktuk and chainput
        self.fact = ActionFactory(remote_tool=TAKTUK,
                                  fileput_tool=CHAINPUT,
                                  fileget_tool=TAKTUK)
        self.kavlan = None if not vlan else vlan
        self.kavlan_site = None
        if env_name is not None:
            self.env_file = None
            if ':' not in env_name:
                self.env_name, self.env_user = env_name, None
            else:
                self.env_user, self.env_name = env_name.split(':')
        else:
            if env_file is not None:
                self.env_name = None
                self.env_user = None
                self.env_file = env_file
            else:
                self.env_name = 'vm5k'
                self.env_user = '******'
                self.env_file = None

        if outdir:
            self.outdir = outdir
        else:
            self.outdir = 'vm5k_' + strftime("%Y%m%d_%H%M%S_%z")

        self.copy_actions = None

        self.state = Element('vm5k')
        self._define_elements(infile, resources, hosts, vms, ip_mac,
                              distribution)

        logger.info('%s %s %s %s %s %s %s %s', len(self.sites),
                    style.emph('sites'), len(self.clusters),
                    style.user1('clusters'), len(self.hosts),
                    style.host('hosts'), len(self.vms), style.vm('vms'))
Example #56
0
    def workflow(self, comb):
        """
            Compute one application launch 
            using a given parameter group
        """
        comb_ok = False
        try:
            # Generate configuration file needed by MPI processes
            logger.info("Generating assembly file...")
            py = comb['cores'] / comb['px']
            prepare = Process(
                'cd %s && python %s %d %d %d %d %d %s app.lad' %
                (self.workingPath, self.genLadScript, comb['datasize'],
                 comb['datasize'], comb['datasize'], comb['px'], py,
                 comb['transposition']))
            prepare.shell = True
            prepare.run()

            # Generate the MPI host file
            mfile = self.generate_machine_file()

            # Start L2C
            lad = "./app.lad"
            logger.info("Computing...")
            res = Process(
                "export OAR_JOB_KEY_FILE=~/.oar_key ; cd %s && l2c_loader -M,-machinefile,%s --mpi -c %d %s"
                % (self.workingPath, mfile, comb['cores'], lad))
            res.shell = True
            res.stdout_handlers.append(
                os.path.join(self.result_dir,
                             slugify(comb) + '.out'))
            res.stdout_handlers.append(sys.stdout)
            res.stderr_handlers.append(
                os.path.join(self.result_dir,
                             slugify(comb) + '.err'))
            res.stderr_handlers.append(sys.stderr)
            res.run()
            if not res.ok:
                logger.error('Bad L2C termination')
                raise Exception('Bad L2C termination')
            if len(
                    res.stderr
            ) > 0:  # WARNING: when L2C cannot find the LAD file or something strange like this
                logger.warning('Not empty error output')

            # Clean configuration files
            logger.info("Removing assembly files...")
            res = Process('cd %s && rm -f app.lad*' % self.workingPath)
            res.shell = True
            res.run()

            comb_ok = True
        except Exception:
            pass
        finally:
            if comb_ok:
                self.sweeper.done(comb)
                logger.info(style.host(slugify(comb)) + ' has been done')
            else:
                self.sweeper.cancel(comb)
                logger.warning(
                    style.host(slugify(comb)) + ' has been canceled')

            logger.info(style.step('%s Remaining'),
                        len(self.sweeper.get_remaining()))