Exemple #1
0
                def takeMetric(
                        path,
                        startTime,
                        endTime,
                        metric=['cpu', 'mem', 'disk', 'swap', 'network']):
                    opt = ''
                    cmd_template_sar = (
                        "sar -f /var/log/sysstat/sa* -{opt} -s {startTime} -e {endTime}"
                    )
                    for met in metric:
                        if met == 'cpu':
                            opt = 'u'
                        elif met == 'mem':
                            opt = 'r'
                        elif met == 'disk':
                            opt = 'dp'
                        elif met == 'swap':
                            opt = 'S'
                        elif met == 'network':
                            opt = 'n DEV'

                        cmd = cmd_template_sar.format(opt=opt,
                                                      startTime=startTime,
                                                      endTime=endTime)
                        for host in self.cluster:
                            hE = SshProcess(cmd,
                                            host,
                                            connection_params={'user': '******'})
                            hE.run()
                            stdMetric = host + '-' + met + '.txt'
                            with open(os.path.join(path, stdMetric),
                                      "w") as sout:
                                sout.write(hE.stdout)
Exemple #2
0
def _munin_server(server, clients):
    """Install the monitoring service munin. Must be executed inside Grid'5000
    to be able to resolve the server and clients IP.

    :param server: a execo.Host

    :param clients: a list of execo.Hosts

    :param plugins: a list of munin plugins

    """
    logger.info(
        'Munin monitoring service installation, server = %s, clients = \n %s',
        server.address, [host.address for host in clients])

    logger.debug('Configuring munin server %s', style.host('server'))
    cmd = 'export DEBIAN_MASTER=noninteractive ; apt-get update && apt-get install -y munin'
    inst_munin_server = SshProcess(cmd, server).run()

    logger.debug('Creating configuration files for server')
    fd, server_conf = mkstemp(dir='/tmp/', prefix='munin-nodes_')
    f = fdopen(fd, 'w')
    for host in clients:
        get_ip = Process('host ' + host.address).run()
        ip = get_ip.stdout.strip().split(' ')[3]
        f.write('[' + host.address + ']\n    address ' + ip +
                '\n   use_node_name yes\n\n')
    f.close()

    Put([server], [server_conf], remote_location='/etc/').run()
    SshProcess('cd /etc && cp ' + server_conf.split('/')[-1] + ' munin.conf',
               server).run()
    Process('rm ' + server_conf).run()
 def run_xp(self):
     """Iterate over the parameters and execute the bench"""
     while len(self.sweeper.get_remaining()) > 0:
         comb = self.sweeper.get_next()
         if comb['n_core'] > get_host_attributes(comb['cluster']+'-1')['architecture']['smt_size'] * self.n_nodes: 
             self.sweeper.skip(comb)
             continue
         logger.info('Processing new combination %s' % (comb,))
         site = get_cluster_site(comb['cluster'])
         jobs = oarsub([(OarSubmission(resources = "{cluster='" + comb['cluster']+"'}/nodes=" + str(self.n_nodes),
                                       job_type = 'allow_classic_ssh', 
                                       walltime ='0:10:00'), 
                         site)])
         if jobs[0][0]:
             try:
                 wait_oar_job_start(*jobs[0])
                 nodes = get_oar_job_nodes(*jobs[0])
                 bench_cmd = 'mpirun -H %s -n %i %s ~/NPB3.3-MPI/bin/lu.%s.%i' % (
                     ",".join([node.address for node in nodes]),
                     comb['n_core'],
                     get_mpi_opts(comb['cluster']),
                     comb['size'],
                     comb['n_core'])
                 lu_bench = SshProcess(bench_cmd, nodes[0])
                 lu_bench.stdout_handlers.append(self.result_dir + '/' + slugify(comb) + '.out')
                 lu_bench.run()
                 if lu_bench.ok:
                     logger.info("comb ok: %s" % (comb,))
                     self.sweeper.done(comb)
                     continue
             finally:
                 oardel(jobs)
         logger.info("comb NOT ok: %s" % (comb,))
         self.sweeper.cancel(comb)
Exemple #4
0
def set_cpu_load(load, vm_ip, pid):
    """Use cpulimit to change process intensity on vm"""
    logger.info('kill cpu_limit on %s and set it to %s', vm_ip, load)
    kill_cpu_limit = SshProcess('ps aux| grep "cpulimit" | grep -v "grep" | awk \'{print $2}\' | xargs -r kill -9',
                                vm_ip).run()
    start_cpu_limit = SshProcess('cpulimit -p ' + str(pid) + ' -l ' + str(load), vm_ip)
    start_cpu_limit.nolog_exit_code = start_cpu_limit.ignore_exit_code = True
    start_cpu_limit.start()
Exemple #5
0
def set_cpu_load(load, vm_ip, pid):
    """Use cpulimit to change process intensity on vm"""
    logger.info('kill cpu_limit on %s and set it to %s', vm_ip, load)
    kill_cpu_limit = SshProcess(
        'ps aux| grep "cpulimit" | grep -v "grep" | awk \'{print $2}\' | xargs -r kill -9',
        vm_ip).run()
    start_cpu_limit = SshProcess(
        'cpulimit -p ' + str(pid) + ' -l ' + str(load), vm_ip)
    start_cpu_limit.nolog_exit_code = start_cpu_limit.ignore_exit_code = True
    start_cpu_limit.start()
Exemple #6
0
    def get_memory_and_cores(self, host):
        """Obtain the total available memory in MB and number of cores of the
        given host.

        Supported systems include Linux and Max OS X.. In linux it uses nproc
        and /proc/meminfo to obtain the informtion. In Max OS X it uses
        system_profiler.

        Args:
          host (Host): The host to query.

        Return:
          tuple of (int, int): A tuple containing the host available memory in
            MB and its number of cores.
        """

        linux_cmd = "nproc && " \
                    "cat /proc/meminfo | grep MemTotal | awk '{ print $2,$3 }'"

        max_cmd = "system_profiler SPHardwareDataType | grep Cores | " \
                  "awk '{ print $NF }' && " \
                  "system_profiler SPHardwareDataType | grep Memory | " \
                  "awk '{ print $2,$3 }'"

        undef_str = "?"
        undef_cmd = "echo '" + undef_str + "'"

        command = 'if [ $(uname) == "Linux" ]; then %s; ' \
                  'elif [ $(uname) == "Darwin" ]; then %s; ' \
                  'else %s; fi' % \
                  (linux_cmd, max_cmd, undef_cmd)

        proc = SshProcess(command, host)
        proc.run()

        out = proc.stdout

        if out == undef_str:
            return None
        else:
            units = {
                "kb": lambda x: int(x) // 1024,
                "mb": lambda x: int(x),
                "gb": lambda x: int(x) * 1024,
            }

            (cores_str, mem_str) = out.splitlines()
            cores = int(cores_str)
            (num, unit) = mem_str.split()
            mem = units[unit.lower()](num)

        return mem, cores
Exemple #7
0
    def bootstrap(self, tar_file):
        """Install Cassandra in all cluster nodes from the specified tar.gz file.

        Args:
          tar_file (str):
            The file containing Cassandra binaries.
        """

        # 0. Check that required packages are present
        required_packages = "openjdk-7-jre openjdk-7-jdk"
        check_packages = TaktukRemote("dpkg -s " + required_packages, self.hosts)
        for p in check_packages.processes:
            p.nolog_exit_code = p.nolog_error = True
        check_packages.run()
        if not check_packages.ok:
            logger.info("Packages not installed, trying to install")
            install_packages = TaktukRemote(
                "export DEBIAN_MASTER=noninteractive ; "
                + "apt-get update && apt-get install -y --force-yes "
                + required_packages,
                self.hosts,
            ).run()
            if not install_packages.ok:
                logger.error("Unable to install the packages")

        get_java_home = SshProcess("echo $(readlink -f /usr/bin/javac | " 'sed "s:/bin/javac::")', self.master)
        get_java_home.run()
        self.java_home = get_java_home.stdout.strip()

        logger.info("All required packages are present")

        # 1. Copy hadoop tar file and uncompress
        logger.info("Copy " + tar_file + " to hosts and uncompress")
        rm_dirs = TaktukRemote("rm -rf " + self.base_dir + " " + self.conf_dir + " " + self.logs_dir, self.hosts)
        put_tar = TaktukPut(self.hosts, [tar_file], "/tmp")
        tar_xf = TaktukRemote("tar xf /tmp/" + os.path.basename(tar_file) + " -C /tmp", self.hosts)
        SequentialActions([rm_dirs, put_tar, tar_xf]).run()

        # 2. Move installation to base dir and create other dirs
        logger.info("Create installation directories")
        mv_base_dir = TaktukRemote(
            "mv /tmp/" + os.path.basename(tar_file).replace(".tar.gz", "") + " " + self.base_dir, self.hosts
        )
        mkdirs = TaktukRemote("mkdir -p " + self.conf_dir + " && mkdir -p " + self.logs_dir, self.hosts)
        chmods = TaktukRemote(
            "chmod g+w " + self.base_dir + " && chmod g+w " + self.conf_dir + " && chmod g+w " + self.logs_dir,
            self.hosts,
        )
        SequentialActions([mv_base_dir, mkdirs, chmods]).run()
Exemple #8
0
def get_server_iface(server):
    """Get the default network interface of the serve """
    logger.debug('Retrieving default interface from %s',
                 style.host(server.address))
    get_if = SshProcess('ip route |grep default |cut -d " " -f 5',
                        server).run()
    return get_if.stdout.strip()
Exemple #9
0
    def _enable_bridge(self, name='br0'):
        """We need a bridge to have automatic DHCP configuration for the VM."""
        logger.detail('Configuring the bridge')
        hosts_br = self._get_bridge(self.hosts)
        nobr_hosts = []
        for host, br in hosts_br.iteritems():
            if br is None:
                logger.debug('No bridge on host %s', style.host(host))
                nobr_hosts.append(host)
            elif br != name:
                logger.debug('Wrong bridge on host %s, destroying it',
                             style.host(host))
                SshProcess('ip link set ' + br + ' down ; brctl delbr ' + br,
                           host).run()
                nobr_hosts.append(host)
            else:
                logger.debug('Bridge %s is present on host %s',
                             style.emph('name'), style.host(host))

        nobr_hosts = map(lambda x: x.address
                         if isinstance(x, Host) else x, nobr_hosts)

        if len(nobr_hosts) > 0:
            logger.debug('Creating bridge on %s', hosts_list(nobr_hosts))
            script = 'export br_if=`ip route |grep default |cut -f 5 -d " "`; \n' + \
    'ifdown $br_if ; \n' + \
    'sed -i "s/$br_if inet dhcp/$br_if inet manual/g" /etc/network/interfaces ; \n' + \
    'sed -i "s/auto $br_if//g" /etc/network/interfaces ; \n' + \
    'echo " " >> /etc/network/interfaces ; \n' + \
    'echo "auto ' + name + '" >> /etc/network/interfaces ; \n' + \
    'echo "iface ' + name + ' inet dhcp" >> /etc/network/interfaces ; \n' + \
    'echo "  bridge_ports $br_if" >> /etc/network/interfaces ; \n' + \
    'echo "  bridge_stp off" >> /etc/network/interfaces ; \n' + \
    'echo "  bridge_maxwait 0" >> /etc/network/interfaces ; \n' + \
    'echo "  bridge_fd 0" >> /etc/network/interfaces ; \n' + \
    'ifup ' + name
            fd, br_script = mkstemp(dir='/tmp/', prefix='create_br_')
            f = fdopen(fd, 'w')
            f.write(script)
            f.close()

            self.fact.get_fileput(nobr_hosts, [br_script]).run()
            self.fact.get_remote('nohup sh ' + br_script.split('/')[-1],
                                 nobr_hosts).run()

            logger.debug('Waiting for network restart')
            if_up = False
            nmap_tries = 0
            while (not if_up) and nmap_tries < 20:
                sleep(20)
                nmap_tries += 1
                nmap = Process('nmap ' +
                               ' '.join([host for host in nobr_hosts]) +
                               ' -p 22').run()
                for line in nmap.stdout.split('\n'):
                    if 'Nmap done' in line:
                        if_up = line.split()[2] == line.split()[5].replace(
                            '(', '')
            logger.debug('Network has been restarted')
        logger.detail('All hosts have the bridge %s', style.emph(name))
Exemple #10
0
def dnsmasq_server(server, clients=None, vms=None, dhcp=True):
    """Configure a DHCP server with dnsmasq

    :param server: host where the server will be installed

    :param clients: list of hosts that will be declared in dnsmasq

    :param vms: list of virtual machines

    """
    logger.debug('Installing and configuring a DNS/DHCP server on %s', server)

    test_running = Process('nmap ' + server + ' -p 53 | grep domain')
    test_running.shell = True
    test_running.run()
    if 'open' in test_running.stdout:
        logger.info('DNS server already running, updating configuration')
    else:
        cmd = 'killall dnsmasq; export DEBIAN_MASTER=noninteractive ; ' + \
            'apt-get update ; apt-get -y purge dnsmasq-base ; ' + \
            'apt-get install -t wheezy -o Dpkg::Options::="--force-confdef" ' + \
            '-o Dpkg::Options::="--force-confnew" ' + \
            '-y dnsmasq; echo 1 > /proc/sys/net/ipv4/ip_forward '
        SshProcess(cmd, server).run()

    sites = list(
        set([
            get_host_site(client)
            for client in clients if get_host_site(client)
        ] + [get_host_site(server)]))
    add_vms(vms, server)
    if clients:
        kill_dnsmasq = TaktukRemote('killall dnsmasq', clients)
        for p in kill_dnsmasq.processes:
            p.ignore_exit_code = p.nolog_exit_code = True
        kill_dnsmasq.run()
        resolv_conf(server, clients, sites)

    if dhcp:
        sysctl_conf(server, vms)
        dhcp_conf(server, vms, sites)

    logger.debug('Restarting service ...')
    cmd = 'service dnsmasq stop ; rm /var/lib/misc/dnsmasq.leases ; ' + \
        'service dnsmasq start',
    SshProcess(cmd, server).run()
Exemple #11
0
                def runMpi(cmd):
                    act = SshProcess(cmd,
                                     master,
                                     connection_params={'user': '******'},
                                     shell=True)
                    act.run()

                    if not os.path.exists(curPath):
                        os.makedirs(curPath)

                    with open(os.path.join(curPath, "stdout.txt"),
                              "a+") as sout, open(
                                  os.path.join(curPath, "stderr.txt"),
                                  "w") as serr:
                        sout.write(act.stdout)
                        serr.write(act.stderr)
                    return act.ok
Exemple #12
0
def restart_vms(vms):
    """ """
    hosts = [vm['host'] for vm in vms]
    running_vms = list_vm(hosts)
    for vm in vms:
        if {'id': vm['id']} not in running_vms[vm['host']]:
            logger.info('%s has not been started on %s, starting it',
                        style.vm(vm['id']), style.host(vm['host']))
            SshProcess('virsh start ' + vm['id'], vm['host']).run()
Exemple #13
0
    def define_parameters(self):
        nbNodes = len(self.cluster)
        # build parameters and make nbCore list per benchmark
        freqList = [2534000, 2000000, 1200000]
        n_nodes = float(len(self.cluster))
        max_core = SshProcess('cat /proc/cpuinfo | grep -i processor |wc -l',
                              self.cluster[0],
                              connection_params={
                                  'user': '******'
                              }).run().stdout
        max_core = n_nodes * float(max_core)
        even = filter(
            lambda i: i > n_nodes,
            list(takewhile(lambda i: i < max_core,
                           (2**i for i in count(0, 1)))))
        powerTwo = filter(
            lambda i: i > n_nodes,
            list(takewhile(lambda i: i < max_core,
                           (i**2 for i in count(0, 1)))))

        # Define parameters
        self.parameters = {
            'Repeat': [1],
            "Freq": [2534000],
            "NPBclass": ['C'],
            "Benchmark": {
                # 'ft': {
                #     'n_core': even
                #     },
                # 'ep': {
                #     'n_core': even
                #     },
                # 'lu': {
                #     'n_core': even
                #     },
                # 'is': {
                #     'n_core': even
                #     },
                # 'sg': {
                #     'n_core': even
                #     },
                # 'bt': {
                #     'n_core': powerTwo
                #     },
                'sp': {
                    'n_core': powerTwo
                }
            }
        }

        logger.info(self.parameters)
        # make all possible parameters object,
        self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"),
                                    sweep(self.parameters))
        logger.info('Number of parameters combinations %s',
                    len(self.sweeper.get_remaining()))
Exemple #14
0
 def import_from_kaenv(self, env, remote=None):
     """import a kadeploy environment from	a kadeploy database"""
     env = re.match(
         r"^(?P<name>[-_.\w]+)(?:@(?P<user>[_.\w]+))?(:?:(?P<version>[_.\w]+))?$",
         env).groupdict("")
     if env['user']:
         env['user'] = "******" + env['user']
     if env['version']:
         env['version'] = " --env-version " + env['version']
     kaenv_cmd = "kaenv3{user}{version} -p {name}".format(**env)
     if remote:
         remote = re.match(
             r"^(?:(?P<user>[-_.\w]+)@)?(?P<address>[-_.\w]+)(?::(?P<port>\d{1,5}))?$",
             remote).groupdict()
         p = SshProcess(kaenv_cmd, Host(**remote))
     else:
         p = Process(kaenv_cmd, shell=True)
     p.run()
     self.desc = yaml.load(p.stdout)
     return self
Exemple #15
0
def add_vms(vms, server):
    """Generate the list of virtual machines """
    logger.debug('Adding the VM in /etc/hosts ...')
    fd, vms_list = mkstemp(dir='/tmp/', prefix='vms_')
    f = fdopen(fd, 'w')
    f.write('\n' + '\n'.join([vm['ip'] + ' \t ' + vm['id'] for vm in vms]))
    f.close()
    Put([server], [vms_list], remote_location='/etc/').run()
    SshProcess(
        '[ -f /etc/hosts.bak ] && cp /etc/hosts.bak /etc/hosts || ' +
        ' cp /etc/hosts /etc/hosts.bak', server).run()
    Remote('cat /etc/' + vms_list.split('/')[-1] + ' >> /etc/hosts',
           [server]).run()
    Process('rm ' + vms_list).run()
Exemple #16
0
def sysctl_conf(server, vms):
    """Change the default value of net.ipv4.neigh.default.gc_thresh*
    to handle large number of IP"""
    val = int(2**ceil(log(len(vms), 2)))
    conf = "\nnet.ipv4.neigh.default.gc_thresh3 = " + str(3 * val) + \
        "\nnet.ipv4.neigh.default.gc_thresh2 = " + str(2 * val) + \
        "\nnet.ipv4.neigh.default.gc_thresh1 = " + str(val)
    fd, sysctl = mkstemp(dir='/tmp/', prefix='sysctl_')
    f = fdopen(fd, 'w')
    f.write(conf)
    f.close()
    Put([server], [sysctl], remote_location='/etc/').run()
    SshProcess(
        'cd /etc && cat ' + sysctl.split('/')[-1] +
        ' >> sysctl.conf && sysctl -p', server).run()
    Process('rm ' + sysctl).run()
Exemple #17
0
def get_cpu_topology(cluster, xpdir=None):
    """ """
    logger.info('Determining the architecture of cluster ' + \
                style.emph(cluster))
    root = None
    # Trying to reed topology from a directory
    if xpdir:
        fname = xpdir + '/topo_' + cluster + '.xml'
        try:
            tree = parse(fname)
            root = tree.getroot()
        except:
            logger.info('No cache file found, will reserve a node and ' + \
                        'determine topology from virsh capabilities')
            pass

    if root is None:
        frontend = get_cluster_site(cluster)
        submission = OarSubmission(resources="{cluster='" + cluster +
                                   "'}/nodes=1",
                                   walltime="0:02:00",
                                   job_type="allow_classic_ssh")
        ((job_id, _), ) = oarsub([(submission, frontend)])
        wait_oar_job_start(job_id, frontend)
        host = get_oar_job_nodes(job_id, frontend)[0]
        capa = SshProcess('virsh capabilities',
                          host,
                          connection_params={
                              'user':
                              default_frontend_connection_params['user']
                          }).run()
        oardel([(job_id, frontend)])
        root = fromstring(capa.stdout)
        if xpdir is not None:
            tree = ElementTree(root)
            tree.write(fname)

    cpu_topology = []
    i_cell = 0
    for cell in root.findall('.//cell'):
        cpu_topology.append([])
        for cpu in cell.findall('.//cpu'):
            cpu_topology[i_cell].append(int(cpu.attrib['id']))
        i_cell += 1
    logger.info(pformat(cpu_topology))
    return cpu_topology
Exemple #18
0
def dhcp_conf(server, vms, sites):
    """Generate the dnsmasq.conf with dhcp parameters and
    put it on the server"""
    logger.debug('Creating dnsmasq.conf')
    ip_mac = [(vm['ip'], vm['mac']) for vm in vms]
    dhcp_lease = 'dhcp-lease-max=10000\n'
    dhcp_range = 'dhcp-range=' + ip_mac[0][0] + ',' + ip_mac[len(vms) -
                                                             1][0] + ',12h\n'
    dhcp_router = 'dhcp-option=option:router,' + get_server_ip(server) + '\n'
    dhcp_hosts = '' + '\n'.join([
        'dhcp-host=' + ':' + ip_mac[i][1] + ',' + vms[i]['id'] + ',' +
        ip_mac[i][0] for i in range(len(vms))
    ])
    dhcp_option = 'dhcp-option=option:domain-search,grid5000.fr,' + \
        ','.join([site + '.grid5000.fr' for site in sites]) + '\n'
    fd, dnsmasq = mkstemp(dir='/tmp/', prefix='dnsmasq_')
    f = fdopen(fd, 'w')
    f.write(dhcp_lease + dhcp_range + dhcp_router + dhcp_hosts + '\n' +
            dhcp_option)
    f.close()
    Put([server], [dnsmasq], remote_location='/etc/').run()
    SshProcess('cd /etc && cp ' + dnsmasq.split('/')[-1] + ' dnsmasq.conf',
               server).run()
    Process('rm ' + dnsmasq).run()
Exemple #19
0
def get_java_home(host):
    proc = SshProcess('echo $(readlink -f /usr/bin/javac | '
                               'sed "s:/bin/javac::")', host)
    proc.run()
    return proc.stdout.strip()
Exemple #20
0
def get_java_home(host):
    proc = SshProcess(
        'echo $(readlink -f /usr/bin/javac | '
        'sed "s:/bin/javac::")', host)
    proc.run()
    return proc.stdout.strip()
#                  walltime=walltime, job_type='deploy'),
#                  get_cluster_site(cluster))])
jobs = [(49083, get_cluster_site(cluster))]
if jobs[0][0]:
    wait_oar_job_start(jobs[0][0], jobs[0][1])
    logger.info('Job %s has started on %s, retrieving nodes', jobs[0][0],
                jobs[0][1])
    nodes = get_oar_job_nodes(jobs[0][0], jobs[0][1])
    logger.info('Deploying on node %s', nodes[0].address)
    deployed, undeployed = deploy(Deployment(nodes,
                                             env_name="wheezy-x64-base"))

    logger.info('Installing fio')
    cmd = 'export DEBIAN_MASTER=noninteractive ; apt-get update && apt-get ' + \
          'install -y --force-yes fio'
    install = SshProcess(cmd, nodes[0]).run()

    col_number = {
        'common': {
            'output_version': 1,
            'fio_version': 2,
            'error': 5
        },
        'read': {
            'runtime': 9,
            'bw': 7,
            'bw_min': 42,
            'bw_max': 43,
            'lat': 40,
            'lat_min': 38,
            'lat_max': 39,
Exemple #22
0
 def is_job_running(self, job_id=None):
     """ """
     get_state = SshProcess('qstat -f ' + str(job_id), jobserver)
     get_state.ignore_exit_code = True
     get_state.run()
     return get_state.ok
    def run(self):
        sweeper = self.create_paramsweeper()

        while True:
            comb = sweeper.get_next()
            if not comb:
                break
            comb_dir = self.result_dir + '/' + slugify(comb)
            if not os.path.isdir(comb_dir):
                os.mkdir(comb_dir)
            comb_file = comb_dir + '/trace'
            g5k_configuration['kadeploy3'] = comb['version']
            logger.info('Treating combination %s', pformat(comb))
            get_version = SshProcess(
                comb['version'] + ' -v',
                comb['site'],
                connection_params=default_frontend_connection_params).run()
            logger.info(get_version.stdout)

            resources = ""
            if comb['kavlan']:
                resources += "{type='kavlan'}/vlan=1+"
            resources += "nodes=" + str(comb['n_nodes'])
            sub = OarSubmission(resources=resources,
                                job_type='deploy',
                                walltime="0:30:00",
                                name='Kadeploy_Tests')
            logger.info('Performing submission of %s on site %s', resources,
                        comb['site'])
            jobs = oarsub([(sub, comb['site'])])

            if jobs[0][0]:
                try:
                    logger.info('Waiting for job to start')
                    wait_oar_job_start(jobs[0][0], jobs[0][1])
                    hosts = get_oar_job_nodes(jobs[0][0], jobs[0][1])
                    logger.info('Deployment of %s',
                                ' '.join([host.address for host in hosts]))
                    kavlan = get_oar_job_kavlan(jobs[0][0], jobs[0][1])
                    if kavlan:
                        logger.info('In kavlan %s', kavlan)
                    deployment = Deployment(hosts,
                                            env_name=comb['env'],
                                            vlan=kavlan)
                    deployed, undeployed = deploy(deployment,
                                                  stdout_handlers=[comb_file],
                                                  stderr_handlers=[comb_file])

                finally:
                    logger.info('Destroying job %s on %s', str(jobs[0][0]),
                                jobs[0][1])
                    oardel([(jobs[0][0], jobs[0][1])])
            else:
                deployed = []

            if len(undeployed) == 0:
                logger.info('%s is OK', slugify(comb))
            elif len(deployed) == 0:
                logger.error('%s is KO', slugify(comb))
            else:
                logger.warning('%s encountered problems with some hosts',
                               slugify(comb))

            sweeper.done(comb)
    def run(self):
        """Run the experiment"""
        already_configured = self.options.already_configured
        reservation_job_id = int(self.options.reservation_id) \
            if self.options.reservation_id is not None else None
        is_a_test = self.options.is_a_test

        if is_a_test:
            logger.warn('THIS IS A TEST! This run will use only a few '
                        'resources')

        # make the result folder writable for all
        os.chmod(self.result_dir, 0o777)
        # Import configuration
        with open(self.args[0]) as config_file:
            config = json.load(config_file)
        # backup configuration
        copy(self.args[0], self.result_dir)

        site = config["grid5000_site"]
        resources = config["resources"]
        nb_experiment_nodes = config["nb_experiment_nodes"]
        walltime = str(config["walltime"])
        env_name = config["kadeploy_env_name"]
        workloads = config["workloads"]
        # check if workloads exists (Suppose that the same NFS mount point
        # is present on the remote and the local environment
        for workload_file in workloads:
            with open(workload_file):
                pass
            # copy the workloads files to the results dir
            copy(workload_file, self.result_dir)

        # define the workloads parameters
        self.parameters = {'workload_filename': workloads}
        logger.info('Workloads: {}'.format(workloads))

        # define the iterator over the parameters combinations
        self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"),
                                    sweep(self.parameters))

        # Due to previous (using -c result_dir) run skip some combination
        logger.info('Skipped parameters:' +
                    '{}'.format(str(self.sweeper.get_skipped())))

        logger.info('Number of parameters combinations {}'.format(
            str(len(self.sweeper.get_remaining()))))
        logger.info('combinations {}'.format(str(
            self.sweeper.get_remaining())))

        if reservation_job_id is not None:
            jobs = [(reservation_job_id, site)]
        else:
            jobs = oarsub([(OarSubmission(resources=resources,
                                          job_type='deploy',
                                          walltime=walltime), site)])
        job_id, site = jobs[0]
        if job_id:
            try:
                logger.info("waiting job start %s on %s" % (job_id, site))
                wait_oar_job_start(job_id,
                                   site,
                                   prediction_callback=prediction_callback)
                logger.info("getting nodes of %s on %s" % (job_id, site))
                nodes = get_oar_job_nodes(job_id, site)
                # sort the nodes
                nodes = sorted(nodes, key=lambda node: node.address)
                # get only the necessary nodes under the switch
                if nb_experiment_nodes > len(nodes):
                    raise RuntimeError('The number of given node in the '
                                       'reservation ({}) do not match the '
                                       'requested resources '
                                       '({})'.format(len(nodes),
                                                     nb_experiment_nodes))
                nodes = nodes[:nb_experiment_nodes]
                logger.info("deploying nodes: {}".format(str(nodes)))
                deployed, undeployed = deploy(
                    Deployment(nodes, env_name=env_name),
                    check_deployed_command=already_configured)
                if undeployed:
                    logger.warn("NOT deployed nodes: {}".format(
                        str(undeployed)))
                    raise RuntimeError('Deployement failed')

                if not already_configured:

                    # install OAR
                    install_cmd = "apt-get update; apt-get install -y "
                    node_packages = "oar-node"
                    logger.info("installing OAR nodes: {}".format(
                        str(nodes[1:])))
                    install_oar_nodes = Remote(
                        install_cmd + node_packages,
                        nodes[1:],
                        connection_params={'user': '******'})
                    install_oar_nodes.start()

                    server_packages = (
                        "oar-server oar-server-pgsql oar-user "
                        "oar-user-pgsql postgresql python3-pip "
                        "libjson-perl postgresql-server-dev-all")
                    install_oar_sched_cmd = """
                    mkdir -p /opt/oar_sched; \
                    cd /opt/oar_sched; \
                    git clone https://github.com/oar-team/oar3.git; \
                    cd oar3; \
                    git checkout dce942bebc2; \
                    pip3 install -e .; \
                    cd /usr/lib/oar/schedulers; \
                    ln -s /usr/local/bin/kamelot; \
                    pip3 install psycopg2
                    """
                    logger.info("installing OAR server node: {}".format(
                        str(nodes[0])))
                    install_master = SshProcess(
                        install_cmd + server_packages + ";" +
                        install_oar_sched_cmd,
                        nodes[0],
                        connection_params={'user': '******'})
                    install_master.run()
                    install_oar_nodes.wait()

                    if not install_master.ok:
                        Report(install_master)

                    configure_oar_cmd = """
                    sed -i \
                        -e 's/^\(DB_TYPE\)=.*/\\1="Pg"/' \
                        -e 's/^\(DB_HOSTNAME\)=.*/\\1="localhost"/' \
                        -e 's/^\(DB_PORT\)=.*/\\1="5432"/' \
                        -e 's/^\(DB_BASE_PASSWD\)=.*/\\1="oar"/' \
                        -e 's/^\(DB_BASE_LOGIN\)=.*/\\1="oar"/' \
                        -e 's/^\(DB_BASE_PASSWD_RO\)=.*/\\1="oar_ro"/' \
                        -e 's/^\(DB_BASE_LOGIN_RO\)=.*/\\1="oar_ro"/' \
                        -e 's/^\(SERVER_HOSTNAME\)=.*/\\1="localhost"/' \
                        -e 's/^\(SERVER_PORT\)=.*/\\1="16666"/' \
                        -e 's/^\(LOG_LEVEL\)\=\"2\"/\\1\=\"3\"/' \
                        -e 's#^\(LOG_FILE\)\=.*#\\1="{result_dir}/oar.log"#' \
                        -e 's/^\(JOB_RESOURCE_MANAGER_PROPERTY_DB_FIELD\=\"cpuset\".*\)/#\\1/' \
                        -e 's/^#\(CPUSET_PATH\=\"\/oar\".*\)/\\1/' \
                        -e 's/^\(FINAUD_FREQUENCY\)\=.*/\\1="0"/' \
                        /etc/oar/oar.conf
                    """.format(result_dir=self.result_dir)
                    configure_oar = Remote(configure_oar_cmd,
                                           nodes,
                                           connection_params={'user': '******'})
                    configure_oar.run()
                    logger.info("OAR is configured on all nodes")

                    # Configure server
                    create_db = "oar-database --create --db-is-local"
                    config_oar_sched = (
                        "oarnotify --remove-queue default;"
                        "oarnotify --add-queue default,1,kamelot")
                    start_oar = "systemctl start oar-server.service"
                    logger.info("configuring OAR database: {}".format(
                        str(nodes[0])))
                    config_master = SshProcess(
                        create_db + ";" + config_oar_sched + ";" + start_oar,
                        nodes[0],
                        connection_params={'user': '******'})
                    config_master.run()

                    # propagate SSH keys
                    logger.info("configuring OAR SSH")
                    oar_key = "/tmp/.ssh"
                    Process('rm -rf ' + oar_key).run()
                    Process(
                        'scp -o BatchMode=yes -o PasswordAuthentication=no '
                        '-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null '
                        '-o ConnectTimeout=20 -rp -o User=root ' +
                        nodes[0].address + ":/var/lib/oar/.ssh"
                        ' ' + oar_key).run()
                    # Get(nodes[0], "/var/lib/oar/.ssh", [oar_key], connection_params={'user': '******'}).run()
                    Put(nodes[1:], [oar_key],
                        "/var/lib/oar/",
                        connection_params={
                            'user': '******'
                        }).run()
                    add_resources_cmd = """
                    oarproperty -a cpu || true; \
                    oarproperty -a core || true; \
                    oarproperty -c -a host || true; \
                    oarproperty -a mem || true; \
                    """
                    for node in nodes[1:]:
                        add_resources_cmd = add_resources_cmd + "oarnodesetting -a -h {node} -p host={node} -p cpu=1 -p core=4 -p cpuset=0 -p mem=16; \\\n".format(
                            node=node.address)

                    add_resources = SshProcess(
                        add_resources_cmd,
                        nodes[0],
                        connection_params={'user': '******'})
                    add_resources.run()

                    if add_resources.ok:
                        logger.info("oar is now configured!")
                    else:
                        raise RuntimeError(
                            "error in the OAR configuration: Abort!")

                # TODO backup de la config de OAR

                # Do the replay
                logger.info('begining the replay')
                while len(self.sweeper.get_remaining()) > 0:
                    combi = self.sweeper.get_next()
                    workload_file = os.path.basename(
                        combi['workload_filename'])
                    oar_replay = SshProcess(
                        script_path + "/oar_replay.py " +
                        combi['workload_filename'] + " " + self.result_dir +
                        "  oar_gant_" + workload_file, nodes[0])
                    oar_replay.stdout_handlers.append(self.result_dir + '/' +
                                                      workload_file + '.out')
                    logger.info("replaying workload: {}".format(combi))
                    oar_replay.run()
                    if oar_replay.ok:
                        logger.info("Replay workload OK: {}".format(combi))
                        self.sweeper.done(combi)
                    else:
                        logger.info("Replay workload NOT OK: {}".format(combi))
                        self.sweeper.cancel(combi)
                        raise RuntimeError("error in the OAR replay: Abort!")

            except:
                traceback.print_exc()
                ipdb.set_trace()

            finally:
                if is_a_test:
                    ipdb.set_trace()
                if reservation_job_id is None:
                    logger.info("delete job: {}".format(jobs))
                    oardel(jobs)
    def run(self):
        """Run the experiment"""
        already_configured = self.options.already_configured
        reservation_job_id = int(self.options.reservation_id) \
            if self.options.reservation_id is not None else None
        is_a_test = self.options.is_a_test

        if is_a_test:
            logger.warn('THIS IS A TEST! This run will use only a few '
                        'resources')

        # make the result folder writable for all
        os.chmod(self.result_dir, 0o777)
        # Import configuration
        with open(self.args[0]) as config_file:
            config = json.load(config_file)
        # backup configuration
        copy(self.args[0], self.result_dir)

        site = config["grid5000_site"]
        resources = config["resources"]
        nb_experiment_nodes = config["nb_experiment_nodes"]
        walltime = str(config["walltime"])
        env_name = config["kadeploy_env_name"]
        workloads = config["workloads"]
        # check if workloads exists (Suppose that the same NFS mount point
        # is present on the remote and the local environment
        for workload_file in workloads:
            with open(workload_file):
                pass
            # copy the workloads files to the results dir
            copy(workload_file, self.result_dir)

        # define the workloads parameters
        self.parameters = {
            'workload_filename': workloads
        }
        logger.info('Workloads: {}'.format(workloads))

        # define the iterator over the parameters combinations
        self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"),
                                    sweep(self.parameters))

        # Due to previous (using -c result_dir) run skip some combination
        logger.info('Skipped parameters:' +
                    '{}'.format(str(self.sweeper.get_skipped())))

        logger.info('Number of parameters combinations {}'.format(
            str(len(self.sweeper.get_remaining()))))
        logger.info('combinations {}'.format(
            str(self.sweeper.get_remaining())))

        if reservation_job_id is not None:
            jobs = [(reservation_job_id, site)]
        else:
            jobs = oarsub([(OarSubmission(resources=resources,
                                          job_type='deploy',
                                          walltime=walltime), site)])
        job_id, site = jobs[0]
        if job_id:
            try:
                logger.info("waiting job start %s on %s" % (job_id, site))
                wait_oar_job_start(
                    job_id, site, prediction_callback=prediction_callback)
                logger.info("getting nodes of %s on %s" % (job_id, site))
                nodes = get_oar_job_nodes(job_id, site)
                # sort the nodes
                nodes = sorted(nodes, key=lambda node: node.address)
                # get only the necessary nodes under the switch
                if nb_experiment_nodes > len(nodes):
                    raise RuntimeError('The number of given node in the '
                                       'reservation ({}) do not match the '
                                       'requested resources '
                                       '({})'.format(len(nodes),
                                                     nb_experiment_nodes))
                nodes = nodes[:nb_experiment_nodes]
                logger.info("deploying nodes: {}".format(str(nodes)))
                deployed, undeployed = deploy(
                    Deployment(nodes, env_name=env_name),
                    check_deployed_command=already_configured)
                if undeployed:
                    logger.warn(
                        "NOT deployed nodes: {}".format(str(undeployed)))
                    raise RuntimeError('Deployement failed')

                if not already_configured:

                    # install OAR
                    install_cmd = "apt-get update; apt-get install -y "
                    node_packages = "oar-node"
                    logger.info(
                        "installing OAR nodes: {}".format(str(nodes[1:])))
                    install_oar_nodes = Remote(
                        install_cmd + node_packages,
                        nodes[1:],
                        connection_params={'user': '******'})
                    install_oar_nodes.start()

                    server_packages = ("oar-server oar-server-pgsql oar-user "
                                       "oar-user-pgsql postgresql python3-pip "
                                       "libjson-perl postgresql-server-dev-all")
                    install_oar_sched_cmd = """
                    mkdir -p /opt/oar_sched; \
                    cd /opt/oar_sched; \
                    git clone https://github.com/oar-team/oar3.git; \
                    cd oar3; \
                    git checkout dce942bebc2; \
                    pip3 install -e .; \
                    cd /usr/lib/oar/schedulers; \
                    ln -s /usr/local/bin/kamelot; \
                    pip3 install psycopg2
                    """
                    logger.info("installing OAR server node: {}".format(str(nodes[0])))
                    install_master = SshProcess(install_cmd + server_packages +
                                                ";" + install_oar_sched_cmd, nodes[0],
                                                connection_params={'user': '******'})
                    install_master.run()
                    install_oar_nodes.wait()

                    if not install_master.ok:
                        Report(install_master)

                    configure_oar_cmd = """
                    sed -i \
                        -e 's/^\(DB_TYPE\)=.*/\\1="Pg"/' \
                        -e 's/^\(DB_HOSTNAME\)=.*/\\1="localhost"/' \
                        -e 's/^\(DB_PORT\)=.*/\\1="5432"/' \
                        -e 's/^\(DB_BASE_PASSWD\)=.*/\\1="oar"/' \
                        -e 's/^\(DB_BASE_LOGIN\)=.*/\\1="oar"/' \
                        -e 's/^\(DB_BASE_PASSWD_RO\)=.*/\\1="oar_ro"/' \
                        -e 's/^\(DB_BASE_LOGIN_RO\)=.*/\\1="oar_ro"/' \
                        -e 's/^\(SERVER_HOSTNAME\)=.*/\\1="localhost"/' \
                        -e 's/^\(SERVER_PORT\)=.*/\\1="16666"/' \
                        -e 's/^\(LOG_LEVEL\)\=\"2\"/\\1\=\"3\"/' \
                        -e 's#^\(LOG_FILE\)\=.*#\\1="{result_dir}/oar.log"#' \
                        -e 's/^\(JOB_RESOURCE_MANAGER_PROPERTY_DB_FIELD\=\"cpuset\".*\)/#\\1/' \
                        -e 's/^#\(CPUSET_PATH\=\"\/oar\".*\)/\\1/' \
                        -e 's/^\(FINAUD_FREQUENCY\)\=.*/\\1="0"/' \
                        /etc/oar/oar.conf
                    """.format(result_dir=self.result_dir)
                    configure_oar = Remote(configure_oar_cmd, nodes,
                                           connection_params={'user': '******'})
                    configure_oar.run()
                    logger.info("OAR is configured on all nodes")

                    # Configure server
                    create_db = "oar-database --create --db-is-local"
                    config_oar_sched = ("oarnotify --remove-queue default;"
                                        "oarnotify --add-queue default,1,kamelot")
                    start_oar = "systemctl start oar-server.service"
                    logger.info(
                        "configuring OAR database: {}".format(str(nodes[0])))
                    config_master = SshProcess(create_db + ";" + config_oar_sched + ";" + start_oar,
                                               nodes[0],
                                               connection_params={'user': '******'})
                    config_master.run()

                    # propagate SSH keys
                    logger.info("configuring OAR SSH")
                    oar_key = "/tmp/.ssh"
                    Process('rm -rf ' + oar_key).run()
                    Process('scp -o BatchMode=yes -o PasswordAuthentication=no '
                            '-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null '
                            '-o ConnectTimeout=20 -rp -o User=root ' +
                            nodes[0].address + ":/var/lib/oar/.ssh"
                            ' ' + oar_key).run()
                    # Get(nodes[0], "/var/lib/oar/.ssh", [oar_key], connection_params={'user': '******'}).run()
                    Put(nodes[1:], [oar_key], "/var/lib/oar/", connection_params={'user': '******'}).run()
                    add_resources_cmd = """
                    oarproperty -a cpu || true; \
                    oarproperty -a core || true; \
                    oarproperty -c -a host || true; \
                    oarproperty -a mem || true; \
                    """
                    for node in nodes[1:]:
                        add_resources_cmd = add_resources_cmd + "oarnodesetting -a -h {node} -p host={node} -p cpu=1 -p core=4 -p cpuset=0 -p mem=16; \\\n".format(node=node.address)

                    add_resources = SshProcess(add_resources_cmd, nodes[0],
                                               connection_params={'user': '******'})
                    add_resources.run()

                    if add_resources.ok:
                        logger.info("oar is now configured!")
                    else:
                        raise RuntimeError("error in the OAR configuration: Abort!")

                # TODO backup de la config de OAR

                # Do the replay
                logger.info('begining the replay')
                while len(self.sweeper.get_remaining()) > 0:
                    combi = self.sweeper.get_next()
                    workload_file = os.path.basename(combi['workload_filename'])
                    oar_replay = SshProcess(script_path + "/oar_replay.py " +
                                            combi['workload_filename'] + " " +
                                            self.result_dir + "  oar_gant_" +
                                            workload_file,
                                            nodes[0])
                    oar_replay.stdout_handlers.append(self.result_dir + '/' +
                                                      workload_file + '.out')
                    logger.info("replaying workload: {}".format(combi))
                    oar_replay.run()
                    if oar_replay.ok:
                        logger.info("Replay workload OK: {}".format(combi))
                        self.sweeper.done(combi)
                    else:
                        logger.info("Replay workload NOT OK: {}".format(combi))
                        self.sweeper.cancel(combi)
                        raise RuntimeError("error in the OAR replay: Abort!")

            except:
                traceback.print_exc()
                ipdb.set_trace()

            finally:
                if is_a_test:
                    ipdb.set_trace()
                if reservation_job_id is None:
                    logger.info("delete job: {}".format(jobs))
                    oardel(jobs)
Exemple #26
0
    def start(self):
        """Start MongoDB server."""

        self._check_initialization()

        logger.info("Starting MongoDB")

        if self.running:
            logger.warn("MongoDB was already started")
            return

        # Start nodes
        procs = []
        for h in self.hosts:
            mongo_command = (NUMA_PREFIX + " " +
                             self.bin_dir + "/mongod "
                             " --fork "
                             " --config " + os.path.join(self.conf_dir,
                                                         CONF_FILE) +
                             " --bind_ip " + h.address +
                             " --port " + str(self.md_port))

            logger.debug(mongo_command)

            proc = SshProcess(mongo_command, h)
            proc.start()
            procs.append(proc)

        finished_ok = True
        for p in procs:
            p.wait()
            if not p.finished_ok:
                finished_ok = False

        if not finished_ok:
            logger.warn("Error while starting MongoDB")
            return
        else:
            self.running = True

        # Start replication
        if self.do_replication:
            logger.info("Configuring replication")
            mongo_command = "rs.initiate();"
            mongo_command += ';'.join(
                'rs.add("' + h.address + ':' + str(self.md_port) + '")'
                for h in self.hosts)

            logger.debug(mongo_command)

            proc = TaktukRemote(self.bin_dir + "/mongo "
                                "--eval '" + mongo_command + "' " +
                                self.master.address,
                                [self.master])
            proc.run()

            if not proc.finished_ok:
                logger.warn("Not able to start replication")

        if self.do_sharding:
            if not self.initialized_sharding:
                logger.info("Configuring sharding")
                time.sleep(2)
                mongo_command = (
                    'rs.initiate({'
                    '_id : "%s",'
                    'configsvr : true,'
                    'members : [%s]})' % (
                        self.rs_name,
                        ",".join('{ _id : %d, host : "%s:%d" }' %
                                 (_id, h.address, self.md_port)
                                 for (_id, h) in enumerate(self.hosts))
                    )
                )

                logger.debug(mongo_command)

                proc = SshProcess(self.bin_dir + "/mongo " +
                                  "--eval '" + mongo_command + "' " +
                                  self.master.address,
                                  self.master)
                proc.run()
                if proc.finished_ok:
                    self.initialized_sharding = True
                else:
                    logger.warn("Not able to configure sharding")

            logger.info("Starting sharding servers")
            mongo_command = (
                NUMA_PREFIX + " " +
                self.bin_dir + "/mongos"
                " --configdb " + self.rs_name + "/" +
                ",".join('%s:%d' % (h.address, self.md_port)
                         for h in self.hosts) +
                " --bind_ip " + self.master.address +
                " --port " + str(self.ms_port) +
                " --fork"
                " --logpath " + self.logs_dir + "/mongos.log"
                " --pidfilepath " + self.mongos_pid_file
            )

            logger.debug(mongo_command)

            start_ms = TaktukRemote(mongo_command, [self.master])
            start_ms.run()