Exemple #1
0
    def _initialize_conf(self):
        """Merge locally-specified configuration files with default files
        from the distribution"""

        action = Remote(
            "cp " + os.path.join(self.conf_dir, MR_CONF_FILE + ".template ") +
            os.path.join(self.conf_dir, MR_CONF_FILE), self.hosts)
        action.run()

        super(HadoopV2Cluster, self)._initialize_conf()
Exemple #2
0
    def _initialize_conf(self):
        """Merge locally-specified configuration files with default files
        from the distribution"""

        action = Remote("cp " + os.path.join(self.conf_dir,
                                             MR_CONF_FILE + ".template ") +
                        os.path.join(self.conf_dir, MR_CONF_FILE),
                        self.hosts)
        action.run()

        super(HadoopV2Cluster, self)._initialize_conf()
Exemple #3
0
def run_bench(output_folder, node):
    # debian part
    logger.info("Starting debian benchs...")
    debian_folder = create_subdir(output_folder, "debian")
    debian_bench_command = "../../tools/benchs.sh \"commands/omp-tasks/debian-omp-tasks\" {} 10 \"1 2 4 8 16\"".format(debian_folder)
    debian_bench = Remote('cd ./unikernel-tools/benchs/bots && {}'.format(debian_bench_command), node).run()

    # hermitux part
    logger.info("Starting hermitux benchs...")
    hermitux_folder = create_subdir(output_folder, "hermitux")
    hermitux_bench_command = "../../tools/benchs.sh \"commands/omp-tasks/hermitux-omp-tasks\" {} 10 \"1 2 4 8 16\"".format(hermitux_folder)
    hermitux_bench = Remote('cd ./unikernel-tools/benchs/bots && {}'.format(hermitux_bench_command), node).run()
Exemple #4
0
    def bootstrap(self, tar_file):
        """Install Hadoop in all cluster nodes from the specified tar.gz file.

        Args:
          hadoop_tar_file (str):
            The file containing Hadoop binaries.
        """

        if super(HadoopV2Cluster, self).bootstrap(tar_file):
            action = Remote(
                "cp " +
                os.path.join(self.conf_dir, MR_CONF_FILE + ".template ") +
                os.path.join(self.conf_dir, MR_CONF_FILE), self.hosts)
            action.run()
Exemple #5
0
    def bootstrap(self, tar_file):
        """Install Hadoop in all cluster nodes from the specified tar.gz file.

        Args:
          hadoop_tar_file (str):
            The file containing Hadoop binaries.
        """

        if super(HadoopV2Cluster, self).bootstrap(tar_file):
            action = Remote("cp " + os.path.join(self.conf_dir,
                                                 MR_CONF_FILE + ".template ") +
                            os.path.join(self.conf_dir, MR_CONF_FILE),
                            self.hosts)
            action.run()
Exemple #6
0
    def workflow(self, comb, host, comb_dir):
        """ """
        comb_ok = False
        thread_name = style.Thread(host.split('.')[0]) + ': '
        logger.info(thread_name + 'Starting combination ' + slugify(comb))

        try:
            logger.info(thread_name + 'Generate conf file')
            param_str = self.create_string(comb)

            Remote(
                "python /home/Work/sgcbntier/paasage_demo/xml_gen_execo.py --cb "
                + param_str, [host]).run()

            logger.info(thread_name + 'Run code')
            Remote(
                "cd /home/Work/sgcbntier/paasage_demo/ ; python run_all_execo.py --cb %s"
                % param_str, [host]).run()

            logger.info(thread_name + 'Get results')

            traceFile = "ntier_" + param_str
            get_results = Get([host], [
                "/home/Work/sgcbntier/paasage_demo/csv/REQTASK_" + traceFile +
                ".csv"
            ],
                              local_location=comb_dir).run()

            for p in get_results.processes:
                if not p.ok:
                    logger.error(
                        host +
                        ': Unable to retrieve the files for combination %s',
                        slugify(comb))
                    exit()

            comb_ok = True
        finally:
            if comb_ok:
                self.sweeper.done(comb)
                logger.info(thread_name + ': ' + slugify(comb) + \
                             ' has been done')
            else:
                self.sweeper.cancel(comb)
                logger.warning(thread_name + ': ' + slugify(comb) + \
                            ' has been canceled')
        logger.info(style.step('%s Remaining'),
                    len(self.sweeper.get_remaining()))
def check_if_file_exists(file_name, nodes, connection_params):
    """
    Check if a file exists in a set of nodes. It returns the set of nodes that DON'T have the files
    :param file_name: 
    :param nodes: 
    :param connection_params: 
    :return: 
    """
    r = Remote(cmd="test -e " + file_name,
               hosts=nodes,
               connection_params=connection_params,
               process_args={"nolog_exit_code": True}
               )
    r.run()
    not_ok = filter(lambda p: p.ok is not True, r.processes)
    return set([node.host.address for node in not_ok])
 def upload_to_all_g5k_sites(self):
     sites = get_g5k_sites()
     sitesg5k = [s + ".g5k" for s in sites] # we add the .g5k to be able to ssh to the frontend
     #   delete any existent spark-benchdirectories
     Remote("rm -rf $HOME/spark-bench",sitesg5k,connection_params={'user': g5k_configuration.get("g5k_user")}).run()
     #   upload the source code to all sites
     Put(sitesg5k, ["spark/spark-bench"], "~/", connection_params = {'user': g5k_configuration.get("g5k_user")}).run()
Exemple #9
0
 def load_workload(self, from_node, workload, recordcount, threadcount,
                   fieldlength):
     """
     Run a workload from the core workloads with the CassandraDB db wrapper
     :param from_node: from which node you want to run the workload. This is a set variable
     :param workload: the type of workload e.g. workloada, workloadb, workloadc ...
     :param recordcount: the total number of records to insert
     :param insertcount: the number of records to insert with this execution
     :param insertstart: from where to start inserting. Useful if we have different clients and each one will
     insert in different ranges e.g.[client1:0-10000,client2:10000-20000,...]
     :param threadcount: the number of threads for each client. Increase this to increase the load on the system
     """
     # we transform the set to a str with the format 'node1,node2,node3...'
     cassandra_nodes_str = ','.join(list(self.cassandra_nodes))
     # divide the number of records equally among clients
     insertcount = int(recordcount) / from_node.__len__()
     insertstart_steps = list(range(from_node.__len__()))
     insertstart_values = map(lambda x: x * insertcount, insertstart_steps)
     # We load the data into the cassandra database
     # This could be changed to an execo generator expression like insertstart={{[x for x in insertstart_values]}}
     #"ycsb-0.12.0/bin/ycsb.sh load cassandra-cql -P ycsb-0.12.0/workloads/" + workload + " -p hosts=" + cassandra_nodes_str + " -p recordcount=" + recordcount + " -p insertstart={{[x for x in insertstart_values]}} -p insertcount=" + str(insertcount) + " -p threadcount=" + threadcount + " -p fieldlength=" + str(fieldlength)
     Remote(
         "ycsb-0.12.0/bin/ycsb.sh load cassandra-cql -P ycsb-0.12.0/workloads/"
         + workload + " -p hosts=" + cassandra_nodes_str +
         " -p recordcount=" + recordcount +
         " -p insertstart={{[x for x in insertstart_values]}} -p insertcount="
         + str(insertcount) + " -p threadcount=" + threadcount +
         " -p fieldlength=" + str(fieldlength),
         hosts=from_node,
         connection_params=self.execo_conn_params,
         process_args={
             'stdout_handlers': [sys.stdout],
             'stderr_handlers': [sys.stderr]
         }).run(timeout=900)
Exemple #10
0
 def run_workload(self, iteration, res_dir, from_node, workload,
                  recordcount, threadcount, fieldlength, target):
     # we transform the set to a str with the format 'node1,node2,node3...'
     """
     Run a given workload. it receives an iteration parameter to be able to repeat the workload several times
     :param iteration: the iteration number. Will be appended to the name of the file with the output
     :param from_node: from which nodes should we execute the benchmark
     :param workload: the type of workload   
     :param threadcount: the number of threads
     """
     cassandra_nodes_str = ','.join(list(self.cassandra_nodes))
     # We run the workload
     Remote(
         "ycsb-0.12.0/bin/ycsb.sh run cassandra-cql -P ycsb-0.12.0/workloads/"
         + workload + " -p hosts=" + cassandra_nodes_str +
         " -p recordcount=" + recordcount + " -p threadcount=" +
         str(threadcount) + " -p fieldlength=" + str(fieldlength) +
         " -p exportfile=" + res_dir + "/output_" + workload +
         "_{{{host}}}_it" + str(iteration),
         hosts=from_node,
         connection_params=self.execo_conn_params,
         process_args={
             'stdout_handlers': [sys.stdout],
             'stderr_handlers': [sys.stderr]
         }).run(timeout=900)
Exemple #11
0
 def submit(self, class_in_jar, class_params, jar, master, submit_conf,
            scheduler_options):
     """
     :param class_in_jar: the class we want to launch inside the jar
     :param class_params: the parameters expected by that class
     :param jar: the jar where the class is bundled
     :param master: the master that is going to take care of launching the app. ("yarn","spark:/192.168.0.1", etc..)
     :param submit_conf: a list of tuples with the form [["spark.executor.memory","2g"],["spark.executor.cores","1"]]
     :param scheduler_options: options that are only applicable to that resource manager e.g. (Mesos tags, yarn labels...)
     """
     if master is None:
         master = self.default_master
     if scheduler_options is None:
         scheduler_options = ""
     scheduler_str = self.generate_scheduler_options(scheduler_options)
     conf_str = self.generate_conf(submit_conf)
     cmd = "{0} --class {1} --master {2} --deploy-mode client {3} {4} {5} {6}".format(
         self.root_to_spark_submit, class_in_jar, master, conf_str, jar,
         " ".join(class_params), scheduler_str)
     Remote(cmd,
            hosts=self.master_node,
            connection_params={
                'user': g5k_configuration.get('g5k_user')
            },
            process_args={
                'stdout_handlers': [sys.stdout],
                'stderr_handlers': [sys.stderr]
            }).run()
Exemple #12
0
def reboot_hosts(hosts, timeout=300):
    """ """
    reboot = Remote('reboot', hosts).run()
    if not reboot.ok:
        return False
    wait_hosts_down(hosts, timeout)
    wait_hosts_up(hosts, timeout)
Exemple #13
0
    def start(self):
		logger.info('Put benchmarking files on hosts.')
		file_path  = os.path.join(os.path.dirname(__file__), '../resources/unixbench-5.1.3.tgz')
		bench_copy = Put(self.bench_list.values(), [file_path], "/tmp/").run()

		logger.info('Start benchmarking on ' + str(len(self.bench_list)) + ' hosts.')
		bench_install = Remote( 'cd /tmp/ &&'                     + \
								'tar xvfz unixbench-5.1.3.tgz &&' + \
								'cd unixbench-5.1.3/ &&'          + \
								'./Run arithmetic &&'             + \
								'cd ../ &&'                       + \
								'rm -rf unixbench-5.1.3/ &&'      + \
								'rm -rf unixbench-5.1.3.tgz',
								self.bench_list.values())

		for p in bench_install.processes:
			host = p.host.address + (':' + str(p.host.port)) if(p.host.port != None) else ""
			p.stdout_handlers.append(end_forwarder_stdout_handler(host, self.callback))

		bench_install.start()
Exemple #14
0
def add_vms(vms, server):
    """Generate the list of virtual machines """
    logger.debug('Adding the VM in /etc/hosts ...')
    fd, vms_list = mkstemp(dir='/tmp/', prefix='vms_')
    f = fdopen(fd, 'w')
    f.write('\n' + '\n'.join([vm['ip'] + ' \t ' + vm['id'] for vm in vms]))
    f.close()
    Put([server], [vms_list], remote_location='/etc/').run()
    SshProcess(
        '[ -f /etc/hosts.bak ] && cp /etc/hosts.bak /etc/hosts || ' +
        ' cp /etc/hosts /etc/hosts.bak', server).run()
    Remote('cat /etc/' + vms_list.split('/')[-1] + ' >> /etc/hosts',
           [server]).run()
    Process('rm ' + vms_list).run()
def install_docker(nodes):
    docker_deb = "https://download.docker.com/linux/debian/dists/jessie/pool/stable/amd64/docker-ce_17.03.1~ce-0~debian-jessie_amd64.deb"
    Remote("wget {0} -O {1}/docker-ce.deb 2>1".format(docker_deb,
                                                      wget_destination),
           hosts=nodes,
           connection_params={
               'user': '******'
           }).run()  ## download the debian package for docker
    Remote("dpkg -i {0}/docker-ce.deb".format(wget_destination),
           hosts=nodes,
           connection_params={
               'user': '******'
           }).run()  # install the package
    Remote("apt-get -fy install",
           hosts=nodes,
           connection_params={
               'user': '******'
           }).run()  # install the package
    Remote("docker run hello-world",
           hosts=nodes,
           connection_params={
               'user': '******'
           }).run()  # install the package
 def __init__(self, install_nodes, execo_conn_params):
     """
     This is the parent class for all the YSCB benchmarks. It needs as parameters the nodes where we want to install
     it and the execo_conn_params (e.g. vagrantg5k, g5k_user, etc...)
     :type install_nodes: set
     :type execo_conn_params: dict
     """
     self.install_nodes = install_nodes
     self.execo_conn_params = execo_conn_params
     # before downloading anything let's check if the files already exists in the home directory
     not_exist = check_if_file_exists("ycsb-0.12.0.tar.gz", install_nodes,
                                      self.execo_conn_params)  # a set
     if (not_exist.__len__() != 0):
         print "Downloading YSCB from Git for nodes: {0}".format(not_exist)
         Remote(
             "curl -O --location https://github.com/brianfrankcooper/YCSB/releases/download/0.12.0/ycsb-0.12.0.tar.gz ycsb-0.12.0.tar.gz",
             hosts=not_exist,
             connection_params=execo_conn_params).run()
     print "Untar ycsb-0.12.0.tar.gz"
     Remote("tar xfvz ycsb-0.12.0.tar.gz",
            hosts=install_nodes,
            connection_params=execo_conn_params).run()
     print "YSCB installed in nodes: {0}".format(install_nodes)
     print "Remember to prepare the DB before running any workload. More info on https://github.com/brianfrankcooper/YCSB"
 def prepare_bench(self):
     """bench configuration and compilation, copy binaries to frontends
     
     return True if preparation is ok
     """
     logger.info("preparation: configure and compile benchmark")
     # the involved sites. We will do the compilation on the first of these.
     sites = list(set(map(get_cluster_site, self.parameters['cluster'])))
     # generate the bench compilation configuration
     bench_list = '\n'.join([ 'lu\t%s\t%s' % (size, n_core)
                              for n_core in self.parameters['n_core']
                              for size in self.parameters['size'] ])
     # Reserving a node because compiling on the frontend is forbidden
     # and because we need mpif77
     jobs = oarsub([(OarSubmission(resources = "nodes=1",
                                   job_type = 'allow_classic_ssh',
                                   walltime ='0:10:00'), sites[0])])
     if jobs[0][0]:
         try:
             logger.info("copying bench archive to %s" % (sites[0],))
             copy_bench = Put([sites[0]], ['NPB3.3-MPI.tar.bz2']).run()
             logger.info("extracting bench archive on %s" % (sites[0],))
             extract_bench = Remote('tar -xjf NPB3.3-MPI.tar.bz2', [sites[0]]).run()
             logger.info("waiting job start %s" % (jobs[0],))
             wait_oar_job_start(*jobs[0], prediction_callback = pred_cb)
             logger.info("getting nodes of %s" % (jobs[0],))
             nodes = get_oar_job_nodes(*jobs[0])
             logger.info("configure bench compilation")
             conf_bench = Remote('echo "%s" > ~/NPB3.3-MPI/config/suite.def' % bench_list, nodes).run()
             logger.info("compil bench")
             compilation = Remote('cd NPB3.3-MPI && make clean && make suite', nodes).run()
             logger.info("compil finished")
         except:
             logger.error("unable to compile bench")
             return False
         finally:
             oardel(jobs)
     # Copying binaries to all other frontends
     frontends = sites[1:]
     rsync = Remote('rsync -avuP ~/NPB3.3-MPI/ {{frontends}}:NPB3.3-MPI', 
                    [get_host_site(nodes[0])] * len(frontends)) 
     rsync.run()
     return compilation.ok and rsync.ok
Exemple #18
0
def setup_node(node):
    clone_bench = Remote('git clone https://github.com/p-jacquot/unikernel-tools', node).run()
    compile_bench = Remote('cd ./unikernel-tools/benchs/bots && make debian && make hermitux', node).run()
    return clone_bench.ok and compile_bench.ok
Exemple #19
0
    def run_xp(self):

        master = self.cluster[0]
        opt = ''
        """Iterate over the parameters and execute the bench"""
        while len(self.sweeper.get_remaining()) > 0:
            # Take sweeper
            comb = self.sweeper.get_next()

            logger.info('Processing new combination %s' % (comb, ))

            try:
                # metric from linux sar tools, works with clock
                def takeMetric(
                        path,
                        startTime,
                        endTime,
                        metric=['cpu', 'mem', 'disk', 'swap', 'network']):
                    opt = ''
                    cmd_template_sar = (
                        "sar -f /var/log/sysstat/sa* -{opt} -s {startTime} -e {endTime}"
                    )
                    for met in metric:
                        if met == 'cpu':
                            opt = 'u'
                        elif met == 'mem':
                            opt = 'r'
                        elif met == 'disk':
                            opt = 'dp'
                        elif met == 'swap':
                            opt = 'S'
                        elif met == 'network':
                            opt = 'n DEV'

                        cmd = cmd_template_sar.format(opt=opt,
                                                      startTime=startTime,
                                                      endTime=endTime)
                        for host in self.cluster:
                            hE = SshProcess(cmd,
                                            host,
                                            connection_params={'user': '******'})
                            hE.run()
                            stdMetric = host + '-' + met + '.txt'
                            with open(os.path.join(path, stdMetric),
                                      "w") as sout:
                                sout.write(hE.stdout)

                #Set CPU Freq and Policy according current combination
                cmd_template_Freq_Policy = ("cpufreq-set -r  -g {policy}")
                cmd_template_Freq = ("cpufreq-set -r -f {freq}")

                if comb['Freq'] == 'OnDemand':
                    cmd_freq_policy = cmd_template_Freq_Policy.format(
                        policy='ondemand')
                    Remote(cmd_freq_policy,
                           master,
                           connection_params={
                               'user': '******'
                           }).run()
                elif comb['Freq'] == 'conservative':
                    cmd_freq_policy = cmd_template_Freq_Policy.format(
                        policy='conservative')
                    Remote(cmd_freq_policy,
                           master,
                           connection_params={
                               'user': '******'
                           }).run()
                else:
                    cmd_freq_policy = cmd_template_Freq_Policy.format(
                        policy='userspace')
                    Remote(cmd_freq_policy,
                           master,
                           connection_params={
                               'user': '******'
                           }).run()
                    cmd_freq = cmd_template_Freq.format(freq=comb['Freq'])
                    Remote(cmd_freq,
                           master,
                           connection_params={
                               'user': '******'
                           }).run()

                # build command
                src = 'source /opt/intel-performance-snapshoot/apsvars.sh'
                cmd_mpirun_template = (
                    "mpirun {opt} -f /root/cluster.txt -np {pr1} aps -r '/tmp/log/' /tmp/NPB/npb-mpi/bin/{typeNPB}.{NPBclass}.{pr2}"
                )
                cmd_mpirun = cmd_mpirun_template.format(
                    opt='',
                    pr1=comb['n_core'],
                    typeNPB=comb['Benchmark'],
                    NPBclass=comb['NPBclass'],
                    pr2=comb['n_core'])
                cmd = "{}; /tmp/NPB/bin/runMPI.sh '{}' '{}'".format(
                    src, cmd_mpirun, slugify(comb))

                curPath = self.result_dir + slugify(comb)

                # run Mpi through execo remote SshProcess
                def runMpi(cmd):
                    act = SshProcess(cmd,
                                     master,
                                     connection_params={'user': '******'},
                                     shell=True)
                    act.run()

                    if not os.path.exists(curPath):
                        os.makedirs(curPath)

                    with open(os.path.join(curPath, "stdout.txt"),
                              "a+") as sout, open(
                                  os.path.join(curPath, "stderr.txt"),
                                  "w") as serr:
                        sout.write(act.stdout)
                        serr.write(act.stderr)
                    return act.ok

                # start clock and exec command in the master node
                time.sleep(5)
                startUnix = int(time.time())
                start24Hour = datetime.datetime.fromtimestamp(
                    startUnix).strftime('%H:%M:%S')

                task1 = runMpi(cmd)

                endUnix = int(time.time())
                end24Hour = datetime.datetime.fromtimestamp(endUnix).strftime(
                    '%H:%M:%S')
                time.sleep(5)

                with open(os.path.join(curPath, "executionTime.txt"),
                          "w") as sout:
                    sout.write(
                        'ExecTime:{}\nStartDate:{}\nEndDate:{}\n'.format(
                            str(endUnix - startUnix), start24Hour, end24Hour))

                takeMetric(curPath, start24Hour, end24Hour,
                           ['cpu', 'mem', 'disk', 'swap', 'network'])

                # collect power from kWAPI: grid5000 infrastructure made tool
                for hostname in self.cluster:
                    powerOut = '{}_power'.format(hostname)
                    collect_metric(startUnix, endUnix, 'power', curPath,
                                   self.site, powerOut, hostname)

                st = '/tmp/out/' + slugify(comb)
                intelAppPerf = str(st + '.html')

                # get the data from ['Application Performance Snapshot', 'Storage Performance Snapshot']
                # https://software.intel.com/en-us/performance-snapshot
                Get(master, [intelAppPerf],
                    curPath,
                    connection_params={
                        'user': '******'
                    }).run()
                if task1:
                    logger.info("comb ok: %s" % (comb, ))
                    self.sweeper.done(comb)
                    continue

            except OSError as err:
                print("OS error: {0}".format(err))
            except ValueError:
                print("Could not convert data to an integer.")
            except:
                print("Unexpected error:", sys.exc_info()[0])
                raise

            logger.info("comb NOT ok: %s" % (comb, ))
            self.sweeper.cancel(comb)
Exemple #20
0
def run_bench(output_folder, node):
    debian_bench_command = "./benchs.sh {} 10 \"1 2 4 8 16\"".format(debian_folder)
    debian_bench = Remote('cd ./unikernel-tools/benchs/rodinias && {}'.format(debian_bench_command), node).run()
Exemple #21
0
def setup_node(node):
    clone_bench = Remote('git clone https://github.com/p-jacquot/unikernel-tools', node).run()
    return clone_bench.ok 
Exemple #22
0
    def run(self):
        """ """
        if self.options.oargrid_job_id:
            self.oargrid_job_id = self.options.oargrid_job_id
        else:
            self.oargrid_job_id = None

        try:
            # Creation of the main iterator which is used for the first control loop.
            self.define_parameters()

            job_is_dead = False
            # While there are combinations to treat
            while len(self.sweeper.get_remaining()) > 0:
                # If no job, we make a reservation and prepare the hosts for the experiments
                if self.oargrid_job_id is None:
                    self.make_reservation()
                # Wait that the job starts
                logger.info('Waiting that the job start')
                wait_oargrid_job_start(self.oargrid_job_id)
                # Retrieving the hosts and subnets parameters
                self.hosts = get_oargrid_job_nodes(self.oargrid_job_id)
                # Hosts deployment and configuration

                default_connection_params['user'] = '******'

                logger.info("Start hosts configuration")
                ex_log.setLevel('INFO')
                deployment = Deployment(
                    hosts=self.hosts,
                    env_file='/home/sirimie/env/mywheezy-x64-base.env')
                self.hosts, _ = deploy(deployment)

                Remote("rm -f /home/Work/sgcbntier/paasage_demo/csv/REQTASK_*",
                       self.hosts).run()
                Remote(
                    "rm -f /home/Work/sgcbntier/paasage_demo/platform_aws.xml",
                    self.hosts).run()
                Remote("rm -f /home/Work/sgcbntier/paasage_demo/cloud_ec2.xml",
                       self.hosts).run()

                Put(self.hosts, [
                    "run_all_execo.py", "xml_gen_execo.py", "conf.xml",
                    "platform_aws.xml", "cloud_ec2.xml"
                ],
                    remote_location="/home/Work/sgcbntier/paasage_demo/").run(
                    )
                logger.info("Done")

                if len(self.hosts) == 0:
                    break

                # Initializing the resources and threads
                available_hosts = [
                    host for host in self.hosts for i in range(
                        get_host_attributes(host)['architecture']['smt_size'])
                ]

                threads = {}

                # Creating the unique folder for storing the results
                comb_dir = self.result_dir + '/csv_results'
                if not os.path.exists(comb_dir):
                    os.mkdir(comb_dir)

                # Checking that the job is running and not in Error
                while self.is_job_alive() or len(threads.keys()) > 0:
                    job_is_dead = False
                    while self.options.n_nodes > len(available_hosts):
                        tmp_threads = dict(threads)
                        for t in tmp_threads:
                            if not t.is_alive():
                                available_hosts.append(tmp_threads[t]['host'])
                                del threads[t]
                        sleep(5)
                        if not self.is_job_alive():
                            job_is_dead = True
                            break
                    if job_is_dead:
                        break

                    # Getting the next combination
                    comb = self.sweeper.get_next()
                    if not comb:
                        while len(threads.keys()) > 0:
                            tmp_threads = dict(threads)
                            for t in tmp_threads:
                                if not t.is_alive():
                                    del threads[t]
                            logger.info('Waiting for threads to complete')
                            sleep(20)
                        break

                    host = available_hosts[0]
                    available_hosts = available_hosts[1:]

                    t = Thread(target=self.workflow,
                               args=(comb, host, comb_dir))
                    threads[t] = {'host': host}
                    t.daemon = True
                    t.start()

                if not self.is_job_alive():
                    job_is_dead = True

                if job_is_dead:
                    self.oargrid_job_id = None

        finally:
            if self.oargrid_job_id is not None:
                if not self.options.keep_alive:
                    logger.info('Deleting job')
                    oargriddel([self.oargrid_job_id])
                else:
                    logger.info('Keeping job alive for debugging')
Exemple #23
0
    def workflow(self, comb, host, comb_dir):
        """ """
        comb_ok = False
        thread_name = style.Thread(str(host).split('.')[0]) + ': '
        logger.info(thread_name + 'Starting combination ' + slugify(comb))
        if 'parapluie' in str(host):
            nb_proc = 24
        elif 'paranoia' in str(host):
            nb_proc = 20
        elif 'parapide' in str(host):
            nb_proc = 8
        else:
            nb_proc = 16

        try:
            self.export = "source ~/aevol_binary/intel/linux/mkl/bin/mklvars.sh intel64; "

            bucketname = self.working_dir + '/raevol_5_mut_lat/' + slugify(
                comb) + '/'

            logger.info(thread_name + "Killing other RAevol")

            killa = Remote("killall -9 aevol_run", [host])
            for killp in killa.processes:
                killp.ignore_error = True
            killa.run()

            if os.path.isdir(bucketname) and os.path.exists(bucketname +
                                                            '/last_gener.txt'):
                logger.info(thread_name + "Resuming AEVOL from NFS backup")

                gen_file = open(bucketname + '/last_gener.txt', 'r')

                last_gen = gen_file.read()

                if int(last_gen) < 300000:
                    logger.info(thread_name + "Resuming AEVOL Run from " +
                                str(int(last_gen)))
                    rem = Remote(
                        self.export + 'cd ' + bucketname +
                        '; /home/jorouzaudcornabas/aevol_binary/aevol/src/aevol_run -p '
                        + str(nb_proc) + ' -e 300000 -r ' + last_gen +
                        ' >> aevol_run.log', [host],
                        process_args={
                            'default_stdout_handler': False,
                            'default_stderr_handler': False
                        }).run()
                    if rem.ok:
                        comb_ok = True
                else:
                    comb_ok = True
            else:
                Remote('mkdir -p ' + bucketname, [host]).run()

                param_file = '/home/jorouzaudcornabas/aevol_binary/aevol/execo/mut_lat/param_tmpl.in'

                logger.info(thread_name + 'Generate config file ' + param_file)

                f_template = open(param_file)
                fd, outfile = mkstemp(dir='/tmp/',
                                      prefix=slugify(comb) + '_param')
                f = os.fdopen(fd, 'w')

                for line in f_template:
                    if 'CONFIGURE_ENVIRONMENT_VALUES' in line:
                        if comb['env'] == 'const':
                            line = line.replace('CONFIGURE_ENVIRONMENT_VALUES',
                                                'NB_ENVIRONMENTS 1')
                            f.write(line)
                            f.write('ENV_ADD_GAUSSIAN  1  0.5   0.2   0.05' +
                                    os.linesep)
                            f.write('ENV_ADD_GAUSSIAN  1  0.5   0.4   0.05' +
                                    os.linesep)
                            f.write('ENV_ADD_GAUSSIAN  1  0.5   0.6   0.05' +
                                    os.linesep)
                            f.write('ENV_ADD_GAUSSIAN  1  0.5   0.8   0.05' +
                                    os.linesep)
                        elif comb['env'] == 'lat_3':
                            line = line.replace('CONFIGURE_ENVIRONMENT_VALUES',
                                                'NB_ENVIRONMENTS 2')
                            f.write(line)
                            f.write('ENV_ADD_GAUSSIAN  1  0.5   0.2   0.05' +
                                    os.linesep)
                            f.write('ENV_ADD_GAUSSIAN  1  0.5   0.4   0.05' +
                                    os.linesep)
                            f.write('ENV_ADD_GAUSSIAN  1  0.5   0.6   0.05' +
                                    os.linesep)
                            f.write('ENV_ADD_GAUSSIAN  1  0.5   0.8   0.05' +
                                    os.linesep)

                            f.write('ENV_ADD_GAUSSIAN  2  0.5   0.2   0.05' +
                                    os.linesep)
                            f.write('ENV_ADD_GAUSSIAN  2  0.5   0.4   0.05' +
                                    os.linesep)
                            f.write('ENV_ADD_GAUSSIAN  2  0.5   0.65  0.05' +
                                    os.linesep)
                            f.write('ENV_ADD_GAUSSIAN  2  0.5   0.8   0.05' +
                                    os.linesep)
                        elif comb['env'] == 'lat_all':
                            line = line.replace('CONFIGURE_ENVIRONMENT_VALUES',
                                                'NB_ENVIRONMENTS 16')
                            f.write(line)

                            #const

                            f.write('ENV_ADD_GAUSSIAN  1  0.5   0.2   0.05' +
                                    os.linesep)
                            f.write('ENV_ADD_GAUSSIAN  1  0.5   0.4   0.05' +
                                    os.linesep)
                            f.write('ENV_ADD_GAUSSIAN  1  0.5   0.6   0.05' +
                                    os.linesep)
                            f.write('ENV_ADD_GAUSSIAN  1  0.5   0.8   0.05' +
                                    os.linesep)

                            # 1

                            f.write('ENV_ADD_GAUSSIAN  2  0.5   0.25  0.05' +
                                    os.linesep)
                            f.write('ENV_ADD_GAUSSIAN  2  0.5   0.4   0.05' +
                                    os.linesep)
                            f.write('ENV_ADD_GAUSSIAN  2  0.5   0.6   0.05' +
                                    os.linesep)
                            f.write('ENV_ADD_GAUSSIAN  2  0.5   0.8   0.05' +
                                    os.linesep)

                            f.write('ENV_ADD_GAUSSIAN  3  0.5   0.2   0.05' +
                                    os.linesep)
                            f.write('ENV_ADD_GAUSSIAN  3  0.5   0.45  0.05' +
                                    os.linesep)
                            f.write('ENV_ADD_GAUSSIAN  3  0.5   0.6   0.05' +
                                    os.linesep)
                            f.write('ENV_ADD_GAUSSIAN  3  0.5   0.8   0.05' +
                                    os.linesep)

                            f.write('ENV_ADD_GAUSSIAN  4  0.5   0.2   0.05' +
                                    os.linesep)
                            f.write('ENV_ADD_GAUSSIAN  4  0.5   0.4   0.05' +
                                    os.linesep)
                            f.write('ENV_ADD_GAUSSIAN  4  0.5   0.65  0.05' +
                                    os.linesep)
                            f.write('ENV_ADD_GAUSSIAN  4  0.5   0.8   0.05' +
                                    os.linesep)

                            f.write('ENV_ADD_GAUSSIAN  5  0.5   0.2   0.05' +
                                    os.linesep)
                            f.write('ENV_ADD_GAUSSIAN  5  0.5   0.4   0.05' +
                                    os.linesep)
                            f.write('ENV_ADD_GAUSSIAN  5  0.5   0.6   0.05' +
                                    os.linesep)
                            f.write('ENV_ADD_GAUSSIAN  5  0.5   0.85  0.05' +
                                    os.linesep)

                            # 2

                            f.write('ENV_ADD_GAUSSIAN  6  0.5   0.25  0.05' +
                                    os.linesep)
                            f.write('ENV_ADD_GAUSSIAN  6  0.5   0.45  0.05' +
                                    os.linesep)
                            f.write('ENV_ADD_GAUSSIAN  6  0.5   0.6   0.05' +
                                    os.linesep)
                            f.write('ENV_ADD_GAUSSIAN  6  0.5   0.8   0.05' +
                                    os.linesep)

                            f.write('ENV_ADD_GAUSSIAN  7  0.5   0.25  0.05' +
                                    os.linesep)
                            f.write('ENV_ADD_GAUSSIAN  7  0.5   0.4   0.05' +
                                    os.linesep)
                            f.write('ENV_ADD_GAUSSIAN  7  0.5   0.65  0.05' +
                                    os.linesep)
                            f.write('ENV_ADD_GAUSSIAN  7  0.5   0.8   0.05' +
                                    os.linesep)

                            f.write('ENV_ADD_GAUSSIAN  8  0.5   0.25  0.05' +
                                    os.linesep)
                            f.write('ENV_ADD_GAUSSIAN  8  0.5   0.4   0.05' +
                                    os.linesep)
                            f.write('ENV_ADD_GAUSSIAN  8  0.5   0.6   0.05' +
                                    os.linesep)
                            f.write('ENV_ADD_GAUSSIAN  8  0.5   0.85  0.05' +
                                    os.linesep)

                            f.write('ENV_ADD_GAUSSIAN  9  0.5   0.2   0.05' +
                                    os.linesep)
                            f.write('ENV_ADD_GAUSSIAN  9  0.5   0.45  0.05' +
                                    os.linesep)
                            f.write('ENV_ADD_GAUSSIAN  9  0.5   0.65  0.05' +
                                    os.linesep)
                            f.write('ENV_ADD_GAUSSIAN  9  0.5   0.8   0.05' +
                                    os.linesep)

                            f.write('ENV_ADD_GAUSSIAN  10  0.5   0.2   0.05' +
                                    os.linesep)
                            f.write('ENV_ADD_GAUSSIAN  10  0.5   0.45  0.05' +
                                    os.linesep)
                            f.write('ENV_ADD_GAUSSIAN  10  0.5   0.6   0.05' +
                                    os.linesep)
                            f.write('ENV_ADD_GAUSSIAN  10  0.5   0.85  0.05' +
                                    os.linesep)

                            f.write('ENV_ADD_GAUSSIAN  11  0.5   0.2   0.05' +
                                    os.linesep)
                            f.write('ENV_ADD_GAUSSIAN  11  0.5   0.4   0.05' +
                                    os.linesep)
                            f.write('ENV_ADD_GAUSSIAN  11  0.5   0.65  0.05' +
                                    os.linesep)
                            f.write('ENV_ADD_GAUSSIAN  11  0.5   0.85  0.05' +
                                    os.linesep)

                            # 3

                            f.write('ENV_ADD_GAUSSIAN  12  0.5   0.25  0.05' +
                                    os.linesep)
                            f.write('ENV_ADD_GAUSSIAN  12  0.5   0.45  0.05' +
                                    os.linesep)
                            f.write('ENV_ADD_GAUSSIAN  12  0.5   0.65  0.05' +
                                    os.linesep)
                            f.write('ENV_ADD_GAUSSIAN  12  0.5   0.8   0.05' +
                                    os.linesep)

                            f.write('ENV_ADD_GAUSSIAN  13  0.5   0.25  0.05' +
                                    os.linesep)
                            f.write('ENV_ADD_GAUSSIAN  13  0.5   0.45  0.05' +
                                    os.linesep)
                            f.write('ENV_ADD_GAUSSIAN  13  0.5   0.6   0.05' +
                                    os.linesep)
                            f.write('ENV_ADD_GAUSSIAN  13  0.5   0.85  0.05' +
                                    os.linesep)

                            f.write('ENV_ADD_GAUSSIAN  14  0.5   0.25  0.05' +
                                    os.linesep)
                            f.write('ENV_ADD_GAUSSIAN  14  0.5   0.4   0.05' +
                                    os.linesep)
                            f.write('ENV_ADD_GAUSSIAN  14  0.5   0.65  0.05' +
                                    os.linesep)
                            f.write('ENV_ADD_GAUSSIAN  14  0.5   0.85  0.05' +
                                    os.linesep)

                            f.write('ENV_ADD_GAUSSIAN  15  0.5   0.2   0.05' +
                                    os.linesep)
                            f.write('ENV_ADD_GAUSSIAN  15  0.5   0.45  0.05' +
                                    os.linesep)
                            f.write('ENV_ADD_GAUSSIAN  15  0.5   0.65  0.05' +
                                    os.linesep)
                            f.write('ENV_ADD_GAUSSIAN  15  0.5   0.85  0.05' +
                                    os.linesep)

                            # 4

                            f.write('ENV_ADD_GAUSSIAN  16  0.5   0.25  0.05' +
                                    os.linesep)
                            f.write('ENV_ADD_GAUSSIAN  16  0.5   0.45  0.05' +
                                    os.linesep)
                            f.write('ENV_ADD_GAUSSIAN  16  0.5   0.65  0.05' +
                                    os.linesep)
                            f.write('ENV_ADD_GAUSSIAN  16  0.5   0.85  0.05' +
                                    os.linesep)
                    elif 'CONFIGURE_SIGNAL_VALUES' in line:
                        if comb['env'] == 'const':
                            line = line.replace('CONFIGURE_SIGNAL_VALUES', '')
                            f.write(line)

                        elif comb['env'] == 'lat_3':
                            line = line.replace(
                                'CONFIGURE_SIGNAL_VALUES',
                                'CREATE_SIGNAL h0 h0 h0 w0 m0 m1 m0 h1 h0 m0 h0 m1 h1 w0 h1 h0 m1 h1 m0 w0 w0 m0 w0 h0 h1 m1 w0 m0 m1 m0 w0 h1 h0 m0 h0 m1 h1 w0 h0 w0 m0 m1 m0 w0 h1 h0 w0 w0 h1'
                            )
                            f.write(line)

                            f.write('ENV_ADD_SIGNAL 2 1' + os.linesep)
                        elif comb['env'] == 'lat_all':
                            line = line.replace(
                                'CONFIGURE_SIGNAL_VALUES',
                                'CREATE_SIGNAL h0 w0 h1 m1 w0 h1 m0 h0 h1 w0 h0 m1 h1 h1 m1 m0 h0 w0 h1 m1 w0 h1 m0 h0 h1 w0 h0 m1 h1 h1 m1 m0 h1 m0 m1'
                            )
                            f.write(line)
                            f.write(
                                'CREATE_SIGNAL m0 h0 m1 h1 m1 w0 m0 m1 m0 h0 m1 h1 w0 h0 h0 h1 m1 m0 h1 w0 h1 h0 m1 h1 m0 w0 w0 m0 m1 w0 w0 h1 h0 w0 h1 h0 h0 m0 h0 w0 h0 m1 m0 w0 h1 w0 w0 h1 m0'
                                + os.linesep)
                            f.write(
                                'CREATE_SIGNAL h0 h0 h0 w0 m0 m1 m0 h1 h0 m0 h0 m1 h1 w0 h1 h0 m1 h1 m0 w0 w0 m0 w0 h0 h1 m1 w0 m0 m1 m0 w0 h1 h0 m0 h0 m1 h1 w0 h0 w0 m0 m1 m0 w0 h1 h0 w0 w0 h1'
                                + os.linesep)
                            f.write(
                                'CREATE_SIGNAL h1 h1 m0 w0 w0 h1 m1 h1 h1 m1 m0 w0 m1 m0 m0 w0 m0 h0 m0 h0 w0 h0 m0 h0 h1 m1 h0 h1 w0 h0 h1 m1 h1 m1 m0'
                                + os.linesep)

                            f.write('ENV_ADD_SIGNAL 2 1' + os.linesep)
                            f.write('ENV_ADD_SIGNAL 3 2' + os.linesep)
                            f.write('ENV_ADD_SIGNAL 4 3' + os.linesep)
                            f.write('ENV_ADD_SIGNAL 5 4' + os.linesep)

                            f.write('ENV_ADD_SIGNAL 6 1' + os.linesep)
                            f.write('ENV_ADD_SIGNAL 6 2' + os.linesep)

                            f.write('ENV_ADD_SIGNAL 7 1' + os.linesep)
                            f.write('ENV_ADD_SIGNAL 7 3' + os.linesep)

                            f.write('ENV_ADD_SIGNAL 8 1' + os.linesep)
                            f.write('ENV_ADD_SIGNAL 8 4' + os.linesep)

                            f.write('ENV_ADD_SIGNAL 9 2' + os.linesep)
                            f.write('ENV_ADD_SIGNAL 9 3' + os.linesep)

                            f.write('ENV_ADD_SIGNAL 10 2' + os.linesep)
                            f.write('ENV_ADD_SIGNAL 10 4' + os.linesep)

                            f.write('ENV_ADD_SIGNAL 11 3' + os.linesep)
                            f.write('ENV_ADD_SIGNAL 11 4' + os.linesep)

                            f.write('ENV_ADD_SIGNAL 12 1' + os.linesep)
                            f.write('ENV_ADD_SIGNAL 12 2' + os.linesep)
                            f.write('ENV_ADD_SIGNAL 12 3' + os.linesep)

                            f.write('ENV_ADD_SIGNAL 13 1' + os.linesep)
                            f.write('ENV_ADD_SIGNAL 13 2' + os.linesep)
                            f.write('ENV_ADD_SIGNAL 13 4' + os.linesep)

                            f.write('ENV_ADD_SIGNAL 14 1' + os.linesep)
                            f.write('ENV_ADD_SIGNAL 14 3' + os.linesep)
                            f.write('ENV_ADD_SIGNAL 14 4' + os.linesep)

                            f.write('ENV_ADD_SIGNAL 15 2' + os.linesep)
                            f.write('ENV_ADD_SIGNAL 15 3' + os.linesep)
                            f.write('ENV_ADD_SIGNAL 15 4' + os.linesep)

                            f.write('ENV_ADD_SIGNAL 16 1' + os.linesep)
                            f.write('ENV_ADD_SIGNAL 16 2' + os.linesep)
                            f.write('ENV_ADD_SIGNAL 16 3' + os.linesep)
                            f.write('ENV_ADD_SIGNAL 16 4' + os.linesep)
                    else:
                        line = line.replace('SEED_NUMBER', str(comb['seed']))
                        line = line.replace('MUTATION_RATE_VALUE',
                                            comb['mutation'])
                        line = line.replace('SELECTION_PRESSURE',
                                            str(comb['selection']))
                        f.write(line)

                f_template.close()
                f.close()

                put_file = Put([host], [outfile],
                               remote_location=bucketname).run()
                if not put_file.ok:
                    exit()

                os.remove(outfile)

                Remote(
                    'cd ' + bucketname + '; cp ' + outfile.split('/')[-1] +
                    ' param.in; cp /home/jorouzaudcornabas/aevol_binary/aevol/execo/mut_lat/binding_matrix.rae .',
                    [host]).run()

                logger.info(thread_name + "Launching AEVOL Create")
                Remote(
                    self.export + 'cd ' + bucketname +
                    '; /home/jorouzaudcornabas/aevol_binary/aevol/src/aevol_create > aevol_create.log',
                    [host],
                    process_args={
                        'default_stdout_handler': False,
                        'default_stderr_handler': False
                    }).run()

                logger.info(thread_name + "Launching AEVOL Run")
                rem = Remote(
                    self.export + 'cd ' + bucketname +
                    '; /home/jorouzaudcornabas/aevol_binary/aevol/src/aevol_run -p '
                    + str(nb_proc) + ' -n 300000 > aevol_run.log', [host],
                    process_args={
                        'default_stdout_handler': False,
                        'default_stderr_handler': False
                    }).run()
                if rem.ok:
                    comb_ok = True

            logger.info(thread_name + 'Get results ' + comb_dir + "/" +
                        slugify(comb))

        #try:
        #os.mkdir(comb_dir + "/" + slugify(comb))
        #except:
        #logger.warning(thread_name +
        #'%s already exists, removing existing files', comb_dir + "/" + slugify(comb))

#shutil.rmtree(comb_dir+ "/" + slugify(comb))
#try:
#os.mkdir(comb_dir + "/" + slugify(comb))
#except:
#logger.warning(thread_name +
#'%s already exists, recreating directory', comb_dir + "/" + slugify(comb))

#get_results = Get([host], [bucketname+ "/aevol_create.log", bucketname+ "/aevol_run.log", bucketname+'/stats/'],
#local_location=comb_dir + "/" + slugify(comb)).run()

#for p in get_results.processes:
#if not p.ok:
#logger.error(thread_name +
#': Unable to retrieve the files for combination %s',
#slugify(comb))
#exit()

        finally:
            if comb_ok:
                self.sweeper.done(comb)
                # shutil.rmtree(bucketname)
                logger.info(thread_name + ': ' + slugify(comb) + \
                             ' has been done')
            else:
                self.sweeper.cancel(comb)
                logger.warning(thread_name + ': ' + slugify(comb) + \
                            ' has been canceled')
        logger.info(style.step('%s Remaining'),
                    len(self.sweeper.get_remaining()))
    def workflow(self, comb, host, comb_dir):
        """ """
        comb_ok = False
        thread_name = style.Thread(str(host).split('.')[0]) + ': '
        logger.info(thread_name + 'Starting combination ' + slugify(comb))

        try:
            self.export = "source ~/aevol_binary/intel/linux/mkl/bin/mklvars.sh intel64; "

            bucketname = self.working_dir + '/raevol_5_mut_lat/' + slugify(
                comb) + '/'

            if os.path.isdir(bucketname) and os.path.exists(bucketname +
                                                            '/last_gener.txt'):
                logger.info(thread_name + "Resuming AEVOL from NFS backup")

                gen_file = open(bucketname + '/last_gener.txt', 'r')

                last_gen = gen_file.read()

                if int(last_gen) < 500000:
                    logger.info(thread_name + "Resuming AEVOL Run from " +
                                str(int(last_gen)))
                    rem = Remote(
                        self.export + 'cd ' + bucketname +
                        '; /home/jorouzaudcornabas/aevol_binary/aevol/src/aevol_run -p 16'
                        + ' -e 300000 -r ' + last_gen + ' >> aevol_run.log',
                        [host]).run()
                    if rem.ok:
                        comb_ok = True
                else:
                    comb_ok = True
            else:
                Remote('mkdir -p ' + bucketname, [host]).run()

                param_file = '/home/jorouzaudcornabas/aevol_binary/execo/mut_lat/param_tmpl.in'

                logger.info(thread_name + 'Generate config file ' + param_file)

                f_template = open(param_file)
                fd, outfile = mkstemp(dir='/tmp/',
                                      prefix=slugify(comb) + '_param')
                f = os.fdopen(fd, 'w')

                for line in f_template:
                    line = line.replace('SEED_NUMBER', str(comb['seed']))
                    line = line.replace('FUZZY_VERSION', str(comb['fuzzy']))
                    if comb['move']:
                        line = line.replace('FIRST_GAUSSIAN_MEDIAN', '0.25')
                        line = line.replace('THIRD_GAUSSIAN_MEDIAN', '0.65')
                    else:
                        line = line.replace('FIRST_GAUSSIAN_MEDIAN', '0.2')
                        line = line.replace('THIRD_GAUSSIAN_MEDIAN', '0.6')
                    line = line.replace('GAUSSIAN_HEIGHT', str(comb['height']))
                    f.write(line)

                f_template.close()
                f.close()

                put_file = Put([host], [outfile],
                               remote_location=bucketname).run()
                if not put_file.ok:
                    exit()

                os.remove(outfile)

                Remote(
                    'cd ' + bucketname + '; cp ' + outfile.split('/')[-1] +
                    ' param.in', [host]).run()

                logger.info(thread_name + "Launching AEVOL Create")
                Remote(
                    self.export + 'cd ' + bucketname +
                    '; /home/jorouzaudcornabas/aevol_diff_area/aevol/src/aevol_create > aevol_create.log',
                    [host]).run()

                logger.info(thread_name + "Launching AEVOL Run")
                rem = Remote(
                    self.export + 'cd ' + bucketname +
                    '; /home/jorouzaudcornabas/aevol_diff_area/aevol/src/aevol_run -p 16 -n 500000 > aevol_run.log',
                    [host]).run()
                if rem.ok:
                    comb_ok = True

            logger.info(thread_name + 'Get results ' + comb_dir + "/" +
                        slugify(comb))

        #try:
        #os.mkdir(comb_dir + "/" + slugify(comb))
        #except:
        #logger.warning(thread_name +
        #'%s already exists, removing existing files', comb_dir + "/" + slugify(comb))

#shutil.rmtree(comb_dir+ "/" + slugify(comb))
#try:
#os.mkdir(comb_dir + "/" + slugify(comb))
#except:
#logger.warning(thread_name +
#'%s already exists, recreating directory', comb_dir + "/" + slugify(comb))

#get_results = Get([host], [bucketname+ "/aevol_create.log", bucketname+ "/aevol_run.log", bucketname+'/stats/'],
#local_location=comb_dir + "/" + slugify(comb)).run()

#for p in get_results.processes:
#if not p.ok:
#logger.error(thread_name +
#': Unable to retrieve the files for combination %s',
#slugify(comb))
#exit()

        finally:
            if comb_ok:
                self.sweeper.done(comb)
                # shutil.rmtree(bucketname)
                logger.info(thread_name + ': ' + slugify(comb) + \
                             ' has been done')
            else:
                self.sweeper.cancel(comb)
                logger.warning(thread_name + ': ' + slugify(comb) + \
                            ' has been canceled')
        logger.info(style.step('%s Remaining'),
                    len(self.sweeper.get_remaining()))
    def run(self):
        """Run the experiment"""
        already_configured = self.options.already_configured
        reservation_job_id = int(self.options.reservation_id) \
            if self.options.reservation_id is not None else None
        is_a_test = self.options.is_a_test

        if is_a_test:
            logger.warn('THIS IS A TEST! This run will use only a few '
                        'resources')

        # make the result folder writable for all
        os.chmod(self.result_dir, 0o777)
        # Import configuration
        with open(self.args[0]) as config_file:
            config = json.load(config_file)
        # backup configuration
        copy(self.args[0], self.result_dir)

        site = config["grid5000_site"]
        resources = config["resources"]
        nb_experiment_nodes = config["nb_experiment_nodes"]
        walltime = str(config["walltime"])
        env_name = config["kadeploy_env_name"]
        workloads = config["workloads"]
        # check if workloads exists (Suppose that the same NFS mount point
        # is present on the remote and the local environment
        for workload_file in workloads:
            with open(workload_file):
                pass
            # copy the workloads files to the results dir
            copy(workload_file, self.result_dir)

        # define the workloads parameters
        self.parameters = {
            'workload_filename': workloads
        }
        logger.info('Workloads: {}'.format(workloads))

        # define the iterator over the parameters combinations
        self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"),
                                    sweep(self.parameters))

        # Due to previous (using -c result_dir) run skip some combination
        logger.info('Skipped parameters:' +
                    '{}'.format(str(self.sweeper.get_skipped())))

        logger.info('Number of parameters combinations {}'.format(
            str(len(self.sweeper.get_remaining()))))
        logger.info('combinations {}'.format(
            str(self.sweeper.get_remaining())))

        if reservation_job_id is not None:
            jobs = [(reservation_job_id, site)]
        else:
            jobs = oarsub([(OarSubmission(resources=resources,
                                          job_type='deploy',
                                          walltime=walltime), site)])
        job_id, site = jobs[0]
        if job_id:
            try:
                logger.info("waiting job start %s on %s" % (job_id, site))
                wait_oar_job_start(
                    job_id, site, prediction_callback=prediction_callback)
                logger.info("getting nodes of %s on %s" % (job_id, site))
                nodes = get_oar_job_nodes(job_id, site)
                # sort the nodes
                nodes = sorted(nodes, key=lambda node: node.address)
                # get only the necessary nodes under the switch
                if nb_experiment_nodes > len(nodes):
                    raise RuntimeError('The number of given node in the '
                                       'reservation ({}) do not match the '
                                       'requested resources '
                                       '({})'.format(len(nodes),
                                                     nb_experiment_nodes))
                nodes = nodes[:nb_experiment_nodes]
                logger.info("deploying nodes: {}".format(str(nodes)))
                deployed, undeployed = deploy(
                    Deployment(nodes, env_name=env_name),
                    check_deployed_command=already_configured)
                if undeployed:
                    logger.warn(
                        "NOT deployed nodes: {}".format(str(undeployed)))
                    raise RuntimeError('Deployement failed')

                if not already_configured:

                    # install OAR
                    install_cmd = "apt-get update; apt-get install -y "
                    node_packages = "oar-node"
                    logger.info(
                        "installing OAR nodes: {}".format(str(nodes[1:])))
                    install_oar_nodes = Remote(
                        install_cmd + node_packages,
                        nodes[1:],
                        connection_params={'user': '******'})
                    install_oar_nodes.start()

                    server_packages = ("oar-server oar-server-pgsql oar-user "
                                       "oar-user-pgsql postgresql python3-pip "
                                       "libjson-perl postgresql-server-dev-all")
                    install_oar_sched_cmd = """
                    mkdir -p /opt/oar_sched; \
                    cd /opt/oar_sched; \
                    git clone https://github.com/oar-team/oar3.git; \
                    cd oar3; \
                    git checkout dce942bebc2; \
                    pip3 install -e .; \
                    cd /usr/lib/oar/schedulers; \
                    ln -s /usr/local/bin/kamelot; \
                    pip3 install psycopg2
                    """
                    logger.info("installing OAR server node: {}".format(str(nodes[0])))
                    install_master = SshProcess(install_cmd + server_packages +
                                                ";" + install_oar_sched_cmd, nodes[0],
                                                connection_params={'user': '******'})
                    install_master.run()
                    install_oar_nodes.wait()

                    if not install_master.ok:
                        Report(install_master)

                    configure_oar_cmd = """
                    sed -i \
                        -e 's/^\(DB_TYPE\)=.*/\\1="Pg"/' \
                        -e 's/^\(DB_HOSTNAME\)=.*/\\1="localhost"/' \
                        -e 's/^\(DB_PORT\)=.*/\\1="5432"/' \
                        -e 's/^\(DB_BASE_PASSWD\)=.*/\\1="oar"/' \
                        -e 's/^\(DB_BASE_LOGIN\)=.*/\\1="oar"/' \
                        -e 's/^\(DB_BASE_PASSWD_RO\)=.*/\\1="oar_ro"/' \
                        -e 's/^\(DB_BASE_LOGIN_RO\)=.*/\\1="oar_ro"/' \
                        -e 's/^\(SERVER_HOSTNAME\)=.*/\\1="localhost"/' \
                        -e 's/^\(SERVER_PORT\)=.*/\\1="16666"/' \
                        -e 's/^\(LOG_LEVEL\)\=\"2\"/\\1\=\"3\"/' \
                        -e 's#^\(LOG_FILE\)\=.*#\\1="{result_dir}/oar.log"#' \
                        -e 's/^\(JOB_RESOURCE_MANAGER_PROPERTY_DB_FIELD\=\"cpuset\".*\)/#\\1/' \
                        -e 's/^#\(CPUSET_PATH\=\"\/oar\".*\)/\\1/' \
                        -e 's/^\(FINAUD_FREQUENCY\)\=.*/\\1="0"/' \
                        /etc/oar/oar.conf
                    """.format(result_dir=self.result_dir)
                    configure_oar = Remote(configure_oar_cmd, nodes,
                                           connection_params={'user': '******'})
                    configure_oar.run()
                    logger.info("OAR is configured on all nodes")

                    # Configure server
                    create_db = "oar-database --create --db-is-local"
                    config_oar_sched = ("oarnotify --remove-queue default;"
                                        "oarnotify --add-queue default,1,kamelot")
                    start_oar = "systemctl start oar-server.service"
                    logger.info(
                        "configuring OAR database: {}".format(str(nodes[0])))
                    config_master = SshProcess(create_db + ";" + config_oar_sched + ";" + start_oar,
                                               nodes[0],
                                               connection_params={'user': '******'})
                    config_master.run()

                    # propagate SSH keys
                    logger.info("configuring OAR SSH")
                    oar_key = "/tmp/.ssh"
                    Process('rm -rf ' + oar_key).run()
                    Process('scp -o BatchMode=yes -o PasswordAuthentication=no '
                            '-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null '
                            '-o ConnectTimeout=20 -rp -o User=root ' +
                            nodes[0].address + ":/var/lib/oar/.ssh"
                            ' ' + oar_key).run()
                    # Get(nodes[0], "/var/lib/oar/.ssh", [oar_key], connection_params={'user': '******'}).run()
                    Put(nodes[1:], [oar_key], "/var/lib/oar/", connection_params={'user': '******'}).run()
                    add_resources_cmd = """
                    oarproperty -a cpu || true; \
                    oarproperty -a core || true; \
                    oarproperty -c -a host || true; \
                    oarproperty -a mem || true; \
                    """
                    for node in nodes[1:]:
                        add_resources_cmd = add_resources_cmd + "oarnodesetting -a -h {node} -p host={node} -p cpu=1 -p core=4 -p cpuset=0 -p mem=16; \\\n".format(node=node.address)

                    add_resources = SshProcess(add_resources_cmd, nodes[0],
                                               connection_params={'user': '******'})
                    add_resources.run()

                    if add_resources.ok:
                        logger.info("oar is now configured!")
                    else:
                        raise RuntimeError("error in the OAR configuration: Abort!")

                # TODO backup de la config de OAR

                # Do the replay
                logger.info('begining the replay')
                while len(self.sweeper.get_remaining()) > 0:
                    combi = self.sweeper.get_next()
                    workload_file = os.path.basename(combi['workload_filename'])
                    oar_replay = SshProcess(script_path + "/oar_replay.py " +
                                            combi['workload_filename'] + " " +
                                            self.result_dir + "  oar_gant_" +
                                            workload_file,
                                            nodes[0])
                    oar_replay.stdout_handlers.append(self.result_dir + '/' +
                                                      workload_file + '.out')
                    logger.info("replaying workload: {}".format(combi))
                    oar_replay.run()
                    if oar_replay.ok:
                        logger.info("Replay workload OK: {}".format(combi))
                        self.sweeper.done(combi)
                    else:
                        logger.info("Replay workload NOT OK: {}".format(combi))
                        self.sweeper.cancel(combi)
                        raise RuntimeError("error in the OAR replay: Abort!")

            except:
                traceback.print_exc()
                ipdb.set_trace()

            finally:
                if is_a_test:
                    ipdb.set_trace()
                if reservation_job_id is None:
                    logger.info("delete job: {}".format(jobs))
                    oardel(jobs)
    def run(self):
        """Run the experiment"""
        already_configured = self.options.already_configured
        reservation_job_id = int(self.options.reservation_id) \
            if self.options.reservation_id is not None else None
        is_a_test = self.options.is_a_test

        if is_a_test:
            logger.warn('THIS IS A TEST! This run will use only a few '
                        'resources')

        # make the result folder writable for all
        os.chmod(self.result_dir, 0o777)
        # Import configuration
        with open(self.args[0]) as config_file:
            config = json.load(config_file)
        # backup configuration
        copy(self.args[0], self.result_dir)

        site = config["grid5000_site"]
        resources = config["resources"]
        nb_experiment_nodes = config["nb_experiment_nodes"]
        walltime = str(config["walltime"])
        env_name = config["kadeploy_env_name"]
        workloads = config["workloads"]
        # check if workloads exists (Suppose that the same NFS mount point
        # is present on the remote and the local environment
        for workload_file in workloads:
            with open(workload_file):
                pass
            # copy the workloads files to the results dir
            copy(workload_file, self.result_dir)

        # define the workloads parameters
        self.parameters = {'workload_filename': workloads}
        logger.info('Workloads: {}'.format(workloads))

        # define the iterator over the parameters combinations
        self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"),
                                    sweep(self.parameters))

        # Due to previous (using -c result_dir) run skip some combination
        logger.info('Skipped parameters:' +
                    '{}'.format(str(self.sweeper.get_skipped())))

        logger.info('Number of parameters combinations {}'.format(
            str(len(self.sweeper.get_remaining()))))
        logger.info('combinations {}'.format(str(
            self.sweeper.get_remaining())))

        if reservation_job_id is not None:
            jobs = [(reservation_job_id, site)]
        else:
            jobs = oarsub([(OarSubmission(resources=resources,
                                          job_type='deploy',
                                          walltime=walltime), site)])
        job_id, site = jobs[0]
        if job_id:
            try:
                logger.info("waiting job start %s on %s" % (job_id, site))
                wait_oar_job_start(job_id,
                                   site,
                                   prediction_callback=prediction_callback)
                logger.info("getting nodes of %s on %s" % (job_id, site))
                nodes = get_oar_job_nodes(job_id, site)
                # sort the nodes
                nodes = sorted(nodes, key=lambda node: node.address)
                # get only the necessary nodes under the switch
                if nb_experiment_nodes > len(nodes):
                    raise RuntimeError('The number of given node in the '
                                       'reservation ({}) do not match the '
                                       'requested resources '
                                       '({})'.format(len(nodes),
                                                     nb_experiment_nodes))
                nodes = nodes[:nb_experiment_nodes]
                logger.info("deploying nodes: {}".format(str(nodes)))
                deployed, undeployed = deploy(
                    Deployment(nodes, env_name=env_name),
                    check_deployed_command=already_configured)
                if undeployed:
                    logger.warn("NOT deployed nodes: {}".format(
                        str(undeployed)))
                    raise RuntimeError('Deployement failed')

                if not already_configured:

                    # install OAR
                    install_cmd = "apt-get update; apt-get install -y "
                    node_packages = "oar-node"
                    logger.info("installing OAR nodes: {}".format(
                        str(nodes[1:])))
                    install_oar_nodes = Remote(
                        install_cmd + node_packages,
                        nodes[1:],
                        connection_params={'user': '******'})
                    install_oar_nodes.start()

                    server_packages = (
                        "oar-server oar-server-pgsql oar-user "
                        "oar-user-pgsql postgresql python3-pip "
                        "libjson-perl postgresql-server-dev-all")
                    install_oar_sched_cmd = """
                    mkdir -p /opt/oar_sched; \
                    cd /opt/oar_sched; \
                    git clone https://github.com/oar-team/oar3.git; \
                    cd oar3; \
                    git checkout dce942bebc2; \
                    pip3 install -e .; \
                    cd /usr/lib/oar/schedulers; \
                    ln -s /usr/local/bin/kamelot; \
                    pip3 install psycopg2
                    """
                    logger.info("installing OAR server node: {}".format(
                        str(nodes[0])))
                    install_master = SshProcess(
                        install_cmd + server_packages + ";" +
                        install_oar_sched_cmd,
                        nodes[0],
                        connection_params={'user': '******'})
                    install_master.run()
                    install_oar_nodes.wait()

                    if not install_master.ok:
                        Report(install_master)

                    configure_oar_cmd = """
                    sed -i \
                        -e 's/^\(DB_TYPE\)=.*/\\1="Pg"/' \
                        -e 's/^\(DB_HOSTNAME\)=.*/\\1="localhost"/' \
                        -e 's/^\(DB_PORT\)=.*/\\1="5432"/' \
                        -e 's/^\(DB_BASE_PASSWD\)=.*/\\1="oar"/' \
                        -e 's/^\(DB_BASE_LOGIN\)=.*/\\1="oar"/' \
                        -e 's/^\(DB_BASE_PASSWD_RO\)=.*/\\1="oar_ro"/' \
                        -e 's/^\(DB_BASE_LOGIN_RO\)=.*/\\1="oar_ro"/' \
                        -e 's/^\(SERVER_HOSTNAME\)=.*/\\1="localhost"/' \
                        -e 's/^\(SERVER_PORT\)=.*/\\1="16666"/' \
                        -e 's/^\(LOG_LEVEL\)\=\"2\"/\\1\=\"3\"/' \
                        -e 's#^\(LOG_FILE\)\=.*#\\1="{result_dir}/oar.log"#' \
                        -e 's/^\(JOB_RESOURCE_MANAGER_PROPERTY_DB_FIELD\=\"cpuset\".*\)/#\\1/' \
                        -e 's/^#\(CPUSET_PATH\=\"\/oar\".*\)/\\1/' \
                        -e 's/^\(FINAUD_FREQUENCY\)\=.*/\\1="0"/' \
                        /etc/oar/oar.conf
                    """.format(result_dir=self.result_dir)
                    configure_oar = Remote(configure_oar_cmd,
                                           nodes,
                                           connection_params={'user': '******'})
                    configure_oar.run()
                    logger.info("OAR is configured on all nodes")

                    # Configure server
                    create_db = "oar-database --create --db-is-local"
                    config_oar_sched = (
                        "oarnotify --remove-queue default;"
                        "oarnotify --add-queue default,1,kamelot")
                    start_oar = "systemctl start oar-server.service"
                    logger.info("configuring OAR database: {}".format(
                        str(nodes[0])))
                    config_master = SshProcess(
                        create_db + ";" + config_oar_sched + ";" + start_oar,
                        nodes[0],
                        connection_params={'user': '******'})
                    config_master.run()

                    # propagate SSH keys
                    logger.info("configuring OAR SSH")
                    oar_key = "/tmp/.ssh"
                    Process('rm -rf ' + oar_key).run()
                    Process(
                        'scp -o BatchMode=yes -o PasswordAuthentication=no '
                        '-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null '
                        '-o ConnectTimeout=20 -rp -o User=root ' +
                        nodes[0].address + ":/var/lib/oar/.ssh"
                        ' ' + oar_key).run()
                    # Get(nodes[0], "/var/lib/oar/.ssh", [oar_key], connection_params={'user': '******'}).run()
                    Put(nodes[1:], [oar_key],
                        "/var/lib/oar/",
                        connection_params={
                            'user': '******'
                        }).run()
                    add_resources_cmd = """
                    oarproperty -a cpu || true; \
                    oarproperty -a core || true; \
                    oarproperty -c -a host || true; \
                    oarproperty -a mem || true; \
                    """
                    for node in nodes[1:]:
                        add_resources_cmd = add_resources_cmd + "oarnodesetting -a -h {node} -p host={node} -p cpu=1 -p core=4 -p cpuset=0 -p mem=16; \\\n".format(
                            node=node.address)

                    add_resources = SshProcess(
                        add_resources_cmd,
                        nodes[0],
                        connection_params={'user': '******'})
                    add_resources.run()

                    if add_resources.ok:
                        logger.info("oar is now configured!")
                    else:
                        raise RuntimeError(
                            "error in the OAR configuration: Abort!")

                # TODO backup de la config de OAR

                # Do the replay
                logger.info('begining the replay')
                while len(self.sweeper.get_remaining()) > 0:
                    combi = self.sweeper.get_next()
                    workload_file = os.path.basename(
                        combi['workload_filename'])
                    oar_replay = SshProcess(
                        script_path + "/oar_replay.py " +
                        combi['workload_filename'] + " " + self.result_dir +
                        "  oar_gant_" + workload_file, nodes[0])
                    oar_replay.stdout_handlers.append(self.result_dir + '/' +
                                                      workload_file + '.out')
                    logger.info("replaying workload: {}".format(combi))
                    oar_replay.run()
                    if oar_replay.ok:
                        logger.info("Replay workload OK: {}".format(combi))
                        self.sweeper.done(combi)
                    else:
                        logger.info("Replay workload NOT OK: {}".format(combi))
                        self.sweeper.cancel(combi)
                        raise RuntimeError("error in the OAR replay: Abort!")

            except:
                traceback.print_exc()
                ipdb.set_trace()

            finally:
                if is_a_test:
                    ipdb.set_trace()
                if reservation_job_id is None:
                    logger.info("delete job: {}".format(jobs))
                    oardel(jobs)