Ejemplo n.º 1
0
def check_data(simfolder, host):
    '''
    Performs a check on existing data, ask for confirmation to overwrite it.

    :param string simfolder: relative path to simfolder
    :param string host: host nr
    :returns: True if no data found or if data can be overwritten, False otherwise
    '''
    conf = utils.get_main_conf(simfolder)
    hosts = utils.num_hosts(simfolder)
    remote_conf = utils.get_host_conf(simfolder)

    hostname = remote_conf.get("host%i" % host, "name")
    ssh_client = _get_ssh_client(remote_conf, host)
    if ssh_client:
        # first look if simulation directory exists yet
        host_path = remote_conf.get("host%i" % host, "path")
        sim_dir = utils.make_simdir_name(simfolder)
        host_path_dirs = ssh(ssh_client, 'cd %s; ls' % host_path)
        if not sim_dir in host_path_dirs:
            return True
        # if so, look in simulation folder for 'data' dir
        path = '%s/%s' % (host_path, sim_dir)
        dirs = ssh(ssh_client, 'cd %s/%s; ls' % (path, simfolder))
        if 'data' in [
                d for d in dirs.split('\n')
                if not d.startswith('[Nicessa]') and not d == ''
        ]:
            # check if 'data' dir has (non-hidden) content
            data = ssh(ssh_client, 'cd %s/%s/data; ls' % (path, simfolder))
            if len([f for f in data.split('\n') if not f == '' and not f.startswith('.')\
                                                   and not f.startswith('[Nicessa]')]) > 0:
                print '[Nicessa] On host %s, I found older log data (in %s/data). Remove? [y/N]' % (
                    hostname, path)
                if not raw_input().lower() == 'y':
                    return False
                else:
                    return True
            else:
                return True
        else:
            return True
    else:
        print "[Nicessa] Cannot make connection to host %d" % host
        return False
    return True
Ejemplo n.º 2
0
def kill_screens(simfolder):
    '''
    Kill all screens that currently run the main simulation or the specified set of simulations.

    :param string simfolder: relative path to simfolder
    '''
    conf = utils.get_main_conf(simfolder)
    hosts = utils.num_hosts(simfolder)
    remote_conf = utils.get_host_conf(simfolder)
    working_cpus_per_host = utils.working_cpus_per_host(simfolder)
    sims = ','.join(utils.get_subsimulation_names(conf))

    print '[Nicessa] Kill all screens running simulations (%s)? [y/N]' % sims
    if not raw_input().lower() == 'y':
        print '[Nicessa] I did nothing.'
        return

    print "[Nicessa] Killing screens on hosts: "
    sys.stdout.flush()
    for host in xrange(1, hosts + 1):
        if working_cpus_per_host[host] > 0:
            hostname = remote_conf.get("host%i" % host, "name")
            path = '%s/%s' % (remote_conf.get(
                "host%i" % host, "path"), utils.make_simdir_name(simfolder))
            print "%s (host-nr:%d, cpus:%d)  " % (hostname, host,
                                                  working_cpus_per_host[host]),
            sys.stdout.flush()
            ssh_client = _get_ssh_client(remote_conf, host)
            if ssh_client:
                killed = ssh(ssh_client, "ps -ef | grep '%s' | awk '{print $2}' | xargs kill -9"\
                                  % utils.make_simdir_name(simfolder),
                                  ignore=['usage: kill', 'kill ', 'No Sockets found', 'No such process'])
                time.sleep(1)
                ssh(ssh_client, 'screen -wipe')
                ssh(ssh_client, 'cd %s; rm -r *' % path)
                sys.stdout.flush()
    print '[Nicessa] Done.'
Ejemplo n.º 3
0
def run_remotely(simfolder, conf):
    '''
    Run the simulation on remote hosts. Reads parameterisation and remote setup from the conf-directory. 

    :param string simfolder: relative path to simfolder
    :param ConfigParser conf: main config
    :returns: True if successful, False otherwise
    '''
    folder = simfolder
    remote_conf = utils.get_host_conf(simfolder)
    num_hosts = utils.num_hosts(simfolder)
    working_cpus_per_host = utils.working_cpus_per_host(folder)
    ssh_clients = {}  # we login twice to each if it has work

    if not folder == ".":
        os.chdir(folder)

    logdir = 'deploy-logs'
    if not os.path.exists(logdir):
        os.mkdir(logdir)
    else:
        for logf in os.listdir(logdir):
            os.remove('%s/%s' % (logdir, logf))

    print "[Nicessa] Preparing hosts ..."
    for host in [
            h for h in xrange(1, num_hosts + 1) if working_cpus_per_host[h] > 0
    ]:
        ssh_clients[host] = _get_ssh_client(remote_conf, host)
        if ssh_clients[host] is None:
            print "[Nicessa] Cannot connect to host %d. Aborting ...  " % host
            return False
        if not remote_conf.has_section("host%i" % host):
            print "[Nicessa] Host %d is not configured!" % host
            return False

        path = '%s/%s' % (remote_conf.get(
            "host%i" % host, "path"), utils.make_simdir_name(simfolder))
        # ------------- clean host (we want to be sure to use fresh code)
        # do this on all hosts before anything is run (e.g. they could operate on the same home dir)
        if not check_data(simfolder, host):
            print '[Nicessa] Aborting.'
            return False
        cleaning = "mkdir -p %s/%s;" % (path, folder)
        togo = "data conf nicessa.conf %s bgscreen screener.py starter.py _nicessa_bundle.tar.gz" \
                % (conf.get('control', 'executable'))
        if remote_conf.has_option('code', 'files'):
            for f in [
                    f for f in remote_conf.get('code', 'files').split(',')
                    if f is not ""
            ]:
                togo += " %s/%s " % (simfolder, f.strip())
        if remote_conf.has_option('code', 'folders'):
            for f in [
                    f for f in remote_conf.get('code', 'folders').split(',')
                    if f is not ""
            ]:
                togo += " %s/%s " % (simfolder, f.strip())
        cleaning += "cd %s/%s; rm -r %s;" % (path, folder, togo)
        # make fresh dirs to config and log screens
        cleaning += 'mkdir -p screenrcs; rm -r screenrcs/*; mkdir -p screenlogs; rm -r screenlogs/*;'
        # clean old states, too - never know how the last run was finished (e.g. Ctrl-C)
        cleaning += clean_states(simfolder, host)
        ssh(ssh_clients[host], cleaning)

    used_hosts = 0
    for host in [
            h for h in xrange(1, num_hosts + 1) if working_cpus_per_host[h] > 0
    ]:
        # -------------  initialize each host
        # don't proceed if a host doesn't have work to do (TODO: maybe also don't clean before?)
        host_has_work = False
        if osp.exists("%s/conf/%d" % (folder, host)):
            host_has_work = True
            used_hosts += 1
        if not host_has_work:
            continue

        # ------------- start screen(s) on all of them
        # let him run the batch for this host in a background screen (for each simulation we might have)
        # TODO: remote works only with subsimulations right now? We should fix that
        screening = ""
        for cpu in xrange(1, working_cpus_per_host[host] + 1):
            if osp.exists("%s/conf/%d/%d" % (simfolder, host, cpu)):
                screen_name = utils.make_screen_name(simfolder, host, cpu)
                nice_level = utils.get_nice_level(simfolder, host)
                screening += "rm finished_%s; ./screener.py %s 'nice -n %d ./starter.py . %i %i; touch finished_%s;exit;'; " \
                    % (screen_name, screen_name, nice_level, host, cpu, screen_name)

        #  --- local file shuffling
        # all host-side screen calls go in a script file, so screener.py can quietly make sure they all start without me waiting
        cmd = open("cmd_%d" % host, 'w')
        cmd.write(screening)
        cmd.flush()
        cmd.close()
        Popen("chmod +x cmd_%d;" % host, shell=True).wait()
        needed = " cmd_%d" % host

        # send everything needed of the simulation to run batches to the host in one go
        needed += " conf"
        needed += " nicessa.conf"
        # user should specify to copy this in remote.conf, since  control->executable can
        # contain any command, not only a filename
        #needed += " %s" % (conf.get('control', 'executable'))
        if remote_conf.has_option('code', 'files'):
            for f in [
                    f for f in remote_conf.get('code', 'files').split(',')
                    if f is not ""
            ]:
                needed += " %s" % f
        if remote_conf.has_option('code', 'folders'):
            for f in [
                    f for f in remote_conf.get('code', 'folders').split(',')
                    if f is not ""
            ]:
                needed += " %s" % f
        # also some nicessa files
        pth = osp.join(osp.dirname(osp.abspath(__file__)))
        copied_here = []
        for filename in ['bgscreen', 'screener.py', 'starter.py']:
            Popen("cp %s/%s ." % (pth, filename), shell=True).wait()
            copied_here.append(filename)
            needed += " %s" % filename.split('/')[-1:][0]

        # put all we need in a tar.gz archive
        Popen("tar -cf _nicessa_bundle.tar %s; gzip -f _nicessa_bundle.tar;" %
              (needed),
              shell=True).wait()

        # ------------ here we actually connect and do all these things online
        path = '%s/%s' % (remote_conf.get(
            "host%i" % host, "path"), utils.make_simdir_name(simfolder))
        if ssh_clients[host] is None:
            return False
        try:
            print "[Nicessa] Running code on %s" % remote_conf.get(
                "host%i" % host, "name")
            scp_client = scp.SCPClient(ssh_clients[host]._transport)
            scp_client.put("_nicessa_bundle.tar.gz",
                           remote_path="%s/%s" % (path, folder))
            time.sleep(2)
            initializing = "cd %s/%s; tar -zxf _nicessa_bundle.tar.gz;" % (
                path, folder)
            log = open('%s/log%d' % (logdir, host), 'w')
            log.write(
                ssh(ssh_clients[host],
                    "%s ./cmd_%d; rm cmd_%d;" % (initializing, host, host)))
            log.flush()
            log.close()
        except scp.SCPException, e:
            print e
        ssh_clients[host].close()

        # ------------ clean locally
        os.remove("_nicessa_bundle.tar.gz")
        os.remove("cmd_%d" % (host))
        for c in copied_here:
            os.remove("%s" % c)
Ejemplo n.º 4
0
def get_results(simfolder, do_wait=True):
    '''
    Copy result logs from the remote host(s) if they are all available for the whole job.

    :param string simfolder: relative path to simfolder
    :param boolean do_wait: True if regular checks should be done until all data is available (default is True)
    '''
    print '*' * 80
    print "[Nicessa] Looking for results ... "

    remote_conf = utils.get_host_conf(simfolder)
    hosts_done = dict.fromkeys(xrange(1,
                                      utils.num_hosts(simfolder) + 1), False)
    working_cpus_per_host = utils.working_cpus_per_host(simfolder)
    for host in hosts_done.keys():
        working_cpus_per_host[host] = 0
        if os.path.exists('%s/conf/%d' % (simfolder, host)):
            working_cpus_per_host[host] = len(
                os.listdir('%s/conf/%d' % (simfolder, host)))
    all_done = False
    if remote_conf.has_option('communication', 'wait-for'):
        if do_wait:
            waiting = remote_conf.getint('communication', 'wait-for')
            print "[Nicessa] waiting for %d seconds ... " % waiting
            time.sleep(waiting)
    if remote_conf.has_option('communication', 'check-every'):
        check_interval = remote_conf.getint('communication', 'check-every')
    else:
        check_interval = 10
    first_time_done = False

    if remote_conf.has_option('communication', 'shared-home'):
        if remote_conf.getboolean('communication', 'shared-home'):
            # check only the first who ran CPUs
            for host in hosts_done.keys():
                if working_cpus_per_host[host] > 0:
                    hosts_done = {host: False}
                    break
    while not all_done:
        for host in hosts_done.keys():
            if working_cpus_per_host[host] == 0:
                hosts_done[host] = True
            if not hosts_done[host]:
                hostname = remote_conf.get("host%i" % host, "name")
                if first_time_done:
                    print ".",
                ssh_client = _get_ssh_client(remote_conf, host)
                if ssh_client:
                    try:
                        path = '%s/%s' % (remote_conf.get(
                            "host%i" % host,
                            "path"), utils.make_simdir_name(simfolder))
                        # check for status by looking for the marker files this host should generate
                        res = ssh(ssh_client,
                                  'cd %s/%s; ls' % (path, simfolder))
                        # TODO: this is no good when we get the results on a different computer than we started from
                        #relevant_subsims = [subsim for subsim in utils.get_subsimulation_names(conf) if osp.exists("%s/conf/%s/%s" % (simfolder, subsim, host))]
                        if 'data' in res and reduce(lambda x, y: x and y, \
                                             map(res.__contains__, ["finished_%s" % utils.make_screen_name(simfolder, host, cpu) for cpu in xrange(1, working_cpus_per_host[host]+1)])):
                            scp_client = scp.SCPClient(ssh_client._transport)
                            try:
                                print "[Nicessa] contacting %s - compressing ... " % hostname,
                                sys.stdout.flush()
                                ssh(
                                    ssh_client,
                                    'cd %s/%s; tar -cf data_%d.tar data/*; gzip -f data_%d.tar;'
                                    % (path, simfolder, host, host))
                                time.sleep(2)
                                print "copying ...",
                                sys.stdout.flush()
                                scp_client.get("%s/%s/data_%d.tar.gz" %
                                               (path, simfolder, host),
                                               local_path='%s' % simfolder)
                                os.chdir(simfolder)
                                Popen(
                                    "tar -zxf data_%d.tar.gz; rm data_%d.tar.gz"
                                    % (host, host),
                                    shell=True).wait()
                                if not simfolder == ".":
                                    for sf in simfolder.split("/"):
                                        if sf != '':
                                            os.chdir('..')
                            except OSError, e:
                                print e
                            hosts_done[host] = True
                            print "done."
                            ssh(
                                ssh_client, 'cd %s/%s; %s' %
                                (path, simfolder, clean_states(
                                    simfolder, host)))
                    except Exception, e:
                        print e
                    ssh_client.close()
                else:
                    print "cannot connect to %s " % host
                    pass  # keep on trying
                    #hosts_done[host] = True # can't connect, so don't keep on trying
        print "_",
        sys.stdout.flush()
        # all done now?
        all_done = True
        for host_done in hosts_done.values():
            all_done = all_done and host_done
        if not first_time_done and not all_done:
            print "[Nicessa] now checking every %d seconds ... " % check_interval
            first_time_done = True
        if not all_done:
            time.sleep(check_interval)
Ejemplo n.º 5
0
def check_states(simfolder):
    '''
    Performs a check on the status of the simulations.
    For this, it looks at the marker files a job creates when it is done and the names
    of currently running screens.
    It prints out the contents of the first search as finished and the second as still running.
    If no jobs are finished or running, it prints a message.

    :param string simfolder: relative path to simfolder
    :returns: True if successful, False otherwise
    '''
    conf = utils.get_main_conf(simfolder)
    hosts = utils.num_hosts(simfolder)
    working_cpus_per_host = utils.working_cpus_per_host(simfolder)
    finished = {}
    running = {}
    for host in [
            h for h in xrange(1, hosts + 1) if working_cpus_per_host[h] > 0
    ]:
        finished[host] = []
        running[host] = []
    remote_conf = utils.get_host_conf(simfolder)
    found_jobs = 0

    # TODO: if shared-home is set to 1, we could save us some time (only connect
    # to one host), but here, the development time does not justify the reward,
    # I think.
    print "[Nicessa] Checking hosts: ",
    sys.stdout.flush()
    for host in xrange(1, hosts + 1):
        if working_cpus_per_host[host] > 0:
            hostname = remote_conf.get("host%i" % host, "name")
            print "%s (host-nr:%d, cpus:%d)  " % (hostname, host,
                                                  working_cpus_per_host[host]),
            sys.stdout.flush()
            ssh_client = _get_ssh_client(remote_conf, host)
            if ssh_client:
                path = '%s/%s' % (remote_conf.get("host%i" % host, "path"),
                                  utils.make_simdir_name(simfolder))
                fin = ssh(ssh_client,
                          'cd %s/%s; ls finished_*;' % (path, simfolder),
                          ignore=['No such file'])
                run = ssh(ssh_client, 'screen -ls;')
                for cpu in xrange(1, working_cpus_per_host[host] + 1):
                    screen_name = utils.make_screen_name(simfolder, host, cpu)
                    if "finished_%s" % screen_name in fin:
                        finished[host].append(cpu)
                        found_jobs += 1
                    if screen_name in run:
                        running[host].append(cpu)
                        found_jobs += 1
            else:
                print "[Nicessa] Cannot make connection to host %d" % host
    print
    if found_jobs == 0:
        print '[Nicessa] Could not find any running or finished jobs for this simulation.\n\
                Maybe I am checking for the wrong set of simulations? In that case, please use the "--simulations" option together with "--check" (or "--results").'

    else:
        print "[Nicessa] Finished cpus:"
        for host in finished.keys():
            print " %.27s:\t%s" % (remote_conf.get(
                "host%i" % host, "name").ljust(24), str(finished[host]))
        print "[Nicessa] Still running cpus:"
        for host in running.keys():
            print " %.27s:\t%s" % (remote_conf.get(
                "host%i" % host, "name").ljust(24), str(running[host]))
    return True