Example #1
0
 def run_example(self):
     ensure_wdir(project=self.project)
     empty_queues(project=self.project)
     os.chdir(self.ex_dir)
     if os.path.exists('logfiles'):
         rmtree('logfiles')
     os.mkdir('logfiles')
     call('./create_jobs.py {}'.format(self.project), shell=True)
     call('fjd-recruiter --project {} hire {}'\
             .format(self.project, self.num_workers), shell=True)
     call('fjd-dispatcher --project {} --end_when_jobs_are_done'\
             .format(self.project), shell=True)
Example #2
0
    def __init__(self, num_workers=1, project=None, local_only=False, curdir=''):
        if not project:
            project = 'default'
        self.project = project
        self.curdir = curdir
        self.wdir = ensure_wdir(project)

        # build up self.hosts
        self.hosts = [dict(name='localhost', workers=int(num_workers))]
        rc_loc = "{}/remote.conf".format(self.wdir)
        if osp.exists(rc_loc) and not local_only:
            self.hosts = []
            remote_conf = configparser.ConfigParser()
            remote_conf.read(rc_loc)
            num_hosts = 0
            while remote_conf.has_section('host{}'.format(num_hosts + 1)):
                hid = "host{}".format(num_hosts + 1)
                if not remote_conf.has_option(hid, 'name') or\
                   not remote_conf.has_option(hid, 'workers'):
                    print("[fjd-recruiter] Host section for {} is missing"\
                          " name or workers option!".format(hid))
                else:
                    self.hosts.append(
                        dict(name=remote_conf.get(hid, "name"),
                             workers=remote_conf.getint(hid, "workers")))
                num_hosts += 1
            if debug:
                print("[fjd-recruiter] I am configured with hosts {}."\
                    .format(','.join([h['name'] for h in self.hosts])))
Example #3
0
    def __init__(self, interval=.1, project=None):
        if not project:
            project = 'default'
        self.wdir = ensure_wdir(project)
        self.start_up()

        # announce my presence
        self.id = self.mk_id()
        print('[fjd-worker] Started with ID {id}.'.format(id=self.id))
        subprocess.call('touch {wdir}/workerqueue/{id}.worker'\
                   .format(wdir=self.wdir, id=self.id), shell=True)

        # look for jobs
        while True:
            job = self.next_job_on_pod()
            if job:
                print('[fjd-worker] Worker {}: I found a job.'.format(self.id))
                # Check if job file is a config file (ini-style).
                # If it is, get executable from there, call it and pass it the config file.
                # If it is not, run the job file as a script.

                # We read the file first and close it, so no stale handles will exists
                # in case ConfigParser exits
                with open('{}/jobpod/{}'.format(self.wdir, job), 'r') as jobfile:
                    jobtxt = jobfile.read()
                    if sys.version < '3':
                        jobtxt = unicode(jobtxt)
                ini_fp = io.StringIO(jobtxt)
                conf = configparser.RawConfigParser()
                try:
                    conf.readfp(ini_fp)  # this raises in case it is not an .ini file
                    exe = conf.get('fjd', 'executable')
                    cmd = 'nice -n {nice} {exe} {wdir}/jobpod/{job}; '\
                          .format(nice=9, exe=exe, wdir=self.wdir, job=job)
                    #except (configparser.MissingSectionHeaderError, configparser.NoSectionError):
                except (configparser.MissingSectionHeaderError):
                    cmd = 'nice -n {nice} {wdir}/jobpod/{job}'.format(nice=9, wdir=self.wdir, job=job)
                subprocess.call(cmd, shell=True)
                print('[fjd-worker] Worker {}: Finished my job.'.format(self.id))
                # remove the job from pod (signaling it is done) + re-announce myself
                subprocess.call('rm {wdir}/jobpod/{job}; touch {wdir}/workerqueue/{id}.worker'\
                        .format(wdir=self.wdir, job=job, id=self.id), shell=True)
            time.sleep(interval)
Example #4
0
File: main.py Project: nhoening/fjd
    def __init__(self, exe, repeat=1, parameters=[], project=None, num_workers=0,
                callback=None, curdir=''):
        if not exe or exe == '':
            print('[fjd] Please specify an executable command (--exe).')
            sys.exit(2)
        if repeat > 1 and len(parameters) > 0:
            print('[fjd] Only one of --repeat and --parameters can be set at a time.')
            sys.exit(2)
        empty_queues(project=project)
        self.wdir = ensure_wdir(project)
        if len(parameters) > 1:
            for i, p in enumerate(parameters):
                job = '{}/jobqueue/job{}'.format(self.wdir, i)
                with open(job, 'w') as f:
                    f.write('#!/bin/bash\n')
                    cur_exe = exe
                    ext_params = []
                    for j, param in enumerate(str(p).split('#')):
                        if '${}'.format(j+1) in cur_exe:
                            cur_exe = cur_exe.replace('${}'.format(j+1), str(param))
                        else:
                            ext_params.append(str(p))
                    f.write('{exe} {params}'.format(exe=cur_exe, params=' '.join(ext_params)))
                os.chmod(job, 0o777)
        else:
            for i in range(repeat):
                job = '{}/jobqueue/job{}'.format(self.wdir, i)
                with open(job, 'w') as f:
                    f.write('#!/bin/bash\n')
                    f.write(exe)
                os.chmod(job, 0o777)

        if num_workers == 0:
            num_workers = cpu_count() - 1
        num_workers = min(num_workers, cpu_count())
        recruiter = Recruiter(num_workers=num_workers, project=project,
                              curdir=curdir)
        recruiter.hire()
        Dispatcher(project=project, callback=callback)
Example #5
0
    def __init__(self, interval=.1, project=None, end_when_jobs_are_done=True,
                 callback=None, status_only=False):
        if not project:
            project = 'default'
        self.wdir = ensure_wdir(project)
        self.start_up()

        if not status_only:
            print('[fjd-dispatcher] Started on project "{}".'.format(project))

        def signal_handler(signal, frame):
            ''' gently exiting, e.g. when CTRL-C was pressed.  '''
            sys.stdout.write('\n[fjd-dispatcher] Received Exit signal. Exiting ...\n')
            print('[fjd-dispatcher] Should I fire all workers in project {}? [y|N]'\
                        .format(project))
            if input().lower() in ["y", "yes"]:
                Recruiter(project=project).fire()
            sys.exit(0)
        signal.signal(signal.SIGINT, signal_handler)

        do_work = True
        while do_work:
            time.sleep(interval)
            if status_only:  # just show info once, don't do anything else
                do_work = False
            jq = os.listdir('{}/jobqueue'.format(self.wdir))
            jp = os.listdir('{}/jobpod'.format(self.wdir))
            wq = os.listdir('{}/workerqueue'.format(self.wdir))
            self.sort_jobqueue(jq)
            num_workers = len(os.listdir('{}/screenrcs'.format(self.wdir)))
            if len(jq) > 0:  # more jobs waiting for workers
                sys.stdout.write("\r[fjd-dispatcher] {} job(s) waiting in the queue."\
                                 " Currently {} worker(s) out of {} are free ...                   "\
                       .format(len(jq), len(wq), num_workers))
                sys.stdout.flush()
                if not status_only:
                    for _ in range(min(len(jq), len(wq))):
                        worker = wq.pop()
                        job = jq.pop()
                        os.rename('{wdir}/jobqueue/{j}'.format(wdir=self.wdir, j=job),
                                '{wdir}/jobpod/{w}'.format(wdir=self.wdir, w=worker))
                        os.remove('{wdir}/workerqueue/{w}'.format(wdir=self.wdir, w=worker))
            elif len(jp) > 0:  # some jobs are still running
                sys.stdout.write("\r[fjd-dispatcher] Job queue is empty. Waiting for remaining {} job(s) to finish ...                  ".format(len(jp)))
                sys.stdout.flush()
            else:  # all jobs are done
                sys.stdout.write("\r[fjd-dispatcher] Job queue is empty and all jobs have finished.                                     ")
                sys.stdout.flush()
                if end_when_jobs_are_done:
                    sys.stdout.write("\n")
                    Recruiter(project=project).fire()
                    do_work = False
                    if callback:
                        if isinstance(callback, types.FunctionType):
                            callback()
                        elif isinstance(callback, str):
                            subprocess.call(callback, shell=True)
                        else:
                            print('[fjd-dispatcher] Cannot use callback function, as it is neither function nor string, but {}'.format(type(callback)))
            if status_only:
                print('')

        self.wrap_up()
Example #6
0
I make parameter configurations from four shuffled lists and let one job run 1000 parameter configurations (otherwise, the job queue becomes too large and it takes too long for fjd to regularly inspect it).
'''

import sys
import os
import itertools
import numpy as np
import random
from subprocess import call
from fjd import Dispatcher
from fjd.utils import ensure_wdir, empty_queues

# clean up
call('rm pbsjobs/pbsjob*', shell=True)
call('rm brute*.log;', shell=True)
ensure_wdir(project='brute')
empty_queues(project='brute')

# start 80 workers on 10 PBS nodes (8 on each)
for node in range(1, 11, 1):
    pbsjob = '''# Shell for the job:
#PBS -S /bin/bash
# request 1 node, 8 cores
#PBS -lnodes=1:cores8
# job requires at most n hours wallclock time
#PBS -lwalltime=08:00:00

cd /home/nicolas/brute
fjd-recruiter --project brute hire 8
python -c "import time; time.sleep(16*60*60)"  # keep PBS job alive
'''