예제 #1
0
class Client():
    """DrQueue client actions"""
    def __init__(self):
        # initialize IPython
        try:
            self.ip_client = IPClient()
        except Exception:
            raise Exception("Could not connect to IPython controller.")
        self.lbview = self.ip_client.load_balanced_view()

        # enable tracking
        self.lbview.track = True

        # list of all available query keys
        self.all_task_query_keys = ['msg_id', 'header', 'content', 'buffers', 'submitted', 'client_uuid', 'engine_uuid', 'started', 'completed', 'resubmitted', 'result_header', 'result_content', 'result_buffers', 'queue', 'pyin', 'pyout', 'pyerr', 'stdout', 'stderr']


    def job_run(self, job):
        """Create and queue tasks from job object"""

        # job_id from db is be used as session name
        self.ip_client.session.session = str(job['_id'])

        # set owner of job
        self.ip_client.session.username = job['owner']

        # set number of retries for each task
        self.lbview.retries = job['retries']

        # depend on another job (it's tasks)
        if ('depend' in job['limits']) and (job['limits']['depend'] != None):
            depend_job = self.query_job_by_name(job['limits']['depend'])
            depend_tasks = self.query_task_list(depend_job['_id'])
            task_ids = []
            for task in depend_tasks:
                task_ids.append(task['msg_id'])
            self.lbview.after = task_ids


        task_frames = list(range(job['startframe'], job['endframe'] + 1, job['blocksize']))
        ar = None
        for x in task_frames:

            # prepare script input
            env_dict = {
            'DRQUEUE_FRAME' : x,
            'DRQUEUE_BLOCKSIZE' : job['blocksize'],
            'DRQUEUE_ENDFRAME' : job['endframe'],
            'DRQUEUE_SCENEFILE' : job['scenefile']
            }

            # log filename
            if job['created_with'] == "DrQueueOnRails":
                # take job directory name
                env_dict['DRQUEUE_LOGFILE'] = job['scenefile'].split("/")[-2] + "-" + str(x) + "_" + str(x + job['blocksize'] -1) + ".log"
            else:
                # take job name
                env_dict['DRQUEUE_LOGFILE'] = job['name'] + "-" + str(x) + "_" + str(x + job['blocksize'] -1) + ".log"

            # optional elements
            if 'renderdir' in job:
                env_dict['DRQUEUE_RENDERDIR'] = job['renderdir']
            if 'projectdir' in job:
                env_dict['DRQUEUE_PROJECTDIR'] = job['projectdir']
            if 'configdir' in job:
                env_dict['DRQUEUE_CONFIGDIR'] = job['configdir']
            if 'imagefile' in job:
                env_dict['DRQUEUE_IMAGEFILE'] = job['imagefile']
            if 'precommand' in job:
                env_dict['DRQUEUE_PRECOMMAND'] = job['precommand']
            if 'renderer' in job:
                env_dict['DRQUEUE_RENDERER'] = job['renderer']
            if 'fileformat' in job:
                env_dict['DRQUEUE_FILEFORMAT'] = job['fileformat']
            if 'postcommand' in job:
                env_dict['DRQUEUE_POSTCOMMAND'] = job['postcommand']
            if 'viewcommand' in job:
                env_dict['DRQUEUE_VIEWCOMMAND'] = job['viewcommand']
            if 'worldfile' in job:
                env_dict['DRQUEUE_WORLDFILE'] = job['worldfile']
            if 'terrainfile' in job:
                env_dict['DRQUEUE_TERRAINFILE'] = job['terrainfile']
            if 'composition' in job:
                env_dict['DRQUEUE_COMPOSITION'] = job['composition']
            if 'camera' in job:
                env_dict['DRQUEUE_CAMERA'] = job['camera']
            if 'resx' in job:
                env_dict['DRQUEUE_RESX'] = job['resx']
            if 'resy' in job:
                env_dict['DRQUEUE_RESY'] = job['resy']
            if 'renderpass' in job:
                env_dict['DRQUEUE_RENDERPASS'] = job['renderpass']
            if 'rendertype' in job:
                env_dict['DRQUEUE_RENDERTYPE'] = job['rendertype']
            if 'fileextension' in job:
                env_dict['DRQUEUE_FILEEXTENSION'] = job['fileextension']
            if 'stepframe' in job:
                env_dict['DRQUEUE_STEPFRAME'] = job['stepframe']
            if 'custom_bucket' in job:
                env_dict['DRQUEUE_CUSTOM_BUCKET'] = job['custom_bucket']
            if 'bucketsize' in job:
                env_dict['DRQUEUE_BUCKETSIZE'] = job['bucketsize']
            if 'custom_lod' in job:
                env_dict['DRQUEUE_CUSTOM_LOD'] = job['custom_lod']
            if 'lod' in job:
                env_dict['DRQUEUE_LOD'] = job['lod']
            if 'custom_varyaa' in job:
                env_dict['DRQUEUE_CUSTOM_VARYAA'] = job['custom_varyaa']
            if 'varyaa' in job:
                env_dict['DRQUEUE_VARYAA'] = job['varyaa']
            if 'raytrace' in job:
                env_dict['DRQUEUE_RAYTRACE'] = job['raytrace']
            if 'antialias' in job:
                env_dict['DRQUEUE_ANTIALIAS'] = job['antialias']
            if 'custom_bdepth' in job:
                env_dict['DRQUEUE_CUSTOM_BDEPTH'] = job['custom_bdepth']
            if 'bdepth' in job:
                env_dict['DRQUEUE_BDEPTH'] = job['bdepth']
            if 'custom_zdepth' in job:
                env_dict['DRQUEUE_CUSTOM_ZDEPTH'] = job['custom_zdepth']
            if 'zdepth' in job:
                env_dict['DRQUEUE_ZDEPTH'] = job['zdepth']
            if 'custom_cracks' in job:
                env_dict['DRQUEUE_CUSTOM_CRACKS'] = job['custom_cracks']
            if 'cracks' in job:
                env_dict['DRQUEUE_CRACKS'] = job['cracks']
            if 'custom_quality' in job:
                env_dict['DRQUEUE_CUSTOM_QUALITY'] = job['custom_quality']
            if 'quality' in job:
                env_dict['DRQUEUE_QUALITY'] = job['quality']
            if 'custom_qfiner' in job:
                env_dict['DRQUEUE_CUSTOM_QFINER'] = job['custom_qfiner']
            if 'qfiner' in job:
                env_dict['DRQUEUE_QFINER'] = job['qfiner']
            if 'custom_smultiplier' in job:
                env_dict['DRQUEUE_CUSTOM_SMULTIPLIER'] = job['custom_smultiplier']
            if 'smultiplier' in job:
                env_dict['DRQUEUE_SMULTIPLIER'] = job['smultiplier']
            if 'custom_mpcache' in job:
                env_dict['DRQUEUE_CUSTOM_MPCACHE'] = job['custom_mpcache']
            if 'mpcache' in job:
                env_dict['DRQUEUE_MPCACHE'] = job['mpcache']
            if 'custom_smpolygon' in job:
                env_dict['DRQUEUE_CUSTOM_SMPOLYGON'] = job['custom_smpolygon']
            if 'smpolygon' in job:
                env_dict['DRQUEUE_SMPOLYGON'] = job['smpolygon']
            if 'custom_wh' in job:
                env_dict['DRQUEUE_CUSTOM_WH'] = job['custom_wh']
            if 'custom_type' in job:
                env_dict['DRQUEUE_CUSTOM_TYPE'] = job['custom_type']
            if 'ctype' in job:
                env_dict['DRQUEUE_CTYPE'] = job['ctype']
            if 'skipframes' in job:
                env_dict['DRQUEUE_SKIPFRAMES'] = job['skipframes']
            if 'custom_command' in job:
                env_dict['DRQUEUE_CUSTOM_COMMAND'] = job['custom_command']

            # set dependencies
            dep_dict = {}
            dep_dict['job_id'] = str(job['_id'])
            if ('os' in job['limits']) and (job['limits']['os'] != None):
                dep_dict['os_name'] = job['limits']['os']
            if ('minram' in job['limits']) and (job['limits']['minram'] > 0):
                dep_dict['minram'] = job['limits']['minram']
            if ('mincores' in job['limits']) and (job['limits']['mincores'] > 0):
                dep_dict['mincores'] = job['limits']['mincores']
            if ('pool_name' in job['limits']) and (job['limits']['pool_name'] != None):
                dep_dict['pool_name'] = job['limits']['pool_name']
            run_script_with_env_and_deps = dependent(DrQueue.run_script_with_env, DrQueue.check_deps, dep_dict)

            # run task on cluster
            render_script = DrQueue.get_rendertemplate(job['renderer'])
            ar = self.lbview.apply(run_script_with_env_and_deps, render_script, env_dict)
            # wait for pyzmq send to complete communication (avoid race condition)
            ar.wait_for_send()

        # append email task behind last task if requested
        if ('send_email' in job) and (job['send_email'] == True):
            self.lbview.after = ar
            # run email task
            mail_ar = self.lbview.apply(DrQueue.send_email, job['name'], job['email_recipients'])
            # wait for pyzmq send to complete communication (avoid race condition)
            mail_ar.wait_for_send()
        return True


    def identify_computer(self, engine_id, cache_time, timeout=15):
        """Gather information about computer"""
        # look if engine info is already stored
        engine = DrQueueComputer.query_db_by_engine_id(engine_id)
        now = int(time.time())
        # check existence and age of info
        if (engine != None) and (now <= engine['created_at'] + cache_time):
            log.debug("Engine %i was found in DB and info is up-to-date." % engine_id)
            return engine
        # store new info
        else:
            if engine != None:
                log.debug("Engine %i was found in DB, but info needs to be updated." % engine_id)
            else:
                log.debug("Engine %i was not found in DB." % engine_id)
            # run command only on specific computer
            try:
                dview = self.ip_client[engine_id]
            except IndexError:
                log.debug("Engine with id %i unknown." % engine_id)
                # delete old entry from database
                DrQueueComputer.delete_from_db_by_engine_id(engine_id)
                log.debug("Engine with id %i deleted from database." % engine_id)
                new_engine = None
            else:
                # run command in async mode
                dview.block = False
                command = "import DrQueue\nfrom DrQueue import Computer as DrQueueComputer\nengine = DrQueueComputer()"
                ar = dview.execute(command)
                try:
                    # try to get results & wait until timeout
                    ar.get(timeout)
                except Exception:
                    if engine != None:
                        log.debug("Update request for engine %i timed out. Using old information from DB." % engine_id)
                        new_engine = engine
                    else:
                        log.debug("Information request for engine %i timed out." % engine_id)
                        new_engine = None
                else:
                    # get computer dict from engine namespace
                    new_engine = dview['engine']
                    # set to known engine_id
                    new_engine['engine_id'] = engine_id
                    # set creation time
                    new_engine['created_at'] = int(time.time())
                    # store entry in database
                    DrQueueComputer.store_db(new_engine)
            return new_engine


    def computer_set_pools(self, computer, pool_list):
        """add computer to list of pools"""
        # convert to string
        pool_str = ','.join(pool_list)
        # update environment variable on engine
        dview = self.ip_client[computer['engine_id']]
        dview.block = True
        command = "import os\nos.environ[\"DRQUEUE_POOL\"] = \"" + pool_str + "\""
        dview.execute(command)
        # update database entry
        computer['pools'] = pool_list
        DrQueueComputer.store_db(computer)
        log.debug("Engine " + str(computer['engine_id']) + " added to pools " + pool_str + ".")
        return computer


    def computer_get_pools(self, computer):
        """Return all pool names where computer is member."""
        return computer['pools']


    def task_wait(self, task_id):
        """Wait for task to finish"""
        ar = self.ip_client.get_result(task_id)
        ar.wait_for_send()
        ar.wait()
        return ar


    def query_job_list(self):
        """Query a list of all jobs"""
        return DrQueueJob.query_job_list()


    def query_job_by_id(self, job_id):
        """Query job by given id"""
        return DrQueueJob.query_db(job_id)


    def query_job_by_name(self, job_name):
        """Query job by given name"""
        return DrQueueJob.query_job_by_name(job_name)


    def query_job_tasks_left(self, job_id):
        """Query left frames of job"""
        left = 0
        tasks = self.query_task_list(job_id)
        for task in tasks:
            if task['completed'] == None:
                left += 1
        return left


    def query_job_finish_time(self, job_id):
        """Query oldest finish time of all tasks."""
        job = self.query_job_by_id(job_id)
        # use requeue time as starting point if available
        if ('requeue_time' in job ) and (job['requeue_time'] != False):
            finish_time = job['requeue_time']
        else:
            finish_time = job['submit_time']
        tasks = self.query_task_list(job_id)
        for task in tasks:
            # look if older finish time exists
            if (task['completed'] != None) and (task['completed'] > finish_time):
                finish_time = task['completed']
        return finish_time


    def get_frame_nr(self, task):
        """Extract value of DRQUEUE_FRAME."""
        if ('buffers' in task) and task['buffers'] != []:
            frame_nr = int(pickle.loads(task['buffers'][3])['DRQUEUE_FRAME'])
        else:
            frame_nr = 1
        return frame_nr


    def query_task_list(self, job_id):
        """Query a list of tasks objects of certain job. Sort by frame number."""
        task_list =  self.ip_client.db_query({'header.session' : str(job_id)}, keys=self.all_task_query_keys)
        sorted_task_list = sorted(task_list, key=self.get_frame_nr)
        return sorted_task_list


    def query_interrupted_task_list(self, job_id):
        """Query a list of interrupted tasks of certain job. Sort by frame number."""
        job = self.query_job_by_id(job_id)
        task_list =  self.ip_client.db_query({'header.session' : str(job_id)}, keys=self.all_task_query_keys)
        interrupted_task_list = []

        for task in task_list:
            frame_nr = self.get_frame_nr(task)
            print("frame_nr: " + str(frame_nr))
            # log filename
            if job['renderer'] == "blender":
                filesearch = job['scenefile'] + str("%04d" % frame_nr) + ".???"
                found = glob.glob(filesearch)
                # file was found
                if len(found) > 0:
                    outputfile = found[0]
                    print("outputfile: "+ str(outputfile))
                    filesize = os.path.getsize(outputfile)
                    print(filesize)
                    # file exists, but is empty
                    if filesize == 0:
                        interrupted_task_list.append(task)
                # file was not found
                else:
                    outputfile = None
                    print("outputfile: "+ str(outputfile))
                    if (task['completed'] == None) and (task['started'] == None):
                        interrupted_task_list.append(task)
            else:
                raise ValueError("Only Blender renderer supported so far.")

        return interrupted_task_list


    def query_task(self, task_id):
        """Query a single task."""
        task = self.ip_client.db_query({'msg_id' : task_id }, keys=self.all_task_query_keys)[0]
        return task


    def query_computer_list(self):
        """Query a list of all computers."""
        return self.ip_client.ids


    def job_stop(self, job_id):
        """Stop job and all tasks which are not currently running"""

        # disable job
        self.job_disable(job_id)

        tasks = self.query_task_list(job_id)
        tasks_to_stop = []
        for task in tasks:
            print("Task " + task["msg_id"] + ": ")
            if ("result_content" in task) and (task["result_content"] != None) and (task["result_content"]["status"] == "ok"):
                print("  finished at " + str(task["completed"]))
            else:
                # get task stats of all computers
                stats = self.ip_client.queue_status('all', True)
                # check if tasks is already running on an engine
                found_on_engine = False
                for key,status in list(stats.items()):
                    if ('tasks' in status) and (task['msg_id'] in status['tasks']):
                        # skip tasks which are already running on an engine
                        print("  not finished yet but already queued to engine. will leave it there.")
                        found_on_engine = True
                        break

                # if a task isn't already queueed/running on an engine, it should be safe to abort it
                if found_on_engine == False:
                    print("  not finished yet. will abort.")
                    tasks_to_stop.append(task['msg_id'])

        if len(tasks_to_stop) > 0:
            try:
                self.ip_client.abort(tasks_to_stop)
            except Exception as e:
                print("ERROR: " + str(e))

        return True


    def job_kill(self, job_id):
        """Stop job and all of it's tasks wether running or not"""

        # disable job
        self.job_disable(job_id)

        tasks = self.query_task_list(job_id)
        running_engines = []
        tasks_to_stop = []
        # abort all queued tasks
        for task in tasks:
            stats = self.ip_client.queue_status('all', True)
            # check if tasks is already running on an engine
            for key,status in list(stats.items()):
                if ('tasks' in status) and (task['msg_id'] in status['tasks']):
                    running_engines.append(key)
            tasks_to_stop.append(task['msg_id'])
        # stop all matching tasks at once
        try:
            self.ip_client.abort(tasks_to_stop)
        except Exception as e:
            print("ERROR: " + str(e))

        # stop all engines which still run a task
        # the slave wrapper will restart the engine
        running_engines = set(running_engines)
        for engine_id in running_engines:
            self.engine_stop(engine_id)
        return True


    def job_disable(self, job_id):
        """Disable job in database."""
        job = self.query_job_by_id(job_id)
        job['enabled'] = False
        DrQueueJob.update_db(job)
        return True


    def job_enable(self, job_id):
        """Disable job in database."""
        job = self.query_job_by_id(job_id)
        job['enabled'] = True
        DrQueueJob.update_db(job)
        return True


    def job_delete(self, job_id):
        """Delete job and all of it's tasks"""
        tasks = self.query_task_list(job_id)
        engines = self.query_computer_list()
        error = False
        pending_tasks = []
        # abort and delete all queued tasks
        for task in tasks:
            if len(engines) > 0:
                # abort outstanding tasks which are already queued to engine
                print('aborting task ' + str(task['msg_id']))
                try:
                    self.ip_client.abort(task['msg_id'], engines)
                except Exception, e:
                    print('Error: ' + str(e))
                    error = True
            # purge all tasks which are not pending
            print('purging task ' + str(task['msg_id']))
            try:
                self.ip_client.purge_hub_results(task['msg_id'], engines)
            except Exception:
                print('Warning: ' + str(task['msg_id']) + ' is pending. Try to kill job before.')
                pending_tasks.append(task)
                error = True
        # delete job if no error occured
        if error == False:
            # delete job itself
            DrQueueJob.delete_from_db(job_id)
            return True
        else:
            return False
    # print n_jobs, n_executed_jobs,
    rc = Client(profile=profile)
    n_clusters = len(rc)
    if n_executed_jobs == 0:
        n_executed_jobs = n_jobs
    elif n_executed_jobs < n_clusters:
        n_jobs = n_executed_jobs
    if n_jobs >= n_clusters:
        dview = rc[:]
    elif n_jobs == -1:
        dview = rc[:]
    elif n_jobs < n_clusters:
        dview = rc[list(np.random.permutation(n_clusters)[:n_executed_jobs])]
    # A = dview.queue_status()
    # print A.keys()
    return dview


if __name__ == "__main__":

    rc = Client(profile='net')
    A = rc.queue_status()
    for ii in range(len(rc)):
        print A[ii]

    # dview = random_rc('net', -1, 10)
    # A = dview.queue_status()
    # print len(dview)
    # for ii in A.keys():
        # print A[ii]
예제 #3
0
class ParallelCache(object):
    def __init__(self, cluster_profile=None, cachedir=None, purge=False,
                 idle_timeout=None, shutdown=False, retries=3, poll_interval=10,
                 verbose=5, **kwargs):
        self._purge = purge
        self._idle_timeout = idle_timeout
        self._shutdown = shutdown
        self._retries = retries
        self._poll_interval = poll_interval
        self._verbose = verbose
        self._execution_times = None
        if cluster_profile is not None:
            self._ip_client = Client(profile=cluster_profile, **kwargs)
        else:
            self._ip_client = None

        if cachedir is not None:
            self._memory = Memory(cachedir=cachedir, verbose=verbose)
        else:
            self._memory = None

    def map(self, f, *sequences, **kwargs):
        # make sure all sequences have the same length
        n_jobs = None
        my_seqs = []
        for ii, seq in enumerate(sequences):
            try:
                this_n_elems = len(seq)
                if n_jobs is None:
                    n_jobs = this_n_elems
                if this_n_elems != n_jobs:
                    raise ValueError('All sequences must have the same lenght,'
                                     'sequence at position %d has length %d'
                                     % (ii + 1, this_n_elems))
                my_seqs.append(seq)
            except TypeError:
                # we allow passing ints etc, convert them to a sequence
                my_seqs.append(repeat(seq))

        t_start = time.time()
        if self._ip_client is None:
            if self._verbose >= 1:
                tmp = 'without' if self._memory is None else 'with'
                print_('Running %d jobs locally %s caching..' % (n_jobs, tmp))
            out = list()
            my_fun = f if self._memory is None else self._memory.cache(f)
            for this_args in zip(*my_seqs):
                out.append(my_fun(*this_args, **kwargs))
        elif self._ip_client is not None and self._memory is None:
            if self._verbose >= 1:
                print('Running %d jobs on cluster without caching..' % n_jobs)
            out = [None] * n_jobs
            lbview = self._ip_client.load_balanced_view()
            tasks = list()
            for this_args in zip(*my_seqs):
                tasks.append(lbview.apply(f, *this_args, **kwargs))
            # wait for tasks to complete
            result_retrieved = [False] * len(tasks)
            execution_times = [None] * len(tasks)
            retry_no = np.zeros(len(tasks), dtype=np.int)
            last_print = 0
            last_idle_check = time.time()
            idle_times = {}
            while True:
                for ii, task in enumerate(tasks):
                    if not result_retrieved[ii] and task.ready():
                        if task.successful():
                            out[ii] = task.get()
                            execution_times[ii] = task.serial_time
                            result_retrieved[ii] = True
                        else:
                            # task failed for some reason, re-run it
                            if retry_no[ii] < self._retries:
                                if self._verbose > 3:
                                    print ('\nTask %d failed, re-running (%d / %d)'
                                           % (ii, retry_no[ii] + 1,
                                              self._retries))
                                this_args = zip(*my_seqs)[ii]
                                new_task = lbview.apply(f, *this_args, **kwargs)
                                tasks[ii] = new_task
                                retry_no[ii] += 1
                            else:
                                msg = ('\nTask %d failed %d times. Stopping'
                                       % (ii, self._retries + 1))
                                print msg
                                # this will throw an exception
                                task.get()
                                raise RuntimeError(msg)
                        if self._purge:
                            _purge_results(self._ip_client, task)

                n_completed = int(np.sum(result_retrieved))
                progress = n_completed / float(n_jobs)
                # print progress in 10% increments
                this_print = int(np.floor(progress * 10))
                if self._verbose >= 1 and this_print != last_print:
                    print_(' %d%%' % (100 * progress), end='')
                    last_print = this_print
                if n_completed == n_jobs:
                    # we are done!
                    print_('')  # newline
                    break
                if self._idle_timeout is not None and time.time() > last_idle_check + 30:
                    now = time.time()
                    queue = self._ip_client.queue_status()
                    shutdown_eids = []
                    for eid in self._ip_client.ids:
                        if eid not in queue:
                            continue
                        if queue[eid]['queue'] + queue[eid]['tasks'] == 0:
                            # engine is idle
                            idle_time = idle_times.get(eid, None)
                            if idle_time is None:
                                # mark engine as idle
                                idle_times[eid] = now
                                continue
                            if now - idle_time > self._idle_timeout:
                                # shut down engine
                                shutdown_eids.append(eid)
                        elif eid in idle_times:
                            # engine has started running again
                            del idle_times[eid]

                    if len(shutdown_eids) > 0:
                        if self._verbose > 0:
                            print 'Shuting-down engines: ', shutdown_eids
                        dv = self._ip_client.direct_view(shutdown_eids)
                        dv.shutdown()
                        for eid in shutdown_eids:
                            del idle_times[eid]
                    last_idle_check = now
                time.sleep(self._poll_interval)

            self._execution_times = execution_times
            if self._shutdown:
                self._shutdown_cluster()

        elif self._ip_client is not None and self._memory is not None:
            # now this is the interesting case..
            if self._verbose >= 1:
                print('Running %d jobs on cluster with caching..' % n_jobs)
            f_cache = self._memory.cache(f)
            lbview = None
            out = [None] * n_jobs
            execution_times = [None] * n_jobs
            task_info = list()

            n_cache = 0
            for ii, this_args in enumerate(zip(*my_seqs)):
                # get the cache directory
                out_dir, _ = f_cache.get_output_dir(*this_args, **kwargs)
                if op.exists(op.join(out_dir, 'output.pkl')):
                    out[ii] = f_cache.load_output(out_dir)
                    n_cache += 1
                    continue
                if lbview is None:
                    lbview = self._ip_client.load_balanced_view()
                task = lbview.apply(f, *this_args, **kwargs)
                task_info.append(dict(task=task, idx=ii, args=this_args))
            if self._verbose >= 1:
                print_('Loaded %d results from cache' % n_cache)

            # wait for tasks to complete
            last_print = 0
            last_idle_check = time.time()
            idle_times = {}
            result_retrieved = [False] * len(task_info)
            retry_no = np.zeros(len(task_info), dtype=np.int)
            failed_tasks = []
            while len(task_info) > 0:
                for ii, ti in enumerate(task_info):
                    if not result_retrieved[ii] and ti['task'].ready():
                        task = ti['task']
                        if task.successful():
                            this_out = task.get()
                            # cache the input and output
                            out_dir, _ = f_cache.get_output_dir(*ti['args'],
                                                                **kwargs)
                            f_cache._persist_output(this_out, out_dir)
                            f_cache._persist_input(out_dir, *ti['args'], **kwargs)
                            # insert result into output
                            out[ti['idx']] = this_out
                            execution_times[ti['idx']] = task.serial_time
                            result_retrieved[ii] = True
                        else:
                            if retry_no[ii] < self._retries:
                                if self._verbose > 3:
                                    print ('\nTask %d failed, re-running (%d / %d)'
                                           % (ii, retry_no[ii] + 1,
                                              self._retries))
                                new_task = lbview.apply(f, *ti['args'], **kwargs)
                                ti['task'] = new_task
                                retry_no[ii] += 1
                            else:
                                # task failed too many times, mark it as done
                                # but keep running
                                if self._verbose >= 1:
                                    print ('\nTask %d failed %d times.'
                                           % (ii, self._retries + 1))
                                failed_tasks.append(task)
                                result_retrieved[ii] = True

                    if self._purge:
                        _purge_results(self._ip_client, task)

                if self._idle_timeout is not None and time.time() > last_idle_check + 30:
                    now = time.time()
                    queue = self._ip_client.queue_status()
                    shutdown_eids = []
                    for eid in self._ip_client.ids:
                        if eid not in queue:
                            continue
                        if queue[eid]['queue'] + queue[eid]['tasks'] == 0:
                            # engine is idle
                            idle_time = idle_times.get(eid, None)
                            if idle_time is None:
                                # mark engine as idle
                                idle_times[eid] = now
                                continue
                            if now - idle_time > self._idle_timeout:
                                # shut down engine
                                shutdown_eids.append(eid)
                        elif eid in idle_times:
                            # engine has started running again
                            del idle_times[eid]

                    if len(shutdown_eids) > 0:
                        if self._verbose > 0:
                            print 'Shuting-down engines: ', shutdown_eids
                        dv = self._ip_client.direct_view(shutdown_eids)
                        dv.shutdown()
                        for eid in shutdown_eids:
                            del idle_times[eid]

                        last_idle_check = now

                n_completed = int(np.sum(result_retrieved))
                progress = n_completed / float(n_jobs - n_cache)
                # print progress in 10% increments
                this_print = int(np.floor(progress * 10))
                if self._verbose >= 1 and this_print != last_print:
                    print_(' %d%% ' % (100 * progress), end='')
                    last_print = this_print
                if n_completed == n_jobs - n_cache:
                    # we are done!
                    print_('')  # newline
                    break
                time.sleep(self._poll_interval)

            if self._shutdown:
                self._shutdown_cluster()

            if len(failed_tasks) > 0:
                msg = ''
                for task in failed_tasks[:5]:
                    try:
                        task.get()
                    except Exception as e:
                        msg += str(e)
                raise RuntimeError('%d tasks failed:\n %s'
                                   % (len(failed_tasks), msg))

            self._execution_times = execution_times
        else:
            raise RuntimeError('WTF?')

        if self._verbose >= 1:
            print_('Done (%0.1f seconds)' % (time.time() - t_start))

        return out

    def get_last_excecution_times(self):
        return self._execution_times

    def purge_results(self, f, *sequences, **kwargs):
        # make sure all sequences have the same length
        n_jobs = None
        my_seqs = []
        for ii, seq in enumerate(sequences):
            try:
                this_n_elems = len(seq)
                if n_jobs is None:
                    n_jobs = this_n_elems
                if this_n_elems != n_jobs:
                    raise ValueError('All sequences must have the same lenght,'
                                     'sequence at position %d has length %d'
                                     % (ii + 1, this_n_elems))
                my_seqs.append(seq)
            except TypeError:
                # we allow passing ints etc, convert them to a sequence
                my_seqs.append(repeat(seq))

        f_cache = self._memory.cache(f)
        n_deleted = 0
        for this_args in zip(*my_seqs):
            out_dir, _ = f_cache.get_output_dir(*this_args, **kwargs)
            if op.exists(out_dir):
                shutil.rmtree(out_dir)
                n_deleted += 1
        print 'Purging cache: %d out of %d deleted' % (n_deleted, n_jobs)

    def _shutdown_cluster(self):
        # shut down all idle engines
        queue = self._ip_client.queue_status()
        shutdown_eids = []
        for eid in self._ip_client.ids:
            if eid not in queue:
                continue
            if queue[eid]['queue'] + queue[eid]['tasks'] == 0:
                shutdown_eids.append(eid)
        if len(shutdown_eids) > 0:
            if self._verbose > 0:
                print 'Shuting-down engines: ', shutdown_eids
            dv = self._ip_client.direct_view(shutdown_eids)
            dv.shutdown()
예제 #4
0
class Client():
    """DrQueue client actions"""
    def __init__(self):
        # initialize IPython
        try:
            self.ip_client = IPClient()
        except Exception:
            raise Exception("Could not connect to IPython controller.")
        self.lbview = self.ip_client.load_balanced_view()

        # enable tracking
        self.lbview.track = True


    def job_run(self, job):
        """Create and queue tasks from job object"""

        # check job name
        if job['name'] in DrQueueJob.query_jobnames():
            raise ValueError("Job name %s is already used!" % job['name'])
            return False

        # save job in database
        job_id = DrQueueJob.store_db(job)

        # job_id from db is be used as session name
        self.ip_client.session.session = str(job_id)

        # set owner of job
        self.ip_client.session.username = job['owner']

        # set number of retries for each task
        self.lbview.retries = job['retries']

        # depend on another job (it's tasks)
        if ('depend' in job['limits']) and (job['limits']['depend'] != None):
            depend_job = self.query_job_by_name(job['limits']['depend'])
            depend_tasks = self.query_task_list(depend_job['_id'])
            task_ids = []
            for task in depend_tasks:
                task_ids.append(task['msg_id'])
            self.lbview.after = task_ids

        # check frame numbers
        if not (job['startframe'] >= 1):
            raise ValueError("Invalid value for startframe. Has to be equal or greater than 1.")
            return False
        if not (job['endframe'] >= 1):
            raise ValueError("Invalid value for endframe. Has to be equal or greater than 1.")
            return False
        if not (job['endframe'] >= job['startframe']):
            raise ValueError("Invalid value for endframe. Has be to equal or greater than startframe.")
            return False
        if job['endframe'] > job['startframe']:
            if not (job['endframe'] - job['startframe'] >= job['blocksize']):
                raise ValueError("Invalid value for blocksize. Has to be equal or lower than endframe-startframe.")
                return False
        if job['endframe'] == job['startframe']:
            if job['blocksize'] != 1:
                raise ValueError("Invalid value for blocksize. Has to be equal 1 if endframe equals startframe.")
                return False

        task_frames = list(range(job['startframe'], job['endframe'] + 1, job['blocksize']))
        ar = None
        for x in task_frames:
            # prepare script input
            env_dict = {
            'DRQUEUE_FRAME' : x,
            'DRQUEUE_BLOCKSIZE' : job['blocksize'],
            'DRQUEUE_ENDFRAME' : job['endframe'],
            'DRQUEUE_SCENEFILE' : job['scenefile'],
            'DRQUEUE_LOGFILE' : job['name'] + "-" + str(x) + "_" + str(x + job['blocksize'] -1) + ".log"
            }

            # optional elements
            if 'renderdir' in job:
                env_dict['DRQUEUE_RENDERDIR'] = job['renderdir']
            if 'projectdir' in job:
                env_dict['DRQUEUE_PROJECTDIR'] = job['projectdir']
            if 'configdir' in job:
                env_dict['DRQUEUE_CONFIGDIR'] = job['configdir']
            if 'imagefile' in job:
                env_dict['DRQUEUE_IMAGEFILE'] = job['imagefile']
            if 'precommand' in job:
                env_dict['DRQUEUE_PRECOMMAND'] = job['precommand']
            if 'renderer' in job:
                env_dict['DRQUEUE_RENDERER'] = job['renderer']
            if 'fileformat' in job:
                env_dict['DRQUEUE_FILEFORMAT'] = job['fileformat']
            if 'postcommand' in job:
                env_dict['DRQUEUE_POSTCOMMAND'] = job['postcommand']
            if 'viewcommand' in job:
                env_dict['DRQUEUE_VIEWCOMMAND'] = job['viewcommand']
            if 'worldfile' in job:
                env_dict['DRQUEUE_WORLDFILE'] = job['worldfile']
            if 'terrainfile' in job:
                env_dict['DRQUEUE_TERRAINFILE'] = job['terrainfile']
            if 'composition' in job:
                env_dict['DRQUEUE_COMPOSITION'] = job['composition']
            if 'camera' in job:
                env_dict['DRQUEUE_CAMERA'] = job['camera']
            if 'resx' in job:
                env_dict['DRQUEUE_RESX'] = job['resx']
            if 'resy' in job:
                env_dict['DRQUEUE_RESY'] = job['resy']
            if 'renderpass' in job:
                env_dict['DRQUEUE_RENDERPASS'] = job['renderpass']
            if 'rendertype' in job:
                env_dict['DRQUEUE_RENDERTYPE'] = job['rendertype']
            if 'fileextension' in job:
                env_dict['DRQUEUE_FILEEXTENSION'] = job['fileextension']
            if 'stepframe' in job:
                env_dict['DRQUEUE_STEPFRAME'] = job['stepframe']
            if 'custom_bucket' in job:
                env_dict['DRQUEUE_CUSTOM_BUCKET'] = job['custom_bucket']
            if 'bucketsize' in job:
                env_dict['DRQUEUE_BUCKETSIZE'] = job['bucketsize']
            if 'custom_lod' in job:
                env_dict['DRQUEUE_CUSTOM_LOD'] = job['custom_lod']
            if 'lod' in job:
                env_dict['DRQUEUE_LOD'] = job['lod']
            if 'custom_varyaa' in job:
                env_dict['DRQUEUE_CUSTOM_VARYAA'] = job['custom_varyaa']
            if 'varyaa' in job:
                env_dict['DRQUEUE_VARYAA'] = job['varyaa']
            if 'raytrace' in job:
                env_dict['DRQUEUE_RAYTRACE'] = job['raytrace']
            if 'antialias' in job:
                env_dict['DRQUEUE_ANTIALIAS'] = job['antialias']
            if 'custom_bdepth' in job:
                env_dict['DRQUEUE_CUSTOM_BDEPTH'] = job['custom_bdepth']
            if 'bdepth' in job:
                env_dict['DRQUEUE_BDEPTH'] = job['bdepth']
            if 'custom_zdepth' in job:
                env_dict['DRQUEUE_CUSTOM_ZDEPTH'] = job['custom_zdepth']
            if 'zdepth' in job:
                env_dict['DRQUEUE_ZDEPTH'] = job['zdepth']
            if 'custom_cracks' in job:
                env_dict['DRQUEUE_CUSTOM_CRACKS'] = job['custom_cracks']
            if 'cracks' in job:
                env_dict['DRQUEUE_CRACKS'] = job['cracks']
            if 'custom_quality' in job:
                env_dict['DRQUEUE_CUSTOM_QUALITY'] = job['custom_quality']
            if 'quality' in job:
                env_dict['DRQUEUE_QUALITY'] = job['quality']
            if 'custom_qfiner' in job:
                env_dict['DRQUEUE_CUSTOM_QFINER'] = job['custom_qfiner']
            if 'qfiner' in job:
                env_dict['DRQUEUE_QFINER'] = job['qfiner']
            if 'custom_smultiplier' in job:
                env_dict['DRQUEUE_CUSTOM_SMULTIPLIER'] = job['custom_smultiplier']
            if 'smultiplier' in job:
                env_dict['DRQUEUE_SMULTIPLIER'] = job['smultiplier']
            if 'custom_mpcache' in job:
                env_dict['DRQUEUE_CUSTOM_MPCACHE'] = job['custom_mpcache']
            if 'mpcache' in job:
                env_dict['DRQUEUE_MPCACHE'] = job['mpcache']
            if 'custom_smpolygon' in job:
                env_dict['DRQUEUE_CUSTOM_SMPOLYGON'] = job['custom_smpolygon']
            if 'smpolygon' in job:
                env_dict['DRQUEUE_SMPOLYGON'] = job['smpolygon']
            if 'custom_wh' in job:
                env_dict['DRQUEUE_CUSTOM_WH'] = job['custom_wh']
            if 'custom_type' in job:
                env_dict['DRQUEUE_CUSTOM_TYPE'] = job['custom_type']
            if 'ctype' in job:
                env_dict['DRQUEUE_CTYPE'] = job['ctype']
            if 'skipframes' in job:
                env_dict['DRQUEUE_SKIPFRAMES'] = job['skipframes']

            # set dependencies
            dep_dict = {}
            if ('os' in job['limits']) and (job['limits']['os'] != None):
                dep_dict['os_name'] = job['limits']['os']
            if ('minram' in job['limits']) and (job['limits']['minram'] > 0):
                dep_dict['minram'] = job['limits']['minram']
            if ('mincores' in job['limits']) and (job['limits']['mincores'] > 0):
                dep_dict['mincores'] = job['limits']['mincores']
            if ('pool_name' in job['limits']) and (job['limits']['pool_name'] != None):
                dep_dict['pool_name'] = job['limits']['pool_name']
            run_script_with_env_and_deps = dependent(DrQueue.run_script_with_env, DrQueue.check_deps, dep_dict)

            # run task on cluster
            render_script = DrQueue.get_rendertemplate(job['renderer'])
            ar = self.lbview.apply(run_script_with_env_and_deps, render_script, env_dict)
            # wait for pyzmq send to complete communication (avoid race condition)
            ar.wait_for_send()

        # append email task behind last task if requested
        if ('send_email' in job) and (job['send_email'] == True):
            self.lbview.after = ar
            # run email task
            mail_ar = self.lbview.apply(DrQueue.send_email, job['name'], job['email_recipients'])
            # wait for pyzmq send to complete communication (avoid race condition)
            mail_ar.wait_for_send()
        return True


    def identify_computer(self, engine_id, cache_time):
        """Gather information about computer"""
        # look if engine info is already stored
        engine = DrQueueComputer.query_db(engine_id)
        now = int(time.time())
        # check existence and age of info
        if (engine != None) and (now <= engine['date'] + cache_time):
            print("DEBUG: Engine %i was found in DB" % engine_id)
        # store new info
        else:
            print("DEBUG: Engine %i was not found in DB" % engine_id)
            # run command only on specific computer
            dview = self.ip_client[engine_id]
            dview.block = True
            dview.execute("import DrQueue\nfrom DrQueue import Computer as DrQueueComputer\nengine = DrQueueComputer(" + str(engine_id) + ")")
            engine = dview['engine']
            engine['date'] = int(time.time())
            DrQueueComputer.store_db(engine)
        return engine


    def task_wait(self, task_id):
        """Wait for task to finish"""
        ar = self.ip_client.get_result(task_id)
        ar.wait_for_send()
        ar.wait()
        return ar


    def query_job_list(self):
        """Query a list of all jobs"""
        return DrQueueJob.query_job_list()


    def query_running_job_list(self):
        """Query a list of all running jobs"""
        jobs = DrQueueJob.query_job_list()
        running_jobs = []
        for job in jobs:
            if self.query_job_tasks_left(job['_id']) > 0:
                running_jobs.append(job)
        return running_jobs


    def query_jobname(self, task_id):
        """Query jobname from task id"""
        data = self.ip_client.db_query({"msg_id" : task_id})
        job_id = data[0]['header']['session']
        job = DrQueueJob.query_db(job_id)
        return job.name


    def query_job(self, job_id):
        """Query job from id"""
        return DrQueueJob.query_db(job_id)


    def query_job_by_name(self, job_name):
        """Query job from name"""
        return DrQueueJob.query_job_by_name(job_name)


    def query_job_tasks_left(self, job_id):
        """Query left frames of job"""
        left = 0
        tasks = self.query_task_list(job_id)
        for task in tasks:
            if task['completed'] == None:
                left += 1
        return left


    def query_job_finish_time(self, job_id):
        """Query oldest finish time of all tasks."""
        job = self.query_job(job_id)
        # use requeue time as starting point if available
        if ('requeue_time' in job ) and (job['requeue_time'] != False):
            finish_time = job['requeue_time']
        else:
            finish_time = job['submit_time']
        tasks = self.query_task_list(job_id)
        for task in tasks:
            # look if older finish time exists
            if (task['completed'] != None) and (task['completed'] > finish_time):
                finish_time = task['completed']
        return finish_time


    def get_frame_nr(self, task):
        """Extract value of DRQUEUE_FRAME."""
        return int(pickle.loads(task['buffers'][3])['DRQUEUE_FRAME'])


    def query_task_list(self, job_id):
        """Query a list of tasks objects of certain job"""
        task_list =  self.ip_client.db_query({'header.session' : str(job_id)})
        sorted_task_list = sorted(task_list, key=self.get_frame_nr)
        return sorted_task_list


    def query_task(self, task_id):
        """Query a single task"""
        task = self.ip_client.db_query({'msg_id' : task_id })[0]
        return task


    def query_engine_list(self):
        """Query a list of all engines"""
        return self.ip_client.ids


    def query_engines_of_pool(self, pool_name):
        """Return available engines of certain pool."""
        pool_computers = self.ip_client.ids
        if pool_name != None:
            computers = DrQueueComputerPool.query_pool_members(pool_name)
            if computers == None:
                raise ValueError("Pool \"%s\" is not existing!" % pool_name)
                return False
            for comp in pool_computers:
                if not comp in computers:
                    pool_computers.remove(comp)
            if pool_computers == []:
                raise ValueError("No computer of pool %s is available!" % pool_name)
                return False
            print("DEBUG: matching pool: " + pool_name)
            print(pool_computers)
        return pool_computers


    def query_engines_of_os(self, os_name):
        """Return only engines running certain OS."""
        # run job only on matching os
        matching_os = self.ip_client.ids
        if os_name != None:
            for engine_id in self.ip_client.ids:
                engine = self.identify_computer(engine_id, 1000)
                # os string has to contain os_name
                if not os_name in engine['os']:
                    matching_os.remove(engine_id)
            print("DEBUG: matching os: " + os_name)
            print(matching_os)
        return matching_os


    def query_engines_with_minram(self, minram):
        """Return only engines with at least minram GB RAM."""
        # run job only on matching minram
        matching_minram = self.ip_client.ids
        if minram > 0:
            for engine_id in self.ip_client.ids:
                engine = self.identify_computer(engine_id, 1000)
                if engine['memory'] < minram:
                    matching_minram.remove(engine_id)
            print("DEBUG: matching minram: " + str(minram))
            print(matching_minram)
        return matching_minram


    def query_engines_with_mincores(self, mincores):
        """Return only engines with at least mincores CPU cores."""
        # run job only on matching mincores
        matching_mincores = self.ip_client.ids
        if mincores > 0:
            for engine_id in self.ip_client.ids:
                engine = self.identify_computer(engine_id, 1000)
                if engine['ncorescpu'] * engine['ncpus'] < mincores:
                    matching_mincores.remove(engine_id)
            print("DEBUG: matching mincores: " + str(mincores))
            print(matching_mincores)
        return matching_mincores


    def match_all_limits(self, os_list, minram_list, mincores_list, pool_list):
        """Match all limits for job."""
        tmp_list = []
        # build list with all list members
        tmp_list.extend(os_list)
        tmp_list.extend(minram_list)
        tmp_list.extend(mincores_list)
        tmp_list.extend(pool_list)
        # make entries unique
        tmp_list = set(tmp_list)
        tmp_list = list(tmp_list)
        matching_limits = []
        for entry in tmp_list:
            # look if entry is in all lists
            if (entry in os_list) and (entry in minram_list) and (entry in mincores_list) and (entry in pool_list):
                matching_limits.append(entry)
            else:
                print("DEBUG: %i isn't matching limits" % entry)
        print("DEBUG: matching limits:")
        print(matching_limits)
        if len(matching_limits) == 0:
            message = "No engine meets the requirements."
            print(message)
            raise Exception(message)
        elif len(matching_limits) > 0:
            # only run on matching engines
            self.lbview = self.ip_client.load_balanced_view(matching_limits)
        else:
            self.lbview = self.ip_client.load_balanced_view()


    def job_stop(self, job_id):
        """Stop job and all tasks which are not currently running"""
        tasks = self.query_task_list(job_id)
        # abort all queued tasks
        for task in tasks:
            self.ip_client.abort(task['msg_id'])
        return True


    def job_kill(self, job_id):
        """Stop job and all of it's tasks wether running or not"""
        tasks = self.query_task_list(job_id)
        running_engines = []
        # abort all queued tasks
        for task in tasks:
            stats = self.ip_client.queue_status('all', True)
            # check if tasks is already running on an engine
            for key,status in list(stats.items()):
                if ('tasks' in status) and (task['msg_id'] in status['tasks']):
                    running_engines.append(key)
            self.ip_client.abort(task['msg_id'])
        # restart all engines which still run a task
        running_engines = set(running_engines)
        return True


    def job_delete(self, job_id):
        """Delete job and all of it's tasks"""
        tasks = self.query_task_list(job_id)
        engines = self.query_engine_list()
        # abort and delete all queued tasks
        for task in tasks:
            if len(engines) > 0:
                self.ip_client.abort(task['msg_id'])
            self.ip_client.purge_results(task['msg_id'])
        # delete job itself
        DrQueueJob.delete_from_db(job_id)
        return True


    def task_continue(self, task_id):
        """Continue aborted or failed task"""
        task = self.query_task(task_id)
        # check if action is needed
        if (task['completed'] != None) and ((task['result_header']['status'] == "error") or (task['result_header']['status'] == "aborted")):
            self.task_requeue(task_id)
        return True


    def task_requeue(self, task_id):
        """Requeue task"""
        self.ip_client.resubmit(task_id)
        print("requeuing %s" % task_id)
        return True


    def job_continue(self, job_id):
        """Continue stopped job and all of it's tasks"""
        job = self.query_job(job_id)
        tasks = self.query_task_list(job_id)
        # continue tasks
        for task in tasks:
            self.task_continue(task['msg_id'])
        return True


    def job_rerun(self, job_id):
        """Run all tasks of job another time"""
        job = self.query_job(job_id)
        tasks = self.query_task_list(job_id)
        # rerun tasks
        for task in tasks:
            self.task_requeue(task['msg_id'])
        # set resubmit time
        job['requeue_time'] = datetime.datetime.now()
        DrQueueJob.update_db(job)
        return True


    def job_status(self, job_id):
        """Return status string of job"""
        tasks = self.query_task_list(job_id)
        status = None
        status_pending = 0
        status_ok = 0
        status_aborted = 0
        status_resubmitted = 0
        status_error = 0
        status_unknown = 0
        for task in tasks:
            # look for pending tasks
            if task['completed'] == None:
                status_pending += 1
            else:
                if 'result_header' in list(task.keys()):
                    result_header = task['result_header']
                    # look for done tasks
                    if ('status' in list(result_header.keys())) and (result_header['status'] == "ok"):
                        status_ok += 1
                    # look for aborted tasks
                    elif ('status' in list(result_header.keys())) and (result_header['status'] == "aborted"):
                        status_aborted += 1
                    # look for done tasks
                    elif ('status' in list(result_header.keys())) and (result_header['status'] == "resubmitted"):
                        status_resubmitted += 1
                    # look for tasks with error
                    elif ('status' in list(result_header.keys())) and (result_header['status'] == "error"):
                        status_error += 1
                    else:
                        status_unknown += 1
        # if at least 1 task is ok, job status is ok
        if status_ok > 0:
            status = "ok"
        # if at least 1 task is pending, job status is pending
        if status_pending > 0:
            status = "pending"
        # if at least 1 task is aborted, job status is aborted
        if status_aborted > 0:
            status = "aborted"
        # if at least 1 task has an error, job status is error
        if status_error > 0:
            status = "error"
        return status


    def job_estimated_finish_time(self, job_id):
        """Calculate estimated finish time of job."""
        tasks = self.query_task_list(job_id)
        spent_times = []
        # get spent time for each finished task
        for task in tasks:
            if task['completed'] != None:
                if 'result_header' in list(task.keys()):
                    result_header = task['result_header']
                    if ('status' in list(result_header.keys())) and (result_header['status'] == "ok"):
                        timediff = task['completed'] - task['started']
                        spent_times.append(timediff)
        if len(spent_times) > 0:
            # calculate sum of spent time
            sum_times = datetime.timedelta(0)
            for spent in spent_times:
                sum_times += spent
            # calcutate mean time for a single task
            meantime = sum_times / len(spent_times)
            # calculate estimated time left
            tasks_left = len(tasks) - len(spent_times)
            time_left = tasks_left * meantime
            # query job object
            job = self.query_job(job_id)
            # look if all tasks are already done
            if self.query_job_tasks_left(job_id) == 0:
                finish_time = self.query_job_finish_time(job_id)
            else:
                # calculate estimated finish time, use requeue time if available
                if ('requeue_time' in job ) and (job['requeue_time'] != False):
                    finish_time = job['requeue_time'] + time_left
                else:
                    finish_time = job['submit_time'] + time_left
        else:
            meantime = "unknown"
            time_left = "unknown"
            finish_time = "unknown"
        return meantime, time_left, finish_time


    def engine_stop(self, engine_id):
        """Stop a specific engine"""
        # delete computer information in db
        DrQueueComputer.delete_from_db(engine_id)
        # shutdown computer
        self.ip_client.shutdown(engine_id)
        return True


    def engine_restart(self, engine_id):
        """Restart a specific engine"""
        self.ip_client.shutdown(engine_id, True, False, True)
        return True
예제 #5
0
class Client():
    """DrQueue client actions"""
    def __init__(self):
        # initialize IPython
        try:
            self.ip_client = IPClient()
        except Exception:
            raise Exception("Could not connect to IPython controller.")
        self.lbview = self.ip_client.load_balanced_view()

        # enable tracking
        self.lbview.track = True

        # list of all available query keys
        self.all_task_query_keys = [
            'msg_id', 'header', 'content', 'buffers', 'submitted',
            'client_uuid', 'engine_uuid', 'started', 'completed',
            'resubmitted', 'result_header', 'result_content', 'result_buffers',
            'queue', 'pyin', 'pyout', 'pyerr', 'stdout', 'stderr'
        ]

    def job_run(self, job):
        """Create and queue tasks from job object"""

        # check job name
        if job['name'] in DrQueueJob.query_jobnames():
            raise ValueError("Job name %s is already used!" % job['name'])
            return False

        # save job in database
        job_id = DrQueueJob.store_db(job)

        # job_id from db is be used as session name
        self.ip_client.session.session = str(job_id)

        # set owner of job
        self.ip_client.session.username = job['owner']

        # set number of retries for each task
        self.lbview.retries = job['retries']

        # depend on another job (it's tasks)
        if ('depend' in job['limits']) and (job['limits']['depend'] != None):
            depend_job = self.query_job_by_name(job['limits']['depend'])
            depend_tasks = self.query_task_list(depend_job['_id'])
            task_ids = []
            for task in depend_tasks:
                task_ids.append(task['msg_id'])
            self.lbview.after = task_ids

        # check frame numbers
        if not (job['startframe'] >= 1):
            raise ValueError(
                "Invalid value for startframe. Has to be equal or greater than 1."
            )
            return False
        if not (job['endframe'] >= 1):
            raise ValueError(
                "Invalid value for endframe. Has to be equal or greater than 1."
            )
            return False
        if not (job['endframe'] >= job['startframe']):
            raise ValueError(
                "Invalid value for endframe. Has be to equal or greater than startframe."
            )
            return False
        if job['endframe'] > job['startframe']:
            if not (job['endframe'] - job['startframe'] >= job['blocksize']):
                raise ValueError(
                    "Invalid value for blocksize. Has to be equal or lower than endframe-startframe."
                )
                return False
        if job['endframe'] == job['startframe']:
            if job['blocksize'] != 1:
                raise ValueError(
                    "Invalid value for blocksize. Has to be equal 1 if endframe equals startframe."
                )
                return False

        task_frames = list(
            range(job['startframe'], job['endframe'] + 1, job['blocksize']))
        ar = None
        for x in task_frames:

            # prepare script input
            env_dict = {
                'DRQUEUE_FRAME': x,
                'DRQUEUE_BLOCKSIZE': job['blocksize'],
                'DRQUEUE_ENDFRAME': job['endframe'],
                'DRQUEUE_SCENEFILE': job['scenefile']
            }

            # log filename
            if job['created_with'] == "DrQueueOnRails":
                # take job directory name
                env_dict['DRQUEUE_LOGFILE'] = job['scenefile'].split(
                    "/")[-2] + "-" + str(x) + "_" + str(x + job['blocksize'] -
                                                        1) + ".log"
            else:
                # take job name
                env_dict['DRQUEUE_LOGFILE'] = job['name'] + "-" + str(
                    x) + "_" + str(x + job['blocksize'] - 1) + ".log"

            # optional elements
            if 'renderdir' in job:
                env_dict['DRQUEUE_RENDERDIR'] = job['renderdir']
            if 'projectdir' in job:
                env_dict['DRQUEUE_PROJECTDIR'] = job['projectdir']
            if 'configdir' in job:
                env_dict['DRQUEUE_CONFIGDIR'] = job['configdir']
            if 'imagefile' in job:
                env_dict['DRQUEUE_IMAGEFILE'] = job['imagefile']
            if 'precommand' in job:
                env_dict['DRQUEUE_PRECOMMAND'] = job['precommand']
            if 'renderer' in job:
                env_dict['DRQUEUE_RENDERER'] = job['renderer']
            if 'fileformat' in job:
                env_dict['DRQUEUE_FILEFORMAT'] = job['fileformat']
            if 'postcommand' in job:
                env_dict['DRQUEUE_POSTCOMMAND'] = job['postcommand']
            if 'viewcommand' in job:
                env_dict['DRQUEUE_VIEWCOMMAND'] = job['viewcommand']
            if 'worldfile' in job:
                env_dict['DRQUEUE_WORLDFILE'] = job['worldfile']
            if 'terrainfile' in job:
                env_dict['DRQUEUE_TERRAINFILE'] = job['terrainfile']
            if 'composition' in job:
                env_dict['DRQUEUE_COMPOSITION'] = job['composition']
            if 'camera' in job:
                env_dict['DRQUEUE_CAMERA'] = job['camera']
            if 'resx' in job:
                env_dict['DRQUEUE_RESX'] = job['resx']
            if 'resy' in job:
                env_dict['DRQUEUE_RESY'] = job['resy']
            if 'renderpass' in job:
                env_dict['DRQUEUE_RENDERPASS'] = job['renderpass']
            if 'rendertype' in job:
                env_dict['DRQUEUE_RENDERTYPE'] = job['rendertype']
            if 'fileextension' in job:
                env_dict['DRQUEUE_FILEEXTENSION'] = job['fileextension']
            if 'stepframe' in job:
                env_dict['DRQUEUE_STEPFRAME'] = job['stepframe']
            if 'custom_bucket' in job:
                env_dict['DRQUEUE_CUSTOM_BUCKET'] = job['custom_bucket']
            if 'bucketsize' in job:
                env_dict['DRQUEUE_BUCKETSIZE'] = job['bucketsize']
            if 'custom_lod' in job:
                env_dict['DRQUEUE_CUSTOM_LOD'] = job['custom_lod']
            if 'lod' in job:
                env_dict['DRQUEUE_LOD'] = job['lod']
            if 'custom_varyaa' in job:
                env_dict['DRQUEUE_CUSTOM_VARYAA'] = job['custom_varyaa']
            if 'varyaa' in job:
                env_dict['DRQUEUE_VARYAA'] = job['varyaa']
            if 'raytrace' in job:
                env_dict['DRQUEUE_RAYTRACE'] = job['raytrace']
            if 'antialias' in job:
                env_dict['DRQUEUE_ANTIALIAS'] = job['antialias']
            if 'custom_bdepth' in job:
                env_dict['DRQUEUE_CUSTOM_BDEPTH'] = job['custom_bdepth']
            if 'bdepth' in job:
                env_dict['DRQUEUE_BDEPTH'] = job['bdepth']
            if 'custom_zdepth' in job:
                env_dict['DRQUEUE_CUSTOM_ZDEPTH'] = job['custom_zdepth']
            if 'zdepth' in job:
                env_dict['DRQUEUE_ZDEPTH'] = job['zdepth']
            if 'custom_cracks' in job:
                env_dict['DRQUEUE_CUSTOM_CRACKS'] = job['custom_cracks']
            if 'cracks' in job:
                env_dict['DRQUEUE_CRACKS'] = job['cracks']
            if 'custom_quality' in job:
                env_dict['DRQUEUE_CUSTOM_QUALITY'] = job['custom_quality']
            if 'quality' in job:
                env_dict['DRQUEUE_QUALITY'] = job['quality']
            if 'custom_qfiner' in job:
                env_dict['DRQUEUE_CUSTOM_QFINER'] = job['custom_qfiner']
            if 'qfiner' in job:
                env_dict['DRQUEUE_QFINER'] = job['qfiner']
            if 'custom_smultiplier' in job:
                env_dict['DRQUEUE_CUSTOM_SMULTIPLIER'] = job[
                    'custom_smultiplier']
            if 'smultiplier' in job:
                env_dict['DRQUEUE_SMULTIPLIER'] = job['smultiplier']
            if 'custom_mpcache' in job:
                env_dict['DRQUEUE_CUSTOM_MPCACHE'] = job['custom_mpcache']
            if 'mpcache' in job:
                env_dict['DRQUEUE_MPCACHE'] = job['mpcache']
            if 'custom_smpolygon' in job:
                env_dict['DRQUEUE_CUSTOM_SMPOLYGON'] = job['custom_smpolygon']
            if 'smpolygon' in job:
                env_dict['DRQUEUE_SMPOLYGON'] = job['smpolygon']
            if 'custom_wh' in job:
                env_dict['DRQUEUE_CUSTOM_WH'] = job['custom_wh']
            if 'custom_type' in job:
                env_dict['DRQUEUE_CUSTOM_TYPE'] = job['custom_type']
            if 'ctype' in job:
                env_dict['DRQUEUE_CTYPE'] = job['ctype']
            if 'skipframes' in job:
                env_dict['DRQUEUE_SKIPFRAMES'] = job['skipframes']
            if 'custom_command' in job:
                env_dict['DRQUEUE_CUSTOM_COMMAND'] = job['custom_command']

            # set dependencies
            dep_dict = {}
            dep_dict['job_id'] = job_id
            if ('os' in job['limits']) and (job['limits']['os'] != None):
                dep_dict['os_name'] = job['limits']['os']
            if ('minram' in job['limits']) and (job['limits']['minram'] > 0):
                dep_dict['minram'] = job['limits']['minram']
            if ('mincores'
                    in job['limits']) and (job['limits']['mincores'] > 0):
                dep_dict['mincores'] = job['limits']['mincores']
            if ('pool_name'
                    in job['limits']) and (job['limits']['pool_name'] != None):
                dep_dict['pool_name'] = job['limits']['pool_name']
            run_script_with_env_and_deps = dependent(
                DrQueue.run_script_with_env, DrQueue.check_deps, dep_dict)

            # run task on cluster
            render_script = DrQueue.get_rendertemplate(job['renderer'])
            ar = self.lbview.apply(run_script_with_env_and_deps, render_script,
                                   env_dict)
            # wait for pyzmq send to complete communication (avoid race condition)
            ar.wait_for_send()

        # append email task behind last task if requested
        if ('send_email' in job) and (job['send_email'] == True):
            self.lbview.after = ar
            # run email task
            mail_ar = self.lbview.apply(DrQueue.send_email, job['name'],
                                        job['email_recipients'])
            # wait for pyzmq send to complete communication (avoid race condition)
            mail_ar.wait_for_send()
        return True

    def identify_computer(self, engine_id, cache_time, timeout=15):
        """Gather information about computer"""
        # look if engine info is already stored
        engine = DrQueueComputer.query_db_by_engine_id(engine_id)
        now = int(time.time())
        # check existence and age of info
        if (engine != None) and (now <= engine['created_at'] + cache_time):
            print("DEBUG: Engine %i was found in DB and info is up-to-date." %
                  engine_id)
            return engine
        # store new info
        else:
            if engine != None:
                print(
                    "DEBUG: Engine %i was found in DB, but info needs to be updated."
                    % engine_id)
            else:
                print("DEBUG: Engine %i was not found in DB." % engine_id)
            # run command only on specific computer
            try:
                dview = self.ip_client[engine_id]
            except IndexError:
                print("DEBUG: Engine with id %i unknown." % engine_id)
                # delete old entry from database
                DrQueueComputer.delete_from_db_by_engine_id(engine_id)
                print("DEBUG: Engine with id %i deleted from database." %
                      engine_id)
                new_engine = None
            else:
                # run command in async mode
                dview.block = False
                command = "import DrQueue\nfrom DrQueue import Computer as DrQueueComputer\nengine = DrQueueComputer()"
                ar = dview.execute(command)
                try:
                    # try to get results & wait until timeout
                    ar.get(timeout)
                except Exception:
                    if engine != None:
                        print(
                            "DEBUG: Update request for engine %i timed out. Using old information from DB."
                            % engine_id)
                        new_engine = engine
                    else:
                        print(
                            "DEBUG: Information request for engine %i timed out."
                            % engine_id)
                        new_engine = None
                else:
                    # get computer dict from engine namespace
                    new_engine = dview['engine']
                    # set to known engine_id
                    new_engine['engine_id'] = engine_id
                    # set creation time
                    new_engine['created_at'] = int(time.time())
                    # store entry in database
                    DrQueueComputer.store_db(new_engine)
            return new_engine

    def computer_set_pools(self, computer, pool_list):
        """add computer to list of pools"""
        # convert to string
        pool_str = ','.join(pool_list)
        # update environment variable on engine
        dview = self.ip_client[computer['engine_id']]
        dview.block = True
        command = "import os\nos.environ[\"DRQUEUE_POOL\"] = \"" + pool_str + "\""
        dview.execute(command)
        # update database entry
        computer['pools'] = pool_list
        DrQueueComputer.store_db(computer)
        print("DEBUG: Engine " + str(computer['engine_id']) +
              " added to pools " + pool_str + ".")
        return computer

    def computer_get_pools(self, computer):
        """Return all pool names where computer is member."""
        return computer['pools']

    def task_wait(self, task_id):
        """Wait for task to finish"""
        ar = self.ip_client.get_result(task_id)
        ar.wait_for_send()
        ar.wait()
        return ar

    def query_job_list(self):
        """Query a list of all jobs"""
        return DrQueueJob.query_job_list()

    def query_job_by_id(self, job_id):
        """Query job by given id"""
        return DrQueueJob.query_db(job_id)

    def query_job_by_name(self, job_name):
        """Query job by given name"""
        return DrQueueJob.query_job_by_name(job_name)

    def query_job_tasks_left(self, job_id):
        """Query left frames of job"""
        left = 0
        tasks = self.query_task_list(job_id)
        for task in tasks:
            if task['completed'] == None:
                left += 1
        return left

    def query_job_finish_time(self, job_id):
        """Query oldest finish time of all tasks."""
        job = self.query_job_by_id(job_id)
        # use requeue time as starting point if available
        if ('requeue_time' in job) and (job['requeue_time'] != False):
            finish_time = job['requeue_time']
        else:
            finish_time = job['submit_time']
        tasks = self.query_task_list(job_id)
        for task in tasks:
            # look if older finish time exists
            if (task['completed'] != None) and (task['completed'] >
                                                finish_time):
                finish_time = task['completed']
        return finish_time

    def get_frame_nr(self, task):
        """Extract value of DRQUEUE_FRAME."""
        if ('buffers' in task) and task['buffers'] != []:
            frame_nr = int(pickle.loads(task['buffers'][3])['DRQUEUE_FRAME'])
        else:
            frame_nr = 1
        return frame_nr

    def query_task_list(self, job_id):
        """Query a list of tasks objects of certain job. Sort by frame number."""
        task_list = self.ip_client.db_query({'header.session': str(job_id)},
                                            keys=self.all_task_query_keys)
        sorted_task_list = sorted(task_list, key=self.get_frame_nr)
        return sorted_task_list

    def query_interrupted_task_list(self, job_id):
        """Query a list of interrupted tasks of certain job. Sort by frame number."""
        job = self.query_job_by_id(job_id)
        task_list = self.ip_client.db_query({'header.session': str(job_id)},
                                            keys=self.all_task_query_keys)
        interrupted_task_list = []

        for task in task_list:
            frame_nr = self.get_frame_nr(task)
            print("frame_nr: " + str(frame_nr))
            # log filename
            if job['renderer'] == "blender":
                filesearch = job['scenefile'] + str("%04d" % frame_nr) + ".???"
                found = glob.glob(filesearch)
                # file was found
                if len(found) > 0:
                    outputfile = found[0]
                    print("outputfile: " + str(outputfile))
                    filesize = os.path.getsize(outputfile)
                    print(filesize)
                    # file exists, but is empty
                    if filesize == 0:
                        interrupted_task_list.append(task)
                # file was not found
                else:
                    outputfile = None
                    print("outputfile: " + str(outputfile))
                    if (task['completed'] == None) and (task['started']
                                                        == None):
                        interrupted_task_list.append(task)
            else:
                raise ValueError("Only Blender renderer supported so far.")

        return interrupted_task_list

    def query_task(self, task_id):
        """Query a single task."""
        task = self.ip_client.db_query({'msg_id': task_id},
                                       keys=self.all_task_query_keys)[0]
        return task

    def query_computer_list(self):
        """Query a list of all computers."""
        return self.ip_client.ids

    def job_stop(self, job_id):
        """Stop job and all tasks which are not currently running"""

        # disable job
        self.job_disable(job_id)

        tasks = self.query_task_list(job_id)
        tasks_to_stop = []
        for task in tasks:
            print("Task " + task["msg_id"] + ": ")
            if ("result_content"
                    in task) and (task["result_content"] != None) and (
                        task["result_content"]["status"] == "ok"):
                print("  finished at " + str(task["completed"]))
            else:
                # get task stats of all computers
                stats = self.ip_client.queue_status('all', True)
                # check if tasks is already running on an engine
                found_on_engine = False
                for key, status in list(stats.items()):
                    if ('tasks' in status) and (task['msg_id']
                                                in status['tasks']):
                        # skip tasks which are already running on an engine
                        print(
                            "  not finished yet but already queued to engine. will leave it there."
                        )
                        found_on_engine = True
                        break

                # if a task isn't already queueed/running on an engine, it should be safe to abort it
                if found_on_engine == False:
                    print("  not finished yet. will abort.")
                    tasks_to_stop.append(task['msg_id'])

        if len(tasks_to_stop) > 0:
            try:
                self.ip_client.abort(tasks_to_stop)
            except Exception as e:
                print("ERROR: " + str(e))

        return True

    def job_kill(self, job_id):
        """Stop job and all of it's tasks wether running or not"""

        # disable job
        self.job_disable(job_id)

        tasks = self.query_task_list(job_id)
        running_engines = []
        tasks_to_stop = []
        # abort all queued tasks
        for task in tasks:
            stats = self.ip_client.queue_status('all', True)
            # check if tasks is already running on an engine
            for key, status in list(stats.items()):
                if ('tasks' in status) and (task['msg_id'] in status['tasks']):
                    running_engines.append(key)
            tasks_to_stop.append(task['msg_id'])
        # stop all matching tasks at once
        try:
            self.ip_client.abort(tasks_to_stop)
        except Exception as e:
            print("ERROR: " + str(e))

        # stop all engines which still run a task
        # the slave wrapper will restart the engine
        running_engines = set(running_engines)
        for engine_id in running_engines:
            self.engine_stop(engine_id)
        return True

    def job_disable(self, job_id):
        """Disable job in database."""
        job = self.query_job_by_id(job_id)
        job['enabled'] = False
        DrQueueJob.update_db(job)
        return True

    def job_enable(self, job_id):
        """Disable job in database."""
        job = self.query_job_by_id(job_id)
        job['enabled'] = True
        DrQueueJob.update_db(job)
        return True

    def job_delete(self, job_id):
        """Delete job and all of it's tasks"""
        tasks = self.query_task_list(job_id)
        engines = self.query_computer_list()
        error = False
        pending_tasks = []
        # abort and delete all queued tasks
        for task in tasks:
            if len(engines) > 0:
                # abort outstanding tasks which are already queued to engine
                print('aborting task ' + str(task['msg_id']))
                try:
                    self.ip_client.abort(task['msg_id'], engines)
                except Exception, e:
                    print('Error: ' + str(e))
                    error = True
            # purge all tasks which are not pending
            print('purging task ' + str(task['msg_id']))
            try:
                self.ip_client.purge_hub_results(task['msg_id'], engines)
            except Exception:
                print('Warning: ' + str(task['msg_id']) +
                      ' is pending. Try to kill job before.')
                pending_tasks.append(task)
                error = True
        # delete job if no error occured
        if error == False:
            # delete job itself
            DrQueueJob.delete_from_db(job_id)
            return True
        else:
            return False
예제 #6
0
class Client:
    """DrQueue client actions"""

    def __init__(self):
        # initialize IPython
        try:
            self.ip_client = IPClient()
        except Exception:
            raise Exception("Could not connect to IPython controller.")
        self.lbview = self.ip_client.load_balanced_view()

        # enable tracking
        self.lbview.track = True

    def job_run(self, job):
        """Create and queue tasks from job object"""

        # check job name
        if job["name"] in DrQueueJob.query_jobnames():
            raise ValueError("Job name %s is already used!" % job["name"])
            return False

        # run job only on matching os
        os_list = self.query_engines_of_os(job["limits"]["os"])

        # run job only on matching minram
        minram_list = self.query_engines_with_minram(job["limits"]["minram"])

        # run job only on matching mincores
        mincores_list = self.query_engines_with_mincores(job["limits"]["mincores"])

        # check pool members
        pool_list = self.query_engines_of_pool(job["limits"]["pool"])

        # check limits
        self.match_all_limits(os_list, minram_list, mincores_list, pool_list)

        # save job in database
        job_id = DrQueueJob.store_db(job)

        # job_id from db is be used as session name
        self.ip_client.session.session = str(job_id)

        # set owner of job
        self.ip_client.session.username = job["owner"]

        # set number of retries for each task
        self.lbview.retries = job["retries"]

        # depend on another job (it's tasks)
        if ("depend" in job["limits"]) and (job["limits"]["depend"] != None):
            depend_job = self.query_job_by_name(job["limits"]["depend"])
            depend_tasks = self.query_task_list(depend_job["_id"])
            task_ids = []
            for task in depend_tasks:
                task_ids.append(task["msg_id"])
            self.lbview.after = task_ids

        # check frame numbers
        if not (job["startframe"] >= 1):
            raise ValueError("Invalid value for startframe. Has to be equal or greater than 1.")
            return False
        if not (job["endframe"] >= 1):
            raise ValueError("Invalid value for endframe. Has to be equal or greater than 1.")
            return False
        if not (job["endframe"] >= job["startframe"]):
            raise ValueError("Invalid value for endframe. Has be to equal or greater than startframe.")
            return False
        if job["endframe"] > job["startframe"]:
            if not (job["endframe"] - job["startframe"] >= job["blocksize"]):
                raise ValueError("Invalid value for blocksize. Has to be equal or lower than endframe-startframe.")
                return False
        if job["endframe"] == job["startframe"]:
            if job["blocksize"] != 1:
                raise ValueError("Invalid value for blocksize. Has to be equal 1 if endframe equals startframe.")
                return False

        task_frames = range(job["startframe"], job["endframe"] + 1, job["blocksize"])
        for x in task_frames:
            # prepare script input
            env_dict = {
                "DRQUEUE_FRAME": x,
                "DRQUEUE_BLOCKSIZE": job["blocksize"],
                "DRQUEUE_ENDFRAME": job["endframe"],
                "DRQUEUE_SCENEFILE": job["scenefile"],
                "DRQUEUE_LOGFILE": job["name"] + "-" + str(x) + "_" + str(x + job["blocksize"] - 1) + ".log",
            }

            # optional elements
            if "renderdir" in job:
                env_dict["DRQUEUE_RENDERDIR"] = job["renderdir"]
            if "projectdir" in job:
                env_dict["DRQUEUE_PROJECTDIR"] = job["projectdir"]
            if "configdir" in job:
                env_dict["DRQUEUE_CONFIGDIR"] = job["configdir"]
            if "imagefile" in job:
                env_dict["DRQUEUE_IMAGEFILE"] = job["imagefile"]
            if "precommand" in job:
                env_dict["DRQUEUE_PRECOMMAND"] = job["precommand"]
            if "renderer" in job:
                env_dict["DRQUEUE_RENDERER"] = job["renderer"]
            if "fileformat" in job:
                env_dict["DRQUEUE_FILEFORMAT"] = job["fileformat"]
            if "postcommand" in job:
                env_dict["DRQUEUE_POSTCOMMAND"] = job["postcommand"]
            if "viewcommand" in job:
                env_dict["DRQUEUE_VIEWCOMMAND"] = job["viewcommand"]
            if "worldfile" in job:
                env_dict["DRQUEUE_WORLDFILE"] = job["worldfile"]
            if "terrainfile" in job:
                env_dict["DRQUEUE_TERRAINFILE"] = job["terrainfile"]
            if "composition" in job:
                env_dict["DRQUEUE_COMPOSITION"] = job["composition"]
            if "camera" in job:
                env_dict["DRQUEUE_CAMERA"] = job["camera"]
            if "resx" in job:
                env_dict["DRQUEUE_RESX"] = job["resx"]
            if "resy" in job:
                env_dict["DRQUEUE_RESY"] = job["resy"]
            if "renderpass" in job:
                env_dict["DRQUEUE_RENDERPASS"] = job["renderpass"]
            if "rendertype" in job:
                env_dict["DRQUEUE_RENDERTYPE"] = job["rendertype"]
            if "fileextension" in job:
                env_dict["DRQUEUE_FILEEXTENSION"] = job["fileextension"]
            if "stepframe" in job:
                env_dict["DRQUEUE_STEPFRAME"] = job["stepframe"]
            if "custom_bucket" in job:
                env_dict["DRQUEUE_CUSTOM_BUCKET"] = job["custom_bucket"]
            if "bucketsize" in job:
                env_dict["DRQUEUE_BUCKETSIZE"] = job["bucketsize"]
            if "custom_lod" in job:
                env_dict["DRQUEUE_CUSTOM_LOD"] = job["custom_lod"]
            if "lod" in job:
                env_dict["DRQUEUE_LOD"] = job["lod"]
            if "custom_varyaa" in job:
                env_dict["DRQUEUE_CUSTOM_VARYAA"] = job["custom_varyaa"]
            if "varyaa" in job:
                env_dict["DRQUEUE_VARYAA"] = job["varyaa"]
            if "raytrace" in job:
                env_dict["DRQUEUE_RAYTRACE"] = job["raytrace"]
            if "antialias" in job:
                env_dict["DRQUEUE_ANTIALIAS"] = job["antialias"]
            if "custom_bdepth" in job:
                env_dict["DRQUEUE_CUSTOM_BDEPTH"] = job["custom_bdepth"]
            if "bdepth" in job:
                env_dict["DRQUEUE_BDEPTH"] = job["bdepth"]
            if "custom_zdepth" in job:
                env_dict["DRQUEUE_CUSTOM_ZDEPTH"] = job["custom_zdepth"]
            if "zdepth" in job:
                env_dict["DRQUEUE_ZDEPTH"] = job["zdepth"]
            if "custom_cracks" in job:
                env_dict["DRQUEUE_CUSTOM_CRACKS"] = job["custom_cracks"]
            if "cracks" in job:
                env_dict["DRQUEUE_CRACKS"] = job["cracks"]
            if "custom_quality" in job:
                env_dict["DRQUEUE_CUSTOM_QUALITY"] = job["custom_quality"]
            if "quality" in job:
                env_dict["DRQUEUE_QUALITY"] = job["quality"]
            if "custom_qfiner" in job:
                env_dict["DRQUEUE_CUSTOM_QFINER"] = job["custom_qfiner"]
            if "qfiner" in job:
                env_dict["DRQUEUE_QFINER"] = job["qfiner"]
            if "custom_smultiplier" in job:
                env_dict["DRQUEUE_CUSTOM_SMULTIPLIER"] = job["custom_smultiplier"]
            if "smultiplier" in job:
                env_dict["DRQUEUE_SMULTIPLIER"] = job["smultiplier"]
            if "custom_mpcache" in job:
                env_dict["DRQUEUE_CUSTOM_MPCACHE"] = job["custom_mpcache"]
            if "mpcache" in job:
                env_dict["DRQUEUE_MPCACHE"] = job["mpcache"]
            if "custom_smpolygon" in job:
                env_dict["DRQUEUE_CUSTOM_SMPOLYGON"] = job["custom_smpolygon"]
            if "smpolygon" in job:
                env_dict["DRQUEUE_SMPOLYGON"] = job["smpolygon"]
            if "custom_wh" in job:
                env_dict["DRQUEUE_CUSTOM_WH"] = job["custom_wh"]
            if "custom_type" in job:
                env_dict["DRQUEUE_CUSTOM_TYPE"] = job["custom_type"]
            if "ctype" in job:
                env_dict["DRQUEUE_CTYPE"] = job["ctype"]
            if "skipframes" in job:
                env_dict["DRQUEUE_SKIPFRAMES"] = job["skipframes"]

            # run task on cluster
            render_script = DrQueue.get_rendertemplate(job["renderer"])
            ar = self.lbview.apply(DrQueue.run_script_with_env, render_script, env_dict)
            # wait for pyzmq send to complete communication (avoid race condition)
            ar.wait_for_send()
        return True

    def identify_computer(self, engine_id, cache_time):
        """Gather information about computer"""
        # look if engine info is already stored
        engine = DrQueueComputer.query_db(engine_id)
        now = int(time.time())
        # check existence and age of info
        if (engine != None) and (now <= engine["date"] + cache_time):
            print ("DEBUG: Engine %i was found in DB" % engine_id)
        # store new info
        else:
            print ("DEBUG: Engine %i was not found in DB" % engine_id)
            # run command only on specific computer
            dview = self.ip_client[engine_id]
            dview.block = True
            dview.execute(
                "import DrQueue\nfrom DrQueue import Computer as DrQueueComputer\nengine = DrQueueComputer("
                + str(engine_id)
                + ")"
            )
            engine = dview["engine"]
            engine["date"] = int(time.time())
            DrQueueComputer.store_db(engine)
        return engine

    def task_wait(self, task_id):
        """Wait for task to finish"""
        ar = self.ip_client.get_result(task_id)
        ar.wait_for_send()
        ar.wait()
        return ar

    def query_job_list(self):
        """Query a list of all jobs"""
        return DrQueueJob.query_job_list()

    def query_running_job_list(self):
        """Query a list of all running jobs"""
        jobs = DrQueueJob.query_job_list()
        running_jobs = []
        for job in jobs:
            if self.query_job_tasks_left(job["_id"]) > 0:
                running_jobs.append(job)
        return running_jobs

    def query_jobname(self, task_id):
        """Query jobname from task id"""
        data = self.ip_client.db_query({"msg_id": task_id})
        job_id = data[0]["header"]["session"]
        job = DrQueueJob.query_db(job_id)
        return job.name

    def query_job(self, job_id):
        """Query job from id"""
        return DrQueueJob.query_db(job_id)

    def query_job_by_name(self, job_name):
        """Query job from name"""
        return DrQueueJob.query_job_by_name(job_name)

    def query_job_tasks_left(self, job_id):
        """Query left frames of job"""
        left = 0
        tasks = self.query_task_list(job_id)
        for task in tasks:
            if task["completed"] == None:
                left += 1
        return left

    def query_task_list(self, job_id):
        """Query a list of tasks objects of certain job"""
        return self.ip_client.db_query({"header.session": str(job_id)})

    def query_task(self, task_id):
        """Query a single task"""
        task = self.ip_client.db_query({"msg_id": task_id})[0]
        return task

    def query_engine_list(self):
        """Query a list of all engines"""
        return self.ip_client.ids

    def query_engines_of_pool(self, pool_name):
        """Return available engines of certain pool."""
        pool_computers = self.ip_client.ids
        if pool_name != None:
            computers = DrQueueComputerPool.query_pool_members(pool_name)
            if computers == None:
                raise ValueError('Pool "%s" is not existing!' % pool_name)
                return False
            for comp in pool_computers:
                if not comp in computers:
                    pool_computers.remove(comp)
            if pool_computers == []:
                raise ValueError("No computer of pool %s is available!" % pool_name)
                return False
            print ("DEBUG: matching pool: " + pool_name)
            print (pool_computers)
        return pool_computers

    def query_engines_of_os(self, os_name):
        """Return only engines running certain OS."""
        # run job only on matching os
        matching_os = self.ip_client.ids
        if os_name != None:
            for engine_id in self.ip_client.ids:
                engine = self.identify_computer(engine_id, 1000)
                # os string has to contain os_name
                if not os_name in engine["os"]:
                    matching_os.remove(engine_id)
            print ("DEBUG: matching os: " + os_name)
            print (matching_os)
        return matching_os

    def query_engines_with_minram(self, minram):
        """Return only engines with at least minram GB RAM."""
        # run job only on matching minram
        matching_minram = self.ip_client.ids
        if minram > 0:
            for engine_id in self.ip_client.ids:
                engine = self.identify_computer(engine_id, 1000)
                if engine["memory"] < minram:
                    matching_minram.remove(engine_id)
            print ("DEBUG: matching minram: " + str(minram))
            print (matching_minram)
        return matching_minram

    def query_engines_with_mincores(self, mincores):
        """Return only engines with at least minram GB RAM."""
        # run job only on matching mincores
        matching_mincores = self.ip_client.ids
        if mincores > 0:
            for engine_id in self.ip_client.ids:
                engine = self.identify_computer(engine_id, 1000)
                if engine["ncorescpu"] * engine["ncpus"] < mincores:
                    matching_mincores.remove(engine_id)
            print ("DEBUG: matching mincores: " + str(mincores))
            print (matching_mincores)
        return matching_mincores

    def match_all_limits(self, os_list, minram_list, mincores_list, pool_list):
        """Match all limits for job."""
        tmp_list = []
        # build list with all list members
        tmp_list.extend(os_list)
        tmp_list.extend(minram_list)
        tmp_list.extend(mincores_list)
        tmp_list.extend(pool_list)
        # make entries unique
        tmp_list = set(tmp_list)
        tmp_list = list(tmp_list)
        matching_limits = []
        for entry in tmp_list:
            # look if entry is in all lists
            if (entry in os_list) and (entry in minram_list) and (entry in mincores_list) and (entry in pool_list):
                matching_limits.append(entry)
            else:
                print ("DEBUG: %i isn't matching limits" % entry)
        print ("DEBUG: matching limits:")
        print (matching_limits)
        if len(matching_limits) == 0:
            message = "No engine meets the requirements."
            print (message)
            raise Exception(message)
        elif len(matching_limits) > 0:
            # only run on matching engines
            self.lbview = self.ip_client.load_balanced_view(matching_limits)
        else:
            self.lbview = self.ip_client.load_balanced_view()

    def job_stop(self, job_id):
        """Stop job and all tasks which are not currently running"""
        tasks = self.query_task_list(job_id)
        # abort all queued tasks
        for task in tasks:
            self.ip_client.abort(task["msg_id"])
        return True

    def job_kill(self, job_id):
        """Stop job and all of it's tasks wether running or not"""
        tasks = self.query_task_list(job_id)
        running_engines = []
        # abort all queued tasks
        for task in tasks:
            stats = self.ip_client.queue_status("all", True)
            # check if tasks is already running on an engine
            for key, status in stats.items():
                if ("tasks" in status) and (task["msg_id"] in status["tasks"]):
                    print "found"
                    running_engines.append(key)
            self.ip_client.abort(task["msg_id"])
        # restart all engines which still run a task
        running_engines = set(running_engines)
        print list(running_engines)
        # for engine_id in running_engines:
        #    self.ip_client(engine_id)
        return True

    def job_delete(self, job_id):
        """Delete job and all of it's tasks"""
        tasks = self.query_task_list(job_id)
        engines = self.query_engine_list()
        # abort and delete all queued tasks
        for task in tasks:
            if len(engines) > 0:
                self.ip_client.abort(task["msg_id"])
            self.ip_client.purge_results(task["msg_id"])
        # delete job itself
        DrQueueJob.delete_from_db(job_id)
        return True

    def task_continue(self, task_id):
        """Continue aborted or failed task"""
        task = self.query_task(task_id)
        # check if action is needed
        if (task["completed"] != None) and (
            (task["result_header"]["status"] == "error") or (task["result_header"]["status"] == "aborted")
        ):
            self.task_requeue(task_id)
        return True

    def task_requeue(self, task_id):
        """Requeue task"""
        self.ip_client.resubmit(task_id)
        print "requeuing %s" % task_id
        return True

    def job_continue(self, job_id):
        """Continue stopped job and all of it's tasks"""
        job = self.query_job(job_id)
        # run job only on matching os
        os_list = self.query_engines_of_os(job["limits"]["os"])
        # run job only on matching minram
        minram_list = self.query_engines_with_minram(job["limits"]["minram"])
        # run job only on matching mincores
        mincores_list = self.query_engines_with_mincores(job["limits"]["mincores"])
        # check pool members
        pool_list = self.query_engines_of_pool(job["limits"]["pool"])
        # check limits
        self.match_all_limits(os_list, minram_list, mincores_list, pool_list)
        tasks = self.query_task_list(job_id)
        # continue tasks
        for task in tasks:
            self.task_continue(task["msg_id"])
        return True

    def job_rerun(self, job_id):
        """Run all tasks of job another time"""
        job = self.query_job(job_id)
        # run job only on matching os
        os_list = self.query_engines_of_os(job["limits"]["os"])
        # run job only on matching minram
        minram_list = self.query_engines_with_minram(job["limits"]["minram"])
        # run job only on matching mincores
        mincores_list = self.query_engines_with_mincores(job["limits"]["mincores"])
        # check pool members
        pool_list = self.query_engines_of_pool(job["limits"]["pool"])
        # check limits
        self.match_all_limits(os_list, minram_list, mincores_list, pool_list)
        tasks = self.query_task_list(job_id)
        # rerun tasks
        for task in tasks:
            self.task_requeue(task["msg_id"])
        return True

    def job_status(self, job_id):
        """Return status string of job"""
        tasks = self.query_task_list(job_id)
        status = None
        status_pending = 0
        status_ok = 0
        status_aborted = 0
        status_resubmitted = 0
        status_error = 0
        for task in tasks:
            # look for pending tasks
            if task["completed"] == None:
                status_pending += 1
            else:
                if "result_header" in task.keys():
                    result_header = task["result_header"]
                    # look for done tasks
                    if ("status" in result_header.keys()) and (result_header["status"] == "ok"):
                        status_ok += 1
                    # look for aborted tasks
                    elif ("status" in result_header.keys()) and (result_header["status"] == "aborted"):
                        status_aborted += 1
                    # look for done tasks
                    elif ("status" in result_header.keys()) and (result_header["status"] == "resubmitted"):
                        status_resubmitted += 1
                    # look for tasks with error
                    elif ("status" in result_header.keys()) and (result_header["status"] == "error"):
                        status_error += 1
                    else:
                        status_unknown += 1
        # if at least 1 task is ok, job status is ok
        if status_ok > 0:
            status = "ok"
        # if at least 1 task is pending, job status is pending
        if status_pending > 0:
            status = "pending"
        # if at least 1 task is aborted, job status is aborted
        if status_aborted > 0:
            status = "aborted"
        # if at least 1 task has an error, job status is error
        if status_error > 0:
            status = "error"
        return status

    def engine_stop(self, engine_id):
        """Stop a specific engine"""
        # delete computer information in db
        DrQueueComputer.delete_from_db(engine_id)
        # shutdown computer
        self.ip_client.shutdown(engine_id)
        return True

    def engine_restart(self, engine_id):
        """Restart a specific engine"""
        self.ip_client.shutdown(engine_id, True, False, True)
        return True
예제 #7
0
class Grid(object):
    '''
    Responsible to run QuanTrade runtime and communicate with drones

    It forks:
        - log.io for logs aggregation
        - dashboards for trading purpose
    And dynamically:
        - Remote rest_services for database wrappers
        - Drones to process remote calls
        - Glances servers and clients for ressources monitoring

    It basically waits for new tasks to pop (ie remote engines to appear), and
    fork trading processes on them according to their associated configuration.
    It can as well create by itself remote/local drones for classic cluster
    purpose.

    The object can be configured through ~/.quantrade/default.json.
    '''
    def __init__(self, configuration_path=CONFIG_PATH):
        log.info('Running Grid master, stop it with CTRL-C')

        # CTRL-C interception
        SignalManager()

        # Setup object configuration
        self._configure(configuration_path)

        # Team_dashboard web graphs
        self.dashboard = Dashboard()
        # Logs monitoring
        self.logio = LogIO(self.configuration['nodes'])

        # Nodes are physical machines of the cluster
        self.nodes = {
            ip: Node(ip, self.configuration['monitored'],
                     self.configuration['restful'])
            for ip in self.configuration['nodes']
        }

        self.processed_engines = []

    def _configure(self, configuration_path):
        '''
        Read and set configuration
        '''
        self.configuration = json.load(open(configuration_path, 'r'))['grid']
        #http://docs.fabfile.org/en/1.4.3/usage/env.html#full-list-of-env-vars
        #env.forward_agent = True
        #env.key_filename = [""]
        env.user = self.configuration['name']
        env.password = self.configuration['password']
        env.hosts = self.configuration['nodes']
        env.roledefs = {
            'local': ['127.0.0.1'],
            'controller': self.configuration['controller'],
            'nodes': self.configuration['nodes']
        }

    def deploy(self):
        '''
        Set up local ipcontroller
        '''
        log.info('Deploying grid trade-system')
        log.info('Activating local ipcontroller')
        execute(fab.activate_controller)

        # Main interface to drones
        self.engines = Client()

    def _is_idle(self, state):
        '''
        Check if there is pending tasks to do
        '''
        if 'queue' in state:
            return not state['queue']

        # Else, no informations to answer
        return None

    def detect_drones(self):
        new_engines = []
        engines_status = self.engines.queue_status()
        #NOTE what is the use of status['unassigned'] ?
        for key, state in engines_status.iteritems():
            if key == 'unassigned':
                continue
            if (self._is_idle(state) and key not in self.processed_engines):
                self.processed_engines.append(key)
                new_engines.append(self.engines[key])

        self._dispatch_engines(new_engines)
        return len(new_engines)

    def _dispatch_engines(self, engines):
        for engine in engines:
            ip = engine.apply_sync(get_local_ip)
            log.info('New engine detected on {}'.format(ip))
            if ip not in self.nodes:
                log.info('New node connected')
                self.nodes[ip] = Node(ip, self.configuration['monitored'],
                                      self.configuration['restful'])

            self.nodes[ip].register_drone(engine.targets, engine)

            drone_name = self.nodes[ip].drones[engine.targets].name
            self.dashboard.add_description(remote_ip=ip, portfolio=drone_name)
            self.logio.add_description(drone_name, remote_ip=ip)

            log.info('Drone registered')

    def process(self, function, node_ip=None, drone_id=None):
        '''
        Process pending tasks on available, and eventually provided, drones
        '''
        processed_nodes = self.nodes.values()
        for node in processed_nodes:
            processed_drones = node.drones.values()
            #FIXME use self.engines.shutdown([1, 3]) insteand of
            #non-functionnal drone.shutdown
            node.inspect_armada()
            for drone in processed_drones:
                drone.run(function)

    def fireup_dashboards(self):
        if self.configuration['logserver']:
            self.logio.build()
            self.logio.run()
            log.notice('Log.io available at http://192.168.0.12:28778')

        if self.configuration['dashboard']:
            self.dashboard.build()
            self.dashboard.run(public_ip=False)
            log.notice('Dashboard available at http://192.168.0.12:4000')
예제 #8
0
class Grid(object):
    '''
    Responsible to run QuanTrade runtime and communicate with drones

    It forks:
        - log.io for logs aggregation
        - dashboards for trading purpose
    And dynamically:
        - Remote rest_services for database wrappers
        - Drones to process remote calls
        - Glances servers and clients for ressources monitoring

    It basically waits for new tasks to pop (ie remote engines to appear), and
    fork trading processes on them according to their associated configuration.
    It can as well create by itself remote/local drones for classic cluster
    purpose.

    The object can be configured through ~/.quantrade/default.json.
    '''

    def __init__(self, configuration_path=CONFIG_PATH):
        log.info('Running Grid master, stop it with CTRL-C')

        # CTRL-C interception
        SignalManager()

        # Setup object configuration
        self._configure(configuration_path)

        # Team_dashboard web graphs
        self.dashboard = Dashboard()
        # Logs monitoring
        self.logio = LogIO(self.configuration['nodes'])

        # Nodes are physical machines of the cluster
        self.nodes = {ip: Node(ip, self.configuration['monitored'],
                               self.configuration['restful'])
                      for ip in self.configuration['nodes']}

        self.processed_engines = []

    def _configure(self, configuration_path):
        '''
        Read and set configuration
        '''
        self.configuration = json.load(open(configuration_path, 'r'))['grid']
        #http://docs.fabfile.org/en/1.4.3/usage/env.html#full-list-of-env-vars
        #env.forward_agent = True
        #env.key_filename = [""]
        env.user = self.configuration['name']
        env.password = self.configuration['password']
        env.hosts = self.configuration['nodes']
        env.roledefs = {
            'local': ['127.0.0.1'],
            'controller': self.configuration['controller'],
            'nodes': self.configuration['nodes']
        }

    def deploy(self):
        '''
        Set up local ipcontroller
        '''
        log.info('Deploying grid trade-system')
        log.info('Activating local ipcontroller')
        execute(fab.activate_controller)

        # Main interface to drones
        self.engines = Client()

    def _is_idle(self, state):
        '''
        Check if there is pending tasks to do
        '''
        if 'queue' in state:
            return not state['queue']

        # Else, no informations to answer
        return None

    def detect_drones(self):
        new_engines = []
        engines_status = self.engines.queue_status()
        #NOTE what is the use of status['unassigned'] ?
        for key, state in engines_status.iteritems():
            if key == 'unassigned':
                continue
            if (self._is_idle(state)
                    and key not in self.processed_engines):
                self.processed_engines.append(key)
                new_engines.append(self.engines[key])

        self._dispatch_engines(new_engines)
        return len(new_engines)

    def _dispatch_engines(self, engines):
        for engine in engines:
            ip = engine.apply_sync(get_local_ip)
            log.info('New engine detected on {}'.format(ip))
            if ip not in self.nodes:
                log.info('New node connected')
                self.nodes[ip] = Node(ip, self.configuration['monitored'],
                                      self.configuration['restful'])

            self.nodes[ip].register_drone(engine.targets, engine)

            drone_name = self.nodes[ip].drones[engine.targets].name
            self.dashboard.add_description(remote_ip=ip, portfolio=drone_name)
            self.logio.add_description(drone_name, remote_ip=ip)

            log.info('Drone registered')

    def process(self, function, node_ip=None, drone_id=None):
        '''
        Process pending tasks on available, and eventually provided, drones
        '''
        processed_nodes = self.nodes.values()
        for node in processed_nodes:
            processed_drones = node.drones.values()
            #FIXME use self.engines.shutdown([1, 3]) insteand of
            #non-functionnal drone.shutdown
            node.inspect_armada()
            for drone in processed_drones:
                drone.run(function)

    def fireup_dashboards(self):
        if self.configuration['logserver']:
            self.logio.build()
            self.logio.run()
            log.notice('Log.io available at http://192.168.0.12:28778')

        if self.configuration['dashboard']:
            self.dashboard.build()
            self.dashboard.run(public_ip=False)
            log.notice('Dashboard available at http://192.168.0.12:4000')