class Client(): """DrQueue client actions""" def __init__(self): # initialize IPython try: self.ip_client = IPClient() except Exception: raise Exception("Could not connect to IPython controller.") self.lbview = self.ip_client.load_balanced_view() # enable tracking self.lbview.track = True # list of all available query keys self.all_task_query_keys = ['msg_id', 'header', 'content', 'buffers', 'submitted', 'client_uuid', 'engine_uuid', 'started', 'completed', 'resubmitted', 'result_header', 'result_content', 'result_buffers', 'queue', 'pyin', 'pyout', 'pyerr', 'stdout', 'stderr'] def job_run(self, job): """Create and queue tasks from job object""" # job_id from db is be used as session name self.ip_client.session.session = str(job['_id']) # set owner of job self.ip_client.session.username = job['owner'] # set number of retries for each task self.lbview.retries = job['retries'] # depend on another job (it's tasks) if ('depend' in job['limits']) and (job['limits']['depend'] != None): depend_job = self.query_job_by_name(job['limits']['depend']) depend_tasks = self.query_task_list(depend_job['_id']) task_ids = [] for task in depend_tasks: task_ids.append(task['msg_id']) self.lbview.after = task_ids task_frames = list(range(job['startframe'], job['endframe'] + 1, job['blocksize'])) ar = None for x in task_frames: # prepare script input env_dict = { 'DRQUEUE_FRAME' : x, 'DRQUEUE_BLOCKSIZE' : job['blocksize'], 'DRQUEUE_ENDFRAME' : job['endframe'], 'DRQUEUE_SCENEFILE' : job['scenefile'] } # log filename if job['created_with'] == "DrQueueOnRails": # take job directory name env_dict['DRQUEUE_LOGFILE'] = job['scenefile'].split("/")[-2] + "-" + str(x) + "_" + str(x + job['blocksize'] -1) + ".log" else: # take job name env_dict['DRQUEUE_LOGFILE'] = job['name'] + "-" + str(x) + "_" + str(x + job['blocksize'] -1) + ".log" # optional elements if 'renderdir' in job: env_dict['DRQUEUE_RENDERDIR'] = job['renderdir'] if 'projectdir' in job: env_dict['DRQUEUE_PROJECTDIR'] = job['projectdir'] if 'configdir' in job: env_dict['DRQUEUE_CONFIGDIR'] = job['configdir'] if 'imagefile' in job: env_dict['DRQUEUE_IMAGEFILE'] = job['imagefile'] if 'precommand' in job: env_dict['DRQUEUE_PRECOMMAND'] = job['precommand'] if 'renderer' in job: env_dict['DRQUEUE_RENDERER'] = job['renderer'] if 'fileformat' in job: env_dict['DRQUEUE_FILEFORMAT'] = job['fileformat'] if 'postcommand' in job: env_dict['DRQUEUE_POSTCOMMAND'] = job['postcommand'] if 'viewcommand' in job: env_dict['DRQUEUE_VIEWCOMMAND'] = job['viewcommand'] if 'worldfile' in job: env_dict['DRQUEUE_WORLDFILE'] = job['worldfile'] if 'terrainfile' in job: env_dict['DRQUEUE_TERRAINFILE'] = job['terrainfile'] if 'composition' in job: env_dict['DRQUEUE_COMPOSITION'] = job['composition'] if 'camera' in job: env_dict['DRQUEUE_CAMERA'] = job['camera'] if 'resx' in job: env_dict['DRQUEUE_RESX'] = job['resx'] if 'resy' in job: env_dict['DRQUEUE_RESY'] = job['resy'] if 'renderpass' in job: env_dict['DRQUEUE_RENDERPASS'] = job['renderpass'] if 'rendertype' in job: env_dict['DRQUEUE_RENDERTYPE'] = job['rendertype'] if 'fileextension' in job: env_dict['DRQUEUE_FILEEXTENSION'] = job['fileextension'] if 'stepframe' in job: env_dict['DRQUEUE_STEPFRAME'] = job['stepframe'] if 'custom_bucket' in job: env_dict['DRQUEUE_CUSTOM_BUCKET'] = job['custom_bucket'] if 'bucketsize' in job: env_dict['DRQUEUE_BUCKETSIZE'] = job['bucketsize'] if 'custom_lod' in job: env_dict['DRQUEUE_CUSTOM_LOD'] = job['custom_lod'] if 'lod' in job: env_dict['DRQUEUE_LOD'] = job['lod'] if 'custom_varyaa' in job: env_dict['DRQUEUE_CUSTOM_VARYAA'] = job['custom_varyaa'] if 'varyaa' in job: env_dict['DRQUEUE_VARYAA'] = job['varyaa'] if 'raytrace' in job: env_dict['DRQUEUE_RAYTRACE'] = job['raytrace'] if 'antialias' in job: env_dict['DRQUEUE_ANTIALIAS'] = job['antialias'] if 'custom_bdepth' in job: env_dict['DRQUEUE_CUSTOM_BDEPTH'] = job['custom_bdepth'] if 'bdepth' in job: env_dict['DRQUEUE_BDEPTH'] = job['bdepth'] if 'custom_zdepth' in job: env_dict['DRQUEUE_CUSTOM_ZDEPTH'] = job['custom_zdepth'] if 'zdepth' in job: env_dict['DRQUEUE_ZDEPTH'] = job['zdepth'] if 'custom_cracks' in job: env_dict['DRQUEUE_CUSTOM_CRACKS'] = job['custom_cracks'] if 'cracks' in job: env_dict['DRQUEUE_CRACKS'] = job['cracks'] if 'custom_quality' in job: env_dict['DRQUEUE_CUSTOM_QUALITY'] = job['custom_quality'] if 'quality' in job: env_dict['DRQUEUE_QUALITY'] = job['quality'] if 'custom_qfiner' in job: env_dict['DRQUEUE_CUSTOM_QFINER'] = job['custom_qfiner'] if 'qfiner' in job: env_dict['DRQUEUE_QFINER'] = job['qfiner'] if 'custom_smultiplier' in job: env_dict['DRQUEUE_CUSTOM_SMULTIPLIER'] = job['custom_smultiplier'] if 'smultiplier' in job: env_dict['DRQUEUE_SMULTIPLIER'] = job['smultiplier'] if 'custom_mpcache' in job: env_dict['DRQUEUE_CUSTOM_MPCACHE'] = job['custom_mpcache'] if 'mpcache' in job: env_dict['DRQUEUE_MPCACHE'] = job['mpcache'] if 'custom_smpolygon' in job: env_dict['DRQUEUE_CUSTOM_SMPOLYGON'] = job['custom_smpolygon'] if 'smpolygon' in job: env_dict['DRQUEUE_SMPOLYGON'] = job['smpolygon'] if 'custom_wh' in job: env_dict['DRQUEUE_CUSTOM_WH'] = job['custom_wh'] if 'custom_type' in job: env_dict['DRQUEUE_CUSTOM_TYPE'] = job['custom_type'] if 'ctype' in job: env_dict['DRQUEUE_CTYPE'] = job['ctype'] if 'skipframes' in job: env_dict['DRQUEUE_SKIPFRAMES'] = job['skipframes'] if 'custom_command' in job: env_dict['DRQUEUE_CUSTOM_COMMAND'] = job['custom_command'] # set dependencies dep_dict = {} dep_dict['job_id'] = str(job['_id']) if ('os' in job['limits']) and (job['limits']['os'] != None): dep_dict['os_name'] = job['limits']['os'] if ('minram' in job['limits']) and (job['limits']['minram'] > 0): dep_dict['minram'] = job['limits']['minram'] if ('mincores' in job['limits']) and (job['limits']['mincores'] > 0): dep_dict['mincores'] = job['limits']['mincores'] if ('pool_name' in job['limits']) and (job['limits']['pool_name'] != None): dep_dict['pool_name'] = job['limits']['pool_name'] run_script_with_env_and_deps = dependent(DrQueue.run_script_with_env, DrQueue.check_deps, dep_dict) # run task on cluster render_script = DrQueue.get_rendertemplate(job['renderer']) ar = self.lbview.apply(run_script_with_env_and_deps, render_script, env_dict) # wait for pyzmq send to complete communication (avoid race condition) ar.wait_for_send() # append email task behind last task if requested if ('send_email' in job) and (job['send_email'] == True): self.lbview.after = ar # run email task mail_ar = self.lbview.apply(DrQueue.send_email, job['name'], job['email_recipients']) # wait for pyzmq send to complete communication (avoid race condition) mail_ar.wait_for_send() return True def identify_computer(self, engine_id, cache_time, timeout=15): """Gather information about computer""" # look if engine info is already stored engine = DrQueueComputer.query_db_by_engine_id(engine_id) now = int(time.time()) # check existence and age of info if (engine != None) and (now <= engine['created_at'] + cache_time): log.debug("Engine %i was found in DB and info is up-to-date." % engine_id) return engine # store new info else: if engine != None: log.debug("Engine %i was found in DB, but info needs to be updated." % engine_id) else: log.debug("Engine %i was not found in DB." % engine_id) # run command only on specific computer try: dview = self.ip_client[engine_id] except IndexError: log.debug("Engine with id %i unknown." % engine_id) # delete old entry from database DrQueueComputer.delete_from_db_by_engine_id(engine_id) log.debug("Engine with id %i deleted from database." % engine_id) new_engine = None else: # run command in async mode dview.block = False command = "import DrQueue\nfrom DrQueue import Computer as DrQueueComputer\nengine = DrQueueComputer()" ar = dview.execute(command) try: # try to get results & wait until timeout ar.get(timeout) except Exception: if engine != None: log.debug("Update request for engine %i timed out. Using old information from DB." % engine_id) new_engine = engine else: log.debug("Information request for engine %i timed out." % engine_id) new_engine = None else: # get computer dict from engine namespace new_engine = dview['engine'] # set to known engine_id new_engine['engine_id'] = engine_id # set creation time new_engine['created_at'] = int(time.time()) # store entry in database DrQueueComputer.store_db(new_engine) return new_engine def computer_set_pools(self, computer, pool_list): """add computer to list of pools""" # convert to string pool_str = ','.join(pool_list) # update environment variable on engine dview = self.ip_client[computer['engine_id']] dview.block = True command = "import os\nos.environ[\"DRQUEUE_POOL\"] = \"" + pool_str + "\"" dview.execute(command) # update database entry computer['pools'] = pool_list DrQueueComputer.store_db(computer) log.debug("Engine " + str(computer['engine_id']) + " added to pools " + pool_str + ".") return computer def computer_get_pools(self, computer): """Return all pool names where computer is member.""" return computer['pools'] def task_wait(self, task_id): """Wait for task to finish""" ar = self.ip_client.get_result(task_id) ar.wait_for_send() ar.wait() return ar def query_job_list(self): """Query a list of all jobs""" return DrQueueJob.query_job_list() def query_job_by_id(self, job_id): """Query job by given id""" return DrQueueJob.query_db(job_id) def query_job_by_name(self, job_name): """Query job by given name""" return DrQueueJob.query_job_by_name(job_name) def query_job_tasks_left(self, job_id): """Query left frames of job""" left = 0 tasks = self.query_task_list(job_id) for task in tasks: if task['completed'] == None: left += 1 return left def query_job_finish_time(self, job_id): """Query oldest finish time of all tasks.""" job = self.query_job_by_id(job_id) # use requeue time as starting point if available if ('requeue_time' in job ) and (job['requeue_time'] != False): finish_time = job['requeue_time'] else: finish_time = job['submit_time'] tasks = self.query_task_list(job_id) for task in tasks: # look if older finish time exists if (task['completed'] != None) and (task['completed'] > finish_time): finish_time = task['completed'] return finish_time def get_frame_nr(self, task): """Extract value of DRQUEUE_FRAME.""" if ('buffers' in task) and task['buffers'] != []: frame_nr = int(pickle.loads(task['buffers'][3])['DRQUEUE_FRAME']) else: frame_nr = 1 return frame_nr def query_task_list(self, job_id): """Query a list of tasks objects of certain job. Sort by frame number.""" task_list = self.ip_client.db_query({'header.session' : str(job_id)}, keys=self.all_task_query_keys) sorted_task_list = sorted(task_list, key=self.get_frame_nr) return sorted_task_list def query_interrupted_task_list(self, job_id): """Query a list of interrupted tasks of certain job. Sort by frame number.""" job = self.query_job_by_id(job_id) task_list = self.ip_client.db_query({'header.session' : str(job_id)}, keys=self.all_task_query_keys) interrupted_task_list = [] for task in task_list: frame_nr = self.get_frame_nr(task) print("frame_nr: " + str(frame_nr)) # log filename if job['renderer'] == "blender": filesearch = job['scenefile'] + str("%04d" % frame_nr) + ".???" found = glob.glob(filesearch) # file was found if len(found) > 0: outputfile = found[0] print("outputfile: "+ str(outputfile)) filesize = os.path.getsize(outputfile) print(filesize) # file exists, but is empty if filesize == 0: interrupted_task_list.append(task) # file was not found else: outputfile = None print("outputfile: "+ str(outputfile)) if (task['completed'] == None) and (task['started'] == None): interrupted_task_list.append(task) else: raise ValueError("Only Blender renderer supported so far.") return interrupted_task_list def query_task(self, task_id): """Query a single task.""" task = self.ip_client.db_query({'msg_id' : task_id }, keys=self.all_task_query_keys)[0] return task def query_computer_list(self): """Query a list of all computers.""" return self.ip_client.ids def job_stop(self, job_id): """Stop job and all tasks which are not currently running""" # disable job self.job_disable(job_id) tasks = self.query_task_list(job_id) tasks_to_stop = [] for task in tasks: print("Task " + task["msg_id"] + ": ") if ("result_content" in task) and (task["result_content"] != None) and (task["result_content"]["status"] == "ok"): print(" finished at " + str(task["completed"])) else: # get task stats of all computers stats = self.ip_client.queue_status('all', True) # check if tasks is already running on an engine found_on_engine = False for key,status in list(stats.items()): if ('tasks' in status) and (task['msg_id'] in status['tasks']): # skip tasks which are already running on an engine print(" not finished yet but already queued to engine. will leave it there.") found_on_engine = True break # if a task isn't already queueed/running on an engine, it should be safe to abort it if found_on_engine == False: print(" not finished yet. will abort.") tasks_to_stop.append(task['msg_id']) if len(tasks_to_stop) > 0: try: self.ip_client.abort(tasks_to_stop) except Exception as e: print("ERROR: " + str(e)) return True def job_kill(self, job_id): """Stop job and all of it's tasks wether running or not""" # disable job self.job_disable(job_id) tasks = self.query_task_list(job_id) running_engines = [] tasks_to_stop = [] # abort all queued tasks for task in tasks: stats = self.ip_client.queue_status('all', True) # check if tasks is already running on an engine for key,status in list(stats.items()): if ('tasks' in status) and (task['msg_id'] in status['tasks']): running_engines.append(key) tasks_to_stop.append(task['msg_id']) # stop all matching tasks at once try: self.ip_client.abort(tasks_to_stop) except Exception as e: print("ERROR: " + str(e)) # stop all engines which still run a task # the slave wrapper will restart the engine running_engines = set(running_engines) for engine_id in running_engines: self.engine_stop(engine_id) return True def job_disable(self, job_id): """Disable job in database.""" job = self.query_job_by_id(job_id) job['enabled'] = False DrQueueJob.update_db(job) return True def job_enable(self, job_id): """Disable job in database.""" job = self.query_job_by_id(job_id) job['enabled'] = True DrQueueJob.update_db(job) return True def job_delete(self, job_id): """Delete job and all of it's tasks""" tasks = self.query_task_list(job_id) engines = self.query_computer_list() error = False pending_tasks = [] # abort and delete all queued tasks for task in tasks: if len(engines) > 0: # abort outstanding tasks which are already queued to engine print('aborting task ' + str(task['msg_id'])) try: self.ip_client.abort(task['msg_id'], engines) except Exception, e: print('Error: ' + str(e)) error = True # purge all tasks which are not pending print('purging task ' + str(task['msg_id'])) try: self.ip_client.purge_hub_results(task['msg_id'], engines) except Exception: print('Warning: ' + str(task['msg_id']) + ' is pending. Try to kill job before.') pending_tasks.append(task) error = True # delete job if no error occured if error == False: # delete job itself DrQueueJob.delete_from_db(job_id) return True else: return False
# print n_jobs, n_executed_jobs, rc = Client(profile=profile) n_clusters = len(rc) if n_executed_jobs == 0: n_executed_jobs = n_jobs elif n_executed_jobs < n_clusters: n_jobs = n_executed_jobs if n_jobs >= n_clusters: dview = rc[:] elif n_jobs == -1: dview = rc[:] elif n_jobs < n_clusters: dview = rc[list(np.random.permutation(n_clusters)[:n_executed_jobs])] # A = dview.queue_status() # print A.keys() return dview if __name__ == "__main__": rc = Client(profile='net') A = rc.queue_status() for ii in range(len(rc)): print A[ii] # dview = random_rc('net', -1, 10) # A = dview.queue_status() # print len(dview) # for ii in A.keys(): # print A[ii]
class ParallelCache(object): def __init__(self, cluster_profile=None, cachedir=None, purge=False, idle_timeout=None, shutdown=False, retries=3, poll_interval=10, verbose=5, **kwargs): self._purge = purge self._idle_timeout = idle_timeout self._shutdown = shutdown self._retries = retries self._poll_interval = poll_interval self._verbose = verbose self._execution_times = None if cluster_profile is not None: self._ip_client = Client(profile=cluster_profile, **kwargs) else: self._ip_client = None if cachedir is not None: self._memory = Memory(cachedir=cachedir, verbose=verbose) else: self._memory = None def map(self, f, *sequences, **kwargs): # make sure all sequences have the same length n_jobs = None my_seqs = [] for ii, seq in enumerate(sequences): try: this_n_elems = len(seq) if n_jobs is None: n_jobs = this_n_elems if this_n_elems != n_jobs: raise ValueError('All sequences must have the same lenght,' 'sequence at position %d has length %d' % (ii + 1, this_n_elems)) my_seqs.append(seq) except TypeError: # we allow passing ints etc, convert them to a sequence my_seqs.append(repeat(seq)) t_start = time.time() if self._ip_client is None: if self._verbose >= 1: tmp = 'without' if self._memory is None else 'with' print_('Running %d jobs locally %s caching..' % (n_jobs, tmp)) out = list() my_fun = f if self._memory is None else self._memory.cache(f) for this_args in zip(*my_seqs): out.append(my_fun(*this_args, **kwargs)) elif self._ip_client is not None and self._memory is None: if self._verbose >= 1: print('Running %d jobs on cluster without caching..' % n_jobs) out = [None] * n_jobs lbview = self._ip_client.load_balanced_view() tasks = list() for this_args in zip(*my_seqs): tasks.append(lbview.apply(f, *this_args, **kwargs)) # wait for tasks to complete result_retrieved = [False] * len(tasks) execution_times = [None] * len(tasks) retry_no = np.zeros(len(tasks), dtype=np.int) last_print = 0 last_idle_check = time.time() idle_times = {} while True: for ii, task in enumerate(tasks): if not result_retrieved[ii] and task.ready(): if task.successful(): out[ii] = task.get() execution_times[ii] = task.serial_time result_retrieved[ii] = True else: # task failed for some reason, re-run it if retry_no[ii] < self._retries: if self._verbose > 3: print ('\nTask %d failed, re-running (%d / %d)' % (ii, retry_no[ii] + 1, self._retries)) this_args = zip(*my_seqs)[ii] new_task = lbview.apply(f, *this_args, **kwargs) tasks[ii] = new_task retry_no[ii] += 1 else: msg = ('\nTask %d failed %d times. Stopping' % (ii, self._retries + 1)) print msg # this will throw an exception task.get() raise RuntimeError(msg) if self._purge: _purge_results(self._ip_client, task) n_completed = int(np.sum(result_retrieved)) progress = n_completed / float(n_jobs) # print progress in 10% increments this_print = int(np.floor(progress * 10)) if self._verbose >= 1 and this_print != last_print: print_(' %d%%' % (100 * progress), end='') last_print = this_print if n_completed == n_jobs: # we are done! print_('') # newline break if self._idle_timeout is not None and time.time() > last_idle_check + 30: now = time.time() queue = self._ip_client.queue_status() shutdown_eids = [] for eid in self._ip_client.ids: if eid not in queue: continue if queue[eid]['queue'] + queue[eid]['tasks'] == 0: # engine is idle idle_time = idle_times.get(eid, None) if idle_time is None: # mark engine as idle idle_times[eid] = now continue if now - idle_time > self._idle_timeout: # shut down engine shutdown_eids.append(eid) elif eid in idle_times: # engine has started running again del idle_times[eid] if len(shutdown_eids) > 0: if self._verbose > 0: print 'Shuting-down engines: ', shutdown_eids dv = self._ip_client.direct_view(shutdown_eids) dv.shutdown() for eid in shutdown_eids: del idle_times[eid] last_idle_check = now time.sleep(self._poll_interval) self._execution_times = execution_times if self._shutdown: self._shutdown_cluster() elif self._ip_client is not None and self._memory is not None: # now this is the interesting case.. if self._verbose >= 1: print('Running %d jobs on cluster with caching..' % n_jobs) f_cache = self._memory.cache(f) lbview = None out = [None] * n_jobs execution_times = [None] * n_jobs task_info = list() n_cache = 0 for ii, this_args in enumerate(zip(*my_seqs)): # get the cache directory out_dir, _ = f_cache.get_output_dir(*this_args, **kwargs) if op.exists(op.join(out_dir, 'output.pkl')): out[ii] = f_cache.load_output(out_dir) n_cache += 1 continue if lbview is None: lbview = self._ip_client.load_balanced_view() task = lbview.apply(f, *this_args, **kwargs) task_info.append(dict(task=task, idx=ii, args=this_args)) if self._verbose >= 1: print_('Loaded %d results from cache' % n_cache) # wait for tasks to complete last_print = 0 last_idle_check = time.time() idle_times = {} result_retrieved = [False] * len(task_info) retry_no = np.zeros(len(task_info), dtype=np.int) failed_tasks = [] while len(task_info) > 0: for ii, ti in enumerate(task_info): if not result_retrieved[ii] and ti['task'].ready(): task = ti['task'] if task.successful(): this_out = task.get() # cache the input and output out_dir, _ = f_cache.get_output_dir(*ti['args'], **kwargs) f_cache._persist_output(this_out, out_dir) f_cache._persist_input(out_dir, *ti['args'], **kwargs) # insert result into output out[ti['idx']] = this_out execution_times[ti['idx']] = task.serial_time result_retrieved[ii] = True else: if retry_no[ii] < self._retries: if self._verbose > 3: print ('\nTask %d failed, re-running (%d / %d)' % (ii, retry_no[ii] + 1, self._retries)) new_task = lbview.apply(f, *ti['args'], **kwargs) ti['task'] = new_task retry_no[ii] += 1 else: # task failed too many times, mark it as done # but keep running if self._verbose >= 1: print ('\nTask %d failed %d times.' % (ii, self._retries + 1)) failed_tasks.append(task) result_retrieved[ii] = True if self._purge: _purge_results(self._ip_client, task) if self._idle_timeout is not None and time.time() > last_idle_check + 30: now = time.time() queue = self._ip_client.queue_status() shutdown_eids = [] for eid in self._ip_client.ids: if eid not in queue: continue if queue[eid]['queue'] + queue[eid]['tasks'] == 0: # engine is idle idle_time = idle_times.get(eid, None) if idle_time is None: # mark engine as idle idle_times[eid] = now continue if now - idle_time > self._idle_timeout: # shut down engine shutdown_eids.append(eid) elif eid in idle_times: # engine has started running again del idle_times[eid] if len(shutdown_eids) > 0: if self._verbose > 0: print 'Shuting-down engines: ', shutdown_eids dv = self._ip_client.direct_view(shutdown_eids) dv.shutdown() for eid in shutdown_eids: del idle_times[eid] last_idle_check = now n_completed = int(np.sum(result_retrieved)) progress = n_completed / float(n_jobs - n_cache) # print progress in 10% increments this_print = int(np.floor(progress * 10)) if self._verbose >= 1 and this_print != last_print: print_(' %d%% ' % (100 * progress), end='') last_print = this_print if n_completed == n_jobs - n_cache: # we are done! print_('') # newline break time.sleep(self._poll_interval) if self._shutdown: self._shutdown_cluster() if len(failed_tasks) > 0: msg = '' for task in failed_tasks[:5]: try: task.get() except Exception as e: msg += str(e) raise RuntimeError('%d tasks failed:\n %s' % (len(failed_tasks), msg)) self._execution_times = execution_times else: raise RuntimeError('WTF?') if self._verbose >= 1: print_('Done (%0.1f seconds)' % (time.time() - t_start)) return out def get_last_excecution_times(self): return self._execution_times def purge_results(self, f, *sequences, **kwargs): # make sure all sequences have the same length n_jobs = None my_seqs = [] for ii, seq in enumerate(sequences): try: this_n_elems = len(seq) if n_jobs is None: n_jobs = this_n_elems if this_n_elems != n_jobs: raise ValueError('All sequences must have the same lenght,' 'sequence at position %d has length %d' % (ii + 1, this_n_elems)) my_seqs.append(seq) except TypeError: # we allow passing ints etc, convert them to a sequence my_seqs.append(repeat(seq)) f_cache = self._memory.cache(f) n_deleted = 0 for this_args in zip(*my_seqs): out_dir, _ = f_cache.get_output_dir(*this_args, **kwargs) if op.exists(out_dir): shutil.rmtree(out_dir) n_deleted += 1 print 'Purging cache: %d out of %d deleted' % (n_deleted, n_jobs) def _shutdown_cluster(self): # shut down all idle engines queue = self._ip_client.queue_status() shutdown_eids = [] for eid in self._ip_client.ids: if eid not in queue: continue if queue[eid]['queue'] + queue[eid]['tasks'] == 0: shutdown_eids.append(eid) if len(shutdown_eids) > 0: if self._verbose > 0: print 'Shuting-down engines: ', shutdown_eids dv = self._ip_client.direct_view(shutdown_eids) dv.shutdown()
class Client(): """DrQueue client actions""" def __init__(self): # initialize IPython try: self.ip_client = IPClient() except Exception: raise Exception("Could not connect to IPython controller.") self.lbview = self.ip_client.load_balanced_view() # enable tracking self.lbview.track = True def job_run(self, job): """Create and queue tasks from job object""" # check job name if job['name'] in DrQueueJob.query_jobnames(): raise ValueError("Job name %s is already used!" % job['name']) return False # save job in database job_id = DrQueueJob.store_db(job) # job_id from db is be used as session name self.ip_client.session.session = str(job_id) # set owner of job self.ip_client.session.username = job['owner'] # set number of retries for each task self.lbview.retries = job['retries'] # depend on another job (it's tasks) if ('depend' in job['limits']) and (job['limits']['depend'] != None): depend_job = self.query_job_by_name(job['limits']['depend']) depend_tasks = self.query_task_list(depend_job['_id']) task_ids = [] for task in depend_tasks: task_ids.append(task['msg_id']) self.lbview.after = task_ids # check frame numbers if not (job['startframe'] >= 1): raise ValueError("Invalid value for startframe. Has to be equal or greater than 1.") return False if not (job['endframe'] >= 1): raise ValueError("Invalid value for endframe. Has to be equal or greater than 1.") return False if not (job['endframe'] >= job['startframe']): raise ValueError("Invalid value for endframe. Has be to equal or greater than startframe.") return False if job['endframe'] > job['startframe']: if not (job['endframe'] - job['startframe'] >= job['blocksize']): raise ValueError("Invalid value for blocksize. Has to be equal or lower than endframe-startframe.") return False if job['endframe'] == job['startframe']: if job['blocksize'] != 1: raise ValueError("Invalid value for blocksize. Has to be equal 1 if endframe equals startframe.") return False task_frames = list(range(job['startframe'], job['endframe'] + 1, job['blocksize'])) ar = None for x in task_frames: # prepare script input env_dict = { 'DRQUEUE_FRAME' : x, 'DRQUEUE_BLOCKSIZE' : job['blocksize'], 'DRQUEUE_ENDFRAME' : job['endframe'], 'DRQUEUE_SCENEFILE' : job['scenefile'], 'DRQUEUE_LOGFILE' : job['name'] + "-" + str(x) + "_" + str(x + job['blocksize'] -1) + ".log" } # optional elements if 'renderdir' in job: env_dict['DRQUEUE_RENDERDIR'] = job['renderdir'] if 'projectdir' in job: env_dict['DRQUEUE_PROJECTDIR'] = job['projectdir'] if 'configdir' in job: env_dict['DRQUEUE_CONFIGDIR'] = job['configdir'] if 'imagefile' in job: env_dict['DRQUEUE_IMAGEFILE'] = job['imagefile'] if 'precommand' in job: env_dict['DRQUEUE_PRECOMMAND'] = job['precommand'] if 'renderer' in job: env_dict['DRQUEUE_RENDERER'] = job['renderer'] if 'fileformat' in job: env_dict['DRQUEUE_FILEFORMAT'] = job['fileformat'] if 'postcommand' in job: env_dict['DRQUEUE_POSTCOMMAND'] = job['postcommand'] if 'viewcommand' in job: env_dict['DRQUEUE_VIEWCOMMAND'] = job['viewcommand'] if 'worldfile' in job: env_dict['DRQUEUE_WORLDFILE'] = job['worldfile'] if 'terrainfile' in job: env_dict['DRQUEUE_TERRAINFILE'] = job['terrainfile'] if 'composition' in job: env_dict['DRQUEUE_COMPOSITION'] = job['composition'] if 'camera' in job: env_dict['DRQUEUE_CAMERA'] = job['camera'] if 'resx' in job: env_dict['DRQUEUE_RESX'] = job['resx'] if 'resy' in job: env_dict['DRQUEUE_RESY'] = job['resy'] if 'renderpass' in job: env_dict['DRQUEUE_RENDERPASS'] = job['renderpass'] if 'rendertype' in job: env_dict['DRQUEUE_RENDERTYPE'] = job['rendertype'] if 'fileextension' in job: env_dict['DRQUEUE_FILEEXTENSION'] = job['fileextension'] if 'stepframe' in job: env_dict['DRQUEUE_STEPFRAME'] = job['stepframe'] if 'custom_bucket' in job: env_dict['DRQUEUE_CUSTOM_BUCKET'] = job['custom_bucket'] if 'bucketsize' in job: env_dict['DRQUEUE_BUCKETSIZE'] = job['bucketsize'] if 'custom_lod' in job: env_dict['DRQUEUE_CUSTOM_LOD'] = job['custom_lod'] if 'lod' in job: env_dict['DRQUEUE_LOD'] = job['lod'] if 'custom_varyaa' in job: env_dict['DRQUEUE_CUSTOM_VARYAA'] = job['custom_varyaa'] if 'varyaa' in job: env_dict['DRQUEUE_VARYAA'] = job['varyaa'] if 'raytrace' in job: env_dict['DRQUEUE_RAYTRACE'] = job['raytrace'] if 'antialias' in job: env_dict['DRQUEUE_ANTIALIAS'] = job['antialias'] if 'custom_bdepth' in job: env_dict['DRQUEUE_CUSTOM_BDEPTH'] = job['custom_bdepth'] if 'bdepth' in job: env_dict['DRQUEUE_BDEPTH'] = job['bdepth'] if 'custom_zdepth' in job: env_dict['DRQUEUE_CUSTOM_ZDEPTH'] = job['custom_zdepth'] if 'zdepth' in job: env_dict['DRQUEUE_ZDEPTH'] = job['zdepth'] if 'custom_cracks' in job: env_dict['DRQUEUE_CUSTOM_CRACKS'] = job['custom_cracks'] if 'cracks' in job: env_dict['DRQUEUE_CRACKS'] = job['cracks'] if 'custom_quality' in job: env_dict['DRQUEUE_CUSTOM_QUALITY'] = job['custom_quality'] if 'quality' in job: env_dict['DRQUEUE_QUALITY'] = job['quality'] if 'custom_qfiner' in job: env_dict['DRQUEUE_CUSTOM_QFINER'] = job['custom_qfiner'] if 'qfiner' in job: env_dict['DRQUEUE_QFINER'] = job['qfiner'] if 'custom_smultiplier' in job: env_dict['DRQUEUE_CUSTOM_SMULTIPLIER'] = job['custom_smultiplier'] if 'smultiplier' in job: env_dict['DRQUEUE_SMULTIPLIER'] = job['smultiplier'] if 'custom_mpcache' in job: env_dict['DRQUEUE_CUSTOM_MPCACHE'] = job['custom_mpcache'] if 'mpcache' in job: env_dict['DRQUEUE_MPCACHE'] = job['mpcache'] if 'custom_smpolygon' in job: env_dict['DRQUEUE_CUSTOM_SMPOLYGON'] = job['custom_smpolygon'] if 'smpolygon' in job: env_dict['DRQUEUE_SMPOLYGON'] = job['smpolygon'] if 'custom_wh' in job: env_dict['DRQUEUE_CUSTOM_WH'] = job['custom_wh'] if 'custom_type' in job: env_dict['DRQUEUE_CUSTOM_TYPE'] = job['custom_type'] if 'ctype' in job: env_dict['DRQUEUE_CTYPE'] = job['ctype'] if 'skipframes' in job: env_dict['DRQUEUE_SKIPFRAMES'] = job['skipframes'] # set dependencies dep_dict = {} if ('os' in job['limits']) and (job['limits']['os'] != None): dep_dict['os_name'] = job['limits']['os'] if ('minram' in job['limits']) and (job['limits']['minram'] > 0): dep_dict['minram'] = job['limits']['minram'] if ('mincores' in job['limits']) and (job['limits']['mincores'] > 0): dep_dict['mincores'] = job['limits']['mincores'] if ('pool_name' in job['limits']) and (job['limits']['pool_name'] != None): dep_dict['pool_name'] = job['limits']['pool_name'] run_script_with_env_and_deps = dependent(DrQueue.run_script_with_env, DrQueue.check_deps, dep_dict) # run task on cluster render_script = DrQueue.get_rendertemplate(job['renderer']) ar = self.lbview.apply(run_script_with_env_and_deps, render_script, env_dict) # wait for pyzmq send to complete communication (avoid race condition) ar.wait_for_send() # append email task behind last task if requested if ('send_email' in job) and (job['send_email'] == True): self.lbview.after = ar # run email task mail_ar = self.lbview.apply(DrQueue.send_email, job['name'], job['email_recipients']) # wait for pyzmq send to complete communication (avoid race condition) mail_ar.wait_for_send() return True def identify_computer(self, engine_id, cache_time): """Gather information about computer""" # look if engine info is already stored engine = DrQueueComputer.query_db(engine_id) now = int(time.time()) # check existence and age of info if (engine != None) and (now <= engine['date'] + cache_time): print("DEBUG: Engine %i was found in DB" % engine_id) # store new info else: print("DEBUG: Engine %i was not found in DB" % engine_id) # run command only on specific computer dview = self.ip_client[engine_id] dview.block = True dview.execute("import DrQueue\nfrom DrQueue import Computer as DrQueueComputer\nengine = DrQueueComputer(" + str(engine_id) + ")") engine = dview['engine'] engine['date'] = int(time.time()) DrQueueComputer.store_db(engine) return engine def task_wait(self, task_id): """Wait for task to finish""" ar = self.ip_client.get_result(task_id) ar.wait_for_send() ar.wait() return ar def query_job_list(self): """Query a list of all jobs""" return DrQueueJob.query_job_list() def query_running_job_list(self): """Query a list of all running jobs""" jobs = DrQueueJob.query_job_list() running_jobs = [] for job in jobs: if self.query_job_tasks_left(job['_id']) > 0: running_jobs.append(job) return running_jobs def query_jobname(self, task_id): """Query jobname from task id""" data = self.ip_client.db_query({"msg_id" : task_id}) job_id = data[0]['header']['session'] job = DrQueueJob.query_db(job_id) return job.name def query_job(self, job_id): """Query job from id""" return DrQueueJob.query_db(job_id) def query_job_by_name(self, job_name): """Query job from name""" return DrQueueJob.query_job_by_name(job_name) def query_job_tasks_left(self, job_id): """Query left frames of job""" left = 0 tasks = self.query_task_list(job_id) for task in tasks: if task['completed'] == None: left += 1 return left def query_job_finish_time(self, job_id): """Query oldest finish time of all tasks.""" job = self.query_job(job_id) # use requeue time as starting point if available if ('requeue_time' in job ) and (job['requeue_time'] != False): finish_time = job['requeue_time'] else: finish_time = job['submit_time'] tasks = self.query_task_list(job_id) for task in tasks: # look if older finish time exists if (task['completed'] != None) and (task['completed'] > finish_time): finish_time = task['completed'] return finish_time def get_frame_nr(self, task): """Extract value of DRQUEUE_FRAME.""" return int(pickle.loads(task['buffers'][3])['DRQUEUE_FRAME']) def query_task_list(self, job_id): """Query a list of tasks objects of certain job""" task_list = self.ip_client.db_query({'header.session' : str(job_id)}) sorted_task_list = sorted(task_list, key=self.get_frame_nr) return sorted_task_list def query_task(self, task_id): """Query a single task""" task = self.ip_client.db_query({'msg_id' : task_id })[0] return task def query_engine_list(self): """Query a list of all engines""" return self.ip_client.ids def query_engines_of_pool(self, pool_name): """Return available engines of certain pool.""" pool_computers = self.ip_client.ids if pool_name != None: computers = DrQueueComputerPool.query_pool_members(pool_name) if computers == None: raise ValueError("Pool \"%s\" is not existing!" % pool_name) return False for comp in pool_computers: if not comp in computers: pool_computers.remove(comp) if pool_computers == []: raise ValueError("No computer of pool %s is available!" % pool_name) return False print("DEBUG: matching pool: " + pool_name) print(pool_computers) return pool_computers def query_engines_of_os(self, os_name): """Return only engines running certain OS.""" # run job only on matching os matching_os = self.ip_client.ids if os_name != None: for engine_id in self.ip_client.ids: engine = self.identify_computer(engine_id, 1000) # os string has to contain os_name if not os_name in engine['os']: matching_os.remove(engine_id) print("DEBUG: matching os: " + os_name) print(matching_os) return matching_os def query_engines_with_minram(self, minram): """Return only engines with at least minram GB RAM.""" # run job only on matching minram matching_minram = self.ip_client.ids if minram > 0: for engine_id in self.ip_client.ids: engine = self.identify_computer(engine_id, 1000) if engine['memory'] < minram: matching_minram.remove(engine_id) print("DEBUG: matching minram: " + str(minram)) print(matching_minram) return matching_minram def query_engines_with_mincores(self, mincores): """Return only engines with at least mincores CPU cores.""" # run job only on matching mincores matching_mincores = self.ip_client.ids if mincores > 0: for engine_id in self.ip_client.ids: engine = self.identify_computer(engine_id, 1000) if engine['ncorescpu'] * engine['ncpus'] < mincores: matching_mincores.remove(engine_id) print("DEBUG: matching mincores: " + str(mincores)) print(matching_mincores) return matching_mincores def match_all_limits(self, os_list, minram_list, mincores_list, pool_list): """Match all limits for job.""" tmp_list = [] # build list with all list members tmp_list.extend(os_list) tmp_list.extend(minram_list) tmp_list.extend(mincores_list) tmp_list.extend(pool_list) # make entries unique tmp_list = set(tmp_list) tmp_list = list(tmp_list) matching_limits = [] for entry in tmp_list: # look if entry is in all lists if (entry in os_list) and (entry in minram_list) and (entry in mincores_list) and (entry in pool_list): matching_limits.append(entry) else: print("DEBUG: %i isn't matching limits" % entry) print("DEBUG: matching limits:") print(matching_limits) if len(matching_limits) == 0: message = "No engine meets the requirements." print(message) raise Exception(message) elif len(matching_limits) > 0: # only run on matching engines self.lbview = self.ip_client.load_balanced_view(matching_limits) else: self.lbview = self.ip_client.load_balanced_view() def job_stop(self, job_id): """Stop job and all tasks which are not currently running""" tasks = self.query_task_list(job_id) # abort all queued tasks for task in tasks: self.ip_client.abort(task['msg_id']) return True def job_kill(self, job_id): """Stop job and all of it's tasks wether running or not""" tasks = self.query_task_list(job_id) running_engines = [] # abort all queued tasks for task in tasks: stats = self.ip_client.queue_status('all', True) # check if tasks is already running on an engine for key,status in list(stats.items()): if ('tasks' in status) and (task['msg_id'] in status['tasks']): running_engines.append(key) self.ip_client.abort(task['msg_id']) # restart all engines which still run a task running_engines = set(running_engines) return True def job_delete(self, job_id): """Delete job and all of it's tasks""" tasks = self.query_task_list(job_id) engines = self.query_engine_list() # abort and delete all queued tasks for task in tasks: if len(engines) > 0: self.ip_client.abort(task['msg_id']) self.ip_client.purge_results(task['msg_id']) # delete job itself DrQueueJob.delete_from_db(job_id) return True def task_continue(self, task_id): """Continue aborted or failed task""" task = self.query_task(task_id) # check if action is needed if (task['completed'] != None) and ((task['result_header']['status'] == "error") or (task['result_header']['status'] == "aborted")): self.task_requeue(task_id) return True def task_requeue(self, task_id): """Requeue task""" self.ip_client.resubmit(task_id) print("requeuing %s" % task_id) return True def job_continue(self, job_id): """Continue stopped job and all of it's tasks""" job = self.query_job(job_id) tasks = self.query_task_list(job_id) # continue tasks for task in tasks: self.task_continue(task['msg_id']) return True def job_rerun(self, job_id): """Run all tasks of job another time""" job = self.query_job(job_id) tasks = self.query_task_list(job_id) # rerun tasks for task in tasks: self.task_requeue(task['msg_id']) # set resubmit time job['requeue_time'] = datetime.datetime.now() DrQueueJob.update_db(job) return True def job_status(self, job_id): """Return status string of job""" tasks = self.query_task_list(job_id) status = None status_pending = 0 status_ok = 0 status_aborted = 0 status_resubmitted = 0 status_error = 0 status_unknown = 0 for task in tasks: # look for pending tasks if task['completed'] == None: status_pending += 1 else: if 'result_header' in list(task.keys()): result_header = task['result_header'] # look for done tasks if ('status' in list(result_header.keys())) and (result_header['status'] == "ok"): status_ok += 1 # look for aborted tasks elif ('status' in list(result_header.keys())) and (result_header['status'] == "aborted"): status_aborted += 1 # look for done tasks elif ('status' in list(result_header.keys())) and (result_header['status'] == "resubmitted"): status_resubmitted += 1 # look for tasks with error elif ('status' in list(result_header.keys())) and (result_header['status'] == "error"): status_error += 1 else: status_unknown += 1 # if at least 1 task is ok, job status is ok if status_ok > 0: status = "ok" # if at least 1 task is pending, job status is pending if status_pending > 0: status = "pending" # if at least 1 task is aborted, job status is aborted if status_aborted > 0: status = "aborted" # if at least 1 task has an error, job status is error if status_error > 0: status = "error" return status def job_estimated_finish_time(self, job_id): """Calculate estimated finish time of job.""" tasks = self.query_task_list(job_id) spent_times = [] # get spent time for each finished task for task in tasks: if task['completed'] != None: if 'result_header' in list(task.keys()): result_header = task['result_header'] if ('status' in list(result_header.keys())) and (result_header['status'] == "ok"): timediff = task['completed'] - task['started'] spent_times.append(timediff) if len(spent_times) > 0: # calculate sum of spent time sum_times = datetime.timedelta(0) for spent in spent_times: sum_times += spent # calcutate mean time for a single task meantime = sum_times / len(spent_times) # calculate estimated time left tasks_left = len(tasks) - len(spent_times) time_left = tasks_left * meantime # query job object job = self.query_job(job_id) # look if all tasks are already done if self.query_job_tasks_left(job_id) == 0: finish_time = self.query_job_finish_time(job_id) else: # calculate estimated finish time, use requeue time if available if ('requeue_time' in job ) and (job['requeue_time'] != False): finish_time = job['requeue_time'] + time_left else: finish_time = job['submit_time'] + time_left else: meantime = "unknown" time_left = "unknown" finish_time = "unknown" return meantime, time_left, finish_time def engine_stop(self, engine_id): """Stop a specific engine""" # delete computer information in db DrQueueComputer.delete_from_db(engine_id) # shutdown computer self.ip_client.shutdown(engine_id) return True def engine_restart(self, engine_id): """Restart a specific engine""" self.ip_client.shutdown(engine_id, True, False, True) return True
class Client(): """DrQueue client actions""" def __init__(self): # initialize IPython try: self.ip_client = IPClient() except Exception: raise Exception("Could not connect to IPython controller.") self.lbview = self.ip_client.load_balanced_view() # enable tracking self.lbview.track = True # list of all available query keys self.all_task_query_keys = [ 'msg_id', 'header', 'content', 'buffers', 'submitted', 'client_uuid', 'engine_uuid', 'started', 'completed', 'resubmitted', 'result_header', 'result_content', 'result_buffers', 'queue', 'pyin', 'pyout', 'pyerr', 'stdout', 'stderr' ] def job_run(self, job): """Create and queue tasks from job object""" # check job name if job['name'] in DrQueueJob.query_jobnames(): raise ValueError("Job name %s is already used!" % job['name']) return False # save job in database job_id = DrQueueJob.store_db(job) # job_id from db is be used as session name self.ip_client.session.session = str(job_id) # set owner of job self.ip_client.session.username = job['owner'] # set number of retries for each task self.lbview.retries = job['retries'] # depend on another job (it's tasks) if ('depend' in job['limits']) and (job['limits']['depend'] != None): depend_job = self.query_job_by_name(job['limits']['depend']) depend_tasks = self.query_task_list(depend_job['_id']) task_ids = [] for task in depend_tasks: task_ids.append(task['msg_id']) self.lbview.after = task_ids # check frame numbers if not (job['startframe'] >= 1): raise ValueError( "Invalid value for startframe. Has to be equal or greater than 1." ) return False if not (job['endframe'] >= 1): raise ValueError( "Invalid value for endframe. Has to be equal or greater than 1." ) return False if not (job['endframe'] >= job['startframe']): raise ValueError( "Invalid value for endframe. Has be to equal or greater than startframe." ) return False if job['endframe'] > job['startframe']: if not (job['endframe'] - job['startframe'] >= job['blocksize']): raise ValueError( "Invalid value for blocksize. Has to be equal or lower than endframe-startframe." ) return False if job['endframe'] == job['startframe']: if job['blocksize'] != 1: raise ValueError( "Invalid value for blocksize. Has to be equal 1 if endframe equals startframe." ) return False task_frames = list( range(job['startframe'], job['endframe'] + 1, job['blocksize'])) ar = None for x in task_frames: # prepare script input env_dict = { 'DRQUEUE_FRAME': x, 'DRQUEUE_BLOCKSIZE': job['blocksize'], 'DRQUEUE_ENDFRAME': job['endframe'], 'DRQUEUE_SCENEFILE': job['scenefile'] } # log filename if job['created_with'] == "DrQueueOnRails": # take job directory name env_dict['DRQUEUE_LOGFILE'] = job['scenefile'].split( "/")[-2] + "-" + str(x) + "_" + str(x + job['blocksize'] - 1) + ".log" else: # take job name env_dict['DRQUEUE_LOGFILE'] = job['name'] + "-" + str( x) + "_" + str(x + job['blocksize'] - 1) + ".log" # optional elements if 'renderdir' in job: env_dict['DRQUEUE_RENDERDIR'] = job['renderdir'] if 'projectdir' in job: env_dict['DRQUEUE_PROJECTDIR'] = job['projectdir'] if 'configdir' in job: env_dict['DRQUEUE_CONFIGDIR'] = job['configdir'] if 'imagefile' in job: env_dict['DRQUEUE_IMAGEFILE'] = job['imagefile'] if 'precommand' in job: env_dict['DRQUEUE_PRECOMMAND'] = job['precommand'] if 'renderer' in job: env_dict['DRQUEUE_RENDERER'] = job['renderer'] if 'fileformat' in job: env_dict['DRQUEUE_FILEFORMAT'] = job['fileformat'] if 'postcommand' in job: env_dict['DRQUEUE_POSTCOMMAND'] = job['postcommand'] if 'viewcommand' in job: env_dict['DRQUEUE_VIEWCOMMAND'] = job['viewcommand'] if 'worldfile' in job: env_dict['DRQUEUE_WORLDFILE'] = job['worldfile'] if 'terrainfile' in job: env_dict['DRQUEUE_TERRAINFILE'] = job['terrainfile'] if 'composition' in job: env_dict['DRQUEUE_COMPOSITION'] = job['composition'] if 'camera' in job: env_dict['DRQUEUE_CAMERA'] = job['camera'] if 'resx' in job: env_dict['DRQUEUE_RESX'] = job['resx'] if 'resy' in job: env_dict['DRQUEUE_RESY'] = job['resy'] if 'renderpass' in job: env_dict['DRQUEUE_RENDERPASS'] = job['renderpass'] if 'rendertype' in job: env_dict['DRQUEUE_RENDERTYPE'] = job['rendertype'] if 'fileextension' in job: env_dict['DRQUEUE_FILEEXTENSION'] = job['fileextension'] if 'stepframe' in job: env_dict['DRQUEUE_STEPFRAME'] = job['stepframe'] if 'custom_bucket' in job: env_dict['DRQUEUE_CUSTOM_BUCKET'] = job['custom_bucket'] if 'bucketsize' in job: env_dict['DRQUEUE_BUCKETSIZE'] = job['bucketsize'] if 'custom_lod' in job: env_dict['DRQUEUE_CUSTOM_LOD'] = job['custom_lod'] if 'lod' in job: env_dict['DRQUEUE_LOD'] = job['lod'] if 'custom_varyaa' in job: env_dict['DRQUEUE_CUSTOM_VARYAA'] = job['custom_varyaa'] if 'varyaa' in job: env_dict['DRQUEUE_VARYAA'] = job['varyaa'] if 'raytrace' in job: env_dict['DRQUEUE_RAYTRACE'] = job['raytrace'] if 'antialias' in job: env_dict['DRQUEUE_ANTIALIAS'] = job['antialias'] if 'custom_bdepth' in job: env_dict['DRQUEUE_CUSTOM_BDEPTH'] = job['custom_bdepth'] if 'bdepth' in job: env_dict['DRQUEUE_BDEPTH'] = job['bdepth'] if 'custom_zdepth' in job: env_dict['DRQUEUE_CUSTOM_ZDEPTH'] = job['custom_zdepth'] if 'zdepth' in job: env_dict['DRQUEUE_ZDEPTH'] = job['zdepth'] if 'custom_cracks' in job: env_dict['DRQUEUE_CUSTOM_CRACKS'] = job['custom_cracks'] if 'cracks' in job: env_dict['DRQUEUE_CRACKS'] = job['cracks'] if 'custom_quality' in job: env_dict['DRQUEUE_CUSTOM_QUALITY'] = job['custom_quality'] if 'quality' in job: env_dict['DRQUEUE_QUALITY'] = job['quality'] if 'custom_qfiner' in job: env_dict['DRQUEUE_CUSTOM_QFINER'] = job['custom_qfiner'] if 'qfiner' in job: env_dict['DRQUEUE_QFINER'] = job['qfiner'] if 'custom_smultiplier' in job: env_dict['DRQUEUE_CUSTOM_SMULTIPLIER'] = job[ 'custom_smultiplier'] if 'smultiplier' in job: env_dict['DRQUEUE_SMULTIPLIER'] = job['smultiplier'] if 'custom_mpcache' in job: env_dict['DRQUEUE_CUSTOM_MPCACHE'] = job['custom_mpcache'] if 'mpcache' in job: env_dict['DRQUEUE_MPCACHE'] = job['mpcache'] if 'custom_smpolygon' in job: env_dict['DRQUEUE_CUSTOM_SMPOLYGON'] = job['custom_smpolygon'] if 'smpolygon' in job: env_dict['DRQUEUE_SMPOLYGON'] = job['smpolygon'] if 'custom_wh' in job: env_dict['DRQUEUE_CUSTOM_WH'] = job['custom_wh'] if 'custom_type' in job: env_dict['DRQUEUE_CUSTOM_TYPE'] = job['custom_type'] if 'ctype' in job: env_dict['DRQUEUE_CTYPE'] = job['ctype'] if 'skipframes' in job: env_dict['DRQUEUE_SKIPFRAMES'] = job['skipframes'] if 'custom_command' in job: env_dict['DRQUEUE_CUSTOM_COMMAND'] = job['custom_command'] # set dependencies dep_dict = {} dep_dict['job_id'] = job_id if ('os' in job['limits']) and (job['limits']['os'] != None): dep_dict['os_name'] = job['limits']['os'] if ('minram' in job['limits']) and (job['limits']['minram'] > 0): dep_dict['minram'] = job['limits']['minram'] if ('mincores' in job['limits']) and (job['limits']['mincores'] > 0): dep_dict['mincores'] = job['limits']['mincores'] if ('pool_name' in job['limits']) and (job['limits']['pool_name'] != None): dep_dict['pool_name'] = job['limits']['pool_name'] run_script_with_env_and_deps = dependent( DrQueue.run_script_with_env, DrQueue.check_deps, dep_dict) # run task on cluster render_script = DrQueue.get_rendertemplate(job['renderer']) ar = self.lbview.apply(run_script_with_env_and_deps, render_script, env_dict) # wait for pyzmq send to complete communication (avoid race condition) ar.wait_for_send() # append email task behind last task if requested if ('send_email' in job) and (job['send_email'] == True): self.lbview.after = ar # run email task mail_ar = self.lbview.apply(DrQueue.send_email, job['name'], job['email_recipients']) # wait for pyzmq send to complete communication (avoid race condition) mail_ar.wait_for_send() return True def identify_computer(self, engine_id, cache_time, timeout=15): """Gather information about computer""" # look if engine info is already stored engine = DrQueueComputer.query_db_by_engine_id(engine_id) now = int(time.time()) # check existence and age of info if (engine != None) and (now <= engine['created_at'] + cache_time): print("DEBUG: Engine %i was found in DB and info is up-to-date." % engine_id) return engine # store new info else: if engine != None: print( "DEBUG: Engine %i was found in DB, but info needs to be updated." % engine_id) else: print("DEBUG: Engine %i was not found in DB." % engine_id) # run command only on specific computer try: dview = self.ip_client[engine_id] except IndexError: print("DEBUG: Engine with id %i unknown." % engine_id) # delete old entry from database DrQueueComputer.delete_from_db_by_engine_id(engine_id) print("DEBUG: Engine with id %i deleted from database." % engine_id) new_engine = None else: # run command in async mode dview.block = False command = "import DrQueue\nfrom DrQueue import Computer as DrQueueComputer\nengine = DrQueueComputer()" ar = dview.execute(command) try: # try to get results & wait until timeout ar.get(timeout) except Exception: if engine != None: print( "DEBUG: Update request for engine %i timed out. Using old information from DB." % engine_id) new_engine = engine else: print( "DEBUG: Information request for engine %i timed out." % engine_id) new_engine = None else: # get computer dict from engine namespace new_engine = dview['engine'] # set to known engine_id new_engine['engine_id'] = engine_id # set creation time new_engine['created_at'] = int(time.time()) # store entry in database DrQueueComputer.store_db(new_engine) return new_engine def computer_set_pools(self, computer, pool_list): """add computer to list of pools""" # convert to string pool_str = ','.join(pool_list) # update environment variable on engine dview = self.ip_client[computer['engine_id']] dview.block = True command = "import os\nos.environ[\"DRQUEUE_POOL\"] = \"" + pool_str + "\"" dview.execute(command) # update database entry computer['pools'] = pool_list DrQueueComputer.store_db(computer) print("DEBUG: Engine " + str(computer['engine_id']) + " added to pools " + pool_str + ".") return computer def computer_get_pools(self, computer): """Return all pool names where computer is member.""" return computer['pools'] def task_wait(self, task_id): """Wait for task to finish""" ar = self.ip_client.get_result(task_id) ar.wait_for_send() ar.wait() return ar def query_job_list(self): """Query a list of all jobs""" return DrQueueJob.query_job_list() def query_job_by_id(self, job_id): """Query job by given id""" return DrQueueJob.query_db(job_id) def query_job_by_name(self, job_name): """Query job by given name""" return DrQueueJob.query_job_by_name(job_name) def query_job_tasks_left(self, job_id): """Query left frames of job""" left = 0 tasks = self.query_task_list(job_id) for task in tasks: if task['completed'] == None: left += 1 return left def query_job_finish_time(self, job_id): """Query oldest finish time of all tasks.""" job = self.query_job_by_id(job_id) # use requeue time as starting point if available if ('requeue_time' in job) and (job['requeue_time'] != False): finish_time = job['requeue_time'] else: finish_time = job['submit_time'] tasks = self.query_task_list(job_id) for task in tasks: # look if older finish time exists if (task['completed'] != None) and (task['completed'] > finish_time): finish_time = task['completed'] return finish_time def get_frame_nr(self, task): """Extract value of DRQUEUE_FRAME.""" if ('buffers' in task) and task['buffers'] != []: frame_nr = int(pickle.loads(task['buffers'][3])['DRQUEUE_FRAME']) else: frame_nr = 1 return frame_nr def query_task_list(self, job_id): """Query a list of tasks objects of certain job. Sort by frame number.""" task_list = self.ip_client.db_query({'header.session': str(job_id)}, keys=self.all_task_query_keys) sorted_task_list = sorted(task_list, key=self.get_frame_nr) return sorted_task_list def query_interrupted_task_list(self, job_id): """Query a list of interrupted tasks of certain job. Sort by frame number.""" job = self.query_job_by_id(job_id) task_list = self.ip_client.db_query({'header.session': str(job_id)}, keys=self.all_task_query_keys) interrupted_task_list = [] for task in task_list: frame_nr = self.get_frame_nr(task) print("frame_nr: " + str(frame_nr)) # log filename if job['renderer'] == "blender": filesearch = job['scenefile'] + str("%04d" % frame_nr) + ".???" found = glob.glob(filesearch) # file was found if len(found) > 0: outputfile = found[0] print("outputfile: " + str(outputfile)) filesize = os.path.getsize(outputfile) print(filesize) # file exists, but is empty if filesize == 0: interrupted_task_list.append(task) # file was not found else: outputfile = None print("outputfile: " + str(outputfile)) if (task['completed'] == None) and (task['started'] == None): interrupted_task_list.append(task) else: raise ValueError("Only Blender renderer supported so far.") return interrupted_task_list def query_task(self, task_id): """Query a single task.""" task = self.ip_client.db_query({'msg_id': task_id}, keys=self.all_task_query_keys)[0] return task def query_computer_list(self): """Query a list of all computers.""" return self.ip_client.ids def job_stop(self, job_id): """Stop job and all tasks which are not currently running""" # disable job self.job_disable(job_id) tasks = self.query_task_list(job_id) tasks_to_stop = [] for task in tasks: print("Task " + task["msg_id"] + ": ") if ("result_content" in task) and (task["result_content"] != None) and ( task["result_content"]["status"] == "ok"): print(" finished at " + str(task["completed"])) else: # get task stats of all computers stats = self.ip_client.queue_status('all', True) # check if tasks is already running on an engine found_on_engine = False for key, status in list(stats.items()): if ('tasks' in status) and (task['msg_id'] in status['tasks']): # skip tasks which are already running on an engine print( " not finished yet but already queued to engine. will leave it there." ) found_on_engine = True break # if a task isn't already queueed/running on an engine, it should be safe to abort it if found_on_engine == False: print(" not finished yet. will abort.") tasks_to_stop.append(task['msg_id']) if len(tasks_to_stop) > 0: try: self.ip_client.abort(tasks_to_stop) except Exception as e: print("ERROR: " + str(e)) return True def job_kill(self, job_id): """Stop job and all of it's tasks wether running or not""" # disable job self.job_disable(job_id) tasks = self.query_task_list(job_id) running_engines = [] tasks_to_stop = [] # abort all queued tasks for task in tasks: stats = self.ip_client.queue_status('all', True) # check if tasks is already running on an engine for key, status in list(stats.items()): if ('tasks' in status) and (task['msg_id'] in status['tasks']): running_engines.append(key) tasks_to_stop.append(task['msg_id']) # stop all matching tasks at once try: self.ip_client.abort(tasks_to_stop) except Exception as e: print("ERROR: " + str(e)) # stop all engines which still run a task # the slave wrapper will restart the engine running_engines = set(running_engines) for engine_id in running_engines: self.engine_stop(engine_id) return True def job_disable(self, job_id): """Disable job in database.""" job = self.query_job_by_id(job_id) job['enabled'] = False DrQueueJob.update_db(job) return True def job_enable(self, job_id): """Disable job in database.""" job = self.query_job_by_id(job_id) job['enabled'] = True DrQueueJob.update_db(job) return True def job_delete(self, job_id): """Delete job and all of it's tasks""" tasks = self.query_task_list(job_id) engines = self.query_computer_list() error = False pending_tasks = [] # abort and delete all queued tasks for task in tasks: if len(engines) > 0: # abort outstanding tasks which are already queued to engine print('aborting task ' + str(task['msg_id'])) try: self.ip_client.abort(task['msg_id'], engines) except Exception, e: print('Error: ' + str(e)) error = True # purge all tasks which are not pending print('purging task ' + str(task['msg_id'])) try: self.ip_client.purge_hub_results(task['msg_id'], engines) except Exception: print('Warning: ' + str(task['msg_id']) + ' is pending. Try to kill job before.') pending_tasks.append(task) error = True # delete job if no error occured if error == False: # delete job itself DrQueueJob.delete_from_db(job_id) return True else: return False
class Client: """DrQueue client actions""" def __init__(self): # initialize IPython try: self.ip_client = IPClient() except Exception: raise Exception("Could not connect to IPython controller.") self.lbview = self.ip_client.load_balanced_view() # enable tracking self.lbview.track = True def job_run(self, job): """Create and queue tasks from job object""" # check job name if job["name"] in DrQueueJob.query_jobnames(): raise ValueError("Job name %s is already used!" % job["name"]) return False # run job only on matching os os_list = self.query_engines_of_os(job["limits"]["os"]) # run job only on matching minram minram_list = self.query_engines_with_minram(job["limits"]["minram"]) # run job only on matching mincores mincores_list = self.query_engines_with_mincores(job["limits"]["mincores"]) # check pool members pool_list = self.query_engines_of_pool(job["limits"]["pool"]) # check limits self.match_all_limits(os_list, minram_list, mincores_list, pool_list) # save job in database job_id = DrQueueJob.store_db(job) # job_id from db is be used as session name self.ip_client.session.session = str(job_id) # set owner of job self.ip_client.session.username = job["owner"] # set number of retries for each task self.lbview.retries = job["retries"] # depend on another job (it's tasks) if ("depend" in job["limits"]) and (job["limits"]["depend"] != None): depend_job = self.query_job_by_name(job["limits"]["depend"]) depend_tasks = self.query_task_list(depend_job["_id"]) task_ids = [] for task in depend_tasks: task_ids.append(task["msg_id"]) self.lbview.after = task_ids # check frame numbers if not (job["startframe"] >= 1): raise ValueError("Invalid value for startframe. Has to be equal or greater than 1.") return False if not (job["endframe"] >= 1): raise ValueError("Invalid value for endframe. Has to be equal or greater than 1.") return False if not (job["endframe"] >= job["startframe"]): raise ValueError("Invalid value for endframe. Has be to equal or greater than startframe.") return False if job["endframe"] > job["startframe"]: if not (job["endframe"] - job["startframe"] >= job["blocksize"]): raise ValueError("Invalid value for blocksize. Has to be equal or lower than endframe-startframe.") return False if job["endframe"] == job["startframe"]: if job["blocksize"] != 1: raise ValueError("Invalid value for blocksize. Has to be equal 1 if endframe equals startframe.") return False task_frames = range(job["startframe"], job["endframe"] + 1, job["blocksize"]) for x in task_frames: # prepare script input env_dict = { "DRQUEUE_FRAME": x, "DRQUEUE_BLOCKSIZE": job["blocksize"], "DRQUEUE_ENDFRAME": job["endframe"], "DRQUEUE_SCENEFILE": job["scenefile"], "DRQUEUE_LOGFILE": job["name"] + "-" + str(x) + "_" + str(x + job["blocksize"] - 1) + ".log", } # optional elements if "renderdir" in job: env_dict["DRQUEUE_RENDERDIR"] = job["renderdir"] if "projectdir" in job: env_dict["DRQUEUE_PROJECTDIR"] = job["projectdir"] if "configdir" in job: env_dict["DRQUEUE_CONFIGDIR"] = job["configdir"] if "imagefile" in job: env_dict["DRQUEUE_IMAGEFILE"] = job["imagefile"] if "precommand" in job: env_dict["DRQUEUE_PRECOMMAND"] = job["precommand"] if "renderer" in job: env_dict["DRQUEUE_RENDERER"] = job["renderer"] if "fileformat" in job: env_dict["DRQUEUE_FILEFORMAT"] = job["fileformat"] if "postcommand" in job: env_dict["DRQUEUE_POSTCOMMAND"] = job["postcommand"] if "viewcommand" in job: env_dict["DRQUEUE_VIEWCOMMAND"] = job["viewcommand"] if "worldfile" in job: env_dict["DRQUEUE_WORLDFILE"] = job["worldfile"] if "terrainfile" in job: env_dict["DRQUEUE_TERRAINFILE"] = job["terrainfile"] if "composition" in job: env_dict["DRQUEUE_COMPOSITION"] = job["composition"] if "camera" in job: env_dict["DRQUEUE_CAMERA"] = job["camera"] if "resx" in job: env_dict["DRQUEUE_RESX"] = job["resx"] if "resy" in job: env_dict["DRQUEUE_RESY"] = job["resy"] if "renderpass" in job: env_dict["DRQUEUE_RENDERPASS"] = job["renderpass"] if "rendertype" in job: env_dict["DRQUEUE_RENDERTYPE"] = job["rendertype"] if "fileextension" in job: env_dict["DRQUEUE_FILEEXTENSION"] = job["fileextension"] if "stepframe" in job: env_dict["DRQUEUE_STEPFRAME"] = job["stepframe"] if "custom_bucket" in job: env_dict["DRQUEUE_CUSTOM_BUCKET"] = job["custom_bucket"] if "bucketsize" in job: env_dict["DRQUEUE_BUCKETSIZE"] = job["bucketsize"] if "custom_lod" in job: env_dict["DRQUEUE_CUSTOM_LOD"] = job["custom_lod"] if "lod" in job: env_dict["DRQUEUE_LOD"] = job["lod"] if "custom_varyaa" in job: env_dict["DRQUEUE_CUSTOM_VARYAA"] = job["custom_varyaa"] if "varyaa" in job: env_dict["DRQUEUE_VARYAA"] = job["varyaa"] if "raytrace" in job: env_dict["DRQUEUE_RAYTRACE"] = job["raytrace"] if "antialias" in job: env_dict["DRQUEUE_ANTIALIAS"] = job["antialias"] if "custom_bdepth" in job: env_dict["DRQUEUE_CUSTOM_BDEPTH"] = job["custom_bdepth"] if "bdepth" in job: env_dict["DRQUEUE_BDEPTH"] = job["bdepth"] if "custom_zdepth" in job: env_dict["DRQUEUE_CUSTOM_ZDEPTH"] = job["custom_zdepth"] if "zdepth" in job: env_dict["DRQUEUE_ZDEPTH"] = job["zdepth"] if "custom_cracks" in job: env_dict["DRQUEUE_CUSTOM_CRACKS"] = job["custom_cracks"] if "cracks" in job: env_dict["DRQUEUE_CRACKS"] = job["cracks"] if "custom_quality" in job: env_dict["DRQUEUE_CUSTOM_QUALITY"] = job["custom_quality"] if "quality" in job: env_dict["DRQUEUE_QUALITY"] = job["quality"] if "custom_qfiner" in job: env_dict["DRQUEUE_CUSTOM_QFINER"] = job["custom_qfiner"] if "qfiner" in job: env_dict["DRQUEUE_QFINER"] = job["qfiner"] if "custom_smultiplier" in job: env_dict["DRQUEUE_CUSTOM_SMULTIPLIER"] = job["custom_smultiplier"] if "smultiplier" in job: env_dict["DRQUEUE_SMULTIPLIER"] = job["smultiplier"] if "custom_mpcache" in job: env_dict["DRQUEUE_CUSTOM_MPCACHE"] = job["custom_mpcache"] if "mpcache" in job: env_dict["DRQUEUE_MPCACHE"] = job["mpcache"] if "custom_smpolygon" in job: env_dict["DRQUEUE_CUSTOM_SMPOLYGON"] = job["custom_smpolygon"] if "smpolygon" in job: env_dict["DRQUEUE_SMPOLYGON"] = job["smpolygon"] if "custom_wh" in job: env_dict["DRQUEUE_CUSTOM_WH"] = job["custom_wh"] if "custom_type" in job: env_dict["DRQUEUE_CUSTOM_TYPE"] = job["custom_type"] if "ctype" in job: env_dict["DRQUEUE_CTYPE"] = job["ctype"] if "skipframes" in job: env_dict["DRQUEUE_SKIPFRAMES"] = job["skipframes"] # run task on cluster render_script = DrQueue.get_rendertemplate(job["renderer"]) ar = self.lbview.apply(DrQueue.run_script_with_env, render_script, env_dict) # wait for pyzmq send to complete communication (avoid race condition) ar.wait_for_send() return True def identify_computer(self, engine_id, cache_time): """Gather information about computer""" # look if engine info is already stored engine = DrQueueComputer.query_db(engine_id) now = int(time.time()) # check existence and age of info if (engine != None) and (now <= engine["date"] + cache_time): print ("DEBUG: Engine %i was found in DB" % engine_id) # store new info else: print ("DEBUG: Engine %i was not found in DB" % engine_id) # run command only on specific computer dview = self.ip_client[engine_id] dview.block = True dview.execute( "import DrQueue\nfrom DrQueue import Computer as DrQueueComputer\nengine = DrQueueComputer(" + str(engine_id) + ")" ) engine = dview["engine"] engine["date"] = int(time.time()) DrQueueComputer.store_db(engine) return engine def task_wait(self, task_id): """Wait for task to finish""" ar = self.ip_client.get_result(task_id) ar.wait_for_send() ar.wait() return ar def query_job_list(self): """Query a list of all jobs""" return DrQueueJob.query_job_list() def query_running_job_list(self): """Query a list of all running jobs""" jobs = DrQueueJob.query_job_list() running_jobs = [] for job in jobs: if self.query_job_tasks_left(job["_id"]) > 0: running_jobs.append(job) return running_jobs def query_jobname(self, task_id): """Query jobname from task id""" data = self.ip_client.db_query({"msg_id": task_id}) job_id = data[0]["header"]["session"] job = DrQueueJob.query_db(job_id) return job.name def query_job(self, job_id): """Query job from id""" return DrQueueJob.query_db(job_id) def query_job_by_name(self, job_name): """Query job from name""" return DrQueueJob.query_job_by_name(job_name) def query_job_tasks_left(self, job_id): """Query left frames of job""" left = 0 tasks = self.query_task_list(job_id) for task in tasks: if task["completed"] == None: left += 1 return left def query_task_list(self, job_id): """Query a list of tasks objects of certain job""" return self.ip_client.db_query({"header.session": str(job_id)}) def query_task(self, task_id): """Query a single task""" task = self.ip_client.db_query({"msg_id": task_id})[0] return task def query_engine_list(self): """Query a list of all engines""" return self.ip_client.ids def query_engines_of_pool(self, pool_name): """Return available engines of certain pool.""" pool_computers = self.ip_client.ids if pool_name != None: computers = DrQueueComputerPool.query_pool_members(pool_name) if computers == None: raise ValueError('Pool "%s" is not existing!' % pool_name) return False for comp in pool_computers: if not comp in computers: pool_computers.remove(comp) if pool_computers == []: raise ValueError("No computer of pool %s is available!" % pool_name) return False print ("DEBUG: matching pool: " + pool_name) print (pool_computers) return pool_computers def query_engines_of_os(self, os_name): """Return only engines running certain OS.""" # run job only on matching os matching_os = self.ip_client.ids if os_name != None: for engine_id in self.ip_client.ids: engine = self.identify_computer(engine_id, 1000) # os string has to contain os_name if not os_name in engine["os"]: matching_os.remove(engine_id) print ("DEBUG: matching os: " + os_name) print (matching_os) return matching_os def query_engines_with_minram(self, minram): """Return only engines with at least minram GB RAM.""" # run job only on matching minram matching_minram = self.ip_client.ids if minram > 0: for engine_id in self.ip_client.ids: engine = self.identify_computer(engine_id, 1000) if engine["memory"] < minram: matching_minram.remove(engine_id) print ("DEBUG: matching minram: " + str(minram)) print (matching_minram) return matching_minram def query_engines_with_mincores(self, mincores): """Return only engines with at least minram GB RAM.""" # run job only on matching mincores matching_mincores = self.ip_client.ids if mincores > 0: for engine_id in self.ip_client.ids: engine = self.identify_computer(engine_id, 1000) if engine["ncorescpu"] * engine["ncpus"] < mincores: matching_mincores.remove(engine_id) print ("DEBUG: matching mincores: " + str(mincores)) print (matching_mincores) return matching_mincores def match_all_limits(self, os_list, minram_list, mincores_list, pool_list): """Match all limits for job.""" tmp_list = [] # build list with all list members tmp_list.extend(os_list) tmp_list.extend(minram_list) tmp_list.extend(mincores_list) tmp_list.extend(pool_list) # make entries unique tmp_list = set(tmp_list) tmp_list = list(tmp_list) matching_limits = [] for entry in tmp_list: # look if entry is in all lists if (entry in os_list) and (entry in minram_list) and (entry in mincores_list) and (entry in pool_list): matching_limits.append(entry) else: print ("DEBUG: %i isn't matching limits" % entry) print ("DEBUG: matching limits:") print (matching_limits) if len(matching_limits) == 0: message = "No engine meets the requirements." print (message) raise Exception(message) elif len(matching_limits) > 0: # only run on matching engines self.lbview = self.ip_client.load_balanced_view(matching_limits) else: self.lbview = self.ip_client.load_balanced_view() def job_stop(self, job_id): """Stop job and all tasks which are not currently running""" tasks = self.query_task_list(job_id) # abort all queued tasks for task in tasks: self.ip_client.abort(task["msg_id"]) return True def job_kill(self, job_id): """Stop job and all of it's tasks wether running or not""" tasks = self.query_task_list(job_id) running_engines = [] # abort all queued tasks for task in tasks: stats = self.ip_client.queue_status("all", True) # check if tasks is already running on an engine for key, status in stats.items(): if ("tasks" in status) and (task["msg_id"] in status["tasks"]): print "found" running_engines.append(key) self.ip_client.abort(task["msg_id"]) # restart all engines which still run a task running_engines = set(running_engines) print list(running_engines) # for engine_id in running_engines: # self.ip_client(engine_id) return True def job_delete(self, job_id): """Delete job and all of it's tasks""" tasks = self.query_task_list(job_id) engines = self.query_engine_list() # abort and delete all queued tasks for task in tasks: if len(engines) > 0: self.ip_client.abort(task["msg_id"]) self.ip_client.purge_results(task["msg_id"]) # delete job itself DrQueueJob.delete_from_db(job_id) return True def task_continue(self, task_id): """Continue aborted or failed task""" task = self.query_task(task_id) # check if action is needed if (task["completed"] != None) and ( (task["result_header"]["status"] == "error") or (task["result_header"]["status"] == "aborted") ): self.task_requeue(task_id) return True def task_requeue(self, task_id): """Requeue task""" self.ip_client.resubmit(task_id) print "requeuing %s" % task_id return True def job_continue(self, job_id): """Continue stopped job and all of it's tasks""" job = self.query_job(job_id) # run job only on matching os os_list = self.query_engines_of_os(job["limits"]["os"]) # run job only on matching minram minram_list = self.query_engines_with_minram(job["limits"]["minram"]) # run job only on matching mincores mincores_list = self.query_engines_with_mincores(job["limits"]["mincores"]) # check pool members pool_list = self.query_engines_of_pool(job["limits"]["pool"]) # check limits self.match_all_limits(os_list, minram_list, mincores_list, pool_list) tasks = self.query_task_list(job_id) # continue tasks for task in tasks: self.task_continue(task["msg_id"]) return True def job_rerun(self, job_id): """Run all tasks of job another time""" job = self.query_job(job_id) # run job only on matching os os_list = self.query_engines_of_os(job["limits"]["os"]) # run job only on matching minram minram_list = self.query_engines_with_minram(job["limits"]["minram"]) # run job only on matching mincores mincores_list = self.query_engines_with_mincores(job["limits"]["mincores"]) # check pool members pool_list = self.query_engines_of_pool(job["limits"]["pool"]) # check limits self.match_all_limits(os_list, minram_list, mincores_list, pool_list) tasks = self.query_task_list(job_id) # rerun tasks for task in tasks: self.task_requeue(task["msg_id"]) return True def job_status(self, job_id): """Return status string of job""" tasks = self.query_task_list(job_id) status = None status_pending = 0 status_ok = 0 status_aborted = 0 status_resubmitted = 0 status_error = 0 for task in tasks: # look for pending tasks if task["completed"] == None: status_pending += 1 else: if "result_header" in task.keys(): result_header = task["result_header"] # look for done tasks if ("status" in result_header.keys()) and (result_header["status"] == "ok"): status_ok += 1 # look for aborted tasks elif ("status" in result_header.keys()) and (result_header["status"] == "aborted"): status_aborted += 1 # look for done tasks elif ("status" in result_header.keys()) and (result_header["status"] == "resubmitted"): status_resubmitted += 1 # look for tasks with error elif ("status" in result_header.keys()) and (result_header["status"] == "error"): status_error += 1 else: status_unknown += 1 # if at least 1 task is ok, job status is ok if status_ok > 0: status = "ok" # if at least 1 task is pending, job status is pending if status_pending > 0: status = "pending" # if at least 1 task is aborted, job status is aborted if status_aborted > 0: status = "aborted" # if at least 1 task has an error, job status is error if status_error > 0: status = "error" return status def engine_stop(self, engine_id): """Stop a specific engine""" # delete computer information in db DrQueueComputer.delete_from_db(engine_id) # shutdown computer self.ip_client.shutdown(engine_id) return True def engine_restart(self, engine_id): """Restart a specific engine""" self.ip_client.shutdown(engine_id, True, False, True) return True
class Grid(object): ''' Responsible to run QuanTrade runtime and communicate with drones It forks: - log.io for logs aggregation - dashboards for trading purpose And dynamically: - Remote rest_services for database wrappers - Drones to process remote calls - Glances servers and clients for ressources monitoring It basically waits for new tasks to pop (ie remote engines to appear), and fork trading processes on them according to their associated configuration. It can as well create by itself remote/local drones for classic cluster purpose. The object can be configured through ~/.quantrade/default.json. ''' def __init__(self, configuration_path=CONFIG_PATH): log.info('Running Grid master, stop it with CTRL-C') # CTRL-C interception SignalManager() # Setup object configuration self._configure(configuration_path) # Team_dashboard web graphs self.dashboard = Dashboard() # Logs monitoring self.logio = LogIO(self.configuration['nodes']) # Nodes are physical machines of the cluster self.nodes = { ip: Node(ip, self.configuration['monitored'], self.configuration['restful']) for ip in self.configuration['nodes'] } self.processed_engines = [] def _configure(self, configuration_path): ''' Read and set configuration ''' self.configuration = json.load(open(configuration_path, 'r'))['grid'] #http://docs.fabfile.org/en/1.4.3/usage/env.html#full-list-of-env-vars #env.forward_agent = True #env.key_filename = [""] env.user = self.configuration['name'] env.password = self.configuration['password'] env.hosts = self.configuration['nodes'] env.roledefs = { 'local': ['127.0.0.1'], 'controller': self.configuration['controller'], 'nodes': self.configuration['nodes'] } def deploy(self): ''' Set up local ipcontroller ''' log.info('Deploying grid trade-system') log.info('Activating local ipcontroller') execute(fab.activate_controller) # Main interface to drones self.engines = Client() def _is_idle(self, state): ''' Check if there is pending tasks to do ''' if 'queue' in state: return not state['queue'] # Else, no informations to answer return None def detect_drones(self): new_engines = [] engines_status = self.engines.queue_status() #NOTE what is the use of status['unassigned'] ? for key, state in engines_status.iteritems(): if key == 'unassigned': continue if (self._is_idle(state) and key not in self.processed_engines): self.processed_engines.append(key) new_engines.append(self.engines[key]) self._dispatch_engines(new_engines) return len(new_engines) def _dispatch_engines(self, engines): for engine in engines: ip = engine.apply_sync(get_local_ip) log.info('New engine detected on {}'.format(ip)) if ip not in self.nodes: log.info('New node connected') self.nodes[ip] = Node(ip, self.configuration['monitored'], self.configuration['restful']) self.nodes[ip].register_drone(engine.targets, engine) drone_name = self.nodes[ip].drones[engine.targets].name self.dashboard.add_description(remote_ip=ip, portfolio=drone_name) self.logio.add_description(drone_name, remote_ip=ip) log.info('Drone registered') def process(self, function, node_ip=None, drone_id=None): ''' Process pending tasks on available, and eventually provided, drones ''' processed_nodes = self.nodes.values() for node in processed_nodes: processed_drones = node.drones.values() #FIXME use self.engines.shutdown([1, 3]) insteand of #non-functionnal drone.shutdown node.inspect_armada() for drone in processed_drones: drone.run(function) def fireup_dashboards(self): if self.configuration['logserver']: self.logio.build() self.logio.run() log.notice('Log.io available at http://192.168.0.12:28778') if self.configuration['dashboard']: self.dashboard.build() self.dashboard.run(public_ip=False) log.notice('Dashboard available at http://192.168.0.12:4000')
class Grid(object): ''' Responsible to run QuanTrade runtime and communicate with drones It forks: - log.io for logs aggregation - dashboards for trading purpose And dynamically: - Remote rest_services for database wrappers - Drones to process remote calls - Glances servers and clients for ressources monitoring It basically waits for new tasks to pop (ie remote engines to appear), and fork trading processes on them according to their associated configuration. It can as well create by itself remote/local drones for classic cluster purpose. The object can be configured through ~/.quantrade/default.json. ''' def __init__(self, configuration_path=CONFIG_PATH): log.info('Running Grid master, stop it with CTRL-C') # CTRL-C interception SignalManager() # Setup object configuration self._configure(configuration_path) # Team_dashboard web graphs self.dashboard = Dashboard() # Logs monitoring self.logio = LogIO(self.configuration['nodes']) # Nodes are physical machines of the cluster self.nodes = {ip: Node(ip, self.configuration['monitored'], self.configuration['restful']) for ip in self.configuration['nodes']} self.processed_engines = [] def _configure(self, configuration_path): ''' Read and set configuration ''' self.configuration = json.load(open(configuration_path, 'r'))['grid'] #http://docs.fabfile.org/en/1.4.3/usage/env.html#full-list-of-env-vars #env.forward_agent = True #env.key_filename = [""] env.user = self.configuration['name'] env.password = self.configuration['password'] env.hosts = self.configuration['nodes'] env.roledefs = { 'local': ['127.0.0.1'], 'controller': self.configuration['controller'], 'nodes': self.configuration['nodes'] } def deploy(self): ''' Set up local ipcontroller ''' log.info('Deploying grid trade-system') log.info('Activating local ipcontroller') execute(fab.activate_controller) # Main interface to drones self.engines = Client() def _is_idle(self, state): ''' Check if there is pending tasks to do ''' if 'queue' in state: return not state['queue'] # Else, no informations to answer return None def detect_drones(self): new_engines = [] engines_status = self.engines.queue_status() #NOTE what is the use of status['unassigned'] ? for key, state in engines_status.iteritems(): if key == 'unassigned': continue if (self._is_idle(state) and key not in self.processed_engines): self.processed_engines.append(key) new_engines.append(self.engines[key]) self._dispatch_engines(new_engines) return len(new_engines) def _dispatch_engines(self, engines): for engine in engines: ip = engine.apply_sync(get_local_ip) log.info('New engine detected on {}'.format(ip)) if ip not in self.nodes: log.info('New node connected') self.nodes[ip] = Node(ip, self.configuration['monitored'], self.configuration['restful']) self.nodes[ip].register_drone(engine.targets, engine) drone_name = self.nodes[ip].drones[engine.targets].name self.dashboard.add_description(remote_ip=ip, portfolio=drone_name) self.logio.add_description(drone_name, remote_ip=ip) log.info('Drone registered') def process(self, function, node_ip=None, drone_id=None): ''' Process pending tasks on available, and eventually provided, drones ''' processed_nodes = self.nodes.values() for node in processed_nodes: processed_drones = node.drones.values() #FIXME use self.engines.shutdown([1, 3]) insteand of #non-functionnal drone.shutdown node.inspect_armada() for drone in processed_drones: drone.run(function) def fireup_dashboards(self): if self.configuration['logserver']: self.logio.build() self.logio.run() log.notice('Log.io available at http://192.168.0.12:28778') if self.configuration['dashboard']: self.dashboard.build() self.dashboard.run(public_ip=False) log.notice('Dashboard available at http://192.168.0.12:4000')