class Client(): """DrQueue client actions""" def __init__(self): # initialize IPython try: self.ip_client = IPClient() except Exception: raise Exception("Could not connect to IPython controller.") self.lbview = self.ip_client.load_balanced_view() # enable tracking self.lbview.track = True # list of all available query keys self.all_task_query_keys = ['msg_id', 'header', 'content', 'buffers', 'submitted', 'client_uuid', 'engine_uuid', 'started', 'completed', 'resubmitted', 'result_header', 'result_content', 'result_buffers', 'queue', 'pyin', 'pyout', 'pyerr', 'stdout', 'stderr'] def job_run(self, job): """Create and queue tasks from job object""" # check job name if job['name'] in DrQueueJob.query_jobnames(): raise ValueError("Job name %s is already used!" % job['name']) return False # save job in database job_id = DrQueueJob.store_db(job) # job_id from db is be used as session name self.ip_client.session.session = str(job_id) # set owner of job self.ip_client.session.username = job['owner'] # set number of retries for each task self.lbview.retries = job['retries'] # depend on another job (it's tasks) if ('depend' in job['limits']) and (job['limits']['depend'] != None): depend_job = self.query_job_by_name(job['limits']['depend']) depend_tasks = self.query_task_list(depend_job['_id']) task_ids = [] for task in depend_tasks: task_ids.append(task['msg_id']) self.lbview.after = task_ids # check frame numbers if not (job['startframe'] >= 1): raise ValueError("Invalid value for startframe. Has to be equal or greater than 1.") return False if not (job['endframe'] >= 1): raise ValueError("Invalid value for endframe. Has to be equal or greater than 1.") return False if not (job['endframe'] >= job['startframe']): raise ValueError("Invalid value for endframe. Has be to equal or greater than startframe.") return False if job['endframe'] > job['startframe']: if not (job['endframe'] - job['startframe'] >= job['blocksize']): raise ValueError("Invalid value for blocksize. Has to be equal or lower than endframe-startframe.") return False if job['endframe'] == job['startframe']: if job['blocksize'] != 1: raise ValueError("Invalid value for blocksize. Has to be equal 1 if endframe equals startframe.") return False task_frames = list(range(job['startframe'], job['endframe'] + 1, job['blocksize'])) ar = None for x in task_frames: # prepare script input env_dict = { 'DRQUEUE_FRAME' : x, 'DRQUEUE_BLOCKSIZE' : job['blocksize'], 'DRQUEUE_ENDFRAME' : job['endframe'], 'DRQUEUE_SCENEFILE' : job['scenefile'] } # log filename if job['created_with'] == "DrQueueOnRails": # take job directory name env_dict['DRQUEUE_LOGFILE'] = job['scenefile'].split("/")[-2] + "-" + str(x) + "_" + str(x + job['blocksize'] -1) + ".log" else: # take job name env_dict['DRQUEUE_LOGFILE'] = job['name'] + "-" + str(x) + "_" + str(x + job['blocksize'] -1) + ".log" # optional elements if 'renderdir' in job: env_dict['DRQUEUE_RENDERDIR'] = job['renderdir'] if 'projectdir' in job: env_dict['DRQUEUE_PROJECTDIR'] = job['projectdir'] if 'configdir' in job: env_dict['DRQUEUE_CONFIGDIR'] = job['configdir'] if 'imagefile' in job: env_dict['DRQUEUE_IMAGEFILE'] = job['imagefile'] if 'precommand' in job: env_dict['DRQUEUE_PRECOMMAND'] = job['precommand'] if 'renderer' in job: env_dict['DRQUEUE_RENDERER'] = job['renderer'] if 'fileformat' in job: env_dict['DRQUEUE_FILEFORMAT'] = job['fileformat'] if 'postcommand' in job: env_dict['DRQUEUE_POSTCOMMAND'] = job['postcommand'] if 'viewcommand' in job: env_dict['DRQUEUE_VIEWCOMMAND'] = job['viewcommand'] if 'worldfile' in job: env_dict['DRQUEUE_WORLDFILE'] = job['worldfile'] if 'terrainfile' in job: env_dict['DRQUEUE_TERRAINFILE'] = job['terrainfile'] if 'composition' in job: env_dict['DRQUEUE_COMPOSITION'] = job['composition'] if 'camera' in job: env_dict['DRQUEUE_CAMERA'] = job['camera'] if 'resx' in job: env_dict['DRQUEUE_RESX'] = job['resx'] if 'resy' in job: env_dict['DRQUEUE_RESY'] = job['resy'] if 'renderpass' in job: env_dict['DRQUEUE_RENDERPASS'] = job['renderpass'] if 'rendertype' in job: env_dict['DRQUEUE_RENDERTYPE'] = job['rendertype'] if 'fileextension' in job: env_dict['DRQUEUE_FILEEXTENSION'] = job['fileextension'] if 'stepframe' in job: env_dict['DRQUEUE_STEPFRAME'] = job['stepframe'] if 'custom_bucket' in job: env_dict['DRQUEUE_CUSTOM_BUCKET'] = job['custom_bucket'] if 'bucketsize' in job: env_dict['DRQUEUE_BUCKETSIZE'] = job['bucketsize'] if 'custom_lod' in job: env_dict['DRQUEUE_CUSTOM_LOD'] = job['custom_lod'] if 'lod' in job: env_dict['DRQUEUE_LOD'] = job['lod'] if 'custom_varyaa' in job: env_dict['DRQUEUE_CUSTOM_VARYAA'] = job['custom_varyaa'] if 'varyaa' in job: env_dict['DRQUEUE_VARYAA'] = job['varyaa'] if 'raytrace' in job: env_dict['DRQUEUE_RAYTRACE'] = job['raytrace'] if 'antialias' in job: env_dict['DRQUEUE_ANTIALIAS'] = job['antialias'] if 'custom_bdepth' in job: env_dict['DRQUEUE_CUSTOM_BDEPTH'] = job['custom_bdepth'] if 'bdepth' in job: env_dict['DRQUEUE_BDEPTH'] = job['bdepth'] if 'custom_zdepth' in job: env_dict['DRQUEUE_CUSTOM_ZDEPTH'] = job['custom_zdepth'] if 'zdepth' in job: env_dict['DRQUEUE_ZDEPTH'] = job['zdepth'] if 'custom_cracks' in job: env_dict['DRQUEUE_CUSTOM_CRACKS'] = job['custom_cracks'] if 'cracks' in job: env_dict['DRQUEUE_CRACKS'] = job['cracks'] if 'custom_quality' in job: env_dict['DRQUEUE_CUSTOM_QUALITY'] = job['custom_quality'] if 'quality' in job: env_dict['DRQUEUE_QUALITY'] = job['quality'] if 'custom_qfiner' in job: env_dict['DRQUEUE_CUSTOM_QFINER'] = job['custom_qfiner'] if 'qfiner' in job: env_dict['DRQUEUE_QFINER'] = job['qfiner'] if 'custom_smultiplier' in job: env_dict['DRQUEUE_CUSTOM_SMULTIPLIER'] = job['custom_smultiplier'] if 'smultiplier' in job: env_dict['DRQUEUE_SMULTIPLIER'] = job['smultiplier'] if 'custom_mpcache' in job: env_dict['DRQUEUE_CUSTOM_MPCACHE'] = job['custom_mpcache'] if 'mpcache' in job: env_dict['DRQUEUE_MPCACHE'] = job['mpcache'] if 'custom_smpolygon' in job: env_dict['DRQUEUE_CUSTOM_SMPOLYGON'] = job['custom_smpolygon'] if 'smpolygon' in job: env_dict['DRQUEUE_SMPOLYGON'] = job['smpolygon'] if 'custom_wh' in job: env_dict['DRQUEUE_CUSTOM_WH'] = job['custom_wh'] if 'custom_type' in job: env_dict['DRQUEUE_CUSTOM_TYPE'] = job['custom_type'] if 'ctype' in job: env_dict['DRQUEUE_CTYPE'] = job['ctype'] if 'skipframes' in job: env_dict['DRQUEUE_SKIPFRAMES'] = job['skipframes'] # set dependencies dep_dict = {} dep_dict['job_id'] = job_id if ('os' in job['limits']) and (job['limits']['os'] != None): dep_dict['os_name'] = job['limits']['os'] if ('minram' in job['limits']) and (job['limits']['minram'] > 0): dep_dict['minram'] = job['limits']['minram'] if ('mincores' in job['limits']) and (job['limits']['mincores'] > 0): dep_dict['mincores'] = job['limits']['mincores'] if ('pool_name' in job['limits']) and (job['limits']['pool_name'] != None): dep_dict['pool_name'] = job['limits']['pool_name'] run_script_with_env_and_deps = dependent(DrQueue.run_script_with_env, DrQueue.check_deps, dep_dict) # run task on cluster render_script = DrQueue.get_rendertemplate(job['renderer']) ar = self.lbview.apply(run_script_with_env_and_deps, render_script, env_dict) # wait for pyzmq send to complete communication (avoid race condition) ar.wait_for_send() # append email task behind last task if requested if ('send_email' in job) and (job['send_email'] == True): self.lbview.after = ar # run email task mail_ar = self.lbview.apply(DrQueue.send_email, job['name'], job['email_recipients']) # wait for pyzmq send to complete communication (avoid race condition) mail_ar.wait_for_send() return True def identify_computer(self, engine_id, cache_time, timeout=15): """Gather information about computer""" # look if engine info is already stored engine = DrQueueComputer.query_db_by_engine_id(engine_id) now = int(time.time()) # check existence and age of info if (engine != None) and (now <= engine['created_at'] + cache_time): print("DEBUG: Engine %i was found in DB and info is up-to-date." % engine_id) return engine # store new info else: if engine != None: print("DEBUG: Engine %i was found in DB, but info needs to be updated." % engine_id) else: print("DEBUG: Engine %i was not found in DB." % engine_id) # run command only on specific computer try: dview = self.ip_client[engine_id] except IndexError: print("DEBUG: Engine with id %i unknown." % engine_id) # delete old entry from database DrQueueComputer.delete_from_db_by_engine_id(engine_id) print("DEBUG: Engine with id %i deleted from database." % engine_id) new_engine = None else: # run command in async mode dview.block = False command = "import DrQueue\nfrom DrQueue import Computer as DrQueueComputer\nengine = DrQueueComputer()" ar = dview.execute(command) try: # try to get results & wait until timeout ar.get(timeout) except Exception: if engine != None: print("DEBUG: Update request for engine %i timed out. Using old information from DB." % engine_id) new_engine = engine else: print("DEBUG: Information request for engine %i timed out." % engine_id) new_engine = None else: # get computer dict from engine namespace new_engine = dview['engine'] # set to known engine_id new_engine['engine_id'] = engine_id # set creation time new_engine['created_at'] = int(time.time()) # store entry in database DrQueueComputer.store_db(new_engine) return new_engine def computer_set_pools(self, computer, pool_list): """add computer to list of pools""" # convert to string pool_str = ','.join(pool_list) # update environment variable on engine dview = self.ip_client[computer['engine_id']] dview.block = True command = "import os\nos.environ[\"DRQUEUE_POOL\"] = \"" + pool_str + "\"" dview.execute(command) # update database entry computer['pools'] = pool_list DrQueueComputer.store_db(computer) print("DEBUG: Engine " + str(computer['engine_id']) + " added to pools " + pool_str + ".") return computer def computer_get_pools(self, computer): """Return all pool names where computer is member.""" return computer['pools'] def task_wait(self, task_id): """Wait for task to finish""" ar = self.ip_client.get_result(task_id) ar.wait_for_send() ar.wait() return ar def query_job_list(self): """Query a list of all jobs""" return DrQueueJob.query_job_list() def query_job_by_id(self, job_id): """Query job by given id""" return DrQueueJob.query_db(job_id) def query_job_by_name(self, job_name): """Query job by given name""" return DrQueueJob.query_job_by_name(job_name) def query_job_tasks_left(self, job_id): """Query left frames of job""" left = 0 tasks = self.query_task_list(job_id) for task in tasks: if task['completed'] == None: left += 1 return left def query_job_finish_time(self, job_id): """Query oldest finish time of all tasks.""" job = self.query_job_by_id(job_id) # use requeue time as starting point if available if ('requeue_time' in job ) and (job['requeue_time'] != False): finish_time = job['requeue_time'] else: finish_time = job['submit_time'] tasks = self.query_task_list(job_id) for task in tasks: # look if older finish time exists if (task['completed'] != None) and (task['completed'] > finish_time): finish_time = task['completed'] return finish_time def get_frame_nr(self, task): """Extract value of DRQUEUE_FRAME.""" if ('buffers' in task) and task['buffers'] != []: frame_nr = int(pickle.loads(task['buffers'][3])['DRQUEUE_FRAME']) else: frame_nr = 1 return frame_nr def query_task_list(self, job_id): """Query a list of tasks objects of certain job. Sort by frame number.""" task_list = self.ip_client.db_query({'header.session' : str(job_id)}, keys=self.all_task_query_keys) sorted_task_list = sorted(task_list, key=self.get_frame_nr) return sorted_task_list def query_interrupted_task_list(self, job_id): """Query a list of interrupted tasks of certain job. Sort by frame number.""" job = self.query_job_by_id(job_id) task_list = self.ip_client.db_query({'header.session' : str(job_id)}, keys=self.all_task_query_keys) interrupted_task_list = [] for task in task_list: frame_nr = self.get_frame_nr(task) print("frame_nr: " + str(frame_nr)) # log filename if job['renderer'] == "blender": filesearch = job['scenefile'] + str("%04d" % frame_nr) + ".???" found = glob.glob(filesearch) # file was found if len(found) > 0: outputfile = found[0] print("outputfile: "+ str(outputfile)) filesize = os.path.getsize(outputfile) print(filesize) # file exists, but is empty if filesize == 0: interrupted_task_list.append(task) # file was not found else: outputfile = None print("outputfile: "+ str(outputfile)) if (task['completed'] == None) and (task['started'] == None): interrupted_task_list.append(task) else: raise ValueError("Only Blender renderer supported so far.") return interrupted_task_list def query_task(self, task_id): """Query a single task.""" task = self.ip_client.db_query({'msg_id' : task_id }, keys=self.all_task_query_keys)[0] return task def query_computer_list(self): """Query a list of all computers.""" return self.ip_client.ids def job_stop(self, job_id): """Stop job and all tasks which are not currently running""" # disable job self.job_disable(job_id) tasks = self.query_task_list(job_id) tasks_to_stop = [] for task in tasks: print("Task " + task["msg_id"] + ": ") if ("result_header" in task) and (task["result_header"] != None) and (task["result_header"]["status"] == "ok"): print(" finished at " + str(task["completed"])) else: # get task stats of all computers stats = self.ip_client.queue_status('all', True) # check if tasks is already running on an engine found_on_engine = False for key,status in list(stats.items()): if ('tasks' in status) and (task['msg_id'] in status['tasks']): # skip tasks which are already running on an engine print(" not finished yet but already queued to engine. will leave it there.") found_on_engine = True break # if a task isn't already queueed/running on an engine, it should be safe to abort it if found_on_engine == False: print(" not finished yet. will abort.") tasks_to_stop.append(task['msg_id']) if len(tasks_to_stop) > 0: try: self.ip_client.abort(tasks_to_stop) except Exception as e: print("ERROR: " + str(e)) return True def job_kill(self, job_id): """Stop job and all of it's tasks wether running or not""" # disable job self.job_disable(job_id) tasks = self.query_task_list(job_id) running_engines = [] tasks_to_stop = [] # abort all queued tasks for task in tasks: stats = self.ip_client.queue_status('all', True) # check if tasks is already running on an engine for key,status in list(stats.items()): if ('tasks' in status) and (task['msg_id'] in status['tasks']): running_engines.append(key) tasks_to_stop.append(task['msg_id']) # stop all matching tasks at once try: self.ip_client.abort(tasks_to_stop) except Exception as e: print("ERROR: " + str(e)) # stop all engines which still run a task # the slave wrapper will restart the engine running_engines = set(running_engines) for engine_id in running_engines: self.engine_stop(engine_id) return True def job_disable(self, job_id): """Disable job in database.""" job = self.query_job_by_id(job_id) job['enabled'] = False DrQueueJob.update_db(job) return True def job_enable(self, job_id): """Disable job in database.""" job = self.query_job_by_id(job_id) job['enabled'] = True DrQueueJob.update_db(job) return True def job_delete(self, job_id): """Delete job and all of it's tasks""" tasks = self.query_task_list(job_id) engines = self.query_computer_list() # abort and delete all queued tasks for task in tasks: if len(engines) > 0: self.ip_client.abort(task['msg_id']) self.ip_client.purge_results(task['msg_id']) # delete job itself DrQueueJob.delete_from_db(job_id) return True def job_continue(self, job_id): """Continue stopped job and all of it's tasks""" job = self.query_job_by_id(job_id) # enable job self.job_enable(job_id) tasks = self.query_task_list(job_id) tasks_to_resubmit = [] for task in tasks: print("Task " + task["msg_id"] + ": ") if ("result_header" in task) and (task["result_header"] != None) and (task["result_header"]["status"] == "ok"): print(" finished at " + str(task["completed"])) else: print(" not finished yet. will resubmit.") tasks_to_resubmit.append(task["msg_id"]) if len(tasks_to_resubmit) > 0: # resubmit all matching msg_ids at once try: async_results = self.ip_client.resubmit(tasks_to_resubmit) except Exception as e: print("ERROR: " + str(e)) # IPython seems to give out new msg_ids instead of re-using the old ones for msg_id in async_results.msg_ids: print("got new msg_id: " + msg_id) # delete old tasks which now have a resubmitted clone try: self.ip_client.purge_results(tasks_to_resubmit) except Exception as e: print("ERROR: " + str(e)) return True def job_rerun(self, job_id): """Run all tasks of job another time""" job = self.query_job_by_id(job_id) # enable job job['enabled'] = True # set resubmit time job['requeue_time'] = datetime.datetime.now() DrQueueJob.update_db(job) tasks = self.query_task_list(job_id) tasks_to_resubmit = [] # get all msg_ids of job for task in tasks: tasks_to_resubmit.append(task["msg_id"]) # resubmit all msg_ids at once try: async_results = self.ip_client.resubmit(tasks_to_resubmit) except Exception as e: print("ERROR: " + str(e)) # IPython seems to give out new msg_ids instead of re-using the old ones for msg_id in async_results.msg_ids: print("got new msg_id: " + msg_id) # delete old tasks which now have a resubmitted clone try: self.ip_client.purge_results(tasks_to_resubmit) except Exception as e: print("ERROR: " + str(e)) # kickstart all computers running_engines = [] for task in tasks: stats = self.ip_client.queue_status('all', True) # check if tasks is already running on an engine for key,status in list(stats.items()): if ('tasks' in status) and (task['msg_id'] in status['tasks']): running_engines.append(key) # stop all engines which still run a task # the slave wrapper will restart the engine running_engines = set(running_engines) for engine_id in running_engines: self.engine_stop(engine_id) return True def task_rerun(self, task_id): """Run task another time""" task = self.query_task(task_id) #print(task) # enable job #job['enabled'] = True # set resubmit time #job['requeue_time'] = datetime.datetime.now() #DrQueueJob.update_db(job) # resubmit msg_id of task try: async_results = self.ip_client.resubmit(task["msg_id"]) except Exception as e: print("ERROR: " + str(e)) # IPython seems to give out new msg_ids instead of re-using the old ones for msg_id in async_results.msg_ids: print("got new msg_id: " + msg_id) # delete old tasks which now have a resubmitted clone try: self.ip_client.purge_results(task["msg_id"]) except Exception as e: print("ERROR: " + str(e)) # kickstart all computers running_engines = [] stats = self.ip_client.queue_status('all', True) # check if tasks is already running on an engine for key,status in list(stats.items()): if ('tasks' in status) and (task['msg_id'] in status['tasks']): running_engines.append(key) # stop all engines which still run a task # the slave wrapper will restart the engine running_engines = set(running_engines) for engine_id in running_engines: self.engine_stop(engine_id) return True def job_rerun_interrupted_tasks(self, job_id): """Run interrupted tasks of job another time""" job = self.query_job_by_id(job_id) # enable job job['enabled'] = True # set resubmit time job['requeue_time'] = datetime.datetime.now() DrQueueJob.update_db(job) tasks = self.query_interrupted_task_list(job_id) if len(tasks) == 0: return True tasks_to_resubmit = [] # get all msg_ids of job for task in tasks: tasks_to_resubmit.append(task["msg_id"]) # resubmit all msg_ids at once try: async_results = self.ip_client.resubmit(tasks_to_resubmit) except Exception as e: print("ERROR: " + str(e)) # IPython seems to give out new msg_ids instead of re-using the old ones for msg_id in async_results.msg_ids: print("got new msg_id: " + msg_id) # delete old tasks which now have a resubmitted clone try: self.ip_client.purge_results(tasks_to_resubmit) except Exception as e: print("ERROR: " + str(e)) # kickstart all computers running_engines = [] for task in tasks: stats = self.ip_client.queue_status('all', True) # check if tasks is already running on an engine for key,status in list(stats.items()): if ('tasks' in status) and (task['msg_id'] in status['tasks']): running_engines.append(key) # stop all engines which still run a task # the slave wrapper will restart the engine running_engines = set(running_engines) for engine_id in running_engines: self.engine_stop(engine_id) return True def job_status(self, job_id): """Return status string of job""" tasks = self.query_task_list(job_id) status = None status_pending = 0 status_ok = 0 status_aborted = 0 status_resubmitted = 0 status_error = 0 status_unknown = 0 for task in tasks: # look for pending tasks if task['completed'] == None: status_pending += 1 else: if 'result_content' in list(task.keys()): result_content = task['result_content'] # look for done tasks if ('status' in list(result_content.keys())) and (result_content['status'] == "ok"): status_ok += 1 # look for aborted tasks elif ('status' in list(result_content.keys())) and (result_content['status'] == "aborted"): status_aborted += 1 # look for done tasks elif ('status' in list(result_content.keys())) and (result_content['status'] == "resubmitted"): status_resubmitted += 1 # look for tasks with error elif ('status' in list(result_content.keys())) and (result_content['status'] == "error"): status_error += 1 else: status_unknown += 1 # if at least 1 task is ok, job status is ok if status_ok > 0: status = "ok" # if at least 1 task has unknown status, job status is unknown if status_unknown > 0: status = "unknown" # if at least 1 task is pending, job status is pending if status_pending > 0: status = "pending" # if at least 1 task is aborted, job status is aborted if status_aborted > 0: status = "aborted" # if at least 1 task has an error, job status is error if status_error > 0: status = "error" return status def job_estimated_finish_time(self, job_id): """Calculate estimated finish time of job.""" tasks = self.query_task_list(job_id) spent_times = [] # get spent time for each finished task for task in tasks: if task['completed'] != None: if 'result_header' in list(task.keys()): result_header = task['result_header'] if ('status' in list(result_header.keys())) and (result_header['status'] == "ok"): timediff = task['completed'] - task['started'] spent_times.append(timediff) if len(spent_times) > 0: # calculate sum of spent time sum_times = datetime.timedelta(0) for spent in spent_times: sum_times += spent # calcutate mean time for a single task sum_times_secs = sum_times.days * 86400 + sum_times.seconds meantime_secs = sum_times_secs / len(spent_times) meantime = datetime.timedelta(0, meantime_secs) # calculate estimated time left tasks_left = len(tasks) - len(spent_times) time_left = tasks_left * meantime # query job object job = self.query_job_by_id(job_id) # look if all tasks are already done if self.query_job_tasks_left(job_id) == 0: finish_time = self.query_job_finish_time(job_id) else: # calculate estimated finish time, use requeue time if available if ('requeue_time' in job ) and (job['requeue_time'] != False): finish_time = job['requeue_time'] + time_left else: finish_time = job['submit_time'] + time_left else: meantime = "unknown" time_left = "unknown" finish_time = "unknown" return meantime, time_left, finish_time def engine_stop(self, engine_id): """Stop a specific engine""" # delete computer information in db DrQueueComputer.delete_from_db_by_engine_id(engine_id) # we stop the engine try: self.ip_client.shutdown(engine_id, False, False, True) except Exception: return False return True
#Follow instructions under header "Using ipcluster in mpiexec/mpirun mode": #https://ipython.org/ipython-doc/2/parallel/parallel_process.html#parallel-process #Now, in a normal bash console run the following: #ipcluster start --profile=mpi -n 4 from IPython.parallel import Client c = Client(profile='mpi') view = c[:] view.activate() # enable magics view.run('TestIPythonConsole.py') view['rank'] #If expected output is all zeros, the iPython cluster has not been properly set up #To shut down the cluster, run the following: c.shutdown(hub=True) #For distributed computing, read the following: #http://stackoverflow.com/questions/33614100/setting-up-a-distributed-ipython-ipyparallel-mpi-cluster
class EngineManager(object): def __init__(self): self.profile = None self.started_controller = None self.started_engines = set() self._client = None def _select_profile(self): # See IPython.core.profileapp:list_profile_in() profiles = [] for filename in os.listdir(get_ipython_dir()): if filename.startswith('profile_'): profiles.append(filename[8:]) if profiles == ['default'] and not qt_available: self.profile = 'default' elif not qt_available: raise ValueError("'default' IPython profile does not exist " "and PyQt4 is not available") else: self.profile = choose_profile(profiles) def ensure_controller(self, connect_only=False): """Make sure a controller is available, else start a local one. """ if self._client: return self._client if self.profile is None: self._select_profile() if self.profile is None: return None print "parallelflow: using IPython profile %r" % self.profile try: self._client = Client(profile=self.profile) print "parallelflow: connected to controller" return self._client except error.TimeoutError: print "parallelflow: timeout when connecting to controller" if connect_only: start_ctrl = False elif qt_available: res = QtGui.QMessageBox.question( None, "Start controller", "Unable to connect to the configured IPython " "controller. Do you want to start one?", QtGui.QMessageBox.Yes | QtGui.QMessageBox.No) start_ctrl = res == QtGui.QMessageBox.Yes else: start_ctrl = True except IOError: print "parallelflow: didn't find a controller to connect to" if connect_only: start_ctrl = False elif qt_available: res = QtGui.QMessageBox.question( None, "Start controller", "No controller is configured in this IPython profile. " "Do you want to start one?", QtGui.QMessageBox.Yes | QtGui.QMessageBox.No) start_ctrl = res == QtGui.QMessageBox.Yes else: start_ctrl = True if start_ctrl: ctrl_pid = os.path.join(locate_profile(self.profile), 'pid', 'ipcontroller.pid') if os.path.exists(ctrl_pid): os.remove(ctrl_pid) print "parallelflow: starting controller" proc, code = self.start_process( lambda: os.path.exists(ctrl_pid), sys.executable, '-m', 'IPython.parallel.apps.ipcontrollerapp', '--profile=%s' % self.profile) if code is not None: if qt_available: QtGui.QMessageBox.critical( None, "Error", "Controller exited with code %d" % code) print( "parallelflow: controller process exited with " "code %d" % code) return None else: self.started_controller = proc print "parallelflow: controller started, connecting" self._client = Client(profile=self.profile) return self._client return None @staticmethod def start_process(condition, *args): """Executes a file and waits for a condition. """ prev_dir = os.getcwd() os.chdir(os.path.join(vistrails_root_directory(), os.path.pardir)) try: p = subprocess.Popen(args) finally: os.chdir(prev_dir) if condition is None: return p, None else: while True: time.sleep(0.5) if condition(): return p, None res = p.poll() if res is not None: return None, res def start_engines(self, nb=None, prompt="Number of engines to start"): """Start some engines locally """ c = self.ensure_controller() if c is None: if qt_available: QtGui.QMessageBox.warning( None, "No controller", "Can't start engines: couldn't connect to a " "controller") print "parallelflow: no controller, not starting engines" else: if not nb and qt_available: nb, res = QtGui.QInputDialog.getInt( None, "Start engines", prompt, 1, # value 1, # min 16) # max if not res: return elif nb is None: nb = 1 print "parallelflow: about to start %d engines" % nb if qt_available: bar = QtGui.QProgressDialog("Starting engines...", None, 0, nb) def progress(n): bar.setValue(n) bar.show() else: def progress(n): pass progress(0) init_engines = set(c.ids) # Start the processes starting = set() for i in xrange(nb): proc, res = self.start_process( None, sys.executable, '-m', 'IPython.parallel.apps.ipengineapp', '--profile=%s' % self.profile) starting.add(proc) # Wait for each one to either fail or connect failed = [] connected = 0 while connected < len(starting): connected = len(set(c.ids) - init_engines) progress(len(failed) + connected) time.sleep(0.5) for p in list(starting): res = p.poll() if res is not None: failed.append(res) starting.remove(p) if failed: nb_failed = len(failed) if nb_failed > 3: failed = "%s, ..." % (', '.join('%d' % f for f in failed)) else: failed = ', '.join('%d' % f for f in failed) if qt_available: QtGui.QMessageBox.critical( None, "Error", "%d engine(s) exited with codes: %s" % (nb_failed, failed)) print "parallelflow: %d engine(s) exited with codes: %s" % ( nb_failed, failed) self.started_engines.update(starting) if qt_available: bar.hide() bar.deleteLater() print "parallelflow: %d engines started" % nb def info(self): """Show some information on the cluster. """ client = self.ensure_controller(connect_only=True) print "----- IPython information -----" print "profile: %s" % self.profile connected = client is not None print "connected to controller: %s" % ("yes" if connected else "no") st_ctrl = (self.started_controller is not None and self.started_controller.poll() is None) print "controller started from VisTrails: %s" % ("running" if st_ctrl else "no") st_engines = sum(1 for p in self.started_engines if p.poll() is None) print "engines started from VisTrails: %d" % st_engines if client is not None: nb_engines = len(client.ids) else: nb_engines = None print "total engines in cluster: %s" % (nb_engines if nb_engines is not None else "(unknown)") if connected and client.ids: dview = client[:] with dview.sync_imports(): import os import platform import socket engines = dview.apply_async( eval, '(os.getpid(), platform.system(), socket.getfqdn())').get_dict( ) engines = sorted(engines.items(), key=lambda (ip_id, (pid, system, fqdn)): (fqdn, ip_id)) print "engines:" print "\tid\tsystem\tPID\tnode FQDN" print "\t--\t------\t---\t---------" for ip_id, (pid, system, fqdn) in engines: print "\t%d\t%s\t%d\t%s" % (ip_id, system, pid, fqdn) print "" if qt_available: dialog = QtGui.QDialog() layout = QtGui.QVBoxLayout() form = QtGui.QFormLayout() form.addRow("Profile:", QtGui.QLabel(self.profile)) form.addRow("Connected:", QtGui.QLabel("yes" if connected else "no")) form.addRow("Controller started from VisTrails:", QtGui.QLabel("running" if st_ctrl else "no")) form.addRow("Engines started from VisTrails:", QtGui.QLabel(str(st_engines))) form.addRow( "Total engines in cluster:", QtGui.QLabel( str(nb_engines) if nb_engines is not None else "(unknown)") ) layout.addLayout(form) if connected and client.ids: tree = QtGui.QTreeWidget() tree.setHeaderHidden(False) tree.setHeaderLabels(["IPython id", "PID", "System type"]) engine_tree = dict() for ip_id, (pid, system, fqdn) in engines: engine_tree.setdefault(fqdn, []).append( (ip_id, pid, system)) for fqdn, info in engine_tree.iteritems(): node = QtGui.QTreeWidgetItem([fqdn]) tree.addTopLevelItem(node) tree.setFirstItemColumnSpanned(node, True) for ip_id, pid, system in info: node.addChild( QtGui.QTreeWidgetItem( [str(ip_id), str(pid), system])) for i in xrange(tree.columnCount()): tree.resizeColumnToContents(i) tree.expandAll() layout.addWidget(tree) ok = QtGui.QPushButton("Ok") QtCore.QObject.connect(ok, QtCore.SIGNAL('clicked()'), dialog, QtCore.SLOT('accept()')) layout.addWidget(ok, 1, QtCore.Qt.AlignHCenter) dialog.setLayout(layout) dialog.exec_() def change_profile(self): self.cleanup() old_profile = self.profile self._select_profile() if not self.profile: self.profile = old_profile if self.profile != old_profile: # Here, the processes that were started but the user didn't want to # clean up are abandonned # They will continue running but later cleanups won't ask for these # ones self.started_engines = set() self.started_controller = None def cleanup(self): """Shut down the started processes (with user confirmation). """ engines = sum(1 for p in self.started_engines if p.poll() is None) ctrl = (self.started_controller is not None and self.started_controller.poll() is None) print("parallelflow: cleanup: %s, %d engines running" % ("controller running" if ctrl else "no controller", engines)) hub_shutdown = False if ctrl: if qt_available: res = QtGui.QMessageBox.question( None, "Shutdown controller", "The controller is still running. Do you want to stop " "it?", QtGui.QMessageBox.Yes, QtGui.QMessageBox.No) res = res != QtGui.QMessageBox.No else: res = True if res: if self._client is not None: self._client.shutdown(targets='all', restart=False, hub=True, block=False) hub_shutdown = True print "parallelflow: requested hub shutdown" else: if self.started_controller.poll() is not None: self.started_controller.terminate() self.started_controller.wait() print "parallelflow: controller terminated" self.started_controller = None if engines > 0 and not hub_shutdown: if qt_available: if self._client is not None: total = " (among %d total)" % len(self._client.ids) else: total = '' res = QtGui.QMessageBox.question( None, "Shutdown engines", "%d engines started here%s are still " "running. Do you want to stop them?" % (engines, total), QtGui.QMessageBox.Yes, QtGui.QMessageBox.No) res = res != QtGui.QMessageBox.No else: res = True if res: for engine in self.started_engines: if engine.poll() is not None: engine.terminate() engine.wait() print("parallelflow: %d engines terminated" % len(self.started_engines)) self.started_engines = set() if self._client is not None: print "parallelflow: closing client" self._client.close() self._client = None def shutdown_cluster(self): """Use the client to request a shutdown of the whole cluster. """ client = self.ensure_controller(connect_only=True) if client is None: if qt_available: QtGui.QMessageBox.information( None, "Couldn't connect", "Couldn't connect to a controller. Is the cluster " "down already?") print( "parallelflow: shutdown_cluster requested, but could " "not connect to a controller") return if qt_available: res = QtGui.QMessageBox.question( None, "Shutdown cluster", "This will use the client connection to request the hub " "and every engine to shutdown. Continue?", QtGui.QMessageBox.Ok, QtGui.QMessageBox.Cancel) if res != QtGui.QMessageBox.Ok: return self._client.shutdown(targets='all', restart=False, hub=True, block=False) print "parallelflow: cluster shutdown requested" self._client = None
class Client(): """DrQueue client actions""" def __init__(self): # initialize IPython try: self.ip_client = IPClient() except Exception: raise Exception("Could not connect to IPython controller.") self.lbview = self.ip_client.load_balanced_view() # enable tracking self.lbview.track = True def job_run(self, job): """Create and queue tasks from job object""" # check job name if job['name'] in DrQueueJob.query_jobnames(): raise ValueError("Job name %s is already used!" % job['name']) return False # save job in database job_id = DrQueueJob.store_db(job) # job_id from db is be used as session name self.ip_client.session.session = str(job_id) # set owner of job self.ip_client.session.username = job['owner'] # set number of retries for each task self.lbview.retries = job['retries'] # depend on another job (it's tasks) if ('depend' in job['limits']) and (job['limits']['depend'] != None): depend_job = self.query_job_by_name(job['limits']['depend']) depend_tasks = self.query_task_list(depend_job['_id']) task_ids = [] for task in depend_tasks: task_ids.append(task['msg_id']) self.lbview.after = task_ids # check frame numbers if not (job['startframe'] >= 1): raise ValueError("Invalid value for startframe. Has to be equal or greater than 1.") return False if not (job['endframe'] >= 1): raise ValueError("Invalid value for endframe. Has to be equal or greater than 1.") return False if not (job['endframe'] >= job['startframe']): raise ValueError("Invalid value for endframe. Has be to equal or greater than startframe.") return False if job['endframe'] > job['startframe']: if not (job['endframe'] - job['startframe'] >= job['blocksize']): raise ValueError("Invalid value for blocksize. Has to be equal or lower than endframe-startframe.") return False if job['endframe'] == job['startframe']: if job['blocksize'] != 1: raise ValueError("Invalid value for blocksize. Has to be equal 1 if endframe equals startframe.") return False task_frames = list(range(job['startframe'], job['endframe'] + 1, job['blocksize'])) ar = None for x in task_frames: # prepare script input env_dict = { 'DRQUEUE_FRAME' : x, 'DRQUEUE_BLOCKSIZE' : job['blocksize'], 'DRQUEUE_ENDFRAME' : job['endframe'], 'DRQUEUE_SCENEFILE' : job['scenefile'], 'DRQUEUE_LOGFILE' : job['name'] + "-" + str(x) + "_" + str(x + job['blocksize'] -1) + ".log" } # optional elements if 'renderdir' in job: env_dict['DRQUEUE_RENDERDIR'] = job['renderdir'] if 'projectdir' in job: env_dict['DRQUEUE_PROJECTDIR'] = job['projectdir'] if 'configdir' in job: env_dict['DRQUEUE_CONFIGDIR'] = job['configdir'] if 'imagefile' in job: env_dict['DRQUEUE_IMAGEFILE'] = job['imagefile'] if 'precommand' in job: env_dict['DRQUEUE_PRECOMMAND'] = job['precommand'] if 'renderer' in job: env_dict['DRQUEUE_RENDERER'] = job['renderer'] if 'fileformat' in job: env_dict['DRQUEUE_FILEFORMAT'] = job['fileformat'] if 'postcommand' in job: env_dict['DRQUEUE_POSTCOMMAND'] = job['postcommand'] if 'viewcommand' in job: env_dict['DRQUEUE_VIEWCOMMAND'] = job['viewcommand'] if 'worldfile' in job: env_dict['DRQUEUE_WORLDFILE'] = job['worldfile'] if 'terrainfile' in job: env_dict['DRQUEUE_TERRAINFILE'] = job['terrainfile'] if 'composition' in job: env_dict['DRQUEUE_COMPOSITION'] = job['composition'] if 'camera' in job: env_dict['DRQUEUE_CAMERA'] = job['camera'] if 'resx' in job: env_dict['DRQUEUE_RESX'] = job['resx'] if 'resy' in job: env_dict['DRQUEUE_RESY'] = job['resy'] if 'renderpass' in job: env_dict['DRQUEUE_RENDERPASS'] = job['renderpass'] if 'rendertype' in job: env_dict['DRQUEUE_RENDERTYPE'] = job['rendertype'] if 'fileextension' in job: env_dict['DRQUEUE_FILEEXTENSION'] = job['fileextension'] if 'stepframe' in job: env_dict['DRQUEUE_STEPFRAME'] = job['stepframe'] if 'custom_bucket' in job: env_dict['DRQUEUE_CUSTOM_BUCKET'] = job['custom_bucket'] if 'bucketsize' in job: env_dict['DRQUEUE_BUCKETSIZE'] = job['bucketsize'] if 'custom_lod' in job: env_dict['DRQUEUE_CUSTOM_LOD'] = job['custom_lod'] if 'lod' in job: env_dict['DRQUEUE_LOD'] = job['lod'] if 'custom_varyaa' in job: env_dict['DRQUEUE_CUSTOM_VARYAA'] = job['custom_varyaa'] if 'varyaa' in job: env_dict['DRQUEUE_VARYAA'] = job['varyaa'] if 'raytrace' in job: env_dict['DRQUEUE_RAYTRACE'] = job['raytrace'] if 'antialias' in job: env_dict['DRQUEUE_ANTIALIAS'] = job['antialias'] if 'custom_bdepth' in job: env_dict['DRQUEUE_CUSTOM_BDEPTH'] = job['custom_bdepth'] if 'bdepth' in job: env_dict['DRQUEUE_BDEPTH'] = job['bdepth'] if 'custom_zdepth' in job: env_dict['DRQUEUE_CUSTOM_ZDEPTH'] = job['custom_zdepth'] if 'zdepth' in job: env_dict['DRQUEUE_ZDEPTH'] = job['zdepth'] if 'custom_cracks' in job: env_dict['DRQUEUE_CUSTOM_CRACKS'] = job['custom_cracks'] if 'cracks' in job: env_dict['DRQUEUE_CRACKS'] = job['cracks'] if 'custom_quality' in job: env_dict['DRQUEUE_CUSTOM_QUALITY'] = job['custom_quality'] if 'quality' in job: env_dict['DRQUEUE_QUALITY'] = job['quality'] if 'custom_qfiner' in job: env_dict['DRQUEUE_CUSTOM_QFINER'] = job['custom_qfiner'] if 'qfiner' in job: env_dict['DRQUEUE_QFINER'] = job['qfiner'] if 'custom_smultiplier' in job: env_dict['DRQUEUE_CUSTOM_SMULTIPLIER'] = job['custom_smultiplier'] if 'smultiplier' in job: env_dict['DRQUEUE_SMULTIPLIER'] = job['smultiplier'] if 'custom_mpcache' in job: env_dict['DRQUEUE_CUSTOM_MPCACHE'] = job['custom_mpcache'] if 'mpcache' in job: env_dict['DRQUEUE_MPCACHE'] = job['mpcache'] if 'custom_smpolygon' in job: env_dict['DRQUEUE_CUSTOM_SMPOLYGON'] = job['custom_smpolygon'] if 'smpolygon' in job: env_dict['DRQUEUE_SMPOLYGON'] = job['smpolygon'] if 'custom_wh' in job: env_dict['DRQUEUE_CUSTOM_WH'] = job['custom_wh'] if 'custom_type' in job: env_dict['DRQUEUE_CUSTOM_TYPE'] = job['custom_type'] if 'ctype' in job: env_dict['DRQUEUE_CTYPE'] = job['ctype'] if 'skipframes' in job: env_dict['DRQUEUE_SKIPFRAMES'] = job['skipframes'] # set dependencies dep_dict = {} if ('os' in job['limits']) and (job['limits']['os'] != None): dep_dict['os_name'] = job['limits']['os'] if ('minram' in job['limits']) and (job['limits']['minram'] > 0): dep_dict['minram'] = job['limits']['minram'] if ('mincores' in job['limits']) and (job['limits']['mincores'] > 0): dep_dict['mincores'] = job['limits']['mincores'] if ('pool_name' in job['limits']) and (job['limits']['pool_name'] != None): dep_dict['pool_name'] = job['limits']['pool_name'] run_script_with_env_and_deps = dependent(DrQueue.run_script_with_env, DrQueue.check_deps, dep_dict) # run task on cluster render_script = DrQueue.get_rendertemplate(job['renderer']) ar = self.lbview.apply(run_script_with_env_and_deps, render_script, env_dict) # wait for pyzmq send to complete communication (avoid race condition) ar.wait_for_send() # append email task behind last task if requested if ('send_email' in job) and (job['send_email'] == True): self.lbview.after = ar # run email task mail_ar = self.lbview.apply(DrQueue.send_email, job['name'], job['email_recipients']) # wait for pyzmq send to complete communication (avoid race condition) mail_ar.wait_for_send() return True def identify_computer(self, engine_id, cache_time): """Gather information about computer""" # look if engine info is already stored engine = DrQueueComputer.query_db(engine_id) now = int(time.time()) # check existence and age of info if (engine != None) and (now <= engine['date'] + cache_time): print("DEBUG: Engine %i was found in DB" % engine_id) # store new info else: print("DEBUG: Engine %i was not found in DB" % engine_id) # run command only on specific computer dview = self.ip_client[engine_id] dview.block = True dview.execute("import DrQueue\nfrom DrQueue import Computer as DrQueueComputer\nengine = DrQueueComputer(" + str(engine_id) + ")") engine = dview['engine'] engine['date'] = int(time.time()) DrQueueComputer.store_db(engine) return engine def task_wait(self, task_id): """Wait for task to finish""" ar = self.ip_client.get_result(task_id) ar.wait_for_send() ar.wait() return ar def query_job_list(self): """Query a list of all jobs""" return DrQueueJob.query_job_list() def query_running_job_list(self): """Query a list of all running jobs""" jobs = DrQueueJob.query_job_list() running_jobs = [] for job in jobs: if self.query_job_tasks_left(job['_id']) > 0: running_jobs.append(job) return running_jobs def query_jobname(self, task_id): """Query jobname from task id""" data = self.ip_client.db_query({"msg_id" : task_id}) job_id = data[0]['header']['session'] job = DrQueueJob.query_db(job_id) return job.name def query_job(self, job_id): """Query job from id""" return DrQueueJob.query_db(job_id) def query_job_by_name(self, job_name): """Query job from name""" return DrQueueJob.query_job_by_name(job_name) def query_job_tasks_left(self, job_id): """Query left frames of job""" left = 0 tasks = self.query_task_list(job_id) for task in tasks: if task['completed'] == None: left += 1 return left def query_job_finish_time(self, job_id): """Query oldest finish time of all tasks.""" job = self.query_job(job_id) # use requeue time as starting point if available if ('requeue_time' in job ) and (job['requeue_time'] != False): finish_time = job['requeue_time'] else: finish_time = job['submit_time'] tasks = self.query_task_list(job_id) for task in tasks: # look if older finish time exists if (task['completed'] != None) and (task['completed'] > finish_time): finish_time = task['completed'] return finish_time def get_frame_nr(self, task): """Extract value of DRQUEUE_FRAME.""" return int(pickle.loads(task['buffers'][3])['DRQUEUE_FRAME']) def query_task_list(self, job_id): """Query a list of tasks objects of certain job""" task_list = self.ip_client.db_query({'header.session' : str(job_id)}) sorted_task_list = sorted(task_list, key=self.get_frame_nr) return sorted_task_list def query_task(self, task_id): """Query a single task""" task = self.ip_client.db_query({'msg_id' : task_id })[0] return task def query_engine_list(self): """Query a list of all engines""" return self.ip_client.ids def query_engines_of_pool(self, pool_name): """Return available engines of certain pool.""" pool_computers = self.ip_client.ids if pool_name != None: computers = DrQueueComputerPool.query_pool_members(pool_name) if computers == None: raise ValueError("Pool \"%s\" is not existing!" % pool_name) return False for comp in pool_computers: if not comp in computers: pool_computers.remove(comp) if pool_computers == []: raise ValueError("No computer of pool %s is available!" % pool_name) return False print("DEBUG: matching pool: " + pool_name) print(pool_computers) return pool_computers def query_engines_of_os(self, os_name): """Return only engines running certain OS.""" # run job only on matching os matching_os = self.ip_client.ids if os_name != None: for engine_id in self.ip_client.ids: engine = self.identify_computer(engine_id, 1000) # os string has to contain os_name if not os_name in engine['os']: matching_os.remove(engine_id) print("DEBUG: matching os: " + os_name) print(matching_os) return matching_os def query_engines_with_minram(self, minram): """Return only engines with at least minram GB RAM.""" # run job only on matching minram matching_minram = self.ip_client.ids if minram > 0: for engine_id in self.ip_client.ids: engine = self.identify_computer(engine_id, 1000) if engine['memory'] < minram: matching_minram.remove(engine_id) print("DEBUG: matching minram: " + str(minram)) print(matching_minram) return matching_minram def query_engines_with_mincores(self, mincores): """Return only engines with at least mincores CPU cores.""" # run job only on matching mincores matching_mincores = self.ip_client.ids if mincores > 0: for engine_id in self.ip_client.ids: engine = self.identify_computer(engine_id, 1000) if engine['ncorescpu'] * engine['ncpus'] < mincores: matching_mincores.remove(engine_id) print("DEBUG: matching mincores: " + str(mincores)) print(matching_mincores) return matching_mincores def match_all_limits(self, os_list, minram_list, mincores_list, pool_list): """Match all limits for job.""" tmp_list = [] # build list with all list members tmp_list.extend(os_list) tmp_list.extend(minram_list) tmp_list.extend(mincores_list) tmp_list.extend(pool_list) # make entries unique tmp_list = set(tmp_list) tmp_list = list(tmp_list) matching_limits = [] for entry in tmp_list: # look if entry is in all lists if (entry in os_list) and (entry in minram_list) and (entry in mincores_list) and (entry in pool_list): matching_limits.append(entry) else: print("DEBUG: %i isn't matching limits" % entry) print("DEBUG: matching limits:") print(matching_limits) if len(matching_limits) == 0: message = "No engine meets the requirements." print(message) raise Exception(message) elif len(matching_limits) > 0: # only run on matching engines self.lbview = self.ip_client.load_balanced_view(matching_limits) else: self.lbview = self.ip_client.load_balanced_view() def job_stop(self, job_id): """Stop job and all tasks which are not currently running""" tasks = self.query_task_list(job_id) # abort all queued tasks for task in tasks: self.ip_client.abort(task['msg_id']) return True def job_kill(self, job_id): """Stop job and all of it's tasks wether running or not""" tasks = self.query_task_list(job_id) running_engines = [] # abort all queued tasks for task in tasks: stats = self.ip_client.queue_status('all', True) # check if tasks is already running on an engine for key,status in list(stats.items()): if ('tasks' in status) and (task['msg_id'] in status['tasks']): running_engines.append(key) self.ip_client.abort(task['msg_id']) # restart all engines which still run a task running_engines = set(running_engines) return True def job_delete(self, job_id): """Delete job and all of it's tasks""" tasks = self.query_task_list(job_id) engines = self.query_engine_list() # abort and delete all queued tasks for task in tasks: if len(engines) > 0: self.ip_client.abort(task['msg_id']) self.ip_client.purge_results(task['msg_id']) # delete job itself DrQueueJob.delete_from_db(job_id) return True def task_continue(self, task_id): """Continue aborted or failed task""" task = self.query_task(task_id) # check if action is needed if (task['completed'] != None) and ((task['result_header']['status'] == "error") or (task['result_header']['status'] == "aborted")): self.task_requeue(task_id) return True def task_requeue(self, task_id): """Requeue task""" self.ip_client.resubmit(task_id) print("requeuing %s" % task_id) return True def job_continue(self, job_id): """Continue stopped job and all of it's tasks""" job = self.query_job(job_id) tasks = self.query_task_list(job_id) # continue tasks for task in tasks: self.task_continue(task['msg_id']) return True def job_rerun(self, job_id): """Run all tasks of job another time""" job = self.query_job(job_id) tasks = self.query_task_list(job_id) # rerun tasks for task in tasks: self.task_requeue(task['msg_id']) # set resubmit time job['requeue_time'] = datetime.datetime.now() DrQueueJob.update_db(job) return True def job_status(self, job_id): """Return status string of job""" tasks = self.query_task_list(job_id) status = None status_pending = 0 status_ok = 0 status_aborted = 0 status_resubmitted = 0 status_error = 0 status_unknown = 0 for task in tasks: # look for pending tasks if task['completed'] == None: status_pending += 1 else: if 'result_header' in list(task.keys()): result_header = task['result_header'] # look for done tasks if ('status' in list(result_header.keys())) and (result_header['status'] == "ok"): status_ok += 1 # look for aborted tasks elif ('status' in list(result_header.keys())) and (result_header['status'] == "aborted"): status_aborted += 1 # look for done tasks elif ('status' in list(result_header.keys())) and (result_header['status'] == "resubmitted"): status_resubmitted += 1 # look for tasks with error elif ('status' in list(result_header.keys())) and (result_header['status'] == "error"): status_error += 1 else: status_unknown += 1 # if at least 1 task is ok, job status is ok if status_ok > 0: status = "ok" # if at least 1 task is pending, job status is pending if status_pending > 0: status = "pending" # if at least 1 task is aborted, job status is aborted if status_aborted > 0: status = "aborted" # if at least 1 task has an error, job status is error if status_error > 0: status = "error" return status def job_estimated_finish_time(self, job_id): """Calculate estimated finish time of job.""" tasks = self.query_task_list(job_id) spent_times = [] # get spent time for each finished task for task in tasks: if task['completed'] != None: if 'result_header' in list(task.keys()): result_header = task['result_header'] if ('status' in list(result_header.keys())) and (result_header['status'] == "ok"): timediff = task['completed'] - task['started'] spent_times.append(timediff) if len(spent_times) > 0: # calculate sum of spent time sum_times = datetime.timedelta(0) for spent in spent_times: sum_times += spent # calcutate mean time for a single task meantime = sum_times / len(spent_times) # calculate estimated time left tasks_left = len(tasks) - len(spent_times) time_left = tasks_left * meantime # query job object job = self.query_job(job_id) # look if all tasks are already done if self.query_job_tasks_left(job_id) == 0: finish_time = self.query_job_finish_time(job_id) else: # calculate estimated finish time, use requeue time if available if ('requeue_time' in job ) and (job['requeue_time'] != False): finish_time = job['requeue_time'] + time_left else: finish_time = job['submit_time'] + time_left else: meantime = "unknown" time_left = "unknown" finish_time = "unknown" return meantime, time_left, finish_time def engine_stop(self, engine_id): """Stop a specific engine""" # delete computer information in db DrQueueComputer.delete_from_db(engine_id) # shutdown computer self.ip_client.shutdown(engine_id) return True def engine_restart(self, engine_id): """Restart a specific engine""" self.ip_client.shutdown(engine_id, True, False, True) return True
class EngineManager(object): def __init__(self): self.profile = None self.started_controller = None self.started_engines = set() self._client = None def _select_profile(self): # See IPython.core.profileapp:list_profile_in() profiles = [] for filename in os.listdir(get_ipython_dir()): if filename.startswith('profile_'): profiles.append(filename[8:]) if profiles == ['default'] and not qt_available: self.profile = 'default' elif not qt_available: raise ValueError("'default' IPython profile does not exist " "and PyQt4 is not available") else: self.profile = choose_profile(profiles) def ensure_controller(self, connect_only=False): """Make sure a controller is available, else start a local one. """ if self._client: return self._client if self.profile is None: self._select_profile() if self.profile is None: return None print "parallelflow: using IPython profile %r" % self.profile try: self._client = Client(profile=self.profile) print "parallelflow: connected to controller" return self._client except error.TimeoutError: print "parallelflow: timeout when connecting to controller" if connect_only: start_ctrl = False elif qt_available: res = QtGui.QMessageBox.question( None, "Start controller", "Unable to connect to the configured IPython " "controller. Do you want to start one?", QtGui.QMessageBox.Yes | QtGui.QMessageBox.No) start_ctrl = res == QtGui.QMessageBox.Yes else: start_ctrl = True except IOError: print "parallelflow: didn't find a controller to connect to" if connect_only: start_ctrl = False elif qt_available: res = QtGui.QMessageBox.question( None, "Start controller", "No controller is configured in this IPython profile. " "Do you want to start one?", QtGui.QMessageBox.Yes | QtGui.QMessageBox.No) start_ctrl = res == QtGui.QMessageBox.Yes else: start_ctrl = True if start_ctrl: ctrl_pid = os.path.join( locate_profile(self.profile), 'pid', 'ipcontroller.pid') if os.path.exists(ctrl_pid): os.remove(ctrl_pid) print "parallelflow: starting controller" proc, code = self.start_process( lambda: os.path.exists(ctrl_pid), sys.executable, '-m', 'IPython.parallel.apps.ipcontrollerapp', '--profile=%s' % self.profile) if code is not None: if qt_available: QtGui.QMessageBox.critical( None, "Error", "Controller exited with code %d" % code) print ("parallelflow: controller process exited with " "code %d" % code) return None else: self.started_controller = proc print "parallelflow: controller started, connecting" self._client = Client(profile=self.profile) return self._client return None @staticmethod def start_process(condition, *args): """Executes a file and waits for a condition. """ prev_dir = os.getcwd() os.chdir(os.path.join(vistrails_root_directory(), os.path.pardir)) try: p = subprocess.Popen(args) finally: os.chdir(prev_dir) if condition is None: return p, None else: while True: time.sleep(0.5) if condition(): return p, None res = p.poll() if res is not None: return None, res def start_engines(self, nb=None, prompt="Number of engines to start"): """Start some engines locally """ c = self.ensure_controller() if c is None: if qt_available: QtGui.QMessageBox.warning( None, "No controller", "Can't start engines: couldn't connect to a " "controller") print "parallelflow: no controller, not starting engines" else: if not nb and qt_available: nb, res = QtGui.QInputDialog.getInt( None, "Start engines", prompt, 1, # value 1, # min 16) # max if not res: return elif nb is None: nb = 1 print "parallelflow: about to start %d engines" % nb if qt_available: bar = QtGui.QProgressDialog( "Starting engines...", None, 0, nb) def progress(n): bar.setValue(n) bar.show() else: def progress(n): pass progress(0) init_engines = set(c.ids) # Start the processes starting = set() for i in xrange(nb): proc, res = self.start_process( None, sys.executable, '-m', 'IPython.parallel.apps.ipengineapp', '--profile=%s' % self.profile) starting.add(proc) # Wait for each one to either fail or connect failed = [] connected = 0 while connected < len(starting): connected = len(set(c.ids) - init_engines) progress(len(failed) + connected) time.sleep(0.5) for p in list(starting): res = p.poll() if res is not None: failed.append(res) starting.remove(p) if failed: nb_failed = len(failed) if nb_failed > 3: failed = "%s, ..." % (', '.join('%d' % f for f in failed)) else: failed = ', '.join('%d' % f for f in failed) if qt_available: QtGui.QMessageBox.critical( None, "Error", "%d engine(s) exited with codes: %s" % ( nb_failed, failed)) print "parallelflow: %d engine(s) exited with codes: %s" % ( nb_failed, failed) self.started_engines.update(starting) if qt_available: bar.hide() bar.deleteLater() print "parallelflow: %d engines started" % (i + 1) def info(self): """Show some information on the cluster. """ client = self.ensure_controller(connect_only=True) print "----- IPython information -----" print "profile: %s" % self.profile connected = client is not None print "connected to controller: %s" % ( "yes" if connected else "no") st_ctrl = (self.started_controller is not None and self.started_controller.poll() is None) print "controller started from VisTrails: %s" % ( "running" if st_ctrl else "no") st_engines = sum(1 for p in self.started_engines if p.poll() is None) print "engines started from VisTrails: %d" % st_engines if client is not None: nb_engines = len(client.ids) else: nb_engines = None print "total engines in cluster: %s" % ( nb_engines if nb_engines is not None else "(unknown)") if connected and client.ids: dview = client[:] with dview.sync_imports(): import os import platform import socket engines = dview.apply_async( eval, '(os.getpid(), platform.system(), socket.getfqdn())' ).get_dict() engines = sorted( engines.items(), key=lambda (ip_id, (pid, system, fqdn)): (fqdn, ip_id)) print "engines:" print "\tid\tsystem\tPID\tnode FQDN" print "\t--\t------\t---\t---------" for ip_id, (pid, system, fqdn) in engines: print "\t%d\t%s\t%d\t%s" % (ip_id, system, pid, fqdn) print "" if qt_available: dialog = QtGui.QDialog() layout = QtGui.QVBoxLayout() form = QtGui.QFormLayout() form.addRow( "Profile:", QtGui.QLabel(self.profile)) form.addRow( "Connected:", QtGui.QLabel("yes" if connected else "no")) form.addRow( "Controller started from VisTrails:", QtGui.QLabel("running" if st_ctrl else "no")) form.addRow( "Engines started from VisTrails:", QtGui.QLabel(str(st_engines))) form.addRow( "Total engines in cluster:", QtGui.QLabel(str(nb_engines) if nb_engines is not None else "(unknown)")) layout.addLayout(form) if connected and client.ids: tree = QtGui.QTreeWidget() tree.setHeaderHidden(False) tree.setHeaderLabels(["IPython id", "PID", "System type"]) engine_tree = dict() for ip_id, (pid, system, fqdn) in engines: engine_tree.setdefault(fqdn, []).append( (ip_id, pid, system)) for fqdn, info in engine_tree.iteritems(): node = QtGui.QTreeWidgetItem([fqdn]) tree.addTopLevelItem(node) tree.setFirstItemColumnSpanned(node, True) for ip_id, pid, system in info: node.addChild(QtGui.QTreeWidgetItem([ str(ip_id), str(pid), system])) for i in xrange(tree.columnCount()): tree.resizeColumnToContents(i) tree.expandAll() layout.addWidget(tree) ok = QtGui.QPushButton("Ok") QtCore.QObject.connect(ok, QtCore.SIGNAL('clicked()'), dialog, QtCore.SLOT('accept()')) layout.addWidget(ok, 1, QtCore.Qt.AlignHCenter) dialog.setLayout(layout) dialog.exec_() def change_profile(self): self.cleanup() old_profile = self.profile self._select_profile() if not self.profile: self.profile = old_profile if self.profile != old_profile: # Here, the processes that were started but the user didn't want to # clean up are abandonned # They will continue running but later cleanups won't ask for these # ones self.started_engines = set() self.started_controller = None def cleanup(self): """Shut down the started processes (with user confirmation). """ engines = sum(1 for p in self.started_engines if p.poll() is None) ctrl = (self.started_controller is not None and self.started_controller.poll() is None) print ("parallelflow: cleanup: %s, %d engines running" % ( "controller running" if ctrl else "no controller", engines)) hub_shutdown = False if ctrl: if qt_available: res = QtGui.QMessageBox.question( None, "Shutdown controller", "The controller is still running. Do you want to stop " "it?", QtGui.QMessageBox.Yes, QtGui.QMessageBox.No) res = res != QtGui.QMessageBox.No else: res = True if res: if self._client is not None: self._client.shutdown( targets='all', restart=False, hub=True, block=False) hub_shutdown = True print "parallelflow: requested hub shutdown" else: if self.started_controller.poll() is not None: self.started_controller.terminate() self.started_controller.wait() print "parallelflow: controller terminated" self.started_controller = None if engines > 0 and not hub_shutdown: if qt_available: if self._client is not None: total = " (among %d total)" % len(self._client.ids) else: total = '' res = QtGui.QMessageBox.question( None, "Shutdown engines", "%d engines started here%s are still " "running. Do you want to stop them?" % ( engines, total), QtGui.QMessageBox.Yes, QtGui.QMessageBox.No) res = res != QtGui.QMessageBox.No else: res = True if res: for engine in self.started_engines: if engine.poll() is not None: engine.terminate() engine.wait() print ("parallelflow: %d engines terminated" % len(self.started_engines)) self.started_engines = set() if self._client is not None: print "parallelflow: closing client" self._client.close() self._client = None def shutdown_cluster(self): """Use the client to request a shutdown of the whole cluster. """ client = self.ensure_controller(connect_only=True) if client is None: if qt_available: QtGui.QMessageBox.information( None, "Couldn't connect", "Couldn't connect to a controller. Is the cluster " "down already?") print ("parallelflow: shutdown_cluster requested, but could " "not connect to a controller") return if qt_available: res = QtGui.QMessageBox.question( None, "Shutdown cluster", "This will use the client connection to request the hub " "and every engine to shutdown. Continue?", QtGui.QMessageBox.Ok, QtGui.QMessageBox.Cancel) if res != QtGui.QMessageBox.Ok: return self._client.shutdown( targets='all', restart=False, hub=True, block=False) print "parallelflow: cluster shutdown requested" self._client = None
class Client: """DrQueue client actions""" def __init__(self): # initialize IPython try: self.ip_client = IPClient() except Exception: raise Exception("Could not connect to IPython controller.") self.lbview = self.ip_client.load_balanced_view() # enable tracking self.lbview.track = True def job_run(self, job): """Create and queue tasks from job object""" # check job name if job["name"] in DrQueueJob.query_jobnames(): raise ValueError("Job name %s is already used!" % job["name"]) return False # run job only on matching os os_list = self.query_engines_of_os(job["limits"]["os"]) # run job only on matching minram minram_list = self.query_engines_with_minram(job["limits"]["minram"]) # run job only on matching mincores mincores_list = self.query_engines_with_mincores(job["limits"]["mincores"]) # check pool members pool_list = self.query_engines_of_pool(job["limits"]["pool"]) # check limits self.match_all_limits(os_list, minram_list, mincores_list, pool_list) # save job in database job_id = DrQueueJob.store_db(job) # job_id from db is be used as session name self.ip_client.session.session = str(job_id) # set owner of job self.ip_client.session.username = job["owner"] # set number of retries for each task self.lbview.retries = job["retries"] # depend on another job (it's tasks) if ("depend" in job["limits"]) and (job["limits"]["depend"] != None): depend_job = self.query_job_by_name(job["limits"]["depend"]) depend_tasks = self.query_task_list(depend_job["_id"]) task_ids = [] for task in depend_tasks: task_ids.append(task["msg_id"]) self.lbview.after = task_ids # check frame numbers if not (job["startframe"] >= 1): raise ValueError("Invalid value for startframe. Has to be equal or greater than 1.") return False if not (job["endframe"] >= 1): raise ValueError("Invalid value for endframe. Has to be equal or greater than 1.") return False if not (job["endframe"] >= job["startframe"]): raise ValueError("Invalid value for endframe. Has be to equal or greater than startframe.") return False if job["endframe"] > job["startframe"]: if not (job["endframe"] - job["startframe"] >= job["blocksize"]): raise ValueError("Invalid value for blocksize. Has to be equal or lower than endframe-startframe.") return False if job["endframe"] == job["startframe"]: if job["blocksize"] != 1: raise ValueError("Invalid value for blocksize. Has to be equal 1 if endframe equals startframe.") return False task_frames = range(job["startframe"], job["endframe"] + 1, job["blocksize"]) for x in task_frames: # prepare script input env_dict = { "DRQUEUE_FRAME": x, "DRQUEUE_BLOCKSIZE": job["blocksize"], "DRQUEUE_ENDFRAME": job["endframe"], "DRQUEUE_SCENEFILE": job["scenefile"], "DRQUEUE_LOGFILE": job["name"] + "-" + str(x) + "_" + str(x + job["blocksize"] - 1) + ".log", } # optional elements if "renderdir" in job: env_dict["DRQUEUE_RENDERDIR"] = job["renderdir"] if "projectdir" in job: env_dict["DRQUEUE_PROJECTDIR"] = job["projectdir"] if "configdir" in job: env_dict["DRQUEUE_CONFIGDIR"] = job["configdir"] if "imagefile" in job: env_dict["DRQUEUE_IMAGEFILE"] = job["imagefile"] if "precommand" in job: env_dict["DRQUEUE_PRECOMMAND"] = job["precommand"] if "renderer" in job: env_dict["DRQUEUE_RENDERER"] = job["renderer"] if "fileformat" in job: env_dict["DRQUEUE_FILEFORMAT"] = job["fileformat"] if "postcommand" in job: env_dict["DRQUEUE_POSTCOMMAND"] = job["postcommand"] if "viewcommand" in job: env_dict["DRQUEUE_VIEWCOMMAND"] = job["viewcommand"] if "worldfile" in job: env_dict["DRQUEUE_WORLDFILE"] = job["worldfile"] if "terrainfile" in job: env_dict["DRQUEUE_TERRAINFILE"] = job["terrainfile"] if "composition" in job: env_dict["DRQUEUE_COMPOSITION"] = job["composition"] if "camera" in job: env_dict["DRQUEUE_CAMERA"] = job["camera"] if "resx" in job: env_dict["DRQUEUE_RESX"] = job["resx"] if "resy" in job: env_dict["DRQUEUE_RESY"] = job["resy"] if "renderpass" in job: env_dict["DRQUEUE_RENDERPASS"] = job["renderpass"] if "rendertype" in job: env_dict["DRQUEUE_RENDERTYPE"] = job["rendertype"] if "fileextension" in job: env_dict["DRQUEUE_FILEEXTENSION"] = job["fileextension"] if "stepframe" in job: env_dict["DRQUEUE_STEPFRAME"] = job["stepframe"] if "custom_bucket" in job: env_dict["DRQUEUE_CUSTOM_BUCKET"] = job["custom_bucket"] if "bucketsize" in job: env_dict["DRQUEUE_BUCKETSIZE"] = job["bucketsize"] if "custom_lod" in job: env_dict["DRQUEUE_CUSTOM_LOD"] = job["custom_lod"] if "lod" in job: env_dict["DRQUEUE_LOD"] = job["lod"] if "custom_varyaa" in job: env_dict["DRQUEUE_CUSTOM_VARYAA"] = job["custom_varyaa"] if "varyaa" in job: env_dict["DRQUEUE_VARYAA"] = job["varyaa"] if "raytrace" in job: env_dict["DRQUEUE_RAYTRACE"] = job["raytrace"] if "antialias" in job: env_dict["DRQUEUE_ANTIALIAS"] = job["antialias"] if "custom_bdepth" in job: env_dict["DRQUEUE_CUSTOM_BDEPTH"] = job["custom_bdepth"] if "bdepth" in job: env_dict["DRQUEUE_BDEPTH"] = job["bdepth"] if "custom_zdepth" in job: env_dict["DRQUEUE_CUSTOM_ZDEPTH"] = job["custom_zdepth"] if "zdepth" in job: env_dict["DRQUEUE_ZDEPTH"] = job["zdepth"] if "custom_cracks" in job: env_dict["DRQUEUE_CUSTOM_CRACKS"] = job["custom_cracks"] if "cracks" in job: env_dict["DRQUEUE_CRACKS"] = job["cracks"] if "custom_quality" in job: env_dict["DRQUEUE_CUSTOM_QUALITY"] = job["custom_quality"] if "quality" in job: env_dict["DRQUEUE_QUALITY"] = job["quality"] if "custom_qfiner" in job: env_dict["DRQUEUE_CUSTOM_QFINER"] = job["custom_qfiner"] if "qfiner" in job: env_dict["DRQUEUE_QFINER"] = job["qfiner"] if "custom_smultiplier" in job: env_dict["DRQUEUE_CUSTOM_SMULTIPLIER"] = job["custom_smultiplier"] if "smultiplier" in job: env_dict["DRQUEUE_SMULTIPLIER"] = job["smultiplier"] if "custom_mpcache" in job: env_dict["DRQUEUE_CUSTOM_MPCACHE"] = job["custom_mpcache"] if "mpcache" in job: env_dict["DRQUEUE_MPCACHE"] = job["mpcache"] if "custom_smpolygon" in job: env_dict["DRQUEUE_CUSTOM_SMPOLYGON"] = job["custom_smpolygon"] if "smpolygon" in job: env_dict["DRQUEUE_SMPOLYGON"] = job["smpolygon"] if "custom_wh" in job: env_dict["DRQUEUE_CUSTOM_WH"] = job["custom_wh"] if "custom_type" in job: env_dict["DRQUEUE_CUSTOM_TYPE"] = job["custom_type"] if "ctype" in job: env_dict["DRQUEUE_CTYPE"] = job["ctype"] if "skipframes" in job: env_dict["DRQUEUE_SKIPFRAMES"] = job["skipframes"] # run task on cluster render_script = DrQueue.get_rendertemplate(job["renderer"]) ar = self.lbview.apply(DrQueue.run_script_with_env, render_script, env_dict) # wait for pyzmq send to complete communication (avoid race condition) ar.wait_for_send() return True def identify_computer(self, engine_id, cache_time): """Gather information about computer""" # look if engine info is already stored engine = DrQueueComputer.query_db(engine_id) now = int(time.time()) # check existence and age of info if (engine != None) and (now <= engine["date"] + cache_time): print ("DEBUG: Engine %i was found in DB" % engine_id) # store new info else: print ("DEBUG: Engine %i was not found in DB" % engine_id) # run command only on specific computer dview = self.ip_client[engine_id] dview.block = True dview.execute( "import DrQueue\nfrom DrQueue import Computer as DrQueueComputer\nengine = DrQueueComputer(" + str(engine_id) + ")" ) engine = dview["engine"] engine["date"] = int(time.time()) DrQueueComputer.store_db(engine) return engine def task_wait(self, task_id): """Wait for task to finish""" ar = self.ip_client.get_result(task_id) ar.wait_for_send() ar.wait() return ar def query_job_list(self): """Query a list of all jobs""" return DrQueueJob.query_job_list() def query_running_job_list(self): """Query a list of all running jobs""" jobs = DrQueueJob.query_job_list() running_jobs = [] for job in jobs: if self.query_job_tasks_left(job["_id"]) > 0: running_jobs.append(job) return running_jobs def query_jobname(self, task_id): """Query jobname from task id""" data = self.ip_client.db_query({"msg_id": task_id}) job_id = data[0]["header"]["session"] job = DrQueueJob.query_db(job_id) return job.name def query_job(self, job_id): """Query job from id""" return DrQueueJob.query_db(job_id) def query_job_by_name(self, job_name): """Query job from name""" return DrQueueJob.query_job_by_name(job_name) def query_job_tasks_left(self, job_id): """Query left frames of job""" left = 0 tasks = self.query_task_list(job_id) for task in tasks: if task["completed"] == None: left += 1 return left def query_task_list(self, job_id): """Query a list of tasks objects of certain job""" return self.ip_client.db_query({"header.session": str(job_id)}) def query_task(self, task_id): """Query a single task""" task = self.ip_client.db_query({"msg_id": task_id})[0] return task def query_engine_list(self): """Query a list of all engines""" return self.ip_client.ids def query_engines_of_pool(self, pool_name): """Return available engines of certain pool.""" pool_computers = self.ip_client.ids if pool_name != None: computers = DrQueueComputerPool.query_pool_members(pool_name) if computers == None: raise ValueError('Pool "%s" is not existing!' % pool_name) return False for comp in pool_computers: if not comp in computers: pool_computers.remove(comp) if pool_computers == []: raise ValueError("No computer of pool %s is available!" % pool_name) return False print ("DEBUG: matching pool: " + pool_name) print (pool_computers) return pool_computers def query_engines_of_os(self, os_name): """Return only engines running certain OS.""" # run job only on matching os matching_os = self.ip_client.ids if os_name != None: for engine_id in self.ip_client.ids: engine = self.identify_computer(engine_id, 1000) # os string has to contain os_name if not os_name in engine["os"]: matching_os.remove(engine_id) print ("DEBUG: matching os: " + os_name) print (matching_os) return matching_os def query_engines_with_minram(self, minram): """Return only engines with at least minram GB RAM.""" # run job only on matching minram matching_minram = self.ip_client.ids if minram > 0: for engine_id in self.ip_client.ids: engine = self.identify_computer(engine_id, 1000) if engine["memory"] < minram: matching_minram.remove(engine_id) print ("DEBUG: matching minram: " + str(minram)) print (matching_minram) return matching_minram def query_engines_with_mincores(self, mincores): """Return only engines with at least minram GB RAM.""" # run job only on matching mincores matching_mincores = self.ip_client.ids if mincores > 0: for engine_id in self.ip_client.ids: engine = self.identify_computer(engine_id, 1000) if engine["ncorescpu"] * engine["ncpus"] < mincores: matching_mincores.remove(engine_id) print ("DEBUG: matching mincores: " + str(mincores)) print (matching_mincores) return matching_mincores def match_all_limits(self, os_list, minram_list, mincores_list, pool_list): """Match all limits for job.""" tmp_list = [] # build list with all list members tmp_list.extend(os_list) tmp_list.extend(minram_list) tmp_list.extend(mincores_list) tmp_list.extend(pool_list) # make entries unique tmp_list = set(tmp_list) tmp_list = list(tmp_list) matching_limits = [] for entry in tmp_list: # look if entry is in all lists if (entry in os_list) and (entry in minram_list) and (entry in mincores_list) and (entry in pool_list): matching_limits.append(entry) else: print ("DEBUG: %i isn't matching limits" % entry) print ("DEBUG: matching limits:") print (matching_limits) if len(matching_limits) == 0: message = "No engine meets the requirements." print (message) raise Exception(message) elif len(matching_limits) > 0: # only run on matching engines self.lbview = self.ip_client.load_balanced_view(matching_limits) else: self.lbview = self.ip_client.load_balanced_view() def job_stop(self, job_id): """Stop job and all tasks which are not currently running""" tasks = self.query_task_list(job_id) # abort all queued tasks for task in tasks: self.ip_client.abort(task["msg_id"]) return True def job_kill(self, job_id): """Stop job and all of it's tasks wether running or not""" tasks = self.query_task_list(job_id) running_engines = [] # abort all queued tasks for task in tasks: stats = self.ip_client.queue_status("all", True) # check if tasks is already running on an engine for key, status in stats.items(): if ("tasks" in status) and (task["msg_id"] in status["tasks"]): print "found" running_engines.append(key) self.ip_client.abort(task["msg_id"]) # restart all engines which still run a task running_engines = set(running_engines) print list(running_engines) # for engine_id in running_engines: # self.ip_client(engine_id) return True def job_delete(self, job_id): """Delete job and all of it's tasks""" tasks = self.query_task_list(job_id) engines = self.query_engine_list() # abort and delete all queued tasks for task in tasks: if len(engines) > 0: self.ip_client.abort(task["msg_id"]) self.ip_client.purge_results(task["msg_id"]) # delete job itself DrQueueJob.delete_from_db(job_id) return True def task_continue(self, task_id): """Continue aborted or failed task""" task = self.query_task(task_id) # check if action is needed if (task["completed"] != None) and ( (task["result_header"]["status"] == "error") or (task["result_header"]["status"] == "aborted") ): self.task_requeue(task_id) return True def task_requeue(self, task_id): """Requeue task""" self.ip_client.resubmit(task_id) print "requeuing %s" % task_id return True def job_continue(self, job_id): """Continue stopped job and all of it's tasks""" job = self.query_job(job_id) # run job only on matching os os_list = self.query_engines_of_os(job["limits"]["os"]) # run job only on matching minram minram_list = self.query_engines_with_minram(job["limits"]["minram"]) # run job only on matching mincores mincores_list = self.query_engines_with_mincores(job["limits"]["mincores"]) # check pool members pool_list = self.query_engines_of_pool(job["limits"]["pool"]) # check limits self.match_all_limits(os_list, minram_list, mincores_list, pool_list) tasks = self.query_task_list(job_id) # continue tasks for task in tasks: self.task_continue(task["msg_id"]) return True def job_rerun(self, job_id): """Run all tasks of job another time""" job = self.query_job(job_id) # run job only on matching os os_list = self.query_engines_of_os(job["limits"]["os"]) # run job only on matching minram minram_list = self.query_engines_with_minram(job["limits"]["minram"]) # run job only on matching mincores mincores_list = self.query_engines_with_mincores(job["limits"]["mincores"]) # check pool members pool_list = self.query_engines_of_pool(job["limits"]["pool"]) # check limits self.match_all_limits(os_list, minram_list, mincores_list, pool_list) tasks = self.query_task_list(job_id) # rerun tasks for task in tasks: self.task_requeue(task["msg_id"]) return True def job_status(self, job_id): """Return status string of job""" tasks = self.query_task_list(job_id) status = None status_pending = 0 status_ok = 0 status_aborted = 0 status_resubmitted = 0 status_error = 0 for task in tasks: # look for pending tasks if task["completed"] == None: status_pending += 1 else: if "result_header" in task.keys(): result_header = task["result_header"] # look for done tasks if ("status" in result_header.keys()) and (result_header["status"] == "ok"): status_ok += 1 # look for aborted tasks elif ("status" in result_header.keys()) and (result_header["status"] == "aborted"): status_aborted += 1 # look for done tasks elif ("status" in result_header.keys()) and (result_header["status"] == "resubmitted"): status_resubmitted += 1 # look for tasks with error elif ("status" in result_header.keys()) and (result_header["status"] == "error"): status_error += 1 else: status_unknown += 1 # if at least 1 task is ok, job status is ok if status_ok > 0: status = "ok" # if at least 1 task is pending, job status is pending if status_pending > 0: status = "pending" # if at least 1 task is aborted, job status is aborted if status_aborted > 0: status = "aborted" # if at least 1 task has an error, job status is error if status_error > 0: status = "error" return status def engine_stop(self, engine_id): """Stop a specific engine""" # delete computer information in db DrQueueComputer.delete_from_db(engine_id) # shutdown computer self.ip_client.shutdown(engine_id) return True def engine_restart(self, engine_id): """Restart a specific engine""" self.ip_client.shutdown(engine_id, True, False, True) return True
class BCK: def __init__(self, pipe, profile, pager_queue, jobs_idle, jobs_executing, jobs_finished): from IPython.parallel import Client self.jobs_idle, self.jobs_executing, self.jobs_finished = jobs_idle, jobs_executing, jobs_finished self.pipe, self.profile = pipe, profile self.client = Client(profile = profile) self.engines_idle = [Engine(self.client[id], jobs_executing) for id in self.client.ids] assign_gpuid_to_engines(self.engines_idle) self.engines_executing = [] self.run = True self.run_scheduling = False self.pager_queue = pager_queue global _pager_queue _pager_queue = self.pager_queue def app(self): print('client: %d engines, with id\'s %s are up' % (len(self.client.ids), self.client.ids)) for engine in self.engines_idle: print('id %d on %s' % (engine.id, engine.hostname)) self.pipe.send(Ready('all systems are a go')) yield bluelet.call(self.scheduler()) def bluelet(self): bluelet.run(self.app()) def scheduler(self): while self.run: if not self.pipe.poll(): yield bluelet.null() if self.run_scheduling: if len(self.engines_idle) > 0 and len(self.jobs_idle) > 0: yield bluelet.spawn(self.schedule_job()) else: recv = self.pipe.recv() BCK.__dict__[recv[0]](self, *recv[1:]) yield bluelet.end() def schedule_job(self): unlucky = self.engines_idle.pop() self.engines_executing.append(unlucky) _,lucky = self.jobs_idle.popitem() lucky.engine_id = unlucky.id # assignment must happen before reinsertion self.jobs_executing[lucky.id] = lucky yield bluelet.call(unlucky.start_job(lucky.id)) # print('job finished: %s' % ('\n'.join(self.jobs_executing[lucky.id].output_queue))) self.jobs_finished[lucky.id] = self.jobs_executing[lucky.id] # print('job finished: %s' % ('\n'.join(self.jobs_finished[lucky.id].output_queue))) del self.jobs_executing[lucky.id] unlucky.executing_job = None self.engines_executing.remove(unlucky) self.engines_idle.append(unlucky) yield bluelet.end() def stop_monitor(self, ack = True): self.run = False for engine in self.engines_executing: engine.apply(BCK.remote_command, 'stop_process') print('%s stopped' % engine) if ack: self.pipe.send('stop_monitor') def shutdown_all(self): self.stop_monitor(ack = False) self.client.shutdown(hub = True) self.pipe.send('shutdown_all') def list_engines(self): pr = ''' --- executing --- %s --- idle --- %s''' % ('\n'.join(map(str,self.engines_executing)), '\n'.join(map(str, self.engines_idle))) print(pr) self.pipe.send('list_engines') def start_scheduling(self): self.status_report(ack = False) self.run_scheduling = True self.pipe.send('start_scheduling') def stop_scheduling(self): self.status_report(ack = False) self.run_scheduling = False self.pipe.send('stop_scheduling') def status_report(self, ack = True): print('%d executing job(s)' % len(self.jobs_executing)) print('%d finished job(s)' % len(self.jobs_finished)) print('%d idle job(s)' % len(self.jobs_idle)) if ack: self.pipe.send('status_report') def follow(self, id): if id in self.jobs_executing: while not self.pager_queue.empty(): self.pager_queue.get() for line in self.jobs_executing[id].output_queue: self.pager_queue.put(line) self.pager_queue.activate(id) self.pipe.send('follow') elif id in self.jobs_finished: while not self.pager_queue.empty(): self.pager_queue.get() for line in self.jobs_finished[id].output_queue: self.pager_queue.put(line) self.pipe.send('follow') else: raise ValueError('follow: \'job\' cannot be None') def unfollow(self, id): self.pager_queue.deactivate() self.pipe.send('unfollow') def remove(self, id): if id in self.jobs_executing: job = self.jobs_executing[id] for engine in self.engines_executing: if job.engine_id == engine.id: engine.apply(BCK.remote_command, 'stop_process') break self.pipe.send('remove') ################################################################################ # REMOTE @staticmethod def start_job(command, wdir, udir): from subprocess import Popen, PIPE import os global job os.chdir(wdir) if udir: try: os.mkdir(udir) except OSError: pass os.chdir(udir) print('pwd:') os.getcwd() #job = Popen(command.split(' '), shell = True, stdout = PIPE) # driver scripten keresztul lehet csak futtatni igy job = Popen(command, shell = True, stdout = PIPE) @staticmethod def remote_command(command): global job if job is None: return 0, b'' # raise ValueError('\'job\' cannot be None') if command == 'stop_process': job.kill() ret = job.wait() job = None return ret if command == 'relay_stdout': return job.poll(), job.stdout.readline() else: raise ValueError('Wrong command <%s>' % command) @staticmethod def remote_system_command(cmd): import subprocess p = subprocess.Popen(cmd.split(' '), stdout = subprocess.PIPE) return p.stdout.readall().strip().decode()