class Client(): """DrQueue client actions""" def __init__(self): # initialize IPython try: self.ip_client = IPClient() except Exception: raise Exception("Could not connect to IPython controller.") self.lbview = self.ip_client.load_balanced_view() # enable tracking self.lbview.track = True # list of all available query keys self.all_task_query_keys = ['msg_id', 'header', 'content', 'buffers', 'submitted', 'client_uuid', 'engine_uuid', 'started', 'completed', 'resubmitted', 'result_header', 'result_content', 'result_buffers', 'queue', 'pyin', 'pyout', 'pyerr', 'stdout', 'stderr'] def job_run(self, job): """Create and queue tasks from job object""" # check job name if job['name'] in DrQueueJob.query_jobnames(): raise ValueError("Job name %s is already used!" % job['name']) return False # save job in database job_id = DrQueueJob.store_db(job) # job_id from db is be used as session name self.ip_client.session.session = str(job_id) # set owner of job self.ip_client.session.username = job['owner'] # set number of retries for each task self.lbview.retries = job['retries'] # depend on another job (it's tasks) if ('depend' in job['limits']) and (job['limits']['depend'] != None): depend_job = self.query_job_by_name(job['limits']['depend']) depend_tasks = self.query_task_list(depend_job['_id']) task_ids = [] for task in depend_tasks: task_ids.append(task['msg_id']) self.lbview.after = task_ids # check frame numbers if not (job['startframe'] >= 1): raise ValueError("Invalid value for startframe. Has to be equal or greater than 1.") return False if not (job['endframe'] >= 1): raise ValueError("Invalid value for endframe. Has to be equal or greater than 1.") return False if not (job['endframe'] >= job['startframe']): raise ValueError("Invalid value for endframe. Has be to equal or greater than startframe.") return False if job['endframe'] > job['startframe']: if not (job['endframe'] - job['startframe'] >= job['blocksize']): raise ValueError("Invalid value for blocksize. Has to be equal or lower than endframe-startframe.") return False if job['endframe'] == job['startframe']: if job['blocksize'] != 1: raise ValueError("Invalid value for blocksize. Has to be equal 1 if endframe equals startframe.") return False task_frames = list(range(job['startframe'], job['endframe'] + 1, job['blocksize'])) ar = None for x in task_frames: # prepare script input env_dict = { 'DRQUEUE_FRAME' : x, 'DRQUEUE_BLOCKSIZE' : job['blocksize'], 'DRQUEUE_ENDFRAME' : job['endframe'], 'DRQUEUE_SCENEFILE' : job['scenefile'] } # log filename if job['created_with'] == "DrQueueOnRails": # take job directory name env_dict['DRQUEUE_LOGFILE'] = job['scenefile'].split("/")[-2] + "-" + str(x) + "_" + str(x + job['blocksize'] -1) + ".log" else: # take job name env_dict['DRQUEUE_LOGFILE'] = job['name'] + "-" + str(x) + "_" + str(x + job['blocksize'] -1) + ".log" # optional elements if 'renderdir' in job: env_dict['DRQUEUE_RENDERDIR'] = job['renderdir'] if 'projectdir' in job: env_dict['DRQUEUE_PROJECTDIR'] = job['projectdir'] if 'configdir' in job: env_dict['DRQUEUE_CONFIGDIR'] = job['configdir'] if 'imagefile' in job: env_dict['DRQUEUE_IMAGEFILE'] = job['imagefile'] if 'precommand' in job: env_dict['DRQUEUE_PRECOMMAND'] = job['precommand'] if 'renderer' in job: env_dict['DRQUEUE_RENDERER'] = job['renderer'] if 'fileformat' in job: env_dict['DRQUEUE_FILEFORMAT'] = job['fileformat'] if 'postcommand' in job: env_dict['DRQUEUE_POSTCOMMAND'] = job['postcommand'] if 'viewcommand' in job: env_dict['DRQUEUE_VIEWCOMMAND'] = job['viewcommand'] if 'worldfile' in job: env_dict['DRQUEUE_WORLDFILE'] = job['worldfile'] if 'terrainfile' in job: env_dict['DRQUEUE_TERRAINFILE'] = job['terrainfile'] if 'composition' in job: env_dict['DRQUEUE_COMPOSITION'] = job['composition'] if 'camera' in job: env_dict['DRQUEUE_CAMERA'] = job['camera'] if 'resx' in job: env_dict['DRQUEUE_RESX'] = job['resx'] if 'resy' in job: env_dict['DRQUEUE_RESY'] = job['resy'] if 'renderpass' in job: env_dict['DRQUEUE_RENDERPASS'] = job['renderpass'] if 'rendertype' in job: env_dict['DRQUEUE_RENDERTYPE'] = job['rendertype'] if 'fileextension' in job: env_dict['DRQUEUE_FILEEXTENSION'] = job['fileextension'] if 'stepframe' in job: env_dict['DRQUEUE_STEPFRAME'] = job['stepframe'] if 'custom_bucket' in job: env_dict['DRQUEUE_CUSTOM_BUCKET'] = job['custom_bucket'] if 'bucketsize' in job: env_dict['DRQUEUE_BUCKETSIZE'] = job['bucketsize'] if 'custom_lod' in job: env_dict['DRQUEUE_CUSTOM_LOD'] = job['custom_lod'] if 'lod' in job: env_dict['DRQUEUE_LOD'] = job['lod'] if 'custom_varyaa' in job: env_dict['DRQUEUE_CUSTOM_VARYAA'] = job['custom_varyaa'] if 'varyaa' in job: env_dict['DRQUEUE_VARYAA'] = job['varyaa'] if 'raytrace' in job: env_dict['DRQUEUE_RAYTRACE'] = job['raytrace'] if 'antialias' in job: env_dict['DRQUEUE_ANTIALIAS'] = job['antialias'] if 'custom_bdepth' in job: env_dict['DRQUEUE_CUSTOM_BDEPTH'] = job['custom_bdepth'] if 'bdepth' in job: env_dict['DRQUEUE_BDEPTH'] = job['bdepth'] if 'custom_zdepth' in job: env_dict['DRQUEUE_CUSTOM_ZDEPTH'] = job['custom_zdepth'] if 'zdepth' in job: env_dict['DRQUEUE_ZDEPTH'] = job['zdepth'] if 'custom_cracks' in job: env_dict['DRQUEUE_CUSTOM_CRACKS'] = job['custom_cracks'] if 'cracks' in job: env_dict['DRQUEUE_CRACKS'] = job['cracks'] if 'custom_quality' in job: env_dict['DRQUEUE_CUSTOM_QUALITY'] = job['custom_quality'] if 'quality' in job: env_dict['DRQUEUE_QUALITY'] = job['quality'] if 'custom_qfiner' in job: env_dict['DRQUEUE_CUSTOM_QFINER'] = job['custom_qfiner'] if 'qfiner' in job: env_dict['DRQUEUE_QFINER'] = job['qfiner'] if 'custom_smultiplier' in job: env_dict['DRQUEUE_CUSTOM_SMULTIPLIER'] = job['custom_smultiplier'] if 'smultiplier' in job: env_dict['DRQUEUE_SMULTIPLIER'] = job['smultiplier'] if 'custom_mpcache' in job: env_dict['DRQUEUE_CUSTOM_MPCACHE'] = job['custom_mpcache'] if 'mpcache' in job: env_dict['DRQUEUE_MPCACHE'] = job['mpcache'] if 'custom_smpolygon' in job: env_dict['DRQUEUE_CUSTOM_SMPOLYGON'] = job['custom_smpolygon'] if 'smpolygon' in job: env_dict['DRQUEUE_SMPOLYGON'] = job['smpolygon'] if 'custom_wh' in job: env_dict['DRQUEUE_CUSTOM_WH'] = job['custom_wh'] if 'custom_type' in job: env_dict['DRQUEUE_CUSTOM_TYPE'] = job['custom_type'] if 'ctype' in job: env_dict['DRQUEUE_CTYPE'] = job['ctype'] if 'skipframes' in job: env_dict['DRQUEUE_SKIPFRAMES'] = job['skipframes'] # set dependencies dep_dict = {} dep_dict['job_id'] = job_id if ('os' in job['limits']) and (job['limits']['os'] != None): dep_dict['os_name'] = job['limits']['os'] if ('minram' in job['limits']) and (job['limits']['minram'] > 0): dep_dict['minram'] = job['limits']['minram'] if ('mincores' in job['limits']) and (job['limits']['mincores'] > 0): dep_dict['mincores'] = job['limits']['mincores'] if ('pool_name' in job['limits']) and (job['limits']['pool_name'] != None): dep_dict['pool_name'] = job['limits']['pool_name'] run_script_with_env_and_deps = dependent(DrQueue.run_script_with_env, DrQueue.check_deps, dep_dict) # run task on cluster render_script = DrQueue.get_rendertemplate(job['renderer']) ar = self.lbview.apply(run_script_with_env_and_deps, render_script, env_dict) # wait for pyzmq send to complete communication (avoid race condition) ar.wait_for_send() # append email task behind last task if requested if ('send_email' in job) and (job['send_email'] == True): self.lbview.after = ar # run email task mail_ar = self.lbview.apply(DrQueue.send_email, job['name'], job['email_recipients']) # wait for pyzmq send to complete communication (avoid race condition) mail_ar.wait_for_send() return True def identify_computer(self, engine_id, cache_time, timeout=15): """Gather information about computer""" # look if engine info is already stored engine = DrQueueComputer.query_db_by_engine_id(engine_id) now = int(time.time()) # check existence and age of info if (engine != None) and (now <= engine['created_at'] + cache_time): print("DEBUG: Engine %i was found in DB and info is up-to-date." % engine_id) return engine # store new info else: if engine != None: print("DEBUG: Engine %i was found in DB, but info needs to be updated." % engine_id) else: print("DEBUG: Engine %i was not found in DB." % engine_id) # run command only on specific computer try: dview = self.ip_client[engine_id] except IndexError: print("DEBUG: Engine with id %i unknown." % engine_id) # delete old entry from database DrQueueComputer.delete_from_db_by_engine_id(engine_id) print("DEBUG: Engine with id %i deleted from database." % engine_id) new_engine = None else: # run command in async mode dview.block = False command = "import DrQueue\nfrom DrQueue import Computer as DrQueueComputer\nengine = DrQueueComputer()" ar = dview.execute(command) try: # try to get results & wait until timeout ar.get(timeout) except Exception: if engine != None: print("DEBUG: Update request for engine %i timed out. Using old information from DB." % engine_id) new_engine = engine else: print("DEBUG: Information request for engine %i timed out." % engine_id) new_engine = None else: # get computer dict from engine namespace new_engine = dview['engine'] # set to known engine_id new_engine['engine_id'] = engine_id # set creation time new_engine['created_at'] = int(time.time()) # store entry in database DrQueueComputer.store_db(new_engine) return new_engine def computer_set_pools(self, computer, pool_list): """add computer to list of pools""" # convert to string pool_str = ','.join(pool_list) # update environment variable on engine dview = self.ip_client[computer['engine_id']] dview.block = True command = "import os\nos.environ[\"DRQUEUE_POOL\"] = \"" + pool_str + "\"" dview.execute(command) # update database entry computer['pools'] = pool_list DrQueueComputer.store_db(computer) print("DEBUG: Engine " + str(computer['engine_id']) + " added to pools " + pool_str + ".") return computer def computer_get_pools(self, computer): """Return all pool names where computer is member.""" return computer['pools'] def task_wait(self, task_id): """Wait for task to finish""" ar = self.ip_client.get_result(task_id) ar.wait_for_send() ar.wait() return ar def query_job_list(self): """Query a list of all jobs""" return DrQueueJob.query_job_list() def query_job_by_id(self, job_id): """Query job by given id""" return DrQueueJob.query_db(job_id) def query_job_by_name(self, job_name): """Query job by given name""" return DrQueueJob.query_job_by_name(job_name) def query_job_tasks_left(self, job_id): """Query left frames of job""" left = 0 tasks = self.query_task_list(job_id) for task in tasks: if task['completed'] == None: left += 1 return left def query_job_finish_time(self, job_id): """Query oldest finish time of all tasks.""" job = self.query_job_by_id(job_id) # use requeue time as starting point if available if ('requeue_time' in job ) and (job['requeue_time'] != False): finish_time = job['requeue_time'] else: finish_time = job['submit_time'] tasks = self.query_task_list(job_id) for task in tasks: # look if older finish time exists if (task['completed'] != None) and (task['completed'] > finish_time): finish_time = task['completed'] return finish_time def get_frame_nr(self, task): """Extract value of DRQUEUE_FRAME.""" if ('buffers' in task) and task['buffers'] != []: frame_nr = int(pickle.loads(task['buffers'][3])['DRQUEUE_FRAME']) else: frame_nr = 1 return frame_nr def query_task_list(self, job_id): """Query a list of tasks objects of certain job. Sort by frame number.""" task_list = self.ip_client.db_query({'header.session' : str(job_id)}, keys=self.all_task_query_keys) sorted_task_list = sorted(task_list, key=self.get_frame_nr) return sorted_task_list def query_interrupted_task_list(self, job_id): """Query a list of interrupted tasks of certain job. Sort by frame number.""" job = self.query_job_by_id(job_id) task_list = self.ip_client.db_query({'header.session' : str(job_id)}, keys=self.all_task_query_keys) interrupted_task_list = [] for task in task_list: frame_nr = self.get_frame_nr(task) print("frame_nr: " + str(frame_nr)) # log filename if job['renderer'] == "blender": filesearch = job['scenefile'] + str("%04d" % frame_nr) + ".???" found = glob.glob(filesearch) # file was found if len(found) > 0: outputfile = found[0] print("outputfile: "+ str(outputfile)) filesize = os.path.getsize(outputfile) print(filesize) # file exists, but is empty if filesize == 0: interrupted_task_list.append(task) # file was not found else: outputfile = None print("outputfile: "+ str(outputfile)) if (task['completed'] == None) and (task['started'] == None): interrupted_task_list.append(task) else: raise ValueError("Only Blender renderer supported so far.") return interrupted_task_list def query_task(self, task_id): """Query a single task.""" task = self.ip_client.db_query({'msg_id' : task_id }, keys=self.all_task_query_keys)[0] return task def query_computer_list(self): """Query a list of all computers.""" return self.ip_client.ids def job_stop(self, job_id): """Stop job and all tasks which are not currently running""" # disable job self.job_disable(job_id) tasks = self.query_task_list(job_id) tasks_to_stop = [] for task in tasks: print("Task " + task["msg_id"] + ": ") if ("result_header" in task) and (task["result_header"] != None) and (task["result_header"]["status"] == "ok"): print(" finished at " + str(task["completed"])) else: # get task stats of all computers stats = self.ip_client.queue_status('all', True) # check if tasks is already running on an engine found_on_engine = False for key,status in list(stats.items()): if ('tasks' in status) and (task['msg_id'] in status['tasks']): # skip tasks which are already running on an engine print(" not finished yet but already queued to engine. will leave it there.") found_on_engine = True break # if a task isn't already queueed/running on an engine, it should be safe to abort it if found_on_engine == False: print(" not finished yet. will abort.") tasks_to_stop.append(task['msg_id']) if len(tasks_to_stop) > 0: try: self.ip_client.abort(tasks_to_stop) except Exception as e: print("ERROR: " + str(e)) return True def job_kill(self, job_id): """Stop job and all of it's tasks wether running or not""" # disable job self.job_disable(job_id) tasks = self.query_task_list(job_id) running_engines = [] tasks_to_stop = [] # abort all queued tasks for task in tasks: stats = self.ip_client.queue_status('all', True) # check if tasks is already running on an engine for key,status in list(stats.items()): if ('tasks' in status) and (task['msg_id'] in status['tasks']): running_engines.append(key) tasks_to_stop.append(task['msg_id']) # stop all matching tasks at once try: self.ip_client.abort(tasks_to_stop) except Exception as e: print("ERROR: " + str(e)) # stop all engines which still run a task # the slave wrapper will restart the engine running_engines = set(running_engines) for engine_id in running_engines: self.engine_stop(engine_id) return True def job_disable(self, job_id): """Disable job in database.""" job = self.query_job_by_id(job_id) job['enabled'] = False DrQueueJob.update_db(job) return True def job_enable(self, job_id): """Disable job in database.""" job = self.query_job_by_id(job_id) job['enabled'] = True DrQueueJob.update_db(job) return True def job_delete(self, job_id): """Delete job and all of it's tasks""" tasks = self.query_task_list(job_id) engines = self.query_computer_list() # abort and delete all queued tasks for task in tasks: if len(engines) > 0: self.ip_client.abort(task['msg_id']) self.ip_client.purge_results(task['msg_id']) # delete job itself DrQueueJob.delete_from_db(job_id) return True def job_continue(self, job_id): """Continue stopped job and all of it's tasks""" job = self.query_job_by_id(job_id) # enable job self.job_enable(job_id) tasks = self.query_task_list(job_id) tasks_to_resubmit = [] for task in tasks: print("Task " + task["msg_id"] + ": ") if ("result_header" in task) and (task["result_header"] != None) and (task["result_header"]["status"] == "ok"): print(" finished at " + str(task["completed"])) else: print(" not finished yet. will resubmit.") tasks_to_resubmit.append(task["msg_id"]) if len(tasks_to_resubmit) > 0: # resubmit all matching msg_ids at once try: async_results = self.ip_client.resubmit(tasks_to_resubmit) except Exception as e: print("ERROR: " + str(e)) # IPython seems to give out new msg_ids instead of re-using the old ones for msg_id in async_results.msg_ids: print("got new msg_id: " + msg_id) # delete old tasks which now have a resubmitted clone try: self.ip_client.purge_results(tasks_to_resubmit) except Exception as e: print("ERROR: " + str(e)) return True def job_rerun(self, job_id): """Run all tasks of job another time""" job = self.query_job_by_id(job_id) # enable job job['enabled'] = True # set resubmit time job['requeue_time'] = datetime.datetime.now() DrQueueJob.update_db(job) tasks = self.query_task_list(job_id) tasks_to_resubmit = [] # get all msg_ids of job for task in tasks: tasks_to_resubmit.append(task["msg_id"]) # resubmit all msg_ids at once try: async_results = self.ip_client.resubmit(tasks_to_resubmit) except Exception as e: print("ERROR: " + str(e)) # IPython seems to give out new msg_ids instead of re-using the old ones for msg_id in async_results.msg_ids: print("got new msg_id: " + msg_id) # delete old tasks which now have a resubmitted clone try: self.ip_client.purge_results(tasks_to_resubmit) except Exception as e: print("ERROR: " + str(e)) # kickstart all computers running_engines = [] for task in tasks: stats = self.ip_client.queue_status('all', True) # check if tasks is already running on an engine for key,status in list(stats.items()): if ('tasks' in status) and (task['msg_id'] in status['tasks']): running_engines.append(key) # stop all engines which still run a task # the slave wrapper will restart the engine running_engines = set(running_engines) for engine_id in running_engines: self.engine_stop(engine_id) return True def task_rerun(self, task_id): """Run task another time""" task = self.query_task(task_id) #print(task) # enable job #job['enabled'] = True # set resubmit time #job['requeue_time'] = datetime.datetime.now() #DrQueueJob.update_db(job) # resubmit msg_id of task try: async_results = self.ip_client.resubmit(task["msg_id"]) except Exception as e: print("ERROR: " + str(e)) # IPython seems to give out new msg_ids instead of re-using the old ones for msg_id in async_results.msg_ids: print("got new msg_id: " + msg_id) # delete old tasks which now have a resubmitted clone try: self.ip_client.purge_results(task["msg_id"]) except Exception as e: print("ERROR: " + str(e)) # kickstart all computers running_engines = [] stats = self.ip_client.queue_status('all', True) # check if tasks is already running on an engine for key,status in list(stats.items()): if ('tasks' in status) and (task['msg_id'] in status['tasks']): running_engines.append(key) # stop all engines which still run a task # the slave wrapper will restart the engine running_engines = set(running_engines) for engine_id in running_engines: self.engine_stop(engine_id) return True def job_rerun_interrupted_tasks(self, job_id): """Run interrupted tasks of job another time""" job = self.query_job_by_id(job_id) # enable job job['enabled'] = True # set resubmit time job['requeue_time'] = datetime.datetime.now() DrQueueJob.update_db(job) tasks = self.query_interrupted_task_list(job_id) if len(tasks) == 0: return True tasks_to_resubmit = [] # get all msg_ids of job for task in tasks: tasks_to_resubmit.append(task["msg_id"]) # resubmit all msg_ids at once try: async_results = self.ip_client.resubmit(tasks_to_resubmit) except Exception as e: print("ERROR: " + str(e)) # IPython seems to give out new msg_ids instead of re-using the old ones for msg_id in async_results.msg_ids: print("got new msg_id: " + msg_id) # delete old tasks which now have a resubmitted clone try: self.ip_client.purge_results(tasks_to_resubmit) except Exception as e: print("ERROR: " + str(e)) # kickstart all computers running_engines = [] for task in tasks: stats = self.ip_client.queue_status('all', True) # check if tasks is already running on an engine for key,status in list(stats.items()): if ('tasks' in status) and (task['msg_id'] in status['tasks']): running_engines.append(key) # stop all engines which still run a task # the slave wrapper will restart the engine running_engines = set(running_engines) for engine_id in running_engines: self.engine_stop(engine_id) return True def job_status(self, job_id): """Return status string of job""" tasks = self.query_task_list(job_id) status = None status_pending = 0 status_ok = 0 status_aborted = 0 status_resubmitted = 0 status_error = 0 status_unknown = 0 for task in tasks: # look for pending tasks if task['completed'] == None: status_pending += 1 else: if 'result_content' in list(task.keys()): result_content = task['result_content'] # look for done tasks if ('status' in list(result_content.keys())) and (result_content['status'] == "ok"): status_ok += 1 # look for aborted tasks elif ('status' in list(result_content.keys())) and (result_content['status'] == "aborted"): status_aborted += 1 # look for done tasks elif ('status' in list(result_content.keys())) and (result_content['status'] == "resubmitted"): status_resubmitted += 1 # look for tasks with error elif ('status' in list(result_content.keys())) and (result_content['status'] == "error"): status_error += 1 else: status_unknown += 1 # if at least 1 task is ok, job status is ok if status_ok > 0: status = "ok" # if at least 1 task has unknown status, job status is unknown if status_unknown > 0: status = "unknown" # if at least 1 task is pending, job status is pending if status_pending > 0: status = "pending" # if at least 1 task is aborted, job status is aborted if status_aborted > 0: status = "aborted" # if at least 1 task has an error, job status is error if status_error > 0: status = "error" return status def job_estimated_finish_time(self, job_id): """Calculate estimated finish time of job.""" tasks = self.query_task_list(job_id) spent_times = [] # get spent time for each finished task for task in tasks: if task['completed'] != None: if 'result_header' in list(task.keys()): result_header = task['result_header'] if ('status' in list(result_header.keys())) and (result_header['status'] == "ok"): timediff = task['completed'] - task['started'] spent_times.append(timediff) if len(spent_times) > 0: # calculate sum of spent time sum_times = datetime.timedelta(0) for spent in spent_times: sum_times += spent # calcutate mean time for a single task sum_times_secs = sum_times.days * 86400 + sum_times.seconds meantime_secs = sum_times_secs / len(spent_times) meantime = datetime.timedelta(0, meantime_secs) # calculate estimated time left tasks_left = len(tasks) - len(spent_times) time_left = tasks_left * meantime # query job object job = self.query_job_by_id(job_id) # look if all tasks are already done if self.query_job_tasks_left(job_id) == 0: finish_time = self.query_job_finish_time(job_id) else: # calculate estimated finish time, use requeue time if available if ('requeue_time' in job ) and (job['requeue_time'] != False): finish_time = job['requeue_time'] + time_left else: finish_time = job['submit_time'] + time_left else: meantime = "unknown" time_left = "unknown" finish_time = "unknown" return meantime, time_left, finish_time def engine_stop(self, engine_id): """Stop a specific engine""" # delete computer information in db DrQueueComputer.delete_from_db_by_engine_id(engine_id) # we stop the engine try: self.ip_client.shutdown(engine_id, False, False, True) except Exception: return False return True
yield chunk del chunk[:] chunk.append(line) yield chunk for f in namefiles: print f total_chunks = 0 full_path = namefile_path + "/" + f full_output_path = full_path + ".namestd" output_conn = open(full_output_path, "wt") output_writer = csv.DictWriter(output_conn, fieldnames=fieldnames) with open(full_path, "rt") as namefile: reader = csv.DictReader(namefile, fieldnames=fieldnames) for process_chunk in gen_chunks(reader, chunksize=block_size): t0 = time.time() out = clean_wrapper.map(process_chunk) output_writer.writerows(out) t1 = time.time() total_chunks += 1 del out[:] print total_chunks, total_chunks * block_size, (t1 - t0) / block_size if total_chunks % 10 == 0 and total_chunks > 0: ## Clean out cached objects on the clients rc.purge_results(targets=rc.ids) dview.results.clear() rc.results.clear() gc.collect() output_conn.close()
class Client(): """DrQueue client actions""" def __init__(self): # initialize IPython try: self.ip_client = IPClient() except Exception: raise Exception("Could not connect to IPython controller.") self.lbview = self.ip_client.load_balanced_view() # enable tracking self.lbview.track = True def job_run(self, job): """Create and queue tasks from job object""" # check job name if job['name'] in DrQueueJob.query_jobnames(): raise ValueError("Job name %s is already used!" % job['name']) return False # save job in database job_id = DrQueueJob.store_db(job) # job_id from db is be used as session name self.ip_client.session.session = str(job_id) # set owner of job self.ip_client.session.username = job['owner'] # set number of retries for each task self.lbview.retries = job['retries'] # depend on another job (it's tasks) if ('depend' in job['limits']) and (job['limits']['depend'] != None): depend_job = self.query_job_by_name(job['limits']['depend']) depend_tasks = self.query_task_list(depend_job['_id']) task_ids = [] for task in depend_tasks: task_ids.append(task['msg_id']) self.lbview.after = task_ids # check frame numbers if not (job['startframe'] >= 1): raise ValueError("Invalid value for startframe. Has to be equal or greater than 1.") return False if not (job['endframe'] >= 1): raise ValueError("Invalid value for endframe. Has to be equal or greater than 1.") return False if not (job['endframe'] >= job['startframe']): raise ValueError("Invalid value for endframe. Has be to equal or greater than startframe.") return False if job['endframe'] > job['startframe']: if not (job['endframe'] - job['startframe'] >= job['blocksize']): raise ValueError("Invalid value for blocksize. Has to be equal or lower than endframe-startframe.") return False if job['endframe'] == job['startframe']: if job['blocksize'] != 1: raise ValueError("Invalid value for blocksize. Has to be equal 1 if endframe equals startframe.") return False task_frames = list(range(job['startframe'], job['endframe'] + 1, job['blocksize'])) ar = None for x in task_frames: # prepare script input env_dict = { 'DRQUEUE_FRAME' : x, 'DRQUEUE_BLOCKSIZE' : job['blocksize'], 'DRQUEUE_ENDFRAME' : job['endframe'], 'DRQUEUE_SCENEFILE' : job['scenefile'], 'DRQUEUE_LOGFILE' : job['name'] + "-" + str(x) + "_" + str(x + job['blocksize'] -1) + ".log" } # optional elements if 'renderdir' in job: env_dict['DRQUEUE_RENDERDIR'] = job['renderdir'] if 'projectdir' in job: env_dict['DRQUEUE_PROJECTDIR'] = job['projectdir'] if 'configdir' in job: env_dict['DRQUEUE_CONFIGDIR'] = job['configdir'] if 'imagefile' in job: env_dict['DRQUEUE_IMAGEFILE'] = job['imagefile'] if 'precommand' in job: env_dict['DRQUEUE_PRECOMMAND'] = job['precommand'] if 'renderer' in job: env_dict['DRQUEUE_RENDERER'] = job['renderer'] if 'fileformat' in job: env_dict['DRQUEUE_FILEFORMAT'] = job['fileformat'] if 'postcommand' in job: env_dict['DRQUEUE_POSTCOMMAND'] = job['postcommand'] if 'viewcommand' in job: env_dict['DRQUEUE_VIEWCOMMAND'] = job['viewcommand'] if 'worldfile' in job: env_dict['DRQUEUE_WORLDFILE'] = job['worldfile'] if 'terrainfile' in job: env_dict['DRQUEUE_TERRAINFILE'] = job['terrainfile'] if 'composition' in job: env_dict['DRQUEUE_COMPOSITION'] = job['composition'] if 'camera' in job: env_dict['DRQUEUE_CAMERA'] = job['camera'] if 'resx' in job: env_dict['DRQUEUE_RESX'] = job['resx'] if 'resy' in job: env_dict['DRQUEUE_RESY'] = job['resy'] if 'renderpass' in job: env_dict['DRQUEUE_RENDERPASS'] = job['renderpass'] if 'rendertype' in job: env_dict['DRQUEUE_RENDERTYPE'] = job['rendertype'] if 'fileextension' in job: env_dict['DRQUEUE_FILEEXTENSION'] = job['fileextension'] if 'stepframe' in job: env_dict['DRQUEUE_STEPFRAME'] = job['stepframe'] if 'custom_bucket' in job: env_dict['DRQUEUE_CUSTOM_BUCKET'] = job['custom_bucket'] if 'bucketsize' in job: env_dict['DRQUEUE_BUCKETSIZE'] = job['bucketsize'] if 'custom_lod' in job: env_dict['DRQUEUE_CUSTOM_LOD'] = job['custom_lod'] if 'lod' in job: env_dict['DRQUEUE_LOD'] = job['lod'] if 'custom_varyaa' in job: env_dict['DRQUEUE_CUSTOM_VARYAA'] = job['custom_varyaa'] if 'varyaa' in job: env_dict['DRQUEUE_VARYAA'] = job['varyaa'] if 'raytrace' in job: env_dict['DRQUEUE_RAYTRACE'] = job['raytrace'] if 'antialias' in job: env_dict['DRQUEUE_ANTIALIAS'] = job['antialias'] if 'custom_bdepth' in job: env_dict['DRQUEUE_CUSTOM_BDEPTH'] = job['custom_bdepth'] if 'bdepth' in job: env_dict['DRQUEUE_BDEPTH'] = job['bdepth'] if 'custom_zdepth' in job: env_dict['DRQUEUE_CUSTOM_ZDEPTH'] = job['custom_zdepth'] if 'zdepth' in job: env_dict['DRQUEUE_ZDEPTH'] = job['zdepth'] if 'custom_cracks' in job: env_dict['DRQUEUE_CUSTOM_CRACKS'] = job['custom_cracks'] if 'cracks' in job: env_dict['DRQUEUE_CRACKS'] = job['cracks'] if 'custom_quality' in job: env_dict['DRQUEUE_CUSTOM_QUALITY'] = job['custom_quality'] if 'quality' in job: env_dict['DRQUEUE_QUALITY'] = job['quality'] if 'custom_qfiner' in job: env_dict['DRQUEUE_CUSTOM_QFINER'] = job['custom_qfiner'] if 'qfiner' in job: env_dict['DRQUEUE_QFINER'] = job['qfiner'] if 'custom_smultiplier' in job: env_dict['DRQUEUE_CUSTOM_SMULTIPLIER'] = job['custom_smultiplier'] if 'smultiplier' in job: env_dict['DRQUEUE_SMULTIPLIER'] = job['smultiplier'] if 'custom_mpcache' in job: env_dict['DRQUEUE_CUSTOM_MPCACHE'] = job['custom_mpcache'] if 'mpcache' in job: env_dict['DRQUEUE_MPCACHE'] = job['mpcache'] if 'custom_smpolygon' in job: env_dict['DRQUEUE_CUSTOM_SMPOLYGON'] = job['custom_smpolygon'] if 'smpolygon' in job: env_dict['DRQUEUE_SMPOLYGON'] = job['smpolygon'] if 'custom_wh' in job: env_dict['DRQUEUE_CUSTOM_WH'] = job['custom_wh'] if 'custom_type' in job: env_dict['DRQUEUE_CUSTOM_TYPE'] = job['custom_type'] if 'ctype' in job: env_dict['DRQUEUE_CTYPE'] = job['ctype'] if 'skipframes' in job: env_dict['DRQUEUE_SKIPFRAMES'] = job['skipframes'] # set dependencies dep_dict = {} if ('os' in job['limits']) and (job['limits']['os'] != None): dep_dict['os_name'] = job['limits']['os'] if ('minram' in job['limits']) and (job['limits']['minram'] > 0): dep_dict['minram'] = job['limits']['minram'] if ('mincores' in job['limits']) and (job['limits']['mincores'] > 0): dep_dict['mincores'] = job['limits']['mincores'] if ('pool_name' in job['limits']) and (job['limits']['pool_name'] != None): dep_dict['pool_name'] = job['limits']['pool_name'] run_script_with_env_and_deps = dependent(DrQueue.run_script_with_env, DrQueue.check_deps, dep_dict) # run task on cluster render_script = DrQueue.get_rendertemplate(job['renderer']) ar = self.lbview.apply(run_script_with_env_and_deps, render_script, env_dict) # wait for pyzmq send to complete communication (avoid race condition) ar.wait_for_send() # append email task behind last task if requested if ('send_email' in job) and (job['send_email'] == True): self.lbview.after = ar # run email task mail_ar = self.lbview.apply(DrQueue.send_email, job['name'], job['email_recipients']) # wait for pyzmq send to complete communication (avoid race condition) mail_ar.wait_for_send() return True def identify_computer(self, engine_id, cache_time): """Gather information about computer""" # look if engine info is already stored engine = DrQueueComputer.query_db(engine_id) now = int(time.time()) # check existence and age of info if (engine != None) and (now <= engine['date'] + cache_time): print("DEBUG: Engine %i was found in DB" % engine_id) # store new info else: print("DEBUG: Engine %i was not found in DB" % engine_id) # run command only on specific computer dview = self.ip_client[engine_id] dview.block = True dview.execute("import DrQueue\nfrom DrQueue import Computer as DrQueueComputer\nengine = DrQueueComputer(" + str(engine_id) + ")") engine = dview['engine'] engine['date'] = int(time.time()) DrQueueComputer.store_db(engine) return engine def task_wait(self, task_id): """Wait for task to finish""" ar = self.ip_client.get_result(task_id) ar.wait_for_send() ar.wait() return ar def query_job_list(self): """Query a list of all jobs""" return DrQueueJob.query_job_list() def query_running_job_list(self): """Query a list of all running jobs""" jobs = DrQueueJob.query_job_list() running_jobs = [] for job in jobs: if self.query_job_tasks_left(job['_id']) > 0: running_jobs.append(job) return running_jobs def query_jobname(self, task_id): """Query jobname from task id""" data = self.ip_client.db_query({"msg_id" : task_id}) job_id = data[0]['header']['session'] job = DrQueueJob.query_db(job_id) return job.name def query_job(self, job_id): """Query job from id""" return DrQueueJob.query_db(job_id) def query_job_by_name(self, job_name): """Query job from name""" return DrQueueJob.query_job_by_name(job_name) def query_job_tasks_left(self, job_id): """Query left frames of job""" left = 0 tasks = self.query_task_list(job_id) for task in tasks: if task['completed'] == None: left += 1 return left def query_job_finish_time(self, job_id): """Query oldest finish time of all tasks.""" job = self.query_job(job_id) # use requeue time as starting point if available if ('requeue_time' in job ) and (job['requeue_time'] != False): finish_time = job['requeue_time'] else: finish_time = job['submit_time'] tasks = self.query_task_list(job_id) for task in tasks: # look if older finish time exists if (task['completed'] != None) and (task['completed'] > finish_time): finish_time = task['completed'] return finish_time def get_frame_nr(self, task): """Extract value of DRQUEUE_FRAME.""" return int(pickle.loads(task['buffers'][3])['DRQUEUE_FRAME']) def query_task_list(self, job_id): """Query a list of tasks objects of certain job""" task_list = self.ip_client.db_query({'header.session' : str(job_id)}) sorted_task_list = sorted(task_list, key=self.get_frame_nr) return sorted_task_list def query_task(self, task_id): """Query a single task""" task = self.ip_client.db_query({'msg_id' : task_id })[0] return task def query_engine_list(self): """Query a list of all engines""" return self.ip_client.ids def query_engines_of_pool(self, pool_name): """Return available engines of certain pool.""" pool_computers = self.ip_client.ids if pool_name != None: computers = DrQueueComputerPool.query_pool_members(pool_name) if computers == None: raise ValueError("Pool \"%s\" is not existing!" % pool_name) return False for comp in pool_computers: if not comp in computers: pool_computers.remove(comp) if pool_computers == []: raise ValueError("No computer of pool %s is available!" % pool_name) return False print("DEBUG: matching pool: " + pool_name) print(pool_computers) return pool_computers def query_engines_of_os(self, os_name): """Return only engines running certain OS.""" # run job only on matching os matching_os = self.ip_client.ids if os_name != None: for engine_id in self.ip_client.ids: engine = self.identify_computer(engine_id, 1000) # os string has to contain os_name if not os_name in engine['os']: matching_os.remove(engine_id) print("DEBUG: matching os: " + os_name) print(matching_os) return matching_os def query_engines_with_minram(self, minram): """Return only engines with at least minram GB RAM.""" # run job only on matching minram matching_minram = self.ip_client.ids if minram > 0: for engine_id in self.ip_client.ids: engine = self.identify_computer(engine_id, 1000) if engine['memory'] < minram: matching_minram.remove(engine_id) print("DEBUG: matching minram: " + str(minram)) print(matching_minram) return matching_minram def query_engines_with_mincores(self, mincores): """Return only engines with at least mincores CPU cores.""" # run job only on matching mincores matching_mincores = self.ip_client.ids if mincores > 0: for engine_id in self.ip_client.ids: engine = self.identify_computer(engine_id, 1000) if engine['ncorescpu'] * engine['ncpus'] < mincores: matching_mincores.remove(engine_id) print("DEBUG: matching mincores: " + str(mincores)) print(matching_mincores) return matching_mincores def match_all_limits(self, os_list, minram_list, mincores_list, pool_list): """Match all limits for job.""" tmp_list = [] # build list with all list members tmp_list.extend(os_list) tmp_list.extend(minram_list) tmp_list.extend(mincores_list) tmp_list.extend(pool_list) # make entries unique tmp_list = set(tmp_list) tmp_list = list(tmp_list) matching_limits = [] for entry in tmp_list: # look if entry is in all lists if (entry in os_list) and (entry in minram_list) and (entry in mincores_list) and (entry in pool_list): matching_limits.append(entry) else: print("DEBUG: %i isn't matching limits" % entry) print("DEBUG: matching limits:") print(matching_limits) if len(matching_limits) == 0: message = "No engine meets the requirements." print(message) raise Exception(message) elif len(matching_limits) > 0: # only run on matching engines self.lbview = self.ip_client.load_balanced_view(matching_limits) else: self.lbview = self.ip_client.load_balanced_view() def job_stop(self, job_id): """Stop job and all tasks which are not currently running""" tasks = self.query_task_list(job_id) # abort all queued tasks for task in tasks: self.ip_client.abort(task['msg_id']) return True def job_kill(self, job_id): """Stop job and all of it's tasks wether running or not""" tasks = self.query_task_list(job_id) running_engines = [] # abort all queued tasks for task in tasks: stats = self.ip_client.queue_status('all', True) # check if tasks is already running on an engine for key,status in list(stats.items()): if ('tasks' in status) and (task['msg_id'] in status['tasks']): running_engines.append(key) self.ip_client.abort(task['msg_id']) # restart all engines which still run a task running_engines = set(running_engines) return True def job_delete(self, job_id): """Delete job and all of it's tasks""" tasks = self.query_task_list(job_id) engines = self.query_engine_list() # abort and delete all queued tasks for task in tasks: if len(engines) > 0: self.ip_client.abort(task['msg_id']) self.ip_client.purge_results(task['msg_id']) # delete job itself DrQueueJob.delete_from_db(job_id) return True def task_continue(self, task_id): """Continue aborted or failed task""" task = self.query_task(task_id) # check if action is needed if (task['completed'] != None) and ((task['result_header']['status'] == "error") or (task['result_header']['status'] == "aborted")): self.task_requeue(task_id) return True def task_requeue(self, task_id): """Requeue task""" self.ip_client.resubmit(task_id) print("requeuing %s" % task_id) return True def job_continue(self, job_id): """Continue stopped job and all of it's tasks""" job = self.query_job(job_id) tasks = self.query_task_list(job_id) # continue tasks for task in tasks: self.task_continue(task['msg_id']) return True def job_rerun(self, job_id): """Run all tasks of job another time""" job = self.query_job(job_id) tasks = self.query_task_list(job_id) # rerun tasks for task in tasks: self.task_requeue(task['msg_id']) # set resubmit time job['requeue_time'] = datetime.datetime.now() DrQueueJob.update_db(job) return True def job_status(self, job_id): """Return status string of job""" tasks = self.query_task_list(job_id) status = None status_pending = 0 status_ok = 0 status_aborted = 0 status_resubmitted = 0 status_error = 0 status_unknown = 0 for task in tasks: # look for pending tasks if task['completed'] == None: status_pending += 1 else: if 'result_header' in list(task.keys()): result_header = task['result_header'] # look for done tasks if ('status' in list(result_header.keys())) and (result_header['status'] == "ok"): status_ok += 1 # look for aborted tasks elif ('status' in list(result_header.keys())) and (result_header['status'] == "aborted"): status_aborted += 1 # look for done tasks elif ('status' in list(result_header.keys())) and (result_header['status'] == "resubmitted"): status_resubmitted += 1 # look for tasks with error elif ('status' in list(result_header.keys())) and (result_header['status'] == "error"): status_error += 1 else: status_unknown += 1 # if at least 1 task is ok, job status is ok if status_ok > 0: status = "ok" # if at least 1 task is pending, job status is pending if status_pending > 0: status = "pending" # if at least 1 task is aborted, job status is aborted if status_aborted > 0: status = "aborted" # if at least 1 task has an error, job status is error if status_error > 0: status = "error" return status def job_estimated_finish_time(self, job_id): """Calculate estimated finish time of job.""" tasks = self.query_task_list(job_id) spent_times = [] # get spent time for each finished task for task in tasks: if task['completed'] != None: if 'result_header' in list(task.keys()): result_header = task['result_header'] if ('status' in list(result_header.keys())) and (result_header['status'] == "ok"): timediff = task['completed'] - task['started'] spent_times.append(timediff) if len(spent_times) > 0: # calculate sum of spent time sum_times = datetime.timedelta(0) for spent in spent_times: sum_times += spent # calcutate mean time for a single task meantime = sum_times / len(spent_times) # calculate estimated time left tasks_left = len(tasks) - len(spent_times) time_left = tasks_left * meantime # query job object job = self.query_job(job_id) # look if all tasks are already done if self.query_job_tasks_left(job_id) == 0: finish_time = self.query_job_finish_time(job_id) else: # calculate estimated finish time, use requeue time if available if ('requeue_time' in job ) and (job['requeue_time'] != False): finish_time = job['requeue_time'] + time_left else: finish_time = job['submit_time'] + time_left else: meantime = "unknown" time_left = "unknown" finish_time = "unknown" return meantime, time_left, finish_time def engine_stop(self, engine_id): """Stop a specific engine""" # delete computer information in db DrQueueComputer.delete_from_db(engine_id) # shutdown computer self.ip_client.shutdown(engine_id) return True def engine_restart(self, engine_id): """Restart a specific engine""" self.ip_client.shutdown(engine_id, True, False, True) return True
def run(self): if self.session.get_client() == None: self.mylog.error("Not connected to a cluster.") return False # workaround for Ipython bug which makes everything slow, # create a new client, use it and delete it c = Client(profile='ssh') jcmd = self.session.opts.get_opt('jcmd') if jcmd == 'purge': num = 0 query = c.db_query({'completed':{'$ne' : None }},['msg_id']) for q in query: result = c.get_result(q['msg_id']).get() # filter on SB, node, task if self._check_result(result): num += 1 c.purge_results(q['msg_id']) mylogger.userinfo(self.mylog, str(num)+" cluster's hub results deleted.") elif jcmd == 'list': num = 0 # query the hub DB for all the finished tasks and get IDs query = c.db_query({'completed':{'$ne' : None }},['msg_id','completed','started']) # search for interesting results and print them for q in query: result = c.get_result(q['msg_id']).get() # filter on SB, node, task if self._check_result(result): # skip results without error if wanted if self.session.opts.get_opt('onlyerr') and result['err'] == '': continue num += 1 header = {'Task' : result['task'], 'Node' : result['node'],\ 'SB' : result['SB'], \ 'Completed' : q['completed'].replace(microsecond=0), \ 'Started' : q['started'].replace(microsecond=0), \ 'Exec time': q['completed'].replace(microsecond=0)-q['started'].replace(microsecond=0)} data = {'Std Output': result['out'], 'Std Error': result['err'], \ 'Command':result['command']} print_jobs(header, data, self.session.opts.get_opt('lines')) mylogger.userinfo(self.mylog, str(num)+" processes listed.") elif jcmd == 'running': num_r = 0 num_q = 0 # TODO: it should be "Started" not "submitted", unfortunately ipython does not set it query = c.db_query({'completed': None},['buffers','engine_uuid','submitted']) for q in query: # unpack the buffer of the sent jobs to obtain the arguments null, com, args = unpack_apply_message(q['buffers']) # filter on SB, node, task if self._check_result({'node':args['node'],'SB':args['SB'],'task':args['task']}): if q['engine_uuid'] == None: if self.session.opts.get_opt('queue') == False: continue q['msg_id'] = q['msg_id']+" (queue)" num_q += 1 else: num_r += 1 header = {'Msg_id' : q['msg_id'], 'Task' : args['task'], 'Node' : args['node'], 'SB' : args['SB'], \ 'Started' : q['submitted'].replace(microsecond=0), \ 'Extime': datetime.datetime.now().replace(microsecond=0) - q['submitted'].replace(microsecond=0)} data = {'Command': com[0]} print_jobs(header, data, self.session.opts.get_opt('lines')) mylogger.userinfo(self.mylog, "Processes running: "+str(num_r)+". In queue: "+str(num_q)+".") elif jcmd == 'kill': print "TBI" #TODO: add a resubmit option to resubmit all tasks that failed http://ipython.org/ipython-doc/stable/parallel/parallel_task.html del c
class Load_balanced_view(object): """class that implements the initialisation of a ipython parallel load_ballance_view performing some check. It also execute allows to execute some python command on all engines, submit pieces of code, and print a progress log. A cleanup function provided """ def __init__(self, client=None, profile='default'): """ Start a load_balanced_view from IPython.parallel. If a client is not given, checks are run to see if ipcluster exists and engines are running If none of this happen the computation is switched to single serial. Otherwise the client and the load balanced view are initialised. Parameters ---------- *client*: an IPython parallel client if *None* a new object created *profile*: Ipython profile. Used if *client* is not *None* """ self.do_parallel = True #everything ok try: #try to import Client if client != None: self.c = client else: self.c = Client(profile=profile) self.engines_id = self.c.ids #get the id of the engines self.dview = self.c[:] self.lbview = self.c.load_balanced_view() #load view except ImportError: #if the import fails print("""Ipython.parallel.Client cannot be imported.\ Make sure to have Ipython version > 0.11 installed.""") self.do_parallel = self._continue_serial() except error.NoEnginesRegistered: #if Ipython is not present print("""The Ipython cluster has not been started start it\ before executing the code. e.g. 'ipcluster start --n=4'.""") self.do_parallel = self._continue_serial() def _continue_serial(self): """asks if the user wants to continue in serial mode or quit""" import io_custom as sio message = "Do you want to continue in serial mode" if (sio.yes_or_not(message, 'y')): return False #disable the paraller computation else: exit() def is_parallel_enabled(self): """Returns *True* if the initialization went fine, othewise *False* output ------ *parallel*: bool *True* if the paraller environment has been set up without problems, *False* otherwise """ return self.do_parallel def exec_on_engine(self, code, block=True): """ Execute the given code on all engines Parameters ---------- to_execute: string or list of strings command(s) to execute on all the nodes. Thought for short tasks, like importing modules block: bool whether or not to wait until done to return. default: True """ #Six: Python 2 and 3 Compatibility Library from six import string_types #appropriate string type if isinstance(code, string_types): # if it's a string code = [ code, ] # convert to list # execute the required commands # (better to do in block mode, avoids errors if command is slow) for te in code: try: self.dview.execute(te, block=block) except error.CompositeError as e: # if an error occurs, print a single one, not one per engine e.raise_exception() def push(self, variables): """ wrapper around dview.push(dict) push a list of variables to the ipython engines Parameters ---------- variables: dictionary dictionary of variables """ self.dview.push(variables) def apply(self, f, *args, **kwargs): """ wrapper around 'lview.apply(self, f, *args, **kwargs)' Docstring: calls f(*args, **kwargs) on remote engines, returning the result. This method sets all apply flags via this View's attributes. if self.block is False: returns AsyncResult else: returns actual result of f(*args, **kwargs) """ return self.lbview.apply(f, *args, **kwargs) def get_queue_status(self): """ get the status of the queue """ return self.lbview.queue_status() def advancement_jobs(self, jobs, update=30, init_status=None): """Print the advancement of the jobs in the queue. This functions returns when all jobs are finished Parameters ---------- jobs: list of AsyncResult objects list of jobs submitted to the task scheduler update: float or int update the status every 'update' seconds. If negative, only the initial and final status are written init_status: dict dictionary returned from load_balanced_view.queue_status(). If given the number of jobs per processors is returned """ import numpy as np tot_jobs = len(jobs) print("Starting {0} jobs using {1} engines".format( tot_jobs, len(self.engines_id))) #start message if (update > 0): #if: advancement status import io_custom as sio while not self.wait(jobs=jobs, timeout=update): status = self.get_queue_status() #get the number of running jobs totrunning = np.sum( [status[i]['tasks'] for i in self.engines_id]) tot_torun = status['unassigned'] already_run = tot_jobs - (totrunning + tot_torun) percentage_run = already_run / float(tot_jobs) #print the status message message = """{0:.1%} done. {1} finished {2} running, {3} pending.""".format( percentage_run, already_run, totrunning, tot_torun) sio.printer(message) #end while not lbview.wait( ... ) sio.printer("Finished") else: #else if: advancement status self.wait(jobs=jobs) #wait until it finishes print("Finished") #end if: advancement status #if details about the jobs per processor are wanted print("") if (init_status is not None): final_status = self.get_queue_status() #get the final status print("{0:<5}: # processes".format("id")) for i in self.engines_id: print("{0:<5}: {1}".format( i, final_status[i]['completed'] - init_status[i]['completed'])) # end def advancement_jobs( ... ) def wait(self, jobs=None, timeout=-1): """wrapper around lview.wait(self, jobs=None, timeout=-1) waits on one or more `jobs`, for up to `timeout` seconds. Parameters ---------- jobs : int, str, or list of ints and/or strs, or one or more AsyncResult objects ints are indices to self.history strs are msg_ids default: wait on all outstanding messages timeout : float a time in seconds, after which to give up. default is -1, which means no timeout Returns ------- True : when all msg_ids are done False : timeout reached, some msg_ids still outstanding """ return self.lbview.wait(jobs=jobs, timeout=timeout) def clear_cache(self): """ clear the cache of the parallel computation to avoid memory overload. from: http://mail.scipy.org/pipermail/ipython-user/2012-December/011874.html check if something like this will be implemented eventually """ self.c.purge_results('all') #clears controller self.c.results.clear() self.c.metadata.clear() self.dview.results.clear() self.lbview.results.clear() assert not self.c.outstanding, "don't clear history when tasks are outstanding" self.c.history = [] self.dview.history = [] self.lbview.history = []
del chunk[:] chunk.append(line) yield chunk for f in namefiles: print f total_chunks = 0 full_path = namefile_path + '/' + f full_output_path = full_path + '.namestd' output_conn = open(full_output_path, 'wt') output_writer = csv.DictWriter(output_conn, fieldnames=fieldnames) with open(full_path, 'rt') as namefile: reader = csv.DictReader(namefile, fieldnames=fieldnames) for process_chunk in gen_chunks(reader, chunksize=block_size): t0 = time.time() out = clean_wrapper.map(process_chunk) output_writer.writerows(out) t1 = time.time() total_chunks += 1 del out[:] print total_chunks, total_chunks * block_size, (t1 - t0) / block_size if total_chunks % 10 == 0 and total_chunks > 0: ## Clean out cached objects on the clients rc.purge_results(targets=rc.ids) dview.results.clear() rc.results.clear() gc.collect() output_conn.close()
class Client: """DrQueue client actions""" def __init__(self): # initialize IPython try: self.ip_client = IPClient() except Exception: raise Exception("Could not connect to IPython controller.") self.lbview = self.ip_client.load_balanced_view() # enable tracking self.lbview.track = True def job_run(self, job): """Create and queue tasks from job object""" # check job name if job["name"] in DrQueueJob.query_jobnames(): raise ValueError("Job name %s is already used!" % job["name"]) return False # run job only on matching os os_list = self.query_engines_of_os(job["limits"]["os"]) # run job only on matching minram minram_list = self.query_engines_with_minram(job["limits"]["minram"]) # run job only on matching mincores mincores_list = self.query_engines_with_mincores(job["limits"]["mincores"]) # check pool members pool_list = self.query_engines_of_pool(job["limits"]["pool"]) # check limits self.match_all_limits(os_list, minram_list, mincores_list, pool_list) # save job in database job_id = DrQueueJob.store_db(job) # job_id from db is be used as session name self.ip_client.session.session = str(job_id) # set owner of job self.ip_client.session.username = job["owner"] # set number of retries for each task self.lbview.retries = job["retries"] # depend on another job (it's tasks) if ("depend" in job["limits"]) and (job["limits"]["depend"] != None): depend_job = self.query_job_by_name(job["limits"]["depend"]) depend_tasks = self.query_task_list(depend_job["_id"]) task_ids = [] for task in depend_tasks: task_ids.append(task["msg_id"]) self.lbview.after = task_ids # check frame numbers if not (job["startframe"] >= 1): raise ValueError("Invalid value for startframe. Has to be equal or greater than 1.") return False if not (job["endframe"] >= 1): raise ValueError("Invalid value for endframe. Has to be equal or greater than 1.") return False if not (job["endframe"] >= job["startframe"]): raise ValueError("Invalid value for endframe. Has be to equal or greater than startframe.") return False if job["endframe"] > job["startframe"]: if not (job["endframe"] - job["startframe"] >= job["blocksize"]): raise ValueError("Invalid value for blocksize. Has to be equal or lower than endframe-startframe.") return False if job["endframe"] == job["startframe"]: if job["blocksize"] != 1: raise ValueError("Invalid value for blocksize. Has to be equal 1 if endframe equals startframe.") return False task_frames = range(job["startframe"], job["endframe"] + 1, job["blocksize"]) for x in task_frames: # prepare script input env_dict = { "DRQUEUE_FRAME": x, "DRQUEUE_BLOCKSIZE": job["blocksize"], "DRQUEUE_ENDFRAME": job["endframe"], "DRQUEUE_SCENEFILE": job["scenefile"], "DRQUEUE_LOGFILE": job["name"] + "-" + str(x) + "_" + str(x + job["blocksize"] - 1) + ".log", } # optional elements if "renderdir" in job: env_dict["DRQUEUE_RENDERDIR"] = job["renderdir"] if "projectdir" in job: env_dict["DRQUEUE_PROJECTDIR"] = job["projectdir"] if "configdir" in job: env_dict["DRQUEUE_CONFIGDIR"] = job["configdir"] if "imagefile" in job: env_dict["DRQUEUE_IMAGEFILE"] = job["imagefile"] if "precommand" in job: env_dict["DRQUEUE_PRECOMMAND"] = job["precommand"] if "renderer" in job: env_dict["DRQUEUE_RENDERER"] = job["renderer"] if "fileformat" in job: env_dict["DRQUEUE_FILEFORMAT"] = job["fileformat"] if "postcommand" in job: env_dict["DRQUEUE_POSTCOMMAND"] = job["postcommand"] if "viewcommand" in job: env_dict["DRQUEUE_VIEWCOMMAND"] = job["viewcommand"] if "worldfile" in job: env_dict["DRQUEUE_WORLDFILE"] = job["worldfile"] if "terrainfile" in job: env_dict["DRQUEUE_TERRAINFILE"] = job["terrainfile"] if "composition" in job: env_dict["DRQUEUE_COMPOSITION"] = job["composition"] if "camera" in job: env_dict["DRQUEUE_CAMERA"] = job["camera"] if "resx" in job: env_dict["DRQUEUE_RESX"] = job["resx"] if "resy" in job: env_dict["DRQUEUE_RESY"] = job["resy"] if "renderpass" in job: env_dict["DRQUEUE_RENDERPASS"] = job["renderpass"] if "rendertype" in job: env_dict["DRQUEUE_RENDERTYPE"] = job["rendertype"] if "fileextension" in job: env_dict["DRQUEUE_FILEEXTENSION"] = job["fileextension"] if "stepframe" in job: env_dict["DRQUEUE_STEPFRAME"] = job["stepframe"] if "custom_bucket" in job: env_dict["DRQUEUE_CUSTOM_BUCKET"] = job["custom_bucket"] if "bucketsize" in job: env_dict["DRQUEUE_BUCKETSIZE"] = job["bucketsize"] if "custom_lod" in job: env_dict["DRQUEUE_CUSTOM_LOD"] = job["custom_lod"] if "lod" in job: env_dict["DRQUEUE_LOD"] = job["lod"] if "custom_varyaa" in job: env_dict["DRQUEUE_CUSTOM_VARYAA"] = job["custom_varyaa"] if "varyaa" in job: env_dict["DRQUEUE_VARYAA"] = job["varyaa"] if "raytrace" in job: env_dict["DRQUEUE_RAYTRACE"] = job["raytrace"] if "antialias" in job: env_dict["DRQUEUE_ANTIALIAS"] = job["antialias"] if "custom_bdepth" in job: env_dict["DRQUEUE_CUSTOM_BDEPTH"] = job["custom_bdepth"] if "bdepth" in job: env_dict["DRQUEUE_BDEPTH"] = job["bdepth"] if "custom_zdepth" in job: env_dict["DRQUEUE_CUSTOM_ZDEPTH"] = job["custom_zdepth"] if "zdepth" in job: env_dict["DRQUEUE_ZDEPTH"] = job["zdepth"] if "custom_cracks" in job: env_dict["DRQUEUE_CUSTOM_CRACKS"] = job["custom_cracks"] if "cracks" in job: env_dict["DRQUEUE_CRACKS"] = job["cracks"] if "custom_quality" in job: env_dict["DRQUEUE_CUSTOM_QUALITY"] = job["custom_quality"] if "quality" in job: env_dict["DRQUEUE_QUALITY"] = job["quality"] if "custom_qfiner" in job: env_dict["DRQUEUE_CUSTOM_QFINER"] = job["custom_qfiner"] if "qfiner" in job: env_dict["DRQUEUE_QFINER"] = job["qfiner"] if "custom_smultiplier" in job: env_dict["DRQUEUE_CUSTOM_SMULTIPLIER"] = job["custom_smultiplier"] if "smultiplier" in job: env_dict["DRQUEUE_SMULTIPLIER"] = job["smultiplier"] if "custom_mpcache" in job: env_dict["DRQUEUE_CUSTOM_MPCACHE"] = job["custom_mpcache"] if "mpcache" in job: env_dict["DRQUEUE_MPCACHE"] = job["mpcache"] if "custom_smpolygon" in job: env_dict["DRQUEUE_CUSTOM_SMPOLYGON"] = job["custom_smpolygon"] if "smpolygon" in job: env_dict["DRQUEUE_SMPOLYGON"] = job["smpolygon"] if "custom_wh" in job: env_dict["DRQUEUE_CUSTOM_WH"] = job["custom_wh"] if "custom_type" in job: env_dict["DRQUEUE_CUSTOM_TYPE"] = job["custom_type"] if "ctype" in job: env_dict["DRQUEUE_CTYPE"] = job["ctype"] if "skipframes" in job: env_dict["DRQUEUE_SKIPFRAMES"] = job["skipframes"] # run task on cluster render_script = DrQueue.get_rendertemplate(job["renderer"]) ar = self.lbview.apply(DrQueue.run_script_with_env, render_script, env_dict) # wait for pyzmq send to complete communication (avoid race condition) ar.wait_for_send() return True def identify_computer(self, engine_id, cache_time): """Gather information about computer""" # look if engine info is already stored engine = DrQueueComputer.query_db(engine_id) now = int(time.time()) # check existence and age of info if (engine != None) and (now <= engine["date"] + cache_time): print ("DEBUG: Engine %i was found in DB" % engine_id) # store new info else: print ("DEBUG: Engine %i was not found in DB" % engine_id) # run command only on specific computer dview = self.ip_client[engine_id] dview.block = True dview.execute( "import DrQueue\nfrom DrQueue import Computer as DrQueueComputer\nengine = DrQueueComputer(" + str(engine_id) + ")" ) engine = dview["engine"] engine["date"] = int(time.time()) DrQueueComputer.store_db(engine) return engine def task_wait(self, task_id): """Wait for task to finish""" ar = self.ip_client.get_result(task_id) ar.wait_for_send() ar.wait() return ar def query_job_list(self): """Query a list of all jobs""" return DrQueueJob.query_job_list() def query_running_job_list(self): """Query a list of all running jobs""" jobs = DrQueueJob.query_job_list() running_jobs = [] for job in jobs: if self.query_job_tasks_left(job["_id"]) > 0: running_jobs.append(job) return running_jobs def query_jobname(self, task_id): """Query jobname from task id""" data = self.ip_client.db_query({"msg_id": task_id}) job_id = data[0]["header"]["session"] job = DrQueueJob.query_db(job_id) return job.name def query_job(self, job_id): """Query job from id""" return DrQueueJob.query_db(job_id) def query_job_by_name(self, job_name): """Query job from name""" return DrQueueJob.query_job_by_name(job_name) def query_job_tasks_left(self, job_id): """Query left frames of job""" left = 0 tasks = self.query_task_list(job_id) for task in tasks: if task["completed"] == None: left += 1 return left def query_task_list(self, job_id): """Query a list of tasks objects of certain job""" return self.ip_client.db_query({"header.session": str(job_id)}) def query_task(self, task_id): """Query a single task""" task = self.ip_client.db_query({"msg_id": task_id})[0] return task def query_engine_list(self): """Query a list of all engines""" return self.ip_client.ids def query_engines_of_pool(self, pool_name): """Return available engines of certain pool.""" pool_computers = self.ip_client.ids if pool_name != None: computers = DrQueueComputerPool.query_pool_members(pool_name) if computers == None: raise ValueError('Pool "%s" is not existing!' % pool_name) return False for comp in pool_computers: if not comp in computers: pool_computers.remove(comp) if pool_computers == []: raise ValueError("No computer of pool %s is available!" % pool_name) return False print ("DEBUG: matching pool: " + pool_name) print (pool_computers) return pool_computers def query_engines_of_os(self, os_name): """Return only engines running certain OS.""" # run job only on matching os matching_os = self.ip_client.ids if os_name != None: for engine_id in self.ip_client.ids: engine = self.identify_computer(engine_id, 1000) # os string has to contain os_name if not os_name in engine["os"]: matching_os.remove(engine_id) print ("DEBUG: matching os: " + os_name) print (matching_os) return matching_os def query_engines_with_minram(self, minram): """Return only engines with at least minram GB RAM.""" # run job only on matching minram matching_minram = self.ip_client.ids if minram > 0: for engine_id in self.ip_client.ids: engine = self.identify_computer(engine_id, 1000) if engine["memory"] < minram: matching_minram.remove(engine_id) print ("DEBUG: matching minram: " + str(minram)) print (matching_minram) return matching_minram def query_engines_with_mincores(self, mincores): """Return only engines with at least minram GB RAM.""" # run job only on matching mincores matching_mincores = self.ip_client.ids if mincores > 0: for engine_id in self.ip_client.ids: engine = self.identify_computer(engine_id, 1000) if engine["ncorescpu"] * engine["ncpus"] < mincores: matching_mincores.remove(engine_id) print ("DEBUG: matching mincores: " + str(mincores)) print (matching_mincores) return matching_mincores def match_all_limits(self, os_list, minram_list, mincores_list, pool_list): """Match all limits for job.""" tmp_list = [] # build list with all list members tmp_list.extend(os_list) tmp_list.extend(minram_list) tmp_list.extend(mincores_list) tmp_list.extend(pool_list) # make entries unique tmp_list = set(tmp_list) tmp_list = list(tmp_list) matching_limits = [] for entry in tmp_list: # look if entry is in all lists if (entry in os_list) and (entry in minram_list) and (entry in mincores_list) and (entry in pool_list): matching_limits.append(entry) else: print ("DEBUG: %i isn't matching limits" % entry) print ("DEBUG: matching limits:") print (matching_limits) if len(matching_limits) == 0: message = "No engine meets the requirements." print (message) raise Exception(message) elif len(matching_limits) > 0: # only run on matching engines self.lbview = self.ip_client.load_balanced_view(matching_limits) else: self.lbview = self.ip_client.load_balanced_view() def job_stop(self, job_id): """Stop job and all tasks which are not currently running""" tasks = self.query_task_list(job_id) # abort all queued tasks for task in tasks: self.ip_client.abort(task["msg_id"]) return True def job_kill(self, job_id): """Stop job and all of it's tasks wether running or not""" tasks = self.query_task_list(job_id) running_engines = [] # abort all queued tasks for task in tasks: stats = self.ip_client.queue_status("all", True) # check if tasks is already running on an engine for key, status in stats.items(): if ("tasks" in status) and (task["msg_id"] in status["tasks"]): print "found" running_engines.append(key) self.ip_client.abort(task["msg_id"]) # restart all engines which still run a task running_engines = set(running_engines) print list(running_engines) # for engine_id in running_engines: # self.ip_client(engine_id) return True def job_delete(self, job_id): """Delete job and all of it's tasks""" tasks = self.query_task_list(job_id) engines = self.query_engine_list() # abort and delete all queued tasks for task in tasks: if len(engines) > 0: self.ip_client.abort(task["msg_id"]) self.ip_client.purge_results(task["msg_id"]) # delete job itself DrQueueJob.delete_from_db(job_id) return True def task_continue(self, task_id): """Continue aborted or failed task""" task = self.query_task(task_id) # check if action is needed if (task["completed"] != None) and ( (task["result_header"]["status"] == "error") or (task["result_header"]["status"] == "aborted") ): self.task_requeue(task_id) return True def task_requeue(self, task_id): """Requeue task""" self.ip_client.resubmit(task_id) print "requeuing %s" % task_id return True def job_continue(self, job_id): """Continue stopped job and all of it's tasks""" job = self.query_job(job_id) # run job only on matching os os_list = self.query_engines_of_os(job["limits"]["os"]) # run job only on matching minram minram_list = self.query_engines_with_minram(job["limits"]["minram"]) # run job only on matching mincores mincores_list = self.query_engines_with_mincores(job["limits"]["mincores"]) # check pool members pool_list = self.query_engines_of_pool(job["limits"]["pool"]) # check limits self.match_all_limits(os_list, minram_list, mincores_list, pool_list) tasks = self.query_task_list(job_id) # continue tasks for task in tasks: self.task_continue(task["msg_id"]) return True def job_rerun(self, job_id): """Run all tasks of job another time""" job = self.query_job(job_id) # run job only on matching os os_list = self.query_engines_of_os(job["limits"]["os"]) # run job only on matching minram minram_list = self.query_engines_with_minram(job["limits"]["minram"]) # run job only on matching mincores mincores_list = self.query_engines_with_mincores(job["limits"]["mincores"]) # check pool members pool_list = self.query_engines_of_pool(job["limits"]["pool"]) # check limits self.match_all_limits(os_list, minram_list, mincores_list, pool_list) tasks = self.query_task_list(job_id) # rerun tasks for task in tasks: self.task_requeue(task["msg_id"]) return True def job_status(self, job_id): """Return status string of job""" tasks = self.query_task_list(job_id) status = None status_pending = 0 status_ok = 0 status_aborted = 0 status_resubmitted = 0 status_error = 0 for task in tasks: # look for pending tasks if task["completed"] == None: status_pending += 1 else: if "result_header" in task.keys(): result_header = task["result_header"] # look for done tasks if ("status" in result_header.keys()) and (result_header["status"] == "ok"): status_ok += 1 # look for aborted tasks elif ("status" in result_header.keys()) and (result_header["status"] == "aborted"): status_aborted += 1 # look for done tasks elif ("status" in result_header.keys()) and (result_header["status"] == "resubmitted"): status_resubmitted += 1 # look for tasks with error elif ("status" in result_header.keys()) and (result_header["status"] == "error"): status_error += 1 else: status_unknown += 1 # if at least 1 task is ok, job status is ok if status_ok > 0: status = "ok" # if at least 1 task is pending, job status is pending if status_pending > 0: status = "pending" # if at least 1 task is aborted, job status is aborted if status_aborted > 0: status = "aborted" # if at least 1 task has an error, job status is error if status_error > 0: status = "error" return status def engine_stop(self, engine_id): """Stop a specific engine""" # delete computer information in db DrQueueComputer.delete_from_db(engine_id) # shutdown computer self.ip_client.shutdown(engine_id) return True def engine_restart(self, engine_id): """Restart a specific engine""" self.ip_client.shutdown(engine_id, True, False, True) return True
"--profile", dest="client_profile", default="unissh", action="store_const", help="the profile to use for ipython.parallel", ) options, args = opt_parser.parse_args() # START: create remote evaluators and a few (or one) special one for # # generating new points logger.info("init") from IPython.parallel import Client, require c = Client(profile=options.client_profile) c.clear() # clears remote engines c.purge_results("all") # all results are memorized in the hub if len(c.ids) < 2: raise Exception("I need at least 2 clients.") nbGens = min(1, len(c.ids) - 1) generators = c.load_balanced_view(c.ids[:nbGens]) evaluators = c.load_balanced_view(c.ids[nbGens:]) # MAX number of tasks in total MAX = 5000 # length of test data, sent over the wire DIMSIZE = 10 # when adding machines, this is the number of additional tasks # beyond the number of free machines new_extra = DIMSIZE
def run(self): if self.session.get_client() == None: self.mylog.error("Not connected to a cluster.") return False # workaround for Ipython bug which makes everything slow, # create a new client, use it and delete it c = Client(profile='ssh') jcmd = self.session.opts.get_opt('jcmd') if jcmd == 'purge': num = 0 query = c.db_query({'completed': {'$ne': None}}, ['msg_id']) for q in query: result = c.get_result(q['msg_id']).get() # filter on SB, node, task if self._check_result(result): num += 1 c.purge_results(q['msg_id']) mylogger.userinfo(self.mylog, str(num) + " cluster's hub results deleted.") elif jcmd == 'list': num = 0 # query the hub DB for all the finished tasks and get IDs query = c.db_query({'completed': { '$ne': None }}, ['msg_id', 'completed', 'started']) # search for interesting results and print them for q in query: result = c.get_result(q['msg_id']).get() # filter on SB, node, task if self._check_result(result): # skip results without error if wanted if self.session.opts.get_opt( 'onlyerr') and result['err'] == '': continue num += 1 header = {'Task' : result['task'], 'Node' : result['node'],\ 'SB' : result['SB'], \ 'Completed' : q['completed'].replace(microsecond=0), \ 'Started' : q['started'].replace(microsecond=0), \ 'Exec time': q['completed'].replace(microsecond=0)-q['started'].replace(microsecond=0)} data = {'Std Output': result['out'], 'Std Error': result['err'], \ 'Command':result['command']} print_jobs(header, data, self.session.opts.get_opt('lines')) mylogger.userinfo(self.mylog, str(num) + " processes listed.") elif jcmd == 'running': num_r = 0 num_q = 0 # TODO: it should be "Started" not "submitted", unfortunately ipython does not set it query = c.db_query({'completed': None}, ['buffers', 'engine_uuid', 'submitted']) for q in query: # unpack the buffer of the sent jobs to obtain the arguments null, com, args = unpack_apply_message(q['buffers']) # filter on SB, node, task if self._check_result({ 'node': args['node'], 'SB': args['SB'], 'task': args['task'] }): if q['engine_uuid'] == None: if self.session.opts.get_opt('queue') == False: continue q['msg_id'] = q['msg_id'] + " (queue)" num_q += 1 else: num_r += 1 header = {'Msg_id' : q['msg_id'], 'Task' : args['task'], 'Node' : args['node'], 'SB' : args['SB'], \ 'Started' : q['submitted'].replace(microsecond=0), \ 'Extime': datetime.datetime.now().replace(microsecond=0) - q['submitted'].replace(microsecond=0)} data = {'Command': com[0]} print_jobs(header, data, self.session.opts.get_opt('lines')) mylogger.userinfo( self.mylog, "Processes running: " + str(num_r) + ". In queue: " + str(num_q) + ".") elif jcmd == 'kill': print "TBI" #TODO: add a resubmit option to resubmit all tasks that failed http://ipython.org/ipython-doc/stable/parallel/parallel_task.html del c