def manage_tasks(self, tasks): """Receives completed tasks from the clients and updates the tasks file. Args: tasks (dict): Tasks. """ t0 = Time.time() tasks_pth = path(self.params().get("tasks_pth", "")) n_tasks = len(tasks) n_done0 = n_tasks - self.taskq().qsize() n_done = n_done0 util.info("[Server] Tasks queued. {}/{} ({:.1f}%%) complete.".format( n_done, n_tasks, n_done / n_tasks * 100)) save_time = Time.time() # Set up a thread that joins self.taskq and only returns once # all the tasks have completed. The while loops continues as # long as taskq_thread is alive. taskq_thread = Thread(name="taskq", target=self._taskq_worker, args=(self.taskq,)) taskq_thread.start() while taskq_thread.is_alive(): # Wait for a done task to arrive. task = self.doneq().get() # Update the master tasks dict. task_name = task["task_name"] tasks[task_name].update(task) if tasks_pth and save_time.delta() > 10.: # Save task to disk. update_tasks_file(tasks_pth, tasks, overwrite=True) save_time = Time.time() # Report progress. n_done += 1 percent = float(n_done) / n_tasks * 100. dt = t0.delta() time_per_task = dt / float(n_done - n_done0) n_left = n_tasks - n_done t_left = Time(time_per_task * n_left) util.info("[Server] Task `{}` complete:\n\t\t {}/{} ({:.2f}%) {} " "\n\t\t Time left: {}.".format( task_name, n_done, n_tasks, percent, dt, t_left)) # Save tasks to disk one last time. update_tasks_file(tasks_pth, tasks, overwrite=True) # Wait for the clients to d/c. clients = [] while not self.activeq().empty(): clients.append(self.activeq().get()) util.debug("[Server] Waiting for clients to disconnect:\n\t{}.".format( "\n\t".join(clients))) self.activeq().join() time.sleep(0.5) self.activeq().close() self.taskq().close() self.doneq().close() util.info("[Server] Tasks completed.")
def _worker_process(self, finish, job, taskq, doneq, max_run_retries=3, max_send_retries=10): """Run a set of jobs.""" job.setup() run_retries = 0 exitcode = 0 # Loop over tasks. while not taskq.empty(): # Pop task off queue. task = taskq.get() success = False T0 = Time.time() try: progress, tmp_fid = job.run(task) except EOFError as err: # Report what went wrong and retry. util.debug("[Client] {}".format(err.msg)) msg = "[Client] {}/{} failed run retries, {{}}".format( run_retries, max_run_retries) if run_retries < max_run_retries: run_retries += 1 util.debug(msg.format("retrying...")) else: util.debug(msg.format("exiting.")) raise err else: # Send results. sent = send_result(tmp_fid, task, retries=max_send_retries) if sent or not self.save: # Mark simulation as complete. task["complete"] = True # Task is done. doneq.put(task) success = True # Report progress. progress.task = (taskq.qsize(), T0.delta()) progress.report() finally: if not success: # Task did not complete successfully: put it back in taskq. taskq.put(task) taskq.task_done() exitcode = 0 if finish.is_set(): exitcode = 100 break job.teardown() util.info("[Client] Process complete: {}.".format(ProcLabel())) sys.exit(exitcode)