def on_task_done(self, task, errorcode): # move stdout and stderr to shared storage srcs = ' '.join([os.path.join(self.local_dir, task.name + '.stdout'), os.path.join(self.local_dir, task.name + '.stderr')]) dst = os.path.join(self.environment.config_dir, 'jobs', self.environment.job_id) remote('mv {0} {1}'.format(srcs, dst), task.host) if errorcode > 0: expl = 'task {0} stopped with non-zero error code {1}' self.reporter.report(reporting.TASK_FAILED, explanation=expl.format(task.name, errorcode), task=task.name) self.scheduler.errorcode = 1 self.scheduler.stop() # if this task is the final task, we should copy its output files to # the workflow directory. if task.name in self.workflow.target_names or task.checkpoint: task.move_output(self.workflow.working_dir) # decrease references for all dependencies of this task. Cleanup will # automatically be run for the dependency if its reference count is 0. for _, dependency in task.dependencies: if not dependency.can_execute: continue dependency.references -= 1 if dependency.references == 0: self.cleanup(dependency) # decrease references for own, task to avoid others preemptly doing a cleanup. task.references -= 1 if task.references == 0: self.cleanup(task) # figure out where this task was run and increment the number of cores # available on the host, since the job is now done. self.environment.nodes[task.host] += task.cores self.running.discard(task) task.transfer_started -= self.on_transfer_started task.transfer_success -= self.on_transfer_success task.transfer_failed -= self.on_transfer_failed self.reporter.report(reporting.TASK_COMPLETED, task=task.name) logging.info('task done: %s', task.name) # reschedule now that we know that a task has finished self.schedule_tasks()
def move_output(self, working_dir): for out_file in self.output: relpath = os.path.relpath(out_file, working_dir) # figure out where this task was run and its path at that host src_host = self.host src_path = os.path.join(self.local_wd, relpath) # make the directory which we're going to copy to local('mkdir -p {0}'.format(os.path.dirname(out_file))) src = ''.join([src_host, ':', src_path]) self.transfer_started(task=self.name, source=src, destination=out_file) # now copy the file to the workflow working directory if remote('cp {0} {1}'.format(src_path, out_file), src_host) == 0: self.transfer_success(task=self.name, source=src, destination=out_file) else: expl = 'could not copy file {0} to {1}'.format(src, out_file) self.transfer_failed(task=self.name, source=src, destination=out_file, explanation=expl)
def get_input(self): for in_file, dependency in self.dependencies: # build path to file on remote source and destination relpath = os.path.relpath(in_file, self.working_dir) src_path = os.path.join(dependency.local_wd, relpath) dst_path = os.path.join(self.local_wd, relpath) # figure out source and destination hosts src_host = dependency.host dst_host = self.host # if the dependency was checkpointed, we check if the file exists # on the node at which it was generated. If not, we should fetch # the file from remote storage. if dependency.checkpoint and not dependency.host: src_host = dst_host src_path = in_file if local('stat {0} &> /dev/null'.format(src_path)) < 0: logging.error('''output of checkpointed dependency does not exist at %s -- halting''' % src_path) os.exit(1) remote('mkdir -p {0}'.format(os.path.dirname(dst_path)), dst_host) # if the source host is the same as the destination host, we won't # copy any files, but just make a hardlink to the source file. if src_host == dst_host and not dependency.checkpoint: self.transfer_started(task=self.name, source=src_path, destination=dst_path) errorcode = remote('ln {0} {1}'.format(src_path, dst_path), src_host) if errorcode > 0: expl = 'could not hard link {0} to {1}'.format(src_path, dst_path) self.transfer_failed(task=self.name, source=src_path, destination=dst_path, explanation=expl) else: self.transfer_success(task=self.name, source=src_path, destination=dst_path) elif dependency.checkpoint: self.transfer_started(task=self.name, source=src_path, destination=dst_path) errorcode = remote('cp {0} {1}'.format(src_path, dst_path), src_host) if errorcode > 0: expl = 'could not copy file {0} to {1}'.format(src_path, dst_path) self.transfer_failed(task=self.name, source=src_path, destination=dst_path, explanation=expl) else: self.transfer_success(task=self.name, source=src_path, destination=dst_path) else: src = ''.join([src_host, ':', src_path]) dst = ''.join([dst_host, ':', dst_path]) self.transfer_started(task=self.name, source=src, destination=dst) if local('scp -c arcfour {0} {1}'.format(src, dst)) == 0: self.transfer_success(task=self.name, source=src, destination=dst) else: expl = 'could not copy file {0} to {1}'.format(src, dst) self.transfer_failed(task=self.name, source=src, destination=dst, explanation=expl)
def cleanup(self, task): if task.host: # delete the task directory on the host remote('rm -rf {0}'.format(task.local_wd), task.host)