def test_resubmit_failed(cleanup): cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT}" ret = run_command(cmd) assert ret == 0 ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01") assert ret == 0 agg = ResultsAggregator.load(OUTPUT) results = agg.get_results_unsafe() assert results for result in results: assert result.return_code == 0 x = results[0] results[0] = Result(x.name, 1, x.status, x.exec_time_s, x.completion_time) agg._write_results(results) results_filename = os.path.join(OUTPUT, RESULTS_FILE) final_results = load_data(results_filename) final_results["results"][0]["return_code"] = 1 final_results["results_summary"]["num_failed"] = 1 final_results["results_summary"]["num_successful"] -= 1 dump_data(final_results, results_filename) summary = ResultsSummary(OUTPUT) assert summary.get_failed_results()[0].name == "1" ret = run_command(f"{RESUBMIT_JOBS} {OUTPUT}") assert ret == 0 ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01") assert ret == 0 summary = ResultsSummary(OUTPUT) assert len(summary.get_successful_results()) == NUM_COMMANDS
def _complete(self): ret = self._pipe.returncode exec_time_s = time.time() - self._start_time job_filename = self._job.name illegal_chars = ("/", "\\", ":") for char in illegal_chars: job_filename = job_filename.replace(char, "-") status = "finished" output_dir = os.path.join(self._output, JOBS_OUTPUT_DIR, self._job.name) bytes_consumed = get_directory_size_bytes(output_dir) event = StructuredLogEvent( source=self._job.name, category=EVENT_CATEGORY_RESOURCE_UTIL, name=EVENT_NAME_BYTES_CONSUMED, message="job output directory size", bytes_consumed=bytes_consumed, ) log_event(event) result = Result(self._job.name, ret, status, exec_time_s) ResultsAggregator.append(self._results_filename, result) logger.info("Job %s completed return_code=%s exec_time_s=%s", self._job.name, ret, exec_time_s)
def cancel(self): self._return_code = 1 self._is_complete = True result = Result(self._job.name, self._return_code, JobCompletionStatus.CANCELED, 0.0) ResultsAggregator.append(self._output, result, batch_id=self._batch_id) logger.info("Canceled job %s", self._job.name)
def _submit(self, verbose): for stage in self._stages: os.environ["JADE_PIPELINE_STAGE_ID"] = str(self._cur_stage_id) stage_info = { "stage_id": self._cur_stage_id, "output_directory": self.get_stage_output_path(self._output, self._cur_stage_id) } self._status_info["stages"].append(stage_info) self._dump_status() self._run_auto_config(stage) cmd = self._make_submit_cmd(stage, verbose) start = time.time() ret = run_command(cmd) end = time.time() exec_time = end - start result = Result(str(self._cur_stage_id), ret, "finished", exec_time, end) self._status_info["stages"][-1]["result"] = serialize_result( result) self._dump_status() if ret != 0: raise ExecutionError(f"stage {self._cur_stage_id} failed") self._cur_stage_id += 1 logger.info("Finished execution pipeline")
def _cancel_job(self, job, aggregator): job.state = JobState.DONE job.blocked_by.clear() result = Result(job.name, 1, JobCompletionStatus.CANCELED, 0) aggregator.append_result(result) logger.info("Canceled job %s because one of its blocking jobs failed.", job.name) return result
def cancel(self): self._return_code = 1 self._is_complete = True if self._is_manager_node: result = Result( self._job.name, self._return_code, JobCompletionStatus.CANCELED, 0.0, hpc_job_id=self._hpc_job_id, ) ResultsAggregator.append(self._output, result, batch_id=self._batch_id) logger.info("Canceled job %s", self._job.name) else: logger.info("Canceled job %s on non-manager node", self._job.name)
def _complete(self): self._return_code = self._pipe.returncode exec_time_s = time.time() - self._start_time if not self._is_manager_node: # This will happen on a multi-node job. Don't complete it multiple times. logger.info( "Job %s completed on non-manager node return_code=%s exec_time_s=%s", self._job.name, self._return_code, exec_time_s, ) return status = JobCompletionStatus.FINISHED output_dir = self._output / JOBS_OUTPUT_DIR / self._job.name bytes_consumed = get_directory_size_bytes(output_dir) event = StructuredLogEvent( source=self._job.name, category=EVENT_CATEGORY_RESOURCE_UTIL, name=EVENT_NAME_BYTES_CONSUMED, message="job output directory size", bytes_consumed=bytes_consumed, ) log_event(event) result = Result( self._job.name, self._return_code, status, exec_time_s, hpc_job_id=self._hpc_job_id ) ResultsAggregator.append(self._output, result, batch_id=self._batch_id) logger.info( "Job %s completed return_code=%s exec_time_s=%s hpc_job_id=%s", self._job.name, self._return_code, exec_time_s, self._hpc_job_id, )
def create_result(index): """Creates a result with unique fields based on an index.""" return Result(str(index), index, "finished", 1.0 + index, hpc_job_id=None)