Ejemplo n.º 1
0
def raise_child_failure_error_fn(name, child_error_file=""):
    if child_error_file:
        _write_error(SentinelError("foobar"), child_error_file)
    pf = ProcessFailure(local_rank=0,
                        pid=997,
                        exitcode=1,
                        error_file=child_error_file)
    raise ChildFailedError(name, {0: pf})
Ejemplo n.º 2
0
def monres(state: WorkerState):
    if state == WorkerState.SUCCEEDED:
        return RunResult(state=state, return_values={0: 0}, failures={})
    elif state in {WorkerState.UNHEALTHY, WorkerState.FAILED}:
        pf = ProcessFailure(local_rank=0, pid=999, exitcode=1, error_file="<none>")
        return RunResult(state=state, return_values={}, failures={0: pf})
    else:
        return RunResult(state=state)
Ejemplo n.º 3
0
 def test_failure_incorrect_reply_file(self):
     content = {"unknown_key": "unknown_value"}
     with open(self.test_error_file, "w") as fp:
         json.dump(content, fp)
     with self.assertRaises(Exception):
         ProcessFailure(
             local_rank=0, pid=997, exitcode=1, error_file=self.test_error_file
         )
Ejemplo n.º 4
0
 def failure_with_error_file(self, exception):
     with mock.patch.dict(
             os.environ, {"TORCHELASTIC_ERROR_FILE": self.test_error_file}):
         ErrorHandler().record_exception(exception)
     return ProcessFailure(local_rank=0,
                           pid=997,
                           exitcode=1,
                           error_file=self.test_error_file)
Ejemplo n.º 5
0
 def test_process_mast_error_format(self):
     error_data = {"message": "test error message", "timestamp": "10"}
     with open(self.test_error_file, "w") as fp:
         json.dump(error_data, fp)
     pf = ProcessFailure(
         local_rank=0, pid=997, exitcode=1, error_file=self.test_error_file
     )
     self.assertEqual("test error message", pf.message)
     self.assertEqual(10, pf.timestamp)
Ejemplo n.º 6
0
def raise_child_failure_error_fn(name, child_error_file=""):
    if child_error_file:
        with mock.patch.dict(os.environ,
                             {"TORCHELASTIC_ERROR_FILE": child_error_file}):
            ErrorHandler().record_exception(SentinelError("foobar"))
    pf = ProcessFailure(local_rank=0,
                        pid=997,
                        exitcode=1,
                        error_file=child_error_file)
    raise ChildFailedError(name, {0: pf})
Ejemplo n.º 7
0
    def _poll(self) -> Optional[RunProcsResult]:
        done_local_ranks = set()
        for local_rank in self._running_local_ranks:
            handler = self.subprocess_handlers[local_rank]
            exitcode = handler.proc.poll()
            if exitcode is not None:
                done_local_ranks.add(local_rank)
                if exitcode != 0:  # failed or signaled
                    self._failures[local_rank] = ProcessFailure(
                        local_rank=local_rank,
                        pid=handler.proc.pid,
                        exitcode=exitcode,
                        error_file=self.error_files[local_rank],
                    )
                # else: --> succeeded; nothing to do

        self._running_local_ranks.difference_update(done_local_ranks)

        # if ALL procs are finished or ANY have failed
        if not self._running_local_ranks or self._failures:
            self.close()  # terminate all running procs
            result = RunProcsResult(
                failures=self._failures,
                stdouts=self.stdouts,
                stderrs=self.stderrs,
            )
            if result.is_failed():
                first_failure = min(result.failures.values(),
                                    key=lambda f: f.timestamp)
                log.error(
                    f"failed (exitcode: {first_failure.exitcode})"
                    f" local_rank: {first_failure.local_rank} (pid: {first_failure.pid})"
                    f" of binary: {self.entrypoint}")
            else:
                # Populate return with dummy values. This provides consistency with MultiprocessingHandler
                result.return_values = {
                    local_rank: None
                    for local_rank in range(self.nprocs)
                }

            return result
        else:  # there are no failures and procs still running
            return None
Ejemplo n.º 8
0
 def failure_without_error_file(self, exitcode):
     return ProcessFailure(local_rank=0,
                           pid=997,
                           exitcode=exitcode,
                           error_file="ignored.json")
Ejemplo n.º 9
0
 def failure_with_error_file(self, exception):
     _write_error(exception, self.test_error_file)
     return ProcessFailure(local_rank=0,
                           pid=997,
                           exitcode=1,
                           error_file=self.test_error_file)
Ejemplo n.º 10
0
    def _poll(self) -> Optional[RunProcsResult]:
        assert self._pc is not None  # assertion for mypy type checker

        try:
            # torch.mp.ProcessContext Throws an Exception if some/all of
            # worker processes failed
            # timeout < 0 checks worker status and return immediately
            # Join will never return success since we use synchronize.Event to wait
            # for all processes to finish.
            self._pc.join(-1)

            # IMPORTANT: we use multiprocessing.Queue to carry worker return values
            # back to the parent, the worker process will wait before terminating
            # until all the buffered items are fed by the feeder thread to the underlying
            # pipe. Hence to prevent deadlocks on large return values,
            # we opportunistically try queue.get on each join call
            # See: https://docs.python.org/2/library/multiprocessing.html#all-platforms
            for local_rank in range(0, self.nprocs):
                return_queue = self._ret_vals[local_rank]
                if not return_queue.empty():
                    # save the return values temporarily into a member var
                    self._return_values[local_rank] = return_queue.get()

            if self._is_done():
                # we should ALWAYS have ALL the return values when all the processes are done
                self._worker_finished_event.set()
                # Wait untill all processes are finished. At this point workers finished executing
                # user function
                self._pc.join()
                _validate_full_rank(self._return_values, self.nprocs,
                                    "return_value queue")
                self.close()
                return RunProcsResult(
                    return_values=self._return_values,
                    stdouts=self.stdouts,
                    stderrs=self.stderrs,
                )
            else:
                return None
        except (mp.ProcessRaisedException, mp.ProcessExitedException) as e:
            failed_local_rank = e.error_index

            # entrypoint for MultiprocessContext will always be a Callable
            fn_name = self.entrypoint.__qualname__  # type: ignore[union-attr]
            failed_proc = self._pc.processes[failed_local_rank]
            error_filepath = self.error_files[failed_local_rank]

            log.error(
                f"failed (exitcode: {failed_proc.exitcode})"
                f" local_rank: {failed_local_rank} (pid: {e.pid})"
                f" of fn: {fn_name} (start_method: {self.start_method})",
                exc_info=True,
            )

            self.close()
            return RunProcsResult(
                failures={
                    failed_local_rank:
                    ProcessFailure(
                        local_rank=failed_local_rank,
                        pid=e.pid,
                        exitcode=failed_proc.exitcode,
                        error_file=error_filepath,
                    )
                },
                stdouts=self.stdouts,
                stderrs=self.stderrs,
            )