def raise_child_failure_error_fn(name, child_error_file=""): if child_error_file: _write_error(SentinelError("foobar"), child_error_file) pf = ProcessFailure(local_rank=0, pid=997, exitcode=1, error_file=child_error_file) raise ChildFailedError(name, {0: pf})
def test_dump_error_file_overwrite_existing(self): dst_error_file = os.path.join(self.test_dir, "dst_error.json") src_error_file = os.path.join(self.test_dir, "src_error.json") _write_error(RuntimeError("foo"), dst_error_file) _write_error(RuntimeError("bar"), src_error_file) with patch.dict(os.environ, {"TORCHELASTIC_ERROR_FILE": dst_error_file}): eh = ErrorHandler() eh.dump_error_file(src_error_file) self.assertTrue(filecmp.cmp(src_error_file, dst_error_file))
def test_copy_error_file(self): src_error_file = os.path.join(self.test_dir, "src_error.json") _write_error(RuntimeError("foobar"), src_error_file) with patch.dict(os.environ, {"TORCHELASTIC_ERROR_FILE": self.test_error_file}): eh = ErrorHandler() eh.copy_error_file(src_error_file) self.assertTrue(filecmp.cmp(src_error_file, self.test_error_file)) with patch.dict(os.environ, {}): eh = ErrorHandler() eh.copy_error_file(src_error_file)
def test_get_failures(self, log_mock): with mock.patch("time.time", side_effect=[3, 2, 1]): error_file0 = os.path.join(self.test_dir, "error0.json") error_file1 = os.path.join(self.test_dir, "error1.json") _write_error(RuntimeError("error 0"), error_file0) _write_error(RuntimeError("error 1"), error_file1) fail0 = ProcessFailure( local_rank=0, pid=997, exitcode=1, error_file=error_file0 ) fail1 = ProcessFailure( local_rank=1, pid=998, exitcode=3, error_file=error_file1 ) fail2 = ProcessFailure( local_rank=2, pid=999, exitcode=15, error_file="no_exist.json" ) self.assertEqual(3, fail0.timestamp) self.assertEqual(2, fail1.timestamp) self.assertEqual(1, fail2.timestamp)
def failure_with_error_file(self, exception): _write_error(exception, self.test_error_file) return ProcessFailure(local_rank=0, pid=997, exitcode=1, error_file=self.test_error_file)