def test_job_bulk_resubmit(driver, state, monkeypatch): root = Folder.get_root() jobs = [ driver.create_job( command="echo 'begin'; sleep 0.2 ; echo 'end' ; exit 1", folder=root ), driver.create_job( command="echo 'begin'; sleep 0.2 ; echo 'end' ; exit 1", folder=root ), driver.create_job( command="echo 'begin'; sleep 0.2 ; echo 'end' ; exit 1", folder=root ), ] other_job = driver.create_job( command="echo 'begin'; sleep 0.2 ; echo 'end' ; exit 1", folder=root ) other_job.status = Job.Status.COMPLETED other_job.save() jobs[0].status = Job.Status.FAILED jobs[0].save() sbatch = Mock(side_effect=[1, 2, 3]) monkeypatch.setattr(driver.slurm, "sbatch", sbatch) driver.bulk_submit(jobs[1:]) assert sbatch.call_count == 2 for job in jobs[1:]: job.status = Job.Status.COMPLETED with open(job.data["stdout"], "w") as f: f.write("hurz") job.save() shutil.rmtree(jobs[0].data["output_dir"]) # we need to prevent driver from actually calling submit submit = Mock() remove = Mock(wraps=os.remove) makedirs = Mock() with monkeypatch.context() as m: m.setattr(driver, "submit", submit) m.setattr(driver.slurm, "sacct", Mock(return_value=[])) m.setattr(driver, "bulk_kill", Mock(side_effect=RuntimeError)) m.setattr("os.remove", remove) m.setattr("os.makedirs", makedirs) driver.bulk_resubmit(jobs) assert submit.call_count == len(jobs) remove.assert_has_calls([call(j.data["stdout"]) for j in jobs[1:]], any_order=True) makedirs.assert_has_calls( [call(j.data["output_dir"]) for j in jobs[1:]], any_order=True ) for job in jobs: job.reload() assert job.status == Job.Status.CREATED # bug: all jobs where reset to created. Check this is not the case anymore other_job.reload() assert other_job.status != Job.Status.CREATED
def test_bulk_sync_status(driver, state, monkeypatch): root = Folder.get_root() jobs = [ driver.create_job(folder=root, command=f"sleep 0.1; echo 'JOB{i}'") for i in range(15) ] assert len(jobs) == 15 for job in jobs: assert job.status == Job.Status.CREATED sbatch = Mock(side_effect=[i + 1 for i in range(len(jobs))]) monkeypatch.setattr(driver.slurm, "sbatch", sbatch) driver.bulk_submit(jobs) sacct_return = [ "|".join([str(i + 1), "RUNNING", "0:0", "", "", "", ""]) for i in range(len(jobs)) ] sacct = Mock(return_value=sacct_return) # pretend they're all running now monkeypatch.setattr(driver.slurm, "_sacct", sacct) jobs = driver.bulk_sync_status(jobs) sacct.assert_called_once_with( jobs=",".join([j.batch_job_id for j in jobs]), format="JobID,State,ExitCode,Submit,Start,End,NodeList", noheader=True, parsable2=True, starttime=ANY, _iter=True, ) for job in jobs: assert job.status == Job.Status.RUNNING sacct_return = [ "|".join([str(i + 1), "COMPLETED" if i < 6 else "FAILED", "0:0"] + [""] * 4) for i in range(len(jobs)) ] sacct = Mock(return_value=sacct_return) monkeypatch.setattr(driver.slurm, "_sacct", sacct) jobs = driver.bulk_sync_status(jobs) sacct.assert_called_once_with( jobs=",".join([j.batch_job_id for j in jobs]), format="JobID,State,ExitCode,Submit,Start,End,NodeList", noheader=True, parsable2=True, starttime=ANY, _iter=True, ) for job in jobs[:6]: assert job.status == Job.Status.COMPLETED for job in jobs[6:]: assert job.status == Job.Status.FAILED
def test_resubmit_job(driver, state, monkeypatch): root = Folder.get_root() j1 = driver.create_job(command="sleep 1", folder=root) assert j1.status == Job.Status.CREATED batch_job_id = 5_207_375 sbatch = Mock(return_value=batch_job_id) monkeypatch.setattr(driver.slurm, "sbatch", sbatch) driver.submit(j1) sbatch.assert_called_once_with(j1) assert j1.status == Job.Status.SUBMITTED assert j1.batch_job_id == str(batch_job_id) monkeypatch.setattr(driver.slurm, "sacct", Mock(return_value=[])) with pytest.raises(InvalidJobStatus): driver.resubmit(j1) SAI = SlurmAccountingItem monkeypatch.setattr( driver.slurm, "sacct", Mock(return_value=[SAI(j1.batch_job_id, Job.Status.FAILED, 0, {})]), ) bjid2 = 42 sbatch = Mock(return_value=bjid2) monkeypatch.setattr(driver.slurm, "sbatch", sbatch) with monkeypatch.context() as m: # job errors on kill, resubmits anyway m.setattr(driver, "kill", Mock(side_effect=RuntimeError())) m.setattr("os.path.exists", Mock(side_effect=[True, False, False])) m.setattr("os.remove", Mock()) j1 = driver.resubmit(j1) sbatch.assert_called_once() assert j1.status == Job.Status.SUBMITTED assert j1.batch_job_id == str(bjid2) # gets new batch job id with monkeypatch.context() as m: m.setattr(driver, "sync_status", Mock()) # disable sync for a second with pytest.raises(InvalidJobStatus): driver.resubmit(j1) # stays in SUBMITTED, not accepted monkeypatch.setattr( driver.slurm, "sacct", Mock(return_value=[SAI(j1.batch_job_id, Job.Status.FAILED, 0, {})]), ) # will go to failed bjid3 = 99 sbatch = Mock(return_value=bjid3) monkeypatch.setattr(driver.slurm, "sbatch", sbatch) j1 = driver.resubmit(j1) sbatch.assert_called_once() assert j1.status == Job.Status.SUBMITTED assert j1.batch_job_id == str(bjid3)