def test_timeout_unhandled(self): '''with timeout handling disabled, jobs are marked FAILED''' remote_dir = tempfile.TemporaryDirectory(prefix="remote") remote_path = os.path.join(remote_dir.name, 'side0.dat') with open(remote_path, 'w') as fp: fp.write('9\n') # Same as previous test, but square.py hangs for 300 sec job = create_job(name='square_testjob2', app='square', args='side0.dat --sleep 5', url_in=f'local:{remote_dir.name}', stage_out_files='square*', url_out=f'local:{remote_dir.name}', auto_timeout_retry=False) # Job reaches the RUNNING state and then times out success = util.run_launcher_until_state(job, 'RUNNING') self.assertTrue(success) # On termination, actively running job is marked RUN_TIMEOUT def check(): job.refresh_from_db() return job.state == 'RUN_TIMEOUT' success = util.poll_until_returns_true(check, timeout=12) self.assertEqual(job.state, 'RUN_TIMEOUT') # If we run the launcher again, it will pick up the timed out job # But without timeout handling, it fails success = util.run_launcher_until_state(job, 'FAILED') self.assertTrue(success)
def test_error_unhandled(self): '''test unhandled return code from app''' remote_dir = tempfile.TemporaryDirectory(prefix="remote") remote_path = os.path.join(remote_dir.name, 'side0.dat') with open(remote_path, 'w') as fp: fp.write('9\n') # Same as previous test, but square.py returns nonzero job = create_job(name='square_testjob2', app='square', args='side0.dat --retcode 1', url_in=f'local:{remote_dir.name}', stage_out_files='square*', url_out=f'local:{remote_dir.name}') self.assertEqual(job.application_args, 'side0.dat --retcode 1') self.assertEqual(BalsamJob.objects.all().count(), 1) # The job is marked FAILED due to unhandled nonzero return code success = util.run_launcher_until_state(job, 'FAILED') self.assertTrue(success) # (But actually the application ran and printed its result correctly) result = float(job.read_file_in_workdir('square.dat')) self.assertEqual(result, 81.0) preproc_out_contents = job.read_file_in_workdir('preprocess.log') jobid_line = [ l for l in preproc_out_contents.split('\n') if 'jobid' in l ][0] self.assertIn(str(job.pk), jobid_line)
def test_static(self): '''test normal processing of a pre-defined DAG''' NUM_SIDES, NUM_RANKS = 3, 2 pre = self.apps[ 'make_sides'].default_preprocess + f' {NUM_SIDES} {NUM_RANKS}' parent = create_job(name='make_sides', app='make_sides', preproc=pre) # Each side length is mapped to a square area in a set of mapping jobs. # These 3 "square_jobs" all have the same parent make_sides, but each # takes a different input file square_jobs = { i: create_job(name=f'square{i}', app='square', args=f'side{i}.dat', input_files=f'side{i}.dat') for i in range(NUM_SIDES) } for job in square_jobs.values(): job.set_parents([parent]) # The final reduce job depends on all the square jobs: all square.dat # files will be staged in and final results staged out to a remote # directory remote_dir = tempfile.TemporaryDirectory(prefix="remote") reduce_job = create_job(name='reduce', app='reduce', input_files="square*.dat*", url_out=f'local:{remote_dir.name}', stage_out_files='summary*.dat reduce.out') reduce_job.set_parents(square_jobs.values()) # Run the entire DAG until finished success = util.run_launcher_until_state(reduce_job, 'JOB_FINISHED', timeout=180.0) self.assertTrue(success) for job in (parent, *square_jobs.values(), reduce_job): job.refresh_from_db() self.assertEqual(job.state, 'JOB_FINISHED') # Double-check the calculation result; thereby testing flow of data workdir = parent.working_directory files = (os.path.join(workdir, f"side{i}.dat") for i in range(NUM_SIDES)) sides = [float(open(f).read()) for f in files] self.assertTrue(all(0.5 <= s <= 5.0 for s in sides)) expected_result = sum(s**2 for s in sides) resultpath = os.path.join(remote_dir.name, 'reduce.out') result = open(resultpath).read() self.assertIn('Total area:', result) result_line = [l for l in result.split('\n') if 'Total area:' in l][0] result = float(result_line.split()[-1]) self.assertAlmostEqual(result, expected_result)
def test_child_timeout(self): '''timeout handling in a dag''' # Same DAG triplet as above: one parent with 2 children A & B NUM_SIDES, NUM_RANKS = 2, 1 pre = self.apps[ 'make_sides'].default_preprocess + f' {NUM_SIDES} {NUM_RANKS}' parent = create_job(name='make_sides', app='make_sides', preproc=pre) chA = create_job(name='square0', app='square', args='side0.dat', input_files='side0.dat') chB = create_job(name='square1', app='square', args='side1.dat --sleep 30', input_files='side1.dat', post_timeout_handler=True) chA.set_parents([parent]) chB.set_parents([parent]) # Run until A finishes, but B will still be hanging def check(): chA.refresh_from_db() chB.refresh_from_db() return chA.state == 'JOB_FINISHED' and chB.state == 'RUNNING' success = util.run_launcher_until(check) self.assertTrue(success) # Give the launcher time to clean up and mark B as timed out def check(): chB.refresh_from_db() return chB.state == 'RUN_TIMEOUT' success = util.poll_until_returns_true(check, timeout=12) self.assertEqual(chB.state, 'RUN_TIMEOUT') # Since B has a timeout handler, when we re-run the launcher, # It is handled gracefully success = util.run_launcher_until_state(chB, 'JOB_FINISHED') parent.refresh_from_db() chA.refresh_from_db() self.assertEqual(parent.state, 'JOB_FINISHED') self.assertEqual(chA.state, 'JOB_FINISHED') self.assertEqual(chB.state, 'JOB_FINISHED') # The data-flow was correct self.triplet_data_check(parent, chA, chB) # The post-processor in fact handled the timeout self.assertIn('recognized timeout', chB.read_file_in_workdir('postprocess.log'))
def test_timeout_post_handler(self): '''test postprocess handling option for timed-out jobs''' remote_dir = tempfile.TemporaryDirectory(prefix="remote") remote_path = os.path.join(remote_dir.name, 'side0.dat') with open(remote_path, 'w') as fp: fp.write('9\n') # Same as previous test, but square.py hangs for 300 sec job = create_job(name='square_testjob2', app='square', args='side0.dat --sleep 5', url_in=f'local:{remote_dir.name}', stage_out_files='square*', url_out=f'local:{remote_dir.name}', post_timeout_handler=True) # Job reaches the RUNNING state and then times out success = util.run_launcher_until_state(job, 'RUNNING') self.assertTrue(success) # On termination, actively running job is marked RUN_TIMEOUT def check(): job.refresh_from_db() return job.state == 'RUN_TIMEOUT' success = util.poll_until_returns_true(check, timeout=12) self.assertEquals(job.state, 'RUN_TIMEOUT') # If we run the launcher again, it will pick up the timed out job success = util.run_launcher_until_state(job, 'JOB_FINISHED') self.assertTrue(success) self.assertNotIn('RESTART_READY', job.state_history) self.assertIn('handled timeout in square_post', job.state_history) # The postprocessor handled the timeout; did not restart post_contents = job.read_file_in_workdir('postprocess.log') self.assertIn('Invoked to handle RUN_TIMEOUT', post_contents) self.assertIn('recognized timeout', post_contents)
def test_kill_during_execution_serial(self): '''Serial job running in mpi_ensemble is properly terminated''' killer_job = create_job(name="killer", app="killer", args="when-running") slow_job = create_job(name="slow_job", app="slow", args="30") success = util.run_launcher_until_state(killer_job, 'JOB_FINISHED') self.assertTrue(success) slow_job.refresh_from_db() self.assertEqual(slow_job.state, "USER_KILLED") stdout = slow_job.read_file_in_workdir('slow_job.out') self.assertIn("Sleeping for a long time", stdout) self.assertIn("RUNNING", slow_job.state_history) self.assertIn("USER_KILLED", slow_job.state_history) self.assertNotIn("RUN_DONE", slow_job.state_history)
def test_kill_during_preprocess(self): '''Job killed while pre-processing is properly marked''' killer_job = create_job(name="killer", app="killer") slow_job = create_job(name="slow_job", app="slow", preproc=self.slow_name, args="30") success = util.run_launcher_until_state(killer_job, 'JOB_FINISHED') self.assertTrue(success) slow_job.refresh_from_db() self.assertEqual(slow_job.state, "USER_KILLED") preproc_out = slow_job.read_file_in_workdir('preprocess.log') self.assertIn("Sleeping for a long time", preproc_out) self.assertNotIn("RUN_DONE", slow_job.state_history) self.assertIn("STAGED_IN", slow_job.state_history)
def test_kill_during_execution_mpi(self): '''Parallel MPIRunner job is properly terminated''' launcherInfo = util.launcher_info() if len(launcherInfo.workerGroup.workers) < 2: self.skipTest("Need at least 2 workers to run this test") killer_job = create_job(name="killer", app="killer") slow_job = create_job(name="slow_job", app="slow", ranks_per_node=2, args="30 parallel") success = util.run_launcher_until_state(killer_job, 'JOB_FINISHED') self.assertTrue(success) slow_job.refresh_from_db() self.assertEqual(slow_job.state, "USER_KILLED") stdout = slow_job.read_file_in_workdir('slow_job.out') self.assertIn("Rank 0 Sleeping for a long time", stdout) self.assertIn("Rank 1 Sleeping for a long time", stdout) self.assertIn("RUNNING", slow_job.state_history) self.assertIn("USER_KILLED", slow_job.state_history) self.assertNotIn("RUN_DONE", slow_job.state_history)
def test_error_handled(self): '''test postprocessor-handled nonzero return code''' remote_dir = tempfile.TemporaryDirectory(prefix="remote") remote_path = os.path.join(remote_dir.name, 'side0.dat') with open(remote_path, 'w') as fp: fp.write('9\n') # Same as previous test, but square.py returns nonzero job = create_job(name='square_testjob2', app='square', args='side0.dat --retcode 1', url_in=f'local:{remote_dir.name}', stage_out_files='square*', url_out=f'local:{remote_dir.name}', post_error_handler=True) self.assertEqual(job.application_args, 'side0.dat --retcode 1') self.assertEqual(BalsamJob.objects.all().count(), 1) # The job finished successfully despite a nonzero return code success = util.run_launcher_until_state(job, 'JOB_FINISHED') self.assertTrue(success) # Make sure at some point, it was marked with RUN_ERROR self.assertIn('RUN_ERROR', job.state_history) # It was saved by the postprocessor: self.assertIn('handled error in square_post', job.state_history) # We can also check the postprocessor stdout: post_contents = job.read_file_in_workdir('postprocess.log') self.assertIn("recognized error", post_contents) self.assertIn("Invoked to handle RUN_ERROR", post_contents) # job id sanity check jobid_line = [l for l in post_contents.split('\n') if 'jobid' in l][0] self.assertIn(str(job.pk), jobid_line)
def test_dynamic(self): '''test dynamic generation of child jobs''' # The parent will create between 4 and 8 child jobs in the course # of its post-processing step: NUM_SIDES, NUM_RANKS = random.randint(4, 8), 1 pre = self.apps[ 'make_sides'].default_preprocess + f' {NUM_SIDES} {NUM_RANKS}' post = self.apps['make_sides'].default_postprocess + ' --dynamic-spawn' parent = create_job(name='make_sides', app='make_sides', preproc=pre, postproc=post) # The final reduce job will depend on all these spawned child jobs, but # they do not exist yet! We will allow these dependencies to be # established dynamically; for now the reduce step just depends on the # top-level parent of the tree. remote_dir = tempfile.TemporaryDirectory(prefix="remote") reduce_job = create_job(name='sum_squares', app='reduce', input_files="square*.dat*", url_out=f'local:{remote_dir.name}', stage_out_files='summary?.dat *.out') reduce_job.set_parents([parent]) # Run the entire DAG until finished success = util.run_launcher_until_state(reduce_job, 'JOB_FINISHED', timeout=200.0) self.assertTrue(success) for job in BalsamJob.objects.all(): self.assertEqual(job.state, 'JOB_FINISHED') # Double-check the calculation result; thereby testing flow of data workdir = parent.working_directory files = (os.path.join(workdir, f"side{i}.dat") for i in range(NUM_SIDES)) sides = [float(open(f).read()) for f in files] self.assertTrue(all(0.5 <= s <= 5.0 for s in sides)) expected_result = sum(s**2 for s in sides) resultpath = os.path.join(remote_dir.name, 'sum_squares.out') result = open(resultpath).read() self.assertIn('Total area:', result) result_line = [l for l in result.split('\n') if 'Total area:' in l][0] result = float(result_line.split()[-1]) self.assertAlmostEqual(result, expected_result) # Checking the post-processor log, we see that those jobs were actually # spawned. post_contents = parent.read_file_in_workdir('postprocess.log') for i in range(NUM_SIDES): self.assertIn(f'spawned square{i} job', post_contents) # The spawned jobs' state histories confirm this. square_jobs = BalsamJob.objects.filter(name__startswith='square') self.assertEqual(square_jobs.count(), NUM_SIDES) for job in square_jobs: self.assertIn(f'spawned by {parent.cute_id}', job.state_history) # Make sure that the correct number of dependencies were created for the # reduce job: one for each dynamically-spawned job (plus the original) self.assertEqual(reduce_job.get_parents().count(), NUM_SIDES + 1)
def test_parent_timeout(self): '''timeout handling (with rescue job) in a dag''' # Same DAG triplet as above: one parent with 2 children A & B NUM_SIDES, NUM_RANKS = 2, 1 pre = self.apps[ 'make_sides'].default_preprocess + f' {NUM_SIDES} {NUM_RANKS}' parent = create_job(name='make_sides', app='make_sides', preproc=pre, args='--sleep 30', post_timeout_handler=True) chA = create_job(name='square0', app='square', args='side0.dat', input_files='side0.dat') chB = create_job(name='square1', app='square', args='side1.dat', input_files='side1.dat') chA.set_parents([parent]) chB.set_parents([parent]) # We run the launcher and kill it once parent starts running success = util.run_launcher_until_state(parent, 'RUNNING') self.assertTrue(success) # Parent timed out def check(): parent.refresh_from_db() return parent.state == 'RUN_TIMEOUT' success = util.poll_until_returns_true(check, timeout=12) self.assertTrue(success) parent.refresh_from_db() chA.refresh_from_db() chB.refresh_from_db() self.assertEqual(parent.state, 'RUN_TIMEOUT') self.assertEqual(chA.state, 'AWAITING_PARENTS') self.assertEqual(chB.state, 'AWAITING_PARENTS') # On re-run, everything finishes okay def check(): chA.refresh_from_db() chB.refresh_from_db() return chA.state == 'JOB_FINISHED' and chB.state == 'JOB_FINISHED' success = util.run_launcher_until(check, timeout=120) parent.refresh_from_db() chA.refresh_from_db() chB.refresh_from_db() self.assertEqual(parent.state, 'JOB_FINISHED') self.assertEqual(chA.state, 'JOB_FINISHED') self.assertEqual(chB.state, 'JOB_FINISHED') # What happened: a rescue job was created by the time-out handler and # ran in the second launcher invocation jobs = BalsamJob.objects.all() self.assertEqual(jobs.count(), 4) # This rescue job was made to be the parent of A and B rescue_job = chB.get_parents().first() self.assertEqual(rescue_job.state, 'JOB_FINISHED') # The job state history shows how this happened: self.assertIn(f'spawned by {parent.cute_id}', rescue_job.state_history) self.assertIn(f'spawned rescue job {rescue_job.cute_id}', parent.state_history) # It happened during the post-processing step: post_log = parent.read_file_in_workdir('postprocess.log') self.assertIn('Creating rescue job', post_log) # Data flow correct: self.triplet_data_check(rescue_job, chA, chB)
def test_normal(self): '''normal processing of a single job''' # A mock "remote" data source has a file side0.dat # This file contains the side length of a square: 9 remote_dir = tempfile.TemporaryDirectory(prefix="remote") remote_path = os.path.join(remote_dir.name, 'side0.dat') with open(remote_path, 'w') as fp: fp.write('9\n') job = create_job(name='square_testjob', app='square', url_in=f'local:{remote_dir.name}', stage_out_files='square*', url_out=f'local:{remote_dir.name}', args='') # Sanity check test case isolation self.assertEquals(job.state, 'CREATED') self.assertEqual(job.application_args, '') self.assertEqual(BalsamJob.objects.all().count(), 1) # Run the launcher and make sure that the job gets carried all the way # through to completion success = util.run_launcher_until_state(job, 'JOB_FINISHED') self.assertTrue(success) # job staged in this remote side0.dat file; it contains "9" staged_in_file_contents = job.read_file_in_workdir('side0.dat') self.assertIn('9\n', staged_in_file_contents) # Preprocess script actually ran: preproc_out_contents = job.read_file_in_workdir('preprocess.log') # Preprocess inherited the correct job from the environment: jobid_line = [ l for l in preproc_out_contents.split('\n') if 'jobid' in l ][0] self.assertIn(str(job.pk), jobid_line) # Preprocess recgonized the side0.dat file # And it altered the job application_args accordingly: self.assertIn('set square.py input to side0.dat', preproc_out_contents) self.assertIn('side0.dat', job.application_args) # application stdout was written to the job's .out file app_stdout = job.read_file_in_workdir('square_testjob.out') self.assertIn("Hello from square", app_stdout) # the square.py app wrote its result to square.dat app_outfile = job.read_file_in_workdir('square.dat') # The result of squaring 9 is 81 result = float(app_outfile) self.assertEqual(result, 81.0) # the job finished normally, so square_post.py just said hello post_contents = job.read_file_in_workdir('postprocess.log') jobid_line = [l for l in post_contents.split('\n') if 'jobid' in l][0] self.assertIn(str(job.pk), jobid_line) self.assertIn('hello from square_post', post_contents) # After stage out, the remote directory contains two new files # That matched the pattern square* .... # square.dat and square_testjob.out remote_square = os.path.join(remote_dir.name, 'square.dat') remote_stdout = os.path.join(remote_dir.name, 'square_testjob.out') self.assertTrue(os.path.exists(remote_square)) self.assertTrue(os.path.exists(remote_stdout)) result_remote = float(open(remote_square).read()) self.assertEquals(result_remote, 81.0) self.assertIn("Hello from square", open(remote_stdout).read())