def wait_for_deletion(client, timeout_secs): """ Wait for job deletion to complete. """ deadline = time.time() + timeout_secs while time.time() < deadline: try: jobs = [ StatelessJob(job_id=s.job_id.value, client=client) for s in list_jobs() ] if len(jobs) == 0: return time.sleep(2) except grpc.RpcError as e: # Catch "not-found" error here because QueryJobs endpoint does # two db queries in sequence: "QueryJobs" and "GetUpdate". # However, when we delete a job, updates are deleted first, # there is a slight chance QueryJobs will fail to query the # update, returning "not-found" error. if e.code() == grpc.StatusCode.NOT_FOUND: time.sleep(2) continue raise assert False, "timed out waiting for jobs to be deleted"
def stop_jobs(client): ''' Calls peloton API to terminate all batch jobs and stateless jobs ''' # obtain a list of jobs from all resource pools and terminate them jobs = list_jobs() for job in jobs: job = StatelessJob(client=client, job_id=job.job_id.value) job.config.max_retry_attempts = 100 job.stop() job.wait_for_terminated()
def cleanup_jobs(client, timeout_secs=20): """ Calls peloton API to delete all currently running jobs """ jobs = [ StatelessJob(job_id=s.job_id.value, client=client) for s in list_jobs() ] for job in jobs: job.delete(force_delete=True) wait_for_deletion(client, timeout_secs)
def test__create_job(stateless_job): stateless_job.create() stateless_job.wait_for_state(goal_state="RUNNING") # ensure ListJobs lists the job jobSummaries = list_jobs() assert len(jobSummaries) > 0 statelessJobSummary = None for jobSummary in jobSummaries: if jobSummary.job_id.value == stateless_job.job_id: statelessJobSummary = jobSummary break assert statelessJobSummary is not None # ensure ListPods lists all the pods of the job podSummaries = stateless_job.list_pods() assert len(podSummaries) == statelessJobSummary.instance_count
def test__get_job_update_details__filter_non_update_workflow(client): """ test getJobUpdateDetails endpoint for filtering non-update workflows """ req1 = get_job_update_request("test_dc_labrat_large_job.yaml") req1.settings.updateGroupSize = 10 req2 = get_job_update_request("test_dc_labrat_large_job_diff_labels.yaml") req2.settings.updateGroupSize = 10 # start a regular update job_key = start_job_update(client, req1, "start job update test/dc/labrat_large_job") # trigger an unexpected restart through peloton api jobs = list_jobs() assert len(jobs) == 1 job = StatelessJob(job_id=jobs[0].job_id.value) job.restart(batch_size=10) job.wait_for_workflow_state(goal_state="SUCCEEDED") # wait for restart # start a new update start_job_update(client, req2, "start job update test/dc/labrat_large_job") # verify getJobUpdateDetails response res = client.get_job_update_details(None, api.JobUpdateQuery(role=job_key.role)) assert len(res.detailsList) == 2 for i, detail in enumerate(res.detailsList): if i == 0: assert len(detail.update.instructions.initialState) > 0 for initial in detail.update.instructions.initialState: assert initial.task.metadata, 'Expect metadata to be present' else: assert len(detail.update.instructions.initialState) == 0
def _list_jobs(): return [StatelessJob(job_id=s.job_id.value) for s in list_jobs()]