def _stop_gke_job(j: Job) -> bool: '''stops a running gke job see: https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.11/#delete-job-v1-batch Args: j: job to stop Returns: True on success, False otherwise ''' cluster_name = j.details['cluster_name'] job_name = get_gke_job_name(j) cluster = get_job_cluster(j) if cluster is None: logging.error(f'unable to connect to cluster {cluster_name}, ' f'so unable to delete job {job_name}') return False status = cluster.delete_job(job_name=job_name) # gke deletes the job completely, so we can't then query its status later # thus if the request went through ok, then we mark as stopped if status: j.status = JobStatus.STOPPED return status
def submit_job( self, job_spec: JobSpec, name: str, labels: Optional[Dict[str, str]] = None, ) -> Optional[Job]: '''submits a job to the cluster based on the given job spec''' v1job = self.create_v1job(job_spec=job_spec, name=name, labels=labels) submitted = self.submit_v1job(v1job) container = job_spec.spec['template']['spec']['containers'][0]['image'] if submitted is not None: details = { 'cluster_name': self.name, 'project_id': self.project_id, 'cluster_zone': self.zone, 'job': ApiClient().sanitize_for_serialization(submitted), } return Job( spec=job_spec, container=container, details=details, status=JobStatus.SUBMITTED, ) return None
def update_job_status(j: Job) -> JobStatus: '''updates and returns job status Returns: current status for this job ''' if j.status is not None and j.status.is_terminal(): return j.status if j.spec.platform == Platform.LOCAL: return j.status if j.spec.platform == Platform.CAIP: j.status = get_caip_job_status(j) return j.status if j.spec.platform == Platform.GKE: j.status = get_gke_job_status(j) return j.status assert False, "can't get job status for platform {j.platform.name}"
def test_job(engine: Engine): args = ['a', 4] kwargs = {'k0': 0, 'k1': 'xyz'} job_spec = {'a': 2, 'b': [0, 1, 2], 'c': {'x': 1, 'y': 'foo'}} container_spec = { 'nogpu': True, 'cloud_key': '/path/to/key.json', 'image_tag': None, 'dir': ['/extra/path0', '/extra/path2'], 'base_dir': '/home/foo', 'module': 'train.py' } # test basic job creation with session_scope(engine) as session: xg = ExperimentGroup() c = ContainerSpec.get_or_create(session=session, spec=container_spec) e = Experiment.get_or_create( xgroup=xg, container_spec=c, args=args, kwargs=kwargs, ) jspec = JobSpec.get_or_create( experiment=e, spec=job_spec, platform=Platform.TEST, ) job = Job(spec=jspec, container='container0', details={'job_id': 123}) session.add(e) # test job persistence with session_scope(engine) as session: j = session.query(Job).all() assert len(j) == 1 j = j[0] assert j.container == 'container0' assert j.experiment.args == args assert j.experiment.kwargs == kwargs assert j.spec.spec == job_spec assert j.details['job_id'] == 123
def execute_jobs( job_specs: Iterable[JobSpec], dry_run: bool = False, caliban_config: Optional[Dict[str, Any]] = None, ): '''executes a sequence of jobs based on job specs Arg: job_specs: specifications for jobs to be executed dry_run: if True, only print what would be done caliban_config: caliban configuration data ''' caliban_config = caliban_config or {} with ut.tqdm_logging() as orig_stream: pbar = tqdm.tqdm(logged_job_specs(job_specs), file=orig_stream, total=len(job_specs), ascii=True, unit="experiment", desc="Executing") for idx, job_spec in enumerate(pbar, 1): command = job_spec.spec['command'] logging.info(f'Running command: {" ".join(command)}') if not dry_run: _, ret_code = ufs.capture_stdout(command, "", ut.TqdmFile(sys.stderr)) else: ret_code = 0 j = Job(spec=job_spec, container=job_spec.spec['container'], details={'ret_code': ret_code}, status=JobStatus.SUCCEEDED if ret_code == 0 else JobStatus.FAILED) local_callback(idx=idx, job=j) if dry_run: logging.info( t.yellow(f'\nTo build your image and execute these jobs, ' f'run your command again without {c.DRY_RUN_FLAG}\n')) return None