Ejemplo n.º 1
0
def _stop_gke_job(j: Job) -> bool:
  '''stops a running gke job

  see:
  https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.11/#delete-job-v1-batch

  Args:
  j: job to stop

  Returns:
  True on success, False otherwise
  '''

  cluster_name = j.details['cluster_name']
  job_name = get_gke_job_name(j)

  cluster = get_job_cluster(j)
  if cluster is None:
    logging.error(f'unable to connect to cluster {cluster_name}, '
                  f'so unable to delete job {job_name}')
    return False

  status = cluster.delete_job(job_name=job_name)

  # gke deletes the job completely, so we can't then query its status later
  # thus if the request went through ok, then we mark as stopped
  if status:
    j.status = JobStatus.STOPPED

  return status
Ejemplo n.º 2
0
    def submit_job(
        self,
        job_spec: JobSpec,
        name: str,
        labels: Optional[Dict[str, str]] = None,
    ) -> Optional[Job]:
        '''submits a job to the cluster based on the given job spec'''

        v1job = self.create_v1job(job_spec=job_spec, name=name, labels=labels)
        submitted = self.submit_v1job(v1job)
        container = job_spec.spec['template']['spec']['containers'][0]['image']

        if submitted is not None:
            details = {
                'cluster_name': self.name,
                'project_id': self.project_id,
                'cluster_zone': self.zone,
                'job': ApiClient().sanitize_for_serialization(submitted),
            }

            return Job(
                spec=job_spec,
                container=container,
                details=details,
                status=JobStatus.SUBMITTED,
            )

        return None
Ejemplo n.º 3
0
def update_job_status(j: Job) -> JobStatus:
  '''updates and returns job status

    Returns:
    current status for this job
    '''

  if j.status is not None and j.status.is_terminal():
    return j.status

  if j.spec.platform == Platform.LOCAL:
    return j.status

  if j.spec.platform == Platform.CAIP:
    j.status = get_caip_job_status(j)
    return j.status

  if j.spec.platform == Platform.GKE:
    j.status = get_gke_job_status(j)
    return j.status

  assert False, "can't get job status for platform {j.platform.name}"
Ejemplo n.º 4
0
def test_job(engine: Engine):

    args = ['a', 4]
    kwargs = {'k0': 0, 'k1': 'xyz'}
    job_spec = {'a': 2, 'b': [0, 1, 2], 'c': {'x': 1, 'y': 'foo'}}
    container_spec = {
        'nogpu': True,
        'cloud_key': '/path/to/key.json',
        'image_tag': None,
        'dir': ['/extra/path0', '/extra/path2'],
        'base_dir': '/home/foo',
        'module': 'train.py'
    }

    # test basic job creation
    with session_scope(engine) as session:

        xg = ExperimentGroup()
        c = ContainerSpec.get_or_create(session=session, spec=container_spec)
        e = Experiment.get_or_create(
            xgroup=xg,
            container_spec=c,
            args=args,
            kwargs=kwargs,
        )

        jspec = JobSpec.get_or_create(
            experiment=e,
            spec=job_spec,
            platform=Platform.TEST,
        )

        job = Job(spec=jspec, container='container0', details={'job_id': 123})
        session.add(e)

    # test job persistence
    with session_scope(engine) as session:
        j = session.query(Job).all()
        assert len(j) == 1
        j = j[0]
        assert j.container == 'container0'
        assert j.experiment.args == args
        assert j.experiment.kwargs == kwargs
        assert j.spec.spec == job_spec
        assert j.details['job_id'] == 123
Ejemplo n.º 5
0
def execute_jobs(
    job_specs: Iterable[JobSpec],
    dry_run: bool = False,
    caliban_config: Optional[Dict[str, Any]] = None,
):
    '''executes a sequence of jobs based on job specs

  Arg:
  job_specs: specifications for jobs to be executed
  dry_run: if True, only print what would be done
  caliban_config: caliban configuration data
  '''
    caliban_config = caliban_config or {}

    with ut.tqdm_logging() as orig_stream:
        pbar = tqdm.tqdm(logged_job_specs(job_specs),
                         file=orig_stream,
                         total=len(job_specs),
                         ascii=True,
                         unit="experiment",
                         desc="Executing")
        for idx, job_spec in enumerate(pbar, 1):
            command = job_spec.spec['command']
            logging.info(f'Running command: {" ".join(command)}')
            if not dry_run:
                _, ret_code = ufs.capture_stdout(command, "",
                                                 ut.TqdmFile(sys.stderr))
            else:
                ret_code = 0
            j = Job(spec=job_spec,
                    container=job_spec.spec['container'],
                    details={'ret_code': ret_code},
                    status=JobStatus.SUCCEEDED
                    if ret_code == 0 else JobStatus.FAILED)
            local_callback(idx=idx, job=j)

    if dry_run:
        logging.info(
            t.yellow(f'\nTo build your image and execute these jobs, '
                     f'run your command again without {c.DRY_RUN_FLAG}\n'))

    return None