def validate_spec(session) -> ContainerSpec: s = session.query(ContainerSpec).all() assert len(s) == 1 s = s[0] assert s.spec == spec assert s.user == current_user() return s
def generate_name( cls, user: Optional[str] = None, date: Optional[datetime] = None, ) -> str: '''generate a default name for an experiment group''' user = user or current_user() date = date or datetime.now().astimezone() return (f'{user}-xgroup-{date.strftime("%Y-%m-%d-%H-%M-%S")}')
def stop(args: Dict[str, Any]) -> None: '''executes the `caliban stop` cli command''' user = current_user() xgroup = args.get('xgroup') dry_run = args.get('dry_run', False) with session_scope(get_sql_engine()) as session: running_jobs = session.query(Job).join(Experiment).join( ExperimentGroup).filter( or_(Job.status == JobStatus.SUBMITTED, Job.status == JobStatus.RUNNING)) if xgroup is not None: running_jobs = running_jobs.filter(ExperimentGroup.name == xgroup) running_jobs = running_jobs.all() if len(running_jobs) == 0: logging.info(f'no running jobs found') return # this is necessary to filter out jobs that have finished but whose status # has not yet been updated in the backing store running_jobs = list( filter( lambda x: update_job_status(x) in [JobStatus.SUBMITTED, JobStatus.RUNNING], running_jobs)) logging.info(f'the following jobs would be stopped:') for j in running_jobs: logging.info(_experiment_command_str(j.experiment)) logging.info(f' job {_job_str(j)}') if dry_run: logging.info( f'to actually stop these jobs, re-run the command without ' f'the --dry_run flag') return # make sure if not user_verify( f'do you wish to stop these {len(running_jobs)} jobs?', False): return for j in running_jobs: logging.info(f'stopping job: {_job_str(j)}') stop_job(j) logging.info( f'requested job cancellation, please be patient as it may take ' f'a short while for this status change to be reflected in the ' f'gcp dashboard or from the `caliban status` command.')
def get_status(args: Dict[str, Any]) -> None: '''executes the `caliban status` cli command ''' xgroup = args.get('xgroup') max_jobs = args.get('max_jobs') user = args.get('user') or current_user() if xgroup is None: _display_recent_jobs(user, max_jobs) else: _display_xgroup(xgroup, user, max_jobs)
def __init__( self, name: Optional[str] = None, user: Optional[str] = None, ): '''ExperimentGroup name: name for this experiment group, if None, a name is auto-generated user: username, if None then user is auto-detected ''' self.user = user or current_user() self.created = datetime.now().astimezone() self.name = name or self.generate_name(self.user, self.created)
def __init__( self, spec: Dict[str, Any], user: Optional[str] = None, ): '''ContainerSpec Args: spec: dictionary containing docker container creation parameters user: username, if None then user is automatically detected ''' self.user = user or current_user() self.spec = sorted_dict(spec) self.created = datetime.now().astimezone()
def stop(args: Dict[str, Any]) -> None: '''executes the `caliban stop` cli command''' user = current_user() xgroup = args.get('xgroup') dry_run = args.get('dry_run', False) # querying and stopping jobs can take a long time, especially on CAIP, # so we check with the user up-front rather than waiting for our full # query to return if (not dry_run and not user_verify( f'Warning: this will potentially stop many jobs, do you wish to continue?', False)): return with session_scope(get_sql_engine()) as session: running_jobs = session.query(Job).join(Experiment).join( ExperimentGroup).filter( or_(Job.status == JobStatus.SUBMITTED, Job.status == JobStatus.RUNNING)) if xgroup is not None: running_jobs = running_jobs.filter(ExperimentGroup.name == xgroup) running_jobs = running_jobs.all() if len(running_jobs) == 0: logging.info(f'no running jobs found') return logging.info(f'the following jobs will be stopped:') for j in running_jobs: logging.info(_experiment_command_str(j.experiment)) logging.info(f' job {_job_str(j)}') if dry_run: logging.info(f'to actually stop these jobs, re-run the command without ' f'the --dry_run flag') return for j in running_jobs: logging.info(f'stopping job: {_job_str(j)}') stop_job(j) logging.info( f'requested job cancellation, please be patient as it may take ' f'a short while for this status change to be reflected in the ' f'gcp dashboard or from the `caliban status` command.')
def __init__( self, spec: JobSpec, container: str, details: Dict[str, Any], status: Optional[JobStatus] = JobStatus.SUBMITTED, user: Optional[str] = None, ): '''Job spec: job spec container: container id for this job details: job- and platform-specific details for job status: initial status for this job user: user who created this job, if None will be auto-detected ''' self.created = datetime.now().astimezone() self.container = container self.details = sorted_dict(details) # 'metadata' is reserved by sqlalchemy self.status = status self.user = user or current_user() spec.jobs.append(self) spec.experiment.jobs.append(self)
def submit_ml_job(job_mode: conf.JobMode, docker_args: Dict[str, Any], region: ct.Region, project_id: str, credentials_path: Optional[str] = None, dry_run: bool = False, job_name: Optional[str] = None, machine_type: Optional[ct.MachineType] = None, gpu_spec: Optional[ct.GPUSpec] = None, tpu_spec: Optional[ct.TPUSpec] = None, image_tag: Optional[str] = None, labels: Optional[Dict[str, str]] = None, experiment_config: Optional[conf.ExpConf] = None, script_args: Optional[List[str]] = None, request_retries: Optional[int] = None, xgroup: Optional[str] = None) -> None: """Top level function in the module. This function: - builds an image using the supplied docker_args, in either CPU or GPU mode - pushes that image to the Cloud Container Repository of the supplied project_id - generates a sequence of 'JobSpec' instances, one for every combination in the supplied experiment_config, and - batch-submits all jobs to AI Platform Keyword args: - job_mode: caliban.config.JobMode. - docker_args: these arguments are passed through to caliban.docker.build_image. - region: the region to use for AI Platform job submission. Different regions support different GPUs. - project_id: GCloud project ID for container storage and job submission. - credentials_path: explicit path to a service account JSON file, if it exists. - dry_run: if True, no actual jobs will be submitted and docker won't actually build; logging side effects will show the user what will happen without dry_run=True. - job_name: optional custom name. This is applied as a label to every job, and used as a prefix for all jobIds submitted to Cloud. - machine_type: the machine type to allocate for each job. Must be one supported by Cloud. - gpu_spec: if None and job_mode is GPU, defaults to a standard single GPU. Else, configures the count and type of GPUs to attach to the machine that runs each job. - tpu_spec: if None, defaults to no TPU attached. Else, configures the count and type of TPUs to attach to the machine that runs each job. - image_tag: optional explicit tag of a Container-Registry-available Docker container. If supplied, submit_ml_job will skip the docker build and push phases and use this image_tag directly. - labels: dictionary of KV pairs to apply to each job. User args will also be applied as labels, plus a few default labels supplied by Caliban. - experiment_config: dict of string to list, boolean, string or int. Any lists will trigger a cartesian product out with the rest of the config. A job will be submitted for every combination of parameters in the experiment config. - script_args: these are extra arguments that will be passed to every job executed, in addition to the arguments created by expanding out the experiment config. - request_retries: the number of times to retry each request if it fails for a timeout or a rate limiting request. - xgroup: experiment group for this submission, if None a new group will be created """ if script_args is None: script_args = [] if job_name is None: job_name = "caliban_{}".format(u.current_user()) if job_mode == conf.JobMode.GPU and gpu_spec is None: gpu_spec = ct.GPUSpec(ct.GPU.P100, 1) if machine_type is None: machine_type = conf.DEFAULT_MACHINE_TYPE[job_mode] if experiment_config is None: experiment_config = {} if labels is None: labels = {} if request_retries is None: request_retries = 10 engine = get_mem_engine() if dry_run else get_sql_engine() with session_scope(engine) as session: container_spec = generate_container_spec(session, docker_args, image_tag) if image_tag is None: image_tag = generate_image_tag(project_id, docker_args, dry_run=dry_run) experiments = create_experiments( session=session, container_spec=container_spec, script_args=script_args, experiment_config=experiment_config, xgroup=xgroup, ) specs = build_job_specs( job_name=job_name, image_tag=image_tag, region=region, machine_type=machine_type, experiments=experiments, user_labels=labels, gpu_spec=gpu_spec, tpu_spec=tpu_spec, ) if dry_run: return execute_dry_run(specs) try: submit_job_specs( specs=specs, project_id=project_id, credentials_path=credentials_path, num_specs=len(experiments), request_retries=request_retries, ) except Exception as e: logging.error(f'exception: {e}') session.commit() # commit here, otherwise will be rolled back logging.info("") logging.info( t.green("Visit {} to see the status of all jobs.".format( job_url(project_id, '')))) logging.info("")
def _dockerfile_template( job_mode: c.JobMode, workdir: Optional[str] = None, base_image_fn: Optional[Callable[[c.JobMode], str]] = None, package: Optional[Union[List, u.Package]] = None, requirements_path: Optional[str] = None, conda_env_path: Optional[str] = None, setup_extras: Optional[List[str]] = None, adc_path: Optional[str] = None, credentials_path: Optional[str] = None, jupyter_version: Optional[str] = None, inject_notebook: NotebookInstall = NotebookInstall.none, shell: Optional[Shell] = None, extra_dirs: Optional[List[str]] = None, caliban_config: Optional[Dict[str, Any]] = None) -> str: """Returns a Dockerfile that builds on a local CPU or GPU base image (depending on the value of job_mode) to create a container that: - installs any dependency specified in a requirements.txt file living at requirements_path, a conda environment at conda_env_path, or any dependencies in a setup.py file, including extra dependencies, if setup_extras isn't None - injects gcloud credentials into the container, so Cloud interaction works just like it does locally - potentially installs a custom shell, or jupyterlab for notebook support - copies all source needed by the main module specified by package, and potentially injects an entrypoint that, on run, will run that main module Most functions that call _dockerfile_template pass along any kwargs that they receive. It should be enough to add kwargs here, then rely on that mechanism to pass them along, vs adding kwargs all the way down the call chain. Supply a custom base_image_fn (function from job_mode -> image ID) to inject more complex Docker commands into the Caliban environments by, for example, building your own image on top of the TF base images, then using that. """ uid = os.getuid() gid = os.getgid() username = u.current_user() if isinstance(package, list): package = u.Package(*package) if workdir is None: workdir = DEFAULT_WORKDIR if base_image_fn is None: base_image_fn = base_image_id base_image = base_image_fn(job_mode) dockerfile = """ FROM {base_image} # Create the same group we're using on the host machine. RUN [ $(getent group {gid}) ] || groupadd --gid {gid} {gid} # Create the user by name. --no-log-init guards against a crash with large user # IDs. RUN useradd --no-log-init --no-create-home -u {uid} -g {gid} --shell /bin/bash {username} # The directory is created by root. This sets permissions so that any user can # access the folder. RUN mkdir -m 777 {workdir} {creds_dir} {c_home} ENV HOME={c_home} WORKDIR {workdir} USER {uid}:{gid} """.format_map({ "base_image": base_image, "username": username, "uid": uid, "gid": gid, "workdir": workdir, "c_home": container_home(), "creds_dir": CREDS_DIR }) dockerfile += _credentials_entries(uid, gid, adc_path=adc_path, credentials_path=credentials_path) dockerfile += _dependency_entries(workdir, uid, gid, requirements_path=requirements_path, conda_env_path=conda_env_path, setup_extras=setup_extras) if inject_notebook.value != 'none': install_lab = inject_notebook == NotebookInstall.lab dockerfile += _notebook_entries(lab=install_lab, version=jupyter_version) if extra_dirs is not None: dockerfile += _extra_dir_entries(workdir, uid, gid, extra_dirs) dockerfile += _custom_packages(uid, gid, packages=c.apt_packages( caliban_config, job_mode), shell=shell) if package is not None: # The actual entrypoint and final copied code. dockerfile += _package_entries(workdir, uid, gid, package) return dockerfile
def container_home(): """Returns the location of the home directory inside the generated container. """ return "/home/{}".format(u.current_user())
def resubmit(args: Dict[str, Any]) -> None: '''executes the `caliban resubmit` command''' user = current_user() xgroup = args.get('xgroup') dry_run = args.get('dry_run', False) all_jobs = args.get('all_jobs', False) project_id = args.get('project_id') creds_file = args.get('cloud_key') rebuild = True if xgroup is None: logging.error(f'you must specify an experiment group for this command') return with session_scope(get_sql_engine()) as session: jobs = _get_resubmit_jobs( session=session, xgroup=xgroup, user=user, all_jobs=all_jobs, ) if jobs is None: return # if we have CAIP or GKE jobs, then we need to have a project_id project_id = _get_resubmit_project_id(jobs, project_id, creds_file) # show what would be done logging.info(f'the following jobs would be resubmitted:') for j in jobs: logging.info(_experiment_command_str(j.experiment)) logging.info(f' job {_job_str(j)}') if dry_run: logging.info( f'to actually resubmit these jobs, run this command again ' f'without the --dry_run flag') return # make sure if not user_verify(f'do you wish to resubmit these {len(jobs)} jobs?', False): return # rebuild all containers first if rebuild: logging.info(f'rebuilding containers...') image_id_map = _rebuild_containers(jobs, project_id=project_id) else: image_id_map = {j: j.container for j in jobs} # create new job specs job_specs = [ replace_job_spec_image(spec=j.spec, image_id=image_id_map[j]) for j in jobs ] # submit jobs, grouped by platform for platform in [Platform.CAIP, Platform.GKE, Platform.LOCAL]: pspecs = list(filter(lambda x: x.platform == platform, job_specs)) try: submit_job_specs( specs=pspecs, platform=platform, project_id=project_id, credentials_path=creds_file, ) except Exception as e: session.commit() # avoid rollback logging.error(f'there was an error submitting some jobs') return
def _mlflow_job_name(index: int, user: str = None) -> str: user = user or u.current_user() timestamp = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S') return f'{user}-{timestamp}-{index}'