コード例 #1
0
ファイル: test_history.py プロジェクト: sagravat/caliban
 def validate_spec(session) -> ContainerSpec:
     s = session.query(ContainerSpec).all()
     assert len(s) == 1
     s = s[0]
     assert s.spec == spec
     assert s.user == current_user()
     return s
コード例 #2
0
ファイル: types.py プロジェクト: johnynek/caliban
 def generate_name(
     cls,
     user: Optional[str] = None,
     date: Optional[datetime] = None,
 ) -> str:
   '''generate a default name for an experiment group'''
   user = user or current_user()
   date = date or datetime.now().astimezone()
   return (f'{user}-xgroup-{date.strftime("%Y-%m-%d-%H-%M-%S")}')
コード例 #3
0
def stop(args: Dict[str, Any]) -> None:
    '''executes the `caliban stop` cli command'''

    user = current_user()
    xgroup = args.get('xgroup')
    dry_run = args.get('dry_run', False)

    with session_scope(get_sql_engine()) as session:
        running_jobs = session.query(Job).join(Experiment).join(
            ExperimentGroup).filter(
                or_(Job.status == JobStatus.SUBMITTED,
                    Job.status == JobStatus.RUNNING))

        if xgroup is not None:
            running_jobs = running_jobs.filter(ExperimentGroup.name == xgroup)

        running_jobs = running_jobs.all()

        if len(running_jobs) == 0:
            logging.info(f'no running jobs found')
            return

        # this is necessary to filter out jobs that have finished but whose status
        # has not yet been updated in the backing store
        running_jobs = list(
            filter(
                lambda x: update_job_status(x) in
                [JobStatus.SUBMITTED, JobStatus.RUNNING], running_jobs))

        logging.info(f'the following jobs would be stopped:')
        for j in running_jobs:
            logging.info(_experiment_command_str(j.experiment))
            logging.info(f'    job {_job_str(j)}')

        if dry_run:
            logging.info(
                f'to actually stop these jobs, re-run the command without '
                f'the --dry_run flag')
            return

        # make sure
        if not user_verify(
                f'do you wish to stop these {len(running_jobs)} jobs?', False):
            return

        for j in running_jobs:
            logging.info(f'stopping job: {_job_str(j)}')
            stop_job(j)

        logging.info(
            f'requested job cancellation, please be patient as it may take '
            f'a short while for this status change to be reflected in the '
            f'gcp dashboard or from the `caliban status` command.')
コード例 #4
0
def get_status(args: Dict[str, Any]) -> None:
    '''executes the `caliban status` cli command
  '''

    xgroup = args.get('xgroup')
    max_jobs = args.get('max_jobs')
    user = args.get('user') or current_user()

    if xgroup is None:
        _display_recent_jobs(user, max_jobs)
    else:
        _display_xgroup(xgroup, user, max_jobs)
コード例 #5
0
ファイル: types.py プロジェクト: johnynek/caliban
  def __init__(
      self,
      name: Optional[str] = None,
      user: Optional[str] = None,
  ):
    '''ExperimentGroup

    name: name for this experiment group, if None, a name is auto-generated
    user: username, if None then user is auto-detected
    '''

    self.user = user or current_user()
    self.created = datetime.now().astimezone()
    self.name = name or self.generate_name(self.user, self.created)
コード例 #6
0
ファイル: types.py プロジェクト: johnynek/caliban
  def __init__(
      self,
      spec: Dict[str, Any],
      user: Optional[str] = None,
  ):
    '''ContainerSpec

    Args:
    spec: dictionary containing docker container creation parameters
    user: username, if None then user is automatically detected
    '''
    self.user = user or current_user()
    self.spec = sorted_dict(spec)
    self.created = datetime.now().astimezone()
コード例 #7
0
def stop(args: Dict[str, Any]) -> None:
  '''executes the `caliban stop` cli command'''

  user = current_user()
  xgroup = args.get('xgroup')
  dry_run = args.get('dry_run', False)

  # querying and stopping jobs can take a long time, especially on CAIP,
  # so we check with the user up-front rather than waiting for our full
  # query to return
  if (not dry_run and not user_verify(
      f'Warning: this will potentially stop many jobs, do you wish to continue?',
      False)):
    return

  with session_scope(get_sql_engine()) as session:
    running_jobs = session.query(Job).join(Experiment).join(
        ExperimentGroup).filter(
            or_(Job.status == JobStatus.SUBMITTED,
                Job.status == JobStatus.RUNNING))

    if xgroup is not None:
      running_jobs = running_jobs.filter(ExperimentGroup.name == xgroup)

    running_jobs = running_jobs.all()

    if len(running_jobs) == 0:
      logging.info(f'no running jobs found')
      return

    logging.info(f'the following jobs will be stopped:')
    for j in running_jobs:
      logging.info(_experiment_command_str(j.experiment))
      logging.info(f'    job {_job_str(j)}')

    if dry_run:
      logging.info(f'to actually stop these jobs, re-run the command without '
                   f'the --dry_run flag')
      return

    for j in running_jobs:
      logging.info(f'stopping job: {_job_str(j)}')
      stop_job(j)

    logging.info(
        f'requested job cancellation, please be patient as it may take '
        f'a short while for this status change to be reflected in the '
        f'gcp dashboard or from the `caliban status` command.')
コード例 #8
0
ファイル: types.py プロジェクト: johnynek/caliban
  def __init__(
      self,
      spec: JobSpec,
      container: str,
      details: Dict[str, Any],
      status: Optional[JobStatus] = JobStatus.SUBMITTED,
      user: Optional[str] = None,
  ):
    '''Job

    spec: job spec
    container: container id for this job
    details: job- and platform-specific details for job
    status: initial status for this job
    user: user who created this job, if None will be auto-detected
    '''
    self.created = datetime.now().astimezone()
    self.container = container
    self.details = sorted_dict(details)  # 'metadata' is reserved by sqlalchemy
    self.status = status
    self.user = user or current_user()

    spec.jobs.append(self)
    spec.experiment.jobs.append(self)
コード例 #9
0
def submit_ml_job(job_mode: conf.JobMode,
                  docker_args: Dict[str, Any],
                  region: ct.Region,
                  project_id: str,
                  credentials_path: Optional[str] = None,
                  dry_run: bool = False,
                  job_name: Optional[str] = None,
                  machine_type: Optional[ct.MachineType] = None,
                  gpu_spec: Optional[ct.GPUSpec] = None,
                  tpu_spec: Optional[ct.TPUSpec] = None,
                  image_tag: Optional[str] = None,
                  labels: Optional[Dict[str, str]] = None,
                  experiment_config: Optional[conf.ExpConf] = None,
                  script_args: Optional[List[str]] = None,
                  request_retries: Optional[int] = None,
                  xgroup: Optional[str] = None) -> None:
    """Top level function in the module. This function:

  - builds an image using the supplied docker_args, in either CPU or GPU mode
  - pushes that image to the Cloud Container Repository of the supplied
    project_id
  - generates a sequence of 'JobSpec' instances, one for every combination in
    the supplied experiment_config, and
  - batch-submits all jobs to AI Platform

  Keyword args:

  - job_mode: caliban.config.JobMode.
  - docker_args: these arguments are passed through to
    caliban.docker.build_image.
  - region: the region to use for AI Platform job submission. Different regions
    support different GPUs.
  - project_id: GCloud project ID for container storage and job submission.
  - credentials_path: explicit path to a service account JSON file, if it exists.
  - dry_run: if True, no actual jobs will be submitted and docker won't
    actually build; logging side effects will show the user what will happen
    without dry_run=True.
  - job_name: optional custom name. This is applied as a label to every job,
    and used as a prefix for all jobIds submitted to Cloud.
  - machine_type: the machine type to allocate for each job. Must be one
    supported by Cloud.
  - gpu_spec: if None and job_mode is GPU, defaults to a standard single GPU.
    Else, configures the count and type of GPUs to attach to the machine that
    runs each job.
  - tpu_spec: if None, defaults to no TPU attached. Else, configures the count
    and type of TPUs to attach to the machine that runs each job.
  - image_tag: optional explicit tag of a Container-Registry-available Docker
    container. If supplied, submit_ml_job will skip the docker build and push
    phases and use this image_tag directly.
  - labels: dictionary of KV pairs to apply to each job. User args will also be
    applied as labels, plus a few default labels supplied by Caliban.
  - experiment_config: dict of string to list, boolean, string or int. Any
    lists will trigger a cartesian product out with the rest of the config. A
    job will be submitted for every combination of parameters in the experiment
    config.
  - script_args: these are extra arguments that will be passed to every job
    executed, in addition to the arguments created by expanding out the
    experiment config.
  - request_retries: the number of times to retry each request if it fails for
    a timeout or a rate limiting request.
  - xgroup: experiment group for this submission, if None a new group will
    be created
  """
    if script_args is None:
        script_args = []

    if job_name is None:
        job_name = "caliban_{}".format(u.current_user())

    if job_mode == conf.JobMode.GPU and gpu_spec is None:
        gpu_spec = ct.GPUSpec(ct.GPU.P100, 1)

    if machine_type is None:
        machine_type = conf.DEFAULT_MACHINE_TYPE[job_mode]

    if experiment_config is None:
        experiment_config = {}

    if labels is None:
        labels = {}

    if request_retries is None:
        request_retries = 10

    engine = get_mem_engine() if dry_run else get_sql_engine()

    with session_scope(engine) as session:
        container_spec = generate_container_spec(session, docker_args,
                                                 image_tag)

        if image_tag is None:
            image_tag = generate_image_tag(project_id,
                                           docker_args,
                                           dry_run=dry_run)

        experiments = create_experiments(
            session=session,
            container_spec=container_spec,
            script_args=script_args,
            experiment_config=experiment_config,
            xgroup=xgroup,
        )

        specs = build_job_specs(
            job_name=job_name,
            image_tag=image_tag,
            region=region,
            machine_type=machine_type,
            experiments=experiments,
            user_labels=labels,
            gpu_spec=gpu_spec,
            tpu_spec=tpu_spec,
        )

        if dry_run:
            return execute_dry_run(specs)

        try:
            submit_job_specs(
                specs=specs,
                project_id=project_id,
                credentials_path=credentials_path,
                num_specs=len(experiments),
                request_retries=request_retries,
            )
        except Exception as e:
            logging.error(f'exception: {e}')
            session.commit()  # commit here, otherwise will be rolled back

        logging.info("")
        logging.info(
            t.green("Visit {} to see the status of all jobs.".format(
                job_url(project_id, ''))))
        logging.info("")
コード例 #10
0
def _dockerfile_template(
        job_mode: c.JobMode,
        workdir: Optional[str] = None,
        base_image_fn: Optional[Callable[[c.JobMode], str]] = None,
        package: Optional[Union[List, u.Package]] = None,
        requirements_path: Optional[str] = None,
        conda_env_path: Optional[str] = None,
        setup_extras: Optional[List[str]] = None,
        adc_path: Optional[str] = None,
        credentials_path: Optional[str] = None,
        jupyter_version: Optional[str] = None,
        inject_notebook: NotebookInstall = NotebookInstall.none,
        shell: Optional[Shell] = None,
        extra_dirs: Optional[List[str]] = None,
        caliban_config: Optional[Dict[str, Any]] = None) -> str:
    """Returns a Dockerfile that builds on a local CPU or GPU base image (depending
  on the value of job_mode) to create a container that:

  - installs any dependency specified in a requirements.txt file living at
    requirements_path, a conda environment at conda_env_path, or any
    dependencies in a setup.py file, including extra dependencies, if
    setup_extras isn't None
  - injects gcloud credentials into the container, so Cloud interaction works
    just like it does locally
  - potentially installs a custom shell, or jupyterlab for notebook support
  - copies all source needed by the main module specified by package, and
    potentially injects an entrypoint that, on run, will run that main module

  Most functions that call _dockerfile_template pass along any kwargs that they
  receive. It should be enough to add kwargs here, then rely on that mechanism
  to pass them along, vs adding kwargs all the way down the call chain.

  Supply a custom base_image_fn (function from job_mode -> image ID) to inject
  more complex Docker commands into the Caliban environments by, for example,
  building your own image on top of the TF base images, then using that.

  """
    uid = os.getuid()
    gid = os.getgid()
    username = u.current_user()

    if isinstance(package, list):
        package = u.Package(*package)

    if workdir is None:
        workdir = DEFAULT_WORKDIR

    if base_image_fn is None:
        base_image_fn = base_image_id

    base_image = base_image_fn(job_mode)

    dockerfile = """
FROM {base_image}

# Create the same group we're using on the host machine.
RUN [ $(getent group {gid}) ] || groupadd --gid {gid} {gid}

# Create the user by name. --no-log-init guards against a crash with large user
# IDs.
RUN useradd --no-log-init --no-create-home -u {uid} -g {gid} --shell /bin/bash {username}

# The directory is created by root. This sets permissions so that any user can
# access the folder.
RUN mkdir -m 777 {workdir} {creds_dir} {c_home}

ENV HOME={c_home}

WORKDIR {workdir}

USER {uid}:{gid}
""".format_map({
        "base_image": base_image,
        "username": username,
        "uid": uid,
        "gid": gid,
        "workdir": workdir,
        "c_home": container_home(),
        "creds_dir": CREDS_DIR
    })
    dockerfile += _credentials_entries(uid,
                                       gid,
                                       adc_path=adc_path,
                                       credentials_path=credentials_path)

    dockerfile += _dependency_entries(workdir,
                                      uid,
                                      gid,
                                      requirements_path=requirements_path,
                                      conda_env_path=conda_env_path,
                                      setup_extras=setup_extras)

    if inject_notebook.value != 'none':
        install_lab = inject_notebook == NotebookInstall.lab
        dockerfile += _notebook_entries(lab=install_lab,
                                        version=jupyter_version)

    if extra_dirs is not None:
        dockerfile += _extra_dir_entries(workdir, uid, gid, extra_dirs)

    dockerfile += _custom_packages(uid,
                                   gid,
                                   packages=c.apt_packages(
                                       caliban_config, job_mode),
                                   shell=shell)

    if package is not None:
        # The actual entrypoint and final copied code.
        dockerfile += _package_entries(workdir, uid, gid, package)

    return dockerfile
コード例 #11
0
def container_home():
    """Returns the location of the home directory inside the generated
  container.

  """
    return "/home/{}".format(u.current_user())
コード例 #12
0
def resubmit(args: Dict[str, Any]) -> None:
    '''executes the `caliban resubmit` command'''

    user = current_user()
    xgroup = args.get('xgroup')
    dry_run = args.get('dry_run', False)
    all_jobs = args.get('all_jobs', False)
    project_id = args.get('project_id')
    creds_file = args.get('cloud_key')
    rebuild = True

    if xgroup is None:
        logging.error(f'you must specify an experiment group for this command')
        return

    with session_scope(get_sql_engine()) as session:
        jobs = _get_resubmit_jobs(
            session=session,
            xgroup=xgroup,
            user=user,
            all_jobs=all_jobs,
        )

        if jobs is None:
            return

        # if we have CAIP or GKE jobs, then we need to have a project_id
        project_id = _get_resubmit_project_id(jobs, project_id, creds_file)

        # show what would be done
        logging.info(f'the following jobs would be resubmitted:')
        for j in jobs:
            logging.info(_experiment_command_str(j.experiment))
            logging.info(f'  job {_job_str(j)}')

        if dry_run:
            logging.info(
                f'to actually resubmit these jobs, run this command again '
                f'without the --dry_run flag')
            return

        # make sure
        if not user_verify(f'do you wish to resubmit these {len(jobs)} jobs?',
                           False):
            return

        # rebuild all containers first
        if rebuild:
            logging.info(f'rebuilding containers...')
            image_id_map = _rebuild_containers(jobs, project_id=project_id)
        else:
            image_id_map = {j: j.container for j in jobs}

        # create new job specs
        job_specs = [
            replace_job_spec_image(spec=j.spec, image_id=image_id_map[j])
            for j in jobs
        ]

        # submit jobs, grouped by platform
        for platform in [Platform.CAIP, Platform.GKE, Platform.LOCAL]:
            pspecs = list(filter(lambda x: x.platform == platform, job_specs))
            try:
                submit_job_specs(
                    specs=pspecs,
                    platform=platform,
                    project_id=project_id,
                    credentials_path=creds_file,
                )
            except Exception as e:
                session.commit()  # avoid rollback
                logging.error(f'there was an error submitting some jobs')

        return
コード例 #13
0
ファイル: metrics.py プロジェクト: dylanking42/caliban
def _mlflow_job_name(index: int, user: str = None) -> str:
    user = user or u.current_user()
    timestamp = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
    return f'{user}-{timestamp}-{index}'