Exemple #1
0
def run_experiments(job_mode: c.JobMode,
                    run_args: Optional[List[str]] = None,
                    script_args: Optional[List[str]] = None,
                    image_id: Optional[str] = None,
                    dry_run: bool = False,
                    experiment_config: Optional[ce.ExpConf] = None,
                    xgroup: Optional[str] = None,
                    **build_image_kwargs) -> None:
    """Builds an image using the supplied **build_image_kwargs and calls `docker
  run` on the resulting image using sensible defaults.

  Keyword args:

  - job_mode: c.JobMode.

  - run_args: extra arguments to supply to `docker run` after our defaults.
  - script_args: extra arguments to supply to the entrypoint. (You can
  - override the default container entrypoint by supplying a new one inside
    run_args.)
  - image_id: ID of the image to run. Supplying this will skip an image build.
  - experiment_config: dict of string to list, boolean, string or int. Any
    lists will trigger a cartesian product out with the rest of the config. A
    job will be executed for every combination of parameters in the experiment
    config.
  - dry_run: if True, no actual jobs will be executed and docker won't
    actually build; logging side effects will show the user what will happen
    without dry_run=True.

  any extra kwargs supplied are passed through to build_image.
  """
    if run_args is None:
        run_args = []

    if script_args is None:
        script_args = []

    if experiment_config is None:
        experiment_config = {}

    docker_args = {k: v for k, v in build_image_kwargs.items()}
    docker_args['job_mode'] = job_mode

    engine = get_mem_engine() if dry_run else get_sql_engine()

    with session_scope(engine) as session:
        container_spec = generate_container_spec(session, docker_args,
                                                 image_id)

        if image_id is None:
            if dry_run:
                logging.info("Dry run - skipping actual 'docker build'.")
                image_id = 'dry_run_tag'
            else:
                image_id = b.build_image(**docker_args)

        experiments = create_experiments(
            session=session,
            container_spec=container_spec,
            script_args=script_args,
            experiment_config=experiment_config,
            xgroup=xgroup,
        )

        job_specs = [
            JobSpec.get_or_create(
                experiment=x,
                spec=_create_job_spec_dict(
                    experiment=x,
                    job_mode=job_mode,
                    run_args=run_args,
                    image_id=image_id,
                ),
                platform=Platform.LOCAL,
            ) for x in experiments
        ]

        try:
            execute_jobs(job_specs=job_specs, dry_run=dry_run)
        except Exception as e:
            logging.error(f'exception: {e}')
            session.commit()  # commit here, otherwise will be rolled back
Exemple #2
0
def submit_ml_job(
    job_mode: conf.JobMode,
    docker_args: Dict[str, Any],
    region: ct.Region,
    project_id: str,
    credentials_path: Optional[str] = None,
    dry_run: bool = False,
    job_name: Optional[str] = None,
    machine_type: Optional[ct.MachineType] = None,
    gpu_spec: Optional[ct.GPUSpec] = None,
    tpu_spec: Optional[ct.TPUSpec] = None,
    image_tag: Optional[str] = None,
    labels: Optional[Dict[str, str]] = None,
    experiment_config: Optional[ce.ExpConf] = None,
    script_args: Optional[List[str]] = None,
    request_retries: Optional[int] = None,
    xgroup: Optional[str] = None,
) -> None:
  """Top level function in the module. This function:

  - builds an image using the supplied docker_args, in either CPU or GPU mode
  - pushes that image to the Cloud Container Repository of the supplied
    project_id
  - generates a sequence of 'JobSpec' instances, one for every combination in
    the supplied experiment_config, and
  - batch-submits all jobs to AI Platform

  Keyword args:

  - job_mode: caliban.config.JobMode.
  - docker_args: these arguments are passed through to
    caliban.docker.build.build_image.
  - region: the region to use for AI Platform job submission. Different regions
    support different GPUs.
  - project_id: GCloud project ID for container storage and job submission.
  - credentials_path: explicit path to a service account JSON file, if it exists.
  - dry_run: if True, no actual jobs will be submitted and docker won't
    actually build; logging side effects will show the user what will happen
    without dry_run=True.
  - job_name: optional custom name. This is applied as a label to every job,
    and used as a prefix for all jobIds submitted to Cloud.
  - machine_type: the machine type to allocate for each job. Must be one
    supported by Cloud.
  - gpu_spec: if None and job_mode is GPU, defaults to a standard single GPU.
    Else, configures the count and type of GPUs to attach to the machine that
    runs each job.
  - tpu_spec: if None, defaults to no TPU attached. Else, configures the count
    and type of TPUs to attach to the machine that runs each job.
  - image_tag: optional explicit tag of a Container-Registry-available Docker
    container. If supplied, submit_ml_job will skip the docker build and push
    phases and use this image_tag directly.
  - labels: dictionary of KV pairs to apply to each job. User args will also be
    applied as labels, plus a few default labels supplied by Caliban.
  - experiment_config: dict of string to list, boolean, string or int. Any
    lists will trigger a cartesian product out with the rest of the config. A
    job will be submitted for every combination of parameters in the experiment
    config.
  - script_args: these are extra arguments that will be passed to every job
    executed, in addition to the arguments created by expanding out the
    experiment config.
  - request_retries: the number of times to retry each request if it fails for
    a timeout or a rate limiting request.
  - xgroup: experiment group for this submission, if None a new group will
    be created
  """
  if script_args is None:
    script_args = []

  if job_name is None:
    job_name = "caliban_{}".format(u.current_user())

  if job_mode == conf.JobMode.GPU and gpu_spec is None:
    gpu_spec = ct.GPUSpec(ct.GPU.P100, 1)

  if machine_type is None:
    machine_type = conf.DEFAULT_MACHINE_TYPE[job_mode]

  if experiment_config is None:
    experiment_config = {}

  if labels is None:
    labels = {}

  if request_retries is None:
    request_retries = 10

  caliban_config = docker_args.get('caliban_config', {})

  engine = get_mem_engine() if dry_run else get_sql_engine()

  with session_scope(engine) as session:
    container_spec = generate_container_spec(session, docker_args, image_tag)

    if image_tag is None:
      image_tag = generate_image_tag(project_id, docker_args, dry_run=dry_run)

    experiments = create_experiments(
        session=session,
        container_spec=container_spec,
        script_args=script_args,
        experiment_config=experiment_config,
        xgroup=xgroup,
    )

    specs = build_job_specs(
        job_name=job_name,
        image_tag=image_tag,
        region=region,
        machine_type=machine_type,
        experiments=experiments,
        user_labels=labels,
        gpu_spec=gpu_spec,
        tpu_spec=tpu_spec,
        caliban_config=caliban_config,
    )

    if dry_run:
      return execute_dry_run(specs)

    try:
      submit_job_specs(
          specs=specs,
          project_id=project_id,
          credentials_path=credentials_path,
          num_specs=len(experiments),
          request_retries=request_retries,
      )
    except Exception as e:
      logging.error(f'exception: {e}')
      logging.error(f'{traceback.format_exc()}')
      session.commit()  # commit here, otherwise will be rolled back

    logging.info("")
    logging.info(
        t.green("Visit {} to see the status of all jobs.".format(
            job_url(project_id, ''))))
    logging.info("")
Exemple #3
0
def _job_submit(args: dict, cluster: Cluster) -> None:
  """submits job(s) to cluster

  Args:
  args: argument dictionary
  cluster: cluster instance
  """

  script_args = conf.extract_script_args(args)
  job_mode = cli.resolve_job_mode(args)
  docker_args = cli.generate_docker_args(job_mode, args)
  docker_run_args = args.get('docker_run_args', []) or []
  dry_run = args['dry_run']
  package = args['module']
  job_name = _generate_job_name(args.get('name'))
  gpu_spec = args.get('gpu_spec')
  preemptible = not args['nonpreemptible']
  min_cpu = args.get('min_cpu')
  min_mem = args.get('min_mem')
  experiment_config = args.get('experiment_config') or [{}]
  xgroup = args.get('xgroup')
  image_tag = args.get('image_tag')
  export = args.get('export', None)

  labels = args.get('label')
  if labels is not None:
    labels = dict(cu.sanitize_labels(args.get('label')))

  # Arguments to internally build the image required to submit to Cloud.
  docker_m = {'job_mode': job_mode, 'package': package, **docker_args}

  # --------------------------------------------------------------------------
  # validatate gpu spec
  if job_mode == conf.JobMode.GPU and gpu_spec is None:
    gpu_spec = k.DEFAULT_GPU_SPEC

  if not cluster.validate_gpu_spec(gpu_spec):
    return

  # --------------------------------------------------------------------------
  # validate tpu spec and driver
  tpu_spec = args.get('tpu_spec')
  preemptible_tpu = not args.get('nonpreemptible_tpu')
  tpu_driver = args.get('tpu_driver')

  if tpu_spec is not None:
    available_tpu = cluster.get_tpu_types()
    if available_tpu is None:
      logging.error('error getting valid tpu types for cluster')
      return

    if tpu_spec not in available_tpu:
      logging.error('invalid tpu spec, cluster supports:')
      for t in available_tpu:
        logging.info('{}x{}'.format(t.count, t.tpu.name))
      return

    if not cluster.validate_tpu_driver(tpu_driver):
      logging.error('error: unsupported tpu driver {}'.format(tpu_driver))
      logging.info('supported tpu drivers for this cluster:')
      for d in cluster.get_tpu_drivers():
        logging.info('  {}'.format(d))
      return

  if tpu_spec is None and gpu_spec is None:  # cpu-only job
    min_cpu = min_cpu or k.DEFAULT_MIN_CPU_CPU
    min_mem = min_mem or k.DEFAULT_MIN_MEM_CPU
  else:  # gpu/tpu-accelerated job
    min_cpu = min_cpu or k.DEFAULT_MIN_CPU_ACCEL
    min_mem = min_mem or k.DEFAULT_MIN_MEM_ACCEL

  # convert accelerator spec
  accel_spec = Cluster.convert_accel_spec(gpu_spec, tpu_spec)
  if accel_spec is None:
    return

  accel, accel_count = accel_spec

  # --------------------------------------------------------------------------
  engine = get_mem_engine() if dry_run else get_sql_engine()

  with session_scope(engine) as session:
    container_spec = generate_container_spec(session, docker_m, image_tag)

    if image_tag is None:
      image_tag = generate_image_tag(cluster.project_id, docker_m, dry_run)

    experiments = create_experiments(
        session=session,
        container_spec=container_spec,
        script_args=script_args,
        experiment_config=experiment_config,
        xgroup=xgroup,
    )

    specs = list(
        cluster.create_simple_experiment_job_specs(
            name=util.sanitize_job_name(job_name),
            image=image_tag,
            min_cpu=min_cpu,
            min_mem=min_mem,
            experiments=experiments,
            args=script_args,
            accelerator=accel,
            accelerator_count=accel_count,
            preemptible=preemptible,
            preemptible_tpu=preemptible_tpu,
            tpu_driver=tpu_driver))

    # just a dry run
    if dry_run:
      logging.info('jobs that would be submitted:')
      for s in specs:
        logging.info(f'\n{json.dumps(s.spec, indent=2)}')
      return

    # export jobs to file
    if export is not None:
      if not _export_jobs(
          export,
          cluster.create_v1jobs(specs, job_name, labels),
      ):
        print('error exporting jobs to {}'.format(export))
      return

    for s in specs:
      try:
        cluster.submit_job(job_spec=s, name=job_name, labels=labels)
      except Exception as e:
        logging.error(f'exception: {e}')
        session.commit()  # commit here, otherwise will be rolled back
        return

  # --------------------------------------------------------------------------
  logging.info(f'jobs submitted, visit {cluster.dashboard_url()} to monitor')

  return
Exemple #4
0
from sqlalchemy.engine.base import Engine

import pytest  # type: ignore
from caliban.history.types import (ContainerSpec, Experiment, ExperimentGroup,
                                   Job, JobSpec, Platform)
from caliban.history.util import get_mem_engine, session_scope
from caliban.util import current_user

# https://mypy.readthedocs.io/en/latest/jobning_mypy.html#missing-imports

# we create and exist session scopes here to test persistence


# ----------------------------------------------------------------------------
@pytest.mark.parametrize('engine', [get_mem_engine()])
def test_container_spec(engine: Engine):

  spec = {
      'nogpu': True,
      'cloud_key': '/path/to/key.json',
      'image_tag': None,
      'dir': ['/extra/path0', '/extra/path2'],
      'base_dir': '/home/foo',
      'module': 'train.py'
  }

  def validate_spec(session) -> ContainerSpec:
    s = session.query(ContainerSpec).all()
    assert len(s) == 1
    s = s[0]