Example #1
0
def _create_job_spec_dict(
    experiment: Experiment,
    job_mode: c.JobMode,
    image_id: str,
    index: int,
    caliban_config: Dict[str, Any],
    run_args: Optional[List[str]] = None,
) -> Dict[str, Any]:
    '''creates a job spec dictionary for a local job'''

    # Without the unbuffered environment variable, stderr and stdout won't be
    # emitted in the proper order from inside the container.
    terminal_cmds = ["-e", "PYTHONUNBUFFERED=1"] + window_size_env_cmds()

    base_cmd = _run_cmd(job_mode, run_args) + terminal_cmds + [image_id]

    launcher_args = um.mlflow_args(
        caliban_config=caliban_config,
        experiment_name=experiment.xgroup.name,
        index=index,
        tags={
            um.GPU_ENABLED_TAG: str(job_mode == c.JobMode.GPU).lower(),
            um.TPU_ENABLED_TAG: 'false',
            um.DOCKER_IMAGE_TAG: image_id,
            um.PLATFORM_TAG: Platform.LOCAL.value,
        },
    )

    cmd_args = ce.experiment_to_args(experiment.kwargs, experiment.args)

    # cmd args *must* be last in order for the launcher to pass them through
    command = base_cmd + launcher_args + cmd_args

    return {'command': command, 'container': image_id}
Example #2
0
def _print_flags(expanded: List[ce.Experiment]) -> None:
    """Print the flags associated with each experiment in the supplied expansion
  list.

  """
    for m in expanded:
        flags = ce.experiment_to_args(m)
        print(' '.join(flags))
Example #3
0
def log_job_spec_instance(job_spec: JobSpec, i: int) -> JobSpec:
    """Prints logging as a side effect for the supplied sequence of job specs
  generated from an experiment definition; returns the input job spec.

  """
    args = ce.experiment_to_args(job_spec.experiment.kwargs,
                                 job_spec.experiment.args)
    logging.info("")
    logging.info("Job {} - Experiment args: {}".format(i, t.yellow(str(args))))
    return job_spec
Example #4
0
def local_callback(idx: int, job: Job) -> None:
    """Provides logging feedback for jobs run locally. If the return code is 0,
  logs success; else, logs the failure as an error and logs the script args
  that provided the failure.

  """
    if job.status == JobStatus.SUCCEEDED:
        logging.info(t.green(f'Job {idx} succeeded!'))
    else:
        logging.error(
            t.red(
                f'Job {idx} failed with return code {job.details["ret_code"]}.'
            ))
        args = ce.experiment_to_args(job.spec.experiment.kwargs,
                                     job.spec.experiment.args)
        logging.error(t.red(f'Failing args for job {idx}: {args}'))
Example #5
0
def _create_job_spec_dict(
    experiment: Experiment,
    job_mode: c.JobMode,
    image_id: str,
    run_args: Optional[List[str]] = None,
) -> Dict[str, Any]:
    '''creates a job spec dictionary for a local job'''

    # Without the unbuffered environment variable, stderr and stdout won't be
    # emitted in the proper order from inside the container.
    terminal_cmds = ["-e", "PYTHONUNBUFFERED=1"] + window_size_env_cmds()

    base_cmd = _run_cmd(job_mode, run_args) + terminal_cmds + [image_id]
    command = base_cmd + ce.experiment_to_args(experiment.kwargs,
                                               experiment.args)
    return {'command': command, 'container': image_id}
Example #6
0
def _job_specs(
    job_name: str,
    training_input: Dict[str, Any],
    labels: Dict[str, str],
    experiments: Iterable[ht.Experiment],
    caliban_config: Optional[Dict[str, Any]] = None,
) -> Iterable[ht.JobSpec]:
  """Returns a generator that yields a JobSpec instance for every possible
  combination of parameters in the supplied experiment config.

  All other arguments parametrize every JobSpec that's generated; labels,
  arguments and job id change for each JobSpec.

  This is lower-level than build_job_specs below.

  """
  caliban_config = caliban_config or {}

  for idx, m in enumerate(experiments, 1):

    launcher_args = um.mlflow_args(
        caliban_config=caliban_config,
        experiment_name=m.xgroup.name,
        index=idx,
        tags={
            um.PLATFORM_TAG: ht.Platform.CAIP.value,
            **labels,
        },
    )

    cmd_args = ce.experiment_to_args(m.kwargs, m.args)

    # cmd args *must* be last in order for the launcher to pass them through
    args = launcher_args + cmd_args

    yield _job_spec(job_name=job_name,
                    idx=idx,
                    training_input={
                        **training_input, "args": args
                    },
                    labels=labels,
                    experiment=m)
Example #7
0
def _job_specs(
    job_name: str,
    training_input: Dict[str, Any],
    labels: Dict[str, str],
    experiments: Iterable[ht.Experiment],
) -> Iterable[ht.JobSpec]:
    """Returns a generator that yields a JobSpec instance for every possible
  combination of parameters in the supplied experiment config.

  All other arguments parametrize every JobSpec that's generated; labels,
  arguments and job id change for each JobSpec.

  This is lower-level than build_job_specs below.

  """
    for idx, m in enumerate(experiments, 1):
        args = ce.experiment_to_args(m.kwargs, m.args)
        yield _job_spec(job_name=job_name,
                        idx=idx,
                        training_input={
                            **training_input, "args": args
                        },
                        labels=labels,
                        experiment=m)
Example #8
0
  def create_simple_job_spec(
      self,
      experiment: Experiment,
      name: str,
      image: str,
      min_cpu: int,
      min_mem: int,
      command: Optional[List[str]] = None,
      env: Dict[str, str] = {},
      accelerator: Optional[Accelerator] = None,
      accelerator_count: int = 1,
      namespace: str = k.DEFAULT_NAMESPACE,
      machine_type: Optional[MachineType] = None,
      preemptible: bool = True,
      preemptible_tpu: bool = True,
      tpu_driver: str = k.DEFAULT_TPU_DRIVER) -> Optional[JobSpec]:
    """creates a simple kubernetes job (1 container, 1 pod) JobSpec for this cluster

    Args:
    name: job name
    image: container image url (gcr.io/...)
    min_cpu: minimum cpu needed, in milli-cpu
    min_mem: minimum memory needed, in MB
    command: command to execute, None = container entrypoint
    args: args to pass to command
    env: environment vars for container
    accelerator: accelerator type, None=cpu only
    accelerator_count: accelerator count
    namespace: kubernetes namespace
    machine_type: machine type, None=default for mode (cpu/gpu)
    preemptible: use preemptible instance
    preemptible_tpu: use preemptible tpus
    tpu_driver: tpu driver to use

    Returns:
    JobSpec on success, None otherwise
    """

    args = ce.experiment_to_args(experiment.kwargs, experiment.args)

    # ------------------------------------------------------------------------
    # container

    # tpu/gpu resources
    container_resources = V1ResourceRequirements(
        requests=Cluster.container_requests(min_cpu, min_mem),
        limits=Cluster.container_limits(
            accelerator,
            accelerator_count,
            preemptible_tpu,
        ),
    )

    container_env = [V1EnvVar(name=k, value=v) for k, v in env.items()]

    # this is a simple 1-container, 1-pod job, so we just name the
    # container the same thing (minus the generated suffix) as the job itself
    container = V1Container(
        name=name,
        image=image,
        command=command,
        args=args,
        resources=container_resources,
        env=container_env,
        image_pull_policy='Always',
    )

    # ------------------------------------------------------------------------
    # template

    # todo: should we support anything other than a 'never' restart policy?
    # see this for discussion
    # https://kubernetes.io/docs/concepts/workloads/controllers/jobs-run-to-completion/#pod-backoff-failure-policy

    tolerations = Cluster.tolerations(preemptible=preemptible)

    # backoff count plus 'OnFailure' may be correct here
    template_spec = V1PodSpec(
        restart_policy='Never',
        containers=[container],
        tolerations=tolerations,
        node_selector=Cluster.node_selector(
            preemptible=preemptible,
            machine_type=machine_type,
            accelerator=accelerator,
        ),
        host_ipc=True,
    )

    template = V1PodTemplateSpec(
        metadata=Cluster.template_metadata(
            accelerator=accelerator,
            tpu_driver=tpu_driver,
        ),
        spec=template_spec,
    )

    # ------------------------------------------------------------------------
    # job
    job_spec = V1JobSpec(template=template, backoff_limit=4)

    return JobSpec.get_or_create(
        experiment=experiment,
        spec=ApiClient().sanitize_for_serialization(job_spec),
        platform=Platform.GKE,
    )