def _print_flags(expanded: List[c.Experiment]) -> None: """Print the flags associated with each experiment in the supplied expansion list. """ for m in expanded: flags = c.experiment_to_args(m) print(' '.join(flags))
def log_job_spec_instance(job_spec: JobSpec, i: int) -> JobSpec: """Prints logging as a side effect for the supplied sequence of job specs generated from an experiment definition; returns the input job spec. """ args = c.experiment_to_args(job_spec.experiment.kwargs, job_spec.experiment.args) logging.info("") logging.info("Job {} - Experiment args: {}".format(i, t.yellow(str(args)))) return job_spec
def _create_job_spec_dict( experiment: Experiment, job_mode: c.JobMode, image_id: str, run_args: Optional[List[str]] = None, ) -> Dict[str, Any]: '''creates a job spec dictionary for a local job''' base_cmd = _run_cmd(job_mode, run_args) + [image_id] command = base_cmd + c.experiment_to_args(experiment.kwargs, experiment.args) return {'command': command, 'container': image_id}
def _create_job_spec_dict( experiment: Experiment, job_mode: c.JobMode, image_id: str, run_args: Optional[List[str]] = None, ) -> Dict[str, Any]: '''creates a job spec dictionary for a local job''' # Without the unbuffered environment variable, stderr and stdout won't be # emitted in the proper order from inside the container. terminal_cmds = ["-e" "PYTHONUNBUFFERED=1"] + window_size_env_cmds() base_cmd = _run_cmd(job_mode, run_args) + terminal_cmds + [image_id] command = base_cmd + c.experiment_to_args(experiment.kwargs, experiment.args) return {'command': command, 'container': image_id}
def local_callback(idx: int, job: Job) -> None: """Provides logging feedback for jobs run locally. If the return code is 0, logs success; else, logs the failure as an error and logs the script args that provided the failure. """ if job.status == JobStatus.SUCCEEDED: logging.info(t.green(f'Job {idx} succeeded!')) else: logging.error( t.red( f'Job {idx} failed with return code {job.details["ret_code"]}.' )) args = c.experiment_to_args(job.spec.experiment.kwargs, job.spec.experiment.args) logging.error(t.red(f'Failing args for job {idx}: {args}'))
def _job_specs( job_name: str, training_input: Dict[str, Any], labels: Dict[str, str], experiments: Iterable[ht.Experiment], ) -> Iterable[ht.JobSpec]: """Returns a generator that yields a JobSpec instance for every possible combination of parameters in the supplied experiment config. All other arguments parametrize every JobSpec that's generated; labels, arguments and job id change for each JobSpec. This is lower-level than build_job_specs below. """ for idx, m in enumerate(experiments, 1): args = conf.experiment_to_args(m.kwargs, m.args) yield _job_spec(job_name=job_name, idx=idx, training_input={ **training_input, "args": args }, labels=labels, experiment=m)
def create_simple_job_spec( self, experiment: Experiment, name: str, image: str, min_cpu: int, min_mem: int, command: Optional[List[str]] = None, env: Dict[str, str] = {}, accelerator: Optional[Accelerator] = None, accelerator_count: int = 1, namespace: str = k.DEFAULT_NAMESPACE, machine_type: Optional[MachineType] = None, preemptible: bool = True, preemptible_tpu: bool = True, tpu_driver: str = k.DEFAULT_TPU_DRIVER) -> Optional[JobSpec]: """creates a simple kubernetes job (1 container, 1 pod) JobSpec for this cluster Args: name: job name image: container image url (gcr.io/...) min_cpu: minimum cpu needed, in milli-cpu min_mem: minimum memory needed, in MB command: command to execute, None = container entrypoint args: args to pass to command env: environment vars for container accelerator: accelerator type, None=cpu only accelerator_count: accelerator count namespace: kubernetes namespace machine_type: machine type, None=default for mode (cpu/gpu) preemptible: use preemptible instance preemptible_tpu: use preemptible tpus tpu_driver: tpu driver to use Returns: JobSpec on success, None otherwise """ args = conf.experiment_to_args(experiment.kwargs, experiment.args) # ------------------------------------------------------------------------ # container # tpu/gpu resources container_resources = V1ResourceRequirements( requests=Cluster.container_requests(min_cpu, min_mem), limits=Cluster.container_limits( accelerator, accelerator_count, preemptible_tpu, ), ) container_env = [V1EnvVar(name=k, value=v) for k, v in env.items()] # this is a simple 1-container, 1-pod job, so we just name the # container the same thing (minus the generated suffix) as the job itself container = V1Container( name=name, image=image, command=command, args=args, resources=container_resources, env=container_env, image_pull_policy='Always', ) # ------------------------------------------------------------------------ # template # todo: should we support anything other than a 'never' restart policy? # see this for discussion # https://kubernetes.io/docs/concepts/workloads/controllers/jobs-run-to-completion/#pod-backoff-failure-policy tolerations = Cluster.tolerations(preemptible=preemptible) # backoff count plus 'OnFailure' may be correct here template_spec = V1PodSpec( restart_policy='Never', containers=[container], tolerations=tolerations, node_selector=Cluster.node_selector( preemptible=preemptible, machine_type=machine_type, accelerator=accelerator, ), host_ipc=True, ) template = V1PodTemplateSpec( metadata=Cluster.template_metadata( accelerator=accelerator, tpu_driver=tpu_driver, ), spec=template_spec, ) # ------------------------------------------------------------------------ # job job_spec = V1JobSpec(template=template, backoff_limit=4) return JobSpec.get_or_create( experiment=experiment, spec=ApiClient().sanitize_for_serialization(job_spec), platform=Platform.GKE, )