Example #1
0
def _print_flags(expanded: List[c.Experiment]) -> None:
  """Print the flags associated with each experiment in the supplied expansion
  list.

  """
  for m in expanded:
    flags = c.experiment_to_args(m)
    print(' '.join(flags))
Example #2
0
def log_job_spec_instance(job_spec: JobSpec, i: int) -> JobSpec:
    """Prints logging as a side effect for the supplied sequence of job specs
  generated from an experiment definition; returns the input job spec.

  """
    args = c.experiment_to_args(job_spec.experiment.kwargs,
                                job_spec.experiment.args)
    logging.info("")
    logging.info("Job {} - Experiment args: {}".format(i, t.yellow(str(args))))
    return job_spec
Example #3
0
def _create_job_spec_dict(
    experiment: Experiment,
    job_mode: c.JobMode,
    image_id: str,
    run_args: Optional[List[str]] = None,
) -> Dict[str, Any]:
    '''creates a job spec dictionary for a local job'''
    base_cmd = _run_cmd(job_mode, run_args) + [image_id]
    command = base_cmd + c.experiment_to_args(experiment.kwargs,
                                              experiment.args)
    return {'command': command, 'container': image_id}
Example #4
0
def _create_job_spec_dict(
    experiment: Experiment,
    job_mode: c.JobMode,
    image_id: str,
    run_args: Optional[List[str]] = None,
) -> Dict[str, Any]:
    '''creates a job spec dictionary for a local job'''

    # Without the unbuffered environment variable, stderr and stdout won't be
    # emitted in the proper order from inside the container.
    terminal_cmds = ["-e" "PYTHONUNBUFFERED=1"] + window_size_env_cmds()

    base_cmd = _run_cmd(job_mode, run_args) + terminal_cmds + [image_id]
    command = base_cmd + c.experiment_to_args(experiment.kwargs,
                                              experiment.args)
    return {'command': command, 'container': image_id}
Example #5
0
def local_callback(idx: int, job: Job) -> None:
    """Provides logging feedback for jobs run locally. If the return code is 0,
  logs success; else, logs the failure as an error and logs the script args
  that provided the failure.

  """
    if job.status == JobStatus.SUCCEEDED:
        logging.info(t.green(f'Job {idx} succeeded!'))
    else:
        logging.error(
            t.red(
                f'Job {idx} failed with return code {job.details["ret_code"]}.'
            ))
        args = c.experiment_to_args(job.spec.experiment.kwargs,
                                    job.spec.experiment.args)
        logging.error(t.red(f'Failing args for job {idx}: {args}'))
Example #6
0
def _job_specs(
    job_name: str,
    training_input: Dict[str, Any],
    labels: Dict[str, str],
    experiments: Iterable[ht.Experiment],
) -> Iterable[ht.JobSpec]:
    """Returns a generator that yields a JobSpec instance for every possible
  combination of parameters in the supplied experiment config.

  All other arguments parametrize every JobSpec that's generated; labels,
  arguments and job id change for each JobSpec.

  This is lower-level than build_job_specs below.

  """
    for idx, m in enumerate(experiments, 1):
        args = conf.experiment_to_args(m.kwargs, m.args)
        yield _job_spec(job_name=job_name,
                        idx=idx,
                        training_input={
                            **training_input, "args": args
                        },
                        labels=labels,
                        experiment=m)
Example #7
0
    def create_simple_job_spec(
            self,
            experiment: Experiment,
            name: str,
            image: str,
            min_cpu: int,
            min_mem: int,
            command: Optional[List[str]] = None,
            env: Dict[str, str] = {},
            accelerator: Optional[Accelerator] = None,
            accelerator_count: int = 1,
            namespace: str = k.DEFAULT_NAMESPACE,
            machine_type: Optional[MachineType] = None,
            preemptible: bool = True,
            preemptible_tpu: bool = True,
            tpu_driver: str = k.DEFAULT_TPU_DRIVER) -> Optional[JobSpec]:
        """creates a simple kubernetes job (1 container, 1 pod) JobSpec for this cluster

    Args:
    name: job name
    image: container image url (gcr.io/...)
    min_cpu: minimum cpu needed, in milli-cpu
    min_mem: minimum memory needed, in MB
    command: command to execute, None = container entrypoint
    args: args to pass to command
    env: environment vars for container
    accelerator: accelerator type, None=cpu only
    accelerator_count: accelerator count
    namespace: kubernetes namespace
    machine_type: machine type, None=default for mode (cpu/gpu)
    preemptible: use preemptible instance
    preemptible_tpu: use preemptible tpus
    tpu_driver: tpu driver to use

    Returns:
    JobSpec on success, None otherwise
    """

        args = conf.experiment_to_args(experiment.kwargs, experiment.args)

        # ------------------------------------------------------------------------
        # container

        # tpu/gpu resources
        container_resources = V1ResourceRequirements(
            requests=Cluster.container_requests(min_cpu, min_mem),
            limits=Cluster.container_limits(
                accelerator,
                accelerator_count,
                preemptible_tpu,
            ),
        )

        container_env = [V1EnvVar(name=k, value=v) for k, v in env.items()]

        # this is a simple 1-container, 1-pod job, so we just name the
        # container the same thing (minus the generated suffix) as the job itself
        container = V1Container(
            name=name,
            image=image,
            command=command,
            args=args,
            resources=container_resources,
            env=container_env,
            image_pull_policy='Always',
        )

        # ------------------------------------------------------------------------
        # template

        # todo: should we support anything other than a 'never' restart policy?
        # see this for discussion
        # https://kubernetes.io/docs/concepts/workloads/controllers/jobs-run-to-completion/#pod-backoff-failure-policy

        tolerations = Cluster.tolerations(preemptible=preemptible)

        # backoff count plus 'OnFailure' may be correct here
        template_spec = V1PodSpec(
            restart_policy='Never',
            containers=[container],
            tolerations=tolerations,
            node_selector=Cluster.node_selector(
                preemptible=preemptible,
                machine_type=machine_type,
                accelerator=accelerator,
            ),
            host_ipc=True,
        )

        template = V1PodTemplateSpec(
            metadata=Cluster.template_metadata(
                accelerator=accelerator,
                tpu_driver=tpu_driver,
            ),
            spec=template_spec,
        )

        # ------------------------------------------------------------------------
        # job
        job_spec = V1JobSpec(template=template, backoff_limit=4)

        return JobSpec.get_or_create(
            experiment=experiment,
            spec=ApiClient().sanitize_for_serialization(job_spec),
            platform=Platform.GKE,
        )