Exemple #1
0
def _create_job_spec_dict(
    experiment: Experiment,
    job_mode: c.JobMode,
    image_id: str,
    index: int,
    caliban_config: Dict[str, Any],
    run_args: Optional[List[str]] = None,
) -> Dict[str, Any]:
    '''creates a job spec dictionary for a local job'''

    # Without the unbuffered environment variable, stderr and stdout won't be
    # emitted in the proper order from inside the container.
    terminal_cmds = ["-e", "PYTHONUNBUFFERED=1"] + window_size_env_cmds()

    base_cmd = _run_cmd(job_mode, run_args) + terminal_cmds + [image_id]

    launcher_args = um.mlflow_args(
        caliban_config=caliban_config,
        experiment_name=experiment.xgroup.name,
        index=index,
        tags={
            um.GPU_ENABLED_TAG: str(job_mode == c.JobMode.GPU).lower(),
            um.TPU_ENABLED_TAG: 'false',
            um.DOCKER_IMAGE_TAG: image_id,
            um.PLATFORM_TAG: Platform.LOCAL.value,
        },
    )

    cmd_args = ce.experiment_to_args(experiment.kwargs, experiment.args)

    # cmd args *must* be last in order for the launcher to pass them through
    command = base_cmd + launcher_args + cmd_args

    return {'command': command, 'container': image_id}
Exemple #2
0
def _job_specs(
    job_name: str,
    training_input: Dict[str, Any],
    labels: Dict[str, str],
    experiments: Iterable[ht.Experiment],
    caliban_config: Optional[Dict[str, Any]] = None,
) -> Iterable[ht.JobSpec]:
  """Returns a generator that yields a JobSpec instance for every possible
  combination of parameters in the supplied experiment config.

  All other arguments parametrize every JobSpec that's generated; labels,
  arguments and job id change for each JobSpec.

  This is lower-level than build_job_specs below.

  """
  caliban_config = caliban_config or {}

  for idx, m in enumerate(experiments, 1):

    launcher_args = um.mlflow_args(
        caliban_config=caliban_config,
        experiment_name=m.xgroup.name,
        index=idx,
        tags={
            um.PLATFORM_TAG: ht.Platform.CAIP.value,
            **labels,
        },
    )

    cmd_args = ce.experiment_to_args(m.kwargs, m.args)

    # cmd args *must* be last in order for the launcher to pass them through
    args = launcher_args + cmd_args

    yield _job_spec(job_name=job_name,
                    idx=idx,
                    training_input={
                        **training_input, "args": args
                    },
                    labels=labels,
                    experiment=m)
Exemple #3
0
def test_mlflow_args():
    '''verifies that we generate the dynamic args for the caliban launcher
  script for mlflow integration'''

    # test case when caliban_config has no mlflow configuration
    cfg = {
        'caliban_config': {
            'base_image': 'gcr.io/a/b'
        },
        'experiment_name': 'foo',
        'index': 42,
        'tags': {
            'a': 'x',
            'b': 7
        },
    }

    mlflow_args = um.mlflow_args(**cfg)
    assert len(mlflow_args) == 0

    # test case when caliban_config has empty mlflow configuration
    cfg = {
        'caliban_config': {
            'base_image': 'gcr.io/a/b',
            'mlflow_config': None
        },
        'experiment_name': 'foo',
        'index': 42,
        'tags': {
            'a': 'x',
            'b': 7
        },
    }

    mlflow_args = um.mlflow_args(**cfg)
    assert len(mlflow_args) == 0

    # test case when caliban_config has mlflow configuration
    cfg = {
        'caliban_config': {
            'mlflow_config': {}
        },
        'experiment_name': 'foo',
        'index': 42,
        'tags': {
            'a': 'x',
            'b': 7
        },
    }

    mlflow_args = um.mlflow_args(**cfg)

    assert len(mlflow_args) == 2
    assert mlflow_args[0] == '--caliban_config'

    # make sure that config is json dict
    arg_dict = json.loads(mlflow_args[1])
    assert isinstance(arg_dict, dict)

    assert 'env' in arg_dict
    env_vars = arg_dict['env']
    assert 'MLFLOW_EXPERIMENT_NAME' in env_vars
    assert env_vars['MLFLOW_EXPERIMENT_NAME'] == cfg['experiment_name']

    assert 'MLFLOW_RUN_NAME' in env_vars
    assert isinstance(env_vars['MLFLOW_RUN_NAME'], str)

    for k, v in cfg['tags'].items():
        k_e = f'ENVVAR_{k}'
        assert k_e in env_vars
        assert env_vars[k_e] == v
Exemple #4
0
    def create_simple_job_spec(
        self,
        experiment: Experiment,
        name: str,
        image: str,
        min_cpu: int,
        min_mem: int,
        index: int,
        command: Optional[List[str]] = None,
        env: Dict[str, str] = {},
        accelerator: Optional[Accelerator] = None,
        accelerator_count: int = 1,
        namespace: str = k.DEFAULT_NAMESPACE,
        machine_type: Optional[MachineType] = None,
        preemptible: bool = True,
        preemptible_tpu: bool = True,
        tpu_driver: str = k.DEFAULT_TPU_DRIVER,
        labels: Optional[Dict[str, str]] = None,
        caliban_config: Optional[Dict[str, Any]] = None,
    ) -> Optional[JobSpec]:
        """creates a simple kubernetes job (1 container, 1 pod) JobSpec for this cluster

    Args:
    name: job name
    image: container image url (gcr.io/...)
    min_cpu: minimum cpu needed, in milli-cpu
    min_mem: minimum memory needed, in MB
    command: command to execute, None = container entrypoint
    args: args to pass to command
    env: environment vars for container
    accelerator: accelerator type, None=cpu only
    accelerator_count: accelerator count
    namespace: kubernetes namespace
    machine_type: machine type, None=default for mode (cpu/gpu)
    preemptible: use preemptible instance
    preemptible_tpu: use preemptible tpus
    tpu_driver: tpu driver to use
    labels: user labels to set
    caliban_config: caliban configuration dictionary

    Returns:
    JobSpec on success, None otherwise
    """

        caliban_config = caliban_config or {}
        labels = labels or {}

        launcher_args = um.mlflow_args(
            caliban_config=caliban_config,
            experiment_name=experiment.xgroup.name,
            index=index,
            tags={
                um.PLATFORM_TAG: Platform.GKE.value,
                **labels,
            },
        )

        cmd_args = ce.experiment_to_args(experiment.kwargs, experiment.args)

        # cmd args *must* be last in order for the launcher to pass them through
        args = launcher_args + cmd_args

        # ------------------------------------------------------------------------
        # container

        # tpu/gpu resources
        container_resources = V1ResourceRequirements(
            requests=Cluster.container_requests(min_cpu, min_mem),
            limits=Cluster.container_limits(
                accelerator,
                accelerator_count,
                preemptible_tpu,
            ),
        )

        container_env = [V1EnvVar(name=k, value=v) for k, v in env.items()]

        # this is a simple 1-container, 1-pod job, so we just name the
        # container the same thing (minus the generated suffix) as the job itself
        container = V1Container(
            name=name,
            image=image,
            command=command,
            args=args,
            resources=container_resources,
            env=container_env,
            image_pull_policy='Always',
        )

        # ------------------------------------------------------------------------
        # template

        # todo: should we support anything other than a 'never' restart policy?
        # see this for discussion
        # https://kubernetes.io/docs/concepts/workloads/controllers/jobs-run-to-completion/#pod-backoff-failure-policy

        tolerations = Cluster.tolerations(preemptible=preemptible)

        # backoff count plus 'OnFailure' may be correct here
        template_spec = V1PodSpec(
            restart_policy='Never',
            containers=[container],
            tolerations=tolerations,
            node_selector=Cluster.node_selector(
                preemptible=preemptible,
                machine_type=machine_type,
                accelerator=accelerator,
            ),
            host_ipc=True,
        )

        template = V1PodTemplateSpec(
            metadata=Cluster.template_metadata(
                accelerator=accelerator,
                tpu_driver=tpu_driver,
            ),
            spec=template_spec,
        )

        # ------------------------------------------------------------------------
        # job
        job_spec = V1JobSpec(template=template, backoff_limit=4)

        return JobSpec.get_or_create(
            experiment=experiment,
            spec=ApiClient().sanitize_for_serialization(job_spec),
            platform=Platform.GKE,
        )