Exemple #1
0
def test_job_spec(engine: Engine):

    job_spec = {'a': 2, 'b': [0, 1, 2], 'c': {'x': 1, 'y': 'foo'}}
    container_spec = {
        'nogpu': True,
        'cloud_key': '/path/to/key.json',
        'image_tag': None,
        'dir': ['/extra/path0', '/extra/path2'],
        'base_dir': '/home/foo',
        'module': 'train.py'
    }

    def validate_spec(session) -> JobSpec:
        s = session.query(JobSpec).all()
        assert len(s) == 1
        s = s[0]
        assert s.platform == Platform.LOCAL
        assert s.spec == job_spec
        return s

    # test basic creation
    with session_scope(engine) as session:
        xg = ExperimentGroup.get_or_create(session=session)
        c = ContainerSpec.get_or_create(session=session, spec=container_spec)
        e = Experiment.get_or_create(xgroup=xg, container_spec=c)
        j = JobSpec.get_or_create(
            experiment=e,
            spec=job_spec,
            platform=Platform.LOCAL,
        )
        session.add(xg)

    # test basic persistence, then add duplicate
    with session_scope(engine) as session:
        s = validate_spec(session)

        session.add(
            JobSpec.get_or_create(
                experiment=s.experiment,
                spec=job_spec,
                platform=Platform.LOCAL,
            ))

    # test get_or_create, then create new spec
    with session_scope(engine) as session:
        s = validate_spec(session)

        session.add(
            JobSpec.get_or_create(
                experiment=s.experiment,
                spec=job_spec,
                platform=Platform.CAIP,
            ))

    # verify that new spec was peristed
    with session_scope(engine) as session:
        s = session.query(JobSpec).all()
        assert len(s) == 2
        assert s[0].spec == s[1].spec
        assert s[0].platform != s[1].platform
Exemple #2
0
def test_job(engine: Engine):

    args = ['a', 4]
    kwargs = {'k0': 0, 'k1': 'xyz'}
    job_spec = {'a': 2, 'b': [0, 1, 2], 'c': {'x': 1, 'y': 'foo'}}
    container_spec = {
        'nogpu': True,
        'cloud_key': '/path/to/key.json',
        'image_tag': None,
        'dir': ['/extra/path0', '/extra/path2'],
        'base_dir': '/home/foo',
        'module': 'train.py'
    }

    # test basic job creation
    with session_scope(engine) as session:

        xg = ExperimentGroup()
        c = ContainerSpec.get_or_create(session=session, spec=container_spec)
        e = Experiment.get_or_create(
            xgroup=xg,
            container_spec=c,
            args=args,
            kwargs=kwargs,
        )

        jspec = JobSpec.get_or_create(
            experiment=e,
            spec=job_spec,
            platform=Platform.TEST,
        )

        job = Job(spec=jspec, container='container0', details={'job_id': 123})
        session.add(e)

    # test job persistence
    with session_scope(engine) as session:
        j = session.query(Job).all()
        assert len(j) == 1
        j = j[0]
        assert j.container == 'container0'
        assert j.experiment.args == args
        assert j.experiment.kwargs == kwargs
        assert j.spec.spec == job_spec
        assert j.details['job_id'] == 123
Exemple #3
0
def replace_caip_job_spec_image(spec: JobSpec, image_id: str) -> JobSpec:
  '''generates a new JobSpec based on an existing one, but replacing the
  image id

  Args:
  spec: job spec used as basis
  image_id: new image id

  Returns:
  new JobSpec
  '''

  new_spec = deepcopy(spec.spec)
  new_spec['trainingInput']['masterConfig']['imageUri'] = image_id

  return JobSpec.get_or_create(experiment=spec.experiment,
                               spec=new_spec,
                               platform=Platform.CAIP)
Exemple #4
0
def replace_gke_job_spec_image(spec: JobSpec, image_id: str) -> JobSpec:
  '''generates a new JobSpec based on an existing one, but replacing the
  image id

  Args:
  spec: job spec used as basis
  image_id: new image id

  Returns:
  new JobSpec
  '''

  new_spec = deepcopy(spec.spec)
  for i in range(len(new_spec['template']['spec']['containers'])):
    new_spec['template']['spec']['containers'][i]['image'] = image_id

  print
  return JobSpec.get_or_create(
      experiment=spec.experiment,
      spec=new_spec,
      platform=Platform.GKE,
  )
Exemple #5
0
def replace_local_job_spec_image(spec: JobSpec, image_id: str) -> JobSpec:
  '''generates a new JobSpec based on an existing one, but replacing the
  image id

  Args:
  spec: job spec used as basis
  image_id: new image id

  Returns:
  new JobSpec
  '''

  old_image = spec.spec['container']
  old_cmd = spec.spec['command']
  new_cmd = list(map(lambda x: x if x != old_image else image_id, old_cmd))

  return JobSpec.get_or_create(
      experiment=spec.experiment,
      spec={
          'command': new_cmd,
          'container': image_id,
      },
      platform=Platform.LOCAL,
  )
Exemple #6
0
def run_experiments(job_mode: c.JobMode,
                    run_args: Optional[List[str]] = None,
                    script_args: Optional[List[str]] = None,
                    image_id: Optional[str] = None,
                    dry_run: bool = False,
                    experiment_config: Optional[ce.ExpConf] = None,
                    xgroup: Optional[str] = None,
                    **build_image_kwargs) -> None:
    """Builds an image using the supplied **build_image_kwargs and calls `docker
  run` on the resulting image using sensible defaults.

  Keyword args:

  - job_mode: c.JobMode.

  - run_args: extra arguments to supply to `docker run` after our defaults.
  - script_args: extra arguments to supply to the entrypoint. (You can
  - override the default container entrypoint by supplying a new one inside
    run_args.)
  - image_id: ID of the image to run. Supplying this will skip an image build.
  - experiment_config: dict of string to list, boolean, string or int. Any
    lists will trigger a cartesian product out with the rest of the config. A
    job will be executed for every combination of parameters in the experiment
    config.
  - dry_run: if True, no actual jobs will be executed and docker won't
    actually build; logging side effects will show the user what will happen
    without dry_run=True.

  any extra kwargs supplied are passed through to build_image.
  """
    if run_args is None:
        run_args = []

    if script_args is None:
        script_args = []

    if experiment_config is None:
        experiment_config = {}

    docker_args = {k: v for k, v in build_image_kwargs.items()}
    docker_args['job_mode'] = job_mode

    engine = get_mem_engine() if dry_run else get_sql_engine()

    with session_scope(engine) as session:
        container_spec = generate_container_spec(session, docker_args,
                                                 image_id)

        if image_id is None:
            if dry_run:
                logging.info("Dry run - skipping actual 'docker build'.")
                image_id = 'dry_run_tag'
            else:
                image_id = b.build_image(**docker_args)

        experiments = create_experiments(
            session=session,
            container_spec=container_spec,
            script_args=script_args,
            experiment_config=experiment_config,
            xgroup=xgroup,
        )

        job_specs = [
            JobSpec.get_or_create(
                experiment=x,
                spec=_create_job_spec_dict(
                    experiment=x,
                    job_mode=job_mode,
                    run_args=run_args,
                    image_id=image_id,
                ),
                platform=Platform.LOCAL,
            ) for x in experiments
        ]

        try:
            execute_jobs(job_specs=job_specs, dry_run=dry_run)
        except Exception as e:
            logging.error(f'exception: {e}')
            session.commit()  # commit here, otherwise will be rolled back
Exemple #7
0
    def create_simple_job_spec(
            self,
            experiment: Experiment,
            name: str,
            image: str,
            min_cpu: int,
            min_mem: int,
            command: Optional[List[str]] = None,
            env: Dict[str, str] = {},
            accelerator: Optional[Accelerator] = None,
            accelerator_count: int = 1,
            namespace: str = k.DEFAULT_NAMESPACE,
            machine_type: Optional[MachineType] = None,
            preemptible: bool = True,
            preemptible_tpu: bool = True,
            tpu_driver: str = k.DEFAULT_TPU_DRIVER) -> Optional[JobSpec]:
        """creates a simple kubernetes job (1 container, 1 pod) JobSpec for this cluster

    Args:
    name: job name
    image: container image url (gcr.io/...)
    min_cpu: minimum cpu needed, in milli-cpu
    min_mem: minimum memory needed, in MB
    command: command to execute, None = container entrypoint
    args: args to pass to command
    env: environment vars for container
    accelerator: accelerator type, None=cpu only
    accelerator_count: accelerator count
    namespace: kubernetes namespace
    machine_type: machine type, None=default for mode (cpu/gpu)
    preemptible: use preemptible instance
    preemptible_tpu: use preemptible tpus
    tpu_driver: tpu driver to use

    Returns:
    JobSpec on success, None otherwise
    """

        args = conf.experiment_to_args(experiment.kwargs, experiment.args)

        # ------------------------------------------------------------------------
        # container

        # tpu/gpu resources
        container_resources = V1ResourceRequirements(
            requests=Cluster.container_requests(min_cpu, min_mem),
            limits=Cluster.container_limits(
                accelerator,
                accelerator_count,
                preemptible_tpu,
            ),
        )

        container_env = [V1EnvVar(name=k, value=v) for k, v in env.items()]

        # this is a simple 1-container, 1-pod job, so we just name the
        # container the same thing (minus the generated suffix) as the job itself
        container = V1Container(
            name=name,
            image=image,
            command=command,
            args=args,
            resources=container_resources,
            env=container_env,
            image_pull_policy='Always',
        )

        # ------------------------------------------------------------------------
        # template

        # todo: should we support anything other than a 'never' restart policy?
        # see this for discussion
        # https://kubernetes.io/docs/concepts/workloads/controllers/jobs-run-to-completion/#pod-backoff-failure-policy

        tolerations = Cluster.tolerations(preemptible=preemptible)

        # backoff count plus 'OnFailure' may be correct here
        template_spec = V1PodSpec(
            restart_policy='Never',
            containers=[container],
            tolerations=tolerations,
            node_selector=Cluster.node_selector(
                preemptible=preemptible,
                machine_type=machine_type,
                accelerator=accelerator,
            ),
            host_ipc=True,
        )

        template = V1PodTemplateSpec(
            metadata=Cluster.template_metadata(
                accelerator=accelerator,
                tpu_driver=tpu_driver,
            ),
            spec=template_spec,
        )

        # ------------------------------------------------------------------------
        # job
        job_spec = V1JobSpec(template=template, backoff_limit=4)

        return JobSpec.get_or_create(
            experiment=experiment,
            spec=ApiClient().sanitize_for_serialization(job_spec),
            platform=Platform.GKE,
        )