Beispiel #1
0
def test_container_spec(engine: Engine):

    spec = {
        'nogpu': True,
        'cloud_key': '/path/to/key.json',
        'image_tag': None,
        'dir': ['/extra/path0', '/extra/path2'],
        'base_dir': '/home/foo',
        'module': 'train.py'
    }

    def validate_spec(session) -> ContainerSpec:
        s = session.query(ContainerSpec).all()
        assert len(s) == 1
        s = s[0]
        assert s.spec == spec
        assert s.user == current_user()
        return s

    # basic creation
    with session_scope(engine) as session:
        s = ContainerSpec.get_or_create(session=session, spec=spec)
        session.add(s)

    # test persistence, then create experiment
    with session_scope(engine) as session:
        s = validate_spec(session)
        xg = ExperimentGroup()
        e = Experiment.get_or_create(xgroup=xg, container_spec=s)

    # test experiment parent-child relationship
    with session_scope(engine) as session:
        s = validate_spec(session)
        assert len(s.experiments) == 1
        assert s.experiments[0].container_spec.id == s.id
Beispiel #2
0
def test_experiment_group(engine: Engine):
    def verify_xg(session):
        xg = session.query(ExperimentGroup).all()
        assert len(xg) == 1
        xg = xg[0]
        return xg

    # basic creation
    with session_scope(engine) as session:
        xg = ExperimentGroup.get_or_create(session=session)
        session.add(xg)

    test_timestamp = datetime.now()

    # test experiment group addition/peristence, test duplicate
    with session_scope(engine) as session:
        xg = verify_xg(session)
        assert xg.created < test_timestamp
        new_xg = ExperimentGroup.get_or_create(session=session)

    # test get_or_create, then create new xg
    with session_scope(engine) as session:
        xg = verify_xg(session)
        new_xg = ExperimentGroup.get_or_create(session=session,
                                               name='new-xgroup')
        session.add(new_xg)

    # test getting recent experiment groups
    with session_scope(engine) as session:
        xg = session.query(ExperimentGroup).filter(
            ExperimentGroup.created > test_timestamp).all()
        assert len(xg) == 1
        xg = xg[0]
        assert xg.name == 'new-xgroup'
Beispiel #3
0
def test_job_spec(engine: Engine):

    job_spec = {'a': 2, 'b': [0, 1, 2], 'c': {'x': 1, 'y': 'foo'}}
    container_spec = {
        'nogpu': True,
        'cloud_key': '/path/to/key.json',
        'image_tag': None,
        'dir': ['/extra/path0', '/extra/path2'],
        'base_dir': '/home/foo',
        'module': 'train.py'
    }

    def validate_spec(session) -> JobSpec:
        s = session.query(JobSpec).all()
        assert len(s) == 1
        s = s[0]
        assert s.platform == Platform.LOCAL
        assert s.spec == job_spec
        return s

    # test basic creation
    with session_scope(engine) as session:
        xg = ExperimentGroup.get_or_create(session=session)
        c = ContainerSpec.get_or_create(session=session, spec=container_spec)
        e = Experiment.get_or_create(xgroup=xg, container_spec=c)
        j = JobSpec.get_or_create(
            experiment=e,
            spec=job_spec,
            platform=Platform.LOCAL,
        )
        session.add(xg)

    # test basic persistence, then add duplicate
    with session_scope(engine) as session:
        s = validate_spec(session)

        session.add(
            JobSpec.get_or_create(
                experiment=s.experiment,
                spec=job_spec,
                platform=Platform.LOCAL,
            ))

    # test get_or_create, then create new spec
    with session_scope(engine) as session:
        s = validate_spec(session)

        session.add(
            JobSpec.get_or_create(
                experiment=s.experiment,
                spec=job_spec,
                platform=Platform.CAIP,
            ))

    # verify that new spec was peristed
    with session_scope(engine) as session:
        s = session.query(JobSpec).all()
        assert len(s) == 2
        assert s[0].spec == s[1].spec
        assert s[0].platform != s[1].platform
Beispiel #4
0
def test_experiment(engine: Engine):

    container_spec = {
        'nogpu': True,
        'cloud_key': '/path/to/key.json',
        'image_tag': None,
        'dir': ['/extra/path0', '/extra/path2'],
        'base_dir': '/home/foo',
        'module': 'train.py'
    }

    with session_scope(engine) as session:
        xg = ExperimentGroup(name='foo-xgroup')
        c = ContainerSpec.get_or_create(session=session, spec=container_spec)
        j = Experiment.get_or_create(
            xgroup=xg,
            container_spec=c,
            args=['arg0', '3', 'arg1'],
            kwargs={
                'k0': 1,
                'k1': 's'
            },
        )
        session.add(xg)

    # check basic persistence, then create duplicate experiment
    with session_scope(engine) as session:
        e = session.query(Experiment).all()
        assert len(e) == 1
        e = e[0]
        assert e.args == ['arg0', '3', 'arg1']
        assert e.kwargs == {'k0': 1, 'k1': 's'}
        assert e.xgroup.name == 'foo-xgroup'
        assert e.container_spec.spec == container_spec

        new_e = Experiment.get_or_create(
            xgroup=e.xgroup,
            container_spec=e.container_spec,
            args=['arg0', '3', 'arg1'],
            kwargs={
                'k0': 1,
                'k1': 's'
            },
        )
        session.add(new_e)

    # test that get_or_create worked as desired
    with session_scope(engine) as session:
        e = session.query(Experiment).all()
        assert len(e) == 1
        e = e[0]
        assert e.container_spec.spec == container_spec
Beispiel #5
0
def _display_recent_jobs(
    user: str,
    max_jobs: Optional[int] = None,
) -> None:
    '''display recent jobs for given user'''

    # max_jobs here controls the maximum number of jobs to retrieve and display
    # across all experiment groups for the given user
    if max_jobs is None:
        max_jobs = _DEFAULT_STATUS_MAX_JOBS

    max_jobs = max(0, max_jobs)

    with session_scope(get_sql_engine()) as session:
        recent_jobs = session.query(Job).filter(Job.user == user).order_by(
            Job.created.desc())

        if max_jobs > 0:
            recent_jobs = recent_jobs.limit(max_jobs)

        recent_jobs = recent_jobs.all()
        recent_jobs.reverse()

        if len(recent_jobs) == 0:
            logging.info(f'No recent jobs found for user {user}.')
            return

        if max_jobs > 0:
            logging.info(f'most recent {max_jobs} jobs for user {user}:\n')
        else:
            logging.info(f'all jobs for user {user}:\n')

        _display_jobs_hierarchy(jobs=recent_jobs)

        return
Beispiel #6
0
def test_job(engine: Engine):

    args = ['a', 4]
    kwargs = {'k0': 0, 'k1': 'xyz'}
    job_spec = {'a': 2, 'b': [0, 1, 2], 'c': {'x': 1, 'y': 'foo'}}
    container_spec = {
        'nogpu': True,
        'cloud_key': '/path/to/key.json',
        'image_tag': None,
        'dir': ['/extra/path0', '/extra/path2'],
        'base_dir': '/home/foo',
        'module': 'train.py'
    }

    # test basic job creation
    with session_scope(engine) as session:

        xg = ExperimentGroup()
        c = ContainerSpec.get_or_create(session=session, spec=container_spec)
        e = Experiment.get_or_create(
            xgroup=xg,
            container_spec=c,
            args=args,
            kwargs=kwargs,
        )

        jspec = JobSpec.get_or_create(
            experiment=e,
            spec=job_spec,
            platform=Platform.TEST,
        )

        job = Job(spec=jspec, container='container0', details={'job_id': 123})
        session.add(e)

    # test job persistence
    with session_scope(engine) as session:
        j = session.query(Job).all()
        assert len(j) == 1
        j = j[0]
        assert j.container == 'container0'
        assert j.experiment.args == args
        assert j.experiment.kwargs == kwargs
        assert j.spec.spec == job_spec
        assert j.details['job_id'] == 123
Beispiel #7
0
def stop(args: Dict[str, Any]) -> None:
    '''executes the `caliban stop` cli command'''

    user = current_user()
    xgroup = args.get('xgroup')
    dry_run = args.get('dry_run', False)

    with session_scope(get_sql_engine()) as session:
        running_jobs = session.query(Job).join(Experiment).join(
            ExperimentGroup).filter(
                or_(Job.status == JobStatus.SUBMITTED,
                    Job.status == JobStatus.RUNNING))

        if xgroup is not None:
            running_jobs = running_jobs.filter(ExperimentGroup.name == xgroup)

        running_jobs = running_jobs.all()

        if len(running_jobs) == 0:
            logging.info(f'no running jobs found')
            return

        # this is necessary to filter out jobs that have finished but whose status
        # has not yet been updated in the backing store
        running_jobs = list(
            filter(
                lambda x: update_job_status(x) in
                [JobStatus.SUBMITTED, JobStatus.RUNNING], running_jobs))

        logging.info(f'the following jobs would be stopped:')
        for j in running_jobs:
            logging.info(_experiment_command_str(j.experiment))
            logging.info(f'    job {_job_str(j)}')

        if dry_run:
            logging.info(
                f'to actually stop these jobs, re-run the command without '
                f'the --dry_run flag')
            return

        # make sure
        if not user_verify(
                f'do you wish to stop these {len(running_jobs)} jobs?', False):
            return

        for j in running_jobs:
            logging.info(f'stopping job: {_job_str(j)}')
            stop_job(j)

        logging.info(
            f'requested job cancellation, please be patient as it may take '
            f'a short while for this status change to be reflected in the '
            f'gcp dashboard or from the `caliban status` command.')
Beispiel #8
0
def _display_xgroup(
    xgroup: str,
    user: str,
    max_jobs: Optional[int] = None,
) -> None:
    '''display information for given experiment group and user'''

    # max_jobs here controls how many jobs to display for each experiment in
    # the specified experiment group, by default we only display the most recent
    # job for each experiment
    if max_jobs is None:
        max_jobs = 1

    max_jobs = max(0, max_jobs)

    with session_scope(get_sql_engine()) as session:
        xg = session.query(ExperimentGroup).filter(
            ExperimentGroup.name == xgroup).filter(
                ExperimentGroup.user == user).first()

        if xg is None:
            logging.info(f'xgroup {xgroup} not found')
            return

        container_specs = sorted(
            set([e.container_spec for e in xg.experiments]),
            key=lambda x: x.id,
        )

        logging.info(f'xgroup {xg.name}:')
        for cs in container_specs:
            logging.info(f'docker config {_container_spec_str(cs)}')
            for e in xg.experiments:
                if e.container_spec.id != cs.id:
                    continue
                logging.info(
                    f'  experiment id {e.id}: {_experiment_command_str(e)}')
                if len(e.jobs) == 0:
                    logging.info(f'    no jobs found')
                else:
                    for j in e.jobs[-max_jobs:]:
                        logging.info(f'    job {_job_str(j)}')
Beispiel #9
0
def submit_ml_job(job_mode: conf.JobMode,
                  docker_args: Dict[str, Any],
                  region: ct.Region,
                  project_id: str,
                  credentials_path: Optional[str] = None,
                  dry_run: bool = False,
                  job_name: Optional[str] = None,
                  machine_type: Optional[ct.MachineType] = None,
                  gpu_spec: Optional[ct.GPUSpec] = None,
                  tpu_spec: Optional[ct.TPUSpec] = None,
                  image_tag: Optional[str] = None,
                  labels: Optional[Dict[str, str]] = None,
                  experiment_config: Optional[conf.ExpConf] = None,
                  script_args: Optional[List[str]] = None,
                  request_retries: Optional[int] = None,
                  xgroup: Optional[str] = None) -> None:
    """Top level function in the module. This function:

  - builds an image using the supplied docker_args, in either CPU or GPU mode
  - pushes that image to the Cloud Container Repository of the supplied
    project_id
  - generates a sequence of 'JobSpec' instances, one for every combination in
    the supplied experiment_config, and
  - batch-submits all jobs to AI Platform

  Keyword args:

  - job_mode: caliban.config.JobMode.
  - docker_args: these arguments are passed through to
    caliban.docker.build_image.
  - region: the region to use for AI Platform job submission. Different regions
    support different GPUs.
  - project_id: GCloud project ID for container storage and job submission.
  - credentials_path: explicit path to a service account JSON file, if it exists.
  - dry_run: if True, no actual jobs will be submitted and docker won't
    actually build; logging side effects will show the user what will happen
    without dry_run=True.
  - job_name: optional custom name. This is applied as a label to every job,
    and used as a prefix for all jobIds submitted to Cloud.
  - machine_type: the machine type to allocate for each job. Must be one
    supported by Cloud.
  - gpu_spec: if None and job_mode is GPU, defaults to a standard single GPU.
    Else, configures the count and type of GPUs to attach to the machine that
    runs each job.
  - tpu_spec: if None, defaults to no TPU attached. Else, configures the count
    and type of TPUs to attach to the machine that runs each job.
  - image_tag: optional explicit tag of a Container-Registry-available Docker
    container. If supplied, submit_ml_job will skip the docker build and push
    phases and use this image_tag directly.
  - labels: dictionary of KV pairs to apply to each job. User args will also be
    applied as labels, plus a few default labels supplied by Caliban.
  - experiment_config: dict of string to list, boolean, string or int. Any
    lists will trigger a cartesian product out with the rest of the config. A
    job will be submitted for every combination of parameters in the experiment
    config.
  - script_args: these are extra arguments that will be passed to every job
    executed, in addition to the arguments created by expanding out the
    experiment config.
  - request_retries: the number of times to retry each request if it fails for
    a timeout or a rate limiting request.
  - xgroup: experiment group for this submission, if None a new group will
    be created
  """
    if script_args is None:
        script_args = []

    if job_name is None:
        job_name = "caliban_{}".format(u.current_user())

    if job_mode == conf.JobMode.GPU and gpu_spec is None:
        gpu_spec = ct.GPUSpec(ct.GPU.P100, 1)

    if machine_type is None:
        machine_type = conf.DEFAULT_MACHINE_TYPE[job_mode]

    if experiment_config is None:
        experiment_config = {}

    if labels is None:
        labels = {}

    if request_retries is None:
        request_retries = 10

    engine = get_mem_engine() if dry_run else get_sql_engine()

    with session_scope(engine) as session:
        container_spec = generate_container_spec(session, docker_args,
                                                 image_tag)

        if image_tag is None:
            image_tag = generate_image_tag(project_id,
                                           docker_args,
                                           dry_run=dry_run)

        experiments = create_experiments(
            session=session,
            container_spec=container_spec,
            script_args=script_args,
            experiment_config=experiment_config,
            xgroup=xgroup,
        )

        specs = build_job_specs(
            job_name=job_name,
            image_tag=image_tag,
            region=region,
            machine_type=machine_type,
            experiments=experiments,
            user_labels=labels,
            gpu_spec=gpu_spec,
            tpu_spec=tpu_spec,
        )

        if dry_run:
            return execute_dry_run(specs)

        try:
            submit_job_specs(
                specs=specs,
                project_id=project_id,
                credentials_path=credentials_path,
                num_specs=len(experiments),
                request_retries=request_retries,
            )
        except Exception as e:
            logging.error(f'exception: {e}')
            session.commit()  # commit here, otherwise will be rolled back

        logging.info("")
        logging.info(
            t.green("Visit {} to see the status of all jobs.".format(
                job_url(project_id, ''))))
        logging.info("")
Beispiel #10
0
def _job_submit(args: dict, cluster: Cluster) -> None:
    """submits job(s) to cluster

  Args:
  args: argument dictionary
  cluster: cluster instance
  """

    script_args = conf.extract_script_args(args)
    job_mode = cli.resolve_job_mode(args)
    docker_args = cli.generate_docker_args(job_mode, args)
    docker_run_args = args.get('docker_run_args', []) or []
    dry_run = args['dry_run']
    package = args['module']
    job_name = _generate_job_name(args.get('name'))
    gpu_spec = args.get('gpu_spec')
    preemptible = not args['nonpreemptible']
    min_cpu = args.get('min_cpu')
    min_mem = args.get('min_mem')
    experiment_config = args.get('experiment_config') or [{}]
    xgroup = args.get('xgroup')
    image_tag = args.get('image_tag')
    export = args.get('export', None)

    labels = args.get('label')
    if labels is not None:
        labels = dict(u.sanitize_labels(args.get('label')))

    # Arguments to internally build the image required to submit to Cloud.
    docker_m = {'job_mode': job_mode, 'package': package, **docker_args}

    # --------------------------------------------------------------------------
    # validatate gpu spec
    if job_mode == conf.JobMode.GPU and gpu_spec is None:
        gpu_spec = k.DEFAULT_GPU_SPEC

    if not cluster.validate_gpu_spec(gpu_spec):
        return

    # --------------------------------------------------------------------------
    # validate tpu spec and driver
    tpu_spec = args.get('tpu_spec')
    preemptible_tpu = not args.get('nonpreemptible_tpu')
    tpu_driver = args.get('tpu_driver')

    if tpu_spec is not None:
        available_tpu = cluster.get_tpu_types()
        if available_tpu is None:
            logging.error('error getting valid tpu types for cluster')
            return

        if tpu_spec not in available_tpu:
            logging.error('invalid tpu spec, cluster supports:')
            for t in available_tpu:
                logging.info('{}x{}'.format(t.count, t.tpu.name))
            return

        if not cluster.validate_tpu_driver(tpu_driver):
            logging.error(
                'error: unsupported tpu driver {}'.format(tpu_driver))
            logging.info('supported tpu drivers for this cluster:')
            for d in cluster.get_tpu_drivers():
                logging.info('  {}'.format(d))
            return

    if tpu_spec is None and gpu_spec is None:  # cpu-only job
        min_cpu = min_cpu or k.DEFAULT_MIN_CPU_CPU
        min_mem = min_mem or k.DEFAULT_MIN_MEM_CPU
    else:  # gpu/tpu-accelerated job
        min_cpu = min_cpu or k.DEFAULT_MIN_CPU_ACCEL
        min_mem = min_mem or k.DEFAULT_MIN_MEM_ACCEL

    # convert accelerator spec
    accel_spec = Cluster.convert_accel_spec(gpu_spec, tpu_spec)
    if accel_spec is None:
        return

    accel, accel_count = accel_spec

    # --------------------------------------------------------------------------
    engine = get_mem_engine() if dry_run else get_sql_engine()

    with session_scope(engine) as session:
        container_spec = generate_container_spec(session, docker_m, image_tag)

        if image_tag is None:
            image_tag = generate_image_tag(cluster.project_id, docker_m,
                                           dry_run)

        experiments = create_experiments(
            session=session,
            container_spec=container_spec,
            script_args=script_args,
            experiment_config=experiment_config,
            xgroup=xgroup,
        )

        specs = list(
            cluster.create_simple_experiment_job_specs(
                name=utils.sanitize_job_name(job_name),
                image=image_tag,
                min_cpu=min_cpu,
                min_mem=min_mem,
                experiments=experiments,
                args=script_args,
                accelerator=accel,
                accelerator_count=accel_count,
                preemptible=preemptible,
                preemptible_tpu=preemptible_tpu,
                tpu_driver=tpu_driver))

        # just a dry run
        if dry_run:
            logging.info('jobs that would be submitted:')
            for s in specs:
                logging.info(f'\n{json.dumps(s.spec, indent=2)}')
            return

        # export jobs to file
        if export is not None:
            if not _export_jobs(
                    export,
                    cluster.create_v1jobs(specs, job_name, labels),
            ):
                print('error exporting jobs to {}'.format(export))
            return

        for s in specs:
            try:
                cluster.submit_job(job_spec=s, name=job_name, labels=labels)
            except Exception as e:
                logging.error(f'exception: {e}')
                session.commit()  # commit here, otherwise will be rolled back
                return

    # --------------------------------------------------------------------------
    logging.info(f'jobs submitted, visit {cluster.dashboard_url()} to monitor')

    return
Beispiel #11
0
def run_experiments(job_mode: c.JobMode,
                    run_args: Optional[List[str]] = None,
                    script_args: Optional[List[str]] = None,
                    image_id: Optional[str] = None,
                    dry_run: bool = False,
                    experiment_config: Optional[c.ExpConf] = None,
                    xgroup: Optional[str] = None,
                    **build_image_kwargs) -> None:
    """Builds an image using the supplied **build_image_kwargs and calls `docker
  run` on the resulting image using sensible defaults.

  Keyword args:

  - job_mode: c.JobMode.

  - run_args: extra arguments to supply to `docker run` after our defaults.
  - script_args: extra arguments to supply to the entrypoint. (You can
  - override the default container entrypoint by supplying a new one inside
    run_args.)
  - image_id: ID of the image to run. Supplying this will skip an image build.
  - experiment_config: dict of string to list, boolean, string or int. Any
    lists will trigger a cartesian product out with the rest of the config. A
    job will be executed for every combination of parameters in the experiment
    config.
  - dry_run: if True, no actual jobs will be executed and docker won't
    actually build; logging side effects will show the user what will happen
    without dry_run=True.

  any extra kwargs supplied are passed through to build_image.
  """
    if run_args is None:
        run_args = []

    if script_args is None:
        script_args = []

    if experiment_config is None:
        experiment_config = {}

    docker_args = {k: v for k, v in build_image_kwargs.items()}
    docker_args['job_mode'] = job_mode

    engine = get_mem_engine() if dry_run else get_sql_engine()

    with session_scope(engine) as session:
        container_spec = generate_container_spec(session, docker_args,
                                                 image_id)

        if image_id is None:
            if dry_run:
                logging.info("Dry run - skipping actual 'docker build'.")
                image_id = 'dry_run_tag'
            else:
                image_id = build_image(**docker_args)

        experiments = create_experiments(
            session=session,
            container_spec=container_spec,
            script_args=script_args,
            experiment_config=experiment_config,
            xgroup=xgroup,
        )

        job_specs = [
            JobSpec.get_or_create(
                experiment=x,
                spec=_create_job_spec_dict(
                    experiment=x,
                    job_mode=job_mode,
                    run_args=run_args,
                    image_id=image_id,
                ),
                platform=Platform.LOCAL,
            ) for x in experiments
        ]

        try:
            execute_jobs(job_specs=job_specs, dry_run=dry_run)
        except Exception as e:
            logging.error(f'exception: {e}')
            session.commit()  # commit here, otherwise will be rolled back
Beispiel #12
0
def resubmit(args: Dict[str, Any]) -> None:
    '''executes the `caliban resubmit` command'''

    user = current_user()
    xgroup = args.get('xgroup')
    dry_run = args.get('dry_run', False)
    all_jobs = args.get('all_jobs', False)
    project_id = args.get('project_id')
    creds_file = args.get('cloud_key')
    rebuild = True

    if xgroup is None:
        logging.error(f'you must specify an experiment group for this command')
        return

    with session_scope(get_sql_engine()) as session:
        jobs = _get_resubmit_jobs(
            session=session,
            xgroup=xgroup,
            user=user,
            all_jobs=all_jobs,
        )

        if jobs is None:
            return

        # if we have CAIP or GKE jobs, then we need to have a project_id
        project_id = _get_resubmit_project_id(jobs, project_id, creds_file)

        # show what would be done
        logging.info(f'the following jobs would be resubmitted:')
        for j in jobs:
            logging.info(_experiment_command_str(j.experiment))
            logging.info(f'  job {_job_str(j)}')

        if dry_run:
            logging.info(
                f'to actually resubmit these jobs, run this command again '
                f'without the --dry_run flag')
            return

        # make sure
        if not user_verify(f'do you wish to resubmit these {len(jobs)} jobs?',
                           False):
            return

        # rebuild all containers first
        if rebuild:
            logging.info(f'rebuilding containers...')
            image_id_map = _rebuild_containers(jobs, project_id=project_id)
        else:
            image_id_map = {j: j.container for j in jobs}

        # create new job specs
        job_specs = [
            replace_job_spec_image(spec=j.spec, image_id=image_id_map[j])
            for j in jobs
        ]

        # submit jobs, grouped by platform
        for platform in [Platform.CAIP, Platform.GKE, Platform.LOCAL]:
            pspecs = list(filter(lambda x: x.platform == platform, job_specs))
            try:
                submit_job_specs(
                    specs=pspecs,
                    platform=platform,
                    project_id=project_id,
                    credentials_path=creds_file,
                )
            except Exception as e:
                session.commit()  # avoid rollback
                logging.error(f'there was an error submitting some jobs')

        return