def test_container_spec(engine: Engine): spec = { 'nogpu': True, 'cloud_key': '/path/to/key.json', 'image_tag': None, 'dir': ['/extra/path0', '/extra/path2'], 'base_dir': '/home/foo', 'module': 'train.py' } def validate_spec(session) -> ContainerSpec: s = session.query(ContainerSpec).all() assert len(s) == 1 s = s[0] assert s.spec == spec assert s.user == current_user() return s # basic creation with session_scope(engine) as session: s = ContainerSpec.get_or_create(session=session, spec=spec) session.add(s) # test persistence, then create experiment with session_scope(engine) as session: s = validate_spec(session) xg = ExperimentGroup() e = Experiment.get_or_create(xgroup=xg, container_spec=s) # test experiment parent-child relationship with session_scope(engine) as session: s = validate_spec(session) assert len(s.experiments) == 1 assert s.experiments[0].container_spec.id == s.id
def test_experiment_group(engine: Engine): def verify_xg(session): xg = session.query(ExperimentGroup).all() assert len(xg) == 1 xg = xg[0] return xg # basic creation with session_scope(engine) as session: xg = ExperimentGroup.get_or_create(session=session) session.add(xg) test_timestamp = datetime.now() # test experiment group addition/peristence, test duplicate with session_scope(engine) as session: xg = verify_xg(session) assert xg.created < test_timestamp new_xg = ExperimentGroup.get_or_create(session=session) # test get_or_create, then create new xg with session_scope(engine) as session: xg = verify_xg(session) new_xg = ExperimentGroup.get_or_create(session=session, name='new-xgroup') session.add(new_xg) # test getting recent experiment groups with session_scope(engine) as session: xg = session.query(ExperimentGroup).filter( ExperimentGroup.created > test_timestamp).all() assert len(xg) == 1 xg = xg[0] assert xg.name == 'new-xgroup'
def test_job_spec(engine: Engine): job_spec = {'a': 2, 'b': [0, 1, 2], 'c': {'x': 1, 'y': 'foo'}} container_spec = { 'nogpu': True, 'cloud_key': '/path/to/key.json', 'image_tag': None, 'dir': ['/extra/path0', '/extra/path2'], 'base_dir': '/home/foo', 'module': 'train.py' } def validate_spec(session) -> JobSpec: s = session.query(JobSpec).all() assert len(s) == 1 s = s[0] assert s.platform == Platform.LOCAL assert s.spec == job_spec return s # test basic creation with session_scope(engine) as session: xg = ExperimentGroup.get_or_create(session=session) c = ContainerSpec.get_or_create(session=session, spec=container_spec) e = Experiment.get_or_create(xgroup=xg, container_spec=c) j = JobSpec.get_or_create( experiment=e, spec=job_spec, platform=Platform.LOCAL, ) session.add(xg) # test basic persistence, then add duplicate with session_scope(engine) as session: s = validate_spec(session) session.add( JobSpec.get_or_create( experiment=s.experiment, spec=job_spec, platform=Platform.LOCAL, )) # test get_or_create, then create new spec with session_scope(engine) as session: s = validate_spec(session) session.add( JobSpec.get_or_create( experiment=s.experiment, spec=job_spec, platform=Platform.CAIP, )) # verify that new spec was peristed with session_scope(engine) as session: s = session.query(JobSpec).all() assert len(s) == 2 assert s[0].spec == s[1].spec assert s[0].platform != s[1].platform
def test_experiment(engine: Engine): container_spec = { 'nogpu': True, 'cloud_key': '/path/to/key.json', 'image_tag': None, 'dir': ['/extra/path0', '/extra/path2'], 'base_dir': '/home/foo', 'module': 'train.py' } with session_scope(engine) as session: xg = ExperimentGroup(name='foo-xgroup') c = ContainerSpec.get_or_create(session=session, spec=container_spec) j = Experiment.get_or_create( xgroup=xg, container_spec=c, args=['arg0', '3', 'arg1'], kwargs={ 'k0': 1, 'k1': 's' }, ) session.add(xg) # check basic persistence, then create duplicate experiment with session_scope(engine) as session: e = session.query(Experiment).all() assert len(e) == 1 e = e[0] assert e.args == ['arg0', '3', 'arg1'] assert e.kwargs == {'k0': 1, 'k1': 's'} assert e.xgroup.name == 'foo-xgroup' assert e.container_spec.spec == container_spec new_e = Experiment.get_or_create( xgroup=e.xgroup, container_spec=e.container_spec, args=['arg0', '3', 'arg1'], kwargs={ 'k0': 1, 'k1': 's' }, ) session.add(new_e) # test that get_or_create worked as desired with session_scope(engine) as session: e = session.query(Experiment).all() assert len(e) == 1 e = e[0] assert e.container_spec.spec == container_spec
def _display_recent_jobs( user: str, max_jobs: Optional[int] = None, ) -> None: '''display recent jobs for given user''' # max_jobs here controls the maximum number of jobs to retrieve and display # across all experiment groups for the given user if max_jobs is None: max_jobs = _DEFAULT_STATUS_MAX_JOBS max_jobs = max(0, max_jobs) with session_scope(get_sql_engine()) as session: recent_jobs = session.query(Job).filter(Job.user == user).order_by( Job.created.desc()) if max_jobs > 0: recent_jobs = recent_jobs.limit(max_jobs) recent_jobs = recent_jobs.all() recent_jobs.reverse() if len(recent_jobs) == 0: logging.info(f'No recent jobs found for user {user}.') return if max_jobs > 0: logging.info(f'most recent {max_jobs} jobs for user {user}:\n') else: logging.info(f'all jobs for user {user}:\n') _display_jobs_hierarchy(jobs=recent_jobs) return
def test_job(engine: Engine): args = ['a', 4] kwargs = {'k0': 0, 'k1': 'xyz'} job_spec = {'a': 2, 'b': [0, 1, 2], 'c': {'x': 1, 'y': 'foo'}} container_spec = { 'nogpu': True, 'cloud_key': '/path/to/key.json', 'image_tag': None, 'dir': ['/extra/path0', '/extra/path2'], 'base_dir': '/home/foo', 'module': 'train.py' } # test basic job creation with session_scope(engine) as session: xg = ExperimentGroup() c = ContainerSpec.get_or_create(session=session, spec=container_spec) e = Experiment.get_or_create( xgroup=xg, container_spec=c, args=args, kwargs=kwargs, ) jspec = JobSpec.get_or_create( experiment=e, spec=job_spec, platform=Platform.TEST, ) job = Job(spec=jspec, container='container0', details={'job_id': 123}) session.add(e) # test job persistence with session_scope(engine) as session: j = session.query(Job).all() assert len(j) == 1 j = j[0] assert j.container == 'container0' assert j.experiment.args == args assert j.experiment.kwargs == kwargs assert j.spec.spec == job_spec assert j.details['job_id'] == 123
def stop(args: Dict[str, Any]) -> None: '''executes the `caliban stop` cli command''' user = current_user() xgroup = args.get('xgroup') dry_run = args.get('dry_run', False) with session_scope(get_sql_engine()) as session: running_jobs = session.query(Job).join(Experiment).join( ExperimentGroup).filter( or_(Job.status == JobStatus.SUBMITTED, Job.status == JobStatus.RUNNING)) if xgroup is not None: running_jobs = running_jobs.filter(ExperimentGroup.name == xgroup) running_jobs = running_jobs.all() if len(running_jobs) == 0: logging.info(f'no running jobs found') return # this is necessary to filter out jobs that have finished but whose status # has not yet been updated in the backing store running_jobs = list( filter( lambda x: update_job_status(x) in [JobStatus.SUBMITTED, JobStatus.RUNNING], running_jobs)) logging.info(f'the following jobs would be stopped:') for j in running_jobs: logging.info(_experiment_command_str(j.experiment)) logging.info(f' job {_job_str(j)}') if dry_run: logging.info( f'to actually stop these jobs, re-run the command without ' f'the --dry_run flag') return # make sure if not user_verify( f'do you wish to stop these {len(running_jobs)} jobs?', False): return for j in running_jobs: logging.info(f'stopping job: {_job_str(j)}') stop_job(j) logging.info( f'requested job cancellation, please be patient as it may take ' f'a short while for this status change to be reflected in the ' f'gcp dashboard or from the `caliban status` command.')
def _display_xgroup( xgroup: str, user: str, max_jobs: Optional[int] = None, ) -> None: '''display information for given experiment group and user''' # max_jobs here controls how many jobs to display for each experiment in # the specified experiment group, by default we only display the most recent # job for each experiment if max_jobs is None: max_jobs = 1 max_jobs = max(0, max_jobs) with session_scope(get_sql_engine()) as session: xg = session.query(ExperimentGroup).filter( ExperimentGroup.name == xgroup).filter( ExperimentGroup.user == user).first() if xg is None: logging.info(f'xgroup {xgroup} not found') return container_specs = sorted( set([e.container_spec for e in xg.experiments]), key=lambda x: x.id, ) logging.info(f'xgroup {xg.name}:') for cs in container_specs: logging.info(f'docker config {_container_spec_str(cs)}') for e in xg.experiments: if e.container_spec.id != cs.id: continue logging.info( f' experiment id {e.id}: {_experiment_command_str(e)}') if len(e.jobs) == 0: logging.info(f' no jobs found') else: for j in e.jobs[-max_jobs:]: logging.info(f' job {_job_str(j)}')
def submit_ml_job(job_mode: conf.JobMode, docker_args: Dict[str, Any], region: ct.Region, project_id: str, credentials_path: Optional[str] = None, dry_run: bool = False, job_name: Optional[str] = None, machine_type: Optional[ct.MachineType] = None, gpu_spec: Optional[ct.GPUSpec] = None, tpu_spec: Optional[ct.TPUSpec] = None, image_tag: Optional[str] = None, labels: Optional[Dict[str, str]] = None, experiment_config: Optional[conf.ExpConf] = None, script_args: Optional[List[str]] = None, request_retries: Optional[int] = None, xgroup: Optional[str] = None) -> None: """Top level function in the module. This function: - builds an image using the supplied docker_args, in either CPU or GPU mode - pushes that image to the Cloud Container Repository of the supplied project_id - generates a sequence of 'JobSpec' instances, one for every combination in the supplied experiment_config, and - batch-submits all jobs to AI Platform Keyword args: - job_mode: caliban.config.JobMode. - docker_args: these arguments are passed through to caliban.docker.build_image. - region: the region to use for AI Platform job submission. Different regions support different GPUs. - project_id: GCloud project ID for container storage and job submission. - credentials_path: explicit path to a service account JSON file, if it exists. - dry_run: if True, no actual jobs will be submitted and docker won't actually build; logging side effects will show the user what will happen without dry_run=True. - job_name: optional custom name. This is applied as a label to every job, and used as a prefix for all jobIds submitted to Cloud. - machine_type: the machine type to allocate for each job. Must be one supported by Cloud. - gpu_spec: if None and job_mode is GPU, defaults to a standard single GPU. Else, configures the count and type of GPUs to attach to the machine that runs each job. - tpu_spec: if None, defaults to no TPU attached. Else, configures the count and type of TPUs to attach to the machine that runs each job. - image_tag: optional explicit tag of a Container-Registry-available Docker container. If supplied, submit_ml_job will skip the docker build and push phases and use this image_tag directly. - labels: dictionary of KV pairs to apply to each job. User args will also be applied as labels, plus a few default labels supplied by Caliban. - experiment_config: dict of string to list, boolean, string or int. Any lists will trigger a cartesian product out with the rest of the config. A job will be submitted for every combination of parameters in the experiment config. - script_args: these are extra arguments that will be passed to every job executed, in addition to the arguments created by expanding out the experiment config. - request_retries: the number of times to retry each request if it fails for a timeout or a rate limiting request. - xgroup: experiment group for this submission, if None a new group will be created """ if script_args is None: script_args = [] if job_name is None: job_name = "caliban_{}".format(u.current_user()) if job_mode == conf.JobMode.GPU and gpu_spec is None: gpu_spec = ct.GPUSpec(ct.GPU.P100, 1) if machine_type is None: machine_type = conf.DEFAULT_MACHINE_TYPE[job_mode] if experiment_config is None: experiment_config = {} if labels is None: labels = {} if request_retries is None: request_retries = 10 engine = get_mem_engine() if dry_run else get_sql_engine() with session_scope(engine) as session: container_spec = generate_container_spec(session, docker_args, image_tag) if image_tag is None: image_tag = generate_image_tag(project_id, docker_args, dry_run=dry_run) experiments = create_experiments( session=session, container_spec=container_spec, script_args=script_args, experiment_config=experiment_config, xgroup=xgroup, ) specs = build_job_specs( job_name=job_name, image_tag=image_tag, region=region, machine_type=machine_type, experiments=experiments, user_labels=labels, gpu_spec=gpu_spec, tpu_spec=tpu_spec, ) if dry_run: return execute_dry_run(specs) try: submit_job_specs( specs=specs, project_id=project_id, credentials_path=credentials_path, num_specs=len(experiments), request_retries=request_retries, ) except Exception as e: logging.error(f'exception: {e}') session.commit() # commit here, otherwise will be rolled back logging.info("") logging.info( t.green("Visit {} to see the status of all jobs.".format( job_url(project_id, '')))) logging.info("")
def _job_submit(args: dict, cluster: Cluster) -> None: """submits job(s) to cluster Args: args: argument dictionary cluster: cluster instance """ script_args = conf.extract_script_args(args) job_mode = cli.resolve_job_mode(args) docker_args = cli.generate_docker_args(job_mode, args) docker_run_args = args.get('docker_run_args', []) or [] dry_run = args['dry_run'] package = args['module'] job_name = _generate_job_name(args.get('name')) gpu_spec = args.get('gpu_spec') preemptible = not args['nonpreemptible'] min_cpu = args.get('min_cpu') min_mem = args.get('min_mem') experiment_config = args.get('experiment_config') or [{}] xgroup = args.get('xgroup') image_tag = args.get('image_tag') export = args.get('export', None) labels = args.get('label') if labels is not None: labels = dict(u.sanitize_labels(args.get('label'))) # Arguments to internally build the image required to submit to Cloud. docker_m = {'job_mode': job_mode, 'package': package, **docker_args} # -------------------------------------------------------------------------- # validatate gpu spec if job_mode == conf.JobMode.GPU and gpu_spec is None: gpu_spec = k.DEFAULT_GPU_SPEC if not cluster.validate_gpu_spec(gpu_spec): return # -------------------------------------------------------------------------- # validate tpu spec and driver tpu_spec = args.get('tpu_spec') preemptible_tpu = not args.get('nonpreemptible_tpu') tpu_driver = args.get('tpu_driver') if tpu_spec is not None: available_tpu = cluster.get_tpu_types() if available_tpu is None: logging.error('error getting valid tpu types for cluster') return if tpu_spec not in available_tpu: logging.error('invalid tpu spec, cluster supports:') for t in available_tpu: logging.info('{}x{}'.format(t.count, t.tpu.name)) return if not cluster.validate_tpu_driver(tpu_driver): logging.error( 'error: unsupported tpu driver {}'.format(tpu_driver)) logging.info('supported tpu drivers for this cluster:') for d in cluster.get_tpu_drivers(): logging.info(' {}'.format(d)) return if tpu_spec is None and gpu_spec is None: # cpu-only job min_cpu = min_cpu or k.DEFAULT_MIN_CPU_CPU min_mem = min_mem or k.DEFAULT_MIN_MEM_CPU else: # gpu/tpu-accelerated job min_cpu = min_cpu or k.DEFAULT_MIN_CPU_ACCEL min_mem = min_mem or k.DEFAULT_MIN_MEM_ACCEL # convert accelerator spec accel_spec = Cluster.convert_accel_spec(gpu_spec, tpu_spec) if accel_spec is None: return accel, accel_count = accel_spec # -------------------------------------------------------------------------- engine = get_mem_engine() if dry_run else get_sql_engine() with session_scope(engine) as session: container_spec = generate_container_spec(session, docker_m, image_tag) if image_tag is None: image_tag = generate_image_tag(cluster.project_id, docker_m, dry_run) experiments = create_experiments( session=session, container_spec=container_spec, script_args=script_args, experiment_config=experiment_config, xgroup=xgroup, ) specs = list( cluster.create_simple_experiment_job_specs( name=utils.sanitize_job_name(job_name), image=image_tag, min_cpu=min_cpu, min_mem=min_mem, experiments=experiments, args=script_args, accelerator=accel, accelerator_count=accel_count, preemptible=preemptible, preemptible_tpu=preemptible_tpu, tpu_driver=tpu_driver)) # just a dry run if dry_run: logging.info('jobs that would be submitted:') for s in specs: logging.info(f'\n{json.dumps(s.spec, indent=2)}') return # export jobs to file if export is not None: if not _export_jobs( export, cluster.create_v1jobs(specs, job_name, labels), ): print('error exporting jobs to {}'.format(export)) return for s in specs: try: cluster.submit_job(job_spec=s, name=job_name, labels=labels) except Exception as e: logging.error(f'exception: {e}') session.commit() # commit here, otherwise will be rolled back return # -------------------------------------------------------------------------- logging.info(f'jobs submitted, visit {cluster.dashboard_url()} to monitor') return
def run_experiments(job_mode: c.JobMode, run_args: Optional[List[str]] = None, script_args: Optional[List[str]] = None, image_id: Optional[str] = None, dry_run: bool = False, experiment_config: Optional[c.ExpConf] = None, xgroup: Optional[str] = None, **build_image_kwargs) -> None: """Builds an image using the supplied **build_image_kwargs and calls `docker run` on the resulting image using sensible defaults. Keyword args: - job_mode: c.JobMode. - run_args: extra arguments to supply to `docker run` after our defaults. - script_args: extra arguments to supply to the entrypoint. (You can - override the default container entrypoint by supplying a new one inside run_args.) - image_id: ID of the image to run. Supplying this will skip an image build. - experiment_config: dict of string to list, boolean, string or int. Any lists will trigger a cartesian product out with the rest of the config. A job will be executed for every combination of parameters in the experiment config. - dry_run: if True, no actual jobs will be executed and docker won't actually build; logging side effects will show the user what will happen without dry_run=True. any extra kwargs supplied are passed through to build_image. """ if run_args is None: run_args = [] if script_args is None: script_args = [] if experiment_config is None: experiment_config = {} docker_args = {k: v for k, v in build_image_kwargs.items()} docker_args['job_mode'] = job_mode engine = get_mem_engine() if dry_run else get_sql_engine() with session_scope(engine) as session: container_spec = generate_container_spec(session, docker_args, image_id) if image_id is None: if dry_run: logging.info("Dry run - skipping actual 'docker build'.") image_id = 'dry_run_tag' else: image_id = build_image(**docker_args) experiments = create_experiments( session=session, container_spec=container_spec, script_args=script_args, experiment_config=experiment_config, xgroup=xgroup, ) job_specs = [ JobSpec.get_or_create( experiment=x, spec=_create_job_spec_dict( experiment=x, job_mode=job_mode, run_args=run_args, image_id=image_id, ), platform=Platform.LOCAL, ) for x in experiments ] try: execute_jobs(job_specs=job_specs, dry_run=dry_run) except Exception as e: logging.error(f'exception: {e}') session.commit() # commit here, otherwise will be rolled back
def resubmit(args: Dict[str, Any]) -> None: '''executes the `caliban resubmit` command''' user = current_user() xgroup = args.get('xgroup') dry_run = args.get('dry_run', False) all_jobs = args.get('all_jobs', False) project_id = args.get('project_id') creds_file = args.get('cloud_key') rebuild = True if xgroup is None: logging.error(f'you must specify an experiment group for this command') return with session_scope(get_sql_engine()) as session: jobs = _get_resubmit_jobs( session=session, xgroup=xgroup, user=user, all_jobs=all_jobs, ) if jobs is None: return # if we have CAIP or GKE jobs, then we need to have a project_id project_id = _get_resubmit_project_id(jobs, project_id, creds_file) # show what would be done logging.info(f'the following jobs would be resubmitted:') for j in jobs: logging.info(_experiment_command_str(j.experiment)) logging.info(f' job {_job_str(j)}') if dry_run: logging.info( f'to actually resubmit these jobs, run this command again ' f'without the --dry_run flag') return # make sure if not user_verify(f'do you wish to resubmit these {len(jobs)} jobs?', False): return # rebuild all containers first if rebuild: logging.info(f'rebuilding containers...') image_id_map = _rebuild_containers(jobs, project_id=project_id) else: image_id_map = {j: j.container for j in jobs} # create new job specs job_specs = [ replace_job_spec_image(spec=j.spec, image_id=image_id_map[j]) for j in jobs ] # submit jobs, grouped by platform for platform in [Platform.CAIP, Platform.GKE, Platform.LOCAL]: pspecs = list(filter(lambda x: x.platform == platform, job_specs)) try: submit_job_specs( specs=pspecs, platform=platform, project_id=project_id, credentials_path=creds_file, ) except Exception as e: session.commit() # avoid rollback logging.error(f'there was an error submitting some jobs') return