def run_app(arg_input): """Main function to run the Caliban app. Accepts a Namespace-type output of an argparse argument parser. """ args = vars(arg_input) script_args = c.extract_script_args(args) command = args["command"] if command == "cluster": return gke.cli.run_cli_command(args) job_mode = cli.resolve_job_mode(args) docker_args = cli.generate_docker_args(job_mode, args) docker_run_args = args.get("docker_run_args", []) if command == "shell": mount_home = not args['bare'] image_id = args.get("image_id") dlvm = args.get("dlvm") shell = args['shell'] docker.run_interactive(job_mode, dlvm=dlvm, image_id=image_id, run_args=docker_run_args, mount_home=mount_home, shell=shell, **docker_args) elif command == "notebook": port = args.get("port") lab = args.get("lab") dlvm = args.get("dlvm") version = args.get("jupyter_version") mount_home = not args['bare'] docker.run_notebook(job_mode, dlvm=dlvm, port=port, lab=lab, version=version, run_args=docker_run_args, mount_home=mount_home, **docker_args) elif command == "build": package = args["module"] docker.build_image(job_mode, package=package, **docker_args) elif command == 'status': caliban.history.cli.get_status(args) elif command == 'stop': caliban.history.cli.stop(args) elif command == 'resubmit': caliban.history.cli.resubmit(args) elif command == "run": dry_run = args["dry_run"] package = args["module"] image_id = args.get("image_id") dlvm = args.get("dlvm") exp_config = args.get("experiment_config") xgroup = args.get('xgroup') docker.run_experiments(job_mode, run_args=docker_run_args, script_args=script_args, image_id=image_id, dlvm=dlvm, experiment_config=exp_config, dry_run=dry_run, package=package, xgroup=xgroup, **docker_args) elif command == "cloud": project_id = c.extract_project_id(args) region = c.extract_region(args) cloud_key = c.extract_cloud_key(args) dry_run = args["dry_run"] package = args["module"] job_name = args.get("name") gpu_spec = args.get("gpu_spec") tpu_spec = args.get("tpu_spec") image_tag = args.get("image_tag") machine_type = args.get("machine_type") dlvm = args.get("dlvm") exp_config = args.get("experiment_config") labels = u.sanitize_labels(args.get("label") or []) xgroup = args.get('xgroup') # Arguments to internally build the image required to submit to Cloud. docker_m = {"job_mode": job_mode, "package": package, **docker_args} cloud.submit_ml_job( job_mode=job_mode, docker_args=docker_m, region=region, project_id=project_id, credentials_path=cloud_key, dry_run=dry_run, job_name=job_name, dlvm=dlvm, machine_type=machine_type, gpu_spec=gpu_spec, tpu_spec=tpu_spec, image_tag=image_tag, labels=labels, script_args=script_args, experiment_config=exp_config, xgroup=xgroup, ) else: logging.info("Unknown command: {}".format(command)) sys.exit(1)
def _job_submit(args: dict, cluster: Cluster) -> None: """submits job(s) to cluster Args: args: argument dictionary cluster: cluster instance """ script_args = conf.extract_script_args(args) job_mode = cli.resolve_job_mode(args) docker_args = cli.generate_docker_args(job_mode, args) docker_run_args = args.get('docker_run_args', []) or [] dry_run = args['dry_run'] package = args['module'] job_name = _generate_job_name(args.get('name')) gpu_spec = args.get('gpu_spec') preemptible = not args['nonpreemptible'] min_cpu = args.get('min_cpu') min_mem = args.get('min_mem') experiment_config = args.get('experiment_config') or [{}] xgroup = args.get('xgroup') image_tag = args.get('image_tag') export = args.get('export', None) labels = args.get('label') if labels is not None: labels = dict(cu.sanitize_labels(args.get('label'))) # Arguments to internally build the image required to submit to Cloud. docker_m = {'job_mode': job_mode, 'package': package, **docker_args} # -------------------------------------------------------------------------- # validatate gpu spec if job_mode == conf.JobMode.GPU and gpu_spec is None: gpu_spec = k.DEFAULT_GPU_SPEC if not cluster.validate_gpu_spec(gpu_spec): return # -------------------------------------------------------------------------- # validate tpu spec and driver tpu_spec = args.get('tpu_spec') preemptible_tpu = not args.get('nonpreemptible_tpu') tpu_driver = args.get('tpu_driver') if tpu_spec is not None: available_tpu = cluster.get_tpu_types() if available_tpu is None: logging.error('error getting valid tpu types for cluster') return if tpu_spec not in available_tpu: logging.error('invalid tpu spec, cluster supports:') for t in available_tpu: logging.info('{}x{}'.format(t.count, t.tpu.name)) return if not cluster.validate_tpu_driver(tpu_driver): logging.error('error: unsupported tpu driver {}'.format(tpu_driver)) logging.info('supported tpu drivers for this cluster:') for d in cluster.get_tpu_drivers(): logging.info(' {}'.format(d)) return if tpu_spec is None and gpu_spec is None: # cpu-only job min_cpu = min_cpu or k.DEFAULT_MIN_CPU_CPU min_mem = min_mem or k.DEFAULT_MIN_MEM_CPU else: # gpu/tpu-accelerated job min_cpu = min_cpu or k.DEFAULT_MIN_CPU_ACCEL min_mem = min_mem or k.DEFAULT_MIN_MEM_ACCEL # convert accelerator spec accel_spec = Cluster.convert_accel_spec(gpu_spec, tpu_spec) if accel_spec is None: return accel, accel_count = accel_spec # -------------------------------------------------------------------------- engine = get_mem_engine() if dry_run else get_sql_engine() with session_scope(engine) as session: container_spec = generate_container_spec(session, docker_m, image_tag) if image_tag is None: image_tag = generate_image_tag(cluster.project_id, docker_m, dry_run) experiments = create_experiments( session=session, container_spec=container_spec, script_args=script_args, experiment_config=experiment_config, xgroup=xgroup, ) specs = list( cluster.create_simple_experiment_job_specs( name=util.sanitize_job_name(job_name), image=image_tag, min_cpu=min_cpu, min_mem=min_mem, experiments=experiments, args=script_args, accelerator=accel, accelerator_count=accel_count, preemptible=preemptible, preemptible_tpu=preemptible_tpu, tpu_driver=tpu_driver)) # just a dry run if dry_run: logging.info('jobs that would be submitted:') for s in specs: logging.info(f'\n{json.dumps(s.spec, indent=2)}') return # export jobs to file if export is not None: if not _export_jobs( export, cluster.create_v1jobs(specs, job_name, labels), ): print('error exporting jobs to {}'.format(export)) return for s in specs: try: cluster.submit_job(job_spec=s, name=job_name, labels=labels) except Exception as e: logging.error(f'exception: {e}') session.commit() # commit here, otherwise will be rolled back return # -------------------------------------------------------------------------- logging.info(f'jobs submitted, visit {cluster.dashboard_url()} to monitor') return