def _cluster_create(args: dict, project_id: str, creds: Credentials) -> None: """creates a gke cluster Args: args: commandline args project_id: project in which to create cluster creds: credentials to use """ dry_run = args['dry_run'] cluster_name = args['cluster_name'] or k.DEFAULT_CLUSTER_NAME zone = args['zone'] dashboard_url = utils.dashboard_cluster_url(cluster_name, zone, project_id) release_channel = args['release_channel'] single_zone = args['single_zone'] # -------------------------------------------------------------------------- cluster_client = googleapiclient.discovery.build('container', k.CLUSTER_API_VERSION, credentials=creds, cache_discovery=False) if cluster_client is None: logging.error('error building cluster client') return request = Cluster.create_request(cluster_client, creds, cluster_name, project_id, zone, release_channel, single_zone) if request is None: logging.error('error creating cluster creation request') return if dry_run: logging.info('request:\n{}'.format(pp.pformat(json.loads( request.body)))) return # -------------------------------------------------------------------------- # see if cluster(s) already exist, and if so, check with the user before # creating another if not _check_for_existing_cluster(cluster_name, project_id, creds): return logging.info('creating cluster {} in project {} in {}...'.format( cluster_name, project_id, zone)) logging.info('please be patient, this may take several minutes') logging.info( 'visit {} to monitor cluster creation progress'.format(dashboard_url)) # -------------------------------------------------------------------------- # create the cluster cluster = Cluster.create(cluster_client, creds, request, project_id) return
def submit_job_specs( args: Dict[str, Any], cluster: Cluster, ) -> None: """submits jobs to cluster Args: args: dictionary of args cluster: cluster instance """ job_specs = args.get('specs') for s in job_specs: name = s.spec['template']['spec']['containers'][0]['name'] cluster.submit_job(job_spec=s, name=name)
def _check_for_existing_cluster(cluster_name: str, project_id: str, creds: Credentials): '''checks for an existing cluster and confirms new cluster creation with user Args: cluster_name: name of cluster to create project_id: project id creds: credentials Returns: True if cluster creation should proceed, False otherwise ''' clusters = Cluster.list(project_id=project_id, creds=creds) if len(clusters) == 0: return True if cluster_name in clusters: logging.error('cluster {} already exists'.format(cluster_name)) return False logging.info('{} clusters already exist for this project:'.format( len(clusters))) for c in clusters: logging.info(c) return utils.user_verify('Do you really want to create a new cluster?', default=False)
def _node_pool_ls(args: dict, cluster: Cluster) -> None: """lists cluster node pools Args: args: commandline args cluster: lists node pools in this cluster instance """ np = cluster.node_pools() if np is None: return if len(np) == 0: logging.info('no node pools found') return FMT = '%-20s%-20s%-40s%-20s' logging.info(FMT, 'NAME', 'MACHINE TYPE', 'ACCELERATORS', 'MAX NODES') for p in np: accel = ','.join([ '%s(%d)' % (a.accelerator_type, a.accelerator_count) for a in p.config.accelerators ]) logging.info(FMT % (p.name, p.config.machine_type, accel, p.autoscaling.max_node_count)) return
def _cluster_ls(args: dict, project_id: str, creds: Credentials) -> None: """lists clusters Args: args: commandline args project_id: list clusters in the project creds: credentials to use """ clusters = Cluster.list(project_id=project_id, creds=creds) if clusters is None: return cluster_name = args.get('cluster_name', None) if cluster_name is not None: if cluster_name not in clusters: logging.error('cluster {} not found'.format(cluster_name)) return logging.error(cluster_name) return logging.info('{} clusters found'.format(len(clusters))) for c in clusters: logging.info(c) return
def _cluster_delete(args: dict, cluster: Cluster) -> None: """deletes given cluster Args: args: commandline args cluster: cluster to delete Returns: None """ if utils.user_verify('Are you sure you want to delete {}?'.format( cluster.name), default=False): cluster.delete() return
def get_job_cluster(j: Job) -> Optional[Cluster]: '''gets the cluster name from a Job object''' if j.spec.platform != Platform.GKE: return None return Cluster.get(name=j.details['cluster_name'], project_id=j.details['project_id'], zone=j.details['cluster_zone'], creds=default_credentials().credentials)
def wrapper(args: dict, project_id: str, creds: Credentials, zone: str = k.ZONE_DEFAULT): cluster_name = args.get('cluster_name') cluster = Cluster.get(name=cluster_name, project_id=project_id, zone=zone, creds=creds) return fn(args, cluster=cluster) if cluster else None
def _job_submit_file(args: dict, cluster: Cluster) -> None: """submit gke job from k8s yaml/json file""" job_file = args['job_file'] job_spec = utils.parse_job_file(job_file) if job_spec is None: logging.error('error parsing job file {}'.format(job_file)) return if args['dry_run']: logging.info('job to submit:\n{}'.format(pp.pformat(job_spec))) return job = cluster.submit_v1job(job=job_spec) if job is None: logging.error('error submitting job:\n{}'.format(pp.pformat(job_spec))) return logging.info('submitted job: {}'.format(cluster.job_dashboard_url(job))) return
def _pod_ls(args: dict, cluster: Cluster): """lists pods for given cluster Args: args: commandline args cluster: list pods in this cluster """ pods = cluster.pods() if pods is None: return logging.info('{} pods found'.format(len(pods))) for p in pods: logging.info(p.metadata.name) return
def _job_ls(args: dict, cluster: Cluster): """lists jobs in given cluster Args: args: commandline args cluster: lists jobs from this cluster """ jobs = cluster.jobs() if jobs is None: return logging.info('{} jobs found'.format(len(jobs))) for j in jobs: logging.info(j.metadata.name) return
def _job_submit(args: dict, cluster: Cluster) -> None: """submits job(s) to cluster Args: args: argument dictionary cluster: cluster instance """ script_args = conf.extract_script_args(args) job_mode = cli.resolve_job_mode(args) docker_args = cli.generate_docker_args(job_mode, args) docker_run_args = args.get('docker_run_args', []) or [] dry_run = args['dry_run'] package = args['module'] job_name = _generate_job_name(args.get('name')) gpu_spec = args.get('gpu_spec') preemptible = not args['nonpreemptible'] min_cpu = args.get('min_cpu') min_mem = args.get('min_mem') experiment_config = args.get('experiment_config') or [{}] xgroup = args.get('xgroup') image_tag = args.get('image_tag') export = args.get('export', None) labels = args.get('label') if labels is not None: labels = dict(u.sanitize_labels(args.get('label'))) # Arguments to internally build the image required to submit to Cloud. docker_m = {'job_mode': job_mode, 'package': package, **docker_args} # -------------------------------------------------------------------------- # validatate gpu spec if job_mode == conf.JobMode.GPU and gpu_spec is None: gpu_spec = k.DEFAULT_GPU_SPEC if not cluster.validate_gpu_spec(gpu_spec): return # -------------------------------------------------------------------------- # validate tpu spec and driver tpu_spec = args.get('tpu_spec') preemptible_tpu = not args.get('nonpreemptible_tpu') tpu_driver = args.get('tpu_driver') if tpu_spec is not None: available_tpu = cluster.get_tpu_types() if available_tpu is None: logging.error('error getting valid tpu types for cluster') return if tpu_spec not in available_tpu: logging.error('invalid tpu spec, cluster supports:') for t in available_tpu: logging.info('{}x{}'.format(t.count, t.tpu.name)) return if not cluster.validate_tpu_driver(tpu_driver): logging.error( 'error: unsupported tpu driver {}'.format(tpu_driver)) logging.info('supported tpu drivers for this cluster:') for d in cluster.get_tpu_drivers(): logging.info(' {}'.format(d)) return if tpu_spec is None and gpu_spec is None: # cpu-only job min_cpu = min_cpu or k.DEFAULT_MIN_CPU_CPU min_mem = min_mem or k.DEFAULT_MIN_MEM_CPU else: # gpu/tpu-accelerated job min_cpu = min_cpu or k.DEFAULT_MIN_CPU_ACCEL min_mem = min_mem or k.DEFAULT_MIN_MEM_ACCEL # convert accelerator spec accel_spec = Cluster.convert_accel_spec(gpu_spec, tpu_spec) if accel_spec is None: return accel, accel_count = accel_spec # -------------------------------------------------------------------------- engine = get_mem_engine() if dry_run else get_sql_engine() with session_scope(engine) as session: container_spec = generate_container_spec(session, docker_m, image_tag) if image_tag is None: image_tag = generate_image_tag(cluster.project_id, docker_m, dry_run) experiments = create_experiments( session=session, container_spec=container_spec, script_args=script_args, experiment_config=experiment_config, xgroup=xgroup, ) specs = list( cluster.create_simple_experiment_job_specs( name=utils.sanitize_job_name(job_name), image=image_tag, min_cpu=min_cpu, min_mem=min_mem, experiments=experiments, args=script_args, accelerator=accel, accelerator_count=accel_count, preemptible=preemptible, preemptible_tpu=preemptible_tpu, tpu_driver=tpu_driver)) # just a dry run if dry_run: logging.info('jobs that would be submitted:') for s in specs: logging.info(f'\n{json.dumps(s.spec, indent=2)}') return # export jobs to file if export is not None: if not _export_jobs( export, cluster.create_v1jobs(specs, job_name, labels), ): print('error exporting jobs to {}'.format(export)) return for s in specs: try: cluster.submit_job(job_spec=s, name=job_name, labels=labels) except Exception as e: logging.error(f'exception: {e}') session.commit() # commit here, otherwise will be rolled back return # -------------------------------------------------------------------------- logging.info(f'jobs submitted, visit {cluster.dashboard_url()} to monitor') return