Example #1
0
def _cluster_create(args: dict, project_id: str, creds: Credentials) -> None:
    """creates a gke cluster

  Args:
  args: commandline args
  project_id: project in which to create cluster
  creds: credentials to use
  """
    dry_run = args['dry_run']
    cluster_name = args['cluster_name'] or k.DEFAULT_CLUSTER_NAME
    zone = args['zone']
    dashboard_url = util.dashboard_cluster_url(cluster_name, zone, project_id)
    release_channel = args['release_channel']
    single_zone = args['single_zone']

    # --------------------------------------------------------------------------
    cluster_client = googleapiclient.discovery.build('container',
                                                     k.CLUSTER_API_VERSION,
                                                     credentials=creds,
                                                     cache_discovery=False)

    if cluster_client is None:
        logging.error('error building cluster client')
        return

    request = Cluster.create_request(cluster_client, creds, cluster_name,
                                     project_id, zone, release_channel,
                                     single_zone)

    if request is None:
        logging.error('error creating cluster creation request')
        return

    if dry_run:
        logging.info('request:\n{}'.format(pp.pformat(json.loads(
            request.body))))
        return

    # --------------------------------------------------------------------------
    # see if cluster(s) already exist, and if so, check with the user before
    # creating another
    if not _check_for_existing_cluster(cluster_name, project_id, creds):
        return

    logging.info('creating cluster {} in project {} in {}...'.format(
        cluster_name, project_id, zone))
    logging.info('please be patient, this may take several minutes')
    logging.info(
        'visit {} to monitor cluster creation progress'.format(dashboard_url))

    # --------------------------------------------------------------------------
    # create the cluster
    cluster = Cluster.create(cluster_client, creds, request, project_id)

    return
Example #2
0
def submit_job_specs(
    args: Dict[str, Any],
    cluster: Cluster,
) -> None:
  """submits jobs to cluster

  Args:
  args: dictionary of args
  cluster: cluster instance
  """
  job_specs = args.get('specs')

  for s in job_specs:
    name = s.spec['template']['spec']['containers'][0]['name']
    cluster.submit_job(job_spec=s, name=name)
Example #3
0
def _check_for_existing_cluster(cluster_name: str, project_id: str,
                                creds: Credentials):
  '''checks for an existing cluster and confirms new cluster creation with user

  Args:
  cluster_name: name of cluster to create
  project_id: project id
  creds: credentials

  Returns:
  True if cluster creation should proceed, False otherwise
  '''

  clusters = Cluster.list(project_id=project_id, creds=creds)

  if len(clusters) == 0:
    return True

  if cluster_name in clusters:
    logging.error('cluster {} already exists'.format(cluster_name))
    return False

  logging.info('{} clusters already exist for this project:'.format(
      len(clusters)))
  for c in clusters:
    logging.info(c)

  return util.user_verify('Do you really want to create a new cluster?',
                          default=False)
Example #4
0
def _node_pool_ls(args: dict, cluster: Cluster) -> None:
  """lists cluster node pools

  Args:
  args: commandline args
  cluster: lists node pools in this cluster instance
  """

  np = cluster.node_pools()

  if np is None:
    return

  if len(np) == 0:
    logging.info('no node pools found')
    return

  FMT = '%-20s%-20s%-40s%-20s'
  logging.info(FMT, 'NAME', 'MACHINE TYPE', 'ACCELERATORS', 'MAX NODES')
  for p in np:
    accel = ','.join([
        '%s(%d)' % (a.accelerator_type, a.accelerator_count)
        for a in p.config.accelerators
    ])
    logging.info(
        FMT %
        (p.name, p.config.machine_type, accel, p.autoscaling.max_node_count))

  return
Example #5
0
def _cluster_ls(args: dict, project_id: str, creds: Credentials) -> None:
  """lists clusters

  Args:
  args: commandline args
  project_id: list clusters in the project
  creds: credentials to use
  """
  clusters = Cluster.list(project_id=project_id, creds=creds)

  if clusters is None:
    return

  cluster_name = args.get('cluster_name', None)

  if cluster_name is not None:
    if cluster_name not in clusters:
      logging.error('cluster {} not found'.format(cluster_name))
      return
    logging.error(cluster_name)
    return

  logging.info('{} clusters found'.format(len(clusters)))
  for c in clusters:
    logging.info(c)

  return
Example #6
0
def _cluster_delete(args: dict, cluster: Cluster) -> None:
  """deletes given cluster

  Args:
  args: commandline args
  cluster: cluster to delete

  Returns:
  None
  """

  if util.user_verify('Are you sure you want to delete {}?'.format(
      cluster.name),
                      default=False):
    cluster.delete()

  return
Example #7
0
def get_job_cluster(j: Job) -> Optional[Cluster]:
    '''gets the cluster name from a Job object'''
    if j.spec.platform != Platform.GKE:
        return None

    return Cluster.get(name=j.details['cluster_name'],
                       project_id=j.details['project_id'],
                       zone=j.details['cluster_zone'],
                       creds=default_credentials().credentials)
Example #8
0
  def wrapper(args: dict,
              project_id: str,
              creds: Credentials,
              zone: str = k.ZONE_DEFAULT):
    cluster_name = args.get('cluster_name')

    cluster = Cluster.get(name=cluster_name,
                          project_id=project_id,
                          zone=zone,
                          creds=creds)

    return fn(args, cluster=cluster) if cluster else None
Example #9
0
def _job_submit_file(args: dict, cluster: Cluster) -> None:
  """submit gke job from k8s yaml/json file"""

  job_file = args['job_file']

  job_spec = util.parse_job_file(job_file)
  if job_spec is None:
    logging.error('error parsing job file {}'.format(job_file))
    return

  if args['dry_run']:
    logging.info('job to submit:\n{}'.format(pp.pformat(job_spec)))
    return

  job = cluster.submit_v1job(job=job_spec)
  if job is None:
    logging.error('error submitting job:\n{}'.format(pp.pformat(job_spec)))
    return

  logging.info('submitted job: {}'.format(cluster.job_dashboard_url(job)))

  return
Example #10
0
def _pod_ls(args: dict, cluster: Cluster):
  """lists pods for given cluster

  Args:
  args: commandline args
  cluster: list pods in this cluster
  """
  pods = cluster.pods()
  if pods is None:
    return

  logging.info('{} pods found'.format(len(pods)))
  for p in pods:
    logging.info(p.metadata.name)

  return
Example #11
0
def _job_ls(args: dict, cluster: Cluster):
  """lists jobs in given cluster

  Args:
  args: commandline args
  cluster: lists jobs from this cluster
  """
  jobs = cluster.jobs()

  if jobs is None:
    return

  logging.info('{} jobs found'.format(len(jobs)))
  for j in jobs:
    logging.info(j.metadata.name)

  return
Example #12
0
def _job_submit(args: dict, cluster: Cluster) -> None:
  """submits job(s) to cluster

  Args:
  args: argument dictionary
  cluster: cluster instance
  """

  script_args = conf.extract_script_args(args)
  job_mode = cli.resolve_job_mode(args)
  docker_args = cli.generate_docker_args(job_mode, args)
  docker_run_args = args.get('docker_run_args', []) or []
  dry_run = args['dry_run']
  package = args['module']
  job_name = _generate_job_name(args.get('name'))
  gpu_spec = args.get('gpu_spec')
  preemptible = not args['nonpreemptible']
  min_cpu = args.get('min_cpu')
  min_mem = args.get('min_mem')
  experiment_config = args.get('experiment_config') or [{}]
  xgroup = args.get('xgroup')
  image_tag = args.get('image_tag')
  export = args.get('export', None)

  labels = args.get('label')
  if labels is not None:
    labels = dict(cu.sanitize_labels(args.get('label')))

  # Arguments to internally build the image required to submit to Cloud.
  docker_m = {'job_mode': job_mode, 'package': package, **docker_args}

  # --------------------------------------------------------------------------
  # validatate gpu spec
  if job_mode == conf.JobMode.GPU and gpu_spec is None:
    gpu_spec = k.DEFAULT_GPU_SPEC

  if not cluster.validate_gpu_spec(gpu_spec):
    return

  # --------------------------------------------------------------------------
  # validate tpu spec and driver
  tpu_spec = args.get('tpu_spec')
  preemptible_tpu = not args.get('nonpreemptible_tpu')
  tpu_driver = args.get('tpu_driver')

  if tpu_spec is not None:
    available_tpu = cluster.get_tpu_types()
    if available_tpu is None:
      logging.error('error getting valid tpu types for cluster')
      return

    if tpu_spec not in available_tpu:
      logging.error('invalid tpu spec, cluster supports:')
      for t in available_tpu:
        logging.info('{}x{}'.format(t.count, t.tpu.name))
      return

    if not cluster.validate_tpu_driver(tpu_driver):
      logging.error('error: unsupported tpu driver {}'.format(tpu_driver))
      logging.info('supported tpu drivers for this cluster:')
      for d in cluster.get_tpu_drivers():
        logging.info('  {}'.format(d))
      return

  if tpu_spec is None and gpu_spec is None:  # cpu-only job
    min_cpu = min_cpu or k.DEFAULT_MIN_CPU_CPU
    min_mem = min_mem or k.DEFAULT_MIN_MEM_CPU
  else:  # gpu/tpu-accelerated job
    min_cpu = min_cpu or k.DEFAULT_MIN_CPU_ACCEL
    min_mem = min_mem or k.DEFAULT_MIN_MEM_ACCEL

  # convert accelerator spec
  accel_spec = Cluster.convert_accel_spec(gpu_spec, tpu_spec)
  if accel_spec is None:
    return

  accel, accel_count = accel_spec

  # --------------------------------------------------------------------------
  engine = get_mem_engine() if dry_run else get_sql_engine()

  with session_scope(engine) as session:
    container_spec = generate_container_spec(session, docker_m, image_tag)

    if image_tag is None:
      image_tag = generate_image_tag(cluster.project_id, docker_m, dry_run)

    experiments = create_experiments(
        session=session,
        container_spec=container_spec,
        script_args=script_args,
        experiment_config=experiment_config,
        xgroup=xgroup,
    )

    specs = list(
        cluster.create_simple_experiment_job_specs(
            name=util.sanitize_job_name(job_name),
            image=image_tag,
            min_cpu=min_cpu,
            min_mem=min_mem,
            experiments=experiments,
            args=script_args,
            accelerator=accel,
            accelerator_count=accel_count,
            preemptible=preemptible,
            preemptible_tpu=preemptible_tpu,
            tpu_driver=tpu_driver))

    # just a dry run
    if dry_run:
      logging.info('jobs that would be submitted:')
      for s in specs:
        logging.info(f'\n{json.dumps(s.spec, indent=2)}')
      return

    # export jobs to file
    if export is not None:
      if not _export_jobs(
          export,
          cluster.create_v1jobs(specs, job_name, labels),
      ):
        print('error exporting jobs to {}'.format(export))
      return

    for s in specs:
      try:
        cluster.submit_job(job_spec=s, name=job_name, labels=labels)
      except Exception as e:
        logging.error(f'exception: {e}')
        session.commit()  # commit here, otherwise will be rolled back
        return

  # --------------------------------------------------------------------------
  logging.info(f'jobs submitted, visit {cluster.dashboard_url()} to monitor')

  return