コード例 #1
0
def test_sanitize_labels_second_noop(pairs):
    """Test that passing the output of sanitize_labels back into the function
  returns its input. Sanitizing a set of sanitized kv pairs should have no
  effect.

  """
    once = u.sanitize_labels(pairs)
    twice = u.sanitize_labels(once)
    assert once == twice
コード例 #2
0
def _job_spec(
    job_name: str,
    idx: int,
    training_input: Dict[str, Any],
    labels: Dict[str, str],
    experiment: ht.Experiment,
) -> ht.JobSpec:
  """Returns the final object required by the Google AI Platform training job
  submission endpoint.

  """
  job_id = f'{job_name}_{idx}'
  job_args = training_input.get("args")
  return ht.JobSpec.get_or_create(
      experiment=experiment,
      spec={
          "jobId": job_id,
          "trainingInput": training_input,
          "labels": {
              **cu.sanitize_labels(labels),
              **cu.script_args_to_labels(job_args)
          }
      },
      platform=ht.Platform.CAIP,
  )
コード例 #3
0
def test_sanitize_labels(pairs):
    """Test that any input we could possibly be provided, as long as it parses into
  kv pairs, will only make it into a dict of labels if it's properly
  sanitized.

  Checks that the functions works for dicts OR for lists of pairs.

  """
    for k, v in u.sanitize_labels(pairs).items():
        assert_valid_key_label(k)
        assert_valid_label(v)
コード例 #4
0
ファイル: cli.py プロジェクト: tgtn007/caliban
def _job_submit(args: dict, cluster: Cluster) -> None:
  """submits job(s) to cluster

  Args:
  args: argument dictionary
  cluster: cluster instance
  """

  script_args = conf.extract_script_args(args)
  job_mode = cli.resolve_job_mode(args)
  docker_args = cli.generate_docker_args(job_mode, args)
  docker_run_args = args.get('docker_run_args', []) or []
  dry_run = args['dry_run']
  package = args['module']
  job_name = _generate_job_name(args.get('name'))
  gpu_spec = args.get('gpu_spec')
  preemptible = not args['nonpreemptible']
  min_cpu = args.get('min_cpu')
  min_mem = args.get('min_mem')
  experiment_config = args.get('experiment_config') or [{}]
  xgroup = args.get('xgroup')
  image_tag = args.get('image_tag')
  export = args.get('export', None)

  labels = args.get('label')
  if labels is not None:
    labels = dict(cu.sanitize_labels(args.get('label')))

  # Arguments to internally build the image required to submit to Cloud.
  docker_m = {'job_mode': job_mode, 'package': package, **docker_args}

  # --------------------------------------------------------------------------
  # validatate gpu spec
  if job_mode == conf.JobMode.GPU and gpu_spec is None:
    gpu_spec = k.DEFAULT_GPU_SPEC

  if not cluster.validate_gpu_spec(gpu_spec):
    return

  # --------------------------------------------------------------------------
  # validate tpu spec and driver
  tpu_spec = args.get('tpu_spec')
  preemptible_tpu = not args.get('nonpreemptible_tpu')
  tpu_driver = args.get('tpu_driver')

  if tpu_spec is not None:
    available_tpu = cluster.get_tpu_types()
    if available_tpu is None:
      logging.error('error getting valid tpu types for cluster')
      return

    if tpu_spec not in available_tpu:
      logging.error('invalid tpu spec, cluster supports:')
      for t in available_tpu:
        logging.info('{}x{}'.format(t.count, t.tpu.name))
      return

    if not cluster.validate_tpu_driver(tpu_driver):
      logging.error('error: unsupported tpu driver {}'.format(tpu_driver))
      logging.info('supported tpu drivers for this cluster:')
      for d in cluster.get_tpu_drivers():
        logging.info('  {}'.format(d))
      return

  if tpu_spec is None and gpu_spec is None:  # cpu-only job
    min_cpu = min_cpu or k.DEFAULT_MIN_CPU_CPU
    min_mem = min_mem or k.DEFAULT_MIN_MEM_CPU
  else:  # gpu/tpu-accelerated job
    min_cpu = min_cpu or k.DEFAULT_MIN_CPU_ACCEL
    min_mem = min_mem or k.DEFAULT_MIN_MEM_ACCEL

  # convert accelerator spec
  accel_spec = Cluster.convert_accel_spec(gpu_spec, tpu_spec)
  if accel_spec is None:
    return

  accel, accel_count = accel_spec

  # --------------------------------------------------------------------------
  engine = get_mem_engine() if dry_run else get_sql_engine()

  with session_scope(engine) as session:
    container_spec = generate_container_spec(session, docker_m, image_tag)

    if image_tag is None:
      image_tag = generate_image_tag(cluster.project_id, docker_m, dry_run)

    experiments = create_experiments(
        session=session,
        container_spec=container_spec,
        script_args=script_args,
        experiment_config=experiment_config,
        xgroup=xgroup,
    )

    specs = list(
        cluster.create_simple_experiment_job_specs(
            name=util.sanitize_job_name(job_name),
            image=image_tag,
            min_cpu=min_cpu,
            min_mem=min_mem,
            experiments=experiments,
            args=script_args,
            accelerator=accel,
            accelerator_count=accel_count,
            preemptible=preemptible,
            preemptible_tpu=preemptible_tpu,
            tpu_driver=tpu_driver))

    # just a dry run
    if dry_run:
      logging.info('jobs that would be submitted:')
      for s in specs:
        logging.info(f'\n{json.dumps(s.spec, indent=2)}')
      return

    # export jobs to file
    if export is not None:
      if not _export_jobs(
          export,
          cluster.create_v1jobs(specs, job_name, labels),
      ):
        print('error exporting jobs to {}'.format(export))
      return

    for s in specs:
      try:
        cluster.submit_job(job_spec=s, name=job_name, labels=labels)
      except Exception as e:
        logging.error(f'exception: {e}')
        session.commit()  # commit here, otherwise will be rolled back
        return

  # --------------------------------------------------------------------------
  logging.info(f'jobs submitted, visit {cluster.dashboard_url()} to monitor')

  return
コード例 #5
0
def test_sanitize_labels_kill_empty():
    """Keys that are sanitized to the empty string should NOT make it through."""
    assert {} == u.sanitize_labels([["--!!", "face"]])
コード例 #6
0
ファイル: main.py プロジェクト: satishjasthi/caliban
def run_app(arg_input):
    """Main function to run the Caliban app. Accepts a Namespace-type output of an
  argparse argument parser.

  """
    args = vars(arg_input)
    script_args = c.extract_script_args(args)

    command = args["command"]

    if command == "cluster":
        return gke.cli.run_cli_command(args)

    job_mode = cli.resolve_job_mode(args)
    docker_args = cli.generate_docker_args(job_mode, args)
    docker_run_args = args.get("docker_run_args", [])

    if command == "shell":
        mount_home = not args['bare']
        image_id = args.get("image_id")
        shell = args['shell']
        ps.run_interactive(job_mode,
                           image_id=image_id,
                           run_args=docker_run_args,
                           mount_home=mount_home,
                           shell=shell,
                           **docker_args)

    elif command == "notebook":
        port = args.get("port")
        lab = args.get("lab")
        version = args.get("jupyter_version")
        mount_home = not args['bare']
        pn.run_notebook(job_mode,
                        port=port,
                        lab=lab,
                        version=version,
                        run_args=docker_run_args,
                        mount_home=mount_home,
                        **docker_args)

    elif command == "build":
        package = args["module"]
        b.build_image(job_mode, package=package, **docker_args)

    elif command == 'status':
        caliban.history.cli.get_status(args)

    elif command == 'stop':
        caliban.history.cli.stop(args)

    elif command == 'resubmit':
        caliban.history.cli.resubmit(args)

    elif command == "run":
        dry_run = args["dry_run"]
        package = args["module"]
        image_id = args.get("image_id")
        exp_config = args.get("experiment_config")
        xgroup = args.get('xgroup')

        pr.run_experiments(job_mode,
                           run_args=docker_run_args,
                           script_args=script_args,
                           image_id=image_id,
                           experiment_config=exp_config,
                           dry_run=dry_run,
                           package=package,
                           xgroup=xgroup,
                           **docker_args)

    elif command == "cloud":
        project_id = c.extract_project_id(args)
        region = c.extract_region(args)
        cloud_key = c.extract_cloud_key(args)

        dry_run = args["dry_run"]
        package = args["module"]
        job_name = args.get("name")
        gpu_spec = args.get("gpu_spec")
        tpu_spec = args.get("tpu_spec")
        image_tag = args.get("image_tag")
        machine_type = args.get("machine_type")
        exp_config = args.get("experiment_config")
        labels = cu.sanitize_labels(args.get("label") or [])
        xgroup = args.get('xgroup')

        # Arguments to internally build the image required to submit to Cloud.
        docker_m = {"job_mode": job_mode, "package": package, **docker_args}

        cloud.submit_ml_job(
            job_mode=job_mode,
            docker_args=docker_m,
            region=region,
            project_id=project_id,
            credentials_path=cloud_key,
            dry_run=dry_run,
            job_name=job_name,
            machine_type=machine_type,
            gpu_spec=gpu_spec,
            tpu_spec=tpu_spec,
            image_tag=image_tag,
            labels=labels,
            script_args=script_args,
            experiment_config=exp_config,
            xgroup=xgroup,
        )
    else:
        logging.info("Unknown command: {}".format(command))
        sys.exit(1)