예제 #1
0
파일: cli.py 프로젝트: sagravat/caliban
def submit_job_specs(
    args: Dict[str, Any],
    cluster: Cluster,
) -> None:
    """submits jobs to cluster

  Args:
  args: dictionary of args
  cluster: cluster instance
  """
    job_specs = args.get('specs')

    for s in job_specs:
        name = s.spec['template']['spec']['containers'][0]['name']
        cluster.submit_job(job_spec=s, name=name)
예제 #2
0
파일: cli.py 프로젝트: sagravat/caliban
def _job_submit(args: dict, cluster: Cluster) -> None:
    """submits job(s) to cluster

  Args:
  args: argument dictionary
  cluster: cluster instance
  """

    script_args = conf.extract_script_args(args)
    job_mode = cli.resolve_job_mode(args)
    docker_args = cli.generate_docker_args(job_mode, args)
    docker_run_args = args.get('docker_run_args', []) or []
    dry_run = args['dry_run']
    package = args['module']
    job_name = _generate_job_name(args.get('name'))
    gpu_spec = args.get('gpu_spec')
    preemptible = not args['nonpreemptible']
    min_cpu = args.get('min_cpu')
    min_mem = args.get('min_mem')
    experiment_config = args.get('experiment_config') or [{}]
    xgroup = args.get('xgroup')
    image_tag = args.get('image_tag')
    export = args.get('export', None)

    labels = args.get('label')
    if labels is not None:
        labels = dict(u.sanitize_labels(args.get('label')))

    # Arguments to internally build the image required to submit to Cloud.
    docker_m = {'job_mode': job_mode, 'package': package, **docker_args}

    # --------------------------------------------------------------------------
    # validatate gpu spec
    if job_mode == conf.JobMode.GPU and gpu_spec is None:
        gpu_spec = k.DEFAULT_GPU_SPEC

    if not cluster.validate_gpu_spec(gpu_spec):
        return

    # --------------------------------------------------------------------------
    # validate tpu spec and driver
    tpu_spec = args.get('tpu_spec')
    preemptible_tpu = not args.get('nonpreemptible_tpu')
    tpu_driver = args.get('tpu_driver')

    if tpu_spec is not None:
        available_tpu = cluster.get_tpu_types()
        if available_tpu is None:
            logging.error('error getting valid tpu types for cluster')
            return

        if tpu_spec not in available_tpu:
            logging.error('invalid tpu spec, cluster supports:')
            for t in available_tpu:
                logging.info('{}x{}'.format(t.count, t.tpu.name))
            return

        if not cluster.validate_tpu_driver(tpu_driver):
            logging.error(
                'error: unsupported tpu driver {}'.format(tpu_driver))
            logging.info('supported tpu drivers for this cluster:')
            for d in cluster.get_tpu_drivers():
                logging.info('  {}'.format(d))
            return

    if tpu_spec is None and gpu_spec is None:  # cpu-only job
        min_cpu = min_cpu or k.DEFAULT_MIN_CPU_CPU
        min_mem = min_mem or k.DEFAULT_MIN_MEM_CPU
    else:  # gpu/tpu-accelerated job
        min_cpu = min_cpu or k.DEFAULT_MIN_CPU_ACCEL
        min_mem = min_mem or k.DEFAULT_MIN_MEM_ACCEL

    # convert accelerator spec
    accel_spec = Cluster.convert_accel_spec(gpu_spec, tpu_spec)
    if accel_spec is None:
        return

    accel, accel_count = accel_spec

    # --------------------------------------------------------------------------
    engine = get_mem_engine() if dry_run else get_sql_engine()

    with session_scope(engine) as session:
        container_spec = generate_container_spec(session, docker_m, image_tag)

        if image_tag is None:
            image_tag = generate_image_tag(cluster.project_id, docker_m,
                                           dry_run)

        experiments = create_experiments(
            session=session,
            container_spec=container_spec,
            script_args=script_args,
            experiment_config=experiment_config,
            xgroup=xgroup,
        )

        specs = list(
            cluster.create_simple_experiment_job_specs(
                name=utils.sanitize_job_name(job_name),
                image=image_tag,
                min_cpu=min_cpu,
                min_mem=min_mem,
                experiments=experiments,
                args=script_args,
                accelerator=accel,
                accelerator_count=accel_count,
                preemptible=preemptible,
                preemptible_tpu=preemptible_tpu,
                tpu_driver=tpu_driver))

        # just a dry run
        if dry_run:
            logging.info('jobs that would be submitted:')
            for s in specs:
                logging.info(f'\n{json.dumps(s.spec, indent=2)}')
            return

        # export jobs to file
        if export is not None:
            if not _export_jobs(
                    export,
                    cluster.create_v1jobs(specs, job_name, labels),
            ):
                print('error exporting jobs to {}'.format(export))
            return

        for s in specs:
            try:
                cluster.submit_job(job_spec=s, name=job_name, labels=labels)
            except Exception as e:
                logging.error(f'exception: {e}')
                session.commit()  # commit here, otherwise will be rolled back
                return

    # --------------------------------------------------------------------------
    logging.info(f'jobs submitted, visit {cluster.dashboard_url()} to monitor')

    return