Beispiel #1
0
 def test_service_backend_bucket_parameter(self):
     backend = ServiceBackend(bucket=HAIL_TEST_GCS_BUCKET)
     b = Batch(backend=backend)
     j1 = b.new_job()
     j1.command(f'echo hello > {j1.ofile}')
     j2 = b.new_job()
     j2.command(f'cat {j1.ofile}')
     b.run()
Beispiel #2
0
 def test_service_backend_remote_tempdir_with_no_trailing_slash(self):
     backend = ServiceBackend(
         remote_tmpdir=f'gs://{HAIL_TEST_GCS_BUCKET}/temporary-files/')
     b = Batch(backend=backend)
     j1 = b.new_job()
     j1.command(f'echo hello > {j1.ofile}')
     j2 = b.new_job()
     j2.command(f'cat {j1.ofile}')
     b.run()
Beispiel #3
0
 def test_large_command(self):
     backend = ServiceBackend(
         remote_tmpdir=f'gs://{HAIL_TEST_GCS_BUCKET}/temporary-files')
     b = Batch(backend=backend)
     j1 = b.new_job()
     long_str = secrets.token_urlsafe(15 * 1024)
     j1.command(f'echo "{long_str}"')
     b.run()
Beispiel #4
0
 def test_big_batch_which_uses_slow_path(self):
     backend = ServiceBackend(
         remote_tmpdir=f'{self.remote_tmpdir}/temporary-files')
     b = Batch(backend=backend)
     # 8 * 256 * 1024 = 2 MiB > 1 MiB max bunch size
     for i in range(8):
         j1 = b.new_job()
         long_str = secrets.token_urlsafe(256 * 1024)
         j1.command(f'echo "{long_str}"')
     batch = b.run()
     assert not batch.submission_info.used_fast_create
     batch_status = batch.status()
     assert batch_status['state'] == 'success', str((batch.debug_info()))
Beispiel #5
0
def _add_submit_job(
    batch: hb.Batch,
    cluster_id: str,
    script: str,
    pyfiles: Optional[List[str]] = None,
    job_name: Optional[str] = None,
    cluster_name: Optional[str] = None,
) -> hb.batch.job.Job:
    """
    Returns a job that submits a script to the Dataproc cluster
    specified by `cluster_id`. It's the user's responsibility to start and stop
    that cluster with the `start_cluster` and `stop_cluster` functions
    """
    job_name_prefix = f'{job_name}: s' if job_name else 'S'
    job_name = f'{job_name_prefix}ubmit to Dataproc cluster'
    if cluster_name:
        job_name += f' "{cluster_name}"'

    main_job = batch.new_job(name=job_name)
    main_job.image(DATAPROC_IMAGE)
    main_job.command(GCLOUD_ACTIVATE_AUTH)
    main_job.command(GCLOUD_PROJECT)
    main_job.command(DATAPROC_REGION)

    # Clone the repository to pass scripts to the cluster.
    prepare_git_job(
        job=main_job,
        repo_name=get_repo_name_from_remote(get_git_default_remote()),
        commit=get_git_commit_ref_of_current_repository(),
    )
    cwd = get_relative_path_from_git_root()
    if cwd:
        main_job.command(f'cd {quote(cwd)}')

    if pyfiles:
        main_job.command(f'mkdir {PYFILES_DIR}')
        main_job.command(f'cp -r {" ".join(pyfiles)} {PYFILES_DIR}')
        main_job.command(f'cd {PYFILES_DIR}')
        main_job.command(f'zip -r {PYFILES_ZIP} .')
        main_job.command(f'cd -')

    main_job.command(f'hailctl dataproc submit ' + (
        f'--pyfiles {PYFILES_DIR}/{PYFILES_ZIP} ' if pyfiles else '') +
                     f'{cluster_id} {script} ')
    return main_job
Beispiel #6
0
def _add_stop_job(
    batch: hb.Batch,
    cluster_id: str,
    job_name: Optional[str] = None,
    cluster_name: Optional[str] = None,
) -> hb.batch.job.Job:
    """
    Returns a job that stops the Dataproc cluster specified by `cluster_id`
    """
    job_name_prefix = f'{job_name}: s' if job_name else 'S'
    job_name = f'{job_name_prefix}top Dataproc cluster'
    if cluster_name:
        job_name += f' "{cluster_name}"'

    stop_job = batch.new_job(name=job_name)
    stop_job.always_run()  # Always clean up.
    stop_job.image(DATAPROC_IMAGE)
    stop_job.command(GCLOUD_ACTIVATE_AUTH)
    stop_job.command(GCLOUD_PROJECT)
    stop_job.command(DATAPROC_REGION)
    stop_job.command(f'hailctl dataproc stop {cluster_id}')

    return stop_job
Beispiel #7
0
def _add_start_job(  # pylint: disable=too-many-arguments
    batch: hb.Batch,
    max_age: str,
    num_workers: int = 2,
    num_secondary_workers: int = 0,
    autoscaling_policy: Optional[str] = None,
    worker_machine_type: Optional[str] = None,  # e.g. 'n1-highmem-8'
    worker_boot_disk_size: Optional[int] = None,  # in GB
    secondary_worker_boot_disk_size: Optional[int] = None,  # in GB
    packages: Optional[List[str]] = None,
    init: Optional[List[str]] = None,
    vep: Optional[str] = None,
    requester_pays_allow_all: bool = False,
    cluster_name: Optional[str] = None,
    job_name: Optional[str] = None,
    scopes: Optional[List[str]] = None,
    labels: Optional[Dict[str, str]] = None,
) -> Tuple[hb.batch.job.Job, str]:
    """
    Returns a Batch job which starts a Dataproc cluster, and the name of the cluster.
    The user is respondible for stopping the cluster.

    See the `hailctl` tool for information on the keyword parameters.
    """
    cluster_id = f'dp-{uuid.uuid4().hex[:20]}'

    job_name_prefix = f'{job_name}: s' if job_name else 'S'
    job_name = f'{job_name_prefix}tart Dataproc cluster'
    if cluster_name:
        job_name += f' "{cluster_name}"'
        cluster_name = re.sub(r'[^a-zA-Z0-9]+', '-', cluster_name.lower())
        # Cluster id can't be longer than 49 characters
        cluster_id = f'{cluster_id}-{cluster_name}'[:49]

    if labels is None:
        labels = {}
    labels['compute-category'] = 'dataproc'
    labels_formatted = ','.join(f'{key}={value}'
                                for key, value in labels.items())

    start_job = batch.new_job(name=job_name)
    start_job.image(DATAPROC_IMAGE)
    start_job.command(GCLOUD_ACTIVATE_AUTH)
    start_job.command(GCLOUD_PROJECT)
    start_job.command(DATAPROC_REGION)

    # The spark-env property can be used to set environment variables in jobs that run
    # on the Dataproc cluster. We propagate some currently set environment variables
    # this way.
    spark_env = []
    for env_var in 'DATASET', 'ACCESS_LEVEL', 'OUTPUT':
        value = os.getenv(env_var)
        assert value, f'environment variable "{env_var}" is not set'
        spark_env.append(f'spark-env:{env_var}={value}')

    # Note that the options and their values must be separated by an equal sign.
    # Using a space will break some options like --label
    start_job_command = [
        'hailctl dataproc start',
        f'--service-account=dataproc-{os.getenv("ACCESS_LEVEL")}@{os.getenv("DATASET_GCP_PROJECT")}.iam.gserviceaccount.com',
        f'--max-age={max_age}',
        f'--num-workers={num_workers}',
        f'--num-secondary-workers={num_secondary_workers}',
        f'--properties="{",".join(spark_env)}"',
        f'--labels={labels_formatted}',
    ]
    if worker_machine_type:
        start_job_command.append(
            f'--worker-machine-type={worker_machine_type}')
    if worker_boot_disk_size:
        start_job_command.append(
            f'--worker-boot-disk-size={worker_boot_disk_size}')
    if secondary_worker_boot_disk_size:
        start_job_command.append(
            f'--secondary-worker-boot-disk-size={secondary_worker_boot_disk_size}'
        )
    if packages:
        start_job_command.append(f'--packages={",".join(packages)}')
    if init:
        start_job_command.append(f'--init={",".join(init)}')
    if vep:
        start_job_command.append(f'--vep={vep}')
    if requester_pays_allow_all:
        start_job_command.append(f'--requester-pays-allow-all')
    if scopes:
        start_job_command.append(f'--scopes={",".join(scopes)}')

    if autoscaling_policy:
        start_job_command.append(f'--autoscaling-policy={autoscaling_policy}')

    start_job_command.append(cluster_id)

    start_job.command(' '.join(start_job_command))
    return start_job, cluster_id