def test_large_command(self): backend = ServiceBackend( remote_tmpdir=f'gs://{HAIL_TEST_GCS_BUCKET}/temporary-files') b = Batch(backend=backend) j1 = b.new_job() long_str = secrets.token_urlsafe(15 * 1024) j1.command(f'echo "{long_str}"') b.run()
def test_big_batch_which_uses_slow_path(self): backend = ServiceBackend( remote_tmpdir=f'{self.remote_tmpdir}/temporary-files') b = Batch(backend=backend) # 8 * 256 * 1024 = 2 MiB > 1 MiB max bunch size for i in range(8): j1 = b.new_job() long_str = secrets.token_urlsafe(256 * 1024) j1.command(f'echo "{long_str}"') batch = b.run() assert not batch.submission_info.used_fast_create batch_status = batch.status() assert batch_status['state'] == 'success', str((batch.debug_info()))
def batch(self): return Batch(backend=self.backend, default_image='google/cloud-sdk:237.0.0-alpine', attributes={ 'foo': 'a', 'bar': 'b' })
def batch(self, requester_pays_project=None): return Batch(backend=self.backend, default_image='google/cloud-sdk:237.0.0-alpine', attributes={ 'foo': 'a', 'bar': 'b' }, requester_pays_project=requester_pays_project)
def batch(self, requester_pays_project=None, default_python_image=None, cancel_after_n_failures=None): return Batch(backend=self.backend, default_image=DOCKER_ROOT_IMAGE, attributes={'foo': 'a', 'bar': 'b'}, requester_pays_project=requester_pays_project, default_python_image=default_python_image, cancel_after_n_failures=cancel_after_n_failures)
def test_service_backend_bucket_parameter(self): backend = ServiceBackend(bucket=HAIL_TEST_GCS_BUCKET) b = Batch(backend=backend) j1 = b.new_job() j1.command(f'echo hello > {j1.ofile}') j2 = b.new_job() j2.command(f'cat {j1.ofile}') b.run()
def test_service_backend_remote_tempdir_with_no_trailing_slash(self): backend = ServiceBackend( remote_tmpdir=f'gs://{HAIL_TEST_GCS_BUCKET}/temporary-files/') b = Batch(backend=backend) j1 = b.new_job() j1.command(f'echo hello > {j1.ofile}') j2 = b.new_job() j2.command(f'cat {j1.ofile}') b.run()
def _add_submit_job( batch: hb.Batch, cluster_id: str, script: str, pyfiles: Optional[List[str]] = None, job_name: Optional[str] = None, cluster_name: Optional[str] = None, ) -> hb.batch.job.Job: """ Returns a job that submits a script to the Dataproc cluster specified by `cluster_id`. It's the user's responsibility to start and stop that cluster with the `start_cluster` and `stop_cluster` functions """ job_name_prefix = f'{job_name}: s' if job_name else 'S' job_name = f'{job_name_prefix}ubmit to Dataproc cluster' if cluster_name: job_name += f' "{cluster_name}"' main_job = batch.new_job(name=job_name) main_job.image(DATAPROC_IMAGE) main_job.command(GCLOUD_ACTIVATE_AUTH) main_job.command(GCLOUD_PROJECT) main_job.command(DATAPROC_REGION) # Clone the repository to pass scripts to the cluster. prepare_git_job( job=main_job, repo_name=get_repo_name_from_remote(get_git_default_remote()), commit=get_git_commit_ref_of_current_repository(), ) cwd = get_relative_path_from_git_root() if cwd: main_job.command(f'cd {quote(cwd)}') if pyfiles: main_job.command(f'mkdir {PYFILES_DIR}') main_job.command(f'cp -r {" ".join(pyfiles)} {PYFILES_DIR}') main_job.command(f'cd {PYFILES_DIR}') main_job.command(f'zip -r {PYFILES_ZIP} .') main_job.command(f'cd -') main_job.command(f'hailctl dataproc submit ' + ( f'--pyfiles {PYFILES_DIR}/{PYFILES_ZIP} ' if pyfiles else '') + f'{cluster_id} {script} ') return main_job
def _add_stop_job( batch: hb.Batch, cluster_id: str, job_name: Optional[str] = None, cluster_name: Optional[str] = None, ) -> hb.batch.job.Job: """ Returns a job that stops the Dataproc cluster specified by `cluster_id` """ job_name_prefix = f'{job_name}: s' if job_name else 'S' job_name = f'{job_name_prefix}top Dataproc cluster' if cluster_name: job_name += f' "{cluster_name}"' stop_job = batch.new_job(name=job_name) stop_job.always_run() # Always clean up. stop_job.image(DATAPROC_IMAGE) stop_job.command(GCLOUD_ACTIVATE_AUTH) stop_job.command(GCLOUD_PROJECT) stop_job.command(DATAPROC_REGION) stop_job.command(f'hailctl dataproc stop {cluster_id}') return stop_job
def test_backend_context_manager(self): with LocalBackend() as backend: b = Batch(backend=backend) b.run()
def batch(self, requester_pays_project=None): return Batch(backend=LocalBackend(), requester_pays_project=requester_pays_project)
def _add_start_job( # pylint: disable=too-many-arguments batch: hb.Batch, max_age: str, num_workers: int = 2, num_secondary_workers: int = 0, autoscaling_policy: Optional[str] = None, worker_machine_type: Optional[str] = None, # e.g. 'n1-highmem-8' worker_boot_disk_size: Optional[int] = None, # in GB secondary_worker_boot_disk_size: Optional[int] = None, # in GB packages: Optional[List[str]] = None, init: Optional[List[str]] = None, vep: Optional[str] = None, requester_pays_allow_all: bool = False, cluster_name: Optional[str] = None, job_name: Optional[str] = None, scopes: Optional[List[str]] = None, labels: Optional[Dict[str, str]] = None, ) -> Tuple[hb.batch.job.Job, str]: """ Returns a Batch job which starts a Dataproc cluster, and the name of the cluster. The user is respondible for stopping the cluster. See the `hailctl` tool for information on the keyword parameters. """ cluster_id = f'dp-{uuid.uuid4().hex[:20]}' job_name_prefix = f'{job_name}: s' if job_name else 'S' job_name = f'{job_name_prefix}tart Dataproc cluster' if cluster_name: job_name += f' "{cluster_name}"' cluster_name = re.sub(r'[^a-zA-Z0-9]+', '-', cluster_name.lower()) # Cluster id can't be longer than 49 characters cluster_id = f'{cluster_id}-{cluster_name}'[:49] if labels is None: labels = {} labels['compute-category'] = 'dataproc' labels_formatted = ','.join(f'{key}={value}' for key, value in labels.items()) start_job = batch.new_job(name=job_name) start_job.image(DATAPROC_IMAGE) start_job.command(GCLOUD_ACTIVATE_AUTH) start_job.command(GCLOUD_PROJECT) start_job.command(DATAPROC_REGION) # The spark-env property can be used to set environment variables in jobs that run # on the Dataproc cluster. We propagate some currently set environment variables # this way. spark_env = [] for env_var in 'DATASET', 'ACCESS_LEVEL', 'OUTPUT': value = os.getenv(env_var) assert value, f'environment variable "{env_var}" is not set' spark_env.append(f'spark-env:{env_var}={value}') # Note that the options and their values must be separated by an equal sign. # Using a space will break some options like --label start_job_command = [ 'hailctl dataproc start', f'--service-account=dataproc-{os.getenv("ACCESS_LEVEL")}@{os.getenv("DATASET_GCP_PROJECT")}.iam.gserviceaccount.com', f'--max-age={max_age}', f'--num-workers={num_workers}', f'--num-secondary-workers={num_secondary_workers}', f'--properties="{",".join(spark_env)}"', f'--labels={labels_formatted}', ] if worker_machine_type: start_job_command.append( f'--worker-machine-type={worker_machine_type}') if worker_boot_disk_size: start_job_command.append( f'--worker-boot-disk-size={worker_boot_disk_size}') if secondary_worker_boot_disk_size: start_job_command.append( f'--secondary-worker-boot-disk-size={secondary_worker_boot_disk_size}' ) if packages: start_job_command.append(f'--packages={",".join(packages)}') if init: start_job_command.append(f'--init={",".join(init)}') if vep: start_job_command.append(f'--vep={vep}') if requester_pays_allow_all: start_job_command.append(f'--requester-pays-allow-all') if scopes: start_job_command.append(f'--scopes={",".join(scopes)}') if autoscaling_policy: start_job_command.append(f'--autoscaling-policy={autoscaling_policy}') start_job_command.append(cluster_id) start_job.command(' '.join(start_job_command)) return start_job, cluster_id
def batch(self): return Batch(backend=LocalBackend())
def batch(self, requester_pays_project=None): return Batch(backend=self.backend, default_image=DOCKER_ROOT_IMAGE, attributes={'foo': 'a', 'bar': 'b'}, requester_pays_project=requester_pays_project)