def _run(self, pipeline, dry_run, verbose, delete_scratch_on_exit): # pylint: disable-msg=R0915 build_dag_start = time.time() bucket = self._batch_client.bucket subdir_name = 'pipeline-{}'.format(uuid.uuid4().hex[:12]) remote_tmpdir = f'gs://{bucket}/pipeline/{subdir_name}' local_tmpdir = f'/io/pipeline/{subdir_name}' default_image = 'ubuntu:latest' attributes = pipeline.attributes if pipeline.name is not None: attributes['name'] = pipeline.name batch = self._batch_client.create_batch(attributes=attributes) n_jobs_submitted = 0 used_remote_tmpdir = False task_to_job_mapping = {} jobs_to_command = {} commands = [] bash_flags = 'set -e' + ('x' if verbose else '') + '; ' activate_service_account = 'gcloud -q auth activate-service-account ' \ '--key-file=/gsa-key/privateKeyData' def copy_input(r): if isinstance(r, InputResourceFile): return [(r._input_path, r._get_path(local_tmpdir))] assert isinstance(r, TaskResourceFile) return [(r._get_path(remote_tmpdir), r._get_path(local_tmpdir))] def copy_internal_output(r): assert isinstance(r, TaskResourceFile) return [(r._get_path(local_tmpdir), r._get_path(remote_tmpdir))] def copy_external_output(r): if isinstance(r, InputResourceFile): return [(r._input_path, dest) for dest in r._output_paths] assert isinstance(r, TaskResourceFile) return [(r._get_path(local_tmpdir), dest) for dest in r._output_paths] write_external_inputs = [ x for r in pipeline._input_resources for x in copy_external_output(r) ] if write_external_inputs: def _cp(src, dst): return f'gsutil -m cp -R {src} {dst}' write_cmd = bash_flags + activate_service_account + ' && ' + \ ' && '.join([_cp(*files) for files in write_external_inputs]) if dry_run: commands.append(write_cmd) else: j = batch.create_job( image='google/cloud-sdk:237.0.0-alpine', command=['/bin/bash', '-c', write_cmd], attributes={'name': 'write_external_inputs'}) jobs_to_command[j] = write_cmd n_jobs_submitted += 1 for task in pipeline._tasks: inputs = [x for r in task._inputs for x in copy_input(r)] outputs = [ x for r in task._internal_outputs for x in copy_internal_output(r) ] if outputs: used_remote_tmpdir = True outputs += [ x for r in task._external_outputs for x in copy_external_output(r) ] resource_defs = [ r._declare(directory=local_tmpdir) for r in task._mentioned ] if task._image is None: if verbose: print( f"Using image '{default_image}' since no image was specified." ) make_local_tmpdir = f'mkdir -p {local_tmpdir}/{task._uid}/; ' defs = '; '.join(resource_defs) + '; ' if resource_defs else '' task_command = [cmd.strip() for cmd in task._command] cmd = bash_flags + make_local_tmpdir + defs + " && ".join( task_command) if dry_run: commands.append(cmd) continue parents = [task_to_job_mapping[t] for t in task._dependencies] attributes = {'task_uid': task._uid} if task.name: attributes['name'] = task.name attributes.update(task.attributes) resources = {} if task._cpu: resources['cpu'] = task._cpu if task._memory: resources['memory'] = task._memory j = batch.create_job( image=task._image if task._image else default_image, command=['/bin/bash', '-c', cmd], parents=parents, attributes=attributes, resources=resources, input_files=inputs if len(inputs) > 0 else None, output_files=outputs if len(outputs) > 0 else None, pvc_size=task._storage) n_jobs_submitted += 1 task_to_job_mapping[task] = j jobs_to_command[j] = cmd if dry_run: print("\n\n".join(commands)) return if delete_scratch_on_exit and used_remote_tmpdir: parents = list(jobs_to_command.keys()) rm_cmd = f'gsutil -m rm -r {remote_tmpdir}' cmd = bash_flags + f'{activate_service_account} && {rm_cmd}' j = batch.create_job(image='google/cloud-sdk:237.0.0-alpine', command=['/bin/bash', '-c', cmd], parents=parents, attributes={'name': 'remove_tmpdir'}, always_run=True) jobs_to_command[j] = cmd n_jobs_submitted += 1 if verbose: print( f'Built DAG with {n_jobs_submitted} jobs in {round(time.time() - build_dag_start, 3)} seconds.' ) submit_batch_start = time.time() batch = batch.submit() jobs_to_command = {j.id: cmd for j, cmd in jobs_to_command.items()} if verbose: print( f'Submitted batch {batch.id} with {n_jobs_submitted} jobs in {round(time.time() - submit_batch_start, 3)} seconds:' ) for jid, cmd in jobs_to_command.items(): print(f'{jid}: {cmd}') status = batch.wait() if status['state'] == 'success': print('Pipeline completed successfully!') return failed_jobs = [(j, Job.exit_code(j)) for j in status['jobs']] failed_jobs = [((j['batch_id'], j['job_id']), Job._get_exit_codes(j)) for j, ec in failed_jobs if ec != 0] fail_msg = '' for jid, ec in failed_jobs: ec = Job.exit_code(ec) job = self._batch_client.get_job(*jid) log = job.log() name = job.status()['attributes'].get('name', None) fail_msg += (f"Job {jid} failed with exit code {ec}:\n" f" Task name:\t{name}\n" f" Command:\t{jobs_to_command[jid]}\n" f" Log:\t{log}\n") raise PipelineException(fail_msg)
def batch_status_exit_codes(batch_status): return [Job._get_exit_codes(j) for j in batch_status['jobs']]