def define_output(self): # Declare GVCF output filename randomer = Platform.generate_unique_id() gvcf = self.generate_unique_file_name(extension="{0}.g.vcf".format(randomer)) self.add_output("gvcf", gvcf) # Declare GVCF index output filename gvcf_idx = self.generate_unique_file_name(extension="{0}.g.vcf.idx".format(randomer)) self.add_output("gvcf_idx", gvcf_idx)
def define_output(self): # Generate randomer string to attach with file name randomer = Platform.generate_unique_id() # Declare VCF output file vcf = self.generate_unique_file_name(extension=f'{randomer}.vcf.gz') self.add_output("vcf_gz", vcf) # Declare VCF index output filename vcf_idx = self.generate_unique_file_name( extension=f'{randomer}.vcf.gz.tbi') self.add_output("vcf_idx", vcf_idx)
def mkdir(self, dir_path, job_name=None, log=False, wait=False, **kwargs): # Makes a directory if it doesn't already exists cmd_generator = StorageHelper.__get_storage_cmd_generator(dir_path) cmd = cmd_generator.mkdir(dir_path) job_name = "mkdir_%s" % Platform.generate_unique_id( ) if job_name is None else job_name # Optionally add logging cmd = "%s !LOG3!" % cmd if log else cmd # Run command and return job name self.proc.run(job_name, cmd, **kwargs) if wait: self.proc.wait_process(job_name) return job_name
def define_output(self): # Declare GVCF output filename randomer = Platform.generate_unique_id() # generate uniques file name based on the output mode set for the Haplotypecaller if self.get_argument("output_type") == "gvcf": gvcf = self.generate_unique_file_name(extension="{0}.g.vcf".format(randomer)) self.add_output("gvcf", gvcf) # Declare GVCF index output filename gvcf_idx = self.generate_unique_file_name(extension="{0}.g.vcf.idx".format(randomer)) self.add_output("gvcf_idx", gvcf_idx) else: vcf = self.generate_unique_file_name(extension="{0}.vcf".format(randomer)) self.add_output("vcf", vcf) # Declare VCF index output filename vcf_idx = self.generate_unique_file_name(extension="{0}.vcf.idx".format(randomer)) self.add_output("vcf_idx", vcf_idx)
def rm(self, path, job_name=None, log=True, wait=False, **kwargs): # Delete file from file system # Log the transfer unless otherwise specified cmd_generator = StorageHelper.__get_storage_cmd_generator(path) cmd = cmd_generator.rm(path) job_name = "rm_%s" % Platform.generate_unique_id( ) if job_name is None else job_name # Optionally add logging cmd = "%s !LOG3!" % cmd if log else cmd # Run command and return job name self.proc.run(job_name, cmd, **kwargs) if wait: self.proc.wait_process(job_name) return job_name
def __init__(self, wrk_dir, tmp_output_dir, final_output_dir): # Work dir: Folder where input/output files will be generated by task # tmp_output_dir: Folder where temporary final output will be saved until all tasks are finished # final_output_dir: Folder where final output files will be saved self.workspace = { "wrk": wrk_dir, "tmp_output": tmp_output_dir, "final_output": final_output_dir } # Define wrk/final log directories self.workspace["wrk_log"] = os.path.join(wrk_dir, "log/") self.workspace["final_log"] = os.path.join(final_output_dir, "log/") # Standardize directory paths for dir_type, dir_path in self.workspace.iteritems(): dir_path = Platform.standardize_dir(dir_path) self.workspace[dir_type] = dir_path
def path_exists(self, path, job_name=None, **kwargs): # Return true if file exists, false otherwise cmd_generator = StorageHelper.__get_storage_cmd_generator(path) cmd = cmd_generator.ls(path) # Run command and return job name job_name = "check_exists_%s" % Platform.generate_unique_id( ) if job_name is None else job_name self.proc.run(job_name, cmd, quiet_failure=False, **kwargs) # Wait for cmd to finish and get output try: self.proc.wait_process(job_name) return True except RuntimeError as e: if str(e) != "": logging.debug("StorageHelper error for %s:\n%s" % (job_name, e)) return False except: logging.error("Unable to check path existence: %s" % path) raise
def get_file_size(self, path, job_name=None, **kwargs): # Return file size in gigabytes cmd_generator = StorageHelper.__get_storage_cmd_generator(path) cmd = cmd_generator.get_file_size(path) # Run command and return job name job_name = "get_size_%s" % Platform.generate_unique_id( ) if job_name is None else job_name self.proc.run(job_name, cmd, **kwargs) # Wait for cmd to finish and get output try: # Try to return file size in gigabytes out, err = self.proc.wait_process(job_name) # Iterate over all files if multiple files (can happen if wildcard) bytes = [int(x.split()[0]) for x in out.split("\n") if x != ""] # Add them up and divide by billion bytes return sum(bytes) / (1024**3.0) except BaseException as e: logging.error("Unable to get file size: %s" % path) if str(e) != "": logging.error("Received the following msg:\n%s" % e) raise
def mv(self, src_path, dest_path, job_name=None, log=True, wait=False, **kwargs): # Transfer file or dir from src_path to dest_path # Log the transfer unless otherwise specified cmd_generator = StorageHelper.__get_storage_cmd_generator( src_path, dest_path) cmd = cmd_generator.mv(src_path, dest_path) job_name = "mv_%s" % Platform.generate_unique_id( ) if job_name is None else job_name # Optionally add logging cmd = "%s !LOG3!" % cmd if log else cmd # Run command and return job name self.proc.run(job_name, cmd, **kwargs) if wait: self.proc.wait_process(job_name) return job_name
def define_output(self): randomer = Platform.generate_unique_id() vcf_gz = self.generate_unique_file_name(extension="{0}.vcf.gz".format(randomer)) self.add_output("vcf_gz", vcf_gz)
def __create_job_def(self, post_processing_required=False): # initialize the job def body self.inst_name = self.name if self.job_count > 1: self.inst_name = self.inst_name + '-' + str(self.job_count) self.job_count += 1 job_def = client.V1Job(kind="Job") job_def.metadata = client.V1ObjectMeta(namespace=self.namespace, name=self.inst_name) # initialize job pieces self.job_containers = [] volume_mounts = [] volumes = [] containers = [] init_containers = [] env_variables = [] if not self.volume_name: # use the task name so it can be used across multiple jobs self.volume_name = self.name + '-pd' # build volume mounts volume_mounts = [] volume_mounts.append( client.V1VolumeMount(mount_path=self.wrk_dir, name=self.volume_name)) cpu_request_max = self.nodepool_info['max_cpu'] - self.cpu_reserve mem_request_max = self.nodepool_info['max_mem'] - self.mem_reserve # define resource limits/requests resource_def = client.V1ResourceRequirements( limits={ 'cpu': cpu_request_max, 'memory': str(mem_request_max) + 'G' }, requests={ 'cpu': cpu_request_max * .8, 'memory': str(mem_request_max - 1) + 'G' }) # update script task with job info if self.script_task: self.script_task.cpu_request = cpu_request_max * .8 self.script_task.cpu_max = cpu_request_max self.script_task.memory_request = mem_request_max - 1 self.script_task.memory_max = mem_request_max self.script_task.instance_name = self.inst_name self.script_task.force_standard = not self.preemptible self.script_task.pool_name = str(self.node_label) self.script_task.instance_type = str( self.nodepool_info["inst_type"]) # place the job in the appropriate node pool node_label_dict = {'poolName': str(self.node_label)} # build volumes volumes.append( client.V1Volume( name=self.volume_name, persistent_volume_claim=client. V1PersistentVolumeClaimVolumeSource(claim_name=self.pvc_name))) # incorporate configured persistent volumes if associated with the current task if self.extra_persistent_volumes: for pv in self.extra_persistent_volumes: if pv['task_prefix'] in self.name: claim_name = pv["pvc_name"] if 'dynamic' in pv and pv['dynamic']: claim_name = claim_name[: 57] + '-' + Platform.generate_unique_id( id_len=5) # need to add the extra persistent volume volume_mounts.append( client.V1VolumeMount(mount_path=pv["path"], name=pv['volume_name'], read_only=pv['read_only'])) volumes.append( client.V1Volume(name=pv['volume_name'], persistent_volume_claim=client. V1PersistentVolumeClaimVolumeSource( claim_name=claim_name))) # specify volumes for script task if self.script_task: if 'dynamic' in pv and pv['dynamic']: self.script_task.extra_volumes.append({ "path": pv["path"], "name": pv["volume_name"], "storage": pv["size"], "read_only": pv["read_only"], "claim_name": claim_name, "command": pv["copy_command"], "dynamic": True }) else: self.script_task.extra_volumes.append({ "path": pv["path"], "name": pv["volume_name"], "read_only": pv["read_only"], "claim_name": claim_name, "dynamic": False }) # incorporate configured secrets if self.gcp_secret_configured: volume_mounts.append( client.V1VolumeMount( mount_path="/etc/cloud_conductor/gcp.json", sub_path="gcp.json", name="secret-volume", read_only=True)) volumes.append( client.V1Volume(name="secret-volume", secret=client.V1SecretVolumeSource( secret_name="cloud-conductor-config", items=[ client.V1KeyToPath(key="gcp_json", path="gcp.json") ]))) env_variables.append( client.V1EnvVar(name='GOOGLE_APPLICATION_CREDENTIALS', value='/etc/cloud_conductor/gcp.json')) env_variables.append( client.V1EnvVar(name='RCLONE_CONFIG_GS_TYPE', value='google cloud storage')) env_variables.append( client.V1EnvVar(name='RCLONE_CONFIG_GS_SERVICE_ACCOUNT_FILE', value='$GOOGLE_APPLICATION_CREDENTIALS')) env_variables.append( client.V1EnvVar(name='RCLONE_CONFIG_GS_OBJECT_ACL', value='projectPrivate')) env_variables.append( client.V1EnvVar(name='RCLONE_CONFIG_GS_BUCKET_ACL', value='projectPrivate')) if self.aws_secret_configured: env_variables.append( client.V1EnvVar( name='AWS_ACCESS_KEY_ID', value_from=client.V1EnvVarSource( secret_key_ref=client.V1SecretKeySelector( name='cloud-conductor-config', key='aws_id')))) env_variables.append( client.V1EnvVar( name='AWS_SECRET_ACCESS_KEY', value_from=client.V1EnvVarSource( secret_key_ref=client.V1SecretKeySelector( name='cloud-conductor-config', key='aws_access')))) env_variables.append( client.V1EnvVar(name='RCLONE_CONFIG_S3_TYPE', value='s3')) env_variables.append( client.V1EnvVar( name='RCLONE_CONFIG_S3_ACCESS_KEY_ID', value_from=client.V1EnvVarSource( secret_key_ref=client.V1SecretKeySelector( name='cloud-conductor-config', key='aws_id')))) env_variables.append( client.V1EnvVar( name='RCLONE_CONFIG_S3_SECRET_ACCESS_KEY', value_from=client.V1EnvVarSource( secret_key_ref=client.V1SecretKeySelector( name='cloud-conductor-config', key='aws_access')))) storage_image = 'gcr.io/cloud-builders/gsutil' storage_tasks = ['mkdir_', 'grant_'] container_name_list = [] for k, v in self.processes.items(): # if the process is for storage (i.e. mkdir, etc.) entrypoint = ["/bin/bash", "-c"] if any(x in k for x in storage_tasks) or not v['docker_image']: container_image = storage_image else: container_image = v['docker_image'] if v['docker_entrypoint'] is not None and v[ 'original_cmd'].find(v['docker_entrypoint']) == -1: v['original_cmd'] = v['docker_entrypoint'] + ' ' + v[ 'original_cmd'] if 'rclone' in container_image: v['original_cmd'] = v['original_cmd'].replace( "|&", "2>&1 |") entrypoint = ["/bin/sh", "-c"] args = v['original_cmd'] if not isinstance(args, list): args = [v['original_cmd'].replace("sudo ", "")] args = " && ".join(args) args = args.replace("\n", " ") args = args.replace("java.io.tmpdir=/tmp/", "java.io.tmpdir=/data/tmp/") if "awk " in args: args = re.sub("'\"'\"'", "'", args) if "gsutil" in args: args = "gcloud auth activate-service-account --key-file $GOOGLE_APPLICATION_CREDENTIALS && sleep 10; " + args # add in pipe error handling # if "copy_input" in k or "copy_output" in k: # args = "set -o pipefail && " + args logging.debug(f"({self.name}) Command for task {k} is : {args}") # format the container name and roll call to logging container_name = k.replace("_", "-").replace(".", "-").lower() formatted_container_name = container_name[: 57] + '-' + Platform.generate_unique_id( id_len=5) while formatted_container_name in container_name_list: # make sure all container names are unique formatted_container_name = container_name[: 57] + '-' + Platform.generate_unique_id( id_len=5) container_name_list.append(formatted_container_name) # args = f">&2 echo STARTING TASK {container_name} && " + args containers.append( client.V1Container( # lifecycle=client.V1Lifecycle(post_start=post_start_handler), image=container_image, command=entrypoint, args=[args], name=formatted_container_name, volume_mounts=volume_mounts, env=env_variables, resources=resource_def, image_pull_policy='IfNotPresent')) if self.script_task and container_name not in self.script_task.commands: self.script_task.commands[container_name] = ({ "name": formatted_container_name, "docker_image": container_image, "entrypoint": entrypoint, "args": [args] }) job_spec = dict(backoff_limit=self.default_num_cmd_retries) self.job_containers = containers # Run jobs in order using init_containers # See https://kubernetes.io/docs/concepts/workloads/pods/init-containers/ if len(containers) > 1: init_containers = containers[:-1] containers = [containers[-1]] else: containers = containers init_containers = None # define the pod spec job_template = client.V1PodTemplateSpec() job_labels = {} job_labels[self.inst_name] = 'CC-Job' # add annotation to prevent autoscaler from killing nodes running jobs annotations = { 'cluster-autoscaler.kubernetes.io/safe-to-evict': 'false' } job_template.metadata = client.V1ObjectMeta(labels=job_labels, annotations=annotations) job_template.spec = client.V1PodSpec( init_containers=init_containers, containers=containers, volumes=volumes, restart_policy='Never', termination_grace_period_seconds=self.termination_seconds, node_selector=node_label_dict) job_def.spec = client.V1JobSpec(template=job_template, **job_spec) if self.script_task: self.script_task.num_retries = self.default_num_cmd_retries for k, v in job_labels.items(): self.script_task.labels.append({"key": k, "value": v}) for k, v in annotations.items(): self.script_task.annotations.append({"key": k, "value": v}) return job_def
def load_input(self, inputs): # List of jobs that have been started in process of loading input job_names = [] # Pull docker image if necessary if self.docker_image is not None: docker_image_name = self.docker_image.get_image_name().split( "/")[0] docker_image_name = docker_image_name.replace(":", "_") job_name = "docker_pull_%s" % docker_image_name self.docker_helper.pull(self.docker_image.get_image_name(), job_name=job_name) job_names.append(job_name) # Load input files # Inputs: list containing remote files, local files, and docker images src_seen = [] dest_seen = [] count = 1 batch_size = 5 loading_counter = 0 for task_input in inputs: # Don't transfer local files if ":" not in task_input.get_path(): continue # Directory where input will be transferred dest_dir = self.workspace.get_wrk_dir() # Input filename after transfer (None = same as src) dest_filename = None # Case: Transfer file into wrk directory if its not already there if task_input.get_transferrable_path() not in src_seen: # Get name of file that's going to be transferred src_path = task_input.get_transferrable_path() job_name = "load_input_%s_%s_%s" % ( self.task_id, task_input.get_type(), count) logging.debug("Input path: %s, transfer path: %s" % (task_input.get_path(), src_path)) # Generate complete transfer path dest_path = os.path.join(dest_dir, task_input.filename) # Check to see if transferring file would overwrite existing file if dest_path in dest_seen: # Add unique tag to destination filename to prevent overwrite if task_input.sample_name is not None: dest_filename = "{0}_{1}".format( task_input.sample_name, task_input.filename) else: dest_filename = "{0}_{1}".format( Platform.generate_unique_id(), dest_filename) logging.debug( "Changing filename from '{0}' to '{1}'.".format( task_input.filename, dest_filename)) dest_path = os.path.join(dest_dir, dest_filename) else: dest_filename = None dest_path = dest_dir # Show the final log file logging.debug("Destination: {0}".format(dest_path)) # Move file to dest_path self.storage_helper.mv(src_path=src_path, dest_path=dest_path, job_name=job_name) loading_counter += 1 # Add transfer path to list of remote paths that have been transferred to local workspace src_seen.append(src_path) count += 1 job_names.append(job_name) # If loading_counter is batch_size, clear out queue if loading_counter >= batch_size: logging.debug("Batch size reached on task {0}".format( self.task_id)) # Wait for all processes to finish while len(job_names): self.processor.wait_process(job_names.pop()) loading_counter = 0 # Update path after transferring to wrk directory and add to list of files in working directory task_input.update_path(new_dir=dest_dir, new_filename=dest_filename) dest_seen.append(task_input.get_path()) logging.debug("Updated path: %s" % task_input.get_path()) # Wait for all processes to finish for job_name in job_names: self.processor.wait_process(job_name) # Recursively give every permission to all files we just added logging.info("(%s) Final workspace perm. update for task '%s'..." % (self.processor.name, self.task_id)) self.__grant_workspace_perms(job_name="grant_final_wrkspace_perms")