Ejemplo n.º 1
0
 def define_output(self):
     # Declare GVCF output filename
     randomer = Platform.generate_unique_id()
     gvcf = self.generate_unique_file_name(extension="{0}.g.vcf".format(randomer))
     self.add_output("gvcf", gvcf)
     # Declare GVCF index output filename
     gvcf_idx = self.generate_unique_file_name(extension="{0}.g.vcf.idx".format(randomer))
     self.add_output("gvcf_idx", gvcf_idx)
Ejemplo n.º 2
0
    def define_output(self):

        # Generate randomer string to attach with file name
        randomer = Platform.generate_unique_id()

        # Declare VCF output file
        vcf = self.generate_unique_file_name(extension=f'{randomer}.vcf.gz')
        self.add_output("vcf_gz", vcf)

        # Declare VCF index output filename
        vcf_idx = self.generate_unique_file_name(
            extension=f'{randomer}.vcf.gz.tbi')
        self.add_output("vcf_idx", vcf_idx)
Ejemplo n.º 3
0
    def mkdir(self, dir_path, job_name=None, log=False, wait=False, **kwargs):
        # Makes a directory if it doesn't already exists
        cmd_generator = StorageHelper.__get_storage_cmd_generator(dir_path)
        cmd = cmd_generator.mkdir(dir_path)

        job_name = "mkdir_%s" % Platform.generate_unique_id(
        ) if job_name is None else job_name

        # Optionally add logging
        cmd = "%s !LOG3!" % cmd if log else cmd

        # Run command and return job name
        self.proc.run(job_name, cmd, **kwargs)
        if wait:
            self.proc.wait_process(job_name)
        return job_name
Ejemplo n.º 4
0
 def define_output(self):
     # Declare GVCF output filename
     randomer = Platform.generate_unique_id()
     # generate uniques file name based on the output mode set for the Haplotypecaller
     if self.get_argument("output_type") == "gvcf":
         gvcf = self.generate_unique_file_name(extension="{0}.g.vcf".format(randomer))
         self.add_output("gvcf", gvcf)
         # Declare GVCF index output filename
         gvcf_idx = self.generate_unique_file_name(extension="{0}.g.vcf.idx".format(randomer))
         self.add_output("gvcf_idx", gvcf_idx)
     else:
         vcf = self.generate_unique_file_name(extension="{0}.vcf".format(randomer))
         self.add_output("vcf", vcf)
         # Declare VCF index output filename
         vcf_idx = self.generate_unique_file_name(extension="{0}.vcf.idx".format(randomer))
         self.add_output("vcf_idx", vcf_idx)
Ejemplo n.º 5
0
    def rm(self, path, job_name=None, log=True, wait=False, **kwargs):
        # Delete file from file system
        # Log the transfer unless otherwise specified
        cmd_generator = StorageHelper.__get_storage_cmd_generator(path)
        cmd = cmd_generator.rm(path)

        job_name = "rm_%s" % Platform.generate_unique_id(
        ) if job_name is None else job_name

        # Optionally add logging
        cmd = "%s !LOG3!" % cmd if log else cmd

        # Run command and return job name
        self.proc.run(job_name, cmd, **kwargs)
        if wait:
            self.proc.wait_process(job_name)
        return job_name
Ejemplo n.º 6
0
    def __init__(self, wrk_dir, tmp_output_dir, final_output_dir):
        # Work dir: Folder where input/output files will be generated by task
        # tmp_output_dir: Folder where temporary final output will be saved until all tasks are finished
        # final_output_dir: Folder where final output files will be saved
        self.workspace = {
            "wrk": wrk_dir,
            "tmp_output": tmp_output_dir,
            "final_output": final_output_dir
        }

        # Define wrk/final log directories
        self.workspace["wrk_log"] = os.path.join(wrk_dir, "log/")
        self.workspace["final_log"] = os.path.join(final_output_dir, "log/")

        # Standardize directory paths
        for dir_type, dir_path in self.workspace.iteritems():
            dir_path = Platform.standardize_dir(dir_path)
            self.workspace[dir_type] = dir_path
Ejemplo n.º 7
0
    def path_exists(self, path, job_name=None, **kwargs):
        # Return true if file exists, false otherwise
        cmd_generator = StorageHelper.__get_storage_cmd_generator(path)
        cmd = cmd_generator.ls(path)

        # Run command and return job name
        job_name = "check_exists_%s" % Platform.generate_unique_id(
        ) if job_name is None else job_name
        self.proc.run(job_name, cmd, quiet_failure=False, **kwargs)

        # Wait for cmd to finish and get output
        try:
            self.proc.wait_process(job_name)
            return True
        except RuntimeError as e:
            if str(e) != "":
                logging.debug("StorageHelper error for %s:\n%s" %
                              (job_name, e))
            return False
        except:
            logging.error("Unable to check path existence: %s" % path)
            raise
Ejemplo n.º 8
0
    def get_file_size(self, path, job_name=None, **kwargs):
        # Return file size in gigabytes
        cmd_generator = StorageHelper.__get_storage_cmd_generator(path)
        cmd = cmd_generator.get_file_size(path)

        # Run command and return job name
        job_name = "get_size_%s" % Platform.generate_unique_id(
        ) if job_name is None else job_name
        self.proc.run(job_name, cmd, **kwargs)

        # Wait for cmd to finish and get output
        try:
            # Try to return file size in gigabytes
            out, err = self.proc.wait_process(job_name)
            # Iterate over all files if multiple files (can happen if wildcard)
            bytes = [int(x.split()[0]) for x in out.split("\n") if x != ""]
            # Add them up and divide by billion bytes
            return sum(bytes) / (1024**3.0)

        except BaseException as e:
            logging.error("Unable to get file size: %s" % path)
            if str(e) != "":
                logging.error("Received the following msg:\n%s" % e)
            raise
Ejemplo n.º 9
0
    def mv(self,
           src_path,
           dest_path,
           job_name=None,
           log=True,
           wait=False,
           **kwargs):
        # Transfer file or dir from src_path to dest_path
        # Log the transfer unless otherwise specified
        cmd_generator = StorageHelper.__get_storage_cmd_generator(
            src_path, dest_path)
        cmd = cmd_generator.mv(src_path, dest_path)

        job_name = "mv_%s" % Platform.generate_unique_id(
        ) if job_name is None else job_name

        # Optionally add logging
        cmd = "%s !LOG3!" % cmd if log else cmd

        # Run command and return job name
        self.proc.run(job_name, cmd, **kwargs)
        if wait:
            self.proc.wait_process(job_name)
        return job_name
Ejemplo n.º 10
0
    def define_output(self):

        randomer = Platform.generate_unique_id()
        vcf_gz = self.generate_unique_file_name(extension="{0}.vcf.gz".format(randomer))
        self.add_output("vcf_gz", vcf_gz)
Ejemplo n.º 11
0
    def __create_job_def(self, post_processing_required=False):
        # initialize the job def body
        self.inst_name = self.name
        if self.job_count > 1:
            self.inst_name = self.inst_name + '-' + str(self.job_count)
        self.job_count += 1
        job_def = client.V1Job(kind="Job")
        job_def.metadata = client.V1ObjectMeta(namespace=self.namespace,
                                               name=self.inst_name)

        # initialize job pieces
        self.job_containers = []
        volume_mounts = []
        volumes = []
        containers = []
        init_containers = []
        env_variables = []

        if not self.volume_name:
            # use the task name so it can be used across multiple jobs
            self.volume_name = self.name + '-pd'

        # build volume mounts
        volume_mounts = []
        volume_mounts.append(
            client.V1VolumeMount(mount_path=self.wrk_dir,
                                 name=self.volume_name))

        cpu_request_max = self.nodepool_info['max_cpu'] - self.cpu_reserve
        mem_request_max = self.nodepool_info['max_mem'] - self.mem_reserve

        # define resource limits/requests
        resource_def = client.V1ResourceRequirements(
            limits={
                'cpu': cpu_request_max,
                'memory': str(mem_request_max) + 'G'
            },
            requests={
                'cpu': cpu_request_max * .8,
                'memory': str(mem_request_max - 1) + 'G'
            })

        # update script task with job info
        if self.script_task:
            self.script_task.cpu_request = cpu_request_max * .8
            self.script_task.cpu_max = cpu_request_max
            self.script_task.memory_request = mem_request_max - 1
            self.script_task.memory_max = mem_request_max
            self.script_task.instance_name = self.inst_name
            self.script_task.force_standard = not self.preemptible
            self.script_task.pool_name = str(self.node_label)
            self.script_task.instance_type = str(
                self.nodepool_info["inst_type"])

        # place the job in the appropriate node pool
        node_label_dict = {'poolName': str(self.node_label)}

        # build volumes
        volumes.append(
            client.V1Volume(
                name=self.volume_name,
                persistent_volume_claim=client.
                V1PersistentVolumeClaimVolumeSource(claim_name=self.pvc_name)))

        # incorporate configured persistent volumes if associated with the current task
        if self.extra_persistent_volumes:
            for pv in self.extra_persistent_volumes:
                if pv['task_prefix'] in self.name:
                    claim_name = pv["pvc_name"]
                    if 'dynamic' in pv and pv['dynamic']:
                        claim_name = claim_name[:
                                                57] + '-' + Platform.generate_unique_id(
                                                    id_len=5)
                    # need to add the extra persistent volume
                    volume_mounts.append(
                        client.V1VolumeMount(mount_path=pv["path"],
                                             name=pv['volume_name'],
                                             read_only=pv['read_only']))
                    volumes.append(
                        client.V1Volume(name=pv['volume_name'],
                                        persistent_volume_claim=client.
                                        V1PersistentVolumeClaimVolumeSource(
                                            claim_name=claim_name)))

                    # specify volumes for script task
                    if self.script_task:
                        if 'dynamic' in pv and pv['dynamic']:
                            self.script_task.extra_volumes.append({
                                "path":
                                pv["path"],
                                "name":
                                pv["volume_name"],
                                "storage":
                                pv["size"],
                                "read_only":
                                pv["read_only"],
                                "claim_name":
                                claim_name,
                                "command":
                                pv["copy_command"],
                                "dynamic":
                                True
                            })
                        else:
                            self.script_task.extra_volumes.append({
                                "path":
                                pv["path"],
                                "name":
                                pv["volume_name"],
                                "read_only":
                                pv["read_only"],
                                "claim_name":
                                claim_name,
                                "dynamic":
                                False
                            })

        # incorporate configured secrets
        if self.gcp_secret_configured:
            volume_mounts.append(
                client.V1VolumeMount(
                    mount_path="/etc/cloud_conductor/gcp.json",
                    sub_path="gcp.json",
                    name="secret-volume",
                    read_only=True))
            volumes.append(
                client.V1Volume(name="secret-volume",
                                secret=client.V1SecretVolumeSource(
                                    secret_name="cloud-conductor-config",
                                    items=[
                                        client.V1KeyToPath(key="gcp_json",
                                                           path="gcp.json")
                                    ])))
            env_variables.append(
                client.V1EnvVar(name='GOOGLE_APPLICATION_CREDENTIALS',
                                value='/etc/cloud_conductor/gcp.json'))
            env_variables.append(
                client.V1EnvVar(name='RCLONE_CONFIG_GS_TYPE',
                                value='google cloud storage'))
            env_variables.append(
                client.V1EnvVar(name='RCLONE_CONFIG_GS_SERVICE_ACCOUNT_FILE',
                                value='$GOOGLE_APPLICATION_CREDENTIALS'))
            env_variables.append(
                client.V1EnvVar(name='RCLONE_CONFIG_GS_OBJECT_ACL',
                                value='projectPrivate'))
            env_variables.append(
                client.V1EnvVar(name='RCLONE_CONFIG_GS_BUCKET_ACL',
                                value='projectPrivate'))

        if self.aws_secret_configured:
            env_variables.append(
                client.V1EnvVar(
                    name='AWS_ACCESS_KEY_ID',
                    value_from=client.V1EnvVarSource(
                        secret_key_ref=client.V1SecretKeySelector(
                            name='cloud-conductor-config', key='aws_id'))))
            env_variables.append(
                client.V1EnvVar(
                    name='AWS_SECRET_ACCESS_KEY',
                    value_from=client.V1EnvVarSource(
                        secret_key_ref=client.V1SecretKeySelector(
                            name='cloud-conductor-config', key='aws_access'))))
            env_variables.append(
                client.V1EnvVar(name='RCLONE_CONFIG_S3_TYPE', value='s3'))
            env_variables.append(
                client.V1EnvVar(
                    name='RCLONE_CONFIG_S3_ACCESS_KEY_ID',
                    value_from=client.V1EnvVarSource(
                        secret_key_ref=client.V1SecretKeySelector(
                            name='cloud-conductor-config', key='aws_id'))))
            env_variables.append(
                client.V1EnvVar(
                    name='RCLONE_CONFIG_S3_SECRET_ACCESS_KEY',
                    value_from=client.V1EnvVarSource(
                        secret_key_ref=client.V1SecretKeySelector(
                            name='cloud-conductor-config', key='aws_access'))))

        storage_image = 'gcr.io/cloud-builders/gsutil'
        storage_tasks = ['mkdir_', 'grant_']
        container_name_list = []

        for k, v in self.processes.items():
            # if the process is for storage (i.e. mkdir, etc.)
            entrypoint = ["/bin/bash", "-c"]
            if any(x in k for x in storage_tasks) or not v['docker_image']:
                container_image = storage_image
            else:
                container_image = v['docker_image']
                if v['docker_entrypoint'] is not None and v[
                        'original_cmd'].find(v['docker_entrypoint']) == -1:
                    v['original_cmd'] = v['docker_entrypoint'] + ' ' + v[
                        'original_cmd']
                if 'rclone' in container_image:
                    v['original_cmd'] = v['original_cmd'].replace(
                        "|&", "2>&1 |")
                    entrypoint = ["/bin/sh", "-c"]
            args = v['original_cmd']
            if not isinstance(args, list):
                args = [v['original_cmd'].replace("sudo ", "")]
            args = " && ".join(args)
            args = args.replace("\n", " ")
            args = args.replace("java.io.tmpdir=/tmp/",
                                "java.io.tmpdir=/data/tmp/")

            if "awk " in args:
                args = re.sub("'\"'\"'", "'", args)

            if "gsutil" in args:
                args = "gcloud auth activate-service-account --key-file $GOOGLE_APPLICATION_CREDENTIALS && sleep 10; " + args

            # add in pipe error handling
            # if "copy_input" in k or "copy_output" in k:
            #     args = "set -o pipefail && " + args

            logging.debug(f"({self.name}) Command for task {k} is : {args}")

            # format the container name and roll call to logging
            container_name = k.replace("_", "-").replace(".", "-").lower()
            formatted_container_name = container_name[:
                                                      57] + '-' + Platform.generate_unique_id(
                                                          id_len=5)
            while formatted_container_name in container_name_list:
                # make sure all container names are unique
                formatted_container_name = container_name[:
                                                          57] + '-' + Platform.generate_unique_id(
                                                              id_len=5)
            container_name_list.append(formatted_container_name)

            # args = f">&2 echo STARTING TASK {container_name} && " + args

            containers.append(
                client.V1Container(
                    # lifecycle=client.V1Lifecycle(post_start=post_start_handler),
                    image=container_image,
                    command=entrypoint,
                    args=[args],
                    name=formatted_container_name,
                    volume_mounts=volume_mounts,
                    env=env_variables,
                    resources=resource_def,
                    image_pull_policy='IfNotPresent'))

            if self.script_task and container_name not in self.script_task.commands:
                self.script_task.commands[container_name] = ({
                    "name": formatted_container_name,
                    "docker_image": container_image,
                    "entrypoint": entrypoint,
                    "args": [args]
                })

        job_spec = dict(backoff_limit=self.default_num_cmd_retries)

        self.job_containers = containers

        # Run jobs in order using init_containers
        # See https://kubernetes.io/docs/concepts/workloads/pods/init-containers/
        if len(containers) > 1:
            init_containers = containers[:-1]
            containers = [containers[-1]]
        else:
            containers = containers
            init_containers = None

        # define the pod spec
        job_template = client.V1PodTemplateSpec()
        job_labels = {}
        job_labels[self.inst_name] = 'CC-Job'
        # add annotation to prevent autoscaler from killing nodes running jobs
        annotations = {
            'cluster-autoscaler.kubernetes.io/safe-to-evict': 'false'
        }
        job_template.metadata = client.V1ObjectMeta(labels=job_labels,
                                                    annotations=annotations)
        job_template.spec = client.V1PodSpec(
            init_containers=init_containers,
            containers=containers,
            volumes=volumes,
            restart_policy='Never',
            termination_grace_period_seconds=self.termination_seconds,
            node_selector=node_label_dict)

        job_def.spec = client.V1JobSpec(template=job_template, **job_spec)

        if self.script_task:
            self.script_task.num_retries = self.default_num_cmd_retries
            for k, v in job_labels.items():
                self.script_task.labels.append({"key": k, "value": v})
            for k, v in annotations.items():
                self.script_task.annotations.append({"key": k, "value": v})

        return job_def
Ejemplo n.º 12
0
    def load_input(self, inputs):

        # List of jobs that have been started in process of loading input
        job_names = []

        # Pull docker image if necessary
        if self.docker_image is not None:
            docker_image_name = self.docker_image.get_image_name().split(
                "/")[0]
            docker_image_name = docker_image_name.replace(":", "_")
            job_name = "docker_pull_%s" % docker_image_name
            self.docker_helper.pull(self.docker_image.get_image_name(),
                                    job_name=job_name)
            job_names.append(job_name)

        # Load input files
        # Inputs: list containing remote files, local files, and docker images
        src_seen = []
        dest_seen = []
        count = 1
        batch_size = 5
        loading_counter = 0
        for task_input in inputs:

            # Don't transfer local files
            if ":" not in task_input.get_path():
                continue

            # Directory where input will be transferred
            dest_dir = self.workspace.get_wrk_dir()

            # Input filename after transfer (None = same as src)
            dest_filename = None

            # Case: Transfer file into wrk directory if its not already there
            if task_input.get_transferrable_path() not in src_seen:

                # Get name of file that's going to be transferred
                src_path = task_input.get_transferrable_path()
                job_name = "load_input_%s_%s_%s" % (
                    self.task_id, task_input.get_type(), count)
                logging.debug("Input path: %s, transfer path: %s" %
                              (task_input.get_path(), src_path))

                # Generate complete transfer path
                dest_path = os.path.join(dest_dir, task_input.filename)

                # Check to see if transferring file would overwrite existing file
                if dest_path in dest_seen:
                    # Add unique tag to destination filename to prevent overwrite
                    if task_input.sample_name is not None:
                        dest_filename = "{0}_{1}".format(
                            task_input.sample_name, task_input.filename)
                    else:
                        dest_filename = "{0}_{1}".format(
                            Platform.generate_unique_id(), dest_filename)
                    logging.debug(
                        "Changing filename from '{0}' to '{1}'.".format(
                            task_input.filename, dest_filename))
                    dest_path = os.path.join(dest_dir, dest_filename)
                else:
                    dest_filename = None
                    dest_path = dest_dir

                # Show the final log file
                logging.debug("Destination: {0}".format(dest_path))

                # Move file to dest_path
                self.storage_helper.mv(src_path=src_path,
                                       dest_path=dest_path,
                                       job_name=job_name)
                loading_counter += 1

                # Add transfer path to list of remote paths that have been transferred to local workspace
                src_seen.append(src_path)
                count += 1
                job_names.append(job_name)

                # If loading_counter is batch_size, clear out queue
                if loading_counter >= batch_size:
                    logging.debug("Batch size reached on task {0}".format(
                        self.task_id))
                    # Wait for all processes to finish
                    while len(job_names):
                        self.processor.wait_process(job_names.pop())
                    loading_counter = 0

            # Update path after transferring to wrk directory and add to list of files in working directory
            task_input.update_path(new_dir=dest_dir,
                                   new_filename=dest_filename)
            dest_seen.append(task_input.get_path())
            logging.debug("Updated path: %s" % task_input.get_path())

        # Wait for all processes to finish
        for job_name in job_names:
            self.processor.wait_process(job_name)

        # Recursively give every permission to all files we just added
        logging.info("(%s) Final workspace perm. update for task '%s'..." %
                     (self.processor.name, self.task_id))
        self.__grant_workspace_perms(job_name="grant_final_wrkspace_perms")