Ejemplo n.º 1
0
class ModuleExecutor(object):
    def __init__(self, task_id, processor, workspace, docker_image=None):
        self.task_id = task_id
        self.processor = processor
        self.workspace = workspace
        self.storage_helper = StorageHelper(self.processor)
        self.docker_helper = DockerHelper(self.processor)
        self.docker_image = docker_image

    def load_input(self, inputs):

        if self.processor.get_status() is Processor.OFF:
            # Create processor if it's off
            logging.info("Creating processor '%s' for task '%s'!" %
                         (self.processor.get_name(), self.task_id))
            self.processor.create()

        # Create workspace directory structure
        self.__create_workspace()

        # List of jobs that have been started in process of loading input
        job_names = []

        # Pull docker image if necessary
        if self.docker_image is not None:
            docker_image_name = self.docker_image.get_image_name().split(
                "/")[0]
            docker_image_name = docker_image_name.replace(":", "_")
            job_name = "docker_pull_%s" % docker_image_name
            self.docker_helper.pull(self.docker_image.get_image_name(),
                                    job_name=job_name)
            job_names.append(job_name)

        # Load input files
        # Inputs: list containing remote files, local files, and docker images
        seen = []
        count = 1
        for task_input in inputs:

            # Case: Transfer file into wrk directory if its not already there
            if task_input.get_transferrable_path() not in seen:

                # Transfer file to workspace directory
                src_path = task_input.get_transferrable_path()
                job_name = "load_input_%s_%s_%s" % (
                    self.task_id, task_input.get_type(), count)
                logging.debug("Input path: %s, transfer path: %s" %
                              (task_input.get_path(), src_path))
                self.storage_helper.mv(src_path=src_path,
                                       dest_path=self.workspace.get_wrk_dir(),
                                       job_name=job_name)

                # Add transfer path to list of remote paths that have been transferred to local workspace
                seen.append(src_path)
                count += 1
                job_names.append(job_name)

            # Update path after transferring to wrk directory
            task_input.update_path(new_dir=self.workspace.get_wrk_dir())
            logging.debug("Updated path: %s" % task_input.get_path())

        # Wait for all processes to finish
        for job_name in job_names:
            self.processor.wait_process(job_name)

        # Recursively give every permission to all files we just added
        logging.info("(%s) Final workspace perm. update for task '%s'..." %
                     (self.processor.name, self.task_id))
        self.__grant_workspace_perms(job_name="grant_final_wrkspace_perms")

    def run(self, cmd):
        # Job name
        job_name = self.task_id
        # Get name of docker image where command should be run (if any)
        docker_image_name = None if self.docker_image is None else self.docker_image.get_image_name(
        )
        # Begin running job and return stdout, stderr after job has finished running
        self.processor.run(job_name, cmd, docker_image=docker_image_name)
        return self.processor.wait_process(job_name)

    def save_output(self, outputs, final_output_types):
        # Return output files to workspace output dir

        # Get workspace places for output files
        final_output_dir = self.workspace.get_output_dir()
        tmp_output_dir = self.workspace.get_tmp_output_dir()
        count = 1
        job_names = []

        for output_file in outputs:
            if output_file.get_type() in final_output_types:
                dest_dir = final_output_dir
            else:
                dest_dir = tmp_output_dir

            # Calculate output file size
            job_name = "get_size_%s_%s_%s" % (self.task_id,
                                              output_file.get_type(), count)
            file_size = self.storage_helper.get_file_size(
                output_file.get_path(), job_name=job_name)
            output_file.set_size(file_size)

            # Transfer to correct output directory
            job_name = "save_output_%s_%s_%s" % (self.task_id,
                                                 output_file.get_type(), count)
            curr_path = output_file.get_transferrable_path()
            self.storage_helper.mv(curr_path, dest_dir, job_name=job_name)

            # Update path of output file to reflect new location
            job_names.append(job_name)
            output_file.update_path(new_dir=dest_dir)
            count += 1

        # Wait for transfers to complete
        for job_name in job_names:
            self.processor.wait_process(job_name)

        # Wait for output files to finish transferring
        self.processor.wait()

    def save_logs(self):
        # Move log files to final output log directory
        log_files = os.path.join(self.workspace.get_wrk_log_dir(), "*")
        final_log_dir = self.workspace.get_final_log_dir()
        self.storage_helper.mv(log_files,
                               final_log_dir,
                               job_name="return_logs",
                               log=False,
                               wait=True)

    def __create_workspace(self):
        # Create all directories specified in task workspace

        logging.info("(%s) Creating workspace for task '%s'..." %
                     (self.processor.name, self.task_id))
        for dir_type, dir_obj in self.workspace.get_workspace().iteritems():
            self.storage_helper.mkdir(dir_obj,
                                      job_name="mkdir_%s" % dir_type,
                                      wait=True)

        # Set processor wrk, log directories
        self.processor.set_wrk_dir(self.workspace.get_wrk_dir())
        self.processor.set_log_dir(self.workspace.get_wrk_log_dir())

        # Give everyone all the permissions on working directory
        logging.info("(%s) Updating workspace permissions..." %
                     self.processor.name)
        self.__grant_workspace_perms(job_name="grant_initial_wrkspace_perms")

        # Wait for all the above commands to complete
        logging.info("(%s) Successfully created workspace for task '%s'!" %
                     (self.processor.name, self.task_id))

    def __grant_workspace_perms(self, job_name):
        cmd = "sudo chmod -R 777 %s" % self.workspace.get_wrk_dir()
        self.processor.run(job_name=job_name, cmd=cmd)
        self.processor.wait_process(job_name)
Ejemplo n.º 2
0
try:
    sh = StorageHelper(proc)
    sh.mkdir("/home/alex_waldrop_jr/test/", wait=True)
    sh.mkdir("/home/gap/log/", wait=True)
    sh.mkdir("gs://derp_test/mkdir_test_mofo_2/", wait=True)
    proc.run("perms_gap", "sudo chmod -R 777 /home/gap/")
    proc.run("perms_awal", "sudo chmod -R 777 /home/alex_waldrop_jr/")
    proc.wait()
    print "local exists: %s" % sh.path_exists("/home/alex_waldrop_jr/test/")
    print "local exists: %s" % sh.path_exists("/home/gap/log/")
    print "cloud exists: %s" % sh.path_exists(
        "gs://derp_test/mkdir_test_mofo_2/")
    print "bad exists: %s" % sh.path_exists("/home/aasdasdfk")
    sh.mv("gs://derp_test/dummy.txt",
          "/home/alex_waldrop_jr/test/",
          log=False,
          wait=True)
    sh.mv("/home/alex_waldrop_jr/test/dummy.txt",
          "/home/alex_waldrop_jr/test/whoops_i_win.txt",
          log=False,
          wait=True)
    sh.mv("/home/alex_waldrop_jr/test/whoops_i_win.txt",
          "gs://derp_test/mkdir_test_mofo_2/",
          log=False,
          wait=True)
    print "good file exists: %s" % sh.path_exists(
        "/home/alex_waldrop_jr/test/dummy.txt")
    print "DNA file size: %s" % sh.get_file_size(
        "gs://davelab_data/ref/hg19/DNA")
    print "Dummy file size: %s" % sh.get_file_size(
        "/home/alex_waldrop_jr/test/whoops_i_win.txt")
Ejemplo n.º 3
0
class ModuleExecutor(object):
    def __init__(self, task_id, processor, workspace, docker_image=None):
        self.task_id = task_id
        self.processor = processor
        self.workspace = workspace
        self.storage_helper = StorageHelper(self.processor)
        self.docker_helper = DockerHelper(self.processor)
        self.docker_image = docker_image

        # Create workspace directory structure
        self.__create_workspace()

    def load_input(self, inputs):

        # List of jobs that have been started in process of loading input
        job_names = []

        # Pull docker image if necessary
        if self.docker_image is not None:
            docker_image_name = self.docker_image.get_image_name().split(
                "/")[0]
            docker_image_name = docker_image_name.replace(":", "_")
            job_name = "docker_pull_%s" % docker_image_name
            self.docker_helper.pull(self.docker_image.get_image_name(),
                                    job_name=job_name)
            job_names.append(job_name)

        # Load input files
        # Inputs: list containing remote files, local files, and docker images
        src_seen = []
        dest_seen = []
        count = 1
        batch_size = 5
        loading_counter = 0
        for task_input in inputs:

            # Don't transfer local files
            if ":" not in task_input.get_path():
                continue

            # Directory where input will be transferred
            dest_dir = self.workspace.get_wrk_dir()

            # Input filename after transfer (None = same as src)
            dest_filename = None

            # Case: Transfer file into wrk directory if its not already there
            if task_input.get_transferrable_path() not in src_seen:

                # Get name of file that's going to be transferred
                src_path = task_input.get_transferrable_path()
                job_name = "load_input_%s_%s_%s" % (
                    self.task_id, task_input.get_type(), count)
                logging.debug("Input path: %s, transfer path: %s" %
                              (task_input.get_path(), src_path))

                # Generate complete transfer path
                dest_path = os.path.join(dest_dir, task_input.filename)

                # Check to see if transferring file would overwrite existing file
                if dest_path in dest_seen:
                    # Add unique tag to destination filename to prevent overwrite
                    if task_input.sample_name is not None:
                        dest_filename = "{0}_{1}".format(
                            task_input.sample_name, task_input.filename)
                    else:
                        dest_filename = "{0}_{1}".format(
                            Platform.generate_unique_id(), dest_filename)
                    logging.debug(
                        "Changing filename from '{0}' to '{1}'.".format(
                            task_input.filename, dest_filename))
                    dest_path = os.path.join(dest_dir, dest_filename)
                else:
                    dest_filename = None
                    dest_path = dest_dir

                # Show the final log file
                logging.debug("Destination: {0}".format(dest_path))

                # Move file to dest_path
                self.storage_helper.mv(src_path=src_path,
                                       dest_path=dest_path,
                                       job_name=job_name)
                loading_counter += 1

                # Add transfer path to list of remote paths that have been transferred to local workspace
                src_seen.append(src_path)
                count += 1
                job_names.append(job_name)

                # If loading_counter is batch_size, clear out queue
                if loading_counter >= batch_size:
                    logging.debug("Batch size reached on task {0}".format(
                        self.task_id))
                    # Wait for all processes to finish
                    while len(job_names):
                        self.processor.wait_process(job_names.pop())
                    loading_counter = 0

            # Update path after transferring to wrk directory and add to list of files in working directory
            task_input.update_path(new_dir=dest_dir,
                                   new_filename=dest_filename)
            dest_seen.append(task_input.get_path())
            logging.debug("Updated path: %s" % task_input.get_path())

        # Wait for all processes to finish
        for job_name in job_names:
            self.processor.wait_process(job_name)

        # Recursively give every permission to all files we just added
        logging.info("(%s) Final workspace perm. update for task '%s'..." %
                     (self.processor.name, self.task_id))
        self.__grant_workspace_perms(job_name="grant_final_wrkspace_perms")

    def run(self, cmd, job_name=None):

        # Check or create job name
        if job_name is None:
            job_name = self.task_id

        # Get name of docker image where command should be run (if any)
        docker_image_name = None if self.docker_image is None else self.docker_image.get_image_name(
        )

        # Begin running job and return stdout, stderr after job has finished running
        self.processor.run(job_name, cmd, docker_image=docker_image_name)
        return self.processor.wait_process(job_name)

    def save_output(self, outputs, final_output_types):
        # Return output files to workspace output dir

        # Get workspace places for output files
        final_output_dir = self.workspace.get_output_dir()
        tmp_output_dir = self.workspace.get_tmp_output_dir()
        count = 1
        job_names = []

        # List of output file paths. We create this list to ensure the files are not being overwritten
        output_filepaths = []

        for output_file in outputs:
            if output_file.get_type() in final_output_types:
                dest_dir = final_output_dir
            else:
                dest_dir = tmp_output_dir

            # Calculate output file size
            job_name = "get_size_%s_%s_%s" % (self.task_id,
                                              output_file.get_type(), count)
            file_size = self.storage_helper.get_file_size(
                output_file.get_path(), job_name=job_name)
            output_file.set_size(file_size)

            # Check if there already exists a file with the same name on the bucket
            destination_path = "{0}/{1}/".format(dest_dir.rstrip("/"),
                                                 output_file.get_filename())
            if destination_path in output_filepaths:

                # Change the destination directory for a new subdirectory
                dest_dir = "{0}/{1}/".format(dest_dir.rstrip("/"),
                                             len(output_filepaths))

                # Regenerate the destination path
                new_destination_path = "{0}/{1}".format(
                    dest_dir.rstrip("/"), output_file.get_filename())

                # Add the new path to the output file paths
                output_filepaths.append(new_destination_path)

            else:
                # Just add the new path to the list of output file paths
                output_filepaths.append(destination_path)

            # Transfer to correct output directory
            job_name = "save_output_%s_%s_%s" % (self.task_id,
                                                 output_file.get_type(), count)
            curr_path = output_file.get_transferrable_path()
            self.storage_helper.mv(curr_path, dest_dir, job_name=job_name)

            # Update path of output file to reflect new location
            job_names.append(job_name)
            output_file.update_path(new_dir=dest_dir)
            logging.debug(
                "(%s) Transferring file '%s' from old path '%s' to new path '%s' ('%s')"
                %
                (self.task_id, output_file.get_type(), curr_path,
                 output_file.get_path(), output_file.get_transferrable_path()))

            count += 1

        # Wait for transfers to complete
        for job_name in job_names:
            self.processor.wait_process(job_name)

        # Wait for output files to finish transferring
        self.processor.wait()

    def save_logs(self):
        # Move log files to final output log directory
        log_files = os.path.join(self.workspace.get_wrk_log_dir(), "*")
        final_log_dir = self.workspace.get_final_log_dir()
        self.storage_helper.mv(log_files,
                               final_log_dir,
                               job_name="return_logs",
                               log=False,
                               wait=True)

    def __create_workspace(self):
        # Create all directories specified in task workspace

        logging.info("(%s) Creating workspace for task '%s'..." %
                     (self.processor.name, self.task_id))
        for dir_type, dir_obj in self.workspace.get_workspace().items():
            self.storage_helper.mkdir(dir_obj,
                                      job_name="mkdir_%s" % dir_type,
                                      wait=True)

        # Set processor wrk, log directories
        self.processor.set_wrk_dir(self.workspace.get_wrk_dir())
        self.processor.set_wrk_out_dir(self.workspace.get_wrk_out_dir())
        self.processor.set_log_dir(self.workspace.get_wrk_log_dir())

        # Give everyone all the permissions on working directory
        logging.info("(%s) Updating workspace permissions..." %
                     self.processor.name)
        self.__grant_workspace_perms(job_name="grant_initial_wrkspace_perms")

        # Wait for all the above commands to complete
        logging.info("(%s) Successfully created workspace for task '%s'!" %
                     (self.processor.name, self.task_id))

    def __grant_workspace_perms(self, job_name):
        cmd = "sudo chmod -R 777 %s" % self.workspace.get_wrk_dir()
        self.processor.run(job_name=job_name, cmd=cmd)
        self.processor.wait_process(job_name)