Python StorageHelper Examples, System.Platform.StorageHelper Python Examples

Example #1

0

Show file

 def __init__(self, task_id, processor, workspace, docker_image=None):
     self.task_id = task_id
     self.processor = processor
     self.workspace = workspace
     self.storage_helper = StorageHelper(self.processor)
     self.docker_helper = DockerHelper(self.processor)
     self.docker_image = docker_image

Example #2

0

Show file

    def __init__(self, task_id, processor, workspace, docker_image=None):
        self.task_id = task_id
        self.processor = processor
        self.workspace = workspace
        self.storage_helper = StorageHelper(self.processor)
        self.docker_helper = DockerHelper(self.processor)
        self.docker_image = docker_image

        # Create workspace directory structure
        self.__create_workspace()

Example #3

0

Show file

File: GAPipeline.py Project: siyansusan/CloudConductor

    def validate(self):

        # Assume all validations are working
        has_errors = False

        # Validate the sample set
        sample_validator = SampleValidator(self.sample_data)
        has_errors = sample_validator.validate() or has_errors
        if not has_errors:
            logging.debug("Sample sheet validated!")

        # Validate the graph
        graph_validator = GraphValidator(self.graph, self.resource_kit, self.sample_data)
        has_errors = graph_validator.validate() or has_errors
        if not has_errors:
            logging.debug("Graph validated!")

        # Validate the platform
        self.platform.validate()

        # Stop the pipeline before launching if there are any errors
        if has_errors:
            raise SystemError("One or more errors have been encountered during validation. "
                              "See the above logs for more information")

        # Create helper processor and storage/docker helpers for checking input files
        self.helper_processor   = self.platform.get_helper_processor()
        self.helper_processor.create()

        self.storage_helper     = StorageHelper(self.helper_processor)
        self.docker_helper      = DockerHelper(self.helper_processor)

        # Validate all pipeline inputs can be found on platform
        input_validator = InputValidator(self.resource_kit, self.sample_data, self.storage_helper, self.docker_helper)
        has_errors = input_validator.validate() or has_errors

        # Stop the pipeline if there are any errors
        if has_errors:
            raise SystemError("One or more errors have been encountered during validation. "
                              "See the above logs for more information")

        # Validate that pipeline workspace can be created
        workspace = self.datastore.get_task_workspace()
        for dir_type, dir_path in workspace.get_workspace().items():
            self.storage_helper.mkdir(dir_path=str(dir_path), job_name="mkdir_%s" % dir_type, wait=True)
        logging.info("CloudCounductor run validated! Beginning pipeline execution.")

Example #4

0

Show file

File: ModuleExecutor.py Project: whitebird01/CloudConductor

    def __init__(self,
                 task_id,
                 processor,
                 final_output_dir,
                 final_tmp_dir,
                 docker_image=None):
        self.task_id = task_id
        self.processor = processor
        self.storage_helper = StorageHelper(self.processor)
        self.docker_helper = DockerHelper(self.processor)
        self.docker_image = docker_image

        self.final_output_dir = final_output_dir
        self.final_tmp_dir = final_tmp_dir

        # Create workspace directory structure
        self.__create_workspace()

Example #5

0

Show file

File: GAPipeline.py Project: whitebird01/CloudConductor

    def validate(self):

        # Assume all validations are working
        has_errors = False

        # Validate the sample set
        sample_validator = SampleValidator(self.sample_data)
        has_errors = sample_validator.validate() or has_errors
        if not has_errors:
            logging.debug("Sample sheet validated!")

        # Validate the graph
        graph_validator = GraphValidator(self.graph, self.resource_kit,
                                         self.sample_data)
        has_errors = graph_validator.validate() or has_errors
        if not has_errors:
            logging.debug("Graph validated!")

        # Validate the platform
        self.platform.validate()

        # Stop the pipeline before launching if there are any errors
        if has_errors:
            raise SystemError(
                "One or more errors have been encountered during validation. "
                "See the above logs for more information")

        # Create storage/docker helpers for checking input files
        self.storage_helper = StorageHelper(None)
        self.docker_helper = DockerHelper(None)

        # Validate all pipeline inputs can be found on platform
        input_validator = InputValidator(self.resource_kit, self.sample_data,
                                         self.storage_helper,
                                         self.docker_helper)
        has_errors = input_validator.validate() or has_errors

        # Stop the pipeline if there are any errors
        if has_errors:
            raise SystemError(
                "One or more errors have been encountered during validation. "
                "See the above logs for more information")

        logging.info(
            "CloudCounductor run validated! Beginning pipeline execution.")

Example #6

0

Show file

File: Test.py Project: alexwaldrop/GAP

rk = ResourceKit(rk_good)

ss_file = "/home/alex/Desktop/projects/gap/test/ss.json"
ss = SampleSet(ss_file)

proc = GoogleStandardProcessor(
    "test-proc-4",
    4,
    12,
    75,
    zone="us-east1-c",
    service_acct="*****@*****.**",
    boot_disk_size=75,
    disk_image="davelab-image-docker")

sh = StorageHelper(proc)
dh = DockerHelper(proc)
iv = InputValidator(rk, ss, sh, dh)

try:
    proc.create()
    proc.__configure_SSH()
    print "We validatin'"
    print iv.validate()
    print "We done validatin'"

finally:
    proc.destroy(wait=False)

exit(0)

Example #7

0

Show file

class ModuleExecutor(object):
    def __init__(self, task_id, processor, workspace, docker_image=None):
        self.task_id = task_id
        self.processor = processor
        self.workspace = workspace
        self.storage_helper = StorageHelper(self.processor)
        self.docker_helper = DockerHelper(self.processor)
        self.docker_image = docker_image

    def load_input(self, inputs):

        if self.processor.get_status() is Processor.OFF:
            # Create processor if it's off
            logging.info("Creating processor '%s' for task '%s'!" %
                         (self.processor.get_name(), self.task_id))
            self.processor.create()

        # Create workspace directory structure
        self.__create_workspace()

        # List of jobs that have been started in process of loading input
        job_names = []

        # Pull docker image if necessary
        if self.docker_image is not None:
            docker_image_name = self.docker_image.get_image_name().split(
                "/")[0]
            docker_image_name = docker_image_name.replace(":", "_")
            job_name = "docker_pull_%s" % docker_image_name
            self.docker_helper.pull(self.docker_image.get_image_name(),
                                    job_name=job_name)
            job_names.append(job_name)

        # Load input files
        # Inputs: list containing remote files, local files, and docker images
        seen = []
        count = 1
        for task_input in inputs:

            # Case: Transfer file into wrk directory if its not already there
            if task_input.get_transferrable_path() not in seen:

                # Transfer file to workspace directory
                src_path = task_input.get_transferrable_path()
                job_name = "load_input_%s_%s_%s" % (
                    self.task_id, task_input.get_type(), count)
                logging.debug("Input path: %s, transfer path: %s" %
                              (task_input.get_path(), src_path))
                self.storage_helper.mv(src_path=src_path,
                                       dest_path=self.workspace.get_wrk_dir(),
                                       job_name=job_name)

                # Add transfer path to list of remote paths that have been transferred to local workspace
                seen.append(src_path)
                count += 1
                job_names.append(job_name)

            # Update path after transferring to wrk directory
            task_input.update_path(new_dir=self.workspace.get_wrk_dir())
            logging.debug("Updated path: %s" % task_input.get_path())

        # Wait for all processes to finish
        for job_name in job_names:
            self.processor.wait_process(job_name)

        # Recursively give every permission to all files we just added
        logging.info("(%s) Final workspace perm. update for task '%s'..." %
                     (self.processor.name, self.task_id))
        self.__grant_workspace_perms(job_name="grant_final_wrkspace_perms")

    def run(self, cmd):
        # Job name
        job_name = self.task_id
        # Get name of docker image where command should be run (if any)
        docker_image_name = None if self.docker_image is None else self.docker_image.get_image_name(
        )
        # Begin running job and return stdout, stderr after job has finished running
        self.processor.run(job_name, cmd, docker_image=docker_image_name)
        return self.processor.wait_process(job_name)

    def save_output(self, outputs, final_output_types):
        # Return output files to workspace output dir

        # Get workspace places for output files
        final_output_dir = self.workspace.get_output_dir()
        tmp_output_dir = self.workspace.get_tmp_output_dir()
        count = 1
        job_names = []

        for output_file in outputs:
            if output_file.get_type() in final_output_types:
                dest_dir = final_output_dir
            else:
                dest_dir = tmp_output_dir

            # Calculate output file size
            job_name = "get_size_%s_%s_%s" % (self.task_id,
                                              output_file.get_type(), count)
            file_size = self.storage_helper.get_file_size(
                output_file.get_path(), job_name=job_name)
            output_file.set_size(file_size)

            # Transfer to correct output directory
            job_name = "save_output_%s_%s_%s" % (self.task_id,
                                                 output_file.get_type(), count)
            curr_path = output_file.get_transferrable_path()
            self.storage_helper.mv(curr_path, dest_dir, job_name=job_name)

            # Update path of output file to reflect new location
            job_names.append(job_name)
            output_file.update_path(new_dir=dest_dir)
            count += 1

        # Wait for transfers to complete
        for job_name in job_names:
            self.processor.wait_process(job_name)

        # Wait for output files to finish transferring
        self.processor.wait()

    def save_logs(self):
        # Move log files to final output log directory
        log_files = os.path.join(self.workspace.get_wrk_log_dir(), "*")
        final_log_dir = self.workspace.get_final_log_dir()
        self.storage_helper.mv(log_files,
                               final_log_dir,
                               job_name="return_logs",
                               log=False,
                               wait=True)

    def __create_workspace(self):
        # Create all directories specified in task workspace

        logging.info("(%s) Creating workspace for task '%s'..." %
                     (self.processor.name, self.task_id))
        for dir_type, dir_obj in self.workspace.get_workspace().iteritems():
            self.storage_helper.mkdir(dir_obj,
                                      job_name="mkdir_%s" % dir_type,
                                      wait=True)

        # Set processor wrk, log directories
        self.processor.set_wrk_dir(self.workspace.get_wrk_dir())
        self.processor.set_log_dir(self.workspace.get_wrk_log_dir())

        # Give everyone all the permissions on working directory
        logging.info("(%s) Updating workspace permissions..." %
                     self.processor.name)
        self.__grant_workspace_perms(job_name="grant_initial_wrkspace_perms")

        # Wait for all the above commands to complete
        logging.info("(%s) Successfully created workspace for task '%s'!" %
                     (self.processor.name, self.task_id))

    def __grant_workspace_perms(self, job_name):
        cmd = "sudo chmod -R 777 %s" % self.workspace.get_wrk_dir()
        self.processor.run(job_name=job_name, cmd=cmd)
        self.processor.wait_process(job_name)

Example #8

0

Show file

class ModuleExecutor(object):
    def __init__(self, task_id, processor, workspace, docker_image=None):
        self.task_id = task_id
        self.processor = processor
        self.workspace = workspace
        self.storage_helper = StorageHelper(self.processor)
        self.docker_helper = DockerHelper(self.processor)
        self.docker_image = docker_image

        # Create workspace directory structure
        self.__create_workspace()

    def load_input(self, inputs):

        # List of jobs that have been started in process of loading input
        job_names = []

        # Pull docker image if necessary
        if self.docker_image is not None:
            docker_image_name = self.docker_image.get_image_name().split(
                "/")[0]
            docker_image_name = docker_image_name.replace(":", "_")
            job_name = "docker_pull_%s" % docker_image_name
            self.docker_helper.pull(self.docker_image.get_image_name(),
                                    job_name=job_name)
            job_names.append(job_name)

        # Load input files
        # Inputs: list containing remote files, local files, and docker images
        src_seen = []
        dest_seen = []
        count = 1
        batch_size = 5
        loading_counter = 0
        for task_input in inputs:

            # Don't transfer local files
            if ":" not in task_input.get_path():
                continue

            # Directory where input will be transferred
            dest_dir = self.workspace.get_wrk_dir()

            # Input filename after transfer (None = same as src)
            dest_filename = None

            # Case: Transfer file into wrk directory if its not already there
            if task_input.get_transferrable_path() not in src_seen:

                # Get name of file that's going to be transferred
                src_path = task_input.get_transferrable_path()
                job_name = "load_input_%s_%s_%s" % (
                    self.task_id, task_input.get_type(), count)
                logging.debug("Input path: %s, transfer path: %s" %
                              (task_input.get_path(), src_path))

                # Generate complete transfer path
                dest_path = os.path.join(dest_dir, task_input.filename)

                # Check to see if transferring file would overwrite existing file
                if dest_path in dest_seen:
                    # Add unique tag to destination filename to prevent overwrite
                    if task_input.sample_name is not None:
                        dest_filename = "{0}_{1}".format(
                            task_input.sample_name, task_input.filename)
                    else:
                        dest_filename = "{0}_{1}".format(
                            Platform.generate_unique_id(), dest_filename)
                    logging.debug(
                        "Changing filename from '{0}' to '{1}'.".format(
                            task_input.filename, dest_filename))
                    dest_path = os.path.join(dest_dir, dest_filename)
                else:
                    dest_filename = None
                    dest_path = dest_dir

                # Show the final log file
                logging.debug("Destination: {0}".format(dest_path))

                # Move file to dest_path
                self.storage_helper.mv(src_path=src_path,
                                       dest_path=dest_path,
                                       job_name=job_name)
                loading_counter += 1

                # Add transfer path to list of remote paths that have been transferred to local workspace
                src_seen.append(src_path)
                count += 1
                job_names.append(job_name)

                # If loading_counter is batch_size, clear out queue
                if loading_counter >= batch_size:
                    logging.debug("Batch size reached on task {0}".format(
                        self.task_id))
                    # Wait for all processes to finish
                    while len(job_names):
                        self.processor.wait_process(job_names.pop())
                    loading_counter = 0

            # Update path after transferring to wrk directory and add to list of files in working directory
            task_input.update_path(new_dir=dest_dir,
                                   new_filename=dest_filename)
            dest_seen.append(task_input.get_path())
            logging.debug("Updated path: %s" % task_input.get_path())

        # Wait for all processes to finish
        for job_name in job_names:
            self.processor.wait_process(job_name)

        # Recursively give every permission to all files we just added
        logging.info("(%s) Final workspace perm. update for task '%s'..." %
                     (self.processor.name, self.task_id))
        self.__grant_workspace_perms(job_name="grant_final_wrkspace_perms")

    def run(self, cmd, job_name=None):

        # Check or create job name
        if job_name is None:
            job_name = self.task_id

        # Get name of docker image where command should be run (if any)
        docker_image_name = None if self.docker_image is None else self.docker_image.get_image_name(
        )

        # Begin running job and return stdout, stderr after job has finished running
        self.processor.run(job_name, cmd, docker_image=docker_image_name)
        return self.processor.wait_process(job_name)

    def save_output(self, outputs, final_output_types):
        # Return output files to workspace output dir

        # Get workspace places for output files
        final_output_dir = self.workspace.get_output_dir()
        tmp_output_dir = self.workspace.get_tmp_output_dir()
        count = 1
        job_names = []

        # List of output file paths. We create this list to ensure the files are not being overwritten
        output_filepaths = []

        for output_file in outputs:
            if output_file.get_type() in final_output_types:
                dest_dir = final_output_dir
            else:
                dest_dir = tmp_output_dir

            # Calculate output file size
            job_name = "get_size_%s_%s_%s" % (self.task_id,
                                              output_file.get_type(), count)
            file_size = self.storage_helper.get_file_size(
                output_file.get_path(), job_name=job_name)
            output_file.set_size(file_size)

            # Check if there already exists a file with the same name on the bucket
            destination_path = "{0}/{1}/".format(dest_dir.rstrip("/"),
                                                 output_file.get_filename())
            if destination_path in output_filepaths:

                # Change the destination directory for a new subdirectory
                dest_dir = "{0}/{1}/".format(dest_dir.rstrip("/"),
                                             len(output_filepaths))

                # Regenerate the destination path
                new_destination_path = "{0}/{1}".format(
                    dest_dir.rstrip("/"), output_file.get_filename())

                # Add the new path to the output file paths
                output_filepaths.append(new_destination_path)

            else:
                # Just add the new path to the list of output file paths
                output_filepaths.append(destination_path)

            # Transfer to correct output directory
            job_name = "save_output_%s_%s_%s" % (self.task_id,
                                                 output_file.get_type(), count)
            curr_path = output_file.get_transferrable_path()
            self.storage_helper.mv(curr_path, dest_dir, job_name=job_name)

            # Update path of output file to reflect new location
            job_names.append(job_name)
            output_file.update_path(new_dir=dest_dir)
            logging.debug(
                "(%s) Transferring file '%s' from old path '%s' to new path '%s' ('%s')"
                %
                (self.task_id, output_file.get_type(), curr_path,
                 output_file.get_path(), output_file.get_transferrable_path()))

            count += 1

        # Wait for transfers to complete
        for job_name in job_names:
            self.processor.wait_process(job_name)

        # Wait for output files to finish transferring
        self.processor.wait()

    def save_logs(self):
        # Move log files to final output log directory
        log_files = os.path.join(self.workspace.get_wrk_log_dir(), "*")
        final_log_dir = self.workspace.get_final_log_dir()
        self.storage_helper.mv(log_files,
                               final_log_dir,
                               job_name="return_logs",
                               log=False,
                               wait=True)

    def __create_workspace(self):
        # Create all directories specified in task workspace

        logging.info("(%s) Creating workspace for task '%s'..." %
                     (self.processor.name, self.task_id))
        for dir_type, dir_obj in self.workspace.get_workspace().items():
            self.storage_helper.mkdir(dir_obj,
                                      job_name="mkdir_%s" % dir_type,
                                      wait=True)

        # Set processor wrk, log directories
        self.processor.set_wrk_dir(self.workspace.get_wrk_dir())
        self.processor.set_wrk_out_dir(self.workspace.get_wrk_out_dir())
        self.processor.set_log_dir(self.workspace.get_wrk_log_dir())

        # Give everyone all the permissions on working directory
        logging.info("(%s) Updating workspace permissions..." %
                     self.processor.name)
        self.__grant_workspace_perms(job_name="grant_initial_wrkspace_perms")

        # Wait for all the above commands to complete
        logging.info("(%s) Successfully created workspace for task '%s'!" %
                     (self.processor.name, self.task_id))

    def __grant_workspace_perms(self, job_name):
        cmd = "sudo chmod -R 777 %s" % self.workspace.get_wrk_dir()
        self.processor.run(job_name=job_name, cmd=cmd)
        self.processor.wait_process(job_name)

Example #9

0

Show file

File: GAPipeline.py Project: siyansusan/CloudConductor

class GAPipeline(object):

    def __init__(self, pipeline_id,
                 graph_config,
                 resource_kit_config,
                 sample_data_config,
                 platform_config,
                 platform_module,
                 final_output_dir):

        # GAP run id
        self.pipeline_id    = pipeline_id

        # Paths to config files
        self.__graph_config         = graph_config
        self.__res_kit_config       = resource_kit_config
        self.__sample_set_config    = sample_data_config
        self.__platform_config      = platform_config

        # Name of platform class where tasks will be executed
        self.__plat_module          = platform_module

        # Final output directory where output is saved
        self.__final_output_dir     = final_output_dir

        # Obtain pipeline name and append to final output dir

        self.graph          = None
        self.resource_kit   = None
        self.sample_data    = None
        self.platform       = None

        # Create datastore from pipeline components
        self.datastore      = None

        # Task scheduler for running jobs
        self.scheduler = None

        # Helper processor for handling platform operations
        self.helper_processor   = None
        self.storage_helper     = None
        self.docker_helper      = None

    def load(self):

        # Load resource kit
        self.resource_kit = ResourceKit(self.__res_kit_config)

        # Load the sample data
        self.sample_data = SampleSet(self.__sample_set_config)

        # Load the graph
        self.graph = Graph(self.__graph_config)

        # Load platform
        plat_module     = importlib.import_module(self.__plat_module)
        plat_class      = plat_module.__dict__[self.__plat_module]
        self.platform   = plat_class(self.pipeline_id, self.__platform_config, self.__final_output_dir)

        # Create datastore and scheduler
        self.datastore = Datastore(self.graph, self.resource_kit, self.sample_data, self.platform)
        self.scheduler = Scheduler(self.graph, self.datastore, self.platform)

    def validate(self):

        # Assume all validations are working
        has_errors = False

        # Validate the sample set
        sample_validator = SampleValidator(self.sample_data)
        has_errors = sample_validator.validate() or has_errors
        if not has_errors:
            logging.debug("Sample sheet validated!")

        # Validate the graph
        graph_validator = GraphValidator(self.graph, self.resource_kit, self.sample_data)
        has_errors = graph_validator.validate() or has_errors
        if not has_errors:
            logging.debug("Graph validated!")

        # Validate the platform
        self.platform.validate()

        # Stop the pipeline before launching if there are any errors
        if has_errors:
            raise SystemError("One or more errors have been encountered during validation. "
                              "See the above logs for more information")

        # Create helper processor and storage/docker helpers for checking input files
        self.helper_processor   = self.platform.get_helper_processor()
        self.helper_processor.create()

        self.storage_helper     = StorageHelper(self.helper_processor)
        self.docker_helper      = DockerHelper(self.helper_processor)

        # Validate all pipeline inputs can be found on platform
        input_validator = InputValidator(self.resource_kit, self.sample_data, self.storage_helper, self.docker_helper)
        has_errors = input_validator.validate() or has_errors

        # Stop the pipeline if there are any errors
        if has_errors:
            raise SystemError("One or more errors have been encountered during validation. "
                              "See the above logs for more information")

        # Validate that pipeline workspace can be created
        workspace = self.datastore.get_task_workspace()
        for dir_type, dir_path in workspace.get_workspace().items():
            self.storage_helper.mkdir(dir_path=str(dir_path), job_name="mkdir_%s" % dir_type, wait=True)
        logging.info("CloudCounductor run validated! Beginning pipeline execution.")

    def run(self, rm_tmp_output_on_success=True):
        # Run until all tasks are complete
        self.scheduler.run()

        # Remove temporary output on success
        if rm_tmp_output_on_success:
            workspace = self.datastore.get_task_workspace()
            try:
                self.storage_helper.rm(path=workspace.get_tmp_output_dir(), job_name="rm_tmp_output", wait=True)
            except BaseException as e:
                logging.error("Unable to remove tmp output directory: %s" % workspace.get_tmp_output_dir())
                if str(e) != "":
                    logging.error("Received the following err message:\n%s" % e)

    def save_progress(self):
        pass

    def publish_report(self, err=False, err_msg=None, git_version=None):
        # Create and publish GAP pipeline report
        try:
            report = self.__make_pipeline_report(err, err_msg, git_version)
            if self.platform is not None:
                self.platform.publish_report(report)
        except BaseException as e:
            logging.error("Unable to publish report!")
            if str(e) != "":
                logging.error("Received the following message:\n%s" % e)
            raise

    def clean_up(self):
        # Destroy the helper processor if it exists
        if self.helper_processor is not None:
            try:
                logging.debug("Destroying helper processor...")
                self.helper_processor.destroy(wait=False)
            except BaseException as e:
                logging.error("Unable to destroy helper processor '%s'!" % self.helper_processor.get_name())
                if str(e) != "":
                    logging.error("Received the follwoing err message:\n%s" % e)

        # Cleaning up the platform (let the platform decide what that means)
        if self.platform is not None:
            self.platform.clean_up()

    def __make_pipeline_report(self, err, err_msg, git_version):

        # Create a pipeline report that summarizes features of pipeline
        report = GAPReport(self.pipeline_id, err, err_msg, git_version)

        # Register helper runtime data
        if self.helper_processor is not None:
            report.set_start_time(self.helper_processor.get_start_time())
            report.set_total_runtime(self.helper_processor.get_runtime())
            report.register_task(task_name="Helper",
                                 start_time=self.helper_processor.get_start_time(),
                                 run_time=self.helper_processor.get_runtime(),
                                 cost=self.helper_processor.compute_cost())

        # Register runtime data for pipeline tasks
        if self.scheduler is not None:
            task_workers = self.scheduler.get_task_workers()
            for task_name, task_worker in task_workers.items():

                # Register data about task runtime
                task        = task_worker.get_task()
                run_time    = task_worker.get_runtime()
                cost        = task_worker.get_cost()
                start_time  = task_worker.get_start_time()
                cmd         = task_worker.get_cmd()
                task_data   = {"parent_task" : task_name.split(".")[0]}
                report.register_task(task_name=task_name,
                                     start_time=start_time,
                                     run_time=run_time,
                                     cost=cost,
                                     cmd=cmd,
                                     task_data=task_data)

                # Register data about task output files
                if task.is_complete():
                    output_files = self.datastore.get_task_output_files(task_id=task_name)
                    for output_file in output_files:
                        file_type       = output_file.get_type()
                        file_path       = output_file.get_path()
                        is_final_output = file_type in task.get_final_output_keys()
                        file_size       = output_file.get_size()
                        if is_final_output or err:
                            # Only declare output files if file is final output file
                            # OR file is temporary output file but pipeline failed
                            report.register_output_file(task_name, file_type, file_path, file_size, is_final_output)

        return report

Example #10

0

Show file

File: GAPipeline.py Project: alexwaldrop/GAP

class GAPipeline(object):

    def __init__(self, pipeline_id,
                 graph_config,
                 resource_kit_config,
                 sample_data_config,
                 platform_config,
                 platform_module,
                 final_output_dir):

        # GAP run id
        self.pipeline_id    = pipeline_id

        # Paths to config files
        self.__graph_config         = graph_config
        self.__res_kit_config       = resource_kit_config
        self.__sample_set_config    = sample_data_config
        self.__platform_config      = platform_config

        # Name of platform class where tasks will be executed
        self.__plat_module          = platform_module

        # Final output directory where output is saved
        self.__final_output_dir     = final_output_dir

        # Obtain pipeline name and append to final output dir

        self.graph          = None
        self.resource_kit   = None
        self.sample_data    = None
        self.platform       = None

        # Create datastore from pipeline components
        self.datastore      = None

        # Task scheduler for running jobs
        self.scheduler = None

        # Helper processor for handling platform operations
        self.helper_processor   = None
        self.storage_helper     = None
        self.docker_helper      = None

    def load(self):

        # Load resource kit
        self.resource_kit = ResourceKit(self.__res_kit_config)

        # Load the sample data
        self.sample_data = SampleSet(self.__sample_set_config)

        # Load the graph
        self.graph = Graph(self.__graph_config)

        # Load platform
        plat_module     = importlib.import_module(self.__plat_module)
        plat_class      = plat_module.__dict__[self.__plat_module]
        self.platform   = plat_class(self.pipeline_id, self.__platform_config, self.__final_output_dir)

        # Create datastore and scheduler
        self.datastore = Datastore(self.graph, self.resource_kit, self.sample_data, self.platform)
        self.scheduler = Scheduler(self.graph, self.datastore, self.platform)

    def validate(self):

        # Assume all validations are working
        has_errors = False

        # Validate the sample set
        sample_validator = SampleValidator(self.sample_data)
        has_errors = sample_validator.validate() or has_errors
        if not has_errors:
            logging.debug("Sample sheet validated!")

        # Validate the graph
        graph_validator = GraphValidator(self.graph, self.resource_kit, self.sample_data)
        has_errors = graph_validator.validate() or has_errors
        if not has_errors:
            logging.debug("Graph validated!")

        # Validate the platform
        self.platform.validate()

        # Stop the pipeline before launching if there are any errors
        if has_errors:
            raise SystemError("One or more errors have been encountered during validation. "
                              "See the above logs for more information")

        # Create helper processor and storage/docker helpers for checking input files
        self.helper_processor   = self.platform.get_helper_processor()
        self.helper_processor.create()

        self.storage_helper     = StorageHelper(self.helper_processor)
        self.docker_helper      = DockerHelper(self.helper_processor)

        # Validate all pipeline inputs can be found on platform
        input_validator = InputValidator(self.resource_kit, self.sample_data, self.storage_helper, self.docker_helper)
        has_errors = input_validator.validate() or has_errors

        # Stop the pipeline if there are any errors
        if has_errors:
            raise SystemError("One or more errors have been encountered during validation. "
                              "See the above logs for more information")

        # Validate that pipeline workspace can be created
        workspace = self.datastore.get_task_workspace()
        for dir_type, dir_path in workspace.get_workspace().iteritems():
            self.storage_helper.mkdir(dir_path=str(dir_path), job_name="mkdir_%s" % dir_type, wait=True)
        logging.info("GAP run validated! Beginning pipeline execution.")

    def run(self, rm_tmp_output_on_success=True):
        # Run until all tasks are complete
        self.scheduler.run()

        # Remove temporary output on success
        if rm_tmp_output_on_success:
            workspace = self.datastore.get_task_workspace()
            try:
                self.storage_helper.rm(path=workspace.get_tmp_output_dir(), job_name="rm_tmp_output", wait=True)
            except BaseException, e:
                logging.error("Unable to remove tmp output directory: %s" % workspace.get_tmp_output_dir())
                if e.message != "":
                    logging.error("Received the following err message:\n%s" % e.message)