def load(self): # Load resource kit self.resource_kit = ResourceKit(self.__res_kit_config) # Load the sample data self.sample_data = SampleSet(self.__sample_set_config) # Load the graph self.graph = Graph(self.__graph_config) # Load platform plat_module = importlib.import_module(self.__plat_module) plat_class = plat_module.__dict__[self.__plat_module] self.platform = plat_class(self.pipeline_id, self.__platform_config, self.__final_output_dir) # Create datastore and scheduler self.datastore = Datastore(self.graph, self.resource_kit, self.sample_data, self.platform) self.scheduler = Scheduler(self.graph, self.datastore, self.platform)
# Test cycle checking algorithm graph_file = "/home/alex/Desktop/projects/gap/test/test_runs/simple_success/graph.config" graph = Graph(graph_file) #print graph rk_file = "/home/alex/Desktop/projects/gap/test/test_runs/simple_success/rk.config" rk = ResourceKit(rk_file) ss_file = "/home/alex/Desktop/projects/gap/test/test_runs/simple_success/ss.json" ss = SampleSet(ss_file) plat_file = "/home/alex/Desktop/projects/gap/test/test_runs/simple_success/plat.config" plat = GooglePlatform("test_plat", plat_file, "gs://derp_test/dis_a_test_3/") dstore = Datastore(graph, rk, ss, plat) # Test graph splitting t1 = graph.tasks["tool1_FASTQC"].module dstore.set_task_input_args("tool1_FASTQC") print t1.get_arguments() print t1.get_argument("java") input_files = dstore.get_task_input_files("tool1_FASTQC") print input_files for input_file in input_files: print "Before: %s" % input_file.get_path() input_file.update_path(new_dir="/data/tool1_FASTQC/") print "After: %s" % input_file.get_path()
class GAPipeline(object): def __init__(self, pipeline_id, graph_config, resource_kit_config, sample_data_config, platform_config, platform_module, final_output_dir): # GAP run id self.pipeline_id = pipeline_id # Paths to config files self.__graph_config = graph_config self.__res_kit_config = resource_kit_config self.__sample_set_config = sample_data_config self.__platform_config = platform_config # Name of platform class where tasks will be executed self.__plat_module = platform_module # Final output directory where output is saved self.__final_output_dir = final_output_dir # Obtain pipeline name and append to final output dir self.graph = None self.resource_kit = None self.sample_data = None self.platform = None # Create datastore from pipeline components self.datastore = None # Task scheduler for running jobs self.scheduler = None # Helper processor for handling platform operations self.helper_processor = None self.storage_helper = None self.docker_helper = None def load(self): # Load resource kit self.resource_kit = ResourceKit(self.__res_kit_config) # Load the sample data self.sample_data = SampleSet(self.__sample_set_config) # Load the graph self.graph = Graph(self.__graph_config) # Load platform plat_module = importlib.import_module(self.__plat_module) plat_class = plat_module.__dict__[self.__plat_module] self.platform = plat_class(self.pipeline_id, self.__platform_config, self.__final_output_dir) # Create datastore and scheduler self.datastore = Datastore(self.graph, self.resource_kit, self.sample_data, self.platform) self.scheduler = Scheduler(self.graph, self.datastore, self.platform) def validate(self): # Assume all validations are working has_errors = False # Validate the sample set sample_validator = SampleValidator(self.sample_data) has_errors = sample_validator.validate() or has_errors if not has_errors: logging.debug("Sample sheet validated!") # Validate the graph graph_validator = GraphValidator(self.graph, self.resource_kit, self.sample_data) has_errors = graph_validator.validate() or has_errors if not has_errors: logging.debug("Graph validated!") # Validate the platform self.platform.validate() # Stop the pipeline before launching if there are any errors if has_errors: raise SystemError("One or more errors have been encountered during validation. " "See the above logs for more information") # Create helper processor and storage/docker helpers for checking input files self.helper_processor = self.platform.get_helper_processor() self.helper_processor.create() self.storage_helper = StorageHelper(self.helper_processor) self.docker_helper = DockerHelper(self.helper_processor) # Validate all pipeline inputs can be found on platform input_validator = InputValidator(self.resource_kit, self.sample_data, self.storage_helper, self.docker_helper) has_errors = input_validator.validate() or has_errors # Stop the pipeline if there are any errors if has_errors: raise SystemError("One or more errors have been encountered during validation. " "See the above logs for more information") # Validate that pipeline workspace can be created workspace = self.datastore.get_task_workspace() for dir_type, dir_path in workspace.get_workspace().items(): self.storage_helper.mkdir(dir_path=str(dir_path), job_name="mkdir_%s" % dir_type, wait=True) logging.info("CloudCounductor run validated! Beginning pipeline execution.") def run(self, rm_tmp_output_on_success=True): # Run until all tasks are complete self.scheduler.run() # Remove temporary output on success if rm_tmp_output_on_success: workspace = self.datastore.get_task_workspace() try: self.storage_helper.rm(path=workspace.get_tmp_output_dir(), job_name="rm_tmp_output", wait=True) except BaseException as e: logging.error("Unable to remove tmp output directory: %s" % workspace.get_tmp_output_dir()) if str(e) != "": logging.error("Received the following err message:\n%s" % e) def save_progress(self): pass def publish_report(self, err=False, err_msg=None, git_version=None): # Create and publish GAP pipeline report try: report = self.__make_pipeline_report(err, err_msg, git_version) if self.platform is not None: self.platform.publish_report(report) except BaseException as e: logging.error("Unable to publish report!") if str(e) != "": logging.error("Received the following message:\n%s" % e) raise def clean_up(self): # Destroy the helper processor if it exists if self.helper_processor is not None: try: logging.debug("Destroying helper processor...") self.helper_processor.destroy(wait=False) except BaseException as e: logging.error("Unable to destroy helper processor '%s'!" % self.helper_processor.get_name()) if str(e) != "": logging.error("Received the follwoing err message:\n%s" % e) # Cleaning up the platform (let the platform decide what that means) if self.platform is not None: self.platform.clean_up() def __make_pipeline_report(self, err, err_msg, git_version): # Create a pipeline report that summarizes features of pipeline report = GAPReport(self.pipeline_id, err, err_msg, git_version) # Register helper runtime data if self.helper_processor is not None: report.set_start_time(self.helper_processor.get_start_time()) report.set_total_runtime(self.helper_processor.get_runtime()) report.register_task(task_name="Helper", start_time=self.helper_processor.get_start_time(), run_time=self.helper_processor.get_runtime(), cost=self.helper_processor.compute_cost()) # Register runtime data for pipeline tasks if self.scheduler is not None: task_workers = self.scheduler.get_task_workers() for task_name, task_worker in task_workers.items(): # Register data about task runtime task = task_worker.get_task() run_time = task_worker.get_runtime() cost = task_worker.get_cost() start_time = task_worker.get_start_time() cmd = task_worker.get_cmd() task_data = {"parent_task" : task_name.split(".")[0]} report.register_task(task_name=task_name, start_time=start_time, run_time=run_time, cost=cost, cmd=cmd, task_data=task_data) # Register data about task output files if task.is_complete(): output_files = self.datastore.get_task_output_files(task_id=task_name) for output_file in output_files: file_type = output_file.get_type() file_path = output_file.get_path() is_final_output = file_type in task.get_final_output_keys() file_size = output_file.get_size() if is_final_output or err: # Only declare output files if file is final output file # OR file is temporary output file but pipeline failed report.register_output_file(task_name, file_type, file_path, file_size, is_final_output) return report
class GAPipeline(object): def __init__(self, pipeline_id, graph_config, resource_kit_config, sample_data_config, platform_config, platform_module, final_output_dir): # GAP run id self.pipeline_id = pipeline_id # Paths to config files self.__graph_config = graph_config self.__res_kit_config = resource_kit_config self.__sample_set_config = sample_data_config self.__platform_config = platform_config # Name of platform class where tasks will be executed self.__plat_module = platform_module # Final output directory where output is saved self.__final_output_dir = final_output_dir # Obtain pipeline name and append to final output dir self.graph = None self.resource_kit = None self.sample_data = None self.platform = None # Create datastore from pipeline components self.datastore = None # Task scheduler for running jobs self.scheduler = None # Helper processor for handling platform operations self.helper_processor = None self.storage_helper = None self.docker_helper = None def load(self): # Load resource kit self.resource_kit = ResourceKit(self.__res_kit_config) # Load the sample data self.sample_data = SampleSet(self.__sample_set_config) # Load the graph self.graph = Graph(self.__graph_config) # Load platform plat_module = importlib.import_module(self.__plat_module) plat_class = plat_module.__dict__[self.__plat_module] self.platform = plat_class(self.pipeline_id, self.__platform_config, self.__final_output_dir) # Create datastore and scheduler self.datastore = Datastore(self.graph, self.resource_kit, self.sample_data, self.platform) self.scheduler = Scheduler(self.graph, self.datastore, self.platform) def validate(self): # Assume all validations are working has_errors = False # Validate the sample set sample_validator = SampleValidator(self.sample_data) has_errors = sample_validator.validate() or has_errors if not has_errors: logging.debug("Sample sheet validated!") # Validate the graph graph_validator = GraphValidator(self.graph, self.resource_kit, self.sample_data) has_errors = graph_validator.validate() or has_errors if not has_errors: logging.debug("Graph validated!") # Validate the platform self.platform.validate() # Stop the pipeline before launching if there are any errors if has_errors: raise SystemError("One or more errors have been encountered during validation. " "See the above logs for more information") # Create helper processor and storage/docker helpers for checking input files self.helper_processor = self.platform.get_helper_processor() self.helper_processor.create() self.storage_helper = StorageHelper(self.helper_processor) self.docker_helper = DockerHelper(self.helper_processor) # Validate all pipeline inputs can be found on platform input_validator = InputValidator(self.resource_kit, self.sample_data, self.storage_helper, self.docker_helper) has_errors = input_validator.validate() or has_errors # Stop the pipeline if there are any errors if has_errors: raise SystemError("One or more errors have been encountered during validation. " "See the above logs for more information") # Validate that pipeline workspace can be created workspace = self.datastore.get_task_workspace() for dir_type, dir_path in workspace.get_workspace().iteritems(): self.storage_helper.mkdir(dir_path=str(dir_path), job_name="mkdir_%s" % dir_type, wait=True) logging.info("GAP run validated! Beginning pipeline execution.") def run(self, rm_tmp_output_on_success=True): # Run until all tasks are complete self.scheduler.run() # Remove temporary output on success if rm_tmp_output_on_success: workspace = self.datastore.get_task_workspace() try: self.storage_helper.rm(path=workspace.get_tmp_output_dir(), job_name="rm_tmp_output", wait=True) except BaseException, e: logging.error("Unable to remove tmp output directory: %s" % workspace.get_tmp_output_dir()) if e.message != "": logging.error("Received the following err message:\n%s" % e.message)