"test-proc", 1, 1, zone="us-east1-b", service_acct="*****@*****.**", boot_disk_size=75, disk_image="davelab-image-latest") proc.set_log_dir("/home/gap/log/") proc.create() print "We READY TO RUN!" try: sh = StorageHelper(proc) sh.mkdir("/home/alex_waldrop_jr/test/", wait=True) sh.mkdir("/home/gap/log/", wait=True) sh.mkdir("gs://derp_test/mkdir_test_mofo_2/", wait=True) proc.run("perms_gap", "sudo chmod -R 777 /home/gap/") proc.run("perms_awal", "sudo chmod -R 777 /home/alex_waldrop_jr/") proc.wait() print "local exists: %s" % sh.path_exists("/home/alex_waldrop_jr/test/") print "local exists: %s" % sh.path_exists("/home/gap/log/") print "cloud exists: %s" % sh.path_exists( "gs://derp_test/mkdir_test_mofo_2/") print "bad exists: %s" % sh.path_exists("/home/aasdasdfk") sh.mv("gs://derp_test/dummy.txt", "/home/alex_waldrop_jr/test/", log=False, wait=True) sh.mv("/home/alex_waldrop_jr/test/dummy.txt",
class ModuleExecutor(object): def __init__(self, task_id, processor, workspace, docker_image=None): self.task_id = task_id self.processor = processor self.workspace = workspace self.storage_helper = StorageHelper(self.processor) self.docker_helper = DockerHelper(self.processor) self.docker_image = docker_image def load_input(self, inputs): if self.processor.get_status() is Processor.OFF: # Create processor if it's off logging.info("Creating processor '%s' for task '%s'!" % (self.processor.get_name(), self.task_id)) self.processor.create() # Create workspace directory structure self.__create_workspace() # List of jobs that have been started in process of loading input job_names = [] # Pull docker image if necessary if self.docker_image is not None: docker_image_name = self.docker_image.get_image_name().split( "/")[0] docker_image_name = docker_image_name.replace(":", "_") job_name = "docker_pull_%s" % docker_image_name self.docker_helper.pull(self.docker_image.get_image_name(), job_name=job_name) job_names.append(job_name) # Load input files # Inputs: list containing remote files, local files, and docker images seen = [] count = 1 for task_input in inputs: # Case: Transfer file into wrk directory if its not already there if task_input.get_transferrable_path() not in seen: # Transfer file to workspace directory src_path = task_input.get_transferrable_path() job_name = "load_input_%s_%s_%s" % ( self.task_id, task_input.get_type(), count) logging.debug("Input path: %s, transfer path: %s" % (task_input.get_path(), src_path)) self.storage_helper.mv(src_path=src_path, dest_path=self.workspace.get_wrk_dir(), job_name=job_name) # Add transfer path to list of remote paths that have been transferred to local workspace seen.append(src_path) count += 1 job_names.append(job_name) # Update path after transferring to wrk directory task_input.update_path(new_dir=self.workspace.get_wrk_dir()) logging.debug("Updated path: %s" % task_input.get_path()) # Wait for all processes to finish for job_name in job_names: self.processor.wait_process(job_name) # Recursively give every permission to all files we just added logging.info("(%s) Final workspace perm. update for task '%s'..." % (self.processor.name, self.task_id)) self.__grant_workspace_perms(job_name="grant_final_wrkspace_perms") def run(self, cmd): # Job name job_name = self.task_id # Get name of docker image where command should be run (if any) docker_image_name = None if self.docker_image is None else self.docker_image.get_image_name( ) # Begin running job and return stdout, stderr after job has finished running self.processor.run(job_name, cmd, docker_image=docker_image_name) return self.processor.wait_process(job_name) def save_output(self, outputs, final_output_types): # Return output files to workspace output dir # Get workspace places for output files final_output_dir = self.workspace.get_output_dir() tmp_output_dir = self.workspace.get_tmp_output_dir() count = 1 job_names = [] for output_file in outputs: if output_file.get_type() in final_output_types: dest_dir = final_output_dir else: dest_dir = tmp_output_dir # Calculate output file size job_name = "get_size_%s_%s_%s" % (self.task_id, output_file.get_type(), count) file_size = self.storage_helper.get_file_size( output_file.get_path(), job_name=job_name) output_file.set_size(file_size) # Transfer to correct output directory job_name = "save_output_%s_%s_%s" % (self.task_id, output_file.get_type(), count) curr_path = output_file.get_transferrable_path() self.storage_helper.mv(curr_path, dest_dir, job_name=job_name) # Update path of output file to reflect new location job_names.append(job_name) output_file.update_path(new_dir=dest_dir) count += 1 # Wait for transfers to complete for job_name in job_names: self.processor.wait_process(job_name) # Wait for output files to finish transferring self.processor.wait() def save_logs(self): # Move log files to final output log directory log_files = os.path.join(self.workspace.get_wrk_log_dir(), "*") final_log_dir = self.workspace.get_final_log_dir() self.storage_helper.mv(log_files, final_log_dir, job_name="return_logs", log=False, wait=True) def __create_workspace(self): # Create all directories specified in task workspace logging.info("(%s) Creating workspace for task '%s'..." % (self.processor.name, self.task_id)) for dir_type, dir_obj in self.workspace.get_workspace().iteritems(): self.storage_helper.mkdir(dir_obj, job_name="mkdir_%s" % dir_type, wait=True) # Set processor wrk, log directories self.processor.set_wrk_dir(self.workspace.get_wrk_dir()) self.processor.set_log_dir(self.workspace.get_wrk_log_dir()) # Give everyone all the permissions on working directory logging.info("(%s) Updating workspace permissions..." % self.processor.name) self.__grant_workspace_perms(job_name="grant_initial_wrkspace_perms") # Wait for all the above commands to complete logging.info("(%s) Successfully created workspace for task '%s'!" % (self.processor.name, self.task_id)) def __grant_workspace_perms(self, job_name): cmd = "sudo chmod -R 777 %s" % self.workspace.get_wrk_dir() self.processor.run(job_name=job_name, cmd=cmd) self.processor.wait_process(job_name)
class GAPipeline(object): def __init__(self, pipeline_id, graph_config, resource_kit_config, sample_data_config, platform_config, platform_module, final_output_dir): # GAP run id self.pipeline_id = pipeline_id # Paths to config files self.__graph_config = graph_config self.__res_kit_config = resource_kit_config self.__sample_set_config = sample_data_config self.__platform_config = platform_config # Name of platform class where tasks will be executed self.__plat_module = platform_module # Final output directory where output is saved self.__final_output_dir = final_output_dir # Obtain pipeline name and append to final output dir self.graph = None self.resource_kit = None self.sample_data = None self.platform = None # Create datastore from pipeline components self.datastore = None # Task scheduler for running jobs self.scheduler = None # Helper processor for handling platform operations self.helper_processor = None self.storage_helper = None self.docker_helper = None def load(self): # Load resource kit self.resource_kit = ResourceKit(self.__res_kit_config) # Load the sample data self.sample_data = SampleSet(self.__sample_set_config) # Load the graph self.graph = Graph(self.__graph_config) # Load platform plat_module = importlib.import_module(self.__plat_module) plat_class = plat_module.__dict__[self.__plat_module] self.platform = plat_class(self.pipeline_id, self.__platform_config, self.__final_output_dir) # Create datastore and scheduler self.datastore = Datastore(self.graph, self.resource_kit, self.sample_data, self.platform) self.scheduler = Scheduler(self.graph, self.datastore, self.platform) def validate(self): # Assume all validations are working has_errors = False # Validate the sample set sample_validator = SampleValidator(self.sample_data) has_errors = sample_validator.validate() or has_errors if not has_errors: logging.debug("Sample sheet validated!") # Validate the graph graph_validator = GraphValidator(self.graph, self.resource_kit, self.sample_data) has_errors = graph_validator.validate() or has_errors if not has_errors: logging.debug("Graph validated!") # Validate the platform self.platform.validate() # Stop the pipeline before launching if there are any errors if has_errors: raise SystemError("One or more errors have been encountered during validation. " "See the above logs for more information") # Create helper processor and storage/docker helpers for checking input files self.helper_processor = self.platform.get_helper_processor() self.helper_processor.create() self.storage_helper = StorageHelper(self.helper_processor) self.docker_helper = DockerHelper(self.helper_processor) # Validate all pipeline inputs can be found on platform input_validator = InputValidator(self.resource_kit, self.sample_data, self.storage_helper, self.docker_helper) has_errors = input_validator.validate() or has_errors # Stop the pipeline if there are any errors if has_errors: raise SystemError("One or more errors have been encountered during validation. " "See the above logs for more information") # Validate that pipeline workspace can be created workspace = self.datastore.get_task_workspace() for dir_type, dir_path in workspace.get_workspace().items(): self.storage_helper.mkdir(dir_path=str(dir_path), job_name="mkdir_%s" % dir_type, wait=True) logging.info("CloudCounductor run validated! Beginning pipeline execution.") def run(self, rm_tmp_output_on_success=True): # Run until all tasks are complete self.scheduler.run() # Remove temporary output on success if rm_tmp_output_on_success: workspace = self.datastore.get_task_workspace() try: self.storage_helper.rm(path=workspace.get_tmp_output_dir(), job_name="rm_tmp_output", wait=True) except BaseException as e: logging.error("Unable to remove tmp output directory: %s" % workspace.get_tmp_output_dir()) if str(e) != "": logging.error("Received the following err message:\n%s" % e) def save_progress(self): pass def publish_report(self, err=False, err_msg=None, git_version=None): # Create and publish GAP pipeline report try: report = self.__make_pipeline_report(err, err_msg, git_version) if self.platform is not None: self.platform.publish_report(report) except BaseException as e: logging.error("Unable to publish report!") if str(e) != "": logging.error("Received the following message:\n%s" % e) raise def clean_up(self): # Destroy the helper processor if it exists if self.helper_processor is not None: try: logging.debug("Destroying helper processor...") self.helper_processor.destroy(wait=False) except BaseException as e: logging.error("Unable to destroy helper processor '%s'!" % self.helper_processor.get_name()) if str(e) != "": logging.error("Received the follwoing err message:\n%s" % e) # Cleaning up the platform (let the platform decide what that means) if self.platform is not None: self.platform.clean_up() def __make_pipeline_report(self, err, err_msg, git_version): # Create a pipeline report that summarizes features of pipeline report = GAPReport(self.pipeline_id, err, err_msg, git_version) # Register helper runtime data if self.helper_processor is not None: report.set_start_time(self.helper_processor.get_start_time()) report.set_total_runtime(self.helper_processor.get_runtime()) report.register_task(task_name="Helper", start_time=self.helper_processor.get_start_time(), run_time=self.helper_processor.get_runtime(), cost=self.helper_processor.compute_cost()) # Register runtime data for pipeline tasks if self.scheduler is not None: task_workers = self.scheduler.get_task_workers() for task_name, task_worker in task_workers.items(): # Register data about task runtime task = task_worker.get_task() run_time = task_worker.get_runtime() cost = task_worker.get_cost() start_time = task_worker.get_start_time() cmd = task_worker.get_cmd() task_data = {"parent_task" : task_name.split(".")[0]} report.register_task(task_name=task_name, start_time=start_time, run_time=run_time, cost=cost, cmd=cmd, task_data=task_data) # Register data about task output files if task.is_complete(): output_files = self.datastore.get_task_output_files(task_id=task_name) for output_file in output_files: file_type = output_file.get_type() file_path = output_file.get_path() is_final_output = file_type in task.get_final_output_keys() file_size = output_file.get_size() if is_final_output or err: # Only declare output files if file is final output file # OR file is temporary output file but pipeline failed report.register_output_file(task_name, file_type, file_path, file_size, is_final_output) return report
class ModuleExecutor(object): def __init__(self, task_id, processor, workspace, docker_image=None): self.task_id = task_id self.processor = processor self.workspace = workspace self.storage_helper = StorageHelper(self.processor) self.docker_helper = DockerHelper(self.processor) self.docker_image = docker_image # Create workspace directory structure self.__create_workspace() def load_input(self, inputs): # List of jobs that have been started in process of loading input job_names = [] # Pull docker image if necessary if self.docker_image is not None: docker_image_name = self.docker_image.get_image_name().split( "/")[0] docker_image_name = docker_image_name.replace(":", "_") job_name = "docker_pull_%s" % docker_image_name self.docker_helper.pull(self.docker_image.get_image_name(), job_name=job_name) job_names.append(job_name) # Load input files # Inputs: list containing remote files, local files, and docker images src_seen = [] dest_seen = [] count = 1 batch_size = 5 loading_counter = 0 for task_input in inputs: # Don't transfer local files if ":" not in task_input.get_path(): continue # Directory where input will be transferred dest_dir = self.workspace.get_wrk_dir() # Input filename after transfer (None = same as src) dest_filename = None # Case: Transfer file into wrk directory if its not already there if task_input.get_transferrable_path() not in src_seen: # Get name of file that's going to be transferred src_path = task_input.get_transferrable_path() job_name = "load_input_%s_%s_%s" % ( self.task_id, task_input.get_type(), count) logging.debug("Input path: %s, transfer path: %s" % (task_input.get_path(), src_path)) # Generate complete transfer path dest_path = os.path.join(dest_dir, task_input.filename) # Check to see if transferring file would overwrite existing file if dest_path in dest_seen: # Add unique tag to destination filename to prevent overwrite if task_input.sample_name is not None: dest_filename = "{0}_{1}".format( task_input.sample_name, task_input.filename) else: dest_filename = "{0}_{1}".format( Platform.generate_unique_id(), dest_filename) logging.debug( "Changing filename from '{0}' to '{1}'.".format( task_input.filename, dest_filename)) dest_path = os.path.join(dest_dir, dest_filename) else: dest_filename = None dest_path = dest_dir # Show the final log file logging.debug("Destination: {0}".format(dest_path)) # Move file to dest_path self.storage_helper.mv(src_path=src_path, dest_path=dest_path, job_name=job_name) loading_counter += 1 # Add transfer path to list of remote paths that have been transferred to local workspace src_seen.append(src_path) count += 1 job_names.append(job_name) # If loading_counter is batch_size, clear out queue if loading_counter >= batch_size: logging.debug("Batch size reached on task {0}".format( self.task_id)) # Wait for all processes to finish while len(job_names): self.processor.wait_process(job_names.pop()) loading_counter = 0 # Update path after transferring to wrk directory and add to list of files in working directory task_input.update_path(new_dir=dest_dir, new_filename=dest_filename) dest_seen.append(task_input.get_path()) logging.debug("Updated path: %s" % task_input.get_path()) # Wait for all processes to finish for job_name in job_names: self.processor.wait_process(job_name) # Recursively give every permission to all files we just added logging.info("(%s) Final workspace perm. update for task '%s'..." % (self.processor.name, self.task_id)) self.__grant_workspace_perms(job_name="grant_final_wrkspace_perms") def run(self, cmd, job_name=None): # Check or create job name if job_name is None: job_name = self.task_id # Get name of docker image where command should be run (if any) docker_image_name = None if self.docker_image is None else self.docker_image.get_image_name( ) # Begin running job and return stdout, stderr after job has finished running self.processor.run(job_name, cmd, docker_image=docker_image_name) return self.processor.wait_process(job_name) def save_output(self, outputs, final_output_types): # Return output files to workspace output dir # Get workspace places for output files final_output_dir = self.workspace.get_output_dir() tmp_output_dir = self.workspace.get_tmp_output_dir() count = 1 job_names = [] # List of output file paths. We create this list to ensure the files are not being overwritten output_filepaths = [] for output_file in outputs: if output_file.get_type() in final_output_types: dest_dir = final_output_dir else: dest_dir = tmp_output_dir # Calculate output file size job_name = "get_size_%s_%s_%s" % (self.task_id, output_file.get_type(), count) file_size = self.storage_helper.get_file_size( output_file.get_path(), job_name=job_name) output_file.set_size(file_size) # Check if there already exists a file with the same name on the bucket destination_path = "{0}/{1}/".format(dest_dir.rstrip("/"), output_file.get_filename()) if destination_path in output_filepaths: # Change the destination directory for a new subdirectory dest_dir = "{0}/{1}/".format(dest_dir.rstrip("/"), len(output_filepaths)) # Regenerate the destination path new_destination_path = "{0}/{1}".format( dest_dir.rstrip("/"), output_file.get_filename()) # Add the new path to the output file paths output_filepaths.append(new_destination_path) else: # Just add the new path to the list of output file paths output_filepaths.append(destination_path) # Transfer to correct output directory job_name = "save_output_%s_%s_%s" % (self.task_id, output_file.get_type(), count) curr_path = output_file.get_transferrable_path() self.storage_helper.mv(curr_path, dest_dir, job_name=job_name) # Update path of output file to reflect new location job_names.append(job_name) output_file.update_path(new_dir=dest_dir) logging.debug( "(%s) Transferring file '%s' from old path '%s' to new path '%s' ('%s')" % (self.task_id, output_file.get_type(), curr_path, output_file.get_path(), output_file.get_transferrable_path())) count += 1 # Wait for transfers to complete for job_name in job_names: self.processor.wait_process(job_name) # Wait for output files to finish transferring self.processor.wait() def save_logs(self): # Move log files to final output log directory log_files = os.path.join(self.workspace.get_wrk_log_dir(), "*") final_log_dir = self.workspace.get_final_log_dir() self.storage_helper.mv(log_files, final_log_dir, job_name="return_logs", log=False, wait=True) def __create_workspace(self): # Create all directories specified in task workspace logging.info("(%s) Creating workspace for task '%s'..." % (self.processor.name, self.task_id)) for dir_type, dir_obj in self.workspace.get_workspace().items(): self.storage_helper.mkdir(dir_obj, job_name="mkdir_%s" % dir_type, wait=True) # Set processor wrk, log directories self.processor.set_wrk_dir(self.workspace.get_wrk_dir()) self.processor.set_wrk_out_dir(self.workspace.get_wrk_out_dir()) self.processor.set_log_dir(self.workspace.get_wrk_log_dir()) # Give everyone all the permissions on working directory logging.info("(%s) Updating workspace permissions..." % self.processor.name) self.__grant_workspace_perms(job_name="grant_initial_wrkspace_perms") # Wait for all the above commands to complete logging.info("(%s) Successfully created workspace for task '%s'!" % (self.processor.name, self.task_id)) def __grant_workspace_perms(self, job_name): cmd = "sudo chmod -R 777 %s" % self.workspace.get_wrk_dir() self.processor.run(job_name=job_name, cmd=cmd) self.processor.wait_process(job_name)
class GAPipeline(object): def __init__(self, pipeline_id, graph_config, resource_kit_config, sample_data_config, platform_config, platform_module, final_output_dir): # GAP run id self.pipeline_id = pipeline_id # Paths to config files self.__graph_config = graph_config self.__res_kit_config = resource_kit_config self.__sample_set_config = sample_data_config self.__platform_config = platform_config # Name of platform class where tasks will be executed self.__plat_module = platform_module # Final output directory where output is saved self.__final_output_dir = final_output_dir # Obtain pipeline name and append to final output dir self.graph = None self.resource_kit = None self.sample_data = None self.platform = None # Create datastore from pipeline components self.datastore = None # Task scheduler for running jobs self.scheduler = None # Helper processor for handling platform operations self.helper_processor = None self.storage_helper = None self.docker_helper = None def load(self): # Load resource kit self.resource_kit = ResourceKit(self.__res_kit_config) # Load the sample data self.sample_data = SampleSet(self.__sample_set_config) # Load the graph self.graph = Graph(self.__graph_config) # Load platform plat_module = importlib.import_module(self.__plat_module) plat_class = plat_module.__dict__[self.__plat_module] self.platform = plat_class(self.pipeline_id, self.__platform_config, self.__final_output_dir) # Create datastore and scheduler self.datastore = Datastore(self.graph, self.resource_kit, self.sample_data, self.platform) self.scheduler = Scheduler(self.graph, self.datastore, self.platform) def validate(self): # Assume all validations are working has_errors = False # Validate the sample set sample_validator = SampleValidator(self.sample_data) has_errors = sample_validator.validate() or has_errors if not has_errors: logging.debug("Sample sheet validated!") # Validate the graph graph_validator = GraphValidator(self.graph, self.resource_kit, self.sample_data) has_errors = graph_validator.validate() or has_errors if not has_errors: logging.debug("Graph validated!") # Validate the platform self.platform.validate() # Stop the pipeline before launching if there are any errors if has_errors: raise SystemError("One or more errors have been encountered during validation. " "See the above logs for more information") # Create helper processor and storage/docker helpers for checking input files self.helper_processor = self.platform.get_helper_processor() self.helper_processor.create() self.storage_helper = StorageHelper(self.helper_processor) self.docker_helper = DockerHelper(self.helper_processor) # Validate all pipeline inputs can be found on platform input_validator = InputValidator(self.resource_kit, self.sample_data, self.storage_helper, self.docker_helper) has_errors = input_validator.validate() or has_errors # Stop the pipeline if there are any errors if has_errors: raise SystemError("One or more errors have been encountered during validation. " "See the above logs for more information") # Validate that pipeline workspace can be created workspace = self.datastore.get_task_workspace() for dir_type, dir_path in workspace.get_workspace().iteritems(): self.storage_helper.mkdir(dir_path=str(dir_path), job_name="mkdir_%s" % dir_type, wait=True) logging.info("GAP run validated! Beginning pipeline execution.") def run(self, rm_tmp_output_on_success=True): # Run until all tasks are complete self.scheduler.run() # Remove temporary output on success if rm_tmp_output_on_success: workspace = self.datastore.get_task_workspace() try: self.storage_helper.rm(path=workspace.get_tmp_output_dir(), job_name="rm_tmp_output", wait=True) except BaseException, e: logging.error("Unable to remove tmp output directory: %s" % workspace.get_tmp_output_dir()) if e.message != "": logging.error("Received the following err message:\n%s" % e.message)