def execute(ssh_client, project_name, analysis_steps, s3_input_files_address, sample_list, group_name, s3_output_files_address, email): yaml_file = project_name + ".yaml" global log_dir log_dir = log_dir.format(project_name) print("making the yaml file ...") YamlFileMaker.make_yaml_file(yaml_file, project_name, analysis_steps, s3_input_files_address, sample_list, group_name, s3_output_files_address, "hg19", "NA") print("copying yaml files to remote master node...") ConnectionManager.copy_file(ssh_client, yaml_file, workspace + "yaml_examples") os.remove(yaml_file) #if not email == "": print("executing pipeline...") ConnectionManager.execute_command( ssh_client, "qsub -o /dev/null -e /dev/null " + workspace + "scripts/run.sh " + workspace + "yaml_examples/" + yaml_file + " " + log_dir + " " + "WGSPipeline.py")
def cat_script(ssh_client, scripts_dict, pipeline, workflow, script_name): """ Returns tuple: (specificity of script, string representation of given script contents ) Specificity in ["Workflow Specific", "Pipeline Specific", "All Pipelines"] Must specify which workflow and pipeline the script is in. input: ssh_client: a paramiko SSHClient obj scripts_dict: scripts dictionary described in get_scripts_dict function pipeline: name of a supported target pipeline workflow: name of a supported target workflow script_name: name of target script including .sh extension """ if script_name in scripts_dict[pipeline][workflow]: return "Workflow Specific", ConnectionManager.execute_command( ssh_client, "cat /shared/workspace/Pipelines/scripts/{}/{}/{}".format( pipeline, workflow, script_name)) elif script_name in scripts_dict[pipeline]["All Workflows"]: return "Pipeline Specific", ConnectionManager.execute_command( ssh_client, "cat /shared/workspace/Pipelines/scripts/{}/{}".format( pipeline, script_name)) elif script_name in scripts_dict["All Pipelines"]: return "All Pipelines", ConnectionManager.execute_command( ssh_client, "cat /shared/workspace/Pipelines/scripts/{}".format(script_name)) else: return "This script isn't called in the specified Pipeline/Workflow", ""
def get_steps_calling_script(ssh_client, scripts_dict, script_name): """ Gets which steps call the given shell script. Does so by parsing workflow config yamls on the cluster. Returns str that summarizes that information input: ssh_client: a paramiko SSHClient obj scripts_dict: scripts dictionary described in get_scripts_dict function script_name: name of target script including .sh extension """ tools_conf = yaml.load( ConnectionManager.execute_command( ssh_client, "cat /shared/workspace/Pipelines/config/tools.yaml")) #keys=pipline names, values=list of workflows in that pipeline #excludes "All Pipelines" key from original scripts_dict pipe_work_dict = { pipeline: get_workflows_in_pipeline(scripts_dict, pipeline) for pipeline in get_all_pipeline_names(scripts_dict) } configs = [] for pipeline in pipe_work_dict: for workflow in pipe_work_dict[pipeline]: configs.append( yaml.load( ConnectionManager.execute_command( ssh_client, "cat /shared/workspace/Pipelines/config/{0}/{0}_{1}.yaml" .format(pipeline, workflow)))) result = "{} called from:\n".format(script_name) script_name = script_name.replace(".sh", "") for conf in configs: for step in conf: if step == "steps": continue dirs = conf[step]["script_path"].split("/") #if length of dirs 1, then the script path must just contain script name # therefore, script in all Pipelines #if length of dirs 2, then script path must have pipeline and script name # therefore, script in all workflows for a specific pipeline #if length of dirs 3, then script path must have pipeline,workflow,script name # therefore, script in specific workflow for specific pipeline in_strings = [ "in all Pipelines", "in all Workflows in {}", "in the {} {} workflow" ] in_string = in_strings[len(dirs) - 1].format(*dirs[:-1]) if dirs[-1] == script_name: result += "{} {}\n".format(step, in_string) if len(dirs) != 3: return result return result
def check_step_failed(ssh_client, pipeline, workflow, project_name, job_name): ConnectionManager.execute_command(ssh_client, "touch /shared/workspace/logs/{}/{}/{}/*/status.log".format(pipeline,workflow,project_name,job_name)) status_log_checker = "ls /shared/workspace/logs/{}/{}/{}/*/status.log | xargs grep \"{}.*failed\" | wc -l" #reports if step finished and failed or finished and passed if int(ConnectionManager.execute_command(ssh_client, status_log_checker.format(pipeline,workflow,project_name,job_name))): return True return False
def edit_step_tools_config(ssh_client, new_step_tools_conf, step_name): tools_conf = yaml.load( ConnectionManager.execute_command( ssh_client, "cat /shared/workspace/Pipelines/config/tools.yaml")) tools_conf[step_name] = new_step_tools_conf with open("tools.yaml", "w+") as f: f.write(yaml.dump(tools_conf, default_flow_style=False)) ConnectionManager.execute_command( ssh_client, "mv -n /shared/workspace/Pipelines/config/tools.yaml /shared/workspace/Pipelines/config/tools.yaml.BACKUP" ) ConnectionManager.copy_file( ssh_client, "{}/tools.yaml".format(os.getcwd()), "/shared/workspace/Pipelines/config/tools.yaml")
def check_status(ssh_client, job_name): print("checking processing status") qstat = ConnectionManager.execute_command(ssh_client, "qstat") job_ids = QstatParser.get_job_ids(qstat) job_details = [ ConnectionManager.execute_command(ssh_client, "qstat -j %s" % x[0]) for x in job_ids ] job_info = [job_ids[x] + [job_details[x]] for x in range(len(job_ids))] global log_dir logs = ConnectionManager.list_dir(ssh_client, log_dir) QstatParser.parse_qstat(job_info, job_name, logs)
def get_pipeline_workflow_config(ssh_client, pipeline, workflow): specific_confs_dict = literal_eval( ConnectionManager.execute_command( ssh_client, "python /shared/workspace/Pipelines/util/GetAllSpecificConfs.py")) return yaml.load(specific_confs_dict[pipeline][workflow])
def display_software_config(ssh_client): """Display the software config file as html""" show_script( ConnectionManager.execute_command( ssh_client, "cat /shared/workspace/Pipelines/config/software.conf"))
def upload_script(ssh_client, pipeline, workflow, script_name): script_path_cluster = "/shared/workspace/Pipelines/scripts/" if pipeline == "all": script_path_cluster += script_name elif workflow == "all": script_path_cluster += "{}/{}".format(pipeline, script_name) else: script_path_cluster += "{}/{}/{}".format(pipeline, workflow, script_name) ConnectionManager.execute_command( ssh_client, "mv -n {0} {0}.BACKUP".format(script_path_cluster)) ConnectionManager.copy_file(ssh_client, "{}/{}".format(os.getcwd(), script_name), script_path_cluster)
def execute(ssh_client, project_name, analysis_steps, s3_input_files_address, sample_list, group_name, s3_output_files_address): yaml_file = project_name + ".yaml" print "making the yaml file..." YamlFileMaker.make_yaml_file(yaml_file, project_name, analysis_steps, s3_input_files_address, sample_list, group_name, s3_output_files_address, "hg19", "NA") print "copying yaml file to remote master node..." ConnectionManager.copy_file(ssh_client, yaml_file, workspace + "yaml_examples") ## Remove the local yaml file os.remove(yaml_file) print "executing pipeline..." ConnectionManager.execute_command(ssh_client, "sh " + workspace + "run.sh " + workspace + "yaml_examples/" + yaml_file)
def execute(ssh_client, project_name, analysis_steps, s3_input_files_address, sample_list, group_list, s3_output_files_address): yaml_file = project_name + ".yaml" print "making the yaml file..." YamlFileMaker.make_yaml_file(yaml_file, project_name, analysis_steps, s3_input_files_address, sample_list, group_list, s3_output_files_address, "hg19", "NA") print "copying yaml file to remote master node..." ConnectionManager.copy_file(ssh_client, yaml_file, workspace + "yaml_examples") ## Remove the local yaml file os.remove(yaml_file) print "executing pipeline..." ConnectionManager.execute_command(ssh_client, "sh " + workspace + "run.sh " + workspace + "yaml_examples/" + yaml_file)
def edit_step_specific_config(ssh_client, pipeline, workflow, new_extra_bash_args, step_name): conf_file_name = "{}_{}.yaml".format(pipeline, workflow) spec_conf = yaml.load( ConnectionManager.execute_command( ssh_client, "cat /shared/workspace/Pipelines/config/{}/{}".format( pipeline, conf_file_name))) spec_conf[step_name] = new_extra_bash_args with open(conf_file_name, "w+") as f: f.write(yaml.dump(spec_conf, default_flow_style=False)) ConnectionManager.execute_command( ssh_client, "mv -n /shared/workspace/Pipelines/config/{0}/{1} /shared/workspace/Pipelines/config/{0}/{1}.BACKUP" .format(pipeline, conf_file_name)) ConnectionManager.copy_file( ssh_client, "{}/{}".format(os.getcwd(), conf_file_name), "/shared/workspace/Pipelines/config/{}/{}".format( pipeline, conf_file_name))
def get_software_dict(ssh_client): """ Returns a dictionary representing all software on the cluster. Uses cluster's GetSoftware.py. The dictionary has installed software names as keys and a list of the installed versions as values This dictionary is the core data structure behind the "Check Software on Cluster" section in the CirrusAddons notebook input: ssh_client: a paramiko SSHClient obj """ return literal_eval( ConnectionManager.execute_command( ssh_client, "python /shared/workspace/Pipelines/util/GetSoftware.py"))
def get_step_config_dicts(ssh_client, scripts_dict, step_name): tools_conf = yaml.load( ConnectionManager.execute_command( ssh_client, "cat /shared/workspace/Pipelines/config/tools.yaml")) step_tools_conf = tools_conf[step_name] specific_confs_dict = literal_eval( ConnectionManager.execute_command( ssh_client, "python /shared/workspace/Pipelines/util/GetAllSpecificConfs.py")) step_spec_conf = {} for pipeline in specific_confs_dict: for workflow in specific_confs_dict[pipeline]: curr_conf = yaml.load(specific_confs_dict[pipeline][workflow]) if step_name in curr_conf: step_spec_conf.update({ (pipeline, workflow): curr_conf[step_name] }) return step_tools_conf, step_spec_conf
def get_scripts_dict(ssh_client): """ Returns a dictionary representing all scripts on the cluster. Uses cluster's GetScripts.py. The dictionary has supported Pipelines and "All Pipelines" as keys and a second dict as values. The inner dictionaries have supported workflows for that pipeline and "All Workflows" as keys. The values for each is a list of shell script names This dictionary is the core data structure behind the "Check Scripts on Cluster" section in the CirrusAddons notebook input: ssh_client: a paramiko SSHClient obj """ return literal_eval( ConnectionManager.execute_command( ssh_client, "python /shared/workspace/Pipelines/util/GetScripts.py"))
def check_processing_status(ssh_client): print "checking processing status" ConnectionManager.execute_command(ssh_client, "cat " + workspace + "nohup.out")
def check_host_status(ssh_client): print "checking qhost status" ConnectionManager.execute_command(ssh_client, "qhost")
def check_jobs_status(ssh_client): print "checking jobs status" ConnectionManager.execute_command(ssh_client, "qstat")
def check_processing_status(ssh_client): print "checking processing status" ConnectionManager.execute_command(ssh_client, "cat " + workspace + "/nohup.out")
def check_status(ssh_client, step_name, pipeline, workflow, project_name,analysis_steps,verbose=False): print("checking status of jobs...\n") spec_yaml = ConnectionManager.execute_command(ssh_client, "cat /shared/workspace/Pipelines/config/{}/{}_{}.yaml".format(pipeline, pipeline, workflow)) spec_yaml = yaml.load(spec_yaml) possible_steps = get_possible_steps(analysis_steps, spec_yaml) if verbose: print("Your project will go through the following steps:\n\t{}\n".format( ", ".join(possible_steps))) all_possible_job_dict = get_job_names("all", possible_steps, spec_yaml) job_dict = get_job_names(step_name, possible_steps, spec_yaml) qstat = ConnectionManager.execute_command(ssh_client, "qstat") current_job = get_current_job(ssh_client, qstat) if qstat: split_qstat = qstat.splitlines()[2:] else: split_qstat = [] curr_time = datetime.datetime.utcnow() #for time running in verbose output status_conv = {"qw":"queued", "r":"running", "dr":"being deleted", "t":"being transferred"} possible_steps = [all_possible_job_dict[x] for x in possible_steps if not x == "done"] possible_steps.append("done") is_done = True for step, job_name in job_dict.items(): if verbose: print("The {} step calls the {} script on the cluster".format(step, job_name)) qstat_j = get_qstat_j(ssh_client, job_name) if "Following jobs do not exist" in qstat_j: #only happens when qstat -j job_name fails if not job_name in possible_steps: print("The {} step was not specified as a step in analysis_steps".format(step)) is_done = is_done and True elif possible_steps.index(current_job) < possible_steps.index(job_name): print("The {} step has not started yet.".format(step)) is_done = False elif check_step_failed(ssh_client, pipeline, workflow, project_name, job_name): print("The {} step has finished running, but has failed".format(step)) print("\tPlease check the logs") is_done = is_done and True elif check_step_passed(ssh_client, pipeline, workflow, project_name, job_name): print("The {} step has finished running without failure".format(step)) is_done = is_done and True else: print("The {} step has not started yet.".format(step)) is_done = False else: #job must be in qstat print("The {} step is being executed".format(step)) num_jobs = {x:0 for x in status_conv.keys()} job_specifics = defaultdict(list) for line in split_qstat: line = line.split() line_id = line[0] #get job name from qstat -j otherwise job name could be longer than qstat allows line_job = re.search("job_name:\s+(.*)\n",get_qstat_j(ssh_client, job_name)).group(1) if not line_job == job_name: continue line_status = line[4] num_jobs[line_status] += 1 month,day,year = map(int,line[5].split("/")) hour,minute,second = map(int,line[6].split(":")) start_time = datetime.datetime(year,month,day,hour,minute,second) line_num_cores = line[-1] job_specifics[line_status].append("\tone is currently {} using {} core(s) and was submitted {} ago".format( status_conv[line_status], line_num_cores, _format_timedelta(curr_time-start_time))) step_info = {stat:(num,job_specifics[stat]) for stat,num in num_jobs.items()} for stat,info_tuple in step_info.items(): num, details = info_tuple if num == 0: continue print("There are {} instances of the {} step currently {}".format(num, step, status_conv[stat])) if verbose: for det in details: print(det) print() if current_job == "done" and is_done: print("\nYour pipeline has finished") print()
def execute(pipeline, ssh_client, project_name, workflow, analysis_steps, s3_input_files_address, sample_list, group_list, s3_output_files_address, genome, style, pairs_list): """Executes a pipeline. The main local side function for executing a pipeline with all user inputs to jupyter notebook. Calls the run.sh shell script on the cluster head node using nohup after creating a yaml file summarizing user input and uploaded that file to the cluster. Args: pipeline: name of the pipeline to be run, supported pipelines can be found in CirrusAddons notebook ssh_client: a paramiko SSHClient object that connects to the cluster where analysis is run project_name: name of the current project, <project_name>.yaml contains all user input to notebook workflow: name of the workflow to be run, supported workflows can be found in CirrusAddons notebook analysis_steps: set of analysis steps to be run, supported steps can be found in pipeline's notebook s3_input_files_address: s3 bucket containing all fastq files for project sample_list: list of dictionaries with sample info for each sample group_list: list of all groups, shares indices with sample_list (sample_list[0] is in group[0], etc) s3_output_files_address: root s3 bucket where analysis results should be uploaded genome: reference genome to be used, supported genomes can be found in pipeline's notebook style: only for ChIPSeq homer workflow, can be "factor" or "histone" pairs_list: dictionary with keys=normal samples, values=experimental samples for ChIPSeq the keys=ChIP samples, values=corresponding input regularization samples Returns: None """ yaml_file = project_name + ".yaml" if s3_output_files_address.endswith("/"): s3_output_files_address = s3_output_files_address[:-1] if s3_input_files_address.endswith("/"): s3_input_files_address = s3_input_files_address[:-1] logs_dir = "/shared/workspace/logs/{}/{}/{}".format( pipeline, workflow, project_name) print("making the yaml file...") YamlFileMaker.make_yaml_file(yaml_file, pipeline, project_name, workflow, analysis_steps, s3_input_files_address, sample_list, group_list, s3_output_files_address, genome, style, pairs_list) print("copying yaml file to remote master node...") # Make sure remote directory exists remote_dir = workspace + "yaml_files/" + pipeline + "/" + workflow ssh_client.exec_command("mkdir -p " + remote_dir) ConnectionManager.copy_file( ssh_client, yaml_file, "{}yaml_files/{}/{}".format(workspace, pipeline, workflow)) # Remove the local yaml file os.remove(yaml_file) print("executing pipeline...") ConnectionManager.execute_command( ssh_client, "nohup bash " + workspace + "scripts/run.sh " + workspace + "yaml_files/{}/{}/{} ".format(pipeline, workflow, yaml_file) + logs_dir + " " + pipeline + "_" + workflow)
def stop_pipeline(ssh_client): ConnectionManager.execute_command( ssh_client, "bash /shared/workspace/Pipelines/util/stop_pipeline.sh")
def restore_backups(ssh_client): ConnectionManager.execute_command( ssh_client, "python /shared/workspace/Pipelines/util/RestoreBackups.py")
def get_qstat_j(ssh_client, job_name): return ConnectionManager.execute_command(ssh_client, "qstat -j {}".format(job_name))