def execute(ssh_client, project_name, analysis_steps, s3_input_files_address,
            sample_list, group_name, s3_output_files_address, email):
    yaml_file = project_name + ".yaml"

    global log_dir
    log_dir = log_dir.format(project_name)

    print("making the yaml file ...")
    YamlFileMaker.make_yaml_file(yaml_file, project_name, analysis_steps,
                                 s3_input_files_address, sample_list,
                                 group_name, s3_output_files_address, "hg19",
                                 "NA")

    print("copying yaml files to remote master node...")
    ConnectionManager.copy_file(ssh_client, yaml_file,
                                workspace + "yaml_examples")
    os.remove(yaml_file)

    #if not email == "":

    print("executing pipeline...")
    ConnectionManager.execute_command(
        ssh_client, "qsub -o /dev/null -e /dev/null " + workspace +
        "scripts/run.sh " + workspace + "yaml_examples/" + yaml_file + "  " +
        log_dir + " " + "WGSPipeline.py")
Example #2
0
def cat_script(ssh_client, scripts_dict, pipeline, workflow, script_name):
    """
    Returns tuple:
        (specificity of script, string representation of given script contents )
    Specificity in ["Workflow Specific", "Pipeline Specific", "All Pipelines"]
    Must specify which workflow and pipeline the script is in.

    input:
        ssh_client: a paramiko SSHClient obj
        scripts_dict: scripts dictionary described in get_scripts_dict function
        pipeline: name of a supported target pipeline
        workflow: name of a supported target workflow
        script_name: name of target script including .sh extension
    """
    if script_name in scripts_dict[pipeline][workflow]:
        return "Workflow Specific", ConnectionManager.execute_command(
            ssh_client,
            "cat /shared/workspace/Pipelines/scripts/{}/{}/{}".format(
                pipeline, workflow, script_name))
    elif script_name in scripts_dict[pipeline]["All Workflows"]:
        return "Pipeline Specific", ConnectionManager.execute_command(
            ssh_client, "cat /shared/workspace/Pipelines/scripts/{}/{}".format(
                pipeline, script_name))
    elif script_name in scripts_dict["All Pipelines"]:
        return "All Pipelines", ConnectionManager.execute_command(
            ssh_client,
            "cat /shared/workspace/Pipelines/scripts/{}".format(script_name))
    else:
        return "This script isn't called in the specified Pipeline/Workflow", ""
Example #3
0
def get_steps_calling_script(ssh_client, scripts_dict, script_name):
    """
    Gets which steps call the given shell script. Does so by parsing workflow config yamls on the cluster. 
    Returns str that summarizes that information

    input:
        ssh_client: a paramiko SSHClient obj
        scripts_dict: scripts dictionary described in get_scripts_dict function
        script_name: name of target script including .sh extension
    """
    tools_conf = yaml.load(
        ConnectionManager.execute_command(
            ssh_client, "cat /shared/workspace/Pipelines/config/tools.yaml"))

    #keys=pipline names, values=list of workflows in that pipeline
    #excludes "All Pipelines" key from original scripts_dict
    pipe_work_dict = {
        pipeline: get_workflows_in_pipeline(scripts_dict, pipeline)
        for pipeline in get_all_pipeline_names(scripts_dict)
    }

    configs = []
    for pipeline in pipe_work_dict:
        for workflow in pipe_work_dict[pipeline]:
            configs.append(
                yaml.load(
                    ConnectionManager.execute_command(
                        ssh_client,
                        "cat /shared/workspace/Pipelines/config/{0}/{0}_{1}.yaml"
                        .format(pipeline, workflow))))

    result = "{} called from:\n".format(script_name)
    script_name = script_name.replace(".sh", "")

    for conf in configs:
        for step in conf:
            if step == "steps":
                continue

            dirs = conf[step]["script_path"].split("/")

            #if length of dirs 1, then the script path must just contain script name
            #   therefore, script in all Pipelines
            #if length of dirs 2, then script path must have pipeline and script name
            #   therefore, script in all workflows for a specific pipeline
            #if length of dirs 3, then script path must have pipeline,workflow,script name
            #   therefore, script in specific workflow for specific pipeline
            in_strings = [
                "in all Pipelines", "in all Workflows in {}",
                "in the {} {} workflow"
            ]
            in_string = in_strings[len(dirs) - 1].format(*dirs[:-1])

            if dirs[-1] == script_name:
                result += "{} {}\n".format(step, in_string)
                if len(dirs) != 3:
                    return result

    return result
Example #4
0
def check_step_failed(ssh_client, pipeline, workflow, project_name, job_name):
    ConnectionManager.execute_command(ssh_client, "touch /shared/workspace/logs/{}/{}/{}/*/status.log".format(pipeline,workflow,project_name,job_name))
    status_log_checker = "ls /shared/workspace/logs/{}/{}/{}/*/status.log | xargs grep \"{}.*failed\" | wc -l"

    #reports if step finished and failed or finished and passed
    if int(ConnectionManager.execute_command(ssh_client,
        status_log_checker.format(pipeline,workflow,project_name,job_name))):
        return True

    return False
Example #5
0
def edit_step_tools_config(ssh_client, new_step_tools_conf, step_name):
    tools_conf = yaml.load(
        ConnectionManager.execute_command(
            ssh_client, "cat /shared/workspace/Pipelines/config/tools.yaml"))
    tools_conf[step_name] = new_step_tools_conf
    with open("tools.yaml", "w+") as f:
        f.write(yaml.dump(tools_conf, default_flow_style=False))
    ConnectionManager.execute_command(
        ssh_client,
        "mv -n /shared/workspace/Pipelines/config/tools.yaml /shared/workspace/Pipelines/config/tools.yaml.BACKUP"
    )
    ConnectionManager.copy_file(
        ssh_client, "{}/tools.yaml".format(os.getcwd()),
        "/shared/workspace/Pipelines/config/tools.yaml")
def check_status(ssh_client, job_name):
    print("checking processing status")
    qstat = ConnectionManager.execute_command(ssh_client, "qstat")

    job_ids = QstatParser.get_job_ids(qstat)
    job_details = [
        ConnectionManager.execute_command(ssh_client, "qstat -j %s" % x[0])
        for x in job_ids
    ]

    job_info = [job_ids[x] + [job_details[x]] for x in range(len(job_ids))]

    global log_dir
    logs = ConnectionManager.list_dir(ssh_client, log_dir)

    QstatParser.parse_qstat(job_info, job_name, logs)
Example #7
0
def get_pipeline_workflow_config(ssh_client, pipeline, workflow):
    specific_confs_dict = literal_eval(
        ConnectionManager.execute_command(
            ssh_client,
            "python /shared/workspace/Pipelines/util/GetAllSpecificConfs.py"))

    return yaml.load(specific_confs_dict[pipeline][workflow])
Example #8
0
def display_software_config(ssh_client):
    """Display the software config file as html"""

    show_script(
        ConnectionManager.execute_command(
            ssh_client,
            "cat /shared/workspace/Pipelines/config/software.conf"))
Example #9
0
def upload_script(ssh_client, pipeline, workflow, script_name):
    script_path_cluster = "/shared/workspace/Pipelines/scripts/"

    if pipeline == "all":
        script_path_cluster += script_name
    elif workflow == "all":
        script_path_cluster += "{}/{}".format(pipeline, script_name)
    else:
        script_path_cluster += "{}/{}/{}".format(pipeline, workflow,
                                                 script_name)

    ConnectionManager.execute_command(
        ssh_client, "mv -n {0} {0}.BACKUP".format(script_path_cluster))
    ConnectionManager.copy_file(ssh_client,
                                "{}/{}".format(os.getcwd(), script_name),
                                script_path_cluster)
def execute(ssh_client, project_name, analysis_steps, s3_input_files_address,
                   sample_list, group_name, s3_output_files_address):
    yaml_file = project_name + ".yaml"

    print "making the yaml file..."
    YamlFileMaker.make_yaml_file(yaml_file, project_name, analysis_steps, s3_input_files_address,
                   sample_list, group_name, s3_output_files_address, "hg19", "NA")

    print "copying yaml file to remote master node..."
    ConnectionManager.copy_file(ssh_client, yaml_file, workspace + "yaml_examples")

    ## Remove the local yaml file
    os.remove(yaml_file)

    print "executing pipeline..."
    ConnectionManager.execute_command(ssh_client, "sh " + workspace + "run.sh "
                                      + workspace + "yaml_examples/" + yaml_file)
def execute(ssh_client, project_name, analysis_steps, s3_input_files_address,
                       sample_list, group_list, s3_output_files_address):
    yaml_file = project_name + ".yaml"

    print "making the yaml file..."
    YamlFileMaker.make_yaml_file(yaml_file, project_name, analysis_steps, s3_input_files_address,
                   sample_list, group_list, s3_output_files_address, "hg19", "NA")

    print "copying yaml file to remote master node..."
    ConnectionManager.copy_file(ssh_client, yaml_file, workspace + "yaml_examples")

    ## Remove the local yaml file
    os.remove(yaml_file)

    print "executing pipeline..."
    ConnectionManager.execute_command(ssh_client, "sh " + workspace + "run.sh "
                                      + workspace + "yaml_examples/" + yaml_file)
Example #12
0
def edit_step_specific_config(ssh_client, pipeline, workflow,
                              new_extra_bash_args, step_name):
    conf_file_name = "{}_{}.yaml".format(pipeline, workflow)
    spec_conf = yaml.load(
        ConnectionManager.execute_command(
            ssh_client, "cat /shared/workspace/Pipelines/config/{}/{}".format(
                pipeline, conf_file_name)))
    spec_conf[step_name] = new_extra_bash_args

    with open(conf_file_name, "w+") as f:
        f.write(yaml.dump(spec_conf, default_flow_style=False))
    ConnectionManager.execute_command(
        ssh_client,
        "mv -n /shared/workspace/Pipelines/config/{0}/{1} /shared/workspace/Pipelines/config/{0}/{1}.BACKUP"
        .format(pipeline, conf_file_name))
    ConnectionManager.copy_file(
        ssh_client, "{}/{}".format(os.getcwd(), conf_file_name),
        "/shared/workspace/Pipelines/config/{}/{}".format(
            pipeline, conf_file_name))
Example #13
0
def get_software_dict(ssh_client):
    """
    Returns a dictionary representing all software on the cluster. Uses cluster's GetSoftware.py.
    The dictionary has installed software names as keys and a list of the installed versions as values
    This dictionary is the core data structure behind the "Check Software on Cluster" section in the
    CirrusAddons notebook

    input:
        ssh_client: a paramiko SSHClient obj
    """
    return literal_eval(
        ConnectionManager.execute_command(
            ssh_client,
            "python /shared/workspace/Pipelines/util/GetSoftware.py"))
Example #14
0
def get_step_config_dicts(ssh_client, scripts_dict, step_name):
    tools_conf = yaml.load(
        ConnectionManager.execute_command(
            ssh_client, "cat /shared/workspace/Pipelines/config/tools.yaml"))
    step_tools_conf = tools_conf[step_name]

    specific_confs_dict = literal_eval(
        ConnectionManager.execute_command(
            ssh_client,
            "python /shared/workspace/Pipelines/util/GetAllSpecificConfs.py"))

    step_spec_conf = {}

    for pipeline in specific_confs_dict:
        for workflow in specific_confs_dict[pipeline]:
            curr_conf = yaml.load(specific_confs_dict[pipeline][workflow])
            if step_name in curr_conf:
                step_spec_conf.update({
                    (pipeline, workflow):
                    curr_conf[step_name]
                })

    return step_tools_conf, step_spec_conf
Example #15
0
def get_scripts_dict(ssh_client):
    """
    Returns a dictionary representing all scripts on the cluster. Uses cluster's GetScripts.py.
    The dictionary has supported Pipelines and "All Pipelines" as keys and a second dict as values.
        The inner dictionaries have supported workflows for that pipeline and "All Workflows" as keys.
        The values for each is a list of shell script names
    This dictionary is the core data structure behind the "Check Scripts on Cluster" section in the
    CirrusAddons notebook

    input:
        ssh_client: a paramiko SSHClient obj
    """
    return literal_eval(
        ConnectionManager.execute_command(
            ssh_client,
            "python /shared/workspace/Pipelines/util/GetScripts.py"))
def check_processing_status(ssh_client):
    print "checking processing status"
    ConnectionManager.execute_command(ssh_client,
                                      "cat " + workspace + "nohup.out")
def check_host_status(ssh_client):
    print "checking qhost status"
    ConnectionManager.execute_command(ssh_client, "qhost")
def check_jobs_status(ssh_client):
    print "checking jobs status"
    ConnectionManager.execute_command(ssh_client, "qstat")
def check_processing_status(ssh_client):
    print "checking processing status"
    ConnectionManager.execute_command(ssh_client, "cat " + workspace + "/nohup.out")
Example #20
0
def check_status(ssh_client, step_name, pipeline, workflow, project_name,analysis_steps,verbose=False):
    print("checking status of jobs...\n")
    spec_yaml = ConnectionManager.execute_command(ssh_client, 
            "cat /shared/workspace/Pipelines/config/{}/{}_{}.yaml".format(pipeline, pipeline, workflow))
    spec_yaml = yaml.load(spec_yaml)
    
    possible_steps = get_possible_steps(analysis_steps, spec_yaml)

    if verbose:
        print("Your project will go through the following steps:\n\t{}\n".format(
            ", ".join(possible_steps)))

    all_possible_job_dict = get_job_names("all", possible_steps, spec_yaml)
    job_dict = get_job_names(step_name, possible_steps, spec_yaml)

    qstat = ConnectionManager.execute_command(ssh_client, "qstat")
    current_job = get_current_job(ssh_client, qstat)

    if qstat:
        split_qstat = qstat.splitlines()[2:]
    else:
        split_qstat = []

    curr_time = datetime.datetime.utcnow() #for time running in verbose output
    status_conv = {"qw":"queued", "r":"running", "dr":"being deleted", "t":"being transferred"}
    possible_steps = [all_possible_job_dict[x] for x in possible_steps if not x == "done"]
    possible_steps.append("done")
    is_done = True

    for step, job_name in job_dict.items():
        if verbose:
            print("The {} step calls the {} script on the cluster".format(step, job_name))

        qstat_j = get_qstat_j(ssh_client, job_name)

        if "Following jobs do not exist" in qstat_j: #only happens when qstat -j job_name fails
            if not job_name in possible_steps:
                print("The {} step was not specified as a step in analysis_steps".format(step))
                is_done = is_done and True
            elif possible_steps.index(current_job) < possible_steps.index(job_name):
                print("The {} step has not started yet.".format(step))
                is_done = False
            elif check_step_failed(ssh_client, pipeline, workflow, project_name, job_name):
                print("The {} step has finished running, but has failed".format(step))
                print("\tPlease check the logs")
                is_done = is_done and True
            elif check_step_passed(ssh_client, pipeline, workflow, project_name, job_name):
                print("The {} step has finished running without failure".format(step))
                is_done = is_done and True
            else:
                print("The {} step has not started yet.".format(step))
                is_done = False
        else: #job must be in qstat
            print("The {} step is being executed".format(step))

            num_jobs = {x:0 for x in status_conv.keys()}
            job_specifics = defaultdict(list)

            for line in split_qstat:
                line = line.split()
                line_id = line[0]

                #get job name from qstat -j otherwise job name could be longer than qstat allows
                line_job = re.search("job_name:\s+(.*)\n",get_qstat_j(ssh_client, job_name)).group(1)
                if not line_job == job_name:
                    continue

                line_status = line[4]
                num_jobs[line_status] += 1

                month,day,year = map(int,line[5].split("/"))
                hour,minute,second = map(int,line[6].split(":"))
                start_time = datetime.datetime(year,month,day,hour,minute,second)

                line_num_cores = line[-1]
                job_specifics[line_status].append("\tone is currently {} using {} core(s) and was submitted {} ago".format(
                    status_conv[line_status], line_num_cores, _format_timedelta(curr_time-start_time)))

            step_info = {stat:(num,job_specifics[stat]) for stat,num in num_jobs.items()}

            for stat,info_tuple in step_info.items():
                num, details = info_tuple
                if num == 0:
                    continue
                print("There are {} instances of the {} step currently {}".format(num, step, status_conv[stat]))
                if verbose:
                    for det in details:
                        print(det)
        print()

    if current_job == "done" and is_done:
        print("\nYour pipeline has finished")
    print()
Example #21
0
def execute(pipeline, ssh_client, project_name, workflow, analysis_steps,
            s3_input_files_address, sample_list, group_list,
            s3_output_files_address, genome, style, pairs_list):
    """Executes a pipeline.

    The main local side function for executing a pipeline with all user inputs to jupyter notebook.
    Calls the run.sh shell script on the cluster head node using nohup after creating
    a yaml file summarizing user input and uploaded that file to the cluster.

    Args:
        pipeline: name of the pipeline to be run, supported pipelines can be found in CirrusAddons notebook
        ssh_client: a paramiko SSHClient object that connects to the cluster where analysis is run
        project_name: name of the current project, <project_name>.yaml contains all user input to notebook
        workflow: name of the workflow to be run, supported workflows can be found in CirrusAddons notebook
        analysis_steps: set of analysis steps to be run, supported steps can be found in pipeline's notebook
        s3_input_files_address: s3 bucket containing all fastq files for project
        sample_list: list of dictionaries with sample info for each sample
        group_list: list of all groups, shares indices with sample_list (sample_list[0] is in group[0], etc)
        s3_output_files_address: root s3 bucket where analysis results should be uploaded
        genome: reference genome to be used, supported genomes can be found in pipeline's notebook
        style: only for ChIPSeq homer workflow, can be "factor" or "histone"
        pairs_list: dictionary with keys=normal samples, values=experimental samples
            for ChIPSeq the keys=ChIP samples, values=corresponding input regularization samples

    Returns:
        None
    """
    yaml_file = project_name + ".yaml"

    if s3_output_files_address.endswith("/"):
        s3_output_files_address = s3_output_files_address[:-1]
    if s3_input_files_address.endswith("/"):
        s3_input_files_address = s3_input_files_address[:-1]

    logs_dir = "/shared/workspace/logs/{}/{}/{}".format(
        pipeline, workflow, project_name)

    print("making the yaml file...")
    YamlFileMaker.make_yaml_file(yaml_file, pipeline, project_name, workflow,
                                 analysis_steps, s3_input_files_address,
                                 sample_list, group_list,
                                 s3_output_files_address, genome, style,
                                 pairs_list)

    print("copying yaml file to remote master node...")

    # Make sure remote directory exists
    remote_dir = workspace + "yaml_files/" + pipeline + "/" + workflow
    ssh_client.exec_command("mkdir -p " + remote_dir)

    ConnectionManager.copy_file(
        ssh_client, yaml_file,
        "{}yaml_files/{}/{}".format(workspace, pipeline, workflow))

    # Remove the local yaml file
    os.remove(yaml_file)

    print("executing pipeline...")

    ConnectionManager.execute_command(
        ssh_client, "nohup bash " + workspace + "scripts/run.sh " + workspace +
        "yaml_files/{}/{}/{} ".format(pipeline, workflow, yaml_file) +
        logs_dir + " " + pipeline + "_" + workflow)
Example #22
0
def stop_pipeline(ssh_client):
    ConnectionManager.execute_command(
        ssh_client, "bash /shared/workspace/Pipelines/util/stop_pipeline.sh")
Example #23
0
def restore_backups(ssh_client):
    ConnectionManager.execute_command(
        ssh_client,
        "python /shared/workspace/Pipelines/util/RestoreBackups.py")
def check_jobs_status(ssh_client):
    print "checking jobs status"
    ConnectionManager.execute_command(ssh_client, "qstat")
def check_host_status(ssh_client):
    print "checking qhost status"
    ConnectionManager.execute_command(ssh_client, "qhost")
Example #26
0
def get_qstat_j(ssh_client, job_name):
    return ConnectionManager.execute_command(ssh_client, "qstat -j {}".format(job_name))