Esempio n. 1
0
    def test_create_sbatch_header(self):
        got_header = utils.create_sbatch_header('slurm_project_id',
                                                'slurm_queue', 17,
                                                'slurm_time', 'job_name',
                                                'slurm_out_log',
                                                'slurm_err_log')
        expected_header = """#!/bin/bash -l

#SBATCH -A slurm_project_id
#SBATCH -p slurm_queue
#SBATCH -n 16
#SBATCH -t slurm_time
#SBATCH -J job_name
#SBATCH -o slurm_out_log
#SBATCH -e slurm_err_log
"""
        self.assertEqual(got_header, expected_header)
Esempio n. 2
0
def sbatch_piper_sample(command_line_list, workflow_name, project, sample,
                        libprep=None, restart_finished_jobs=False, 
                        config=None, config_file_path=None):
    """sbatch a piper sample-level workflow.

    :param list command_line_list: The list of command lines to execute (in order)
    :param str workflow_name: The name of the workflow to execute
    :param NGIProject project: The NGIProject
    :param NGISample sample: The NGISample
    :param dict config: The parsed configuration file (optional)
    :param str config_file_path: The path to the configuration file (optional)
    """
    job_identifier = "{}-{}-{}".format(project.project_id, sample, workflow_name)
    # Paths to the various data directories
    project_dirname = project.dirname
    sample_dirname = sample.dirname
    perm_analysis_dir = os.path.join(project.base_path, "ANALYSIS", project_dirname, "piper_ngi")
    scratch_analysis_dir = os.path.join("$SNIC_TMP/ANALYSIS/", project_dirname, "piper_ngi")
    scratch_aln_dir = os.path.join(scratch_analysis_dir, "01_raw_alignments")
    scratch_qc_dir = os.path.join(scratch_analysis_dir, "02_preliminary_alignment_qc")
    #ensure that the analysis dir exists
    safe_makedir(perm_analysis_dir)
    try:
        slurm_project_id = config["environment"]["project_id"]
    except KeyError:
        raise RuntimeError('No SLURM project id specified in configuration file '
                           'for job "{}"'.format(job_identifier))
    slurm_queue = config.get("slurm", {}).get("queue") or "core"
    num_cores = config.get("slurm", {}).get("cores") or 8
    slurm_time = config.get("piper", {}).get("job_walltime", {}).get("workflow_name") or "4-00:00:00"
    slurm_out_log = os.path.join(perm_analysis_dir, "logs", "{}_sbatch.out".format(job_identifier))
    slurm_err_log = os.path.join(perm_analysis_dir, "logs", "{}_sbatch.err".format(job_identifier))
    for log_file in slurm_out_log, slurm_err_log:
        rotate_file(log_file)
    sbatch_text = create_sbatch_header(slurm_project_id=slurm_project_id,
                                       slurm_queue=slurm_queue,
                                       num_cores=num_cores,
                                       slurm_time=slurm_time,
                                       job_name="piper_{}".format(job_identifier),
                                       slurm_out_log=slurm_out_log,
                                       slurm_err_log=slurm_err_log)
    sbatch_text_list = sbatch_text.split("\n")
    sbatch_extra_params = config.get("slurm", {}).get("extra_params", {})
    for param, value in sbatch_extra_params.iteritems():
        sbatch_text_list.append("#SBATCH {} {}\n\n".format(param, value))
    modules_to_load = config.get("piper", {}).get("load_modules", [])
    if modules_to_load:
        sbatch_text_list.append("\n# Load required modules for Piper")
        for module_name in modules_to_load:
            sbatch_text_list.append("module load {}".format(module_name))

    project, src_aln_files, src_alnqc_files = \
            collect_files_for_sample_analysis(project, sample, 
                                                restart_finished_jobs)

    # Fastq files to copy
    fastq_src_dst_list = []
    directories_to_create = set()
    for sample in project:
        for libprep in sample:
            for seqrun in libprep:
                project_specific_path = os.path.join(project.dirname,
                                                     sample.dirname,
                                                     libprep.dirname,
                                                     seqrun.dirname)
                directories_to_create.add(os.path.join("$SNIC_TMP/DATA/", project_specific_path))
                for fastq in seqrun.fastq_files:
                    src_file = os.path.join(project.base_path, "DATA", project_specific_path, fastq)
                    dst_file = os.path.join("$SNIC_TMP/DATA/", project_specific_path, fastq)
                    fastq_src_dst_list.append([src_file, dst_file])

    sbatch_text_list.append("echo -ne '\\n\\nCopying fastq files at '")
    sbatch_text_list.append("date")
    if fastq_src_dst_list:
        for directory in directories_to_create:
            sbatch_text_list.append("mkdir -p {}".format(directory))
        for src_file, dst_file in fastq_src_dst_list:
            sbatch_text_list.append("rsync -rptoDLv {} {}".format(src_file, dst_file))
    else:
        raise ValueError(('No valid fastq files available to process for '
                          'project/sample {}/{}'.format(project, sample)))

    # BAM files / Alignment QC files
    input_files_list = [ src_aln_files, src_alnqc_files ]
    output_dirs_list = [ scratch_aln_dir, scratch_qc_dir ]
    echo_text_list = ["Copying any pre-existing alignment files",
                      "Copying any pre-existing alignment qc files"]
    for echo_text, input_files, output_dir in zip(echo_text_list, input_files_list, output_dirs_list):
        if input_files:
            sbatch_text_list.append("echo -ne '\\n\\n{}' at ".format(echo_text))
            sbatch_text_list.append("date")
            sbatch_text_list.append("mkdir -p {}".format(output_dir))
            sbatch_text_list.append(("rsync -rptoDLv {input_files} "
                                     "{output_directory}/").format(input_files=" ".join(input_files),
                                                                  output_directory=output_dir))
    sbatch_text_list.append("echo -ne '\\n\\nExecuting command lines at '")
    sbatch_text_list.append("date")
    sbatch_text_list.append("# Run the actual commands")
    for command_line in command_line_list:
        sbatch_text_list.append(command_line)


    piper_status_file=create_exit_code_file_path(workflow_subtask=workflow_name,
                                                project_base_path=project.base_path,
                                                project_name=project.dirname,
                                                project_id=project.project_id,
                                                sample_id=sample.name)
    sbatch_text_list.append("\nPIPER_RETURN_CODE=$?")
    #sbatch_text_list.append("if [[ $PIPER_RETURN_CODE == 0 ]]")
    #sbatch_text_list.append("then")
    sbatch_text_list.append("echo -ne '\\n\\nCopying back the resulting analysis files at '")
    sbatch_text_list.append("date")
    sbatch_text_list.append("mkdir -p {}".format(perm_analysis_dir))
    sbatch_text_list.append("rsync -rptoDLv {}/ {}/".format(scratch_analysis_dir, perm_analysis_dir))
    sbatch_text_list.append("\nRSYNC_RETURN_CODE=$?")
    #sbatch_text_list.append("else")
    #sbatch_text_list.append("  echo -e '\\n\\nPiper job failed'")
    #sbatch_text_list.append("fi")

    # Record job completion status
    sbatch_text_list.append("if [[ $RSYNC_RETURN_CODE == 0 ]]")
    sbatch_text_list.append("then")
    sbatch_text_list.append("  if [[ $PIPER_RETURN_CODE == 0 ]]")
    sbatch_text_list.append("  then")
    sbatch_text_list.append("    echo '0'> {}".format(piper_status_file))
    sbatch_text_list.append("  else")
    sbatch_text_list.append("    echo '1'> {}".format(piper_status_file))
    sbatch_text_list.append("  fi")
    sbatch_text_list.append("else")
    sbatch_text_list.append("  echo '2'> {}".format(piper_status_file))
    sbatch_text_list.append("fi")

    # Write the sbatch file
    sbatch_dir = os.path.join(perm_analysis_dir, "sbatch")
    safe_makedir(sbatch_dir)
    sbatch_outfile = os.path.join(sbatch_dir, "{}.sbatch".format(job_identifier))
    rotate_file(sbatch_outfile)
    with open(sbatch_outfile, 'w') as f:
        f.write("\n".join(sbatch_text_list))
    LOG.info("Queueing sbatch file {} for job {}".format(sbatch_outfile, job_identifier))
    # Queue the sbatch file
    p_handle = execute_command_line("sbatch {}".format(sbatch_outfile),
                                    stdout=subprocess.PIPE,
                                    stderr=subprocess.PIPE)
    p_out, p_err = p_handle.communicate()
    try:
        slurm_job_id = re.match(r'Submitted batch job (\d+)', p_out).groups()[0]
    except AttributeError:
        raise RuntimeError('Could not submit sbatch job for workflow "{}": '
                           '{}'.format(job_identifier, p_err))
    # Detail which seqruns we've started analyzing so we can update statuses later
    record_analysis_details(project, job_identifier)
    return int(slurm_job_id)
Esempio n. 3
0
def sbatch_piper_sample(command_line_list,
                        workflow_name,
                        project,
                        sample,
                        libprep=None,
                        restart_finished_jobs=False,
                        files_to_copy=None,
                        config=None,
                        config_file_path=None):
    """sbatch a piper sample-level workflow.

    :param list command_line_list: The list of command lines to execute (in order)
    :param str workflow_name: The name of the workflow to execute
    :param NGIProject project: The NGIProject
    :param NGISample sample: The NGISample
    :param dict config: The parsed configuration file (optional)
    :param str config_file_path: The path to the configuration file (optional)
    """
    job_identifier = "{}-{}-{}".format(project.project_id, sample,
                                       workflow_name)
    # Paths to the various data directories
    project_dirname = project.dirname
    perm_analysis_dir = os.path.join(project.base_path, "ANALYSIS",
                                     project_dirname, "piper_ngi", "")
    scratch_analysis_dir = os.path.join("$SNIC_TMP/ANALYSIS/", project_dirname,
                                        "piper_ngi", "")
    #ensure that the analysis dir exists
    safe_makedir(perm_analysis_dir)
    try:
        slurm_project_id = config["environment"]["project_id"]
    except KeyError:
        raise RuntimeError(
            'No SLURM project id specified in configuration file '
            'for job "{}"'.format(job_identifier))
    slurm_queue = config.get("slurm", {}).get("queue") or "core"
    num_cores = config.get("slurm", {}).get("cores") or 16
    slurm_time = config.get("piper", {}).get(
        "job_walltime", {}).get(workflow_name) or "4-00:00:00"
    slurm_out_log = os.path.join(perm_analysis_dir, "logs",
                                 "{}_sbatch.out".format(job_identifier))
    slurm_err_log = os.path.join(perm_analysis_dir, "logs",
                                 "{}_sbatch.err".format(job_identifier))
    for log_file in slurm_out_log, slurm_err_log:
        rotate_file(log_file)
    sbatch_text = create_sbatch_header(
        slurm_project_id=slurm_project_id,
        slurm_queue=slurm_queue,
        num_cores=num_cores,
        slurm_time=slurm_time,
        job_name="piper_{}".format(job_identifier),
        slurm_out_log=slurm_out_log,
        slurm_err_log=slurm_err_log)
    sbatch_text_list = sbatch_text.split("\n")
    sbatch_extra_params = config.get("slurm", {}).get("extra_params", {})
    for param, value in sbatch_extra_params.iteritems():
        sbatch_text_list.append("#SBATCH {} {}\n\n".format(param, value))
    modules_to_load = config.get("piper", {}).get("load_modules", [])
    if modules_to_load:
        sbatch_text_list.append("\n# Load required modules for Piper")
        for module_name in modules_to_load:
            sbatch_text_list.append("module load {}".format(module_name))

    if not files_to_copy:
        project, files_to_copy = \
            collect_files_for_sample_analysis(project, sample, restart_finished_jobs)

    # Fastq files to copy
    fastq_src_dst_list = []
    directories_to_create = set()
    for libprep in sample:
        for seqrun in libprep:
            project_specific_path = os.path.join(project.dirname,
                                                 sample.dirname,
                                                 libprep.dirname,
                                                 seqrun.dirname)
            directories_to_create.add(
                os.path.join("$SNIC_TMP/DATA/", project_specific_path))
            for fastq in seqrun.fastq_files:
                src_file = os.path.join(project.base_path, "DATA",
                                        project_specific_path, fastq)
                dst_file = os.path.join("$SNIC_TMP/DATA/",
                                        project_specific_path, fastq)
                fastq_src_dst_list.append([src_file, dst_file])

    sbatch_text_list.append("echo -ne '\\n\\nCopying fastq files at '")
    sbatch_text_list.append("date")
    if fastq_src_dst_list:
        for directory in directories_to_create:
            sbatch_text_list.append("mkdir -p {}".format(directory))
        for src_file, dst_file in fastq_src_dst_list:
            sbatch_text_list.append("rsync -rptoDLv {} {}".format(
                src_file, dst_file))
    else:
        raise ValueError(('No valid fastq files available to process for '
                          'project/sample {}/{}'.format(project, sample)))

    # Pre-existing analysis files
    if files_to_copy:
        sbatch_text_list.append(
            "echo -ne '\\n\\nCopying pre-existing analysis files at '")
        sbatch_text_list.append("date")

        sbatch_text_list.append("if [ ! -d {output directory} ]; then")
        sbatch_text_list.append("mkdir {output directory} ")
        sbatch_text_list.append("fi")
        sbatch_text_list.append(("rsync -rptoDLv {input_files} "
                                 "{output_directory}/").format(
                                     input_files=" ".join(files_to_copy),
                                     output_directory=scratch_analysis_dir))
        # Delete pre-existing analysis files after copy
        sbatch_text_list.append(
            "echo -ne '\\n\\nDeleting pre-existing analysis files at '")
        sbatch_text_list.append("date")
        sbatch_text_list.append(
            "rm -rf {input_files}".format(input_files=" ".join(files_to_copy)))

    sbatch_text_list.append("echo -ne '\\n\\nExecuting command lines at '")
    sbatch_text_list.append("date")
    sbatch_text_list.append("# Run the actual commands")
    for command_line in command_line_list:
        sbatch_text_list.append(command_line)

    piper_status_file = create_exit_code_file_path(
        workflow_subtask=workflow_name,
        project_base_path=project.base_path,
        project_name=project.dirname,
        project_id=project.project_id,
        sample_id=sample.name)
    sbatch_text_list.append("\nPIPER_RETURN_CODE=$?")

    #Precalcuate md5sums
    sbatch_text_list.append(
        'MD5FILES="$SNIC_TMP/ANALYSIS/{}/piper_ngi/05_processed_alignments/*{}*.bam'
        .format(project.project_id, sample.name))
    sbatch_text_list.append(
        '$SNIC_TMP/ANALYSIS/{}/piper_ngi/05_processed_alignments/*.table'.
        format(project.project_id))
    sbatch_text_list.append(
        '$SNIC_TMP/ANALYSIS/{}/piper_ngi/07_variant_calls/*{}*.genomic.vcf.gz'.
        format(project.project_id, sample.name))
    sbatch_text_list.append(
        '$SNIC_TMP/ANALYSIS/{}/piper_ngi/07_variant_calls/*{}*.annotated.vcf.gz"'
        .format(project.project_id, sample.name))
    sbatch_text_list.append('for f in $MD5FILES')
    sbatch_text_list.append('do')
    sbatch_text_list.append("    md5sum $f | awk '{printf $1}' > $f.md5 &")
    sbatch_text_list.append('done')
    sbatch_text_list.append('wait')

    #Copying back files
    sbatch_text_list.append(
        "echo -ne '\\n\\nCopying back the resulting analysis files at '")
    sbatch_text_list.append("date")
    sbatch_text_list.append("mkdir -p {}".format(perm_analysis_dir))
    sbatch_text_list.append("rsync -rptoDLv {}/ {}/".format(
        scratch_analysis_dir, perm_analysis_dir))
    sbatch_text_list.append("\nRSYNC_RETURN_CODE=$?")

    # Record job completion status
    sbatch_text_list.append("if [[ $RSYNC_RETURN_CODE == 0 ]]")
    sbatch_text_list.append("then")
    sbatch_text_list.append("  if [[ $PIPER_RETURN_CODE == 0 ]]")
    sbatch_text_list.append("  then")
    sbatch_text_list.append("    echo '0'> {}".format(piper_status_file))
    sbatch_text_list.append("  else")
    sbatch_text_list.append("    echo '1'> {}".format(piper_status_file))
    sbatch_text_list.append("  fi")
    sbatch_text_list.append("else")
    sbatch_text_list.append("  echo '2'> {}".format(piper_status_file))
    sbatch_text_list.append("fi")

    # Write the sbatch file
    sbatch_dir = os.path.join(perm_analysis_dir, "sbatch")
    safe_makedir(sbatch_dir)
    sbatch_outfile = os.path.join(sbatch_dir,
                                  "{}.sbatch".format(job_identifier))
    rotate_file(sbatch_outfile)
    with open(sbatch_outfile, 'w') as f:
        f.write("\n".join(sbatch_text_list))
    LOG.info("Queueing sbatch file {} for job {}".format(
        sbatch_outfile, job_identifier))
    # Queue the sbatch file
    p_handle = execute_command_line("sbatch {}".format(sbatch_outfile),
                                    stdout=subprocess.PIPE,
                                    stderr=subprocess.PIPE)
    p_out, p_err = p_handle.communicate()
    try:
        slurm_job_id = re.match(r'Submitted batch job (\d+)',
                                p_out).groups()[0]
    except AttributeError:
        raise RuntimeError('Could not submit sbatch job for workflow "{}": '
                           '{}'.format(job_identifier, p_err))
    # Detail which seqruns we've started analyzing so we can update statuses later
    record_analysis_details(project, job_identifier)
    return int(slurm_job_id)