Ejemplo n.º 1
0
def rotate_previous_analysis(project_obj):
    """Rotates the files from the existing analysis starting at 03_merged_aligments"""
    project_dir_path = os.path.join(project_obj.base_path, "ANALYSIS",
                                    project_obj.project_id, "piper_ngi")
    #analysis_move = glob.glob(os.path.join(project_dir_path, '0[3-9]_*'))
    for sample in project_obj:
        # P123_456 is renamed by Piper to P123-456
        piper_sample_name = sample.name.replace("_", "-", 1)
        sample_files = glob.glob(os.path.join(project_dir_path, "0[3-9]_*", "{}.*".format(piper_sample_name)))
        if sample_files:
            LOG.info('Rotating files for sample {} under {} to '
                     '"previous_analyses" folder'.format(sample, project_dir_path))
            current_datetime = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S:%f")
            for sample_file in sample_files:
                # This will be the project_dir_path, so I guess I'm just being paranoid
                common_prefix = os.path.commonprefix([os.path.abspath(project_dir_path),
                                                      os.path.abspath(sample_file)])
                # This part of the directory tree we need to recreate under previous_analyses
                # So e.g. with
                #       /proj/a2015001/Y.Mom_15_01/01_raw_alignments/P123_456.bam
                # we'd get
                #       01_raw_alignments/P123_456.bam
                # and we'd then create
                #       /proj/a2015001/Y.Mom_15_01/previous_analyses/2015-02-19_16:24:12:640314/01_raw_alignments/
                # and move the file to this directory.
                leaf_path = os.path.relpath(sample_file, common_prefix)
                leaf_base, filename = os.path.split(leaf_path)
                previous_analysis_dirpath = os.path.join(common_prefix,
                                                         "previous_analyses",
                                                         current_datetime,
                                                         leaf_base)
                safe_makedir(previous_analysis_dirpath, mode=0o2770)
                LOG.debug("Moving file {} to directory {}".format(sample_file,
                                                                  previous_analysis_dirpath))
                shutil.move(sample_file, previous_analysis_dirpath)
Ejemplo n.º 2
0
def write_batch_job(analysis_object,
                    reference,
                    fastq_dir_path,
                    config=None,
                    config_file_path=None):
    analysis_path = os.path.join(analysis_object.project.base_path, "ANALYSIS",
                                 analysis_object.project.project_id, 'rna_ngi')
    sbatch_dir_path = os.path.join(analysis_path, 'sbatch')
    safe_makedir(sbatch_dir_path)
    sbatch_file_path = os.path.join(sbatch_dir_path, 'rna_ngi.sh')
    fastq_glob_path = os.path.join(fastq_dir_path, '*_R{1,2}_*.fastq.gz')
    main_nexflow_path = config['analysis']['best_practice_analysis'][
        'RNA-seq']['ngi_nf_path']
    nf_conf = config['analysis']['best_practice_analysis']['RNA-seq'][
        '{}_ngi_conf'.format(analysis_object.sequencing_facility)]
    analysis_log_path = os.path.join(analysis_path, 'nextflow_output.log')
    exit_code_path = os.path.join(analysis_path, 'nextflow_exit_code.out')
    LOG.info("Writing sbatch file to {}".format(sbatch_file_path))
    with open(sbatch_file_path, 'w') as sb:
        sb.write("#!/bin/bash\n\n")
        sb.write("cd {an_path}\n".format(an_path=analysis_path))
        sb.write("> {ex_path}\n".format(ex_path=exit_code_path))
        sb.write(
            "nextflow {ngi_rna_nf} --reads '{fastq_glob}' --genome '{ref}' -c {nf_conf} --outdir {an_path} &> {out_log}\n"
            .format(ngi_rna_nf=main_nexflow_path,
                    fastq_glob=fastq_glob_path,
                    ref=reference,
                    nf_conf=nf_conf,
                    an_path=analysis_path,
                    out_log=analysis_log_path))
        sb.write("echo $? > {ex_path}\n".format(ex_path=exit_code_path))
    LOG.info("NextFlow output will be logged at {}".format(analysis_log_path))
    return sbatch_file_path
Ejemplo n.º 3
0
def rotate_previous_analysis(project_obj):
    """Rotates the files from the existing analysis starting at 03_merged_aligments"""
    project_dir_path = os.path.join(project_obj.base_path, "ANALYSIS", project_obj.project_id, "piper_ngi")
    #analysis_move = glob.glob(os.path.join(project_dir_path, '0[3-9]_*'))
    for sample in project_obj:
        # P123_456 is renamed by Piper to P123-456
        piper_sample_name = sample.name.replace("_", "-", 1)
        sample_files = glob.glob(os.path.join(project_dir_path, "0[3-9]_*", "{}.*".format(piper_sample_name)))
    if sample_files:
        LOG.info('Rotating files for sample {} under {} to '
                 '"previous_analyses" folder'.format(sample, project_dir_path))
        current_datetime = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S:%f")
        for sample_file in sample_files:
            # This will be the project_dir_path, so I guess I'm just being paranoid
            common_prefix = os.path.commonprefix([os.path.abspath(project_dir_path),
                                                  os.path.abspath(sample_file)])
            # This part of the directory tree we need to recreate under previous_analyses
            # So e.g. with
            #       /proj/a2015001/Y.Mom_15_01/01_raw_alignments/P123_456.bam
            # we'd get
            #       01_raw_alignments/P123_456.bam
            # and we'd then create
            #       /proj/a2015001/Y.Mom_15_01/previous_analyses/2015-02-19_16:24:12:640314/01_raw_alignments/
            # and move the file to this directory.
            leaf_path = os.path.relpath(sample_file, common_prefix)
            leaf_base, filename = os.path.split(leaf_path)
            previous_analysis_dirpath = os.path.join(common_prefix, "previous_analyses", current_datetime, leaf_base)
            safe_makedir(previous_analysis_dirpath, mode=0o2770)
            LOG.debug("Moving file {} to directory {}".format(sample_file, previous_analysis_dirpath))
            shutil.move(sample_file, previous_analysis_dirpath)
Ejemplo n.º 4
0
def run_multiqc(base_path, project_id, project_name, wait=False):

    project_path=os.path.join(base_path, 'ANALYSIS', project_id)
    result_path=os.path.join(base_path, 'ANALYSIS', project_id, 'multiqc')
    safe_makedir(result_path)
    command=['multiqc', project_path, '-o', result_path, '-i', project_name, '-n', project_name, '-q', '-f']
    multiqc_stdout=''
    multiqc_stderr=''
    try:
        #if multiqc is already running, kill it first.
        ps_command=["ps", "ux"]
        pcs=subprocess.check_output(ps_command)
        for line in pcs.splitlines():
            if " ".join(command) in line :
                os.kill(int(line.split()[1]), 9)

        #then run multiqc
        handle=execute_command_line(command)
        if wait:
            (multiqc_stdout, multiqc_stderr)=handle.communicate()
            if multiqc_stdout or multiqc_stderr:
                combined_output="{}\n{}".format(multiqc_stdout, multiqc_stderr)
                raise Exception(combined_output)

    except:
        raise
def build_piper_cl(project, workflow_name, setup_xml_path, exit_code_path,
                   config, genotype_file=None, exec_mode="local", generate_bqsr_bam=False):
    """Determine which workflow to run for a project and build the appropriate command line.
    :param NGIProject project: The project object to analyze.
    :param str workflow_name: The name of the workflow to execute (e.g. "dna_alignonly")
    :param str exit_code_path: The path to the file to which the exit code for this cl will be written
    :param dict config: The (parsed) configuration file for this machine/environment.
    :param str genotype_file: The path to the genotype file (only relevant for genotype workflow)
    :param str exec_mode: "local" or "sbatch"

    :returns: A list of Project objects with command lines to execute attached.
    :rtype: list
    :raises ValueError: If a required configuration value is missing.
    """
    if exec_mode == "sbatch":
        output_dir = os.path.join("$SNIC_TMP/ANALYSIS/", project.dirname, 'piper_ngi')
        # Can't create these directories ahead of time of course
    elif exec_mode == "local":
        output_dir = os.path.join(project.base_path, "ANALYSIS", project.dirname, 'piper_ngi')
        safe_makedir(output_dir)
    else:
        raise ValueError('"exec_mode" must be one of "local", "sbatch" (value '
                         'was "{}"'.format(exec_mode))

    # Global Piper configuration
    piper_rootdir = config.get("piper", {}).get("path_to_piper_rootdir")
    piper_global_config_path = \
                    (os.environ.get("PIPER_GLOB_CONF_XML") or
                     config.get("piper", {}).get("path_to_piper_globalconfig") or
                     (os.path.join(piper_rootdir, "globalConfig.xml") if
                     piper_rootdir else None))
    if not piper_global_config_path:
        raise ValueError('Could not find Piper global configuration file in config '
                         'file, as environmental variable ("PIPER_GLOB_CONF_XML"), '
                         'or in Piper root directory.')

    # QScripts directory
    try:
        piper_qscripts_dir = (os.environ.get("PIPER_QSCRIPTS_DIR") or
                              config['piper']['path_to_piper_qscripts'])
    except KeyError:
        raise ValueError('Could not find Piper QScripts directory in config file or '
                         'as environmental variable ("PIPER_QSCRIPTS_DIR").')

    # Build Piper cl
    LOG.info('Building workflow command line(s) for project "{}" / workflow '
             '"{}"'.format(project, workflow_name))
    cl = workflows.return_cl_for_workflow(workflow_name=workflow_name,
                                          qscripts_dir_path=piper_qscripts_dir,
                                          setup_xml_path=setup_xml_path,
                                          genotype_file=genotype_file,
                                          global_config_path=piper_global_config_path,
                                          output_dir=output_dir,
                                          exec_mode=exec_mode,
                                          generate_bqsr_bam=generate_bqsr_bam)
    # Blank out the file if it already exists
    safe_makedir(os.path.dirname(exit_code_path))
    open(exit_code_path, 'w').close()
    return cl 
Ejemplo n.º 6
0
def build_piper_cl(project,
                   workflow_name,
                   setup_xml_path,
                   exit_code_path,
                   config,
                   genotype_file=None,
                   exec_mode="local",
                   generate_bqsr_bam=False):
    """Determine which workflow to run for a project and build the appropriate command line.
    :param NGIProject project: The project object to analyze.
    :param str workflow_name: The name of the workflow to execute (e.g. "dna_alignonly")
    :param str exit_code_path: The path to the file to which the exit code for this cl will be written
    :param dict config: The (parsed) configuration file for this machine/environment.
    :param str genotype_file: The path to the genotype file (only relevant for genotype workflow)
    :param str exec_mode: "local" or "sbatch"

    :returns: A list of Project objects with command lines to execute attached.
    :rtype: list
    :raises ValueError: If a required configuration value is missing.
    """
    if exec_mode == "sbatch":
        output_dir = os.path.join("$SNIC_TMP/ANALYSIS/", project.dirname,
                                  'piper_ngi')
        # Can't create these directories ahead of time of course
    elif exec_mode == "local":
        output_dir = os.path.join(project.base_path, "ANALYSIS",
                                  project.dirname, 'piper_ngi')
        safe_makedir(output_dir)
    else:
        raise ValueError('"exec_mode" must be one of "local", "sbatch" (value '
                         'was "{}"'.format(exec_mode))

    # Global Piper configuration
    piper_rootdir = config.get("piper", {}).get("path_to_piper_rootdir")

    # QScripts directory
    try:
        piper_qscripts_dir = (os.environ.get("PIPER_QSCRIPTS_DIR")
                              or os.environ.get("PIPER_QSCRIPTS")
                              or config['piper']['path_to_piper_qscripts'])
    except KeyError:
        raise ValueError(
            'Could not find Piper QScripts directory in config file or '
            'as environmental variable ("PIPER_QSCRIPTS_DIR").')

    # Build Piper cl
    LOG.info('Building workflow command line(s) for project "{}" / workflow '
             '"{}"'.format(project, workflow_name))
    cl = workflows.return_cl_for_workflow(workflow_name=workflow_name,
                                          qscripts_dir_path=piper_qscripts_dir,
                                          setup_xml_path=setup_xml_path,
                                          genotype_file=genotype_file,
                                          output_dir=output_dir,
                                          exec_mode=exec_mode,
                                          generate_bqsr_bam=generate_bqsr_bam)
    # Blank out the file if it already exists
    safe_makedir(os.path.dirname(exit_code_path))
    open(exit_code_path, 'w').close()
    return cl
Ejemplo n.º 7
0
def preprocess_analysis(analysis_object, fastq_files):
    analysis_path=os.path.join(analysis_object.project.base_path, "ANALYSIS", analysis_object.project.project_id, 'rna_ngi')
    safe_makedir(analysis_path)
    convenience_dir_path=os.path.join(analysis_path, 'fastqs')
    safe_makedir(convenience_dir_path)
    LOG.info("cleaning subfolder {}".format(convenience_dir_path))
    for link in glob.glob(os.path.join(convenience_dir_path, '*')):
        os.unlink(link)
    merge_fastq_files(convenience_dir_path, fastq_files)
    return convenience_dir_path
def workflow_fastqc(input_files, output_dir, config):
    """The constructor of the FastQC command line.

    :param list input_files: The list of fastq files to analyze (may be 2D for read pairs)
    :param str output_dir: The path to the desired output directory (will be created)
    :param dict config: The parsed system/pipeline configuration file

    :returns: A list of command lines to be executed in the order given
    :rtype: list
    :raises ValueError: If the FastQC path is not given or is not on PATH
    """
    # Get the path to the fastqc command
    fastqc_path = config.get("paths", {}).get("fastqc")
    if not fastqc_path:
        if find_on_path("fastqc", config):
            LOG.info("fastqc found on PATH")
            fastqc_path = "fastqc"
        else:
            raise ValueError('Path to FastQC could not be found and it is not '
                             'available on PATH; cannot proceed with FastQC '
                             'workflow.')

    fastq_files = flatten(input_files) # FastQC cares not for your "read pairs"
    # Verify that we in fact need to run this on these files
    fastqc_output_file_tmpls = ("{}_fastqc.zip", "{}_fastqc.html")
    fastq_to_analyze = fastq_to_be_analysed(fastq_files, output_dir, fastqc_output_file_tmpls)
    # Construct the command lines
    num_threads = config.get("qc", {}).get("fastqc", {}).get("threads") or 1
    cl_list = []
    # fastqc commands
    for fastq_file_pair in fastq_to_analyze:
        #when building the fastqc command soflink in the qc_ngi folder the fastq file processed being sure to avoid name collision (i.e., same sample run in two different FC but on the same lane number). Run fastqc on the softlink and delete the soflink straight away.
        fastq_file_original   = fastq_file_pair[0]
        fastq_file_softlinked = fastq_file_pair[1]
        #add the command
        cl_list.append('ln -s {original_file} {renamed_fastq_file}'.format(original_file=fastq_file_original,
                                                                            renamed_fastq_file=fastq_file_softlinked))
        #now the fastq command (one per file)
        cl_list.append('{fastqc_path} -t {num_threads} -o {output_dir} '
                       '{fastq_files}'.format(output_dir=output_dir,
                                              fastqc_path=fastqc_path,
                                              num_threads=num_threads,
                                              fastq_files=fastq_file_softlinked))
        #remove the link to the fastq file
        cl_list.append('rm {renamed_fastq_file}'.format(renamed_fastq_file=fastq_file_softlinked))
    if cl_list:
        safe_makedir(output_dir) #create the fastqc folder as fastqc wants it and I have to create soflinks
        # Module loading
        modules_to_load = get_all_modules_for_workflow("fastqc", config)
        mod_list = [ "module load {}".format(module) for module in modules_to_load ]
        if mod_list:
            cl_list = mod_list + cl_list
    if not cl_list:
        LOG.info("FastQC analysis not needed or input files were invalid.")
    return cl_list
Ejemplo n.º 9
0
def preprocess_analysis(analysis_object, fastq_files):
    analysis_path = os.path.join(analysis_object.project.base_path, "ANALYSIS",
                                 analysis_object.project.project_id, 'rna_ngi')
    safe_makedir(analysis_path)
    convenience_dir_path = os.path.join(analysis_path, 'fastqs')
    safe_makedir(convenience_dir_path)
    LOG.info("cleaning subfolder {}".format(convenience_dir_path))
    for link in glob.glob(os.path.join(convenience_dir_path, '*')):
        os.unlink(link)
    merge_fastq_files(convenience_dir_path, fastq_files)
    return convenience_dir_path
Ejemplo n.º 10
0
def create_sbatch_file(cl_list, project, sample, config):
    project_analysis_path = os.path.join(project.base_path, "ANALYSIS",
                                         project.project_id, "qc_ngi")
    log_dir_path = os.path.join(project_analysis_path, "logs")
    sbatch_dir_path = os.path.join(project_analysis_path, "sbatch")
    job_label = "{}-{}".format(project.project_id, sample)
    sbatch_file_path = os.path.join(sbatch_dir_path,
                                    "{}.sbatch".format(job_label))
    safe_makedir(log_dir_path)
    safe_makedir(sbatch_dir_path)
    # sbatch parameters
    try:
        slurm_project_id = config["environment"]["project_id"]
    except KeyError:
        raise RuntimeError(
            'No SLURM project id specified in configuration file '
            'for job "{}"'.format(job_label))
    slurm_queue = config.get("slurm", {}).get("queue") or "core"
    num_cores = config.get("slurm", {}).get("cores") or 16
    slurm_time = config.get("qc", {}).get("job_walltime", {}) or "3-00:00:00"
    slurm_out_log = os.path.join(log_dir_path,
                                 "{}_sbatch.out".format(job_label))
    slurm_err_log = os.path.join(log_dir_path,
                                 "{}_sbatch.err".format(job_label))
    for log_file in slurm_out_log, slurm_err_log:
        rotate_file(log_file)
    sbatch_text = SBATCH_HEADER.format(slurm_project_id=slurm_project_id,
                                       slurm_queue=slurm_queue,
                                       num_cores=num_cores,
                                       slurm_time=slurm_time,
                                       job_name="qc_{}".format(job_label),
                                       slurm_out_log=slurm_out_log,
                                       slurm_err_log=slurm_err_log)
    sbatch_text_list = sbatch_text.split("\n")
    sbatch_extra_params = config.get("slurm", {}).get("extra_params", {})
    for param, value in sbatch_extra_params.items():
        sbatch_text_list.append("#SBATCH {} {}\n\n".format(param, value))
    sbatch_text_list.append("echo -ne '\\n\\nExecuting command lines at '")
    sbatch_text_list.append("date")
    # Note that because these programs have such small output,
    # we're writing results directly to permanent storage and thus
    # it is not necessary to copy results back from anywhere
    sbatch_text_list.append("# Run the actual commands")
    for command_line_sublist in cl_list:
        for command_line in command_line_sublist:
            sbatch_text_list.append(command_line)
    sbatch_text_list.append("echo -ne '\\n\\nFinished execution at '")
    sbatch_text_list.append("date")
    rotate_file(sbatch_file_path)
    LOG.info("Writing sbatch file to {}".format(sbatch_file_path))
    with open(sbatch_file_path, 'w') as f:
        f.write("\n".join(sbatch_text_list))
    return sbatch_file_path
 def test_do_links(self):
     src_tmp_dir = tempfile.mkdtemp()
     dst_tmp_dir = os.path.join(src_tmp_dir, "dst")
     safe_makedir(dst_tmp_dir)
     src_file_path = os.path.join(src_tmp_dir, "file1.txt")
     dst_file_path = os.path.join(dst_tmp_dir, "file1.txt")
     open(src_file_path, "w").close()
     do_hardlink([src_file_path], dst_tmp_dir)
     assert filecmp.cmp(src_file_path, dst_file_path)
     os.remove(dst_file_path)
     do_symlink([src_file_path], dst_tmp_dir)
     assert filecmp.cmp(src_file_path, dst_file_path)
 def test_do_links(self):
     src_tmp_dir = tempfile.mkdtemp()
     dst_tmp_dir = os.path.join(src_tmp_dir, 'dst' )
     safe_makedir(dst_tmp_dir)
     src_file_path = os.path.join(src_tmp_dir, 'file1.txt') 
     dst_file_path = os.path.join(dst_tmp_dir, 'file1.txt') 
     open(src_file_path, 'w').close()
     do_hardlink([src_file_path], dst_tmp_dir)
     assert(filecmp.cmp(src_file_path, dst_file_path))
     os.remove(dst_file_path)
     do_symlink([src_file_path], dst_tmp_dir)
     assert(filecmp.cmp(src_file_path, dst_file_path))
Ejemplo n.º 13
0
 def test_do_links(self):
     src_tmp_dir = tempfile.mkdtemp()
     dst_tmp_dir = os.path.join(src_tmp_dir, 'dst' )
     safe_makedir(dst_tmp_dir)
     src_file_path = os.path.join(src_tmp_dir, 'file1.txt') 
     dst_file_path = os.path.join(dst_tmp_dir, 'file1.txt') 
     open(src_file_path, 'w').close()
     do_hardlink([src_file_path], dst_tmp_dir)
     assert(filecmp.cmp(src_file_path, dst_file_path))
     os.remove(dst_file_path)
     do_symlink([src_file_path], dst_tmp_dir)
     assert(filecmp.cmp(src_file_path, dst_file_path))
Ejemplo n.º 14
0
def create_sbatch_file(cl_list, project, sample, config):
    project_analysis_path = os.path.join(project.base_path,
                                         "ANALYSIS",
                                         project.project_id,
                                         "qc_ngi")
    log_dir_path = os.path.join(project_analysis_path, "logs")
    sbatch_dir_path = os.path.join(project_analysis_path, "sbatch")
    job_label = "{}-{}".format(project.project_id, sample)
    sbatch_file_path = os.path.join(sbatch_dir_path, "{}.sbatch".format(job_label))
    safe_makedir(log_dir_path)
    safe_makedir(sbatch_dir_path)
    # sbatch parameters
    try:
        slurm_project_id = config["environment"]["project_id"]
    except KeyError:
        raise RuntimeError('No SLURM project id specified in configuration file '
                           'for job "{}"'.format(job_identifier))
    slurm_queue = config.get("slurm", {}).get("queue") or "core"
    num_cores = config.get("slurm", {}).get("cores") or 16
    slurm_time = config.get("qc", {}).get("job_walltime", {}) or "1-00:00:00"
    slurm_out_log = os.path.join(log_dir_path, "{}_sbatch.out".format(job_label))
    slurm_err_log = os.path.join(log_dir_path, "{}_sbatch.err".format(job_label))
    for log_file in slurm_out_log, slurm_err_log:
        rotate_file(log_file)
    sbatch_text = SBATCH_HEADER.format(slurm_project_id=slurm_project_id,
                                       slurm_queue=slurm_queue,
                                       num_cores=num_cores,
                                       slurm_time=slurm_time,
                                       job_name="qc_{}".format(job_label),
                                       slurm_out_log=slurm_out_log,
                                       slurm_err_log=slurm_err_log)
    sbatch_text_list = sbatch_text.split("\n")
    sbatch_extra_params = config.get("slurm", {}).get("extra_params", {})
    for param, value in sbatch_extra_params.iteritems():
        sbatch_text_list.append("#SBATCH {} {}\n\n".format(param, value))
    sbatch_text_list.append("echo -ne '\\n\\nExecuting command lines at '")
    sbatch_text_list.append("date")
    # Note that because these programs have such small output,
    # we're writing results directly to permanent storage and thus
    # it is not necessary to copy results back from anywhere
    sbatch_text_list.append("# Run the actual commands")
    for command_line_sublist in cl_list:
        for command_line in command_line_sublist:
            sbatch_text_list.append(command_line)
    sbatch_text_list.append("echo -ne '\\n\\nFinished execution at '")
    sbatch_text_list.append("date")
    rotate_file(sbatch_file_path)
    LOG.info("Writing sbatch file to {}".format(sbatch_file_path))
    with open(sbatch_file_path, 'w') as f:
        f.write("\n".join(sbatch_text_list))
    return sbatch_file_path
Ejemplo n.º 15
0
def analyze(project, sample, config=None, config_file_path=None):
    """The main entry point for the qc pipeline."""
    LOG.info("Processing project/sample {}/{}".format(project, sample))

    # Two paths diverged in a yellow wood
    project_analysis_path = os.path.join(project.base_path,
                                         "ANALYSIS",
                                         project.project_id,
                                         "qc_ngi")
    # and sorry I could not travel both
    sample_analysis_path = os.path.join(project_analysis_path, sample.name)
    # and be one traveler, long I stood
    log_dir_path = os.path.join(project_analysis_path, "logs")
    # and looked down one as far as I could
    safe_makedir(sample_analysis_path)
    # To where it bent in the undergrowth
    safe_makedir(log_dir_path)
    # I need to go to sleep

    fastq_files_to_process = []
    # I suppose I -should- have quoted the other one
    src_fastq_base = os.path.join(project.base_path, "DATA",
                                  project.project_id, sample.name)
    # Whose woods these are I think I know
    for libprep in sample:
        # His house is in the village though
        for seqrun in libprep:
            # He will not see mt stopping here
            for fastq_file in seqrun:
                # To watch
                path_to_src_fastq = os.path.join(src_fastq_base,
                                                 libprep.name,
                                                 seqrun.name,
                                                 fastq_file)
                # his woods
                fastq_files_to_process.append(path_to_src_fastq)
    # fill up
    paired_fastq_files = find_fastq_read_pairs(fastq_files_to_process).values()
    # with snow
    qc_cl_list = return_cls_for_workflow("qc", paired_fastq_files, sample_analysis_path)

    sbatch_file_path = create_sbatch_file(qc_cl_list, project, sample, config)
    try:
        slurm_job_id = queue_sbatch_file(sbatch_file_path) 
    except RuntimeError as e:
        LOG.error('Failed to queue qc sbatch file for project/sample '
                  '"{}"/"{}"!'.format(project, sample))
    else:
        LOG.info('Queued qc sbatch file for project/sample '
                 '"{}"/"{}": slurm job id {}'.format(project, sample, slurm_job_id))
Ejemplo n.º 16
0
def analyze(project, sample, quiet=False, config=None, config_file_path=None):
    """The main entry point for the qc pipeline."""
    ## TODO implement "quiet" feature
    ## TODO implement mailing on failure
    LOG.info("Launching qc analysis for project/sample {}/{}".format(
        project, sample))

    project_analysis_path = os.path.join(project.base_path, "ANALYSIS",
                                         project.project_id, "qc_ngi")
    sample_analysis_path = os.path.join(project_analysis_path, sample.name)
    log_dir_path = os.path.join(project_analysis_path, "logs")
    safe_makedir(sample_analysis_path)
    safe_makedir(log_dir_path)

    fastq_files_to_process = []
    src_fastq_base = os.path.join(project.base_path, "DATA",
                                  project.project_id, sample.name)
    for libprep in sample:
        for seqrun in libprep:
            for fastq_file in seqrun:
                path_to_src_fastq = os.path.join(src_fastq_base, libprep.name,
                                                 seqrun.name, fastq_file)
                fastq_files_to_process.append(path_to_src_fastq)
    paired_fastq_files = list(
        find_fastq_read_pairs(fastq_files_to_process).values())
    qc_cl_list = return_cls_for_workflow("qc", paired_fastq_files,
                                         sample_analysis_path)

    sbatch_file_path = create_sbatch_file(qc_cl_list, project, sample, config)
    try:
        slurm_job_id = queue_sbatch_file(sbatch_file_path)
    except RuntimeError as e:
        LOG.error('Failed to queue qc sbatch file for project/sample '
                  '"{}"/"{}"!'.format(project, sample))
    else:
        LOG.info('Queued qc sbatch file for project/sample '
                 '"{}"/"{}": slurm job id {}'.format(project, sample,
                                                     slurm_job_id))
        slurm_jobid_file = os.path.join(
            log_dir_path, "{}-{}.slurmjobid".format(project.project_id,
                                                    sample))
        LOG.info('Writing slurm job id "{}" to file "{}"'.format(
            slurm_job_id, slurm_jobid_file))
        try:
            with open(slurm_jobid_file, 'w') as f:
                f.write("{}\n".format(slurm_job_id))
        except IOError as e:
            LOG.warning('Could not write slurm job id for project/sample '
                        '{}/{} to file "{}" ({})'.format(
                            project, sample, slurm_jobid_file, e))
Ejemplo n.º 17
0
def analyze(project, sample, quiet=False, config=None, config_file_path=None):
    """The main entry point for the qc pipeline."""
    ## TODO implement "quiet" feature
    ## TODO implement mailing on failure
    LOG.info("Launching qc analysis for project/sample {}/{}".format(project, sample))

    project_analysis_path = os.path.join(project.base_path,
                                         "ANALYSIS",
                                         project.project_id,
                                         "qc_ngi")
    sample_analysis_path = os.path.join(project_analysis_path, sample.name)
    log_dir_path = os.path.join(project_analysis_path, "logs")
    safe_makedir(sample_analysis_path)
    safe_makedir(log_dir_path)

    fastq_files_to_process = []
    src_fastq_base = os.path.join(project.base_path, "DATA",
                                  project.project_id, sample.name)
    for libprep in sample:
        for seqrun in libprep:
            for fastq_file in seqrun:
                path_to_src_fastq = os.path.join(src_fastq_base,
                                                 libprep.name,
                                                 seqrun.name,
                                                 fastq_file)
                fastq_files_to_process.append(path_to_src_fastq)
    paired_fastq_files = find_fastq_read_pairs(fastq_files_to_process).values()
    qc_cl_list = return_cls_for_workflow("qc", paired_fastq_files, sample_analysis_path)

    sbatch_file_path = create_sbatch_file(qc_cl_list, project, sample, config)
    try:
        slurm_job_id = queue_sbatch_file(sbatch_file_path)
    except RuntimeError as e:
        LOG.error('Failed to queue qc sbatch file for project/sample '
                  '"{}"/"{}"!'.format(project, sample))
    else:
        LOG.info('Queued qc sbatch file for project/sample '
                 '"{}"/"{}": slurm job id {}'.format(project, sample, slurm_job_id))
        slurm_jobid_file = os.path.join(log_dir_path,
                                        "{}-{}.slurmjobid".format(project.project_id,
                                                                  sample))
        LOG.info('Writing slurm job id "{}" to file "{}"'.format(slurm_job_id,
                                                                 slurm_jobid_file))
        try:
            with open(slurm_jobid_file, 'w') as f:
                f.write("{}\n".format(slurm_job_id))
        except IOError as e:
            LOG.warn('Could not write slurm job id for project/sample '
                     '{}/{} to file "{}" ({}). So... yup. Good luck bro!'.format(e))
Ejemplo n.º 18
0
def analyze(project, sample, config=None, config_file_path=None):
    """The main entry point for the qc pipeline."""
    LOG.info("Processing project/sample {}/{}".format(project, sample))

    # Two paths diverged in a yellow wood
    project_analysis_path = os.path.join(project.base_path, "ANALYSIS",
                                         project.project_id, "qc_ngi")
    # and sorry I could not travel both
    sample_analysis_path = os.path.join(project_analysis_path, sample.name)
    # and be one traveler, long I stood
    log_dir_path = os.path.join(project_analysis_path, "logs")
    # and looked down one as far as I could
    safe_makedir(sample_analysis_path)
    # To where it bent in the undergrowth
    safe_makedir(log_dir_path)
    # I need to go to sleep

    fastq_files_to_process = []
    # I suppose I -should- have quoted the other one
    src_fastq_base = os.path.join(project.base_path, "DATA",
                                  project.project_id, sample.name)
    # Whose woods these are I think I know
    for libprep in sample:
        # His house is in the village though
        for seqrun in libprep:
            # He will not see mt stopping here
            for fastq_file in seqrun:
                # To watch
                path_to_src_fastq = os.path.join(src_fastq_base, libprep.name,
                                                 seqrun.name, fastq_file)
                # his woods
                fastq_files_to_process.append(path_to_src_fastq)
    # fill up
    paired_fastq_files = find_fastq_read_pairs(fastq_files_to_process).values()
    # with snow
    qc_cl_list = return_cls_for_workflow("qc", paired_fastq_files,
                                         sample_analysis_path)

    sbatch_file_path = create_sbatch_file(qc_cl_list, project, sample, config)
    try:
        slurm_job_id = queue_sbatch_file(sbatch_file_path)
    except RuntimeError as e:
        LOG.error('Failed to queue qc sbatch file for project/sample '
                  '"{}"/"{}"!'.format(project, sample))
    else:
        LOG.info('Queued qc sbatch file for project/sample '
                 '"{}"/"{}": slurm job id {}'.format(project, sample,
                                                     slurm_job_id))
Ejemplo n.º 19
0
    def _slurm_script_from_command_line(
            self,
            command_line,
            working_dir,
            exit_code_path,
            job_name):
        """
        Create a SLURM script ready for submission based on the supplied command line and the parameters in this
        SlurmConnector instance.

        The created SLURM script will contain a job header as created by this SlurmConnector. The exit code of the
        command to be executed as a SLURM job will be written to a file. This file will be truncated or created once
        the job starts. The SLURM script will include the current time, so the file name should be unique between
        subsequent calls to this function.

        :param command_line: command line to execute in the SLURM job, formatted as a string
        :param working_dir: the directory in which to create the SLURM script (expected to exist)
        :param exit_code_path: path to the file where the exit code from the command should be stored
        :param job_name: the job name to use for the SLURM submission
        :return: the path to the created SLURM script
        """
        # create the script in the passed working directory
        slurm_script_dir = os.path.join(working_dir, "sbatch")
        safe_makedir(slurm_script_dir)
        slurm_script = os.path.join(
            slurm_script_dir, "{}.{}.sbatch".format(job_name, datetime.datetime.now().strftime("%s")))
        slurm_stdout = "{}.out".format(slurm_script)
        slurm_stderr = "{}.err".format(slurm_script)

        with open(slurm_script, "w") as fh:
            fh.write(
                SlurmConnector.JOB_HEADER_TEMPLATE.format(
                    slurm_job_name=job_name,
                    slurm_stdout=slurm_stdout,
                    slurm_stderr=slurm_stderr,
                    slurm_working_directory=self.slurm_parameters.get("slurm_working_directory", working_dir),
                    **self.slurm_parameters))
            # append any extra SLURM arguments passed
            for slurm_extra_arg in self.slurm_parameters.get("slurm_extra_args", []):
                fh.write("#SBATCH {}\n".format(slurm_extra_arg))

            fh.write("\necho \"\" > \"{}\"\n".format(exit_code_path))
            fh.write("{}\n".format(command_line))
            fh.write("echo \"$?\" > \"{}\"\n".format(exit_code_path))

        return slurm_script
    def _slurm_script_from_command_line(
            self,
            command_line,
            working_dir,
            exit_code_path,
            job_name):
        """
        Create a SLURM script ready for submission based on the supplied command line and the parameters in this
        SlurmConnector instance.

        The created SLURM script will contain a job header as created by this SlurmConnector. The exit code of the
        command to be executed as a SLURM job will be written to a file. This file will be truncated or created once
        the job starts. The SLURM script will include the current time, so the file name should be unique between
        subsequent calls to this function.

        :param command_line: command line to execute in the SLURM job, formatted as a string
        :param working_dir: the directory in which to create the SLURM script (expected to exist)
        :param exit_code_path: path to the file where the exit code from the command should be stored
        :param job_name: the job name to use for the SLURM submission
        :return: the path to the created SLURM script
        """
        # create the script in the passed working directory
        slurm_script_dir = os.path.join(working_dir, "sbatch")
        safe_makedir(slurm_script_dir)
        slurm_script = os.path.join(
            slurm_script_dir, "{}.{}.sbatch".format(job_name, datetime.datetime.now().strftime("%s")))
        slurm_stdout = "{}.out".format(slurm_script)
        slurm_stderr = "{}.err".format(slurm_script)

        with open(slurm_script, "w") as fh:
            fh.write(
                SlurmConnector.JOB_HEADER_TEMPLATE.format(
                    slurm_job_name=job_name,
                    slurm_stdout=slurm_stdout,
                    slurm_stderr=slurm_stderr,
                    slurm_working_directory=self.slurm_parameters.get("slurm_working_directory", working_dir),
                    **self.slurm_parameters))
            # append any extra SLURM arguments passed
            for slurm_extra_arg in self.slurm_parameters.get("slurm_extra_args", []):
                fh.write("#SBATCH {}\n".format(slurm_extra_arg))

            fh.write("\necho \"\" > \"{}\"\n".format(exit_code_path))
            fh.write("{}\n".format(command_line))
            fh.write("echo \"$?\" > \"{}\"\n".format(exit_code_path))

        return slurm_script
Ejemplo n.º 21
0
def run_multiqc(base_path, project_id, project_name):

    project_path=os.path.join(base_path, 'ANALYSIS', project_id)
    result_path=os.path.join(base_path, 'ANALYSIS', project_id, 'multiqc')
    safe_makedir(result_path)
    command=['multiqc', project_path, '-o', result_path, '-i', project_name, '-n', project_name, '-q', '-f']
    multiqc_stdout=''
    multiqc_stderr=''
    try:
        handle=execute_command_line(command)
        (multiqc_stdout, multiqc_stderr)=handle.communicate()
        if multiqc_stdout or multiqc_stderr:
            combined_output="{}\n{}".format(multiqc_stdout, multiqc_stderr)
            raise Exception(combined_output)

    except:
        raise
Ejemplo n.º 22
0
def record_analysis_details(project, job_identifier):
    """Write a yaml file enumerating exactly which fastq files we've started
    analyzing.
    """
    output_file_path = os.path.join(project.base_path, "ANALYSIS",
                                    project.dirname, "piper_ngi","logs",
                                    "{}.files".format(job_identifier))
    analysis_dict = {}
    proj_dict = analysis_dict[project.dirname] = {}
    for sample in project:
        samp_dict = proj_dict[sample.name] = {}
        for libprep in sample:
            lib_dict = samp_dict[libprep.name] = {}
            for seqrun in libprep:
                lib_dict[seqrun.name] = seqrun.fastq_files
    rotate_file(output_file_path)
    safe_makedir(os.path.dirname(output_file_path))
    with open(output_file_path, 'w') as f:
        f.write(yaml.dump(analysis_dict))
    def execute_process(self, command_line, working_dir=None, **extra_args):
        """
        Execute the supplied command line. If the working directory that should be used does not exist, it will be
        created.

        :param command_line: command line to be executed, can be a string or a list
        :param working_dir: directory to use as working directory when executing the command line. Default is to use the
        current working directory used by this ProcessConnector. Will be created if it does not exist
        :param extra_args: any additional parameters passed will be ignored
        :return: the process id (pid) of the launched process
        """
        working_dir = working_dir or self.cwd
        safe_makedir(working_dir)
        with chdir(working_dir):
            try:
                proc = execute_command_line(command_line, shell=False, cwd=working_dir)
                return proc.pid
            except RuntimeError:
                raise
Ejemplo n.º 24
0
    def execute_process(self, command_line, working_dir=None, **extra_args):
        """
        Execute the supplied command line. If the working directory that should be used does not exist, it will be
        created.

        :param command_line: command line to be executed, can be a string or a list
        :param working_dir: directory to use as working directory when executing the command line. Default is to use the
        current working directory used by this ProcessConnector. Will be created if it does not exist
        :param extra_args: any additional parameters passed will be ignored
        :return: the process id (pid) of the launched process
        """
        working_dir = working_dir or self.cwd
        safe_makedir(working_dir)
        with chdir(working_dir):
            try:
                proc = execute_command_line(command_line, shell=False, cwd=working_dir)
                return proc.pid
            except RuntimeError:
                raise
Ejemplo n.º 25
0
def write_batch_job(analysis_object, reference, fastq_dir_path, config=None, config_file_path=None):
    analysis_path=os.path.join(analysis_object.project.base_path, "ANALYSIS", analysis_object.project.project_id, 'rna_ngi')
    sbatch_dir_path=os.path.join(analysis_path, 'sbatch')
    safe_makedir(sbatch_dir_path)
    sbatch_file_path=os.path.join(sbatch_dir_path, 'rna_ngi.sh')
    fastq_glob_path=os.path.join(fastq_dir_path, '*_R{1,2}_*.fastq.gz')
    main_nexflow_path=config['analysis']['best_practice_analysis']['RNA-seq']['ngi_nf_path']
    nf_conf=config['analysis']['best_practice_analysis']['RNA-seq']['{}_ngi_conf'.format(analysis_object.sequencing_facility)]
    analysis_log_path=os.path.join(analysis_path, 'nextflow_output.log')
    exit_code_path=os.path.join(analysis_path, 'nextflow_exit_code.out')
    LOG.info("Writing sbatch file to {}".format(sbatch_file_path))
    with open(sbatch_file_path, 'w') as sb:
        sb.write("#!/bin/bash\n\n")
        sb.write("cd {an_path}\n".format(an_path=analysis_path))
        sb.write("> {ex_path}\n".format(ex_path=exit_code_path))
        sb.write("nextflow {ngi_rna_nf} --reads '{fastq_glob}' --genome '{ref}' -c {nf_conf} --outdir {an_path} &> {out_log}\n".format(
            ngi_rna_nf=main_nexflow_path,fastq_glob=fastq_glob_path, ref=reference, nf_conf=nf_conf, an_path=analysis_path, out_log=analysis_log_path))
        sb.write("echo $? > {ex_path}\n".format(ex_path=exit_code_path))
    LOG.info("NextFlow output will be logged at {}".format(analysis_log_path))
    return sbatch_file_path
Ejemplo n.º 26
0
    def create_tsv_file(self, analysis_sample):
        """
        Create a tsv file containing the information needed by Sarek for starting the analysis. Will decide the path to
        the tsv file based on the sample information. If the path does not exist, it will be created.

        :raises: a SampleNotValidForAnalysisError if no libpreps or seqruns for the sample were eligible for analysis
        :param analysis_sample: a SarekAnalysisSample object representing the sample to create the tsv file for
        :return: the path to the created tsv file
        """
        rows = self.generate_tsv_file_contents(analysis_sample)
        if not rows:
            raise SampleNotValidForAnalysisError(
                analysis_sample.projectid, analysis_sample.sampleid,
                "no libpreps or seqruns to analyze")

        tsv_file = analysis_sample.sample_analysis_tsv_file()
        safe_makedir(os.path.dirname(tsv_file))
        with open(tsv_file, "w") as fh:
            writer = csv.writer(fh, dialect=csv.excel_tab)
            writer.writerows(rows)
        return tsv_file
Ejemplo n.º 27
0
    def execute_process(self, command_line, working_dir=None, exit_code_path=None, job_name=None):
        """
        Wrap the supplied command line in a SLURM script and submit it to the job queue.

        :param command_line: command line to execute in the SLURM job, formatted as a string
        :param working_dir: the directory in which to create the SLURM script and use as working directory for the job.
        If it does not already exist, it will be created.
        :param exit_code_path: path to the file where the exit code from the command should be stored. If not specified,
        the exit code will be sent to /dev/null
        :param job_name: the job name to use when submitting to the cluster. If not specified, it will be constructed
        from the command line
        :return: the slurm job id
        """
        exit_code_path = exit_code_path or os.devnull
        job_name = job_name or command_line.replace(" ", "_")[0:20]
        # create the working dir if it does not exist already
        working_dir = working_dir or self.cwd
        safe_makedir(working_dir)
        with chdir(working_dir):
            slurm_script = self._slurm_script_from_command_line(
                command_line,
                working_dir,
                exit_code_path,
                job_name)
            # submit the sbatch file
            sbatch_command_line = "sbatch {}".format(slurm_script)
            proc = execute_command_line(
                sbatch_command_line,
                shell=False,
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE)
            stdout, stderr = proc.communicate()
            try:
                # parse the slurm job id from the sbatch stdout
                slurm_job_id = re.match(r'Submitted batch job (\d+)', stdout).groups()[0]
                return slurm_job_id
            except AttributeError:
                raise RuntimeError(
                    'Could not submit sbatch job for workflow "{}": {}'.format(job_name, stderr))
Ejemplo n.º 28
0
def run_multiqc(base_path, project_id, project_name, wait=False):

    project_path = os.path.join(base_path, 'ANALYSIS', project_id)
    result_path = os.path.join(base_path, 'ANALYSIS', project_id, 'multiqc')
    safe_makedir(result_path)
    command = [
        'multiqc', project_path, '-o', result_path, '-i', project_name, '-n',
        project_name, '-q', '-f'
    ]
    multiqc_stdout = ''
    multiqc_stderr = ''
    try:
        handle = execute_command_line(command)
        if wait:
            (multiqc_stdout, multiqc_stderr) = handle.communicate()
            if multiqc_stdout or multiqc_stderr:
                combined_output = "{}\n{}".format(multiqc_stdout,
                                                  multiqc_stderr)
                raise Exception(combined_output)

    except:
        raise
    def create_tsv_file(self, analysis_sample):
        """
        Create a tsv file containing the information needed by Sarek for starting the analysis. Will decide the path to
        the tsv file based on the sample information. If the path does not exist, it will be created.

        :raises: a SampleNotValidForAnalysisError if no libpreps or seqruns for the sample were eligible for analysis
        :param analysis_sample: a SarekAnalysisSample object representing the sample to create the tsv file for
        :return: the path to the created tsv file
        """
        rows = self.generate_tsv_file_contents(analysis_sample)
        if not rows:
            raise SampleNotValidForAnalysisError(
                analysis_sample.projectid,
                analysis_sample.sampleid,
                "no libpreps or seqruns to analyze")

        tsv_file = analysis_sample.sample_analysis_tsv_file()
        safe_makedir(os.path.dirname(tsv_file))
        with open(tsv_file, "w") as fh:
            writer = csv.writer(fh, dialect=csv.excel_tab)
            writer.writerows(rows)
        return tsv_file
    def execute_process(self, command_line, working_dir=None, exit_code_path=None, job_name=None):
        """
        Wrap the supplied command line in a SLURM script and submit it to the job queue.

        :param command_line: command line to execute in the SLURM job, formatted as a string
        :param working_dir: the directory in which to create the SLURM script and use as working directory for the job.
        If it does not already exist, it will be created.
        :param exit_code_path: path to the file where the exit code from the command should be stored. If not specified,
        the exit code will be sent to /dev/null
        :param job_name: the job name to use when submitting to the cluster. If not specified, it will be constructed
        from the command line
        :return: the slurm job id
        """
        exit_code_path = exit_code_path or os.devnull
        job_name = job_name or command_line.replace(" ", "_")[0:20]
        # create the working dir if it does not exist already
        working_dir = working_dir or self.cwd
        safe_makedir(working_dir)
        with chdir(working_dir):
            slurm_script = self._slurm_script_from_command_line(
                command_line,
                working_dir,
                exit_code_path,
                job_name)
            # submit the sbatch file
            sbatch_command_line = "sbatch {}".format(slurm_script)
            proc = execute_command_line(
                sbatch_command_line,
                shell=False,
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE)
            stdout, stderr = proc.communicate()
            try:
                # parse the slurm job id from the sbatch stdout
                slurm_job_id = re.match(r'Submitted batch job (\d+)', stdout).groups()[0]
                return slurm_job_id
            except AttributeError:
                raise RuntimeError(
                    'Could not submit sbatch job for workflow "{}": {}'.format(job_name, stderr))
 def test_safe_makedir_dirtree(self):
     dir_tree = os.path.join(self.tmp_dir, "first", "second", "third")
     safe_makedir(dir_tree)
     assert(os.path.exists(dir_tree))
Ejemplo n.º 32
0
def workflow_fastqc(input_files, output_dir, config):
    """The constructor of the FastQC command line.

    :param list input_files: The list of fastq files to analyze (may be 2D for read pairs)
    :param str output_dir: The path to the desired output directory (will be created)
    :param dict config: The parsed system/pipeline configuration file

    :returns: A list of command lines to be executed in the order given
    :rtype: list
    :raises ValueError: If the FastQC path is not given or is not on PATH
    """
    # Get the path to the fastqc command
    fastqc_path = config.get("paths", {}).get("fastqc")
    if not fastqc_path:
        if find_on_path("fastqc", config):
            LOG.info("fastqc found on PATH")
            fastqc_path = "fastqc"
        else:
            raise ValueError('Path to FastQC could not be found and it is not '
                             'available on PATH; cannot proceed with FastQC '
                             'workflow.')

    fastq_files = flatten(input_files) # FastQC cares not for your "read pairs"
    # Verify that we in fact need to run this on these files
    fastqc_output_file_tmpls = ("{}_fastqc.zip", "{}_fastqc.html")
    fastq_to_analyze = set()
    for fastq_file in fastq_files:
        # Get the basename withot extensions (.fastq, .fastq.gz)
        m = re.match(r'([\w-]+).fastq', os.path.basename(fastq_file))
        if not m:
            # fastq file name doesn't match expected pattern -- just process it
            fastq_to_analyze.add(fastq_file)
            continue
        else:
            fastq_file_base = m.groups()[0]
        for fastqc_output_file_tmpl in fastqc_output_file_tmpls:
            fastqc_output_file = \
                    os.path.join(output_dir, fastqc_output_file_tmpl.format(fastq_file_base))
            if not os.path.exists(fastqc_output_file):
                # Output file doesn't exist
                fastq_to_analyze.add(fastq_file)
            elif os.path.getctime(fastq_file) > os.path.getctime(fastqc_output_file):
                # Input file modified more recently than output file
                fastq_to_analyze.add(fastq_file)

    num_threads = config.get("qc", {}).get("fastqc", {}).get("threads") or 1
    # Construct the command lines
    cl_list = []
    if fastq_to_analyze:
        safe_makedir(output_dir)
        # Module loading
        modules_to_load = get_all_modules_for_workflow("fastqc", config)
        for module in modules_to_load:
            cl_list.append("module load {}".format(module))
        # Execute fastqc
        cl_list.append('{fastqc_path} -t {num_threads} -o {output_dir} '
                       '{fastq_files}'.format(output_dir=output_dir,
                                              fastqc_path=fastqc_path,
                                              num_threads=num_threads,
                                              fastq_files=" ".join(fastq_to_analyze)))
    if not cl_list:
        LOG.info("FastQC analysis not needed or input files were invalid.")
    return cl_list
Ejemplo n.º 33
0
def setup_analysis_directory_structure(fc_dir,
                                       projects_to_analyze,
                                       restrict_to_projects=None,
                                       restrict_to_samples=None,
                                       create_files=True,
                                       fallback_libprep=None,
                                       quiet=False,
                                       config=None,
                                       config_file_path=None):
    """
    Copy and sort files from their CASAVA-demultiplexed flowcell structure
    into their respective project/sample/libPrep/FCIDs. This collects samples
    split across multiple flowcells.

    :param str fc_dir: The directory created by CASAVA for this flowcell.
    :param dict config: The parsed configuration file.
    :param set projects_to_analyze: A dict (of Project objects, or empty)
    :param bool create_files: Alter the filesystem (as opposed to just parsing flowcells) (default True)
    :param str fallback_libprep: If libprep cannot be determined, use this value if supplied (default None)
    :param list restrict_to_projects: Specific projects within the flowcell to process exclusively
    :param list restrict_to_samples: Specific samples within the flowcell to process exclusively

    :returns: A list of NGIProject objects that need to be run through the analysis pipeline
    :rtype: list

    :raises KeyError: If a required configuration key is not available.
    """
    LOG.info(
        "Setting up analysis for demultiplexed data in source folder \"{}\"".
        format(fc_dir))
    if not restrict_to_projects: restrict_to_projects = []
    if not restrict_to_samples: restrict_to_samples = []
    config[
        "quiet"] = quiet  # Hack because I enter here from a script sometimes
    #Checks flowcell path to establish which group owns it
    pattern = ".+({}|{})\/.+".format(config["analysis"]["sthlm_root"],
                                     config["analysis"]["upps_root"])
    matches = re.match(pattern, fc_dir)
    if matches:
        flowcell_uppnexid = matches.group(1)
    else:
        LOG.error(
            "cannot guess which project (sthlm/uppsala) the flowcell {} belongs to"
            .format(fc_dir))
        raise RuntimeError

    analysis_top_dir = os.path.abspath(
        os.path.join(config["analysis"]["base_root"], flowcell_uppnexid,
                     config["analysis"]["top_dir"]))
    try:
        safe_makedir(analysis_top_dir)
    except OSError as e:
        LOG.error(
            'Error: Analysis top directory {} does not exist and could not '
            'be created.'.format(analysis_top_dir))
    fc_dir = fc_dir if os.path.isabs(fc_dir) else os.path.join(
        analysis_top_dir, fc_dir)
    if not os.path.exists(fc_dir):
        LOG.error("Error: Flowcell directory {} does not exist".format(fc_dir))
        return []
    # Map the directory structure for this flowcell
    try:
        fc_dir_structure = parse_flowcell(fc_dir)
    except (OSError, ValueError) as e:
        LOG.error("Error when processing flowcell dir \"{}\": {}".format(
            fc_dir, e))
        return []
    fc_full_id = fc_dir_structure['fc_full_id']
    if not fc_dir_structure.get('projects'):
        LOG.warning(
            "No projects found in specified flowcell directory \"{}\"".format(
                fc_dir))

    # Iterate over the projects in the flowcell directory
    for project in fc_dir_structure.get('projects', []):
        project_name = project['project_name']
        project_original_name = project['project_original_name']
        samplesheet_path = fc_dir_structure.get("samplesheet_path")

        # parse the samplesheet and get the expected sample numbers assigned by bcl2fastq
        samplesheet_sample_numbers = get_sample_numbers_from_samplesheet(
            samplesheet_path) if samplesheet_path else None

        try:
            # Maps e.g. "Y.Mom_14_01" to "P123"
            project_id = get_project_id_from_name(project_name)
        except (CharonError, RuntimeError, ValueError) as e:
            LOG.warning(
                'Could not retrieve project id from Charon (record missing?). '
                'Using project name ("{}") as project id '
                '(error: {})'.format(project_name, e))
            project_id = project_name
        # If specific projects are specified, skip those that do not match
        if restrict_to_projects and project_name not in restrict_to_projects and \
                                    project_id not in restrict_to_projects:
            LOG.debug(
                "Skipping project {} (not in restrict_to_projects)".format(
                    project_name))
            continue
        LOG.info("Setting up project {}".format(project.get("project_name")))
        # Create a project directory if it doesn't already exist, including
        # intervening "DATA" directory
        project_dir = os.path.join(analysis_top_dir, "DATA", project_id)
        project_sl_dir = os.path.join(analysis_top_dir, "DATA", project_name)
        project_analysis_dir = os.path.join(analysis_top_dir, "ANALYSIS",
                                            project_id)
        project_analysis_sl_dir = os.path.join(analysis_top_dir, "ANALYSIS",
                                               project_name)
        if create_files:
            safe_makedir(project_dir, 0o2770)
            safe_makedir(project_analysis_dir, 0o2770)
            if not project_dir == project_sl_dir and \
               not os.path.exists(project_sl_dir):
                os.symlink(project_dir, project_sl_dir)
            if not project_analysis_dir == project_analysis_sl_dir and \
               not os.path.exists(project_analysis_sl_dir):
                os.symlink(project_analysis_dir, project_analysis_sl_dir)
        try:
            project_obj = projects_to_analyze[project_dir]
        except KeyError:
            project_obj = NGIProject(name=project_name,
                                     dirname=project_id,
                                     project_id=project_id,
                                     base_path=analysis_top_dir)
            projects_to_analyze[project_dir] = project_obj
        # Iterate over the samples in the project
        for sample in project.get('samples', []):
            sample_name = sample['sample_name']
            # If specific samples are specified, skip those that do not match
            if restrict_to_samples and sample_name not in restrict_to_samples:
                LOG.debug("Skipping sample {}: not in specified samples "
                          "{}".format(sample_name,
                                      ", ".join(restrict_to_samples)))
                continue
            LOG.info("Setting up sample {}".format(sample_name))
            # Create a directory for the sample if it doesn't already exist
            sample_dir = os.path.join(project_dir, sample_name)
            if create_files: safe_makedir(sample_dir, 0o2770)
            # This will only create a new sample object if it doesn't already exist in the project
            sample_obj = project_obj.add_sample(name=sample_name,
                                                dirname=sample_name)
            # Get the Library Prep ID for each file
            pattern = re.compile(".*\.(fastq|fq)(\.gz|\.gzip|\.bz2)?$")
            fastq_files = list(filter(pattern.match, sample.get('files', [])))
            # For each fastq file, create the libprep and seqrun objects
            # and add the fastq file to the seqprep object
            # Note again that these objects only get created if they don't yet exist;
            # if they do exist, the existing object is returned
            for fq_file in fastq_files:
                # Try to use assignment from SampleSheet
                samplesheet_sample = match_fastq_sample_number_to_samplesheet(
                    fq_file, samplesheet_sample_numbers, project_id)
                if samplesheet_sample is not None and \
                        samplesheet_sample[6] is not None:
                    libprep_name = samplesheet_sample[6]
                else:
                    LOG.debug(
                        'Unable to determine library prep from sample sheet file; try to determine from Charon'
                    )
                    try:
                        # Requires Charon access
                        libprep_name = determine_library_prep_from_fcid(
                            project_id, sample_name, fc_full_id)
                        LOG.debug('Found libprep name "{}" in Charon'.format(
                            libprep_name))
                    except ValueError:
                        charon_session = CharonSession()
                        libpreps = charon_session.sample_get_libpreps(
                            project_id, sample_name).get('libpreps')
                        if len(libpreps) == 1:
                            libprep_name = libpreps[0].get('libprepid')
                            LOG.warning(
                                'Project "{}" / sample "{}" / seqrun "{}" / fastq "{}" '
                                'has no libprep information in Charon, but only one '
                                'library prep is present in Charon ("{}"). Using '
                                'this as the library prep.'.format(
                                    project_name, sample_name, fc_full_id,
                                    fq_file, libprep_name))
                        elif fallback_libprep:
                            libprep_name = fallback_libprep
                            LOG.warning(
                                'Project "{}" / sample "{}" / seqrun "{}" / fastq "{}" '
                                'has no libprep information in Charon, but a fallback '
                                'libprep value of "{}" was supplied -- using this '
                                'value.'.format(project_name, sample_name,
                                                fc_full_id, fq_file,
                                                libprep_name))
                        else:
                            error_text = (
                                'Project "{}" / sample "{}" / seqrun "{}" / fastq "{}" '
                                'has no libprep information in Charon. Skipping '
                                'analysis.'.format(project_name, sample_name,
                                                   fc_full_id, fq_file))
                            LOG.error(error_text)
                            if not config.get('quiet'):
                                mail_analysis(project_name=project_name,
                                              sample_name=sample_name,
                                              level="ERROR",
                                              info_text=error_text)
                            continue
                libprep_object = sample_obj.add_libprep(name=libprep_name,
                                                        dirname=libprep_name)
                libprep_dir = os.path.join(sample_dir, libprep_name)
                if create_files: safe_makedir(libprep_dir, 0o2770)
                seqrun_object = libprep_object.add_seqrun(name=fc_full_id,
                                                          dirname=fc_full_id)
                seqrun_dir = os.path.join(libprep_dir, fc_full_id)
                if create_files: safe_makedir(seqrun_dir, 0o2770)
                seqrun_object.add_fastq_files(fq_file)
            if fastq_files and create_files:
                src_sample_dir = os.path.join(fc_dir_structure['fc_dir'],
                                              project['data_dir'],
                                              project['project_dir'],
                                              sample['sample_dir'])
                for libprep_obj in sample_obj:
                    for seqrun_obj in libprep_obj:
                        src_fastq_files = [
                            os.path.join(src_sample_dir, fastq_file)
                            for fastq_file in seqrun_obj.fastq_files
                        ]
                        seqrun_dst_dir = os.path.join(project_obj.base_path,
                                                      "DATA",
                                                      project_obj.dirname,
                                                      sample_obj.dirname,
                                                      libprep_obj.dirname,
                                                      seqrun_obj.dirname)
                        LOG.info(
                            "Symlinking fastq files from {} to {}...".format(
                                src_sample_dir, seqrun_dst_dir))
                        try:
                            do_symlink(src_fastq_files, seqrun_dst_dir)
                        except OSError:
                            error_text = (
                                'Could not symlink files for project/sample'
                                'libprep/seqrun {}/{}/{}/{}'.format(
                                    project_obj, sample_obj, libprep_obj,
                                    seqrun_obj))
                            LOG.error(error_text)
                            if not config.get('quiet'):
                                mail_analysis(project_name=project_name,
                                              sample_name=sample_name,
                                              level="ERROR",
                                              info_text=error_text)
                            continue
    return projects_to_analyze
 def test_safe_makedir_singledir(self):
     # Should test that this doesn't overwrite an existing dir as well
     single_dir = os.path.join(self.tmp_dir, "single_directory")
     safe_makedir(single_dir)
     assert(os.path.exists(single_dir))
Ejemplo n.º 35
0
def workflow_fastq_screen(input_files, output_dir, config):
    # Get the path to the fastq_screen command
    fastq_screen_path = config.get("paths", {}).get("fastq_screen")
    if not fastq_screen_path:
        if find_on_path("fastq_screen", config):
            LOG.info("fastq_screen found on PATH")
            fastq_screen_path = "fastq_screen"
        else:
            raise ValueError(
                'Path to fastq_screen could not be found and it is not '
                'available on PATH; cannot proceed with fastq_screen '
                'workflow.')
    fastq_screen_config_path = config.get("qc", {}).get("fastq_screen",
                                                        {}).get("config_path")
    # We probably should have the path to the fastq_screen config file written down somewhere
    if not fastq_screen_config_path:
        LOG.warn('Path to fastq_screen config file not specified; assuming '
                 'it is in the same directory as the fastq_screen binary, '
                 'even though I think this is probably a fairly bad '
                 'assumption to make. You\'re in charge, whatever.')
    else:
        try:
            open(fastq_screen_config_path, 'r').close()
        except IOError as e:
            raise ValueError(
                'Error when accessing fastq_screen configuration '
                'file as specified in pipeline config: "{}" (path '
                'given was {})'.format(e, fastq_screen_config_path))

    num_threads = config.get("qc", {}).get("fastq_screen",
                                           {}).get("threads") or 1
    subsample_reads = config.get("qc", {}).get("fastq_screen",
                                               {}).get("subsample_reads")

    # Determine which files need processing
    fastq_screen_output_file_tmpls = ("{}_screen.png", "{}_screen.txt")
    fastq_to_analyze = set()
    for elt in input_files:
        # This may be a read pair
        if type(elt) is list:
            # Changing list to tuple so we can use it in the set() (lists aren't hashable)
            elt = tuple(elt)
            # fastq_screen uses the name of the first read of the pair for output files
            fastq_file = elt[0]
        else:
            fastq_file = elt
        for fastq_screen_output_file_tmpl in fastq_screen_output_file_tmpls:
            fastq_screen_output_file = \
                    os.path.join(output_dir, fastq_screen_output_file_tmpl.format(fastq_file))
            if not os.path.exists(fastq_screen_output_file):
                # Output file doesn't exist
                fastq_to_analyze.add(elt)
            elif os.path.getctime(fastq_file) > os.path.getctime(
                    fastq_screen_output_file):
                # Input file modified more recently than output file
                fastq_to_analyze.add(elt)

    # Construct the command lines
    cl_list = []
    # fastq_screen commands
    for elt in fastq_to_analyze:
        cl = fastq_screen_path
        cl += " --aligner bowtie2"
        cl += " --outdir {}".format(output_dir)
        if subsample_reads: cl += " --subset {}".format(subsample_reads)
        if num_threads: cl += " --threads {}".format(num_threads)
        if fastq_screen_config_path:
            cl += " --conf {}".format(fastq_screen_config_path)
        if type(elt) is list or type(elt) is tuple:
            if len(elt) == 2:
                # Read pair; run fastq_screen on these together
                cl += (" --paired {}".format(" ".join(elt)))
            elif len(elt) == 1:
                cl += " " + elt[0]
            else:
                LOG.error('Files passed as list but more than two elements; '
                          'not a read pair? Skipping. ({})'.format(
                              " ".join(elt)))
                continue
        elif type(elt) is str or type(elt) is unicode:
            cl += " " + elt
        else:
            LOG.error("Ignoring your weird input (not a string, not a list).")
            continue
        cl_list.append(cl)
    if cl_list:
        safe_makedir(output_dir)
        # Module loading
        modules_to_load = get_all_modules_for_workflow("fastq_screen", config)
        mod_list = [
            "module load {}".format(module) for module in modules_to_load
        ]
        if mod_list:
            cl_list = mod_list + cl_list
    else:
        LOG.info(
            "fastq_screen analysis not needed or input files were invalid.")
    return cl_list
Ejemplo n.º 36
0
def sbatch_piper_sample(command_line_list,
                        workflow_name,
                        project,
                        sample,
                        libprep=None,
                        restart_finished_jobs=False,
                        files_to_copy=None,
                        config=None,
                        config_file_path=None):
    """sbatch a piper sample-level workflow.

    :param list command_line_list: The list of command lines to execute (in order)
    :param str workflow_name: The name of the workflow to execute
    :param NGIProject project: The NGIProject
    :param NGISample sample: The NGISample
    :param dict config: The parsed configuration file (optional)
    :param str config_file_path: The path to the configuration file (optional)
    """
    job_identifier = "{}-{}-{}".format(project.project_id, sample,
                                       workflow_name)
    # Paths to the various data directories
    project_dirname = project.dirname
    perm_analysis_dir = os.path.join(project.base_path, "ANALYSIS",
                                     project_dirname, "piper_ngi", "")
    scratch_analysis_dir = os.path.join("$SNIC_TMP/ANALYSIS/", project_dirname,
                                        "piper_ngi", "")
    #ensure that the analysis dir exists
    safe_makedir(perm_analysis_dir)
    try:
        slurm_project_id = config["environment"]["project_id"]
    except KeyError:
        raise RuntimeError(
            'No SLURM project id specified in configuration file '
            'for job "{}"'.format(job_identifier))
    slurm_queue = config.get("slurm", {}).get("queue") or "core"
    num_cores = config.get("slurm", {}).get("cores") or 16
    slurm_time = config.get("piper", {}).get(
        "job_walltime", {}).get(workflow_name) or "4-00:00:00"
    slurm_out_log = os.path.join(perm_analysis_dir, "logs",
                                 "{}_sbatch.out".format(job_identifier))
    slurm_err_log = os.path.join(perm_analysis_dir, "logs",
                                 "{}_sbatch.err".format(job_identifier))
    for log_file in slurm_out_log, slurm_err_log:
        rotate_file(log_file)
    sbatch_text = create_sbatch_header(
        slurm_project_id=slurm_project_id,
        slurm_queue=slurm_queue,
        num_cores=num_cores,
        slurm_time=slurm_time,
        job_name="piper_{}".format(job_identifier),
        slurm_out_log=slurm_out_log,
        slurm_err_log=slurm_err_log)
    sbatch_text_list = sbatch_text.split("\n")
    sbatch_extra_params = config.get("slurm", {}).get("extra_params", {})
    for param, value in sbatch_extra_params.iteritems():
        sbatch_text_list.append("#SBATCH {} {}\n\n".format(param, value))
    modules_to_load = config.get("piper", {}).get("load_modules", [])
    if modules_to_load:
        sbatch_text_list.append("\n# Load required modules for Piper")
        for module_name in modules_to_load:
            sbatch_text_list.append("module load {}".format(module_name))

    if not files_to_copy:
        project, files_to_copy = \
            collect_files_for_sample_analysis(project, sample, restart_finished_jobs)

    # Fastq files to copy
    fastq_src_dst_list = []
    directories_to_create = set()
    for libprep in sample:
        for seqrun in libprep:
            project_specific_path = os.path.join(project.dirname,
                                                 sample.dirname,
                                                 libprep.dirname,
                                                 seqrun.dirname)
            directories_to_create.add(
                os.path.join("$SNIC_TMP/DATA/", project_specific_path))
            for fastq in seqrun.fastq_files:
                src_file = os.path.join(project.base_path, "DATA",
                                        project_specific_path, fastq)
                dst_file = os.path.join("$SNIC_TMP/DATA/",
                                        project_specific_path, fastq)
                fastq_src_dst_list.append([src_file, dst_file])

    sbatch_text_list.append("echo -ne '\\n\\nCopying fastq files at '")
    sbatch_text_list.append("date")
    if fastq_src_dst_list:
        for directory in directories_to_create:
            sbatch_text_list.append("mkdir -p {}".format(directory))
        for src_file, dst_file in fastq_src_dst_list:
            sbatch_text_list.append("rsync -rptoDLv {} {}".format(
                src_file, dst_file))
    else:
        raise ValueError(('No valid fastq files available to process for '
                          'project/sample {}/{}'.format(project, sample)))

    # Pre-existing analysis files
    if files_to_copy:
        sbatch_text_list.append(
            "echo -ne '\\n\\nCopying pre-existing analysis files at '")
        sbatch_text_list.append("date")

        sbatch_text_list.append("if [ ! -d {output directory} ]; then")
        sbatch_text_list.append("mkdir {output directory} ")
        sbatch_text_list.append("fi")
        sbatch_text_list.append(("rsync -rptoDLv {input_files} "
                                 "{output_directory}/").format(
                                     input_files=" ".join(files_to_copy),
                                     output_directory=scratch_analysis_dir))
        # Delete pre-existing analysis files after copy
        sbatch_text_list.append(
            "echo -ne '\\n\\nDeleting pre-existing analysis files at '")
        sbatch_text_list.append("date")
        sbatch_text_list.append(
            "rm -rf {input_files}".format(input_files=" ".join(files_to_copy)))

    sbatch_text_list.append("echo -ne '\\n\\nExecuting command lines at '")
    sbatch_text_list.append("date")
    sbatch_text_list.append("# Run the actual commands")
    for command_line in command_line_list:
        sbatch_text_list.append(command_line)

    piper_status_file = create_exit_code_file_path(
        workflow_subtask=workflow_name,
        project_base_path=project.base_path,
        project_name=project.dirname,
        project_id=project.project_id,
        sample_id=sample.name)
    sbatch_text_list.append("\nPIPER_RETURN_CODE=$?")

    #Precalcuate md5sums
    sbatch_text_list.append(
        'MD5FILES="$SNIC_TMP/ANALYSIS/{}/piper_ngi/05_processed_alignments/*{}*.bam'
        .format(project.project_id, sample.name))
    sbatch_text_list.append(
        '$SNIC_TMP/ANALYSIS/{}/piper_ngi/05_processed_alignments/*.table'.
        format(project.project_id))
    sbatch_text_list.append(
        '$SNIC_TMP/ANALYSIS/{}/piper_ngi/07_variant_calls/*{}*.genomic.vcf.gz'.
        format(project.project_id, sample.name))
    sbatch_text_list.append(
        '$SNIC_TMP/ANALYSIS/{}/piper_ngi/07_variant_calls/*{}*.annotated.vcf.gz"'
        .format(project.project_id, sample.name))
    sbatch_text_list.append('for f in $MD5FILES')
    sbatch_text_list.append('do')
    sbatch_text_list.append("    md5sum $f | awk '{printf $1}' > $f.md5 &")
    sbatch_text_list.append('done')
    sbatch_text_list.append('wait')

    #Copying back files
    sbatch_text_list.append(
        "echo -ne '\\n\\nCopying back the resulting analysis files at '")
    sbatch_text_list.append("date")
    sbatch_text_list.append("mkdir -p {}".format(perm_analysis_dir))
    sbatch_text_list.append("rsync -rptoDLv {}/ {}/".format(
        scratch_analysis_dir, perm_analysis_dir))
    sbatch_text_list.append("\nRSYNC_RETURN_CODE=$?")

    # Record job completion status
    sbatch_text_list.append("if [[ $RSYNC_RETURN_CODE == 0 ]]")
    sbatch_text_list.append("then")
    sbatch_text_list.append("  if [[ $PIPER_RETURN_CODE == 0 ]]")
    sbatch_text_list.append("  then")
    sbatch_text_list.append("    echo '0'> {}".format(piper_status_file))
    sbatch_text_list.append("  else")
    sbatch_text_list.append("    echo '1'> {}".format(piper_status_file))
    sbatch_text_list.append("  fi")
    sbatch_text_list.append("else")
    sbatch_text_list.append("  echo '2'> {}".format(piper_status_file))
    sbatch_text_list.append("fi")

    # Write the sbatch file
    sbatch_dir = os.path.join(perm_analysis_dir, "sbatch")
    safe_makedir(sbatch_dir)
    sbatch_outfile = os.path.join(sbatch_dir,
                                  "{}.sbatch".format(job_identifier))
    rotate_file(sbatch_outfile)
    with open(sbatch_outfile, 'w') as f:
        f.write("\n".join(sbatch_text_list))
    LOG.info("Queueing sbatch file {} for job {}".format(
        sbatch_outfile, job_identifier))
    # Queue the sbatch file
    p_handle = execute_command_line("sbatch {}".format(sbatch_outfile),
                                    stdout=subprocess.PIPE,
                                    stderr=subprocess.PIPE)
    p_out, p_err = p_handle.communicate()
    try:
        slurm_job_id = re.match(r'Submitted batch job (\d+)',
                                p_out).groups()[0]
    except AttributeError:
        raise RuntimeError('Could not submit sbatch job for workflow "{}": '
                           '{}'.format(job_identifier, p_err))
    # Detail which seqruns we've started analyzing so we can update statuses later
    record_analysis_details(project, job_identifier)
    return int(slurm_job_id)
def workflow_fastq_screen(input_files, output_dir, config):
    # Get the path to the fastq_screen command
    fastq_screen_path = config.get("paths", {}).get("fastq_screen")
    if not fastq_screen_path:
        if find_on_path("fastq_screen", config):
            LOG.info("fastq_screen found on PATH")
            fastq_screen_path = "fastq_screen"
        else:
            raise ValueError('Path to fastq_screen could not be found and it is not '
                             'available on PATH; cannot proceed with fastq_screen '
                             'workflow.')
    fastq_screen_config_path = config.get("qc", {}).get("fastq_screen", {}).get("config_path")
    # We probably should have the path to the fastq_screen config file written down somewhere
    if not fastq_screen_config_path:
        LOG.warning('Path to fastq_screen config file not specified; assuming '
                 'it is in the same directory as the fastq_screen binary, '
                 'even though I think this is probably a fairly bad '
                 'assumption to make. You\'re in charge, whatever.')
    else:
        try:
            open(fastq_screen_config_path, 'r').close()
        except IOError as e:
            raise ValueError('Error when accessing fastq_screen configuration '
                             'file as specified in pipeline config: "{}" (path '
                             'given was {})'.format(e, fastq_screen_config_path))

    num_threads = config.get("qc", {}).get("fastq_screen", {}).get("threads") or 1
    subsample_reads = config.get("qc", {}).get("fastq_screen", {}).get("subsample_reads")

    # Determine which files need processing
    fastq_files = flatten(input_files) # Fastq_screen cares not for your "read pairs" anymore from version 1.5
    # Verify that we in fact need to run this on these files
    fastq_screen_output_file_tmpls = ["{}_screen.txt"]
    fastq_to_analyze = fastq_to_be_analysed(fastq_files, output_dir, fastq_screen_output_file_tmpls)
    # Construct the command lines
    cl_list = []
    # fastq_screen commands
    for fastq_file_pair in fastq_to_analyze:
        #when building the fastq_screen command soflink in the qc_ngi folder the fastq file processed being sure to avoid name collision (i.e., same sample run in two different FC but on the same lane number). Run fastq_screen on the softlink and delete the soflink straight away.
        fastq_file_original   = fastq_file_pair[0]
        fastq_file_softlinked = fastq_file_pair[1]
        #add the command
        cl_list.append('ln -s {original_file} {renamed_fastq_file}'.format(original_file=fastq_file_original,
                                                                            renamed_fastq_file=fastq_file_softlinked))
        #now the fastq_screen command (one per file)
        cl = fastq_screen_path
        cl += " --aligner bowtie2"
        cl += " --outdir {}".format(output_dir)
        if subsample_reads: cl += " --subset {}".format(subsample_reads)
        if num_threads: cl += " --threads {}".format(num_threads)
        if fastq_screen_config_path: cl += " --conf {}".format(fastq_screen_config_path)
        cl += " {}".format(fastq_file_softlinked)
        cl_list.append(cl)
        #remove the link to the fastq file
        cl_list.append('rm {renamed_fastq_file}'.format(renamed_fastq_file=fastq_file_softlinked))
    if cl_list:
        safe_makedir(output_dir)
        # Module loading
        modules_to_load = get_all_modules_for_workflow("fastq_screen", config)
        mod_list = [ "module load {}".format(module) for module in modules_to_load ]
        if mod_list:
            cl_list = mod_list + cl_list
    else:
        LOG.info("fastq_screen analysis not needed or input files were invalid.")
    return cl_list
def build_setup_xml(project, sample, workflow, local_scratch_mode, config):
    """Build the setup.xml file for each project using the CLI-interface of
    Piper's SetupFileCreator.

    :param NGIProject project: The project to be converted.
    :param NGISample sample: the sample object
    :param str workflow: The name of the workflow to be executed
    :param bool local_scratch_mode: Whether the job will be run in scratch or permanent storage
    :param dict config: The (parsed) configuration file for this machine/environment.

    :raises ValueError: If a required configuration file value is missing
    :raises RuntimeError: If the setupFileCreator returns non-zero
    """
    LOG.info('Building Piper setup.xml file for project "{}" '
             'sample "{}"'.format(project, sample.name))

    if local_scratch_mode:
        project_top_level_dir = os.path.join("$SNIC_TMP/DATA/", project.dirname)
        analysis_dir = os.path.join("$SNIC_TMP/ANALYSIS/", project.dirname, "piper_ngi")
        # Can't create these directories ahead of time of course
    else:
        project_top_level_dir = os.path.join(project.base_path, "DATA", project.dirname)
        analysis_dir = os.path.join(project.base_path, "ANALYSIS", project.dirname, "piper_ngi")
        safe_makedir(analysis_dir)

    cl_args = {'project': project.dirname}
    try:
        charon_session = CharonSession()
        charon_project = charon_session.project_get(project.project_id)
        cl_args["sequencing_center"] = charon_project["sequencing_facility"]
    except (KeyError, CharonError) as e:
        LOG.warn('Could not determine sequencing center from Charon ({}); setting to "Unknown".'.format(e))
        cl_args["sequencing_center"] = "Unknown"
    cl_args["sequencing_tech"] = "Illumina"
    slurm_qos = config.get("slurm", {}).get("extra_params", {}).get("--qos")
    if slurm_qos:
        cl_args["qos"] = slurm_qos

    # TODO Eventually this will be loaded from e.g. Charon
    reference_genome = 'GRCh37'
    try:
        cl_args["reference_path"] = config['supported_genomes'][reference_genome]
        cl_args["uppmax_proj"] = config['environment']['project_id']
    except KeyError as e:
        error_msg = ("Could not load required information from "
                     "configuration file and cannot continue with project {}: "
                     "value \"{}\" missing".format(project, e.message))
        raise ValueError(error_msg)

    try:
        cl_args["sfc_binary"] = config['piper']['path_to_setupfilecreator']
    except KeyError:
        cl_args["sfc_binary"] = "setupFileCreator" # Assume setupFileCreator is on path

    # setup XML file is always stored in permanent analysis directory
    output_xml_filepath = os.path.join(project.base_path, "ANALYSIS",
                                       project.dirname, "piper_ngi", "setup_xml_files",
                                       "{}-{}-{}-setup.xml".format(project, sample, workflow))
    safe_makedir(os.path.dirname(output_xml_filepath))
    cl_args["output_xml_filepath"] = output_xml_filepath
    setupfilecreator_cl = ("{sfc_binary} "
                           "--output {output_xml_filepath} "
                           "--project_name {project} "
                           "--sequencing_platform {sequencing_tech} "
                           "--sequencing_center {sequencing_center} "
                           "--uppnex_project_id {uppmax_proj} "
                           "--reference {reference_path}").format(**cl_args)
    if "qos" in cl_args:
        setupfilecreator_cl += " --qos {qos}".format(**cl_args)
    for samp in project:
        for libprep in samp:
            for seqrun in libprep:
                sample_run_directory = os.path.join(project_top_level_dir, sample.dirname,
                                                    libprep.dirname, seqrun.dirname)
                for fastq_file_name in seqrun.fastq_files:
                    fastq_file = os.path.join(sample_run_directory, fastq_file_name)
                    setupfilecreator_cl += " --input_fastq {}".format(fastq_file)
    return (setupfilecreator_cl, output_xml_filepath)
Ejemplo n.º 39
0
def workflow_fastqc(input_files, output_dir, config):
    """The constructor of the FastQC command line.

    :param list input_files: The list of fastq files to analyze (may be 2D for read pairs)
    :param str output_dir: The path to the desired output directory (will be created)
    :param dict config: The parsed system/pipeline configuration file

    :returns: A list of command lines to be executed in the order given
    :rtype: list
    :raises ValueError: If the FastQC path is not given or is not on PATH
    """
    # Get the path to the fastqc command
    fastqc_path = config.get("paths", {}).get("fastqc")
    if not fastqc_path:
        if find_on_path("fastqc", config):
            LOG.info("fastqc found on PATH")
            fastqc_path = "fastqc"
        else:
            raise ValueError('Path to FastQC could not be found and it is not '
                             'available on PATH; cannot proceed with FastQC '
                             'workflow.')

    fastq_files = flatten(
        input_files)  # FastQC cares not for your "read pairs"
    # Verify that we in fact need to run this on these files
    fastqc_output_file_tmpls = ("{}_fastqc.zip", "{}_fastqc.html")
    fastq_to_analyze = set()
    for fastq_file in fastq_files:
        # Get the basename withot extensions (.fastq, .fastq.gz)
        m = re.match(r'([\w-]+).fastq', os.path.basename(fastq_file))
        if not m:
            # fastq file name doesn't match expected pattern -- just process it
            fastq_to_analyze.add(fastq_file)
            continue
        else:
            fastq_file_base = m.groups()[0]
        for fastqc_output_file_tmpl in fastqc_output_file_tmpls:
            fastqc_output_file = \
                    os.path.join(output_dir, fastqc_output_file_tmpl.format(fastq_file_base))
            if not os.path.exists(fastqc_output_file):
                # Output file doesn't exist
                fastq_to_analyze.add(fastq_file)
            elif os.path.getctime(fastq_file) > os.path.getctime(
                    fastqc_output_file):
                # Input file modified more recently than output file
                fastq_to_analyze.add(fastq_file)

    num_threads = config.get("qc", {}).get("fastqc", {}).get("threads") or 1
    # Construct the command lines
    cl_list = []
    if fastq_to_analyze:
        safe_makedir(output_dir)
        # Module loading
        modules_to_load = get_all_modules_for_workflow("fastqc", config)
        for module in modules_to_load:
            cl_list.append("module load {}".format(module))
        # Execute fastqc
        cl_list.append('{fastqc_path} -t {num_threads} -o {output_dir} '
                       '{fastq_files}'.format(
                           output_dir=output_dir,
                           fastqc_path=fastqc_path,
                           num_threads=num_threads,
                           fastq_files=" ".join(fastq_to_analyze)))
    if not cl_list:
        LOG.info("FastQC analysis not needed or input files were invalid.")
    return cl_list
Ejemplo n.º 40
0
def workflow_fastqc(input_files, output_dir, config):
    """The constructor of the FastQC command line.

    :param list input_files: The list of fastq files to analyze (may be 2D for read pairs)
    :param str output_dir: The path to the desired output directory (will be created)
    :param dict config: The parsed system/pipeline configuration file

    :returns: A list of command lines to be executed in the order given
    :rtype: list
    :raises ValueError: If the FastQC path is not given or is not on PATH
    """
    # Get the path to the fastqc command
    fastqc_path = config.get("paths", {}).get("fastqc")
    if not fastqc_path:
        if find_on_path("fastqc", config):
            LOG.info("fastqc found on PATH")
            fastqc_path = "fastqc"
        else:
            raise ValueError('Path to FastQC could not be found and it is not '
                             'available on PATH; cannot proceed with FastQC '
                             'workflow.')

    fastq_files = flatten(
        input_files)  # FastQC cares not for your "read pairs"
    # Verify that we in fact need to run this on these files
    fastqc_output_file_tmpls = ("{}_fastqc.zip", "{}_fastqc.html")
    fastq_to_analyze = fastq_to_be_analysed(fastq_files, output_dir,
                                            fastqc_output_file_tmpls)
    # Construct the command lines
    num_threads = config.get("qc", {}).get("fastqc", {}).get("threads") or 1
    cl_list = []
    # fastqc commands
    for fastq_file_pair in fastq_to_analyze:
        #when building the fastqc command soflink in the qc_ngi folder the fastq file processed being sure to avoid name collision (i.e., same sample run in two different FC but on the same lane number). Run fastqc on the softlink and delete the soflink straight away.
        fastq_file_original = fastq_file_pair[0]
        fastq_file_softlinked = fastq_file_pair[1]
        #add the command
        cl_list.append('ln -s {original_file} {renamed_fastq_file}'.format(
            original_file=fastq_file_original,
            renamed_fastq_file=fastq_file_softlinked))
        #now the fastq command (one per file)
        cl_list.append('{fastqc_path} -t {num_threads} -o {output_dir} '
                       '{fastq_files}'.format(
                           output_dir=output_dir,
                           fastqc_path=fastqc_path,
                           num_threads=num_threads,
                           fastq_files=fastq_file_softlinked))
        #remove the link to the fastq file
        cl_list.append('rm {renamed_fastq_file}'.format(
            renamed_fastq_file=fastq_file_softlinked))
    if cl_list:
        safe_makedir(
            output_dir
        )  #create the fastqc folder as fastqc wants it and I have to create soflinks
        # Module loading
        modules_to_load = get_all_modules_for_workflow("fastqc", config)
        mod_list = [
            "module load {}".format(module) for module in modules_to_load
        ]
        if mod_list:
            cl_list = mod_list + cl_list
    if not cl_list:
        LOG.info("FastQC analysis not needed or input files were invalid.")
    return cl_list
Ejemplo n.º 41
0
 def test_safe_makedir_dirtree(self):
     dir_tree = os.path.join(self.tmp_dir, "first", "second", "third")
     safe_makedir(dir_tree)
     assert(os.path.exists(dir_tree))
Ejemplo n.º 42
0
 def test_safe_makedir_singledir(self):
     # Should test that this doesn't overwrite an existing dir as well
     single_dir = os.path.join(self.tmp_dir, "single_directory")
     safe_makedir(single_dir)
     assert(os.path.exists(single_dir))
Ejemplo n.º 43
0
def build_setup_xml(project, sample, local_scratch_mode, config):
    """Build the setup.xml file for each project using the CLI-interface of
    Piper's SetupFileCreator.

    :param NGIProject project: The project to be converted.
    :param NGISample sample: the sample object
    :param bool local_scratch_mode: Whether the job will be run in scratch or permanent storage
    :param dict config: The (parsed) configuration file for this machine/environment.

    :raises ValueError: If a required configuration file value is missing
    :raises RuntimeError: If the setupFileCreator returns non-zero
    """
    LOG.info('Building Piper setup.xml file for project "{}" '
             'sample "{}"'.format(project, sample.name))

    if local_scratch_mode:
        project_top_level_dir = os.path.join("$SNIC_TMP/DATA/",
                                             project.dirname)
        analysis_dir = os.path.join("$SNIC_TMP/ANALYSIS/", project.dirname,
                                    "piper_ngi")
        # Can't create these directories ahead of time of course
    else:
        project_top_level_dir = os.path.join(project.base_path, "DATA",
                                             project.dirname)
        analysis_dir = os.path.join(project.base_path, "ANALYSIS",
                                    project.dirname, "piper_ngi")
        safe_makedir(analysis_dir, 0770)
    ## TODO handle this elsewhere
    #safe_makedir(os.path.join(analysis_dir, "logs"))

    cl_args = {'project': project.dirname}
    cl_args["sequencing_center"] = "NGI"
    cl_args["sequencing_tech"] = "Illumina"
    ## TODO load these from (ngi_pipeline) config file
    cl_args["qos"] = "seqver"

    # Eventually this will be loaded from e.g. Charon
    reference_genome = 'GRCh37'
    try:
        cl_args["reference_path"] = config['supported_genomes'][
            reference_genome]
        cl_args["uppmax_proj"] = config['environment']['project_id']
    except KeyError as e:
        error_msg = ("Could not load required information from "
                     "configuration file and cannot continue with project {}: "
                     "value \"{}\" missing".format(project, e.message))
        raise ValueError(error_msg)

    try:
        cl_args["sfc_binary"] = config['piper']['path_to_setupfilecreator']
    except KeyError:
        cl_args[
            "sfc_binary"] = "setupFileCreator"  # Assume setupFileCreator is on path

    # setup XML file is always stored in permanent analysis directory
    output_xml_filepath = os.path.join(
        project.base_path, "ANALYSIS", project.dirname, "piper_ngi",
        "setup_xml_files", "{}-{}-setup.xml".format(project, sample))
    safe_makedir(os.path.dirname(output_xml_filepath))
    cl_args["output_xml_filepath"] = output_xml_filepath
    setupfilecreator_cl = ("{sfc_binary} "
                           "--output {output_xml_filepath} "
                           "--project_name {project} "
                           "--sequencing_platform {sequencing_tech} "
                           "--sequencing_center {sequencing_center} "
                           "--uppnex_project_id {uppmax_proj} "
                           "--reference {reference_path} "
                           "--qos {qos}").format(**cl_args)
    for libprep in sample:
        for seqrun in libprep:
            sample_run_directory = os.path.join(project_top_level_dir,
                                                sample.dirname,
                                                libprep.dirname,
                                                seqrun.dirname)
            for fastq_file_name in seqrun.fastq_files:
                fastq_file = os.path.join(sample_run_directory,
                                          fastq_file_name)
                setupfilecreator_cl += " --input_fastq {}".format(fastq_file)
    return (setupfilecreator_cl, output_xml_filepath)
Ejemplo n.º 44
0
def sbatch_piper_sample(command_line_list, workflow_name, project, sample,
                        libprep=None, restart_finished_jobs=False, 
                        config=None, config_file_path=None):
    """sbatch a piper sample-level workflow.

    :param list command_line_list: The list of command lines to execute (in order)
    :param str workflow_name: The name of the workflow to execute
    :param NGIProject project: The NGIProject
    :param NGISample sample: The NGISample
    :param dict config: The parsed configuration file (optional)
    :param str config_file_path: The path to the configuration file (optional)
    """
    job_identifier = "{}-{}-{}".format(project.project_id, sample, workflow_name)
    # Paths to the various data directories
    project_dirname = project.dirname
    sample_dirname = sample.dirname
    perm_analysis_dir = os.path.join(project.base_path, "ANALYSIS", project_dirname, "piper_ngi")
    scratch_analysis_dir = os.path.join("$SNIC_TMP/ANALYSIS/", project_dirname, "piper_ngi")
    scratch_aln_dir = os.path.join(scratch_analysis_dir, "01_raw_alignments")
    scratch_qc_dir = os.path.join(scratch_analysis_dir, "02_preliminary_alignment_qc")
    #ensure that the analysis dir exists
    safe_makedir(perm_analysis_dir)
    try:
        slurm_project_id = config["environment"]["project_id"]
    except KeyError:
        raise RuntimeError('No SLURM project id specified in configuration file '
                           'for job "{}"'.format(job_identifier))
    slurm_queue = config.get("slurm", {}).get("queue") or "core"
    num_cores = config.get("slurm", {}).get("cores") or 8
    slurm_time = config.get("piper", {}).get("job_walltime", {}).get("workflow_name") or "4-00:00:00"
    slurm_out_log = os.path.join(perm_analysis_dir, "logs", "{}_sbatch.out".format(job_identifier))
    slurm_err_log = os.path.join(perm_analysis_dir, "logs", "{}_sbatch.err".format(job_identifier))
    for log_file in slurm_out_log, slurm_err_log:
        rotate_file(log_file)
    sbatch_text = create_sbatch_header(slurm_project_id=slurm_project_id,
                                       slurm_queue=slurm_queue,
                                       num_cores=num_cores,
                                       slurm_time=slurm_time,
                                       job_name="piper_{}".format(job_identifier),
                                       slurm_out_log=slurm_out_log,
                                       slurm_err_log=slurm_err_log)
    sbatch_text_list = sbatch_text.split("\n")
    sbatch_extra_params = config.get("slurm", {}).get("extra_params", {})
    for param, value in sbatch_extra_params.iteritems():
        sbatch_text_list.append("#SBATCH {} {}\n\n".format(param, value))
    modules_to_load = config.get("piper", {}).get("load_modules", [])
    if modules_to_load:
        sbatch_text_list.append("\n# Load required modules for Piper")
        for module_name in modules_to_load:
            sbatch_text_list.append("module load {}".format(module_name))

    project, src_aln_files, src_alnqc_files = \
            collect_files_for_sample_analysis(project, sample, 
                                                restart_finished_jobs)

    # Fastq files to copy
    fastq_src_dst_list = []
    directories_to_create = set()
    for sample in project:
        for libprep in sample:
            for seqrun in libprep:
                project_specific_path = os.path.join(project.dirname,
                                                     sample.dirname,
                                                     libprep.dirname,
                                                     seqrun.dirname)
                directories_to_create.add(os.path.join("$SNIC_TMP/DATA/", project_specific_path))
                for fastq in seqrun.fastq_files:
                    src_file = os.path.join(project.base_path, "DATA", project_specific_path, fastq)
                    dst_file = os.path.join("$SNIC_TMP/DATA/", project_specific_path, fastq)
                    fastq_src_dst_list.append([src_file, dst_file])

    sbatch_text_list.append("echo -ne '\\n\\nCopying fastq files at '")
    sbatch_text_list.append("date")
    if fastq_src_dst_list:
        for directory in directories_to_create:
            sbatch_text_list.append("mkdir -p {}".format(directory))
        for src_file, dst_file in fastq_src_dst_list:
            sbatch_text_list.append("rsync -rptoDLv {} {}".format(src_file, dst_file))
    else:
        raise ValueError(('No valid fastq files available to process for '
                          'project/sample {}/{}'.format(project, sample)))

    # BAM files / Alignment QC files
    input_files_list = [ src_aln_files, src_alnqc_files ]
    output_dirs_list = [ scratch_aln_dir, scratch_qc_dir ]
    echo_text_list = ["Copying any pre-existing alignment files",
                      "Copying any pre-existing alignment qc files"]
    for echo_text, input_files, output_dir in zip(echo_text_list, input_files_list, output_dirs_list):
        if input_files:
            sbatch_text_list.append("echo -ne '\\n\\n{}' at ".format(echo_text))
            sbatch_text_list.append("date")
            sbatch_text_list.append("mkdir -p {}".format(output_dir))
            sbatch_text_list.append(("rsync -rptoDLv {input_files} "
                                     "{output_directory}/").format(input_files=" ".join(input_files),
                                                                  output_directory=output_dir))
    sbatch_text_list.append("echo -ne '\\n\\nExecuting command lines at '")
    sbatch_text_list.append("date")
    sbatch_text_list.append("# Run the actual commands")
    for command_line in command_line_list:
        sbatch_text_list.append(command_line)


    piper_status_file=create_exit_code_file_path(workflow_subtask=workflow_name,
                                                project_base_path=project.base_path,
                                                project_name=project.dirname,
                                                project_id=project.project_id,
                                                sample_id=sample.name)
    sbatch_text_list.append("\nPIPER_RETURN_CODE=$?")
    #sbatch_text_list.append("if [[ $PIPER_RETURN_CODE == 0 ]]")
    #sbatch_text_list.append("then")
    sbatch_text_list.append("echo -ne '\\n\\nCopying back the resulting analysis files at '")
    sbatch_text_list.append("date")
    sbatch_text_list.append("mkdir -p {}".format(perm_analysis_dir))
    sbatch_text_list.append("rsync -rptoDLv {}/ {}/".format(scratch_analysis_dir, perm_analysis_dir))
    sbatch_text_list.append("\nRSYNC_RETURN_CODE=$?")
    #sbatch_text_list.append("else")
    #sbatch_text_list.append("  echo -e '\\n\\nPiper job failed'")
    #sbatch_text_list.append("fi")

    # Record job completion status
    sbatch_text_list.append("if [[ $RSYNC_RETURN_CODE == 0 ]]")
    sbatch_text_list.append("then")
    sbatch_text_list.append("  if [[ $PIPER_RETURN_CODE == 0 ]]")
    sbatch_text_list.append("  then")
    sbatch_text_list.append("    echo '0'> {}".format(piper_status_file))
    sbatch_text_list.append("  else")
    sbatch_text_list.append("    echo '1'> {}".format(piper_status_file))
    sbatch_text_list.append("  fi")
    sbatch_text_list.append("else")
    sbatch_text_list.append("  echo '2'> {}".format(piper_status_file))
    sbatch_text_list.append("fi")

    # Write the sbatch file
    sbatch_dir = os.path.join(perm_analysis_dir, "sbatch")
    safe_makedir(sbatch_dir)
    sbatch_outfile = os.path.join(sbatch_dir, "{}.sbatch".format(job_identifier))
    rotate_file(sbatch_outfile)
    with open(sbatch_outfile, 'w') as f:
        f.write("\n".join(sbatch_text_list))
    LOG.info("Queueing sbatch file {} for job {}".format(sbatch_outfile, job_identifier))
    # Queue the sbatch file
    p_handle = execute_command_line("sbatch {}".format(sbatch_outfile),
                                    stdout=subprocess.PIPE,
                                    stderr=subprocess.PIPE)
    p_out, p_err = p_handle.communicate()
    try:
        slurm_job_id = re.match(r'Submitted batch job (\d+)', p_out).groups()[0]
    except AttributeError:
        raise RuntimeError('Could not submit sbatch job for workflow "{}": '
                           '{}'.format(job_identifier, p_err))
    # Detail which seqruns we've started analyzing so we can update statuses later
    record_analysis_details(project, job_identifier)
    return int(slurm_job_id)
Ejemplo n.º 45
0
def workflow_fastq_screen(input_files, output_dir, config):
    # Get the path to the fastq_screen command
    fastq_screen_path = config.get("paths", {}).get("fastq_screen")
    if not fastq_screen_path:
        if find_on_path("fastq_screen", config):
            LOG.info("fastq_screen found on PATH")
            fastq_screen_path = "fastq_screen"
        else:
            raise ValueError(
                'Path to fastq_screen could not be found and it is not '
                'available on PATH; cannot proceed with fastq_screen '
                'workflow.')
    fastq_screen_config_path = config.get("qc", {}).get("fastq_screen",
                                                        {}).get("config_path")
    # We probably should have the path to the fastq_screen config file written down somewhere
    if not fastq_screen_config_path:
        LOG.warn('Path to fastq_screen config file not specified; assuming '
                 'it is in the same directory as the fastq_screen binary, '
                 'even though I think this is probably a fairly bad '
                 'assumption to make. You\'re in charge, whatever.')
    else:
        try:
            open(fastq_screen_config_path, 'r').close()
        except IOError as e:
            raise ValueError(
                'Error when accessing fastq_screen configuration '
                'file as specified in pipeline config: "{}" (path '
                'given was {})'.format(e, fastq_screen_config_path))

    num_threads = config.get("qc", {}).get("fastq_screen",
                                           {}).get("threads") or 1
    subsample_reads = config.get("qc", {}).get("fastq_screen",
                                               {}).get("subsample_reads")

    # Determine which files need processing
    fastq_files = flatten(
        input_files
    )  # Fastq_screen cares not for your "read pairs" anymore from version 1.5
    # Verify that we in fact need to run this on these files
    fastq_screen_output_file_tmpls = ["{}_screen.txt"]
    fastq_to_analyze = fastq_to_be_analysed(fastq_files, output_dir,
                                            fastq_screen_output_file_tmpls)
    # Construct the command lines
    cl_list = []
    # fastq_screen commands
    for fastq_file_pair in fastq_to_analyze:
        #when building the fastq_screen command soflink in the qc_ngi folder the fastq file processed being sure to avoid name collision (i.e., same sample run in two different FC but on the same lane number). Run fastq_screen on the softlink and delete the soflink straight away.
        fastq_file_original = fastq_file_pair[0]
        fastq_file_softlinked = fastq_file_pair[1]
        #add the command
        cl_list.append('ln -s {original_file} {renamed_fastq_file}'.format(
            original_file=fastq_file_original,
            renamed_fastq_file=fastq_file_softlinked))
        #now the fastq_screen command (one per file)
        cl = fastq_screen_path
        cl += " --aligner bowtie2"
        cl += " --outdir {}".format(output_dir)
        if subsample_reads: cl += " --subset {}".format(subsample_reads)
        if num_threads: cl += " --threads {}".format(num_threads)
        if fastq_screen_config_path:
            cl += " --conf {}".format(fastq_screen_config_path)
        cl += " {}".format(fastq_file_softlinked)
        cl_list.append(cl)
        #remove the link to the fastq file
        cl_list.append('rm {renamed_fastq_file}'.format(
            renamed_fastq_file=fastq_file_softlinked))
    if cl_list:
        safe_makedir(output_dir)
        # Module loading
        modules_to_load = get_all_modules_for_workflow("fastq_screen", config)
        mod_list = [
            "module load {}".format(module) for module in modules_to_load
        ]
        if mod_list:
            cl_list = mod_list + cl_list
    else:
        LOG.info(
            "fastq_screen analysis not needed or input files were invalid.")
    return cl_list
Ejemplo n.º 46
0
def workflow_fastq_screen(input_files, output_dir, config):
    # Get the path to the fastq_screen command
    fastq_screen_path = config.get("paths", {}).get("fastq_screen")
    if not fastq_screen_path:
        if find_on_path("fastq_screen", config):
            LOG.info("fastq_screen found on PATH")
            fastq_screen_path = "fastq_screen"
        else:
            raise ValueError('Path to fastq_screen could not be found and it is not '
                             'available on PATH; cannot proceed with fastq_screen '
                             'workflow.')
    fastq_screen_config_path = config.get("qc", {}).get("fastq_screen", {}).get("config_path")
    # We probably should have the path to the fastq_screen config file written down somewhere
    if not fastq_screen_config_path:
        LOG.warn('Path to fastq_screen config file not specified; assuming '
                 'it is in the same directory as the fastq_screen binary, '
                 'even though I think this is probably a fairly bad '
                 'assumption to make. You\'re in charge, whatever.')
    else:
        try:
            open(fastq_screen_config_path, 'r').close()
        except IOError as e:
            raise ValueError('Error when accessing fastq_screen configuration '
                             'file as specified in pipeline config: "{}" (path '
                             'given was {})'.format(e, fastq_screen_config_path))

    num_threads = config.get("qc", {}).get("fastq_screen", {}).get("threads") or 1
    subsample_reads = config.get("qc", {}).get("fastq_screen", {}).get("subsample_reads")

    # Determine which files need processing
    fastq_screen_output_file_tmpls = ("{}_screen.png", "{}_screen.txt")
    fastq_to_analyze = set()
    for elt in input_files:
        # This may be a read pair
        if type(elt) is list:
            # Changing list to tuple so we can use it in the set() (lists aren't hashable)
            elt = tuple(elt)
            # fastq_screen uses the name of the first read of the pair for output files
            fastq_file = elt[0]
        else:
            fastq_file = elt
        for fastq_screen_output_file_tmpl in fastq_screen_output_file_tmpls:
            fastq_screen_output_file = \
                    os.path.join(output_dir, fastq_screen_output_file_tmpl.format(fastq_file))
            if not os.path.exists(fastq_screen_output_file):
                # Output file doesn't exist
                fastq_to_analyze.add(elt)
            elif os.path.getctime(fastq_file) > os.path.getctime(fastq_screen_output_file):
                # Input file modified more recently than output file
                fastq_to_analyze.add(elt)

    # Construct the command lines
    cl_list = []
    # fastq_screen commands
    for elt in fastq_to_analyze:
        cl = fastq_screen_path
        cl += " --aligner bowtie2"
        cl += " --outdir {}".format(output_dir)
        if subsample_reads: cl += " --subset {}".format(subsample_reads)
        if num_threads: cl += " --threads {}".format(num_threads)
        if fastq_screen_config_path: cl += " --conf {}".format(fastq_screen_config_path)
        if type(elt) is list or type(elt) is tuple:
            if len(elt) == 2:
                # Read pair; run fastq_screen on these together
                cl += (" --paired {}".format(" ".join(elt)))
            elif len(elt) == 1:
                cl += " " + elt[0]
            else:
                LOG.error('Files passed as list but more than two elements; '
                          'not a read pair? Skipping. ({})'.format(" ".join(elt)))
                continue
        elif type(elt) is str or type(elt) is unicode:
            cl += " " + elt
        else:
            LOG.error("Ignoring your weird input (not a string, not a list).")
            continue
        cl_list.append(cl)
    if cl_list:
        safe_makedir(output_dir)
        # Module loading
        modules_to_load = get_all_modules_for_workflow("fastq_screen", config)
        mod_list = [ "module load {}".format(module) for module in modules_to_load ]
        if mod_list:
            cl_list = mod_list + cl_list
    else:
        LOG.info("fastq_screen analysis not needed or input files were invalid.")
    return cl_list
Ejemplo n.º 47
0
 def test_safe_makedir_dirtree(self):
     dir_tree = os.path.join(self.tmp_dir, 'first', 'second', 'third')
     safe_makedir(dir_tree)
     assert (os.path.exists(dir_tree))
Ejemplo n.º 48
0
def setup_analysis_directory_structure(fc_dir, projects_to_analyze,
                                       restrict_to_projects=None, restrict_to_samples=None,
                                       create_files=True,
                                       fallback_libprep=None,
                                       quiet=False,
                                       config=None, config_file_path=None):
    """
    Copy and sort files from their CASAVA-demultiplexed flowcell structure
    into their respective project/sample/libPrep/FCIDs. This collects samples
    split across multiple flowcells.

    :param str fc_dir: The directory created by CASAVA for this flowcell.
    :param dict config: The parsed configuration file.
    :param set projects_to_analyze: A dict (of Project objects, or empty)
    :param bool create_files: Alter the filesystem (as opposed to just parsing flowcells) (default True)
    :param str fallback_libprep: If libprep cannot be determined, use this value if supplied (default None)
    :param list restrict_to_projects: Specific projects within the flowcell to process exclusively
    :param list restrict_to_samples: Specific samples within the flowcell to process exclusively

    :returns: A list of NGIProject objects that need to be run through the analysis pipeline
    :rtype: list

    :raises KeyError: If a required configuration key is not available.
    """
    LOG.info("Setting up analysis for demultiplexed data in source folder \"{}\"".format(fc_dir))
    if not restrict_to_projects: restrict_to_projects = []
    if not restrict_to_samples: restrict_to_samples = []
    config["quiet"] = quiet # Hack because I enter here from a script sometimes
    pattern="(.+(?:{}|{}))\/.+".format(config["analysis"]["sthlm_root"], config["analysis"]["upps_root"])
    matches=re.match(pattern, fc_dir)
    if matches:
        flowcell_root=matches.group(1)
    else:
        LOG.error("cannot guess which project the flowcell {} belongs to".format(fc_dir))
        raise RuntimeError

    analysis_top_dir = os.path.abspath(os.path.join(flowcell_root,config["analysis"]["top_dir"]))
    try:
        safe_makedir(analysis_top_dir)
    except OSError as e:
        LOG.error('Error: Analysis top directory {} does not exist and could not '
                  'be created.'.format(analysis_top_dir))
    fc_dir = fc_dir if os.path.isabs(fc_dir) else os.path.join(analysis_top_dir, fc_dir)
    if not os.path.exists(fc_dir):
        LOG.error("Error: Flowcell directory {} does not exist".format(fc_dir))
        return []
    # Map the directory structure for this flowcell
    try:
        fc_dir_structure = parse_flowcell(fc_dir)
    except (OSError, ValueError) as e:
        LOG.error("Error when processing flowcell dir \"{}\": {}".format(fc_dir, e))
        return []
    fc_full_id = fc_dir_structure['fc_full_id']
    if not fc_dir_structure.get('projects'):
        LOG.warn("No projects found in specified flowcell directory \"{}\"".format(fc_dir))
    # Iterate over the projects in the flowcell directory
    for project in fc_dir_structure.get('projects', []):
        project_name = project['project_name']
        project_original_name = project['project_original_name']
        samplesheet_path = fc_dir_structure.get("samplesheet_path")
        try:
            # Maps e.g. "Y.Mom_14_01" to "P123"
            project_id = get_project_id_from_name(project_name)
        except (CharonError, RuntimeError, ValueError) as e:
            LOG.warn('Could not retrieve project id from Charon (record missing?). '
                     'Using project name ("{}") as project id '
                     '(error: {})'.format(project_name, e))
            project_id = project_name
        # If specific projects are specified, skip those that do not match
        if restrict_to_projects and project_name not in restrict_to_projects and \
                                    project_id not in restrict_to_projects:
            LOG.debug("Skipping project {} (not in restrict_to_projects)".format(project_name))
            continue
        LOG.info("Setting up project {}".format(project.get("project_name")))
        # Create a project directory if it doesn't already exist, including
        # intervening "DATA" directory
        project_dir = os.path.join(analysis_top_dir, "DATA", project_id)
        project_sl_dir = os.path.join(analysis_top_dir, "DATA", project_name)
        project_analysis_dir = os.path.join(analysis_top_dir, "ANALYSIS", project_id)
        project_analysis_sl_dir = os.path.join(analysis_top_dir, "ANALYSIS", project_name)
        if create_files:
            safe_makedir(project_dir, 0o2770)
            safe_makedir(project_analysis_dir, 0o2770)
            if not project_dir == project_sl_dir and \
               not os.path.exists(project_sl_dir):
                os.symlink(project_dir, project_sl_dir)
            if not project_analysis_dir == project_analysis_sl_dir and \
               not os.path.exists(project_analysis_sl_dir):
                os.symlink(project_analysis_dir, project_analysis_sl_dir)
        try:
            project_obj = projects_to_analyze[project_dir]
        except KeyError:
            project_obj = NGIProject(name=project_name, dirname=project_id,
                                     project_id=project_id,
                                     base_path=analysis_top_dir)
            projects_to_analyze[project_dir] = project_obj
        # Iterate over the samples in the project
        for sample in project.get('samples', []):
            sample_name = sample['sample_name']
            # If specific samples are specified, skip those that do not match
            if restrict_to_samples and sample_name not in restrict_to_samples:
                LOG.debug("Skipping sample {}: not in specified samples "
                          "{}".format(sample_name, ", ".join(restrict_to_samples)))
                continue
            LOG.info("Setting up sample {}".format(sample_name))
            # Create a directory for the sample if it doesn't already exist
            sample_dir = os.path.join(project_dir, sample_name)
            if create_files: safe_makedir(sample_dir, 0o2770)
            # This will only create a new sample object if it doesn't already exist in the project
            sample_obj = project_obj.add_sample(name=sample_name, dirname=sample_name)
            # Get the Library Prep ID for each file
            pattern = re.compile(".*\.(fastq|fq)(\.gz|\.gzip|\.bz2)?$")
            fastq_files = filter(pattern.match, sample.get('files', []))
            # For each fastq file, create the libprep and seqrun objects
            # and add the fastq file to the seqprep object
            # Note again that these objects only get created if they don't yet exist;
            # if they do exist, the existing object is returned
            for fq_file in fastq_files:
                # Try to parse from SampleSheet
                try:
                    if not samplesheet_path: raise ValueError()
                    lane_num = re.match(r'[\w-]+_L\d{2}(\d)_\w+', fq_file).groups()[0]
                    libprep_name = determine_library_prep_from_samplesheet(samplesheet_path,
                                                                           project_original_name,
                                                                           sample_name,
                                                                           lane_num)
                except (IndexError, ValueError) as e:
                    LOG.debug('Unable to determine library prep from sample sheet file '
                              '("{}"); try to determine from Charon'.format(e))
                    try:
                        # Requires Charon access
                        libprep_name = determine_library_prep_from_fcid(project_id, sample_name, fc_full_id)
                        LOG.debug('Found libprep name "{}" in Charon'.format(libprep_name))
                    except ValueError:
                        charon_session = CharonSession()
                        libpreps = charon_session.sample_get_libpreps(project_id, sample_name).get('libpreps')
                        if len(libpreps) == 1:
                            libprep_name = libpreps[0].get('libprepid')
                            LOG.warn('Project "{}" / sample "{}" / seqrun "{}" / fastq "{}" '
                                     'has no libprep information in Charon, but only one '
                                     'library prep is present in Charon ("{}"). Using '
                                     'this as the library prep.'.format(project_name,
                                                                        sample_name,
                                                                        fc_full_id,
                                                                        fq_file,
                                                                        libprep_name))
                        elif fallback_libprep:
                            libprep_name = fallback_libprep
                            LOG.warn('Project "{}" / sample "{}" / seqrun "{}" / fastq "{}" '
                                     'has no libprep information in Charon, but a fallback '
                                     'libprep value of "{}" was supplied -- using this '
                                     'value.'.format(project_name,
                                                     sample_name,
                                                     fc_full_id,
                                                     fq_file,
                                                     libprep_name,
                                                     fallback_libprep))
                        else:
                            error_text = ('Project "{}" / sample "{}" / seqrun "{}" / fastq "{}" '
                                          'has no libprep information in Charon. Skipping '
                                          'analysis.'.format(project_name, sample_name,
                                                             fc_full_id, fq_file))
                            LOG.error(error_text)
                            if not config.get('quiet'):
                                mail_analysis(project_name=project_name,
                                              sample_name=sample_name,
                                              level="ERROR",
                                              info_text=error_text)
                            continue
                libprep_object = sample_obj.add_libprep(name=libprep_name,
                                                        dirname=libprep_name)
                libprep_dir = os.path.join(sample_dir, libprep_name)
                if create_files: safe_makedir(libprep_dir, 0o2770)
                seqrun_object = libprep_object.add_seqrun(name=fc_full_id,
                                                          dirname=fc_full_id)
                seqrun_dir = os.path.join(libprep_dir, fc_full_id)
                if create_files: safe_makedir(seqrun_dir, 0o2770)
                seqrun_object.add_fastq_files(fq_file)
            if fastq_files and create_files:
                src_sample_dir = os.path.join(fc_dir_structure['fc_dir'],
                                              project['data_dir'],
                                              project['project_dir'],
                                              sample['sample_dir'])
                for libprep_obj in sample_obj:
                    for seqrun_obj in libprep_obj:
                        src_fastq_files = [os.path.join(src_sample_dir, fastq_file) for
                                           fastq_file in seqrun_obj.fastq_files]
                        seqrun_dst_dir = os.path.join(project_obj.base_path, project_obj.dirname,
                                                      sample_obj.dirname, libprep_obj.dirname,
                                                      seqrun_obj.dirname)
                        LOG.info("Symlinking fastq files from {} to {}...".format(src_sample_dir, seqrun_dir))
                        try:
                            do_symlink(src_fastq_files, seqrun_dir)
                        except OSError:
                            error_text = ('Could not symlink files for project/sample'
                                          'libprep/seqrun {}/{}/{}/{}'.format(project_obj,
                                                                              sample_obj,
                                                                              libprep_obj,
                                                                              seqrun_obj))
                            LOG.error(error_text)
                            if not config.get('quiet'):
                                mail_analysis(project_name=project_name,
                                              sample_name=sample_name,
                                              level="ERROR",
                                              info_text=error_text)
                            continue
    return projects_to_analyze
Ejemplo n.º 49
0
def build_setup_xml(project, config, sample=None, libprep_id=None, seqrun_id=None):
    """Build the setup.xml file for each project using the CLI-interface of
    Piper's SetupFileCreator.

    :param NGIProject project: The project to be converted.
    :param dict config: The (parsed) configuration file for this machine/environment.
    :param NGISample sample: the sample object
    :param str library_id: id of the library
    :param str seqrun_id: flowcell identifier

    :returns: A list of Project objects with setup.xml paths as attributes.
    :rtype: list
    """

    if not seqrun_id:
        LOG.info('Building Piper setup.xml file for project "{}" '
                 'sample "{}"'.format(project, sample.name))
    else:
        LOG.info('Building Piper setup.xml file for project "{}" '
                 'sample "{}", libprep "{}", seqrun "{}"'.format(project, sample,
                                                                 libprep_id, seqrun_id))

    project_top_level_dir = os.path.join(project.base_path, "DATA", project.dirname)
    analysis_dir = os.path.join(project.base_path, "ANALYSIS", project.dirname)
    safe_makedir(analysis_dir, 0770)
    safe_makedir(os.path.join(analysis_dir, "logs"))
    cl_args = {'project': project.name}
    # Load needed data from database
    try:
        # Information we need from the database:
        # - species / reference genome that should be used (hg19, mm9)
        # - analysis workflows to run (QC, DNA alignment, RNA alignment, variant calling, etc.)
        # - adapters to be trimmed (?)
        ## <open connection to project database>
        #reference_genome = proj_db.get('species')
        reference_genome = 'GRCh37'
        # sequencing_center = proj_db.get('Sequencing Center')
        cl_args["sequencing_center"] = "NGI"
    except:
        ## Handle database connection failures here once we actually try to connect to it
        pass

    # Load needed data from configuration file
    try:
        cl_args["reference_path"] = config['supported_genomes'][reference_genome]
        cl_args["uppmax_proj"] = config['environment']['project_id']
    except KeyError as e:
        error_msg = ("Could not load required information from "
                     "configuration file and cannot continue with project {}: "
                     "value \"{}\" missing".format(project, e.message))
        raise ValueError(error_msg)

    try:
        cl_args["sfc_binary"] = config['piper']['path_to_setupfilecreator']
    except KeyError:
        # Assume setupFileCreator is on path
        cl_args["sfc_binary"] = "setupFileCreator"


    if not seqrun_id:
        output_xml_filepath = os.path.join(analysis_dir,
                                        "{}-{}-setup.xml".format(project, sample.name))
    else:
        output_xml_filepath = os.path.join(analysis_dir,
                                        "{}-{}-{}_setup.xml".format(project, sample.name, seqrun_id))

    cl_args["output_xml_filepath"]  = output_xml_filepath
    cl_args["sequencing_tech"]      = "Illumina"
    cl_args["qos"] = "seqver"
    setupfilecreator_cl = ("{sfc_binary} "
                           "--output {output_xml_filepath} "
                           "--project_name {project} "
                           "--sequencing_platform {sequencing_tech} "
                           "--sequencing_center {sequencing_center} "
                           "--uppnex_project_id {uppmax_proj} "
                           "--reference {reference_path} "
                           "--qos {qos}".format(**cl_args))
    #NOTE: here I am assuming the different dir structure, it would be wiser to change the object type and have an uppsala project
    if not seqrun_id:
        #if seqrun_id is none it means I want to create a sample level setup xml
        for libprep in sample:
            for seqrun in libprep:
                sample_run_directory = os.path.join(project_top_level_dir, sample.dirname, libprep.name, seqrun.name )
                for fastq_file_name in os.listdir(sample_run_directory):
                    #MARIO: I am not a big fun of this, IGN object need to be created from file system in order to avoid this things
                    fastq_file = os.path.join(sample_run_directory, fastq_file_name)
                    setupfilecreator_cl += " --input_fastq {}".format(fastq_file)
    else:
        #I need to create an xml file for this sample_run
        sample_run_directory = os.path.join(project_top_level_dir, sample.dirname, libprep_id, seqrun_id )
        for fastq_file_name in sample.libpreps[libprep_id].seqruns[seqrun_id].fastq_files:
            fastq_file = os.path.join(sample_run_directory, fastq_file_name)
            setupfilecreator_cl += " --input_fastq {}".format(fastq_file)

    try:
        LOG.info("Executing command line: {}".format(setupfilecreator_cl))
        subprocess.check_call(shlex.split(setupfilecreator_cl))
        project.setup_xml_path = output_xml_filepath
        project.analysis_dir   = analysis_dir
    except (subprocess.CalledProcessError, OSError, ValueError) as e:
        error_msg = ("Unable to produce setup XML file for project {}; "
                     "skipping project analysis. "
                     "Error is: \"{}\". .".format(project, e))
        raise RuntimeError(error_msg)