Esempio n. 1
0
def test_flatten():
    nested_list = [['A', 'B'], 
                   ['C', 'D'], 
                   'E'
                   ]
    expected_list = ['A', 'B', 'C', 'D', 'E']
    flattened_list = flatten(nested_list)
    for got_element, expected_element in zip(flattened_list, expected_list):
        assert got_element == expected_element
def workflow_fastqc(input_files, output_dir, config):
    """The constructor of the FastQC command line.

    :param list input_files: The list of fastq files to analyze (may be 2D for read pairs)
    :param str output_dir: The path to the desired output directory (will be created)
    :param dict config: The parsed system/pipeline configuration file

    :returns: A list of command lines to be executed in the order given
    :rtype: list
    :raises ValueError: If the FastQC path is not given or is not on PATH
    """
    # Get the path to the fastqc command
    fastqc_path = config.get("paths", {}).get("fastqc")
    if not fastqc_path:
        if find_on_path("fastqc", config):
            LOG.info("fastqc found on PATH")
            fastqc_path = "fastqc"
        else:
            raise ValueError('Path to FastQC could not be found and it is not '
                             'available on PATH; cannot proceed with FastQC '
                             'workflow.')

    fastq_files = flatten(input_files) # FastQC cares not for your "read pairs"
    # Verify that we in fact need to run this on these files
    fastqc_output_file_tmpls = ("{}_fastqc.zip", "{}_fastqc.html")
    fastq_to_analyze = fastq_to_be_analysed(fastq_files, output_dir, fastqc_output_file_tmpls)
    # Construct the command lines
    num_threads = config.get("qc", {}).get("fastqc", {}).get("threads") or 1
    cl_list = []
    # fastqc commands
    for fastq_file_pair in fastq_to_analyze:
        #when building the fastqc command soflink in the qc_ngi folder the fastq file processed being sure to avoid name collision (i.e., same sample run in two different FC but on the same lane number). Run fastqc on the softlink and delete the soflink straight away.
        fastq_file_original   = fastq_file_pair[0]
        fastq_file_softlinked = fastq_file_pair[1]
        #add the command
        cl_list.append('ln -s {original_file} {renamed_fastq_file}'.format(original_file=fastq_file_original,
                                                                            renamed_fastq_file=fastq_file_softlinked))
        #now the fastq command (one per file)
        cl_list.append('{fastqc_path} -t {num_threads} -o {output_dir} '
                       '{fastq_files}'.format(output_dir=output_dir,
                                              fastqc_path=fastqc_path,
                                              num_threads=num_threads,
                                              fastq_files=fastq_file_softlinked))
        #remove the link to the fastq file
        cl_list.append('rm {renamed_fastq_file}'.format(renamed_fastq_file=fastq_file_softlinked))
    if cl_list:
        safe_makedir(output_dir) #create the fastqc folder as fastqc wants it and I have to create soflinks
        # Module loading
        modules_to_load = get_all_modules_for_workflow("fastqc", config)
        mod_list = [ "module load {}".format(module) for module in modules_to_load ]
        if mod_list:
            cl_list = mod_list + cl_list
    if not cl_list:
        LOG.info("FastQC analysis not needed or input files were invalid.")
    return cl_list
def workflow_qc(input_files, output_dir, config):
    """Generic qc (includes multiple specific qc utilities)."""
    cl_list = []
    for workflow_name in "fastqc", "fastq_screen":
        workflow_fn_name = "workflow_{}".format(workflow_name)
        try:
            workflow_function = getattr(sys.modules[__name__], workflow_fn_name)
            output_subdir = os.path.join(output_dir, workflow_name)
            cl_list.append(workflow_function(input_files, output_subdir, config))
        except ValueError as e:
            LOG.error('Could not create command line for workflow '
                      '"{}" ({})'.format(workflow_name, e))
    # If specified in the config generate md5 for the fastq files
    if config.get("qc", {}).get("make_md5", False):
        md5_fls = 'files_for_md5="{}"'.format(" ".join(flatten(input_files)))
        md5_cmd = "for fl in $files_for_md5; do md5sum $fl | awk '{printf $1}' > ${fl}.md5; done"
        cl_list.append([md5_fls, md5_cmd])
    return cl_list
Esempio n. 4
0
def workflow_qc(input_files, output_dir, config):
    """Generic qc (includes multiple specific qc utilities)."""
    cl_list = []
    for workflow_name in "fastqc", "fastq_screen":
        workflow_fn_name = "workflow_{}".format(workflow_name)
        try:
            workflow_function = getattr(sys.modules[__name__],
                                        workflow_fn_name)
            output_subdir = os.path.join(output_dir, workflow_name)
            cl_list.append(
                workflow_function(input_files, output_subdir, config))
        except ValueError as e:
            LOG.error('Could not create command line for workflow '
                      '"{}" ({})'.format(workflow_name, e))
    # If specified in the config generate md5 for the fastq files
    if config.get("qc", {}).get("make_md5", False):
        md5_fls = 'files_for_md5="{}"'.format(" ".join(flatten(input_files)))
        md5_cmd = "for fl in $files_for_md5; do md5sum $fl | awk '{printf $1}' > ${fl}.md5; done"
        cl_list.append([md5_fls, md5_cmd])
    return cl_list
Esempio n. 5
0
def workflow_fastqc(input_files, output_dir, config):
    """The constructor of the FastQC command line.

    :param list input_files: The list of fastq files to analyze (may be 2D for read pairs)
    :param str output_dir: The path to the desired output directory (will be created)
    :param dict config: The parsed system/pipeline configuration file

    :returns: A list of command lines to be executed in the order given
    :rtype: list
    :raises ValueError: If the FastQC path is not given or is not on PATH
    """
    # Get the path to the fastqc command
    fastqc_path = config.get("paths", {}).get("fastqc")
    if not fastqc_path:
        if find_on_path("fastqc", config):
            LOG.info("fastqc found on PATH")
            fastqc_path = "fastqc"
        else:
            raise ValueError('Path to FastQC could not be found and it is not '
                             'available on PATH; cannot proceed with FastQC '
                             'workflow.')
    num_threads = config.get("qc", {}).get("fastqc", {}).get("threads") or 1
    fastq_files = flatten(
        input_files)  # FastQC cares not for your "read pairs"
    # Construct the command lines
    cl_list = []
    # Module loading
    modules_to_load = get_all_modules_for_workflow("fastqc", config)
    for module in modules_to_load:
        cl_list.append("module load {}".format(module))
    # Create the output directory
    cl_list.append('mkdir -p {output_dir}'.format(output_dir=output_dir))
    # Execute fastqc
    cl_list.append('{fastqc_path} -t {num_threads} -o {output_dir} '
                   '{fastq_files}'.format(output_dir=output_dir,
                                          fastqc_path=fastqc_path,
                                          num_threads=num_threads,
                                          fastq_files=" ".join(fastq_files)))
    return cl_list
Esempio n. 6
0
def workflow_fastqc(input_files, output_dir, config):
    """The constructor of the FastQC command line.

    :param list input_files: The list of fastq files to analyze (may be 2D for read pairs)
    :param str output_dir: The path to the desired output directory (will be created)
    :param dict config: The parsed system/pipeline configuration file

    :returns: A list of command lines to be executed in the order given
    :rtype: list
    :raises ValueError: If the FastQC path is not given or is not on PATH
    """
    # Get the path to the fastqc command
    fastqc_path = config.get("paths", {}).get("fastqc")
    if not fastqc_path:
        if find_on_path("fastqc", config):
            LOG.info("fastqc found on PATH")
            fastqc_path = "fastqc"
        else:
            raise ValueError('Path to FastQC could not be found and it is not '
                             'available on PATH; cannot proceed with FastQC '
                             'workflow.')
    num_threads = config.get("qc", {}).get("fastqc", {}).get("threads") or 1
    fastq_files = flatten(input_files) # FastQC cares not for your "read pairs"
    # Construct the command lines
    cl_list = []
    # Module loading
    modules_to_load = get_all_modules_for_workflow("fastqc", config)
    for module in modules_to_load:
        cl_list.append("module load {}".format(module))
    # Create the output directory
    cl_list.append('mkdir -p {output_dir}'.format(output_dir=output_dir))
    # Execute fastqc
    cl_list.append('{fastqc_path} -t {num_threads} -o {output_dir} '
                   '{fastq_files}'.format(output_dir=output_dir,
                                          fastqc_path=fastqc_path,
                                          num_threads=num_threads,
                                          fastq_files=" ".join(fastq_files)))
    return cl_list
def workflow_fastq_screen(input_files, output_dir, config):
    # Get the path to the fastq_screen command
    fastq_screen_path = config.get("paths", {}).get("fastq_screen")
    if not fastq_screen_path:
        if find_on_path("fastq_screen", config):
            LOG.info("fastq_screen found on PATH")
            fastq_screen_path = "fastq_screen"
        else:
            raise ValueError('Path to fastq_screen could not be found and it is not '
                             'available on PATH; cannot proceed with fastq_screen '
                             'workflow.')
    fastq_screen_config_path = config.get("qc", {}).get("fastq_screen", {}).get("config_path")
    # We probably should have the path to the fastq_screen config file written down somewhere
    if not fastq_screen_config_path:
        LOG.warning('Path to fastq_screen config file not specified; assuming '
                 'it is in the same directory as the fastq_screen binary, '
                 'even though I think this is probably a fairly bad '
                 'assumption to make. You\'re in charge, whatever.')
    else:
        try:
            open(fastq_screen_config_path, 'r').close()
        except IOError as e:
            raise ValueError('Error when accessing fastq_screen configuration '
                             'file as specified in pipeline config: "{}" (path '
                             'given was {})'.format(e, fastq_screen_config_path))

    num_threads = config.get("qc", {}).get("fastq_screen", {}).get("threads") or 1
    subsample_reads = config.get("qc", {}).get("fastq_screen", {}).get("subsample_reads")

    # Determine which files need processing
    fastq_files = flatten(input_files) # Fastq_screen cares not for your "read pairs" anymore from version 1.5
    # Verify that we in fact need to run this on these files
    fastq_screen_output_file_tmpls = ["{}_screen.txt"]
    fastq_to_analyze = fastq_to_be_analysed(fastq_files, output_dir, fastq_screen_output_file_tmpls)
    # Construct the command lines
    cl_list = []
    # fastq_screen commands
    for fastq_file_pair in fastq_to_analyze:
        #when building the fastq_screen command soflink in the qc_ngi folder the fastq file processed being sure to avoid name collision (i.e., same sample run in two different FC but on the same lane number). Run fastq_screen on the softlink and delete the soflink straight away.
        fastq_file_original   = fastq_file_pair[0]
        fastq_file_softlinked = fastq_file_pair[1]
        #add the command
        cl_list.append('ln -s {original_file} {renamed_fastq_file}'.format(original_file=fastq_file_original,
                                                                            renamed_fastq_file=fastq_file_softlinked))
        #now the fastq_screen command (one per file)
        cl = fastq_screen_path
        cl += " --aligner bowtie2"
        cl += " --outdir {}".format(output_dir)
        if subsample_reads: cl += " --subset {}".format(subsample_reads)
        if num_threads: cl += " --threads {}".format(num_threads)
        if fastq_screen_config_path: cl += " --conf {}".format(fastq_screen_config_path)
        cl += " {}".format(fastq_file_softlinked)
        cl_list.append(cl)
        #remove the link to the fastq file
        cl_list.append('rm {renamed_fastq_file}'.format(renamed_fastq_file=fastq_file_softlinked))
    if cl_list:
        safe_makedir(output_dir)
        # Module loading
        modules_to_load = get_all_modules_for_workflow("fastq_screen", config)
        mod_list = [ "module load {}".format(module) for module in modules_to_load ]
        if mod_list:
            cl_list = mod_list + cl_list
    else:
        LOG.info("fastq_screen analysis not needed or input files were invalid.")
    return cl_list
Esempio n. 8
0
def workflow_fastqc(input_files, output_dir, config):
    """The constructor of the FastQC command line.

    :param list input_files: The list of fastq files to analyze (may be 2D for read pairs)
    :param str output_dir: The path to the desired output directory (will be created)
    :param dict config: The parsed system/pipeline configuration file

    :returns: A list of command lines to be executed in the order given
    :rtype: list
    :raises ValueError: If the FastQC path is not given or is not on PATH
    """
    # Get the path to the fastqc command
    fastqc_path = config.get("paths", {}).get("fastqc")
    if not fastqc_path:
        if find_on_path("fastqc", config):
            LOG.info("fastqc found on PATH")
            fastqc_path = "fastqc"
        else:
            raise ValueError('Path to FastQC could not be found and it is not '
                             'available on PATH; cannot proceed with FastQC '
                             'workflow.')

    fastq_files = flatten(input_files) # FastQC cares not for your "read pairs"
    # Verify that we in fact need to run this on these files
    fastqc_output_file_tmpls = ("{}_fastqc.zip", "{}_fastqc.html")
    fastq_to_analyze = set()
    for fastq_file in fastq_files:
        # Get the basename withot extensions (.fastq, .fastq.gz)
        m = re.match(r'([\w-]+).fastq', os.path.basename(fastq_file))
        if not m:
            # fastq file name doesn't match expected pattern -- just process it
            fastq_to_analyze.add(fastq_file)
            continue
        else:
            fastq_file_base = m.groups()[0]
        for fastqc_output_file_tmpl in fastqc_output_file_tmpls:
            fastqc_output_file = \
                    os.path.join(output_dir, fastqc_output_file_tmpl.format(fastq_file_base))
            if not os.path.exists(fastqc_output_file):
                # Output file doesn't exist
                fastq_to_analyze.add(fastq_file)
            elif os.path.getctime(fastq_file) > os.path.getctime(fastqc_output_file):
                # Input file modified more recently than output file
                fastq_to_analyze.add(fastq_file)

    num_threads = config.get("qc", {}).get("fastqc", {}).get("threads") or 1
    # Construct the command lines
    cl_list = []
    if fastq_to_analyze:
        safe_makedir(output_dir)
        # Module loading
        modules_to_load = get_all_modules_for_workflow("fastqc", config)
        for module in modules_to_load:
            cl_list.append("module load {}".format(module))
        # Execute fastqc
        cl_list.append('{fastqc_path} -t {num_threads} -o {output_dir} '
                       '{fastq_files}'.format(output_dir=output_dir,
                                              fastqc_path=fastqc_path,
                                              num_threads=num_threads,
                                              fastq_files=" ".join(fastq_to_analyze)))
    if not cl_list:
        LOG.info("FastQC analysis not needed or input files were invalid.")
    return cl_list
Esempio n. 9
0
def workflow_fastqc(input_files, output_dir, config):
    """The constructor of the FastQC command line.

    :param list input_files: The list of fastq files to analyze (may be 2D for read pairs)
    :param str output_dir: The path to the desired output directory (will be created)
    :param dict config: The parsed system/pipeline configuration file

    :returns: A list of command lines to be executed in the order given
    :rtype: list
    :raises ValueError: If the FastQC path is not given or is not on PATH
    """
    # Get the path to the fastqc command
    fastqc_path = config.get("paths", {}).get("fastqc")
    if not fastqc_path:
        if find_on_path("fastqc", config):
            LOG.info("fastqc found on PATH")
            fastqc_path = "fastqc"
        else:
            raise ValueError('Path to FastQC could not be found and it is not '
                             'available on PATH; cannot proceed with FastQC '
                             'workflow.')

    fastq_files = flatten(
        input_files)  # FastQC cares not for your "read pairs"
    # Verify that we in fact need to run this on these files
    fastqc_output_file_tmpls = ("{}_fastqc.zip", "{}_fastqc.html")
    fastq_to_analyze = set()
    for fastq_file in fastq_files:
        # Get the basename withot extensions (.fastq, .fastq.gz)
        m = re.match(r'([\w-]+).fastq', os.path.basename(fastq_file))
        if not m:
            # fastq file name doesn't match expected pattern -- just process it
            fastq_to_analyze.add(fastq_file)
            continue
        else:
            fastq_file_base = m.groups()[0]
        for fastqc_output_file_tmpl in fastqc_output_file_tmpls:
            fastqc_output_file = \
                    os.path.join(output_dir, fastqc_output_file_tmpl.format(fastq_file_base))
            if not os.path.exists(fastqc_output_file):
                # Output file doesn't exist
                fastq_to_analyze.add(fastq_file)
            elif os.path.getctime(fastq_file) > os.path.getctime(
                    fastqc_output_file):
                # Input file modified more recently than output file
                fastq_to_analyze.add(fastq_file)

    num_threads = config.get("qc", {}).get("fastqc", {}).get("threads") or 1
    # Construct the command lines
    cl_list = []
    if fastq_to_analyze:
        safe_makedir(output_dir)
        # Module loading
        modules_to_load = get_all_modules_for_workflow("fastqc", config)
        for module in modules_to_load:
            cl_list.append("module load {}".format(module))
        # Execute fastqc
        cl_list.append('{fastqc_path} -t {num_threads} -o {output_dir} '
                       '{fastq_files}'.format(
                           output_dir=output_dir,
                           fastqc_path=fastqc_path,
                           num_threads=num_threads,
                           fastq_files=" ".join(fastq_to_analyze)))
    if not cl_list:
        LOG.info("FastQC analysis not needed or input files were invalid.")
    return cl_list
Esempio n. 10
0
def workflow_fastqc(input_files, output_dir, config):
    """The constructor of the FastQC command line.

    :param list input_files: The list of fastq files to analyze (may be 2D for read pairs)
    :param str output_dir: The path to the desired output directory (will be created)
    :param dict config: The parsed system/pipeline configuration file

    :returns: A list of command lines to be executed in the order given
    :rtype: list
    :raises ValueError: If the FastQC path is not given or is not on PATH
    """
    # Get the path to the fastqc command
    fastqc_path = config.get("paths", {}).get("fastqc")
    if not fastqc_path:
        if find_on_path("fastqc", config):
            LOG.info("fastqc found on PATH")
            fastqc_path = "fastqc"
        else:
            raise ValueError('Path to FastQC could not be found and it is not '
                             'available on PATH; cannot proceed with FastQC '
                             'workflow.')

    fastq_files = flatten(
        input_files)  # FastQC cares not for your "read pairs"
    # Verify that we in fact need to run this on these files
    fastqc_output_file_tmpls = ("{}_fastqc.zip", "{}_fastqc.html")
    fastq_to_analyze = fastq_to_be_analysed(fastq_files, output_dir,
                                            fastqc_output_file_tmpls)
    # Construct the command lines
    num_threads = config.get("qc", {}).get("fastqc", {}).get("threads") or 1
    cl_list = []
    # fastqc commands
    for fastq_file_pair in fastq_to_analyze:
        #when building the fastqc command soflink in the qc_ngi folder the fastq file processed being sure to avoid name collision (i.e., same sample run in two different FC but on the same lane number). Run fastqc on the softlink and delete the soflink straight away.
        fastq_file_original = fastq_file_pair[0]
        fastq_file_softlinked = fastq_file_pair[1]
        #add the command
        cl_list.append('ln -s {original_file} {renamed_fastq_file}'.format(
            original_file=fastq_file_original,
            renamed_fastq_file=fastq_file_softlinked))
        #now the fastq command (one per file)
        cl_list.append('{fastqc_path} -t {num_threads} -o {output_dir} '
                       '{fastq_files}'.format(
                           output_dir=output_dir,
                           fastqc_path=fastqc_path,
                           num_threads=num_threads,
                           fastq_files=fastq_file_softlinked))
        #remove the link to the fastq file
        cl_list.append('rm {renamed_fastq_file}'.format(
            renamed_fastq_file=fastq_file_softlinked))
    if cl_list:
        safe_makedir(
            output_dir
        )  #create the fastqc folder as fastqc wants it and I have to create soflinks
        # Module loading
        modules_to_load = get_all_modules_for_workflow("fastqc", config)
        mod_list = [
            "module load {}".format(module) for module in modules_to_load
        ]
        if mod_list:
            cl_list = mod_list + cl_list
    if not cl_list:
        LOG.info("FastQC analysis not needed or input files were invalid.")
    return cl_list
Esempio n. 11
0
def workflow_fastq_screen(input_files, output_dir, config):
    # Get the path to the fastq_screen command
    fastq_screen_path = config.get("paths", {}).get("fastq_screen")
    if not fastq_screen_path:
        if find_on_path("fastq_screen", config):
            LOG.info("fastq_screen found on PATH")
            fastq_screen_path = "fastq_screen"
        else:
            raise ValueError(
                'Path to fastq_screen could not be found and it is not '
                'available on PATH; cannot proceed with fastq_screen '
                'workflow.')
    fastq_screen_config_path = config.get("qc", {}).get("fastq_screen",
                                                        {}).get("config_path")
    # We probably should have the path to the fastq_screen config file written down somewhere
    if not fastq_screen_config_path:
        LOG.warn('Path to fastq_screen config file not specified; assuming '
                 'it is in the same directory as the fastq_screen binary, '
                 'even though I think this is probably a fairly bad '
                 'assumption to make. You\'re in charge, whatever.')
    else:
        try:
            open(fastq_screen_config_path, 'r').close()
        except IOError as e:
            raise ValueError(
                'Error when accessing fastq_screen configuration '
                'file as specified in pipeline config: "{}" (path '
                'given was {})'.format(e, fastq_screen_config_path))

    num_threads = config.get("qc", {}).get("fastq_screen",
                                           {}).get("threads") or 1
    subsample_reads = config.get("qc", {}).get("fastq_screen",
                                               {}).get("subsample_reads")

    # Determine which files need processing
    fastq_files = flatten(
        input_files
    )  # Fastq_screen cares not for your "read pairs" anymore from version 1.5
    # Verify that we in fact need to run this on these files
    fastq_screen_output_file_tmpls = ["{}_screen.txt"]
    fastq_to_analyze = fastq_to_be_analysed(fastq_files, output_dir,
                                            fastq_screen_output_file_tmpls)
    # Construct the command lines
    cl_list = []
    # fastq_screen commands
    for fastq_file_pair in fastq_to_analyze:
        #when building the fastq_screen command soflink in the qc_ngi folder the fastq file processed being sure to avoid name collision (i.e., same sample run in two different FC but on the same lane number). Run fastq_screen on the softlink and delete the soflink straight away.
        fastq_file_original = fastq_file_pair[0]
        fastq_file_softlinked = fastq_file_pair[1]
        #add the command
        cl_list.append('ln -s {original_file} {renamed_fastq_file}'.format(
            original_file=fastq_file_original,
            renamed_fastq_file=fastq_file_softlinked))
        #now the fastq_screen command (one per file)
        cl = fastq_screen_path
        cl += " --aligner bowtie2"
        cl += " --outdir {}".format(output_dir)
        if subsample_reads: cl += " --subset {}".format(subsample_reads)
        if num_threads: cl += " --threads {}".format(num_threads)
        if fastq_screen_config_path:
            cl += " --conf {}".format(fastq_screen_config_path)
        cl += " {}".format(fastq_file_softlinked)
        cl_list.append(cl)
        #remove the link to the fastq file
        cl_list.append('rm {renamed_fastq_file}'.format(
            renamed_fastq_file=fastq_file_softlinked))
    if cl_list:
        safe_makedir(output_dir)
        # Module loading
        modules_to_load = get_all_modules_for_workflow("fastq_screen", config)
        mod_list = [
            "module load {}".format(module) for module in modules_to_load
        ]
        if mod_list:
            cl_list = mod_list + cl_list
    else:
        LOG.info(
            "fastq_screen analysis not needed or input files were invalid.")
    return cl_list