def fastq_join(forward_fname,
               reverse_fname,
               output_file,
               reorder_to=None,
               options=dict()):
    """Workflow function for joining (aka stitching) paired-end fastq
    files with ea-utils' ``fastq-join``. If the ``drop_unpaired``
    option is set to True, unpaired forward reads are concatenated to
    the joined fastq file.

    :param forward_fname: String; file name for the forward reads.

    :param reverse_fname: String; file name for the reverse reads.

    :param output_file: String; file name for the the finished, joined reads.
   
    :param reorder_to: String; file name to reorder sequences against

    :param options: Dictionary; interpreted as command line options to
    be passed to the wrapped knead_data.py script.  No - or -- flags
    are necessary; the correct - or --t flags are inferred based on
    the length of the option.  For boolean options, use the key/value
    pattern of { "my-option": "" }.

    External Dependencies
      - fastq-join: Part of ea-utils 1.1.2-806 'https://drive.google.com/folderview?id=0B7KhouP0YeRAOTFWWGVFYkFSQjg&usp=sharing'

    """

    drop_unpaired = options.pop('drop_unpaired', False)

    default_opts = {"o": output_file}
    default_opts.update(options)
    opts = dict_to_cmd_opts(default_opts)

    cmd = "fastq-join " + opts + " " + forward_fname + " " + reverse_fname

    if '%' in output_file:
        renamed_output = output_file.replace("%", "join")
    else:
        renamed_output = output_file + "join"

    actions = [cmd]
    unpaired_forward = renamed_output.replace("join", "un1")
    if not drop_unpaired and reorder_to:
        actions.append(
            "sequence_re-pair -f fastq -t fastq -b %s %s %s > %s" %
            (reorder_to, renamed_output, unpaired_forward, output_file))
    elif not drop_unpaired and not reorder_to:
        actions.append("cat {} {} > {}".format(unpaired_forward,
                                               renamed_output, output_file))
    else:
        actions.append("mv %s %s" % (renamed_output, output_file))

    return {
        "name": "fastq_join: " + output_file,
        "actions": actions,
        "file_dep": [forward_fname, reverse_fname],
        "targets": [output_file]
    }
def featureCounts(input_sams, output_table, options=dict()):
    opts = {
        "a": settings.workflows.subread.annotations,
    }
    opts.update(options)
    opts['o'] = output_table

    cmd = ("featureCounts" + " " + dict_to_cmd_opts(opts) + " ")

    def run():
        files = [
            f for f in input_sams
            if os.path.exists(f) and os.stat(f).st_size > 0
        ]
        if files:
            return CmdAction(cmd + " ".join(files), verbose=True).execute()
        else:
            open(output_table, 'w').close()

    return {
        "name": "featureCounts: " + output_table,
        "file_dep": input_sams,
        "targets": [output_table],
        "actions": [run]
    }
def pick_denovo_otus(fasta_in,
                     otutab_out,
                     keep_tempfiles=False,
                     strand="plus",
                     log_file=None,
                     resume=False,
                     quiet=False,
                     chimera_standard=None,
                     truncate_opts={},
                     derep_opts={},
                     sort_opts={},
                     cluster_opts={},
                     chimera_opts={},
                     map_opts={}):

    opts = dict(input=fasta_in, output=otutab_out, print_cmd=True)
    if 'db' in chimera_opts:
        s = chimera_opts['db']
    elif bool(chimera_standard) is True:
        s = chimera_standard
    else:
        s = settings.workflows.usearch.chimera_gold_standard
    opts['chimera_standard'] = s
    opts['tmp_dir'] = otutab_out + "_usearch"
    kvopts = [("truncate_opts", truncate_opts), ("derep_opts", derep_opts),
              ("sort_opts", sort_opts), ("cluster_opts", cluster_opts),
              ("chimera_opts", chimera_opts), ("map_opts", map_opts)]
    for name, value in kvopts:
        if value:
            s = " ".join('='.join(pair) for pair in value.iteritems())
            opts[name] = "'" + s + "'"

    if not log_file:
        log_file = opts['tmp_dir'] + ".log"
    opts['log_file'] = log_file

    cmd = "usearch_denovo_otus " + dict_to_cmd_opts(opts)
    targets = [otutab_out, join(opts['tmp_dir'], "nonchimeric.fa"), log_file]

    def _run():
        ret = CmdAction(cmd).execute()
        if ret is None or not issubclass(type(ret), Exception):
            if not keep_tempfiles:
                for f in os.listdir(opts['tmp_dir']):
                    if f != "nonchimeric.fa" and \
                       os.path.isfile(join(opts['tmp_dir'], f)):
                        os.remove(join(opts['tmp_dir'], f))
        else:
            for t in targets:
                if not os.path.exists(t):
                    open(t, 'w').close()
        return ret

    return {
        "name": "usearch_pick_denovo_otus: " + otutab_out,
        "actions": [run],
        "file_dep": [fasta_in],
        "targets": targets,
        "title": usearch_rusage([fasta_in])
    }
def align(maybe_paired_fastq, output_sam, options=dict()):
    opts = {
        "unique": "",
        "hamming": "",
        "index": settings.workflows.subread.index
    }
    opts.update(options)
    opts['output'] = output_sam
    if type(maybe_paired_fastq) in (tuple, list):
        opts['read'], opts['read2'] = maybe_paired_fastq
        deps = maybe_paired_fastq
    else:
        opts['read'] = maybe_paired_fastq
        deps = [maybe_paired_fastq]

    cmd = "subread-align " + dict_to_cmd_opts(opts)

    def run():
        if any(os.stat(f).st_size < 1 for f in deps):
            open(output_sam, 'w').close()
        else:
            return CmdAction(cmd, verbose=True).execute()

    return {
        "name": "subread_align: " + output_sam,
        "actions": [run],
        "file_dep": deps,
        "targets": [output_sam]
    }
def sort(input_bam,
         output_prefix,
         memory_level="768M",
         num_threads=1,
         **kwargs):
    """Sort a bam file by sequence name with samtools.
    
    :param input_bam: String; file name of input bam file.

    :param output_prefix: String; file name of output sorted bam file
    without the .bam suffix

    :keyword memory_level: String; K/M/G human readable amount of ram
    to give each sorting thread

    :keyword num_threads: Int or string; number of threads to use when
    sorting.

    """

    output_file = output_prefix + ".bam"

    opts = {'n': "", '@': str(num_threads), 'm': memory_level}

    opts.update(kwargs)
    cmd = ("samtools sort " + " " + dict_to_cmd_opts(opts) + " " + input_bam +
           " " + output_prefix)

    return {
        "name": "samtools.sort: " + output_file,
        "file_dep": [input_bam],
        "actions": [cmd],
        "targets": [output_file]
    }
def sequence_pair(seqfname1,
                  seqfname2,
                  outfname1="/dev/null",
                  outfname2="/dev/null",
                  from_format=None,
                  format_to="fastq",
                  options=dict()):

    extra_options = dict_to_cmd_opts(options)
    targets = [
        target for target in [outfname1, outfname2] if target != "/dev/null"
    ]

    if not from_format:
        from_format = guess_seq_filetype(targets[0])

    pair_cmd = ("sequence_pair"
                " -f {from_format} -t {format_to}"
                " -1 {r1out} -2 {r2out} ").format(from_format=from_format,
                                                  format_to=format_to,
                                                  r1out=outfname1,
                                                  r2out=outfname2)
    pair_cmd += extra_options + " {} {}".format(seqfname1, seqfname2)

    return {
        "name": "sequence_pair: %s %s" % (outfname1, outfname2),
        "actions": [pair_cmd],
        "file_dep": [seqfname1, seqfname2],
        "targets": targets
    }
Exemple #7
0
 def run(pcoa_cmd=pcoa_cmd):
     if default_opts['meta'] is True or not default_opts['meta']:
         default_opts['meta'] = last_meta_name(pcl_fname)
     if default_opts['id'] is True or not default_opts['id']:
         default_opts['id'] = sample_id(pcl_fname)
     pcoa_cmd += dict_to_cmd_opts(default_opts)
     pcoa_cmd += " " + pcl_fname + " "
     return CmdAction(pcoa_cmd, verbose=True).execute()
def demultiplex(map_fname, fasta_fname, qual_fname, output_fname,
                qiime_opts={}):
    """Workflow to demultiplex a barcoded set of 16S sequences from a
    single run. This workflow wraps the qiime split_libraries.py
    script. For information on what the split_libraries.py script
    does, check out the qiime documentation:
    - http://qiime.org/tutorials/tutorial.html#assign-samples-to-multiplex-reads
    - http://qiime.org/scripts/split_libraries.html

    :param map_fname: String; File path location of the map.txt metdata file
    :param fasta_fname: String; File path to the input, multiplex, fasta files
    :param qual_fname: String; File path to the qual file corresponding 
                       to ``fasta_fname``.
    :param output_fname: String; File path to where the demultiplexed reads 
                         will be saved in fasta format.
    :keyword qiime_opts: Dictionary; A dictionary of command line options to
                         be passed to the wrapped split_libraries.py script. 
                         No - or -- flags are necessary; the correct - or --t
                         flags are inferred based on the length of the option. 
                         For boolean options, use the key/value pattern 
                         of { "my-option": "" }.

    External dependencies:
      - Qiime 1.8.0: https://github.com/qiime/qiime

    """
    
    
    output_dir, output_basename = os.path.split(output_fname)
    opts = dict_to_cmd_opts(qiime_opts)
    
    cmd = ("split_libraries.py"+
           " --map="+map_fname+
           " --fasta="+fasta_fname+
           " --qual="+qual_fname+
           " --dir-prefix="+output_dir+
           " "+opts)

    actions = [cmd]
    if output_basename != "seqs.fna":
        default_out = os.path.join(output_dir, "seqs.fna")
        actions.append("mv '%s' '%s'"%(default_out, output_fname))
    
    return {
        "name": "demultiplex:"+fasta_fname,
        "actions": actions,
        "file_dep": [map_fname, fasta_fname, qual_fname],
        "targets": [output_fname],
        "title": lambda t: t.name+" Estimated time=%.2f"%(
            sum(os.stat(f).st_size for f in t.file_dep)/1024./1024/5)
    }
def assign_taxonomy(in_fasta, out_dir, qiime_opts={}):

    name = rmext(os.path.basename(in_fasta))+"_tax_assignments.txt"
    taxonomy_out = os.path.join(out_dir, name)
    
    default_opts = dict([
        ("r", settings.workflows.sixteen.otu_refseq),
        ("t", settings.workflows.sixteen.otu_taxonomy),
    ]+list(qiime_opts.items()))

    cmd = ("assign_taxonomy.py -i "+in_fasta+" -o "+out_dir+
           " "+dict_to_cmd_opts(default_opts))

    return { "name"     : "assign_taxonomy: "+taxonomy_out,
             "targets"  : [taxonomy_out],
             "actions"  : [cmd],
             "file_dep" : [default_opts['r'], default_opts['t'], in_fasta] }
Exemple #10
0
def bowtie2_align(infiles_list, output_file, **opts):
    """Workflow to use bowtie2 to map a list of input sequence files
    against a bowtie2 database. Additional keyword options are used
    directly as bowtie2 command-line flags.

    :param infiles_list: List of strings; File paths to input search
                         queries as sequences in fastq format
    :param output_file: String; File path to the search results, in 
                        sam format.
    :keyword reference_db: String; File path to the bowtie2 reference 
                           db basename. Fed immediately into bowtie2's
                           -x option.
    :keyword threads: String or int; Number of threads to use when 
                      performing the mapping. Uses bowtie2's -p option.


    External dependencies:
      - Bowtie2 2.2.1: http://bowtie-bio.sourceforge.net/bowtie2/index.shtml

    Resource utilization:
      - Ram: 2.0-3.0G
      - CPU: 1 core; > 1 core depending on 'threads' option
    """

    all_opts = { # defaults in here
        "reference_db": settings.workflows.alignment.kegg_bowtie2_db,
        "threads": 2,
    }
    all_opts.update(opts)

    cmd = ("bowtie2 " + " -x " + all_opts.pop('reference_db') + " -p " +
           str(all_opts.pop('threads')) + " -U " + ",".join(infiles_list) +
           " --no-head" + " --very-sensitive" + " " +
           dict_to_cmd_opts(all_opts) + " > " + output_file)

    return {
        "name": "bowtie2_align:" + output_file,
        "actions": [cmd],
        "file_dep": infiles_list,
        "targets": [output_file]
    }
Exemple #11
0
def stacked_bar_chart(biom_fname, output_dir, qiime_opts=dict()):
    """Workflow to produce stacked bar charts of biom-formatted taxonomic
    profiles using QIIME's `summarize_taxa_through_plots.py`.

    :param biom_fname: String; the file name of a single biom-formatted otu 
                       table or taxonomic profile to be visualized.
    :param output_dir: String; the full path to a directory wherein the 
                       summary plots and charts will be placed
    :keyword qiime_opts: Dictionary; A dictionary of command line options to be
                         passed to the wrapped 
                         summarize_taxa_through_plots.py script. No - or -- 
                         flags are necessary; the correct - or --t flags are 
                         inferred based on the length of the option. For 
                         boolean options, use the key/value pattern of 
                         { "my-option": "" }.

    External dependencies
      - Qiime 1.8.0: https://github.com/qiime/qiime-deploy

    """

    cmd = ("summarize_taxa_through_plots.py "
           "-i {} -o {} ".format(biom_fname, output_dir))

    default_opts = {"force": True}
    default_opts.update(qiime_opts)

    opts = dict_to_cmd_opts(default_opts)
    cmd += opts

    target = os.path.join(output_dir, addtag(os.path.basename(biom_fname),
                                             "L2"))

    yield {
        "name": "stacked_bar_chart: " + output_dir,
        "actions": [cmd],
        "file_dep": [biom_fname],
        "targets": [target]
    }
def to_bam(input_sam, output_bam, threads=1, **kwargs):

    kwargs['@'] = kwargs.get("@", threads)
    opts = dict([("b", ""), ("o", output_bam)] + list(kwargs.iteritems()))

    cmd = ("samtools view " + " " + dict_to_cmd_opts(opts) + " " + input_sam)

    def _perfhint(task):
        threads = int(opts.get("@", 1))
        mem = 400  # MB
        size_mb = os.stat(first(task.file_dep)).st_size / 1024. / 1024.
        rate = 1800.  # MB/clock min
        time = 20 + (size_mb / rate)
        return ("{n} Estimated mem={mem:.0f} "
                "time={time:.0f} threads={threads:.0f}").format(
                    n=task.name, mem=mem, time=time, threads=threads)

    return {
        "name": "samtools.to_bam: " + output_bam,
        "file_dep": [input_sam],
        "targets": [output_bam],
        "actions": [cmd],
        "title": _perfhint
    }
def pick_otus_closed_ref(in_fasta,
                         out_biom,
                         out_tsv=None,
                         non_chimeric_otu_seqs=None,
                         denovo_otu_txt=None,
                         sample_metadata_fname=None,
                         taxonomy_fname=None,
                         ref_fasta=None,
                         keep_tempfiles=False,
                         strand='plus',
                         chimera_standard=None,
                         log_file=None,
                         resume=False,
                         quiet=False,
                         tmp_folder=None,
                         usearch_closed_opts={},
                         denovo_opts={}):

    if not tmp_folder:
        tmp_folder = in_fasta + "_usearch"
    if not log_file:
        log_file = in_fasta + "_usearch.log"
    if not taxonomy_fname:
        taxonomy_fname = settings.workflows.sixteen.otu_taxonomy
    if not ref_fasta:
        ref_fasta = settings.workflows.usearch.otu_db
    if not out_tsv:
        out_tsv = out_biom + ".tsv"

    if 'db' in denovo_opts.get('chimera_opts', {}):
        chimera = denovo_opts['chimera_opts']['db']
    elif 'chimera_standard' in denovo_opts:
        chimera = denovo_opts['chimera_standard']
    elif bool(chimera_standard) is True:
        chimera = chimera_standard
    else:
        chimera = settings.workflows.usearch.chimera_gold_standard

    opts = dict(input=in_fasta,
                output=out_tsv,
                taxonomy=taxonomy_fname,
                reference=ref_fasta,
                strand=strand,
                chimera_standard=chimera,
                quiet=quiet,
                print_cmd=True,
                log_file=log_file,
                denovo_otu_table=denovo_otu_txt,
                resume=resume,
                keep_tempfiles=True,
                tmp_dir=tmp_folder,
                otu_sequences=non_chimeric_otu_seqs)

    kvopts = list(denovo_opts.items()) + [("closed_opts", usearch_closed_opts)]
    for name, value in kvopts:
        if value:
            s = " ".join('='.join(pair) for pair in value.iteritems())
            opts[name] = "'" + s + "'"

    usearch_cmd = "uclust_closed_otus " + dict_to_cmd_opts(opts)

    biom_cmd = ("biom convert -i " + out_tsv + " -o " + out_biom +
                " --table-type='OTU Table' --process-obs-metadata=taxonomy" +
                " --output-metadata-id=taxonomy")

    if sample_metadata_fname:
        biom_cmd += " --sample-metadata-fp=" + sample_metadata_fname

    def _run():
        ret = CmdAction(usearch_cmd).execute()
        if ret is None or not issubclass(type(ret), Exception):
            ret = CmdAction(biom_cmd).execute()
            if not keep_tempfiles:
                for f in os.listdir(opts['tmp_dir']):
                    if f != "nonchimeric.fa" and \
                       os.path.isfile(join(opts['tmp_dir'], f)):
                        os.remove(join(opts['tmp_dir'], f))
        else:
            for t in targets:
                if not os.path.exists(t):
                    open(t, 'w').close()
        return ret

    file_dep = [in_fasta, taxonomy_fname, ref_fasta, chimera]
    targets = [
        out_biom, out_tsv,
        join(opts['tmp_dir'], "nonchimeric.fa"), log_file
    ]
    yield {
        "name": "usearch_pick_otus_closed_ref: " + out_biom,
        "targets": targets,
        "actions": [_run],
        "file_dep": file_dep,
        "title": usearch_rusage(file_dep)
    }
Exemple #14
0
def humann2(seqfile_in, output_dir, scratch=None, **opts):
    """Workflow to find pathway and gene lists grouped by organism from
    raw whole genome shotgun reads.

    Additional keywords are interpreted as command line options to be
    passed to the wrapped knead_data.py script.  No - or -- flags are
    necessary; the correct - or --t flags are inferred based on the
    length of the option.  For boolean options, use the key/value
    pattern of { "my-option": "" }.

    :param seqfile_in: String; Paths to file to be fed into HUMAnN2.

    :param output_dir: String; Directory path to where a HUMAnN2
      deposits its results.

    External dependencies:

      - `HUMAnN2 v0.1.9 <https://bitbucket.org/biobakery/humann2>`_


    Resource utilization: 

      - Ram: 4-6G
      - Time: 1 hr

    """

    default_opts = {
        "input": seqfile_in,
        "output": os.path.abspath(output_dir),
        "o-log": os.path.join(output_dir, "humann2_log.txt"),
        "memory-use": "minimum",
        "log-level": "INFO",
        "remove-temp-output": True,
        "output-format": "tsv"
    }
    default_opts.update(opts)

    suffix = default_opts['output-format']

    def _join(s):
        file, ext = os.path.splitext(seqfile_in)
        file = file + '.' + suffix
        return new_file(addtag(file, s), basedir=default_opts['output'])

    targets = map(_join, ("genefamilies", "pathcoverage", "pathabundance"))

    if scratch:
        old_out = default_opts['output']
        default_opts.pop('output', None)
        dbs = _get_humann2_dbs(default_opts.get("chocophlan", None),
                               default_opts.get("uniref", None))
        default_opts.pop('chocophlan', None), default_opts.pop('uniref', None)
        cmd = "humann2 " + dict_to_cmd_opts(default_opts, longsep=" ")
        actions = [
            """ tdir=$(mktemp -d -p {sdir});
                cd ${{tdir}}; 
                mkdir -pv ${{tdir}}/dbs;
                cp -rv {dbs} ${{tdir}}/dbs/;
                {humann2} --output ${{tdir}} \
                          --chocophlan ${{tdir}}/dbs/chocophlan \
                          --uniref ${{tdir}}/dbs/uniref;
                mv -iv ${{tdir}}/*.* {final_out};
                rm -rvf ${{tdir}};
            """.format(sdir=scratch,
                       dbs=" ".join(dbs),
                       humann2=cmd,
                       final_out=old_out)
        ]
    else:
        actions = ["humann2 " + dict_to_cmd_opts(default_opts, longsep=" ")]

    def _perfhint(task):
        threads = int(default_opts.get('threads', 1))
        insize = os.stat(seqfile_in).st_size
        # estimated number of million of reads for fastq input file
        est_reads = insize / 4 / 10e5
        return "{n} Estimated mem={mem:.0f} time={time:.0f}, threads={threads:.0f}".format(
            n=task.name,
            mem=(750 + (3.5 * log(est_reads))),
            time=(3.5 + ((2 * est_reads) / threads)) * 60,
            threads=threads)

    return {
        "name": "humann2:" + output_dir,
        "file_dep": [seqfile_in],
        "targets": targets,
        "actions": actions,
        "title": _perfhint
    }
Exemple #15
0
def knead_data(infiles, output_basestr, scratch=None, **opts):
    """Workflow to sanitize host data and otherwise quality filter
    metagenomic reads. Input sequences are mapped against a host
    database using bowtie2; any sequences that map back to the host
    database are discarded.

    Additional keywords are interpreted as command line options to be
    passed to the wrapped kneaddata executable.  No - or -- flags are
    necessary; the correct - or --t flags are inferred based on the
    length of the option.  For boolean options, use the key/value
    pattern of { "my-option": "" }.

    :param infiles: Iterable of strings; File path to the input
      sequences. Should be either a one-length or two-length
      iterable. Two-length iterables are treated as paired-end data.

    :param output_basestr: String; Path to the directory and base
      filename where the output cleaned sequences will be saved.

    In default_opts, the variable "reference-db" refers to the
    location of the database that contains the settings for the
    workflows.  The location can be passed either by command line
    or by editing "input/_options/decontaminate.txt" in the skeleton
    method.  Refer to http://huttenhower.sph.harvard.edu/docs/
    anadama/your_own_pipeline.html for more information.

    External dependencies:
      - `kneaddata <https://bitbucket.org/biobakery/kneaddata>`_
      - `bowtie2 <http://bowtie-bio.sourceforge.net/index.shtml>`_

    Resource utilization:
      - RAM: 4 G

    """

    path, base = os.path.split(output_basestr)
    default_opts = {
        "output-prefix": base,
        "output": output_basestr + "_knead",
        "reference-db": settings.workflows.knead.reference_db,
    }
    default_opts.update(opts)

    db_bases = map(os.path.basename, default_opts['reference-db'])

    def _targets(nums=[None]):
        outdir = default_opts['output']
        prefix = default_opts['output-prefix']
        yield os.path.join(outdir, prefix + ".fastq")
        for num in nums:
            for db_base in db_bases:
                to_join = [prefix, db_base, num, "contam.fastq"]
                n = "_".join(filter(bool, to_join))
                yield os.path.join(outdir, n)

    def _perfhint(task):
        threads = int(default_opts.get('threads', 1))
        insize = sum(os.stat(f).st_size for f in infiles)
        dbsize = sum(
            os.stat(f).st_size for pat in default_opts['reference-db']
            for f in glob(pat + "*"))
        return "{n} Estimated mem={mem:.0f} time={time:.0f}, threads={threads}".format(
            n=task.name,
            mem=dbsize / 1024 / 1024 + (1500),
            time=60 + (insize / 9e8 / (threads)),
            threads=threads)

    if type(infiles) in (unicode, str):
        infiles_list = [infiles]
    else:
        infiles_list = list(infiles)

    if len(infiles_list) > 1:
        one, two = infiles_list
        default_opts['input'] = one
        default_opts['input2'] = two
        targets = list(_targets(nums=[1, 2]))
    else:
        default_opts['input'] = infiles_list[0]
        targets = list(_targets())

    if scratch:
        db_patterns = " ".join(s + "*" for s in default_opts['reference-db'])
        db_printf_cmds = " ".join([
            '--reference-db "${{tdir}}/dbs/{}"'.format(db) for db in db_bases
        ])
        refs = default_opts.pop("reference-db", None)
        knead = "kneaddata " + dict_to_cmd_opts(default_opts)
        if refs:
            default_opts['reference-db'] = refs
        cmd = """ tdir=$(mktemp -d -p {sdir});
                  cd ${{tdir}};
                  mkdir -pv ${{tdir}}/dbs;
                  cp {db_patterns} ${{tdir}}/dbs/;
                  {knead} {db_printf_cmds};
                  rm -rvf ${{tdir}};
        """.format(sdir=scratch,
                   db_patterns=db_patterns,
                   knead=knead,
                   db_printf_cmds=db_printf_cmds)
    else:
        cmd = "kneaddata " + dict_to_cmd_opts(default_opts)

    return {
        "name": "kneaddata:" + output_basestr,
        "targets": targets,
        "file_dep": infiles_list,
        "actions": [cmd],
        "title": _perfhint,
    }
Exemple #16
0
def metaphlan2(files_list, scratch=None, **opts):
    """Workflow to perform taxonomic profiling from whole metagenome
    shotgun sequences. Additional keyword options are used directly as
    bowtie2 command-line flags.

    :param files_list: List of strings; File paths to input sequences,
                       in fastq format.
    
    External dependencies
      - Metaphlan2 @tip: https://bitbucket.org/biobakery/metaphlan2

    Resource utilization:
      - Ram: 1.5-3.0G

    """
    def_base = opts.get("output_file") or files_list[0]
    all_opts = {
        'bt2_ps': 'very-sensitive',
        'bowtie2db': settings.workflows.metaphlan2.bowtie2db,
        'mpa_pkl': settings.workflows.metaphlan2.mpa_pkl,
        "bowtie2out": new_file(addext(def_base, "bowtie2out.txt")),
        "output_file": new_file(addext(def_base, "metaphlan2"))
    }
    all_opts.update(opts)

    if 'input_type' not in all_opts:
        guessed = guess_seq_filetype(files_list[0])
        if guessed not in ('fasta', 'fastq'):
            raise ValueError("Need sequences in fasta or fastq format, "
                             "or provide keyword 'input_type'")
        all_opts['input_type'] = biopython_to_metaphlan[guessed]

    targets = [all_opts['output_file'], all_opts['bowtie2out']]
    if 'biom' in opts:
        targets.append(opts['biom'])

    cmd = starters.cat(files_list, guess_from=files_list[0])
    if scratch:
        db, pkl = all_opts['bowtie2db'], all_opts['mpa_pkl']
        all_opts.pop('bowtie2db', None), all_opts.pop('mpa_pkl', None)
        dbbase, pklbase = map(os.path.basename, (db, pkl))
        cmd += (" | metaphlan2.py" + " " + dict_to_cmd_opts(all_opts))
        actions = [
            """ tdir=$(mktemp -d -p {sdir});
                cd ${{tdir}};
                mkdir -pv ${{tdir}}/dbs;
                cp {db}* {pkl} ${{tdir}}/dbs;
                {cmd} --mpa_pkl ${{tdir}}/dbs/{pklbase} \
                      --bowtie2db ${{tdir}}/dbs/{dbbase};
                rm -rvf ${{tdir}};
            """.format(sdir=scratch,
                       pkl=pkl,
                       db=db,
                       cmd=cmd,
                       pklbase=pklbase,
                       dbbase=dbbase)
        ]
    else:
        cmd += (" | metaphlan2.py" + " " + dict_to_cmd_opts(all_opts))
        actions = [cmd]

    def _perfhint(task):
        threads = int(all_opts.get('nproc', 1))
        insize = sum(os.stat(f).st_size for f in files_list)
        return "{n} Estimated mem={mem:.0f} time={time:.0f}, threads={threads:.0f}".format(
            n=task.name,
            mem=1.5 * 1024,
            time=15 + (insize / 1.2e9 / (threads)),
            threads=threads)

    return dict(
        name="metaphlan2:" + all_opts['output_file'],
        actions=actions,
        file_dep=files_list,
        targets=targets,
        title=_perfhint,
    )
def pick_otus_open_ref(input_fname, output_dir, verbose=None, qiime_opts={}):
    """Workflow to perform open-reference OTU picking. Similar to
    closed-reference OTU picking, this workflow generates a
    biom-formatted OTU table from demultiplexed 16S reads. This
    workflow (in general terms) wraps qiime's
    pick_open_reference_otus.py, which itself wraps either uclust or
    usearch. Note that uclust and usearch require a fairly large
    memory footprint (1.5-2.0G in some cases).

    :param input_fname: String; File path to the input,
                        fasta-formatted 16S sequences
    :param output_dir: String; Path to the directory where the output OTU 
                       table will be saved as 'otu_table.biom'. Other 
                       qiime-specific logs will go there, too.
    :keyword verbose: Boolean: set to true to print the commands that are 
                      run as they are run
    :keyword qiime_opts: Dictionary; A dictionary of command line options to
                         be passed to the wrapped split_libraries.py script. 
                         No - or -- flags are necessary; the correct - or --t
                         flags are inferred based on the length of the option. 
                         For boolean options, use the key/value pattern 
                         of { "my-option": "" }.

    External dependencies:
      - Qiime 1.8.0: https://github.com/qiime/qiime-deploy
      - USEARCH: (only if using the usearch option) 
        http://www.drive5.com/usearch/

    Resource utilization:
      - RAM: >1.5 G

    """

    output_fname = new_file("otu_table.biom", basedir=output_dir)
    revcomp_fname = new_file(
        "revcomp.fna", basedir=os.path.dirname(input_fname))

    verbose = settings.workflows.verbose if verbose is None else verbose

    default_opts = {
        "reference_fp": settings.workflows.sixteen.otu_refseq
    }
    default_opts.update(qiime_opts)
    opts = dict_to_cmd_opts(default_opts)

    cmd = (" pick_open_reference_otus.py"+
           " --input_fp={}"+
           " --output_dir="+output_dir+
           " -f"+
           " "+opts)

    revcomp_cmd = ("sequence_convert"+
                   " --format=fasta"+
                   " --to=fasta "+
                   " -r"+
                   " "+input_fname+
                   " > "+revcomp_fname)

    def run(targets):
        strategies.backup(
            (CmdAction(cmd.format(input_fname),verbose=verbose),
             strategies.Group(
                 CmdAction(revcomp_cmd),
                 CmdAction(cmd.format(revcomp_fname),verbose=verbose))),
            extra_conditions = [ 
                lambda ret, output_fname: os.stat(output_fname).st_size == 0
            ],
            output_fname=output_fname
        )

    return {
        "name": "pick_otus_open_ref:"+input_fname,
        "actions": [run],
        "targets": [output_fname],
        "file_dep": [input_fname],
    }
def demultiplex_illumina(fastq_fnames, barcode_fnames, map_fname, output_fname,
                         verbose=True, qiime_opts={}):

    output_dir, output_basename = os.path.split(output_fname)
    default_opts = {
        "i": ",".join(fastq_fnames),
        "b": ",".join(barcode_fnames),
        "m": map_fname,
        "o": output_dir
    }
    default_opts.update(qiime_opts)
    opts = dict_to_cmd_opts(default_opts)
    
    cmd = "split_libraries_fastq.py "

    revcomp_map_fname = new_file(addtag(map_fname, "revcomp"),
                                 basedir=output_dir)
    revcomp_opts = default_opts.copy()
    revcomp_opts['m'] = revcomp_map_fname
    revcomp_opts = dict_to_cmd_opts(revcomp_opts)
    def _revcomp():
        from anadama.util import deserialize_map_file, serialize_map_file
        from Bio.Seq import Seq

        def _reverse(sample):
            seq = Seq(sample.BarcodeSequence).reverse_complement()
            return sample._replace(BarcodeSequence=str(seq))

        with open(map_fname) as from_map:
            from_samples = deserialize_map_file(from_map)
            serialize_map_file(
                ( _reverse(s) for s in from_samples ),
                revcomp_map_fname
            )


    default_out = os.path.join(output_dir, "seqs.fna")
    output_exists = lambda *args, **kwargs: (
        not os.path.exists(default_out)
        or not os.stat(default_out).st_size > 1
    )

    def run():
        return strategies.backup(
            (CmdAction(cmd+opts, verbose=verbose),
             strategies.Group(
                 PythonAction(_revcomp),
                 CmdAction(cmd+revcomp_opts,verbose=verbose))),
            extra_conditions=[output_exists]
        )


    actions = [run]
    if output_basename != "seqs.fna":
        actions.append("mv '%s' '%s'"%(default_out, output_fname))

    return {
        "name": "demultiplex_illumina:"+output_fname,
        "actions": actions,
        "file_dep": list(fastq_fnames) + list(barcode_fnames) + [map_fname],
        "targets": [output_fname],
        "title": lambda t: t.name+" Estimated time=%.2f"%(
            sum(os.stat(f).st_size for f in t.file_dep)/1024./1024/5)
    }