Beispiel #1
0
    def _configure(self):
        if self.options['infer_pairs'].get('infer'):
            paired, notpaired = infer_pairs(self.raw_seq_files)
            self.raw_seq_files = paired + notpaired

        maybe_tasks = list()
        for maybe_pair in self.raw_seq_files:
            is_pair = type(maybe_pair) in (tuple, list)
            if is_pair:
                pair, tasks = maybe_convert_to_fastq(maybe_pair,
                                                     self.products_dir)
                self.paired_fastq_files.append(pair)
                maybe_tasks.extend(tasks)
            elif util.guess_seq_filetype(maybe_pair) == 'bam':
                prefix = util.new_file(util.rmext(basename(maybe_pair)),
                                       basedir=self.products_dir)
                t = samtools.to_paired_fastq(maybe_pair, prefix)
                paired, single = t['targets'][:2], t['targets'][2]
                self.paired_fastq_files.append(paired)
                self.unpaired_fastq_files.append(single)
                maybe_tasks.append(t)
            else:
                single, tasks = maybe_convert_to_fastq([maybe_pair],
                                                       self.products_dir)
                self.unpaired_fastq_files.append(single[0])
                maybe_tasks.extend(tasks)

        for task in maybe_tasks:
            yield task

        for pair in self.paired_fastq_files:
            align_sam = util.new_file(_to_merged(basename(pair[0]),
                                                 tag="align"),
                                      basedir=self.products_dir)
            align_sam += ".sam"
            self.align_sams.append(align_sam)
            yield subread.align(pair, align_sam,
                                self.options.get('subread_align', dict()))

        for single in self.unpaired_fastq_files:
            align_sam = util.new_file(util.addtag(basename(single), "align"),
                                      basedir=self.products_dir)
            align_sam += ".sam"
            self.align_sams.append(align_sam)
            yield subread.align(single, align_sam,
                                self.options.get('subread_align', dict()))

        for align_sam in self.align_sams:
            count_table = util.new_file(util.addtag(basename(align_sam),
                                                    "count"),
                                        basedir=self.products_dir)
            self.count_tables.append(count_table)
            yield subread.featureCounts([align_sam], count_table,
                                        self.options.get(
                                            'featureCounts', dict()))
Beispiel #2
0
 def _actually_du():
     srv, remote_path = parse_fasp_url(url)
     bn = basename(remote_path)
     local_file = join(local_dir, bn)
     skip = (bn in local_cached
             and os.stat(local_file).st_size == remote_size)
     if skip == False:
         ret = asp.download_file(srv, dcc_user, dcc_pw, remote_path,
                                 local_dir)
         if not ret:
             raise Exception("Download failed: " + url)
     to_rm, files_to_upload = untar(local_file)
     for i, f in enumerate(files_to_upload):
         new_f = addtag(f, namespace)
         os.rename(f, new_f)
         files_to_upload[i] = new_f
     names_sizes = [(basename(f), os.stat(f).st_size)
                    for f in files_to_upload]
     for f in files_to_upload:
         ret = asp.upload_file(ncbi_srv,
                               ncbi_user,
                               None,
                               f,
                               ncbi_path,
                               keyfile=ncbi_keyfile)
     with open(local_file + "." + namespace + ".complete", 'w') as f:
         for name_size in names_sizes:
             print >> f, "\t".join(map(str, name_size))
     for f in reversed(to_rm):
         try:
             os.rmdir(f) if os.path.isdir(f) else os.remove(f)
         except:
             print >> sys.stderr, "Unable to remove " + f
Beispiel #3
0
    def split_454_style(self, seqfiles_to_split):
        tasks, demuxed = list(), list()
        self.sample_metadata = sorted(self.sample_metadata, key=firstitem)
        for sample_id, sample_group in groupby(self.sample_metadata,
                                               firstitem):
            sample_dir = join(self.products_dir, sample_id)
            sample_group = list(sample_group)
            map_fname = util.new_file("map.txt", basedir=sample_dir)
            tasks.append(
                sixteen.write_map(sample_group, sample_dir,
                                  **self.options.get('write_map', dict())))

            files_list = self._filter_files_for_sample(seqfiles_to_split,
                                                       sample_group)
            fasta_fname = util.new_file(sample_id + ".fa", basedir=sample_dir)
            qual_fname = util.new_file(sample_id + ".qual", basedir=sample_dir)
            tasks.append(
                general.fastq_split(files_list, fasta_fname, qual_fname,
                                    **self.options.get('fastq_split', dict())))

            qiime_opts = self.options['demultiplex'].pop('qiime_opts', {})
            if 'barcode-type' not in qiime_opts:
                qiime_opts['barcode-type'] = _determine_barcode_type(
                    sample_group)
            demuxed_fname = util.addtag(fasta_fname, "demuxed")
            tasks.append(
                sixteen.demultiplex(map_fname,
                                    fasta_fname,
                                    qual_fname,
                                    demuxed_fname,
                                    qiime_opts=qiime_opts,
                                    **self.options.get('demultiplex', dict())))
            demuxed.append(demuxed_fname)

        return demuxed, tasks
Beispiel #4
0
 def _process_raw_demuxed_fastq_files(self):
     for fname in self.raw_demuxed_fastq_files:
         filtered_fname = util.addtag(fname, "filtered")
         opts = self.options.get('fastq_filter', {})
         opts['mangle_to'] = self._filter_samples_for_file(
             self.sample_metadata, fname)[0][0]
         yield usearch.filter(fname, filtered_fname, **opts)
         self.demuxed_fasta_files.append(filtered_fname)
Beispiel #5
0
    def _configure(self):
        if self.otu_tables:
            merged_name = util.addtag(self.otu_tables[0], "merged")
            merged_file = util.new_file(merged_name, basedir=self.products_dir)
            yield sixteen.merge_otu_tables(
                self.otu_tables,
                name=merged_file
            )
            meta_biom_name = util.addtag(merged_file, "meta")
            yield biom.add_metadata(
                merged_file, meta_biom_name, 
                self._get_or_create_sample_metadata()
            )
            self.merged_otu_tables.append(meta_biom_name)

        for otu_table in self.merged_otu_tables:
            barchart_path = util.new_file(
                otu_table+"_barcharts", basedir=self.products_dir)
            yield visualization.stacked_bar_chart(
                otu_table, barchart_path,
                **self.options.get('stacked_bar_chart', {}))

            tsv_filename = otu_table+".tsv"
            yield association.biom_to_tsv(otu_table, tsv_filename)
            nice_tsv_filename = util.addtag(tsv_filename, 'maaslin')
            yield association.qiime_to_maaslin(tsv_filename, nice_tsv_filename)
            pcl_filename = otu_table+".pcl"
            yield association.merge_otu_metadata(
                nice_tsv_filename, 
                self._get_or_create_sample_metadata(),
                pcl_filename
            )
            self.pcl_files.append(pcl_filename)

        for pcl_file in self.pcl_files:
            yield visualization.breadcrumbs_pcoa_plot(
                pcl_file, pcl_file+"_pcoa_plot.png",
                CoordinatesMatrix = pcl_file+"_pcoa_coords.txt",
                **self.options.get('breadcrumbs_pcoa_plot', {})
            )
Beispiel #6
0
def maybe_stitch(maybe_pairs,
                 products_dir,
                 barcode_files=list(),
                 drop_unpaired=False):
    pairs, singles = split_pairs(maybe_pairs)
    tasks = list()
    barcodes = list()

    if not pairs:
        return singles, barcode_files, tasks

    pairs = sorted(pairs, key=firstitem)
    barcode_files = sorted(barcode_files)
    for pair, maybe_barcode in izip_longest(pairs, barcode_files):
        (forward,
         reverse), maybe_tasks = maybe_convert_to_fastq(pair, products_dir)
        tasks.extend(maybe_tasks)
        output = util.new_file(_to_merged(forward), basedir=products_dir)
        singles.append(output)
        if maybe_barcode and drop_unpaired:
            tasks.append(
                general.fastq_join(forward,
                                   reverse,
                                   output,
                                   options={'drop_unpaired': drop_unpaired}))
            filtered_barcode = util.new_file(util.addtag(
                maybe_barcode, "filtered"),
                                             basedir=products_dir)
            pairtask = general.sequence_pair(maybe_barcode,
                                             output,
                                             outfname1=filtered_barcode,
                                             options={"inner_join": "right"})
            barcodes.append(filtered_barcode)
            tasks.append(pairtask)
        else:
            tasks.append(
                general.fastq_join(forward, reverse, output, maybe_barcode,
                                   {'drop_unpaired': drop_unpaired}))
            barcodes.append(maybe_barcode)

    return singles, barcodes, tasks
Beispiel #7
0
def stacked_bar_chart(biom_fname, output_dir, qiime_opts=dict()):
    """Workflow to produce stacked bar charts of biom-formatted taxonomic
    profiles using QIIME's `summarize_taxa_through_plots.py`.

    :param biom_fname: String; the file name of a single biom-formatted otu 
                       table or taxonomic profile to be visualized.
    :param output_dir: String; the full path to a directory wherein the 
                       summary plots and charts will be placed
    :keyword qiime_opts: Dictionary; A dictionary of command line options to be
                         passed to the wrapped 
                         summarize_taxa_through_plots.py script. No - or -- 
                         flags are necessary; the correct - or --t flags are 
                         inferred based on the length of the option. For 
                         boolean options, use the key/value pattern of 
                         { "my-option": "" }.

    External dependencies
      - Qiime 1.8.0: https://github.com/qiime/qiime-deploy

    """

    cmd = ("summarize_taxa_through_plots.py "
           "-i {} -o {} ".format(biom_fname, output_dir))

    default_opts = {"force": True}
    default_opts.update(qiime_opts)

    opts = dict_to_cmd_opts(default_opts)
    cmd += opts

    target = os.path.join(output_dir, addtag(os.path.basename(biom_fname),
                                             "L2"))

    yield {
        "name": "stacked_bar_chart: " + output_dir,
        "actions": [cmd],
        "file_dep": [biom_fname],
        "targets": [target]
    }
Beispiel #8
0
 def _drop_unknown():
     import os
     import gzip
     import json
     from biom.table import DenseOTUTable
     from biom.parse import (
         OBS_META_TYPES,
         parse_biom_table,
         parse_classic_table_to_rich_table
     )
     idx = set([ row.strip().split('\t')[0]
                 for row in gzip.open(_copy_fname) ])
     filter_func = lambda a, otu_id, c: str(otu_id) in idx
     tmpfile = file+"_tmp.biom"
     with open(file) as f, open(tmpfile, 'w') as f_out:
         try:
             table = parse_biom_table(f)
         except Exception as e:
             table = parse_classic_table_to_rich_table(
                 f, None, None, OBS_META_TYPES['taxonomy'], DenseOTUTable)
         table = table.filterObservations(filter_func)
         json.dump( table.getBiomFormatObject("AnADAMA"), f_out )
     os.rename(file, addtag(file, "unfiltered"))
     os.rename(tmpfile, file)
Beispiel #9
0
 def _join(s):
     file, ext = os.path.splitext(seqfile_in)
     file = file + '.' + suffix
     return new_file(addtag(file, s), basedir=default_opts['output'])
Beispiel #10
0
def picrust(file, output_dir=None, verbose=True, **opts):
    """Workflow to predict metagenome functional content from 16S OTU tables.

    :param file: String; input OTU table.
    :keyword tab_in: Boolean; True if the input is a tabulated 
                     file (default:0)
    :keyword tab_out: Boolean; True if the output file is to be
                      tabulated (default:False)
    :keyword gg_version: String; the greengenes version to be used
                         (default:most recent version)
    :keyword t: String; option to use a different type of prediction
                   (default:KO)
    :keyword with_confidence: Boolean; Set to True to output confidence 
                              intervals (default:0)
    :keyword custom: String; specify a file containing a custom trait to 
                     predict metagenomes

    External Dependencies:
      - PICRUSt: Version 1.0.0, 
        http://picrust.github.io/picrust/install.html#install

    """
    norm_out = new_file(addtag(file, "normalized_otus"), basedir=output_dir)
    predict_out = new_file(addtag(file, "picrust"), basedir=output_dir)

    all_opts = { 'tab_in'          : 0,  'tab_out' : 0,
                 'gg_version'      : '', 't'       : '', 
                 'with_confidence' : 0,  'custom'  : '',
                 'drop_unknown'    : True}
    all_opts.update(opts)
    drop_unknown = all_opts.pop("drop_unknown", True)

    _copy_fname = settings.workflows.picrust.copy_number
    def _drop_unknown():
        import os
        import gzip
        import json
        from biom.table import DenseOTUTable
        from biom.parse import (
            OBS_META_TYPES,
            parse_biom_table,
            parse_classic_table_to_rich_table
        )
        idx = set([ row.strip().split('\t')[0]
                    for row in gzip.open(_copy_fname) ])
        filter_func = lambda a, otu_id, c: str(otu_id) in idx
        tmpfile = file+"_tmp.biom"
        with open(file) as f, open(tmpfile, 'w') as f_out:
            try:
                table = parse_biom_table(f)
            except Exception as e:
                table = parse_classic_table_to_rich_table(
                    f, None, None, OBS_META_TYPES['taxonomy'], DenseOTUTable)
            table = table.filterObservations(filter_func)
            json.dump( table.getBiomFormatObject("AnADAMA"), f_out )
        os.rename(file, addtag(file, "unfiltered"))
        os.rename(tmpfile, file)


    cmd1 = ("normalize_by_copy_number.py "
            + "-i %s"
            + " -o " + norm_out)
    if all_opts['gg_version']:
        cmd1 += " -g " + all_opts['gg_version']
    if all_opts['tab_in']:
        cmd1 += " -f"

    cmd2 = ("predict_metagenomes.py "
            + "-i %s"
            + " -o " + predict_out)
    if all_opts['gg_version']:
        cmd2 += " -g " + all_opts['gg_version']
    if all_opts['tab_out']:
        cmd2 += " -f"
    if all_opts['t']:
        cmd2 += " -t " + all_opts['t']
    if all_opts['with_confidence']:
        cmd2 += " --with_confidence"
    if all_opts['custom']:
        cmd2 += " -c " + all_opts['custom']


    converted = addtag(file, "json")
    format_cmd = CmdAction('biom convert --table-type="OTU table"'
                           ' --header-key taxonomy --to-json'
                           ' -i {} -o {} '.format(file, converted),
                           verbose=verbose)
    def run(targets):
        # try to run without converting to json, if that fails,
        # convert first, then run on the json-converted biom file
        if os.stat(file).st_size < 1:
            for target in targets:
                open(target, "w").close()
            return True

        return strategies.backup(
            (strategies.Group(CmdAction(cmd1%(file), verbose=verbose),
                              CmdAction(cmd2%(norm_out), verbose=verbose)),
             strategies.Group(format_cmd,
                              CmdAction(cmd1%(converted), verbose=verbose),
                              CmdAction(cmd2%(norm_out), verbose=verbose)))
        )
             
    actions = [run]
    if drop_unknown:
        actions = [_drop_unknown, run]

    def _rusage(task):
        msg = task.name+" Estimated mem={mem} time={time} threads=1"
        s = os.stat(list(task.file_dep)[0]).st_size
        return msg.format(
            mem=100+(s/1024.),
            time=100+(s*2.5e-4)
        )
        
    return dict(
        name = "picrust:"+predict_out,
        actions = actions,
        file_dep = [file],
        targets = [predict_out, norm_out],
        title = _rusage,
    )
Beispiel #11
0
def demultiplex_illumina(fastq_fnames, barcode_fnames, map_fname, output_fname,
                         verbose=True, qiime_opts={}):

    output_dir, output_basename = os.path.split(output_fname)
    default_opts = {
        "i": ",".join(fastq_fnames),
        "b": ",".join(barcode_fnames),
        "m": map_fname,
        "o": output_dir
    }
    default_opts.update(qiime_opts)
    opts = dict_to_cmd_opts(default_opts)
    
    cmd = "split_libraries_fastq.py "

    revcomp_map_fname = new_file(addtag(map_fname, "revcomp"),
                                 basedir=output_dir)
    revcomp_opts = default_opts.copy()
    revcomp_opts['m'] = revcomp_map_fname
    revcomp_opts = dict_to_cmd_opts(revcomp_opts)
    def _revcomp():
        from anadama.util import deserialize_map_file, serialize_map_file
        from Bio.Seq import Seq

        def _reverse(sample):
            seq = Seq(sample.BarcodeSequence).reverse_complement()
            return sample._replace(BarcodeSequence=str(seq))

        with open(map_fname) as from_map:
            from_samples = deserialize_map_file(from_map)
            serialize_map_file(
                ( _reverse(s) for s in from_samples ),
                revcomp_map_fname
            )


    default_out = os.path.join(output_dir, "seqs.fna")
    output_exists = lambda *args, **kwargs: (
        not os.path.exists(default_out)
        or not os.stat(default_out).st_size > 1
    )

    def run():
        return strategies.backup(
            (CmdAction(cmd+opts, verbose=verbose),
             strategies.Group(
                 PythonAction(_revcomp),
                 CmdAction(cmd+revcomp_opts,verbose=verbose))),
            extra_conditions=[output_exists]
        )


    actions = [run]
    if output_basename != "seqs.fna":
        actions.append("mv '%s' '%s'"%(default_out, output_fname))

    return {
        "name": "demultiplex_illumina:"+output_fname,
        "actions": actions,
        "file_dep": list(fastq_fnames) + list(barcode_fnames) + [map_fname],
        "targets": [output_fname],
        "title": lambda t: t.name+" Estimated time=%.2f"%(
            sum(os.stat(f).st_size for f in t.file_dep)/1024./1024/5)
    }