Esempio n. 1
0
    def split_454_style(self, seqfiles_to_split):
        tasks, demuxed = list(), list()
        self.sample_metadata = sorted(self.sample_metadata, key=firstitem)
        for sample_id, sample_group in groupby(self.sample_metadata,
                                               firstitem):
            sample_dir = join(self.products_dir, sample_id)
            sample_group = list(sample_group)
            map_fname = util.new_file("map.txt", basedir=sample_dir)
            tasks.append(
                sixteen.write_map(sample_group, sample_dir,
                                  **self.options.get('write_map', dict())))

            files_list = self._filter_files_for_sample(seqfiles_to_split,
                                                       sample_group)
            fasta_fname = util.new_file(sample_id + ".fa", basedir=sample_dir)
            qual_fname = util.new_file(sample_id + ".qual", basedir=sample_dir)
            tasks.append(
                general.fastq_split(files_list, fasta_fname, qual_fname,
                                    **self.options.get('fastq_split', dict())))

            qiime_opts = self.options['demultiplex'].pop('qiime_opts', {})
            if 'barcode-type' not in qiime_opts:
                qiime_opts['barcode-type'] = _determine_barcode_type(
                    sample_group)
            demuxed_fname = util.addtag(fasta_fname, "demuxed")
            tasks.append(
                sixteen.demultiplex(map_fname,
                                    fasta_fname,
                                    qual_fname,
                                    demuxed_fname,
                                    qiime_opts=qiime_opts,
                                    **self.options.get('demultiplex', dict())))
            demuxed.append(demuxed_fname)

        return demuxed, tasks
Esempio n. 2
0
    def _configure(self):
        if self.options['infer_pairs'].get('infer'):
            paired, notpaired = infer_pairs(self.raw_seq_files)
            self.raw_seq_files = paired + notpaired

        maybe_tasks = list()
        for maybe_pair in self.raw_seq_files:
            is_pair = type(maybe_pair) in (tuple, list)
            if is_pair:
                pair, tasks = maybe_convert_to_fastq(maybe_pair,
                                                     self.products_dir)
                self.paired_fastq_files.append(pair)
                maybe_tasks.extend(tasks)
            elif util.guess_seq_filetype(maybe_pair) == 'bam':
                prefix = util.new_file(util.rmext(basename(maybe_pair)),
                                       basedir=self.products_dir)
                t = samtools.to_paired_fastq(maybe_pair, prefix)
                paired, single = t['targets'][:2], t['targets'][2]
                self.paired_fastq_files.append(paired)
                self.unpaired_fastq_files.append(single)
                maybe_tasks.append(t)
            else:
                single, tasks = maybe_convert_to_fastq([maybe_pair],
                                                       self.products_dir)
                self.unpaired_fastq_files.append(single[0])
                maybe_tasks.extend(tasks)

        for task in maybe_tasks:
            yield task

        for pair in self.paired_fastq_files:
            align_sam = util.new_file(_to_merged(basename(pair[0]),
                                                 tag="align"),
                                      basedir=self.products_dir)
            align_sam += ".sam"
            self.align_sams.append(align_sam)
            yield subread.align(pair, align_sam,
                                self.options.get('subread_align', dict()))

        for single in self.unpaired_fastq_files:
            align_sam = util.new_file(util.addtag(basename(single), "align"),
                                      basedir=self.products_dir)
            align_sam += ".sam"
            self.align_sams.append(align_sam)
            yield subread.align(single, align_sam,
                                self.options.get('subread_align', dict()))

        for align_sam in self.align_sams:
            count_table = util.new_file(util.addtag(basename(align_sam),
                                                    "count"),
                                        basedir=self.products_dir)
            self.count_tables.append(count_table)
            yield subread.featureCounts([align_sam], count_table,
                                        self.options.get(
                                            'featureCounts', dict()))
Esempio n. 3
0
def group_by_sampleid(large_fastas, output_dir, sample_ids):
    output_fnames = [
        new_file(s + "_demuxed.fa", basedir=output_dir) for s in sample_ids
    ]

    if type(large_fastas) is str:
        large_fastas = [large_fastas]

    def _run():
        import contextlib
        from Bio import SeqIO
        from itertools import chain
        records = chain.from_iterable(
            iter(SeqIO.parse(fname, "fasta") for fname in large_fastas))

        files = dict([(s_id, open(f, 'w'))
                      for f, s_id in zip(output_fnames, sample_ids)])
        with contextlib.nested(*list(files.values())):
            for rec in records:
                sample_id = "_".join(rec.id.split("_")[:-2])
                SeqIO.write(rec, files[sample_id], "fasta")

    return {
        "name": "group_by_sampleid: " + large_fastas[0],
        "actions": [_run],
        "targets": output_fnames,
        "file_dep": large_fastas
    }
Esempio n. 4
0
def maybe_stitch(maybe_pairs,
                 products_dir,
                 barcode_files=list(),
                 drop_unpaired=False):
    pairs, singles = split_pairs(maybe_pairs)
    tasks = list()
    barcodes = list()

    if not pairs:
        return singles, barcode_files, tasks

    pairs = sorted(pairs, key=firstitem)
    barcode_files = sorted(barcode_files)
    for pair, maybe_barcode in izip_longest(pairs, barcode_files):
        (forward,
         reverse), maybe_tasks = maybe_convert_to_fastq(pair, products_dir)
        tasks.extend(maybe_tasks)
        output = util.new_file(_to_merged(forward), basedir=products_dir)
        singles.append(output)
        if maybe_barcode and drop_unpaired:
            tasks.append(
                general.fastq_join(forward,
                                   reverse,
                                   output,
                                   options={'drop_unpaired': drop_unpaired}))
            filtered_barcode = util.new_file(util.addtag(
                maybe_barcode, "filtered"),
                                             basedir=products_dir)
            pairtask = general.sequence_pair(maybe_barcode,
                                             output,
                                             outfname1=filtered_barcode,
                                             options={"inner_join": "right"})
            barcodes.append(filtered_barcode)
            tasks.append(pairtask)
        else:
            tasks.append(
                general.fastq_join(forward, reverse, output, maybe_barcode,
                                   {'drop_unpaired': drop_unpaired}))
            barcodes.append(maybe_barcode)

    return singles, barcodes, tasks
Esempio n. 5
0
    def split_illumina_style(self, seqfiles_to_split, barcode_seqfiles):
        demuxed, tasks = list(), list()
        bcode_pairs = zip(seqfiles_to_split, barcode_seqfiles)

        options = self.options.get("demultiplex_illumina", dict())
        if 'barcode_type' not in options:
            options['barcode_type'] = _determine_barcode_type(
                self.sample_metadata)

        do_groupby = options.pop("group_by_sampleid", False)
        for seqfile, bcode_file in bcode_pairs:
            sample_dir = join(self.products_dir, basename(seqfile) + "_split")

            map_fname = util.new_file("map.txt", basedir=sample_dir)
            sample_group = self._filter_samples_for_file(
                self.sample_metadata,
                seqfile,
                key=lambda val: val.Run_accession)
            tasks.append(
                sixteen.write_map(sample_group, sample_dir,
                                  **self.options.get('write_map', dict())))

            outfile = util.new_file(util.rmext(basename(seqfile)) +
                                    "_demuxed.fna",
                                    basedir=sample_dir)
            tasks.append(
                sixteen.demultiplex_illumina([seqfile], [bcode_file],
                                             map_fname,
                                             outfile,
                                             qiime_opts=options))
            demuxed.append(outfile)

        if do_groupby:
            output_dir = join(self.products_dir, "demuxed_by-sampleid")
            sample_ids = [s[0] for s in sample_group]
            task_dict = general.group_by_sampleid(demuxed, output_dir,
                                                  sample_ids)
            demuxed = task_dict['targets']
            tasks.append(task_dict)

        return demuxed, tasks
Esempio n. 6
0
def maybe_convert_to_fastq(fnames, products_dir):
    new_fnames, tasks = list(), list()
    for f in fnames:
        guess = util.guess_seq_filetype(f)
        if guess != "fastq" or util.is_compressed(f):
            fastq_file = util.new_file(f + ".fastq", basedir=products_dir)
            new_fnames.append(fastq_file)
            tasks.append(general.sequence_convert([f], fastq_file))
        else:
            new_fnames.append(f)

    return new_fnames, tasks
Esempio n. 7
0
    def _configure(self):
        if self.otu_tables:
            merged_name = util.addtag(self.otu_tables[0], "merged")
            merged_file = util.new_file(merged_name, basedir=self.products_dir)
            yield sixteen.merge_otu_tables(
                self.otu_tables,
                name=merged_file
            )
            meta_biom_name = util.addtag(merged_file, "meta")
            yield biom.add_metadata(
                merged_file, meta_biom_name, 
                self._get_or_create_sample_metadata()
            )
            self.merged_otu_tables.append(meta_biom_name)

        for otu_table in self.merged_otu_tables:
            barchart_path = util.new_file(
                otu_table+"_barcharts", basedir=self.products_dir)
            yield visualization.stacked_bar_chart(
                otu_table, barchart_path,
                **self.options.get('stacked_bar_chart', {}))

            tsv_filename = otu_table+".tsv"
            yield association.biom_to_tsv(otu_table, tsv_filename)
            nice_tsv_filename = util.addtag(tsv_filename, 'maaslin')
            yield association.qiime_to_maaslin(tsv_filename, nice_tsv_filename)
            pcl_filename = otu_table+".pcl"
            yield association.merge_otu_metadata(
                nice_tsv_filename, 
                self._get_or_create_sample_metadata(),
                pcl_filename
            )
            self.pcl_files.append(pcl_filename)

        for pcl_file in self.pcl_files:
            yield visualization.breadcrumbs_pcoa_plot(
                pcl_file, pcl_file+"_pcoa_plot.png",
                CoordinatesMatrix = pcl_file+"_pcoa_coords.txt",
                **self.options.get('breadcrumbs_pcoa_plot', {})
            )
Esempio n. 8
0
    def _configure(self):
        yield self._handle_raw_seqs_and_demultiplex()

        # merge all of the demultiplexed files into a single file
        merged_fasta = util.new_file("all_samples.fa",
                                     basedir=self.products_dir)
        yield general.cat(self.demuxed_fasta_files, merged_fasta)

        otu_table_biom = util.new_file("all_samples_otu_tax.biom",
                                       basedir=self.products_dir)
        otu_table_tsv = util.new_file("all_samples_otu_tax.tsv",
                                      basedir=self.products_dir)
        # run closed reference picking
        yield pick_otus_closed_ref(merged_fasta,
                                   otu_table_biom,
                                   out_tsv=otu_table_tsv,
                                   **self.options.get('pick_otus_closed_ref',
                                                      dict()))

        # infer genes and pathways with picrust
        yield sixteen.picrust(otu_table_biom,
                              **self.options.get('picrust', dict()))
Esempio n. 9
0
def maybe_concatenate(maybe_pairs, products_dir):
    pairs, singles = split_pairs(maybe_pairs)
    tasks = list()

    if not pairs:
        return singles, tasks

    for pair in pairs:
        catted_fname = util.new_file( 
            _to_merged(pair[0], tag="cat", strip_ext=False),
            basedir=products_dir )

        simply_cat = all( util.guess_seq_filetype(s) in ('fastq' , 'fasta')
                          for s in pair )
        if simply_cat:
            tasks.append(general.cat(pair, catted_fname))
        else:
            tasks.append(general.sequence_convert(pair, catted_fname))

        singles.append(catted_fname)

    return singles, tasks
Esempio n. 10
0
def write_map(sample_group, sample_dir):
    """Workflow to write a new map.txt file from a list of samples.  The
    resultant map.txt file is always named 'map.txt' and is placed in
    the ``sample_dir`` directory
    
    :param sample_group: List of namedtuples; A list of samples as
                         deserialized by anadama.util.deserialize_map_file
    :param sample_dir: String; Directory path indicating where to write 
                       the map.txt file

    """


    map_fname = new_file("map.txt", basedir=sample_dir)

    def _write(targets):
        with open(map_fname, 'w') as map_file:
            # print the headers first
            print >> map_file, "#"+"\t".join(sample_group[0]._fields)

            for _, samples_bycode in itertools.groupby(
                    sample_group, operator.attrgetter("BarcodeSequence")):
                # get the first (hopefully only) sample from the samples
                # grouped by ID then barcode. Ignore any other samples
                # under the same ID for the same barcode
                sample = samples_bycode.next()
                bcode = sample.BarcodeSequence
                # uniq-ify to make qiime happy
                sample = list(sample)
                sample[0] += "_" + bcode
                print >> map_file, "\t".join(sample)

    return {
        "name": "write_map:"+map_fname,
        "actions": [_write],
        "targets": [map_fname],
        "title": lambda t: t.name+" Estimated mem=200 time=5 threads=1"
    }
Esempio n. 11
0
def metaphlan2(files_list, scratch=None, **opts):
    """Workflow to perform taxonomic profiling from whole metagenome
    shotgun sequences. Additional keyword options are used directly as
    bowtie2 command-line flags.

    :param files_list: List of strings; File paths to input sequences,
                       in fastq format.
    
    External dependencies
      - Metaphlan2 @tip: https://bitbucket.org/biobakery/metaphlan2

    Resource utilization:
      - Ram: 1.5-3.0G

    """
    def_base = opts.get("output_file") or files_list[0]
    all_opts = {
        'bt2_ps': 'very-sensitive',
        'bowtie2db': settings.workflows.metaphlan2.bowtie2db,
        'mpa_pkl': settings.workflows.metaphlan2.mpa_pkl,
        "bowtie2out": new_file(addext(def_base, "bowtie2out.txt")),
        "output_file": new_file(addext(def_base, "metaphlan2"))
    }
    all_opts.update(opts)

    if 'input_type' not in all_opts:
        guessed = guess_seq_filetype(files_list[0])
        if guessed not in ('fasta', 'fastq'):
            raise ValueError("Need sequences in fasta or fastq format, "
                             "or provide keyword 'input_type'")
        all_opts['input_type'] = biopython_to_metaphlan[guessed]

    targets = [all_opts['output_file'], all_opts['bowtie2out']]
    if 'biom' in opts:
        targets.append(opts['biom'])

    cmd = starters.cat(files_list, guess_from=files_list[0])
    if scratch:
        db, pkl = all_opts['bowtie2db'], all_opts['mpa_pkl']
        all_opts.pop('bowtie2db', None), all_opts.pop('mpa_pkl', None)
        dbbase, pklbase = map(os.path.basename, (db, pkl))
        cmd += (" | metaphlan2.py" + " " + dict_to_cmd_opts(all_opts))
        actions = [
            """ tdir=$(mktemp -d -p {sdir});
                cd ${{tdir}};
                mkdir -pv ${{tdir}}/dbs;
                cp {db}* {pkl} ${{tdir}}/dbs;
                {cmd} --mpa_pkl ${{tdir}}/dbs/{pklbase} \
                      --bowtie2db ${{tdir}}/dbs/{dbbase};
                rm -rvf ${{tdir}};
            """.format(sdir=scratch,
                       pkl=pkl,
                       db=db,
                       cmd=cmd,
                       pklbase=pklbase,
                       dbbase=dbbase)
        ]
    else:
        cmd += (" | metaphlan2.py" + " " + dict_to_cmd_opts(all_opts))
        actions = [cmd]

    def _perfhint(task):
        threads = int(all_opts.get('nproc', 1))
        insize = sum(os.stat(f).st_size for f in files_list)
        return "{n} Estimated mem={mem:.0f} time={time:.0f}, threads={threads:.0f}".format(
            n=task.name,
            mem=1.5 * 1024,
            time=15 + (insize / 1.2e9 / (threads)),
            threads=threads)

    return dict(
        name="metaphlan2:" + all_opts['output_file'],
        actions=actions,
        file_dep=files_list,
        targets=targets,
        title=_perfhint,
    )
Esempio n. 12
0
 def _join(s):
     file, ext = os.path.splitext(seqfile_in)
     file = file + '.' + suffix
     return new_file(addtag(file, s), basedir=default_opts['output'])
Esempio n. 13
0
def picrust(file, output_dir=None, verbose=True, **opts):
    """Workflow to predict metagenome functional content from 16S OTU tables.

    :param file: String; input OTU table.
    :keyword tab_in: Boolean; True if the input is a tabulated 
                     file (default:0)
    :keyword tab_out: Boolean; True if the output file is to be
                      tabulated (default:False)
    :keyword gg_version: String; the greengenes version to be used
                         (default:most recent version)
    :keyword t: String; option to use a different type of prediction
                   (default:KO)
    :keyword with_confidence: Boolean; Set to True to output confidence 
                              intervals (default:0)
    :keyword custom: String; specify a file containing a custom trait to 
                     predict metagenomes

    External Dependencies:
      - PICRUSt: Version 1.0.0, 
        http://picrust.github.io/picrust/install.html#install

    """
    norm_out = new_file(addtag(file, "normalized_otus"), basedir=output_dir)
    predict_out = new_file(addtag(file, "picrust"), basedir=output_dir)

    all_opts = { 'tab_in'          : 0,  'tab_out' : 0,
                 'gg_version'      : '', 't'       : '', 
                 'with_confidence' : 0,  'custom'  : '',
                 'drop_unknown'    : True}
    all_opts.update(opts)
    drop_unknown = all_opts.pop("drop_unknown", True)

    _copy_fname = settings.workflows.picrust.copy_number
    def _drop_unknown():
        import os
        import gzip
        import json
        from biom.table import DenseOTUTable
        from biom.parse import (
            OBS_META_TYPES,
            parse_biom_table,
            parse_classic_table_to_rich_table
        )
        idx = set([ row.strip().split('\t')[0]
                    for row in gzip.open(_copy_fname) ])
        filter_func = lambda a, otu_id, c: str(otu_id) in idx
        tmpfile = file+"_tmp.biom"
        with open(file) as f, open(tmpfile, 'w') as f_out:
            try:
                table = parse_biom_table(f)
            except Exception as e:
                table = parse_classic_table_to_rich_table(
                    f, None, None, OBS_META_TYPES['taxonomy'], DenseOTUTable)
            table = table.filterObservations(filter_func)
            json.dump( table.getBiomFormatObject("AnADAMA"), f_out )
        os.rename(file, addtag(file, "unfiltered"))
        os.rename(tmpfile, file)


    cmd1 = ("normalize_by_copy_number.py "
            + "-i %s"
            + " -o " + norm_out)
    if all_opts['gg_version']:
        cmd1 += " -g " + all_opts['gg_version']
    if all_opts['tab_in']:
        cmd1 += " -f"

    cmd2 = ("predict_metagenomes.py "
            + "-i %s"
            + " -o " + predict_out)
    if all_opts['gg_version']:
        cmd2 += " -g " + all_opts['gg_version']
    if all_opts['tab_out']:
        cmd2 += " -f"
    if all_opts['t']:
        cmd2 += " -t " + all_opts['t']
    if all_opts['with_confidence']:
        cmd2 += " --with_confidence"
    if all_opts['custom']:
        cmd2 += " -c " + all_opts['custom']


    converted = addtag(file, "json")
    format_cmd = CmdAction('biom convert --table-type="OTU table"'
                           ' --header-key taxonomy --to-json'
                           ' -i {} -o {} '.format(file, converted),
                           verbose=verbose)
    def run(targets):
        # try to run without converting to json, if that fails,
        # convert first, then run on the json-converted biom file
        if os.stat(file).st_size < 1:
            for target in targets:
                open(target, "w").close()
            return True

        return strategies.backup(
            (strategies.Group(CmdAction(cmd1%(file), verbose=verbose),
                              CmdAction(cmd2%(norm_out), verbose=verbose)),
             strategies.Group(format_cmd,
                              CmdAction(cmd1%(converted), verbose=verbose),
                              CmdAction(cmd2%(norm_out), verbose=verbose)))
        )
             
    actions = [run]
    if drop_unknown:
        actions = [_drop_unknown, run]

    def _rusage(task):
        msg = task.name+" Estimated mem={mem} time={time} threads=1"
        s = os.stat(list(task.file_dep)[0]).st_size
        return msg.format(
            mem=100+(s/1024.),
            time=100+(s*2.5e-4)
        )
        
    return dict(
        name = "picrust:"+predict_out,
        actions = actions,
        file_dep = [file],
        targets = [predict_out, norm_out],
        title = _rusage,
    )
Esempio n. 14
0
def pick_otus_open_ref(input_fname, output_dir, verbose=None, qiime_opts={}):
    """Workflow to perform open-reference OTU picking. Similar to
    closed-reference OTU picking, this workflow generates a
    biom-formatted OTU table from demultiplexed 16S reads. This
    workflow (in general terms) wraps qiime's
    pick_open_reference_otus.py, which itself wraps either uclust or
    usearch. Note that uclust and usearch require a fairly large
    memory footprint (1.5-2.0G in some cases).

    :param input_fname: String; File path to the input,
                        fasta-formatted 16S sequences
    :param output_dir: String; Path to the directory where the output OTU 
                       table will be saved as 'otu_table.biom'. Other 
                       qiime-specific logs will go there, too.
    :keyword verbose: Boolean: set to true to print the commands that are 
                      run as they are run
    :keyword qiime_opts: Dictionary; A dictionary of command line options to
                         be passed to the wrapped split_libraries.py script. 
                         No - or -- flags are necessary; the correct - or --t
                         flags are inferred based on the length of the option. 
                         For boolean options, use the key/value pattern 
                         of { "my-option": "" }.

    External dependencies:
      - Qiime 1.8.0: https://github.com/qiime/qiime-deploy
      - USEARCH: (only if using the usearch option) 
        http://www.drive5.com/usearch/

    Resource utilization:
      - RAM: >1.5 G

    """

    output_fname = new_file("otu_table.biom", basedir=output_dir)
    revcomp_fname = new_file(
        "revcomp.fna", basedir=os.path.dirname(input_fname))

    verbose = settings.workflows.verbose if verbose is None else verbose

    default_opts = {
        "reference_fp": settings.workflows.sixteen.otu_refseq
    }
    default_opts.update(qiime_opts)
    opts = dict_to_cmd_opts(default_opts)

    cmd = (" pick_open_reference_otus.py"+
           " --input_fp={}"+
           " --output_dir="+output_dir+
           " -f"+
           " "+opts)

    revcomp_cmd = ("sequence_convert"+
                   " --format=fasta"+
                   " --to=fasta "+
                   " -r"+
                   " "+input_fname+
                   " > "+revcomp_fname)

    def run(targets):
        strategies.backup(
            (CmdAction(cmd.format(input_fname),verbose=verbose),
             strategies.Group(
                 CmdAction(revcomp_cmd),
                 CmdAction(cmd.format(revcomp_fname),verbose=verbose))),
            extra_conditions = [ 
                lambda ret, output_fname: os.stat(output_fname).st_size == 0
            ],
            output_fname=output_fname
        )

    return {
        "name": "pick_otus_open_ref:"+input_fname,
        "actions": [run],
        "targets": [output_fname],
        "file_dep": [input_fname],
    }
Esempio n. 15
0
def demultiplex_illumina(fastq_fnames, barcode_fnames, map_fname, output_fname,
                         verbose=True, qiime_opts={}):

    output_dir, output_basename = os.path.split(output_fname)
    default_opts = {
        "i": ",".join(fastq_fnames),
        "b": ",".join(barcode_fnames),
        "m": map_fname,
        "o": output_dir
    }
    default_opts.update(qiime_opts)
    opts = dict_to_cmd_opts(default_opts)
    
    cmd = "split_libraries_fastq.py "

    revcomp_map_fname = new_file(addtag(map_fname, "revcomp"),
                                 basedir=output_dir)
    revcomp_opts = default_opts.copy()
    revcomp_opts['m'] = revcomp_map_fname
    revcomp_opts = dict_to_cmd_opts(revcomp_opts)
    def _revcomp():
        from anadama.util import deserialize_map_file, serialize_map_file
        from Bio.Seq import Seq

        def _reverse(sample):
            seq = Seq(sample.BarcodeSequence).reverse_complement()
            return sample._replace(BarcodeSequence=str(seq))

        with open(map_fname) as from_map:
            from_samples = deserialize_map_file(from_map)
            serialize_map_file(
                ( _reverse(s) for s in from_samples ),
                revcomp_map_fname
            )


    default_out = os.path.join(output_dir, "seqs.fna")
    output_exists = lambda *args, **kwargs: (
        not os.path.exists(default_out)
        or not os.stat(default_out).st_size > 1
    )

    def run():
        return strategies.backup(
            (CmdAction(cmd+opts, verbose=verbose),
             strategies.Group(
                 PythonAction(_revcomp),
                 CmdAction(cmd+revcomp_opts,verbose=verbose))),
            extra_conditions=[output_exists]
        )


    actions = [run]
    if output_basename != "seqs.fna":
        actions.append("mv '%s' '%s'"%(default_out, output_fname))

    return {
        "name": "demultiplex_illumina:"+output_fname,
        "actions": actions,
        "file_dep": list(fastq_fnames) + list(barcode_fnames) + [map_fname],
        "targets": [output_fname],
        "title": lambda t: t.name+" Estimated time=%.2f"%(
            sum(os.stat(f).st_size for f in t.file_dep)/1024./1024/5)
    }
Esempio n. 16
0
 def _decompress(f):
     unz_f = os.path.splitext(f)[0]
     unz_f = util.new_file(os.path.basename(unz_f), basedir=products_dir)
     tasks.append(general.extract(f, unz_f))
     return unz_f
Esempio n. 17
0
def upload(files_16s, files_wgs, sub_fname, ready_fname, keyfile, remote_path,
           remote_srv, user, products_dir):
    """Upload raw sequence files and xml.

    :param keyfile: String; absolute filepath to private SSH keyfile for
    access to NCBI's submission server

    :param remote_path: String; the directory on the NCBI submission
    server where to upload data. If unset, the remote_path is
    automatically determined.

    :param remote_srv: String; TLD of NCBI's submission server

    :param user: String; username used to access NCBI's submission server

    """

    to_upload = [
        f for f in list(files_16s) + list(files_wgs)
        if not f.endswith(".complete")
    ]
    ssh_session = ssh.SSHConnection(user, remote_srv, keyfile, remote_path)
    uptodate = [ssh_session.uptodate]

    def _upload(local_fname, complete_fname, blithely=False):
        def _u():
            ret = asp.upload_file(remote_srv,
                                  user,
                                  None,
                                  local_fname,
                                  remote_path,
                                  keyfile=keyfile)
            if blithely or ret:
                open(complete_fname, 'w').close()
            return blithely or ret  # return True if blithely is True

        return _u

    complete_fnames = [
        new_file(f + ".complete", basedir=products_dir) for f in to_upload
    ]
    for f, complete_fname in zip(to_upload, complete_fnames):
        yield {
            "name": "upload: " + basename(f),
            "actions": [_upload(f, complete_fname)],
            "file_dep": [f],
            "uptodate": uptodate,
            "targets": [complete_fname]
        }

    yield {
        "name": "upload: " + basename(sub_fname),
        "actions": [_upload(sub_fname, sub_fname + ".complete")],
        "file_dep": complete_fnames,
        "targets": [sub_fname + ".complete"]
    }

    yield {
        "name": "upload: " + basename(ready_fname),
        "actions": [_upload(ready_fname, ready_fname + ".complete", True)],
        "file_dep": complete_fnames + [sub_fname + ".complete"],
        "targets": [ready_fname + ".complete"]
    }
Esempio n. 18
0
    def _configure(self):
        for attr in self.sequence_attrs:
            seq_set = getattr(self, attr)

            if self.options['infer_pairs'].get('infer'):
                paired, notpaired = infer_pairs(seq_set)
                seq_set = paired + notpaired

            seq_set, maybe_tasks = maybe_concatenate(seq_set, self.products_dir)
            setattr(self, attr, seq_set)
            for t in maybe_tasks:
                yield t

        for file_ in self.raw_seq_files:
            if util.guess_seq_filetype(file_) != "fastq":
                fastq_file = util.new_file( basename(file_)+"_filtered.fastq",
                                            basedir=self.products_dir )
                yield general.sequence_convert(
                    [file_], fastq_file, 
                    **self.options.get('sequence_convert', dict())
                )
            else:
                fastq_file = file_
            self.intermediate_fastq_files.append(fastq_file)
                

        for fastq_file in self.intermediate_fastq_files:
            name_base = util.new_file(util.rmext(fastq_file, all=True),
                                      basedir=self.products_dir)
            task_dict = next(wgs.knead_data(
                [fastq_file], name_base,
                **self.options.get('decontaminate', {})
            ))
            decontaminated_fastq = task_dict['targets'][0]
            self.decontaminated_fastq_files.append(decontaminated_fastq)
            yield task_dict

        for d_fastq in self.decontaminated_fastq_files:
            metaphlan_file = util.new_file(
                basename(d_fastq)+".metaphlan2.tsv",
                basedir=self.products_dir )
            otu_table = metaphlan_file.replace('.tsv', '.biom')
            yield wgs.metaphlan2(
                [d_fastq], output_file=metaphlan_file,
                biom=otu_table,
                # first index is for first item in list of samples
                # second index is to get the sample id from the sample
                sample_id=self._filter_samples_for_file(self.sample_metadata,
                                                        d_fastq)[0][0],
                input_type="multifastq",
                **self.options.get('metaphlan2', dict())
            )
            self.metaphlan_results.append(metaphlan_file)
            self.otu_tables.append(otu_table)

            # Finally, HUMAnN all alignment files
            humann_output_dir = util.new_file(
                util.rmext(basename(d_fastq), all=True)+"_humann",
                basedir=self.products_dir
            )
            yield wgs.humann2( d_fastq, humann_output_dir, 
                               **self.options.get('humann', dict()) )