Example #1
0
class Coves(CommandLineApplication):
    """Application controller for Coves

    Computes the score of each whole sequence individually, and prints the scores.
    You might use it to detect sequences which, according to the model, don't belong
    to the same structural consensus; sequences which don't fit the model get negative
    scores.


    where options are:
    -a          : show all pairs, not just Watson-Crick
    -g <gcfrac> : set expected background GC composition (default 0.5)
    -m          : mountain representation of structural alignment
    -s          : secondary structure string representation of
                  structural alignment
    """

    _parameters = {
        '-a': FlagParameter(Prefix='-', Name='a'),
        '-g': ValuedParameter(Prefix='-', Name='g', Delimiter=' '),
        '-m': FlagParameter(Prefix='-', Name='m'),
        '-s': FlagParameter(Prefix='-', Name='s', Value=True)
    }

    _command = 'coves'
    _input_handler = '_input_as_string'

    def _input_as_string(self, filename):
        """Returns 'modelname' and 'filename' to redirect input to stdin"""
        return ' '.join(
            [filename + '.cm',
             super(Coves, self)._input_as_string(filename)])
Example #2
0
class Contrafold(CommandLineApplication):
    """Application controler for CONTRAfold v1.0"""

    _parameters = {'predict':FlagParameter(Prefix='',Name='predict',Value=True),
                   'train':FlagParameter(Prefix='',Name='train')}

    _command = 'contrafold'
    _input_handler='_input_as_string'
Example #3
0
class Sfffile(CommandLineApplication):
    """Simple sfffile application controller.
    """
    _options = {
        # output filepath
        '-o': ValuedParameter('-', 'o', Delimiter=' '),
        # file of accession numbers to be included
        '-i': ValuedParameter('-', 'i', Delimiter=' '),
        # file of accession numbers to be excluded
        '-e': ValuedParameter('-', 'e', Delimiter=' '),
        # file of custom trim points
        '-t': ValuedParameter('-', 't', Delimiter=' '),
        # number of cycles in output sff
        '-c': ValuedParameter('-', 'c', Delimiter=' '),
        # shortcut for -c 42
        '-gs20': FlagParameter('-', 'gs20'),
        # shortcut for -c 100
        '-gsflx': FlagParameter('-', 'gsflx'),
        # split multiplexed reads
        '-s': ValuedParameter('-', 's', Delimiter=' '),
        # custom MID configuration file
        '-mcf': ValuedParameter('-', 'mcf', Delimiter=' '),
        # prevent propagation of sff index
        '-nmft': FlagParameter('-', 'nmft'),
    }
    _parameters = {}
    _parameters.update(_options)
    _input_handler = '_input_as_path'
    _command = 'sfffile'

    def _get_result_paths(self, data):
        """Collect the resultant SFF file in the results.

        Because cogent.app.util.CommandLineAppResult opens output
        files in text mode, this method may not be portable for
        Windows users.  A more portable solution would be to not use
        the app controller results, but instead specify the output SFF
        filepath manually via the '-o' parameter.
        """
        if self.Parameters['-o'].isOn():
            sff_path = self.Parameters['-o'].Value
        else:
            sff_path = '454Reads.sff'
        return {'sff': ResultPath(sff_path)}

    def _accept_exit_status(self, exit_status):
        """Accept an exit status of 0 for the sfffile program.
        """
        return exit_status == 0
Example #4
0
    def test_ne(self):
        """FlagParameter: ne functions as expected """
        p1 = FlagParameter(Name='a', Prefix='-', Value=True)
        p2 = FlagParameter(Name='a', Prefix='-', Value=True)
        p3 = FlagParameter(Name='a', Prefix='-')
        p4 = FlagParameter(Name='i', Prefix='-', Value=True)
        p5 = FlagParameter(Name='a', Prefix='--', Value=True)

        assert not p1 != p2
        assert p1 != p3
        assert p1 != p4
        assert p1 != p5
        assert p3 != p4
        assert p3 != p5
        assert p4 != p5
Example #5
0
class foldalign(CommandLineApplication):
    """Applictation controller for foldalign RNA secondary structure prediction 
    application
    """

    _parameters = {  
        '-max_length':ValuedParameter(Prefix='-',Name='max_length',Delimiter=' '),
        '-max_diff':ValuedParameter(Prefix='-',Name='max_diff',Delimiter=' '),
        '-score_matrix':ValuedParameter(Prefix='-',Name='score_matrix',Delimiter=' '),
        '-format':ValuedParameter(Prefix='-',Name='format',Delimiter=' '),
        '-plot_score':FlagParameter(Prefix='-',Name='plot_score'),
        '-global':FlagParameter(Prefix='-',Name='global'),
        '-summary':FlagParameter(Prefix='-',Name='summary'),}

    _command = 'foldalign'
    _input_handler = '_input_as_string'
Example #6
0
class RNAsubopt(CommandLineApplication):
    """Application controller for RNAsubopt (in the Vienna RNA package)

    Manual on:
    http://www.tbi.univie.ac.at/~ivo/RNA/RNAsubopt.html
    http://bioweb.pasteur.fr/docs/man/man/RNAsubopt.1.html

    Parameters with default values:
        -e: 1 (range)
        -T: 37 (temperature)
        -d: 2 (dangling ends as in partition function folding)

    Input is always written to a file which is used as the application's input.
    StdErr is suppressed by default, but can be overwritten in an instance.
    """
    _parameters = {
        '-p': ValuedParameter(Prefix='-', Name='p', Delimiter=' '),
        '-C': FlagParameter(Prefix='-', Name='C'),
        '-e': ValuedParameter(Prefix='-', Name='e', Delimiter=' ', Value=1),
        '-ep': ValuedParameter(Prefix='-', Name='ep', Delimiter=' '),
        '-s': FlagParameter(Prefix='-', Name='s'),
        '-lodos': FlagParameter(Prefix='-', Name='lodos'),
        '-T': ValuedParameter(Prefix='-', Name='T', Value=37, Delimiter=' '),
        '-4': FlagParameter(Prefix='-', Name=4),
        '-d': MixedParameter(Prefix='-', Name='d', Delimiter='', Value=2),
        '-noGU': FlagParameter(Prefix='-', Name='noGU'),
        '-noCloseGU': FlagParameter(Prefix='-', Name='noCloseGU'),
        '-P': ValuedParameter(Prefix='-', Name='P', Delimiter=' '),
        '-logML': FlagParameter(Prefix='-', Name='logML'),
        '-nsp': ValuedParameter(Prefix='-', Name='nsp', Delimiter=' '),
        '-noLP': FlagParameter(Prefix='-', Name='noLP')
    }
    _synonyms = {
        'Temperature': '-T',
        'Temp': '-T',
        'EnergyRange': '-e',
        'Sort': '-s'
    }
    _command = 'RNAsubopt'
    _input_handler = '_input_as_lines'
    _suppress_stderr = True

    def _input_as_path(self, filename):
        """Returns '>"filename"' to redirect input to stdin
        
        Includes quotes to handle file names containing spaces.
        """
        return ''.join(\
            ['<',str(super(RNAsubopt,self)._input_as_path(filename))])

    def _input_as_lines(self, data):
        """Returns '>temp_filename to redirect input to stdin
        
        Includes quotes to handle file names containing spaces.
        """
        return ''.join(
            ['<', str(super(RNAsubopt, self)._input_as_lines(data))])
Example #7
0
 def test_init_defaults(self):
     """FlagParameter: init functions as expected with default values"""
     p = FlagParameter(Name='a', Prefix='-')
     self.assertEqual(p.Name, 'a')
     self.assertEqual(p.Prefix, '-')
     self.assertEqual(p.Value, False)
     self.assertEqual(p.Delimiter, None)
     self.assertEqual(p.Quote, None)
     self.assertEqual(p.Id, '-a')
Example #8
0
 def test_init(self):
     """FlagParameter: init functions as expected """
     param = FlagParameter(Name='a', Prefix='-', Value=42)
     self.assertEqual(param.Name, 'a')
     self.assertEqual(param.Prefix, '-')
     self.assertEqual(param.Value, 42)
     self.assertEqual(param.Delimiter, None)
     self.assertEqual(param.Quote, None)
     self.assertEqual(param.Id, '-a')
Example #9
0
class Stride(CommandLineApplication):
    """Application controller for stride."""

    _parameters = {
        # report hydrogen bonds
        '-h': FlagParameter(Prefix='-', Name='h', Value=False),
        # calculate contact order
        '-k': FlagParameter(Prefix='-', Name='k', Value=False),
        # print out the contact map only
        '-w': FlagParameter(Prefix='-', Name='w', Value=False),
        # write output as file
        '-f': ValuedParameter(Prefix='-', Name='f', Delimiter=''),
        # write output as molscript file
        '-m': ValuedParameter(Prefix='-', Name='m', Delimiter=''),
    }

    _command = "stride"
    _input_handler = '_input_as_multiline_string'

    def _input_as_multiline_string(self, data):
        """This allows to feed entities to stride."""
        dummy_file = StringIO()
        PDBWriter(dummy_file, data)
        dummy_file.seek(0)
        data = super(Stride,
                     self)._input_as_multiline_string(dummy_file.read())
        return data

    def _align_out_filename(self):

        if self.Parameters['-f'].isOn():
            aln_filename = self._absolute(str(self.Parameters['-f'].Value))
        else:
            aln_filename = None
        return aln_filename

    def _get_result_paths(self, data):

        result = {}
        if self.Parameters['-f'].isOn():
            out_name = self._align_out_filename()
            result['File'] = ResultPath(Path=out_name, IsWritten=True)
        return result
Example #10
0
 def setUp(self):
     self.fp = FlagParameter(Prefix='-', Name='d')
     self.vp = ValuedParameter(Name='p', Prefix='-', Value=[1])
     self.mp = MixedParameter(Prefix='--', Name='k', Delimiter=' ')
     self.all_params = {
         self.fp.Id: self.fp,
         self.vp.Id: self.vp,
         self.mp.Id: self.mp
     }
     self.p1 = Parameters()
     self.p2 = Parameters(self.all_params)
     self._synonyms = {'Pino': '-p', 'K': 'k'}
     self.p3 = Parameters(self.all_params, self._synonyms)
Example #11
0
class Covee(CommandLineApplication):
    """Application controller for Covee

    emits a consensus structure prediction for the family.

    where options are:
     -a        : annotate all pairs, not just canonical ones
     -b        : emit single most probable sequence
     -l        : print as mountain landscape
     -s <seed> : set seed for random()
    EXPERIMENTAL OPTIONS:
     -L        : calculate expected length distributions for states


    """

    _parameters = {
        '-a': FlagParameter(Prefix='-', Name='a'),
        '-b': FlagParameter(Prefix='-', Name='b', Value=True),
        '-l': FlagParameter(Prefix='-', Name='l'),
        '-s': ValuedParameter(Prefix='-', Name='s', Delimiter=' '),
        '-L': FlagParameter(Prefix='-', Name='L')
    }

    _command = 'covee'
    _input_handler = '_input_as_string'

    def _input_as_string(self, filename):
        """Returns 'modelname' and 'filename' to redirect input to stdin"""
        return ' '.join(
            [filename + '.cm',
             super(Covee, self)._input_as_string(filename)])

    def _input_as_lines(self, data):
        """Returns 'temp_filename to redirect input to stdin"""
        return ''.join(
            [data + '.cm',
             super(Covee, self)._input_as_lines(data)])
Example #12
0
class Covea(CommandLineApplication):
    """Application controller for Covea


    here supported options are:
     -a             : annotate all base pairs, not just canonical ones
     -h             : print short help and version info
     -o <outfile>   : write alignment to <outfile> in SELEX format
     -s <scorefile> : save individual alignment scores to <scorefile>

     Experimental options:
     -S             : use small-memory variant of alignment algorithm

    """

    _parameters = {
        '-a': FlagParameter(Prefix='-', Name='a'),
        '-o': ValuedParameter(Prefix='-', Name='o', Delimiter=' '),
        '-s': ValuedParameter(Prefix='-', Name='s', Delimiter=' '),
        '-S': FlagParameter(Prefix='-', Name='S')
    }

    _command = 'covea'
    _input_handler = '_input_as_string'

    def _input_as_string(self, filename):
        """Returns 'modelname' and 'filename' to redirect input to stdin"""
        return ' '.join(
            [filename + '.cm',
             super(Covea, self)._input_as_string(filename)])

    def _input_as_lines(self, data):
        """Returns 'temp_filename to redirect input to stdin"""
        return ''.join(
            [data + '.cm',
             super(Covea, self)._input_as_lines(data)])
Example #13
0
class PknotsRG(CommandLineApplication):
    """Application controller for PknotsRG v1.2 application

    Input: plain seqeunce
    
    pknotsRG is a tool for thermodynamic folding of RNA secondary
    structures, including the class of canonical simple recursive
    pseudoknots.

    Options:
    -m         Use mfe strategy
    -f         Use enf strategy
    -l         Use loc strategy
    -s         Show suboptimals
    -u         no dangling bases (implies -s)
    -o         no suboptimals inside pknots (implies -s -l)
    -e <value> Set energy range for suboptimals (kcal/mole)
    -c <value> Set energy range for suboptimals (%) [10]
    -n <value> Set npp-value [0.3]
    -p <value> Set pkinit-value [9]
    -k <value> Set maximal pknot-length


    """
    _parameters = {
        '-m': FlagParameter(Prefix='-', Name='m'),
        '-f': FlagParameter(Prefix='-', Name='f'),
        '-l': FlagParameter(Prefix='-', Name='l'),
        '-s': FlagParameter(Prefix='-', Name='s'),
        '-u': FlagParameter(Prefix='-', Name='u'),
        '-o': FlagParameter(Prefix='-', Name='o'),
        '-e': ValuedParameter(Prefix='-', Name='e', Delimiter=' '),
        '-c': ValuedParameter(Prefix='-', Name='c', Delimiter=' '),
        '-n': ValuedParameter(Prefix='-', Name='n', Delimiter=' '),
        '-p': ValuedParameter(Prefix='-', Name='p', Delimiter=' '),
        '-k': ValuedParameter(Prefix='-', Name='k', Delimiter=' ')
    }

    _command = 'pknotsRG-1.2-i386-linux-static'
    _input_handler = '_input_as_string'

    def _input_as_string(self, filename):
        """Returns '>filename' to redirect input to stdin"""
        return ''.join(['<', super(PknotsRG, self)._input_as_string(filename)])

    def _input_as_lines(self, data):
        """Returns '>temp_filename to redirect input to stdin"""
        return ''.join(['<', super(PknotsRG, self)._input_as_lines(data)])
Example #14
0
class RNAfold(CommandLineApplication):
    """Application controller for RNAfold (in the Vienna RNA package)

    Manual on:
    http://www.tbi.univie.ac.at/~ivo/RNA/RNAfold.html
    http://bioweb.pasteur.fr/docs/man/man/RNAfold.1.html

    Parameters with default values:
        -T: 37 (temperature)
        -d: 1 (only unpaired bases in dangling ends)
        -S: 1.07 (scale)

    Input is always written to a file which is used as the application's input.
    StdErr is suppressed by default, but can be overruled in an instance.
    """
    _parameters = {
    '-p':MixedParameter(Prefix='-',Name='p',Delimiter='',Value=False),
    '-C':FlagParameter(Prefix='-',Name='C'),
    '-T':ValuedParameter(Prefix='-',Name='T',Value=37,Delimiter=' '),
    '-4':FlagParameter(Prefix='-',Name=4),
    '-d':MixedParameter(Prefix='-',Name='d',Delimiter='',Value=1),
    '-noLP':FlagParameter(Prefix='-',Name='noLP'),
    '-noGU':FlagParameter(Prefix='-',Name='noGU'),
    '-noCloseGU':FlagParameter(Prefix='-',Name='noCloseGU'),
    '-e':ValuedParameter(Prefix='-',Name='e',Delimiter=' '),
    '-P':ValuedParameter(Prefix='-',Name='P',Delimiter=' '),
    '-nsp':ValuedParameter(Prefix='-',Name='nsp',Delimiter=' '),
    '-S':ValuedParameter(Prefix='-',Name='S',Value=1.07,Delimiter=' ')}
    _synonyms = {'Temperature':'-T','Temp':'-T','Scale':'-S'}
    _command = 'RNAfold'
    _input_handler = '_input_as_lines'
    _suppress_stderr = True 

    def _input_as_path(self,filename):
        """Returns '>"filename"' to redirect input to stdin"""
        return ''.join(\
            ['<', str(super(RNAfold,self)._input_as_path(filename))])
    
    def _input_as_lines(self,data):
        """Returns '>"temp_filename" to redirect input to stdin"""
        return ''.join(['<',str(super(RNAfold,self)._input_as_lines(data))])

    def _get_result_paths(self,data):
        """Specifies the paths of output files generated by the application
        
        data: the data the instance of the application is called on

        You always get back: StdOut,StdErr, and ExitStatus
        RNAfold can produce two additional output files:
            a secondary structure structure graph. Default name: rna.ps
            a dot plot of the base pairing matrix. Default name: dp.ps
        The default names are used for unnamed sequences. Files are created
            in the current working directory.
        You can make a sequence named by inserting a line '>name' above it in
            your input file (or list of sequences). The ss and dp files for 
            named sequences will be written to name_ss.ps and name_dp.ps
        """
        result = {}
        name_counter = 0
        seq_counter = 0
        if not isinstance(data,list):
            #means data is file
            data = open(data).readlines()
        for item in data:
            if item.startswith('>'):
                name_counter += 1
                name = item.strip('>\n')
                result[(name+'_ss')] =\
                    ResultPath(Path=(self.WorkingDir+name+'_ss.ps'))
                result[(name+'_dp')] =\
                    ResultPath(Path=(self.WorkingDir+name+'_dp.ps'),\
                    IsWritten=self.Parameters['-p'].isOn())
            else:
                seq_counter += 1
        
        result['SS'] = ResultPath(Path=self.WorkingDir+'rna.ps',\
            IsWritten=seq_counter - name_counter > 0) #Secondary Structure
        result['DP'] = ResultPath(Path=self.WorkingDir+'dot.ps',
            IsWritten=(self.Parameters['-p'].isOn() and\
            seq_counter - name_counter > 0)) #DotPlot
        return result
Example #15
0
class Blat(CommandLineApplication):
    """BLAT generic application controller"""

    _command = 'blat'
    _input_handler = "_input_as_list"

    _database_types = ['dna', 'prot', 'dnax']
    _query_types = ['dna', 'rna', 'prot', 'dnax', 'rnax']
    _mask_types = ['lower', 'upper', 'out', 'file.out']
    _out_types = [
        'psl', 'pslx', 'axt', 'maf', 'sim4', 'wublast', 'blast', 'blast8',
        'blast9'
    ]
    _valid_combinations = [('dna', 'dna'), ('dna', 'rna'), ('prot', 'prot'),
                           ('dnax', 'prot'), ('dnax', 'dnax'),
                           ('dnax', 'rnax')]
    _database = None
    _query = None
    _output = None

    _parameters = {
        # database type (dna, prot, or dnax, where dnax is DNA sequence
        # translated in six frames to protein
        '-t':
        ValuedParameter('-', Delimiter='=', Name='t'),

        # query type (dna, rna, prot, dnax, rnax, where rnax is DNA sequence
        # translated in three frames to protein
        '-q':
        ValuedParameter('-', Delimiter='=', Name='q'),

        # Use overused tile file N.ooc, and N should correspond to the tileSize
        '-ooc':
        ValuedParameter('-', Delimiter='=', Name='ooc', IsPath=True),

        # Sets the size of at match that that triggers an alignment
        '-tileSize':
        ValuedParameter('-', Delimiter='=', Name='tileSize'),

        # Spacing between tiles.
        '-stepSize':
        ValuedParameter('-', Delimiter='=', Name='stepSize'),

        # If set to 1, allows one mismatch in the tile and still triggers
        # an alignment.
        '-oneOff':
        ValuedParameter('-', Delimiter='=', Name='oneOff'),

        # sets the number of tile matches
        '-minMatch':
        ValuedParameter('-', Delimiter='=', Name='minMatch'),

        # sets the minimum score
        '-minScore':
        ValuedParameter('-', Delimiter='=', Name='minScore'),

        # sets the minimum sequence identity in percent
        '-minIdentity':
        ValuedParameter('-', Delimiter='=', Name='minIdentity'),

        # sets the size o the maximum gap between tiles in a clump
        '-maxGap':
        ValuedParameter('-', Delimiter='=', Name='maxGap'),

        # make an overused tile file. Target needs to be complete genome.
        '-makeOoc':
        ValuedParameter('-', Delimiter='=', Name='makeOoc', IsPath=True),

        # sets the number of repetitions of a tile allowed before it is marked
        # as overused
        '-repMatch':
        ValuedParameter('-', Delimiter='=', Name='repMatch'),

        # mask out repeats.  Alignments won't be started in masked region but
        # may extend through it in nucleotide searches.  Masked areas are
        # ignored entirely in protein or translated searches.  Types are:
        # lower, upper, out, file.out (file.out - mask database according to
        # RepeatMasker file.out
        '-mask':
        ValuedParameter('-', Delimiter='=', Name='mask'),

        # Mask out repeats in query sequence.  similar to -mask but for query
        # rather than target sequence
        '-qMask':
        ValuedParameter('-', Delimiter='=', Name='qMask'),

        # repeat bases will not be masked in any way, but matches in
        # repeat areas will be reported separately from matches in other
        # areas in the pls output
        '-repeats':
        ValuedParameter('-', Delimiter='=', Name='repeats'),

        # minimum percent divergence of repeats to allow them to be unmasked
        '-minRepDivergence':
        ValuedParameter('-', Delimiter='=', Name='minRepDivergence'),

        # output dot every N sequences to show program's progress
        '-dots':
        ValuedParameter('-', Delimiter='=', Name='dots'),

        # controls output file format.  One of:
        # psl - Default.  Tab separated format, no sequence
        # pslx - Tab separated format with sequence
        # axt - blastz-associated axt format
        # maf - multiz-associated maf format
        # sim4 - similar to sim4 format
        # wublast - similar to wublast format
        # blast - similar to NCBI blast format
        # blast8- NCBI blast tabular format
        # blast9 - NCBI blast tabular format with comments
        '-out':
        ValuedParameter('-', Delimiter='=', Name='out'),

        # sets maximum intron size
        '-maxIntron':
        ValuedParameter('-', Delimiter='=', Name='maxIntron'),

        # suppress column headers in psl output
        '-noHead':
        FlagParameter('-', Name='noHead'),

        # trim leading poly-T
        '-trimT':
        FlagParameter('-', Name='trimT'),

        # do not trim trailing poly-A
        '-noTrimA':
        FlagParameter('-', Name='noTrimA'),

        # Remove poly-A tail from qSize as well as alignments in psl output
        '-trimHardA':
        FlagParameter('-', Name='trimHardA'),

        # run for fast DNA/DNA remapping - not allowing introns,
        # requiring high %ID
        '-fastMap':
        FlagParameter('-', Name='fastMap'),

        # for high quality mRNAs, look harder for small initial and terminal
        # exons
        '-fine':
        FlagParameter('-', Name='fine'),

        # Allows extension of alignment through large blocks of N's
        '-extendThroughN':
        FlagParameter('-', Name='extendThroughN')
    }

    def _get_result_paths(self, data):
        """Returns the file location for result output
        """

        return {'output': ResultPath(data[2], IsWritten=True)}

    def _get_base_command(self):
        """Gets the command that will be run when the app controller is
        called.
        """
        command_parts = []
        cd_command = ''.join(['cd ', str(self.WorkingDir), ';'])
        if self._command is None:
            raise ApplicationError('_command has not been set.')
        command = self._command
        parameters = sorted(
            [str(x) for x in self.Parameters.values() if str(x)])

        synonyms = self._synonyms

        command_parts.append(cd_command)
        command_parts.append(command)
        command_parts.append(self._database)  # Positional argument
        command_parts.append(self._query)  # Positional argument
        command_parts += parameters
        if self._output:
            command_parts.append(self._output.Path)  # Positional

        return (self._command_delimiter.join(filter(None,
                                                    command_parts)).strip())

    BaseCommand = property(_get_base_command)

    def _input_as_list(self, data):
        '''Takes the positional arguments as input in a list.

        The list input here should be [query_file_path, database_file_path,
        output_file_path]'''
        query, database, output = data
        if (not isabs(database)) \
                or (not isabs(query)) \
                or (not isabs(output)):
            raise ApplicationError("Only absolute paths allowed.\n%s" %
                                   ', '.join(data))

        self._database = FilePath(database)
        self._query = FilePath(query)
        self._output = ResultPath(output, IsWritten=True)

        # check parameters that can only take a particular set of values
        # check combination of databse and query type
        if self.Parameters['-t'].isOn() and self.Parameters['-q'].isOn() and \
                (self.Parameters['-t'].Value, self.Parameters['-q'].Value) not in \
                self._valid_combinations:
            error_message = "Invalid combination of database and query " + \
                            "types ('%s', '%s').\n" % \
                            (self.Paramters['-t'].Value,
                             self.Parameters['-q'].Value)

            error_message += "Must be one of: %s\n" % \
                             repr(self._valid_combinations)

            raise ApplicationError(error_message)

        # check database type
        if self.Parameters['-t'].isOn() and \
                self.Parameters['-t'].Value not in self._database_types:
            error_message = "Invalid database type %s\n" % \
                            self.Parameters['-t'].Value

            error_message += "Allowed values: %s\n" % \
                             ', '.join(self._database_types)

            raise ApplicationError(error_message)

        # check query type
        if self.Parameters['-q'].isOn() and \
                self.Parameters['-q'].Value not in self._query_types:
            error_message = "Invalid query type %s\n" % \
                            self.Parameters['-q'].Value

            error_message += "Allowed values: %s\n" % \
                ', '.join(self._query_types)

            raise ApplicationError(error_message)

        # check mask type
        if self.Parameters['-mask'].isOn() and \
                self.Parameters['-mask'].Value not in self._mask_types:
            error_message = "Invalid mask type %s\n" % \
                            self.Parameters['-mask']

            error_message += "Allowed Values: %s\n" % \
                ', '.join(self._mask_types)

            raise ApplicationError(error_message)

        # check qmask type
        if self.Parameters['-qMask'].isOn() and \
                self.Parameters['-qMask'].Value not in self._mask_types:
            error_message = "Invalid qMask type %s\n" % \
                            self.Parameters['-qMask'].Value

            error_message += "Allowed values: %s\n" % \
                             ', '.join(self._mask_types)

            raise ApplicationError(error_message)

        # check repeat type
        if self.Parameters['-repeats'].isOn() and \
                self.Parameters['-repeats'].Value not in self._mask_types:
            error_message = "Invalid repeat type %s\n" % \
                            self.Parameters['-repeat'].Value

            error_message += "Allowed values: %s\n" % \
                             ', '.join(self._mask_types)

            raise ApplicationError(error_message)

        # check output format
        if self.Parameters['-out'].isOn() and \
                self.Parameters['-out'].Value not in self._out_types:
            error_message = "Invalid output type %s\n" % \
                            self.Parameters['-out']

            error_message += "Allowed values: %s\n" % \
                             ', '.join(self._out_types)

            raise ApplicationError(error_message)

        return ''
Example #16
0
class MpiBlast(Blast):
    """mpblast application controller - Prototype """

    _mpi_options = {
        # Produces verbose debugging output for each node, optionally logs the
        # output to a file
        '--debug':
        ValuedParameter('-', Name='--debug', Delimiter='='),

        # Set the scheduler process' MPI Rank (default is 1). Because the
        # scheduler uses very little CPU it can be useful to force the
        # scheduler to run on the same physical machine as the writer (rank 0).
        '--scheduler-rank':
        ValuedParameter('-', Name='--scheduler-rank', Delimiter='='),

        # Print the Altschul. et. al. 1997 paper reference instead of the
        # mpiBLAST paper reference. With this option mpiblast output is nearly
        # identical to NCBI-BLAST output.
        '--altschul-reference':
        FlagParameter(Prefix='--', Name='altschul-reference'),

        #Removes the local copy of the database from each node before
        # terminating execution
        '--removedb':
        FlagParameter(Prefix='--', Name='removedb'),

        # Sets the method of copying files that each worker will use.
        #  Default = "cp"
        # * cp : use standard file system "cp" command.
        #        Additional option is --concurrent.
        # * rcp : use rsh "rcp" command. Additonal option is --concurrent.
        # * scp : use ssh "scp" command. Additional option is --concurrent.
        # * mpi : use MPI_Send/MPI_Recv to copy files.
        #         Additional option is --mpi-size.
        # * none : do not copy files,instead use shared storage as local storage
        '--copy-via':
        ValuedParameter('-', Name='--copy-via', Delimiter='='),

        # set the number of concurrent accesses to shared storage. Default = 1
        '--concurrent':
        ValuedParameter('-', Name='--concurrent', Delimiter='='),

        # in bytes, set the maximum buffer size that MPI will use to send data
        # when transferring files. Default = 65536
        '--mpi-size':
        ValuedParameter('-', Name='--mpi-size', Delimiter='='),

        # set whether file locking should be used to manage local fragment
        # lists. Defaults to off. When --concurrency > 1 defaults to on
        # [on|off]
        '--lock':
        ValuedParameter('-', Name='--lock', Delimiter='='),

        # When set, the writer will use the database on shared storage for
        # sequence lookup. Can drastically reduce overhead for some blastn
        # searches.
        '--disable-mpi-db':
        FlagParameter(Prefix='--', Name='disable-mpi-db'),

        # Under unix, sets the nice value for each mpiblast process.
        '--nice':
        ValuedParameter('-', Name='--nice', Delimiter='='),

        # Under unix, sets the nice value for each mpiblast process.
        '--config-file':
        ValuedParameter('--', Name='config-file', Delimiter='='),

        # Experimental. When set, mpiblast will read the output file and
        # attempt to continue a previously aborted run where it left off
        '--resume-run':
        FlagParameter(Prefix='--', Name='resume-run'),

        # print the mpiBLAST version
        '--version':
        FlagParameter(Prefix='--', Name='version'),
    }

    _mpi_options.update(BLASTALL_OPTIONS)

    def __init__(self,
                 blast_mat_root=None,
                 params=None,
                 mpiblast_root="/usr/local/bin/",
                 local_root="/var/scratch/mpiblastdata/",
                 shared_root="/quicksand/hamady/data/blast/mpidb/",
                 config_file="/quicksand2/downloads2/mpiblast/mpiblast.conf",
                 num_db_frags=40,
                 InputHandler=None,
                 SuppressStderr=None,
                 SuppressStdout=None,
                 WorkingDir=None,
                 HALT_EXEC=False):
        """ Initialize mpiblast"""
        if config_file:
            params["--config-file"] = config_file
        super(MpiBlast, self).__init__(
            self._mpi_options,
            "mpirun -np %d %smpiblast" % ((num_db_frags + 2), mpiblast_root),
            blast_mat_root=blast_mat_root,
            extra_env="export Local=%s; export Shared=%s;" %
            (local_root, shared_root),
            params=params,
            InputHandler=InputHandler,
            SuppressStderr=SuppressStderr,
            SuppressStdout=SuppressStdout,
            WorkingDir=WorkingDir,
            HALT_EXEC=HALT_EXEC)
Example #17
0
class Dialign(CommandLineApplication):
    """Dialign application controller"""

    _options = {
        # -afc            Creates additional output file "*.afc" containing data of
        #                 all fragments considered for alignment
        #                 WARNING: this file can be HUGE !
        '-afc': FlagParameter(Prefix='-', Name='afc'),
        # -afc_v          like "-afc" but verbose: fragments are explicitly printed
        #                 WARNING: this file can be EVEN BIGGER !
        '-afc_v': FlagParameter(Prefix='-', Name='afc_v'),
        # -anc            Anchored alignment. Requires a file <seq_file>.anc
        #                 containing anchor points.
        '-anc': FlagParameter(Prefix='-', Name='anc'),
        # -cs             if segments are translated, not only the `Watson strand'
        #                 but also the `Crick strand' is looked at.
        '-cs': FlagParameter(Prefix='-', Name='cs'),
        # -cw             additional output file in CLUSTAL W format.
        '-cw': FlagParameter(Prefix='-', Name='cw'),
        # -ds             `dna alignment speed up' - non-translated nucleic acid
        #                 fragments are taken into account only if they start with
        #                 at least two matches. Speeds up DNA alignment at the expense
        #                 of sensitivity.
        '-ds': FlagParameter(Prefix='-', Name='ds'),
        # -fa             additional output file in FASTA format.
        '-fa': FlagParameter(Prefix='-', Name='fa'),
        # -ff             Creates file *.frg containing information about all
        #                 fragments that are part of the respective optimal pairwise
        #                 alignmnets plus information about consistency in the multiple
        #                 alignment
        '-ff': FlagParameter(Prefix='-', Name='ff'),
        # -fn <out_file>  output files are named <out_file>.<extension> .
        '-fn': ValuedParameter('-', Name='fn', Delimiter=' ', IsPath=True),
        #
        #
        # -fop            Creates file *.fop containing coordinates of all fragments
        #                 that are part of the respective pairwise alignments.
        '-fop': FlagParameter(Prefix='-', Name='fop'),
        # -fsm            Creates file *.fsm containing coordinates of all fragments
        #                 that are part of the final alignment
        '-fsm': FlagParameter(Prefix='-', Name='fsm'),
        # -iw             overlap weights switched off (by default, overlap weights are
        #                 used if up to 35 sequences are aligned). This option
        #                 speeds up the alignment but may lead to reduced alignment
        #                 quality.
        '-iw': FlagParameter(Prefix='-', Name='iw'),
        # -lgs            `long genomic sequences' - combines the following options:
        #                 -ma, -thr 2, -lmax 30, -smin 8, -nta, -ff,
        #                 -fop, -ff, -cs, -ds, -pst
        '-lgs': FlagParameter(Prefix='-', Name='lgs'),
        # -lgs_t          Like "-lgs" but with all segment pairs assessed at the
        #                 peptide level (rather than 'mixed alignments' as with the
        #                 "-lgs" option). Therefore faster than -lgs but not very
        #                 sensitive for non-coding regions.
        '-lgs_t': FlagParameter(Prefix='-', Name='lgs_t'),
        # -lmax <x>       maximum fragment length = x  (default: x = 40 or x = 120
        #                 for `translated' fragments). Shorter x speeds up the program
        #                 but may affect alignment quality.
        '-lmax': ValuedParameter('-', Name='lmax', Delimiter=' '),
        # -lo             (Long Output) Additional file *.log with information abut
        #                 fragments selected for pairwise alignment and about
        #                 consistency in multi-alignment proceedure
        '-lo': FlagParameter(Prefix='-', Name='lo'),
        # -ma             `mixed alignments' consisting of P-fragments and N-fragments
        #                 if nucleic acid sequences are aligned.
        '-ma': FlagParameter(Prefix='-', Name='ma'),
        # -mask           residues not belonging to selected fragments are replaced
        #                 by `*' characters in output alignment (rather than being
        #                 printed in lower-case characters)
        '-mask': FlagParameter(Prefix='-', Name='mask'),
        # -mat            Creates file *mat with substitution counts derived from the
        #                 fragments that have been selected for alignment
        '-mat': FlagParameter(Prefix='-', Name='mat'),
        # -mat_thr <t>    Like "-mat" but only fragments with weight score > t
        #                 are considered
        '-mat_thr': ValuedParameter('-', Name='mat_thr', Delimiter=' '),
        # -max_link       "maximum linkage" clustering used to construct sequence tree
        #                 (instead of UPGMA).
        '-max_link': FlagParameter(Prefix='-', Name='max_link'),
        # -min_link       "minimum linkage" clustering used.
        '-min_link': FlagParameter(Prefix='-', Name='min_link'),
        #
        # -mot            "motif" option.
        '-mot': FlagParameter(Prefix='-', Name='mot'),
        # -msf            separate output file in MSF format.
        '-msf': FlagParameter(Prefix='-', Name='msf'),
        # -n              input sequences are nucleic acid sequences. No translation
        #                 of fragments.
        '-n': FlagParameter(Prefix='-', Name='n'),
        # -nt             input sequences are nucleic acid sequences and `nucleic acid
        #                 segments' are translated to `peptide segments'.
        '-nt': FlagParameter(Prefix='-', Name='nt'),
        # -nta            `no textual alignment' - textual alignment suppressed. This
        #                 option makes sense if other output files are of intrest --
        #                 e.g. the fragment files created with -ff, -fop, -fsm or -lo
        '-nta': FlagParameter(Prefix='-', Name='nta'),
        # -o              fast version, resulting alignments may be slightly different.
        '-o': FlagParameter(Prefix='-', Name='o'),
        #
        # -ow             overlap weights enforced (By default, overlap weights are
        #                 used only if up to 35 sequences are aligned since calculating
        #                 overlap weights is time consuming). Warning: overlap weights
        #                 generally improve alignment quality but the running time
        #                 increases in the order O(n^4) with the number of sequences.
        #                 This is why, by default, overlap weights are used only for
        #                 sequence sets with < 35 sequences.
        '-ow': FlagParameter(Prefix='-', Name='ow'),
        # -pst            "print status". Creates and updates a file *.sta with
        #                 information about the current status of the program run.
        #                 This option is recommended if large data sets are aligned
        #                 since it allows the user to estimate the remaining running
        #                 time.
        '-pst': FlagParameter(Prefix='-', Name='pst'),
        # -smin <x>       minimum similarity value for first residue pair (or codon
        #                 pair) in fragments. Speeds up protein alignment or alignment
        #                 of translated DNA fragments at the expense of sensitivity.
        '-smin': ValuedParameter('-', Name='smin', Delimiter=' '),
        # -stars <x>      maximum number of `*' characters indicating degree of
        #                 local similarity among sequences. By default, no stars
        #                 are used but numbers between 0 and 9, instead.
        '-stars': ValuedParameter('-', Name='stars', Delimiter=' '),
        # -stdo           Results written to standard output.
        '-stdo': FlagParameter(Prefix='-', Name='stdo'),
        # -ta             standard textual alignment printed (overrides suppression
        #                 of textual alignments in special options, e.g. -lgs)
        '-ta': FlagParameter(Prefix='-', Name='ta'),
        # -thr <x>        Threshold T = x.
        '-thr': ValuedParameter('-', Name='thr', Delimiter=' '),
        # -xfr            "exclude fragments" - list of fragments can be specified
        #                 that are NOT considered for pairwise alignment
        '-xfr': FlagParameter(Prefix='-', Name='xfr'),
    }

    _parameters = {}
    _parameters.update(_options)
    _command = "dialign2-2"

    def _input_as_seqs(self, data):
        lines = []
        for i, s in enumerate(data):
            #will number the sequences 1,2,3,etc.
            lines.append(''.join(['>', str(i + 1)]))
            lines.append(s)
        return self._input_as_lines(lines)

    def _align_out_filename(self):

        if self.Parameters['-fn'].isOn():
            aln_filename = self._absolute(str(self.Parameters['-fn'].Value))
        else:
            raise ValueError("No output file specified.")
        return aln_filename

    def _get_result_paths(self, data):

        result = {}
        if self.Parameters['-fn'].isOn():
            out_name = self._align_out_filename()
            result['Align'] = ResultPath(Path=out_name, IsWritten=True)
        return result

    def getHelp(self):
        """Dialign help"""

        help_str = """
"""
        return help_str
Example #18
0
class Raxml(CommandLineApplication):
    """RAxML application controller"""

    deprecated('class',
               'cogent.app.raxml.Raxml',
               'cogent.app.raxml_v730.Raxml',
               '1.6')

    _options ={

        # Specify a column weight file name to assign individual wieghts to 
        # each column of the alignment. Those weights must be integers 
        # separated by any number and type of whitespaces whithin a separate 
        # file, see file "example_weights" for an example.
        '-a':ValuedParameter('-',Name='a',Delimiter=' '),

        #  Specify an integer number (random seed) for bootstrapping
        '-b':ValuedParameter('-',Name='b',Delimiter=' '),


        # Specify number of distinct rate catgories for raxml when 
        # ModelOfEvolution is set to GTRCAT or HKY85CAT.
        # Individual per-site rates are categorized into numberOfCategories 
        # rate categories to accelerate computations. (Default = 50)
        '-c':ValuedParameter('-',Name='c',Delimiter=' ', Value=50),

        # This option allows you to start the RAxML search with a complete 
        # random starting tree instead of the default Maximum Parsimony 
        # Starting tree. On smaller datasets (around 100-200 taxa) it has 
        # been observed that this might sometimes yield topologies of distinct 
        # local likelihood maxima which better correspond to empirical 
        # expectations. 
        '-d':FlagParameter('-',Name='d'),

        # This allows you to specify up to which likelihood difference.
        # Default is 0.1 log likelihood units, author recommends 1 or 2 to
        # rapidly evaluate different trees.
        
        '-e':ValuedParameter('-',Name='e',Delimiter=' ', Value=0.1),
        

        # select search algorithm: 
        #   d for normal hill-climbing search (Default)
        #     when -f option is omitted this algorithm will be used
        #   o old (slower) algorithm from v. 2.1.3
        #   c (check) just tests whether it can read the alignment
        #   e (evaluate) to optimize model+branch lengths for given input tree
        #   b (bipartition) draws bipartitions
        #   s (split) splits into individual genes, provided with model file
        '-f':ValuedParameter('-',Name='f',Delimiter=' ', Value="d"),

        # select grouping file name: allows incomplete multifurcating constraint
        # tree in newick format -- resolves multifurcations randomly, adds
        # other taxa using parsimony insertion
        '-g':ValuedParameter('-', Name='g',Delimiter=' '),

        # prints help and exits

        '-h':FlagParameter('-', Name='h'),

        # allows initial rearrangement to be constrained, e.g. 10 means
        # insertion will not be more than 10 nodes away from original.
        # default is to pick a "good" setting.

        '-i':ValuedParameter('-', Name='i', Delimiter=' '),

        # writes checkpoints (off by default)

        '-j':FlagParameter('-', Name='j'),

        #specifies that RAxML will optimize model parameters (for GTRMIX and
        # GTRGAMMA) as well as calculating likelihoods for bootstrapped trees.

        '-k':FlagParameter('-', Name='k'),

        # Model of Nucleotide Substitution:
        # -m GTRGAMMA: GTR + Optimization of substitution rates + Gamma
        # -m GTRCAT: GTR + Optimization of substitution rates +  Optimization 
        #    of site-specific evolutionary rates which are categorized into 
        #    numberOfCategories distinct rate categories for greater 
        #    computational efficiency
        # -m GTRMIX: Searches for GTRCAT, then switches to GTRGAMMA
        
        # Amino Acid Models
        # matrixName (see below): DAYHOFF, DCMUT, JTT, MTREV, WAG, RTREV, 
        # CPREV, VT, BLOSUM62, MTMAM, GTR.
        # F means use empirical nucleotide frequencies (append to string)
        # -m PROTCATmatrixName[F]: uses site-specific rate categories
        # -m PROTGAMMAmatrixName[F]: uses Gamma
        # -m PROTMIXmatrixName[F]: switches between gamma and cat models
        # e.g. -m PROTCATBLOSUM62F would use protcat with BLOSUM62 and
        # empirical frequencies

        '-m':ValuedParameter('-',Name='m',Delimiter=' '),

        # Specifies the name of the output file.
        '-n':ValuedParameter('-',Name='n',Delimiter=' '),

        # Specifies the name of the outgroup (or outgroups: comma-delimited,
        # no spaces, should be monophyletic).
        '-o':ValuedParameter('-',Name='o',Delimiter=' '),

        # Specified MultipleModel file name, in format:
        #    gene1 = 1-500
        #    gene2 = 501-1000
        #    (note: ranges can also be discontiguous, e.g. 1-100, 200-300,
        #     or can specify codon ranges as e.g. 1-100/3, 2-100/3, 3-100/3))
        '-q':ValuedParameter('-', Name='q', Delimiter=' '),

        # Name of the working directory where RAxML-V will write its output 
        # files.
        '-w':ValuedParameter('-',Name='w',Delimiter=' '),

        # Constraint file name: allows a bifurcating Newick tree to be passed
        # in as a constraint file, other taxa will be added by parsimony.
        '-r':ValuedParameter('-',Name='r',Delimiter=' '),
       
        # Specify a random number seed for the parsimony inferences. This 
        # allows you to reproduce your results and will help me debug the 
        # program. This option HAS NO EFFECT in the parallel MPI version
        '-p':ValuedParameter('-',Name='p',Delimiter=' '),
        
        # specify the name of the alignment data file, in relaxed PHYLIP
        # format.
        '-s':ValuedParameter('-',Name='s',Delimiter=' '),

        # Specify a user starting tree file name in Newick format
        '-t':ValuedParameter('-',Name='t',Delimiter=' '),

        # Print the version
        '-v':FlagParameter('-',Name='v'),

        # Compute only randomized starting parsimony tree with RAxML, do not
        # optimize an ML analysis of the tree
        '-y':FlagParameter('-', Name='y'),

        # Multiple tree file, for use with -f b (to draw bipartitions onto the
        # common tree specified with -t)
        '-z':ValuedParameter('-', Name='z', Delimiter=' '),

        # Specifies number of runs on distinct starting trees.
        '-#':ValuedParameter('-', Name='#', Delimiter=' '),

        #Specify an integer number (random seed) to turn on rapid bootstrapping
        '-x':ValuedParameter('-', Name='x', Delimiter=' ')
    }

    _parameters = {}
    _parameters.update(_options)
    _command = "raxmlHPC"
    _out_format = "RAxML_%s.%s"

    def _format_output(self, outfile_name, out_type):
        """ Prepend proper output prefix to output filename """

        outfile_name = self._absolute(outfile_name)
        outparts = outfile_name.split("/") 
        outparts[-1] = self._out_format % (out_type, outparts[-1] )

        return '/'.join(outparts)
 

    def _input_as_seqs(self,data):
        lines = []
        for i,s in enumerate(data):
            #will number the sequences 1,2,3,etc.
            lines.append(''.join(['>',str(i+1)]))
            lines.append(s)
        return self._input_as_lines(lines)

    def _input_as_lines(self,data):
        if data:
            self.Parameters['-s']\
                .on(super(Raxml,self)._input_as_lines(data))
        return ''

    def _input_as_string(self,data):
        """Makes data the value of a specific parameter
     
        This method returns the empty string. The parameter will be printed
        automatically once set.
        """
        if data:
            self.Parameters['-in'].on(str(data))
        return ''

    def _input_as_multiline_string(self, data):
        if data:
            self.Parameters['-s']\
                .on(super(Raxml,self)._input_as_multiline_string(data))
        return ''
   
    def _absolute(self,path):
        path = FilePath(path)
        if isabs(path):
            return path
        elif self.Parameters['-w'].isOn():
            return self.Parameters['-w'].Value + path
        else:
            return self.WorkingDir + path

    def _log_out_filename(self):
        if self.Parameters['-n'].isOn():
            return self._format_output(str(self.Parameters['-n'].Value), "log")
        else:
            raise ValueError, "No output file specified." 

    def _info_out_filename(self):
        if self.Parameters['-n'].isOn():
            return self._format_output(str(self.Parameters['-n'].Value), "info")
        else:
            raise ValueError, "No output file specified." 

    def _parsimony_tree_out_filename(self):
        if self.Parameters['-n'].isOn():
            return self._format_output(str(self.Parameters['-n'].Value), "parsimonyTree")
        else:
            raise ValueError, "No output file specified." 

    def _result_tree_out_filename(self):
        if self.Parameters['-n'].isOn():
            return self._format_output(str(self.Parameters['-n'].Value), "result")
        else:
            raise ValueError, "No output file specified." 

    def _result_bootstrap_out_filename(self):
        if self.Parameters['-n'].isOn():
            return self._format_output(str(self.Parameters['-n'].Value), \
                                       "bootstrap")
        else:
            raise ValueError, "No output file specified"

    def _checkpoint_out_filenames(self):
        """
        RAxML generates a crapload of checkpoint files so need to
        walk directory to collect names of all of them.
        """
        out_filenames = []
        if self.Parameters['-n'].isOn():
            out_name = str(self.Parameters['-n'].Value)
            walk_root = self.WorkingDir
            if self.Parameters['-w'].isOn(): 
                walk_root = str(self.Parameters['-w'].Value)
            for tup in walk(walk_root):
                dpath, dnames, dfiles = tup
                if dpath == walk_root:
                    for gen_file in dfiles:
                        if out_name in gen_file and "checkpoint" in gen_file:
                            out_filenames.append(walk_root + gen_file)
                    break

        else:
            raise ValueError, "No output file specified." 
        return out_filenames

    def _get_result_paths(self,data):
        
        result = {}
        result['Info'] = ResultPath(Path=self._info_out_filename(),
                                            IsWritten=True)
        if self.Parameters['-k'].isOn():
            result['Bootstrap'] = ResultPath(
                            Path=self._result_bootstrap_out_filename(),
                            IsWritten=True)
        else:    
            result['Log'] = ResultPath(Path=self._log_out_filename(),
                                            IsWritten=True)
            result['ParsimonyTree'] = ResultPath(
                                      Path=self._parsimony_tree_out_filename(),
                                      IsWritten=True)
            result['Result'] = ResultPath(
                            Path=self._result_tree_out_filename(),
                            IsWritten=True)

        for checkpoint_file in self._checkpoint_out_filenames():
            checkpoint_num = checkpoint_file.split(".")[-1]
            try:
                checkpoint_num = int(checkpoint_num)
            except Exception, e:
                raise ValueError, "%s does not appear to be a valid checkpoint file"
            result['Checkpoint%d' % checkpoint_num] = ResultPath(
                        Path=checkpoint_file,
                        IsWritten=True)
 
        return result
Example #19
0
class Msms(CommandLineApplication):
    """Application controller for MSMS. The default input is a ``Entity`` 
    instance.
    
    Supported parameters:
      
      - probe_radius float : probe sphere radius, [1.5]
      - density float      : surface points density, [1.0]
      - hdensity float     : surface points high density, [3.0]
      - surface <tses,ases>: triangulated or Analytical SES, [tses]
      - no_area            : turns off the analytical surface area computation
      - noh                : ignore atoms with radius 1.2
      - no_rest_on_pbr     : no restart if pb. during triangulation
      - no_rest            : no restart if pb. are encountered
      - if filename        : sphere input file
      - of filename        : output for triangulated surface
      - af filename        : area file
      - no_header         : do not add comment line to the output
      - free_vertices      : turns on computation for isolated RS vertices
      - all_components     : compute all the surfaces components
    """

    _parameters = {
        #  -probe_radius float : probe sphere radius, [1.5]
        '-probe_radius':
        ValuedParameter(Prefix='-', Name='probe_radius', Delimiter=' '),
        #  -density float      : surface points density, [1.0]
        '-density':
        ValuedParameter(Prefix='-', Name='density', Delimiter=' '),
        #  -hdensity float     : surface points high density, [3.0]
        '-hdensity':
        ValuedParameter(Prefix='-', Name='hdensity', Delimiter=' '),
        #  -surface <tses,ases>: triangulated or Analytical SES, [tses]
        '-surface':
        ValuedParameter(Prefix='-', Name='surface', Delimiter=' '),
        #  -no_area            : turns off the analytical surface area computation
        '-no_area':
        FlagParameter(Prefix='-', Name='no_area', Value=False),
        #  -noh                : ignore atoms with radius 1.2
        '-noh':
        FlagParameter(Prefix='-', Name='noh', Value=False),
        #  -no_rest_on_pbr     : no restart if pb. during triangulation
        '-no_rest_on_pbr':
        FlagParameter(Prefix='-', Name='no_rest_on_pbr', Value=False),
        #  -no_rest            : no restart if pb. are encountered
        '-no_rest':
        FlagParameter(Prefix='-', Name='no_rest', Value=False),
        #  -if filename        : sphere input file
        '-if':
        ValuedParameter(Prefix='-', Name='if', Delimiter=' ', IsPath=True),
        #  -of filename        : output for triangulated surface
        '-of':
        ValuedParameter(Prefix='-', Name='of', Delimiter=' ', IsPath=True),
        #  -af filename        : area file
        '-af':
        ValuedParameter(Prefix='-', Name='af', Delimiter=' ', IsPath=True),
        #  -no_header         : do not add comment line to the output
        '-no_header':
        FlagParameter(Prefix='-', Name='no_header', Value=False),
        #  -free_vertices      : turns on computation for isolated RS vertices
        '-free_vertices':
        FlagParameter(Prefix='-', Name='free_vertices', Value=False),
        #  -all_components     : compute all the surfaces components
        '-all_components':
        FlagParameter(Prefix='-', Name='all_components', Value=False),
        #######################
        #  -one_cavity #atoms at1 [at2][at3] : Compute the surface for an internal
        # cavity for which at least one atom is specified
        #######################
        #  -socketName servicename : socket connection from a client
        #  -socketPort portNumber : socket connection from a client
        #  -xdr                : use xdr encoding over socket
        #  -sinetd             : inetd server connectio
    }

    _command = "msms"
    _input_handler = '_input_from_entity'

    def _input_from_entity(self, data):
        """This allows to feed entities to msms."""
        if data:
            # create temporary files and names.
            fd, self._input_filename = tempfile.mkstemp()
            os.close(fd)
            # write XYZR data
            fh = open(self._input_filename, 'wb')
            XYZRNWriter(fh, data)
            fh.close()
            #
            self.Parameters['-if'].on(self._input_filename)
            self.Parameters['-of'].on(
                self._input_filename)  # msms appends .vert .face
            self.Parameters['-af'].on(
                self._input_filename)  # msms appends .area
        if (not self.Parameters['-if'].isOn()) or \
           (not self.Parameters['-of'].isOn()) or \
           (not self.Parameters['-af'].isOn()):
            raise ValueError('All of -if, -of and -af have to be specified.')
        return ""

    def _get_result_paths(self, data):
        result = {}
        vert_file = self.Parameters['-of'].Value + '.vert'
        result['VertFile'] = ResultPath(Path=vert_file, IsWritten=True)
        face_file = self.Parameters['-of'].Value + '.face'
        result['FaceFile'] = ResultPath(Path=face_file, IsWritten=True)
        if not self.Parameters['-no_area'].Value:
            area_file = self.Parameters['-af'].Value + '.area'
            result['AreaFile'] = ResultPath(Path=area_file, IsWritten=True)
        return result
Example #20
0
class Raxml(CommandLineApplication):
    """RAxML application controller"""

    _options = {

        # Specify a column weight file name to assign individual wieghts to
        # each column of the alignment. Those weights must be integers
        # separated by any number and type of whitespaces whithin a separate
        # file, see file "example_weights" for an example.
        '-a': ValuedParameter('-', Name='a', Delimiter=' '),

        # Specify one of the secondary structure substitution models implemented
        # in RAxML. The same nomenclature as in the PHASE manual is used,
        # available models:  S6A, S6B, S6C, S6D, S6E, S7A, S7B, S7C, S7D, S7E,
        # S7F, S16, S16A, S16B
        # DEFAULT: 16-state GTR model (S16)
        '-A': ValuedParameter('-', Name='A', Delimiter=' '),

        #  Specify an integer number (random seed) for bootstrapping
        '-b': ValuedParameter('-', Name='b', Delimiter=' '),

        # specify a floating point number between 0.0 and 1.0 that will be used
        # as cutoff threshold for the MR-based bootstopping criteria. The
        # recommended setting is 0.03.
        '-B': ValuedParameter('-', Name='B', Delimiter=' '),

        # Specify number of distinct rate catgories for raxml when
        # ModelOfEvolution is set to GTRCAT or HKY85CAT.
        # Individual per-site rates are categorized into numberOfCategories
        # rate categories to accelerate computations. (Default = 50)
        '-c': ValuedParameter('-', Name='c', Delimiter=' '),

        # Conduct model parameter optimization on gappy, partitioned multi-gene
        # alignments with per-partition branch length estimates (-M enabled)
        # using the fast method with pointer meshes described in:
        # Stamatakis and Ott: "Efficient computation of the phylogenetic
        # likelihood function on multi-gene alignments and multi-core
        # processors"
        # WARNING: We can not conduct useful tree searches using this method
        # yet! Does not work with Pthreads version.
        '-C': ValuedParameter('-', Name='C', Delimiter=' '),

        # This option allows you to start the RAxML search with a complete
        # random starting tree instead of the default Maximum Parsimony
        # Starting tree. On smaller datasets (around 100-200 taxa) it has
        # been observed that this might sometimes yield topologies of distinct
        # local likelihood maxima which better correspond to empirical
        # expectations.
        '-d': FlagParameter('-', Name='d'),

        # ML search convergence criterion. This will break off ML searches if
        # the relative Robinson-Foulds distance between the trees obtained from
        # two consecutive lazy SPR cycles is smaller or equal to 1%. Usage
        # recommended for very large datasets in terms of taxa. On trees with
        # more than 500 taxa this will yield execution time improvements of
        # approximately 50% While yielding only slightly worse trees.
        # DEFAULT: OFF
        '-D': ValuedParameter('-', Name='D'),

        # This allows you to specify up to which likelihood difference.
        # Default is 0.1 log likelihood units, author recommends 1 or 2 to
        # rapidly evaluate different trees.
        '-e': ValuedParameter('-', Name='e', Delimiter=' '),

        # specify an exclude file name, that contains a specification of
        # alignment positions you wish to exclude. Format is similar to Nexus,
        # the file shall contain entries like "100-200 300-400", to exclude a
        # single column write, e.g., "100-100", if you use a mixed model, an
        # appropriatly adapted model file will be written.
        '-E': ValuedParameter('-', Name='E', Delimiter=' '),

        # select search algorithm:
        #   a rapid Bootstrap analysis and search for best-scoring ML tree in
        #       one program run
        #   A compute marginal ancestral states on a ROOTED reference tree
        #       provided with "t" - ONLY IN 7.3.0
        #   b draw bipartition information on a tree provided with "-t" based on
        #       multiple trees (e.g., from a bootstrap) in a file specifed by
        #       "-z"
        #   c check if the alignment can be properly read by RAxML
        #   d for normal hill-climbing search (Default)
        #     when -f option is omitted this algorithm will be used
        #   e optimize model+branch lengths for given input tree under
        #       GAMMA/GAMMAI only
        #   E execute very fast experimental tree search, at present only for
        #       testing
        #   F execute fast experimental tree search, at present only for testing
        #   g compute per site log Likelihoods for one ore more trees passed via
        #       "-z" and write them to a file that can be read by CONSEL
        #       WARNING: does not print likelihoods in the original column order
        #   h compute log likelihood test (SH-test) between best tree passed via
        #       "-t" and a bunch of other trees passed via "-z"
        #   i EXPERIMENTAL do not use for real tree inferences: conducts a
        #       single cycle of fast lazy SPR moves on a given input tree, to be
        #       used in combination with -C and -M
        #   I EXPERIMENTAL do not use for real tree inferences: conducts a
        #       single cycle of thorough lazy SPR moves on a given input tree,
        #       to be used in combination with -C and -M
        #   j generate a bunch of bootstrapped alignment files from an original
        #       alignemnt file. You need to specify a seed with "-b" and the
        #       number of replicates with "-#"
        # following "J" is for version 7.2.8
        #   J Compute SH-like support values on a given tree passed via "-t".
        #   m compare bipartitions between two bunches of trees passed via "-t"
        #       and "-z" respectively. This will return the Pearson correlation
        #       between all bipartitions found in the two tree files. A file
        #       called RAxML_bipartitionFrequencies.outpuFileName will be
        #       printed that contains the pair-wise bipartition frequencies of
        #       the two sets
        #   n compute the log likelihood score of all trees contained in a tree
        #       file provided by "-z" under GAMMA or GAMMA+P-Invar
        #   o old (slower) algorithm from v. 2.1.3
        #   p perform pure stepwise MP addition of new sequences to an
        #       incomplete starting tree and exit
        #   r compute pairwise Robinson-Foulds (RF) distances between all pairs
        #       of trees in a tree file passed via "-z" if the trees have node
        #       labales represented as integer support values the program will
        #       also compute two flavors of the weighted Robinson-Foulds (WRF)
        #       distance
        # following "R" is for version 7.2.8
        #   R compute rogue taxa using new statistical method based on the
        #       evolutionary placement algorithm
        #       WARNING: this is experimental code - DEPRECATED IN 7.3.0
        #   s (split) splits into individual genes, provided with model file
        # following "S" is for version 7.2.8
        #   S compute site-specific placement bias using a leave one out test
        #       inspired by the evolutionary placement algorithm
        #   t do randomized tree searches on one fixed starting tree
        #   u execute morphological weight calibration using maximum likelihood,
        #       this will return a weight vector. you need to provide a
        #       morphological alignment and a reference tree via "-t"
        #   U execute morphological wieght calibration using parsimony, this
        #       will return a weight vector. you need to provide a morphological
        #       alignment and a reference tree via "-t" - DEPRECATED IN 7.3.0
        #   v classify a bunch of environmental sequences into a reference tree
        #       using the slow heuristics without dynamic alignment you will
        #       need to start RAxML with a non-comprehensive reference tree and
        #       an alignment containing all sequences (reference + query)
        #   w compute ELW test on a bunch of trees passed via "-z"
        #   x compute pair-wise ML distances, ML model parameters will be
        #       estimated on an MP starting tree or a user-defined tree passed
        #       via "-t", only allowed for GAMMA-based models of rate
        #       heterogeneity
        #   y classify a bunch of environmental sequences into a reference tree
        #       using the fast heuristics without dynamic alignment you will
        #       need to start RAxML with a non-comprehensive reference tree and
        #       an alignment containing all sequences (reference + query)
        '-f': ValuedParameter('-', Name='f', Delimiter=' ', Value="d"),

        # enable ML tree searches under CAT model for very large trees without
        # switching to GAMMA in the end (saves memory). This option can also be
        # used with the GAMMA models in order to avoid the thorough optimization
        # of the best-scoring ML tree in the end.
        # DEFAULT: OFF
        '-F': FlagParameter('-', Name='F'),

        # select grouping file name: allows incomplete multifurcating constraint
        # tree in newick format -- resolves multifurcations randomly, adds
        # other taxa using parsimony insertion
        '-g': ValuedParameter('-', Name='g', Delimiter=' '),

        # enable the ML-based evolutionary placement algorithm heuristics by
        # specifiyng a threshold value (fraction of insertion branches to be
        # evaluated using slow insertions under ML).
        '-G': FlagParameter('-', Name='G'),

        # prints help and exits
        '-h': FlagParameter('-', Name='h'),

        # enable the MP-based evolutionary placement algorithm heuristics
        # by specifiyng a threshold value (fraction of insertion branches to be
        # evaluated using slow insertions under ML) - DEPRECATED IN 7.3.0
        #'-H':ValuedParameter('-', Name='H',Delimiter=' '),

        # allows initial rearrangement to be constrained, e.g. 10 means
        # insertion will not be more than 10 nodes away from original.
        # default is to pick a "good" setting.
        '-i': ValuedParameter('-', Name='i', Delimiter=' '),

        # a posteriori bootstopping analysis. Use:
        #   "-I autoFC" for the frequency-based criterion
        #   "-I autoMR" for the majority-rule consensus tree criterion
        #   "-I autoMRE" for the extended majority-rule consensus tree criterion
        #   "-I autoMRE_IGN" for metrics similar to MRE, but include
        #       bipartitions under the threshold whether they are compatible
        #       or not. This emulates MRE but is faster to compute.
        #   You also need to pass a tree file containg several bootstrap
        #   replicates via "-z"
        '-I': ValuedParameter('-', Name='I', Delimiter=' '),

        # writes checkpoints (off by default)
        '-j': FlagParameter('-', Name='j'),

        # Compute majority rule consensus tree with "-J MR" or extended majority
        # rule consensus tree with "-J MRE" or strict consensus tree with "-J
        # STRICT" You will need to provide a tree file containing several
        # UNROOTED trees via "-z"
        '-J': ValuedParameter('-', Name='J', Delimiter=' '),

        #specifies that RAxML will optimize model parameters (for GTRMIX and
        # GTRGAMMA) as well as calculating likelihoods for bootstrapped trees.
        '-k': FlagParameter('-', Name='k'),

        # Specify one of the multi-state substitution models (max 32 states)
        # implemented in RAxML. Available models are: ORDERED, MK, GTR
        '-K': ValuedParameter('-', Name='K', Delimiter=' '),

        # Model of Binary (Morphological), Nucleotide, Multi-State, or Amino
        #   Acid Substitution::
        # BINARY:
        #   -m BINCAT : Optimization of site-specific evolutionary rates which
        #       are categorized into numberOfCategories distinct rate categories
        #       for greater computational efficiency. Final tree might be
        #       evaluated automatically under BINGAMMA, depending on the tree
        #       search option
        #   -m BINCATI : Optimization of site-specific evolutionary rates which
        #       are categorized into numberOfCategories distinct rate categories
        #       for greater computational efficiency. Final tree might be
        #       evaluated automatically under BINGAMMAI, depending on the tree
        #       search option
        #   -m BINGAMMA : GAMMA model of rate heterogeneity (alpha parameter
        #       will be estimated)
        #   -m BINGAMMAI : Same as BINGAMMA, but with estimate of proportion of
        #       invariable sites
        # NUCLEOTIDES
        #   -m GTRCAT: GTR + Optimization of substitution rates +  Optimization
        #       of site-specific evolutionary rates which are categorized into
        #       numberOfCategories distinct rate categories for greater
        #       computational efficiency
        #   -m GTRCAT_FLOAT : Same as above but uses single-precision floating
        #       point arithemtics instead of double-precision Usage only
        #       recommened for testing, the code will run slower, but can save
        #       almost 50% of memory. If you have problems with phylogenomic
        #       datasets and large memory requirements you may give it a shot.
        #       Keep in mind that numerical stability seems to be okay but needs
        #       further testing. - DEPRECATED IN 7.3.0
        #   -m GTRCATI : GTR + Optimization of substitution rates + Optimization
        #       of site-specific evolutionary rates which are categorized into
        #       numberOfCategories distinct rate categories for greater
        #       computational efficiency.  Final tree might be evaluated under
        #       GTRGAMMAI, depending on the tree search option
        #   -m GTRGAMMA: GTR + Optimization of substitution rates + Gamma
        #   -m GTRGAMMA_FLOAT : Same as GTRGAMMA, but also with
        #       single-precision arithmetics, same cautionary notes as for
        #       GTRCAT_FLOAT apply. - DEPRECATED IN 7.3.0
        #   -m GTRGAMMAI : Same as GTRGAMMA, but with estimate of proportion of
        #       invariable sites
        # MULTI-STATE:
        #   -m MULTICAT : Optimization of site-specific evolutionary rates which
        #       are categorized into numberOfCategories distinct rate categories
        #       for greater computational efficiency. Final tree might be
        #       evaluated automatically under MULTIGAMMA, depending on the tree
        #       search option
        #   -m MULTICATI : Optimization of site-specific evolutionary rates
        #       which are categorized into numberOfCategories distinct rate
        #       categories for greater computational efficiency. Final tree
        #       might be evaluated automatically under MULTIGAMMAI, depending on
        #       the tree search option
        #   -m MULTIGAMMA : GAMMA model of rate heterogeneity (alpha parameter
        #       will be estimated)
        #   -m MULTIGAMMAI : Same as MULTIGAMMA, but with estimate of proportion
        #       of invariable sites
        # You can use up to 32 distinct character states to encode multi-state
        # regions, they must be used in the following order: 0, 1, 2, 3, 4, 5,
        # 6, 7, 8, 9, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S,
        # T, U, V i.e., if you have 6 distinct character states you would use 0,
        # 1, 2, 3, 4, 5 to encode these. The substitution model for the
        # multi-state regions can be selected via the "-K" option
        # Amino Acid Models:
        #   -m PROTCATmatrixName[F] : specified AA matrix + Optimization of
        #       substitution rates + Optimization of site-specific evolutionary
        #       rates which are categorized into numberOfCategories distinct
        #       rate categories for greater computational efficiency.   Final
        #       tree might be evaluated automatically under
        #       PROTGAMMAmatrixName[f], depending on the tree search option
        #   -m PROTCATmatrixName[F]_FLOAT : PROTCAT with single precision
        #       arithmetics, same cautionary notes as for GTRCAT_FLOAT apply
        #       - DEPRECATED IN 7.3.0
        #   -m PROTCATImatrixName[F] : specified AA matrix + Optimization of
        #       substitution rates + Optimization of site-specific
        #       evolutionary rates which are categorized into numberOfCategories
        #       distinct rate categories for greater computational efficiency.
        #       Final tree might be evaluated automatically under
        #       PROTGAMMAImatrixName[f], depending on the tree search option
        #   -m PROTGAMMAmatrixName[F] : specified AA matrix + Optimization of
        #       substitution rates + GAMMA model of rate heterogeneity (alpha
        #       parameter will be estimated)
        #   -m PROTGAMMAmatrixName[F]_FLOAT : PROTGAMMA with single precision
        #       arithmetics, same cautionary notes as for GTRCAT_FLOAT apply
        #       - DEPRECATED IN 7.3.0
        #   -m PROTGAMMAImatrixName[F] : Same as PROTGAMMAmatrixName[F], but
        #       with estimate of proportion of invariable sites
        # Available AA substitution models: DAYHOFF, DCMUT, JTT, MTREV, WAG,
        # RTREV, CPREV, VT, BLOSUM62, MTMAM, LG, GTR. With the optional "F"
        # appendix you can specify if you want to use empirical base frequencies
        # Please note that for mixed models you can in addition specify the
        # per-gene AA model in the mixed model file (see manual for details).
        # Also note that if you estimate AA GTR parameters on a partitioned
        # dataset, they will be linked (estimated jointly) across all partitions
        # to avoid over-parametrization
        '-m': ValuedParameter('-', Name='m', Delimiter=' '),

        # Switch on estimation of individual per-partition branch lengths. Only
        # has effect when used in combination with "-q". Branch lengths for
        # individual partitions will be printed to separate files. A weighted
        # average of the branch lengths is computed by using the respective
        # partition lengths.
        # DEFAULT: OFF
        '-M': FlagParameter('-', Name='M'),

        # Specifies the name of the output file.
        '-n': ValuedParameter('-', Name='n', Delimiter=' '),

        # Specifies the name of the outgroup (or outgroups: comma-delimited,
        # no spaces, should be monophyletic).
        '-o': ValuedParameter('-', Name='o', Delimiter=' '),

        # Enable checkpointing using the dmtcp library available at
        # http://dmtcp.sourceforge.net/. This only works if you call the program
        # by preceded by the command "dmtcp_checkpoint" and if you compile a
        # dedicated binary using the appropriate Makefile. With "-O" you can
        # specify the interval between checkpoints in seconds.
        # DEFAULT: 3600.0 seconds - DEPRECATED IN 7.3.0
        #'-O':ValuedParameter('-',Name='O',Delimiter=' ',Value=3600.0),

        # Specify a random number seed for the parsimony inferences. This allows
        # you to reproduce your results and will help me debug the program.
        '-p': ValuedParameter('-', Name='p', Delimiter=' '),

        # Specify the file name of a user-defined AA (Protein) substitution
        # model. This file must contain 420 entries, the first 400 being the AA
        # substitution rates (this must be a symmetric matrix) and the last 20
        # are the empirical base frequencies
        '-P': ValuedParameter('-', Name='P', Delimiter=' '),

        # Specified MultipleModel file name, in format:
        #    gene1 = 1-500
        #    gene2 = 501-1000
        #    (note: ranges can also be discontiguous, e.g. 1-100, 200-300,
        #     or can specify codon ranges as e.g. 1-100/3, 2-100/3, 3-100/3))
        '-q': ValuedParameter('-', Name='q', Delimiter=' '),

        # THE FOLLOWING "Q" is DEPRECATED IN 7.2.8
        # Turn on computation of SH-like support values on tree.
        # DEFAULT: OFF
        '-Q': FlagParameter('-', Name='Q'),

        # Constraint file name: allows a bifurcating Newick tree to be passed
        # in as a constraint file, other taxa will be added by parsimony.
        '-r': ValuedParameter('-', Name='r', Delimiter=' '),

        # THE FOLLOWING "R" is IN 7.2.8
        # Specify the file name of a binary model parameter file that has
        # previously been generated with RAxML using the -f e tree evaluation
        # option. The file name should be:  RAxML_binaryModelParameters.runID
        '-R': ValuedParameter('-', Name='R', Delimiter=' '),

        # specify the name of the alignment data file, in relaxed PHYLIP
        # format.
        '-s': ValuedParameter('-', Name='s', Delimiter=' '),

        # Specify the name of a secondary structure file. The file can contain
        # "." for alignment columns that do not form part of a stem and
        # characters "()<>[]{}" to define stem regions and pseudoknots
        '-S': ValuedParameter('-', Name='S', Delimiter=' '),

        # Specify a user starting tree file name in Newick format
        '-t': ValuedParameter('-', Name='t', Delimiter=' '),

        # PTHREADS VERSION ONLY! Specify the number of threads you want to run.
        # Make sure to set "-T" to at most the number of CPUs you have on your
        # machine, otherwise, there will be a huge performance decrease!
        '-T': ValuedParameter('-', Name='T', Delimiter=' '),

        # THE FOLLOWING "U" is IN 7.2.8
        # Try to save memory by using SEV-based implementation for gap columns
        # on large gappy alignments
        # WARNING: this will only work for DNA under GTRGAMMA and is still in an
        # experimental state.
        '-U': ValuedParameter('-', Name='U', Delimiter=' '),

        # Print the version
        '-v': FlagParameter('-', Name='v'),

        # Name of the working directory where RAxML-V will write its output
        # files.
        '-w': ValuedParameter('-', Name='w', Delimiter=' '),

        # THE FOLLOWING "W" is IN 7.2.8
        # Sliding window size for leave-one-out site-specific placement bias
        # algorithm only effective when used in combination with "-f S"
        #   DEFAULT: 100 sites
        '-W': ValuedParameter('-', Name='W', Delimiter=' '),

        # Specify an integer number (random seed) and turn on rapid
        # bootstrapping. CAUTION: unlike in version 7.0.4 RAxML will conduct
        # rapid BS replicates under the model of rate heterogeneity you
        # specified via "-m" and not by default under CAT
        '-x': ValuedParameter('-', Name='x', Delimiter=' '),

        # EXPERIMENTAL OPTION: This option will do a per-site estimate of
        # protein substitution models by looping over all given, fixed models
        # LG, WAG, JTT, etc and using their respective base frequencies to
        # independently assign a prot subst. model to each site via ML
        # optimization. At present this option only works with the GTR+GAMMA
        # model, unpartitioned datasets, and in the sequential version only.
        #   DEFAULT: OFF
        '-X': FlagParameter('-', Name='X'),

        # Compute only randomized starting parsimony tree with RAxML, do not
        # optimize an ML analysis of the tree
        '-y': FlagParameter('-', Name='y'),

        # Do a more thorough parsimony tree search using a parsimony ratchet and
        # exit. Specify the number of ratchet searches via "-#" or "-N". This
        # has just been implemented for completeness, if you want a fast MP
        # implementation use TNT
        # DEFAULT: OFF - DEPRECATED IN 7.3.0
        #'-Y':FlagParameter('-', Name='Y'),

        # Multiple tree file, for use with -f b (to draw bipartitions onto the
        # common tree specified with -t)
        '-z': ValuedParameter('-', Name='z', Delimiter=' '),

        # Specifies number of runs on distinct starting trees.
        '-#': ValuedParameter('-', Name='#', Delimiter=' ', Value=1),

        # Specifies number of runs on distinct starting trees.
        '-N': ValuedParameter('-', Name='N', Delimiter=' '),
    }

    _parameters = {}
    _parameters.update(_options)
    _command = "raxmlHPC"
    _out_format = "RAxML_%s.%s"

    def _format_output(self, outfile_name, out_type):
        """ Prepend proper output prefix to output filename """

        outfile_name = self._absolute(outfile_name)
        outparts = outfile_name.split("/")
        outparts[-1] = self._out_format % (out_type, outparts[-1])

        return '/'.join(outparts)

    def _input_as_seqs(self, data):
        lines = []
        for i, s in enumerate(data):
            #will number the sequences 1,2,3,etc.
            lines.append(''.join(['>', str(i + 1)]))
            lines.append(s)
        return self._input_as_lines(lines)

    def _input_as_lines(self, data):
        if data:
            self.Parameters['-s']\
                .on(super(Raxml,self)._input_as_lines(data))
        return ''

    def _input_as_string(self, data):
        """Makes data the value of a specific parameter
     
        This method returns the empty string. The parameter will be printed
        automatically once set.
        """
        if data:
            self.Parameters['-in'].on(str(data))
        return ''

    def _input_as_multiline_string(self, data):
        if data:
            self.Parameters['-s']\
                .on(super(Raxml,self)._input_as_multiline_string(data))
        return ''

    def _absolute(self, path):
        path = FilePath(path)
        if isabs(path):
            return path
        elif self.Parameters['-w'].isOn():
            return self.Parameters['-w'].Value + path
        else:
            return self.WorkingDir + path

    def _log_out_filename(self):
        if self.Parameters['-n'].isOn():
            return self._format_output(str(self.Parameters['-n'].Value), "log")
        else:
            raise ValueError("No output file specified.")

    def _info_out_filename(self):
        if self.Parameters['-n'].isOn():
            return self._format_output(str(self.Parameters['-n'].Value),
                                       "info")
        else:
            raise ValueError("No output file specified.")

    def _parsimony_tree_out_filename(self):
        if self.Parameters['-n'].isOn():
            return self._format_output(str(self.Parameters['-n'].Value), \
                                            "parsimonyTree")
        else:
            raise ValueError("No output file specified.")

    # added for tree-insertion
    def _originallabelled_tree_out_filename(self):
        if self.Parameters['-n'].isOn():
            return self._format_output(str(self.Parameters['-n'].Value), \
                                            "originalLabelledTree")
        else:
            raise ValueError("No output file specified.")

    # added for tree-insertion
    def _labelled_tree_out_filename(self):
        if self.Parameters['-n'].isOn():
            return self._format_output(str(self.Parameters['-n'].Value), \
                                            "labelledTree")
        else:
            raise ValueError("No output file specified.")

    # added for tree-insertion
    def _classification_out_filename(self):
        if self.Parameters['-n'].isOn():
            return self._format_output(str(self.Parameters['-n'].Value), \
                                            "classification")
        else:
            raise ValueError("No output file specified.")

    # added for tree-insertion
    def _classificationlikelihoodweights_out_filename(self):
        if self.Parameters['-n'].isOn():
            return self._format_output(str(self.Parameters['-n'].Value), \
                                            "classificationLikelihoodWeights")
        else:
            raise ValueError("No output file specified.")

    # added for tree-insertion
    def _best_tree_out_filename(self):
        if self.Parameters['-n'].isOn():
            return self._format_output(str(self.Parameters['-n'].Value), \
                                            "bestTree")
        else:
            raise ValueError("No output file specified.")

    # added for tree-insertion
    def _entropy_out_filename(self):
        if self.Parameters['-n'].isOn():
            return self._format_output(str(self.Parameters['-n'].Value), \
                                            "entropy")
        else:
            raise ValueError("No output file specified.")

    # added for tree-insertion
    def _json_out_filename(self):
        if self.Parameters['-n'].isOn():
            return self._format_output(str(self.Parameters['-n'].Value), \
                                            "portableTree")
        else:
            raise ValueError("No output file specified.")

    # added for tree-insertion
    def _parsimony_out_filename(self):
        if self.Parameters['-n'].isOn():
            return self._format_output(str(self.Parameters['-n'].Value), \
                                            "equallyParsimoniousPlacements")
        else:
            raise ValueError("No output file specified.")

    def _result_tree_out_filename(self):
        if self.Parameters['-n'].isOn():
            return self._format_output(str(self.Parameters['-n'].Value), \
                                            "result")
        else:
            raise ValueError("No output file specified.")

    def _result_bootstrap_out_filename(self):
        if self.Parameters['-n'].isOn():
            return self._format_output(str(self.Parameters['-n'].Value), \
                                            "bootstrap")
        else:
            raise ValueError("No output file specified")

    def _checkpoint_out_filenames(self):
        """
        RAxML generates a crapload of checkpoint files so need to
        walk directory to collect names of all of them.
        """
        out_filenames = []
        if self.Parameters['-n'].isOn():
            out_name = str(self.Parameters['-n'].Value)
            walk_root = self.WorkingDir
            if self.Parameters['-w'].isOn():
                walk_root = str(self.Parameters['-w'].Value)
            for tup in walk(walk_root):
                dpath, dnames, dfiles = tup
                if dpath == walk_root:
                    for gen_file in dfiles:
                        if out_name in gen_file and "checkpoint" in gen_file:
                            out_filenames.append(walk_root + gen_file)
                    break

        else:
            raise ValueError("No output file specified.")
        return out_filenames

    def _handle_app_result_build_failure(self, out, err, exit_status,
                                         result_paths):
        """ Catch the error when files are not produced """

        try:
            raise ApplicationError('RAxML failed to produce an output file due to the following error: \n\n%s ' \
             % err.read())
        except:
            raise ApplicationError('RAxML failed to run properly.')

    def _get_result_paths(self, data):

        result = {}
        result['Info'] = ResultPath(Path=self._info_out_filename(),
                                    IsWritten=True)
        if self.Parameters['-k'].isOn():
            result['Bootstrap'] = ResultPath(
                Path=self._result_bootstrap_out_filename(), IsWritten=True)
        elif self.Parameters["-f"].Value == 'v':
            #these were added to handle the results from tree-insertion
            result['Classification'] = ResultPath(
                Path=self._classification_out_filename(), IsWritten=True)
            result['ClassificationLikelihoodWeights'] = ResultPath(
                Path=self._classificationlikelihoodweights_out_filename(),
                IsWritten=True)
            result['OriginalLabelledTree'] = ResultPath(
                Path=self._originallabelled_tree_out_filename(),
                IsWritten=True)
            result['Result'] = ResultPath(
                Path=self._labelled_tree_out_filename(), IsWritten=True)
            result['entropy'] = ResultPath(Path=self._entropy_out_filename(),
                                           IsWritten=True)
            result['json'] = ResultPath(Path=self._json_out_filename() +
                                        '.jplace',
                                        IsWritten=True)
        elif self.Parameters["-f"].Value == 'y':
            #these were added to handle the results from tree-insertion

            result['Parsimony'] = ResultPath(
                Path=self._parsimony_out_filename(), IsWritten=True)
            result['OriginalLabelledTree'] = ResultPath(
                Path=self._originallabelled_tree_out_filename(),
                IsWritten=True)
            result['json'] = ResultPath(Path=self._json_out_filename() +
                                        '.jplace',
                                        IsWritten=True)
        else:
            result['Log'] = ResultPath(Path=self._log_out_filename(),
                                       IsWritten=True)
            result['ParsimonyTree'] = ResultPath(
                Path=self._parsimony_tree_out_filename(), IsWritten=True)
            result['Result'] = ResultPath(
                Path=self._result_tree_out_filename(), IsWritten=True)
            #
            result['besttree'] = ResultPath(
                Path=self._best_tree_out_filename(), IsWritten=True)

        for checkpoint_file in self._checkpoint_out_filenames():
            checkpoint_num = checkpoint_file.split(".")[-1]
            try:
                checkpoint_num = int(checkpoint_num)
            except Exception as e:
                raise ValueError(
                    "%s does not appear to be a valid checkpoint file")
            result['Checkpoint%d' % checkpoint_num] = ResultPath(
                Path=checkpoint_file, IsWritten=True)

        return result
Example #21
0
class RNAshapes(CommandLineApplication):
    """Application controller for RNAshapes application

    Options:
    -h          Display this information
    -H <option> Display detailed information on <option>
    -v          Show version

    Sequence analysis modes:
    -a          Shape folding (standard mode)
    -s          Complete suboptimal folding
    -p          Shape probabilities
    -q          Shape probabilities (including shreps)
    -P <value>  Shape probabilities for mfe-best shapes
    -i <value>  Sampling with <value> iterations
    -C          Consensus shapes (RNAcast)

    Additional modes (use with any of the above):
    -r          Calculate structure probabilities
    -w <value>  Specify window size
    -W <value>  Specify window position increment (use with -w) [1]
    -m <shape>  Match shape (use with -a, -s, -p, -q or -C)

    Analysis control:
    -e <value>  Set energy range (kcal/mol)
    -c <value>  Set energy range (%) [10]
    -t <value>  Specify shape type (1-5) [5]
    -F <value>  Set probability cutoff filter (use with -p, -q or -P)
    -T <value>  Set probability output filter (use with -p, -q or -P)
    -M <value>  Set maximum loop length [30]  (use -M n for unrestricted)
    -l          Allow lonely base pairs
    -u          Ignore unstable structures (use with -a, -s or -C)

    Input/Output:
    -o <value>  Specify output type (1-4,f) [2]
    -O <string> Specify output format string
    -S <value>  Specify output width for structures
    -# <value>  Print only the first <value> results
    -g <value>  Generate structure graphs for first <value> structures
    -L          Highlight upper case characters in structure graphs
    -N          Do not include additional information in graph output file
    -f <file>   Read input from <file>
    -B          Show progress bar (use with -p, -q or -P)
    -z          Enable colors (in interactive mode: disable colors)
    -Z          Enable colors for dotbracket and shape strings
    -D <string> Convert dotbracket-string to shape (choose type with -t)

    """

    _sequence_analysis = {
         '-a':FlagParameter(Prefix='-',Name='a',Value=True),
         '-s':FlagParameter(Prefix='-',Name='s',Value=False),
         '-p':FlagParameter(Prefix='-',Name='p',Value=False),
         '-q':FlagParameter(Prefix='-',Name='q',Value=False),
         '-P':ValuedParameter(Prefix='-',Name='P',Value=None,Delimiter=' '),
         '-i':ValuedParameter(Prefix='-',Name='i',Value=None,Delimiter=' '),
         '-C':FlagParameter(Prefix='-',Name='C',Value=True)}

    _additional_modes ={
        '-r':FlagParameter(Prefix='-',Name='r',Value=False)}

    _analysis_control = {
        '-e':ValuedParameter(Prefix='-',Name='e',Value=None,Delimiter=' '),
        '-c':ValuedParameter(Prefix='-',Name='c',Value=20,Delimiter=' '),
        '-t':ValuedParameter(Prefix='-',Name='t',Value=None,Delimiter=' '),
        '-F':ValuedParameter(Prefix='-',Name='F',Value=None,Delimiter=' '),
        '-T':ValuedParameter(Prefix='-',Name='T',Value=None,Delimiter=' '),
        '-M':ValuedParameter(Prefix='-',Name='M',Value=None,Delimiter=' '),
        '-l':ValuedParameter(Prefix='-',Name='l',Value=None,Delimiter=' '),
        '-u':ValuedParameter(Prefix='-',Name='u',Value=None,Delimiter=' ')}


    _input_output = {
         '-o':ValuedParameter(Prefix='-',Name='o',Value=None,Delimiter=' '),
         '-S':ValuedParameter(Prefix='-',Name='S',Value=None,Delimiter=' '),
         '-#':ValuedParameter(Prefix='-',Name='#',Value=None,Delimiter=' '),
         '-g':ValuedParameter(Prefix='-',Name='g',Value=None,Delimiter=' '),
         '-L':FlagParameter(Prefix='-',Name='L',Value=False),
         '-N':FlagParameter(Prefix='-',Name='N',Value=False),
         '-f':ValuedParameter(Prefix='-',Name='f',Delimiter=' '),
         '-B':FlagParameter(Prefix='-',Name='B',Value=False),
         '-z':FlagParameter(Prefix='-',Name='z',Value=False),
         '-Z':FlagParameter(Prefix='-',Name='Z',Value=False)}
    
    _parameters = {}
    _parameters.update(_sequence_analysis)
    _parameters.update(_additional_modes)
    _parameters.update(_analysis_control)
    _parameters.update(_input_output)
    
    _command = 'RNAshapes'
    _input_handler = '_input_as_string'

 
    def _input_as_lines(self,data):
        """Makes data the value of a specific parameter"""
        if data:
            self.Parameters['-f']\
                .on(super(RNAshapes,self)._input_as_lines(data))
        return ''

    def _input_as_string(self,data):
        """Makes data the value of a specific parameter
        
        This method returns the empty string. The parameter will be printed
        automatically once set.
        """
        if data:
            self.Parameters['-f'].on(data)
        return ''
Example #22
0
class BWA_bwasw(BWA):
    """Controls the "bwasw" subcommand of the bwa application.
    
    Valid input keys are: prefix, query_fasta, _query_fasta2
    input keys beginning with an underscore are optional.
    """
    _parameters = {
        #Score of a match [1]
        '-a': ValuedParameter('-', Delimiter=' ', Name='a'),

        #Mismatch penalty [3]
        '-b': ValuedParameter('-', Delimiter=' ', Name='b'),

        #Gap open penalty [5]
        '-q': ValuedParameter('-', Delimiter=' ', Name='q'),

        #Gap  extension  penalty.
        '-r': ValuedParameter('-', Delimiter=' ', Name='r'),

        # mask level [0.50]
        '-m': ValuedParameter('-', Delimiter=' ', Name='m'),

        #Number of threads in the multi-threading mode [1]
        '-t': ValuedParameter('-', Delimiter=' ', Name='t'),

        # file to output results to instead of stdout
        '-f': ValuedParameter('-', Delimiter=' ', Name='f'),

        #Band width in the banded alignment [33]
        '-w': ValuedParameter('-', Delimiter=' ', Name='w'),

        #Minimum score threshold divided by a [30]
        '-T': ValuedParameter('-', Delimiter=' ', Name='T'),

        #Coefficient  for  threshold  adjustment  according  to query length.
        #Given an l-long query, the threshold for a hit to be retained is
        #a*max{T,c*log(l)}. [5.5]
        '-c': ValuedParameter('-', Delimiter=' ', Name='c'),

        #Z-best heuristics. Higher -z increases accuracy at the cost
        #of speed. [1]
        '-z': ValuedParameter('-', Delimiter=' ', Name='z'),

        #Maximum SA interval size for initiating a seed. Higher -s increases
        #accuracy at the cost of speed. [3]
        '-s': ValuedParameter('-', Delimiter=' ', Name='s'),

        #Minimum  number  of  seeds  supporting  the  resultant alignment to
        #trigger reverse alignment. [5]
        '-N': ValuedParameter('-', Delimiter=' ', Name='N'),

        # in SAM output, use hard clipping instead of soft clipping
        '-H': FlagParameter('-', Name='H'),

        # mark multi-part alignments as secondary
        '-M': FlagParameter('-', Name='M'),

        # skip Smith-Waterman read pariing
        '-S': FlagParameter('-', Name='S'),

        # ignore pairs with insert >= INT for inferring the size of distr
        # [20000]
        '-I': ValuedParameter('-', Delimiter=' ', Name='I')
    }

    # the subcommand fo bwasw
    _subcommand = 'bwasw'

    # input file keys beginning with _ are optional inputs
    _input_order = ['prefix', 'query_fasta', '_query_fasta_2']

    _valid_arguments = {
        # Make sure this is a float
        '-c': is_float,
        '-m': is_float,

        # Make sure these are ints
        '-a': is_int,
        '-b': is_int,
        '-q': is_int,
        '-r': is_int,
        '-t': is_int,
        '-w': is_int,
        '-T': is_int,
        '-z': is_int,
        '-s': is_int,
        '-N': is_int,
        '-I': is_int,

        # make sure this is an absolute path
        '-f': isabs
    }

    def _get_result_paths(self, data):
        """Gets the result file for a bwa bwasw run.

        There is only one output file of a bwa bwasw run, a .sam file,
        and it can be retrieved with the key 'output'.
        """
        return {
            'output': ResultPath(self.Parameters['-f'].Value, IsWritten=True)
        }
Example #23
0
class BWA_aln(BWA):
    """Controls the "aln" subcommand of the bwa application.
    
    Valid input keys are: prefix, fastq_in 
    """
    _parameters = {
        # max #diff (int) or missing prob under 0.02 err rate (float) [0.04]
        '-n': ValuedParameter('-', Delimiter=' ', Name='n'),
        #maximum number or fraction of gap opens [1]
        '-o': ValuedParameter('-', Delimiter=' ', Name='o'),

        #maximum number of gap extensions, -1 for disabling long gaps [-1]
        '-e': ValuedParameter('-', Delimiter=' ', Name='e'),

        #do not put an indel within bp towards the ends [5]
        '-i': ValuedParameter('-', Delimiter=' ', Name='i'),

        #maximum occurrences for extending a long deletion [10]
        '-d': ValuedParameter('-', Delimiter=' ', Name='d'),

        #seed length [32]
        '-l': ValuedParameter('-', Delimiter=' ', Name='l'),

        #maximum differences in the seed [2]
        '-k': ValuedParameter('-', Delimiter=' ', Name='k'),

        #maximum entries in the queue [2000000]
        '-m': ValuedParameter('-', Delimiter=' ', Name='m'),

        #number of threads [1]
        '-t': ValuedParameter('-', Delimiter=' ', Name='t'),

        #mismatch penalty [3]
        '-M': ValuedParameter('-', Delimiter=' ', Name='M'),

        #gap open penalty [11]
        '-O': ValuedParameter('-', Delimiter=' ', Name='O'),

        #gap extension penalty [4]
        '-E': ValuedParameter('-', Delimiter=' ', Name='E'),

        #stop searching when there are > equally best hits [30]
        '-R': ValuedParameter('-', Delimiter=' ', Name='R'),

        #quality threshold for read trimming down to 35bp [0]
        '-q': ValuedParameter('-', Delimiter=' ', Name='q'),

        #file to write output to instead of stdout
        '-f': ValuedParameter('-', Delimiter=' ', Name='f'),

        #length of barcode
        '-B': ValuedParameter('-', Delimiter=' ', Name='B'),

        #log-scaled gap penalty for long deletions
        '-L': FlagParameter('-', Name='L'),

        #non-iterative mode: search for all n-difference hits (slooow)
        '-N': FlagParameter('-', Name='N'),

        #the input is in the Illumina 1.3+ FASTQ-like format
        '-I': FlagParameter('-', Name='I'),

        #the input read file is in the BAM format
        '-b': FlagParameter('-', Name='b'),

        #use single-end reads only (effective with -b)
        '-0': FlagParameter('-', Name='0'),

        #use the 1st read in a pair (effective with -b)
        '-1': FlagParameter('-', Name='1'),

        #use the 2nd read in a pair (effective with -b)
        '-2': FlagParameter('-', Name='2'),

        #filter Casava-filtered sequences
        '-Y': FlagParameter('-', Name='Y')
    }

    # the subcommand for bwa aln
    _subcommand = 'aln'

    _valid_arguments = {
        # check to see if this is decimal numbers
        '-n': is_float,

        # check to see if these are integers
        '-o': is_int,
        '-e': is_int,
        '-i': is_int,
        '-d': is_int,
        '-l': is_int,
        '-k': is_int,
        '-m': is_int,
        '-t': is_int,
        '-M': is_int,
        '-O': is_int,
        '-E': is_int,
        '-R': is_int,
        '-q': is_int,
        '-B': is_int,

        # check to see if this is an absolute file path
        '-f': isabs
    }

    # input file keys beginning with _ are optional inputs
    _input_order = ['prefix', 'fastq_in']

    def _get_result_paths(self, data):
        """Gets the result file for a bwa aln run.

        There is only one output file of a bwa aln run, a .sai file
        and it can be retrieved with the key 'output'.
        """
        return {
            'output': ResultPath(self.Parameters['-f'].Value, IsWritten=True)
        }
Example #24
0
class SeqPrep(CommandLineApplication):
    """SeqPrep application controller for joining paired-end reads"""
    _command = 'SeqPrep'
    _parameters = {
        # Required Arguments
        # -f <first read input fastq filename>
        # -r <second read input fastq filename>
        # -1 <first read output fastq filename>
        # -2 <second read output fastq filename>
        '-f': ValuedParameter(Prefix='-', Delimiter=' ', Name='f'),
        '-r': ValuedParameter(Prefix='-', Delimiter=' ', Name='r'),
        '-1': ValuedParameter(Prefix='-', Delimiter=' ', Name='1'),
        '-2': ValuedParameter(Prefix='-', Delimiter=' ', Name='2'),

        # General Arguments (Optional):
        # -3 <first read discarded fastq filename>
        # -4 <second read discarded fastq filename>
        # -h Display this help message and exit (also works with no args)
        # -6 Input sequence is in phred+64 rather than phred+33 format, the
        #    output will still be phred+33
        # -q <Quality score cutoff for mismatches to be counted in overlap; default = 13>
        # -L <Minimum length of a trimmed or merged read to print it; default = 30>
        '-3': ValuedParameter(Prefix='-', Delimiter=' ', Name='3'),
        '-4': ValuedParameter(Prefix='-', Delimiter=' ', Name='4'),
        '-h': FlagParameter(Prefix='-', Name='h'),
        '-6': FlagParameter(Prefix='-', Name='6'),
        '-q': ValuedParameter(Prefix='-', Delimiter=' ', Name='q'),
        '-L': ValuedParameter(Prefix='-', Delimiter=' ', Name='L'),

        # Arguments for Adapter/Primer Trimming (Optional):
        # -A <forward read primer/adapter sequence to trim as it would appear at the
        #   end of a read (recommend about 20bp of this)
        #	(should validate by grepping a file);
        #   default (genomic non-multiplexed adapter1) = AGATCGGAAGAGCGGTTCAG>
        # -B <reverse read primer/adapter sequence to trim as it would appear at the
        #   end of a read (recommend about 20bp of this)
        #	(should validate by grepping a file);
        #   default (genomic non-multiplexed adapter2) = AGATCGGAAGAGCGTCGTGT>
        # -O <minimum overall base pair overlap with adapter sequence to trim;
        #   default = 10>
        # -M <maximum fraction of good quality mismatching bases for primer/adapter
        #    overlap; default = 0.020000>
        # -N <minimum fraction of matching bases for primer/adapter overlap;
        #   default = 0.870000>
        # -b <adapter alignment band-width; default = 50>
        # -Q <adapter alignment gap-open; default = 8>
        # -t <adapter alignment gap-extension; default = 2>
        # -e <adapter alignment gap-end; default = 2>
        # -Z <adapter alignment minimum local alignment score cutoff
        #   [roughly (2*num_hits) - (num_gaps*gap_open) - (num_gaps*gap_close) -
        #   (gap_len*gap_extend) - (2*num_mismatches)]; default = 26>
        # -w <read alignment band-width; default = 50>
        # -W <read alignment gap-open; default = 26>
        # -p <read alignment gap-extension; default = 9>
        # -P <read alignment gap-end; default = 5>
        # -X <read alignment maximum fraction gap cutoff; default = 0.125000>
        '-A': ValuedParameter(Prefix='-', Delimiter=' ', Name='A'),
        '-B': ValuedParameter(Prefix='-', Delimiter=' ', Name='B'),
        '-O': ValuedParameter(Prefix='-', Delimiter=' ', Name='O'),
        '-M': ValuedParameter(Prefix='-', Delimiter=' ', Name='M'),
        '-N': ValuedParameter(Prefix='-', Delimiter=' ', Name='N'),
        '-b': ValuedParameter(Prefix='-', Delimiter=' ', Name='b'),
        '-Q': ValuedParameter(Prefix='-', Delimiter=' ', Name='Q'),
        '-t': ValuedParameter(Prefix='-', Delimiter=' ', Name='t'),
        '-e': ValuedParameter(Prefix='-', Delimiter=' ', Name='e'),
        '-Z': ValuedParameter(Prefix='-', Delimiter=' ', Name='Z'),
        '-w': ValuedParameter(Prefix='-', Delimiter=' ', Name='w'),
        '-W': ValuedParameter(Prefix='-', Delimiter=' ', Name='W'),
        '-p': ValuedParameter(Prefix='-', Delimiter=' ', Name='p'),
        '-P': ValuedParameter(Prefix='-', Delimiter=' ', Name='P'),
        '-X': ValuedParameter(Prefix='-', Delimiter=' ', Name='X'),

        # Optional Arguments for Merging:
        # -y <maximum quality score in output ((phred 33) default = ']' )>
        # -g <print overhang when adapters are present and stripped (use this if
        #   reads are different length)>
        # -s <perform merging and output the merged reads to this file>
        # -E <write pretty alignments to this file for visual Examination>
        # -x <max number of pretty alignments to write (if -E provided);
        #   default = 10000>
        # -o <minimum overall base pair overlap to merge two reads; default = 15>
        # -m <maximum fraction of good quality mismatching bases to overlap reads;
        #   default = 0.020000>
        # -n <minimum fraction of matching bases to overlap reads;
        #   default = 0.900000>
        '-y': ValuedParameter(Prefix='-', Delimiter=' ', Name='y'),
        '-g': FlagParameter(Prefix='-', Name='y'),
        '-s': ValuedParameter(Prefix='-', Delimiter=' ', Name='s'),
        '-E': ValuedParameter(Prefix='-', Delimiter=' ', Name='E'),
        '-x': ValuedParameter(Prefix='-', Delimiter=' ', Name='x'),
        '-o': ValuedParameter(Prefix='-', Delimiter=' ', Name='o'),
        '-m': ValuedParameter(Prefix='-', Delimiter=' ', Name='m'),
        '-n': ValuedParameter(Prefix='-', Delimiter=' ', Name='n')
    }

    def _unassembled_reads1_out_file_name(self):
        """Checks file name is set for reads1 output. 
           Returns absolute path."""
        if self.Parameters['-1'].isOn():
            unassembled_reads1 = self._absolute(
                str(self.Parameters['-1'].Value))
        else:
            raise ValueError("No reads1 (flag: -1) output path specified")
        return unassembled_reads1

    def _unassembled_reads2_out_file_name(self):
        """Checks if file name is set for reads2 output. 
           Returns absolute path."""
        if self.Parameters['-2'].isOn():
            unassembled_reads2 = self._absolute(
                str(self.Parameters['-2'].Value))
        else:
            raise ValueError("No reads2 (flag -2) output path specified")
        return unassembled_reads2

    def _discarded_reads1_out_file_name(self):
        """Checks if file name is set for discarded reads1 output. 
           Returns absolute path."""
        if self.Parameters['-3'].isOn():
            discarded_reads1 = self._absolute(str(self.Parameters['-3'].Value))
        else:
            raise ValueError(
                "No discarded-reads1 (flag -3) output path specified")
        return discarded_reads1

    def _discarded_reads2_out_file_name(self):
        """Checks if file name is set for discarded reads2 output. 
           Returns absolute path."""
        if self.Parameters['-4'].isOn():
            discarded_reads2 = self._absolute(str(self.Parameters['-4'].Value))
        else:
            raise ValueError(
                "No discarded-reads2 (flag -4) output path specified")
        return discarded_reads2

    def _assembled_out_file_name(self):
        """Checks file name is set for assembled output. 
           Returns absolute path."""
        if self.Parameters['-s'].isOn():
            assembled_reads = self._absolute(str(self.Parameters['-s'].Value))
        else:
            raise ValueError(
                "No assembled-reads (flag -s) output path specified")
        return assembled_reads

    def _pretty_alignment_out_file_name(self):
        """Checks file name is set for pretty alignment output. 
           Returns absolute path."""
        if self.Parameters['-E'].isOn():
            pretty_alignment = self._absolute(str(self.Parameters['-E'].Value))
        else:
            raise ValueError(
                "No pretty-=alignment (flag -E) output path specified")
        return pretty_alignment

    def _get_result_paths(self, data):
        """Captures SeqPrep output.
        
        """
        result = {}

        # Always output:
        result['UnassembledReads1'] = ResultPath(
            Path=self._unassembled_reads1_out_file_name(), IsWritten=True)
        result['UnassembledReads2'] = ResultPath(
            Path=self._unassembled_reads2_out_file_name(), IsWritten=True)

        # optional output, so we check for each
        # check for assembled reads file
        if self.Parameters['-s'].isOn():
            result['Assembled'] = ResultPath(
                Path=self._assembled_out_file_name(), IsWritten=True)

        # check for discarded (unassembled) reads1 file
        if self.Parameters['-3'].isOn():
            result['Reads1Discarded'] = ResultPath(
                Path=self._discarded_reads1_out_file_name(), IsWritten=True)

        # check for discarded (unassembled) reads2 file
        if self.Parameters['-4'].isOn():
            result['Reads2Discarded'] = ResultPath(
                Path=self._discarded_reads2_out_file_name(), IsWritten=True)

        # check for pretty-alignment file
        if self.Parameters['-E'].isOn():
            result['PrettyAlignments'] = ResultPath(
                Path=self._pretty_alignment_out_file_name(), IsWritten=True)

        return result

    def getHelp(self):
        """seqprep help"""
        help_str = """
        For basic help, type the following at the command line:
            'SeqPrep -h'

        Website:
            https://github.com/jstjohn/SeqPrep
        """
        return help_str
Example #25
0
class BWA_index(BWA):
    """Controls the "index" subcommand of the bwa application.
    
    Valid input keys are: fasta_in
    """

    # the subcommand for bwa index
    _subcommand = "index"

    _parameters = {
        # which algorithm to use.
        # is
        # IS linear-time algorithm for constructing suffix array. It requires
        # 5.37N memory where N is the size of the database. IS is moderately
        # fast, but does not work with database larger than 2GB. IS is the
        # default algorithm due to its simplicity. The current codes for IS
        # algorithm are reimplemented by Yuta Mori.
        #
        # bwtsw
        # Algorithm implemented in BWT-SW. This method works with the whole
        # human genome, but it does not work with database smaller than 10MB
        # and it is usually slower than IS.
        #
        # DEFAULTs to auto-select (based on input fasta file size)
        '-a': ValuedParameter('-', Delimiter=' ', Name='a'),

        # prefix for the output index.
        # DEFAULTs to the base name of the input fasta file
        '-p': ValuedParameter('-', Delimiter=' ', Name='p'),

        # index files named as <in.fasta>.64.* instead of <in.fasta>.*
        '-6': FlagParameter('-', Name='6')
    }

    # The -a command can take on of only two possible values
    # the -p command allows the user to specify a prefix; for our purposes,
    # this prefix should be an abolute path
    _valid_arguments = {'-a': lambda x: x in ['is', 'bwtsw'], '-p': isabs}

    # For the position specific arguments, this is the order that they will
    # be written in the base command
    # input file keys beginning with _ are optional inputs
    _input_order = ['fasta_in']

    def _get_result_paths(self, data):
        """Gets the results for a run of bwa index.

        bwa index outputs 5 files when the index is created. The filename
        prefix will be the same as the input fasta, unless overridden with
        the -p option, and the 5 extensions are listed below:

        .amb
        .ann
        .bwt
        .pac
        .sa

        and these extentions (including the period) are the keys to the
        dictionary that is returned.
        """

        # determine the names of the files. The name will be the same as the
        # input fasta file unless overridden with the -p option
        if self.Parameters['-p'].isOn():
            prefix = self.Parameters['-p'].Value
        else:
            prefix = data['fasta_in']

        # the 5 output file suffixes
        suffixes = ['.amb', '.ann', '.bwt', '.pac', '.sa']
        out_files = {}
        for suffix in suffixes:
            out_files[suffix] = ResultPath(prefix + suffix, IsWritten=True)

        return out_files
Example #26
0
class Detrender(CommandLineApplication):
    """ ApplicationController for detrending ordination coordinates
    """

    _command = 'R'
    _r_script = 'detrend.r'
    _parameters = {\
        # input PCoA file

        '-i':ValuedParameter(Prefix='-',Name='i',Delimiter=' ',IsPath=True),\
        # metadata table

        '-m':ValuedParameter(Prefix='-',Name='m',Delimiter=' ',IsPath=True),\
        # gradient variable column header name

        '-c':ValuedParameter(Prefix='-',Name='c',Delimiter=' '),\
        # output directory

        '-o':ValuedParameter(Prefix='-',Name='o',Delimiter=' ',IsPath=True),\
        # use pre-rotation for optimal alignment with known gradient

        '-r':FlagParameter(Prefix='-',Name='r'),\
     }
    _input_handler = '_input_as_parameter'
    _suppress_stdout = False
    _suppress_stderr = False

    def _input_as_parameter(self, data):
        """ Set the input path based on data (data is the filepath)
        """
        self.Parameters['-i'].on(data)
        return ''

    def _get_result_paths(self, data):
        """ Build the dict of result filepaths
        """
        # access the output dir through self.Parameters so we know it's been cast
        # to a FilePath
        od = self.Parameters['-o'].Value

        result = {}
        # the before/after coords plot
        result['plot'] = ResultPath(Path=join(od, 'PCoA_vs_projection.pdf'),
                                    IsWritten=True)
        # the detrended coords file
        result['coords'] = ResultPath(Path=join(od, 'detrended_pcoa.txt'),
                                      IsWritten=True)
        # the summary file, only present if metadata was included
        summary_fp = join(od, 'summary.txt')
        result['summary'] = ResultPath(Path=summary_fp,
                                       IsWritten=self.Parameters['-c'].isOn())
        return result

    def _get_R_script_dir(self):
        """Returns the path to the qiime R source directory
        """
        qiime_dir = get_qiime_project_dir()
        script_dir = join(qiime_dir, 'qiime', 'support_files', 'R')
        return script_dir

    def _get_R_script_path(self):
        """Returns the path to the R script to be executed
        """
        return join(self._get_R_script_dir(), self._r_script)

    # Overridden to add R-specific command-line arguments
    # This means:
    # R --slave --vanilla --args --source_dir $QIIMEDIR/qiime/support/R/ ... <normal params> ... < detrend.r
    def _get_base_command(self):
        """Returns the base command plus R command-line options.
           Adapted from RDP Classifier app controller
        """
        cd_command = ''.join(['cd ', str(self.WorkingDir), ';'])
        r_command = self._commandline_join(
            ['R', '--slave', '--no-restore', '--args'])
        source_dir_arg = self._commandline_join(
            ['--source_dir', self._get_R_script_dir()])
        script_arguments = self._commandline_join(
            [self.Parameters[k] for k in self._parameters])

        command_parts = [
            cd_command, r_command, source_dir_arg, script_arguments, '<',
            self._get_R_script_path()
        ]
        return self._commandline_join(command_parts).strip()

    BaseCommand = property(_get_base_command)

    def _commandline_join(self, tokens):
        """Formats a list of tokens as a shell command
    
           Taken from RDP Classifier app controller
        """
        commands = filter(None, map(str, tokens))
        return self._command_delimiter.join(commands).strip()

    def _accept_exit_status(self, exit_status):
        """ Return True when the exit status was 0
        """
        return exit_status == 0
Example #27
0
class BWA_sampe(BWA):
    """Controls the "sampe" subcommand of the bwa application.
    
    Valid input keys are: prefix, sai1_in, sai2_in, fastq1_in,
    fastq2_in
    """
    _parameters = {
        # Maximum insert size for a read pair to be considered being mapped
        # properly
        '-a': ValuedParameter('-', Delimiter=' ', Name='a'),

        # Maximum occurrences of a read for pairing
        '-o': ValuedParameter('-', Delimiter=' ', Name='o'),

        # Load the entire FM-index into memory to reduce disk operations
        '-P': FlagParameter('-', Name='P'),

        # maximum hits to output for paired reads [3]
        '-n': ValuedParameter('-', Delimiter=' ', Name='n'),

        # maximum hits to output for discordant pairs [10]
        '-N': ValuedParameter('-', Delimiter=' ', Name='N'),

        #file to write output to instead of stdout
        '-f': ValuedParameter('-', Delimiter=' ', Name='f'),

        # Specify the read group in a format like '@RG\tID:foo\tSM:bar'
        '-r': ValuedParameter('-', Delimiter=' ', Name='r'),

        # disable Smith-Waterman for the unmapped mate
        '-s': FlagParameter('-', Name='s'),

        # prior of chimeric rate (lower bound) [1.0e-05]
        '-c': ValuedParameter('-', Delimiter=' ', Name='c'),

        # disable insert size estimate (force -s)
        '-A': FlagParameter('-', Name='A')
    }

    # the subcommand for sampe
    _subcommand = 'sampe'

    _valid_arguments = {
        # make sure this is a float
        '-c': is_float,

        # make sure these are all ints
        '-a': is_int,
        '-o': is_int,
        '-n': is_int,
        '-N': is_int,

        # check to see if this is an absolute file path
        '-f': isabs
    }

    # input file keys beginning with _ are optional inputs
    _input_order = ['prefix', 'sai1_in', 'sai2_in', 'fastq1_in', 'fastq2_in']

    def _get_result_paths(self, data):
        """Gets the result file for a bwa sampe run.

        There is only one output file of a bwa sampe run, a .sam file,
        and it can be retrieved with the key 'output'.
        """
        return {
            'output': ResultPath(self.Parameters['-f'].Value, IsWritten=True)
        }
Example #28
0
class Pplacer(CommandLineApplication):
    """pplacer Application Controller
    """

    _command = 'pplacer'
    _input_handler = '_input_as_multiline_string'
    _parameters = {
        # -c Specify the path to the reference package.
        '-c': ValuedParameter('-', Name='c', Delimiter=' ', IsPath=True),

        # -t Specify the reference tree filename.
        '-t': ValuedParameter('-', Name='t', Delimiter=' ', IsPath=True),

        # -r Specify the reference alignment filename.
        '-r': ValuedParameter('-', Name='r', Delimiter=' ', IsPath=True),

        # -s Supply a phyml stats.txt or a RAxML info file giving the model parameters.
        '-s': ValuedParameter('-', Name='s', Delimiter=' ', IsPath=True),

        # -d Specify the directory containing the reference information.
        '-d': ValuedParameter('-', Name='d', Delimiter=' ', IsPath=True),

        # -p Calculate posterior probabilities.
        '-p': FlagParameter('-', Name='p'),

        # -m Substitution model. Protein: are LG, WAG, or JTT. Nucleotides: GTR.
        '-m': ValuedParameter('-', Name='m', Delimiter=' '),

        # --model-freqs Use model frequencies instead of reference alignment frequencies.
        '--model-freqs': FlagParameter('--', Name='model-freqs'),

        # --gamma-cats Number of categories for discrete gamma model.
        '--gamma-cats': ValuedParameter('--', Name='gamma-cats', Delimiter=' '),

        # --gamma-alpha Specify the shape parameter for a discrete gamma model.
        '--gamma-alpha': ValuedParameter('--', Name='gamma-alpha', Delimiter=' '),

        # --ml-tolerance 1st stage branch len optimization tolerance (2nd stage to 1e-5). Default: 0.01.
        '--ml-tolerance': ValuedParameter('--', Name='ml-tolerance', Delimiter=' '),

        # --pp-rel-err Relative error for the posterior probability calculation. Default is 0.01.
        '--pp-rel-err': ValuedParameter('--', Name='pp-rel-err', Delimiter=' '),

        # --unif-prior Use a uniform prior rather than exponential.
        '--unif-prior': FlagParameter('--', Name='unif-prior'),

        # --start-pend Starting pendant branch length. Default is 0.1.
        '--start-pend': ValuedParameter('--', Name='start-pend', Delimiter=' '),
        
        # --max-pend Set the maximum ML pendant branch length. Default is 2.
        '--max-pend': ValuedParameter('--', Name='max-pend', Delimiter=' '),
        
        # --max-strikes Maximum number of strikes for baseball. 0 -> no ball playing. Default is 6.
        '--max-strikes': ValuedParameter('--', Name='max-strikes', Delimiter=' '),
        
        # --strike-box Set the size of the strike box in log likelihood units. Default is 3.
        '--strike-box': ValuedParameter('--', Name='strike-box', Delimiter=' '),
        
        # --max-pitches Set the maximum number of pitches for baseball. Default is 40.
        '--max-pitches': ValuedParameter('--', Name='max-pitches', Delimiter=' '),
        
        # --fantasy Desired likelihood cutoff for fantasy baseball mode. 0 -> no fantasy.
        '--fantasy': ValuedParameter('--', Name='fantasy', Delimiter=' '),
        
        # --fantasy-frac Fraction of fragments to use when running fantasy baseball. Default is 0.1.
        '--fantasy-frac': ValuedParameter('--', Name='fantasy-frac', Delimiter=' '),
        
        # --write-masked Write alignment masked to the region without gaps in the query.
        '--write-masked': FlagParameter('--', Name='write-masked'),
        
        # --verbosity Set verbosity level. 0 is silent, and 2 is quite a lot. Default is 1.
        '--verbosity': ValuedParameter('--', Name='verbosity', Delimiter=' '),
        
        # --unfriendly Do not run friend finder pre-analysis.
        '--unfriendly': FlagParameter('--', Name='unfriendly'),
        
        # --out-dir Specify the directory to write place files to.
        '--out-dir': ValuedParameter('--', Name='out-dir', Delimiter=' ', IsPath=True),
        
        # --pretend Only check out the files then report. Do not run the analysis.
        '--pretend': FlagParameter('--', Name='pretend'),

        # --csv Make a CSV file with the results.
        '--csv': FlagParameter('--', Name='csv'),

        # --old-format Make an old-format placefile with the resuls.
        '--old-format': FlagParameter('--', Name='old-format'),

        # --diagnostic Write file describing the 'diagnostic' mutations for various clades.
        '--diagnostic': FlagParameter('--', Name='diagnostic'),

        # --check-like Write out the likelihood of the reference tree, calculated two ways.
        '--check-like': FlagParameter('--', Name='check-like'),

        # --version Write out the version number and exit.
        '--version': FlagParameter('--', Name='version'),

        # --help  Display this list of options
        '--help': FlagParameter('--', Name='help'),
    }
 
    def getTmpFilename(self, tmp_dir="/tmp",prefix='tmp',suffix='.fasta',\
           include_class_id=False,result_constructor=FilePath):
        """ Define Tmp filename to contain .fasta suffix, since pplacer requires
            the suffix to be .fasta """
            
        return super(Pplacer,self).getTmpFilename(tmp_dir=tmp_dir,
                                    prefix=prefix,
                                    suffix=suffix,
                                    include_class_id=include_class_id,
                                    result_constructor=result_constructor)
    
    def _handle_app_result_build_failure(self,out,err,exit_status,result_paths):
        """ Catch the error when files are not produced """
        raise ApplicationError('Pplacer failed to produce an output file due to the following error: \n\n%s ' \
         % out.read())

    def _get_result_paths(self,data):
        """ Define the output filepaths """
        output_dir = self.Parameters['--out-dir'].Value
        result = {}
        result['json'] = ResultPath(Path=join(output_dir,
                                splitext(split(self._input_filename)[-1])[0] + \
                                '.jplace'))
        return result
Example #29
0
class Uclust(CommandLineApplication):
    """ Uclust ApplicationController
    
    """

    _command = 'uclust'
    _input_handler = '_input_as_parameters'
    _parameters = {\
        
        # Fasta input file for merge-sort function


        '--mergesort':ValuedParameter('--',Name='mergesort',Delimiter=' ',
            IsPath=True),

        # Output file, used by several difference functions
        '--output':ValuedParameter('--',Name='output',Delimiter=' ',
            IsPath=True),

        # Sets temp directory for uclust to create temp fasta file
        '--tmpdir':ValuedParameter('--',Name='tmpdir',Delimiter=' ',
            IsPath=True),

        # input filename, fasta format
        '--input':ValuedParameter('--',Name='input',Delimiter=' ',
            IsPath=True),

        # Output filename will be in uclust (.uc) format
        # Output cluster file, required parameter
        '--uc':ValuedParameter('--',Name='uc',Delimiter=' ',
            IsPath=True),

        # ID percent for OTU, by default is 97%
        '--id':ValuedParameter('--',Name='id',Delimiter=' ',IsPath=False),

        # Disable reverse comparison option, if norev is disabled
        # memory usage is expected to double for uclust
        '--rev':FlagParameter('--',Name='rev'),

        # 'library' file -- a reference of sequences representing pre-existing
        # clusters
        '--lib':ValuedParameter('--',Name='lib',Delimiter=' ',IsPath=True),

        # only compare sequences to the library file, don't add new clusters
        # for sequences which don't hit the library
        '--libonly':FlagParameter('--',Name='libonly'),

        # Maximum hits before quitting search (default 1, 0=infinity).
        '--maxaccepts':ValuedParameter('--',Name='maxaccepts',Delimiter=' '),

        # Maximum rejects before quitting search (default 8, 0=infinity).
        '--maxrejects':ValuedParameter('--',Name='maxrejects',Delimiter=' '),

        # Target nr. of common words (default 8, 0=don't step)
        '--stepwords':ValuedParameter('--',Name='stepwords',Delimiter=' '),

        # Word length for windex (default 5 aa.s, 8 nuc.s).
        '--w':ValuedParameter('--',Name='w',Delimiter=' '),

        # output fp for pairwise aligned sequences
        '--fastapairs':ValuedParameter('--',Name='fastapairs',Delimiter=' ',
            IsPath=True),

        # input filename, .uc format
        '--uc2clstr':ValuedParameter('--', Name='uc2clstr', Delimiter=' ',
            IsPath=True),

        # Don't assume input is sorted by length (default assume sorted).
        '--usersort':FlagParameter('--',Name='usersort'),

        # Same as --maxrejects 0 --nowordcountreject.
        # comes with a performance hit.
        '--exact':FlagParameter('--',Name='exact'),

        # Same as --maxrejects 0 --maxaccepts 0 --nowordcountreject --
        # comes with a performance hit.
        '--optimal':FlagParameter('--',Name='optimal'),

        '--stable_sort':FlagParameter('--',Name='stable_sort'),

        # From uclust help:
        # Write all accepts to .uc file (default top hit/no match only).
        '--allhits':FlagParameter('--',Name='allhits'),
    }

    _suppress_stdout = False
    _suppress_stderr = False

    def _input_as_parameters(self, data):
        """ Set the input path (a fasta filepath)
        """
        # The list of values which can be passed on a per-run basis
        allowed_values = ['--input','--uc','--fastapairs',\
                           '--uc2clstr','--output','--mergesort']

        unsupported_parameters = set(data.keys()) - set(allowed_values)
        if unsupported_parameters:
            raise ApplicationError,\
             "Unsupported parameter(s) passed when calling uclust: %s" %\
              ' '.join(unsupported_parameters)

        for v in allowed_values:
            # turn the parameter off so subsequent runs are not
            # affected by parameter settings from previous runs
            self.Parameters[v].off()
            if v in data:
                # turn the parameter on if specified by the user
                self.Parameters[v].on(data[v])

        return ''

    def _get_result_paths(self, data):
        """ Set the result paths """

        result = {}

        result['Output'] = ResultPath(\
         Path=self.Parameters['--output'].Value,\
         IsWritten=self.Parameters['--output'].isOn())

        result['ClusterFile'] = ResultPath(
            Path=self.Parameters['--uc'].Value,
            IsWritten=self.Parameters['--uc'].isOn())

        result['PairwiseAlignments'] = ResultPath(
            Path=self.Parameters['--fastapairs'].Value,
            IsWritten=self.Parameters['--fastapairs'].isOn())

        return result

    def _accept_exit_status(self, exit_status):
        """ Test for acceptable exit status
        
            uclust can seg fault and still generate a parsable .uc file
            so we explicitly check the exit status
        
        """
        return exit_status == 0

    def getHelp(self):
        """Method that points to documentation"""
        help_str =\
        """
        UCLUST is hosted at:
        http://www.drive5.com/uclust/

        The following papers should be cited if this resource is used:

        Paper pending. Check with Robert Edgar who is writing the paper
        for uclust as of March 2010.  Cite the above URL for the time being.
        """
        return help_str
Example #30
0
class PandaSeq(CommandLineApplication):
    """pandaseq application controller for joining paired-end reads """
    _command = 'pandaseq'
    _parameters = {
        # pandaseq 2.4 <*****@*****.**>
        # Usage: pandaseq -f forward.fastq -r reverse.fastq [-6] [-a] [-B]
        #    [-C module1 -C module2 ...] [-d flags] [-F] [-j] [-L maxlen]
        #    [-l minlen] [-N] [-o minoverlap] [-p forwardprimer]
        #    [-q reverseprimer] [-T threads] [-t threshold] > assembled.fastq

        # -6  Use PHRED+64 (CASAVA 1.3-1.7) instead of PHRED+33 (CASAVA 1.8+).
        '-6': FlagParameter(Prefix='-', Name='6'),

        # -a  Strip the primers after assembly, rather than before.
        '-a': FlagParameter(Prefix='-', Name='a'),

        # -B  Allow unbarcoded sequences (try this for BADID errors).
        '-B': FlagParameter(Prefix='-', Name='B'),

        # -C module   Load a sequence validation module.
        '-C': FlagParameter(Prefix='-', Name='C'),

        # -d flags    Control the logging messages. Capital to enable; small to disable.
        #    (R)econstruction detail.
        #    Sequence (b)uilding information.
        #    (F)ile processing.
        #    (k)-mer table construction.
        #    Show every (m)ismatch.
        '-d': ValuedParameter(Prefix='-', Delimiter=' ', Name='d'),

        #    Optional (s)tatistics.
        # -f  Input FASTQ file containing forward reads.
        '-f': ValuedParameter(Prefix='-', Delimiter=' ', Name='f'),

        # -F  Output FASTQ instead of FASTA.
        '-F': FlagParameter(Prefix='-', Name='F'),

        # -j  Input files are bzipped.
        '-j': FlagParameter(Prefix='-', Name='j'),

        # -k kmers    The number of k-mers in the table.
        '-k': ValuedParameter(Prefix='-', Delimiter=' ', Name='k'),

        # -L maxlen   Maximum length for a sequence
        '-L': ValuedParameter(Prefix='-', Delimiter=' ', Name='L'),

        # -l minlen   Minimum length for a sequence
        '-l': ValuedParameter(Prefix='-', Delimiter=' ', Name='l'),

        # -N  Eliminate all sequences with unknown nucleotides in the output.
        '-N': FlagParameter(Prefix='-', Name='N'),

        # -o minoverlap   Minimum overlap between forward and reverse reads (default = 1)
        '-o': ValuedParameter(Prefix='-', Delimiter=' ', Name='o'),

        # -p  Forward primer sequence or number of bases to be removed.
        '-p': ValuedParameter(Prefix='-', Delimiter=' ', Name='p'),

        # -q  Reverse primer sequence or number of bases to be removed.
        '-q': ValuedParameter(Prefix='-', Delimiter=' ', Name='q'),

        # -r  Input FASTQ file containing reverse reads.
        '-r': ValuedParameter(Prefix='-', Delimiter=' ', Name='r'),

        # -T thread   Run with a number of parallel threads.
        '-T': ValuedParameter(Prefix='-', Delimiter=' ', Name='T'),

        # -t  The minimum probability that a sequence must have to match a primer.
        #     (default = 6.000000e-01)
        '-t': ValuedParameter(Prefix='-', Delimiter=' ', Name='t'),
    }

    # No _get_results_path needed as all results (the merged paired-ends)
    # are sent to STDOUT.

    def getHelp(self):
        """pandaseq help"""
        help_Str = """