Ejemplo n.º 1
0
class hybrid_ss_min(CommandLineApplication):
    """Application controller for hybrid_ss_min application

    Computes a minimum energy folding for a RNA or DNA sequence
    """

    #not all options supported here!
    #left out: -n,-d,-r,-f,-I,-q,-c,-b and all obscure options
    _parameters = {
        't': ValuedParameter(Prefix='-', Name='t', Value=None, Delimiter=' '),
        'T': ValuedParameter(Prefix='-', Name='T', Value=None, Delimiter=' '),
        'i': ValuedParameter(Prefix='-', Name='i', Value=None, Delimiter=' '),
        's': ValuedParameter(Prefix='-', Name='s', Value=None, Delimiter=' '),
        'o': ValuedParameter(Prefix='-', Name='o', Value=None, Delimiter=' '),
        'N': ValuedParameter(Prefix='-', Name='N', Value=None, Delimiter=' '),
        'M': ValuedParameter(Prefix='-', Name='M', Value=None, Delimiter=' '),
        'p': ValuedParameter(Prefix='-', Name='p', Value=None, Delimiter=' '),
        'E': FlagParameter(Prefix='-', Name='E'),
        'F': ValuedParameter(Prefix='-', Name='F', Value=None, Delimiter=' '),
        'm': ValuedParameter(Prefix='-', Name='m', Value=None, Delimiter=' ')
    }

    _command = 'hybrid-ss-min'
    _input_handler = '_input_as_string'

    def _get_result_paths(self, data):
        """Return a dict of ResultPath objects representing all possible output

    This dictionary will have keys based
    on the name that you'd like to access the file by in the 
    CommandLineAppResult object that will be created, and the values
    which are ResultPath objects."""

        result = {}
        #UNAfold default values
        start_tmp = 37
        step = 1
        end_tmp = 38

        if isinstance(data, list):
            filename = self._input_filename.split('/')[-1]
        else:
            filename = data.split('/')[-1]

        result['ct']= \
            ResultPath(Path=(self.WorkingDir+filename+'.ct'))
        result['dG'] = \
            ResultPath(Path=(self.WorkingDir+filename+'.dG'))
        result['run'] = \
            ResultPath(Path=(self.WorkingDir+filename+'.run'))

        #if temp interval is not default it will result in more output-files
        #one for every temp
        if self.Parameters['t'].Value is not None:
            start_tmp = self.Parameters['t'].Value
        if self.Parameters['T'].Value is not None:
            end_tmp = self.Parameters['T'].Value + 1
        if self.Parameters['i'].Value is not None:
            step = self.Parameters['i'].Value
        for i in range(start_tmp, end_tmp, step):
            temp = str(i)
            result['plot_%d' % i]= \
            ResultPath(Path=(self.WorkingDir+filename+'.'+temp+'.plot'))
            result['ext_%d' % i]= \
            ResultPath(Path=(self.WorkingDir+filename+'.'+temp+'.ext'))

        return result
Ejemplo n.º 2
0
class Rtax(CommandLineApplication):
    """ Rtax ApplicationController

    """

    _command = 'rtax'
    _input_handler = '_input_as_parameters'
    _parameters = {\

        # -r a reference database in FASTA format
        '-r':ValuedParameter('-',Name='r',Delimiter=' ', IsPath=True),

        # -t a taxonomy file with sequence IDs matching the reference database
        '-t':ValuedParameter('-',Name='t',Delimiter=' ', IsPath=True),

        # -a a FASTA file containing query sequences (single-ended, read 1, or paired-end delimited)
        '-a':ValuedParameter('-',Name='a',Delimiter=' ', IsPath=True),

        # -b a FASTA file containing query sequences (read 2, with matching IDs)
        '-b':ValuedParameter('-',Name='b',Delimiter=' ', IsPath=True),

        # -l a text file containing sequence IDs to process, one per line
        '-l':ValuedParameter('-',Name='l',Delimiter=' ', IsPath=True),

        # -d a delimiter separating the two reads when provided in a single file
        '-d':ValuedParameter('-',Name='d',Delimiter=' ', IsPath=False, Quote="\""),

        # -i a regular expression used to select part of the fasta header to use as the sequence id.
        '-i':ValuedParameter('-',Name='i',Delimiter=' ', IsPath=False, Quote="'"),

        # -o output file name for classification assignment
        '-o': ValuedParameter('-', Name='o', Delimiter=' ', IsPath=True),

        # -m temporary directory
        '-m': ValuedParameter('-', Name='m', Delimiter=' ', IsPath=True),

        # -f allow fallback from paired-end to single-ended classification when one read is missing
        '-f':FlagParameter(Prefix='-',Name='f'),

        # -g do not allow fallback from paired-end to single-ended classification when one read is too generic
        '-g':FlagParameter(Prefix='-',Name='g')
    }

    _suppress_stdout = False
    _suppress_stderr = False

    #def __init__(self):
    #    super().__init__()...
    #    usearch_command = "usearch"
    #    if not (exists(usearch_command) or app_path(usearch_command)):
    #        raise ApplicationNotFoundError,\
 	#        "Cannot find %s. Is it installed? Is it in your path?"\
 	#        % usearch_command


    def _input_as_parameters(self,data):
        """ Set the input path (a fasta filepath)
        """
        # The list of values which can be passed on a per-run basis
        allowed_values = ['-r','-t','-a','-b','-l','-d','i','-o','-m','-v','-f', '-g']

        unsupported_parameters = set(data.keys()) - set(allowed_values)
        if unsupported_parameters:
            raise ApplicationError,\
             "Unsupported parameter(s) passed when calling rtax: %s" %\
              ' '.join(unsupported_parameters)

        for v in allowed_values:
            # turn the parameter off so subsequent runs are not
            # affected by parameter settings from previous runs
            self.Parameters[v].off()
            if v in data:
                # turn the parameter on if specified by the user
                self.Parameters[v].on(data[v])

        return ''

    def _get_result_paths(self,data):
        """ Return a dict of ResultPath objects representing all possible output
        """
        assignment_fp = str(self.Parameters['-o'].Value).strip('"')
        if not os.path.isabs(assignment_fp):
            assignment_fp = os.path.relpath(assignment_fp, self.WorkingDir)
        return {'Assignments': ResultPath(assignment_fp, IsWritten=True)}



    def _accept_exit_status(self,exit_status):
        """ Test for acceptable exit status

            uclust can seg fault and still generate a parsable .uc file
            so we explicitly check the exit status

        """
        return exit_status == 0

    def getHelp(self):
        """Method that points to documentation"""
        help_str =\
        """
        RTAX is hosted at:
        http://dev.davidsoergel.com/rtax/

        The following paper should be cited if this resource is used:

        Soergel D.A.W., Dey N., Knight R., and Brenner S.E.  2012.
        Selection of primers for optimal taxonomic classification
        of environmental 16S rRNA gene sequences.  ISME J (6), 1440-1444
        """
        return help_str
Ejemplo n.º 3
0
class FastTree(CommandLineApplication):
    """FastTree application Controller"""

    _command = 'FastTree'
    _input_handler = '_input_as_multiline_string'
    _parameters = {
        '-quiet': FlagParameter('-', Name='quiet'),
        '-boot': ValuedParameter('-', Delimiter=' ', Name='boot'),
        '-seed': ValuedParameter('-', Delimiter=' ', Name='seed'),
        '-nni': ValuedParameter('-', Delimiter=' ', Name='nni'),
        '-slow': FlagParameter('-', Name='slow'),
        '-fastest': FlagParameter('-', Name='fastest'),
        '-top': FlagParameter('-', Name='top'),
        '-notop': FlagParameter('-', Name='notop'),
        '-topm': ValuedParameter('-', Delimiter=' ', Name='topm'),
        '-close': ValuedParameter('-', Delimiter=' ', Name='close'),
        '-refresh': ValuedParameter('-', Delimiter=' ', Name='refresh'),
        '-matrix': ValuedParameter('-', Delimiter=' ', Name='matrix'),
        '-nomatrix': FlagParameter('-', Name='nomatrix'),
        '-nj': FlagParameter('-', Name='nj'),
        '-bionj': FlagParameter('-', Name='bionj'),
        '-nt': FlagParameter('-', Name='nt'),
        '-n': ValuedParameter('-', Delimiter=' ', Name='n')
    }

    #FastTree [-quiet] [-boot 1000] [-seed 1253] [-nni 10] [-slow | -fastest]
    #      [-top | -notop] [-topm 1.0 [-close 0.75] [-refresh 0.8]]
    #      [-matrix Matrix | -nomatrix] [-nj | -bionj]
    #      [-nt] [-n 100] [alignment] > newick_tree

    def __call__(self, data=None, remove_tmp=True):
        """Run the application with the specified kwargs on data
        
            data: anything that can be cast into a string or written out to
                a file. Usually either a list of things or a single string or 
                number. input_handler will be called on this data before it 
                is passed as part of the command-line argument, so by creating
                your own input handlers you can customize what kind of data
                you want your application to accept

            remove_tmp: if True, removes tmp files

            NOTE: Override of the base class to handle redirected output
        """
        input_handler = self.InputHandler
        suppress_stderr = self.SuppressStderr

        outfile = self.getTmpFilename(self.TmpDir)
        self._outfile = outfile

        if suppress_stderr:
            errfile = FilePath('/dev/null')
        else:
            errfile = FilePath(self.getTmpFilename(self.TmpDir))
        if data is None:
            input_arg = ''
        else:
            input_arg = getattr(self, input_handler)(data)

        # Build up the command, consisting of a BaseCommand followed by
        # input and output (file) specifications
        command = self._command_delimiter.join(filter(None,\
            [self.BaseCommand,str(input_arg),'>',str(outfile),'2>',\
                str(errfile)]))
        if self.HaltExec:
            raise AssertionError, "Halted exec with command:\n" + command
        # The return value of system is a 16-bit number containing the signal
        # number that killed the process, and then the exit status.
        # We only want to keep the exit status so do a right bitwise shift to
        # get rid of the signal number byte
        exit_status = system(command) >> 8

        # Determine if error should be raised due to exit status of
        # appliciation
        if not self._accept_exit_status(exit_status):
            raise ApplicationError, \
             'Unacceptable application exit status: %s, command: %s'\
                % (str(exit_status),command)

        out = open(outfile, "r")

        err = None
        if not suppress_stderr:
            err = open(errfile, "r")

        result =  CommandLineAppResult(out,err,exit_status,\
            result_paths=self._get_result_paths(data))

        # Clean up the input file if one was created
        if remove_tmp:
            if self._input_filename:
                remove(self._input_filename)
                self._input_filename = None

        return result

    def _get_result_paths(self, data):
        result = {}
        result['Tree'] = ResultPath(Path=self._outfile)
        return result
Ejemplo n.º 4
0
class Dialign(CommandLineApplication):
    """Dialign application controller"""
    
    _options ={
        # -afc            Creates additional output file "*.afc" containing data of
        #                 all fragments considered for alignment
        #                 WARNING: this file can be HUGE !
        '-afc':FlagParameter(Prefix='-',Name='afc'),
        # -afc_v          like "-afc" but verbose: fragments are explicitly printed
        #                 WARNING: this file can be EVEN BIGGER !
        '-afc_v':FlagParameter(Prefix='-',Name='afc_v'),
        # -anc            Anchored alignment. Requires a file <seq_file>.anc
        #                 containing anchor points.
        '-anc':FlagParameter(Prefix='-',Name='anc'),
        # -cs             if segments are translated, not only the `Watson strand'
        #                 but also the `Crick strand' is looked at.
        '-cs':FlagParameter(Prefix='-',Name='cs'),
        # -cw             additional output file in CLUSTAL W format.
        '-cw':FlagParameter(Prefix='-',Name='cw'),
        # -ds             `dna alignment speed up' - non-translated nucleic acid
        #                 fragments are taken into account only if they start with
        #                 at least two matches. Speeds up DNA alignment at the expense
        #                 of sensitivity.
        '-ds':FlagParameter(Prefix='-',Name='ds'),
        # -fa             additional output file in FASTA format.
        '-fa':FlagParameter(Prefix='-',Name='fa'),
        # -ff             Creates file *.frg containing information about all
        #                 fragments that are part of the respective optimal pairwise
        #                 alignmnets plus information about consistency in the multiple
        #                 alignment
        '-ff':FlagParameter(Prefix='-',Name='ff'),
        # -fn <out_file>  output files are named <out_file>.<extension> .
        '-fn':ValuedParameter('-',Name='fn',Delimiter=' ', IsPath=True),
        #
        #
        # -fop            Creates file *.fop containing coordinates of all fragments
        #                 that are part of the respective pairwise alignments.
        '-fop':FlagParameter(Prefix='-',Name='fop'),
        # -fsm            Creates file *.fsm containing coordinates of all fragments
        #                 that are part of the final alignment
        '-fsm':FlagParameter(Prefix='-',Name='fsm'),
        # -iw             overlap weights switched off (by default, overlap weights are
        #                 used if up to 35 sequences are aligned). This option
        #                 speeds up the alignment but may lead to reduced alignment
        #                 quality.
        '-iw':FlagParameter(Prefix='-',Name='iw'),
        # -lgs            `long genomic sequences' - combines the following options:
        #                 -ma, -thr 2, -lmax 30, -smin 8, -nta, -ff,
        #                 -fop, -ff, -cs, -ds, -pst
        '-lgs':FlagParameter(Prefix='-',Name='lgs'),
        # -lgs_t          Like "-lgs" but with all segment pairs assessed at the
        #                 peptide level (rather than 'mixed alignments' as with the
        #                 "-lgs" option). Therefore faster than -lgs but not very
        #                 sensitive for non-coding regions.
        '-lgs_t':FlagParameter(Prefix='-',Name='lgs_t'),
        # -lmax <x>       maximum fragment length = x  (default: x = 40 or x = 120
        #                 for `translated' fragments). Shorter x speeds up the program
        #                 but may affect alignment quality.
        '-lmax':ValuedParameter('-',Name='lmax',Delimiter=' '),
        # -lo             (Long Output) Additional file *.log with information abut
        #                 fragments selected for pairwise alignment and about
        #                 consistency in multi-alignment proceedure
        '-lo':FlagParameter(Prefix='-',Name='lo'),
        # -ma             `mixed alignments' consisting of P-fragments and N-fragments
        #                 if nucleic acid sequences are aligned.
        '-ma':FlagParameter(Prefix='-',Name='ma'),
        # -mask           residues not belonging to selected fragments are replaced
        #                 by `*' characters in output alignment (rather than being
        #                 printed in lower-case characters)
        '-mask':FlagParameter(Prefix='-',Name='mask'),
        # -mat            Creates file *mat with substitution counts derived from the
        #                 fragments that have been selected for alignment
        '-mat':FlagParameter(Prefix='-',Name='mat'),
        # -mat_thr <t>    Like "-mat" but only fragments with weight score > t
        #                 are considered
        '-mat_thr':ValuedParameter('-',Name='mat_thr',Delimiter=' '),
        # -max_link       "maximum linkage" clustering used to construct sequence tree
        #                 (instead of UPGMA).
        '-max_link':FlagParameter(Prefix='-',Name='max_link'),
        # -min_link       "minimum linkage" clustering used.
        '-min_link':FlagParameter(Prefix='-',Name='min_link'),
        #
        # -mot            "motif" option.
        '-mot':FlagParameter(Prefix='-',Name='mot'),
        # -msf            separate output file in MSF format.
        '-msf':FlagParameter(Prefix='-',Name='msf'),
        # -n              input sequences are nucleic acid sequences. No translation
        #                 of fragments.
        '-n':FlagParameter(Prefix='-',Name='n'),
        # -nt             input sequences are nucleic acid sequences and `nucleic acid
        #                 segments' are translated to `peptide segments'.
        '-nt':FlagParameter(Prefix='-',Name='nt'),
        # -nta            `no textual alignment' - textual alignment suppressed. This
        #                 option makes sense if other output files are of intrest --
        #                 e.g. the fragment files created with -ff, -fop, -fsm or -lo
        '-nta':FlagParameter(Prefix='-',Name='nta'),
        # -o              fast version, resulting alignments may be slightly different.
        '-o':FlagParameter(Prefix='-',Name='o'),
        #
        # -ow             overlap weights enforced (By default, overlap weights are
        #                 used only if up to 35 sequences are aligned since calculating
        #                 overlap weights is time consuming). Warning: overlap weights
        #                 generally improve alignment quality but the running time
        #                 increases in the order O(n^4) with the number of sequences.
        #                 This is why, by default, overlap weights are used only for
        #                 sequence sets with < 35 sequences.
        '-ow':FlagParameter(Prefix='-',Name='ow'),
        # -pst            "print status". Creates and updates a file *.sta with
        #                 information about the current status of the program run.
        #                 This option is recommended if large data sets are aligned
        #                 since it allows the user to estimate the remaining running
        #                 time.
        '-pst':FlagParameter(Prefix='-',Name='pst'),
        # -smin <x>       minimum similarity value for first residue pair (or codon
        #                 pair) in fragments. Speeds up protein alignment or alignment
        #                 of translated DNA fragments at the expense of sensitivity.
        '-smin':ValuedParameter('-',Name='smin',Delimiter=' '),
        # -stars <x>      maximum number of `*' characters indicating degree of
        #                 local similarity among sequences. By default, no stars
        #                 are used but numbers between 0 and 9, instead.
        '-stars':ValuedParameter('-',Name='stars',Delimiter=' '),
        # -stdo           Results written to standard output.
        '-stdo':FlagParameter(Prefix='-',Name='stdo'),
        # -ta             standard textual alignment printed (overrides suppression
        #                 of textual alignments in special options, e.g. -lgs)
        '-ta':FlagParameter(Prefix='-',Name='ta'),
        # -thr <x>        Threshold T = x.
        '-thr':ValuedParameter('-',Name='thr',Delimiter=' '),
        # -xfr            "exclude fragments" - list of fragments can be specified
        #                 that are NOT considered for pairwise alignment
        '-xfr':FlagParameter(Prefix='-',Name='xfr'),
    
    }
    
    _parameters = {}
    _parameters.update(_options)
    _command = "dialign2-2"
    
    def _input_as_seqs(self,data):
        lines = []
        for i,s in enumerate(data):
            #will number the sequences 1,2,3,etc.
            lines.append(''.join(['>',str(i+1)]))
            lines.append(s)
        return self._input_as_lines(lines)
    
    def _align_out_filename(self):
        
        if self.Parameters['-fn'].isOn():
            aln_filename = self._absolute(str(self.Parameters['-fn'].Value))
        else:
            raise ValueError, "No output file specified."
        return aln_filename
    
    def _get_result_paths(self,data):
        
        result = {}
        if self.Parameters['-fn'].isOn():
            out_name = self._align_out_filename()
            result['Align'] = ResultPath(Path=out_name,IsWritten=True)
        return result
    
    def getHelp(self):
        """Dialign help"""
        
        help_str = """
"""
        return help_str
Ejemplo n.º 5
0
class CD_HIT(CommandLineApplication):
    """cd-hit Application Controller

    Use this version of CD-HIT if your MolType is PROTEIN
    """

    _command = 'cd-hit'
    _input_handler = '_input_as_multiline_string'
    _parameters = {
        # input input filename in fasta format, required
        '-i': ValuedParameter('-', Name='i', Delimiter=' ', IsPath=True),

        # output filename, required
        '-o': ValuedParameter('-', Name='o', Delimiter=' ', IsPath=True),

        # sequence identity threshold, default 0.9
        # this is the default cd-hit's "global sequence identity" calc'd as :
        # number of identical amino acids in alignment
        # divided by the full length of the shorter sequence
        '-c': ValuedParameter('-', Name='c', Delimiter=' '),

        # use global sequence identity, default 1
        # if set to 0, then use local sequence identity, calculated as :
        # number of identical amino acids in alignment
        # divided by the length of the alignment
        # NOTE!!! don't use -G 0 unless you use alignment coverage controls
        # see options -aL, -AL, -aS, -AS
        '-g': ValuedParameter('-', Name='g', Delimiter=' '),

        # band_width of alignment, default 20
        '-b': ValuedParameter('-', Name='b', Delimiter=' '),

        # max available memory (Mbyte), default 400
        '-M': ValuedParameter('-', Name='M', Delimiter=' '),

        # word_length, default 8, see user's guide for choosing it
        '-n': ValuedParameter('-', Name='n', Delimiter=' '),

        # length of throw_away_sequences, default 10
        '-l': ValuedParameter('-', Name='l', Delimiter=' '),

        # tolerance for redundance, default 2
        '-t': ValuedParameter('-', Name='t', Delimiter=' '),

        # length of description in .clstr file, default 20
        # if set to 0, it takes the fasta defline and stops at first space
        '-d': ValuedParameter('-', Name='d', Delimiter=' '),

        # length difference cutoff, default 0.0
        # if set to 0.9, the shorter sequences need to be
        # at least 90% length of the representative of the cluster
        '-s': ValuedParameter('-', Name='s', Delimiter=' '),

        # length difference cutoff in amino acid, default 999999
        # f set to 60, the length difference between the shorter sequences
        # and the representative of the cluster can not be bigger than 60
        '-S': ValuedParameter('-', Name='S', Delimiter=' '),

        # alignment coverage for the longer sequence, default 0.0
        # if set to 0.9, the alignment must covers 90% of the sequence
        '-aL': ValuedParameter('-', Name='aL', Delimiter=' '),

        # alignment coverage control for the longer sequence, default 99999999
        # if set to 60, and the length of the sequence is 400,
        # then the alignment must be >= 340 (400-60) residues
        '-AL': ValuedParameter('-', Name='AL', Delimiter=' '),

        # alignment coverage for the shorter sequence, default 0.0
        # if set to 0.9, the alignment must covers 90% of the sequence
        '-aS': ValuedParameter('-', Name='aS', Delimiter=' '),

        # alignment coverage control for the shorter sequence, default 99999999
        # if set to 60, and the length of the sequence is 400,
        # then the alignment must be >= 340 (400-60) residues
        '-AS': ValuedParameter('-', Name='AS', Delimiter=' '),

        # 1 or 0, default 0, by default, sequences are stored in RAM
        # if set to 1, sequence are stored on hard drive
        # it is recommended to use -B 1 for huge databases
        '-B': ValuedParameter('-', Name='B', Delimiter=' '),

        # 1 or 0, default 0
        # if set to 1, print alignment overlap in .clstr file
        '-p': ValuedParameter('-', Name='p', Delimiter=' '),

        # 1 or 0, default 0
        # by cd-hit's default algorithm, a sequence is clustered to the first
        # cluster that meet the threshold (fast cluster). If set to 1, the program
        # will cluster it into the most similar cluster that meet the threshold
        # (accurate but slow mode)
        # but either 1 or 0 won't change the representatives of final clusters
        '-g': ValuedParameter('-', Name='g', Delimiter=' '),

        # print this help
        '-h': ValuedParameter('-', Name='h', Delimiter=' ')
    }
    _synonyms = {'Similarity': '-c'}

    def getHelp(self):
        """Method that points to documentation"""
        help_str =\
        """
        CD-HIT is hosted as an open source project at:
        http://www.bioinformatics.org/cd-hit/

        The following papers should be cited if this resource is used:

        Clustering of highly homologous sequences to reduce thesize of large 
        protein database", Weizhong Li, Lukasz Jaroszewski & Adam Godzik
        Bioinformatics, (2001) 17:282-283

        Tolerating some redundancy significantly speeds up clustering of large
        protein databases", Weizhong Li, Lukasz Jaroszewski & Adam Godzik 
        Bioinformatics, (2002) 18:77-82
        """
        return help_str

    def _input_as_multiline_string(self, data):
        """Writes data to tempfile and sets -i parameter

        data -- list of lines
        """
        if data:
            self.Parameters['-i']\
                    .on(super(CD_HIT,self)._input_as_multiline_string(data))
        return ''

    def _input_as_lines(self, data):
        """Writes data to tempfile and sets -i parameter

        data -- list of lines, ready to be written to file
        """
        if data:
            self.Parameters['-i']\
                    .on(super(CD_HIT,self)._input_as_lines(data))
        return ''

    def _input_as_seqs(self, data):
        """Creates a list of seqs to pass to _input_as_lines

        data -- list like object of sequences
        """
        lines = []
        for i, s in enumerate(data):
            # will number the sequences 1,2,3, etc...
            lines.append(''.join(['>', str(i + 1)]))
            lines.append(s)
        return self._input_as_lines(lines)

    def _input_as_string(self, data):
        """Makes data the value of a specific parameter"""
        if data:
            self.Parameters['-i'].on(str(data))
        return ''

    def _get_seqs_outfile(self):
        """Returns the absolute path to the seqs outfile"""
        if self.Parameters['-o'].isOn():
            return self.Parameters['-o'].Value
        else:
            raise ValueError, "No output file specified"

    def _get_clstr_outfile(self):
        """Returns the absolute path to the clstr outfile"""
        if self.Parameters['-o'].isOn():
            return ''.join([self.Parameters['-o'].Value, '.clstr'])
        else:
            raise ValueError, "No output file specified"

    def _get_result_paths(self, data):
        """Return dict of {key: ResultPath}"""
        result = {}
        result['FASTA'] = ResultPath(Path=self._get_seqs_outfile())
        result['CLSTR'] = ResultPath(Path=self._get_clstr_outfile())
        return result
Ejemplo n.º 6
0
class Mafft(CommandLineApplication):
    """Mafft application controller"""
    
    
    _options ={
    # Algorithm
    
    # Automatically selects an appropriate strategy from L-INS-i, FFT-NS-i
    # and FFT-NS-2, according to data size. Default: off (always FFT-NS-2)
    '--auto':FlagParameter(Prefix='--',Name='auto'),\

    # Distance is calculated based on the number of shared 6mers. Default: on
    '--6merpair':FlagParameter(Prefix='--',Name='6merpair'),\

    # All pairwise alignments are computed with the Needleman-Wunsch algorithm.
    # More accurate but slower than --6merpair. Suitable for a set of globally
    # alignable sequences. Applicable to up to ~200 sequences. A combination
    # with --maxiterate 1000 is recommended (G-INS-i). Default: off 
    # (6mer distance is used)
    '--globalpair':FlagParameter(Prefix='--',Name='globalpair'),\

    # All pairwise alignments are computed with the Smith-Waterman algorithm.
    # More accurate but slower than --6merpair. Suitable for a set of locally
    # alignable sequences. Applicable to up to ~200 sequences. A combination
    # with --maxiterate 1000 is recommended (L-INS-i). Default: off
    # (6mer distance is used)
    '--localpair':FlagParameter(Prefix='--',Name='localpair'),\

    # All pairwise alignments are computed with a local algorithm with the
    # generalized affine gap cost (Altschul 1998). More accurate but slower than 
    # --6merpair. Suitable when large internal gaps are expected. Applicable to
    # up to ~200 sequences. A combination with --maxiterate 1000 is recommended
    # (E-INS-i). Default: off (6mer distance is used)
    '--genafpair':FlagParameter(Prefix='--',Name='genafpair'),\

    # All pairwise alignments are computed with FASTA (Pearson and Lipman 1988). 
    # FASTA is required. Default: off (6mer distance is used)
    '--fastapair':FlagParameter(Prefix='--',Name='fastapair'),\

    # Weighting factor for the consistency term calculated from pairwise
    # alignments. Valid when either of --blobalpair, --localpair, --genafpair,
    # --fastapair or --blastpair is selected. Default: 2.7
    '--weighti':ValuedParameter(Prefix='--',Name='weighti',Delimiter=' '),\

    # Guide tree is built number times in the progressive stage. Valid with 6mer 
    # distance. Default: 2
    '--retree':ValuedParameter(Prefix='--',Name='retree',Delimiter=' '),\

    # number cycles of iterative refinement are performed. Default: 0
    '--maxiterate':ValuedParameter(Prefix='--',Name='maxiterate',\
        Delimiter=' '),\
  
    # Use FFT approximation in group-to-group alignment. Default: on
    '--fft':FlagParameter(Prefix='--',Name='fft'),\
    
    # Do not use FFT approximation in group-to-group alignment. Default: off
    '--nofft':FlagParameter(Prefix='--',Name='nofft'),\

    #Alignment score is not checked in the iterative refinement stage. Default:
    # off (score is checked)
    '--noscore':FlagParameter(Prefix='--',Name='noscore'),\

    # Use the Myers-Miller (1988) algorithm. Default: automatically turned on 
    # when the alignment length exceeds 10,000 (aa/nt).
    '--memsave':FlagParameter(Prefix='--',Name='memsave'),\

    # Use a fast tree-building method (PartTree, Katoh and Toh 2007) with the
    # 6mer distance. Recommended for a large number (> ~10,000) of sequences are 
    # input. Default: off
    '--parttree':FlagParameter(Prefix='--',Name='parttree'),\

    # The PartTree algorithm is used with distances based on DP. Slightly more
    # accurate and slower than --parttree. Recommended for a large number
    # (> ~10,000) of sequences are input. Default: off
    '--dpparttree':FlagParameter(Prefix='--',Name='dpparttree'),\

    # The PartTree algorithm is used with distances based on FASTA. Slightly
    # more accurate and slower than --parttree. Recommended for a large number
    # (> ~10,000) of sequences are input. FASTA is required. Default: off
    '--fastaparttree':FlagParameter(Prefix='--',Name='fastaparttree'),\

    # The number of partitions in the PartTree algorithm. Default: 50
    '--partsize':ValuedParameter(Prefix='--',Name='partsize',Delimiter=' '),\

    # Do not make alignment larger than number sequences. Valid only with the
    # --*parttree options. Default: the number of input sequences
    '--groupsize':ValuedParameter(Prefix='--',Name='groupsize',Delimiter=' '),\
 
    # Parameter

    # Gap opening penalty at group-to-group alignment. Default: 1.53
    '--op':ValuedParameter(Prefix='--',Name='op',Delimiter=' '),\

    # Offset value, which works like gap extension penalty, for group-to-group
    # alignment. Deafult: 0.123
    '--ep':ValuedParameter(Prefix='--',Name='ep',Delimiter=' '),\

    # Gap opening penalty at local pairwise alignment. Valid when the
    # --localpair or --genafpair option is selected. Default: -2.00
    '--lop':ValuedParameter(Prefix='--',Name='lop',Delimiter=' '),\

    # Offset value at local pairwise alignment. Valid when the --localpair or 
    # --genafpair option is selected. Default: 0.1
    '--lep':ValuedParameter(Prefix='--',Name='lep',Delimiter=' '),\

    # Gap extension penalty at local pairwise alignment. Valid when the
    # --localpair or --genafpair option is selected. Default: -0.1
    '--lexp':ValuedParameter(Prefix='--',Name='lexp',Delimiter=' '),\

    # Gap opening penalty to skip the alignment. Valid when the --genafpair
    # option is selected. Default: -6.00
    '--LOP':ValuedParameter(Prefix='--',Name='LOP',Delimiter=' '),\

    # Gap extension penalty to skip the alignment. Valid when the --genafpair
    # option is selected. Default: 0.00
    '--LEXP':ValuedParameter(Prefix='--',Name='LEXP',Delimiter=' '),\

    # BLOSUM number matrix (Henikoff and Henikoff 1992) is used. number=30, 45,
    # 62 or 80. Default: 62
    '--bl':ValuedParameter(Prefix='--',Name='bl',Delimiter=' '),\

    # JTT PAM number (Jones et al. 1992) matrix is used. number>0.
    # Default: BLOSUM62
    '--jtt':ValuedParameter(Prefix='--',Name='jtt',Delimiter=' '),\

    # Transmembrane PAM number (Jones et al. 1994) matrix is used. number>0.
    # Default: BLOSUM62
    '--tm':ValuedParameter(Prefix='--',Name='tm',Delimiter=' '),\

    # Use a user-defined AA scoring matrix. The format of matrixfile is the same 
    # to that of BLAST. Ignored when nucleotide sequences are input.
    # Default: BLOSUM62
    '--aamatrix':ValuedParameter(Prefix='--',Name='aamatrix',Delimiter=' '),\

    # Incorporate the AA/nuc composition information into the scoring matrix.
    # Deafult: off
    '--fmodel':FlagParameter(Prefix='--',Name='fmodel'),\
    
    # Output

    # Output format: clustal format. Default: off (fasta format)
    '--clustalout':FlagParameter(Prefix='--',Name='clustalout'),\

    # Output order: same as input. Default: on
    '--inputorder':FlagParameter(Prefix='--',Name='inputorder'),\

    # Output order: aligned. Default: off (inputorder)
    '--reorder':FlagParameter(Prefix='--',Name='reorder'),\

    # Guide tree is output to the input.tree file. Default: off
    '--treeout':FlagParameter(Prefix='--',Name='treeout'),\

    # Do not report progress. Default: off
    '--quiet':FlagParameter(Prefix='--',Name='quiet'),\

# Input

    # Assume the sequences are nucleotide. Deafult: auto
    '--nuc':FlagParameter(Prefix='--',Name='nuc'),\

    # Assume the sequences are amino acid. Deafult: auto
    '--amino':FlagParameter(Prefix='--',Name='amino'),\

    # Seed alignments given in alignment_n (fasta format) are aligned with
    # sequences in input. The alignment within every seed is preserved.
    '--seed':ValuedParameter(Prefix='--',Name='seed',Delimiter=' '),\
    }
    
    _parameters = {}
    _parameters.update(_options)
    _command = "mafft"
    _suppress_stderr=True
    
    def _input_as_seqs(self,data):
        lines = []
        for i,s in enumerate(data):
            #will number the sequences 1,2,3,etc.
            lines.append(''.join(['>',str(i+1)]))
            lines.append(s)
        return self._input_as_lines(lines)
    
    def _tree_out_filename(self):
        if self.Parameters['--treeout'].isOn():
            tree_filename = self._absolute(str(self._input_filename))+'.tree'
        else:
            raise ValueError, "No tree output file specified."
        return tree_filename
    
    def _tempfile_as_multiline_string(self, data):
        """Write a multiline string to a temp file and return the filename.

            data: a multiline string to be written to a file.

           * Note: the result will be the filename as a FilePath object 
            (which is a string subclass).

        """
        filename = FilePath(self.getTmpFilename(self.TmpDir))
        data_file = open(filename,'w')
        data_file.write(data)
        data_file.close()
        return filename
        
    def getHelp(self):
        """Method that points to the Mafft documentation."""
        
        help_str = \
        """
        See Mafft documentation at:
        http://align.bmr.kyushu-u.ac.jp/mafft/software/manual/manual.html
        """
        return help_str
    
    def _get_result_paths(self,data):
        result = {}
        if self.Parameters['--treeout'].isOn():
            out_name = self._tree_out_filename()
            result['Tree'] = ResultPath(Path=out_name,IsWritten=True)
        return result
Ejemplo n.º 7
0
class Knetfold(CommandLineApplication):
    """Application controller for Knetfold v1.4.4b application"""

    _parameters = {
        '-i': ValuedParameter(Prefix='-', Name='i', Delimiter=' '),
        '-n': ValuedParameter(Prefix='-', Name='n', Delimiter=' '),
        '-d': ValuedParameter(Prefix='-', Name='d', Delimiter=' '),
        '-m': ValuedParameter(Prefix='-', Name='m', Delimiter=' '),
        '-q': ValuedParameter(Prefix='-', Name='q', Delimiter=' '),
        '-o': ValuedParameter(Prefix='-', Name='o', Delimiter=' '),
        '-r': ValuedParameter(Prefix='-', Name='r', Delimiter=' '),
        '-f': ValuedParameter(Prefix='-', Name='f', Delimiter=' '),
        '-h': ValuedParameter(Prefix='-', Name='h', Delimiter=' '),
    }

    _command = 'knetfold.pl'
    _input_handler = '_input_as_string'

    def _input_as_lines(self, data):
        """
        Infile needs to be set with parameter -i
        """
        filename = self._input_filename = self.getTmpFilename(self.WorkingDir)
        data_file = open(filename, 'w')
        data_to_file = '\n'.join([str(d).strip('\n') for d in data])
        data_file.write(data_to_file)
        data_file.write('\n')
        data_file.close()

        #set input flag and give it the input filename
        self.Parameters['-i'].on(filename)
        return ''

    def _input_as_string(self, data):
        """Makes data the value of a specific parameter
        
        This method returns the empty string. The parameter will be printed
        automatically once set.
        """
        if data:
            self.Parameters['-i'].on(data)
        return ''

    def _get_result_paths(self, data):
        """
        Adds output files to the resultpath
        """
        result = {}

        if isinstance(data, list):
            filename = self._input_filename.split('/')[-1]
        else:
            filename = (data.split('/')[-1]).split('.')[0]
        #output files created in extensions list
        extensions = ['ct', 'coll', 'sec', 'fasta', 'pdf']
        #file = '%s%s%s' % (self.WorkingDir,filename,'_knet')
        file = ''.join([self.WorkingDir, filename, '_knet'])
        for ext in extensions:
            try:
                path = '%s.%s' % (file, ext)
                f = open(path)
                f.close()
                result[ext]=\
                    ResultPath(Path=(path))
            except IOError:
                pass

        number = 0
        #Unknown number of mx files, try/except to find all
        #file = '%s%s%s%d' % (self.WorkingDir,filename,'_knet.mx',number)
        file_base = ''.join([self.WorkingDir, filename, '_knet.mx'])
        while (True):
            try:
                file = file_base + str(number)
                f = open(file)
                result['%s%d' % ('mx', number)] = ResultPath(Path=(file))
                f.close()
                number += 1
            except IOError:  #No more mx files
                break
        return result
Ejemplo n.º 8
0
class cdbyank(CommandLineApplication):
    """cdbyank application controller"""

    _options ={
        # -a <key> the sequence name (accession) for a fasta record to be
        # retrieved; if not given, a list of accessions is expected
        # at stdin
        '-a':ValuedParameter('-',Name='a', Delimiter=' '),

        # -d <fasta_file> is the fasta file to pull records from; 
        # if not specified, cdbyank will look in the same directory
        # where <index_file> resides, for a file with the same name
        # but without the ".cidx" suffix
        '-d':ValuedParameter('-', Name='d', Delimiter=' '),

        # -o the records found are written to file <outfile> instead of stdout
        '-o':ValuedParameter('-', Name='o', Delimiter=' '),

        # -x allows retrieval of multiple records per key, if the indexed 
        # database had records with the same key (non-unique keys);
        # (without -x only one record for a given key is retrieved)
        '-x':FlagParameter('-', Name='x'),

        # -i case insensitive query (expects the <index_file> to have been 
        # created with cdbfasta -i option)
        '-i':FlagParameter('-', Name='i'),

        # -Q output the query key surrounded by character '%' before the
        # corresponding record
        '-Q':FlagParameter('-', Name='Q'),

        # -q same as -Q but use character <char> instead of '%'
        '-q':ValuedParameter('-', Name='q', Delimiter=' '),

        # -w enable warnings (sent to stderr) when a key is not found
        '-w':FlagParameter('-', Name='w'),

        # -F pulls only the defline for each record (discard the sequence)
        '-F':FlagParameter('-', Name='F'),

        # -P only displays the position(s) (file offset) within the 
        # database file, for the requested record(s)
        '-P':FlagParameter('-', Name='P'),

        # -R sequence range extraction: expects the input <key(s)> to have 
        # the format: '<seq_name> <start> <end>'
        # and pulls only the specified sequence range
        '-R':ValuedParameter('-', Name='R', Delimiter=' '),

        # -z decompress the entire file <dbfasta.cdbz>
        # (assumes it was built using cdbfasta with '-z' option)
        '-z':ValuedParameter('-', Name='z', Delimiter=' '),

        # -v show version number and exit
        '-v':FlagParameter('-', Name='v'),

        ###
        # Index file statistics (no database file needed):
        # -n display the number of records indexed
        '-n':FlagParameter('-', Name='n'),

        # -l list all keys stored in <index_file>
        '-l':FlagParameter('-', Name='l'),

        # -s display indexing summary info
        '-s':FlagParameter('-', Name='s')
    }

    _parameters = {}
    _parameters.update(_options)
    _command = "cdbyank"
    _input_file = ""
    _queries = []

    def _input_as_string(self, data):
        """File path for an index"""
        if not data:
            raise ValueError, "Expected a file!"
        if not path.exists(data):
            raise ValueError, "Index doesn't exist: %s" % data

        self._input_file = data
        return ""
       
    def _get_result_paths(self, data):
        if self.Parameters['-v'].isOn():
            return {}

        output = {}
        if self.Parameters['-o'].isOn():
            output['seqs'] = ResultPath(self.Parameters['-o'].Value)
        return output
   
    def _get_base_command(self):
        """Yay for positional arguments..."""
        command_parts = []
        cd_command = ''.join(['cd ',str(self.WorkingDir),';'])
        if self._command is None:
            raise ApplicationError, '_command has not been set.'
        command = self._command
        parameters = sorted([str(x) for x in self.Parameters.values() 
                            if str(x)])

        synonyms = self._synonyms

        if self._queries:
            bulk_query = 'echo "%s" | ' % " ".join(self._queries)
        else:
            bulk_query = ""

        command_parts.append(cd_command)
        command_parts.append(bulk_query)
        command_parts.append(command)
        command_parts.append(self._input_file) # Positional argument
        command_parts += parameters

        return self._command_delimiter.join(filter(None,command_parts)).strip()

    BaseCommand = property(_get_base_command)

    def setQueries(self, queries):
        """Sets queries"""
        self._queries = queries
Ejemplo n.º 9
0
class cdbfasta(CommandLineApplication):
    """cdbfasta application controller"""

    _options ={
        # -o the index file will be named <index_file>; if not given,
        # the index filename is database name plus the suffix '.cidx'
        '-o':ValuedParameter('-',Name='anchorspacing',Delimiter=' '),
        
        # -r <record_delimiter> a string of characters at the beginning of line
        # marking the start of a record (default: '>')
        '-clw':FlagParameter(Prefix='-',Name='clw'),

        # -Q treat input as fastq format, i.e. with '@' as record delimiter
        # and with records expected to have at least 4 lines
        '-Q':FlagParameter(Prefix='-',Name='Q'),

        # -z database is compressed into the file <compressed_db>
        # before indexing (<fastafile> can be "-" or "stdin" 
        # in order to get the input records from stdin)
        '-z':ValuedParameter('-',Name='z',Delimiter=' '),

        # -s strip extraneous characters from *around* the space delimited
        # tokens, for the multikey options below (-m,-n,-f);
        # Default <stripendchars> set is: '",`.(){}/[]!:;~|><+-
        '-s':ValuedParameter('-',Name='s',Delimiter=' '),

        # -m ("multi-key" option) create hash entries pointing to 
        # the same record for all tokens found in
        # the defline
        '-m':FlagParameter('-',Name='m'),

        # -n <numkeys> same as -m, but only takes the first <numkeys>
        # tokens from the defline
        '-n':ValuedParameter('-',Name='n',Delimiter=' '),

        # -f indexes *space* delimited tokens (fields) in the defline as given
        # by LIST of fields or fields ranges (the same syntax as UNIX 'cut')
        '-f':ValuedParameter('-',Name='f',Delimiter=''),

        # -w <stopwordslist> exclude from indexing all the words found
        # in the file <stopwordslist> (for options -m, -n and -k)
        '-w':ValuedParameter('-',Name='w',Delimiter=' '),

        # -i do case insensitive indexing (i.e. create additional keys for 
        # all-lowercase tokens used for indexing from the defline 
        '-i':FlagParameter('-',Name='i'),

        # -c for deflines in the format: db1|accession1|db2|accession2|...,
        # only the first db-accession pair ('db1|accession1') is taken as key
        '-c':FlagParameter('-',Name='c'),

        # -C like -c, but also subsequent db|accession constructs are indexed,
        # along with the full (default) token; additionally,
        # all nrdb concatenated accessions found in the defline 
        # are parsed and stored (assuming 0x01 or '^|^' as separators)
        '-C':FlagParameter('-', Name='C'),

        # -a accession mode: like -C option, but indexes the 'accession'
        # part for all 'db|accession' constructs found
        '-a':FlagParameter('-', Name='a'),

        # -A like -a and -C together (both accessions and 'db|accession'
        # constructs are used as keys
        '-A':FlagParameter('-', Name='A'),

        # -v show program version and exit
        '-v':FlagParameter('-', Name='v')
        } 

    _parameters = {}
    _parameters.update(_options)
    _command = "cdbfasta"
    _input_file = ""

    def _input_as_string(self, data):
        """Index a single file"""
        if not data:
            raise ValueError, "Expected a file!"
        if not path.exists(data):
            raise ValueError, "File to index doesn't exist: %s" % data

        self._input_file = data
        return ""

    def _get_result_paths(self,data):
        if self.Parameters['-v'].isOn():
            return {}

        output = {}
        if self.Parameters['-o'].isOn():
            output['cidx'] = ResultPath(self.Parameters['-o'].Value)
        else:
            output['cidx'] = ResultPath(self._input_file + '.cidx')
        return output
    
    def _get_base_command(self):
        """Yay for positional arguments..."""
        command_parts = []
        cd_command = ''.join(['cd ',str(self.WorkingDir),';'])
        if self._command is None:
            raise ApplicationError, '_command has not been set.'
        command = self._command
        parameters = sorted([str(x) for x in self.Parameters.values() 
                            if str(x)])

        synonyms = self._synonyms

        command_parts.append(cd_command)
        command_parts.append(command)
        command_parts.append(self._input_file) # Positional argument
        command_parts += parameters

        return self._command_delimiter.join(filter(None,command_parts)).strip()

    BaseCommand = property(_get_base_command)
Ejemplo n.º 10
0
class RdpTrainer(RdpClassifier):
    _input_handler = '_input_as_lines'
    TrainingClass = 'edu.msu.cme.rdp.classifier.train.ClassifierTraineeMaker'
    PropertiesFile = 'RdpClassifier.properties'

    _parameters = {
        'taxonomy_file': ValuedParameter(None, None, IsPath=True),
        'model_output_dir': ValuedParameter(None, None, IsPath=True),
        'training_set_id': ValuedParameter(None, None, Value='1'),
        'taxonomy_version': ValuedParameter(None, None, Value='version1'),
        'modification_info': ValuedParameter(None, None, Value='cogent'),
    }
    _jvm_parameters = {
        # Maximum heap size for JVM.
        '-Xmx': ValuedParameter('-', Name='Xmx', Delimiter='', Value='1000m'),
    }
    _parameters.update(_jvm_parameters)

    def _get_base_command(self):
        """Returns the base command plus command-line options.

        Handles everything up to and including the classpath.  The
        positional training parameters are added by the
        _input_handler_decorator method.
        """
        cd_command = ''.join(['cd ', str(self.WorkingDir), ';'])
        jvm_command = "java"
        jvm_args = self._commandline_join(
            [self.Parameters[k] for k in self._jvm_parameters])
        cp_args = '-cp "%s" %s' % (self._get_jar_fp(), self.TrainingClass)

        command_parts = [cd_command, jvm_command, jvm_args, cp_args]
        return self._commandline_join(command_parts).strip()

    BaseCommand = property(_get_base_command)

    def _set_input_handler(self, method_name):
        """Stores the selected input handler in a private attribute.
        """
        self.__InputHandler = method_name

    def _get_input_handler(self):
        """Returns decorator that wraps the requested input handler.
        """
        return '_input_handler_decorator'

    InputHandler = property(_get_input_handler, _set_input_handler)

    @property
    def ModelDir(self):
        """Absolute FilePath to the training output directory.
        """
        model_dir = self.Parameters['model_output_dir'].Value
        absolute_model_dir = os.path.abspath(model_dir)
        return FilePath(absolute_model_dir)

    def _input_handler_decorator(self, data):
        """Adds positional parameters to selected input_handler's results.
        """
        input_handler = getattr(self, self.__InputHandler)
        input_parts = [
            self.Parameters['taxonomy_file'],
            input_handler(data),
            self.Parameters['training_set_id'],
            self.Parameters['taxonomy_version'],
            self.Parameters['modification_info'],
            self.ModelDir,
        ]
        return self._commandline_join(input_parts)

    def _get_result_paths(self, output_dir):
        """Return a dict of output files.
        """
        # Only include the properties file here. Add the other result
        # paths in the __call__ method, so we can catch errors if an
        # output file is not written.
        self._write_properties_file()
        properties_fp = os.path.join(self.ModelDir, self.PropertiesFile)
        result_paths = {
            'properties': ResultPath(
                properties_fp,
                IsWritten=True,
            )
        }
        return result_paths

    def _write_properties_file(self):
        """Write an RDP training properties file manually.
        """
        # The properties file specifies the names of the files in the
        # training directory.  We use the example properties file
        # directly from the rdp_classifier distribution, which lists
        # the default set of files created by the application.  We
        # must write this file manually after generating the
        # training data.
        properties_fp = os.path.join(self.ModelDir, self.PropertiesFile)
        properties_file = open(properties_fp, 'w')
        properties_file.write(
            "# Sample ResourceBundle properties file\n"
            "bergeyTree=bergeyTrainingTree.xml\n"
            "probabilityList=genus_wordConditionalProbList.txt\n"
            "probabilityIndex=wordConditionalProbIndexArr.txt\n"
            "wordPrior=logWordPrior.txt\n"
            "classifierVersion=Naive Bayesian rRNA Classifier Version 1.0, "
            "November 2003\n")
        properties_file.close()

    def __call__(self, data=None, remove_tmp=True):
        """Run the application with the specified kwargs on data

        data: anything that can be cast into a string or written out
          to a file. Usually either a list of things or a single
          string or number. input_handler will be called on this data
          before it is passed as part of the command-line argument, so
          by creating your own input handlers you can customize what
          kind of data you want your application to accept

        remove_tmp: if True, removes tmp files
        """
        result = super(RdpClassifier, self).__call__(data=data,
                                                     remove_tmp=remove_tmp)
        training_files = {
            'bergeyTree': 'bergeyTrainingTree.xml',
            'probabilityList': 'genus_wordConditionalProbList.txt',
            'probabilityIndex': 'wordConditionalProbIndexArr.txt',
            'wordPrior': 'logWordPrior.txt',
        }
        for key, training_fn in sorted(training_files.items()):
            training_fp = os.path.join(self.ModelDir, training_fn)
            if not os.path.exists(training_fp):
                exception_msg = (
                    "Training output file %s not found.  This may "
                    "happen if an error occurred during the RDP training "
                    "process.  More details may be available in the "
                    "standard error, printed below.\n\n" % training_fp)
                stderr_msg = result["StdErr"].read()
                result["StdErr"].seek(0)
                raise ApplicationError(exception_msg + stderr_msg)
            # Not in try/except clause because we already know the
            # file exists. Failure would be truly exceptional, and we
            # want to maintain the original exception in that case.
            result[key] = open(training_fp)
        return result
Ejemplo n.º 11
0
class RdpClassifier(CommandLineApplication):
    """RDP Classifier application controller

    The RDP Classifier program is distributed as a java archive (.jar)
    file.  If the file 'rdp_classifier-2.2.jar' is not found in the
    current directory, the app controller uses the JAR file specified
    by the environment variable RDP_JAR_PATH.  If this variable is not
    set, and 'rdp_classifier-2.2.jar' is not found in the current
    directory, the application controller raises an
    ApplicationNotFoundError.

    The RDP Classifier often requires memory in excess of Java's
    default 64M. To correct this situation, the authors recommend
    increasing the maximum heap size for the java virtual machine.  An
    option '-Xmx' (default 1000M) is provided for this purpose.
    Details on this option may be found at
    http://java.sun.com/j2se/1.5.0/docs/tooldocs/solaris/java.html

    The classifier may optionally use a custom training set.  The full
    path to the training set may be provided in the option
    '-training-data'.
    """
    _input_handler = '_input_as_lines'
    _command = "rdp_classifier-2.2.jar"
    _options = {
        # output file name for classification assignment
        '-o': ValuedParameter('-', Name='o', Delimiter=' ', IsPath=True),
        # a property file contains the mapping of the training
        # files. Note: the training files and the property file should
        # be in the same directory. The default property file is set
        # to data/classifier/rRNAClassifier.properties.
        '-t': ValuedParameter('-', Name='t', Delimiter=' ', IsPath=True),
        # all tab delimited output format: [allrank|fixrank|db].
        # Default is allrank.
        #
        #   allrank: outputs the results for all ranks applied for
        #   each sequence: seqname, orientation, taxon name, rank,
        #   conf, ...
        #
        #   fixrank: only outputs the results for fixed ranks in
        #   order: no rank, domain, phylum, class, order, family,
        #   genus
        #
        #   db: outputs the seqname, trainset_no, tax_id, conf. This
        #   is good for storing in a database
        '-f': ValuedParameter('-', Name='f', Delimiter=' '),
    }

    # The following are available in the attributes JvmParameters,
    # JarParameters, and PositionalParameters

    _jvm_synonyms = {}
    _jvm_parameters = {
        # Maximum heap size for JVM.
        '-Xmx': ValuedParameter('-', Name='Xmx', Delimiter='', Value='1000m'),
    }

    _parameters = {}
    _parameters.update(_options)
    _parameters.update(_jvm_parameters)

    def getHelp(self):
        """Returns documentation string"""
        # Summary paragraph copied from rdp_classifier-2.0, which is
        # licensed under the GPL 2.0 and Copyright 2008 Michigan State
        # University Board of Trustees
        help_str = """\
        usage: ClassifierCmd [-f <arg>] [-o <arg>] [-q <arg>] [-t <arg>]

        -f,--format <arg> all tab delimited output format:
        [allrank|fixrank|db]. Default is allrank.

            allrank: outputs the results for all ranks applied for each
            sequence: seqname, orientation, taxon name, rank, conf, ...

            fixrank: only outputs the results for fixed ranks in order:
            no rank, domain, phylum, class, order, family, genus

            db: outputs the seqname, trainset_no, tax_id, conf. This is
            good for storing in a database

        -o,--outputFile <arg> output file name for classification
        assignment

        -q,--queryFile <arg> query file contains sequences in one of
        the following formats: Fasta, Genbank and EMBL

        -t,--train_propfile <arg> a property file contains the mapping
        of the training files.
        
        Note: the training files and the property file should be in
        the same directory. The default property file is set to
        data/classifier/rRNAClassifier.properties."""
        return help_str

    def _accept_exit_status(self, status):
        """Returns false if an error occurred in execution
        """
        return (status == 0)

    def _error_on_missing_application(self, params):
        """Raise an ApplicationNotFoundError if the app is not accessible

        In this case, checks for the java runtime and the RDP jar file.
        """
        if not (os.path.exists('java') or app_path('java')):
            raise ApplicationNotFoundError(
                "Cannot find java runtime. Is it installed? Is it in your "
                "path?")
        jar_fp = self._get_jar_fp()
        if jar_fp is None:
            raise ApplicationNotFoundError(
                "JAR file not found in current directory and the RDP_JAR_PATH "
                "environment variable is not set.  Please set RDP_JAR_PATH to "
                "the full pathname of the JAR file.")
        if not os.path.exists(jar_fp):
            raise ApplicationNotFoundError("JAR file %s does not exist." %
                                           jar_fp)

    def _get_jar_fp(self):
        """Returns the full path to the JAR file.

        If the JAR file cannot be found in the current directory and
        the environment variable RDP_JAR_PATH is not set, returns
        None.
        """
        # handles case where the jar file is in the current working directory
        if os.path.exists(self._command):
            return self._command
        # handles the case where the user has specified the location via
        # an environment variable
        elif 'RDP_JAR_PATH' in environ:
            return getenv('RDP_JAR_PATH')
        else:
            return None

    # Overridden to pull out JVM-specific command-line arguments.
    def _get_base_command(self):
        """Returns the base command plus command-line options.

        Does not include input file, output file, and training set.
        """
        cd_command = ''.join(['cd ', str(self.WorkingDir), ';'])
        jvm_command = "java"
        jvm_arguments = self._commandline_join(
            [self.Parameters[k] for k in self._jvm_parameters])
        jar_arguments = '-jar "%s"' % self._get_jar_fp()
        rdp_arguments = self._commandline_join(
            [self.Parameters[k] for k in self._options])

        command_parts = [
            cd_command, jvm_command, jvm_arguments, jar_arguments,
            rdp_arguments, '-q'
        ]
        return self._commandline_join(command_parts).strip()

    BaseCommand = property(_get_base_command)

    def _commandline_join(self, tokens):
        """Formats a list of tokens as a shell command
 
        This seems to be a repeated pattern; may be useful in
        superclass.
        """
        commands = filter(None, map(str, tokens))
        return self._command_delimiter.join(commands).strip()

    def _get_result_paths(self, data):
        """ Return a dict of ResultPath objects representing all possible output
        """
        assignment_fp = str(self.Parameters['-o'].Value).strip('"')
        if not os.path.isabs(assignment_fp):
            assignment_fp = os.path.relpath(assignment_fp, self.WorkingDir)
        return {'Assignments': ResultPath(assignment_fp, IsWritten=True)}
Ejemplo n.º 12
0
class Mfold(CommandLineApplication):
    """Application controller for mfold 3.2 application"""

    #Not all parameters included!
    #skipped: NA_CONC,MG_CONC,LB_FR,ROT_ANG,START,STOP,REUSE
    _parameters = {
        'LC':
        ValuedParameter(Prefix='', Name='LC=', Value=None, Delimiter=''),
        'T':
        ValuedParameter(Prefix='', Name='T=', Value=None, Delimiter=''),
        'P':
        ValuedParameter(Prefix='', Name='P=', Value=None, Delimiter=''),
        'MAXBP':
        ValuedParameter(Prefix='', Name='MAXBP=', Value=None, Delimiter=''),
        'MAX':
        ValuedParameter(Prefix='', Name='MAX=', Value=30, Delimiter=''),
        'MAX_LP':
        ValuedParameter(Prefix='', Name='MAX_LP=', Value=None, Delimiter=''),
        'MAX_AS':
        ValuedParameter(Prefix='', Name='MAX_AS=', Value=None, Delimiter=''),
        'MODE':
        ValuedParameter(Prefix='', Name='MODE=', Value=None, Delimiter=''),
    }

    _command = 'mfold'
    _input_handler = '_input_as_string'

    def _input_as_string(self, filename):
        """
        mfold dosen't take full paths so a tmp-file is created in the working 
        dir for mfold to read.
        """
        nr = choice(list(range(150)))
        input_file = open(filename).readlines()
        filename = self._input_filename = 'mfold_in%d.txt' % nr
        data_file = open(filename, 'w')
        data_to_file = '\n'.join([str(d).strip('\n') for d in input_file])
        data_file.write(data_to_file)
        data_file.close()
        data = '='.join(['SEQ', filename])
        return data

    def _input_as_lines(self, data):
        """
        Uses a fixed tmp filename since weird truncation of the generated 
        filename sometimes occured.
        """
        nr = choice(list(range(150)))
        filename = self._input_filename = 'mfold_in%d.txt' % nr
        data_file = open(filename, 'w')
        data_to_file = '\n'.join([str(d).strip('\n') for d in data])
        data_file.write(data_to_file)
        data_file.close()
        return '='.join(['SEQ', filename])

    def _get_result_paths(self, data):
        """Return a dict of ResultPath objects representing all possible output
        """
        result = {}
        itr = self.Parameters['MAX'].Value
        if itr == None:
            itr = 30

        filename = self._input_filename.split('/')[-1]
        for i in range(1, itr + 1):
            try:
                ct = self.WorkingDir + filename + '_' + str(i) + '.ct'
                f = open(ct)
                f.close()
                result['ct'+str(i)] =\
                    ResultPath(Path=ct)

                pdf = self.WorkingDir + filename + '_' + str(i) + '.pdf'
                f = open(pdf)
                f.close()
                result['pdf'+str(i)] =\
                    ResultPath(Path=pdf)
            except IOError:
                pass
        result['ct_all'] =\
            ResultPath(Path=(self.WorkingDir+filename+'.ct'))

        name = self.WorkingDir + filename
        #output files
        files = [
            'log', 'ann', 'h-num', 'det', 'pnt', 'sav', 'ss-count',
            '-local.seq', 'rnaml', 'out', 'plot', 'ps', '_1.ps', '_1.ss'
        ]
        for f in files:
            if f == '-local.seq':
                file = ''.join([name, f])
            elif f.startswith('_1'):
                file = ''.join([name, f])
            else:
                file = '.'.join([name, f])
            result['%s' % f] = ResultPath(Path=file)

        return result
Ejemplo n.º 13
0
class ChimeraSlayer(CommandLineApplication):
    """ ChimeraSlayer ApplicationController

    """

    _command = 'ChimeraSlayer.pl'
    _input_handler = '_input_as_parameters'
    _parameters = {

        # multi-fasta file containing query sequences in alignment format
        '--query_NAST':
        ValuedParameter('--', Name='query_NAST', Delimiter=' ', IsPath=True),
        '--db_NAST':
        ValuedParameter('--', Name='db_NAST', Delimiter=' ', IsPath=True),
        '--db_FASTA':
        ValuedParameter('--', Name='db_FASTA', Delimiter=' ', IsPath=True),
        '--exec_dir':
        ValuedParameter('--', Name='exec_dir', Delimiter=' ', IsPath=True),
        '-R':
        ValuedParameter('-', Name='R', Delimiter=' ')
    }

    _suppress_stdout = False
    _suppress_stderr = False

    def _input_as_parameters(self, data):
        """ Set the input paths (a NAST aligned fasta filepath)
        """
        # The list of values which can be passed on a per-run basis
        allowed_values = ['--query_NAST', '--db_NAST', '--db_FASTA', '-R']

        unsupported_parameters = set(data.keys()) - set(allowed_values)
        if unsupported_parameters:
            raise ApplicationError(
                "Unsupported parameter(s) passed when calling ChimeraSlayer: %s"
                % ' '.join(unsupported_parameters))

        return ''

    def _get_result_paths(self, data):
        """ Set the result paths """

        result = {}

        inp_file_name = str(self.Parameters['--query_NAST'].Value)
        inp_file_name = inp_file_name.rstrip('"')
        inp_file_name = inp_file_name.lstrip('"')

        exec_dir = self.Parameters['--exec_dir']
        if exec_dir.isOn():
            exec_dir = str(exec_dir.Value)
            exec_dir = exec_dir.lstrip('"')
            exec_dir = exec_dir.rstrip('"')

            if inp_file_name[0] == '/':
                # path is already absolute
                pass
            else:
                inp_file_name = exec_dir + "/" + inp_file_name

        if not exists(inp_file_name + ".CPS.CPC"):
            raise ApplicationError("Calling ChimeraSlayer failed.")

        result['CPS'] = ResultPath(Path=inp_file_name + ".CPS.CPC",
                                   IsWritten=True)
        return result

    def remove_intermediate_files(self):
        """Remove all intermediate files."""

        # tmp files are written in the current dir,
        # app controller always jumps into dir specified via exec_dir
        # Note: blast intermediates are not removed
        exec_dir = str(self.Parameters['--exec_dir'].Value)
        inp_file_name = str(self.Parameters['--query_NAST'].Value)

        exec_dir = exec_dir.rstrip('"')
        exec_dir = exec_dir.lstrip('"')

        inp_file_name = inp_file_name.rstrip('"')
        inp_file_name = inp_file_name.lstrip('"')

        tmp_suffixes = [
            ".CPS", ".CPS.CPC", ".CPS_RENAST", ".CPS_RENAST.cidx",
            ".CPS.CPC.wTaxons", ".cidx"
        ]
        cs_tmp_files = [
            exec_dir + '/' + inp_file_name + x for x in tmp_suffixes
        ]
        remove_files(cs_tmp_files, error_on_missing=False)

        db_param = self.Parameters['--db_NAST']
        if db_param.isOn():
            nast_db_name = str(db_param.Value)
            nast_db_name = nast_db_name.rstrip('"')
            nast_db_name = nast_db_name.lstrip('"')

            # Better do not remove this file since other ChimeraSlayer
            # instances running on the same ref set might use this file
            # Should be rather deleted in the calling function
#            remove_files([nast_db_name + ".cidx"],
#                         error_on_missing=False)

        fasta_param = self.Parameters['--db_FASTA']
        if fasta_param.isOn():
            fasta_name = str(fasta_param.Value)
            fasta_name = fasta_name.rstrip('"')
            fasta_name = fasta_name.lstrip('"')

            blast_db_files = [
                fasta_name + x for x in [".nsq", ".nin", ".nhr", ".cidx"]
            ]
            remove_files(blast_db_files, error_on_missing=False)

    def getHelp(self):
        """Method that points to documentation"""
        help_str =\
            """##########################################################################################
#
#  Required:
#
#    --query_NAST      multi-fasta file containing query sequences in alignment format
#
#  Common opts:
#
#    --db_NAST        db in NAST format
#    --db_FASTA       db in fasta format (megablast formatted)
#
#
#    -n       number of top matching database sequences to compare to (default 15)
#    -R       min divergence ratio default: 1.007
#    -P       min percent identity among matching sequences (default: 90)
#
#  ## parameters to tune ChimeraParentSelector:
#
#  Scoring parameters:
#   -M match score   (default: +5)
#   -N mismatch penalty  (default: -4)
#   -Q min query coverage by matching database sequence (default: 70)
#   -T maximum traverses of the multiple alignment  (default: 1)

#
#  ## parameters to tune ChimeraPhyloChecker:
#
#
#    --windowSize                default 50
#    --windowStep                default 5
#    --minBS      minimum bootstrap support for calling chimera (default: 90)
#    -S           percent of SNPs to sample on each side of breakpoint for computing bootstrap support (default: 10)
#    --num_parents_test       number of potential parents to test for chimeras (default: 3)
#    --MAX_CHIMERA_PARENT_PER_ID    Chimera/Parent alignments with perID above this are considered non-chimeras (default 100; turned off)
#
#  ## misc opts
#
#   --printFinalAlignments          shows alignment between query sequence and pair of candidate chimera parents
#   --printCSalignments             print ChimeraSlayer alignments in ChimeraSlayer output
#   --exec_dir                      chdir to here before running
#
#########################################################################################
        """
        return help_str
Ejemplo n.º 14
0
class Raxml(CommandLineApplication):
    """RAxML application controller"""

    _options ={

        # Specify a column weight file name to assign individual wieghts to 
        # each column of the alignment. Those weights must be integers 
        # separated by any number and type of whitespaces whithin a separate 
        # file, see file "example_weights" for an example.
        '-a':ValuedParameter('-',Name='a',Delimiter=' '),

        # Specify one of the secondary structure substitution models implemented
        # in RAxML. The same nomenclature as in the PHASE manual is used, 
        # available models:  S6A, S6B, S6C, S6D, S6E, S7A, S7B, S7C, S7D, S7E, 
        # S7F, S16, S16A, S16B
        # DEFAULT: 16-state GTR model (S16)
        '-A':ValuedParameter('-',Name='A',Delimiter=' '),
        
        #  Specify an integer number (random seed) for bootstrapping
        '-b':ValuedParameter('-',Name='b',Delimiter=' '),
        
        # specify a floating point number between 0.0 and 1.0 that will be used 
        # as cutoff threshold for the MR-based bootstopping criteria. The 
        # recommended setting is 0.03.
        '-B':ValuedParameter('-',Name='B',Delimiter=' '),
        
        # Specify number of distinct rate catgories for raxml when 
        # ModelOfEvolution is set to GTRCAT or HKY85CAT.
        # Individual per-site rates are categorized into numberOfCategories 
        # rate categories to accelerate computations. (Default = 50)
        '-c':ValuedParameter('-',Name='c',Delimiter=' '),

        # Conduct model parameter optimization on gappy, partitioned multi-gene 
        # alignments with per-partition branch length estimates (-M enabled) 
        # using the fast method with pointer meshes described in:
        # Stamatakis and Ott: "Efficient computation of the phylogenetic 
        # likelihood function on multi-gene alignments and multi-core 
        # processors"
        # WARNING: We can not conduct useful tree searches using this method 
        # yet! Does not work with Pthreads version.
        '-C':ValuedParameter('-',Name='C',Delimiter=' '),          

        # This option allows you to start the RAxML search with a complete 
        # random starting tree instead of the default Maximum Parsimony 
        # Starting tree. On smaller datasets (around 100-200 taxa) it has 
        # been observed that this might sometimes yield topologies of distinct 
        # local likelihood maxima which better correspond to empirical 
        # expectations. 
        '-d':FlagParameter('-',Name='d'),

        # ML search convergence criterion. This will break off ML searches if 
        # the relative Robinson-Foulds distance between the trees obtained from 
        # two consecutive lazy SPR cycles is smaller or equal to 1%. Usage 
        # recommended for very large datasets in terms of taxa. On trees with 
        # more than 500 taxa this will yield execution time improvements of 
        # approximately 50% While yielding only slightly worse trees.
        # DEFAULT: OFF
        '-D':ValuedParameter('-',Name='D'), 

        # This allows you to specify up to which likelihood difference.
        # Default is 0.1 log likelihood units, author recommends 1 or 2 to
        # rapidly evaluate different trees.
        '-e':ValuedParameter('-',Name='e',Delimiter=' '),
        
        # specify an exclude file name, that contains a specification of 
        # alignment positions you wish to exclude. Format is similar to Nexus, 
        # the file shall contain entries like "100-200 300-400", to exclude a 
        # single column write, e.g., "100-100", if you use a mixed model, an 
        # appropriatly adapted model file will be written.
        '-E':ValuedParameter('-',Name='E',Delimiter=' '),

        # select search algorithm: 
        #   a rapid Bootstrap analysis and search for best-scoring ML tree in 
        #       one program run
        #   A compute marginal ancestral states on a ROOTED reference tree
        #       provided with "t" - ONLY IN 7.3.0
        #   b draw bipartition information on a tree provided with "-t" based on 
        #       multiple trees (e.g., from a bootstrap) in a file specifed by 
        #       "-z"
        #   c check if the alignment can be properly read by RAxML
        #   d for normal hill-climbing search (Default)
        #     when -f option is omitted this algorithm will be used
        #   e optimize model+branch lengths for given input tree under 
        #       GAMMA/GAMMAI only
        #   E execute very fast experimental tree search, at present only for 
        #       testing
        #   F execute fast experimental tree search, at present only for testing
        #   g compute per site log Likelihoods for one ore more trees passed via
        #       "-z" and write them to a file that can be read by CONSEL
        #       WARNING: does not print likelihoods in the original column order
        #   h compute log likelihood test (SH-test) between best tree passed via 
        #       "-t" and a bunch of other trees passed via "-z" 
        #   i EXPERIMENTAL do not use for real tree inferences: conducts a 
        #       single cycle of fast lazy SPR moves on a given input tree, to be 
        #       used in combination with -C and -M 
        #   I EXPERIMENTAL do not use for real tree inferences: conducts a 
        #       single cycle of thorough lazy SPR moves on a given input tree,  
        #       to be used in combination with -C and -M 
        #   j generate a bunch of bootstrapped alignment files from an original 
        #       alignemnt file. You need to specify a seed with "-b" and the 
        #       number of replicates with "-#" 
        # following "J" is for version 7.2.8
        #   J Compute SH-like support values on a given tree passed via "-t".
        #   m compare bipartitions between two bunches of trees passed via "-t" 
        #       and "-z" respectively. This will return the Pearson correlation 
        #       between all bipartitions found in the two tree files. A file 
        #       called RAxML_bipartitionFrequencies.outpuFileName will be 
        #       printed that contains the pair-wise bipartition frequencies of 
        #       the two sets
        #   n compute the log likelihood score of all trees contained in a tree 
        #       file provided by "-z" under GAMMA or GAMMA+P-Invar
        #   o old (slower) algorithm from v. 2.1.3
        #   p perform pure stepwise MP addition of new sequences to an 
        #       incomplete starting tree and exit
        #   r compute pairwise Robinson-Foulds (RF) distances between all pairs 
        #       of trees in a tree file passed via "-z" if the trees have node 
        #       labales represented as integer support values the program will 
        #       also compute two flavors of the weighted Robinson-Foulds (WRF)
        #       distance
        # following "R" is for version 7.2.8
        #   R compute rogue taxa using new statistical method based on the
        #       evolutionary placement algorithm
        #       WARNING: this is experimental code - DEPRECATED IN 7.3.0
        #   s (split) splits into individual genes, provided with model file
        # following "S" is for version 7.2.8
        #   S compute site-specific placement bias using a leave one out test
        #       inspired by the evolutionary placement algorithm
        #   t do randomized tree searches on one fixed starting tree
        #   u execute morphological weight calibration using maximum likelihood, 
        #       this will return a weight vector. you need to provide a 
        #       morphological alignment and a reference tree via "-t" 
        #   U execute morphological wieght calibration using parsimony, this 
        #       will return a weight vector. you need to provide a morphological 
        #       alignment and a reference tree via "-t" - DEPRECATED IN 7.3.0
        #   v classify a bunch of environmental sequences into a reference tree 
        #       using the slow heuristics without dynamic alignment you will 
        #       need to start RAxML with a non-comprehensive reference tree and 
        #       an alignment containing all sequences (reference + query)
        #   w compute ELW test on a bunch of trees passed via "-z" 
        #   x compute pair-wise ML distances, ML model parameters will be 
        #       estimated on an MP starting tree or a user-defined tree passed 
        #       via "-t", only allowed for GAMMA-based models of rate 
        #       heterogeneity
        #   y classify a bunch of environmental sequences into a reference tree 
        #       using the fast heuristics without dynamic alignment you will 
        #       need to start RAxML with a non-comprehensive reference tree and 
        #       an alignment containing all sequences (reference + query)
        '-f':ValuedParameter('-',Name='f',Delimiter=' ', Value="d"),

        # enable ML tree searches under CAT model for very large trees without 
        # switching to GAMMA in the end (saves memory). This option can also be 
        # used with the GAMMA models in order to avoid the thorough optimization 
        # of the best-scoring ML tree in the end.
        # DEFAULT: OFF
        '-F':FlagParameter('-',Name='F'),
        
        # select grouping file name: allows incomplete multifurcating constraint
        # tree in newick format -- resolves multifurcations randomly, adds
        # other taxa using parsimony insertion
        '-g':ValuedParameter('-', Name='g',Delimiter=' '),

        # enable the ML-based evolutionary placement algorithm heuristics by 
        # specifiyng a threshold value (fraction of insertion branches to be 
        # evaluated using slow insertions under ML).
        '-G':FlagParameter('-', Name='G'),

        # prints help and exits
        '-h':FlagParameter('-', Name='h'),

        # enable the MP-based evolutionary placement algorithm heuristics
        # by specifiyng a threshold value (fraction of insertion branches to be 
        # evaluated using slow insertions under ML) - DEPRECATED IN 7.3.0
        #'-H':ValuedParameter('-', Name='H',Delimiter=' '),
        
        # allows initial rearrangement to be constrained, e.g. 10 means
        # insertion will not be more than 10 nodes away from original.
        # default is to pick a "good" setting.
        '-i':ValuedParameter('-', Name='i', Delimiter=' '),

        # a posteriori bootstopping analysis. Use:
        #   "-I autoFC" for the frequency-based criterion
        #   "-I autoMR" for the majority-rule consensus tree criterion
        #   "-I autoMRE" for the extended majority-rule consensus tree criterion
        #   "-I autoMRE_IGN" for metrics similar to MRE, but include 
        #       bipartitions under the threshold whether they are compatible
        #       or not. This emulates MRE but is faster to compute.
        #   You also need to pass a tree file containg several bootstrap 
        #   replicates via "-z"
        '-I':ValuedParameter('-', Name='I', Delimiter=' '),
        
        # writes checkpoints (off by default)
        '-j':FlagParameter('-', Name='j'),

        # Compute majority rule consensus tree with "-J MR" or extended majority 
        # rule consensus tree with "-J MRE" or strict consensus tree with "-J 
        # STRICT" You will need to provide a tree file containing several 
        # UNROOTED trees via "-z"
        '-J':ValuedParameter('-', Name='J', Delimiter=' '),
        
        #specifies that RAxML will optimize model parameters (for GTRMIX and
        # GTRGAMMA) as well as calculating likelihoods for bootstrapped trees.
        '-k':FlagParameter('-', Name='k'),

        # Specify one of the multi-state substitution models (max 32 states) 
        # implemented in RAxML. Available models are: ORDERED, MK, GTR
        '-K':ValuedParameter('-', Name='K', Delimiter=' '),
        
        # Model of Binary (Morphological), Nucleotide, Multi-State, or Amino 
        #   Acid Substitution::
        # BINARY:
        #   -m BINCAT : Optimization of site-specific evolutionary rates which 
        #       are categorized into numberOfCategories distinct rate categories 
        #       for greater computational efficiency. Final tree might be 
        #       evaluated automatically under BINGAMMA, depending on the tree 
        #       search option
        #   -m BINCATI : Optimization of site-specific evolutionary rates which 
        #       are categorized into numberOfCategories distinct rate categories    
        #       for greater computational efficiency. Final tree might be 
        #       evaluated automatically under BINGAMMAI, depending on the tree 
        #       search option 
        #   -m BINGAMMA : GAMMA model of rate heterogeneity (alpha parameter 
        #       will be estimated)
        #   -m BINGAMMAI : Same as BINGAMMA, but with estimate of proportion of 
        #       invariable sites
        # NUCLEOTIDES
        #   -m GTRCAT: GTR + Optimization of substitution rates +  Optimization 
        #       of site-specific evolutionary rates which are categorized into 
        #       numberOfCategories distinct rate categories for greater 
        #       computational efficiency
        #   -m GTRCAT_FLOAT : Same as above but uses single-precision floating 
        #       point arithemtics instead of double-precision Usage only 
        #       recommened for testing, the code will run slower, but can save 
        #       almost 50% of memory. If you have problems with phylogenomic 
        #       datasets and large memory requirements you may give it a shot. 
        #       Keep in mind that numerical stability seems to be okay but needs 
        #       further testing. - DEPRECATED IN 7.3.0
        #   -m GTRCATI : GTR + Optimization of substitution rates + Optimization 
        #       of site-specific evolutionary rates which are categorized into 
        #       numberOfCategories distinct rate categories for greater 
        #       computational efficiency.  Final tree might be evaluated under 
        #       GTRGAMMAI, depending on the tree search option
        #   -m GTRGAMMA: GTR + Optimization of substitution rates + Gamma
        #   -m GTRGAMMA_FLOAT : Same as GTRGAMMA, but also with 
        #       single-precision arithmetics, same cautionary notes as for  
        #       GTRCAT_FLOAT apply. - DEPRECATED IN 7.3.0
        #   -m GTRGAMMAI : Same as GTRGAMMA, but with estimate of proportion of 
        #       invariable sites 
        # MULTI-STATE:
        #   -m MULTICAT : Optimization of site-specific evolutionary rates which 
        #       are categorized into numberOfCategories distinct rate categories 
        #       for greater computational efficiency. Final tree might be 
        #       evaluated automatically under MULTIGAMMA, depending on the tree 
        #       search option
        #   -m MULTICATI : Optimization of site-specific evolutionary rates 
        #       which are categorized into numberOfCategories distinct rate 
        #       categories for greater computational efficiency. Final tree 
        #       might be evaluated automatically under MULTIGAMMAI, depending on 
        #       the tree search option 
        #   -m MULTIGAMMA : GAMMA model of rate heterogeneity (alpha parameter 
        #       will be estimated)
        #   -m MULTIGAMMAI : Same as MULTIGAMMA, but with estimate of proportion 
        #       of invariable sites
        # You can use up to 32 distinct character states to encode multi-state
        # regions, they must be used in the following order: 0, 1, 2, 3, 4, 5, 
        # 6, 7, 8, 9, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, 
        # T, U, V i.e., if you have 6 distinct character states you would use 0, 
        # 1, 2, 3, 4, 5 to encode these. The substitution model for the
        # multi-state regions can be selected via the "-K" option
        # Amino Acid Models:
        #   -m PROTCATmatrixName[F] : specified AA matrix + Optimization of 
        #       substitution rates + Optimization of site-specific evolutionary 
        #       rates which are categorized into numberOfCategories distinct 
        #       rate categories for greater computational efficiency.   Final
        #       tree might be evaluated automatically under 
        #       PROTGAMMAmatrixName[f], depending on the tree search option
        #   -m PROTCATmatrixName[F]_FLOAT : PROTCAT with single precision 
        #       arithmetics, same cautionary notes as for GTRCAT_FLOAT apply
        #       - DEPRECATED IN 7.3.0
        #   -m PROTCATImatrixName[F] : specified AA matrix + Optimization of 
        #       substitution rates + Optimization of site-specific
        #       evolutionary rates which are categorized into numberOfCategories 
        #       distinct rate categories for greater computational efficiency.   
        #       Final tree might be evaluated automatically under 
        #       PROTGAMMAImatrixName[f], depending on the tree search option
        #   -m PROTGAMMAmatrixName[F] : specified AA matrix + Optimization of 
        #       substitution rates + GAMMA model of rate heterogeneity (alpha 
        #       parameter will be estimated)
        #   -m PROTGAMMAmatrixName[F]_FLOAT : PROTGAMMA with single precision 
        #       arithmetics, same cautionary notes as for GTRCAT_FLOAT apply
        #       - DEPRECATED IN 7.3.0
        #   -m PROTGAMMAImatrixName[F] : Same as PROTGAMMAmatrixName[F], but 
        #       with estimate of proportion of invariable sites 
        # Available AA substitution models: DAYHOFF, DCMUT, JTT, MTREV, WAG, 
        # RTREV, CPREV, VT, BLOSUM62, MTMAM, LG, GTR. With the optional "F" 
        # appendix you can specify if you want to use empirical base frequencies
        # Please note that for mixed models you can in addition specify the 
        # per-gene AA model in the mixed model file (see manual for details). 
        # Also note that if you estimate AA GTR parameters on a partitioned
        # dataset, they will be linked (estimated jointly) across all partitions 
        # to avoid over-parametrization
        '-m':ValuedParameter('-',Name='m',Delimiter=' '),

        # Switch on estimation of individual per-partition branch lengths. Only 
        # has effect when used in combination with "-q". Branch lengths for 
        # individual partitions will be printed to separate files. A weighted 
        # average of the branch lengths is computed by using the respective 
        # partition lengths. 
        # DEFAULT: OFF
        '-M':FlagParameter('-',Name='M'),
        
        # Specifies the name of the output file.
        '-n':ValuedParameter('-',Name='n',Delimiter=' '),

        # Specifies the name of the outgroup (or outgroups: comma-delimited,
        # no spaces, should be monophyletic).
        '-o':ValuedParameter('-',Name='o',Delimiter=' '),

        # Enable checkpointing using the dmtcp library available at 
        # http://dmtcp.sourceforge.net/. This only works if you call the program 
        # by preceded by the command "dmtcp_checkpoint" and if you compile a 
        # dedicated binary using the appropriate Makefile. With "-O" you can 
        # specify the interval between checkpoints in seconds.
        # DEFAULT: 3600.0 seconds - DEPRECATED IN 7.3.0
        #'-O':ValuedParameter('-',Name='O',Delimiter=' ',Value=3600.0),

        # Specify a random number seed for the parsimony inferences. This allows 
        # you to reproduce your results and will help me debug the program.
        '-p':ValuedParameter('-',Name='p',Delimiter=' '),
        
        # Specify the file name of a user-defined AA (Protein) substitution 
        # model. This file must contain 420 entries, the first 400 being the AA 
        # substitution rates (this must be a symmetric matrix) and the last 20 
        # are the empirical base frequencies
        '-P':ValuedParameter('-',Name='P',Delimiter=' '),

        # Specified MultipleModel file name, in format:
        #    gene1 = 1-500
        #    gene2 = 501-1000
        #    (note: ranges can also be discontiguous, e.g. 1-100, 200-300,
        #     or can specify codon ranges as e.g. 1-100/3, 2-100/3, 3-100/3))
        '-q':ValuedParameter('-', Name='q', Delimiter=' '),

        # THE FOLLOWING "Q" is DEPRECATED IN 7.2.8
        # Turn on computation of SH-like support values on tree.
        # DEFAULT: OFF
        '-Q':FlagParameter('-', Name='Q'),
        
        # Constraint file name: allows a bifurcating Newick tree to be passed
        # in as a constraint file, other taxa will be added by parsimony.
        '-r':ValuedParameter('-',Name='r',Delimiter=' '),
        
        # THE FOLLOWING "R" is IN 7.2.8 
        # Specify the file name of a binary model parameter file that has
        # previously been generated with RAxML using the -f e tree evaluation
        # option. The file name should be:  RAxML_binaryModelParameters.runID
        '-R':ValuedParameter('-',Name='R',Delimiter=' '),
        
        # specify the name of the alignment data file, in relaxed PHYLIP
        # format.
        '-s':ValuedParameter('-',Name='s',Delimiter=' '),

        # Specify the name of a secondary structure file. The file can contain 
        # "." for alignment columns that do not form part of a stem and 
        # characters "()<>[]{}" to define stem regions and pseudoknots
        '-S':ValuedParameter('-',Name='S',Delimiter=' '),
        
        # Specify a user starting tree file name in Newick format
        '-t':ValuedParameter('-',Name='t',Delimiter=' '),

        # PTHREADS VERSION ONLY! Specify the number of threads you want to run.
        # Make sure to set "-T" to at most the number of CPUs you have on your 
        # machine, otherwise, there will be a huge performance decrease! 
        '-T':ValuedParameter('-',Name='T',Delimiter=' '),
        
        # THE FOLLOWING "U" is IN 7.2.8 
        # Try to save memory by using SEV-based implementation for gap columns
        # on large gappy alignments
        # WARNING: this will only work for DNA under GTRGAMMA and is still in an
        # experimental state.
        '-U':ValuedParameter('-',Name='U',Delimiter=' '),
        
        # Print the version
        '-v':FlagParameter('-',Name='v'),

        # Name of the working directory where RAxML-V will write its output 
        # files.
        '-w':ValuedParameter('-',Name='w',Delimiter=' '),

        # THE FOLLOWING "W" is IN 7.2.8
        # Sliding window size for leave-one-out site-specific placement bias
        # algorithm only effective when used in combination with "-f S" 
        #   DEFAULT: 100 sites
        '-W':ValuedParameter('-',Name='W',Delimiter=' '),
        
        # Specify an integer number (random seed) and turn on rapid 
        # bootstrapping. CAUTION: unlike in version 7.0.4 RAxML will conduct 
        # rapid BS replicates under the model of rate heterogeneity you 
        # specified via "-m" and not by default under CAT
        '-x':ValuedParameter('-',Name='x',Delimiter=' '),
        
        # EXPERIMENTAL OPTION: This option will do a per-site estimate of
        # protein substitution models by looping over all given, fixed models
        # LG, WAG, JTT, etc and using their respective base frequencies to
        # independently assign a prot subst. model to each site via ML
        # optimization. At present this option only works with the GTR+GAMMA
        # model, unpartitioned datasets, and in the sequential version only.
        #   DEFAULT: OFF
        '-X':FlagParameter('-', Name='X'),

        # Compute only randomized starting parsimony tree with RAxML, do not
        # optimize an ML analysis of the tree
        '-y':FlagParameter('-', Name='y'),

        # Do a more thorough parsimony tree search using a parsimony ratchet and 
        # exit. Specify the number of ratchet searches via "-#" or "-N". This 
        # has just been implemented for completeness, if you want a fast MP 
        # implementation use TNT
        # DEFAULT: OFF - DEPRECATED IN 7.3.0
        #'-Y':FlagParameter('-', Name='Y'),

        # Multiple tree file, for use with -f b (to draw bipartitions onto the
        # common tree specified with -t)
        '-z':ValuedParameter('-', Name='z', Delimiter=' '),

        # Specifies number of runs on distinct starting trees.
        '-#':ValuedParameter('-', Name='#', Delimiter=' ',Value=1),

        # Specifies number of runs on distinct starting trees.
        '-N':ValuedParameter('-', Name='N', Delimiter=' '),

    }

    _parameters = {}
    _parameters.update(_options)
    _command = "raxmlHPC"
    _out_format = "RAxML_%s.%s"

    def _format_output(self, outfile_name, out_type):
        """ Prepend proper output prefix to output filename """

        outfile_name = self._absolute(outfile_name)
        outparts = outfile_name.split("/") 
        outparts[-1] = self._out_format % (out_type, outparts[-1] )

        return '/'.join(outparts)

    def _input_as_seqs(self,data):
        lines = []
        for i,s in enumerate(data):
            #will number the sequences 1,2,3,etc.
            lines.append(''.join(['>',str(i+1)]))
            lines.append(s)
        return self._input_as_lines(lines)

    def _input_as_lines(self,data):
        if data:
            self.Parameters['-s']\
                .on(super(Raxml,self)._input_as_lines(data))
        return ''

    def _input_as_string(self,data):
        """Makes data the value of a specific parameter
     
        This method returns the empty string. The parameter will be printed
        automatically once set.
        """
        if data:
            self.Parameters['-in'].on(str(data))
        return ''

    def _input_as_multiline_string(self, data):
        if data:
            self.Parameters['-s']\
                .on(super(Raxml,self)._input_as_multiline_string(data))
        return ''
   
    def _absolute(self,path):
        path = FilePath(path)
        if isabs(path):
            return path
        elif self.Parameters['-w'].isOn():
            return self.Parameters['-w'].Value + path
        else:
            return self.WorkingDir + path

    def _log_out_filename(self):
        if self.Parameters['-n'].isOn():
            return self._format_output(str(self.Parameters['-n'].Value), "log")
        else:
            raise ValueError, "No output file specified." 

    def _info_out_filename(self):
        if self.Parameters['-n'].isOn():
            return self._format_output(str(self.Parameters['-n'].Value), "info")
        else:
            raise ValueError, "No output file specified." 

    def _parsimony_tree_out_filename(self):
        if self.Parameters['-n'].isOn():
            return self._format_output(str(self.Parameters['-n'].Value), \
                                            "parsimonyTree")
        else:
            raise ValueError, "No output file specified." 
    
    # added for tree-insertion
    def _originallabelled_tree_out_filename(self):
        if self.Parameters['-n'].isOn():
            return self._format_output(str(self.Parameters['-n'].Value), \
                                            "originalLabelledTree")
        else:
            raise ValueError, "No output file specified."
    
    # added for tree-insertion
    def _labelled_tree_out_filename(self):
        if self.Parameters['-n'].isOn():
            return self._format_output(str(self.Parameters['-n'].Value), \
                                            "labelledTree")
        else:
            raise ValueError, "No output file specified."

    # added for tree-insertion
    def _classification_out_filename(self):
        if self.Parameters['-n'].isOn():
            return self._format_output(str(self.Parameters['-n'].Value), \
                                            "classification")
        else:
            raise ValueError, "No output file specified."
    
    # added for tree-insertion
    def _classificationlikelihoodweights_out_filename(self):
        if self.Parameters['-n'].isOn():
            return self._format_output(str(self.Parameters['-n'].Value), \
                                            "classificationLikelihoodWeights")
        else:
            raise ValueError, "No output file specified."
    
    # added for tree-insertion
    def _best_tree_out_filename(self):
        if self.Parameters['-n'].isOn():
            return self._format_output(str(self.Parameters['-n'].Value), \
                                            "bestTree")
        else:
            raise ValueError, "No output file specified."

    # added for tree-insertion
    def _entropy_out_filename(self):
        if self.Parameters['-n'].isOn():
            return self._format_output(str(self.Parameters['-n'].Value), \
                                            "entropy")
        else:
            raise ValueError, "No output file specified."

    # added for tree-insertion
    def _json_out_filename(self):
        if self.Parameters['-n'].isOn():
            return self._format_output(str(self.Parameters['-n'].Value), \
                                            "portableTree")
        else:
            raise ValueError, "No output file specified."
    
    # added for tree-insertion
    def _parsimony_out_filename(self):
        if self.Parameters['-n'].isOn():
            return self._format_output(str(self.Parameters['-n'].Value), \
                                            "equallyParsimoniousPlacements")
        else:
            raise ValueError, "No output file specified."
            
    def _result_tree_out_filename(self):
        if self.Parameters['-n'].isOn():
            return self._format_output(str(self.Parameters['-n'].Value), \
                                            "result")
        else:
            raise ValueError, "No output file specified." 

    def _result_bootstrap_out_filename(self):
        if self.Parameters['-n'].isOn():
            return self._format_output(str(self.Parameters['-n'].Value), \
                                            "bootstrap")
        else:
            raise ValueError, "No output file specified"

    def _checkpoint_out_filenames(self):
        """
        RAxML generates a crapload of checkpoint files so need to
        walk directory to collect names of all of them.
        """
        out_filenames = []
        if self.Parameters['-n'].isOn():
            out_name = str(self.Parameters['-n'].Value)
            walk_root = self.WorkingDir
            if self.Parameters['-w'].isOn(): 
                walk_root = str(self.Parameters['-w'].Value)
            for tup in walk(walk_root):
                dpath, dnames, dfiles = tup
                if dpath == walk_root:
                    for gen_file in dfiles:
                        if out_name in gen_file and "checkpoint" in gen_file:
                            out_filenames.append(walk_root + gen_file)
                    break

        else:
            raise ValueError, "No output file specified." 
        return out_filenames

    def _handle_app_result_build_failure(self,out,err,exit_status,result_paths):
        """ Catch the error when files are not produced """

        try:
            raise ApplicationError, \
             'RAxML failed to produce an output file due to the following error: \n\n%s ' \
             % err.read()
        except:
            raise ApplicationError,\
                'RAxML failed to run properly.'

    def _get_result_paths(self,data):

        result = {}
        result['Info'] = ResultPath(Path=self._info_out_filename(),
                                            IsWritten=True)
        if self.Parameters['-k'].isOn():
            result['Bootstrap'] = ResultPath(
                            Path=self._result_bootstrap_out_filename(),
                            IsWritten=True)
        elif self.Parameters["-f"].Value == 'v':
            #these were added to handle the results from tree-insertion
            result['Classification'] = ResultPath(
                Path=self._classification_out_filename(),
                IsWritten=True)
            result['ClassificationLikelihoodWeights'] = ResultPath(  
                Path=self._classificationlikelihoodweights_out_filename(),
                IsWritten=True)
            result['OriginalLabelledTree'] = ResultPath(  
                Path=self._originallabelled_tree_out_filename(),
                IsWritten=True)
            result['Result'] = ResultPath(
                Path=self._labelled_tree_out_filename(),IsWritten=True)
            result['entropy'] = ResultPath(
                Path=self._entropy_out_filename(),IsWritten=True)
            result['json'] = ResultPath(
                Path=self._json_out_filename()+'.jplace',IsWritten=True)
        elif self.Parameters["-f"].Value == 'y':
            #these were added to handle the results from tree-insertion
            
            result['Parsimony'] = ResultPath(  
                Path=self._parsimony_out_filename(),
                IsWritten=True)
            result['OriginalLabelledTree'] = ResultPath(  
                Path=self._originallabelled_tree_out_filename(),
                IsWritten=True)
            result['json'] = ResultPath(
                Path=self._json_out_filename()+'.jplace',IsWritten=True)
        else:
            result['Log'] = ResultPath(Path=self._log_out_filename(),
                                            IsWritten=True)
            result['ParsimonyTree'] = ResultPath(
                                      Path=self._parsimony_tree_out_filename(),
                                      IsWritten=True)
            result['Result'] = ResultPath(
                            Path=self._result_tree_out_filename(),
                            IsWritten=True)
            #
            result['besttree'] = ResultPath(
                            Path=self._best_tree_out_filename(),
                            IsWritten=True)
        
        for checkpoint_file in self._checkpoint_out_filenames():
            checkpoint_num = checkpoint_file.split(".")[-1]
            try:
                checkpoint_num = int(checkpoint_num)
            except Exception, e:
                raise ValueError, "%s does not appear to be a valid checkpoint file"
            result['Checkpoint%d' % checkpoint_num] = ResultPath(
                        Path=checkpoint_file,
                        IsWritten=True)
 
        return result
Ejemplo n.º 15
0
class Clearcut(CommandLineApplication):
    """ clearcut application controller 
   
    The parameters are organized by function to give some idea of how the 
    program works. However, no restrictions are put on any combinations 
    of parameters. Misuse of parameters can lead to errors or otherwise
    strange results.
    """
    #General options.
    _general = {\
        # --verbose.  More Output. (Default:OFF)

        '-v':FlagParameter('-',Name='v'),
        # --quiet.  Silent operation. (Default: ON)
        '-q':FlagParameter('-',Name='q',Value=True),
        # --seed=<seed>.  Explicitly set the PRNG seed to a specific value.
        '-s':ValuedParameter('-',Name='s',Delimiter='='),
        # --norandom.  Attempt joins deterministically.  (Default: OFF)
        '-r':FlagParameter('-',Name='r'),
        # --shuffle.  Randomly shuffle the distance matrix.  (Default: OFF)
        '-S':FlagParameter('-',Name='S'),
        #--neighbor.  Use traditional Neighbor-Joining algorithm. (Default: OFF)
        '-N':FlagParameter('-',Name='N'),

        }

    # Input file is distance matrix or alignment.  Default expects distance
    # matrix.  Output file is tree created by clearcut.
    _input = {\
        # --in=<infilename>.  Input file

        '--in':ValuedParameter('--',Name='in',Delimiter='=',IsPath=True),
        # --stdin.  Read input from STDIN.
        '-I':FlagParameter('-',Name='I'),
        # --distance.  Input file is a distance matrix. (Default: ON)
        '-d':FlagParameter('-',Name='d',Value=True),
        # --alignment.  Input file is a set of aligned sequences.
        #     (Default: OFF)
        '-a':FlagParameter('-',Name='a'),
        # --DNA.  Input alignment are DNA sequences.
        '-D':FlagParameter('-',Name='D'),
        # --protein.  Input alignment are protein sequences.
        '-P':FlagParameter('-',Name='P'),
        }

    #Correction model for computing distance matrix (Default: NO Correction):
    _correction={\
        # --jukes.  Use Jukes-Cantor correction for computing distance matrix.

        '-j':FlagParameter('-',Name='j'),
        # --kimura.  Use Kimura correction for distance matrix.
        '-k':FlagParameter('-',Name='k'),

        }

    _output={\
        # --out=<outfilename>.  Output file

        '--out':ValuedParameter('--',Name='out',Delimiter='=',IsPath=True),
        # --stdout.  Output tree to STDOUT.
        '-O':FlagParameter('-',Name='O'),
        # --matrixout=<file> Output distance matrix to specified file.
        '-m':ValuedParameter('-',Name='m',Delimiter='='),
        # --ntrees=<n>.  Output n trees.  (Default: 1)
        '-n':ValuedParameter('-',Name='n',Delimiter='='),
        # --expblen.  Exponential notation for branch lengths. (Default: OFF)
        '-e':FlagParameter('-',Name='e'),
        # --expdist.  Exponential notation in distance output. (Default: OFF)
        '-E':FlagParameter('-',Name='E'),

        }

    #NOT SUPPORTED
    #'-h':FlagParameter('-','h'),       #Help
    #'-V':FlagParameter('-','V'),       #Version

    _parameters = {}
    _parameters.update(_general)
    _parameters.update(_input)
    _parameters.update(_correction)
    _parameters.update(_output)

    _command = 'clearcut'

    def getHelp(self):
        """Method that points to the Clearcut documentation."""
        help_str =\
        """
        See Clearcut homepage at:
        http://bioinformatics.hungry.com/clearcut/
        """
        return help_str

    def _input_as_multiline_string(self, data):
        """Writes data to tempfile and sets -infile parameter

        data -- list of lines
        """
        if data:
            self.Parameters['--in']\
                .on(super(Clearcut,self)._input_as_multiline_string(data))
        return ''

    def _input_as_lines(self, data):
        """Writes data to tempfile and sets -infile parameter

        data -- list of lines, ready to be written to file
        """
        if data:
            self.Parameters['--in']\
                .on(super(Clearcut,self)._input_as_lines(data))
        return ''

    def _input_as_seqs(self, data):
        """writes sequences to tempfile and sets -infile parameter

        data -- list of sequences

        Adds numbering to the sequences: >1, >2, etc.
        """
        lines = []
        for i, s in enumerate(data):
            #will number the sequences 1,2,3,etc.
            lines.append(''.join(['>', str(i + 1)]))
            lines.append(s)
        return self._input_as_lines(lines)

    def _input_as_string(self, data):
        """Makes data the value of a specific parameter
    
        This method returns the empty string. The parameter will be printed
        automatically once set.
        """
        if data:
            self.Parameters['--in'].on(data)
        return ''

    def _tree_filename(self):
        """Return name of file containing the alignment
        
        prefix -- str, prefix of alignment file.
        """
        if self.Parameters['--out']:
            aln_filename = self._absolute(self.Parameters['--out'].Value)
        else:
            raise ValueError("No tree output file specified.")
        return aln_filename

    def _get_result_paths(self, data):
        """Return dict of {key: ResultPath}
        """
        result = {}
        if self.Parameters['--out'].isOn():
            out_name = self._tree_filename()
            result['Tree'] = ResultPath(Path=out_name, IsWritten=True)
        return result
Ejemplo n.º 16
0
class Pplacer(CommandLineApplication):
    """pplacer Application Controller
    """

    _command = 'pplacer'
    _input_handler = '_input_as_multiline_string'
    _parameters = {
        # -c Specify the path to the reference package.
        '-c':
        ValuedParameter('-', Name='c', Delimiter=' ', IsPath=True),

        # -t Specify the reference tree filename.
        '-t':
        ValuedParameter('-', Name='t', Delimiter=' ', IsPath=True),

        # -r Specify the reference alignment filename.
        '-r':
        ValuedParameter('-', Name='r', Delimiter=' ', IsPath=True),

        # -s Supply a phyml stats.txt or a RAxML info file giving the model parameters.
        '-s':
        ValuedParameter('-', Name='s', Delimiter=' ', IsPath=True),

        # -d Specify the directory containing the reference information.
        '-d':
        ValuedParameter('-', Name='d', Delimiter=' ', IsPath=True),

        # -p Calculate posterior probabilities.
        '-p':
        FlagParameter('-', Name='p'),

        # -m Substitution model. Protein: are LG, WAG, or JTT. Nucleotides: GTR.
        '-m':
        ValuedParameter('-', Name='m', Delimiter=' '),

        # --model-freqs Use model frequencies instead of reference alignment frequencies.
        '--model-freqs':
        FlagParameter('--', Name='model-freqs'),

        # --gamma-cats Number of categories for discrete gamma model.
        '--gamma-cats':
        ValuedParameter('--', Name='gamma-cats', Delimiter=' '),

        # --gamma-alpha Specify the shape parameter for a discrete gamma model.
        '--gamma-alpha':
        ValuedParameter('--', Name='gamma-alpha', Delimiter=' '),

        # --ml-tolerance 1st stage branch len optimization tolerance (2nd stage to 1e-5). Default: 0.01.
        '--ml-tolerance':
        ValuedParameter('--', Name='ml-tolerance', Delimiter=' '),

        # --pp-rel-err Relative error for the posterior probability calculation. Default is 0.01.
        '--pp-rel-err':
        ValuedParameter('--', Name='pp-rel-err', Delimiter=' '),

        # --unif-prior Use a uniform prior rather than exponential.
        '--unif-prior':
        FlagParameter('--', Name='unif-prior'),

        # --start-pend Starting pendant branch length. Default is 0.1.
        '--start-pend':
        ValuedParameter('--', Name='start-pend', Delimiter=' '),

        # --max-pend Set the maximum ML pendant branch length. Default is 2.
        '--max-pend':
        ValuedParameter('--', Name='max-pend', Delimiter=' '),

        # --max-strikes Maximum number of strikes for baseball. 0 -> no ball playing. Default is 6.
        '--max-strikes':
        ValuedParameter('--', Name='max-strikes', Delimiter=' '),

        # --strike-box Set the size of the strike box in log likelihood units. Default is 3.
        '--strike-box':
        ValuedParameter('--', Name='strike-box', Delimiter=' '),

        # --max-pitches Set the maximum number of pitches for baseball. Default is 40.
        '--max-pitches':
        ValuedParameter('--', Name='max-pitches', Delimiter=' '),

        # --fantasy Desired likelihood cutoff for fantasy baseball mode. 0 -> no fantasy.
        '--fantasy':
        ValuedParameter('--', Name='fantasy', Delimiter=' '),

        # --fantasy-frac Fraction of fragments to use when running fantasy baseball. Default is 0.1.
        '--fantasy-frac':
        ValuedParameter('--', Name='fantasy-frac', Delimiter=' '),

        # --write-masked Write alignment masked to the region without gaps in the query.
        '--write-masked':
        FlagParameter('--', Name='write-masked'),

        # --verbosity Set verbosity level. 0 is silent, and 2 is quite a lot. Default is 1.
        '--verbosity':
        ValuedParameter('--', Name='verbosity', Delimiter=' '),

        # --unfriendly Do not run friend finder pre-analysis.
        '--unfriendly':
        FlagParameter('--', Name='unfriendly'),

        # --out-dir Specify the directory to write place files to.
        '--out-dir':
        ValuedParameter('--', Name='out-dir', Delimiter=' ', IsPath=True),

        # --pretend Only check out the files then report. Do not run the analysis.
        '--pretend':
        FlagParameter('--', Name='pretend'),

        # --csv Make a CSV file with the results.
        '--csv':
        FlagParameter('--', Name='csv'),

        # --old-format Make an old-format placefile with the resuls.
        '--old-format':
        FlagParameter('--', Name='old-format'),

        # --diagnostic Write file describing the 'diagnostic' mutations for various clades.
        '--diagnostic':
        FlagParameter('--', Name='diagnostic'),

        # --check-like Write out the likelihood of the reference tree, calculated two ways.
        '--check-like':
        FlagParameter('--', Name='check-like'),

        # --version Write out the version number and exit.
        '--version':
        FlagParameter('--', Name='version'),

        # --help  Display this list of options
        '--help':
        FlagParameter('--', Name='help'),
    }

    def getTmpFilename(self, tmp_dir="/tmp",prefix='tmp',suffix='.fasta',\
           include_class_id=False,result_constructor=FilePath):
        """ Define Tmp filename to contain .fasta suffix, since pplacer requires
            the suffix to be .fasta """

        return super(Pplacer, self).getTmpFilename(
            tmp_dir=tmp_dir,
            prefix=prefix,
            suffix=suffix,
            include_class_id=include_class_id,
            result_constructor=result_constructor)

    def _handle_app_result_build_failure(self, out, err, exit_status,
                                         result_paths):
        """ Catch the error when files are not produced """
        raise ApplicationError, \
         'Pplacer failed to produce an output file due to the following error: \n\n%s ' \
         % out.read()

    def _get_result_paths(self, data):
        """ Define the output filepaths """
        output_dir = self.Parameters['--out-dir'].Value
        result = {}
        result['json'] = ResultPath(Path=join(output_dir,
                                splitext(split(self._input_filename)[-1])[0] + \
                                '.jplace'))
        return result
Ejemplo n.º 17
0
class CMfinder(CommandLineApplication):
    """The application controller for CMfinder 0.2 application


    Options:
    -b               Do not use BLAST search to locate anchors
    -v               Verbose. Print running information, and save intermediate
                     results
    -c <number>      The maximum number of candidates in each sequence. Default
                     40. No bigger than 100.
    -m <number>      The minimum length of candidates. Default 30
    -M <number>      The maximum length of candidates. Default 100
    -n <number>      The maximum number of output motifs. Default 3
    -f <number>      The fraction of the sequences expected to contain the
                     motif. Default 0.80
    -s <number>      The number of stem-loops in the motif
    -h               Show help


    """
    #-n default is 3, set to 3 because of resultpath concerns
    _parameters = {
        '-b': FlagParameter(Prefix='-', Name='b', Value=True),
        '-v': FlagParameter(Prefix='-', Name='v'),
        '-c': ValuedParameter(Prefix='-', Name='c', Value=None, Delimiter=' '),
        '-m': ValuedParameter(Prefix='-', Name='m', Value=None, Delimiter=' '),
        '-M': ValuedParameter(Prefix='-', Name='M', Value=None, Delimiter=' '),
        '-n': ValuedParameter(Prefix='-', Name='n', Value=3, Delimiter=' '),
        '-f': ValuedParameter(Prefix='-', Name='f', Value=None, Delimiter=' '),
        '-s': ValuedParameter(Prefix='-', Name='s', Value=None, Delimiter=' ')
    }
    _command = 'cmfinder.pl'
    _input_handler = '_input_as_string'

    def _get_result_paths(self, data):
        """Specifies the paths of output files generated by the application
        
        data: the data the instance the application is called on
        
        CMfinder produces it's output in two files .align and .motif
        it also prints an output to sdtout.

        """
        result = {}
        if not isinstance(data, list):
            inputPath = str(data)
        else:
            inputPath = self._input_filename
        itr = self.Parameters['-n'].Value
        for i in range(itr):
            nr = str(i + 1)
            try:  #unknown nr of output files
                f = open(
                    (inputPath + '.motif.h1.' + nr))  #if exists add to path
                f.close()
                result[('cm_'+nr)] =\
                    ResultPath(Path=(inputPath+'.cm.h1.'+nr))
                result[('motif_'+nr)] =\
                    ResultPath(Path=(inputPath+'.motif.h1.'+nr))

            except IOError:  # else no more outputs
                break
        if self._input_filename is not None:
            result['_input_filename'] = ResultPath(self._input_filename)

        if isfile(self.WorkingDir + 'latest.cm'):
            result['latest'] =\
                ResultPath(Path=(self.WorkingDir+'latest.cm'))
        else:
            pass

        return result
Ejemplo n.º 18
0
class SeqPrep(CommandLineApplication):
    """SeqPrep application controller for joining paired-end reads"""
    _command = 'SeqPrep'
    _parameters = {
        # Required Arguments
        # -f <first read input fastq filename>
        # -r <second read input fastq filename>
        # -1 <first read output fastq filename>
        # -2 <second read output fastq filename>
        '-f': ValuedParameter(Prefix='-', Delimiter=' ', Name='f'),
        '-r': ValuedParameter(Prefix='-', Delimiter=' ', Name='r'),
        '-1': ValuedParameter(Prefix='-', Delimiter=' ', Name='1'),
        '-2': ValuedParameter(Prefix='-', Delimiter=' ', Name='2'),

        # General Arguments (Optional):
        # -3 <first read discarded fastq filename>
        # -4 <second read discarded fastq filename>
        # -h Display this help message and exit (also works with no args)
        # -6 Input sequence is in phred+64 rather than phred+33 format, the
        #    output will still be phred+33
        # -q <Quality score cutoff for mismatches to be counted in overlap; default = 13>
        # -L <Minimum length of a trimmed or merged read to print it; default = 30>
        '-3': ValuedParameter(Prefix='-', Delimiter=' ', Name='3'),
        '-4': ValuedParameter(Prefix='-', Delimiter=' ', Name='4'),
        '-h': FlagParameter(Prefix='-', Name='h'),
        '-6': FlagParameter(Prefix='-', Name='6'),
        '-q': ValuedParameter(Prefix='-', Delimiter=' ', Name='q'),
        '-L': ValuedParameter(Prefix='-', Delimiter=' ', Name='L'),

        # Arguments for Adapter/Primer Trimming (Optional):
        # -A <forward read primer/adapter sequence to trim as it would appear at the
        #   end of a read (recommend about 20bp of this)
        #	(should validate by grepping a file);
        #   default (genomic non-multiplexed adapter1) = AGATCGGAAGAGCGGTTCAG>
        # -B <reverse read primer/adapter sequence to trim as it would appear at the
        #   end of a read (recommend about 20bp of this)
        #	(should validate by grepping a file);
        #   default (genomic non-multiplexed adapter2) = AGATCGGAAGAGCGTCGTGT>
        # -O <minimum overall base pair overlap with adapter sequence to trim;
        #   default = 10>
        # -M <maximum fraction of good quality mismatching bases for primer/adapter
        #    overlap; default = 0.020000>
        # -N <minimum fraction of matching bases for primer/adapter overlap;
        #   default = 0.870000>
        # -b <adapter alignment band-width; default = 50>
        # -Q <adapter alignment gap-open; default = 8>
        # -t <adapter alignment gap-extension; default = 2>
        # -e <adapter alignment gap-end; default = 2>
        # -Z <adapter alignment minimum local alignment score cutoff
        #   [roughly (2*num_hits) - (num_gaps*gap_open) - (num_gaps*gap_close) -
        #   (gap_len*gap_extend) - (2*num_mismatches)]; default = 26>
        # -w <read alignment band-width; default = 50>
        # -W <read alignment gap-open; default = 26>
        # -p <read alignment gap-extension; default = 9>
        # -P <read alignment gap-end; default = 5>
        # -X <read alignment maximum fraction gap cutoff; default = 0.125000>
        '-A': ValuedParameter(Prefix='-', Delimiter=' ', Name='A'),
        '-B': ValuedParameter(Prefix='-', Delimiter=' ', Name='B'),
        '-O': ValuedParameter(Prefix='-', Delimiter=' ', Name='O'),
        '-M': ValuedParameter(Prefix='-', Delimiter=' ', Name='M'),
        '-N': ValuedParameter(Prefix='-', Delimiter=' ', Name='N'),
        '-b': ValuedParameter(Prefix='-', Delimiter=' ', Name='b'),
        '-Q': ValuedParameter(Prefix='-', Delimiter=' ', Name='Q'),
        '-t': ValuedParameter(Prefix='-', Delimiter=' ', Name='t'),
        '-e': ValuedParameter(Prefix='-', Delimiter=' ', Name='e'),
        '-Z': ValuedParameter(Prefix='-', Delimiter=' ', Name='Z'),
        '-w': ValuedParameter(Prefix='-', Delimiter=' ', Name='w'),
        '-W': ValuedParameter(Prefix='-', Delimiter=' ', Name='W'),
        '-p': ValuedParameter(Prefix='-', Delimiter=' ', Name='p'),
        '-P': ValuedParameter(Prefix='-', Delimiter=' ', Name='P'),
        '-X': ValuedParameter(Prefix='-', Delimiter=' ', Name='X'),

        # Optional Arguments for Merging:
        # -y <maximum quality score in output ((phred 33) default = ']' )>
        # -g <print overhang when adapters are present and stripped (use this if
        #   reads are different length)>
        # -s <perform merging and output the merged reads to this file>
        # -E <write pretty alignments to this file for visual Examination>
        # -x <max number of pretty alignments to write (if -E provided);
        #   default = 10000>
        # -o <minimum overall base pair overlap to merge two reads; default = 15>
        # -m <maximum fraction of good quality mismatching bases to overlap reads;
        #   default = 0.020000>
        # -n <minimum fraction of matching bases to overlap reads;
        #   default = 0.900000>
        '-y': ValuedParameter(Prefix='-', Delimiter=' ', Name='y'),
        '-g': FlagParameter(Prefix='-', Name='y'),
        '-s': ValuedParameter(Prefix='-', Delimiter=' ', Name='s'),
        '-E': ValuedParameter(Prefix='-', Delimiter=' ', Name='E'),
        '-x': ValuedParameter(Prefix='-', Delimiter=' ', Name='x'),
        '-o': ValuedParameter(Prefix='-', Delimiter=' ', Name='o'),
        '-m': ValuedParameter(Prefix='-', Delimiter=' ', Name='m'),
        '-n': ValuedParameter(Prefix='-', Delimiter=' ', Name='n')
    }

    def _unassembled_reads1_out_file_name(self):
        """Checks file name is set for reads1 output. 
           Returns absolute path."""
        if self.Parameters['-1'].isOn():
            unassembled_reads1 = self._absolute(
                str(self.Parameters['-1'].Value))
        else:
            raise ValueError("No reads1 (flag: -1) output path specified")
        return unassembled_reads1

    def _unassembled_reads2_out_file_name(self):
        """Checks if file name is set for reads2 output. 
           Returns absolute path."""
        if self.Parameters['-2'].isOn():
            unassembled_reads2 = self._absolute(
                str(self.Parameters['-2'].Value))
        else:
            raise ValueError("No reads2 (flag -2) output path specified")
        return unassembled_reads2

    def _discarded_reads1_out_file_name(self):
        """Checks if file name is set for discarded reads1 output. 
           Returns absolute path."""
        if self.Parameters['-3'].isOn():
            discarded_reads1 = self._absolute(str(self.Parameters['-3'].Value))
        else:
            raise ValueError(
                "No discarded-reads1 (flag -3) output path specified")
        return discarded_reads1

    def _discarded_reads2_out_file_name(self):
        """Checks if file name is set for discarded reads2 output. 
           Returns absolute path."""
        if self.Parameters['-4'].isOn():
            discarded_reads2 = self._absolute(str(self.Parameters['-4'].Value))
        else:
            raise ValueError(
                "No discarded-reads2 (flag -4) output path specified")
        return discarded_reads2

    def _assembled_out_file_name(self):
        """Checks file name is set for assembled output. 
           Returns absolute path."""
        if self.Parameters['-s'].isOn():
            assembled_reads = self._absolute(str(self.Parameters['-s'].Value))
        else:
            raise ValueError(
                "No assembled-reads (flag -s) output path specified")
        return assembled_reads

    def _pretty_alignment_out_file_name(self):
        """Checks file name is set for pretty alignment output. 
           Returns absolute path."""
        if self.Parameters['-E'].isOn():
            pretty_alignment = self._absolute(str(self.Parameters['-E'].Value))
        else:
            raise ValueError(
                "No pretty-=alignment (flag -E) output path specified")
        return pretty_alignment

    def _get_result_paths(self, data):
        """Captures SeqPrep output.
        
        """
        result = {}

        # Always output:
        result['UnassembledReads1'] = ResultPath(
            Path=self._unassembled_reads1_out_file_name(), IsWritten=True)
        result['UnassembledReads2'] = ResultPath(
            Path=self._unassembled_reads2_out_file_name(), IsWritten=True)

        # optional output, so we check for each
        # check for assembled reads file
        if self.Parameters['-s'].isOn():
            result['Assembled'] = ResultPath(
                Path=self._assembled_out_file_name(), IsWritten=True)

        # check for discarded (unassembled) reads1 file
        if self.Parameters['-3'].isOn():
            result['Reads1Discarded'] = ResultPath(
                Path=self._discarded_reads1_out_file_name(), IsWritten=True)

        # check for discarded (unassembled) reads2 file
        if self.Parameters['-4'].isOn():
            result['Reads2Discarded'] = ResultPath(
                Path=self._discarded_reads2_out_file_name(), IsWritten=True)

        # check for pretty-alignment file
        if self.Parameters['-E'].isOn():
            result['PrettyAlignments'] = ResultPath(
                Path=self._pretty_alignment_out_file_name(), IsWritten=True)

        return result

    def getHelp(self):
        """seqprep help"""
        help_str = """
        For basic help, type the following at the command line:
            'SeqPrep -h'

        Website:
            https://github.com/jstjohn/SeqPrep
        """
        return help_str
Ejemplo n.º 19
0
class PandaSeq(CommandLineApplication):
    """pandaseq application controller for joining paired-end reads """
    _command = 'pandaseq'
    _parameters = {
        # pandaseq 2.4 <*****@*****.**>
        # Usage: pandaseq -f forward.fastq -r reverse.fastq [-6] [-a] [-B]
        #    [-C module1 -C module2 ...] [-d flags] [-F] [-j] [-L maxlen]
        #    [-l minlen] [-N] [-o minoverlap] [-p forwardprimer]
        #    [-q reverseprimer] [-T threads] [-t threshold] > assembled.fastq

        # -6  Use PHRED+64 (CASAVA 1.3-1.7) instead of PHRED+33 (CASAVA 1.8+).
        '-6': FlagParameter(Prefix='-', Name='6'),

        # -a  Strip the primers after assembly, rather than before.
        '-a': FlagParameter(Prefix='-', Name='a'),

        # -B  Allow unbarcoded sequences (try this for BADID errors).
        '-B': FlagParameter(Prefix='-', Name='B'),

        # -C module   Load a sequence validation module.
        '-C': FlagParameter(Prefix='-', Name='C'),

        # -d flags    Control the logging messages. Capital to enable; small to disable.
        #    (R)econstruction detail.
        #    Sequence (b)uilding information.
        #    (F)ile processing.
        #    (k)-mer table construction.
        #    Show every (m)ismatch.
        '-d': ValuedParameter(Prefix='-', Delimiter=' ', Name='d'),

        #    Optional (s)tatistics.
        # -f  Input FASTQ file containing forward reads.
        '-f': ValuedParameter(Prefix='-', Delimiter=' ', Name='f'),

        # -F  Output FASTQ instead of FASTA.
        '-F': FlagParameter(Prefix='-', Name='F'),

        # -j  Input files are bzipped.
        '-j': FlagParameter(Prefix='-', Name='j'),

        # -k kmers    The number of k-mers in the table.
        '-k': ValuedParameter(Prefix='-', Delimiter=' ', Name='k'),

        # -L maxlen   Maximum length for a sequence
        '-L': ValuedParameter(Prefix='-', Delimiter=' ', Name='L'),

        # -l minlen   Minimum length for a sequence
        '-l': ValuedParameter(Prefix='-', Delimiter=' ', Name='l'),

        # -N  Eliminate all sequences with unknown nucleotides in the output.
        '-N': FlagParameter(Prefix='-', Name='N'),

        # -o minoverlap   Minimum overlap between forward and reverse reads (default = 1)
        '-o': ValuedParameter(Prefix='-', Delimiter=' ', Name='o'),

        # -p  Forward primer sequence or number of bases to be removed.
        '-p': ValuedParameter(Prefix='-', Delimiter=' ', Name='p'),

        # -q  Reverse primer sequence or number of bases to be removed.
        '-q': ValuedParameter(Prefix='-', Delimiter=' ', Name='q'),

        # -r  Input FASTQ file containing reverse reads.
        '-r': ValuedParameter(Prefix='-', Delimiter=' ', Name='r'),

        # -T thread   Run with a number of parallel threads.
        '-T': ValuedParameter(Prefix='-', Delimiter=' ', Name='T'),

        # -t  The minimum probability that a sequence must have to match a primer.
        #     (default = 6.000000e-01)
        '-t': ValuedParameter(Prefix='-', Delimiter=' ', Name='t'),
    }

    # No _get_results_path needed as all results (the merged paired-ends)
    # are sent to STDOUT.

    def getHelp(self):
        """pandaseq help"""
        help_Str = """
Ejemplo n.º 20
0
class BWA_index(BWA):
    """Controls the "index" subcommand of the bwa application.
    
    Valid input keys are: fasta_in
    """

    # the subcommand for bwa index
    _subcommand = "index"

    _parameters = {
        # which algorithm to use.
        # is
        # IS linear-time algorithm for constructing suffix array. It requires
        # 5.37N memory where N is the size of the database. IS is moderately
        # fast, but does not work with database larger than 2GB. IS is the
        # default algorithm due to its simplicity. The current codes for IS
        # algorithm are reimplemented by Yuta Mori.
        #
        # bwtsw
        # Algorithm implemented in BWT-SW. This method works with the whole
        # human genome, but it does not work with database smaller than 10MB
        # and it is usually slower than IS.
        #
        # DEFAULTs to auto-select (based on input fasta file size)
        '-a': ValuedParameter('-', Delimiter=' ', Name='a'),

        # prefix for the output index.
        # DEFAULTs to the base name of the input fasta file
        '-p': ValuedParameter('-', Delimiter=' ', Name='p'),

        # index files named as <in.fasta>.64.* instead of <in.fasta>.*
        '-6': FlagParameter('-', Name='6')
    }

    # The -a command can take on of only two possible values
    # the -p command allows the user to specify a prefix; for our purposes,
    # this prefix should be an abolute path
    _valid_arguments = {'-a': lambda x: x in ['is', 'bwtsw'], '-p': isabs}

    # For the position specific arguments, this is the order that they will
    # be written in the base command
    # input file keys beginning with _ are optional inputs
    _input_order = ['fasta_in']

    def _get_result_paths(self, data):
        """Gets the results for a run of bwa index.

        bwa index outputs 5 files when the index is created. The filename
        prefix will be the same as the input fasta, unless overridden with
        the -p option, and the 5 extensions are listed below:

        .amb
        .ann
        .bwt
        .pac
        .sa

        and these extentions (including the period) are the keys to the
        dictionary that is returned.
        """

        # determine the names of the files. The name will be the same as the
        # input fasta file unless overridden with the -p option
        if self.Parameters['-p'].isOn():
            prefix = self.Parameters['-p'].Value
        else:
            prefix = data['fasta_in']

        # the 5 output file suffixes
        suffixes = ['.amb', '.ann', '.bwt', '.pac', '.sa']
        out_files = {}
        for suffix in suffixes:
            out_files[suffix] = ResultPath(prefix + suffix, IsWritten=True)

        return out_files
Ejemplo n.º 21
0
class RNAalifold(CommandLineApplication):
    """Application controller for RNAalifold application

    reads aligned RNA sequences from stdin or file.aln and calculates
    their minimum free energy (mfe) structure,  partition  function
    (pf)  and  base pairing probability matrix.

OPTIONS
       -cv <float>
              Set  the weight of the covariance term in the energy function to
              factor. Default is 1.


       -nc <float>
              Set the penalty for non-compatible sequences in  the  covariance
              term of the energy function to factor. Default is 1.

       -E     Score pairs with endgaps same as gap-gap pairs.

       -mis   Output \"most informative sequence\" instead of simple consensus:
              For each column of the alignment output the set  of  nucleotides
              with frequence greater than average in IUPAC notation.

       -p     Calculate  the  partition  function and base pairing probability
              matrix in addition to the mfe structure. Default is  calculation
              of mfe structure only.

       -noLP  Avoid  structures without lonely pairs (helices of length 1). In
              the mfe case structures with lonely pairs are  strictly  forbid-
              den.  For  partition  function folding this disallows pairs that
              can only occur isolated.  Setting this option provides a signif-
              icant speedup.

       The -T, -d, -4, -noGU, -noCloseGU, -e, -P, -nsp, options should work as
       in RNAfold

       If using -C constraints will be read from stdin, the alignment  has  to
       given as a filename on the command line.

       For more info see respective man pages. 

    """

    _parameters = {
        '-cv': ValuedParameter(Prefix='-', Name='cv', Delimiter=' '),
        '-nc': ValuedParameter(Prefix='-', Name='nc', Delimiter=' '),
        '-E': FlagParameter(Prefix='-', Name='E'),
        '-mis': FlagParameter(Prefix='-', Name='mis'),
        '-noLP': FlagParameter(Prefix='-', Name='noLP'),
        '-T': ValuedParameter(Prefix='-', Name='T', Value=37, Delimiter=' '),
        '-4': FlagParameter(Prefix='-', Name=4),
        '-d': MixedParameter(Prefix='-', Name='d', Delimiter=''),
        '-noGU': FlagParameter(Prefix='-', Name='noGU'),
        '-noCloseGU': FlagParameter(Prefix='-', Name='noCloseGU'),
        '-e': ValuedParameter(Prefix='-', Name='e', Delimiter=' '),
        '-P': ValuedParameter(Prefix='-', Name='P', Delimiter=' '),
        '-nsp': ValuedParameter(Prefix='-', Name='nsp', Delimiter=' '),
        '-C': FlagParameter(Prefix='-', Name='C')
    }

    _synonyms = {'Temperature': '-T', 'Temp': '-T', 'EnergyRange': '-e'}

    _command = 'RNAalifold'
    _input_handler = '_input_as_string'

    def _get_result_paths(self, data):
        """Specify the paths of the output files generated by the application

        You always get back: StdOut, StdErr, and ExitStatus.
        In addition RNAalifold writes a file: alirna.ps. It seems that this
            file is always written (no exceptions found so far.
        The documentation says the application can produce a dotplot
            (alidot.ps), but it is unclear when this file is produced, and
            thus it is not added to the results dictionary.
        """

        result = {}

        result['SS'] = ResultPath(Path=self.WorkingDir+'alirna.ps',\
            IsWritten=True)

        return result
Ejemplo n.º 22
0
class BWA_aln(BWA):
    """Controls the "aln" subcommand of the bwa application.
    
    Valid input keys are: prefix, fastq_in 
    """
    _parameters = {
        # max #diff (int) or missing prob under 0.02 err rate (float) [0.04]
        '-n': ValuedParameter('-', Delimiter=' ', Name='n'),
        #maximum number or fraction of gap opens [1]
        '-o': ValuedParameter('-', Delimiter=' ', Name='o'),

        #maximum number of gap extensions, -1 for disabling long gaps [-1]
        '-e': ValuedParameter('-', Delimiter=' ', Name='e'),

        #do not put an indel within bp towards the ends [5]
        '-i': ValuedParameter('-', Delimiter=' ', Name='i'),

        #maximum occurrences for extending a long deletion [10]
        '-d': ValuedParameter('-', Delimiter=' ', Name='d'),

        #seed length [32]
        '-l': ValuedParameter('-', Delimiter=' ', Name='l'),

        #maximum differences in the seed [2]
        '-k': ValuedParameter('-', Delimiter=' ', Name='k'),

        #maximum entries in the queue [2000000]
        '-m': ValuedParameter('-', Delimiter=' ', Name='m'),

        #number of threads [1]
        '-t': ValuedParameter('-', Delimiter=' ', Name='t'),

        #mismatch penalty [3]
        '-M': ValuedParameter('-', Delimiter=' ', Name='M'),

        #gap open penalty [11]
        '-O': ValuedParameter('-', Delimiter=' ', Name='O'),

        #gap extension penalty [4]
        '-E': ValuedParameter('-', Delimiter=' ', Name='E'),

        #stop searching when there are > equally best hits [30]
        '-R': ValuedParameter('-', Delimiter=' ', Name='R'),

        #quality threshold for read trimming down to 35bp [0]
        '-q': ValuedParameter('-', Delimiter=' ', Name='q'),

        #file to write output to instead of stdout
        '-f': ValuedParameter('-', Delimiter=' ', Name='f'),

        #length of barcode
        '-B': ValuedParameter('-', Delimiter=' ', Name='B'),

        #log-scaled gap penalty for long deletions
        '-L': FlagParameter('-', Name='L'),

        #non-iterative mode: search for all n-difference hits (slooow)
        '-N': FlagParameter('-', Name='N'),

        #the input is in the Illumina 1.3+ FASTQ-like format
        '-I': FlagParameter('-', Name='I'),

        #the input read file is in the BAM format
        '-b': FlagParameter('-', Name='b'),

        #use single-end reads only (effective with -b)
        '-0': FlagParameter('-', Name='0'),

        #use the 1st read in a pair (effective with -b)
        '-1': FlagParameter('-', Name='1'),

        #use the 2nd read in a pair (effective with -b)
        '-2': FlagParameter('-', Name='2'),

        #filter Casava-filtered sequences
        '-Y': FlagParameter('-', Name='Y')
    }

    # the subcommand for bwa aln
    _subcommand = 'aln'

    _valid_arguments = {
        # check to see if this is decimal numbers
        '-n': is_float,

        # check to see if these are integers
        '-o': is_int,
        '-e': is_int,
        '-i': is_int,
        '-d': is_int,
        '-l': is_int,
        '-k': is_int,
        '-m': is_int,
        '-t': is_int,
        '-M': is_int,
        '-O': is_int,
        '-E': is_int,
        '-R': is_int,
        '-q': is_int,
        '-B': is_int,

        # check to see if this is an absolute file path
        '-f': isabs
    }

    # input file keys beginning with _ are optional inputs
    _input_order = ['prefix', 'fastq_in']

    def _get_result_paths(self, data):
        """Gets the result file for a bwa aln run.

        There is only one output file of a bwa aln run, a .sai file
        and it can be retrieved with the key 'output'.
        """
        return {
            'output': ResultPath(self.Parameters['-f'].Value, IsWritten=True)
        }
Ejemplo n.º 23
0
class Mothur(CommandLineApplication):
    """Mothur application controller
    """
    _options = {
        # Clustering algorithm.  Choices are furthest, nearest, and
        # average
        'method':
        ValuedParameter(Name='method',
                        Value='furthest',
                        Delimiter='=',
                        Prefix=''),
        # Cutoff distance for the distance matrix
        'cutoff':
        ValuedParameter(Name='cutoff', Value=None, Delimiter='=', Prefix=''),
        # Minimum pairwise distance to consider for clustering
        'precision':
        ValuedParameter(Name='precision', Value=None, Delimiter='=',
                        Prefix=''),
    }
    _parameters = {}
    _parameters.update(_options)
    _input_handler = '_input_as_multiline_string'
    _command = 'mothur'

    def __init__(self,
                 params=None,
                 InputHandler=None,
                 SuppressStderr=None,
                 SuppressStdout=None,
                 WorkingDir=None,
                 TmpDir='/tmp',
                 TmpNameLen=20,
                 HALT_EXEC=False):
        """Initialize a Mothur application controller

            params: a dictionary mapping the Parameter id or synonym to its
                value (or None for FlagParameters or MixedParameters in flag
                mode) for Parameters that should be turned on
            InputHandler: this is the method to be run on data when it is
                passed into call. This should be a string containing the
                method name. The default is _input_as_string which casts data
                to a string before appending it to the command line argument
            SuppressStderr: if set to True, will route standard error to
                /dev/null, False by default
            SuppressStdout: if set to True, will route standard out to
                /dev/null, False by default
            WorkingDir: the directory where you want the application to run,
                default is the current working directory, but is useful to
                change in cases where the program being run creates output
                to its current working directory and you either don't want
                it to end up where you are running the program, or the user
                running the script doesn't have write access to the current
                working directory
                WARNING: WorkingDir MUST be an absolute path!
            TmpDir: the directory where temp files will be created, /tmp
                by default
            TmpNameLen: the length of the temp file name
            HALT_EXEC: if True, raises exception w/ command output just
                before execution, doesn't clean up temp files. Default False.
        """
        super(Mothur, self).__init__(params=params,
                                     InputHandler=InputHandler,
                                     SuppressStderr=SuppressStderr,
                                     SuppressStdout=SuppressStdout,
                                     WorkingDir='',
                                     TmpDir='',
                                     TmpNameLen=TmpNameLen,
                                     HALT_EXEC=HALT_EXEC)
        # Prevent self.WorkingDir from being explicitly cast as a
        # FilePath object.  This behavior does not seem necessary in
        # the parent's __init__() method, since the casting is
        # repeated in _set_WorkingDir().
        if WorkingDir is not None:
            working_dir = WorkingDir
        else:
            working_dir = self._working_dir or getcwd()
        self.WorkingDir = working_dir
        self.TmpDir = TmpDir

    @staticmethod
    def getHelp():
        """Returns link to online manual"""
        help = ('See manual, available on the MOTHUR wiki:\n'
                'http://schloss.micro.umass.edu/mothur/')
        return help

    def __call__(self, data=None, remove_tmp=True):
        """Run the application with the specified kwargs on data

            data: anything that can be cast into a string or written out to
                a file. Usually either a list of things or a single string or
                number. input_handler will be called on this data before it
                is passed as part of the command-line argument, so by creating
                your own input handlers you can customize what kind of data
                you want your application to accept

            remove_tmp: if True, removes tmp files
        """
        # Process the input data.  Input filepath is stored in
        # self._input_filename
        getattr(self, self.InputHandler)(data)

        if self.SuppressStdout:
            outfile = None
        else:
            outfile = open(self.getTmpFilename(self.TmpDir), 'w')
        if self.SuppressStderr:
            errfile = None
        else:
            errfile = open(self.getTmpFilename(self.TmpDir), 'w')

        args = [self._command, self._compile_mothur_script()]
        process = Popen(args,
                        stdout=outfile,
                        stderr=errfile,
                        cwd=self.WorkingDir)
        exit_status = process.wait()
        if not self._accept_exit_status(exit_status):
            raise ApplicationError(
                'Unacceptable application exit status: %s, command: %s' %
                (exit_status, args))

        if outfile is not None:
            outfile.seek(0)
        if errfile is not None:
            errfile.seek(0)
        result = CommandLineAppResult(outfile,
                                      errfile,
                                      exit_status,
                                      result_paths=self._get_result_paths())

        # Clean up the input file if one was created
        if remove_tmp:
            if self._input_filename:
                remove(self._input_filename)
                self._input_filename = None

        return result

    def _accept_exit_status(self, status):
        return int(status) == 0

    def _compile_mothur_script(self):
        """Returns a Mothur batch script as a string"""
        def format_opts(*opts):
            """Formats a series of options for a Mothur script"""
            return ', '.join(filter(None, map(str, opts)))

        vars = {
            'in':
            self._input_filename,
            'unique':
            self._derive_unique_path(),
            'dist':
            self._derive_dist_path(),
            'names':
            self._derive_names_path(),
            'cluster_opts':
            format_opts(
                self.Parameters['method'],
                self.Parameters['cutoff'],
                self.Parameters['precision'],
            ),
        }
        script = ('#'
                  'unique.seqs(fasta=%(in)s); '
                  'dist.seqs(fasta=%(unique)s); '
                  'read.dist(column=%(dist)s, name=%(names)s); '
                  'cluster(%(cluster_opts)s)' % vars)
        return script

    def _get_result_paths(self):
        paths = {
            'distance matrix': self._derive_dist_path(),
            'otu list': self._derive_list_path(),
            'rank abundance': self._derive_rank_abundance_path(),
            'species abundance': self._derive_species_abundance_path(),
            'unique names': self._derive_names_path(),
            'unique seqs': self._derive_unique_path(),
            'log': self._derive_log_path(),
        }
        return dict([(k, ResultPath(v)) for (k, v) in paths.items()])

    # Methods to derive/guess output pathnames produced by MOTHUR.
    # TODO: test for input files that do not have a filetype extension

    def _derive_log_path(self):
        """Guess logfile path produced by Mothur

        This method checks the working directory for log files
        generated by Mothur.  It will raise an ApplicationError if no
        log file can be found.

        Mothur generates log files named in a nondeterministic way,
        using the current time.  We return the log file with the most
        recent time, although this may lead to incorrect log file
        detection if you are running many instances of mothur
        simultaneously.
        """
        filenames = listdir(self.WorkingDir)
        lognames = [
            x for x in filenames if re.match("^mothur\.\d+\.logfile$", x)
        ]
        if not lognames:
            raise ApplicationError(
                'No log file detected in directory %s. Contents: \n\t%s' %
                (input_dir, '\n\t'.join(possible_logfiles)))
        most_recent_logname = sorted(lognames, reverse=True)[0]
        return path.join(self.WorkingDir, most_recent_logname)

    def _derive_unique_path(self):
        """Guess unique sequences path produced by Mothur"""
        base, ext = path.splitext(self._input_filename)
        return '%s.unique%s' % (base, ext)

    def _derive_dist_path(self):
        """Guess distance matrix path produced by Mothur"""
        base, ext = path.splitext(self._input_filename)
        return '%s.unique.dist' % base

    def _derive_names_path(self):
        """Guess unique names file path produced by Mothur"""
        base, ext = path.splitext(self._input_filename)
        return '%s.names' % base

    def __get_method_abbrev(self):
        """Abbreviated form of clustering method parameter.

        Used to guess output filenames for MOTHUR.
        """
        abbrevs = {
            'furthest': 'fn',
            'nearest': 'nn',
            'average': 'an',
        }
        if self.Parameters['method'].isOn():
            method = self.Parameters['method'].Value
        else:
            method = self.Parameters['method'].Default
        return abbrevs[method]

    def _derive_list_path(self):
        """Guess otu list file path produced by Mothur"""
        base, ext = path.splitext(self._input_filename)
        return '%s.unique.%s.list' % (base, self.__get_method_abbrev())

    def _derive_rank_abundance_path(self):
        """Guess rank abundance file path produced by Mothur"""
        base, ext = path.splitext(self._input_filename)
        return '%s.unique.%s.rabund' % (base, self.__get_method_abbrev())

    def _derive_species_abundance_path(self):
        """Guess species abundance file path produced by Mothur"""
        base, ext = path.splitext(self._input_filename)
        return '%s.unique.%s.sabund' % (base, self.__get_method_abbrev())

    def getTmpFilename(self, tmp_dir='/tmp', prefix='tmp', suffix='.txt'):
        """Returns a temporary filename

        Similar interface to tempfile.mktmp()
        """
        # Override to change default constructor to str(). FilePath
        # objects muck up the Mothur script.
        return super(Mothur, self).getTmpFilename(tmp_dir=tmp_dir,
                                                  prefix=prefix,
                                                  suffix=suffix,
                                                  result_constructor=str)

    # Temporary input file needs to be in the working directory, so we
    # override all input handlers.

    def _input_as_multiline_string(self, data):
        """Write multiline string to temp file, return filename

        data: a multiline string to be written to a file.
        """
        self._input_filename = self.getTmpFilename(suffix='.fasta')
        with open(self._input_filename, 'w') as f:
            f.write(data)
        return self._input_filename

    def _input_as_lines(self, data):
        """Write sequence of lines to temp file, return filename

        data: a sequence to be written to a file, each element of the
            sequence will compose a line in the file

        * Note: '\n' will be stripped off the end of each sequence
            element before writing to a file in order to avoid
            multiple new lines accidentally be written to a file
        """
        self._input_filename = self.getTmpFilename(suffix='.fasta')
        with open(self._input_filename, 'w') as f:
            # Use lazy iteration instead of list comprehension to
            # prevent reading entire file into memory
            for line in data:
                f.write(str(line).strip('\n'))
                f.write('\n')
        return self._input_filename

    def _input_as_path(self, data):
        """Copys the provided file to WorkingDir and returns the new filename

        data: path or filename
        """
        self._input_filename = self.getTmpFilename(suffix='.fasta')
        copyfile(data, self._input_filename)
        return self._input_filename

    def _input_as_paths(self, data):
        raise NotImplementedError('Not applicable for MOTHUR controller.')

    def _input_as_string(self, data):
        raise NotImplementedError('Not applicable for MOTHUR controller.')

    # FilePath objects muck up the Mothur script, so we override the
    # property methods for self.WorkingDir

    def _get_WorkingDir(self):
        """Gets the working directory"""
        return self._curr_working_dir

    def _set_WorkingDir(self, path):
        """Sets the working directory
        """
        self._curr_working_dir = path
        try:
            mkdir(self.WorkingDir)
        except OSError:
            # Directory already exists
            pass

    WorkingDir = property(_get_WorkingDir, _set_WorkingDir)
Ejemplo n.º 24
0
class BWA_sampe(BWA):
    """Controls the "sampe" subcommand of the bwa application.
    
    Valid input keys are: prefix, sai1_in, sai2_in, fastq1_in,
    fastq2_in
    """
    _parameters = {
        # Maximum insert size for a read pair to be considered being mapped
        # properly
        '-a': ValuedParameter('-', Delimiter=' ', Name='a'),

        # Maximum occurrences of a read for pairing
        '-o': ValuedParameter('-', Delimiter=' ', Name='o'),

        # Load the entire FM-index into memory to reduce disk operations
        '-P': FlagParameter('-', Name='P'),

        # maximum hits to output for paired reads [3]
        '-n': ValuedParameter('-', Delimiter=' ', Name='n'),

        # maximum hits to output for discordant pairs [10]
        '-N': ValuedParameter('-', Delimiter=' ', Name='N'),

        #file to write output to instead of stdout
        '-f': ValuedParameter('-', Delimiter=' ', Name='f'),

        # Specify the read group in a format like '@RG\tID:foo\tSM:bar'
        '-r': ValuedParameter('-', Delimiter=' ', Name='r'),

        # disable Smith-Waterman for the unmapped mate
        '-s': FlagParameter('-', Name='s'),

        # prior of chimeric rate (lower bound) [1.0e-05]
        '-c': ValuedParameter('-', Delimiter=' ', Name='c'),

        # disable insert size estimate (force -s)
        '-A': FlagParameter('-', Name='A')
    }

    # the subcommand for sampe
    _subcommand = 'sampe'

    _valid_arguments = {
        # make sure this is a float
        '-c': is_float,

        # make sure these are all ints
        '-a': is_int,
        '-o': is_int,
        '-n': is_int,
        '-N': is_int,

        # check to see if this is an absolute file path
        '-f': isabs
    }

    # input file keys beginning with _ are optional inputs
    _input_order = ['prefix', 'sai1_in', 'sai2_in', 'fastq1_in', 'fastq2_in']

    def _get_result_paths(self, data):
        """Gets the result file for a bwa sampe run.

        There is only one output file of a bwa sampe run, a .sam file,
        and it can be retrieved with the key 'output'.
        """
        return {
            'output': ResultPath(self.Parameters['-f'].Value, IsWritten=True)
        }
Ejemplo n.º 25
0
class Covet(CommandLineApplication):
    """Application controller for Covet

    Generate new models, by training them on example sequences.

    where options are:
     -a <alignfile>  : make starting model from alignment
     -A <filename>   : save alignments to filename.1, etc., for animation
     -b <backupfile> : each iteration, back up curr model to <backupfile>
     -f              : use flat text save formats, portable but clumsy
     -G <GOP>        : gap-open prob 0 < gop < 1 for random alignment generation
     -h              : print short help and version info
     -i <cm file>    : take start model from <cm file>
     -m              : do maximum likelihood model construction (slow!)
     -p <prior file> : use prior in <file>; default is Laplace plus-one
     -s <seed>       : set random() seed
     -X <GEX>        : gap-extend prob 0 < gex < 1 for random alignment generation

    """

    _parameters = {
        '-a': ValuedParameter(Prefix='-', Name='a', Delimiter=' '),
        '-A': ValuedParameter(Prefix='-', Name='A', Delimiter=' '),
        '-b': ValuedParameter(Prefix='-', Name='b', Delimiter=' '),
        '-f': FlagParameter(Prefix='-', Name='f'),
        '-G': ValuedParameter(Prefix='-', Name='G', Delimiter=' '),
        '-i': ValuedParameter(Prefix='-', Name='i', Delimiter=' '),
        '-m': FlagParameter(Prefix='-', Name='m'),
        '-p': ValuedParameter(Prefix='-', Name='p', Delimiter=' '),
        '-s': ValuedParameter(Prefix='-', Name='s', Delimiter=' '),
        '-X': ValuedParameter(Prefix='-', Name='X', Delimiter=' ')
    }

    _command = 'covet'
    _input_handlar = '_input_as_string'

    def _input_as_string(self, filename):
        """Returns 'modelname' and 'filename' to redirect input to stdin"""
        return ' '.join(
            [filename + '.cm',
             super(Covet, self)._input_as_string(filename)])

    def _input_as_lines(self, data):
        """Returns 'temp_filename to redirect input to stdin"""
        filename = self._input_filename = self.getTmpFilename(self.WorkingDir)
        data_file = open(filename, 'w')
        data_to_file = '\n'.join([str(d).strip('\n') for d in data])
        data_file.write(data_to_file)
        data_file.write('\n')  #must end with new line
        data_file.close()
        return ' '.join([filename + '.cm', filename])

    def _get_result_paths(self, data):
        """Specifies the paths of output files generated by the application
        
        data: the data the instance of the application is called on
        
        CMfinder produces it's output in two files .align and .motif
        it also prints an output to sdtout.

        """
        result = {}
        if not isinstance(data, list):
            inputPath = data
        else:
            inputPath = ''.join([self._input_filename])

        result['cm'] =\
              ResultPath(Path=(inputPath+'.cm'))

        if self._input_filename is not None:
            result['_input_filename'] = ResultPath(self._input_filename)

        return result
Ejemplo n.º 26
0
class BWA_bwasw(BWA):
    """Controls the "bwasw" subcommand of the bwa application.
    
    Valid input keys are: prefix, query_fasta, _query_fasta2
    input keys beginning with an underscore are optional.
    """
    _parameters = {
        #Score of a match [1]
        '-a': ValuedParameter('-', Delimiter=' ', Name='a'),

        #Mismatch penalty [3]
        '-b': ValuedParameter('-', Delimiter=' ', Name='b'),

        #Gap open penalty [5]
        '-q': ValuedParameter('-', Delimiter=' ', Name='q'),

        #Gap  extension  penalty.
        '-r': ValuedParameter('-', Delimiter=' ', Name='r'),

        # mask level [0.50]
        '-m': ValuedParameter('-', Delimiter=' ', Name='m'),

        #Number of threads in the multi-threading mode [1]
        '-t': ValuedParameter('-', Delimiter=' ', Name='t'),

        # file to output results to instead of stdout
        '-f': ValuedParameter('-', Delimiter=' ', Name='f'),

        #Band width in the banded alignment [33]
        '-w': ValuedParameter('-', Delimiter=' ', Name='w'),

        #Minimum score threshold divided by a [30]
        '-T': ValuedParameter('-', Delimiter=' ', Name='T'),

        #Coefficient  for  threshold  adjustment  according  to query length.
        #Given an l-long query, the threshold for a hit to be retained is
        #a*max{T,c*log(l)}. [5.5]
        '-c': ValuedParameter('-', Delimiter=' ', Name='c'),

        #Z-best heuristics. Higher -z increases accuracy at the cost
        #of speed. [1]
        '-z': ValuedParameter('-', Delimiter=' ', Name='z'),

        #Maximum SA interval size for initiating a seed. Higher -s increases
        #accuracy at the cost of speed. [3]
        '-s': ValuedParameter('-', Delimiter=' ', Name='s'),

        #Minimum  number  of  seeds  supporting  the  resultant alignment to
        #trigger reverse alignment. [5]
        '-N': ValuedParameter('-', Delimiter=' ', Name='N'),

        # in SAM output, use hard clipping instead of soft clipping
        '-H': FlagParameter('-', Name='H'),

        # mark multi-part alignments as secondary
        '-M': FlagParameter('-', Name='M'),

        # skip Smith-Waterman read pariing
        '-S': FlagParameter('-', Name='S'),

        # ignore pairs with insert >= INT for inferring the size of distr
        # [20000]
        '-I': ValuedParameter('-', Delimiter=' ', Name='I')
    }

    # the subcommand fo bwasw
    _subcommand = 'bwasw'

    # input file keys beginning with _ are optional inputs
    _input_order = ['prefix', 'query_fasta', '_query_fasta_2']

    _valid_arguments = {
        # Make sure this is a float
        '-c': is_float,
        '-m': is_float,

        # Make sure these are ints
        '-a': is_int,
        '-b': is_int,
        '-q': is_int,
        '-r': is_int,
        '-t': is_int,
        '-w': is_int,
        '-T': is_int,
        '-z': is_int,
        '-s': is_int,
        '-N': is_int,
        '-I': is_int,

        # make sure this is an absolute path
        '-f': isabs
    }

    def _get_result_paths(self, data):
        """Gets the result file for a bwa bwasw run.

        There is only one output file of a bwa bwasw run, a .sam file,
        and it can be retrieved with the key 'output'.
        """
        return {
            'output': ResultPath(self.Parameters['-f'].Value, IsWritten=True)
        }
Ejemplo n.º 27
0
class RnaView(CommandLineApplication):
    """ The Application controller for the RnaView application 

        There are two known issues with this applications controller
         which are being addressed: 
         (1) The first is that it doesn't handle
         the functionality where -a is passed followed by a filename and
         a float. This will be addressed shortly. 
         (2) The second problem is that under
         some mysterious circumstances, rnaview (the actual application,
         not via cogent) writes extra output files. This seems to be a bug
         in rnaview, and cogent therefore doesn't clean them up. One time 
         when this occurs is when there are spaces in filenames or paths.
         Contact Greg ([email protected]) for some example output
         illustrating this issue. Users are warned to look for extra 
         output files either showing up in the working directory or 
         the temp directory (default: /tmp).

    """

    # The functionality necessary for use of the -a parameter is still
    # under development and is not ready for use
    _parameters = {\
        '-p':FlagParameter(Prefix='-',Name='p'),\
        '-v':FlagParameter(Prefix='-',Name='v'),\
        '-c':ValuedParameter(Prefix='-',Name='c',Delimiter=' '),\
        '-a':FlagParameter(Prefix='-',Name='a'),\
        '-x':FlagParameter(Prefix='-',Name='x')}
    _command = 'rnaview'
    _input_handler = '_input_as_path'

    ### Everything above is necessary for sub-class, code below is for cases
    ### where files are written (ie. data goes to places other than stdout and
    ### stderr) Complexity increases with the amount of variability in the
    ### file name. For rnaview the naming of the files is quite complex, hence
    ### the large amount of code necessary.

    def _get_result_paths(self, data):
        # There are two possibilities for what data will be, if -a has been
        # specified, data will be a file containing a space-delimited list of
        # pdb files. If -a has not been specified data will be the name of a
        # single pdb file to act on
        result = {}
        if self.Parameters['-a'].isOff():
            # If we have created a temp file containing data we need that
            # temp file name
            if self._input_filename:
                file_prefix = self._get_pdb_filename(self._input_filename)
                out_path = self._get_out_path(self._input_filename)
            # Otherwise we will just be passing a filename as data
            else:
                file_prefix = self._get_pdb_filename(data)
                out_path = self._get_out_path(data)

            result.update(\
             self._construct_result_file_set(file_prefix=file_prefix,\
                out_path=out_path))
        else:
            inputs = data.split(' ')
            f = open(inputs[0])
            pdb_files = f.read().split(' ')
            f.close()
            for p in pdb_files:
                file_prefix = self._get_pdb_filename(p)
                key_prefix = ''.join([file_prefix, '_'])
                result.update(\
                 self._construct_result_file_set(key_prefix=key_prefix,\
                 file_prefix=file_prefix,out_path=self._get_out_path(p)))

        return result

    def _construct_result_file_set(self, key_prefix='', file_prefix='',\
        out_path=''):
        result = {}
        result['bp_stats'] = ResultPath(Path=\
            ''.join([self.WorkingDir,'/base_pair_statistics.out']))
        result[''.join([key_prefix,'base_pairs'])] =\
            ResultPath(Path=''.join([out_path,file_prefix,'.out']))
        result[''.join([key_prefix,'ps'])] =\
            ResultPath(Path=''.join([out_path,file_prefix,'.ps']),\
            IsWritten=self.Parameters['-p'].isOn())
        result[''.join([key_prefix,'vrml'])] =\
            ResultPath(Path=''.join([out_path,file_prefix,'.wrl']),\
            IsWritten=self.Parameters['-v'].isOn())
        result[''.join([key_prefix,'xml'])] =\
            ResultPath(Path=''.join([out_path,file_prefix,'.xml']),\
            IsWritten=self.Parameters['-x'].isOn())
        result['best_pair'] = ResultPath(Path=\
            ''.join([self.WorkingDir,'/best_pair.out']),\
            IsWritten=self.Parameters['-a'].isOn())
        result['pattern_tmp'] = ResultPath(Path=\
            ''.join([self.WorkingDir,'/pattern_tmp.out']),\
            IsWritten=self.Parameters['-a'].isOn())
        result[''.join([key_prefix,'patt'])] =\
            ResultPath(Path=''.join([out_path,file_prefix,'_patt.out']),\
            IsWritten=self.Parameters['-a'].isOn())
        result[''.join([key_prefix,'patt_tmp'])] =\
            ResultPath(Path=''.join([out_path,file_prefix,'_patt_tmp.out']),\
            IsWritten=self.Parameters['-a'].isOn())
        result[''.join([key_prefix,'sort.out'])] =\
            ResultPath(Path=''.join([out_path,file_prefix,'_sort.out']),\
            IsWritten=self.Parameters['-a'].isOn())

        return result

    def _accept_exit_status(self, exit_status):
        "Return False if exit_status is not zero " ""
        if exit_status != 0:
            return False
        return True

    def _get_pdb_filename(self, word):
        """Returns the file prefix of the _input_filename.
        
        If the file is an NMR file, Rnaview creates a new PDB file containing
        only the used model. This file is the original input filename plus
        '_nmr.pdb'. The resulting .out file containing the base pairs uses
        that prefix, e.g. xxx.ent_nmr.pdb.out

        Since NMR and X-RAY files cannot be safely distinguished based on the
        filename, we have to open the file and check the EXPDTA field for 
        'NMR' or detect the word MODEL in there.
        """
        nmr = False
        f = open(word)
        for line in f:
            if line.startswith('EXPDTA') and 'NMR' in line:
                nmr = True
                break
            if line.startswith('MODEL'):
                nmr = True
                break
        f.close()
        start_index = word.rfind('/')
        if nmr:
            return word[start_index + 1:].strip() + '_nmr.pdb'
        return word[start_index + 1:].strip()

    def _get_out_path(self, word):
        end_index = word.rfind('/')
        if end_index >= 0:
            return word[0:end_index + 1].strip()
        return ''
Ejemplo n.º 28
0
class Muscle(CommandLineApplication):
    """Muscle application controller"""

    _options = {
        # Minimum spacing between anchor columns. [Integer]
        '-anchorspacing':
        ValuedParameter('-', Name='anchorspacing', Delimiter=' '),
        # Center parameter. Should be negative [Float]
        '-center':
        ValuedParameter('-', Name='center', Delimiter=' '),

        # Clustering method. cluster1 is used in iteration 1
        # and 2, cluster2 in later iterations
        '-cluster1':
        ValuedParameter('-', Name='cluster1', Delimiter=' '),
        '-cluster2':
        ValuedParameter('-', Name='cluster2', Delimiter=' '),

        # Minimum length of diagonal.
        '-diaglength':
        ValuedParameter('-', Name='diaglength', Delimiter=' '),

        # Discard this many positions at ends of diagonal.
        '-diagmargin':
        ValuedParameter('-', Name='diagmargin', Delimiter=' '),

        # Distance measure for iteration 1.
        '-distance1':
        ValuedParameter('-', Name='distance1', Delimiter=' '),

        # Distance measure for iterations 2, 3 ...
        '-distance2':
        ValuedParameter('-', Name='distance2', Delimiter=' '),

        # The gap open score. Must be negative.
        '-gapopen':
        ValuedParameter('-', Name='gapopen', Delimiter=' '),

        # Window size for determining whether a region is hydrophobic.
        '-hydro':
        ValuedParameter('-', Name='hydro', Delimiter=' '),

        # Multiplier for gap open/close penalties in hydrophobic regions.
        '-hydrofactor':
        ValuedParameter('-', Name='hydrofactor', Delimiter=' '),

        # Where to find the input sequences.
        '-in':
        ValuedParameter('-', Name='in', Delimiter=' ', Quote="\""),
        '-in1':
        ValuedParameter('-', Name='in1', Delimiter=' ', Quote="\""),
        '-in2':
        ValuedParameter('-', Name='in2', Delimiter=' ', Quote="\""),

        # Log file name (delete existing file).
        '-log':
        ValuedParameter('-', Name='log', Delimiter=' '),

        # Log file name (append to existing file).
        '-loga':
        ValuedParameter('-', Name='loga', Delimiter=' '),

        # Maximum distance between two diagonals that allows them to merge
        # into one diagonal.
        '-maxdiagbreak':
        ValuedParameter('-', Name='maxdiagbreak', Delimiter=' '),

        # Maximum time to run in hours. The actual time may exceed the
        # requested limit by a few minutes. Decimals are allowed, so 1.5
        # means one hour and 30 minutes.
        '-maxhours':
        ValuedParameter('-', Name='maxhours', Delimiter=' '),

        # Maximum number of iterations.
        '-maxiters':
        ValuedParameter('-', Name='maxiters', Delimiter=' '),

        # Maximum memory in Mb
        '-maxmb':
        ValuedParameter('-', Name='maxmb', Delimiter=' '),

        # Maximum number of new trees to build in iteration 2.
        '-maxtrees':
        ValuedParameter('-', Name='maxtrees', Delimiter=' '),

        # Minimum score a column must have to be an anchor.
        '-minbestcolscore':
        ValuedParameter('-', Name='minbestcolscore', Delimiter=' '),

        # Minimum smoothed score a column must have to be an anchor.
        '-minsmoothscore':
        ValuedParameter('-', Name='minsmoothscore', Delimiter=' '),

        # Objective score used by tree dependent refinement.
        # sp=sum-of-pairs score.
        # spf=sum-of-pairs score (dimer approximation)
        # spm=sp for < 100 seqs, otherwise spf
        # dp=dynamic programming score.
        # ps=average profile-sequence score.
        # xp=cross profile score.
        '-objscore':
        ValuedParameter('-', Name='objscore', Delimiter=' '),

        # Where to write the alignment.
        '-out':
        ValuedParameter('-', Name='out', Delimiter=' ', Quote="\""),

        # Where to write the file in phylip sequenctial format (v3.6 only).
        '-physout':
        ValuedParameter('-', Name='physout', Delimiter=' '),

        # Where to write the file in phylip interleaved format (v3.6 only).
        '-phyiout':
        ValuedParameter('-', Name='phyiout', Delimiter=' '),

        # Set to profile for aligning two alignments and adding seqs to an
        # existing alignment
        '-profile':
        FlagParameter(Prefix='-', Name='profile'),

        # Method used to root tree; root1 is used in iteration 1 and 2, root2
        # in later iterations.
        '-root1':
        ValuedParameter('-', Name='root1', Delimiter=' '),
        '-root2':
        ValuedParameter('-', Name='root2', Delimiter=' '),

        # Sequence type.
        '-seqtype':
        ValuedParameter('-', Name='seqtype', Delimiter=' '),

        # Maximum value of column score for smoothing purposes.
        '-smoothscoreceil':
        ValuedParameter('-', Name='smoothscoreceil', Delimiter=' '),

        # Constant used in UPGMB clustering. Determines the relative fraction
        # of average linkage (SUEFF) vs. nearest-neighbor linkage (1 . SUEFF).
        '-SUEFF':
        ValuedParameter('-', Name='SUEFF', Delimiter=' '),

        # Save tree produced in first or second iteration to given file in
        # Newick (Phylip-compatible) format.
        '-tree1':
        ValuedParameter('-', Name='tree1', Delimiter=' ', Quote="\""),
        '-tree2':
        ValuedParameter('-', Name='tree2', Delimiter=' ', Quote="\""),

        # Sequence weighting scheme.
        # weight1 is used in iterations 1 and 2.
        # weight2 is used for tree-dependent refinement.
        # none=all sequences have equal weight.
        # henikoff=Henikoff & Henikoff weighting scheme.
        # henikoffpb=Modified Henikoff scheme as used in PSI-BLAST.
        # clustalw=CLUSTALW method.
        # threeway=Gotoh three-way method.
        '-weight1':
        ValuedParameter('-', Name='weight1', Delimiter=' '),
        '-weight2':
        ValuedParameter('-', Name='weight2', Delimiter=' '),

        # Use anchor optimization in tree dependent refinement iterations
        '-anchors':
        FlagParameter(Prefix='-', Name='anchors'),

        # Write output in CLUSTALW format (default is FASTA).
        '-clw':
        FlagParameter(Prefix='-', Name='clw'),

        # Cluster sequences
        '-clusteronly':
        FlagParameter(Prefix='-', Name='clusteronly'),
        # neighborjoining is "unrecognized"
        #'-neighborjoining':FlagParameter(Prefix='-',Name='neighborjoining'),

        # Write output in CLUSTALW format with the "CLUSTAL W (1.81)" header
        # rather than the MUSCLE version. This is useful when a post-processing
        # step is picky about the file header.
        '-clwstrict':
        FlagParameter(Prefix='-', Name='clwstrict'),

        # Do not catch exceptions.
        '-core':
        FlagParameter(Prefix='-', Name='core'),

        # Write output in FASTA format. Alternatives include .clw,
        # .clwstrict, .msf and .html.
        '-fasta':
        FlagParameter(Prefix='-', Name='fasta'),

        # Group similar sequences together in the output. This is the default.
        # See also .stable.
        '-group':
        FlagParameter(Prefix='-', Name='group'),

        # Write output in HTML format (default is FASTA).
        '-html':
        FlagParameter(Prefix='-', Name='html'),

        # Use log-expectation profile score (VTML240). Alternatives are to use
        # -sp or -sv. This is the default for amino acid sequences.
        '-le':
        FlagParameter(Prefix='-', Name='le'),

        # Write output in MSF format (default is FASTA).
        '-msf':
        FlagParameter(Prefix='-', Name='msf'),

        # Disable anchor optimization. Default is -anchors.
        '-noanchors':
        FlagParameter(Prefix='-', Name='noanchors'),

        # Catch exceptions and give an error message if possible.
        '-nocore':
        FlagParameter(Prefix='-', Name='nocore'),

        # Do not display progress messages.
        '-quiet':
        FlagParameter(Prefix='-', Name='quiet'),

        # Input file is already aligned, skip first two iterations and begin
        # tree dependent refinement.
        '-refine':
        FlagParameter(Prefix='-', Name='refine'),

        # Use sum-of-pairs protein profile score (PAM200). Default is -le.
        '-sp':
        FlagParameter(Prefix='-', Name='sp'),

        # Use sum-of-pairs nucleotide profile score (BLASTZ parameters). This
        # is the only option for nucleotides, and is therefore the default.
        '-spn':
        FlagParameter(Prefix='-', Name='spn'),

        # Preserve input order of sequences in output file. Default is to group
        # sequences by similarity (-group).
        '-stable':
        FlagParameter(Prefix='-', Name='stable'),

        # Use sum-of-pairs profile score (VTML240). Default is -le.
        '-sv':
        FlagParameter(Prefix='-', Name='sv'),

        # Diagonal optimization
        '-diags':
        FlagParameter(Prefix='-', Name='diags'),
        '-diags1':
        FlagParameter(Prefix='-', Name='diags1'),
        '-diags2':
        FlagParameter(Prefix='-', Name='diags2'),

        # Terminal gaps penalized with full penalty.
        # [1] Not fully supported in this version.
        '-termgapsfull':
        FlagParameter(Prefix='-', Name='termgapsfull'),

        # Terminal gaps penalized with half penalty.
        # [1] Not fully supported in this version.
        '-termgapshalf':
        FlagParameter(Prefix='-', Name='termgapshalf'),

        # Terminal gaps penalized with half penalty if gap relative to
        # longer sequence, otherwise with full penalty.
        # [1] Not fully supported in this version.
        '-termgapshalflonger':
        FlagParameter(Prefix='-', Name='termgapshalflonger'),

        # Write parameter settings and progress messages to log file.
        '-verbose':
        FlagParameter(Prefix='-', Name='verbose'),

        # Write version string to stdout and exit.
        '-version':
        FlagParameter(Prefix='-', Name='version'),
    }

    _parameters = {}
    _parameters.update(_options)
    _command = "muscle"

    def _input_as_seqs(self, data):
        lines = []
        for i, s in enumerate(data):
            #will number the sequences 1,2,3,etc.
            lines.append(''.join(['>', str(i + 1)]))
            lines.append(s)
        return self._input_as_lines(lines)

    def _input_as_lines(self, data):
        if data:
            self.Parameters['-in']\
                .on(super(Muscle,self)._input_as_lines(data))

        return ''

    def _input_as_string(self, data):
        """Makes data the value of a specific parameter
        
        This method returns the empty string. The parameter will be printed
        automatically once set.
        """
        if data:
            self.Parameters['-in'].on(str(data))
        return ''

    def _input_as_multiline_string(self, data):
        if data:
            self.Parameters['-in']\
                .on(super(Muscle,self)._input_as_multiline_string(data))
        return ''

    def _input_as_multifile(self, data):
        """For use with the -profile option

        This input handler expects data to be a tuple containing two
        filenames. Index 0 will be set to -in1 and index 1 to -in2
        """
        if data:
            try:
                filename1, filename2 = data
            except:
                raise ValueError, "Expected two filenames"

            self.Parameters['-in'].off()
            self.Parameters['-in1'].on(filename1)
            self.Parameters['-in2'].on(filename2)
        return ''

    def _align_out_filename(self):

        if self.Parameters['-out'].isOn():
            aln_filename = self._absolute(str(self.Parameters['-out'].Value))
        else:
            raise ValueError, "No output file specified."
        return aln_filename

    def _tree1_out_filename(self):

        if self.Parameters['-tree1'].isOn():
            aln_filename = self._absolute(str(self.Parameters['-tree1'].Value))
        else:
            raise ValueError, "No tree output file specified."
        return aln_filename

    def _tree2_out_filename(self):

        if self.Parameters['-tree2'].isOn():
            tree_filename = self._absolute(str(
                self.Parameters['-tree2'].Value))
        else:
            raise ValueError, "No tree output file specified."
        return tree_filename

    def _get_result_paths(self, data):

        result = {}
        if self.Parameters['-out'].isOn():
            out_name = self._align_out_filename()
            result['MuscleOut'] = ResultPath(Path=out_name, IsWritten=True)
        if self.Parameters['-tree1'].isOn():
            out_name = self._tree1_out_filename()
            result['Tree1Out'] = ResultPath(Path=out_name, IsWritten=True)
        if self.Parameters['-tree2'].isOn():
            out_name = self._tree2_out_filename()
            result['Tree2Out'] = ResultPath(Path=out_name, IsWritten=True)
        return result

    def getHelp(self):
        """Muscle help"""

        help_str = """
"""
        return help_str
Ejemplo n.º 29
0
class Dotur(CommandLineApplication):
    """Dotur application controller.
    """
    # Options:
    _options = {\
        # -i:		Number of iterations (default = 1000)

        '-i':ValuedParameter('-',Name='i',Delimiter=' '),\
        # -c:		Clustering method - (f) furthest neighbor, (n) nearest
        #           neighbor, (a) average neighbor (default = f)


        '-c':ValuedParameter('-',Name='c',Delimiter=' '),\
        # -p:		Precision of distances for output, increasing can
        #           dramatically lengthen execution times - 10, 100, 1000, 10000 
        #           (default = 100)



        '-p':ValuedParameter('-',Name='p',Delimiter=' '),\
        # -l:		Input file is lower triangular (default = square matrix)

        '-l':FlagParameter('-',Name='l'),\
        # -r:		Calculates rarefaction curves for each parameter, can
        #           dramatically lengthen execution times.  Simple rarefaction
        #           curve always calculated.



        '-r':FlagParameter('-',Name='r'),\
        # -stop:	Stops clustering when cutoff has been reached.

        '-stop':FlagParameter('-',Name='stop'),\
        # -wrep:	Samples with replacement.

        '-wrep':FlagParameter('-',Name='wrep'),\
        # -jumble:	Jumble the order of the distance matrix.

        '-jumble':FlagParameter('-',Name='jumble'),\
        # -sim:		Converts similarity score to distance (D=1-S).

        '-sim':FlagParameter('-',Name='sim'),\
         }

    _parameters = {}
    _parameters.update(_options)
    _input_handler = '_input_as_multiline_string'
    _command = 'dotur'

    def getHelp(self):
        """Method that points to the DOTUR documentation."""
        help_str =\
        """
        See DOTUR Documentation page at:
        http://schloss.micro.umass.edu/software/dotur/documentation.html
        """
        return help_str

    def _input_as_multiline_string(self, data):
        """Write a multiline string to a temp file and return the filename.

            data: a multiline string to be written to a file.

           * Note: the result will be the filename as a FilePath object 
            (which is a string subclass).

        """
        filename = self._input_filename = \
            FilePath(self.getTmpFilename(self.WorkingDir))
        data_file = open(filename, 'w')
        data_file.write(data)
        data_file.close()
        return filename

    def _get_cluster_method(self):
        """Returns cluster method as string.
        """
        if self.Parameters['-c'].isOn():
            cluster_method = self._absolute(str(\
                self.Parameters['-c'].Value))+'n'
        else:
            # f (furthest neighbor) is default
            cluster_method = 'fn'

        return cluster_method

    def _get_result_paths(self, data):
        """Return dict of {key: ResultPath}
        
            - NOTE: Only putting a few files on the results path.  Add more
                here if needed.
        """
        result = {}
        out_name = self._input_filename.split('.txt')[0]
        cluster_method = self._get_cluster_method()
        #only care about Otu, List and Rank, can add others later.
        result['Otu'] = ResultPath(Path=out_name + '.%s.otu' %
                                   (cluster_method))
        result['List'] = ResultPath(Path=out_name + '.%s.list' %
                                    (cluster_method))
        result['Rank'] = ResultPath(Path=out_name + '.%s.rank' %
                                    (cluster_method))
        result['Rarefaction'] = \
            ResultPath(Path=out_name+'.%s.rarefaction'%(cluster_method))
        return result
Ejemplo n.º 30
0
class Blat(CommandLineApplication):
    """BLAT generic application controller"""

    _command = 'blat'
    _input_handler = "_input_as_list"

    _database_types = ['dna', 'prot', 'dnax']
    _query_types = ['dna', 'rna', 'prot', 'dnax', 'rnax']
    _mask_types = ['lower', 'upper', 'out', 'file.out']
    _out_types = [
        'psl', 'pslx', 'axt', 'maf', 'sim4', 'wublast', 'blast', 'blast8',
        'blast9'
    ]
    _valid_combinations = [('dna', 'dna'), ('dna', 'rna'), ('prot', 'prot'),
                           ('dnax', 'prot'), ('dnax', 'dnax'),
                           ('dnax', 'rnax')]
    _database = None
    _query = None
    _output = None

    _parameters = {
        # database type (dna, prot, or dnax, where dnax is DNA sequence
        # translated in six frames to protein
        '-t':
        ValuedParameter('-', Delimiter='=', Name='t'),

        # query type (dna, rna, prot, dnax, rnax, where rnax is DNA sequence
        # translated in three frames to protein
        '-q':
        ValuedParameter('-', Delimiter='=', Name='q'),

        # Use overused tile file N.ooc, and N should correspond to the tileSize
        '-ooc':
        ValuedParameter('-', Delimiter='=', Name='ooc', IsPath=True),

        # Sets the size of at match that that triggers an alignment
        '-tileSize':
        ValuedParameter('-', Delimiter='=', Name='tileSize'),

        # Spacing between tiles.
        '-stepSize':
        ValuedParameter('-', Delimiter='=', Name='stepSize'),

        # If set to 1, allows one mismatch in the tile and still triggers
        # an alignment.
        '-oneOff':
        ValuedParameter('-', Delimiter='=', Name='oneOff'),

        # sets the number of tile matches
        '-minMatch':
        ValuedParameter('-', Delimiter='=', Name='minMatch'),

        #sets the minimum score
        '-minScore':
        ValuedParameter('-', Delimiter='=', Name='minScore'),

        # sets the minimum sequence identity in percent
        '-minIdentity':
        ValuedParameter('-', Delimiter='=', Name='minIdentity'),

        # sets the size o the maximum gap between tiles in a clump
        '-maxGap':
        ValuedParameter('-', Delimiter='=', Name='maxGap'),

        # make an overused tile file. Target needs to be complete genome.
        '-makeOoc':
        ValuedParameter('-', Delimiter='=', Name='makeOoc', IsPath=True),

        # sets the number of repetitions of a tile allowed before it is marked
        # as overused
        '-repMatch':
        ValuedParameter('-', Delimiter='=', Name='repMatch'),

        # mask out repeats.  Alignments won't be started in masked region but
        # may extend through it in nucleotide searches.  Masked areas are
        # ignored entirely in protein or translated searches.  Types are:
        # lower, upper, out, file.out (file.out - mask database according to
        # RepeatMasker file.out
        '-mask':
        ValuedParameter('-', Delimiter='=', Name='mask'),

        # Mask out repeats in query sequence.  similar to -mask but for query
        # rather than target sequence
        '-qMask':
        ValuedParameter('-', Delimiter='=', Name='qMask'),

        # repeat bases will not be masked in any way, but matches in
        # repeat areas will be reported separately from matches in other
        # areas in the pls output
        '-repeats':
        ValuedParameter('-', Delimiter='=', Name='repeats'),

        # minimum percent divergence of repeats to allow them to be unmasked
        '-minRepDivergence':
        ValuedParameter('-', Delimiter='=', Name='minRepDivergence'),

        # output dot every N sequences to show program's progress
        '-dots':
        ValuedParameter('-', Delimiter='=', Name='dots'),

        # controls output file format.  One of:
        # psl - Default.  Tab separated format, no sequence
        # pslx - Tab separated format with sequence
        # axt - blastz-associated axt format
        # maf - multiz-associated maf format
        # sim4 - similar to sim4 format
        # wublast - similar to wublast format
        # blast - similar to NCBI blast format
        # blast8- NCBI blast tabular format
        # blast9 - NCBI blast tabular format with comments
        '-out':
        ValuedParameter('-', Delimiter='=', Name='out'),

        # sets maximum intron size
        '-maxIntron':
        ValuedParameter('-', Delimiter='=', Name='maxIntron'),

        # suppress column headers in psl output
        '-noHead':
        FlagParameter('-', Name='noHead'),

        # trim leading poly-T
        '-trimT':
        FlagParameter('-', Name='trimT'),

        # do not trim trailing poly-A
        '-noTrimA':
        FlagParameter('-', Name='noTrimA'),

        # Remove poly-A tail from qSize as well as alignments in psl output
        '-trimHardA':
        FlagParameter('-', Name='trimHardA'),

        # run for fast DNA/DNA remapping - not allowing introns,
        # requiring high %ID
        '-fastMap':
        FlagParameter('-', Name='fastMap'),

        # for high quality mRNAs, look harder for small initial and terminal
        # exons
        '-fine':
        FlagParameter('-', Name='fine'),

        # Allows extension of alignment through large blocks of N's
        '-extendThroughN':
        FlagParameter('-', Name='extendThroughN')
    }

    def _get_result_paths(self, data):
        """Returns the file location for result output
        """

        return {'output': ResultPath(data[2], IsWritten=True)}

    def _get_base_command(self):
        """Gets the command that will be run when the app controller is
        called.
        """
        command_parts = []
        cd_command = ''.join(['cd ', str(self.WorkingDir), ';'])
        if self._command is None:
            raise ApplicationError, '_command has not been set.'
        command = self._command
        parameters = sorted(
            [str(x) for x in self.Parameters.values() if str(x)])

        synonyms = self._synonyms

        command_parts.append(cd_command)
        command_parts.append(command)
        command_parts.append(self._database)  # Positional argument
        command_parts.append(self._query)  # Positional argument
        command_parts += parameters
        if self._output: command_parts.append(self._output.Path)  # Positional

        return self._command_delimiter.join(filter(None,
                                                   command_parts)).strip()

    BaseCommand = property(_get_base_command)

    def _input_as_list(self, data):
        '''Takes the positional arguments as input in a list.
        
        The list input here should be [query_file_path, database_file_path, 
        output_file_path]'''
        query, database, output = data
        if (not isabs(database)) \
          or (not isabs(query)) \
          or (not isabs(output)):
            raise ApplicationError, "Only absolute paths allowed.\n%s" %\
                                    ', '.join(data)

        self._database = FilePath(database)
        self._query = FilePath(query)
        self._output = ResultPath(output, IsWritten=True)

        ## check parameters that can only take a particular set of values
        # check combination of databse and query type
        if self.Parameters['-t'].isOn() and self.Parameters['-q'].isOn() and \
         (self.Parameters['-t'].Value, self.Parameters['-q'].Value) not in \
         self._valid_combinations:
            error_message = "Invalid combination of database and query " + \
                            "types ('%s', '%s').\n" % \
                            (self.Paramters['-t'].Value,
                            self.Parameters['-q'].Value)

            error_message += "Must be one of: %s\n" % \
                             repr(self._valid_combinations)

            raise ApplicationError(error_message)

        # check database type
        if self.Parameters['-t'].isOn() and \
         self.Parameters['-t'].Value not in self._database_types:
            error_message = "Invalid database type %s\n" % \
                            self.Parameters['-t'].Value

            error_message += "Allowed values: %s\n" % \
                             ', '.join(self._database_types)

            raise ApplicationError(error_message)

        # check query type
        if self.Parameters['-q'].isOn() and \
         self.Parameters['-q'].Value not in self._query_types:
            error_message = "Invalid query type %s\n" % \
                            self.Parameters['-q'].Value

            error_message += "Allowed values: %s\n" % \
                            ', '.join(self._query_types)

            raise ApplicationError(error_message)

        # check mask type
        if self.Parameters['-mask'].isOn() and \
         self.Parameters['-mask'].Value not in self._mask_types:
            error_message = "Invalid mask type %s\n" % \
                            self.Parameters['-mask']

            error_message += "Allowed Values: %s\n" % \
                            ', '.join(self._mask_types)

            raise ApplicationError(error_message)

        # check qmask type
        if self.Parameters['-qMask'].isOn() and \
         self.Parameters['-qMask'].Value not in self._mask_types:
            error_message = "Invalid qMask type %s\n" % \
                            self.Parameters['-qMask'].Value

            error_message += "Allowed values: %s\n" % \
                             ', '.join(self._mask_types)

            raise ApplicationError(error_message)

        # check repeat type
        if self.Parameters['-repeats'].isOn() and \
         self.Parameters['-repeats'].Value not in self._mask_types:
            error_message = "Invalid repeat type %s\n" % \
                            self.Parameters['-repeat'].Value

            error_message += "Allowed values: %s\n" % \
                             ', '.join(self._mask_types)

            raise ApplicationError(error_message)

        # check output format
        if self.Parameters['-out'].isOn() and \
         self.Parameters['-out'].Value not in self._out_types:
            error_message = "Invalid output type %s\n" % \
                            self.Parameters['-out']

            error_message += "Allowed values: %s\n" % \
                             ', '.join(self._out_types)

            raise ApplicationError(error_message)

        return ''