예제 #1
0
def check_flowgram_ali_exe():
    """Check if we have a working FlowgramAligner"""
    ali_exe = get_flowgram_ali_exe()

    if which(ali_exe) is None:
        raise ApplicationNotFoundError("The alignment program %s is not "
                                       "accessible via the PATH environment "
                                       "variable." % ali_exe)

    # test if its callable and actually works
    command = "%s -h" % ali_exe
    proc = Popen(command,
                 shell=True,
                 universal_newlines=True,
                 stdout=PIPE,
                 stderr=STDOUT)

    if (proc.wait() != 0):
        raise ApplicationError(
            "Calling %s failed. Check permissions and that it is in fact an executable."
            % ali_exe)

    result = proc.stdout.read()
    # check that the help string looks correct
    if (not result.startswith("Usage")):
        raise ApplicationError(
            "Calling %s failed. Check permissions and that it is in fact an executable."
            % ali_exe)
    return True
예제 #2
0
    def _handle_app_result_build_failure(self, out, err, exit_status,
                                         result_paths):
        """ Catch the error when files are not produced """

        try:
            raise ApplicationError('RAxML failed to produce an output file due to the following error: \n\n%s ' \
             % err.read())
        except:
            raise ApplicationError('RAxML failed to run properly.')
예제 #3
0
    def _set_command_line_parameters(self, data):
        """ Get the right setting for each command line parameter """
        # This function could be cleaned up.

        # for each command line parameter, set it to the value passed in or
        # the default value.
        for p in self._parameter_order:
            if p not in data:
                if p in self._required_parameters:
                    raise ApplicationError("Required parameter %s missing." %
                                           p)
                else:
                    data[p] = self._data[p]
            # Write necessary files to disk -- need to modify this so paths
            # to existing files can be passed in.
            if p in self._potential_paths:
                try:
                    data[p] = self._input_as_lines(data[p])
                except TypeError:
                    pass
        if data['single_pair_only'] == 1 and \
           not (data['pos1'] and data['pos2']):
            raise ApplicationError(
                "Must specify pos1 and pos2 if single_pair_only == 1.")

        # Make sure the MolType is in the correct format (i.e., 1 or 0)
        data['mol_type'] = mol_type = \
         self._mol_type_lookup[str(data['mol_type']).lower()]

        char_order = self._char_order[mol_type]
        # If we didn't get several values as parameters, set the defaults.
        # These are done outside of the above loop b/c they require special
        # handling.
        if not data['char_priors']:
            data['char_priors'] = self._default_priors[mol_type]
        data['char_priors'] = \
             self._input_as_lines(\
              self._input_as_gctmpca_char_priors(\
              data['char_priors'],char_order))
        if not data['sub_matrix']:
            data['sub_matrix'] = \
             self._input_as_multiline_string(\
              self._default_sub_matrix[mol_type])
        else:
            data['sub_matrix'] = \
             self._input_as_lines(\
              self._input_as_gctmpca_rate_matrix(\
              data['sub_matrix'],char_order))
        if not data['output_path']:
            data['output_path'] = \
             self._input_as_path(self.getTmpFilename())
        return data
예제 #4
0
    def _get_base_command(self):
        """ Returns the full command string 

            input_arg: the argument to the command which represents the input 
                to the program, this will be a string, either 
                representing input or a filename to get input from
        """
        command_part1 = []
        command_part2 = []
        # Append a change directory to the beginning of the command to change
        # to self.WorkingDir before running the command
        cd_command = ''.join(['cd ', self.WorkingDir, ';'])
        if self._command1 is None:
            raise ApplicationError('_command has not been set.')

        parameters = self.Parameters
        command1 = self._command1
        command2 = self._command2

        command_part1.append(cd_command)
        command_part1.append(command1)
        command_part1.append(''.join(['2> ', self.WorkingDir, 'ShapesStderr']))

        command_part2.append(command2)

        command_part2.append(
            self._command_delimiter.join([
                _f for _f in (list(map(str, list(parameters.values())))) if _f
            ]))

        return self._command_delimiter.join(command_part1).strip(),\
               self._command_delimiter.join(command_part2).strip()
예제 #5
0
    def _get_result_paths(self, data):
        """ Set the result paths """

        result = {}

        inp_file_name = str(self.Parameters['--query_NAST'].Value)
        inp_file_name = inp_file_name.rstrip('"')
        inp_file_name = inp_file_name.lstrip('"')

        exec_dir = self.Parameters['--exec_dir']
        if exec_dir.isOn():
            exec_dir = str(exec_dir.Value)
            exec_dir = exec_dir.lstrip('"')
            exec_dir = exec_dir.rstrip('"')

            if inp_file_name[0] == '/':
                # path is already absolute
                pass
            else:
                inp_file_name = exec_dir + "/" + inp_file_name

        if not exists(inp_file_name + ".CPS.CPC"):
            raise ApplicationError("Calling ChimeraSlayer failed.")

        result['CPS'] = ResultPath(Path=inp_file_name + ".CPS.CPC",
                                   IsWritten=True)
        return result
예제 #6
0
    def _get_base_command(self):
        """ Returns the full command string

        Overridden here because there are positional arguments (specifically
        the input and output files).
        """
        command_parts = []
        # Append a change directory to the beginning of the command to change
        # to self.WorkingDir before running the command
        # WorkingDir should be in quotes -- filenames might contain spaces
        cd_command = ''.join(['cd ', str(self.WorkingDir), ';'])
        if self._command is None:
            raise ApplicationError('_command has not been set.')
        command = self._command
        # also make sure there's a subcommand!
        if self._subcommand is None:
            raise ApplicationError('_subcommand has not been set.')
        subcommand = self._subcommand
        # sorting makes testing easier, since the options will be written out
        # in alphabetical order. Could of course use option parsing scripts
        # in cogent for this, but this works as well.
        parameters = sorted(
            [str(x) for x in self.Parameters.values() if str(x)])
        synonyms = self._synonyms

        command_parts.append(cd_command)
        command_parts.append(command)
        # add in subcommand
        command_parts.append(subcommand)
        command_parts += parameters
        # add in the positional arguments in the correct order
        for k in self._input_order:
            # this check is necessary to account for optional positional
            # arguments, such as the mate file for bwa bwasw
            # Note that the input handler will ensure that all required
            # parameters have valid values
            if k in self._input:
                command_parts.append(self._input[k])

        return self._command_delimiter.join(command_parts).strip()
예제 #7
0
    def __call__(self, data=None, remove_tmp=True):
        """Run the application with the specified kwargs on data
        
            data: anything that can be cast into a string or written out to
                a file. Usually either a list of things or a single string or 
                number. input_handler will be called on this data before it 
                is passed as part of the command-line argument, so by creating
                your own input handlers you can customize what kind of data
                you want your application to accept

            remove_tmp: if True, removes tmp files
        """
        # Process the input data.  Input filepath is stored in
        # self._input_filename
        getattr(self, self.InputHandler)(data)

        if self.SuppressStdout:
            outfile = None
        else:
            outfile = open(self.getTmpFilename(self.TmpDir), 'w')
        if self.SuppressStderr:
            errfile = None
        else:
            errfile = open(self.getTmpFilename(self.TmpDir), 'w')

        args = [self._command, self._compile_mothur_script()]
        process = Popen(args,
                        stdout=outfile,
                        stderr=errfile,
                        cwd=self.WorkingDir)
        exit_status = process.wait()
        if not self._accept_exit_status(exit_status):
            raise ApplicationError(
                'Unacceptable application exit status: %s, command: %s' % \
                    (exit_status, args))

        if outfile is not None:
            outfile.seek(0)
        if errfile is not None:
            errfile.seek(0)
        result = CommandLineAppResult(outfile,
                                      errfile,
                                      exit_status,
                                      result_paths=self._get_result_paths())

        # Clean up the input file if one was created
        if remove_tmp:
            if self._input_filename:
                remove(self._input_filename)
                self._input_filename = None

        return result
예제 #8
0
    def _input_as_parameters(self, data):
        """ Set the input paths (a NAST aligned fasta filepath)
        """
        # The list of values which can be passed on a per-run basis
        allowed_values = ['--query_NAST', '--db_NAST', '--db_FASTA', '-R']

        unsupported_parameters = set(data.keys()) - set(allowed_values)
        if unsupported_parameters:
            raise ApplicationError(
                "Unsupported parameter(s) passed when calling ChimeraSlayer: %s"
                % ' '.join(unsupported_parameters))

        return ''
예제 #9
0
    def _input_as_dict(self, data):
        """Takes dictionary that sets input and output files.

        Valid keys for the dictionary are specified in the subclasses. File
        paths must be absolute.
        """
        # clear self._input; ready to receive new input and output files
        self._input = {}
        # Check that the arguments to the
        # subcommand-specific parameters are valid
        self.check_arguments()

        # Ensure that we have all required input (file I/O)
        for k in self._input_order:
            # N.B.: optional positional arguments begin with underscore (_)!
            # (e.g., see _mate_in for bwa bwasw)
            if k[0] != '_' and k not in data:
                raise ApplicationError("Missing required input %s" % k)

        # Set values for input and output files
        for k in data:
            # check for unexpected keys in the dict
            if k not in self._input_order:
                error_message = "Invalid input arguments (%s)\n" % k
                error_message += "Valid keys are: %s" % repr(self._input_order)
                raise ApplicationError(error_message + '\n')

            # check for absolute paths
            if not isabs(data[k][0]):
                raise ApplicationError("Only absolute paths allowed.\n%s" %
                                       repr(data))
            self._input[k] = data[k]

        # if there is a -f option to specify an output file, force the user to
        # use it (otherwise things to to stdout)
        if '-f' in self.Parameters and not self.Parameters['-f'].isOn():
            raise ApplicationError("Please specify an output file with -f")

        return ''
예제 #10
0
파일: bwa.py 프로젝트: rob-knight/qiime
    def check_arguments(self):
        """Sanity check the arguments passed in.

        Uses the boolean functions specified in the subclasses in the
        _valid_arguments dictionary to determine if an argument is valid
        or invalid.
        """
        for k, v in self.Parameters.iteritems():
            if self.Parameters[k].isOn():
                if k in self._valid_arguments:
                    if not self._valid_arguments[k](v.Value):
                        error_message = 'Invalid argument (%s) ' % v.Value
                        error_message += 'for parameter %s\n' % k
                        raise ApplicationError(error_message)
예제 #11
0
    def setUp(self):
        """ """
        self.files_to_remove = []
        self.dirs_to_remove = []

        tmp_dir = get_qiime_temp_dir()
        self.test_out = get_tmp_filename(
            tmp_dir=tmp_dir,
            prefix='qiime_parallel_taxonomy_assigner_tests_',
            suffix='',
            result_constructor=str)
        self.dirs_to_remove.append(self.test_out)
        create_dir(self.test_out)

        # Temporary input file
        self.tmp_seq_filepath = get_tmp_filename(
            tmp_dir=self.test_out,
            prefix='qiime_parallel_taxonomy_assigner_tests_input',
            suffix='.fasta')
        seq_file = open(self.tmp_seq_filepath, 'w')
        seq_file.write(rdp_test_seqs)
        seq_file.close()
        self.files_to_remove.append(self.tmp_seq_filepath)

        self.id_to_taxonomy_file = NamedTemporaryFile(
            prefix='qiime_parallel_taxonomy_assigner_tests_id_to_taxonomy',
            suffix='.txt',
            dir=tmp_dir)
        self.id_to_taxonomy_file.write(rdp_id_to_taxonomy)
        self.id_to_taxonomy_file.seek(0)

        self.reference_seqs_file = NamedTemporaryFile(
            prefix='qiime_parallel_taxonomy_assigner_tests_ref_seqs',
            suffix='.fasta',
            dir=tmp_dir)
        self.reference_seqs_file.write(rdp_reference_seqs)
        self.reference_seqs_file.seek(0)

        jar_fp = getenv('RDP_JAR_PATH')
        jar_basename = basename(jar_fp)
        if '2.2' not in jar_basename:
            raise ApplicationError(
                "RDP_JAR_PATH does not point to version 2.2 of the "
                "RDP Classifier.")

        initiate_timeout(60)
예제 #12
0
    def _get_jar_fp(self):
        """Returns the full path to the JAR file.

        Raises an ApplicationError if the JAR file cannot be
        found in the (1) current directory or (2) the path specified
        in the RDP_JAR_PATH environment variable.
        """
        # handles case where the jar file is in the current working directory
        if exists(self._command):
            return self._command
        # handles the case where the user has specified the location via
        # an environment variable
        elif 'RDP_JAR_PATH' in environ:
            return getenv('RDP_JAR_PATH')
        # error otherwise
        else:
            raise ApplicationError("$RDP_JAR_PATH is not set -- this must be set to use the"+\
             " RDP classifier application controller.")
예제 #13
0
    def _input_as_parameters(self,data):
        """ Set the input path (a fasta filepath)
        """
        # The list of values which can be passed on a per-run basis
        allowed_values = ['-r','-t','-a','-b','-l','-d','i','-o','-m','-v','-f', '-g']

        unsupported_parameters = set(data.keys()) - set(allowed_values)
        if unsupported_parameters:
            raise ApplicationError("Unsupported parameter(s) passed when calling rtax: %s" %\
              ' '.join(unsupported_parameters))

        for v in allowed_values:
            # turn the parameter off so subsequent runs are not
            # affected by parameter settings from previous runs
            self.Parameters[v].off()
            if v in data:
                # turn the parameter on if specified by the user
                self.Parameters[v].on(data[v])

        return ''
예제 #14
0
    def _input_as_parameters(self, data):
        """ Set the input path (a fasta filepath)
        """
        # The list of values which can be passed on a per-run basis
        allowed_values = ['--input','--uc','--fastapairs',\
                           '--uc2clstr','--output','--mergesort']

        unsupported_parameters = set(data.keys()) - set(allowed_values)
        if unsupported_parameters:
            raise ApplicationError("Unsupported parameter(s) passed when calling uclust: %s" %\
              ' '.join(unsupported_parameters))

        for v in allowed_values:
            # turn the parameter off so subsequent runs are not
            # affected by parameter settings from previous runs
            self.Parameters[v].off()
            if v in data:
                # turn the parameter on if specified by the user
                self.Parameters[v].on(data[v])

        return ''
예제 #15
0
파일: mothur.py 프로젝트: yatisht/pycogent
    def _derive_log_path(self):
        """Guess logfile path produced by Mothur

        This method checks the working directory for log files
        generated by Mothur.  It will raise an ApplicationError if no
        log file can be found.

        Mothur generates log files named in a nondeterministic way,
        using the current time.  We return the log file with the most
        recent time, although this may lead to incorrect log file
        detection if you are running many instances of mothur
        simultaneously.
        """
        filenames = listdir(self.WorkingDir)
        lognames = [x for x in filenames if re.match("^mothur\.\d+\.logfile$", x)]
        if not lognames:
            raise ApplicationError(
                'No log file detected in directory %s. Contents: \n\t%s' % (
                    input_dir, '\n\t'.join(possible_logfiles)))
        most_recent_logname = sorted(lognames, reverse=True)[0]
        return path.join(self.WorkingDir, most_recent_logname)
예제 #16
0
def assign_dna_reads_to_dna_database(query_fasta_fp,
                                     database_fasta_fp,
                                     output_fp,
                                     params=None):
    """Assign DNA reads to a database fasta of DNA sequences.

    Wraps assign_reads_to_database, setting database and query types. All
    parameters are set to default unless params is passed.

    query_fasta_fp: absolute path to the query fasta file containing DNA
                   sequences.
    database_fasta_fp: absolute path to the database fasta file containing
                      DNA sequences.
    output_fp: absolute path where the output file will be generated.
    params: optional. dict containing parameter settings to be used
                  instead of default values. Cannot change database or query
                  file types from dna and dna, respectively.

    This method returns an open file object. The output format
    defaults to blast9 and should be parsable by the PyCogent BLAST parsers.
    """
    if params is None:
        params = {}

    my_params = {'-t': 'dna', '-q': 'dna'}

    # if the user specified parameters other than default, then use them.
    # However, if they try to change the database or query types, raise an
    # applciation error.
    if '-t' in params or '-q' in params:
        raise ApplicationError("Cannot change database or query types when " +
                               "using assign_dna_reads_to_dna_database. " +
                               "Use assign_reads_to_database instead.\n")

    my_params.update(params)

    result = assign_reads_to_database(query_fasta_fp, database_fasta_fp,
                                      output_fp, my_params)

    return result
예제 #17
0
    def _get_base_command(self):
        """Returns the base command plus command-line options.

        Does not include input file, output file, and training set.
        """
        # Necessary? Preserve for consistency.
        if self._command is None:
            raise ApplicationError('_command has not been set.')

        # Append a change directory to the beginning of the command to change
        # to self.WorkingDir before running the command
        # WorkingDir should be in quotes -- filenames might contain spaces
        cd_command = ''.join(['cd ', str(self.WorkingDir), ';'])

        jvm_command = "java"
        jvm_arguments = self._commandline_join(
            list(self.JvmParameters.values()))
        jar_arguments = '-jar "%s"' % self._get_jar_fp()

        result = self._commandline_join(
            [cd_command, jvm_command, jvm_arguments, jar_arguments])
        return result
예제 #18
0
    def _get_base_command(self):
        """Gets the command that will be run when the app controller is
        called.
        """
        command_parts = []
        cd_command = ''.join(['cd ',str(self.WorkingDir),';'])
        if self._command is None:
            raise ApplicationError('_command has not been set.')
        command = self._command
        parameters = sorted([str(x) for x in list(self.Parameters.values()) 
                            if str(x)])

        synonyms = self._synonyms

        command_parts.append(cd_command)
        command_parts.append(command)
        command_parts.append(self._database) # Positional argument
        command_parts.append(self._query) # Positional argument
        command_parts += parameters
        if self._output: command_parts.append(self._output.Path) # Positional

        return self._command_delimiter.join([_f for _f in command_parts if _f]).strip()
예제 #19
0
    def _get_base_command(self):
        """ Returns the full command string

            Overides the __call__ function in util.py becasue of the special
            circumstance surrounding the command line input.

            input_arg: the argument to the command which represents the input 
                to the program, this will be a string, either 
                representing input or a filename to get input from
        """
        command_part1 = []
        command_part2 = []
        # Append a change directory to the beginning of the command to change
        # to self.WorkingDir before running the command
        cd_command = ''.join(['cd ', self.WorkingDir, ';'])
        if self._command is None:
            raise ApplicationError('_command has not been set.')
        command = self._command

        command_part1.append(cd_command)
        command_part1.append(command)

        lista = [self.Parameters['-alignment'],\
                  self.Parameters['-M'],\
                  self.Parameters['-gap_cost'],\
                  self.Parameters['-max_structures'],\
                  self.Parameters['-max_percent_diff'],\
                  self.Parameters['-bp_window'],\
                  self.Parameters['-align_window'],\
                  self.Parameters['-single_bp_inserts']]

        command_part2.append(
            self._command_delimiter.join(
                [_f for _f in (list(map(str, lista))) if _f]))

        return self._command_delimiter.join(command_part1).strip(),\
               self._command_delimiter.join(command_part2).strip()
예제 #20
0
    def __call__(self, data=None, remove_tmp=True):
        """Run the application with the specified kwargs on data

        data: anything that can be cast into a string or written out
          to a file. Usually either a list of things or a single
          string or number. input_handler will be called on this data
          before it is passed as part of the command-line argument, so
          by creating your own input handlers you can customize what
          kind of data you want your application to accept

        remove_tmp: if True, removes tmp files
        """
        result = super(RdpClassifier, self).__call__(data=data, remove_tmp=remove_tmp)
        training_files = {
            'bergeyTree': 'bergeyTrainingTree.xml',
            'probabilityList': 'genus_wordConditionalProbList.txt',
            'probabilityIndex': 'wordConditionalProbIndexArr.txt',
            'wordPrior': 'logWordPrior.txt',
        }
        for key, training_fn in sorted(training_files.items()):
            training_fp = os.path.join(self.ModelDir, training_fn)
            if not os.path.exists(training_fp):
                exception_msg = (
                    "Training output file %s not found.  This may "
                    "happen if an error occurred during the RDP training "
                    "process.  More details may be available in the "
                    "standard error, printed below.\n\n" % training_fp
                    )
                stderr_msg = result["StdErr"].read()
                result["StdErr"].seek(0)
                raise ApplicationError(exception_msg + stderr_msg)
            # Not in try/except clause because we already know the
            # file exists. Failure would be truly exceptional, and we
            # want to maintain the original exception in that case.
            result[key] = open(training_fp)
        return result
예제 #21
0
def filter_with_flowgram(id,
                         flowgram,
                         flowgrams,
                         header,
                         ids,
                         num_flows,
                         bestscores,
                         log_fh,
                         outdir="/tmp/",
                         threshold=3.75,
                         num_cpus=32,
                         fast_method=True,
                         on_cluster=False,
                         mapping=None,
                         spread=[],
                         verbose=False,
                         pair_id_thresh=0.97,
                         client_sockets=[],
                         error_profile=DENOISER_DATA_DIR +
                         'FLX_error_profile.dat'):
    """Filter all files in flows_filename with flowgram and split according to threshold.

    id: The flowgram identifier of the master flowgram of this round

    flowgram: This flowgram is used to filter all the other flowgrams

    flowgrams: iterator containing the flowgrams to be filtered

    header: a valid sff.txt header

    ids: this list marks the active flowgrams, i.e. flowgrams that are unclustered

    num_flows: Number of flows remaining in the current round

    bestscores: dictionary that stores for each unclustered flowgram the best
                score it has to to one of the centroids previously seen
                and the id of the centroid. Used in the second denoising phase.

    outdir: directory where intermediate and result files go

    threshold: Filtering threshold

    num_cpus: number of cpus to run on, if on_cluster == True

    fast_method: Boolean value for fast denoising with lots of memory

    on_cluster: Boolean flag for local vs cluster

    mapping: the current cluster mapping

    spread: worker processing throughput

    error_profile: Path to error profile *.dat file


    Implementation detail:
    The iterator behind 'flowgrams' is big and thus we want to keep its traversals
    at a minimum. The naive implementation of this filter function would traverse the
    iterator once to create the input file for the alignment routine, then a second
    time to do the actual filtering. To get rid of the second run through the iterator,
    we keep a list (in fact a dict) of active 'ids' and do the filtering only in the next
    round. A cleaner but still fast solution would be great, as this definitly poses a
    pitfall for future modifications.

    Returns filename of file containing all non-filtered flows and the number of flows
    """
    if verbose:
        log_fh.write("Filtering with %s: %d flowgrams\n" % (id, num_flows))

    # set up the flowgram storage
    if (not fast_method):
        fc = FlowgramContainerFile(header, outdir)
    else:
        fc = FlowgramContainerArray()

    # calculate distance scores
    if on_cluster:
        (scores, names, flowgrams) =\
            get_flowgram_distances_on_cluster(
                id, flowgram, flowgrams, fc, ids, num_cpus,
                num_flows, spread=spread, client_sockets=client_sockets)
    else:
        (scores, names, flowgrams) =\
            get_flowgram_distances(
                id, flowgram, flowgrams, fc, ids, outdir=outdir,
                error_profile=error_profile)

    # shortcut for non-matching flowgrams
    survivors = filter(
        lambda a_b: a_b[0] < threshold or a_b[1] >= pair_id_thresh, scores)
    if (len(survivors) == 0):
        # put it in its own cluster
        # and remove it from any further searches
        if (id in bestscores):
            del (bestscores[id])
        del (ids[id])
        return (flowgrams, num_flows - 1)

    # Do the filtering
    non_clustered_ctr = 0
    for ((score, pair_id), name) in zip(scores, names):
        if (score < threshold or name == id or pair_id >= pair_id_thresh):
            # make sure the original flowgram gets into this cluster
            del (ids[name])
            if (name in bestscores):
                del (bestscores[name])
            if (id != name):
                # update the mapping information
                mapping[id].extend(mapping[name])
                mapping[id].append(name)
                # delete the old cluster from the mapping
                del (mapping[name])
        else:
            non_clustered_ctr += 1
            # keep track of the best match of this guy to any centroid
            if (name not in bestscores or score < bestscores[name][1]):
                bestscores[name] = (id, score)

    # Some extra safety that we are not missing anything
    if (len(ids) != non_clustered_ctr or len(bestscores) != non_clustered_ctr):
        raise ApplicationError("filterWithFlowgram failed")

    return (flowgrams, non_clustered_ctr)
예제 #22
0
파일: bwa.py 프로젝트: rob-knight/qiime
def assign_reads_to_database(query, database_fasta, out_path, params=None):
    """Assign a set of query sequences to a reference database
    
    database_fasta_fp: absolute file path to the reference database
    query_fasta_fp: absolute file path to query sequences
    output_fp: absolute file path of the file to be output
    params: dict of BWA specific parameters.
            * Specify which algorithm to use (bwa-short or bwasw) using the
            dict key "algorithm"
            * if algorithm is bwasw, specify params for the bwa bwasw
            subcommand
            * if algorithm is bwa-short, specify params for the bwa samse
            subcommand
            * if algorithm is bwa-short, must also specify params to use with
            bwa aln, which is used to get the sai file necessary to run samse.
            bwa aln params should be passed in using dict key "aln_params" and
            the associated value should be a dict of params for the bwa aln
            subcommand
            * if a temporary directory is not specified in params using dict
            key "temp_dir", it will be assumed to be /tmp
    
    This method returns an open file object (SAM format).
    """
    if params is None:
        params = {}

    # set the output path
    params['-f'] = out_path

    # if the algorithm is not specified in the params dict, or the algorithm
    # is not recognized, raise an exception
    if 'algorithm' not in params:
        raise ApplicationError("Must specify which algorithm to use " + \
                               "('bwa-short' or 'bwasw')")
    elif params['algorithm'] not in ('bwa-short', 'bwasw'):
        raise ApplicationError('Unknown algorithm "%s". ' % \
                                params['algorithm'] + \
                                "Please enter either 'bwa-short' or 'bwasw'.")

    # if the temp directory is not specified, assume /tmp
    if 'temp_dir' not in params:
        params['temp_dir'] = '/tmp'

    # if the algorithm is bwa-short, we must build use bwa aln to get an sai
    # file before calling bwa samse on that sai file, so we need to know how
    # to run bwa aln. Therefore, we must ensure there's an entry containing
    # those parameters
    if params['algorithm'] == 'bwa-short':
        if 'aln_params' not in params:
            raise ApplicationError("With bwa-short, need to specify a key " + \
                                   "'aln_params' and its value, a " + \
                                   "dictionary to pass to bwa aln, since " + \
                                   "bwa aln is an intermediate step when " + \
                                   "doing bwa-short.")

    # we have this params dict, with "algorithm" and "temp_dir", etc which are
    # not for any of the subcommands, so make a new params dict that is the
    # same as the original minus these addendums
    subcommand_params = {}
    for k, v in params.iteritems():
        if k not in ('algorithm', 'temp_dir', 'aln_params'):
            subcommand_params[k] = v

    # build index from database_fasta
    # get a temporary file name that is not in use
    index_prefix = get_tmp_filename(tmp_dir=params['temp_dir'], suffix='', \
                                    result_constructor=str)

    create_bwa_index_from_fasta_file(database_fasta, {'-p': index_prefix})

    # if the algorithm is bwasw, things are pretty simple. Just instantiate
    # the proper controller and set the files
    if params['algorithm'] == 'bwasw':
        bwa = BWA_bwasw(params=subcommand_params)
        files = {'prefix': index_prefix, 'query_fasta': query}

    # if the algorithm is bwa-short, it's not so simple
    elif params['algorithm'] == 'bwa-short':
        # we have to call bwa_aln to get the sai file needed for samse
        # use the aln_params we ensured we had above
        bwa_aln = BWA_aln(params=params['aln_params'])
        aln_files = {'prefix': index_prefix, 'fastq_in': query}
        # get the path to the sai file
        sai_file_path = bwa_aln(aln_files)['output'].name

        # we will use that sai file to run samse
        bwa = BWA_samse(params=subcommand_params)
        files = {
            'prefix': index_prefix,
            'sai_in': sai_file_path,
            'fastq_in': query
        }

    # run which ever app controller we decided was correct on the files
    # we set up
    result = bwa(files)

    # they both return a SAM file, so return that
    return result['output']
예제 #23
0
 def _handle_app_result_build_failure(self, out, err, exit_status,
                                      result_paths):
     """ Catch the error when files are not produced """
     raise ApplicationError('ParsInsert failed to produce an output file due to the following error: \n\n%s ' \
      % err.read())
예제 #24
0
def assign_dna_reads_to_protein_database(query_fasta_fp,
                                         database_fasta_fp,
                                         output_fp,
                                         temp_dir="/tmp",
                                         params=None):
    """Assign DNA reads to a database fasta of protein sequences.

    Wraps assign_reads_to_database, setting database and query types. All
    parameters are set to default unless params is passed. A temporary
    file must be written containing the translated sequences from the input
    query fasta file because BLAT cannot do this automatically.

    query_fasta_fp: absolute path to the query fasta file containing DNA
                   sequences.
    database_fasta_fp: absolute path to the database fasta file containing
                      protein sequences.
    output_fp: absolute path where the output file will be generated.
    temp_dir: optional. Change the location where the translated sequences
              will be written before being used as the query. Defaults to
              /tmp.
    params: optional. dict containing parameter settings to be used
                  instead of default values. Cannot change database or query
                  file types from protein and dna, respectively.

    This method returns an open file object. The output format
    defaults to blast9 and should be parsable by the PyCogent BLAST parsers.
    """
    if params is None:
        params = {}

    my_params = {'-t': 'prot', '-q': 'prot'}

    # make sure temp_dir specifies an absolute path
    if not isabs(temp_dir):
        raise ApplicationError("temp_dir must be an absolute path.")

    # if the user specified parameters other than default, then use them.
    # However, if they try to change the database or query types, raise an
    # applciation error.
    if '-t' in params or '-q' in params:
        raise ApplicationError(
            "Cannot change database or query types "
            "when using assign_dna_reads_to_dna_database. Use "
            "assign_reads_to_database instead.")

    if 'genetic_code' in params:
        my_genetic_code = GeneticCodes[params['genetic_code']]
        del params['genetic_code']
    else:
        my_genetic_code = GeneticCodes[1]

    my_params.update(params)

    # get six-frame translation of the input DNA sequences and write them to
    # temporary file.
    tmp = get_tmp_filename(tmp_dir=temp_dir, result_constructor=str)
    tmp_out = open(tmp, 'w')

    for label, sequence in MinimalFastaParser(open(query_fasta_fp)):
        seq_id = label.split()[0]

        s = DNA.makeSequence(sequence)
        translations = my_genetic_code.sixframes(s)
        frames = [1, 2, 3, -1, -2, -3]
        translations = dict(zip(frames, translations))

        for frame, translation in sorted(translations.iteritems()):
            entry = '>{seq_id}_frame_{frame}\n{trans}\n'
            entry = entry.format(seq_id=seq_id, frame=frame, trans=translation)
            tmp_out.write(entry)

    tmp_out.close()
    result = assign_reads_to_database(tmp,
                                      database_fasta_fp,
                                      output_fp,
                                      params=my_params)

    remove(tmp)

    return result
예제 #25
0
def denoise_seqs(sff_fps,
                 fasta_fp,
                 tmpoutdir,
                 preprocess_fp=None,
                 cluster=False,
                 num_cpus=1,
                 squeeze=True,
                 percent_id=0.97,
                 bail=1,
                 primer="",
                 low_cutoff=3.75,
                 high_cutoff=4.5,
                 log_fp="denoiser.log",
                 low_memory=False,
                 verbose=False,
                 error_profile=DENOISER_DATA_DIR + 'FLX_error_profile.dat',
                 max_num_rounds=None,
                 titanium=False,
                 checkpoint_fp=None):
    """The main routine to denoise flowgrams"""

    # abort if binary is missing
    check_flowgram_ali_exe()

    if verbose:
        # switch of buffering for log file
        log_fh = open(tmpoutdir + "/" + log_fp, "w", 0)
    else:
        log_fh = None

    # overwrite settings if titanium is set
    # This flag is only used from qiime. Remove after qiime integration
    if titanium:
        error_profile = DENOISER_DATA_DIR + "Titanium_error_profile.dat"
        low_cutoff = 4
        high_cutoff = 5

    if verbose:
        log_fh.write("Denoiser version: %s\n" % __version__)
        log_fh.write("SFF files: %s\n" % ', '.join(sff_fps))
        log_fh.write("Fasta file: %s\n" % fasta_fp)
        log_fh.write("Preprocess dir: %s\n" % preprocess_fp)
        if checkpoint_fp:
            log_fh.write("Resuming denoiser from %s\n" % checkpoint_fp)
        log_fh.write("Primer sequence: %s\n" % primer)
        log_fh.write("Running on cluster: %s\n" % cluster)
        log_fh.write("Num CPUs: %d\n" % num_cpus)
        log_fh.write("Squeeze Seqs: %s\n" % squeeze)
        log_fh.write("tmpdir: %s\n" % tmpoutdir)
        log_fh.write("percent_id threshold: %.2f\n" % percent_id)
        log_fh.write("Minimal sequence coverage for first phase: %d\n" % bail)
        log_fh.write("Low cut-off: %.2f\n" % low_cutoff)
        log_fh.write("High cut-off: %.2f\n" % high_cutoff)
        log_fh.write("Error profile: %s\n" % error_profile)
        log_fh.write("Maximal number of iteration: %s\n\n" % max_num_rounds)

    # here we go ...
    # Phase I - clean up and truncate input sff
    if (checkpoint_fp):
        if (preprocess_fp):
            # skip preprocessing as we should have data
            # we already have preprocessed data, so use it
            (deprefixed_sff_fp, l, mapping,
             seqs) = read_preprocessed_data(preprocess_fp)
        else:
            raise ApplicationError(
                "Resuming from checkpoint requires --preprocess option")

    else:
        if (preprocess_fp):
            # we already have preprocessed data, so use it
            (deprefixed_sff_fp, l, mapping,
             seqs) = read_preprocessed_data(preprocess_fp)
        elif (cluster):
            preprocess_on_cluster(sff_fps,
                                  log_fp,
                                  fasta_fp=fasta_fp,
                                  out_fp=tmpoutdir,
                                  verbose=verbose,
                                  squeeze=squeeze,
                                  primer=primer)
            (deprefixed_sff_fp, l, mapping,
             seqs) = read_preprocessed_data(tmpoutdir)
        else:
            (deprefixed_sff_fp, l, mapping, seqs) = \
                preprocess(
                    sff_fps, log_fh, fasta_fp=fasta_fp, out_fp=tmpoutdir,
                    verbose=verbose, squeeze=squeeze, primer=primer)

        # preprocessor writes into same file, so better jump to end of file
        if verbose:
            log_fh.close()
            log_fh = open(tmpoutdir + "/" + log_fp, "a", 0)

    # phase II:
    # use prefix map based clustering as initial centroids and greedily
    # add flowgrams to clusters with a low threshold

    (new_sff_file, bestscores, mapping) = \
        greedy_clustering(deprefixed_sff_fp, seqs, mapping, tmpoutdir, l,
                          log_fh, num_cpus=num_cpus, on_cluster=cluster,
                          bail_out=bail, pair_id_thresh=percent_id,
                          threshold=low_cutoff, verbose=verbose,
                          fast_method=not low_memory,
                          error_profile=error_profile,
                          max_num_rounds=max_num_rounds,
                          checkpoint_fp=checkpoint_fp)

    # phase III phase:
    # Assign seqs to nearest existing centroid with high threshold
    secondary_clustering(new_sff_file,
                         mapping,
                         bestscores,
                         log_fh,
                         verbose=verbose,
                         threshold=high_cutoff)
    remove(new_sff_file)
    if (verbose):
        log_fh.write("Finished clustering\n")
        log_fh.write("Writing Clusters\n")
        log_fh.write(make_stats(mapping) + "\n")
    store_clusters(mapping, deprefixed_sff_fp, tmpoutdir)
    store_mapping(mapping, tmpoutdir, "denoiser")
예제 #26
0
    def _input_as_list(self, data):
        '''Takes the positional arguments as input in a list.

        The list input here should be [query_file_path, database_file_path,
        output_file_path]'''
        query, database, output = data
        if (not isabs(database)) \
                or (not isabs(query)) \
                or (not isabs(output)):
            raise ApplicationError("Only absolute paths allowed.\n%s" %
                                   ', '.join(data))

        self._database = FilePath(database)
        self._query = FilePath(query)
        self._output = ResultPath(output, IsWritten=True)

        # check parameters that can only take a particular set of values
        # check combination of databse and query type
        if self.Parameters['-t'].isOn() and self.Parameters['-q'].isOn() and \
                (self.Parameters['-t'].Value, self.Parameters['-q'].Value) not in \
                self._valid_combinations:
            error_message = "Invalid combination of database and query " + \
                            "types ('%s', '%s').\n" % \
                            (self.Paramters['-t'].Value,
                             self.Parameters['-q'].Value)

            error_message += "Must be one of: %s\n" % \
                             repr(self._valid_combinations)

            raise ApplicationError(error_message)

        # check database type
        if self.Parameters['-t'].isOn() and \
                self.Parameters['-t'].Value not in self._database_types:
            error_message = "Invalid database type %s\n" % \
                            self.Parameters['-t'].Value

            error_message += "Allowed values: %s\n" % \
                             ', '.join(self._database_types)

            raise ApplicationError(error_message)

        # check query type
        if self.Parameters['-q'].isOn() and \
                self.Parameters['-q'].Value not in self._query_types:
            error_message = "Invalid query type %s\n" % \
                            self.Parameters['-q'].Value

            error_message += "Allowed values: %s\n" % \
                ', '.join(self._query_types)

            raise ApplicationError(error_message)

        # check mask type
        if self.Parameters['-mask'].isOn() and \
                self.Parameters['-mask'].Value not in self._mask_types:
            error_message = "Invalid mask type %s\n" % \
                            self.Parameters['-mask']

            error_message += "Allowed Values: %s\n" % \
                ', '.join(self._mask_types)

            raise ApplicationError(error_message)

        # check qmask type
        if self.Parameters['-qMask'].isOn() and \
                self.Parameters['-qMask'].Value not in self._mask_types:
            error_message = "Invalid qMask type %s\n" % \
                            self.Parameters['-qMask'].Value

            error_message += "Allowed values: %s\n" % \
                             ', '.join(self._mask_types)

            raise ApplicationError(error_message)

        # check repeat type
        if self.Parameters['-repeats'].isOn() and \
                self.Parameters['-repeats'].Value not in self._mask_types:
            error_message = "Invalid repeat type %s\n" % \
                            self.Parameters['-repeat'].Value

            error_message += "Allowed values: %s\n" % \
                             ', '.join(self._mask_types)

            raise ApplicationError(error_message)

        # check output format
        if self.Parameters['-out'].isOn() and \
                self.Parameters['-out'].Value not in self._out_types:
            error_message = "Invalid output type %s\n" % \
                            self.Parameters['-out']

            error_message += "Allowed values: %s\n" % \
                             ', '.join(self._out_types)

            raise ApplicationError(error_message)

        return ''
예제 #27
0
def get_clusters_from_fasta_filepath(fasta_filepath,
                                     original_fasta_path,
                                     percent_ID=0.97,
                                     max_accepts=1,
                                     max_rejects=8,
                                     stepwords=8,
                                     word_length=8,
                                     optimal=False,
                                     exact=False,
                                     suppress_sort=False,
                                     output_dir=None,
                                     enable_rev_strand_matching=False,
                                     subject_fasta_filepath=None,
                                     suppress_new_clusters=False,
                                     return_cluster_maps=False,
                                     stable_sort=False,
                                     save_uc_files=True,
                                     HALT_EXEC=False):
    """ Main convenience wrapper for using uclust to generate cluster files
    
    A source fasta file is required for the fasta_filepath.  This will be 
    sorted to be in order of longest to shortest length sequences.  Following
    this, the sorted fasta file is used to generate a cluster file in the
    uclust (.uc) format.  Next the .uc file is converted to cd-hit format
    (.clstr).  Finally this file is parsed and returned as a list of lists, 
    where each sublist a cluster of sequences.  If an output_dir is
    specified, the intermediate files will be preserved, otherwise all
    files created are temporary and will be deleted at the end of this 
    function
    
    The percent_ID parameter specifies the percent identity for a clusters,
    i.e., if 99% were the parameter, all sequences that were 99% identical
    would be grouped as a cluster.
    """

    # Create readable intermediate filenames if they are to be kept

    fasta_output_filepath = None
    uc_output_filepath = None
    cd_hit_filepath = None

    if output_dir and not output_dir.endswith('/'):
        output_dir += '/'

    if save_uc_files:
        uc_save_filepath = get_output_filepaths(output_dir,
                                                original_fasta_path)
    else:
        uc_save_filepath = None

    sorted_fasta_filepath = ""
    uc_filepath = ""
    clstr_filepath = ""

    # Error check in case any app controller fails
    files_to_remove = []
    try:
        if not suppress_sort:
            # Sort fasta input file from largest to smallest sequence
            sort_fasta = uclust_fasta_sort_from_filepath(fasta_filepath, \
            output_filepath=fasta_output_filepath)

            # Get sorted fasta name from application wrapper
            sorted_fasta_filepath = sort_fasta['Output'].name
            files_to_remove.append(sorted_fasta_filepath)

        else:
            sort_fasta = None
            sorted_fasta_filepath = fasta_filepath

        # Generate uclust cluster file (.uc format)
        uclust_cluster = uclust_cluster_from_sorted_fasta_filepath(
            sorted_fasta_filepath,
            uc_save_filepath,
            percent_ID=percent_ID,
            max_accepts=max_accepts,
            max_rejects=max_rejects,
            stepwords=stepwords,
            word_length=word_length,
            optimal=optimal,
            exact=exact,
            suppress_sort=suppress_sort,
            enable_rev_strand_matching=enable_rev_strand_matching,
            subject_fasta_filepath=subject_fasta_filepath,
            suppress_new_clusters=suppress_new_clusters,
            stable_sort=stable_sort,
            HALT_EXEC=HALT_EXEC)
        # Get cluster file name from application wrapper
        remove_files(files_to_remove)
    except ApplicationError:
        remove_files(files_to_remove)
        raise ApplicationError(
            'Error running uclust. Possible causes are '
            'unsupported version (current supported version is v1.2.22) is installed or '
            'improperly formatted input file was provided')
    except ApplicationNotFoundError:
        remove_files(files_to_remove)
        raise ApplicationNotFoundError('uclust not found, is it properly '+\
         'installed?')

    # Get list of lists for each cluster
    clusters, failures, seeds = \
     clusters_from_uc_file(uclust_cluster['ClusterFile'])

    # Remove temp files unless user specifies output filepath
    if not save_uc_files:
        uclust_cluster.cleanUp()

    if return_cluster_maps:
        return clusters, failures, seeds
    else:
        return list(clusters.values()), failures, seeds
예제 #28
0
    def __call__(self, data=None):
        """Run the application with the specified kwargs on data

        Overides the __call__ function in util.py becasue of the special
        circumstance surrounding the command line input.
        
        data: anything that can be cast into a string or written out to
        a file. Usually either a list of things or a single string or 
        number. input_handler will be called on this data before it 
        is passed as part of the command-line argument, so by creating
        your own input handlers you can customize what kind of data
        you want you application to accept
        """
        input_handler = self.InputHandler
        suppress_stdout = self.SuppressStdout
        suppress_stderr = self.SuppressStderr
        if suppress_stdout:
            outfile = '/dev/null'
        else:
            outfile = self.getTmpFilename(self.WorkingDir)
        if suppress_stderr:
            errfile = '/dev/null'
        else:
            errfile = self.getTmpFilename(self.WorkingDir)
        if data is None:
            input_arg = ''
        else:
            input_arg = getattr(self, input_handler)(data)

        # Build up the command, consisting of a BaseCommand followed by
        # input and output (file) specifications
        first, second = self.BaseCommand
        command = self._command_delimiter.join([
            _f
            for _f in [first, input_arg, second, '>', outfile, '2>', errfile]
            if _f
        ])
        if self.HaltExec:
            raise AssertionError("Halted exec with command:\n" + command)
        # The return value of system is a 16-bit number containing the signal
        # number that killed the process, and then the exit status.
        # We only want to keep the exit status so do a right bitwise shift to
        # get rid of the signal number byte
        exit_status = system(command) >> 8

        # Determine if error should be raised due to exit status of
        # appliciation
        if not self._accept_exit_status(exit_status):
            raise ApplicationError('Unacceptable application exit status: %s, command: %s'\
                % (str(exit_status),command))

        # open the stdout and stderr if not being suppressed
        out = None
        if not suppress_stdout:
            out = open(outfile, "r")
        err = None
        if not suppress_stderr:
            err = open(errfile, "r")

        result =  CommandLineAppResult(out,err,exit_status,\
            result_paths=self._get_result_paths(data))

        # Clean up the input file if one was created
        if self._input_filename:
            for f in self._input_filename:
                remove(f)
            self._input_filename = None

        return result
예제 #29
0
    def _train_with_rdp_files(self,
                              training_seqs_file,
                              taxonomy_file,
                              model_output_dir,
                              remove_tmp=True):
        """Creates a set of training data for the RDP Classifier

            training_seqs_file: A pre-classified set of training
                sequences, in fasta-like format.  Each sequence must
                be labelled with an identifier (no spaces) and an
                assigned lineage (taxa separated by ';'). Example of
                a valid label: ">seq1 ROOT;Ph1;Fam1;G1;"

            taxonomy_file: A File-like object that specifies a
                taxonomic heirarchy. Each line in the file must
                contain a '*'-separated list of the following items:
                Taxon ID, Taxon Name, Parent Taxon ID, Depth, and
                Rank.  IDs should have an integer format.  Example of
                a valid line: "1*Bacteria*0*0*domain"

            model_output_dir: Directory in which to store training data.

            remove_tmp: if True, removes tmp files

        To use the resulting model with the RdpClassifier, set
        '-training_data' to the following path: model_output_dir +
        RdpClassifier.PropertiesFile
        """
        # Three extra pieces of information are required to create
        # training data.  Unless we want built-in support for
        # versioned training sets, these may be set to sensible
        # defaults.
        training_set_id = '1'
        taxonomy_version = 'version1'
        modification_info = 'cogent'

        # The properties file specifies the names of the files in the
        # training directory.  We use the example properties file
        # directly from the rdp_classifier distribution, which lists
        # the default set of files created by the application.  We
        # must write this file explicitly after generating the
        # training data.
        properties = (
            "# Sample ResourceBundle properties file\n"
            "bergeyTree=bergeyTrainingTree.xml\n"
            "probabilityList=genus_wordConditionalProbList.txt\n"
            "probabilityIndex=wordConditionalProbIndexArr.txt\n"
            "wordPrior=logWordPrior.txt\n"
            "classifierVersion=Naive Bayesian rRNA Classifier Version 1.0, November 2003\n"
        )

        input_handler = self.InputHandler
        suppress_stdout = self.SuppressStdout
        suppress_stderr = self.SuppressStderr
        if suppress_stdout:
            outfile = FilePath('/dev/null')
        else:
            outfile = self.getTmpFilename(self.TmpDir)
        if suppress_stderr:
            errfile = FilePath('/dev/null')
        else:
            errfile = FilePath(self.getTmpFilename(self.TmpDir))

        input_handler_function = getattr(self, input_handler)
        taxonomy_filename = input_handler_function(taxonomy_file)
        training_seqs_filename = input_handler_function(training_seqs_file)

        # Build up the command, consisting of a BaseCommand followed
        # by input and output (file) specifications

        # Example from rdp_classifier/sampledata/README:
        # java -Xmx400m -cp rdp_classifier-2.0.jar
        # edu/msu/cme/rdp/classifier/train/ClassifierTraineeMaker
        # mydata/mytaxon.txt mydata/mytrainseq.fasta 1 version1 test
        # mydata
        command = self._commandline_join([
            self.BaseCommand, taxonomy_filename, training_seqs_filename,
            training_set_id, taxonomy_version, modification_info,
            model_output_dir, '>', outfile, '2>', errfile
        ])

        if self.HaltExec:
            raise AssertionError("Halted exec with command:\n" + command)
        # The return value of system is a 16-bit number containing the signal
        # number that killed the process, and then the exit status.
        # We only want to keep the exit status so do a right bitwise shift to
        # get rid of the signal number byte
        exit_status = system(command) >> 8

        # Determine if error should be raised due to exit status of
        # appliciation
        if not self._accept_exit_status(exit_status):
            raise ApplicationError('Unacceptable application exit status: %s, command: %s'\
                % (str(exit_status),command))

        # must write properties file to output directory manually
        properties_fp = path.join(model_output_dir, self.PropertiesFile)
        properties_file = open(properties_fp, 'w')
        properties_file.write(properties)
        properties_file.close()

        # open the stdout and stderr if not being suppressed
        out = None
        if not suppress_stdout:
            out = open(outfile, "r")
        err = None
        if not suppress_stderr:
            err = open(errfile, "r")

        result = CommandLineAppResult(
            out,
            err,
            exit_status,
            result_paths=self._get_result_paths(model_output_dir))

        # Clean up the input files
        if remove_tmp:
            remove(taxonomy_filename)
            remove(training_seqs_filename)

        return result
예제 #30
0
    def __call__(self, data=None, remove_tmp=True):
        """Run the application with the specified kwargs on data
        
            data: anything that can be cast into a string or written out to
                a file. Usually either a list of things or a single string or 
                number. input_handler will be called on this data before it 
                is passed as part of the command-line argument, so by creating
                your own input handlers you can customize what kind of data
                you want your application to accept

            remove_tmp: if True, removes tmp files
        """
        input_handler = self.InputHandler
        suppress_stdout = self.SuppressStdout
        suppress_stderr = self.SuppressStderr
        assignment_fp = FilePath(self.getTmpFilename(self.TmpDir))
        if suppress_stdout:
            outfile = FilePath('/dev/null')
        else:
            outfile = FilePath(self.getTmpFilename(self.TmpDir))
        if suppress_stderr:
            errfile = FilePath('/dev/null')
        else:
            errfile = FilePath(self.getTmpFilename(self.TmpDir))
        if data is None:
            input_arg = ''
        else:
            input_arg = getattr(self, input_handler)(data)

        training_data = self.PositionalParameters['-training-data']

        # Build up the command, consisting of a BaseCommand followed by
        # input and output (file) specifications
        command = self._commandline_join([
            self.BaseCommand,
            input_arg,
            assignment_fp,
            training_data,
            '>',
            outfile,
            '2>',
            errfile,
        ])

        if self.HaltExec:
            raise AssertionError("Halted exec with command:\n" + command)
        # The return value of system is a 16-bit number containing the signal
        # number that killed the process, and then the exit status.
        # We only want to keep the exit status so do a right bitwise shift to
        # get rid of the signal number byte
        exit_status = system(command) >> 8

        # Determine if error should be raised due to exit status of
        # appliciation
        if not self._accept_exit_status(exit_status):
            raise ApplicationError('Unacceptable application exit status: %s, command: %s'\
                % (str(exit_status),command))

        # open the stdout and stderr if not being suppressed
        out = None
        if not suppress_stdout:
            out = open(outfile, "r")
        err = None
        if not suppress_stderr:
            err = open(errfile, "r")

        result_paths = self._get_result_paths(data)
        result_paths['Assignments'] = ResultPath(assignment_fp)
        result = CommandLineAppResult(out,
                                      err,
                                      exit_status,
                                      result_paths=result_paths)

        # Clean up the input file if one was created
        if remove_tmp:
            if self._input_filename:
                remove(self._input_filename)
                self._input_filename = None

        return result