def check_flowgram_ali_exe(): """Check if we have a working FlowgramAligner""" ali_exe = get_flowgram_ali_exe() if which(ali_exe) is None: raise ApplicationNotFoundError("The alignment program %s is not " "accessible via the PATH environment " "variable." % ali_exe) # test if its callable and actually works command = "%s -h" % ali_exe proc = Popen(command, shell=True, universal_newlines=True, stdout=PIPE, stderr=STDOUT) if (proc.wait() != 0): raise ApplicationError( "Calling %s failed. Check permissions and that it is in fact an executable." % ali_exe) result = proc.stdout.read() # check that the help string looks correct if (not result.startswith("Usage")): raise ApplicationError( "Calling %s failed. Check permissions and that it is in fact an executable." % ali_exe) return True
def _handle_app_result_build_failure(self, out, err, exit_status, result_paths): """ Catch the error when files are not produced """ try: raise ApplicationError('RAxML failed to produce an output file due to the following error: \n\n%s ' \ % err.read()) except: raise ApplicationError('RAxML failed to run properly.')
def _set_command_line_parameters(self, data): """ Get the right setting for each command line parameter """ # This function could be cleaned up. # for each command line parameter, set it to the value passed in or # the default value. for p in self._parameter_order: if p not in data: if p in self._required_parameters: raise ApplicationError("Required parameter %s missing." % p) else: data[p] = self._data[p] # Write necessary files to disk -- need to modify this so paths # to existing files can be passed in. if p in self._potential_paths: try: data[p] = self._input_as_lines(data[p]) except TypeError: pass if data['single_pair_only'] == 1 and \ not (data['pos1'] and data['pos2']): raise ApplicationError( "Must specify pos1 and pos2 if single_pair_only == 1.") # Make sure the MolType is in the correct format (i.e., 1 or 0) data['mol_type'] = mol_type = \ self._mol_type_lookup[str(data['mol_type']).lower()] char_order = self._char_order[mol_type] # If we didn't get several values as parameters, set the defaults. # These are done outside of the above loop b/c they require special # handling. if not data['char_priors']: data['char_priors'] = self._default_priors[mol_type] data['char_priors'] = \ self._input_as_lines(\ self._input_as_gctmpca_char_priors(\ data['char_priors'],char_order)) if not data['sub_matrix']: data['sub_matrix'] = \ self._input_as_multiline_string(\ self._default_sub_matrix[mol_type]) else: data['sub_matrix'] = \ self._input_as_lines(\ self._input_as_gctmpca_rate_matrix(\ data['sub_matrix'],char_order)) if not data['output_path']: data['output_path'] = \ self._input_as_path(self.getTmpFilename()) return data
def _get_base_command(self): """ Returns the full command string input_arg: the argument to the command which represents the input to the program, this will be a string, either representing input or a filename to get input from """ command_part1 = [] command_part2 = [] # Append a change directory to the beginning of the command to change # to self.WorkingDir before running the command cd_command = ''.join(['cd ', self.WorkingDir, ';']) if self._command1 is None: raise ApplicationError('_command has not been set.') parameters = self.Parameters command1 = self._command1 command2 = self._command2 command_part1.append(cd_command) command_part1.append(command1) command_part1.append(''.join(['2> ', self.WorkingDir, 'ShapesStderr'])) command_part2.append(command2) command_part2.append( self._command_delimiter.join([ _f for _f in (list(map(str, list(parameters.values())))) if _f ])) return self._command_delimiter.join(command_part1).strip(),\ self._command_delimiter.join(command_part2).strip()
def _get_result_paths(self, data): """ Set the result paths """ result = {} inp_file_name = str(self.Parameters['--query_NAST'].Value) inp_file_name = inp_file_name.rstrip('"') inp_file_name = inp_file_name.lstrip('"') exec_dir = self.Parameters['--exec_dir'] if exec_dir.isOn(): exec_dir = str(exec_dir.Value) exec_dir = exec_dir.lstrip('"') exec_dir = exec_dir.rstrip('"') if inp_file_name[0] == '/': # path is already absolute pass else: inp_file_name = exec_dir + "/" + inp_file_name if not exists(inp_file_name + ".CPS.CPC"): raise ApplicationError("Calling ChimeraSlayer failed.") result['CPS'] = ResultPath(Path=inp_file_name + ".CPS.CPC", IsWritten=True) return result
def _get_base_command(self): """ Returns the full command string Overridden here because there are positional arguments (specifically the input and output files). """ command_parts = [] # Append a change directory to the beginning of the command to change # to self.WorkingDir before running the command # WorkingDir should be in quotes -- filenames might contain spaces cd_command = ''.join(['cd ', str(self.WorkingDir), ';']) if self._command is None: raise ApplicationError('_command has not been set.') command = self._command # also make sure there's a subcommand! if self._subcommand is None: raise ApplicationError('_subcommand has not been set.') subcommand = self._subcommand # sorting makes testing easier, since the options will be written out # in alphabetical order. Could of course use option parsing scripts # in cogent for this, but this works as well. parameters = sorted( [str(x) for x in self.Parameters.values() if str(x)]) synonyms = self._synonyms command_parts.append(cd_command) command_parts.append(command) # add in subcommand command_parts.append(subcommand) command_parts += parameters # add in the positional arguments in the correct order for k in self._input_order: # this check is necessary to account for optional positional # arguments, such as the mate file for bwa bwasw # Note that the input handler will ensure that all required # parameters have valid values if k in self._input: command_parts.append(self._input[k]) return self._command_delimiter.join(command_parts).strip()
def __call__(self, data=None, remove_tmp=True): """Run the application with the specified kwargs on data data: anything that can be cast into a string or written out to a file. Usually either a list of things or a single string or number. input_handler will be called on this data before it is passed as part of the command-line argument, so by creating your own input handlers you can customize what kind of data you want your application to accept remove_tmp: if True, removes tmp files """ # Process the input data. Input filepath is stored in # self._input_filename getattr(self, self.InputHandler)(data) if self.SuppressStdout: outfile = None else: outfile = open(self.getTmpFilename(self.TmpDir), 'w') if self.SuppressStderr: errfile = None else: errfile = open(self.getTmpFilename(self.TmpDir), 'w') args = [self._command, self._compile_mothur_script()] process = Popen(args, stdout=outfile, stderr=errfile, cwd=self.WorkingDir) exit_status = process.wait() if not self._accept_exit_status(exit_status): raise ApplicationError( 'Unacceptable application exit status: %s, command: %s' % \ (exit_status, args)) if outfile is not None: outfile.seek(0) if errfile is not None: errfile.seek(0) result = CommandLineAppResult(outfile, errfile, exit_status, result_paths=self._get_result_paths()) # Clean up the input file if one was created if remove_tmp: if self._input_filename: remove(self._input_filename) self._input_filename = None return result
def _input_as_parameters(self, data): """ Set the input paths (a NAST aligned fasta filepath) """ # The list of values which can be passed on a per-run basis allowed_values = ['--query_NAST', '--db_NAST', '--db_FASTA', '-R'] unsupported_parameters = set(data.keys()) - set(allowed_values) if unsupported_parameters: raise ApplicationError( "Unsupported parameter(s) passed when calling ChimeraSlayer: %s" % ' '.join(unsupported_parameters)) return ''
def _input_as_dict(self, data): """Takes dictionary that sets input and output files. Valid keys for the dictionary are specified in the subclasses. File paths must be absolute. """ # clear self._input; ready to receive new input and output files self._input = {} # Check that the arguments to the # subcommand-specific parameters are valid self.check_arguments() # Ensure that we have all required input (file I/O) for k in self._input_order: # N.B.: optional positional arguments begin with underscore (_)! # (e.g., see _mate_in for bwa bwasw) if k[0] != '_' and k not in data: raise ApplicationError("Missing required input %s" % k) # Set values for input and output files for k in data: # check for unexpected keys in the dict if k not in self._input_order: error_message = "Invalid input arguments (%s)\n" % k error_message += "Valid keys are: %s" % repr(self._input_order) raise ApplicationError(error_message + '\n') # check for absolute paths if not isabs(data[k][0]): raise ApplicationError("Only absolute paths allowed.\n%s" % repr(data)) self._input[k] = data[k] # if there is a -f option to specify an output file, force the user to # use it (otherwise things to to stdout) if '-f' in self.Parameters and not self.Parameters['-f'].isOn(): raise ApplicationError("Please specify an output file with -f") return ''
def check_arguments(self): """Sanity check the arguments passed in. Uses the boolean functions specified in the subclasses in the _valid_arguments dictionary to determine if an argument is valid or invalid. """ for k, v in self.Parameters.iteritems(): if self.Parameters[k].isOn(): if k in self._valid_arguments: if not self._valid_arguments[k](v.Value): error_message = 'Invalid argument (%s) ' % v.Value error_message += 'for parameter %s\n' % k raise ApplicationError(error_message)
def setUp(self): """ """ self.files_to_remove = [] self.dirs_to_remove = [] tmp_dir = get_qiime_temp_dir() self.test_out = get_tmp_filename( tmp_dir=tmp_dir, prefix='qiime_parallel_taxonomy_assigner_tests_', suffix='', result_constructor=str) self.dirs_to_remove.append(self.test_out) create_dir(self.test_out) # Temporary input file self.tmp_seq_filepath = get_tmp_filename( tmp_dir=self.test_out, prefix='qiime_parallel_taxonomy_assigner_tests_input', suffix='.fasta') seq_file = open(self.tmp_seq_filepath, 'w') seq_file.write(rdp_test_seqs) seq_file.close() self.files_to_remove.append(self.tmp_seq_filepath) self.id_to_taxonomy_file = NamedTemporaryFile( prefix='qiime_parallel_taxonomy_assigner_tests_id_to_taxonomy', suffix='.txt', dir=tmp_dir) self.id_to_taxonomy_file.write(rdp_id_to_taxonomy) self.id_to_taxonomy_file.seek(0) self.reference_seqs_file = NamedTemporaryFile( prefix='qiime_parallel_taxonomy_assigner_tests_ref_seqs', suffix='.fasta', dir=tmp_dir) self.reference_seqs_file.write(rdp_reference_seqs) self.reference_seqs_file.seek(0) jar_fp = getenv('RDP_JAR_PATH') jar_basename = basename(jar_fp) if '2.2' not in jar_basename: raise ApplicationError( "RDP_JAR_PATH does not point to version 2.2 of the " "RDP Classifier.") initiate_timeout(60)
def _get_jar_fp(self): """Returns the full path to the JAR file. Raises an ApplicationError if the JAR file cannot be found in the (1) current directory or (2) the path specified in the RDP_JAR_PATH environment variable. """ # handles case where the jar file is in the current working directory if exists(self._command): return self._command # handles the case where the user has specified the location via # an environment variable elif 'RDP_JAR_PATH' in environ: return getenv('RDP_JAR_PATH') # error otherwise else: raise ApplicationError("$RDP_JAR_PATH is not set -- this must be set to use the"+\ " RDP classifier application controller.")
def _input_as_parameters(self,data): """ Set the input path (a fasta filepath) """ # The list of values which can be passed on a per-run basis allowed_values = ['-r','-t','-a','-b','-l','-d','i','-o','-m','-v','-f', '-g'] unsupported_parameters = set(data.keys()) - set(allowed_values) if unsupported_parameters: raise ApplicationError("Unsupported parameter(s) passed when calling rtax: %s" %\ ' '.join(unsupported_parameters)) for v in allowed_values: # turn the parameter off so subsequent runs are not # affected by parameter settings from previous runs self.Parameters[v].off() if v in data: # turn the parameter on if specified by the user self.Parameters[v].on(data[v]) return ''
def _input_as_parameters(self, data): """ Set the input path (a fasta filepath) """ # The list of values which can be passed on a per-run basis allowed_values = ['--input','--uc','--fastapairs',\ '--uc2clstr','--output','--mergesort'] unsupported_parameters = set(data.keys()) - set(allowed_values) if unsupported_parameters: raise ApplicationError("Unsupported parameter(s) passed when calling uclust: %s" %\ ' '.join(unsupported_parameters)) for v in allowed_values: # turn the parameter off so subsequent runs are not # affected by parameter settings from previous runs self.Parameters[v].off() if v in data: # turn the parameter on if specified by the user self.Parameters[v].on(data[v]) return ''
def _derive_log_path(self): """Guess logfile path produced by Mothur This method checks the working directory for log files generated by Mothur. It will raise an ApplicationError if no log file can be found. Mothur generates log files named in a nondeterministic way, using the current time. We return the log file with the most recent time, although this may lead to incorrect log file detection if you are running many instances of mothur simultaneously. """ filenames = listdir(self.WorkingDir) lognames = [x for x in filenames if re.match("^mothur\.\d+\.logfile$", x)] if not lognames: raise ApplicationError( 'No log file detected in directory %s. Contents: \n\t%s' % ( input_dir, '\n\t'.join(possible_logfiles))) most_recent_logname = sorted(lognames, reverse=True)[0] return path.join(self.WorkingDir, most_recent_logname)
def assign_dna_reads_to_dna_database(query_fasta_fp, database_fasta_fp, output_fp, params=None): """Assign DNA reads to a database fasta of DNA sequences. Wraps assign_reads_to_database, setting database and query types. All parameters are set to default unless params is passed. query_fasta_fp: absolute path to the query fasta file containing DNA sequences. database_fasta_fp: absolute path to the database fasta file containing DNA sequences. output_fp: absolute path where the output file will be generated. params: optional. dict containing parameter settings to be used instead of default values. Cannot change database or query file types from dna and dna, respectively. This method returns an open file object. The output format defaults to blast9 and should be parsable by the PyCogent BLAST parsers. """ if params is None: params = {} my_params = {'-t': 'dna', '-q': 'dna'} # if the user specified parameters other than default, then use them. # However, if they try to change the database or query types, raise an # applciation error. if '-t' in params or '-q' in params: raise ApplicationError("Cannot change database or query types when " + "using assign_dna_reads_to_dna_database. " + "Use assign_reads_to_database instead.\n") my_params.update(params) result = assign_reads_to_database(query_fasta_fp, database_fasta_fp, output_fp, my_params) return result
def _get_base_command(self): """Returns the base command plus command-line options. Does not include input file, output file, and training set. """ # Necessary? Preserve for consistency. if self._command is None: raise ApplicationError('_command has not been set.') # Append a change directory to the beginning of the command to change # to self.WorkingDir before running the command # WorkingDir should be in quotes -- filenames might contain spaces cd_command = ''.join(['cd ', str(self.WorkingDir), ';']) jvm_command = "java" jvm_arguments = self._commandline_join( list(self.JvmParameters.values())) jar_arguments = '-jar "%s"' % self._get_jar_fp() result = self._commandline_join( [cd_command, jvm_command, jvm_arguments, jar_arguments]) return result
def _get_base_command(self): """Gets the command that will be run when the app controller is called. """ command_parts = [] cd_command = ''.join(['cd ',str(self.WorkingDir),';']) if self._command is None: raise ApplicationError('_command has not been set.') command = self._command parameters = sorted([str(x) for x in list(self.Parameters.values()) if str(x)]) synonyms = self._synonyms command_parts.append(cd_command) command_parts.append(command) command_parts.append(self._database) # Positional argument command_parts.append(self._query) # Positional argument command_parts += parameters if self._output: command_parts.append(self._output.Path) # Positional return self._command_delimiter.join([_f for _f in command_parts if _f]).strip()
def _get_base_command(self): """ Returns the full command string Overides the __call__ function in util.py becasue of the special circumstance surrounding the command line input. input_arg: the argument to the command which represents the input to the program, this will be a string, either representing input or a filename to get input from """ command_part1 = [] command_part2 = [] # Append a change directory to the beginning of the command to change # to self.WorkingDir before running the command cd_command = ''.join(['cd ', self.WorkingDir, ';']) if self._command is None: raise ApplicationError('_command has not been set.') command = self._command command_part1.append(cd_command) command_part1.append(command) lista = [self.Parameters['-alignment'],\ self.Parameters['-M'],\ self.Parameters['-gap_cost'],\ self.Parameters['-max_structures'],\ self.Parameters['-max_percent_diff'],\ self.Parameters['-bp_window'],\ self.Parameters['-align_window'],\ self.Parameters['-single_bp_inserts']] command_part2.append( self._command_delimiter.join( [_f for _f in (list(map(str, lista))) if _f])) return self._command_delimiter.join(command_part1).strip(),\ self._command_delimiter.join(command_part2).strip()
def __call__(self, data=None, remove_tmp=True): """Run the application with the specified kwargs on data data: anything that can be cast into a string or written out to a file. Usually either a list of things or a single string or number. input_handler will be called on this data before it is passed as part of the command-line argument, so by creating your own input handlers you can customize what kind of data you want your application to accept remove_tmp: if True, removes tmp files """ result = super(RdpClassifier, self).__call__(data=data, remove_tmp=remove_tmp) training_files = { 'bergeyTree': 'bergeyTrainingTree.xml', 'probabilityList': 'genus_wordConditionalProbList.txt', 'probabilityIndex': 'wordConditionalProbIndexArr.txt', 'wordPrior': 'logWordPrior.txt', } for key, training_fn in sorted(training_files.items()): training_fp = os.path.join(self.ModelDir, training_fn) if not os.path.exists(training_fp): exception_msg = ( "Training output file %s not found. This may " "happen if an error occurred during the RDP training " "process. More details may be available in the " "standard error, printed below.\n\n" % training_fp ) stderr_msg = result["StdErr"].read() result["StdErr"].seek(0) raise ApplicationError(exception_msg + stderr_msg) # Not in try/except clause because we already know the # file exists. Failure would be truly exceptional, and we # want to maintain the original exception in that case. result[key] = open(training_fp) return result
def filter_with_flowgram(id, flowgram, flowgrams, header, ids, num_flows, bestscores, log_fh, outdir="/tmp/", threshold=3.75, num_cpus=32, fast_method=True, on_cluster=False, mapping=None, spread=[], verbose=False, pair_id_thresh=0.97, client_sockets=[], error_profile=DENOISER_DATA_DIR + 'FLX_error_profile.dat'): """Filter all files in flows_filename with flowgram and split according to threshold. id: The flowgram identifier of the master flowgram of this round flowgram: This flowgram is used to filter all the other flowgrams flowgrams: iterator containing the flowgrams to be filtered header: a valid sff.txt header ids: this list marks the active flowgrams, i.e. flowgrams that are unclustered num_flows: Number of flows remaining in the current round bestscores: dictionary that stores for each unclustered flowgram the best score it has to to one of the centroids previously seen and the id of the centroid. Used in the second denoising phase. outdir: directory where intermediate and result files go threshold: Filtering threshold num_cpus: number of cpus to run on, if on_cluster == True fast_method: Boolean value for fast denoising with lots of memory on_cluster: Boolean flag for local vs cluster mapping: the current cluster mapping spread: worker processing throughput error_profile: Path to error profile *.dat file Implementation detail: The iterator behind 'flowgrams' is big and thus we want to keep its traversals at a minimum. The naive implementation of this filter function would traverse the iterator once to create the input file for the alignment routine, then a second time to do the actual filtering. To get rid of the second run through the iterator, we keep a list (in fact a dict) of active 'ids' and do the filtering only in the next round. A cleaner but still fast solution would be great, as this definitly poses a pitfall for future modifications. Returns filename of file containing all non-filtered flows and the number of flows """ if verbose: log_fh.write("Filtering with %s: %d flowgrams\n" % (id, num_flows)) # set up the flowgram storage if (not fast_method): fc = FlowgramContainerFile(header, outdir) else: fc = FlowgramContainerArray() # calculate distance scores if on_cluster: (scores, names, flowgrams) =\ get_flowgram_distances_on_cluster( id, flowgram, flowgrams, fc, ids, num_cpus, num_flows, spread=spread, client_sockets=client_sockets) else: (scores, names, flowgrams) =\ get_flowgram_distances( id, flowgram, flowgrams, fc, ids, outdir=outdir, error_profile=error_profile) # shortcut for non-matching flowgrams survivors = filter( lambda a_b: a_b[0] < threshold or a_b[1] >= pair_id_thresh, scores) if (len(survivors) == 0): # put it in its own cluster # and remove it from any further searches if (id in bestscores): del (bestscores[id]) del (ids[id]) return (flowgrams, num_flows - 1) # Do the filtering non_clustered_ctr = 0 for ((score, pair_id), name) in zip(scores, names): if (score < threshold or name == id or pair_id >= pair_id_thresh): # make sure the original flowgram gets into this cluster del (ids[name]) if (name in bestscores): del (bestscores[name]) if (id != name): # update the mapping information mapping[id].extend(mapping[name]) mapping[id].append(name) # delete the old cluster from the mapping del (mapping[name]) else: non_clustered_ctr += 1 # keep track of the best match of this guy to any centroid if (name not in bestscores or score < bestscores[name][1]): bestscores[name] = (id, score) # Some extra safety that we are not missing anything if (len(ids) != non_clustered_ctr or len(bestscores) != non_clustered_ctr): raise ApplicationError("filterWithFlowgram failed") return (flowgrams, non_clustered_ctr)
def assign_reads_to_database(query, database_fasta, out_path, params=None): """Assign a set of query sequences to a reference database database_fasta_fp: absolute file path to the reference database query_fasta_fp: absolute file path to query sequences output_fp: absolute file path of the file to be output params: dict of BWA specific parameters. * Specify which algorithm to use (bwa-short or bwasw) using the dict key "algorithm" * if algorithm is bwasw, specify params for the bwa bwasw subcommand * if algorithm is bwa-short, specify params for the bwa samse subcommand * if algorithm is bwa-short, must also specify params to use with bwa aln, which is used to get the sai file necessary to run samse. bwa aln params should be passed in using dict key "aln_params" and the associated value should be a dict of params for the bwa aln subcommand * if a temporary directory is not specified in params using dict key "temp_dir", it will be assumed to be /tmp This method returns an open file object (SAM format). """ if params is None: params = {} # set the output path params['-f'] = out_path # if the algorithm is not specified in the params dict, or the algorithm # is not recognized, raise an exception if 'algorithm' not in params: raise ApplicationError("Must specify which algorithm to use " + \ "('bwa-short' or 'bwasw')") elif params['algorithm'] not in ('bwa-short', 'bwasw'): raise ApplicationError('Unknown algorithm "%s". ' % \ params['algorithm'] + \ "Please enter either 'bwa-short' or 'bwasw'.") # if the temp directory is not specified, assume /tmp if 'temp_dir' not in params: params['temp_dir'] = '/tmp' # if the algorithm is bwa-short, we must build use bwa aln to get an sai # file before calling bwa samse on that sai file, so we need to know how # to run bwa aln. Therefore, we must ensure there's an entry containing # those parameters if params['algorithm'] == 'bwa-short': if 'aln_params' not in params: raise ApplicationError("With bwa-short, need to specify a key " + \ "'aln_params' and its value, a " + \ "dictionary to pass to bwa aln, since " + \ "bwa aln is an intermediate step when " + \ "doing bwa-short.") # we have this params dict, with "algorithm" and "temp_dir", etc which are # not for any of the subcommands, so make a new params dict that is the # same as the original minus these addendums subcommand_params = {} for k, v in params.iteritems(): if k not in ('algorithm', 'temp_dir', 'aln_params'): subcommand_params[k] = v # build index from database_fasta # get a temporary file name that is not in use index_prefix = get_tmp_filename(tmp_dir=params['temp_dir'], suffix='', \ result_constructor=str) create_bwa_index_from_fasta_file(database_fasta, {'-p': index_prefix}) # if the algorithm is bwasw, things are pretty simple. Just instantiate # the proper controller and set the files if params['algorithm'] == 'bwasw': bwa = BWA_bwasw(params=subcommand_params) files = {'prefix': index_prefix, 'query_fasta': query} # if the algorithm is bwa-short, it's not so simple elif params['algorithm'] == 'bwa-short': # we have to call bwa_aln to get the sai file needed for samse # use the aln_params we ensured we had above bwa_aln = BWA_aln(params=params['aln_params']) aln_files = {'prefix': index_prefix, 'fastq_in': query} # get the path to the sai file sai_file_path = bwa_aln(aln_files)['output'].name # we will use that sai file to run samse bwa = BWA_samse(params=subcommand_params) files = { 'prefix': index_prefix, 'sai_in': sai_file_path, 'fastq_in': query } # run which ever app controller we decided was correct on the files # we set up result = bwa(files) # they both return a SAM file, so return that return result['output']
def _handle_app_result_build_failure(self, out, err, exit_status, result_paths): """ Catch the error when files are not produced """ raise ApplicationError('ParsInsert failed to produce an output file due to the following error: \n\n%s ' \ % err.read())
def assign_dna_reads_to_protein_database(query_fasta_fp, database_fasta_fp, output_fp, temp_dir="/tmp", params=None): """Assign DNA reads to a database fasta of protein sequences. Wraps assign_reads_to_database, setting database and query types. All parameters are set to default unless params is passed. A temporary file must be written containing the translated sequences from the input query fasta file because BLAT cannot do this automatically. query_fasta_fp: absolute path to the query fasta file containing DNA sequences. database_fasta_fp: absolute path to the database fasta file containing protein sequences. output_fp: absolute path where the output file will be generated. temp_dir: optional. Change the location where the translated sequences will be written before being used as the query. Defaults to /tmp. params: optional. dict containing parameter settings to be used instead of default values. Cannot change database or query file types from protein and dna, respectively. This method returns an open file object. The output format defaults to blast9 and should be parsable by the PyCogent BLAST parsers. """ if params is None: params = {} my_params = {'-t': 'prot', '-q': 'prot'} # make sure temp_dir specifies an absolute path if not isabs(temp_dir): raise ApplicationError("temp_dir must be an absolute path.") # if the user specified parameters other than default, then use them. # However, if they try to change the database or query types, raise an # applciation error. if '-t' in params or '-q' in params: raise ApplicationError( "Cannot change database or query types " "when using assign_dna_reads_to_dna_database. Use " "assign_reads_to_database instead.") if 'genetic_code' in params: my_genetic_code = GeneticCodes[params['genetic_code']] del params['genetic_code'] else: my_genetic_code = GeneticCodes[1] my_params.update(params) # get six-frame translation of the input DNA sequences and write them to # temporary file. tmp = get_tmp_filename(tmp_dir=temp_dir, result_constructor=str) tmp_out = open(tmp, 'w') for label, sequence in MinimalFastaParser(open(query_fasta_fp)): seq_id = label.split()[0] s = DNA.makeSequence(sequence) translations = my_genetic_code.sixframes(s) frames = [1, 2, 3, -1, -2, -3] translations = dict(zip(frames, translations)) for frame, translation in sorted(translations.iteritems()): entry = '>{seq_id}_frame_{frame}\n{trans}\n' entry = entry.format(seq_id=seq_id, frame=frame, trans=translation) tmp_out.write(entry) tmp_out.close() result = assign_reads_to_database(tmp, database_fasta_fp, output_fp, params=my_params) remove(tmp) return result
def denoise_seqs(sff_fps, fasta_fp, tmpoutdir, preprocess_fp=None, cluster=False, num_cpus=1, squeeze=True, percent_id=0.97, bail=1, primer="", low_cutoff=3.75, high_cutoff=4.5, log_fp="denoiser.log", low_memory=False, verbose=False, error_profile=DENOISER_DATA_DIR + 'FLX_error_profile.dat', max_num_rounds=None, titanium=False, checkpoint_fp=None): """The main routine to denoise flowgrams""" # abort if binary is missing check_flowgram_ali_exe() if verbose: # switch of buffering for log file log_fh = open(tmpoutdir + "/" + log_fp, "w", 0) else: log_fh = None # overwrite settings if titanium is set # This flag is only used from qiime. Remove after qiime integration if titanium: error_profile = DENOISER_DATA_DIR + "Titanium_error_profile.dat" low_cutoff = 4 high_cutoff = 5 if verbose: log_fh.write("Denoiser version: %s\n" % __version__) log_fh.write("SFF files: %s\n" % ', '.join(sff_fps)) log_fh.write("Fasta file: %s\n" % fasta_fp) log_fh.write("Preprocess dir: %s\n" % preprocess_fp) if checkpoint_fp: log_fh.write("Resuming denoiser from %s\n" % checkpoint_fp) log_fh.write("Primer sequence: %s\n" % primer) log_fh.write("Running on cluster: %s\n" % cluster) log_fh.write("Num CPUs: %d\n" % num_cpus) log_fh.write("Squeeze Seqs: %s\n" % squeeze) log_fh.write("tmpdir: %s\n" % tmpoutdir) log_fh.write("percent_id threshold: %.2f\n" % percent_id) log_fh.write("Minimal sequence coverage for first phase: %d\n" % bail) log_fh.write("Low cut-off: %.2f\n" % low_cutoff) log_fh.write("High cut-off: %.2f\n" % high_cutoff) log_fh.write("Error profile: %s\n" % error_profile) log_fh.write("Maximal number of iteration: %s\n\n" % max_num_rounds) # here we go ... # Phase I - clean up and truncate input sff if (checkpoint_fp): if (preprocess_fp): # skip preprocessing as we should have data # we already have preprocessed data, so use it (deprefixed_sff_fp, l, mapping, seqs) = read_preprocessed_data(preprocess_fp) else: raise ApplicationError( "Resuming from checkpoint requires --preprocess option") else: if (preprocess_fp): # we already have preprocessed data, so use it (deprefixed_sff_fp, l, mapping, seqs) = read_preprocessed_data(preprocess_fp) elif (cluster): preprocess_on_cluster(sff_fps, log_fp, fasta_fp=fasta_fp, out_fp=tmpoutdir, verbose=verbose, squeeze=squeeze, primer=primer) (deprefixed_sff_fp, l, mapping, seqs) = read_preprocessed_data(tmpoutdir) else: (deprefixed_sff_fp, l, mapping, seqs) = \ preprocess( sff_fps, log_fh, fasta_fp=fasta_fp, out_fp=tmpoutdir, verbose=verbose, squeeze=squeeze, primer=primer) # preprocessor writes into same file, so better jump to end of file if verbose: log_fh.close() log_fh = open(tmpoutdir + "/" + log_fp, "a", 0) # phase II: # use prefix map based clustering as initial centroids and greedily # add flowgrams to clusters with a low threshold (new_sff_file, bestscores, mapping) = \ greedy_clustering(deprefixed_sff_fp, seqs, mapping, tmpoutdir, l, log_fh, num_cpus=num_cpus, on_cluster=cluster, bail_out=bail, pair_id_thresh=percent_id, threshold=low_cutoff, verbose=verbose, fast_method=not low_memory, error_profile=error_profile, max_num_rounds=max_num_rounds, checkpoint_fp=checkpoint_fp) # phase III phase: # Assign seqs to nearest existing centroid with high threshold secondary_clustering(new_sff_file, mapping, bestscores, log_fh, verbose=verbose, threshold=high_cutoff) remove(new_sff_file) if (verbose): log_fh.write("Finished clustering\n") log_fh.write("Writing Clusters\n") log_fh.write(make_stats(mapping) + "\n") store_clusters(mapping, deprefixed_sff_fp, tmpoutdir) store_mapping(mapping, tmpoutdir, "denoiser")
def _input_as_list(self, data): '''Takes the positional arguments as input in a list. The list input here should be [query_file_path, database_file_path, output_file_path]''' query, database, output = data if (not isabs(database)) \ or (not isabs(query)) \ or (not isabs(output)): raise ApplicationError("Only absolute paths allowed.\n%s" % ', '.join(data)) self._database = FilePath(database) self._query = FilePath(query) self._output = ResultPath(output, IsWritten=True) # check parameters that can only take a particular set of values # check combination of databse and query type if self.Parameters['-t'].isOn() and self.Parameters['-q'].isOn() and \ (self.Parameters['-t'].Value, self.Parameters['-q'].Value) not in \ self._valid_combinations: error_message = "Invalid combination of database and query " + \ "types ('%s', '%s').\n" % \ (self.Paramters['-t'].Value, self.Parameters['-q'].Value) error_message += "Must be one of: %s\n" % \ repr(self._valid_combinations) raise ApplicationError(error_message) # check database type if self.Parameters['-t'].isOn() and \ self.Parameters['-t'].Value not in self._database_types: error_message = "Invalid database type %s\n" % \ self.Parameters['-t'].Value error_message += "Allowed values: %s\n" % \ ', '.join(self._database_types) raise ApplicationError(error_message) # check query type if self.Parameters['-q'].isOn() and \ self.Parameters['-q'].Value not in self._query_types: error_message = "Invalid query type %s\n" % \ self.Parameters['-q'].Value error_message += "Allowed values: %s\n" % \ ', '.join(self._query_types) raise ApplicationError(error_message) # check mask type if self.Parameters['-mask'].isOn() and \ self.Parameters['-mask'].Value not in self._mask_types: error_message = "Invalid mask type %s\n" % \ self.Parameters['-mask'] error_message += "Allowed Values: %s\n" % \ ', '.join(self._mask_types) raise ApplicationError(error_message) # check qmask type if self.Parameters['-qMask'].isOn() and \ self.Parameters['-qMask'].Value not in self._mask_types: error_message = "Invalid qMask type %s\n" % \ self.Parameters['-qMask'].Value error_message += "Allowed values: %s\n" % \ ', '.join(self._mask_types) raise ApplicationError(error_message) # check repeat type if self.Parameters['-repeats'].isOn() and \ self.Parameters['-repeats'].Value not in self._mask_types: error_message = "Invalid repeat type %s\n" % \ self.Parameters['-repeat'].Value error_message += "Allowed values: %s\n" % \ ', '.join(self._mask_types) raise ApplicationError(error_message) # check output format if self.Parameters['-out'].isOn() and \ self.Parameters['-out'].Value not in self._out_types: error_message = "Invalid output type %s\n" % \ self.Parameters['-out'] error_message += "Allowed values: %s\n" % \ ', '.join(self._out_types) raise ApplicationError(error_message) return ''
def get_clusters_from_fasta_filepath(fasta_filepath, original_fasta_path, percent_ID=0.97, max_accepts=1, max_rejects=8, stepwords=8, word_length=8, optimal=False, exact=False, suppress_sort=False, output_dir=None, enable_rev_strand_matching=False, subject_fasta_filepath=None, suppress_new_clusters=False, return_cluster_maps=False, stable_sort=False, save_uc_files=True, HALT_EXEC=False): """ Main convenience wrapper for using uclust to generate cluster files A source fasta file is required for the fasta_filepath. This will be sorted to be in order of longest to shortest length sequences. Following this, the sorted fasta file is used to generate a cluster file in the uclust (.uc) format. Next the .uc file is converted to cd-hit format (.clstr). Finally this file is parsed and returned as a list of lists, where each sublist a cluster of sequences. If an output_dir is specified, the intermediate files will be preserved, otherwise all files created are temporary and will be deleted at the end of this function The percent_ID parameter specifies the percent identity for a clusters, i.e., if 99% were the parameter, all sequences that were 99% identical would be grouped as a cluster. """ # Create readable intermediate filenames if they are to be kept fasta_output_filepath = None uc_output_filepath = None cd_hit_filepath = None if output_dir and not output_dir.endswith('/'): output_dir += '/' if save_uc_files: uc_save_filepath = get_output_filepaths(output_dir, original_fasta_path) else: uc_save_filepath = None sorted_fasta_filepath = "" uc_filepath = "" clstr_filepath = "" # Error check in case any app controller fails files_to_remove = [] try: if not suppress_sort: # Sort fasta input file from largest to smallest sequence sort_fasta = uclust_fasta_sort_from_filepath(fasta_filepath, \ output_filepath=fasta_output_filepath) # Get sorted fasta name from application wrapper sorted_fasta_filepath = sort_fasta['Output'].name files_to_remove.append(sorted_fasta_filepath) else: sort_fasta = None sorted_fasta_filepath = fasta_filepath # Generate uclust cluster file (.uc format) uclust_cluster = uclust_cluster_from_sorted_fasta_filepath( sorted_fasta_filepath, uc_save_filepath, percent_ID=percent_ID, max_accepts=max_accepts, max_rejects=max_rejects, stepwords=stepwords, word_length=word_length, optimal=optimal, exact=exact, suppress_sort=suppress_sort, enable_rev_strand_matching=enable_rev_strand_matching, subject_fasta_filepath=subject_fasta_filepath, suppress_new_clusters=suppress_new_clusters, stable_sort=stable_sort, HALT_EXEC=HALT_EXEC) # Get cluster file name from application wrapper remove_files(files_to_remove) except ApplicationError: remove_files(files_to_remove) raise ApplicationError( 'Error running uclust. Possible causes are ' 'unsupported version (current supported version is v1.2.22) is installed or ' 'improperly formatted input file was provided') except ApplicationNotFoundError: remove_files(files_to_remove) raise ApplicationNotFoundError('uclust not found, is it properly '+\ 'installed?') # Get list of lists for each cluster clusters, failures, seeds = \ clusters_from_uc_file(uclust_cluster['ClusterFile']) # Remove temp files unless user specifies output filepath if not save_uc_files: uclust_cluster.cleanUp() if return_cluster_maps: return clusters, failures, seeds else: return list(clusters.values()), failures, seeds
def __call__(self, data=None): """Run the application with the specified kwargs on data Overides the __call__ function in util.py becasue of the special circumstance surrounding the command line input. data: anything that can be cast into a string or written out to a file. Usually either a list of things or a single string or number. input_handler will be called on this data before it is passed as part of the command-line argument, so by creating your own input handlers you can customize what kind of data you want you application to accept """ input_handler = self.InputHandler suppress_stdout = self.SuppressStdout suppress_stderr = self.SuppressStderr if suppress_stdout: outfile = '/dev/null' else: outfile = self.getTmpFilename(self.WorkingDir) if suppress_stderr: errfile = '/dev/null' else: errfile = self.getTmpFilename(self.WorkingDir) if data is None: input_arg = '' else: input_arg = getattr(self, input_handler)(data) # Build up the command, consisting of a BaseCommand followed by # input and output (file) specifications first, second = self.BaseCommand command = self._command_delimiter.join([ _f for _f in [first, input_arg, second, '>', outfile, '2>', errfile] if _f ]) if self.HaltExec: raise AssertionError("Halted exec with command:\n" + command) # The return value of system is a 16-bit number containing the signal # number that killed the process, and then the exit status. # We only want to keep the exit status so do a right bitwise shift to # get rid of the signal number byte exit_status = system(command) >> 8 # Determine if error should be raised due to exit status of # appliciation if not self._accept_exit_status(exit_status): raise ApplicationError('Unacceptable application exit status: %s, command: %s'\ % (str(exit_status),command)) # open the stdout and stderr if not being suppressed out = None if not suppress_stdout: out = open(outfile, "r") err = None if not suppress_stderr: err = open(errfile, "r") result = CommandLineAppResult(out,err,exit_status,\ result_paths=self._get_result_paths(data)) # Clean up the input file if one was created if self._input_filename: for f in self._input_filename: remove(f) self._input_filename = None return result
def _train_with_rdp_files(self, training_seqs_file, taxonomy_file, model_output_dir, remove_tmp=True): """Creates a set of training data for the RDP Classifier training_seqs_file: A pre-classified set of training sequences, in fasta-like format. Each sequence must be labelled with an identifier (no spaces) and an assigned lineage (taxa separated by ';'). Example of a valid label: ">seq1 ROOT;Ph1;Fam1;G1;" taxonomy_file: A File-like object that specifies a taxonomic heirarchy. Each line in the file must contain a '*'-separated list of the following items: Taxon ID, Taxon Name, Parent Taxon ID, Depth, and Rank. IDs should have an integer format. Example of a valid line: "1*Bacteria*0*0*domain" model_output_dir: Directory in which to store training data. remove_tmp: if True, removes tmp files To use the resulting model with the RdpClassifier, set '-training_data' to the following path: model_output_dir + RdpClassifier.PropertiesFile """ # Three extra pieces of information are required to create # training data. Unless we want built-in support for # versioned training sets, these may be set to sensible # defaults. training_set_id = '1' taxonomy_version = 'version1' modification_info = 'cogent' # The properties file specifies the names of the files in the # training directory. We use the example properties file # directly from the rdp_classifier distribution, which lists # the default set of files created by the application. We # must write this file explicitly after generating the # training data. properties = ( "# Sample ResourceBundle properties file\n" "bergeyTree=bergeyTrainingTree.xml\n" "probabilityList=genus_wordConditionalProbList.txt\n" "probabilityIndex=wordConditionalProbIndexArr.txt\n" "wordPrior=logWordPrior.txt\n" "classifierVersion=Naive Bayesian rRNA Classifier Version 1.0, November 2003\n" ) input_handler = self.InputHandler suppress_stdout = self.SuppressStdout suppress_stderr = self.SuppressStderr if suppress_stdout: outfile = FilePath('/dev/null') else: outfile = self.getTmpFilename(self.TmpDir) if suppress_stderr: errfile = FilePath('/dev/null') else: errfile = FilePath(self.getTmpFilename(self.TmpDir)) input_handler_function = getattr(self, input_handler) taxonomy_filename = input_handler_function(taxonomy_file) training_seqs_filename = input_handler_function(training_seqs_file) # Build up the command, consisting of a BaseCommand followed # by input and output (file) specifications # Example from rdp_classifier/sampledata/README: # java -Xmx400m -cp rdp_classifier-2.0.jar # edu/msu/cme/rdp/classifier/train/ClassifierTraineeMaker # mydata/mytaxon.txt mydata/mytrainseq.fasta 1 version1 test # mydata command = self._commandline_join([ self.BaseCommand, taxonomy_filename, training_seqs_filename, training_set_id, taxonomy_version, modification_info, model_output_dir, '>', outfile, '2>', errfile ]) if self.HaltExec: raise AssertionError("Halted exec with command:\n" + command) # The return value of system is a 16-bit number containing the signal # number that killed the process, and then the exit status. # We only want to keep the exit status so do a right bitwise shift to # get rid of the signal number byte exit_status = system(command) >> 8 # Determine if error should be raised due to exit status of # appliciation if not self._accept_exit_status(exit_status): raise ApplicationError('Unacceptable application exit status: %s, command: %s'\ % (str(exit_status),command)) # must write properties file to output directory manually properties_fp = path.join(model_output_dir, self.PropertiesFile) properties_file = open(properties_fp, 'w') properties_file.write(properties) properties_file.close() # open the stdout and stderr if not being suppressed out = None if not suppress_stdout: out = open(outfile, "r") err = None if not suppress_stderr: err = open(errfile, "r") result = CommandLineAppResult( out, err, exit_status, result_paths=self._get_result_paths(model_output_dir)) # Clean up the input files if remove_tmp: remove(taxonomy_filename) remove(training_seqs_filename) return result
def __call__(self, data=None, remove_tmp=True): """Run the application with the specified kwargs on data data: anything that can be cast into a string or written out to a file. Usually either a list of things or a single string or number. input_handler will be called on this data before it is passed as part of the command-line argument, so by creating your own input handlers you can customize what kind of data you want your application to accept remove_tmp: if True, removes tmp files """ input_handler = self.InputHandler suppress_stdout = self.SuppressStdout suppress_stderr = self.SuppressStderr assignment_fp = FilePath(self.getTmpFilename(self.TmpDir)) if suppress_stdout: outfile = FilePath('/dev/null') else: outfile = FilePath(self.getTmpFilename(self.TmpDir)) if suppress_stderr: errfile = FilePath('/dev/null') else: errfile = FilePath(self.getTmpFilename(self.TmpDir)) if data is None: input_arg = '' else: input_arg = getattr(self, input_handler)(data) training_data = self.PositionalParameters['-training-data'] # Build up the command, consisting of a BaseCommand followed by # input and output (file) specifications command = self._commandline_join([ self.BaseCommand, input_arg, assignment_fp, training_data, '>', outfile, '2>', errfile, ]) if self.HaltExec: raise AssertionError("Halted exec with command:\n" + command) # The return value of system is a 16-bit number containing the signal # number that killed the process, and then the exit status. # We only want to keep the exit status so do a right bitwise shift to # get rid of the signal number byte exit_status = system(command) >> 8 # Determine if error should be raised due to exit status of # appliciation if not self._accept_exit_status(exit_status): raise ApplicationError('Unacceptable application exit status: %s, command: %s'\ % (str(exit_status),command)) # open the stdout and stderr if not being suppressed out = None if not suppress_stdout: out = open(outfile, "r") err = None if not suppress_stderr: err = open(errfile, "r") result_paths = self._get_result_paths(data) result_paths['Assignments'] = ResultPath(assignment_fp) result = CommandLineAppResult(out, err, exit_status, result_paths=result_paths) # Clean up the input file if one was created if remove_tmp: if self._input_filename: remove(self._input_filename) self._input_filename = None return result