def check_flowgram_ali_exe(): """Check if we have a working FlowgramAligner""" ali_exe = get_flowgram_ali_exe() if which(ali_exe) is None: raise ApplicationNotFoundError("The alignment program %s is not " "accessible via the PATH environment " "variable." % ali_exe) # test if its callable and actually works command = "%s -h" % ali_exe proc = Popen(command, shell=True, universal_newlines=True, stdout=PIPE, stderr=STDOUT) if (proc.wait() != 0): raise ApplicationError( "Calling %s failed. Check permissions and that it is in fact an executable." % ali_exe) result = proc.stdout.read() # check that the help string looks correct if (not result.startswith("Usage")): raise ApplicationError( "Calling %s failed. Check permissions and that it is in fact an executable." % ali_exe) return True
def _get_result_paths(self, data): """ Set the result paths """ result = {} inp_file_name = str(self.Parameters['--query_NAST'].Value) inp_file_name = inp_file_name.rstrip('"') inp_file_name = inp_file_name.lstrip('"') exec_dir = self.Parameters['--exec_dir'] if exec_dir.isOn(): exec_dir = str(exec_dir.Value) exec_dir = exec_dir.lstrip('"') exec_dir = exec_dir.rstrip('"') if inp_file_name[0] == '/': # path is already absolute pass else: inp_file_name = exec_dir + "/" + inp_file_name if not exists(inp_file_name + ".CPS.CPC"): raise ApplicationError("Calling ChimeraSlayer failed.") result['CPS'] = ResultPath(Path=inp_file_name + ".CPS.CPC", IsWritten=True) return result
def _get_base_command(self): if self._subcommand is None: raise ApplicationError('_subcommand has not been set.') # prevent append multiple subcommand if not self._command.endswith(self._subcommand): self._command = self._command_delimiter.join( [self._command, self._subcommand]) return super()._get_base_command()
def main(): """run denoiser on input flowgrams""" option_parser, opts, args = parse_command_line_parameters(**script_info) sff_files = opts.sff_fps for f in sff_files: if (not exists(f)): option_parser.error(('Flowgram file path does not exist:\n %s \n' + 'Pass a valid one via -i.') % f) outdir = opts.output_dir create_dir(outdir, fail_on_exist=not opts.force) log_fh = None if (not (opts.primer or opts.map_fname)): raise ApplicationError("Either mapping file or primer required") # Read primer from Meta data file if not set on command line if not opts.primer: mapping_data, header, comments = \ parse_mapping_file(open(opts.map_fname, "U")) index = header.index("LinkerPrimerSequence") all_primers = set(array(mapping_data)[:, index]) if len(all_primers) != 1: raise ValueError( "Currently only data sets with one primer are allowed.\n" + "Make separate mapping files with only one primer, re-run split_libraries and\n" + "denoise with each split_library output separately.") primer = list(all_primers)[0] last_char = primer[-1] if (last_char not in "ACGT"): raise ValueError("We currently do not support primer with " + "degenerate bases at it's 3' end.") else: primer = opts.primer centroids, cluster_mapping = fast_denoiser(opts.sff_fps, opts.fasta_fp, outdir, opts.num_cpus, primer, titanium=opts.titanium) # store mapping file and centroids result_otu_path = '%s/denoised_clusters.txt' % outdir of = open(result_otu_path, 'w') for i, cluster in cluster_mapping.iteritems(): of.write('%s\t%s\n' % (str(i), '\t'.join(cluster))) of.close() result_fasta_path = '%s/denoised_seqs.fasta' % outdir oh = open(result_fasta_path, 'w') write_Fasta_from_name_seq_pairs(centroids, oh)
def _input_as_parameters(self, data): """ Set the input paths (a NAST aligned fasta filepath) """ # The list of values which can be passed on a per-run basis allowed_values = ['--query_NAST', '--db_NAST', '--db_FASTA', '-R'] unsupported_parameters = set(data.keys()) - set(allowed_values) if unsupported_parameters: raise ApplicationError( "Unsupported parameter(s) passed when calling ChimeraSlayer: %s" % ' '.join(unsupported_parameters)) return ''
def setUp(self): """ """ self.files_to_remove = [] self.dirs_to_remove = [] tmp_dir = get_qiime_temp_dir() self.test_out = mkdtemp( dir=tmp_dir, prefix='qiime_parallel_taxonomy_assigner_tests_', suffix='') self.dirs_to_remove.append(self.test_out) # Temporary input file fd, self.tmp_seq_filepath = mkstemp( dir=self.test_out, prefix='qiime_parallel_taxonomy_assigner_tests_input', suffix='.fasta') close(fd) seq_file = open(self.tmp_seq_filepath, 'w') seq_file.write(rdp_test_seqs) seq_file.close() self.files_to_remove.append(self.tmp_seq_filepath) self.id_to_taxonomy_file = NamedTemporaryFile( prefix='qiime_parallel_taxonomy_assigner_tests_id_to_taxonomy', suffix='.txt', dir=tmp_dir) self.id_to_taxonomy_file.write(rdp_id_to_taxonomy) self.id_to_taxonomy_file.seek(0) self.reference_seqs_file = NamedTemporaryFile( prefix='qiime_parallel_taxonomy_assigner_tests_ref_seqs', suffix='.fasta', dir=tmp_dir) self.reference_seqs_file.write(rdp_reference_seqs) self.reference_seqs_file.seek(0) jar_fp = getenv('RDP_JAR_PATH') jar_basename = basename(jar_fp) if '2.2' not in jar_basename: raise ApplicationError( "RDP_JAR_PATH does not point to version 2.2 of the " "RDP Classifier.") initiate_timeout(60)
def denoise_seqs(sff_fps, fasta_fp, tmpoutdir, preprocess_fp=None, cluster=False, num_cpus=1, squeeze=True, percent_id=0.97, bail=1, primer="", low_cutoff=3.75, high_cutoff=4.5, log_fp="denoiser.log", low_memory=False, verbose=False, error_profile=DENOISER_DATA_DIR + 'FLX_error_profile.dat', max_num_rounds=None, titanium=False, checkpoint_fp=None): """The main routine to denoise flowgrams""" # abort if binary is missing check_flowgram_ali_exe() if verbose: # switch of buffering for log file log_fh = open(tmpoutdir + "/" + log_fp, "w", 0) else: log_fh = None # overwrite settings if titanium is set # This flag is only used from qiime. Remove after qiime integration if titanium: error_profile = DENOISER_DATA_DIR + "Titanium_error_profile.dat" low_cutoff = 4 high_cutoff = 5 if verbose: log_fh.write("Denoiser version: %s\n" % __version__) log_fh.write("SFF files: %s\n" % ', '.join(sff_fps)) log_fh.write("Fasta file: %s\n" % fasta_fp) log_fh.write("Preprocess dir: %s\n" % preprocess_fp) if checkpoint_fp: log_fh.write("Resuming denoiser from %s\n" % checkpoint_fp) log_fh.write("Primer sequence: %s\n" % primer) log_fh.write("Running on cluster: %s\n" % cluster) log_fh.write("Num CPUs: %d\n" % num_cpus) log_fh.write("Squeeze Seqs: %s\n" % squeeze) log_fh.write("tmpdir: %s\n" % tmpoutdir) log_fh.write("percent_id threshold: %.2f\n" % percent_id) log_fh.write("Minimal sequence coverage for first phase: %d\n" % bail) log_fh.write("Low cut-off: %.2f\n" % low_cutoff) log_fh.write("High cut-off: %.2f\n" % high_cutoff) log_fh.write("Error profile: %s\n" % error_profile) log_fh.write("Maximal number of iteration: %s\n\n" % max_num_rounds) # here we go ... # Phase I - clean up and truncate input sff if (checkpoint_fp): if (preprocess_fp): # skip preprocessing as we should have data # we already have preprocessed data, so use it (deprefixed_sff_fp, l, mapping, seqs) = read_preprocessed_data(preprocess_fp) else: raise ApplicationError( "Resuming from checkpoint requires --preprocess option") else: if (preprocess_fp): # we already have preprocessed data, so use it (deprefixed_sff_fp, l, mapping, seqs) = read_preprocessed_data(preprocess_fp) elif (cluster): preprocess_on_cluster(sff_fps, log_fp, fasta_fp=fasta_fp, out_fp=tmpoutdir, verbose=verbose, squeeze=squeeze, primer=primer) (deprefixed_sff_fp, l, mapping, seqs) = read_preprocessed_data(tmpoutdir) else: (deprefixed_sff_fp, l, mapping, seqs) = \ preprocess( sff_fps, log_fh, fasta_fp=fasta_fp, out_fp=tmpoutdir, verbose=verbose, squeeze=squeeze, primer=primer) # preprocessor writes into same file, so better jump to end of file if verbose: log_fh.close() log_fh = open(tmpoutdir + "/" + log_fp, "a", 0) # phase II: # use prefix map based clustering as initial centroids and greedily # add flowgrams to clusters with a low threshold (new_sff_file, bestscores, mapping) = \ greedy_clustering(deprefixed_sff_fp, seqs, mapping, tmpoutdir, l, log_fh, num_cpus=num_cpus, on_cluster=cluster, bail_out=bail, pair_id_thresh=percent_id, threshold=low_cutoff, verbose=verbose, fast_method=not low_memory, error_profile=error_profile, max_num_rounds=max_num_rounds, checkpoint_fp=checkpoint_fp) # phase III phase: # Assign seqs to nearest existing centroid with high threshold secondary_clustering(new_sff_file, mapping, bestscores, log_fh, verbose=verbose, threshold=high_cutoff) remove(new_sff_file) if (verbose): log_fh.write("Finished clustering\n") log_fh.write("Writing Clusters\n") log_fh.write(make_stats(mapping) + "\n") store_clusters(mapping, deprefixed_sff_fp, tmpoutdir) store_mapping(mapping, tmpoutdir, "denoiser")
def filter_with_flowgram(id, flowgram, flowgrams, header, ids, num_flows, bestscores, log_fh, outdir="/tmp/", threshold=3.75, num_cpus=32, fast_method=True, on_cluster=False, mapping=None, spread=[], verbose=False, pair_id_thresh=0.97, client_sockets=[], error_profile=DENOISER_DATA_DIR + 'FLX_error_profile.dat'): """Filter all files in flows_filename with flowgram and split according to threshold. id: The flowgram identifier of the master flowgram of this round flowgram: This flowgram is used to filter all the other flowgrams flowgrams: iterator containing the flowgrams to be filtered header: a valid sff.txt header ids: this list marks the active flowgrams, i.e. flowgrams that are unclustered num_flows: Number of flows remaining in the current round bestscores: dictionary that stores for each unclustered flowgram the best score it has to to one of the centroids previously seen and the id of the centroid. Used in the second denoising phase. outdir: directory where intermediate and result files go threshold: Filtering threshold num_cpus: number of cpus to run on, if on_cluster == True fast_method: Boolean value for fast denoising with lots of memory on_cluster: Boolean flag for local vs cluster mapping: the current cluster mapping spread: worker processing throughput error_profile: Path to error profile *.dat file Implementation detail: The iterator behind 'flowgrams' is big and thus we want to keep its traversals at a minimum. The naive implementation of this filter function would traverse the iterator once to create the input file for the alignment routine, then a second time to do the actual filtering. To get rid of the second run through the iterator, we keep a list (in fact a dict) of active 'ids' and do the filtering only in the next round. A cleaner but still fast solution would be great, as this definitly poses a pitfall for future modifications. Returns filename of file containing all non-filtered flows and the number of flows """ if verbose: log_fh.write("Filtering with %s: %d flowgrams\n" % (id, num_flows)) # set up the flowgram storage if (not fast_method): fc = FlowgramContainerFile(header, outdir) else: fc = FlowgramContainerArray() # calculate distance scores if on_cluster: (scores, names, flowgrams) =\ get_flowgram_distances_on_cluster( id, flowgram, flowgrams, fc, ids, num_cpus, num_flows, spread=spread, client_sockets=client_sockets) else: (scores, names, flowgrams) =\ get_flowgram_distances( id, flowgram, flowgrams, fc, ids, outdir=outdir, error_profile=error_profile) # shortcut for non-matching flowgrams survivors = filter( lambda a_b: a_b[0] < threshold or a_b[1] >= pair_id_thresh, scores) if (len(survivors) == 0): # put it in its own cluster # and remove it from any further searches if (id in bestscores): del (bestscores[id]) del (ids[id]) return (flowgrams, num_flows - 1) # Do the filtering non_clustered_ctr = 0 for ((score, pair_id), name) in zip(scores, names): if (score < threshold or name == id or pair_id >= pair_id_thresh): # make sure the original flowgram gets into this cluster del (ids[name]) if (name in bestscores): del (bestscores[name]) if (id != name): # update the mapping information mapping[id].extend(mapping[name]) mapping[id].append(name) # delete the old cluster from the mapping del (mapping[name]) else: non_clustered_ctr += 1 # keep track of the best match of this guy to any centroid if (name not in bestscores or score < bestscores[name][1]): bestscores[name] = (id, score) # Some extra safety that we are not missing anything if (len(ids) != non_clustered_ctr or len(bestscores) != non_clustered_ctr): raise ApplicationError("filterWithFlowgram failed") return (flowgrams, non_clustered_ctr)