def prefix_filter_flowgrams(flowgrams, squeeze=False): """Filters flowgrams by common prefixes. flowgrams: iterable source of flowgrams squeeze: if True, collapse all poly-X to X Returns prefix mapping. """ # collect flowgram sequences if squeeze: seqs = imap( lambda f: (f.Name, squeeze_seq(str(f.toSeq(truncate=True)))), flowgrams) else: seqs = imap(lambda f: (f.Name, str(f.toSeq(truncate=True))), flowgrams) # equivalent but more efficient than #seqs = [(f.Name, str(f.toSeq(truncate=True))) for f in flowgrams] # get prefix mappings mapping = build_prefix_map(seqs) l = len(mapping) orig_l = sum([len(a) for a in mapping.values()]) + l return (l, orig_l, mapping)
def prefix_filter_flowgrams(flowgrams, squeeze=False): """Filters flowgrams by common prefixes. flowgrams: iterable source of flowgrams squeeze: if True, collapse all poly-X to X Returns prefix mapping. """ #collect flowgram sequences if squeeze: seqs = imap(lambda f: (f.Name, squeeze_seq(str(f.toSeq(truncate=True)))), flowgrams) else: seqs = imap(lambda f: (f.Name, str(f.toSeq(truncate=True))), flowgrams) #equivalent but more efficient than #seqs = [(f.Name, str(f.toSeq(truncate=True))) for f in flowgrams] #get prefix mappings mapping = build_prefix_map(seqs) l = len(mapping) orig_l=sum([len(a) for a in mapping.values()]) +l; return (l, orig_l, mapping)
def __call__(self, seq_path, result_path=None, log_path=None): """Returns dict mapping {otu_id:[seq_ids]} for each otu. Parameters: seq_path: path to file of sequences result_path: path to file of results. If specified, dumps the result to the desired path instead of returning it. log_path: path to log, which includes dump of params. """ log_lines = [] # Get the appropriate sequence iterator if self.Params['Reverse']: # Reverse the sequences prior to building the prefix map. # This effectively creates a suffix map. # Also removes descriptions from seq identifier lines seqs = imap(lambda s: (s[0].split()[0], s[1][::-1]),\ MinimalFastaParser(open(seq_path))) log_lines.append(\ 'Seqs reversed for suffix mapping (rather than prefix mapping).') else: # remove descriptions from seq identifier lines seqs = imap(lambda s: (s[0].split()[0], s[1]),\ MinimalFastaParser(open(seq_path))) # Build the mapping mapping = build_prefix_map(seqs) log_lines.append('Num OTUs: %d' % len(mapping)) if result_path: # if the user provided a result_path, write the # results to file with one tab-separated line per # cluster of = open(result_path, 'w') for i, (otu_id, members) in enumerate(mapping.iteritems()): of.write('%s\t%s\n' % (i, '\t'.join([otu_id] + members))) of.close() result = None log_lines.append('Result path: %s' % result_path) else: # if the user did not provide a result_path, store # the clusters in a dict of {otu_id:[seq_ids]}, where # otu_id is arbitrary #add key to cluster_members for key in mapping.keys(): mapping[key].append(key) result = dict(enumerate(mapping.values())) log_lines.append('Result path: None, returned as dict.') if log_path: # if the user provided a log file path, log the run log_file = open(log_path, 'w') log_lines = [str(self)] + log_lines log_file.write('\n'.join(log_lines)) # return the result (note this is None if the data was # written to file) return result
def __call__(self, seq_path, result_path=None, log_path=None): """Returns dict mapping {otu_id:[seq_ids]} for each otu. Parameters: seq_path: path to file of sequences result_path: path to file of results. If specified, dumps the result to the desired path instead of returning it. log_path: path to log, which includes dump of params. """ log_lines = [] # Get the appropriate sequence iterator if self.Params["Reverse"]: # Reverse the sequences prior to building the prefix map. # This effectively creates a suffix map. # Also removes descriptions from seq identifier lines seqs = imap(lambda s: (s[0].split()[0], s[1][::-1]), MinimalFastaParser(open(seq_path))) log_lines.append("Seqs reversed for suffix mapping (rather than prefix mapping).") else: # remove descriptions from seq identifier lines seqs = imap(lambda s: (s[0].split()[0], s[1]), MinimalFastaParser(open(seq_path))) # Build the mapping mapping = build_prefix_map(seqs) log_lines.append("Num OTUs: %d" % len(mapping)) if result_path: # if the user provided a result_path, write the # results to file with one tab-separated line per # cluster of = open(result_path, "w") for i, (otu_id, members) in enumerate(mapping.iteritems()): of.write("%s\t%s\n" % (i, "\t".join([otu_id] + members))) of.close() result = None log_lines.append("Result path: %s" % result_path) else: # if the user did not provide a result_path, store # the clusters in a dict of {otu_id:[seq_ids]}, where # otu_id is arbitrary # add key to cluster_members for key in mapping.keys(): mapping[key].append(key) result = dict(enumerate(mapping.values())) log_lines.append("Result path: None, returned as dict.") if log_path: # if the user provided a log file path, log the run log_file = open(log_path, "w") log_lines = [str(self)] + log_lines log_file.write("\n".join(log_lines)) # return the result (note this is None if the data was # written to file) return result
def _prefilter_with_trie(self, seq_path): trunc_id = lambda (a, b): (a.split()[0], b) # get the prefix map mapping = build_prefix_map(imap(trunc_id, MinimalFastaParser(open(seq_path)))) for key in mapping.keys(): mapping[key].append(key) # collect the representative seqs filtered_seqs = [] for (label, seq) in MinimalFastaParser(open(seq_path)): label = label.split()[0] if label in mapping: filtered_seqs.append((label, seq)) return filtered_seqs, mapping
def _prefilter_with_trie(self, seq_path): trunc_id = lambda (a, b): (a.split()[0], b) # get the prefix map mapping = build_prefix_map( imap(trunc_id, MinimalFastaParser(open(seq_path)))) for key in mapping.keys(): mapping[key].append(key) # collect the representative seqs filtered_seqs = [] for (label, seq) in MinimalFastaParser(open(seq_path)): label = label.split()[0] if label in mapping: filtered_seqs.append((label, seq)) return filtered_seqs, mapping