def main(): from cogent.parse.fasta import MinimalFastaParser greengenes_filename = os.path.expanduser( '~/Data/greengenes/sequences_16S_gg_2011_1.sel4cni.inf.aln.masked.fasta' ) logging.basicConfig(level='INFO', format='%(levelname)s: %(message)s', filename='log.log', filemode='w') distributions = [] with open(greengenes_filename) as greengenes: for label, seq in MinimalFastaParser(greengenes): d = distribution(seq, RNA.Alphabet) distributions.append([label, d, shannon(d)]) print distributions
def read_preprocessed_data(out_fp="/tmp/"): """Read data of a previous preprocessing run. out_fp: output directory of previous preprocess run. Supposed to contain two files: - prefix_dereplicated.fasta - prefix_mapping.txt """ # read mapping, and extract seqs # mapping has fasta_header like this: # > id: count seqs = dict([(a.split(':')[0], b) for (a, b) in ( MinimalFastaParser(open(out_fp + "/prefix_dereplicated.fasta")))]) mapping = read_denoiser_mapping(open(out_fp + "/prefix_mapping.txt")) return (out_fp + "/prefix_dereplicated.sff.txt", len(mapping), mapping, seqs)
def __call__(self, seq_path=None, result_path=None, log_path=None): """Returns a dict mapping {seq_id:(taxonomy, confidence)} for each seq Keep in mind, "confidence" is only done for consistency and in fact all assignments will have a score of 0 because a method for determining confidence is not currently implemented. Parameters: seq_path: path to file of sequences. The sequences themselves are never actually used, but they are needed for their ids. result_path: path to file of results. If specified, dumps the result to the desired path instead of returning it. log_path: path to log, which should include dump of params. """ # initialize the logger logger = self._get_logger(log_path) logger.info(str(self)) with open(seq_path, 'U') as f: seqs = dict(MinimalFastaParser(f)) consensus_map = tax2tree.prep_consensus( open(self.Params['id_to_taxonomy_fp']), seqs.keys()) seed_con = consensus_map[0].strip().split('\t')[1] determine_rank_order(seed_con) tipnames_map = load_consensus_map(consensus_map, False) tree = load_tree(open(self.Params['tree_fp']), tipnames_map) results = tax2tree.generate_constrings(tree, tipnames_map) results = tax2tree.clean_output(results, seqs.keys()) if result_path: # if the user provided a result_path, write the # results to file with open(result_path, 'w') as f: for seq_id, (lineage, confidence) in results.iteritems(): f.write('%s\t%s\t%s\n' % (seq_id, lineage, confidence)) logger.info('Result path: %s' % result_path) return results
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) negate = opts.negate sample_ids = opts.sample_ids mapping_fp = opts.mapping_fp input_fasta_fp = opts.input_fasta_fp output_fasta_fp = opts.output_fasta_fp if not mapping_fp: sample_ids = sample_ids.split(',') else: map_data, map_header, map_comments = parse_mapping_file(mapping_fp) sample_ids = get_sample_ids( map_data, map_header, parse_metadata_state_descriptions(sample_ids)) if len(sample_ids) == 0: raise ValueError,\ "No samples match the search criteria: %s" % valid_states if opts.verbose: # This is useful when using the --valid_states feature so you can # find out if a search query didn't work as you expected before a # lot of time is spent print "Extracting samples: %s" % ', '.join(sample_ids) try: seqs = MinimalFastaParser(open(input_fasta_fp)) except IOError: option_parser.error('Cannot open %s. Does it exist? Do you have read access?'%\ input_fasta_fp) exit(1) try: output_fasta_f = open(output_fasta_fp, 'w') except IOError: option_parser.error("Cannot open %s. Does path exist? Do you have write access?" %\ output_fasta_fp) exit(1) for r in extract_seqs_by_sample_id(seqs, sample_ids, negate): output_fasta_f.write('>%s\n%s\n' % r) output_fasta_f.close()
def fix_abundance_labels(output_consensus_fp, filtered_consensus_fp): """ puts size= part of label as second component after white space output_consensus_fp: consensus filepath with abundance data filtered_consensus_fp: output filepath name """ consensus_f = open(output_consensus_fp, "U") filtered_f = open(filtered_consensus_fp, "w") for label, seq in MinimalFastaParser(consensus_f): fasta_label = label.split()[0] size = "size=" + label.split('size=')[1].replace(';', '') final_label = "%s;%s" % (fasta_label, size) filtered_f.write(">%s\n%s\n" % (final_label, seq)) consensus_f.close() filtered_f.close()
def get_aligned_muscle(seq1, seq2): """Returns aligned sequences and frac_same using MUSCLE. This needs to be moved to the muscle app controller """ outname = get_tmp_filename() res = muscle_seqs([seq1, seq2], add_seq_names=True, WorkingDir="/tmp", out_filename=outname) seq1_aligned, seq2_aligned = list( MinimalFastaParser(res['MuscleOut'].read())) res.cleanUp() del (res) seq1_aligned = seq1_aligned[1][1:] seq2_aligned = seq2_aligned[1][1:] frac_same = (array(seq1_aligned, 'c') == array(seq2_aligned, 'c')).sum(0)\ / min(len(seq1), len(seq2)) return seq1_aligned, seq2_aligned, frac_same
def write_combined_fasta(fasta_name_to_sample_id, fasta_files, output_dir=".", counter=0): """ Writes combined, enumerated fasta file fasta_name_to_sample_id: dict of fasta file name to SampleID fasta_files: list of filepaths to iterate through output_dir: output directory to write combined file to counter: Starting number to enumerate sequences with """ combined_file_out = open(join(output_dir + "/", "combined_seqs.fna"), "w") for curr_fasta in fasta_files: for label, seq in MinimalFastaParser(open(curr_fasta, "U")): combined_file_out.write(">%s_%d %s\n" %\ (fasta_name_to_sample_id[basename(curr_fasta)], counter, label)) combined_file_out.write("%s\n" % seq) counter += 1
def test_call_alt_input_types(self): """BlastTaxonAssigner.__call__ functions w alt input types """ p = BlastTaxonAssigner({\ 'reference_seqs_filepath':self.reference_seqs_fp,\ 'id_to_taxonomy_filepath':self.id_to_taxonomy_fp}) # neither seqs or seq_fp passed results in AssertionError self.assertRaises(AssertionError, p) # Functions with a list of (seq_id, seq) pairs seqs = list(MinimalFastaParser(open(self.input_seqs_fp))) actual = p(seqs=seqs) self.assertEqual(actual, self.expected1) # Functions with input path actual = p(self.input_seqs_fp) self.assertEqual(actual, self.expected1) # same result when passing fp or seqs self.assertEqual(p(seqs=seqs), p(self.input_seqs_fp))
def kegg_fasta_to_codon_list(lines): """Reads list of CodonUsage objects from KEGG-format FASTA file.""" result = [] for label, seq in MinimalFastaParser(lines): seq = seq.upper() curr_info = {} fields = label.split() curr_info['SpeciesAbbreviation'], curr_info['GeneId'] = \ fields[0].split(':') if len(fields) > 1: #additional annotation first_word = fields[1] if first_word.endswith(';'): #gene label curr_info['Gene'] = first_word[:-1] curr_info['Description'] = ' '.join(fields[2:]) else: curr_info['Description'] = ' '.join(fields[1:]) curr_codon_usage = CodonUsage(seq_to_codon_dict(seq), Info=curr_info) curr_codon_usage.__dict__.update(curr_info) result.append(curr_codon_usage) return result
def cdhit_from_seqs(seqs, moltype, params=None): """Returns the CD-HIT results given seqs seqs : dict like collection of sequences moltype : cogent.core.moltype object params : cd-hit parameters NOTE: This method will call CD_HIT if moltype is PROTIEN, CD_HIT_EST if moltype is RNA/DNA, and raise if any other moltype is passed. """ # keys are not remapped. Tested against seq_ids of 100char length seqs = SequenceCollection(seqs, MolType=moltype) # setup params and make sure the output argument is set if params is None: params = {} if '-o' not in params: params['-o'] = get_tmp_filename() # call the correct version of cd-hit base on moltype working_dir = get_tmp_filename() if moltype is PROTEIN: app = CD_HIT(WorkingDir=working_dir, params=params) elif moltype is RNA: app = CD_HIT_EST(WorkingDir=working_dir, params=params) elif moltype is DNA: app = CD_HIT_EST(WorkingDir=working_dir, params=params) else: raise ValueError, "Moltype must be either PROTEIN, RNA, or DNA" # grab result res = app(seqs.toFasta()) new_seqs = dict(MinimalFastaParser(res['FASTA'].readlines())) # perform cleanup res.cleanUp() shutil.rmtree(working_dir) remove(params['-o'] + '.bak.clstr') return SequenceCollection(new_seqs, MolType=moltype)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) suppress_errors = opts.suppress_errors input_fps = [] for input_fp in opts.input_fps.split(','): input_fps.extend(glob(input_fp)) for input_fp in input_fps: i = 0 try: input_f = open(input_fp, 'U') except IOError, e: if suppress_errors: continue else: print input_fp, e for s in MinimalFastaParser(input_f): i += 1 print input_fp, i
def align_unaligned_seqs(seqs, moltype, params=None): """Returns an Alignment object from seqs. seqs: SequenceCollection object, or data that can be used to build one. moltype: a MolType object. DNA, RNA, or PROTEIN. params: dict of parameters to pass in to the Muscle app controller. Result will be an Alignment object. """ if not params: params = {} #create SequenceCollection object from seqs seq_collection = SequenceCollection(seqs, MolType=moltype) #Create mapping between abbreviated IDs and full IDs int_map, int_keys = seq_collection.getIntMap() #Create SequenceCollection from int_map. int_map = SequenceCollection(int_map, MolType=moltype) #get temporary filename params.update({'-out': get_tmp_filename()}) #Create Muscle app. app = Muscle(InputHandler='_input_as_multiline_string',\ params=params) #Get results using int_map as input to app res = app(int_map.toFasta()) #Get alignment as dict out of results alignment = dict(MinimalFastaParser(res['MuscleOut'].readlines())) #Make new dict mapping original IDs new_alignment = {} for k, v in alignment.items(): new_alignment[int_keys[k]] = v #Create an Alignment object from alignment dict new_alignment = Alignment(new_alignment, MolType=moltype) #Clean up res.cleanUp() del (seq_collection, int_map, int_keys, app, res, alignment, params) return new_alignment
def test_main(self): """Denoiser should always give same result on test data""" expected = """>FS8APND01D3TW3 | cluster size: 94 CTCCCGTAGGAGTCTGGGCCGTATCTCAGTCCCAATGTGGCCGGTCACCCTCTCAGGCCGGCTACCCGTCAAAGCCTTGGTAAGCCACTACCCCACCAACAAGCTGATAAGCCGCGAGTCCATCCCCAACCGCCGAAACTTTCCAACCCCCACCATGCAGCAGGAGCTCCTATCCGGTATTAGCCCCAGTTTCCTGAAGTTATCCCAAAGTCAAGGGCAGGTTACTCACGTGTTACTCACCCGTTCGCC """ expected_map = """FS8APND01EWRS4: FS8APND01DXG45: FS8APND01D3TW3:\tFS8APND01CSXFN\tFS8APND01DQ8MX\tFS8APND01DY7QW\tFS8APND01B5QNI\tFS8APND01CQ6OG\tFS8APND01C7IGN\tFS8APND01DHSGH\tFS8APND01DJ17E\tFS8APND01CUXOA\tFS8APND01EUTYG\tFS8APND01EKK7T\tFS8APND01D582W\tFS8APND01B5GWU\tFS8APND01D7N2A\tFS8APND01BJGHZ\tFS8APND01D6DYZ\tFS8APND01C6ZIM\tFS8APND01D2X6Y\tFS8APND01BUYCE\tFS8APND01BNUEY\tFS8APND01DKLOE\tFS8APND01C24PP\tFS8APND01EBWQX\tFS8APND01ELDYW\tFS8APND01B0GCS\tFS8APND01D4QXI\tFS8APND01EMYD9\tFS8APND01EA2SK\tFS8APND01DZOSO\tFS8APND01DHYAZ\tFS8APND01C7UD9\tFS8APND01BTZFV\tFS8APND01CR78R\tFS8APND01B39IE\tFS8APND01ECVC0\tFS8APND01DM3PL\tFS8APND01DELWS\tFS8APND01CIEK8\tFS8APND01D7ZOZ\tFS8APND01CZSAI\tFS8APND01DYOVR\tFS8APND01BX9XY\tFS8APND01DEWJA\tFS8APND01BEKIW\tFS8APND01DCKB9\tFS8APND01EEYIS\tFS8APND01DDKEA\tFS8APND01DSZLO\tFS8APND01C6EBC\tFS8APND01EE15M\tFS8APND01ELO9B\tFS8APND01C58QY\tFS8APND01DONCG\tFS8APND01DVXX2\tFS8APND01BL5YT\tFS8APND01BIL2V\tFS8APND01EBSYQ\tFS8APND01CCX8R\tFS8APND01B2YCJ\tFS8APND01B1JG4\tFS8APND01DJ024\tFS8APND01BIJY0\tFS8APND01CIA4G\tFS8APND01DV74M\tFS8APND01ECAX5\tFS8APND01DC3TZ\tFS8APND01EJVO6\tFS8APND01D4VFG\tFS8APND01DYYYO\tFS8APND01D1EDD\tFS8APND01DQUOT\tFS8APND01A2NSJ\tFS8APND01DDC8I\tFS8APND01BP1T2\tFS8APND01DPY6U\tFS8APND01CIQGV\tFS8APND01BPUT8\tFS8APND01BDNH4\tFS8APND01DOZDN\tFS8APND01DS866\tFS8APND01DGS2J\tFS8APND01EDK32\tFS8APND01EPA0T\tFS8APND01CK3JM\tFS8APND01BKLWW\tFS8APND01DV0BO\tFS8APND01DPNXE\tFS8APND01B7LUA\tFS8APND01BTTE2\tFS8APND01CKO4X\tFS8APND01DGGBY\tFS8APND01C4NHX\tFS8APND01DYPQN FS8APND01BSTVP: FS8APND01EFK0W: FS8APND01DCIOO: FS8APND01CKOMZ: """ command = " ".join([ "%s/denoiser.py" % get_qiime_scripts_dir(), "--force", "-o", self.test_dir, "-i", "%s/qiime/support_files/denoiser/TestData/denoiser_test_set.sff.txt" % PROJECT_HOME ]) result = Popen(command,shell=True,universal_newlines=True,\ stdout=PIPE,stderr=STDOUT).stdout.read() self.result_dir = self.test_dir observed = "".join(list(open(self.result_dir + "centroids.fasta"))) self.assertEqual(observed, expected) self.assertEqual( len( list( MinimalFastaParser( open(self.result_dir + "singletons.fasta")))), 6) observed = "".join(list(open(self.result_dir + "denoiser_mapping.txt"))) self.assertEqual(observed, expected_map)
def split_fasta(infile, seqs_per_file, outfile_prefix, working_dir=''): """ Split infile into files with seqs_per_file sequences in each infile: list of fasta lines or open file object seqs_per_file: the number of sequences to include in each file out_fileprefix: string used to create output filepath - output filepaths are <out_prefix>.<i>.fasta where i runs from 0 to number of output files working_dir: directory to prepend to temp filepaths (defaults to empty string -- files written to cwd) List of output filepaths is returned. """ if seqs_per_file <= 0: raise ValueError("seqs_per_file must be > 0!") seq_counter = 0 out_files = [] if working_dir and not working_dir.endswith('/'): working_dir += '/' create_dir(working_dir) for seq_id,seq in MinimalFastaParser(infile): if seq_counter == 0: current_out_fp = '%s%s.%d.fasta' \ % (working_dir,outfile_prefix,len(out_files)) current_out_file = open(current_out_fp, 'w') out_files.append(current_out_fp) current_out_file.write('>%s\n%s\n' % (seq_id, seq)) seq_counter += 1 if seq_counter == seqs_per_file: current_out_file.close() seq_counter = 0 if not current_out_file.closed: current_out_file.close() return out_files
def test_insert_sequences_into_tree(self): """Inserts sequences into Tree""" params={} # generate temp filename for output params["-r"] = self.refseq_fname params["-t"] = self.tree_fname params["-s"] = self.stats_fname params["--out-dir"] = "/tmp" aln_ref_query=MinimalFastaParser(StringIO(QUERY_SEQS)) aln = Alignment(aln_ref_query) seqs, align_map = aln.toPhylip() tree = insert_sequences_into_tree(seqs, DNA, params=params, write_log=False) # rename tips back to query names for node in tree.tips(): if node.Name in align_map: node.Name = align_map[node.Name] self.assertEqual(tree.getNewick(with_distances=True), RESULT_TREE)
def test_gt_bracket_in_seq(self): """MinimalFastaParser handles alternate finder function this test also illustrates how to use the MinimalFastaParser to handle "sequences" that start with a > symbol, which can happen when we abuse the MinimalFastaParser to parse fasta-like sequence quality files. """ oneseq_w_gt = '>abc\n>CAG\n'.split('\n') def get_two_line_records(infile): line1 = None for line in infile: if line1 == None: line1 = line else: yield (line1, line) line1 = None f = list(MinimalFastaParser(oneseq_w_gt, finder=get_two_line_records)) self.assertEqual(len(f), 1) a = f[0] self.assertEqual(a, ('abc', '>CAG'))
def compute_sample_novelty(table_fs, rep_set_f, verbose=False): """""" ref_otus = [seq_id.split()[0] for seq_id, _ in MinimalFastaParser(rep_set_f)] # {sample_id: [novel_count, known_count, [novel_obs_ids]]} sample_novelty = defaultdict(lambda: [0, 0, []]) tables_processed = 0 for table_f in table_fs: table = parse_biom_table(table_f) novel_obs = set(table.ObservationIds) - set(ref_otus) for counts, obs_id, _ in table.iterObservations(): if obs_id in novel_obs: for sid, count in zip(table.SampleIds, counts): if count > 0: sample_novelty[sid][0] += count sample_novelty[sid][2].append(obs_id) else: for sid, count in zip(table.SampleIds, counts): sample_novelty[sid][1] += count tables_processed += 1 if verbose: print "Processed %d table(s)." % tables_processed results = [] for sid, (novel_count, known_count, novel_obs_ids) in \ sample_novelty.items(): percent_novel_seqs = (novel_count / (known_count + novel_count)) * 100 # Create a set first in case a sample in multiple tables has the same # novel observations. num_new_obs = len(set(novel_obs_ids)) results.append((sid, num_new_obs, percent_novel_seqs)) return sorted(results, reverse=True, key=itemgetter(1))
def parse_fasta(lines): """lightweight parser for KEGG FASTA format sequences""" for label, seq in MinimalFastaParser(lines): yield '\t'.join(list(kegg_label_fields(label)) \ + [seq] + ["\n"])
def usearch61_chimera_check(input_seqs_fp, output_dir, reference_seqs_fp=None, suppress_usearch61_intermediates=False, suppress_usearch61_ref=False, suppress_usearch61_denovo=False, split_by_sampleid=False, non_chimeras_retention="union", usearch61_minh=0.28, usearch61_xn=8.0, usearch61_dn=1.4, usearch61_mindiffs=3, usearch61_mindiv=0.8, usearch61_abundance_skew=2.0, percent_id_usearch61=0.97, minlen=64, word_length=8, max_accepts=1, max_rejects=8, verbose=False, threads=1.0, HALT_EXEC=False): """ Main convenience function for usearch61 chimera checking input_seqs_fp: filepath of input fasta file. output_dir: output directory reference_seqs_fp: fasta filepath for reference chimera detection. suppress_usearch61_intermediates: Suppress retention of .uc and log files. suppress_usearch61_ref: Suppress usearch61 reference chimera detection. suppress_usearch61_denovo: Suppress usearch61 de novo chimera detection. split_by_sampleid: Split by sample ID for de novo chimera detection. non_chimeras_retention: Set to "union" or "intersection" to retain non-chimeras between de novo and reference based results. usearch61_minh: Minimum score (h) to be classified as chimera. Increasing this value tends to the number of false positives (and also sensitivity). usearch61_xn: Weight of "no" vote. Increasing this value tends to the number of false positives (and also sensitivity). usearch61_dn: Pseudo-count prior for "no" votes. (n). Increasing this value tends to the number of false positives (and also sensitivity). usearch61_mindiffs: Minimum number of diffs in a segment. Increasing this value tends to reduce the number of false positives while reducing sensitivity to very low-divergence chimeras. usearch61_mindiv: Minimum divergence, i.e. 100% - identity between the query and closest reference database sequence. Expressed as a percentage, so the default is 0.8%, which allows chimeras that are up to 99.2% similar to a reference sequence. usearch61_abundance_skew: abundance skew for de novo chimera comparisons. percent_id_usearch61: identity to cluster sequences at minlen: minimum sequence length for use with usearch61 word_length: length of nucleotide 'words' for usearch61 max_accepts: max number of accepts for hits with usearch61 max_rejects: max number of rejects for usearch61, increasing allows more sensitivity at a cost of speed threads: Specify number of threads used per core per CPU HALT_EXEC=application controller option to halt execution and print command """ """ Need to cluster sequences de novo first to get 1. abundance information and 2 consensus sequence for each cluster. Using dereplication followed by clustering does not appear to automatically update complete cluster size, will directly cluster raw seqs with the small_mem clustering option. This means without additional parsing steps to recalculate actual cluster sizes, the sizeorder option can't be used for de novo clustering and downstream chimera detection.""" files_to_remove = [] # Get absolute paths to avoid issues with calling usearch input_seqs_fp = abspath(input_seqs_fp) output_dir = abspath(output_dir) if reference_seqs_fp: reference_seqs_fp = abspath(reference_seqs_fp) log_fp = join(output_dir, "identify_chimeric_seqs.log") chimeras_fp = join(output_dir, "chimeras.txt") non_chimeras_fp = join(output_dir, "non_chimeras.txt") non_chimeras = [] chimeras = [] log_lines = { 'denovo_chimeras': 0, 'denovo_non_chimeras': 0, 'ref_chimeras': 0, 'ref_non_chimeras': 0 } if split_by_sampleid: if verbose: print "Splitting fasta according to SampleID..." full_seqs = open(input_seqs_fp, "U") sep_fastas =\ split_fasta_on_sample_ids_to_files(MinimalFastaParser(full_seqs), output_dir) full_seqs.close() if suppress_usearch61_intermediates: files_to_remove += sep_fastas for curr_fasta in sep_fastas: curr_chimeras, curr_non_chimeras, files_to_remove, log_lines =\ identify_chimeras_usearch61(curr_fasta, output_dir, reference_seqs_fp, suppress_usearch61_intermediates, suppress_usearch61_ref, suppress_usearch61_denovo, non_chimeras_retention, usearch61_minh, usearch61_xn, usearch61_dn, usearch61_mindiffs, usearch61_mindiv, usearch61_abundance_skew, percent_id_usearch61, minlen, word_length, max_accepts, max_rejects, files_to_remove, HALT_EXEC, log_lines, verbose, threads) chimeras += curr_chimeras non_chimeras += curr_non_chimeras else: chimeras, non_chimeras, files_to_remove, log_lines =\ identify_chimeras_usearch61(input_seqs_fp, output_dir, reference_seqs_fp, suppress_usearch61_intermediates, suppress_usearch61_ref, suppress_usearch61_denovo, non_chimeras_retention, usearch61_minh, usearch61_xn, usearch61_dn, usearch61_mindiffs, usearch61_mindiv, usearch61_abundance_skew, percent_id_usearch61, minlen, word_length, max_accepts, max_rejects, files_to_remove, HALT_EXEC, log_lines, verbose, threads) # write log, non chimeras, chimeras. write_usearch61_log( log_fp, input_seqs_fp, output_dir, reference_seqs_fp, suppress_usearch61_intermediates, suppress_usearch61_ref, suppress_usearch61_denovo, split_by_sampleid, non_chimeras_retention, usearch61_minh, usearch61_xn, usearch61_dn, usearch61_mindiffs, usearch61_mindiv, usearch61_abundance_skew, percent_id_usearch61, minlen, word_length, max_accepts, max_rejects, HALT_EXEC, log_lines) chimeras_f = open(chimeras_fp, "w") non_chimeras_f = open(non_chimeras_fp, "w") for curr_chimera in chimeras: chimeras_f.write("%s\n" % curr_chimera) for curr_non_chimera in non_chimeras: non_chimeras_f.write("%s\n" % curr_non_chimera) chimeras_f.close() non_chimeras_f.close() remove_files(files_to_remove)
def get_chimeras_from_Nast_aligned(seqs_fp, ref_db_aligned_fp=None, ref_db_fasta_fp=None, HALT_EXEC=False, min_div_ratio=None, keep_intermediates=False): """remove chimeras from seqs_fp using chimeraSlayer. seqs_fp: a filepath with the seqs to check in the file ref_db_aligned_fp: fp to (pynast) aligned reference sequences ref_db_fasta_fp: same seqs as above, just unaligned. Will be computed on the fly if not provided, HALT_EXEC: stop execution if true min_div_ratio: passed to ChimeraSlayer App """ files_to_remove = [] #might come in as FilePath object with quotes seqs_fp = str(seqs_fp) seqs_fp = seqs_fp.rstrip('"') seqs_fp = seqs_fp.lstrip('"') seqs_dir, new_seqs_fp = split(seqs_fp) #if fp is in current dir, we fake a dir change if seqs_dir == "": seqs_dir = "./" #Chimera Slayer puts some temp files in current dir and some in dir of input file #use exe_dir to change to dir of input file, so to have all tmp files in one place params = {'--query_NAST': new_seqs_fp, '--exec_dir': seqs_dir} if ref_db_aligned_fp == None and ref_db_fasta_fp == None: #use default db, whose relative position to the #ChimeraSlayer binary is hardcoded pass else: if not ref_db_fasta_fp: #make degapped reference file ref_db_fasta_fp = write_degapped_fasta_to_file(MinimalFastaParser( \ open(ref_db_aligned_fp))) files_to_remove.append(ref_db_fasta_fp) #use user db params.update({ '--db_NAST': abspath(ref_db_aligned_fp), '--db_FASTA': abspath(ref_db_fasta_fp) }) if min_div_ratio != None: params.update({'-R': min_div_ratio}) app = ChimeraSlayer(params=params, HALT_EXEC=HALT_EXEC) app_results = app() # this is a FilePath object in case of success. # How can we test for failure here? # if not exists(app_results['CPS']): # raise ApplicationError, "ChimeraSlayer failed. No output file." chimeras = parse_CPS_file((app_results['CPS'])) if not keep_intermediates: app.remove_intermediate_files() remove_files(files_to_remove) return chimeras
def get_seqs_to_keep_lookup_from_prefix(fasta_f, prefix): seqs_to_keep = [ seq_id for seq_id, seq in MinimalFastaParser(fasta_f) if seq_id.startswith(prefix) ] return {}.fromkeys(seqs_to_keep)
def assign_taxonomy(data, min_confidence=0.80, output_fp=None, training_data_fp=None, max_memory=None): """ Assign taxonomy to each sequence in data with the RDP classifier data: open fasta file object or list of fasta lines confidence: minimum support threshold to assign taxonomy to a sequence output_fp: path to write output; if not provided, result will be returned in a dict of {seq_id:(taxonomy_assignment,confidence)} """ data = list(data) # build a map of seq identifiers as the RDP classifier doesn't # preserve these perfectly identifier_lookup = {} for seq_id, seq in MinimalFastaParser(data): identifier_lookup[seq_id.split()[0]] = seq_id # build the classifier object app = RdpClassifier20() if max_memory is not None: app.Parameters['-Xmx'].on(max_memory) if training_data_fp is not None: app.Parameters['-training-data'].on(training_data_fp) # apply the rdp app controller rdp_result = app('\n'.join(data)) # grab assignment output result_lines = rdp_result['Assignments'] # start a list to store the assignments results = {} # ShortSequenceException messages are written to stdout # Tag these ID's as unassignable stdout_lines = rdp_result['StdOut'] for line in stdout_lines: if line.startswith('ShortSequenceException'): matchobj = re.search('recordID=(\S+)', line) if matchobj: rdp_id = matchobj.group(1) orig_id = identifier_lookup[rdp_id] results[orig_id] = ('Unassignable', 1.0) # iterate over the identifier, assignment strings (this is a bit # of an abuse of the MinimalFastaParser, as these are not truely # fasta lines) for identifier, assignment_str in MinimalFastaParser(result_lines): # get the original identifier from the one in the rdp result identifier = identifier_lookup[\ identifier[:identifier.index('reverse=')].strip()] # build a list to store the assignments we're confident in # (i.e., the ones that have a confidence greater than min_confidence) confident_assignments = [] # keep track of the lowest acceptable confidence value that # has been encountered lowest_confidence = 0.0 # split the taxonomy assignment string assignment_fields = assignment_str.split(';') # iterate over (assignment, assignment confidence) pairs for i in range(0, len(assignment_fields), 2): assignment = assignment_fields[i] try: assignment_confidence = float(assignment_fields[i + 1]) except IndexError: break # check the confidence of the current assignment if assignment_confidence >= min_confidence: # if the current assignment confidence is greater than # the min, store the assignment and confidence value confident_assignments.append(assignment.strip()) lowest_confidence = assignment_confidence else: # otherwise, we've made it to the lowest assignment that # met the confidence threshold, so bail out of the loop break # store the identifier, the semi-colon-separated assignments, and the # confidence for the last assignment results[identifier] = \ (';'.join(confident_assignments),lowest_confidence) if output_fp: try: output_file = open(output_fp, 'w') except OSError: raise OSError, "Can't open output file for writing: %s" % output_fp for seq_id, values in results.items(): output_file.write('%s\t%s\t%1.3f\n' % (seq_id, values[0], values[1])) output_file.close() return None else: return results
def process_silva(seqs, tax_out, seq_out): for label, seq in MinimalFastaParser(seqs): new_header, taxonomy = parse_label(label) fixed_seq = parse_seq(seq) tax_out.write(new_header + '\t' + taxonomy + '\n') seq_out.write('>' + new_header + '\n' + fixed_seq + '\n')
def assign_dna_reads_to_protein_database(query_fasta_fp, database_fasta_fp, output_fp, temp_dir="/tmp", params=None): """Assign DNA reads to a database fasta of protein sequences. Wraps assign_reads_to_database, setting database and query types. All parameters are set to default unless params is passed. A temporary file must be written containing the translated sequences from the input query fasta file because BLAT cannot do this automatically. query_fasta_fp: absolute path to the query fasta file containing DNA sequences. database_fasta_fp: absolute path to the database fasta file containing protein sequences. output_fp: absolute path where the output file will be generated. temp_dir: optional. Change the location where the translated sequences will be written before being used as the query. Defaults to /tmp. params: optional. dict containing parameter settings to be used instead of default values. Cannot change database or query file types from protein and dna, respectively. This method returns an open file object. The output format defaults to blast9 and should be parsable by the PyCogent BLAST parsers. """ if params is None: params = {} my_params = {'-t': 'prot', '-q': 'prot'} # make sure temp_dir specifies an absolute path if not isabs(temp_dir): raise ApplicationError("temp_dir must be an absolute path.") # if the user specified parameters other than default, then use them. # However, if they try to change the database or query types, raise an # applciation error. if '-t' in params or '-q' in params: raise ApplicationError( "Cannot change database or query types " "when using assign_dna_reads_to_dna_database. Use " "assign_reads_to_database instead.") if 'genetic_code' in params: my_genetic_code = GeneticCodes[params['genetic_code']] del params['genetic_code'] else: my_genetic_code = GeneticCodes[1] my_params.update(params) # get six-frame translation of the input DNA sequences and write them to # temporary file. tmp = get_tmp_filename(tmp_dir=temp_dir, result_constructor=str) tmp_out = open(tmp, 'w') for label, sequence in MinimalFastaParser(open(query_fasta_fp)): seq_id = label.split()[0] s = DNA.makeSequence(sequence) translations = my_genetic_code.sixframes(s) frames = [1, 2, 3, -1, -2, -3] translations = dict(zip(frames, translations)) for frame, translation in sorted(translations.iteritems()): entry = '>{seq_id}_frame_{frame}\n{trans}\n' entry = entry.format(seq_id=seq_id, frame=frame, trans=translation) tmp_out.write(entry) tmp_out.close() result = assign_reads_to_database(tmp, database_fasta_fp, output_fp, params=my_params) remove(tmp) return result
def get_hits_data(primer, primer_id, fasta_fp, tp_len, last_base_mm, tp_mm, non_tp_mm, tp_gap, non_tp_gap): """ Finds mismatches, gaps, scores for primer/seqs sets Returns a list of lines of hits data for writing to the output hits file, and a list of lists containing the mismatches, gaps, and weighted scores for writing a histogram file. primer: current primer (DNA.Sequence object) primer_ids: current primer name fasta_fp: current open fasta filepath object to test primers against seq_collection: tuple of (collection_id, seq_collection), with id based upon root name of fasta file, collection is degapped SequenceCollection object tp_len: three prime length last_base_mm: penalty for last base mismatch tp_mm: three prime mismatch penalty non_tp_mm: non three prime mismatch penalty tp_gap: penalty for three prime gaps non_tp_gap: penalty for non three prime gaps """ # Contains header, parameters, comments for the output hits file hits_lines = ["# Primer: %s 5'-%s-3'" % (primer.Name, primer), '# Input fasta file: %s' % basename(fasta_fp.name), '# Parameters', '# 3\' length: %d' % tp_len, '# non 3\' mismatch penalty: %1.2f per mismatch' % non_tp_mm, '# 3\' mismatch penalty: %1.2f per mismatch' % tp_mm, '# last base mismatch penalty: %1.2f' % last_base_mm, '# non 3\' gap penalty: %1.2f per gap' % non_tp_gap, '# 3\' gap penalty: %1.2f per gap' % tp_gap, '# Note - seq hit and primer hit are the best local pairwise alignment '+\ 'results for a given sequence and primer pair. A gap in seq hit '+\ 'represents a '+\ 'deletion in the sequence, whereas a gap in the primer hit signifies '+\ 'an insertion in the target sequence.\n#\n' '# seq ID, seq hit, primer hit, hit start position, non 3\' mismatches, '+\ '3\' mismatches (except last base), last base mismatch, '+\ 'non 3\' gaps, 3\' gaps, overall weighted score, '+\ 'hits sequence end '] # Calculate range of GC content, accounting for degeneracies min_gc = sum([primer.count(c) for c in 'GCS']) / len(primer) max_gc = sum([primer.count(c) for c in 'GCSNRYKMBDHV']) / len(primer) # Put together strings for text in output summary graphs degen_gc_content = '%s; Degeneracy: %d; GC content %.2f - %.2f'%\ (primer_id, primer.possibilities(), min_gc, max_gc) primer_title = '\n5\'-%s-3\'' % str(primer) seq_collection_title = '\nSequences tested: ' + basename(fasta_fp.name) figure_title = degen_gc_content + primer_title + seq_collection_title # Weighted score strings for the bottom of the histogram, following # weighted score results. tp_len_title = '3\' length: %d nucleotides' % tp_len weighted_score_info = "\nWeighted score = non-3' mismatches * "+\ "%1.2f + 3' mismatches * %1.2f + non 3' gaps * %1.2f + 3\' gaps * %1.2f" %\ (non_tp_mm, tp_mm, non_tp_gap, tp_gap) last_base_info = '\nAn additional %1.2f penalty is assigned if the ' %\ last_base_mm + 'final 3\' base mismatches' rounded_clause = '\nWeighted score is rounded to the nearest whole '+\ 'number in this graphical display' weighted_score_subtext = tp_len_title + weighted_score_info +\ last_base_info + rounded_clause # Set upper limit for purpose of displaying data on histograms max_mm = 5 max_gaps = 5 max_weighted_score = 5.0 non_tp_mm_data = [] tp_mm_data = [] non_tp_gap_data = [] tp_gap_data = [] weighted_score_data = [] last_base_mm_data = [] # get primer length to test for hitting sequence end primer_len = len(primer) primer_seq = primer_to_match_query(primer) for label, seq in MinimalFastaParser(fasta_fp): primer_hit, target_hit, hit_start = \ local_align_primer_seq(primer_seq, seq) # Get score, numbers of gaps/mismatches weighted_score, non_tp_gaps, tp_gaps, non_tp_mismatches,\ tp_mismatches, last_base_mismatches = score_primer(primer, primer_hit, target_hit, tp_len, last_base_mm, tp_mm, non_tp_mm, tp_gap, non_tp_gap) # Append data to lists for generating histograms # Max value appended to this list capped for purposes of readability # in the output histogram if non_tp_mismatches <= max_mm: non_tp_mm_data.append(non_tp_mismatches) else: non_tp_mm_data.append(max_mm) if tp_mismatches <= max_mm: tp_mm_data.append(tp_mismatches) else: tp_mm_data.append(max_mm) if non_tp_gaps <= max_gaps: non_tp_gap_data.append(non_tp_gaps) else: non_tp_gap_data.append(max_gaps) if tp_gaps <= max_gaps: tp_gap_data.append(tp_gaps) else: tp_gap_data.append(max_gaps) if weighted_score <= max_weighted_score: weighted_score_data.append(float('%2.2f' % weighted_score)) else: weighted_score_data.append(max_weighted_score) if last_base_mismatches: last_base_mm_data.append(1) else: last_base_mm_data.append(0) # Determine if primer hits sequence end # Difficult to use this in scoring, but can be parsed out if one wants # to determine if primer sequences were left in fasta sequences hits_sequence_end = hits_seq_end(seq, hit_start, primer_len) # Append hit info for output hits file data # Label is split to just contain fasta ID hits_lines.append(','.join(map(str,[label.split()[0], target_hit, primer_hit, hit_start, non_tp_mismatches, tp_mismatches, bool(last_base_mismatches), non_tp_gaps, tp_gaps, weighted_score, hits_sequence_end]))) # Make list of all histogram data lists so only one data item being # passed around hist_data = [non_tp_mm_data, tp_mm_data, non_tp_gap_data, tp_gap_data, weighted_score_data, last_base_mm_data, figure_title, weighted_score_subtext] return hits_lines, hist_data
def setUp(self): self.seqs = Alignment(dict(MinimalFastaParser(test_seqs.split())))
def get_seqs_to_keep_lookup_from_fasta_file(fasta_f): """return the sequence ids within the fasta file""" return set( [seq_id.split()[0] for seq_id, seq in MinimalFastaParser(fasta_f)])
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) input_fp = opts.input_fp output_fp = opts.output_fp min_count = opts.min_count max_count = opts.max_count min_count_fraction = opts.min_count_fraction if min_count_fraction < 0. or min_count_fraction > 1.: option_parser.error("min_count_fraction must be between 0 and 1") if min_count != 0 and min_count_fraction != 0: option_parser.error( "cannot specify both min_count and min_count_fraction") min_samples = opts.min_samples max_samples = opts.max_samples otu_ids_to_exclude_fp = opts.otu_ids_to_exclude_fp negate_ids_to_exclude = opts.negate_ids_to_exclude if not (min_count != 0 or \ min_count_fraction != 0 or \ not isinf(max_count) or \ otu_ids_to_exclude_fp != None or \ min_samples !=0 or not isinf(max_samples)): option_parser.error( "No filtering requested. Must provide either " "min counts, max counts, min samples, max samples, min_count_fraction, " "or exclude_fp (or some combination of those).") otu_table = parse_biom_table(open(opts.input_fp, 'U')) if min_count_fraction > 0: min_count = otu_table.sum() * min_count_fraction print otu_table.sum(), min_count output_f = open(opts.output_fp, 'w') otu_ids_to_keep = set(otu_table.ObservationIds) if otu_ids_to_exclude_fp: if otu_ids_to_exclude_fp.endswith('.fasta') or \ otu_ids_to_exclude_fp.endswith('.fna'): otu_ids_to_exclude = set([ id_.strip().split()[0] for id_, seq in MinimalFastaParser( open(otu_ids_to_exclude_fp, 'U')) ]) else: otu_ids_to_exclude = set([ l.strip().split('\t')[0] for l in open(otu_ids_to_exclude_fp, 'U') ]) otu_ids_to_keep -= otu_ids_to_exclude filtered_otu_table = filter_otus_from_otu_table(otu_table, otu_ids_to_keep, min_count, max_count, min_samples, max_samples, negate_ids_to_exclude) output_f.write(format_biom_table(filtered_otu_table)) output_f.close()
def assign_taxonomy( data, min_confidence=0.80, output_fp=None, training_data_fp=None, fixrank=True, max_memory=None, tmp_dir=None): """Assign taxonomy to each sequence in data with the RDP classifier data: open fasta file object or list of fasta lines confidence: minimum support threshold to assign taxonomy to a sequence output_fp: path to write output; if not provided, result will be returned in a dict of {seq_id:(taxonomy_assignment,confidence)} """ # Going to iterate through this twice in succession, best to force # evaluation now data = list(data) # RDP classifier doesn't preserve identifiers with spaces # Use lookup table seq_id_lookup = {} for seq_id, seq in MinimalFastaParser(data): seq_id_lookup[seq_id.split()[0]] = seq_id app_kwargs = {} if tmp_dir is not None: app_kwargs['TmpDir'] = tmp_dir app = RdpClassifier(**app_kwargs) if max_memory is not None: app.Parameters['-Xmx'].on(max_memory) temp_output_file = tempfile.NamedTemporaryFile( prefix='RdpAssignments_', suffix='.txt', dir=tmp_dir) app.Parameters['-o'].on(temp_output_file.name) if training_data_fp is not None: app.Parameters['-t'].on(training_data_fp) if fixrank: app.Parameters['-f'].on('fixrank') else: app.Parameters['-f'].on('allrank') app_result = app(data) assignments = {} # ShortSequenceException messages are written to stdout # Tag these ID's as unassignable for line in app_result['StdOut']: excep = parse_rdp_exception(line) if excep is not None: _, rdp_id = excep orig_id = seq_id_lookup[rdp_id] assignments[orig_id] = ('Unassignable', 1.0) for line in app_result['Assignments']: rdp_id, direction, taxa = parse_rdp_assignment(line) if taxa[0][0] == "Root": taxa = taxa[1:] orig_id = seq_id_lookup[rdp_id] lineage, confidence = get_rdp_lineage(taxa, min_confidence) if lineage: assignments[orig_id] = (';'.join(lineage), confidence) else: assignments[orig_id] = ('Unclassified', 1.0) if output_fp: try: output_file = open(output_fp, 'w') except OSError: raise OSError("Can't open output file for writing: %s" % output_fp) for seq_id, assignment in assignments.items(): lineage, confidence = assignment output_file.write( '%s\t%s\t%1.3f\n' % (seq_id, lineage, confidence)) output_file.close() return None else: return assignments
def filter_fasta_fp(input_seqs_fp, output_seqs_fp, seqs_to_keep, negate=False): """Filter a fasta file to include only sequences listed in seqs_to_keep """ input_seqs = MinimalFastaParser(open(input_seqs_fp, 'U')) output_f = open(output_seqs_fp, 'w') return filter_fasta(input_seqs, output_f, seqs_to_keep, negate)