def setUp(self): # create a list of files to cleanup self._paths_to_clean_up = [] self._dirs_to_clean_up = [] # load query seqs self.seqs = Alignment(parse_fasta(QUERY_SEQS.split())) # generate temp filename tmp_dir='/tmp' self.outfile = get_tmp_filename(tmp_dir) # create and write out reference sequence file self.outfasta=splitext(self.outfile)[0]+'.fasta' fastaout=open(self.outfasta,'w') fastaout.write(REF_SEQS) fastaout.close() self._paths_to_clean_up.append(self.outfasta) # create and write out starting tree file self.outtree=splitext(self.outfile)[0]+'.tree' treeout=open(self.outtree,'w') treeout.write(REF_TREE) treeout.close() self._paths_to_clean_up.append(self.outtree)
def cluster_seqs(seqs, neighbor_join=False, params={}, add_seq_names=True, WorkingDir=tempfile.gettempdir(), SuppressStderr=None, SuppressStdout=None, max_chars=1000000, max_hours=1.0, constructor=PhyloNode, clean_up=True ): """Muscle cluster list of sequences. seqs: either file name or list of sequence objects or list of strings or single multiline string containing sequences. Addl docs coming soon """ num_seqs = len(seqs) if num_seqs < 2: raise ValueError, "Muscle requres 2 or more sequences to cluster." num_chars = sum(map(len, seqs)) if num_chars > max_chars: params["-maxiters"] = 2 params["-diags1"] = True params["-sv"] = True #params["-distance1"] = "kmer6_6" #params["-distance1"] = "kmer20_3" #params["-distance1"] = "kbit20_3" print "lots of chars, using fast align", num_chars params["-maxhours"] = max_hours #params["-maxiters"] = 10 #cluster_type = "upgmb" #if neighbor_join: # cluster_type = "neighborjoining" params["-clusteronly"] = True params["-tree1"] = get_tmp_filename(WorkingDir) muscle_res = muscle_seqs(seqs, params=params, add_seq_names=add_seq_names, WorkingDir=WorkingDir, SuppressStderr=SuppressStderr, SuppressStdout=SuppressStdout) tree = DndParser(muscle_res["Tree1Out"], constructor=constructor) if clean_up: muscle_res.cleanUp() return tree
def build_tree_from_alignment(aln, moltype=DNA, best_tree=False, params={}): """Returns a tree from Alignment object aln. aln: an xxx.Alignment object, or data that can be used to build one. moltype: cogent.core.moltype.MolType object best_tree: best_tree suppport is currently not implemented params: dict of parameters to pass in to the RAxML app controller. The result will be an xxx.Alignment object, or None if tree fails. """ if best_tree: raise NotImplementedError if '-m' not in params: if moltype == DNA or moltype == RNA: #params["-m"] = 'GTRMIX' # in version 7.2.3, GTRMIX is no longer supported but says GTRCAT # behaves like GTRMIX (http://www.phylo.org/tools/raxmlhpc2.html) params["-m"] = 'GTRGAMMA' elif moltype == PROTEIN: params["-m"] = 'PROTGAMMAmatrixName' else: raise ValueError, "Moltype must be either DNA, RNA, or PROTEIN" if not hasattr(aln, 'toPhylip'): aln = Alignment(aln) seqs, align_map = aln.toPhylip() # generate temp filename for output params["-w"] = "/tmp/" params["-n"] = get_tmp_filename().split("/")[-1] params["-k"] = True params["-p"] = randint(1,100000) params["-x"] = randint(1,100000) ih = '_input_as_multiline_string' raxml_app = Raxml(params=params, InputHandler=ih, WorkingDir=None, SuppressStderr=True, SuppressStdout=True) raxml_result = raxml_app(seqs) tree = DndParser(raxml_result['Bootstrap'], constructor=PhyloNode) for node in tree.tips(): node.Name = align_map[node.Name] raxml_result.cleanUp() return tree
def aln_tree_seqs(seqs, input_handler=None, tree_type='neighborjoining', params={}, add_seq_names=True, WorkingDir=tempfile.gettempdir(), SuppressStderr=None, SuppressStdout=None, max_hours=5.0, constructor=PhyloNode, clean_up=True ): """Muscle align sequences and report tree from iteration2. Unlike cluster_seqs, returns tree2 which is the tree made during the second muscle iteration (it should be more accurate that the cluster from the first iteration which is made fast based on k-mer words) seqs: either file name or list of sequence objects or list of strings or single multiline string containing sequences. tree_type: can be either neighborjoining (default) or upgmb for UPGMA clean_up: When true, will clean up output files """ params["-maxhours"] = max_hours if tree_type: params["-cluster2"] = tree_type params["-tree2"] = get_tmp_filename(WorkingDir) params["-out"] = get_tmp_filename(WorkingDir) muscle_res = muscle_seqs(seqs, input_handler=input_handler, params=params, add_seq_names=add_seq_names, WorkingDir=WorkingDir, SuppressStderr=SuppressStderr, SuppressStdout=SuppressStdout) tree = DndParser(muscle_res["Tree2Out"], constructor=constructor) aln = [line for line in muscle_res["MuscleOut"]] if clean_up: muscle_res.cleanUp() return tree, aln
def raxml_alignment(align_obj, raxml_model="GTRCAT", params={}, SuppressStderr=True, SuppressStdout=True): """Run raxml on alignment object align_obj: Alignment object params: you can set any params except -w and -n returns: tuple (phylonode, parsimonyphylonode, log likelihood, total exec time) """ # generate temp filename for output params["-w"] = "/tmp/" params["-n"] = get_tmp_filename().split("/")[-1] params["-m"] = raxml_model params["-p"] = randint(1,100000) ih = '_input_as_multiline_string' seqs, align_map = align_obj.toPhylip() #print params["-n"] # set up command raxml_app = Raxml( params=params, InputHandler=ih, WorkingDir=None, SuppressStderr=SuppressStderr, SuppressStdout=SuppressStdout) # run raxml ra = raxml_app(seqs) # generate tree tree_node = DndParser(ra["Result"]) # generate parsimony tree parsimony_tree_node = DndParser(ra["ParsimonyTree"]) # extract log likelihood from log file log_file = ra["Log"] total_exec_time = exec_time = log_likelihood = 0.0 for line in log_file: exec_time, log_likelihood = map(float, line.split()) total_exec_time += exec_time # remove output files ra.cleanUp() return tree_node, parsimony_tree_node, log_likelihood, total_exec_time
def test_insert_sequences_into_tree(self): """Inserts sequences into Tree using params - test handles tree-insertion""" # generate temp filename for output outfname=splitext(get_tmp_filename('/tmp/'))[0] # create starting tree outtreefname=outfname+'.tre' outtree=open(outtreefname,'w') outtree.write(REF_TREE) outtree.close() # set params for tree-insertion params={} params["-w"]="/tmp/" params["-n"] = get_tmp_filename().split("/")[-1] params["-f"] = 'v' #params["-G"] = '0.25' params["-t"] = outtreefname params["-m"] = 'GTRGAMMA' aln_ref_query=get_align_for_phylip(StringIO(PHYLIP_FILE_DNA_REF_QUERY)) aln = Alignment(aln_ref_query) seqs, align_map = aln.toPhylip() tree = insert_sequences_into_tree(seqs, DNA, params=params, write_log=False) for node in tree.tips(): removed_query_str=re.sub('QUERY___','',str(node.Name)) new_node_name=re.sub('___\d+','',str(removed_query_str)) if new_node_name in align_map: node.Name = align_map[new_node_name] self.assertTrue(isinstance(tree, PhyloNode)) self.assertEqual(tree.getNewick(with_distances=True),RESULT_TREE) self.assertEqual(len(tree.tips()), 7) self.assertRaises(NotImplementedError, build_tree_from_alignment, \ self.align1, RNA, True) remove(outtreefname)
def insert_sequences_into_tree(seqs, moltype, params={}, write_log=True): """Insert sequences into Tree. aln: an xxx.Alignment object, or data that can be used to build one. moltype: cogent.core.moltype.MolType object params: dict of parameters to pass in to the RAxML app controller. The result will be a tree. """ ih = '_input_as_multiline_string' raxml_app = Raxml(params=params, InputHandler=ih, WorkingDir=None, SuppressStderr=False, SuppressStdout=False, HALT_EXEC=False) raxml_result = raxml_app(seqs) # write a log file if write_log: log_fp = join(params["-w"],'log_raxml_'+split(get_tmp_filename())[-1]) log_file=open(log_fp,'w') log_file.write(raxml_result['StdOut'].read()) log_file.close() ''' # getting setup since parsimony doesn't output tree..only jplace, however # it is currently corrupt # use guppy to convert json file into a placement tree guppy_params={'tog':None} new_tree=build_tree_from_json_using_params(raxml_result['json'].name, \ output_dir=params["-w"], \ params=guppy_params) ''' # get tree from 'Result Names' new_tree=raxml_result['Result'].readlines() filtered_tree=re.sub('\[I\d+\]','',str(new_tree)) tree = DndParser(filtered_tree, constructor=PhyloNode) raxml_result.cleanUp() return tree
def insert_sequences_into_tree(aln, moltype, params={}, write_log=True): """Returns a tree from Alignment object aln. aln: an xxx.Alignment object, or data that can be used to build one. moltype: cogent.core.moltype.MolType object params: dict of parameters to pass in to the RAxML app controller. The result will be an xxx.Alignment object, or None if tree fails. """ # convert aln to phy since seq_names need fixed to run through pplacer new_aln=get_align_for_phylip(StringIO(aln)) # convert aln to fasta in case it is not already a fasta file aln2 = Alignment(new_aln) seqs = aln2.toFasta() ih = '_input_as_multiline_string' pplacer_app = Pplacer(params=params, InputHandler=ih, WorkingDir=None, SuppressStderr=False, SuppressStdout=False) pplacer_result = pplacer_app(seqs) # write a log file if write_log: log_fp = join(params["--out-dir"],'log_pplacer_' + \ split(get_tmp_filename())[-1]) log_file=open(log_fp,'w') log_file.write(pplacer_result['StdOut'].read()) log_file.close() # use guppy to convert json file into a placement tree guppy_params={'tog':None} new_tree=build_tree_from_json_using_params(pplacer_result['json'].name, \ output_dir=params['--out-dir'], \ params=guppy_params) pplacer_result.cleanUp() return new_tree
def build_tree_from_distance_matrix(matrix, best_tree=False, params={},\ working_dir='/tmp'): """Returns a tree from a distance matrix. matrix: a square Dict2D object (cogent.util.dict2d) best_tree: if True (default:False), uses a slower but more accurate algorithm to build the tree. params: dict of parameters to pass in to the Clearcut app controller. The result will be an cogent.core.tree.PhyloNode object, or None if tree fails. """ params['--out'] = get_tmp_filename(working_dir) # Create instance of app controller, enable tree, disable alignment app = Clearcut(InputHandler='_input_as_multiline_string', params=params, \ WorkingDir=working_dir, SuppressStdout=True,\ SuppressStderr=True) #Turn off input as alignment app.Parameters['-a'].off() #Input is a distance matrix app.Parameters['-d'].on() if best_tree: app.Parameters['-N'].on() # Turn the dict2d object into the expected input format matrix_input, int_keys = _matrix_input_from_dict2d(matrix) # Collect result result = app(matrix_input) # Build tree tree = DndParser(result['Tree'].read(), constructor=PhyloNode) # reassign to original names for node in tree.tips(): node.Name = int_keys[node.Name] # Clean up result.cleanUp() del(app, result, params) return tree
def build_tree_from_alignment(aln, moltype=DNA, best_tree=False, params=None): """Returns a tree from Alignment object aln. aln: a cogent.core.alignment.Alignment object, or data that can be used to build one. moltype: cogent.core.moltype.MolType object best_tree: unsupported params: dict of parameters to pass in to the Muscle app controller. The result will be an cogent.core.tree.PhyloNode object, or None if tree fails. """ # Create instance of app controller, enable tree, disable alignment app = Muscle(InputHandler='_input_as_multiline_string', params=params, \ WorkingDir=tempfile.gettempdir()) app.Parameters['-clusteronly'].on() app.Parameters['-tree1'].on(get_tmp_filename(app.WorkingDir)) app.Parameters['-seqtype'].on(moltype.label) seq_collection = SequenceCollection(aln, MolType=moltype) #Create mapping between abbreviated IDs and full IDs int_map, int_keys = seq_collection.getIntMap() #Create SequenceCollection from int_map. int_map = SequenceCollection(int_map,MolType=moltype) # Collect result result = app(int_map.toFasta()) # Build tree tree = DndParser(result['Tree1Out'].read(), constructor=PhyloNode) for tip in tree.tips(): tip.Name = int_keys[tip.Name] # Clean up result.cleanUp() del(seq_collection, app, result) return tree
def align_unaligned_seqs(seqs, moltype=DNA, params=None): """Returns an Alignment object from seqs. seqs: SequenceCollection object, or data that can be used to build one. moltype: a MolType object. DNA, RNA, or PROTEIN. params: dict of parameters to pass in to the Muscle app controller. Result will be an Alignment object. """ if not params: params = {} #create SequenceCollection object from seqs seq_collection = SequenceCollection(seqs,MolType=moltype) #Create mapping between abbreviated IDs and full IDs int_map, int_keys = seq_collection.getIntMap() #Create SequenceCollection from int_map. int_map = SequenceCollection(int_map,MolType=moltype) #get temporary filename params.update({'-out':get_tmp_filename()}) #Create Muscle app. app = Muscle(InputHandler='_input_as_multiline_string',\ params=params, WorkingDir=tempfile.gettempdir()) #Get results using int_map as input to app res = app(int_map.toFasta()) #Get alignment as dict out of results alignment = dict(parse_fasta(res['MuscleOut'])) #Make new dict mapping original IDs new_alignment = {} for k,v in alignment.items(): new_alignment[int_keys[k]]=v #Create an Alignment object from alignment dict new_alignment = Alignment(new_alignment,MolType=moltype) #Clean up res.cleanUp() del(seq_collection,int_map,int_keys,app,res,alignment,params) return new_alignment
def setUp(self): '''setup the files for testing pplacer''' # create a list of files to cleanup self._paths_to_clean_up = [] self._dirs_to_clean_up = [] # get a tmp filename to use basename=splitext(get_tmp_filename())[0] # create and write out RAxML stats file self.stats_fname=basename+'.stats' stats_out=open(self.stats_fname,'w') stats_out.write(RAXML_STATS) stats_out.close() self._paths_to_clean_up.append(self.stats_fname) # create and write out reference sequence file self.refseq_fname=basename+'_refseqs.fasta' refseq_out=open(self.refseq_fname,'w') refseq_out.write(REF_SEQS) refseq_out.close() self._paths_to_clean_up.append(self.refseq_fname) # create and write out query sequence file self.query_fname=basename+'_queryseqs.fasta' query_out=open(self.query_fname,'w') query_out.write(QUERY_SEQS) query_out.close() self._paths_to_clean_up.append(self.query_fname) # create and write out starting tree file self.tree_fname=basename+'.tre' tree_out=open(self.tree_fname,'w') tree_out.write(REF_TREE) tree_out.close() self._paths_to_clean_up.append(self.tree_fname)
def align_two_alignments(aln1, aln2, params=None): """Returns an Alignment object from two existing Alignments. aln1, aln2: cogent.core.alignment.Alignment objects, or data that can be used to build them. params: dict of parameters to pass in to the Muscle app controller. """ if not params: params = {} #create SequenceCollection object from aln1 aln1_collection = SequenceCollection(aln1) #Create mapping between abbreviated IDs and full IDs aln1_int_map, aln1_int_keys = aln1_collection.getIntMap(prefix='aln1_') #Create SequenceCollection from int_map. aln1_int_map = SequenceCollection(aln1_int_map) #create SequenceCollection object from aln2 aln2_collection = SequenceCollection(aln2) #Create mapping between abbreviated IDs and full IDs aln2_int_map, aln2_int_keys = aln2_collection.getIntMap(prefix='aln2_') #Create SequenceCollection from int_map. aln2_int_map = SequenceCollection(aln2_int_map) #set output and profile options params.update({'-out':get_tmp_filename(), '-profile':True}) #save aln1 to tmp file aln1_filename = get_tmp_filename() aln1_out = open(aln1_filename,'w') aln1_out.write(aln1_int_map.toFasta()) aln1_out.close() #save aln2 to tmp file aln2_filename = get_tmp_filename() aln2_out = open(aln2_filename, 'w') aln2_out.write(aln2_int_map.toFasta()) aln2_out.close() #Create Muscle app and get results app = Muscle(InputHandler='_input_as_multifile', params=params, WorkingDir=tempfile.gettempdir()) res = app((aln1_filename, aln2_filename)) #Get alignment as dict out of results alignment = dict(parse_fasta(res['MuscleOut'])) #Make new dict mapping original IDs new_alignment = {} for k,v in alignment.items(): if k in aln1_int_keys: new_alignment[aln1_int_keys[k]] = v else: new_alignment[aln2_int_keys[k]] = v #Create an Alignment object from alignment dict new_alignment = Alignment(new_alignment) #Clean up res.cleanUp() del(aln1_collection, aln1_int_map, aln1_int_keys) del(aln2_collection, aln2_int_map, aln2_int_keys) del(app, res, alignment, params) remove(aln1_filename) remove(aln2_filename) return new_alignment
def build_tree_from_alignment(aln, moltype=DNA, best_tree=False, params={},\ working_dir='/tmp'): """Returns a tree from Alignment object aln. aln: an cogent.core.alignment.Alignment object, or data that can be used to build one. - Clearcut only accepts aligned sequences. Alignment object used to handle unaligned sequences. moltype: a cogent.core.moltype object. - NOTE: If moltype = RNA, we must convert to DNA since Clearcut v1.0.8 gives incorrect results if RNA is passed in. 'U' is treated as an incorrect character and is excluded from distance calculations. best_tree: if True (default:False), uses a slower but more accurate algorithm to build the tree. params: dict of parameters to pass in to the Clearcut app controller. The result will be an cogent.core.tree.PhyloNode object, or None if tree fails. """ params['--out'] = get_tmp_filename(working_dir) # Create instance of app controller, enable tree, disable alignment app = Clearcut(InputHandler='_input_as_multiline_string', params=params, \ WorkingDir=working_dir, SuppressStdout=True,\ SuppressStderr=True) #Input is an alignment app.Parameters['-a'].on() #Turn off input as distance matrix app.Parameters['-d'].off() #If moltype = RNA, we must convert to DNA. if moltype == RNA: moltype = DNA if best_tree: app.Parameters['-N'].on() #Turn on correct moltype moltype_string = moltype.label.upper() app.Parameters[MOLTYPE_MAP[moltype_string]].on() # Setup mapping. Clearcut clips identifiers. We will need to remap them. # Clearcut only accepts aligned sequences. Let Alignment object handle # unaligned sequences. seq_aln = Alignment(aln,MolType=moltype) #get int mapping int_map, int_keys = seq_aln.getIntMap() #create new Alignment object with int_map int_map = Alignment(int_map) # Collect result result = app(int_map.toFasta()) # Build tree tree = DndParser(result['Tree'].read(), constructor=PhyloNode) for node in tree.tips(): node.Name = int_keys[node.Name] # Clean up result.cleanUp() del(seq_aln, app, result, int_map, int_keys, params) return tree
def assign_taxonomy(dataPath, reference_sequences_fp, id_to_taxonomy_fp, read_1_seqs_fp, read_2_seqs_fp, single_ok=False, no_single_ok_generic=False, header_id_regex=None, read_id_regex = "\S+\s+(\S+)", amplicon_id_regex = "(\S+)\s+(\S+?)\/", output_fp=None, log_path=None, HALT_EXEC=False, base_tmp_dir = '/tmp'): """Assign taxonomy to each sequence in data with the RTAX classifier # data: open fasta file object or list of fasta lines dataPath: path to a fasta file output_fp: path to write output; if not provided, result will be returned in a dict of {seq_id:(taxonomy_assignment,confidence)} """ usearch_command = "usearch" if not (exists(usearch_command) or app_path(usearch_command)): raise ApplicationNotFoundError,\ "Cannot find %s. Is it installed? Is it in your path?"\ % usearch_command my_tmp_dir = get_tmp_filename(tmp_dir=base_tmp_dir,prefix='rtax_',suffix='',result_constructor=str) os.makedirs(my_tmp_dir) try: # RTAX classifier doesn't necessarily preserve identifiers # it reports back only the id extracted as $1 using header_id_regex # since rtax takes the original unclustered sequence files as input, # the usual case is that the regex extracts the amplicon ID from the second field # Use lookup table read_1_id_to_orig_id = {} readIdExtractor = re.compile(read_id_regex) # OTU clustering produces ">clusterID read_1_id" data = open(dataPath,'r') for seq_id, seq in parse_fasta(data): # apply the regex extract = readIdExtractor.match(seq_id) if extract is None: stderr.write("Matched no ID with read_id_regex " + read_id_regex +" in '" + seq_id + "' from file " + dataPath + "\n") else: read_1_id_to_orig_id[extract.group(1)] = seq_id #stderr.write(extract.group(1) + " => " + seq_id + "\n") #seq_id_lookup[seq_id.split()[1]] = seq_id data.close() # make list of amplicon IDs to pass to RTAX id_list_fp = open(my_tmp_dir+"/ampliconIdsToClassify", "w") # Establish mapping of amplicon IDs to read_1 IDs # simultaneously write the amplicon ID file for those IDs found in the input mapping above amplicon_to_read_1_id = {} ampliconIdExtractor = re.compile(amplicon_id_regex) # split_libraries produces >read_1_id ampliconID/1 ... // see also assign_taxonomy 631 read_1_data = open(read_1_seqs_fp,'r') for seq_id, seq in parse_fasta(read_1_data): # apply the regex extract = ampliconIdExtractor.match(seq_id) if extract is None: stderr.write("Matched no ID with amplicon_id_regex " + amplicon_id_regex + " in '" + seq_id + "' from file " + read_1_seqs_fp + "\n") else: read_1_id = extract.group(1) amplicon_id = extract.group(2) try: amplicon_to_read_1_id[amplicon_id] = read_1_id bogus = read_1_id_to_orig_id[read_1_id] # verify that the id is valid id_list_fp.write('%s\n' % (amplicon_id)) except KeyError: pass data.close() id_list_fp.close() app = Rtax(HALT_EXEC=HALT_EXEC) temp_output_file = tempfile.NamedTemporaryFile( prefix='RtaxAssignments_', suffix='.txt') app.Parameters['-o'].on(temp_output_file.name) app.Parameters['-r'].on(reference_sequences_fp) app.Parameters['-t'].on(id_to_taxonomy_fp) # app.Parameters['-d'].on(delimiter) app.Parameters['-l'].on(id_list_fp.name) # these are amplicon IDs app.Parameters['-a'].on(read_1_seqs_fp) if read_2_seqs_fp is not None: app.Parameters['-b'].on(read_2_seqs_fp) app.Parameters['-i'].on(header_id_regex) app.Parameters['-m'].on(my_tmp_dir) if single_ok: app.Parameters['-f'].on(); if no_single_ok_generic: app.Parameters['-g'].on(); #app.Parameters['-v'].on() app_result = app() if log_path: f=open(log_path, 'a') errString=''.join(app_result['StdErr'].readlines()) + '\n' f.write(errString) f.close() assignments = {} # restore original sequence IDs with spaces for line in app_result['Assignments']: toks = line.strip().split('\t') rtax_id = toks.pop(0) if len(toks): bestpcid = toks.pop(0) # ignored lineage = toks # RTAX does not provide a measure of confidence. We could pass one in, # based on the choice of primers, or even look it up on the fly in the tables # from the "optimal primers" paper; but it would be the same for every # query sequence anyway. # we could also return bestpcid, but that's not the same thing as confidence. confidence = 1.0 read_1_id = amplicon_to_read_1_id[rtax_id] orig_id = read_1_id_to_orig_id[read_1_id] if lineage: assignments[orig_id] = (';'.join(lineage), confidence) else: assignments[orig_id] = ('Unclassified', 1.0) if output_fp: try: output_file = open(output_fp, 'w') except OSError: raise OSError("Can't open output file for writing: %s" % output_fp) for seq_id, assignment in assignments.items(): lineage, confidence = assignment output_file.write( '%s\t%s\t%1.3f\n' % (seq_id, lineage, confidence)) output_file.close() return None else: return assignments finally: try: rmtree(my_tmp_dir) except OSError: pass