def build_tree_from_alignment(aln, moltype, best_tree=False, params=None): """Returns a tree from Alignment object aln. aln: an cogent.core.alignment.Alignment object, or data that can be used to build one. moltype: cogent.core.moltype.MolType object best_tree: if True (default:False), uses a slower but more accurate algorithm to build the tree. params: dict of parameters to pass in to the Clustal app controller. The result will be an cogent.core.tree.PhyloNode object, or None if tree fails. """ # Create instance of app controller, enable tree, disable alignment app = Clustalw(InputHandler="_input_as_multiline_string", params=params, WorkingDir="/tmp") app.Parameters["-align"].off() # Set params to empty dict if None. if params is None: params = {} if moltype == DNA or moltype == RNA: params["-type"] = "d" elif moltype == PROTEIN: params["-type"] = "p" else: raise ValueError, "moltype must be DNA, RNA, or PROTEIN" # best_tree -> bootstrap if best_tree: if "-bootstrap" not in params: app.Parameters["-bootstrap"].on(1000) if "-seed" not in params: app.Parameters["-seed"].on(randint(0, 1000)) if "-bootlabels" not in params: app.Parameters["-bootlabels"].on("nodes") else: app.Parameters["-tree"].on() # Setup mapping. Clustalw clips identifiers. We will need to remap them. seq_collection = SequenceCollection(aln) int_map, int_keys = seq_collection.getIntMap() int_map = SequenceCollection(int_map) # Collect result result = app(int_map.toFasta()) # Build tree tree = DndParser(result["Tree"].read(), constructor=PhyloNode) for node in tree.tips(): node.Name = int_keys[node.Name] # Clean up result.cleanUp() del (seq_collection, app, result, int_map, int_keys) return tree
def align_unaligned_seqs(seqs, moltype, params=None): """Returns an Alignment object from seqs. seqs: cogent.core.alignment.SequenceCollection object, or data that can be used to build one. moltype: a MolType object. DNA, RNA, or PROTEIN. params: dict of parameters to pass in to the Clustal app controller. Result will be a cogent.core.alignment.Alignment object. """ #create SequenceCollection object from seqs seq_collection = SequenceCollection(seqs, MolType=moltype) #Create mapping between abbreviated IDs and full IDs int_map, int_keys = seq_collection.getIntMap() #Create SequenceCollection from int_map. int_map = SequenceCollection(int_map, MolType=moltype) #Create Clustalw app. app = Clustalw(InputHandler='_input_as_multiline_string', params=params) #Get results using int_map as input to app res = app(int_map.toFasta()) #Get alignment as dict out of results alignment = dict(ClustalParser(res['Align'].readlines())) #Make new dict mapping original IDs new_alignment = {} for k, v in alignment.items(): new_alignment[int_keys[k]] = v #Create an Alignment object from alignment dict new_alignment = Alignment(new_alignment, MolType=moltype) #Clean up res.cleanUp() del (seq_collection, int_map, int_keys, app, res, alignment) return new_alignment
def align_unaligned_seqs(seqs, moltype, params=None): """Returns an Alignment object from seqs. seqs: cogent.core.alignment.SequenceCollection object, or data that can be used to build one. moltype: a MolType object. DNA, RNA, or PROTEIN. params: dict of parameters to pass in to the Clustal app controller. Result will be a cogent.core.alignment.Alignment object. """ # create SequenceCollection object from seqs seq_collection = SequenceCollection(seqs, MolType=moltype) # Create mapping between abbreviated IDs and full IDs int_map, int_keys = seq_collection.getIntMap() # Create SequenceCollection from int_map. int_map = SequenceCollection(int_map, MolType=moltype) # Create Clustalw app. app = Clustalw(InputHandler="_input_as_multiline_string", params=params) # Get results using int_map as input to app res = app(int_map.toFasta()) # Get alignment as dict out of results alignment = dict(ClustalParser(res["Align"].readlines())) # Make new dict mapping original IDs new_alignment = {} for k, v in alignment.items(): new_alignment[int_keys[k]] = v # Create an Alignment object from alignment dict new_alignment = Alignment(new_alignment, MolType=moltype) # Clean up res.cleanUp() del (seq_collection, int_map, int_keys, app, res, alignment) return new_alignment
def build_tree_from_alignment(aln, moltype, best_tree=False, params=None): """Returns a tree from alignment Will check MolType of aln object """ if params is None: params = {} if moltype == DNA or moltype == RNA: params["-nt"] = True elif moltype == PROTEIN: params["-nt"] = False else: raise ValueError, "FastTree does not support moltype: %s" % moltype.label if best_tree: params["-slow"] = True # Create mapping between abbreviated IDs and full IDs int_map, int_keys = aln.getIntMap() # Create SequenceCollection from int_map. int_map = SequenceCollection(int_map, MolType=moltype) app = FastTree(params=params) result = app(int_map.toFasta()) tree = DndParser(result["Tree"].read(), constructor=PhyloNode) # remap tip names for tip in tree.tips(): tip.Name = int_keys[tip.Name] return tree
def build_tree_from_alignment(aln, moltype, best_tree=False, params=None): """Returns a tree from alignment Will check MolType of aln object """ if params is None: params = {} if moltype == DNA or moltype == RNA: params['-nt'] = True elif moltype == PROTEIN: params['-nt'] = False else: raise ValueError, \ "FastTree does not support moltype: %s" % moltype.label if best_tree: params['-slow'] = True #Create mapping between abbreviated IDs and full IDs int_map, int_keys = aln.getIntMap() #Create SequenceCollection from int_map. int_map = SequenceCollection(int_map, MolType=moltype) app = FastTree(params=params) result = app(int_map.toFasta()) tree = DndParser(result['Tree'].read(), constructor=PhyloNode) #remap tip names for tip in tree.tips(): tip.Name = int_keys[tip.Name] return tree
def add_seqs_to_alignment(seqs, aln, moltype, params=None): """Returns an Alignment object from seqs and existing Alignment. seqs: a cogent.core.alignment.SequenceCollection object, or data that can be used to build one. aln: a cogent.core.alignment.Alignment object, or data that can be used to build one params: dict of parameters to pass in to the Clustal app controller. """ # create SequenceCollection object from seqs seq_collection = SequenceCollection(seqs, MolType=moltype) # Create mapping between abbreviated IDs and full IDs seq_int_map, seq_int_keys = seq_collection.getIntMap() # Create SequenceCollection from int_map. seq_int_map = SequenceCollection(seq_int_map, MolType=moltype) # create Alignment object from aln aln = Alignment(aln, MolType=moltype) # Create mapping between abbreviated IDs and full IDs aln_int_map, aln_int_keys = aln.getIntMap(prefix="seqn_") # Create SequenceCollection from int_map. aln_int_map = Alignment(aln_int_map, MolType=moltype) # Update seq_int_keys with aln_int_keys seq_int_keys.update(aln_int_keys) # Create Mafft app. app = Clustalw(InputHandler="_input_as_multiline_string", params=params, SuppressStderr=True) app.Parameters["-align"].off() app.Parameters["-infile"].off() app.Parameters["-sequences"].on() # Add aln_int_map as profile1 app.Parameters["-profile1"].on(app._tempfile_as_multiline_string(aln_int_map.toFasta())) # Add seq_int_map as profile2 app.Parameters["-profile2"].on(app._tempfile_as_multiline_string(seq_int_map.toFasta())) # Get results using int_map as input to app res = app() # Get alignment as dict out of results alignment = dict(ClustalParser(res["Align"].readlines())) # Make new dict mapping original IDs new_alignment = {} for k, v in alignment.items(): new_alignment[seq_int_keys[k]] = v # Create an Alignment object from alignment dict new_alignment = Alignment(new_alignment, MolType=moltype) # Clean up res.cleanUp() remove(app.Parameters["-profile1"].Value) remove(app.Parameters["-profile2"].Value) del (seq_collection, seq_int_map, seq_int_keys, aln, aln_int_map, aln_int_keys, app, res, alignment) return new_alignment
def bootstrap_tree_from_alignment(aln, seed=None, num_trees=None, params=None): """Returns a tree from Alignment object aln with bootstrap support values. aln: an cogent.core.alignment.Alignment object, or data that can be used to build one. seed: an interger, seed value to use num_trees: an integer, number of trees to bootstrap against params: dict of parameters to pass in to the Clustal app controller. The result will be an cogent.core.tree.PhyloNode object, or None if tree fails. If seed is not specifed in params, a random integer between 0-1000 is used. """ # Create instance of controllor, enable bootstrap, disable alignment,tree app = Clustalw(InputHandler='_input_as_multiline_string', params=params, \ WorkingDir='/tmp') app.Parameters['-align'].off() app.Parameters['-tree'].off() if app.Parameters['-bootstrap'].isOff(): if num_trees is None: num_trees = 1000 app.Parameters['-bootstrap'].on(num_trees) if app.Parameters['-seed'].isOff(): if seed is None: seed = randint(0,1000) app.Parameters['-seed'].on(seed) if app.Parameters['-bootlabels'].isOff(): app.Parameters['-bootlabels'].on("node") # Setup mapping. Clustalw clips identifiers. We will need to remap them. seq_collection = SequenceCollection(aln) int_map, int_keys = seq_collection.getIntMap() int_map = SequenceCollection(int_map) # Collect result result = app(int_map.toFasta()) # Build tree tree = DndParser(result['Tree'].read(), constructor=PhyloNode) for node in tree.tips(): node.Name = int_keys[node.Name] # Clean up result.cleanUp() del(seq_collection, app, result, int_map, int_keys) return tree
def bootstrap_tree_from_alignment(aln, seed=None, num_trees=None, params=None): """Returns a tree from Alignment object aln with bootstrap support values. aln: an cogent.core.alignment.Alignment object, or data that can be used to build one. seed: an interger, seed value to use num_trees: an integer, number of trees to bootstrap against params: dict of parameters to pass in to the Clustal app controller. The result will be an cogent.core.tree.PhyloNode object, or None if tree fails. If seed is not specifed in params, a random integer between 0-1000 is used. """ # Create instance of controllor, enable bootstrap, disable alignment,tree app = Clustalw(InputHandler='_input_as_multiline_string', params=params, \ WorkingDir='/tmp') app.Parameters['-align'].off() app.Parameters['-tree'].off() if app.Parameters['-bootstrap'].isOff(): if num_trees is None: num_trees = 1000 app.Parameters['-bootstrap'].on(num_trees) if app.Parameters['-seed'].isOff(): if seed is None: seed = randint(0, 1000) app.Parameters['-seed'].on(seed) if app.Parameters['-bootlabels'].isOff(): app.Parameters['-bootlabels'].on("node") # Setup mapping. Clustalw clips identifiers. We will need to remap them. seq_collection = SequenceCollection(aln) int_map, int_keys = seq_collection.getIntMap() int_map = SequenceCollection(int_map) # Collect result result = app(int_map.toFasta()) # Build tree tree = DndParser(result['Tree'].read(), constructor=PhyloNode) for node in tree.tips(): node.Name = int_keys[node.Name] # Clean up result.cleanUp() del (seq_collection, app, result, int_map, int_keys) return tree
def cdhit_clusters_from_seqs(seqs, moltype, params=None): """Returns the CD-HIT clusters given seqs seqs : dict like collection of sequences moltype : cogent.core.moltype object params : cd-hit parameters NOTE: This method will call CD_HIT if moltype is PROTIEN, CD_HIT_EST if moltype is RNA/DNA, and raise if any other moltype is passed. """ # keys are not remapped. Tested against seq_ids of 100char length seqs = SequenceCollection(seqs, MolType=moltype) #Create mapping between abbreviated IDs and full IDs int_map, int_keys = seqs.getIntMap() #Create SequenceCollection from int_map. int_map = SequenceCollection(int_map,MolType=moltype) # setup params and make sure the output argument is set if params is None: params = {} if '-o' not in params: params['-o'] = get_tmp_filename() # call the correct version of cd-hit base on moltype working_dir = get_tmp_filename() if moltype is PROTEIN: app = CD_HIT(WorkingDir=working_dir, params=params) elif moltype is RNA: app = CD_HIT_EST(WorkingDir=working_dir, params=params) elif moltype is DNA: app = CD_HIT_EST(WorkingDir=working_dir, params=params) else: raise ValueError, "Moltype must be either PROTEIN, RNA, or DNA" # grab result res = app(int_map.toFasta()) clusters = parse_cdhit_clstr_file(res['CLSTR'].readlines()) remapped_clusters = [] for c in clusters: curr = [int_keys[i] for i in c] remapped_clusters.append(curr) # perform cleanup res.cleanUp() shutil.rmtree(working_dir) remove(params['-o'] + '.bak.clstr') return remapped_clusters
def build_tree_from_alignment(aln, moltype, best_tree=False, params=None): """Returns a tree from Alignment object aln. aln: a cogent.core.alignment.Alignment object, or data that can be used to build one. moltype: cogent.core.moltype.MolType object best_tree: unsupported params: dict of parameters to pass in to the Muscle app controller. The result will be an cogent.core.tree.PhyloNode object, or None if tree fails. """ # Create instance of app controller, enable tree, disable alignment app = Muscle(InputHandler='_input_as_multiline_string', params=params, \ WorkingDir='/tmp') app.Parameters['-cluster'].on() app.Parameters['-tree1'].on(get_tmp_filename(app.WorkingDir)) app.Parameters['-seqtype'].on(moltype.label) seq_collection = SequenceCollection(aln, MolType=moltype) #Create mapping between abbreviated IDs and full IDs int_map, int_keys = seq_collection.getIntMap() #Create SequenceCollection from int_map. int_map = SequenceCollection(int_map,MolType=moltype) # Collect result result = app(int_map.toFasta()) # Build tree tree = DndParser(result['Tree1Out'].read(), constructor=PhyloNode) for tip in tree.tips(): tip.Name = int_keys[tip.Name] # Clean up result.cleanUp() del(seq_collection, app, result) return tree
def build_tree_from_alignment(aln, moltype, best_tree=False, params=None): """Returns a tree from Alignment object aln. aln: a cogent.core.alignment.Alignment object, or data that can be used to build one. moltype: cogent.core.moltype.MolType object best_tree: unsupported params: dict of parameters to pass in to the Muscle app controller. The result will be an cogent.core.tree.PhyloNode object, or None if tree fails. """ # Create instance of app controller, enable tree, disable alignment app = Muscle(InputHandler='_input_as_multiline_string', params=params, \ WorkingDir='/tmp') app.Parameters['-clusteronly'].on() app.Parameters['-tree1'].on(get_tmp_filename(app.WorkingDir)) app.Parameters['-seqtype'].on(moltype.label) seq_collection = SequenceCollection(aln, MolType=moltype) #Create mapping between abbreviated IDs and full IDs int_map, int_keys = seq_collection.getIntMap() #Create SequenceCollection from int_map. int_map = SequenceCollection(int_map,MolType=moltype) # Collect result result = app(int_map.toFasta()) # Build tree tree = DndParser(result['Tree1Out'].read(), constructor=PhyloNode) for tip in tree.tips(): tip.Name = int_keys[tip.Name] # Clean up result.cleanUp() del(seq_collection, app, result) return tree
def align_unaligned_seqs(seqs,moltype,params=None,accurate=False): """Aligns unaligned sequences seqs: either list of sequence objects or list of strings add_seq_names: boolean. if True, sequence names are inserted in the list of sequences. if False, it assumes seqs is a list of lines of some proper format that the program can handle """ #create SequenceCollection object from seqs seq_collection = SequenceCollection(seqs,MolType=moltype) #Create mapping between abbreviated IDs and full IDs int_map, int_keys = seq_collection.getIntMap() #Create SequenceCollection from int_map. int_map = SequenceCollection(int_map,MolType=moltype) #Create Mafft app. app = Mafft(InputHandler='_input_as_multiline_string',params=params) #Turn on correct moltype moltype_string = moltype.label.upper() app.Parameters[MOLTYPE_MAP[moltype_string]].on() #Do not report progress app.Parameters['--quiet'].on() #More accurate alignment, sacrificing performance. if accurate: app.Parameters['--globalpair'].on() app.Parameters['--maxiterate'].Value=1000 #Get results using int_map as input to app res = app(int_map.toFasta()) #Get alignment as dict out of results alignment = dict(MinimalFastaParser(res['StdOut'].readlines())) #Make new dict mapping original IDs new_alignment = {} for k,v in list(alignment.items()): new_alignment[int_keys[k]]=v #Create an Alignment object from alignment dict new_alignment = Alignment(new_alignment,MolType=moltype) #Clean up res.cleanUp() del(seq_collection,int_map,int_keys,app,res,alignment) return new_alignment
def align_unaligned_seqs(seqs,moltype,params=None,accurate=False): """Aligns unaligned sequences seqs: either list of sequence objects or list of strings add_seq_names: boolean. if True, sequence names are inserted in the list of sequences. if False, it assumes seqs is a list of lines of some proper format that the program can handle """ #create SequenceCollection object from seqs seq_collection = SequenceCollection(seqs,MolType=moltype) #Create mapping between abbreviated IDs and full IDs int_map, int_keys = seq_collection.getIntMap() #Create SequenceCollection from int_map. int_map = SequenceCollection(int_map,MolType=moltype) #Create Mafft app. app = Mafft(InputHandler='_input_as_multiline_string',params=params) #Turn on correct moltype moltype_string = moltype.label.upper() app.Parameters[MOLTYPE_MAP[moltype_string]].on() #Do not report progress app.Parameters['--quiet'].on() #More accurate alignment, sacrificing performance. if accurate: app.Parameters['--globalpair'].on() app.Parameters['--maxiterate'].Value=1000 #Get results using int_map as input to app res = app(int_map.toFasta()) #Get alignment as dict out of results alignment = dict(parse_fasta(res['StdOut'])) #Make new dict mapping original IDs new_alignment = {} for k,v in alignment.items(): new_alignment[int_keys[k]]=v #Create an Alignment object from alignment dict new_alignment = Alignment(new_alignment,MolType=moltype) #Clean up res.cleanUp() del(seq_collection,int_map,int_keys,app,res,alignment) return new_alignment
def cdhit_from_seqs(seqs, moltype, params=None): """Returns the CD-HIT results given seqs seqs : dict like collection of sequences moltype : cogent.core.moltype object params : cd-hit parameters NOTE: This method will call CD_HIT if moltype is PROTIEN, CD_HIT_EST if moltype is RNA/DNA, and raise if any other moltype is passed. """ # keys are not remapped. Tested against seq_ids of 100char length seqs = SequenceCollection(seqs, MolType=moltype) # setup params and make sure the output argument is set if params is None: params = {} if '-o' not in params: params['-o'] = get_tmp_filename() # call the correct version of cd-hit base on moltype working_dir = get_tmp_filename() if moltype is PROTEIN: app = CD_HIT(WorkingDir=working_dir, params=params) elif moltype is RNA: app = CD_HIT_EST(WorkingDir=working_dir, params=params) elif moltype is DNA: app = CD_HIT_EST(WorkingDir=working_dir, params=params) else: raise ValueError, "Moltype must be either PROTEIN, RNA, or DNA" # grab result res = app(seqs.toFasta()) new_seqs = dict(MinimalFastaParser(res['FASTA'].readlines())) # perform cleanup res.cleanUp() shutil.rmtree(working_dir) remove(params['-o'] + '.bak.clstr') return SequenceCollection(new_seqs, MolType=moltype)
def align_unaligned_seqs(seqs, moltype, params=None): """Returns an Alignment object from seqs. seqs: SequenceCollection object, or data that can be used to build one. moltype: a MolType object. DNA, RNA, or PROTEIN. params: dict of parameters to pass in to the Muscle app controller. Result will be an Alignment object. """ if not params: params = {} #create SequenceCollection object from seqs seq_collection = SequenceCollection(seqs, MolType=moltype) #Create mapping between abbreviated IDs and full IDs int_map, int_keys = seq_collection.getIntMap() #Create SequenceCollection from int_map. int_map = SequenceCollection(int_map, MolType=moltype) #get temporary filename params.update({'-out': get_tmp_filename()}) #Create Muscle app. app = Muscle(InputHandler='_input_as_multiline_string',\ params=params) #Get results using int_map as input to app res = app(int_map.toFasta()) #Get alignment as dict out of results alignment = dict(MinimalFastaParser(res['MuscleOut'].readlines())) #Make new dict mapping original IDs new_alignment = {} for k, v in alignment.items(): new_alignment[int_keys[k]] = v #Create an Alignment object from alignment dict new_alignment = Alignment(new_alignment, MolType=moltype) #Clean up res.cleanUp() del (seq_collection, int_map, int_keys, app, res, alignment, params) return new_alignment
def align_unaligned_seqs(seqs, moltype, params=None): """Returns an Alignment object from seqs. seqs: SequenceCollection object, or data that can be used to build one. moltype: a MolType object. DNA, RNA, or PROTEIN. params: dict of parameters to pass in to the Muscle app controller. Result will be an Alignment object. """ if not params: params = {} #create SequenceCollection object from seqs seq_collection = SequenceCollection(seqs,MolType=moltype) #Create mapping between abbreviated IDs and full IDs int_map, int_keys = seq_collection.getIntMap() #Create SequenceCollection from int_map. int_map = SequenceCollection(int_map,MolType=moltype) #get temporary filename params.update({'-out':get_tmp_filename()}) #Create Muscle app. app = Muscle(InputHandler='_input_as_multiline_string',\ params=params) #Get results using int_map as input to app res = app(int_map.toFasta()) #Get alignment as dict out of results alignment = dict(MinimalFastaParser(res['MuscleOut'].readlines())) #Make new dict mapping original IDs new_alignment = {} for k,v in alignment.items(): new_alignment[int_keys[k]]=v #Create an Alignment object from alignment dict new_alignment = Alignment(new_alignment,MolType=moltype) #Clean up res.cleanUp() del(seq_collection,int_map,int_keys,app,res,alignment,params) return new_alignment
def create_locarnap_alignment(seqs, moltype, struct=False, params=None): """Returns mlocarna results given an unaligned SequenceCollection. - seqs: A SequenceCollection object or something that behaves like one. - moltype: cogent.core.moltype object. -struct: Boolean whether or not to also output vienna structure string """ #Construct SequenceCollection object. seqs = SequenceCollection(seqs, MolType=moltype) #need to make int map. int_map, int_keys = seqs.getIntMap() #construct SequenceCollection object from int map to use functionality int_map = SequenceCollection(int_map, MolType=moltype) #Create application. app = MLocarna(InputHandler='_input_as_multiline_string', params=params) #Get temporary directory to write all mlocarna files. mlocarna_dir = get_tmp_filename(suffix='') app.Parameters['--tgtdir'].on(mlocarna_dir) #set parameters to run locarna-p app.Parameters['--write-structure'].on() app.Parameters['--probabilistic'].on() app.Parameters['--consistency-transformation'].on() res = app(int_map.toFasta()) #get the structure from the results if necessary if struct: structfile = open(res['ProbabilisticAlignment'].name, 'U') structure = "" newstrline = True for line in structfile: line = line.strip() #read in structure lines of alignment (--write-structure) if len(line) > 0 and (line[0] == "." or line[0] == "("): #only append if new structure aspect, since struct is #written both above and below blocks in alignment if newstrline: structure += line newstrline = not newstrline else: newstrline = not newstrline aligned = dict(ClustalParser(res['ClustalAlignment'])) #Make new dict mapping original IDs new_alignment = {} for k, v in aligned.items(): new_alignment[int_keys.get(k, k)] = v #Create an Alignment object from alignment dict new_alignment = Alignment(new_alignment, MolType=moltype) #Clean up after MlocARNA res.cleanUp() shutil.rmtree(mlocarna_dir) #output alignment and structure if asked for, else outout just alignment if struct: return new_alignment, structure else: return new_alignment
def add_seqs_to_alignment(seqs, aln, params=None): """Returns an Alignment object from seqs and existing Alignment. seqs: a cogent.core.alignment.SequenceCollection object, or data that can be used to build one. aln: a cogent.core.alignment.Alignment object, or data that can be used to build one params: dict of parameters to pass in to the Muscle app controller. """ if not params: params = {} #create SequenceCollection object from seqs seqs_collection = SequenceCollection(seqs) #Create mapping between abbreviated IDs and full IDs seqs_int_map, seqs_int_keys = seqs_collection.getIntMap(prefix='seq_') #Create SequenceCollection from int_map. seqs_int_map = SequenceCollection(seqs_int_map) #create SequenceCollection object from aln aln_collection = SequenceCollection(aln) #Create mapping between abbreviated IDs and full IDs aln_int_map, aln_int_keys = aln_collection.getIntMap(prefix='aln_') #Create SequenceCollection from int_map. aln_int_map = SequenceCollection(aln_int_map) #set output and profile options params.update({'-out':get_tmp_filename(), '-profile':True}) #save seqs to tmp file seqs_filename = get_tmp_filename() seqs_out = open(seqs_filename,'w') seqs_out.write(seqs_int_map.toFasta()) seqs_out.close() #save aln to tmp file aln_filename = get_tmp_filename() aln_out = open(aln_filename, 'w') aln_out.write(aln_int_map.toFasta()) aln_out.close() #Create Muscle app and get results app = Muscle(InputHandler='_input_as_multifile', params=params, WorkingDir=tempfile.gettempdir()) res = app((aln_filename, seqs_filename)) #Get alignment as dict out of results alignment = dict(parse_fasta(res['MuscleOut'])) #Make new dict mapping original IDs new_alignment = {} for k,v in alignment.items(): if k in seqs_int_keys: new_alignment[seqs_int_keys[k]] = v else: new_alignment[aln_int_keys[k]] = v #Create an Alignment object from alignment dict new_alignment = Alignment(new_alignment) #Clean up res.cleanUp() del(seqs_collection, seqs_int_map, seqs_int_keys) del(aln_collection, aln_int_map, aln_int_keys) del(app, res, alignment, params) remove(seqs_filename) remove(aln_filename) return new_alignment
def align_two_alignments(aln1, aln2, params=None): """Returns an Alignment object from two existing Alignments. aln1, aln2: cogent.core.alignment.Alignment objects, or data that can be used to build them. params: dict of parameters to pass in to the Muscle app controller. """ if not params: params = {} #create SequenceCollection object from aln1 aln1_collection = SequenceCollection(aln1) #Create mapping between abbreviated IDs and full IDs aln1_int_map, aln1_int_keys = aln1_collection.getIntMap(prefix='aln1_') #Create SequenceCollection from int_map. aln1_int_map = SequenceCollection(aln1_int_map) #create SequenceCollection object from aln2 aln2_collection = SequenceCollection(aln2) #Create mapping between abbreviated IDs and full IDs aln2_int_map, aln2_int_keys = aln2_collection.getIntMap(prefix='aln2_') #Create SequenceCollection from int_map. aln2_int_map = SequenceCollection(aln2_int_map) #set output and profile options params.update({'-out':get_tmp_filename(), '-profile':True}) #save aln1 to tmp file aln1_filename = get_tmp_filename() aln1_out = open(aln1_filename,'w') aln1_out.write(aln1_int_map.toFasta()) aln1_out.close() #save aln2 to tmp file aln2_filename = get_tmp_filename() aln2_out = open(aln2_filename, 'w') aln2_out.write(aln2_int_map.toFasta()) aln2_out.close() #Create Muscle app and get results app = Muscle(InputHandler='_input_as_multifile', params=params) res = app((aln1_filename, aln2_filename)) #Get alignment as dict out of results alignment = dict(MinimalFastaParser(res['MuscleOut'].readlines())) #Make new dict mapping original IDs new_alignment = {} for k,v in alignment.items(): if k in aln1_int_keys: new_alignment[aln1_int_keys[k]] = v else: new_alignment[aln2_int_keys[k]] = v #Create an Alignment object from alignment dict new_alignment = Alignment(new_alignment) #Clean up res.cleanUp() del(aln1_collection, aln1_int_map, aln1_int_keys) del(aln2_collection, aln2_int_map, aln2_int_keys) del(app, res, alignment, params) remove(aln1_filename) remove(aln2_filename) return new_alignment
def add_seqs_to_alignment(seqs, aln, moltype, params=None, accurate=False): """Returns an Alignment object from seqs and existing Alignment. seqs: a cogent.core.sequence.Sequence object, or data that can be used to build one. aln: an cogent.core.alignment.Alignment object, or data that can be used to build one params: dict of parameters to pass in to the Mafft app controller. """ #create SequenceCollection object from seqs seq_collection = SequenceCollection(seqs,MolType=moltype) #Create mapping between abbreviated IDs and full IDs seq_int_map, seq_int_keys = seq_collection.getIntMap() #Create SequenceCollection from int_map. seq_int_map = SequenceCollection(seq_int_map,MolType=moltype) #create Alignment object from aln aln = Alignment(aln,MolType=moltype) #Create mapping between abbreviated IDs and full IDs aln_int_map, aln_int_keys = aln.getIntMap(prefix='seqn_') #Create SequenceCollection from int_map. aln_int_map = Alignment(aln_int_map,MolType=moltype) #Update seq_int_keys with aln_int_keys seq_int_keys.update(aln_int_keys) #Create Mafft app. app = Mafft(InputHandler='_input_as_multiline_string',\ params=params, SuppressStderr=True) #Turn on correct moltype moltype_string = moltype.label.upper() app.Parameters[MOLTYPE_MAP[moltype_string]].on() #Do not report progress app.Parameters['--quiet'].on() #Add aln_int_map as seed alignment app.Parameters['--seed'].on(\ app._tempfile_as_multiline_string(aln_int_map.toFasta())) #More accurate alignment, sacrificing performance. if accurate: app.Parameters['--globalpair'].on() app.Parameters['--maxiterate'].Value=1000 #Get results using int_map as input to app res = app(seq_int_map.toFasta()) #Get alignment as dict out of results alignment = dict(parse_fasta(res['StdOut'])) #Make new dict mapping original IDs new_alignment = {} for k,v in alignment.items(): key = k.replace('_seed_','') new_alignment[seq_int_keys[key]]=v #Create an Alignment object from alignment dict new_alignment = Alignment(new_alignment,MolType=moltype) #Clean up res.cleanUp() remove(app.Parameters['--seed'].Value) del(seq_collection,seq_int_map,seq_int_keys,\ aln,aln_int_map,aln_int_keys,app,res,alignment) return new_alignment
def add_seqs_to_alignment(seqs, aln, moltype, params=None, accurate=False): """Returns an Alignment object from seqs and existing Alignment. seqs: a cogent.core.sequence.Sequence object, or data that can be used to build one. aln: an cogent.core.alignment.Alignment object, or data that can be used to build one params: dict of parameters to pass in to the Mafft app controller. """ #create SequenceCollection object from seqs seq_collection = SequenceCollection(seqs,MolType=moltype) #Create mapping between abbreviated IDs and full IDs seq_int_map, seq_int_keys = seq_collection.getIntMap() #Create SequenceCollection from int_map. seq_int_map = SequenceCollection(seq_int_map,MolType=moltype) #create Alignment object from aln aln = Alignment(aln,MolType=moltype) #Create mapping between abbreviated IDs and full IDs aln_int_map, aln_int_keys = aln.getIntMap(prefix='seqn_') #Create SequenceCollection from int_map. aln_int_map = Alignment(aln_int_map,MolType=moltype) #Update seq_int_keys with aln_int_keys seq_int_keys.update(aln_int_keys) #Create Mafft app. app = Mafft(InputHandler='_input_as_multiline_string',\ params=params, SuppressStderr=True) #Turn on correct moltype moltype_string = moltype.label.upper() app.Parameters[MOLTYPE_MAP[moltype_string]].on() #Do not report progress app.Parameters['--quiet'].on() #Add aln_int_map as seed alignment app.Parameters['--seed'].on(\ app._tempfile_as_multiline_string(aln_int_map.toFasta())) #More accurate alignment, sacrificing performance. if accurate: app.Parameters['--globalpair'].on() app.Parameters['--maxiterate'].Value=1000 #Get results using int_map as input to app res = app(seq_int_map.toFasta()) #Get alignment as dict out of results alignment = dict(MinimalFastaParser(res['StdOut'].readlines())) #Make new dict mapping original IDs new_alignment = {} for k,v in list(alignment.items()): key = k.replace('_seed_','') new_alignment[seq_int_keys[key]]=v #Create an Alignment object from alignment dict new_alignment = Alignment(new_alignment,MolType=moltype) #Clean up res.cleanUp() remove(app.Parameters['--seed'].Value) del(seq_collection,seq_int_map,seq_int_keys,\ aln,aln_int_map,aln_int_keys,app,res,alignment) return new_alignment
def align_two_alignments(aln1, aln2, params=None): """Returns an Alignment object from two existing Alignments. aln1, aln2: cogent.core.alignment.Alignment objects, or data that can be used to build them. params: dict of parameters to pass in to the Muscle app controller. """ if not params: params = {} #create SequenceCollection object from aln1 aln1_collection = SequenceCollection(aln1) #Create mapping between abbreviated IDs and full IDs aln1_int_map, aln1_int_keys = aln1_collection.getIntMap(prefix='aln1_') #Create SequenceCollection from int_map. aln1_int_map = SequenceCollection(aln1_int_map) #create SequenceCollection object from aln2 aln2_collection = SequenceCollection(aln2) #Create mapping between abbreviated IDs and full IDs aln2_int_map, aln2_int_keys = aln2_collection.getIntMap(prefix='aln2_') #Create SequenceCollection from int_map. aln2_int_map = SequenceCollection(aln2_int_map) #set output and profile options params.update({'-out': get_tmp_filename(), '-profile': True}) #save aln1 to tmp file aln1_filename = get_tmp_filename() aln1_out = open(aln1_filename, 'w') aln1_out.write(aln1_int_map.toFasta()) aln1_out.close() #save aln2 to tmp file aln2_filename = get_tmp_filename() aln2_out = open(aln2_filename, 'w') aln2_out.write(aln2_int_map.toFasta()) aln2_out.close() #Create Muscle app and get results app = Muscle(InputHandler='_input_as_multifile', params=params) res = app((aln1_filename, aln2_filename)) #Get alignment as dict out of results alignment = dict(MinimalFastaParser(res['MuscleOut'].readlines())) #Make new dict mapping original IDs new_alignment = {} for k, v in alignment.items(): if k in aln1_int_keys: new_alignment[aln1_int_keys[k]] = v else: new_alignment[aln2_int_keys[k]] = v #Create an Alignment object from alignment dict new_alignment = Alignment(new_alignment) #Clean up res.cleanUp() del (aln1_collection, aln1_int_map, aln1_int_keys) del (aln2_collection, aln2_int_map, aln2_int_keys) del (app, res, alignment, params) remove(aln1_filename) remove(aln2_filename) return new_alignment
def add_seqs_to_alignment(seqs, aln, moltype, params=None): """Returns an Alignment object from seqs and existing Alignment. seqs: a cogent.core.alignment.SequenceCollection object, or data that can be used to build one. aln: a cogent.core.alignment.Alignment object, or data that can be used to build one params: dict of parameters to pass in to the Clustal app controller. """ #create SequenceCollection object from seqs seq_collection = SequenceCollection(seqs, MolType=moltype) #Create mapping between abbreviated IDs and full IDs seq_int_map, seq_int_keys = seq_collection.getIntMap() #Create SequenceCollection from int_map. seq_int_map = SequenceCollection(seq_int_map, MolType=moltype) #create Alignment object from aln aln = Alignment(aln, MolType=moltype) #Create mapping between abbreviated IDs and full IDs aln_int_map, aln_int_keys = aln.getIntMap(prefix='seqn_') #Create SequenceCollection from int_map. aln_int_map = Alignment(aln_int_map, MolType=moltype) #Update seq_int_keys with aln_int_keys seq_int_keys.update(aln_int_keys) #Create Mafft app. app = Clustalw(InputHandler='_input_as_multiline_string',\ params=params, SuppressStderr=True) app.Parameters['-align'].off() app.Parameters['-infile'].off() app.Parameters['-sequences'].on() #Add aln_int_map as profile1 app.Parameters['-profile1'].on(\ app._tempfile_as_multiline_string(aln_int_map.toFasta())) #Add seq_int_map as profile2 app.Parameters['-profile2'].on(\ app._tempfile_as_multiline_string(seq_int_map.toFasta())) #Get results using int_map as input to app res = app() #Get alignment as dict out of results alignment = dict(ClustalParser(res['Align'].readlines())) #Make new dict mapping original IDs new_alignment = {} for k, v in alignment.items(): new_alignment[seq_int_keys[k]] = v #Create an Alignment object from alignment dict new_alignment = Alignment(new_alignment, MolType=moltype) #Clean up res.cleanUp() remove(app.Parameters['-profile1'].Value) remove(app.Parameters['-profile2'].Value) del(seq_collection,seq_int_map,seq_int_keys,\ aln,aln_int_map,aln_int_keys,app,res,alignment) return new_alignment
def create_locarnap_alignment(seqs,moltype,struct=False,params=None): """Returns mlocarna results given an unaligned SequenceCollection. - seqs: A SequenceCollection object or something that behaves like one. - moltype: cogent.core.moltype object. -struct: Boolean whether or not to also output vienna structure string """ #Construct SequenceCollection object. seqs = SequenceCollection(seqs,MolType=moltype) #need to make int map. int_map, int_keys = seqs.getIntMap() #construct SequenceCollection object from int map to use functionality int_map = SequenceCollection(int_map, MolType=moltype) #Create application. app = MLocarna(InputHandler='_input_as_multiline_string',params=params) #Get temporary directory to write all mlocarna files. mlocarna_dir = get_tmp_filename(suffix='') app.Parameters['--tgtdir'].on(mlocarna_dir) #set parameters to run locarna-p app.Parameters['--write-structure'].on() app.Parameters['--probabilistic'].on() app.Parameters['--consistency-transformation'].on() res = app(int_map.toFasta()) #get the structure from the results if necessary if struct: structfile = open(res['ProbabilisticAlignment'].name, 'U') structure = "" newstrline = True for line in structfile: line = line.strip() #read in structure lines of alignment (--write-structure) if len(line) > 0 and (line[0] == "." or line[0] == "("): #only append if new structure aspect, since struct is #written both above and below blocks in alignment if newstrline: structure += line newstrline = not newstrline else: newstrline = not newstrline aligned = dict(ClustalParser(res['ClustalAlignment'])) #Make new dict mapping original IDs new_alignment={} for k,v in aligned.items(): new_alignment[int_keys.get(k,k)]=v #Create an Alignment object from alignment dict new_alignment = Alignment(new_alignment,MolType=moltype) #Clean up after MlocARNA res.cleanUp() shutil.rmtree(mlocarna_dir) #output alignment and structure if asked for, else outout just alignment if struct: return new_alignment, structure else: return new_alignment
def build_tree_from_alignment(aln, moltype, best_tree=False, params=None): """Returns a tree from Alignment object aln. aln: an cogent.core.alignment.Alignment object, or data that can be used to build one. moltype: cogent.core.moltype.MolType object best_tree: if True (default:False), uses a slower but more accurate algorithm to build the tree. params: dict of parameters to pass in to the Clustal app controller. The result will be an cogent.core.tree.PhyloNode object, or None if tree fails. """ # Create instance of app controller, enable tree, disable alignment app = Clustalw(InputHandler='_input_as_multiline_string', params=params, \ WorkingDir='/tmp') app.Parameters['-align'].off() #Set params to empty dict if None. if params is None: params = {} if moltype == DNA or moltype == RNA: params['-type'] = 'd' elif moltype == PROTEIN: params['-type'] = 'p' else: raise ValueError, "moltype must be DNA, RNA, or PROTEIN" # best_tree -> bootstrap if best_tree: if '-bootstrap' not in params: app.Parameters['-bootstrap'].on(1000) if '-seed' not in params: app.Parameters['-seed'].on(randint(0, 1000)) if '-bootlabels' not in params: app.Parameters['-bootlabels'].on('nodes') else: app.Parameters['-tree'].on() # Setup mapping. Clustalw clips identifiers. We will need to remap them. seq_collection = SequenceCollection(aln) int_map, int_keys = seq_collection.getIntMap() int_map = SequenceCollection(int_map) # Collect result result = app(int_map.toFasta()) # Build tree tree = DndParser(result['Tree'].read(), constructor=PhyloNode) for node in tree.tips(): node.Name = int_keys[node.Name] # Clean up result.cleanUp() del (seq_collection, app, result, int_map, int_keys) return tree