def align_unaligned_seqs(seqs, moltype, params=None): """Returns an Alignment object from seqs. seqs: cogent.core.alignment.SequenceCollection object, or data that can be used to build one. moltype: a MolType object. DNA, RNA, or PROTEIN. params: dict of parameters to pass in to the Clustal app controller. Result will be a cogent.core.alignment.Alignment object. """ # create SequenceCollection object from seqs seq_collection = SequenceCollection(seqs, MolType=moltype) # Create mapping between abbreviated IDs and full IDs int_map, int_keys = seq_collection.getIntMap() # Create SequenceCollection from int_map. int_map = SequenceCollection(int_map, MolType=moltype) # Create Clustalw app. app = Clustalw(InputHandler="_input_as_multiline_string", params=params) # Get results using int_map as input to app res = app(int_map.toFasta()) # Get alignment as dict out of results alignment = dict(ClustalParser(res["Align"].readlines())) # Make new dict mapping original IDs new_alignment = {} for k, v in alignment.items(): new_alignment[int_keys[k]] = v # Create an Alignment object from alignment dict new_alignment = Alignment(new_alignment, MolType=moltype) # Clean up res.cleanUp() del (seq_collection, int_map, int_keys, app, res, alignment) return new_alignment
def build_tree_from_alignment(aln, moltype, best_tree=False, params=None): """Returns a tree from alignment Will check MolType of aln object """ if params is None: params = {} if moltype == DNA or moltype == RNA: params['-nt'] = True elif moltype == PROTEIN: params['-nt'] = False else: raise ValueError, \ "FastTree does not support moltype: %s" % moltype.label if best_tree: params['-slow'] = True #Create mapping between abbreviated IDs and full IDs int_map, int_keys = aln.getIntMap() #Create SequenceCollection from int_map. int_map = SequenceCollection(int_map, MolType=moltype) app = FastTree(params=params) result = app(int_map.toFasta()) tree = DndParser(result['Tree'].read(), constructor=PhyloNode) #remap tip names for tip in tree.tips(): tip.Name = int_keys[tip.Name] return tree
def test_seqs_to_flows(self): """seqs_to_flows should take a list of seqs and probs and return """ seqs = [('a', 'ATCGT'), ('b', 'ACCCAG'), ('c', 'GTAATG')] a = SequenceCollection(seqs) flows = seqs_to_flows(a.items()) assert isinstance(flows, FlowgramCollection) for f, i in zip(flows, [ '0.0 1.0 0.0 0.0 1.0 0.0 1.0 1.0 1.0 0.0 0.0 0.0', '0.0 1.0 3.0 0.0 0.0 1.0 0.0 1.0', '0.0 0.0 0.0 1.0 1.0 2.0 0.0 0.0 1.0 0.0 0.0 1.0' ]): self.assertEqual(f, i) probs = { 0: [1.0, 0, 0, 0, 0], 1: [0, 1.0, 0, 0, 0], 2: [0, 0, 1.0, 0, 0], 3: [0, 0, 0, 1.0, 0] } flows = seqs_to_flows(a.items(), probs=probs, bin_size=1.0) assert isinstance(flows, FlowgramCollection) for f, i in zip(flows, [ '0.0 1.0 0.0 0.0 1.0 0.0 1.0 1.0 1.0 0.0 0.0 0.0', '0.0 1.0 3.0 0.0 0.0 1.0 0.0 1.0', '0.0 0.0 0.0 1.0 1.0 2.0 0.0 0.0 1.0 0.0 0.0 1.0' ]): self.assertEqual(f, i)
def build_tree_from_alignment(aln, moltype, best_tree=False, params=None): """Returns a tree from alignment Will check MolType of aln object """ if params is None: params = {} if moltype == DNA or moltype == RNA: params["-nt"] = True elif moltype == PROTEIN: params["-nt"] = False else: raise ValueError, "FastTree does not support moltype: %s" % moltype.label if best_tree: params["-slow"] = True # Create mapping between abbreviated IDs and full IDs int_map, int_keys = aln.getIntMap() # Create SequenceCollection from int_map. int_map = SequenceCollection(int_map, MolType=moltype) app = FastTree(params=params) result = app(int_map.toFasta()) tree = DndParser(result["Tree"].read(), constructor=PhyloNode) # remap tip names for tip in tree.tips(): tip.Name = int_keys[tip.Name] return tree
def test_seqs_to_flows(self): """seqs_to_flows should take a list of seqs and probs and return """ seqs = [("a", "ATCGT"), ("b", "ACCCAG"), ("c", "GTAATG")] a = SequenceCollection(seqs) flows = seqs_to_flows(a.items()) assert isinstance(flows, FlowgramCollection) for f, i in zip( flows, [ "0.0 1.0 0.0 0.0 1.0 0.0 1.0 1.0 1.0 0.0 0.0 0.0", "0.0 1.0 3.0 0.0 0.0 1.0 0.0 1.0", "0.0 0.0 0.0 1.0 1.0 2.0 0.0 0.0 1.0 0.0 0.0 1.0", ], ): self.assertEqual(f, i) probs = {0: [1.0, 0, 0, 0, 0], 1: [0, 1.0, 0, 0, 0], 2: [0, 0, 1.0, 0, 0], 3: [0, 0, 0, 1.0, 0]} flows = seqs_to_flows(a.items(), probs=probs, bin_size=1.0) assert isinstance(flows, FlowgramCollection) for f, i in zip( flows, [ "0.0 1.0 0.0 0.0 1.0 0.0 1.0 1.0 1.0 0.0 0.0 0.0", "0.0 1.0 3.0 0.0 0.0 1.0 0.0 1.0", "0.0 0.0 0.0 1.0 1.0 2.0 0.0 0.0 1.0 0.0 0.0 1.0", ], ): self.assertEqual(f, i)
def build_tree_from_alignment(aln, moltype, best_tree=False, params=None): """Returns a tree from Alignment object aln. aln: an cogent.core.alignment.Alignment object, or data that can be used to build one. moltype: cogent.core.moltype.MolType object best_tree: if True (default:False), uses a slower but more accurate algorithm to build the tree. params: dict of parameters to pass in to the Clustal app controller. The result will be an cogent.core.tree.PhyloNode object, or None if tree fails. """ # Create instance of app controller, enable tree, disable alignment app = Clustalw(InputHandler="_input_as_multiline_string", params=params, WorkingDir="/tmp") app.Parameters["-align"].off() # Set params to empty dict if None. if params is None: params = {} if moltype == DNA or moltype == RNA: params["-type"] = "d" elif moltype == PROTEIN: params["-type"] = "p" else: raise ValueError, "moltype must be DNA, RNA, or PROTEIN" # best_tree -> bootstrap if best_tree: if "-bootstrap" not in params: app.Parameters["-bootstrap"].on(1000) if "-seed" not in params: app.Parameters["-seed"].on(randint(0, 1000)) if "-bootlabels" not in params: app.Parameters["-bootlabels"].on("nodes") else: app.Parameters["-tree"].on() # Setup mapping. Clustalw clips identifiers. We will need to remap them. seq_collection = SequenceCollection(aln) int_map, int_keys = seq_collection.getIntMap() int_map = SequenceCollection(int_map) # Collect result result = app(int_map.toFasta()) # Build tree tree = DndParser(result["Tree"].read(), constructor=PhyloNode) for node in tree.tips(): node.Name = int_keys[node.Name] # Clean up result.cleanUp() del (seq_collection, app, result, int_map, int_keys) return tree
def add_seqs_to_alignment(seqs, aln, moltype, params=None): """Returns an Alignment object from seqs and existing Alignment. seqs: a cogent.core.alignment.SequenceCollection object, or data that can be used to build one. aln: a cogent.core.alignment.Alignment object, or data that can be used to build one params: dict of parameters to pass in to the Clustal app controller. """ # create SequenceCollection object from seqs seq_collection = SequenceCollection(seqs, MolType=moltype) # Create mapping between abbreviated IDs and full IDs seq_int_map, seq_int_keys = seq_collection.getIntMap() # Create SequenceCollection from int_map. seq_int_map = SequenceCollection(seq_int_map, MolType=moltype) # create Alignment object from aln aln = Alignment(aln, MolType=moltype) # Create mapping between abbreviated IDs and full IDs aln_int_map, aln_int_keys = aln.getIntMap(prefix="seqn_") # Create SequenceCollection from int_map. aln_int_map = Alignment(aln_int_map, MolType=moltype) # Update seq_int_keys with aln_int_keys seq_int_keys.update(aln_int_keys) # Create Mafft app. app = Clustalw(InputHandler="_input_as_multiline_string", params=params, SuppressStderr=True) app.Parameters["-align"].off() app.Parameters["-infile"].off() app.Parameters["-sequences"].on() # Add aln_int_map as profile1 app.Parameters["-profile1"].on(app._tempfile_as_multiline_string(aln_int_map.toFasta())) # Add seq_int_map as profile2 app.Parameters["-profile2"].on(app._tempfile_as_multiline_string(seq_int_map.toFasta())) # Get results using int_map as input to app res = app() # Get alignment as dict out of results alignment = dict(ClustalParser(res["Align"].readlines())) # Make new dict mapping original IDs new_alignment = {} for k, v in alignment.items(): new_alignment[seq_int_keys[k]] = v # Create an Alignment object from alignment dict new_alignment = Alignment(new_alignment, MolType=moltype) # Clean up res.cleanUp() remove(app.Parameters["-profile1"].Value) remove(app.Parameters["-profile2"].Value) del (seq_collection, seq_int_map, seq_int_keys, aln, aln_int_map, aln_int_keys, app, res, alignment) return new_alignment
def bootstrap_tree_from_alignment(aln, seed=None, num_trees=None, params=None): """Returns a tree from Alignment object aln with bootstrap support values. aln: an cogent.core.alignment.Alignment object, or data that can be used to build one. seed: an interger, seed value to use num_trees: an integer, number of trees to bootstrap against params: dict of parameters to pass in to the Clustal app controller. The result will be an cogent.core.tree.PhyloNode object, or None if tree fails. If seed is not specifed in params, a random integer between 0-1000 is used. """ # Create instance of controllor, enable bootstrap, disable alignment,tree app = Clustalw(InputHandler='_input_as_multiline_string', params=params, \ WorkingDir='/tmp') app.Parameters['-align'].off() app.Parameters['-tree'].off() if app.Parameters['-bootstrap'].isOff(): if num_trees is None: num_trees = 1000 app.Parameters['-bootstrap'].on(num_trees) if app.Parameters['-seed'].isOff(): if seed is None: seed = randint(0,1000) app.Parameters['-seed'].on(seed) if app.Parameters['-bootlabels'].isOff(): app.Parameters['-bootlabels'].on("node") # Setup mapping. Clustalw clips identifiers. We will need to remap them. seq_collection = SequenceCollection(aln) int_map, int_keys = seq_collection.getIntMap() int_map = SequenceCollection(int_map) # Collect result result = app(int_map.toFasta()) # Build tree tree = DndParser(result['Tree'].read(), constructor=PhyloNode) for node in tree.tips(): node.Name = int_keys[node.Name] # Clean up result.cleanUp() del(seq_collection, app, result, int_map, int_keys) return tree
def clustal_from_alignment(aln, interleave_len=None): """Returns a string in Clustal format. - aln: can be an Alignment object or a dict. - interleave_len: sequence line width. Only available if sequences are aligned. """ if not aln: return '' # get seq output order try: order = aln.RowOrder except: order = aln.keys() order.sort() seqs = SequenceCollection(aln) clustal_list = ["CLUSTAL\n"] if seqs.isRagged(): raise ValueError,\ "Sequences in alignment are not all the same length." +\ "Cannot generate Clustal format." aln_len = seqs.SeqLen #Get all labels labels = copy(seqs.Names) #Find all label lengths in order to get padding. label_lengths = [len(l) for l in labels] label_max = max(label_lengths) max_spaces = label_max+4 #Get ordered seqs ordered_seqs = [seqs.NamedSeqs[label] for label in order] if interleave_len is not None: curr_ix = 0 while curr_ix < aln_len: clustal_list.extend(["%s%s%s"%(x,' '*(max_spaces-len(x)),\ y[curr_ix:curr_ix+ \ interleave_len]) for x,y in zip(order,ordered_seqs)]) clustal_list.append("") curr_ix += interleave_len else: clustal_list.extend(["%s%s%s"%(x,' '*(max_spaces-len(x)),y) \ for x,y in zip(order,ordered_seqs)]) clustal_list.append("") return '\n'.join(clustal_list)
def load_from_clustal(data, seq_constructor=Sequence, strict=True): recs = [(name, seq_constructor(seq, )) for name, seq in\ ClustalParser(data, strict)] lengths = [len(i[1]) for i in recs] if lengths and max(lengths) == min(lengths): return Alignment(recs, MolType=BYTES) else: return SequenceCollection(recs, MolType=BYTES)
def build_tree_from_alignment(aln, moltype, best_tree=False, params=None): """Returns a tree from Alignment object aln. aln: a cogent.core.alignment.Alignment object, or data that can be used to build one. moltype: cogent.core.moltype.MolType object best_tree: unsupported params: dict of parameters to pass in to the Muscle app controller. The result will be an cogent.core.tree.PhyloNode object, or None if tree fails. """ # Create instance of app controller, enable tree, disable alignment app = Muscle(InputHandler='_input_as_multiline_string', params=params, \ WorkingDir='/tmp') app.Parameters['-clusteronly'].on() app.Parameters['-tree1'].on(get_tmp_filename(app.WorkingDir)) app.Parameters['-seqtype'].on(moltype.label) seq_collection = SequenceCollection(aln, MolType=moltype) #Create mapping between abbreviated IDs and full IDs int_map, int_keys = seq_collection.getIntMap() #Create SequenceCollection from int_map. int_map = SequenceCollection(int_map,MolType=moltype) # Collect result result = app(int_map.toFasta()) # Build tree tree = DndParser(result['Tree1Out'].read(), constructor=PhyloNode) for tip in tree.tips(): tip.Name = int_keys[tip.Name] # Clean up result.cleanUp() del(seq_collection, app, result) return tree
def align_unaligned_seqs(seqs,moltype,params=None,accurate=False): """Aligns unaligned sequences seqs: either list of sequence objects or list of strings add_seq_names: boolean. if True, sequence names are inserted in the list of sequences. if False, it assumes seqs is a list of lines of some proper format that the program can handle """ #create SequenceCollection object from seqs seq_collection = SequenceCollection(seqs,MolType=moltype) #Create mapping between abbreviated IDs and full IDs int_map, int_keys = seq_collection.getIntMap() #Create SequenceCollection from int_map. int_map = SequenceCollection(int_map,MolType=moltype) #Create Mafft app. app = Mafft(InputHandler='_input_as_multiline_string',params=params) #Turn on correct moltype moltype_string = moltype.label.upper() app.Parameters[MOLTYPE_MAP[moltype_string]].on() #Do not report progress app.Parameters['--quiet'].on() #More accurate alignment, sacrificing performance. if accurate: app.Parameters['--globalpair'].on() app.Parameters['--maxiterate'].Value=1000 #Get results using int_map as input to app res = app(int_map.toFasta()) #Get alignment as dict out of results alignment = dict(parse_fasta(res['StdOut'])) #Make new dict mapping original IDs new_alignment = {} for k,v in alignment.items(): new_alignment[int_keys[k]]=v #Create an Alignment object from alignment dict new_alignment = Alignment(new_alignment,MolType=moltype) #Clean up res.cleanUp() del(seq_collection,int_map,int_keys,app,res,alignment) return new_alignment
def cdhit_from_seqs(seqs, moltype, params=None): """Returns the CD-HIT results given seqs seqs : dict like collection of sequences moltype : cogent.core.moltype object params : cd-hit parameters NOTE: This method will call CD_HIT if moltype is PROTIEN, CD_HIT_EST if moltype is RNA/DNA, and raise if any other moltype is passed. """ # keys are not remapped. Tested against seq_ids of 100char length seqs = SequenceCollection(seqs, MolType=moltype) # setup params and make sure the output argument is set if params is None: params = {} if '-o' not in params: params['-o'] = get_tmp_filename() # call the correct version of cd-hit base on moltype working_dir = get_tmp_filename() if moltype is PROTEIN: app = CD_HIT(WorkingDir=working_dir, params=params) elif moltype is RNA: app = CD_HIT_EST(WorkingDir=working_dir, params=params) elif moltype is DNA: app = CD_HIT_EST(WorkingDir=working_dir, params=params) else: raise ValueError, "Moltype must be either PROTEIN, RNA, or DNA" # grab result res = app(seqs.toFasta()) new_seqs = dict(MinimalFastaParser(res['FASTA'].readlines())) # perform cleanup res.cleanUp() shutil.rmtree(working_dir) remove(params['-o'] + '.bak.clstr') return SequenceCollection(new_seqs, MolType=moltype)
def getSeqCollection(self, feature_types=None, where_feature=None): """returns a SequenceCollection instance of the unaligned sequences""" seqs = [] for member in self.Members: if feature_types: seq = member.getAnnotatedSeq(feature_types, where_feature) else: seq = member.Seq if seq is None: continue seqs.append((seq.Name, seq)) return SequenceCollection(data=seqs, MolType=DNA)
def align_unaligned_seqs(seqs, moltype, params=None): """Returns an Alignment object from seqs. seqs: cogent.core.alignment.SequenceCollection object, or data that can be used to build one. moltype: a MolType object. DNA, RNA, or PROTEIN. params: dict of parameters to pass in to the Clustal app controller. Result will be a cogent.core.alignment.Alignment object. """ #create SequenceCollection object from seqs seq_collection = SequenceCollection(seqs, MolType=moltype) #Create mapping between abbreviated IDs and full IDs int_map, int_keys = seq_collection.getIntMap() #Create SequenceCollection from int_map. int_map = SequenceCollection(int_map, MolType=moltype) #Create Clustalw app. app = Clustalw(InputHandler='_input_as_multiline_string', params=params) #Get results using int_map as input to app res = app(int_map.toFasta()) #Get alignment as dict out of results alignment = dict(ClustalParser(res['Align'].readlines())) #Make new dict mapping original IDs new_alignment = {} for k, v in alignment.items(): new_alignment[int_keys[k]] = v #Create an Alignment object from alignment dict new_alignment = Alignment(new_alignment, MolType=moltype) #Clean up res.cleanUp() del (seq_collection, int_map, int_keys, app, res, alignment) return new_alignment
def align_unaligned_seqs(seqs, moltype, params=None): """Returns an Alignment object from seqs. seqs: SequenceCollection object, or data that can be used to build one. moltype: a MolType object. DNA, RNA, or PROTEIN. params: dict of parameters to pass in to the Muscle app controller. Result will be an Alignment object. """ if not params: params = {} #create SequenceCollection object from seqs seq_collection = SequenceCollection(seqs,MolType=moltype) #Create mapping between abbreviated IDs and full IDs int_map, int_keys = seq_collection.getIntMap() #Create SequenceCollection from int_map. int_map = SequenceCollection(int_map,MolType=moltype) #get temporary filename params.update({'-out':get_tmp_filename()}) #Create Muscle app. app = Muscle(InputHandler='_input_as_multiline_string',\ params=params) #Get results using int_map as input to app res = app(int_map.toFasta()) #Get alignment as dict out of results alignment = dict(MinimalFastaParser(res['MuscleOut'].readlines())) #Make new dict mapping original IDs new_alignment = {} for k,v in alignment.items(): new_alignment[int_keys[k]]=v #Create an Alignment object from alignment dict new_alignment = Alignment(new_alignment,MolType=moltype) #Clean up res.cleanUp() del(seq_collection,int_map,int_keys,app,res,alignment,params) return new_alignment
def test_seqs_to_flows(self): """seqs_to_flows should take a list of seqs and probs and return """ seqs = [('a','ATCGT'), ('b','ACCCAG'), ('c','GTAATG')] a = SequenceCollection(seqs) flows = seqs_to_flows(a.items()) assert isinstance(flows,FlowgramCollection) for f,i in zip(flows,['0.0 1.0 0.0 0.0 1.0 0.0 1.0 1.0 1.0 0.0 0.0 0.0', '0.0 1.0 3.0 0.0 0.0 1.0 0.0 1.0', '0.0 0.0 0.0 1.0 1.0 2.0 0.0 0.0 1.0 0.0 0.0 1.0']): self.assertEqual(f,i) probs ={0:[1.0,0,0,0,0],1:[0,1.0,0,0,0],2:[0,0,1.0,0,0],3:[0,0,0,1.0,0]} flows = seqs_to_flows(a.items(), probs = probs, bin_size = 1.0) assert isinstance(flows,FlowgramCollection) for f,i in zip(flows,['0.0 1.0 0.0 0.0 1.0 0.0 1.0 1.0 1.0 0.0 0.0 0.0', '0.0 1.0 3.0 0.0 0.0 1.0 0.0 1.0', '0.0 0.0 0.0 1.0 1.0 2.0 0.0 0.0 1.0 0.0 0.0 1.0']): self.assertEqual(f,i)
def bootstrap_tree_from_alignment(aln, seed=None, num_trees=None, params=None): """Returns a tree from Alignment object aln with bootstrap support values. aln: an cogent.core.alignment.Alignment object, or data that can be used to build one. seed: an interger, seed value to use num_trees: an integer, number of trees to bootstrap against params: dict of parameters to pass in to the Clustal app controller. The result will be an cogent.core.tree.PhyloNode object, or None if tree fails. If seed is not specifed in params, a random integer between 0-1000 is used. """ # Create instance of controllor, enable bootstrap, disable alignment,tree app = Clustalw(InputHandler='_input_as_multiline_string', params=params, \ WorkingDir='/tmp') app.Parameters['-align'].off() app.Parameters['-tree'].off() if app.Parameters['-bootstrap'].isOff(): if num_trees is None: num_trees = 1000 app.Parameters['-bootstrap'].on(num_trees) if app.Parameters['-seed'].isOff(): if seed is None: seed = randint(0, 1000) app.Parameters['-seed'].on(seed) if app.Parameters['-bootlabels'].isOff(): app.Parameters['-bootlabels'].on("node") # Setup mapping. Clustalw clips identifiers. We will need to remap them. seq_collection = SequenceCollection(aln) int_map, int_keys = seq_collection.getIntMap() int_map = SequenceCollection(int_map) # Collect result result = app(int_map.toFasta()) # Build tree tree = DndParser(result['Tree'].read(), constructor=PhyloNode) for node in tree.tips(): node.Name = int_keys[node.Name] # Clean up result.cleanUp() del (seq_collection, app, result, int_map, int_keys) return tree
def LoadSeqs(filename=None, format=None, data=None, moltype=None, name=None, aligned=True, label_to_name=None, parser_kw={}, constructor_kw={}, **kw): """Initialize an alignment or collection of sequences. Arguments: - filename: name of the sequence file - format: format of the sequence file - data: optional explicit provision of sequences - moltype: the MolType, eg DNA, PROTEIN - aligned: set True if sequences are already aligned and have the same length, results in an Alignment object. If False, a SequenceCollection instance is returned instead. If callable, will use as a constructor (e.g. can pass in DenseAlignment or CodonAlignment). - label_to_name: function for converting original name into another name. Default behavior is to preserve the original FASTA label and comment. To remove all FASTA label comments, and pass in only the label, pass in: label_to_name=lambda x: x.split()[0] To look up names in a dict, pass in: label_to_name = lambda x: d.get(x, default_name) ...where d is a dict that's in scope, and default_name is what you want to assign any sequence that isn't in the dict. If format is None, will attempt to infer format from the filename suffix. If label_to_name is None, will attempt to infer correct conversion from the format. """ if filename is None: assert data is not None assert format is None assert not kw, kw else: assert data is None, (filename, data) data = list(FromFilenameParser(filename, format, **parser_kw)) # the following is a temp hack until we have the load API sorted out. if aligned: #if callable, call it -- expect either f(data) or bool if hasattr(aligned, '__call__'): return aligned(data=data, MolType=moltype, Name=name, label_to_name=label_to_name, **constructor_kw) else: #was not callable, but wasn't False return Alignment(data=data, MolType=moltype, Name=name, label_to_name=label_to_name, **constructor_kw) else: #generic case: return SequenceCollection return SequenceCollection(data, MolType=moltype, Name=name, label_to_name=label_to_name, **constructor_kw)
def cdhit_clusters_from_seqs(seqs, moltype, params=None): """Returns the CD-HIT clusters given seqs seqs : dict like collection of sequences moltype : cogent.core.moltype object params : cd-hit parameters NOTE: This method will call CD_HIT if moltype is PROTIEN, CD_HIT_EST if moltype is RNA/DNA, and raise if any other moltype is passed. """ # keys are not remapped. Tested against seq_ids of 100char length seqs = SequenceCollection(seqs, MolType=moltype) #Create mapping between abbreviated IDs and full IDs int_map, int_keys = seqs.getIntMap() #Create SequenceCollection from int_map. int_map = SequenceCollection(int_map,MolType=moltype) # setup params and make sure the output argument is set if params is None: params = {} if '-o' not in params: params['-o'] = get_tmp_filename() # call the correct version of cd-hit base on moltype working_dir = get_tmp_filename() if moltype is PROTEIN: app = CD_HIT(WorkingDir=working_dir, params=params) elif moltype is RNA: app = CD_HIT_EST(WorkingDir=working_dir, params=params) elif moltype is DNA: app = CD_HIT_EST(WorkingDir=working_dir, params=params) else: raise ValueError, "Moltype must be either PROTEIN, RNA, or DNA" # grab result res = app(int_map.toFasta()) clusters = parse_cdhit_clstr_file(res['CLSTR'].readlines()) remapped_clusters = [] for c in clusters: curr = [int_keys[i] for i in c] remapped_clusters.append(curr) # perform cleanup res.cleanUp() shutil.rmtree(working_dir) remove(params['-o'] + '.bak.clstr') return remapped_clusters
def build_tree_from_alignment(aln, moltype, best_tree=False, params=None): """Returns a tree from Alignment object aln. aln: a cogent.core.alignment.Alignment object, or data that can be used to build one. moltype: cogent.core.moltype.MolType object best_tree: unsupported params: dict of parameters to pass in to the Muscle app controller. The result will be an cogent.core.tree.PhyloNode object, or None if tree fails. """ # Create instance of app controller, enable tree, disable alignment app = Muscle(InputHandler='_input_as_multiline_string', params=params, \ WorkingDir='/tmp') app.Parameters['-cluster'].on() app.Parameters['-tree1'].on(get_tmp_filename(app.WorkingDir)) app.Parameters['-seqtype'].on(moltype.label) seq_collection = SequenceCollection(aln, MolType=moltype) #Create mapping between abbreviated IDs and full IDs int_map, int_keys = seq_collection.getIntMap() #Create SequenceCollection from int_map. int_map = SequenceCollection(int_map,MolType=moltype) # Collect result result = app(int_map.toFasta()) # Build tree tree = DndParser(result['Tree1Out'].read(), constructor=PhyloNode) for tip in tree.tips(): tip.Name = int_keys[tip.Name] # Clean up result.cleanUp() del(seq_collection, app, result) return tree
def align_unaligned_seqs(seqs,moltype,params=None,accurate=False): """Aligns unaligned sequences seqs: either list of sequence objects or list of strings add_seq_names: boolean. if True, sequence names are inserted in the list of sequences. if False, it assumes seqs is a list of lines of some proper format that the program can handle """ #create SequenceCollection object from seqs seq_collection = SequenceCollection(seqs,MolType=moltype) #Create mapping between abbreviated IDs and full IDs int_map, int_keys = seq_collection.getIntMap() #Create SequenceCollection from int_map. int_map = SequenceCollection(int_map,MolType=moltype) #Create Mafft app. app = Mafft(InputHandler='_input_as_multiline_string',params=params) #Turn on correct moltype moltype_string = moltype.label.upper() app.Parameters[MOLTYPE_MAP[moltype_string]].on() #Do not report progress app.Parameters['--quiet'].on() #More accurate alignment, sacrificing performance. if accurate: app.Parameters['--globalpair'].on() app.Parameters['--maxiterate'].Value=1000 #Get results using int_map as input to app res = app(int_map.toFasta()) #Get alignment as dict out of results alignment = dict(MinimalFastaParser(res['StdOut'].readlines())) #Make new dict mapping original IDs new_alignment = {} for k,v in list(alignment.items()): new_alignment[int_keys[k]]=v #Create an Alignment object from alignment dict new_alignment = Alignment(new_alignment,MolType=moltype) #Clean up res.cleanUp() del(seq_collection,int_map,int_keys,app,res,alignment) return new_alignment
def align_unaligned_seqs(seqs, moltype, params=None): """Returns an Alignment object from seqs. seqs: SequenceCollection object, or data that can be used to build one. moltype: a MolType object. DNA, RNA, or PROTEIN. params: dict of parameters to pass in to the Muscle app controller. Result will be an Alignment object. """ if not params: params = {} #create SequenceCollection object from seqs seq_collection = SequenceCollection(seqs, MolType=moltype) #Create mapping between abbreviated IDs and full IDs int_map, int_keys = seq_collection.getIntMap() #Create SequenceCollection from int_map. int_map = SequenceCollection(int_map, MolType=moltype) #get temporary filename params.update({'-out': get_tmp_filename()}) #Create Muscle app. app = Muscle(InputHandler='_input_as_multiline_string',\ params=params) #Get results using int_map as input to app res = app(int_map.toFasta()) #Get alignment as dict out of results alignment = dict(MinimalFastaParser(res['MuscleOut'].readlines())) #Make new dict mapping original IDs new_alignment = {} for k, v in alignment.items(): new_alignment[int_keys[k]] = v #Create an Alignment object from alignment dict new_alignment = Alignment(new_alignment, MolType=moltype) #Clean up res.cleanUp() del (seq_collection, int_map, int_keys, app, res, alignment, params) return new_alignment
def create_locarnap_alignment(seqs,moltype,struct=False,params=None): """Returns mlocarna results given an unaligned SequenceCollection. - seqs: A SequenceCollection object or something that behaves like one. - moltype: cogent.core.moltype object. -struct: Boolean whether or not to also output vienna structure string """ #Construct SequenceCollection object. seqs = SequenceCollection(seqs,MolType=moltype) #need to make int map. int_map, int_keys = seqs.getIntMap() #construct SequenceCollection object from int map to use functionality int_map = SequenceCollection(int_map, MolType=moltype) #Create application. app = MLocarna(InputHandler='_input_as_multiline_string',params=params) #Get temporary directory to write all mlocarna files. mlocarna_dir = get_tmp_filename(suffix='') app.Parameters['--tgtdir'].on(mlocarna_dir) #set parameters to run locarna-p app.Parameters['--write-structure'].on() app.Parameters['--probabilistic'].on() app.Parameters['--consistency-transformation'].on() res = app(int_map.toFasta()) #get the structure from the results if necessary if struct: structfile = open(res['ProbabilisticAlignment'].name, 'U') structure = "" newstrline = True for line in structfile: line = line.strip() #read in structure lines of alignment (--write-structure) if len(line) > 0 and (line[0] == "." or line[0] == "("): #only append if new structure aspect, since struct is #written both above and below blocks in alignment if newstrline: structure += line newstrline = not newstrline else: newstrline = not newstrline aligned = dict(ClustalParser(res['ClustalAlignment'])) #Make new dict mapping original IDs new_alignment={} for k,v in aligned.items(): new_alignment[int_keys.get(k,k)]=v #Create an Alignment object from alignment dict new_alignment = Alignment(new_alignment,MolType=moltype) #Clean up after MlocARNA res.cleanUp() shutil.rmtree(mlocarna_dir) #output alignment and structure if asked for, else outout just alignment if struct: return new_alignment, structure else: return new_alignment
def align_two_alignments(aln1, aln2, params=None): """Returns an Alignment object from two existing Alignments. aln1, aln2: cogent.core.alignment.Alignment objects, or data that can be used to build them. params: dict of parameters to pass in to the Muscle app controller. """ if not params: params = {} #create SequenceCollection object from aln1 aln1_collection = SequenceCollection(aln1) #Create mapping between abbreviated IDs and full IDs aln1_int_map, aln1_int_keys = aln1_collection.getIntMap(prefix='aln1_') #Create SequenceCollection from int_map. aln1_int_map = SequenceCollection(aln1_int_map) #create SequenceCollection object from aln2 aln2_collection = SequenceCollection(aln2) #Create mapping between abbreviated IDs and full IDs aln2_int_map, aln2_int_keys = aln2_collection.getIntMap(prefix='aln2_') #Create SequenceCollection from int_map. aln2_int_map = SequenceCollection(aln2_int_map) #set output and profile options params.update({'-out':get_tmp_filename(), '-profile':True}) #save aln1 to tmp file aln1_filename = get_tmp_filename() aln1_out = open(aln1_filename,'w') aln1_out.write(aln1_int_map.toFasta()) aln1_out.close() #save aln2 to tmp file aln2_filename = get_tmp_filename() aln2_out = open(aln2_filename, 'w') aln2_out.write(aln2_int_map.toFasta()) aln2_out.close() #Create Muscle app and get results app = Muscle(InputHandler='_input_as_multifile', params=params) res = app((aln1_filename, aln2_filename)) #Get alignment as dict out of results alignment = dict(MinimalFastaParser(res['MuscleOut'].readlines())) #Make new dict mapping original IDs new_alignment = {} for k,v in alignment.items(): if k in aln1_int_keys: new_alignment[aln1_int_keys[k]] = v else: new_alignment[aln2_int_keys[k]] = v #Create an Alignment object from alignment dict new_alignment = Alignment(new_alignment) #Clean up res.cleanUp() del(aln1_collection, aln1_int_map, aln1_int_keys) del(aln2_collection, aln2_int_map, aln2_int_keys) del(app, res, alignment, params) remove(aln1_filename) remove(aln2_filename) return new_alignment
def add_seqs_to_alignment(seqs, aln, moltype, params=None, accurate=False): """Returns an Alignment object from seqs and existing Alignment. seqs: a cogent.core.sequence.Sequence object, or data that can be used to build one. aln: an cogent.core.alignment.Alignment object, or data that can be used to build one params: dict of parameters to pass in to the Mafft app controller. """ #create SequenceCollection object from seqs seq_collection = SequenceCollection(seqs,MolType=moltype) #Create mapping between abbreviated IDs and full IDs seq_int_map, seq_int_keys = seq_collection.getIntMap() #Create SequenceCollection from int_map. seq_int_map = SequenceCollection(seq_int_map,MolType=moltype) #create Alignment object from aln aln = Alignment(aln,MolType=moltype) #Create mapping between abbreviated IDs and full IDs aln_int_map, aln_int_keys = aln.getIntMap(prefix='seqn_') #Create SequenceCollection from int_map. aln_int_map = Alignment(aln_int_map,MolType=moltype) #Update seq_int_keys with aln_int_keys seq_int_keys.update(aln_int_keys) #Create Mafft app. app = Mafft(InputHandler='_input_as_multiline_string',\ params=params, SuppressStderr=True) #Turn on correct moltype moltype_string = moltype.label.upper() app.Parameters[MOLTYPE_MAP[moltype_string]].on() #Do not report progress app.Parameters['--quiet'].on() #Add aln_int_map as seed alignment app.Parameters['--seed'].on(\ app._tempfile_as_multiline_string(aln_int_map.toFasta())) #More accurate alignment, sacrificing performance. if accurate: app.Parameters['--globalpair'].on() app.Parameters['--maxiterate'].Value=1000 #Get results using int_map as input to app res = app(seq_int_map.toFasta()) #Get alignment as dict out of results alignment = dict(MinimalFastaParser(res['StdOut'].readlines())) #Make new dict mapping original IDs new_alignment = {} for k,v in list(alignment.items()): key = k.replace('_seed_','') new_alignment[seq_int_keys[key]]=v #Create an Alignment object from alignment dict new_alignment = Alignment(new_alignment,MolType=moltype) #Clean up res.cleanUp() remove(app.Parameters['--seed'].Value) del(seq_collection,seq_int_map,seq_int_keys,\ aln,aln_int_map,aln_int_keys,app,res,alignment) return new_alignment
def __call__(self, seq_path, result_path=None, log_path=None, \ failure_path=None, cmbuild_params=None, cmalign_params=None): log_params = [] # load candidate sequences candidate_sequences = dict(MinimalFastaParser(open(seq_path,'U'))) # load template sequences try: info, template_alignment, struct = list(MinimalRfamParser(open(\ self.Params['template_filepath'],'U'),\ seq_constructor=ChangedSequence))[0] except RecordError: raise ValueError, "Template alignment must be in Stockholm format with corresponding secondary structure annotation when using InfernalAligner." moltype = self.Params['moltype'] #Need to make separate mapping for unaligned sequences unaligned = SequenceCollection(candidate_sequences,MolType=moltype) int_map, int_keys = unaligned.getIntMap(prefix='unaligned_') int_map = SequenceCollection(int_map,MolType=moltype) #Turn on --gapthresh option in cmbuild to force alignment to full model if cmbuild_params is None: cmbuild_params = {} cmbuild_params.update({'--gapthresh':1.0}) #record cmbuild parameters log_params.append('cmbuild parameters:') log_params.append(str(cmbuild_params)) #Turn on --sub option in Infernal, since we know the unaligned sequences # are fragments. #Also turn on --gapthresh to use same gapthresh as was used to build # model if cmalign_params is None: cmalign_params = {} cmalign_params.update({'--sub':True,'--gapthresh':1.0}) #record cmalign parameters log_params.append('cmalign parameters:') log_params.append(str(cmalign_params)) #Align sequences to alignment including alignment gaps. aligned, struct_string = cmalign_from_alignment(aln=template_alignment,\ structure_string=struct,\ seqs=int_map,\ moltype=moltype,\ include_aln=True,\ params=cmalign_params,\ cmbuild_params=cmbuild_params) #Pull out original sequences from full alignment. infernal_aligned={} aligned_dict = aligned.NamedSeqs for key in int_map.Names: infernal_aligned[int_keys.get(key,key)]=aligned_dict[key] #Create an Alignment object from alignment dict infernal_aligned = Alignment(infernal_aligned,MolType=moltype) if log_path is not None: log_file = open(log_path,'w') log_file.write('\n'.join(log_params)) log_file.close() if result_path is not None: result_file = open(result_path,'w') result_file.write(infernal_aligned.toFasta()) result_file.close() return None else: try: return infernal_aligned except ValueError: return {}
def align_two_alignments(aln1, aln2, params=None): """Returns an Alignment object from two existing Alignments. aln1, aln2: cogent.core.alignment.Alignment objects, or data that can be used to build them. params: dict of parameters to pass in to the Muscle app controller. """ if not params: params = {} #create SequenceCollection object from aln1 aln1_collection = SequenceCollection(aln1) #Create mapping between abbreviated IDs and full IDs aln1_int_map, aln1_int_keys = aln1_collection.getIntMap(prefix='aln1_') #Create SequenceCollection from int_map. aln1_int_map = SequenceCollection(aln1_int_map) #create SequenceCollection object from aln2 aln2_collection = SequenceCollection(aln2) #Create mapping between abbreviated IDs and full IDs aln2_int_map, aln2_int_keys = aln2_collection.getIntMap(prefix='aln2_') #Create SequenceCollection from int_map. aln2_int_map = SequenceCollection(aln2_int_map) #set output and profile options params.update({'-out': get_tmp_filename(), '-profile': True}) #save aln1 to tmp file aln1_filename = get_tmp_filename() aln1_out = open(aln1_filename, 'w') aln1_out.write(aln1_int_map.toFasta()) aln1_out.close() #save aln2 to tmp file aln2_filename = get_tmp_filename() aln2_out = open(aln2_filename, 'w') aln2_out.write(aln2_int_map.toFasta()) aln2_out.close() #Create Muscle app and get results app = Muscle(InputHandler='_input_as_multifile', params=params) res = app((aln1_filename, aln2_filename)) #Get alignment as dict out of results alignment = dict(MinimalFastaParser(res['MuscleOut'].readlines())) #Make new dict mapping original IDs new_alignment = {} for k, v in alignment.items(): if k in aln1_int_keys: new_alignment[aln1_int_keys[k]] = v else: new_alignment[aln2_int_keys[k]] = v #Create an Alignment object from alignment dict new_alignment = Alignment(new_alignment) #Clean up res.cleanUp() del (aln1_collection, aln1_int_map, aln1_int_keys) del (aln2_collection, aln2_int_map, aln2_int_keys) del (app, res, alignment, params) remove(aln1_filename) remove(aln2_filename) return new_alignment
def add_seqs_to_alignment(seqs, aln, moltype, params=None): """Returns an Alignment object from seqs and existing Alignment. seqs: a cogent.core.alignment.SequenceCollection object, or data that can be used to build one. aln: a cogent.core.alignment.Alignment object, or data that can be used to build one params: dict of parameters to pass in to the Clustal app controller. """ #create SequenceCollection object from seqs seq_collection = SequenceCollection(seqs, MolType=moltype) #Create mapping between abbreviated IDs and full IDs seq_int_map, seq_int_keys = seq_collection.getIntMap() #Create SequenceCollection from int_map. seq_int_map = SequenceCollection(seq_int_map, MolType=moltype) #create Alignment object from aln aln = Alignment(aln, MolType=moltype) #Create mapping between abbreviated IDs and full IDs aln_int_map, aln_int_keys = aln.getIntMap(prefix='seqn_') #Create SequenceCollection from int_map. aln_int_map = Alignment(aln_int_map, MolType=moltype) #Update seq_int_keys with aln_int_keys seq_int_keys.update(aln_int_keys) #Create Mafft app. app = Clustalw(InputHandler='_input_as_multiline_string',\ params=params, SuppressStderr=True) app.Parameters['-align'].off() app.Parameters['-infile'].off() app.Parameters['-sequences'].on() #Add aln_int_map as profile1 app.Parameters['-profile1'].on(\ app._tempfile_as_multiline_string(aln_int_map.toFasta())) #Add seq_int_map as profile2 app.Parameters['-profile2'].on(\ app._tempfile_as_multiline_string(seq_int_map.toFasta())) #Get results using int_map as input to app res = app() #Get alignment as dict out of results alignment = dict(ClustalParser(res['Align'].readlines())) #Make new dict mapping original IDs new_alignment = {} for k, v in alignment.items(): new_alignment[seq_int_keys[k]] = v #Create an Alignment object from alignment dict new_alignment = Alignment(new_alignment, MolType=moltype) #Clean up res.cleanUp() remove(app.Parameters['-profile1'].Value) remove(app.Parameters['-profile2'].Value) del(seq_collection,seq_int_map,seq_int_keys,\ aln,aln_int_map,aln_int_keys,app,res,alignment) return new_alignment
def stockholm_from_alignment(aln, interleave_len=None, GC_annotation=None): """Returns a string in Stockholm format. - aln: can be an Alignment object or a dict. - interleave_len: sequence line width. Only available if sequences are aligned. - GC_annotation: dict containing Per-column annotation {<tag>:<s>}, added to Stockholm file in the following format: #=GC <tag> <s> - <s> is an aligned text line of annotation type <tag>. - #=GC lines are associated with a sequence alignment block; - <s> is aligned to the residues in the alignment block, and has the same length as the rest of the block. #=GC lines are placed at the end of each block. """ if not aln: return '' # get seq output order try: order = aln.RowOrder except: order = aln.keys() order.sort() seqs = SequenceCollection(aln) stockholm_list = ["# STOCKHOLM 1.0\n"] if seqs.isRagged(): raise ValueError,\ "Sequences in alignment are not all the same length." +\ "Cannot generate Stockholm format." aln_len = seqs.SeqLen #Get all labels labels = copy(seqs.Names) #Get ordered seqs ordered_seqs = [seqs.NamedSeqs[label] for label in order] if GC_annotation is not None: GC_annotation_list = \ [(k,GC_annotation[k]) for k in sorted(GC_annotation.keys())] #Add GC_annotation to list of labels. labels.extend(['#=GC ' + k for k in GC_annotation.keys()]) for k, v in GC_annotation.items(): if len(v) != aln_len: raise ValueError, """GC annotation %s is not same length as alignment. Cannot generate Stockholm format.""" % ( k) #Find all label lengths in order to get padding. label_lengths = [len(l) for l in labels] label_max = max(label_lengths) max_spaces = label_max + 4 if interleave_len is not None: curr_ix = 0 while curr_ix < aln_len: stockholm_list.extend(["%s%s%s"%(x,' '*(max_spaces-len(x)),\ y[curr_ix:curr_ix+ \ interleave_len]) for x,y in zip(order, ordered_seqs)]) if GC_annotation is not None: stockholm_list.extend(["#=GC %s%s%s"%(x,\ ' '*(max_spaces-len(x)-5),\ y[curr_ix:curr_ix + interleave_len]) for x,y in\ GC_annotation_list]) stockholm_list.append("") curr_ix += interleave_len else: stockholm_list.extend(["%s%s%s"%(x,' '*(max_spaces-len(x)),y) \ for x,y in zip(order, ordered_seqs)]) if GC_annotation is not None: stockholm_list.extend(["#=GC %s%s%s"%(x,' '*(max_spaces-len(x)-5),\ y) for x,y in GC_annotation_list]) stockholm_list.append("") return '\n'.join(stockholm_list) + '//'
def build_tree_from_alignment(aln, moltype, best_tree=False, params=None): """Returns a tree from Alignment object aln. aln: an cogent.core.alignment.Alignment object, or data that can be used to build one. moltype: cogent.core.moltype.MolType object best_tree: if True (default:False), uses a slower but more accurate algorithm to build the tree. params: dict of parameters to pass in to the Clustal app controller. The result will be an cogent.core.tree.PhyloNode object, or None if tree fails. """ # Create instance of app controller, enable tree, disable alignment app = Clustalw(InputHandler='_input_as_multiline_string', params=params, \ WorkingDir='/tmp') app.Parameters['-align'].off() #Set params to empty dict if None. if params is None: params = {} if moltype == DNA or moltype == RNA: params['-type'] = 'd' elif moltype == PROTEIN: params['-type'] = 'p' else: raise ValueError, "moltype must be DNA, RNA, or PROTEIN" # best_tree -> bootstrap if best_tree: if '-bootstrap' not in params: app.Parameters['-bootstrap'].on(1000) if '-seed' not in params: app.Parameters['-seed'].on(randint(0, 1000)) if '-bootlabels' not in params: app.Parameters['-bootlabels'].on('nodes') else: app.Parameters['-tree'].on() # Setup mapping. Clustalw clips identifiers. We will need to remap them. seq_collection = SequenceCollection(aln) int_map, int_keys = seq_collection.getIntMap() int_map = SequenceCollection(int_map) # Collect result result = app(int_map.toFasta()) # Build tree tree = DndParser(result['Tree'].read(), constructor=PhyloNode) for node in tree.tips(): node.Name = int_keys[node.Name] # Clean up result.cleanUp() del (seq_collection, app, result, int_map, int_keys) return tree
def stockholm_from_alignment(aln, interleave_len=None, GC_annotation=None): """Returns a string in Stockholm format. - aln: can be an Alignment object or a dict. - interleave_len: sequence line width. Only available if sequences are aligned. - GC_annotation: dict containing Per-column annotation {<tag>:<s>}, added to Stockholm file in the following format: #=GC <tag> <s> - <s> is an aligned text line of annotation type <tag>. - #=GC lines are associated with a sequence alignment block; - <s> is aligned to the residues in the alignment block, and has the same length as the rest of the block. #=GC lines are placed at the end of each block. """ if not aln: return "" # get seq output order try: order = aln.RowOrder except: order = aln.keys() order.sort() seqs = SequenceCollection(aln) stockholm_list = ["# STOCKHOLM 1.0\n"] if seqs.isRagged(): raise ValueError, "Sequences in alignment are not all the same length." + "Cannot generate Stockholm format." aln_len = seqs.SeqLen # Get all labels labels = copy(seqs.Names) # Get ordered seqs ordered_seqs = [seqs.NamedSeqs[label] for label in order] if GC_annotation is not None: GC_annotation_list = [(k, GC_annotation[k]) for k in sorted(GC_annotation.keys())] # Add GC_annotation to list of labels. labels.extend(["#=GC " + k for k in GC_annotation.keys()]) for k, v in GC_annotation.items(): if len(v) != aln_len: raise ValueError, """GC annotation %s is not same length as alignment. Cannot generate Stockholm format.""" % ( k ) # Find all label lengths in order to get padding. label_lengths = [len(l) for l in labels] label_max = max(label_lengths) max_spaces = label_max + 4 if interleave_len is not None: curr_ix = 0 while curr_ix < aln_len: stockholm_list.extend( [ "%s%s%s" % (x, " " * (max_spaces - len(x)), y[curr_ix : curr_ix + interleave_len]) for x, y in zip(order, ordered_seqs) ] ) if GC_annotation is not None: stockholm_list.extend( [ "#=GC %s%s%s" % (x, " " * (max_spaces - len(x) - 5), y[curr_ix : curr_ix + interleave_len]) for x, y in GC_annotation_list ] ) stockholm_list.append("") curr_ix += interleave_len else: stockholm_list.extend(["%s%s%s" % (x, " " * (max_spaces - len(x)), y) for x, y in zip(order, ordered_seqs)]) if GC_annotation is not None: stockholm_list.extend( ["#=GC %s%s%s" % (x, " " * (max_spaces - len(x) - 5), y) for x, y in GC_annotation_list] ) stockholm_list.append("") return "\n".join(stockholm_list) + "//"
def add_seqs_to_alignment(seqs, aln, params=None): """Returns an Alignment object from seqs and existing Alignment. seqs: a cogent.core.alignment.SequenceCollection object, or data that can be used to build one. aln: a cogent.core.alignment.Alignment object, or data that can be used to build one params: dict of parameters to pass in to the Muscle app controller. """ if not params: params = {} #create SequenceCollection object from seqs seqs_collection = SequenceCollection(seqs) #Create mapping between abbreviated IDs and full IDs seqs_int_map, seqs_int_keys = seqs_collection.getIntMap(prefix='seq_') #Create SequenceCollection from int_map. seqs_int_map = SequenceCollection(seqs_int_map) #create SequenceCollection object from aln aln_collection = SequenceCollection(aln) #Create mapping between abbreviated IDs and full IDs aln_int_map, aln_int_keys = aln_collection.getIntMap(prefix='aln_') #Create SequenceCollection from int_map. aln_int_map = SequenceCollection(aln_int_map) #set output and profile options params.update({'-out':get_tmp_filename(), '-profile':True}) #save seqs to tmp file seqs_filename = get_tmp_filename() seqs_out = open(seqs_filename,'w') seqs_out.write(seqs_int_map.toFasta()) seqs_out.close() #save aln to tmp file aln_filename = get_tmp_filename() aln_out = open(aln_filename, 'w') aln_out.write(aln_int_map.toFasta()) aln_out.close() #Create Muscle app and get results app = Muscle(InputHandler='_input_as_multifile', params=params, WorkingDir=tempfile.gettempdir()) res = app((aln_filename, seqs_filename)) #Get alignment as dict out of results alignment = dict(parse_fasta(res['MuscleOut'])) #Make new dict mapping original IDs new_alignment = {} for k,v in alignment.items(): if k in seqs_int_keys: new_alignment[seqs_int_keys[k]] = v else: new_alignment[aln_int_keys[k]] = v #Create an Alignment object from alignment dict new_alignment = Alignment(new_alignment) #Clean up res.cleanUp() del(seqs_collection, seqs_int_map, seqs_int_keys) del(aln_collection, aln_int_map, aln_int_keys) del(app, res, alignment, params) remove(seqs_filename) remove(aln_filename) return new_alignment
def toSequenceCollection(self, Bases = False): names = self.Names flow_dict = self.NamedFlows flows = [flow_dict[f].toSeq(Bases = Bases) for f in names] return SequenceCollection(flows)
def create_locarnap_alignment(seqs, moltype, struct=False, params=None): """Returns mlocarna results given an unaligned SequenceCollection. - seqs: A SequenceCollection object or something that behaves like one. - moltype: cogent.core.moltype object. -struct: Boolean whether or not to also output vienna structure string """ #Construct SequenceCollection object. seqs = SequenceCollection(seqs, MolType=moltype) #need to make int map. int_map, int_keys = seqs.getIntMap() #construct SequenceCollection object from int map to use functionality int_map = SequenceCollection(int_map, MolType=moltype) #Create application. app = MLocarna(InputHandler='_input_as_multiline_string', params=params) #Get temporary directory to write all mlocarna files. mlocarna_dir = get_tmp_filename(suffix='') app.Parameters['--tgtdir'].on(mlocarna_dir) #set parameters to run locarna-p app.Parameters['--write-structure'].on() app.Parameters['--probabilistic'].on() app.Parameters['--consistency-transformation'].on() res = app(int_map.toFasta()) #get the structure from the results if necessary if struct: structfile = open(res['ProbabilisticAlignment'].name, 'U') structure = "" newstrline = True for line in structfile: line = line.strip() #read in structure lines of alignment (--write-structure) if len(line) > 0 and (line[0] == "." or line[0] == "("): #only append if new structure aspect, since struct is #written both above and below blocks in alignment if newstrline: structure += line newstrline = not newstrline else: newstrline = not newstrline aligned = dict(ClustalParser(res['ClustalAlignment'])) #Make new dict mapping original IDs new_alignment = {} for k, v in aligned.items(): new_alignment[int_keys.get(k, k)] = v #Create an Alignment object from alignment dict new_alignment = Alignment(new_alignment, MolType=moltype) #Clean up after MlocARNA res.cleanUp() shutil.rmtree(mlocarna_dir) #output alignment and structure if asked for, else outout just alignment if struct: return new_alignment, structure else: return new_alignment
def add_seqs_to_alignment(seqs, aln, moltype, params=None, accurate=False): """Returns an Alignment object from seqs and existing Alignment. seqs: a cogent.core.sequence.Sequence object, or data that can be used to build one. aln: an cogent.core.alignment.Alignment object, or data that can be used to build one params: dict of parameters to pass in to the Mafft app controller. """ #create SequenceCollection object from seqs seq_collection = SequenceCollection(seqs,MolType=moltype) #Create mapping between abbreviated IDs and full IDs seq_int_map, seq_int_keys = seq_collection.getIntMap() #Create SequenceCollection from int_map. seq_int_map = SequenceCollection(seq_int_map,MolType=moltype) #create Alignment object from aln aln = Alignment(aln,MolType=moltype) #Create mapping between abbreviated IDs and full IDs aln_int_map, aln_int_keys = aln.getIntMap(prefix='seqn_') #Create SequenceCollection from int_map. aln_int_map = Alignment(aln_int_map,MolType=moltype) #Update seq_int_keys with aln_int_keys seq_int_keys.update(aln_int_keys) #Create Mafft app. app = Mafft(InputHandler='_input_as_multiline_string',\ params=params, SuppressStderr=True) #Turn on correct moltype moltype_string = moltype.label.upper() app.Parameters[MOLTYPE_MAP[moltype_string]].on() #Do not report progress app.Parameters['--quiet'].on() #Add aln_int_map as seed alignment app.Parameters['--seed'].on(\ app._tempfile_as_multiline_string(aln_int_map.toFasta())) #More accurate alignment, sacrificing performance. if accurate: app.Parameters['--globalpair'].on() app.Parameters['--maxiterate'].Value=1000 #Get results using int_map as input to app res = app(seq_int_map.toFasta()) #Get alignment as dict out of results alignment = dict(parse_fasta(res['StdOut'])) #Make new dict mapping original IDs new_alignment = {} for k,v in alignment.items(): key = k.replace('_seed_','') new_alignment[seq_int_keys[key]]=v #Create an Alignment object from alignment dict new_alignment = Alignment(new_alignment,MolType=moltype) #Clean up res.cleanUp() remove(app.Parameters['--seed'].Value) del(seq_collection,seq_int_map,seq_int_keys,\ aln,aln_int_map,aln_int_keys,app,res,alignment) return new_alignment