def group_to_reference(fulldict, reference, nonref, structscore, norefseq=False): nogroup = [] score_structures = ScoreStructures() for currstruct in nonref: strscore = structscore seqscore = 0 bestref = "" #remove gaps from majority and set as seq1 seq = fulldict[currstruct].majorityConsensus() seq1 = RnaSequence(''.join(seq).replace('-', '')) for teststruct in reference: holdscore = score_structures(currstruct, teststruct) if holdscore <= strscore: #remove gaps from majority and set as seq2 seq = fulldict[teststruct].majorityConsensus() seq2 = RnaSequence(''.join(seq).replace('-', '')) #compare alignment score. subtract so lower is still better aln, alnscore = classic_align_pairwise(seq1, seq2, alnscores, -10, -10, False, return_score=True) if alnscore > seqscore: strscore = holdscore seqscore = alnscore bestref = teststruct if bestref != "": #combine the two alignments into one alignment using reference sequence as guide #refseq must be ungapped to do this without realigning, hence the checks if norefseq: #realign all sequences since no refseq available combinedseqs = fulldict[bestref].degap().addSeqs(fulldict[currstruct].degap()) fulldict[bestref] = align_unaligned_seqs(combinedseqs, RNA, params={"-maxiters": 2, "-diags": True}) continue #if one refseq is gapless, can easily combine them without realigning if not fulldict[currstruct].getGappedSeq("refseq").isGapped(): fulldict[bestref].addFromReferenceAln(fulldict[currstruct]) elif not fulldict[bestref].getGappedSeq("refseq").isGapped(): fulldict[currstruct].addFromReferenceAln(fulldict[bestref]) fulldict[bestref] = fulldict[currstruct] else: #realign all sequences since both refseqs have gaps #hacky but it works, need to fix later fulldict[bestref].Names.remove("refseq") combinedseqs = fulldict[bestref].degap().addSeqs(fulldict[currstruct].degap()) fulldict[bestref] = align_unaligned_seqs(combinedseqs, RNA) fulldict[bestref].Names.remove("refseq") fulldict[bestref].Names.insert(0, "refseq") fulldict.pop(currstruct) else: nogroup.append(currstruct) score_structures.end() return fulldict, nogroup
def group_denovo(fulldict, keys, structscore, norefseq=False): topop = [] score_structures = ScoreStructures() for pos, currstruct in enumerate(keys): strscore = structscore seqscore = 0 bestref = "" #remove gaps from majority and set as seq1 seq = fulldict[currstruct].majorityConsensus() seq1 = RnaSequence(''.join(seq).replace('-', '')) for secpos in range(pos+1, len(keys)): holdscore = score_structures(currstruct, keys[secpos]) if holdscore <= strscore: #remove gaps from majority and set as seq2 seq = fulldict[keys[secpos]].majorityConsensus() seq2 = RnaSequence(''.join(seq).replace('-', '')) #compare alignment score. Higher is better. aln, alnscore = classic_align_pairwise(seq1, seq2, alnscores, -10, -10, False, return_score=True) if alnscore > seqscore: strscore = holdscore seqscore = alnscore bestref = keys[secpos] if bestref != "": if norefseq: #realign all sequences since no refseq available combinedseqs = fulldict[bestref].degap().addSeqs(fulldict[currstruct].degap()) fulldict[bestref] = align_unaligned_seqs(combinedseqs, RNA) continue if not fulldict[currstruct].getGappedSeq("refseq").isGapped(): fulldict[bestref].addFromReferenceAln(fulldict[currstruct]) elif not fulldict[bestref].getGappedSeq("refseq").isGapped(): fulldict[currstruct].addFromReferenceAln(fulldict[bestref]) fulldict[bestref] = fulldict[currstruct] else: #realign all sequences since both refseqs have gaps #hacky but it works, need to fix later fulldict[bestref].Names.remove("refseq") combinedseqs = fulldict[bestref].degap().addSeqs(fulldict[currstruct].degap()) fulldict[bestref] = align_unaligned_seqs(combinedseqs, RNA) fulldict[bestref].Names.remove("refseq") fulldict[bestref].Names.insert(0, "refseq") fulldict.pop(currstruct) topop.append(pos) topop.sort(reverse=True) for pos in topop: keys.pop(pos) score_structures.end() return fulldict, keys
def align_order_seqs(seqs, params, outfolder, num, prefix="group_"): if exists("%s%s%i.fna" % (outfolder, prefix, num)): return try: aln = align_unaligned_seqs(seqs, RNA, params=params) aln.Names.sort(reverse=True, key=lambda c: count_seqs(c)) with open("%s%s%i.fna" % (outfolder, prefix, num), 'w') as fout: fout.write(aln.toFasta() + "\n") except Exception as e: print("align_order_seqs ERROR: ", format_exc(e))
def bayesfold(seqsin, temperature=37, params=None): '''Takes in sequences in LoadSeqs readable format and returns most likely structure from bayesfold.''' try: if params == None: params = {} aln = align_unaligned_seqs(seqsin, RNA, params=params) bayesinput = BayesInputWrapper(aln.getSeqNames(), map(str, aln.iterSeqs()), str(temperature)) bayescalc = BayesCalculation(bayesinput) bayescalc.run() struct = str(bayescalc.Alignment.Structures).split()[1] del bayescalc del bayesinput return aln, struct except Exception, e: print "BAYESFOLD ERROR: ", e
def bayesfold(seqsin): '''Runs BayesFold on a set of sequences in MinimalFastaParser format [(header, seq), (header, seq)] and returns alignment and structure''' #make sure group has enough sequences before continuing temperature = 37 seqs = [] aln = align_unaligned_seqs(seqsin, RNA) for item in aln.Seqs: seqs.append(str(item)) sequences = RNAAlignment(sequences=seqs) structures = sequences.fold(temperature, 2, 100) structalign = str(RNAStructureAlignment(sequences,structures,temperature)).split("\n") for line in structalign: if ".." in line: struct = line break return aln, struct
def bayesfold(seqsin, temperature=37, params=None, align=True): '''Takes in sequences in LoadSeqs readable format and returns most likely structure from bayesfold.''' try: if params is None: params = {} if not align: aln = LoadSeqs(data=seqsin, moltype=RNA, aligned=True) else: aln = align_unaligned_seqs(seqsin, RNA, params=params) bayesinput = BayesInputWrapper(aln.getSeqNames(), map(str, aln.iterSeqs()), str(temperature)) bayescalc = BayesCalculation(bayesinput) bayescalc.run() struct = str(bayescalc.Alignment.Structures).split()[1] del bayescalc del bayesinput return aln, struct except Exception, e: print "BAYESFOLD ERROR: ", format_exc(e) raise RuntimeError("BAYESFOLD ERROR: ", format_exc(e))
def test_align_unaligned_seqs(self): """align_unaligned_seqs should work as expected""" res = align_unaligned_seqs(self.seqs1, RNA) self.assertEqual(res.toFasta(), align1)
#load in all haeders for sequences in the clade and match them to their sequence listin = open(argv[1], 'rU') fileout = open(folderout + "/" + basenames + "-seqs.fasta", 'w') listin.readline() rawseqs = [] tips = [] for line in listin: header = line.split()[0] fileout.write(''.join([">", header, "\n", seqs[header], "\n"])) rawseqs.append((header, seqs[header])) tips.append(header) fileout.close() print "Aligning seqs using muscle with -diags" seqs = LoadSeqs(data=rawseqs, moltype=RNA, aligned=False) aln = align_unaligned_seqs(seqs, RNA, {"-diags": True}) fileout = open(folderout + "/" + basenames + "-seqsaligned.fasta", 'w') fileout.write(str(aln)) fileout.close() print "Folding sequences" #get subtree of the clade being folded to pass to PPfold tr = LoadTree(argv[3]) sub_tree = tr.getSubTree(tips, keep_root=True) filesubtree = open(folderout + "/" + basenames + "-subtreeDistances.nwk", 'w') filesubtree.write(sub_tree.getNewick(with_distances=True)) filesubtree.close() filesubtree = open(folderout + "/" + basenames + "-subtree.nwk", 'w') filesubtree.write(sub_tree.getNewick(with_distances=False)) #call PPfold with aligned sequences and subtree args = ["java", "-jar", PPFOLDDIR + "PPfold.jar", folderout + "/" + basenames + "-seqsaligned.fasta", "--outputd", folderout]
else: print "Clusters previously folded" clusters.clear() del clusters if not exists(otufolder + "clusters_aln.fasta"): startaln = time() if not args.nr: #add refseq and align all sequences for later seq/struct comparisons refin = open(args.f + "refseq.fasta") crap, refseq = MinimalFastaParser(refin).next() refin.close() for struct in structgroups: if not args.nr: structgroups[struct].append(("refseq", refseq)) structgroups[struct] = align_unaligned_seqs(structgroups[struct], RNA) if not args.nr: #hacky way to make sure refseq first, will need to fix later structgroups[struct].Names.remove("refseq") structgroups[struct].Names.insert(0, "refseq") #write that shit to a file to save time if rerun needed cout = open(otufolder + "clusters_aln.fasta", 'w') for struct in structgroups: cout.write(">%s\n%s\n" % ("newcluster", struct)) cout.write(structgroups[struct].toFasta() + "\n") cout.close() print "Aligned all clusters: " + str((time() - startaln)/60) + " min" else: cin = open(otufolder + "clusters_aln.fasta") currclust = [] currstruct = ""
def test_align_unaligned_seqs(self): """align_unaligned_seqs should work as expected""" res = align_unaligned_seqs(self.seqs1, RNA) self.assertEqual(res.toFasta(), align1)
print c[0] + "\t" + str(c[1]) print str(len(clusters)) + " clusters" #print that shit to file clustfiles = [] for cnum,cinfo in enumerate(countarray): if not exists(''.join([argv[2],"cluster", str(cnum), "_", str(cinfo[1]), ".txt"])): clusters[cinfo[0]].sort(reverse=True,key=lambda count: int(count[0].split('_')[1])) cout = open(''.join([argv[2],"cluster", str(cnum), "_", str(cinfo[1]), ".txt"]), 'w') for seq in clusters[cinfo[0]]: cout.write(">%s\n%s\n" % seq) cout.close() clustfiles.append(''.join([argv[2],"cluster", str(cnum), "_", str(cinfo[1]), ".txt"])) for cfile in clustfiles: print cfile seqs = LoadSeqs(cfile, moltype=RNA, format='fasta') aln = align_unaligned_seqs(seqs,RNA) cout = open(cfile[:cfile.rfind(".")] + "_aln.txt", 'w') cout.write(aln.toFasta()) cout.close() fin = open(cfile[:cfile.rfind(".")] + "_aln.txt") seqs = read_seq_data(fin) fin.close() data = LogoData.from_seqs(seqs) options = LogoOptions() options.title = "cluster logo" format = LogoFormat(data, options) fout = open(cfile[:cfile.rfind(".")] + ".eps", 'w') eps_formatter(data, format, fout) fout.close()
def phyl_tree(): mytree = nj.nj(d.getPairwiseDistances()) print ("\n\n") print mytree.asciiArt() al = LoadSeqs("cytc.fasta", moltype=PROTEIN, interleaved=False) d = distance.EstimateDistances(al, submodel = JTT92()) d.run() sys.stdout = open("cytc distances.txt", "w") print d phyl_tree() al = LoadSeqs("mtdna.fasta", moltype=DNA, interleaved=True, aligned=False) d = distance.EstimateDistances(al, submodel = JC69()) d.run() sys.stdout = open("mtdna distances.txt", "w") print d phyl_tree() seqs = LoadSeqs("cytb.fasta", moltype=PROTEIN, aligned=False) al = align_unaligned_seqs(seqs,PROTEIN) dcalc = distance.EstimateDistances(al, submodel = JTT92()) dcalc.run(show_progress = True) d = dcalc.getPairwiseDistances() tree=nj.nj(d) sys.stdout = open("cytb distances.txt", "w") print dcalc print '\n\n' print tree.asciiArt() #phyl_tree()