Ejemplo n.º 1
0
def group_to_reference(fulldict, reference, nonref, structscore, norefseq=False):
    nogroup = []
    score_structures = ScoreStructures()
    for currstruct in nonref:
        strscore = structscore
        seqscore = 0
        bestref = ""
        #remove gaps from majority and set as seq1
        seq = fulldict[currstruct].majorityConsensus()
        seq1 = RnaSequence(''.join(seq).replace('-', ''))
        for teststruct in reference:
            holdscore = score_structures(currstruct, teststruct)
            if holdscore <= strscore:
                #remove gaps from majority and set as seq2
                seq = fulldict[teststruct].majorityConsensus()
                seq2 = RnaSequence(''.join(seq).replace('-', ''))
                #compare alignment score. subtract so lower is still better
                aln, alnscore = classic_align_pairwise(seq1, seq2, alnscores, -10, -10, False, return_score=True)
                if alnscore > seqscore:
                    strscore = holdscore
                    seqscore = alnscore
                    bestref = teststruct
        if bestref != "":
            #combine the two alignments into one alignment using reference sequence as guide
            #refseq must be ungapped to do this without realigning, hence the checks
            if norefseq:
                #realign all sequences since no refseq available
                combinedseqs = fulldict[bestref].degap().addSeqs(fulldict[currstruct].degap())
                fulldict[bestref] = align_unaligned_seqs(combinedseqs, RNA, params={"-maxiters": 2, "-diags": True})
                continue
            #if one refseq is gapless, can easily combine them without realigning
            if not fulldict[currstruct].getGappedSeq("refseq").isGapped():
                fulldict[bestref].addFromReferenceAln(fulldict[currstruct])
            elif not fulldict[bestref].getGappedSeq("refseq").isGapped():
                fulldict[currstruct].addFromReferenceAln(fulldict[bestref])
                fulldict[bestref] = fulldict[currstruct]
            else:
                #realign all sequences since both refseqs have gaps
                #hacky but it works, need to fix later
                fulldict[bestref].Names.remove("refseq")
                combinedseqs = fulldict[bestref].degap().addSeqs(fulldict[currstruct].degap())
                fulldict[bestref] = align_unaligned_seqs(combinedseqs, RNA)
                fulldict[bestref].Names.remove("refseq")
                fulldict[bestref].Names.insert(0, "refseq")
            fulldict.pop(currstruct)
        else:
            nogroup.append(currstruct)
    score_structures.end()
    return fulldict, nogroup
Ejemplo n.º 2
0
def group_denovo(fulldict, keys, structscore, norefseq=False):
    topop = []
    score_structures = ScoreStructures()
    for pos, currstruct in enumerate(keys):
        strscore = structscore
        seqscore = 0
        bestref = ""
        #remove gaps from majority and set as seq1
        seq = fulldict[currstruct].majorityConsensus()
        seq1 = RnaSequence(''.join(seq).replace('-', ''))
        for secpos in range(pos+1, len(keys)):
            holdscore = score_structures(currstruct, keys[secpos])
            if holdscore <= strscore:
                #remove gaps from majority and set as seq2
                seq = fulldict[keys[secpos]].majorityConsensus()
                seq2 = RnaSequence(''.join(seq).replace('-', ''))
                #compare alignment score. Higher is better.
                aln, alnscore = classic_align_pairwise(seq1, seq2, alnscores, -10, -10, False, return_score=True)
                if alnscore > seqscore:
                    strscore = holdscore
                    seqscore = alnscore
                    bestref = keys[secpos]
        if bestref != "":
            if norefseq:
                #realign all sequences since no refseq available
                combinedseqs = fulldict[bestref].degap().addSeqs(fulldict[currstruct].degap())
                fulldict[bestref] = align_unaligned_seqs(combinedseqs, RNA)
                continue
            if not fulldict[currstruct].getGappedSeq("refseq").isGapped():
                fulldict[bestref].addFromReferenceAln(fulldict[currstruct])
            elif not fulldict[bestref].getGappedSeq("refseq").isGapped():
                fulldict[currstruct].addFromReferenceAln(fulldict[bestref])
                fulldict[bestref] = fulldict[currstruct]
            else:
                #realign all sequences since both refseqs have gaps
                #hacky but it works, need to fix later
                fulldict[bestref].Names.remove("refseq")
                combinedseqs = fulldict[bestref].degap().addSeqs(fulldict[currstruct].degap())
                fulldict[bestref] = align_unaligned_seqs(combinedseqs, RNA)
                fulldict[bestref].Names.remove("refseq")
                fulldict[bestref].Names.insert(0, "refseq")
            fulldict.pop(currstruct)
            topop.append(pos)
    topop.sort(reverse=True)
    for pos in topop:
        keys.pop(pos)
    score_structures.end()
    return fulldict, keys
Ejemplo n.º 3
0
def align_order_seqs(seqs, params, outfolder, num, prefix="group_"):
    if exists("%s%s%i.fna" % (outfolder, prefix, num)):
        return
    try:
        aln = align_unaligned_seqs(seqs, RNA, params=params)
        aln.Names.sort(reverse=True, key=lambda c: count_seqs(c))
        with open("%s%s%i.fna" % (outfolder, prefix, num), 'w') as fout:
            fout.write(aln.toFasta() + "\n")
    except Exception as e:
        print("align_order_seqs ERROR: ", format_exc(e))
Ejemplo n.º 4
0
def bayesfold(seqsin, temperature=37, params=None):
    '''Takes in sequences in LoadSeqs readable format and returns
    most likely structure from bayesfold.'''
    try:
        if params == None:
            params = {}
        aln = align_unaligned_seqs(seqsin, RNA, params=params)
        bayesinput = BayesInputWrapper(aln.getSeqNames(),
            map(str, aln.iterSeqs()), str(temperature))
        bayescalc = BayesCalculation(bayesinput)
        bayescalc.run()
        struct = str(bayescalc.Alignment.Structures).split()[1]
        del bayescalc
        del bayesinput
        return aln, struct
    except Exception, e:
        print "BAYESFOLD ERROR: ", e
def bayesfold(seqsin):
    '''Runs BayesFold on a set of sequences in MinimalFastaParser format
    [(header, seq), (header, seq)] and returns alignment and structure'''
    #make sure group has enough sequences before continuing
    temperature = 37
    seqs = []
    aln = align_unaligned_seqs(seqsin, RNA)
    for item in aln.Seqs:
        seqs.append(str(item))
    sequences = RNAAlignment(sequences=seqs)
    structures = sequences.fold(temperature, 2, 100) 
    structalign = str(RNAStructureAlignment(sequences,structures,temperature)).split("\n")
    for line in structalign:
        if ".." in line:
            struct = line
            break
    return aln, struct
Ejemplo n.º 6
0
def bayesfold(seqsin, temperature=37, params=None, align=True):
    '''Takes in sequences in LoadSeqs readable format and returns
    most likely structure from bayesfold.'''
    try:
        if params is None:
            params = {}
        if not align:
            aln = LoadSeqs(data=seqsin, moltype=RNA, aligned=True)
        else:
            aln = align_unaligned_seqs(seqsin, RNA, params=params)
        bayesinput = BayesInputWrapper(aln.getSeqNames(),
                                       map(str, aln.iterSeqs()),
                                       str(temperature))
        bayescalc = BayesCalculation(bayesinput)
        bayescalc.run()
        struct = str(bayescalc.Alignment.Structures).split()[1]
        del bayescalc
        del bayesinput
        return aln, struct
    except Exception, e:
        print "BAYESFOLD ERROR: ", format_exc(e)
        raise RuntimeError("BAYESFOLD ERROR: ", format_exc(e))
Ejemplo n.º 7
0
 def test_align_unaligned_seqs(self):
     """align_unaligned_seqs should work as expected"""
     res = align_unaligned_seqs(self.seqs1, RNA)
     self.assertEqual(res.toFasta(), align1)
Ejemplo n.º 8
0
	#load in all haeders for sequences in the clade and match them to their sequence
    listin = open(argv[1], 'rU')
    fileout = open(folderout + "/" + basenames + "-seqs.fasta", 'w')
    listin.readline()
    rawseqs = []
    tips = []
    for line in listin:
        header = line.split()[0]
        fileout.write(''.join([">", header, "\n", seqs[header], "\n"]))
        rawseqs.append((header, seqs[header]))
        tips.append(header)
    fileout.close()

    print "Aligning seqs using muscle with -diags"
    seqs = LoadSeqs(data=rawseqs, moltype=RNA, aligned=False)
    aln = align_unaligned_seqs(seqs, RNA, {"-diags": True})
    fileout = open(folderout + "/" + basenames + "-seqsaligned.fasta", 'w')
    fileout.write(str(aln))
    fileout.close()

    print "Folding sequences"
    #get subtree of the clade being folded to pass to PPfold
    tr = LoadTree(argv[3])
    sub_tree = tr.getSubTree(tips, keep_root=True)
    filesubtree = open(folderout + "/" + basenames + "-subtreeDistances.nwk", 'w')
    filesubtree.write(sub_tree.getNewick(with_distances=True))
    filesubtree.close()
    filesubtree = open(folderout + "/" + basenames + "-subtree.nwk", 'w')
    filesubtree.write(sub_tree.getNewick(with_distances=False))
    #call PPfold with aligned sequences and subtree
    args = ["java", "-jar", PPFOLDDIR + "PPfold.jar", folderout + "/" + basenames + "-seqsaligned.fasta", "--outputd", folderout]
    else:
        print "Clusters previously folded"

    clusters.clear()
    del clusters
    if not exists(otufolder + "clusters_aln.fasta"):
        startaln = time()
        if not args.nr:
        #add refseq and align all sequences for later seq/struct comparisons
            refin = open(args.f + "refseq.fasta")
            crap, refseq = MinimalFastaParser(refin).next()
            refin.close()
        for struct in structgroups:
            if not args.nr:
                structgroups[struct].append(("refseq", refseq))
            structgroups[struct] = align_unaligned_seqs(structgroups[struct], RNA)
            if not args.nr:
                #hacky way to make sure refseq first, will need to fix later
                structgroups[struct].Names.remove("refseq")
                structgroups[struct].Names.insert(0, "refseq")
        #write that shit to a file to save time if rerun needed
        cout = open(otufolder + "clusters_aln.fasta", 'w')
        for struct in structgroups:
            cout.write(">%s\n%s\n" % ("newcluster", struct))
            cout.write(structgroups[struct].toFasta() + "\n")
        cout.close()
        print "Aligned all clusters: " + str((time() - startaln)/60) + " min"
    else:
        cin = open(otufolder + "clusters_aln.fasta")
        currclust = []
        currstruct = ""
Ejemplo n.º 10
0
 def test_align_unaligned_seqs(self):
     """align_unaligned_seqs should work as expected"""
     res = align_unaligned_seqs(self.seqs1, RNA)
     self.assertEqual(res.toFasta(), align1)
Ejemplo n.º 11
0
        print c[0] + "\t" + str(c[1])

    print str(len(clusters)) + " clusters"

    #print that shit to file
    clustfiles = []
    for cnum,cinfo in enumerate(countarray):
        if not exists(''.join([argv[2],"cluster", str(cnum), "_", str(cinfo[1]), ".txt"])):
            clusters[cinfo[0]].sort(reverse=True,key=lambda count: int(count[0].split('_')[1]))
            cout = open(''.join([argv[2],"cluster", str(cnum), "_", str(cinfo[1]), ".txt"]), 'w')
            for seq in clusters[cinfo[0]]:
                cout.write(">%s\n%s\n" % seq)
            cout.close()
        clustfiles.append(''.join([argv[2],"cluster", str(cnum), "_", str(cinfo[1]), ".txt"]))
    for cfile in clustfiles:
        print cfile
        seqs = LoadSeqs(cfile, moltype=RNA, format='fasta')
        aln = align_unaligned_seqs(seqs,RNA)
        cout = open(cfile[:cfile.rfind(".")] + "_aln.txt", 'w')
        cout.write(aln.toFasta())
        cout.close()
        fin = open(cfile[:cfile.rfind(".")] + "_aln.txt")
        seqs = read_seq_data(fin) 
        fin.close()
        data = LogoData.from_seqs(seqs)
        options = LogoOptions()
        options.title = "cluster logo"
        format = LogoFormat(data, options)
        fout = open(cfile[:cfile.rfind(".")] + ".eps", 'w')
        eps_formatter(data, format, fout)
        fout.close()
def phyl_tree():
	mytree = nj.nj(d.getPairwiseDistances())
	print ("\n\n")
	print mytree.asciiArt()

al = LoadSeqs("cytc.fasta", moltype=PROTEIN, interleaved=False)
d = distance.EstimateDistances(al, submodel = JTT92())
d.run()
sys.stdout = open("cytc distances.txt", "w")
print d
phyl_tree()

al = LoadSeqs("mtdna.fasta", moltype=DNA, interleaved=True, aligned=False)
d = distance.EstimateDistances(al, submodel = JC69())
d.run()
sys.stdout = open("mtdna distances.txt", "w")
print d
phyl_tree()

seqs = LoadSeqs("cytb.fasta", moltype=PROTEIN, aligned=False)
al = align_unaligned_seqs(seqs,PROTEIN)
dcalc = distance.EstimateDistances(al, submodel = JTT92())
dcalc.run(show_progress = True)
d = dcalc.getPairwiseDistances()
tree=nj.nj(d)
sys.stdout = open("cytb distances.txt", "w")
print dcalc
print '\n\n'
print tree.asciiArt()
#phyl_tree()