Python MotifTools.save_motifs 예제들, TAMO.MotifTools.save_motifs Python 예제들

예제 #1

0

파일 보기

파일: memeset2tamo.py 프로젝트: xguse/customTAMO

def memefiles2tamo(files, tamoname):
    global probefile, PROBESET, fsafile
    
    motifs = []
    for filename in files:
        print ">>>SDFSD>F ",filename
        if   re.search('\.ace$',filename):
            mdobject = AlignAce.AlignAce(filename)
            if not mdobject.fastafile: mdobject.fastafile=filename.replace('.ace','.fsa')
        elif re.search('\.meme.*$',filename):
            mdobject = Meme.Meme(filename)
            if not mdobject.fastafile:
                mdobject.fastafile=re.sub('\..\.meme','.meme',filename).replace('.meme','.fsa')
        motifs.extend(mdobject.motifs)

    #fsaname = find_fsa(mdobject.fastafile)
    print mdobject.fastafile
    if fsafile: fsaname = fsafile
    else:       fsaname = Fasta.find(mdobject.fastafile)
    fsaD    = Fasta.load(fsaname)
    probes  = fsaD.keys()
    if not probefile:
        PROBESET = MotifMetrics.ProbeSet('YEAST')
        #PROBESET= pick_genome(fsaname)
    for key,seq in fsaD.items():
        PROBESET.probes[key] = seq

    for motif in motifs:
        if motif.pvalue == 1: motif.pvalue = PROBESET.p_value(motif,probes,'v')
        if motif.church == 1: motif.church = PROBESET.church(motif,probes,'v')
        #if motif.E_site == None: motif.E_site = PROBESET.E_sitef(motif,probes,3,'v')
        #if motif.E_chi2 == None: motif.E_chi2 = PROBESET.E_chi2(motif,probes,None,'v')
        #if motif.E_seq  == None: motif.E_seq  = PROBESET.E_seq(motif,probes,'v')
        if motif.ROC_auc== None: motif.ROC_auc= PROBESET.ROC_AUC(motif,probes,'v')
        #if motif.MNCP   == None: motif.MNCP   = PROBESET.MNCP(motif,probes,'v')
        if motif.frac   == None: motif.frac   = PROBESET.frac(motif,probes,'v',0.7)
        if re.search('\.meme$',filename):
            motif.MAP = -math.log(motif.evalue)/math.log(10)
        if 0 and (motif.CRA == None):
            try:
                pass
                CRA, Cfrac = PROBESET.cons_ROC_AUC(motif,probes,'v',tuple='YES')
                motif.CRA = CRA
                motif.Cfrac = Cfrac
            except: pass

    if re.search('\.meme$',filename):
        mdobject.motifs.sort(lambda x,y: cmp(x.pvalue, y.pvalue))
    else:
        mdobject.motifs.sort(lambda x,y: cmp(x.church, y.church))

    MotifTools.save_motifs(motifs,tamoname)

예제 #2

0

파일 보기

def tamo2tamo(file, outname):
    global probefile, PROBESET, fsafile

    motifs = MotifTools.load(file)
    if fsafile:
        fsaname = fsafile
    else:
        fsaname = find_fsa(file)

    print '# FSA ', fsaname
    fsaD = MotifMetrics.fasta2seqs(fsaname, 'want_dict')
    probes = fsaD.keys()
    if not probefile:
        PROBESET = MotifMetrics.ProbeSet('YEAST')
        #PROBESET= pick_genome(fsaname)
    #for key,seq in fsaD.items():
    #    PROBESET.probes[key] = seq

    print "# %d motifs" % len(motifs)
    for motif in motifs:
        #motif.pvalue, motif.church = 1,1  #Comment this!
        if motif.pvalue == 1:
            motif.pvalue = PROBESET.p_value(motif, probes, 'v')
        if motif.church == 1:
            motif.church = PROBESET.church(motif, probes, 'v')
        #if motif.E_site == None: motif.E_site = PROBESET.E_sitef(motif,probes,3,'v')
        #if motif.E_chi2 == None: motif.E_chi2 = PROBESET.E_chi2(motif,probes,None,'v')
        #if motif.E_seq  == None: motif.E_seq  = PROBESET.E_seq(motif,probes,'v')
        if motif.ROC_auc == None:
            motif.ROC_auc = PROBESET.ROC_AUC(motif, probes, 'v')
        #if motif.MNCP   == None: motif.MNCP   = PROBESET.MNCP(motif,probes,'v')
        if motif.frac == None:
            motif.frac = PROBESET.frac(motif, probes, 'v', 0.7)
        if motif.numbound == 0:
            matching = PROBESET.matching_ids(motif, [], factor=0.7)
            matchbound = [x for x in matching if x in probes]
            motif.numbound = len(probes)
            motif.nummotif = len(matching)
            motif.numboundmotif = len(matchbound)
        if 0 and motif.CRA == None:
            try:
                pass
                CRA, Cfrac = PROBESET.cons_ROC_AUC(motif,
                                                   probes,
                                                   'v',
                                                   tuple='YES')
                motif.CRA = CRA
                motif.Cfrac = Cfrac
            except:
                pass

    MotifTools.save_motifs(motifs, outname)

예제 #3

0

파일 보기

파일: tamo2tamo.py 프로젝트: adamlabadorf/TAMO

def tamo2tamo(file, outname):
    global probefile, PROBESET, fsafile
    
    motifs  = MotifTools.load(file)
    if fsafile:
        fsaname = fsafile
    else:
        fsaname = find_fsa(file)

    print '# FSA ',fsaname
    fsaD    = MotifMetrics.fasta2seqs(fsaname,'want_dict')
    probes  = fsaD.keys()
    if not probefile:
        PROBESET = MotifMetrics.ProbeSet('YEAST')
        #PROBESET= pick_genome(fsaname)
    #for key,seq in fsaD.items():
    #    PROBESET.probes[key] = seq

    print "# %d motifs"%len(motifs)
    for motif in motifs:
        #motif.pvalue, motif.church = 1,1  #Comment this!
        if motif.pvalue == 1: motif.pvalue = PROBESET.p_value(motif,probes,'v')
        if motif.church == 1: motif.church = PROBESET.church(motif,probes,'v')
        #if motif.E_site == None: motif.E_site = PROBESET.E_sitef(motif,probes,3,'v')
        #if motif.E_chi2 == None: motif.E_chi2 = PROBESET.E_chi2(motif,probes,None,'v')
        #if motif.E_seq  == None: motif.E_seq  = PROBESET.E_seq(motif,probes,'v')
        if motif.ROC_auc== None: motif.ROC_auc= PROBESET.ROC_AUC(motif,probes,'v')
        #if motif.MNCP   == None: motif.MNCP   = PROBESET.MNCP(motif,probes,'v')
        if motif.frac   == None: motif.frac   = PROBESET.frac(motif,probes,'v',0.7)
        if motif.numbound == 0:
            matching            = PROBESET.matching_ids(motif,[],factor=0.7)
            matchbound          = [x for x in matching if x in probes]
            motif.numbound      = len(probes)
            motif.nummotif      = len(matching)
            motif.numboundmotif = len(matchbound)
        if 0 and motif.CRA    == None:
            try:
                pass
                CRA, Cfrac = PROBESET.cons_ROC_AUC(motif,probes,'v',tuple='YES')
                motif.CRA = CRA
                motif.Cfrac = Cfrac
            except: pass
        
    MotifTools.save_motifs(motifs,outname)

예제 #4

0

파일 보기

파일: ace2tamo.py 프로젝트: xguse/customTAMO

def ace2tamo(filename, tamoname):
    global probefile, PROBESET
    if   re.search('\.ace$',filename):
        mdobject = AlignAce.AlignAce(filename)
    elif re.search('\.meme$',filename):
        mdobject = Meme.Meme(filename)

    fsaname = find_fsa(mdobject.fastafile)
    fsaD    = MotifMetrics.fasta2seqs(fsaname,'want_dict')
    probes  = fsaD.keys()
    if not probefile:
        PROBESET = MotifMetrics.ProbeSet('HUMAN_250')
        #PROBESET= pick_genome(fsaname)
    for key,seq in fsaD.items():
        PROBESET.probes[key] = seq

    for motif in mdobject.motifs:
        if motif.pvalue == 1: motif.pvalue = PROBESET.p_value(motif,probes,'v')
        if motif.church == 1: motif.church = PROBESET.church(motif,probes,'v')
        if motif.E_site == None: motif.E_site = PROBESET.E_sitef(motif,probes,3,'v')
        #if motif.E_chi2 == None: motif.E_chi2 = PROBESET.E_chi2(motif,probes,None,'v')
        if motif.E_seq  == None: motif.E_seq  = PROBESET.E_seq(motif,probes,'v')
        if motif.ROC_auc== None: motif.ROC_auc= PROBESET.ROC_AUC(motif,probes,'v')
        if motif.MNCP   == None: motif.MNCP   = PROBESET.MNCP(motif,probes,'v')
        if re.search('\.meme$',filename):
            motif.MAP = -math.log(motif.evalue)/math.log(10)
        sys.stdout.flush()

    i = 0
    for motif in mdobject.motifs:
        motif.seednum = i ; i=i+1
        kmers = motif.bogus_kmers(100)
        motif.maxscore = -100
        scores = [motif.scan(kmer)[2][0] for kmer in kmers]
        print Arith.avestd(scores)

    if re.search('\.meme$',filename):
        mdobject.motifs.sort(lambda x,y: cmp(x.pvalue, y.pvalue))
    else:
        mdobject.motifs.sort(lambda x,y: cmp(x.church, y.church))

    MotifTools.save_motifs(mdobject.motifs,tamoname)

예제 #5

0

파일 보기

def ace2tamo(filename, tamoname):
    global probefile, PROBESET
    if   re.search('\.ace$',filename):
        mdobject = AlignAce.AlignAce(filename)
    elif re.search('\.meme$',filename):
        mdobject = Meme.Meme(filename)

    fsaname = find_fsa(mdobject.fastafile)
    fsaD    = MotifMetrics.fasta2seqs(fsaname,'want_dict')
    probes  = fsaD.keys()
    if not probefile:
        PROBESET = MotifMetrics.ProbeSet('HUMAN_250')
        #PROBESET= pick_genome(fsaname)
    for key,seq in fsaD.items():
        PROBESET.probes[key] = seq

    for motif in mdobject.motifs:
        if motif.pvalue == 1: motif.pvalue = PROBESET.p_value(motif,probes,'v')
        if motif.church == 1: motif.church = PROBESET.church(motif,probes,'v')
        if motif.E_site == None: motif.E_site = PROBESET.E_sitef(motif,probes,3,'v')
        #if motif.E_chi2 == None: motif.E_chi2 = PROBESET.E_chi2(motif,probes,None,'v')
        if motif.E_seq  == None: motif.E_seq  = PROBESET.E_seq(motif,probes,'v')
        if motif.ROC_auc== None: motif.ROC_auc= PROBESET.ROC_AUC(motif,probes,'v')
        if motif.MNCP   == None: motif.MNCP   = PROBESET.MNCP(motif,probes,'v')
        if re.search('\.meme$',filename):
            motif.MAP = -math.log(motif.evalue)/math.log(10)
        sys.stdout.flush()

    i = 0
    for motif in mdobject.motifs:
        motif.seednum = i ; i=i+1
        kmers = motif.bogus_kmers(100)
        motif.maxscore = -100
        scores = [motif.scan(kmer)[2][0] for kmer in kmers]
        print Arith.avestd(scores)

    if re.search('\.meme$',filename):
        mdobject.motifs.sort(lambda x,y: cmp(x.pvalue, y.pvalue))
    else:
        mdobject.motifs.sort(lambda x,y: cmp(x.church, y.church))

    MotifTools.save_motifs(mdobject.motifs,tamoname)

예제 #6

0

파일 보기

파일: kellis2tamo.py 프로젝트: malhamdoosh/abseqPy

def motifs2tamo(motifs, outname):
    global probefile, PROBESET
    
    fsaname = find_fsa(outname)
    fsaD    = MotifMetrics.fasta2seqs(fsaname,'want_dict')
    probes  = fsaD.keys()
    if not probefile:
        PROBESET = MotifMetrics.ProbeSet('YEAST')
        #PROBESET= pick_genome(fsaname)
    #for key,seq in fsaD.items():
    #    PROBESET.probes[key] = seq

    print "# %d motifs"%len(motifs)
    for motif in motifs:
        if motif.pvalue == 1: motif.pvalue = PROBESET.p_value(motif,probes,'v')
        if motif.church == 1: motif.church = PROBESET.church(motif,probes,'v')
        if motif.E_site == None: motif.E_site = PROBESET.E_sitef(motif,probes,3,'v')
        #if motif.E_chi2 == None: motif.E_chi2 = PROBESET.E_chi2(motif,probes,None,'v')
        if motif.E_seq  == None: motif.E_seq  = PROBESET.E_seq(motif,probes,'v')
        if motif.ROC_auc== None: motif.ROC_auc= PROBESET.ROC_AUC(motif,probes,'v')
        if motif.MNCP   == None: motif.MNCP   = PROBESET.MNCP(motif,probes,'v')
    MotifTools.save_motifs(motifs,outname)

예제 #7

0

파일 보기

파일: meme2tamo.py 프로젝트: HarleyRobinson/MotifVBTransfer

filename = sys.argv[1]
motif_list = open(filename).read().split('\nMOTIF')[1:]
tamo_list = []
motif_counter = 1
nsites_pat = re.compile("(w= [0-9]+)")

for motif in motif_list:
  m_info1, m_info2 = motif.split('letter-probability matrix: ')
  m_mat = m_info2.split('--------------------------------------------------------------------------------', 1)[0]
  m_mat_header, m_prob_mat = m_mat.split('\n', 1) 
  nsites = int(nsites_pat.findall(m_mat_header)[0].split('= ')[1])
  count_pos = m_prob_mat.split('\n')[:-1]
  count_mat = []
  site_list = []
  for count in count_pos:
    sites = [float(i) for i in count.split()]
    site_list.append(sites)
    count_dict = {'A': int(sites[0] * nsites),
                  'C': int(sites[1] * nsites),
                  'G': int(sites[2] * nsites),
                  'T': int(sites[3] * nsites)}
    count_mat.append(count_dict)
  m = MotifTools.Motif_from_counts(count_mat)
  m.source = "Motif%s | %s" % (motif_counter, m_mat_header)
  tamo_list.append(m)
  motif_counter += 1  
  
MotifTools.save_motifs(tamo_list, "MEME_motifs_%s.tamo" % filename.split('.')[0])

예제 #8

0

파일 보기

파일: GEMSlikePWM.py 프로젝트: xguse/gusPyProj

# Sort on log'd pVals
testMotifs.sort(key=lambda x: x[1])

comboMotifs = []

for i in range(0,int(len(testMotifs)*0.2)):
    simMotifs  = getKmersWithOneMisMtch(testMotifs[i][0],testMotifs) 
    alndMotifs = alignSimilarMotifs([x[0] for x in simMotifs])
    #for m in simMotifs:
        #print m[0].oneletter
    comboMotifs.append(MotifTools.sum(alndMotifs,[-x[1] for x in simMotifs])) # -x[1] to convert neg logs to pos weights
    print len(comboMotifs)

t2 = time.time()    

oFile = '/Users/biggus/Documents/James/Collaborations/Campbell/data/Results_HyperGeoScreen/masked/Results_gGEMS/CCupAt4Days.gte2x.5-16mers.shfSeq.3.gGEMS.tmo'
pFile = '/Users/biggus/Documents/James/Collaborations/Campbell/data/Results_HyperGeoScreen/masked/Results_gGEMS/CCupAt4Days.gte2x.5-16mers.shfSeq.3.gGEMS.pkl'
MotifTools.save_motifs(comboMotifs,oFile,kmer_count=60)

pFile = open(pFile, 'w')
cPickle.dump(comboMotifs,pFile)
t3 = time.time()    
print 'Calculations took %.3f min.\nWriting/Pickling took %.3f min.' % ((float(t2)-t1)/60, (float(t3)-t2)/60) 
    


    
None

예제 #9

0

파일 보기

# Create a general list with all the motifs from all algorithms
genlist = []
genlist.extend(seederlist)
genlist.extend(memelist)
genlist.extend(weederlist)

# Perform clustering on the general list of motifs
clusterinf = clusterinfo(genlist)
averages = clusteravg(genlist,clusterinf)

# Trim the final average list
# averages = trim(averages,0.5)
# print clusterinf

# Save new list of cluster averages
MotifTools.save_motifs(averages,tamooutput)

# WEBLOGO IMAGE GENERATION
# Generate giflogos of all average motifs
for index in range(len(averages)):
    cluster = 'Cluster ' + str(index + 1)
    clustergif = argv[1] + '/other/cluster' + str(index + 1)
    averages[index].giflogo(clustergif,title=cluster,scale=2)

# SUMMARY REPORT
# Determine location of the markdown file for the summary report
reportout = open(str(argv[1] + '/final/' + listname + '_cluster_report.md'), 'w')

# Write the header of the report
rundate = date.today()
header = "# Summary report for `" + listname + "`\nThis analysis was run on: " + str(rundate) + \

예제 #10

0

파일 보기

def memefiles2tamo(files, tamoname):
    global probefile, PROBESET

    motifs = []
    for filename in files:
        print ">>>SDFSD>F ", filename
        if re.search('\.ace$', filename):
            mdobject = AlignAce.AlignAce(filename)
            if not mdobject.fastafile:
                mdobject.fastafile = filename.replace('.ace', '.fsa')
        elif re.search('\.meme.*$', filename):
            mdobject = Meme.Meme(filename)
            if not mdobject.fastafile:
                mdobject.fastafile = re.sub('\..\.meme', '.meme',
                                            filename).replace('.meme', '.fsa')
        motifs.extend(mdobject.motifs)

    #fsaname = find_fsa(mdobject.fastafile)
    print mdobject.fastafile
    fsaname = Fasta.find(mdobject.fastafile)
    fsaD = Fasta.load(fsaname)
    probes = fsaD.keys()
    if not probefile:
        PROBESET = MotifMetrics.ProbeSet('YEAST')
        #PROBESET= pick_genome(fsaname)
    for key, seq in fsaD.items():
        PROBESET.probes[key] = seq

    for motif in motifs:
        if motif.pvalue == 1:
            motif.pvalue = PROBESET.p_value(motif, probes, 'v')
        if motif.church == 1:
            motif.church = PROBESET.church(motif, probes, 'v')
        if motif.E_site == None:
            motif.E_site = PROBESET.E_sitef(motif, probes, 3, 'v')
        #if motif.E_chi2 == None: motif.E_chi2 = PROBESET.E_chi2(motif,probes,None,'v')
        #if motif.E_seq  == None: motif.E_seq  = PROBESET.E_seq(motif,probes,'v')
        if motif.ROC_auc == None:
            motif.ROC_auc = PROBESET.ROC_AUC(motif, probes, 'v')
        if motif.MNCP == None: motif.MNCP = PROBESET.MNCP(motif, probes, 'v')
        if motif.frac == None:
            motif.frac = PROBESET.frac(motif, probes, 'v', 0.7)
        if re.search('\.meme$', filename):
            motif.MAP = -math.log(motif.evalue) / math.log(10)
        if 1 and (motif.CRA == None):
            try:
                pass
                CRA, Cfrac = PROBESET.cons_ROC_AUC(motif,
                                                   probes,
                                                   'v',
                                                   tuple='YES')
                motif.CRA = CRA
                motif.Cfrac = Cfrac
            except:
                pass

    if re.search('\.meme$', filename):
        mdobject.motifs.sort(lambda x, y: cmp(x.pvalue, y.pvalue))
    else:
        mdobject.motifs.sort(lambda x, y: cmp(x.church, y.church))

    MotifTools.save_motifs(motifs, tamoname)

예제 #11

0

파일 보기

파일: test_alignAndCombine.py 프로젝트: xguse/gusPyProj

from gusPyCode.MDAP_proj.MDAP_defs import alignAndCombineMotifs
from TAMO import MotifTools

Motif = MotifTools.Motif

outFile = '/Users/biggus/Documents/James/Collaborations/Campbell/data/Results_HyperGeoScreen/masked/Results_gGEMS/CCupAt4Days.6-8mers.gGEMS.top6PlusCombos.motifs.stdThresh.tmo'

m = MotifTools.load('/Users/biggus/Documents/James/Collaborations/Campbell/data/Results_HyperGeoScreen/masked/Results_gGEMS/CCupAt4Days.6-8mers.gGEMS.top6.motifs.stdThresh.tmo')
w = [5.8952,
     5.6523,
     5.0585,
     4.9788,
     4.9678,
     4.7688]

toTmo = []
toTmo.append(alignAndCombineMotifs([m[0],m[1]],[w[0],w[1]]))
toTmo.append(alignAndCombineMotifs([m[0],m[4]],[w[0],w[4]]))
toTmo.append(alignAndCombineMotifs([m[1],m[4]],[w[1],w[4]]))
toTmo.append(alignAndCombineMotifs([m[2],m[3]],[w[2],w[3]]))
toTmo.append(alignAndCombineMotifs([m[2],m[5]],[w[2],w[5]]))


for e in toTmo:
    print e.oneletter

MotifTools.save_motifs(m+toTmo,outFile)    
    
None

예제 #12

0

파일 보기

파일: md2tamo.py 프로젝트: xguse/gusPyProj

from gusPyCode.MDAP_proj.MDAP_defs import loadMotifsFromOutFile
from TAMO import MotifTools


mdOutFiles = ['/Users/biggus/Documents/James/Data/ReClustering/PrelimData_Grant_Feb09/RandSplitFastas/AceResults/Clus2_247gene_0.8_Apr16_14-46-33.ace.1.txt',
             '/Users/biggus/Documents/James/Data/ReClustering/PrelimData_Grant_Feb09/RandSplitFastas/AceResults/Clus2_247gene_0.8_Apr16_14-46-33.ace.2.txt',
             '/Users/biggus/Documents/James/Data/ReClustering/PrelimData_Grant_Feb09/RandSplitFastas/AceResults/Clus2_247gene_0.8_Apr16_14-46-33.ace.3.txt']


for mdFile in mdOutFiles:
    motifs = loadMotifsFromOutFile(mdFile,'list') # ['Meme', 'AlignAce', 'MDscan', 'Weeder','list']
    MotifTools.save_motifs(motifs,mdFile+'.tmo')
    
print 'Done.'

예제 #13

0

파일 보기

파일: generate_PWM.py 프로젝트: bmmoore43/MotifDiscovery

# generate PWM for indicated motifs
#module load TAMO (on hpc)
import sys
from TAMO import MotifTools
#input file: motif list
input=open(sys.argv[1], 'r')

motif=[]
for line in input:
    if line.startswith("#"):
        pass
    else:
        line=line.strip("\n").strip("\r").strip()
        motif.append(line)

input.close()

print (motif)

pw=[]
for i in range(0, len(motif)):
    m=MotifTools.Motif_from_text(motif[i])
    pw.append(m)
MotifTools.save_motifs(pw,sys.argv[1]+'.tamo')

예제 #14

0

파일 보기

파일: THEME.py 프로젝트: malhamdoosh/abseqPy

    def run_CV(self,models):
        num_bg = len(self.all_probes.keys())
        num_fg = len(self.fg_seqs.keys())
       
        for key in self.fg_seqs.keys():
            self.all_probes[key] = self.fg_seqs[key]

        self.probes = self.all_probes.keys()
                
        if (self.randomize): trials = 50
        else: trials = 1

        self.models = models
        
        std = 0.0
        mean = 0.0
        sum_sq = 0.0
        sum_mean = 0.0
        for trial in range(trials):
            if (self.randomize):
                N = num_bg/num_fg
                if (N>10): N = 10
                fg = []
                bg = []
                total = len(self.probes)
                while(len(fg)<num_fg):
                    sp = random.randint(0,(total-1))
                    if (not(fg.count(self.probes[sp])>0)):
                        fg.append(self.probes[sp])
                while(len(bg)<(N*num_fg)):
                    sp = random.randint(0,(total-1))
                    if ( (not(bg.count(self.probes[sp])>0)) and (not(fg.count(self.probes[sp])>0)) ):
                        bg.append(self.probes[sp])
            else:            
                #select a random under-sampled set of background sequences            
                N = num_bg/num_fg
                (N, bg) = self.undersample(N)
                num_bg = N*num_fg
                fg = self.fg_seqs.keys()

            fg_group_size = len(fg)/self.cv_level   #determine foreground group size
            bg_group_size = len(bg)/self.cv_level   #determine background group size

            #separate sequences into groups
            fg_groups = {}
            bg_groups = {}
            temp_fg = []
            temp_bg = []
            for a in fg: temp_fg.append(a)
            for b in bg: temp_bg.append(b)
            for i in range(self.cv_level):
                fg_groups[i] = []
                bg_groups[i] = []
                if (i==(self.cv_level-1)):
                    fg_groups[i] = temp_fg[0:]
                    bg_groups[i] = temp_bg[0:]
                else:
                    for j in range(fg_group_size):
                        entry = temp_fg[random.randint(0,len(temp_fg)-1)]
                        temp_fg.remove(entry)
                        fg_groups[i].append(entry)
                    for j in range(bg_group_size):
                        entry = temp_bg[random.randint(0,len(temp_bg)-1)]
                        temp_bg.remove(entry)
                        bg_groups[i].append(entry)

        ####################################################################################
        #for each seed we run EM to train a probability model, calculate the maximum LLR for
        #each input sequence in a group, train a SVM classifier on the training data, and
        #calculate classification error on the test set.  We repeat this for each cv-group
        #and determine the mean cross-validation error for each hypothesis.
        ####################################################################################
            self.classification_errors = {}
            self.refined_motifs = {}
            classifier = {}
            self.best_motif = []
            num_models = len(self.models)
            for k in range(num_models):
                self.best_motif.append(None)
                (self.refined_motifs[k], self.classification_errors[k]) = self.train_classifier(k, fg_groups, bg_groups, N, trial)
               
                #print out some information
                best_c = 0
                best_beta = 0.0
                min = 1.0
                mean = 0.0
                for beta in self.classification_errors[k].keys():
                    for j in range(len(self.classification_errors[k][beta][0])):
                        mean = 0.0
                        for i in range(self.cv_level):
                            mean = mean + self.classification_errors[k][beta][i][j]
                        mean = mean/self.cv_level
                        if (mean<min):
                            min = mean
                            best_c = j
                            best_beta = beta
                if (self.randomize):
                    sum_sq = sum_sq + (min*min)
                    sum_mean = sum_mean + min
                    cv_mean = sum_mean/(trial+1)
                    if (trial>0):
                        try: std = math.sqrt((sum_sq - (sum_mean*sum_mean)/(trial+1))/(trial))
                        except: std = 0.0
                    stddev_err = 0.71*std/math.sqrt(trial+1)
                else:
                    (self.best_motif[k], classifier[k], fn) = self.train_final(k, fg, bg, N, best_beta)
                    print "\r\r---------------New Hypothesis---------------"
                    print "Unrefined Hypothesis: %s"%(self.models[k])
                    print "Refined Hypothesis: %s Optimal Beta: %f"%(self.best_motif[k], best_beta/(best_beta+1.0))
                    print "Mean %i-fold cv error: %f"%(self.cv_level,min)
                    LLR_thresh = (classifier[k][2]*classifier[k][1]/classifier[k][0] - classifier[k][3])
                    print "LLR match threshold: %f True positives: %i False Negatives: %i"%(LLR_thresh/self.best_motif[k].maxscore, (len(fg)-fn), fn)
                    self.best_motif[k].source = self.models[k].source
                    if (self.family!=''): self.best_motif[k].family = self.family
                    self.best_motif[k].dataset = self.datafiles[0]
                    self.best_motif[k].bgfile = self.datafiles[1]
                    self.best_motif[k].beta = best_beta/(best_beta+1.0)
                    self.best_motif[k].match_thresh = LLR_thresh
                    self.best_motif[k].cverror = min
                    print "Motif matches in positive input set:"
                    best_pssm = MDsupport.Motif2c_PSSM(self.best_motif[k])
                    for seq in fg:
                        sites = []
                        matches = best_pssm.matchstarts(self.all_probes[seq],LLR_thresh)
                        if (matches):
                            line = seq + '------>  '
                            for match in matches:
                                entry = str(match) + ': ' + self.all_probes[seq][match:match+self.best_motif[k].width] + ' '
                                line = line + entry
                            print line
                        
        if (self.randomize):
            print "Mean: %f Std.Dev: %f Error: %f Percent error: %f"%(cv_mean,std,stddev_err,(stddev_err/std))
        else:
            MotifTools.save_motifs(self.best_motif, self.motif_file)
        return((self.best_motif, min))

예제 #15

0

파일 보기

motiflist.pop(blankindex)

# Build matrix dictionaries and substitute the strings for the dictionaries
for num in range(len(motiflist)):
    # Prepare emtpy list of dictionaries of the length of the motif
    tempmotif = []
    motiflength = len(motiflist[num][1].strip().split('\t')) - 1
    for item in range(motiflength):
        tempmotif.append({})
    # Start filling in the dictionaries with the information in the matrix
    for line in range(1, 5):
        nucleotide = motiflist[num][line].strip().split('\t')[0]
        problist = motiflist[num][line].strip().split('\t')[1:]
        for position in range(motiflength):
            tempmotif[position][nucleotide] = float(problist[position])
    # Save the list of dictionaries in the general variable
    motiflist[num] = tempmotif

#print motiflist

# TAMO CONVERSION
# TAMO formated list of files
tamomotifs = []

# Convert dictionaries found in sigmotifs and store them as TAMO motifs
for motif in motiflist:
    tamomotif = MotifTools.Motif_from_counts(motif[:], beta=0.01, bg=bkgrddict)
    tamomotifs.append(tamomotif)

MotifTools.save_motifs(tamomotifs, output)