コード例 #1
ファイル: Background.py プロジェクト: adamlabadorf/TAMO
def main():
    seqsD = Fasta.load(sys.argv[1])
    seqs  = seqsD.values()
    for w in range(1,7):
        allnmers = permute(w)
        nmersT = MotifTools.top_nmers(w,seqs,'with counts','purge Ns')
        nmersD = {}
        total = 0
        for nmer in allnmers:
            nmersD[nmer] = 1 #Pseudo count
            total = total + 1
        for nmer,count in nmersT[:]:
                rc = MotifTools.revcomplement(nmer)
                nmersD[nmer] = nmersD[nmer] + count
                nmersD[rc]   = nmersD[rc]   + count
                total = total + 2*count
            except KeyError:
        _t = nmersD.keys()
        print "# freq in %s (total %d with pseudocounts)"%(sys.argv[1],total)
        for nmer in _t:
            print "%-7s %20.17f"%(nmer,float(nmersD[nmer]) / total)
コード例 #2
def info2seeds(N,infofile,probefile,species='YEAST'):
    G    = ProbeSet(species)
    IDs  = G.ids_from_file(probefile)
    Q    = EM.theMarkovBackground.zeroth()
    seqs = Fasta.seqs(infofile)
    if not N:
        nmers = seqs
        nmers= MotifTools.top_nmers(N,seqs)
        if len(nmers) > 1000: nmers = nmers[0:1000]
    print "Scoring enrichment of %d nmers from %s"%len(nmers,infofile)
    nmers_scoresT = []
    for nmer in nmers:
        if nmer.isalpha():
            p = G.p_value(nmer,IDs,'') #'verbose'
    nmers_scoresT.sort(lambda x,y: cmp(x[1],y[1]))
    last = min(20,len(nmers_scoresT))
    models = []
    for i in range(last):
        seq = nmers_scoresT[i][0]
        m = MotifTools.Motif('',Q)
    for tup in nmers_scoresT[0:40]:
        print tup
コード例 #3
ファイル: Background.py プロジェクト: malhamdoosh/abseqPy
def main():
    seqsD = Fasta.load(sys.argv[1])
    seqs = seqsD.values()
    for w in range(1, 7):
        allnmers = permute(w)
        nmersT = MotifTools.top_nmers(w, seqs, 'with counts', 'purge Ns')
        nmersD = {}
        total = 0
        for nmer in allnmers:
            nmersD[nmer] = 1  #Pseudo count
            total = total + 1
        for nmer, count in nmersT[:]:
                rc = MotifTools.revcomplement(nmer)
                nmersD[nmer] = nmersD[nmer] + count
                nmersD[rc] = nmersD[rc] + count
                total = total + 2 * count
            except KeyError:
        _t = nmersD.keys()
        print "# freq in %s (total %d with pseudocounts)" % (sys.argv[1],
        for nmer in _t:
            print "%-7s %20.17f" % (nmer, float(nmersD[nmer]) / total)
コード例 #4
ファイル: MarkovBackground.py プロジェクト: xguse/gusPyProj
def main(fastafile, outDirectory):  # !! 1/2/09 AD added 'fastafile' var and changed 'if __name__' as way to call this from script.
    seqsD = Fasta.load(fastafile)
    seqs  = seqsD.values()
    output = []
    for w in range(1,7):
        allnmers = permute(w)
        nmersT = MotifTools.top_nmers(w,seqs,'with counts','purge Ns')
        nmersD = {}
        total = 0
        for nmer in allnmers:
            nmersD[nmer] = 1 #Pseudo count
            total = total + 1
        for nmer,count in nmersT[:]:
                rc = MotifTools.revcomplement(nmer)
                nmersD[nmer] = nmersD[nmer] + count
                nmersD[rc]   = nmersD[rc]   + count
                total = total + 2*count
            except KeyError:
        _t = nmersD.keys()
        output.append("# freq in %s (total %d with pseudocounts)\n"%(fastafile.split('/')[-1],total))  # AD 02-27-09 added a '\n' to make file look right
        for nmer in _t:
            output.append( "%-7s %20.17f\n"%(nmer,float(nmersD[nmer]) / total))  # AD 02-27-09 added a '\n' to make file look right
        # open output file and write out results
        outFile = '%s/%s.freq' % (outDirectory, fastafile.split('/')[-1])
        outFile = open(outFile, 'w')
        for index in output:
コード例 #5
def loadmotif(infile, trimstart=0, trimend=0):
    from TAMO import MotifTools
    lines = loadlist(infile)
    if lines[0] == "A\tC\tG\tT":
        ma = []
        for l in lines[1:]:
            p = l.split("\t")
                'A': float(p[0]),
                'C': float(p[1]),
                'G': float(p[2]),
                'T': float(p[3])
        if trimend == 0: ma = ma[trimstart:]
        else: ma = ma[trimstart:-trimend]
        return MotifTools.Motif_from_counts(ma)
    elif lines[0][0] in 'ACGT':
        if trimend == 0: lines = lines[trimstart:]
        else: lines = lines[trimstart:-trimend]
        return MotifTools.Motif(lines)
        na = []
        for line in lines:
            na.append(list(map(int, line.split())))
        ma = []
        for i in range(len(na[0])):
                'A': na[0][i],
                'C': na[1][i],
                'G': na[2][i],
                'T': na[3][i]
        return MotifTools.Motif_from_counts(ma)
コード例 #6
def tamo2tamo(file, outname):
    global probefile, PROBESET, fsafile

    motifs = MotifTools.load(file)
    if fsafile:
        fsaname = fsafile
        fsaname = find_fsa(file)

    print '# FSA ', fsaname
    fsaD = MotifMetrics.fasta2seqs(fsaname, 'want_dict')
    probes = fsaD.keys()
    if not probefile:
        PROBESET = MotifMetrics.ProbeSet('YEAST')
        #PROBESET= pick_genome(fsaname)
    #for key,seq in fsaD.items():
    #    PROBESET.probes[key] = seq

    print "# %d motifs" % len(motifs)
    for motif in motifs:
        #motif.pvalue, motif.church = 1,1  #Comment this!
        if motif.pvalue == 1:
            motif.pvalue = PROBESET.p_value(motif, probes, 'v')
        if motif.church == 1:
            motif.church = PROBESET.church(motif, probes, 'v')
        #if motif.E_site == None: motif.E_site = PROBESET.E_sitef(motif,probes,3,'v')
        #if motif.E_chi2 == None: motif.E_chi2 = PROBESET.E_chi2(motif,probes,None,'v')
        #if motif.E_seq  == None: motif.E_seq  = PROBESET.E_seq(motif,probes,'v')
        if motif.ROC_auc == None:
            motif.ROC_auc = PROBESET.ROC_AUC(motif, probes, 'v')
        #if motif.MNCP   == None: motif.MNCP   = PROBESET.MNCP(motif,probes,'v')
        if motif.frac == None:
            motif.frac = PROBESET.frac(motif, probes, 'v', 0.7)
        if motif.numbound == 0:
            matching = PROBESET.matching_ids(motif, [], factor=0.7)
            matchbound = [x for x in matching if x in probes]
            motif.numbound = len(probes)
            motif.nummotif = len(matching)
            motif.numboundmotif = len(matchbound)
        if 0 and motif.CRA == None:
                CRA, Cfrac = PROBESET.cons_ROC_AUC(motif,
                motif.CRA = CRA
                motif.Cfrac = Cfrac

    MotifTools.save_motifs(motifs, outname)
コード例 #7
ファイル: memeset2tamo.py プロジェクト: xguse/customTAMO
def memefiles2tamo(files, tamoname):
    global probefile, PROBESET, fsafile
    motifs = []
    for filename in files:
        print ">>>SDFSD>F ",filename
        if   re.search('\.ace$',filename):
            mdobject = AlignAce.AlignAce(filename)
            if not mdobject.fastafile: mdobject.fastafile=filename.replace('.ace','.fsa')
        elif re.search('\.meme.*$',filename):
            mdobject = Meme.Meme(filename)
            if not mdobject.fastafile:

    #fsaname = find_fsa(mdobject.fastafile)
    print mdobject.fastafile
    if fsafile: fsaname = fsafile
    else:       fsaname = Fasta.find(mdobject.fastafile)
    fsaD    = Fasta.load(fsaname)
    probes  = fsaD.keys()
    if not probefile:
        PROBESET = MotifMetrics.ProbeSet('YEAST')
        #PROBESET= pick_genome(fsaname)
    for key,seq in fsaD.items():
        PROBESET.probes[key] = seq

    for motif in motifs:
        if motif.pvalue == 1: motif.pvalue = PROBESET.p_value(motif,probes,'v')
        if motif.church == 1: motif.church = PROBESET.church(motif,probes,'v')
        #if motif.E_site == None: motif.E_site = PROBESET.E_sitef(motif,probes,3,'v')
        #if motif.E_chi2 == None: motif.E_chi2 = PROBESET.E_chi2(motif,probes,None,'v')
        #if motif.E_seq  == None: motif.E_seq  = PROBESET.E_seq(motif,probes,'v')
        if motif.ROC_auc== None: motif.ROC_auc= PROBESET.ROC_AUC(motif,probes,'v')
        #if motif.MNCP   == None: motif.MNCP   = PROBESET.MNCP(motif,probes,'v')
        if motif.frac   == None: motif.frac   = PROBESET.frac(motif,probes,'v',0.7)
        if re.search('\.meme$',filename):
            motif.MAP = -math.log(motif.evalue)/math.log(10)
        if 0 and (motif.CRA == None):
                CRA, Cfrac = PROBESET.cons_ROC_AUC(motif,probes,'v',tuple='YES')
                motif.CRA = CRA
                motif.Cfrac = Cfrac
            except: pass

    if re.search('\.meme$',filename):
        mdobject.motifs.sort(lambda x,y: cmp(x.pvalue, y.pvalue))
        mdobject.motifs.sort(lambda x,y: cmp(x.church, y.church))

コード例 #8
def tamo2tf(TAMO_file):
    '''Converts TAMO files to the TRANSFAC format

    ml = MotifTools.txt2motifs(TAMO_file)
    TAMO_file_name = TAMO_file.split("/")[-1]
    ACGT = ["A", "C", "G", "T"]
    n = 1
    oup = open("%s.tf" % (TAMO_file), "w")
    for m in ml:
        if m.source == "":
            oup.write("DE\t%s_%s\t%s_%s\n" %
                      (TAMO_file_name, n, TAMO_file_name, n))
            oup.write("DE\t%s\t%s\n" % (m.source, m.source))
        count = 0
        #print m.source
        for i in range(m.width):
            oup.write("%s\t" % count)
            for letter in ACGT:
                if m.logP:
                    Pij = pow(2.0, m.logP[i][letter])
                    oup.write("%s\t" % int(Pij * 100))
            count += 1
        n += 1
コード例 #9
ファイル: MotifCompare.py プロジェクト: adamlabadorf/TAMO
def averagemotifs(motifs,ovlp=2,template=None,DFUNC=negcommonbitsrange,VERBOSE=1,prop=''):
    if not template: 
        Dmat = computeDmat(motifs)
        idx  = centroididx(Dmat)
        template = motifs[idx]

    for m in motifs:
        off, rc = minshortestoverhangdiff(template,m,OVLP(template,m),'want_offset',DFUNC=DFUNC)
        m.offset = off
        m.rc     = rc
        #Find most negative offset
    offsets = [m.offset for m in motifs]             ; offsets.sort()
    maxposs = [(m.offset + m.width) for m in motifs] ; maxposs.sort()
    minpos = -offsets[0]
    maxpos = maxposs[-1] + minpos
    pmotifs = []
    for m in motifs:
        if m.rc: _m = m.revcomp()
        else   : _m = m
        leftpad  = minpos + m.offset
        rightpad = maxpos - (leftpad + m.width)
        padded   = _m[-leftpad,_m.width+rightpad]
        #print '%s%s%s\t%s'%('*'*leftpad,_m.oneletter,'*'*rightpad,padded)
    AVE = MotifTools.sum(pmotifs,[])
    if VERBOSE:
        for m in pmotifs:
            d = minshortestoverhangdiff(AVE,m,OVLP(AVE,m),DFUNC=DFUNC)
            print '%s   %5.3f'%(m.oneletter,d),
            if m.__dict__.has_key('key'): print m.key,
            if prop and m.__dict__.has_key(prop): print m.__dict__[prop],
        print '-'*m.width
    return AVE
コード例 #10
ファイル: EM.py プロジェクト: adamlabadorf/TAMO
    def __init__(self,seed_seqs, all_seqs, width = 6, verbose = ''):
        self.seed_seqs  = seed_seqs #Sequences to be scanned for seeds
        self.seqs       = all_seqs
        self.candidates = []
        self.models     = []      #Set directly or computed from seed_seqs
        self.width      = width
        self.verbose    = verbose
        if width:
            self.goodwmersT = MotifTools.top_nmers(self.width,self.seed_seqs,1,"")
            self.goodwmersT = zip(self.seed_seqs,range(len(self.seed_seqs)))
        self.bgprob     = {'A': 0.31, 'C': .19, 'G': .19, 'T': .31}
        self.beta       = 0.001
        self.deltamin   = 1e-3
        self.probes     = []
        self.method     = "ZOOPS" # OOPS or ZOOPS )
        self.param      = {}
        self.gapflank   = 0
        self.gapweight  = 0.2
        self.seedbeta   = 0.02
        self.joint      = 1

        global theMarkovBackground
        if theMarkovBackground:
            self.bgprob = theMarkovBackground.zeroth()

コード例 #11
ファイル: UPGMA.py プロジェクト: malhamdoosh/abseqPy
def parse_opts():
    global GLOBALS
    global DFUNC, DMAX
    short_opts = 'm:'
    long_opts  = ['dfunc:']
    try:   opts, args = getopt.getopt(sys.argv[1:], short_opts, long_opts)
    except getopt.GetoptError:
        print getopt.GetoptError.__dict__
    if not opts: usage()

    GLOBALS['args'] = args
    GLOBALS['motifs'] = []
    DFUNCtxt = None
    for opt,value in opts:
        if opt == '-m':                   GLOBALS['motifs'] = MotifTools.txt2motifs(value)
        if opt == '--dfunc':              DFUNCtxt = value
        if opt == '-d':                   DMAX     = float(value)

    # Deal with DFUNC and DMAX
    if DFUNCtxt == 'NCB':
        _DFUNC = MotifCompare.negcommonbits
    elif DFUNCtxt:
            exec ("_DFUNC = MotifCompare.%s"%DFUNCtxt)
            usage("No such distance metric: %s"%DFUNCtxt)
    if _DFUNC:  set_dfunc(_DFUNC,DMAX)
コード例 #12
ファイル: tamo2tamo.py プロジェクト: adamlabadorf/TAMO
def tamo2tamo(file, outname):
    global probefile, PROBESET, fsafile
    motifs  = MotifTools.load(file)
    if fsafile:
        fsaname = fsafile
        fsaname = find_fsa(file)

    print '# FSA ',fsaname
    fsaD    = MotifMetrics.fasta2seqs(fsaname,'want_dict')
    probes  = fsaD.keys()
    if not probefile:
        PROBESET = MotifMetrics.ProbeSet('YEAST')
        #PROBESET= pick_genome(fsaname)
    #for key,seq in fsaD.items():
    #    PROBESET.probes[key] = seq

    print "# %d motifs"%len(motifs)
    for motif in motifs:
        #motif.pvalue, motif.church = 1,1  #Comment this!
        if motif.pvalue == 1: motif.pvalue = PROBESET.p_value(motif,probes,'v')
        if motif.church == 1: motif.church = PROBESET.church(motif,probes,'v')
        #if motif.E_site == None: motif.E_site = PROBESET.E_sitef(motif,probes,3,'v')
        #if motif.E_chi2 == None: motif.E_chi2 = PROBESET.E_chi2(motif,probes,None,'v')
        #if motif.E_seq  == None: motif.E_seq  = PROBESET.E_seq(motif,probes,'v')
        if motif.ROC_auc== None: motif.ROC_auc= PROBESET.ROC_AUC(motif,probes,'v')
        #if motif.MNCP   == None: motif.MNCP   = PROBESET.MNCP(motif,probes,'v')
        if motif.frac   == None: motif.frac   = PROBESET.frac(motif,probes,'v',0.7)
        if motif.numbound == 0:
            matching            = PROBESET.matching_ids(motif,[],factor=0.7)
            matchbound          = [x for x in matching if x in probes]
            motif.numbound      = len(probes)
            motif.nummotif      = len(matching)
            motif.numboundmotif = len(matchbound)
        if 0 and motif.CRA    == None:
                CRA, Cfrac = PROBESET.cons_ROC_AUC(motif,probes,'v',tuple='YES')
                motif.CRA = CRA
                motif.Cfrac = Cfrac
            except: pass
コード例 #13
def pick_chunk_score(wdir, TAMO_file, target, genome):
    '''Trims and returns the top motif in a cluster.
    This script takes in the TAMO file from the motifs in a single cluster. It
    trims the low-information ends from each motifs. It then indentifies the
    motif that is most significantly represented in the target genes in your
    genome. If no motif is significantly represented, then a blank top motif
    file is created.
    os.system("cd %s" % wdir)

    script_dir = '/'.join(os.path.abspath(__file__).split('/')
                          [:-1])  # path to pcc_merge_CC.py script

    # step 1 trim tamo to eliminate low information flanking sequence
    trim_motif(TAMO_file, 0.1)

    # step 2 Group Specificity Score" from the Church lab
    # python MotifMetrics.py [Genes of interest] -genome [FASTA of promoter sequence] -t [Trimmed TAMO of cluster motifs]
    # MotifMetrics.py checks if the motifs appear disproportionatly to the
    # targets compared to the rest of the genes.
        "python %s/MotifMetrics.py %s -genome %s -t %s_0.1.trim -spec > %s_0.1.trim_Cout"
        % (script_dir, target, genome, TAMO_file, TAMO_file))

    # Gets the motif that is most significantly represented in your target genes
    # Returns "None" if none of the motifs has a p-value above 0.001.
    topm = parse_out_pcs("%s_0.1.trim_Cout" % TAMO_file)
    print "topm", topm

    # Writes the top motif to its own directory.
    if topm != "None":

        newdic = {}
        ml = MotifTools.txt2motifs("%s_0.1.trim" % TAMO_file)

        for m in ml:

            if m.oneletter == topm:
                newdic[m.oneletter] = m

        save_motifs(newdic.values(), "%s.TOP" % TAMO_file)
        os.system("rm %s_0.1.trim" % TAMO_file)
        os.system("rm %s_0.1.trim_Cout" % TAMO_file)

    # Writes a blank document if there was no top motif.
        oup = open("%s.TOP" % TAMO_file, "w")
コード例 #14
def ace2tamo(filename, tamoname):
    global probefile, PROBESET
    if   re.search('\.ace$',filename):
        mdobject = AlignAce.AlignAce(filename)
    elif re.search('\.meme$',filename):
        mdobject = Meme.Meme(filename)

    fsaname = find_fsa(mdobject.fastafile)
    fsaD    = MotifMetrics.fasta2seqs(fsaname,'want_dict')
    probes  = fsaD.keys()
    if not probefile:
        PROBESET = MotifMetrics.ProbeSet('HUMAN_250')
        #PROBESET= pick_genome(fsaname)
    for key,seq in fsaD.items():
        PROBESET.probes[key] = seq

    for motif in mdobject.motifs:
        if motif.pvalue == 1: motif.pvalue = PROBESET.p_value(motif,probes,'v')
        if motif.church == 1: motif.church = PROBESET.church(motif,probes,'v')
        if motif.E_site == None: motif.E_site = PROBESET.E_sitef(motif,probes,3,'v')
        #if motif.E_chi2 == None: motif.E_chi2 = PROBESET.E_chi2(motif,probes,None,'v')
        if motif.E_seq  == None: motif.E_seq  = PROBESET.E_seq(motif,probes,'v')
        if motif.ROC_auc== None: motif.ROC_auc= PROBESET.ROC_AUC(motif,probes,'v')
        if motif.MNCP   == None: motif.MNCP   = PROBESET.MNCP(motif,probes,'v')
        if re.search('\.meme$',filename):
            motif.MAP = -math.log(motif.evalue)/math.log(10)

    i = 0
    for motif in mdobject.motifs:
        motif.seednum = i ; i=i+1
        kmers = motif.bogus_kmers(100)
        motif.maxscore = -100
        scores = [motif.scan(kmer)[2][0] for kmer in kmers]
        print Arith.avestd(scores)

    if re.search('\.meme$',filename):
        mdobject.motifs.sort(lambda x,y: cmp(x.pvalue, y.pvalue))
        mdobject.motifs.sort(lambda x,y: cmp(x.church, y.church))

コード例 #15
ファイル: ace2tamo.py プロジェクト: xguse/customTAMO
def ace2tamo(filename, tamoname):
    global probefile, PROBESET
    if   re.search('\.ace$',filename):
        mdobject = AlignAce.AlignAce(filename)
    elif re.search('\.meme$',filename):
        mdobject = Meme.Meme(filename)

    fsaname = find_fsa(mdobject.fastafile)
    fsaD    = MotifMetrics.fasta2seqs(fsaname,'want_dict')
    probes  = fsaD.keys()
    if not probefile:
        PROBESET = MotifMetrics.ProbeSet('HUMAN_250')
        #PROBESET= pick_genome(fsaname)
    for key,seq in fsaD.items():
        PROBESET.probes[key] = seq

    for motif in mdobject.motifs:
        if motif.pvalue == 1: motif.pvalue = PROBESET.p_value(motif,probes,'v')
        if motif.church == 1: motif.church = PROBESET.church(motif,probes,'v')
        if motif.E_site == None: motif.E_site = PROBESET.E_sitef(motif,probes,3,'v')
        #if motif.E_chi2 == None: motif.E_chi2 = PROBESET.E_chi2(motif,probes,None,'v')
        if motif.E_seq  == None: motif.E_seq  = PROBESET.E_seq(motif,probes,'v')
        if motif.ROC_auc== None: motif.ROC_auc= PROBESET.ROC_AUC(motif,probes,'v')
        if motif.MNCP   == None: motif.MNCP   = PROBESET.MNCP(motif,probes,'v')
        if re.search('\.meme$',filename):
            motif.MAP = -math.log(motif.evalue)/math.log(10)

    i = 0
    for motif in mdobject.motifs:
        motif.seednum = i ; i=i+1
        kmers = motif.bogus_kmers(100)
        motif.maxscore = -100
        scores = [motif.scan(kmer)[2][0] for kmer in kmers]
        print Arith.avestd(scores)

    if re.search('\.meme$',filename):
        mdobject.motifs.sort(lambda x,y: cmp(x.pvalue, y.pvalue))
        mdobject.motifs.sort(lambda x,y: cmp(x.church, y.church))

コード例 #16
def trim_motif(TAMO_file, cut=0.4):
    '''Trims the motifs in TAMO_file, eliminating low-information flanks.'''

    testmotifs = MotifTools.load(TAMO_file)
    file = TAMO_file + "_" + str(cut) + ".trim"

    new_mlist = []
    for motif in testmotifs:
        m = motif.trimmed(cut)
    save_motifs(new_mlist, file)
コード例 #17
def TAMO_split(TAMO_file, motifs_per_file=190):
    '''This function splits a TAMO into smaller files for create_cc'''
    ml = MotifTools.txt2motifs(TAMO_file)
    total = len(ml) / int(motifs_per_file)  # Total number of TAMOs to generate
    by = motifs_per_file
    for i in range(total):
        print i
        print i * by + by, TAMO_file + '_n%s' % i
        save_motifs(ml[i * by:i * by + by], TAMO_file + '_n%s' % i)
    print total * by, len(ml), TAMO_file + '_n%s' % (total)
    save_motifs(ml[total * by:len(ml)], TAMO_file + '_n%s' % (total))
    return (total)
コード例 #18
def opentamo(fileloc):
    Opens a tamo file with MotifTools.load and returns the list of motifs,
    except when the input file doesn't exist, in which case it returns an empty list. 

    Has 1 argument:
    - fileloc: a string with the location of the file
        return MotifTools.load(fileloc)
    except IOError:
        return []
コード例 #19
ファイル: infoana.py プロジェクト: xtina/ExamiNDR
def Reduce_Nmers(Info):
    print 'COMPUTING Nmers ....'
    mseqs = ReduceInfo2seqs(Info,70, lambda L: MotifTools.top_nmers(6,L)[0:3])
    print "Combining representative sequences...: "
    for i in range(len(mseqs)):
        i = i + 1
        print '\t%s'%mseqs[i-1],
        if (i%5 == 0): print

    top_seq_pairs = MotifTools.top_nmers(5,mseqs,1)
    total_nmers = 0
    for (mner,count) in top_seq_pairs:
        total_nmers = total_nmers + count
    for (nmer,count) in top_seq_pairs[0:8]:
        print "RESULT: %s\t%2d (%5.2f%%) occurences:  "%(nmer,count,
        for bsite in Info.query['bsites']:
            seq = bsite.cleantxt()
            (max,s1,s2) = MotifTools.compare_seqs(nmer,seq)
            print '   %s vs %s %4.2f correct'%(s1,s2,max)
コード例 #20
def tamofile2motifs(filename):
    FID = open(filename,'r')
    lines = FID.readlines()
    motifs   = []
    seedD    = {}
    seedfile = ''
    for i in range(len(lines)):
        if lines[i][0:10] == 'Log-odds matrix'[0:10]:
            w = len(lines[i+1].split())-1
            ll = []
            for pos in range(w):
            for j in range(0,4):
                toks = lines[i+j+2].split()
                L = toks[0][1]
                for pos in range(w):
                    ll[pos][L] = float(toks[pos+1])
            m = MotifTools.Motif_from_ll(ll)
        if lines[i][0:6] == 'Motif '[0:6]:
            toks =  lines[i].split()
            motifs[-1].nseqs    = float(re.sub('[\(\)]','',toks[3]))
            motifs[-1].totalbits= float(toks[5])
            motifs[-1].MAP      = float(toks[7])
            motifs[-1].seeddist = float(toks[9])
            motifs[-1].seednum  = int(toks[10][0:-1])
            motifs[-1].pvalue   = math.pow(10,-float(toks[12]))
            if 'ch:' in toks:
                motifs[-1].church = math.pow(10,-float(toks[14]))
        if lines[i][0:10] == 'Threshold: '[0:10]:
            toks =  lines[i].split()
            motifs[-1].threshold= float(toks[1])
        if lines[i][0:5] == 'Seed '[0:5]:
            toks = lines[i].split()
            id = int(toks[1][0:-1])  #'10:' -> '10'
            seedD[id] = toks[2]
        if lines[i][0:7] == 'Source: '[0:7]:
            motifs[-1].source = lines[i][7:].strip()
        if lines[i][0:6] == 'Gamma: '[0:6]:
            motifs[-1].gamma = float(lines[i][6:])
        if lines[i][0:6] == 'Evalue: '[0:6]:
            motifs[-1].evalue = float(lines[i][7:].strip())
        if lines[i].find('Using')>=0 and lines[i].find('as seeds')>=0:
            '''#Using all (132) motifs in SLT_081503.seeds as seeds:'''
            seedfile = lines[i].split()[-3]
    for i in range(len(motifs)):
        if seedfile: motifs[i].seedfile = seedfile
        seednum = motifs[i].seednum
        if seedD.has_key(seednum):
            motifs[i].seedtxt = seedD[seednum]
コード例 #21
ファイル: kellis2tamo.py プロジェクト: malhamdoosh/abseqPy
def motifs2tamo(motifs, outname):
    global probefile, PROBESET
    fsaname = find_fsa(outname)
    fsaD    = MotifMetrics.fasta2seqs(fsaname,'want_dict')
    probes  = fsaD.keys()
    if not probefile:
        PROBESET = MotifMetrics.ProbeSet('YEAST')
        #PROBESET= pick_genome(fsaname)
    #for key,seq in fsaD.items():
    #    PROBESET.probes[key] = seq

    print "# %d motifs"%len(motifs)
    for motif in motifs:
        if motif.pvalue == 1: motif.pvalue = PROBESET.p_value(motif,probes,'v')
        if motif.church == 1: motif.church = PROBESET.church(motif,probes,'v')
        if motif.E_site == None: motif.E_site = PROBESET.E_sitef(motif,probes,3,'v')
        #if motif.E_chi2 == None: motif.E_chi2 = PROBESET.E_chi2(motif,probes,None,'v')
        if motif.E_seq  == None: motif.E_seq  = PROBESET.E_seq(motif,probes,'v')
        if motif.ROC_auc== None: motif.ROC_auc= PROBESET.ROC_AUC(motif,probes,'v')
        if motif.MNCP   == None: motif.MNCP   = PROBESET.MNCP(motif,probes,'v')
コード例 #22
def parse_block(name, block):
    mat = []
    ACGT = {"A": 1, "C": 2, "G": 3, "T": 4}
    for i in block:
        L = i.strip().split()
        D = {'A': 0, 'C': 0, 'T': 0, 'G': 0}
        for j in ACGT.keys():
            D[j] = float(L[ACGT[j]])
    m = MotifTools.Motif_from_counts(mat)
    m.source = name
    #print m._print_p()
    return m
コード例 #23
def test():
    motifs = []
    betalist = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 4.0]
    for beta in [1.0]:
        m = MotifTools.Motif()
        m.compute_from_text('GGTTTCAT', beta)  #STE12 binding site
        print m
        print "Against Ste12:"
        match = validate(m, "STE12", 'V', 'T')
        print "Against Fkh2:"
        fmatch = validate(m, "FKH2", 'V', 'T')
        print beta, match, fmatch
コード例 #24
ファイル: EM.py プロジェクト: adamlabadorf/TAMO
 def freq_from_seqs_old(self,seqs):
     self.highestorder = 4
     for depth in range(1,6):
         nmersT = MotifTools.top_nmers(depth, seqs, "TUPLES")
         self.nmers_by_size[depth] = map(lambda x:x[0],nmersT)
         total = 0
         for nmer,count in nmersT:
             total = total + count
         for nmer,count in nmersT:
             rc = MotifTools.revcomplement(nmer)
             if nmer == rc:                       #correct top_nmers 
                 f   = float(count)/total         #palindrome count
                 f   = float(count)/total/2
             self.F[nmer] = f
             self.F[rc]   = f
     for depth in range(0):                       #For debugging
         total = 0
         for k in self.F.keys():
             if len(k) == depth:
                 total = total + self.F[k]
                 print k, self.F[k]
         print depth,total
コード例 #25
ファイル: MDAP_defs.py プロジェクト: xguse/gusPyProj
def alignAndCombineMotifs(motifs, weights):
    # zip motifs and weights
    simMotifs = zip(motifs, weights)
    # sort by weights
    simMotifs.sort(key=lambda x: abs(x[1]))
    aligned = alignSimilarMotifs([x[0] for x in simMotifs], minoverlap=4)
    #print '--'
    #for each in aligned: print each.oneletter
    #print '\n'
    comboMotif = MotifTools.sum(aligned,[-x[1] for x in simMotifs])
    return comboMotif
コード例 #26
def combine_distance_matrix_for_2(wdir, TAMO_file_1, TAMO_file_2):
    '''Combines matricies made from two TAMO files.
    This script is used to create the final matrix after all jobs from 
    create_cc_for_2 are complete.

    ml_1 = MotifTools.txt2motifs(TAMO_file_1)
    ml_2 = MotifTools.txt2motifs(TAMO_file_2)

    n_split_1 = len(ml_1) / 100
    n_split_2 = len(ml_2) / 100

    print n_split_1, len(ml_1)
    print n_split_2

    # Change to the working directory.
    os.system("cd %s" % wdir)

    # This loop will paste together matricies
    for i in range(n_split_1 + 1):
        com = "paste "
        for j in range(n_split_2 + 1):
            com += "%s_n%s-%s_n%s.dm " % (TAMO_file_1, i, TAMO_file_2, j)
        com += "> distance_%s" % i
        print com

    com = "cat "
    for i in range(n_split_1 + 1):
        com += "distance_%s " % i
    com += "> %s-%s.dm" % (TAMO_file_1, TAMO_file_2)

    print com
コード例 #27
ファイル: tamo2table.py プロジェクト: malhamdoosh/abseqPy
def parse_opts():
    global GLOBALS
    short_opts = 'm:g:'
    long_opts  = ['genome=','top=']
    try:   opts, args = getopt.getopt(sys.argv[1:], short_opts, long_opts)
    except getopt.GetoptError:
        print getopt.GetoptError.__dict__
    if not opts: usage()

    GLOBALS['args'] = args
    for opt,value in opts:
        if opt == '-m':                GLOBALS['motifs']     = MotifTools.txt2motifs(value)
        if opt in ['-g', '--genome']:  GLOBALS['genomefile'] = value
        if opt == '--top':             GLOBALS['top']        = int(value)
コード例 #28
ファイル: TAMO_Motif.py プロジェクト: shwetabhandare/PySG
def Read_Dreme_PSSM(lines):
	pwm = []
	name = "Dreme Motif";

	vals = []
	for line in lines.split('\n'):
		for item in line.split():
		vals = [];
	#print pwm

	m = MotifTools.toDict(pwm)
	motif = MotifTools.Motif_from_counts(m)
	return motif;
コード例 #29
ファイル: EM.py プロジェクト: adamlabadorf/TAMO
 def study_seqs(self,seqs):
     for depth in range(1,6):
         nmersT = MotifTools.top_nmers(depth, seqs, "TUPLES")
         total = 0
         for nmer,count in nmersT:
             total = total + count
             rc = MotifTools.revcomplement(nmer)
         for nmer,count in nmersT:
             f   = math.log(float(count)/total)/math.log(2)
             f_2 = math.log(0.5 * float(count)/total)/math.log(2)
             rc = MotifTools.revcomplement(nmer)
             if rc != nmer:
                 self.D[nmer] = f_2
                 self.D[rc]   = f_2
                 self.D[nmer] = f
     for depth in range(0):
         total = 0
         for k in self.D.keys():
             if len(k) == depth:
                 total = total + pow(2,self.D[k])
                 print k, pow(2,self.D[k])
         print depth,total
     self.highestorder = 5
コード例 #30
ファイル: EM.py プロジェクト: adamlabadorf/TAMO
 def freq_from_seqs(self,seqs):
    self.highestorder = 6
    for w in range(1,7):
        allnmers = permute(w)
        nmersT = MotifTools.top_nmers(w,seqs,'with counts','purge Ns')
        self.nmers_by_size[w] = allnmers[:]
        nmersD = {}
        total = 0.0
        for nmer in allnmers: #Pseudo count
            nmersD[nmer] = 1 
            total = total + 1
        for nmer,count in nmersT:
                rc = MotifTools.revcomplement(nmer)
                nmersD[nmer] = nmersD[nmer] + count
                nmersD[rc]   = nmersD[rc]   + count
                total = total + 2*count
            except KeyError:
        for nmer in nmersD.keys():
            rc = MotifTools.revcomplement(nmer)
            f  = nmersD[nmer]/total
            self.F[nmer] = f
            self.F[rc]   = f
コード例 #31
ファイル: TAMO_Motif.py プロジェクト: shwetabhandare/PySG
def Make_PWM_Motif(filename, motifBackGround=""):

	#print "# Reading PWM from: [%s]"%filename
	name, pwm = Read_PWM(filename)
	m = MotifTools.toDict(pwm)
	#print m
	motif = MotifTools.Motif_from_ll(m);
	motif.source = name
	#print "Motif:", motif.source
	#print "Max Motif Score:", motif.maxscore
	#print "Motif Summary:", motif.summary()
	return motif
コード例 #32
def averagemotifs(motifs,
    if not template:
        Dmat = computeDmat(motifs)
        idx = centroididx(Dmat)
        template = motifs[idx]

    for m in motifs:
        off, rc = minshortestoverhangdiff(template,
                                          OVLP(template, m),
        m.offset = off
        m.rc = rc
        #Find most negative offset
    offsets = [m.offset for m in motifs]
    maxposs = [(m.offset + m.width) for m in motifs]
    minpos = -offsets[0]
    maxpos = maxposs[-1] + minpos
    pmotifs = []
    for m in motifs:
        if m.rc: _m = m.revcomp()
        else: _m = m
        leftpad = minpos + m.offset
        rightpad = maxpos - (leftpad + m.width)
        padded = _m[-leftpad, _m.width + rightpad]
        #print '%s%s%s\t%s'%('*'*leftpad,_m.oneletter,'*'*rightpad,padded)
    AVE = MotifTools.sum(pmotifs, [])
    if VERBOSE:
        for m in pmotifs:
            d = minshortestoverhangdiff(AVE, m, OVLP(AVE, m), DFUNC=DFUNC)
            print '%s   %5.3f' % (m.oneletter, d),
            if m.__dict__.has_key('key'): print m.key,
            if prop and m.__dict__.has_key(prop): print m.__dict__[prop],
        print '-' * m.width
    return AVE
コード例 #33
def showdiffXvert(motif, seq, OVLP_FCN=None, DIFF_FCN=None):
    The funtion converts the sequence to a Motif, computes the D
    of the best alignment, and prints the alignment that generated
    that D.
    MSOdiff = minshortestoverhangdiff
    if not OVLP_FCN: OVLP_FCN = lambda A, B: min(min(A.width, B.width) - 1, 7)
    bg = motif.background
    other = MotifTools.Motif_from_text(seq, bg=bg)
    ovlp = OVLP_FCN(motif, other)
    diff = MSOdiff(motif, other, ovlp, DFUNC=DIFF_FCN)
    offset, rcflag = MSOdiff(motif, other, ovlp, 'want_offset', DFUNC=DIFF_FCN)
    if rcflag: m = other.revcomp()
    else: m = other
    print 'MSOdiff:  %8.4f %s%s%s' % (diff, ' ' * 15, motif.oneletter, ' ' *
                                      (30 - motif.width))
    print '          %8s %s%s%s' % (' ', ' ' *
                                    (15 + offset), m.oneletter, ' ' *
                                    (30 - offset - other.width))
    return diff
コード例 #34
ファイル: kellis2tamo.py プロジェクト: malhamdoosh/abseqPy
def main():
    fsa_fcn = up_and_no_N


    FID = open(sys.argv[1])
    tokss = [x.strip().split(',') for x in FID.readlines()]

    D = {}
    for expt,motif,score,source in tokss:
        print expt,motif
        if expt == 'Category': continue
        if motif == 'x': continue
        motif = MotifTools.Motif_from_text(motif)
        motif.kellis = float(score)
        motif.source = source
        try: D[expt].append(motif)
        except: D[expt] = [motif]

    for expt,motifs in D.items():
        root = expt
        ext  = 'cons'
        if root[0:3] == 'Rnd':
            num = re.sub('.*_','',root)
            if len(num) == 1:
                root = re.sub('_','_00',root)
                root = re.sub('_','_0',root)
            root = re.sub('Rnd','random_',root)
        outname = '%s.t%s'%(root,ext)
        print '%-18s  --> %s'%(root,outname)
            print "Error: Could not convert %s [[ %s ]]"%(
                filename, outname)
コード例 #35
def memefiles2tamo(files, tamoname):
    global probefile, PROBESET

    motifs = []
    for filename in files:
        print ">>>SDFSD>F ", filename
        if re.search('\.ace$', filename):
            mdobject = AlignAce.AlignAce(filename)
            if not mdobject.fastafile:
                mdobject.fastafile = filename.replace('.ace', '.fsa')
        elif re.search('\.meme.*$', filename):
            mdobject = Meme.Meme(filename)
            if not mdobject.fastafile:
                mdobject.fastafile = re.sub('\..\.meme', '.meme',
                                            filename).replace('.meme', '.fsa')

    #fsaname = find_fsa(mdobject.fastafile)
    print mdobject.fastafile
    fsaname = Fasta.find(mdobject.fastafile)
    fsaD = Fasta.load(fsaname)
    probes = fsaD.keys()
    if not probefile:
        PROBESET = MotifMetrics.ProbeSet('YEAST')
        #PROBESET= pick_genome(fsaname)
    for key, seq in fsaD.items():
        PROBESET.probes[key] = seq

    for motif in motifs:
        if motif.pvalue == 1:
            motif.pvalue = PROBESET.p_value(motif, probes, 'v')
        if motif.church == 1:
            motif.church = PROBESET.church(motif, probes, 'v')
        if motif.E_site == None:
            motif.E_site = PROBESET.E_sitef(motif, probes, 3, 'v')
        #if motif.E_chi2 == None: motif.E_chi2 = PROBESET.E_chi2(motif,probes,None,'v')
        #if motif.E_seq  == None: motif.E_seq  = PROBESET.E_seq(motif,probes,'v')
        if motif.ROC_auc == None:
            motif.ROC_auc = PROBESET.ROC_AUC(motif, probes, 'v')
        if motif.MNCP == None: motif.MNCP = PROBESET.MNCP(motif, probes, 'v')
        if motif.frac == None:
            motif.frac = PROBESET.frac(motif, probes, 'v', 0.7)
        if re.search('\.meme$', filename):
            motif.MAP = -math.log(motif.evalue) / math.log(10)
        if 1 and (motif.CRA == None):
                CRA, Cfrac = PROBESET.cons_ROC_AUC(motif,
                motif.CRA = CRA
                motif.Cfrac = Cfrac

    if re.search('\.meme$', filename):
        mdobject.motifs.sort(lambda x, y: cmp(x.pvalue, y.pvalue))
        mdobject.motifs.sort(lambda x, y: cmp(x.church, y.church))

    MotifTools.save_motifs(motifs, tamoname)
コード例 #36
ファイル: EM.py プロジェクト: adamlabadorf/TAMO
    def all_Wmers(self,N,seq):
        forw = []
        rev  = []
        seqrc = MotifTools.revcomplement(seq)
        Mlh = theMarkovBackground.highestorder
        Mlb = theMarkovBackground.logbackground
        MCP = theMarkovBackground.CP
        Fbg = Mlb(seq)
        Rbg = Mlb(seqrc)
        nmask = map(lambda x:1-x, self.mask)

        ?? QUESTION: Is it sensible to compute the background probabilities
        this way?
        1) BG of complementary strand is taken as equal to primary strand.
        2) Letters inside the motif window are not used for conditional probabilities.
           As a result, the calculation essentially breaks down to the log probability the
           background emits the sequence to the left of the window plus the log probability
           the background emits the sequence to the right.
        3) I\'ve worked out an efficient way to compute this by
           a) Compute the background probability for the entire probe/sequence
           b) (Quick) Compute logQdiff below
           c) Subtract

        for i in range(len(seq)-N+1):
            subseq = seq[i:i+N]

            '''Build Wmer information'''
            #Wtmp        = Wmer(subseq)
            left        = seq[0:i]
            right       = seq[i+N:]
            #Wtmp.lflank = left
            #Wtmp.rflank = right
            #if i==0: Wtmp.src    = seq
            #Wtmp.srcQ   = Fbg
            #Wtmp.i      = i

            '''This is the fast way'''
            logQdiff = Mlb(left[-Mlh:] + subseq + right[0:Mlh]) - Mlb(left[-Mlh:]) - Mlb(right[0:Mlh])
            logQtot = Fbg - logQdiff

            '''Add a bit back for intervening bases in the "gap" '''
            gapbg = 0
            for p in range(N):
                gapbg = gapbg + MCP[subseq[p]] * nmask[p]
            logQtot = logQtot + gapbg

            '''Build Wmer-reverse complement information'''
            #Wtmprc = Wmer(Wtmp.rc)
            #Wtmprc.lflank = seqrc[0:-(i+N)]  #Check this in case it is ever necessary
            #if i!=0:
            #    Wtmprc.rflank = seqrc[-i:]   #Necessary [11-12-02]
            #    Wtmprc.rflank = ''
            #Wtmprc.logQtot = Wtmp.logQtot
            #Wtmprc.srcQ    = Wtmp.srcQ
            #Wtmprc.i       = i
        W = []
        #seq.c_wmerbgs = MDsupport.list2double(map(lambda x: x.logQtot, W))
コード例 #37
def merge_runs(TAMO_file, wdir, height, distance, ancestor, target, genome):
    '''This script is used to merge motifs with the PCC matrix of all motifs.
    The script was originally written by Cheng Zou, and then converted to a 
    function by Alex Seddon.

    print "Here are the parameters you specified in this run "
    print "-tamo        %s" % TAMO_file
    print "-wdir        %s" % wdir
    print "-h        height to cut the tree, %s" % height
    print "-distance    %s" % distance
    print "-ancestor    %s" % ancestor
    print "-target    %s" % target
    print "-genome    %s" % genome

    if TAMO_file == '' or wdir == '':

    os.system("cd %s" % wdir)


    # This code was in the original clustering script. It has been taken out
    # because the processes involved take too long and have been replaced by
    # the matrix creation scripts and the run_UPGMA script.
    #if distance==0:
    #    os.system("python /mnt/home/seddonal/scripts/5_motif_merging/3.calculate_distance_matrix.py   -i %s --dfunc pccrange" % TAMO_file)
    #os.system("R --vanilla --slave --args %s.dm  %s< /mnt/home/seddonal/scripts/5_motif_merging/UPGMA_final.R> %s.Rout" % (TAMO_file,height,TAMO_file))

    cl_dic = {}
    n = 0

    # The file, TAMO_file.dm_UPGMA_Cl_0.05, is inorder of the motifs that appear
    # in the TAMO_file. If two motifs have the same number, they are considered
    # a part of the same cluster.
    # This loop pulls the clustering information out of this file and creats
    # the dictionary cl_dic = {cluster_index:{motif_index:'1'}}
    for line in open("%s.dm_UPGMA_Cl_%s" % (TAMO_file, height), "r"):

        # Gets the clusterindex of this motif
        cl = line.strip()

        # Adds the cluster index if it has not been
        if not cl_dic.has_key(cl):
            cl_dic[cl] = {}

        cl_dic[cl][n] = "1"  # Adds the motif to that cluster
        n += 1  # Increases the motif index for the next motif

    #print cl_dic

    ml = MotifTools.txt2motifs(TAMO_file)
    old = []  # List of motifs that are the sole members of a cluster.

    # I think I can divide up this portion of the code to create a series
    print ancestor, ancestor == 0
    if ancestor == 0:

        # This loop Looks at each cluster and attempts to merge the motifs
        # in the cluster if there are multiple motifs.
        for i in cl_dic.keys():

            print i, cl_dic[i]

            # If there are multiple motifs in the cluster, it merges the motifs
            if len(cl_dic[i]) > 1:

                # Adds all of the motifs in the cluster to an object called
                # mlist.
                mlist = []
                for j in cl_dic[i]:

                # Saves these motifs to there own TAMO file.
                save_motifs(mlist, "%s_sub_%s.tm" % (TAMO_file, i))

                # I am fairly certain that this process of converting to TF and
                # then returning it to TAMO format is only for keeping the names
                # consistent. I need to verify this suspicion
                tamo2tf("%s_sub_%s.tm" % (TAMO_file, i))
                os.system("cat  %s_sub_%s.tm.tf > %s_sub_%s_sum.tm.tf" %
                          (TAMO_file, i, TAMO_file, i))
                tf2tamo("%s_sub_%s_sum.tm.tf" % (TAMO_file, i))

                # Gets the top motif in the cluster.
                                 '%s_sub_%s_sum.tm.tf.tm' % (TAMO_file, i),
                                 target, genome)

                # Removes the files that were created.
                os.system("rm  %s_sub_%s_sum.tm.tf.tm" % (TAMO_file, i))
                os.system("rm %s_sub_%s_sum.tm.tf" % (TAMO_file, i))
                os.system("rm -R %s_sub_%s.tm.tf_ST*" % (TAMO_file, i))

            # If there is only one motif in the cluster, it leaves it alone,
            # And adds it to old
                key = cl_dic[i].keys()[0]

    if ancestor == 1:

        # This loop Looks at each cluster and attempts to merge the motifs
        # in the cluster if there are multiple motifs.
        for i in cl_dic.keys():

            print i, cl_dic[i]

            # If there are multiple motifs in the cluster, it merges the motifs
            if len(cl_dic[i]) > 1:

                # Adds all of the motifs in the cluster to an object called
                # mlist.
                mlist = []
                for j in cl_dic[i]:

                # Saves these motifs to there own TAMO file.
                save_motifs(mlist, "%s_sub_%s.tm" % (TAMO_file, i))

                # Merges the motifs in the same cluster using STAMP
                tamo2tf("%s_sub_%s.tm" % (TAMO_file, i))

                # Gets the JASPER motifs that best match the motifs from within
                # the cluster.
                    "STAMP -tf  %s_sub_%s.tm.tf  -sd /home/chengzou/bin/STAMP/ScoreDists/JaspRand_PCC_SWU.scores  \
                 -go  1000 -ge 1000 -cc PCC -align SWU -out %s_sub_%s.tm.tf_STout -chp > %s_sub_%s.tm.tf_STout.log"
                    % (TAMO_file, i, TAMO_file, i, TAMO_file, i))
                parse_out_STAMP(TAMO_file, i)

                # combines the JASPER motifs with the cluster motif and then
                # converts them all to one TAMO file
                    "cat  %s_sub_%s.tm.tf %s_sub_%s.tm.tf_SToutFBP.txt.mod %s_sub_%s.tm.tf_STout_tree_clusters.txt > %s_sub_%s_sum.tm.tf"
                    % (TAMO_file, i, TAMO_file, i, TAMO_file, i, TAMO_file, i))
                tf2tamo("%s_sub_%s_sum.tm.tf" % (TAMO_file, i))

                # Gets the top motif within the TAMO file.
                                 '%s_sub_%s_sum.tm.tf.tm' % (TAMO_file, i),
                                 target, genome)

                # Removes any files created in the processing.
                os.system("rm  %s_sub_%s_sum.tm.tf.tm" % (TAMO_file, i))
                os.system("rm %s_sub_%s_sum.tm.tf" % (TAMO_file, i))
                os.system("rm -R %s_sub_%s.tm.tf_ST*" % (TAMO_file, i))
                key = cl_dic[i].keys()[0]

    # Combine together the top motifs from every
    os.system("cat %s_sub_*_sum.tm.tf.tm.TOP > %s_sub_new.tm" %
              (TAMO_file, TAMO_file))
    save_motifs(old, "%s_sub_old.tm" % (TAMO_file))
    os.system("cat %s_sub_old.tm %s_sub_new.tm > %s_P1.tm" %
              (TAMO_file, TAMO_file, TAMO_file))
コード例 #38
genelist = argv[1].split('/')[-1]
allclusters = argv[1] + '/' + genelist + '_allclusters.tamo'
#print genelist
oneletters = argv[1] + '/other/' + genelist + '_oneletter.tmp'
symbols = argv[1] + '/other/' + genelist + '_symbols.tmp'

# Open output files for writing
oneletters = open(oneletters, 'w')
symbols = open(symbols, 'w')

# Define output variables
oneletterlist = []
symbolstring = '1234567890ABCDEFGHIJKLMNOPQRSTUVWXYZ*+.,:;!'

# Open list
motiflist = MotifTools.load(allclusters)

# Try to verify the initial list is not too long
if len(motiflist) > len(symbolstring):
    # If the list is too long, raise an exception so that the program quits
    raise ValueError("The cluster list is too long for sitemap.py")
# If the list is not too long, adjust the symbols string to the appropriate length
    symbolstring = symbolstring[:len(motiflist)]

# Save symbol string in the symbols file and close that file

# Add oneletter summaries to the list
for num in range(len(motiflist)):
コード例 #39
ファイル: EM.py プロジェクト: adamlabadorf/TAMO
    def EM_Cstart(self):
        verbose = self.verbose
        if verbose:
            print "Seeding models..."

        #Initialize parameters
        if not self.param.has_key('gamma'): self.param['gamma'] = 0.2
        timings = {'Probes':0, 'Background':0, 'C EM':0, 'Post':0}
        _time = time.time()

        for seq in self.seqs:
            P = Probe(seq)

        _time2 = time.time(); timings['Probes'] = _time2-_time; _time = _time2

        if verbose: print "Optimizing candidates by EM."
        if verbose: sys.stdout.flush()

        c_logZ_sets = {}
        for Model,i in zip(self.models,range(len(self.models))):
            width = Model.width

            if not c_logZ_sets.has_key(width):
                c_logZs_set = []
                if verbose: print "#%s   |%s|"%(' '*28,'-'*len(self.seqs))
                if verbose: sys.stdout.flush()
                if verbose: print "Computing background (width %2d)  "%width,
                for P in self.probes:
                    if verbose: sys.stdout.write('.')
                    if verbose: sys.stdout.flush()
                    logZs = self.all_Wmers(width,P)
                    c_logZs = MDsupport.list2double(logZs)
                    #P.c_wmerbgs = MDsupport.list2double(map(lambda x: x.logQtot, Wlist))
                c_logZ_sets[width] = c_logZs_set
                if verbose: print

            c_logZ_set = c_logZ_sets[width]
            for P,c_logZs in zip(self.probes,c_logZ_set):
                P.c_wmerbgs = c_logZs
            _time2 = time.time()
            timings['Background'] = timings['Background'] +_time2-_time
            _time = _time2

            '''Perform EM'''
            _time  = time.time()
            newModel = self.EM_C(Model, self.probes)
            _time2 = time.time(); timings['C EM'] = timings['C EM'] + _time2-_time; _time = _time2

            #print "cLL: ",newModel.joint
            #print "pLL: ",self.compute_joint(newModel,Wmers_by_seq)

            '''Was there a problem?'''
            if newModel == None:

            '''Set various things in PSSM'''
            seeddist = MotifTools.infomaskdiff(newModel,Model)
            print '%s ----> %s'%(Model,newModel)
            print "Seed %2d: %s  -->  %s  mask:%9.5f  infoMask:%9.5f d:%9.5f"%(
                i, Model, newModel,
                MotifTools.infomaskdiff(newModel,Model), #order is important
            if Model.seedtxt: newModel.seedtxt = Model.seedtxt
            if Model.source:  newModel.source  = Model.source
            newModel.seeddist = seeddist
            newModel.seednum  = i
            print newModel

            '''Set various things in Candidate (like a wrapper for PSSM)'''
            C = MotifCandidate()
            C.pssm = newModel.copy()
            #C.wmers = self.best_by_Z(Wmers_by_seq)
            C.wmers  = [newModel.emit() for junk in range(20)]
            #C._update()  #MAJOR REMOVAL????????? DBG 10-14-03
            C.pssm = newModel.copy()  
            _time2 = time.time(); timings['Post'] = timings['Post']+_time2-_time;_time = _time2

        '''Print Timing Information'''
        if verbose:
            print "# Timing Information"
            _t = 0
            for timing in timings.keys():
                _t = _t + timings[timing]
            for timing in timings.keys():
                print "# %12s %f  %f%%"%(timing,timings[timing],timings[timing]*100/_t)
コード例 #40
ファイル: GEMSlikePWM.py プロジェクト: xguse/gusPyProj
# TAMOify kmers and logify pVals
for i in range(len(testMotifs)):
    testMotifs[i] = (Motif(testMotifs[i][0]),numpy.log10(float(testMotifs[i][1])))
# Sort on log'd pVals
testMotifs.sort(key=lambda x: x[1])

comboMotifs = []

for i in range(0,int(len(testMotifs)*0.2)):
    simMotifs  = getKmersWithOneMisMtch(testMotifs[i][0],testMotifs) 
    alndMotifs = alignSimilarMotifs([x[0] for x in simMotifs])
    #for m in simMotifs:
        #print m[0].oneletter
    comboMotifs.append(MotifTools.sum(alndMotifs,[-x[1] for x in simMotifs])) # -x[1] to convert neg logs to pos weights
    print len(comboMotifs)

t2 = time.time()    

oFile = '/Users/biggus/Documents/James/Collaborations/Campbell/data/Results_HyperGeoScreen/masked/Results_gGEMS/CCupAt4Days.gte2x.5-16mers.shfSeq.3.gGEMS.tmo'
pFile = '/Users/biggus/Documents/James/Collaborations/Campbell/data/Results_HyperGeoScreen/masked/Results_gGEMS/CCupAt4Days.gte2x.5-16mers.shfSeq.3.gGEMS.pkl'

pFile = open(pFile, 'w')
t3 = time.time()    
print 'Calculations took %.3f min.\nWriting/Pickling took %.3f min.' % ((float(t2)-t1)/60, (float(t3)-t2)/60) 

コード例 #41
ファイル: EM.py プロジェクト: adamlabadorf/TAMO
 def has_wmer(self,wmer):
     rc = MotifTools.revcomplement(wmer)
     if (wmer in self.wmers) or (rc in self.wmers):
コード例 #42
ファイル: Sitemap.py プロジェクト: xguse/customTAMO
def main():
        opts, args = getopt.getopt(sys.argv[1:], "f:m:n:L:t:a:S:i:", ["help", "output="])  # AD added 'i'
    except getopt.GetoptError:
    if not opts:

    print "#" + ' '.join(sys.argv)
    fastafile, motiffile, motifnums, labels, thresh = (None, None, [], None, 0.75) # AD changed thresh val to 0.75 from 0.7
    ambigs = []

    scale   = 50.0 / 1000.0
    motifs = []
    for opt, value in opts:
        #print opt, value
        if   opt ==  '-f':  fastafile = value
        elif opt ==  '-m':  motifs.extend(MotifTools.txt2motifs(value))
        elif opt ==  '-n':  motifnums = [int(x) for x in value.split(',')]
        elif opt ==  '-L':  labels    = list(value)
        elif opt ==  '-t':  thresh    = float(value)
        elif opt ==  '-a':  ambigs.extend(value.split(','))
        elif opt ==  '-S':  scale     = float(value)
        elif opt ==  '-i':  motiffile = value  # AD added this option to ACTUALLY supply the tamo motif file at the command-line.  The code to deal with motiffiles already existed. There was just no code for User to supply one.
    probes = Fasta.load(fastafile)
    if motiffile:
        for f in motiffile.split(','):      # AD added this to allow supplying multiple tamo files at the prompt like you can supply multiple motifs
    if ambigs:
        for ambig in ambigs:
            motifs.append( MotifTools.Motif_from_text(ambig,0.1) )
    if not motifnums:  motifnums = range(len(motifs))
    print '# %d: %s'%(len(motifs),motifnums)
    for i in range(len(motifnums)):
        motif = motifs[motifnums[i]]
        if labels and i < len(labels):
            txt = labels[i]
            txt = '%d'%i
        print '%-3s : %s %5.2f (%4.2f)'%(txt,motif,thresh*motif.maxscore,thresh)

    probehits = {}
    for key in probes.keys():
        hits_by_motif = []
        save_flag     = 0
        if re.search('[BDHU]',probes[key]): continue
        for num in motifnums:
            result = motifs[num].scan(probes[key],thresh*motif.maxscore)
            if result[0]:
                save_flag = 1
        if save_flag:

    #scale   = .1
    maxw = 40
    for key in probehits.keys():
        l       = len(probes[key])
        a       = list('-'* int(scale*l) )
        a.extend( list(' '*10 ) )
        desc    = []
        matches = probehits[key]
        for i in range(len(matches)):
            if matches[i]:
                subseqs,endpoints,scores = matches[i]
                for idx in range(len(subseqs)):
                    start,stop = endpoints[idx]
                    subseq     = subseqs[idx]
                    score      = scores[idx]
                    if labels and (i<len(labels)): ID = labels[i]
                    else                         : ID = '%d'%i
                    desc.append('%s %s %d-%d %4.2f '%(ID,subseq,start,stop,score))
                    start = int(start*scale)
                    for offset in range(10):
                        if a[start+offset] == '-':
                            if labels and (i < len(labels)):
                                a[start+offset] = labels[i]
                                a[start+offset] = '%d'%i
        print '%-14s %s'%(key,''.join(a)),
        print ' '*max(0,maxw-len(a)), '| '.join(['%-27s'%x for x in desc])
    print "Found matches in %d of %d input probes"%(len(probehits),len(probes))
コード例 #43
def main():
    if len(sys.argv) < 2:
        print "Usage: %s <fasta_file> [width = None ] [options]"%(re.sub('^.*/','',sys.argv[0]))
        print "Options include:"
        print ""
        print " EM Parameters:"
        print "                  -beta    [0.01]   Beta for pseudocounts"
        print "                  -seedbeta[0.02]   Beta for pseudocounts for seeds from text"
        print "                  -gamma   [0.2]    Gamma (fraction of sequences)"
        print "                  -delta   [0.001]  Convergence criteria"
        print " "
        print " Seeds (not actually proper priors)"
        print "                  -prior            Seqences or motifs for seeds (may be repeated)"
        print "                  -top N   [0]      Include w-mers in top N probes"
        print "                  -gap    string    sample gapped motifs"
#       print "                  -TF               Seed with (all) TRANSFAC PSSMs (buggy)"
        print "                  -kmerseeds        Use kmers with best enrichment score as seeds for EM"
        print "                  -pad              add NN..NN to seed"
        print " "
        print " Genome / Background model "
        print "                  -human (250,1000) Use Human Background model"
        print "                  -g genome.fsa     Use specicied Fasta file as background (searches first for matching frequency file)"
#       print "                  -Y2K, -Y5C        Use Yeast Upstream Intergenic regions (2000, 500)"
#       print "                  -B                Use Bacterial Orfs"
        print " " 
        print "Examples:"
        print " %s t.fsa 5 -prior GGGTA -prior AAAAAC "%(sys.argv[0].split('/')[-1])
        print "   will start an EM with 3 seeds: GGGTA, AAAAA, and AAAAC"
        print " %s t.fsa 5 -info CUP9.info -gamma 0.5 "%(sys.argv[0].split('/')[-1])
        print "   will start an EM with Enriched seeds in CUP9.info, with"
        print "   Gamma expectation of 50% of all probes"
        print " %s t.fsa -prior MCM1_5.tamo:0 "%(sys.argv[0].split('/')[-1])
        print "   will start an EM with 0th motif of the file MCM1_5.tamo"
        print "   as a seed"
    fastafile = sys.argv[1]

    #Echo the command line
    print "#" + ' '.join(map(lambda x: re.sub(' ','\ ',x), sys.argv))

    if sys.argv[2].isdigit():
        width = sys.argv[2]
    else: width = None
    algorithm = ''
    beta      = ''
    seedbeta  = ''
    deltamin  = ''
    gamma     = 0.2
    infofile  = ''
    seedmodels= []
    species   = 'YEAST'
    valid_tfs = [] #NOT USED
    gapped_syl= None
    gapflank  = 0
    gapweight = 0.2
    enrichfact= 0.7
    pmax      = 0  #False
    TFSEEDS   = 0
    TFMids    = []
    pad       = None
    bgfile    = None

    seed_count = 0   #Default: Take the top 0
    seed_s     = []  #Initialize seq array

    '''Parse command-line arguments'''
    for tok,i in zip(sys.argv,xrange(len(sys.argv))):
        if   tok == '-top'   :   seed_count = int(sys.argv[i+1])
        elif tok == '-greedy':   algorithm  = "GREEDY"
        elif tok == '-prior' :   seed_s.append(sys.argv[i+1])
        elif tok == '-beta'  :   beta       = float(sys.argv[i+1])
        elif tok == '-seedbeta': seedbeta   = float(sys.argv[i+1])
        elif tok == '-gamma' :   gamma      = float(sys.argv[i+1])
        elif tok == '-delta' :   deltamin   = float(sys.argv[i+1])
        elif tok == '-kmerseeds' :   infofile   = 1
        elif tok == '-valid' :   valid_tfs.append(sys.argv[i+1]) #NOT USED
        elif tok == '-w'     :   width      = sys.argv[i+1]
        elif tok == '-width' :   width      = sys.argv[i+1]
        elif tok == '-gap'   :   gapped_syl = sys.argv[i+1]
        elif tok == '-gapflank' :gapflank   = int(sys.argv[i+1])
        elif tok == '-gapweight':gapweight  = float(sys.argv[i+1])
        elif tok == '-enrichfact':enrichfact= float(sys.argv[i+1])
        elif tok == '-pmax'  :   pmax       = 1
        elif tok == '-Y2K'   :   species    = "YEAST_2000_UP"
        elif tok == '-Y5C'   :   species    = "YEAST_500_UP"
        elif tok == '-B'     :   species    = "BAC_ORF"
        elif tok == '-Ch22'  :   species    = "Ch22"
        elif tok == '-genome':   species    = sys.argv[i+1]
        elif tok == '-pad'   :   pad        = "TRUE"
        elif tok == '-bgfile':   bgfile     = sys.argv[i+1]
            TFSEEDS = 1
            for j in range(i+1,len(sys.argv)):
                if re.match('M0',sys.argv[j]):
        elif tok == '-human' :
            _s = ''
            if sys.argv[i+1].isdigit(): _s = '_'+sys.argv[i+1]
            else:                       _s = ''
            species    = 'HUMAN'+_s

    if infofile: infofile = fastafile

    if bgfile:
    elif not ('-random_background' in sys.argv or '-nomarkov' in sys.argv):
        EM.theMarkovBackground = EM.Zeroth()

    fsaD     = Fasta.load(fastafile)
    seqs     = fsaD.values()
    probes   = fsaD.keys()
    all_seqs = seqs

    if infofile and width=='info':
        width = info2width(infofile)
    elif width != None:
        width = int(width)

    #Alternate source of seeds
    if infofile:
        if 1 or width:
            print 'Error: need to specify motif width w/ .info file'
    #Any -prior pointers to motifs in other files?
    (seed_s, motifs) = parse_priors(seed_s)

    #Should we get seeds from TRANSFAC?
        tf = []
        D  = tfmats()
        if not TFMids:
            keys = D.keys()
            keys = []
            for TFMid in TFMids:
                for key in D.keys():
                    if key[0:6] == TFMid:
        for key in keys:
            m = D[key]
            m.seednum = int(re.sub('M0*','',key.split()[0]))
            m.seedtxt = '%-24s %s'%(m,key)
        tf.sort(lambda x,y: cmp(x.seednum,y.seednum))

    if gapped_syl:
        gapped_priors = gapped_motifs(gapped_syl)
        gapped_priors = map(lambda x:'N'+x+'N', gapped_priors)

    if pad:
        print '# Padding models with NN-m-NN'
        newmodels = []
        left  = MotifTools.Motif_from_text('@')
        right = MotifTools.Motif_from_text('N')
        for m in seedmodels:
            newmodels.append(left + m + right)
            print left + m + right
        seedmodels = newmodels

    Set everything up and GO!!
    global theEM
    theEM = EM.EM(seed_s,[],width,"VERBOSE")
    if beta:     theEM.beta     = beta
    if deltamin: theEM.deltamin = deltamin
    if seedbeta: theEM.seedbeta = seedbeta
    theEM.param['gamma']        = gamma
    theEM.models    = seedmodels
    theEM.gapflank  = gapflank
    theEM.gapweight = gapweight
    theEM.EM_Cstart()    #GO!!

    #print "#Sorting candidates"
    #EM.candidates.sort(lambda x,y: cmp(y.MAP,x.MAP))

    Compute some metrics
    print "#Loading Genome %s"%species ; sys.stdout.flush()
    Genome = ProbeSet(species,enrichfact)
    ids    = Genome.ids_from_file(fastafile)
    for C in theEM.candidates:
        if not pmax:
            C.pssm.pvalue = Genome.p_value(C.pssm,ids,'verbose')
            C.pssm.church = Genome.church(C.pssm,ids)
            C.pssm.frac   = Genome.frac(C.pssm,probes,None,0.7)
            (p,frac) = Genome.best_p_value(C.pssm,ids)
            C.pssm.pvalue    = p
            C.pssm.threshold = frac * C.pssm.maxscore
            print "Bests:",p,frac

        matching             = Genome.matching_ids(C.pssm,[],factor=0.7)
        matchbound           = [x for x in matching if x in probes]
        C.pssm.numbound      = len(probes)
        C.pssm.nummotif      = len(matching)
        C.pssm.numboundmotif = len(matchbound)

    Print out all motifs (sorted by Enrichment) in an AlignACE-like form

    theEM.candidates.sort(lambda x,y: cmp(x.pssm.pvalue,y.pssm.pvalue))
    for C,i in zip(theEM.candidates,range(len(theEM.candidates))):
        C.pssm.maxscore = -100  #May have side effects.  Recompute when done
        if C.pssm.valid:  #NOT USED
            _t = C.pssm.valid
            if not _t[0]:
                vstring = "(--- %8.4f %8.4f %s)"%(_t[1],_t[2],_t[3])
                vstring = "(HIT %8.4f %8.4f %s)"%(_t[1],_t[2],_t[3])
            vstring = ''
        C.pssm._maxscore()     #Recomputed

        #Antiquated stuff  -- Remove !!
        print "Log-odds matrix for Motif %3d %s"%(i,C)
        print "Sequence Logo"
        #print '# %3d matching sequences at 90%%'%len(C.pssm.bestseqs(C.pssm.maxscore * 0.9))
        m = C.pssm
        if not m.__dict__.has_key('gamma'):  m.gamma = None #Kludge to deal w/ old shelves
        if m.seedtxt:     print "Seed: %3d %s"%(i,m.seedtxt)
        if m.source:      print "Source: ",m.source
        if m.gamma:       print "Gamma: %7.5f"%m.gamma
        if m.threshold:   print "Threshold: %5.2f"%m.threshold
        #if C.pssm.seedtxt:
        #    print 'Seed  %3d %-25s'%(i,C.pssm.seedtxt)
        if C.pssm.church != None: vstring = 'ch: %5.2f  %s'%(
            math.fabs(math.log(C.pssm.church)/math.log(10)), vstring)
        print "Motif %3d %-25s  nlog(p): %6.3f  %s"%(i,C,-math.log(C.pssm.pvalue)/math.log(10),vstring)
        if C.pssm.threshold:
            print "Threshold: %6.3f  %4.1f%%"%(
                C.pssm.threshold, 100.0*C.pssm.threshold/C.pssm.maxscore)

        C.pssm.maxscore = -1e100  #May have side effects.  Recompute when done
        for seq in C.wmers:
            print seq,i,C.pssm.scan(seq)[2][0]
        C.pssm._maxscore()      #Recomputed
        print '*'*len(seq)
        print "MAP Score: %f"%C.MAP
    sys.exit(0) #Avoid ridiculous python cleanup times
コード例 #44
def merge_runs_cc(TAMO_file, wdir, height, distance, ancestor, target, genome):
    '''This script is used to merge motifs with the PCC matrix of all motifs.
    The script was originally written by Cheng Zou, and then converted to a 
    function by Alex Seddon.

    print "Here are the parameters you specified in this run "
    print "-tamo        %s" % TAMO_file
    print "-wdir        %s" % wdir
    print "-h        height to cut the tree, %s" % height
    print "-ancestor    %s" % ancestor
    print "-target    %s" % target
    print "-genome    %s" % genome

    if TAMO_file == '' or wdir == '':

    os.system("cd %s" % wdir)


    # Get the directory where the script is located.
    script_dir = '/'.join(os.path.abspath(__file__).split('/')[:-1])

    # This code was in the original clustering script. It has been taken out
    # because the processes involved take too long and have been taken up by
    # the matrrix creation scripts and the run_UPGMA script.
    #if distance==0:
    #    os.system("python /mnt/home/seddonal/gil scottscripts/5_motif_merging/3.calculate_distance_matrix.py   -i %s --dfunc pccrange" % TAMO_file)
    #os.system("R --vanilla --slave --args %s.dm  %s< /mnt/home/seddonal/scripts/5_motif_merging/UPGMA_final.R> %s.Rout" % (TAMO_file,height,TAMO_file))

    cl_dic = {}
    n = 0

    # The file, TAMO_file.dm_UPGMA_Cl_0.05, is inorder of the motifs that appear
    # in the TAMO_file. If two motifs have the same number, they are considered
    # a part of the same cluster.
    # This loop pulls the clustering information out of this file and creats
    # the dictionary cl_dic = {cluster_index:{motif_index:'1'}}
    for line in open("%s.dm_UPGMA_Cl_%s" % (TAMO_file, height), "r"):

        # Gets the clusterindex of this motif
        cl = line.strip()

        # Adds the cluster index if it has not been
        if not cl_dic.has_key(cl):
            cl_dic[cl] = {}

        cl_dic[cl][n] = "1"  # Adds the motif to that cluster
        n += 1  # Increases the motif index for the next motif

    #print cl_dic

    ml = MotifTools.txt2motifs(TAMO_file)
    old = []  # List of motifs that are the sole members of a cluster.

    # I think I can divide up this portion of the code to create a series
    print ancestor, ancestor == 0

    cc_output = open('merge_runs_cc', 'w')

    if ancestor == 0:

        # This loop Looks at each cluster and attempts to merge the motifs
        # in the cluster if there are multiple motifs.
        for i in cl_dic.keys():

            print i, cl_dic[i]

            # If there are multiple motifs in the cluster, it merges the motifs
            if len(cl_dic[i]) > 1:

                # Adds all of the motifs in the cluster to an object called
                # mlist.
                mlist = []
                for j in cl_dic[i]:

                # Saves these motifs to there own TAMO file.
                save_motifs(mlist, "%s_sub_%s.tm" % (TAMO_file, i))

                    'module load TAMO; python %s/pcc_merge_CC.py merge_runs_no_ancestor -t %s/%s -i %s -target %s -genome %s\n'
                    % (script_dir, wdir, TAMO_file, i, target, genome))

            # If there is only one motif in the cluster, it leaves it alone,
            # And adds it to old
                key = cl_dic[i].keys()[0]

    if ancestor == 1:

        # This loop Looks at each cluster and attempts to merge the motifs
        # in the cluster if there are multiple motifs.
        for i in cl_dic.keys():

            print i, cl_dic[i]

            # If there are multiple motifs in the cluster, it merges the motifs
            if len(cl_dic[i]) > 1:

                # Adds all of the motifs in the cluster to an object called
                # mlist.
                mlist = []
                for j in cl_dic[i]:

                # Saves these motifs to there own TAMO file.
                save_motifs(mlist, "%s_sub_%s.tm" % (TAMO_file, i))

                    'module load TAMO; module load STAMPmotif; python %s/pcc_merge_CC.py merge_runs_ancestor -t %s/%s -i %s -target %s -genome %s\n'
                    % (script_dir, wdir, TAMO_file, i, target, genome))

                key = cl_dic[i].keys()[0]

    # Combine together the motifs that are in there own cluster.
    #os.system("cat %s_sub_*_sum.tm.tf.tm.TOP > %s_sub_new.tm" % (TAMO_file,TAMO_file))
    save_motifs(old, "%s_sub_old.tm" % (TAMO_file))
コード例 #45
def combine_distance_matrix(wdir, TAMO_file):
    '''Combines the PCC score matricies and outputs them as a single matrix.
    Originaly written by Cheng Zou, and converted to a function by Alex Seddon.
    ml = MotifTools.txt2motifs(TAMO_file)
    n_split = len(ml) / 100
    # Change to the working directory.
    os.system("cd %s" % wdir)
    # The following loop keeps counts the number of lines in the each of the
    # PCC matricies for a comparison of a TAMO file with itself.
    lendic = {}  # Dictionary with the length of PCC matricies.
    for i in range(n_split + 1):
        lendic[i] = line_count("%s_n%s.dm" % (TAMO_file, i))
    print lendic
    # This loop creates files with blanks. The files are used to ensure that
    # the PCC-distance matrix is square. The blank files will be created to take
    # the place of files that would have been left blank
    for i in range(n_split + 1):
        for j in range(0, i):
            # open the file to add blanks
            oup = open("%s_n%s-%s_n%s.dm" % (TAMO_file, i, TAMO_file, j), "w")
            print lendic[j], lendic[i]
            list = []
            # Add a number of "-" to the list equal to the number of lines in
            # the self comparison files.
            for y in range(lendic[j]):
            for x in range(lendic[i]):
                oup.write("%s\n" % "\t".join(list))

    # Creates a copy of the self comparison file so that it can be easily picked
    # out by the function.
    for i in range(n_split + 1):
        os.system("cp %s_n%s.dm %s_n%s-%s_n%s.dm" %
                  (TAMO_file, i, TAMO_file, i, TAMO_file, i))

    # This loop will look at each
    for i in range(n_split + 1):
        com = "paste "
        for j in range(n_split + 1):
            com += "%s_n%s-%s_n%s.dm " % (TAMO_file, i, TAMO_file, j)
        com += "> distance_%s" % i
        print com

    com = "cat "
    for i in range(n_split + 1):
        com += "distance_%s " % i
    com += "> %s.dm" % TAMO_file

    print com
    # Concatonate all the matricies
    # My embarisingly ad hoc way of removing double tabs
    remove_double_tabs("%s.dm" % TAMO_file)
コード例 #46
This opens a general TAMO cluster list and outputs **TO STANDARD OUT** the probability matrices of all
items there. Separated by a line with the name of each cluster. *It is recommended to be used in a
bash pipeline where the standard out can be written into a file.*  

Has 1 argument: 
- motiflist: a TAMO motif list that will be outputed 

- A series of strings that represet the probability matrices of all motifs in the input list

Author: Hector Galvez

from sys import argv
from TAMO import MotifTools

# Open list
motiflist = MotifTools.load(argv[1])

# Start printing information for each motif
for num in range(len(motiflist)):
    print '>Cluster_' + str(num + 1)

コード例 #47
# Create a general list with all the motifs from all algorithms
genlist = []

# Perform clustering on the general list of motifs
clusterinf = clusterinfo(genlist)
averages = clusteravg(genlist,clusterinf)

# Trim the final average list
# averages = trim(averages,0.5)
# print clusterinf

# Save new list of cluster averages

# Generate giflogos of all average motifs
for index in range(len(averages)):
    cluster = 'Cluster ' + str(index + 1)
    clustergif = argv[1] + '/other/cluster' + str(index + 1)

# Determine location of the markdown file for the summary report
reportout = open(str(argv[1] + '/final/' + listname + '_cluster_report.md'), 'w')

# Write the header of the report
rundate = date.today()
header = "# Summary report for `" + listname + "`\nThis analysis was run on: " + str(rundate) + \
コード例 #48
ファイル: TAMO_Motif.py プロジェクト: shwetabhandare/PySG
def GetKmerFromMotifFromPWM(pwm, seq):
	m = MotifTools.toDict(pwm)
	#print m
	motif = MotifTools.Motif_from_counts(m)
	return motif.bestscanseq(seq);
コード例 #49
# Compare motifs in tamo format

from   TAMO              import MotifTools
from   TAMO.MotifMetrics import ProbeSet
from   TAMO.Clustering   import MotifCompare
from   TAMO.Clustering   import Kmedoids
import sys
import pickle
import pprint

file_unknown = sys.argv[1]# Unknown
file_tfbs = sys.argv[2]# TF db
motifs_unknown = MotifTools.load(file_unknown) 
motifs_tfbs = MotifTools.load(file_tfbs) 

match_dict = {}
for unknown in motifs_unknown:
  tf_list = []
  for tfbs in motifs_tfbs:
    #print "Comparing motifs:"
    #print "    %s  vs  %s" % (unknown.source, tfbs.source)
    #print "    Unknown motif ( %s ) vs TFBS ( %s ) " % (unknown, tfbs)
    joined_motifs = []
    print joined_motifs
コード例 #50
ファイル: MotifCompare.py プロジェクト: adamlabadorf/TAMO
def probOvlp(A,B,thresh=0.7,verbose=None):
    if A.width >= B.width:
        Wide, Narrow = A, B
        Wide, Narrow = B, A

    RC = MotifTools.revcomplement
    if 1:
        newWide  = Wide[-1,Wide.width+1]
        if Wide.__dict__.has_key('bestWide'):
            bestWide = Wide.bestWide
            bestWideD = {}
            for x in newWide.bestseqs(thresh*newWide.maxscore):
                bestWideD[x] = 1
            for x in bestWideD.keys():
                bestWideD[RC(x)] = 1
            Wide.bestWide = bestWideD.keys()
            bestWide = Wide.bestWide
        Wide = newWide
        if Narrow.__dict__.has_key('bestNarrow'):
            bestNarrow = Narrow.bestNarrow
            bestNarrowD = {}
            for x in Narrow.bestseqs(thresh*Narrow.maxscore):
                bestNarrowD[x] = 1
            for x in bestNarrowD.keys():
                bestNarrowD[RC(x)] = 1
            bestNarrow = bestNarrowD.keys()
            Narrow.bestNarrow = bestNarrow
    #bestWide   = [x[1] for x in Wide.bestseqs  (thresh*Wide.maxscore)  ]
    #bestNarrow = [x[1] for x in Narrow.bestseqs(thresh*Narrow.maxscore)]

    countNarrow = len(bestNarrow)
    countWide   = len(bestWide)

    numtotal    = math.pow(4,Wide.width)
    fudgefactor = math.pow(4,Wide.width - Narrow.width)

    bestWideTups = [(x,MotifTools.revcomplement(x)) for x in bestWide]

    countBoth = 0
    for i in range(len(bestNarrow)):
        m_narrow = bestNarrow[i]
        delj = []

        for j in range(len(bestWideTups)):
            if (bestWideTups[j][0].find(m_narrow) >= 0) or (bestWideTups[j][1].find(m_narrow) >= 0):
                countBoth += 1

        delj.reverse()  #Chew in from the back
        for j in delj:

    if verbose: print '%10d %10d %10d %10d | %10d  %5d '%(
        countWide, numtotal, countNarrow *fudgefactor , countBoth , countNarrow, Wide.width - Narrow.width),
    p = Arith.hypgeomsummore(countWide,                 #Num Interesting
                             numtotal,                  #All k-mers
                             countNarrow * fudgefactor, #Number picked
                             countBoth                ) #Number found
    return p
コード例 #51

import os,sys,string
from   TAMO              import MotifTools
from   TAMO.seq          import Fasta
from   TAMO.MotifMetrics import ProbeSet

promoters = ProbeSet(sys.argv[1])
geneset_ids = open(sys.argv[2]).read().split('\n')[:-1]
match_ids = []
prom_ids = promoters.probes.keys()
for id in geneset_ids:
  if id in prom_ids:

motifs = MotifTools.load(sys.argv[3])
church = 0.05
rocauc = 0.1
pvalue = 0.05

print "Name\tMotif\tChurch\tRoc-auc\tP-value"
for m in motifs:
  m.church   = promoters.church  (m, match_ids)
#  m.ROC_auc  = promoters.ROC_AUC (m, match_ids)
  m.pvalue   = promoters.p_value (m, match_ids)
  if m.church <= church and m.pvalue <= pvalue:
    print "%s\t%s\t%s\t%s" %\
    (m.source, m, m.church, m.pvalue) 

コード例 #52
from gusPyCode.MDAP_proj.MDAP_defs import alignAndCombineMotifs
from TAMO import MotifTools

Motif = MotifTools.Motif

outFile = '/Users/biggus/Documents/James/Collaborations/Campbell/data/Results_HyperGeoScreen/masked/Results_gGEMS/CCupAt4Days.6-8mers.gGEMS.top6PlusCombos.motifs.stdThresh.tmo'

m = MotifTools.load('/Users/biggus/Documents/James/Collaborations/Campbell/data/Results_HyperGeoScreen/masked/Results_gGEMS/CCupAt4Days.6-8mers.gGEMS.top6.motifs.stdThresh.tmo')
w = [5.8952,

toTmo = []

for e in toTmo:
    print e.oneletter

コード例 #53
filename = sys.argv[1]
motif_list = open(filename).read().split('\nMOTIF')[1:]
tamo_list = []
motif_counter = 1
nsites_pat = re.compile("(w= [0-9]+)")

for motif in motif_list:
  m_info1, m_info2 = motif.split('letter-probability matrix: ')
  m_mat = m_info2.split('--------------------------------------------------------------------------------', 1)[0]
  m_mat_header, m_prob_mat = m_mat.split('\n', 1) 
  nsites = int(nsites_pat.findall(m_mat_header)[0].split('= ')[1])
  count_pos = m_prob_mat.split('\n')[:-1]
  count_mat = []
  site_list = []
  for count in count_pos:
    sites = [float(i) for i in count.split()]
    count_dict = {'A': int(sites[0] * nsites),
                  'C': int(sites[1] * nsites),
                  'G': int(sites[2] * nsites),
                  'T': int(sites[3] * nsites)}
  m = MotifTools.Motif_from_counts(count_mat)
  m.source = "Motif%s | %s" % (motif_counter, m_mat_header)
  motif_counter += 1  
MotifTools.save_motifs(tamo_list, "MEME_motifs_%s.tamo" % filename.split('.')[0])

コード例 #54
def motif_matrix(fsa, motif, outfile, genome='mm9'):
    if genome == 'hg18':
        markov = "/nfs/genomes/human_gp_mar_06/hg18_promoters_3000_1000.markov"
        markov = "/nfs/data/cwng/chipseq/hypotheses/Mouse.markov"

    #Load motif and background adjust PSSM
    m = MotifTools.load(motif)
    bg = EM.theMarkovBackground.zeroth()
    F = Fasta.load(fsa, key_func=lambda x: x)
    seqs = F.values()
    n_seqs = len(seqs)
    n_motifs = len(m)
    SCORES = np.zeros((n_motifs, n_seqs), dtype='float')

    for i, M in enumerate(m):
        ll = M.logP
        bg = EM.theMarkovBackground.zeroth()
        for pos in ll:
            for letter in pos.keys():
                pos[letter] = pos[letter] - math.log(
                    bg[letter]) / math.log(2.0)
        AM = MotifTools.Motif_from_ll(ll)
        #adj_model = MotifTools.Motif_from_ll(ll)
        #adj_model.source = M.source
        #pssm = MDsupport.Motif2c_PSSM(adj_model)

        mi, ma = AM.minscore, AM.maxscore

        #Search every seq for given motif above threshold t and print motif centered results
        for j, seq in enumerate(seqs):
            seq_fwd = seq.upper()
            #seq_rev = str(MotifTools.revcomplement(seq_fwd))[::-1]
            #scores_fwd = pssm.score_probe(seq_fwd)
            #scores_rev = pssm.score_probe(seq_rev)
            #for ind,s in enumerate(scores_fwd):
            #    if s> max_score:
            #        max_score=s
            #        max_ind=ind
            #        strand='+'
            #for ind,s in enumerate(scores_rev):
            #    if s> max_score:
            #        max_score=s
            #        max_ind=ind
            #        strand='-'
            max_score = AM.bestscore(seq_fwd)
            mscore = (max_score - mi) / (ma - mi)
            SCORES[i, j] = mscore
    #del F
    np.savetxt(outfile, SCORES, fmt='%1.3f')