Esempio n. 1
0
def trim_motif(TAMO_file, cut=0.4):
    '''Trims the motifs in TAMO_file, eliminating low-information flanks.'''

    testmotifs = MotifTools.load(TAMO_file)
    file = TAMO_file + "_" + str(cut) + ".trim"

    new_mlist = []
    for motif in testmotifs:
        m = motif.trimmed(cut)
        new_mlist.append(m)
    save_motifs(new_mlist, file)
def opentamo(fileloc):
    '''
    Opens a tamo file with MotifTools.load and returns the list of motifs,
    except when the input file doesn't exist, in which case it returns an empty list. 

    Has 1 argument:
    - fileloc: a string with the location of the file
    '''
    try:
        return MotifTools.load(fileloc)
    except IOError:
        return []
Esempio n. 3
0
def tamo2tamo(file, outname):
    global probefile, PROBESET, fsafile

    motifs = MotifTools.load(file)
    if fsafile:
        fsaname = fsafile
    else:
        fsaname = find_fsa(file)

    print '# FSA ', fsaname
    fsaD = MotifMetrics.fasta2seqs(fsaname, 'want_dict')
    probes = fsaD.keys()
    if not probefile:
        PROBESET = MotifMetrics.ProbeSet('YEAST')
        #PROBESET= pick_genome(fsaname)
    #for key,seq in fsaD.items():
    #    PROBESET.probes[key] = seq

    print "# %d motifs" % len(motifs)
    for motif in motifs:
        #motif.pvalue, motif.church = 1,1  #Comment this!
        if motif.pvalue == 1:
            motif.pvalue = PROBESET.p_value(motif, probes, 'v')
        if motif.church == 1:
            motif.church = PROBESET.church(motif, probes, 'v')
        #if motif.E_site == None: motif.E_site = PROBESET.E_sitef(motif,probes,3,'v')
        #if motif.E_chi2 == None: motif.E_chi2 = PROBESET.E_chi2(motif,probes,None,'v')
        #if motif.E_seq  == None: motif.E_seq  = PROBESET.E_seq(motif,probes,'v')
        if motif.ROC_auc == None:
            motif.ROC_auc = PROBESET.ROC_AUC(motif, probes, 'v')
        #if motif.MNCP   == None: motif.MNCP   = PROBESET.MNCP(motif,probes,'v')
        if motif.frac == None:
            motif.frac = PROBESET.frac(motif, probes, 'v', 0.7)
        if motif.numbound == 0:
            matching = PROBESET.matching_ids(motif, [], factor=0.7)
            matchbound = [x for x in matching if x in probes]
            motif.numbound = len(probes)
            motif.nummotif = len(matching)
            motif.numboundmotif = len(matchbound)
        if 0 and motif.CRA == None:
            try:
                pass
                CRA, Cfrac = PROBESET.cons_ROC_AUC(motif,
                                                   probes,
                                                   'v',
                                                   tuple='YES')
                motif.CRA = CRA
                motif.Cfrac = Cfrac
            except:
                pass

    MotifTools.save_motifs(motifs, outname)
Esempio n. 4
0
def tamo2tamo(file, outname):
    global probefile, PROBESET, fsafile
    
    motifs  = MotifTools.load(file)
    if fsafile:
        fsaname = fsafile
    else:
        fsaname = find_fsa(file)

    print '# FSA ',fsaname
    fsaD    = MotifMetrics.fasta2seqs(fsaname,'want_dict')
    probes  = fsaD.keys()
    if not probefile:
        PROBESET = MotifMetrics.ProbeSet('YEAST')
        #PROBESET= pick_genome(fsaname)
    #for key,seq in fsaD.items():
    #    PROBESET.probes[key] = seq

    print "# %d motifs"%len(motifs)
    for motif in motifs:
        #motif.pvalue, motif.church = 1,1  #Comment this!
        if motif.pvalue == 1: motif.pvalue = PROBESET.p_value(motif,probes,'v')
        if motif.church == 1: motif.church = PROBESET.church(motif,probes,'v')
        #if motif.E_site == None: motif.E_site = PROBESET.E_sitef(motif,probes,3,'v')
        #if motif.E_chi2 == None: motif.E_chi2 = PROBESET.E_chi2(motif,probes,None,'v')
        #if motif.E_seq  == None: motif.E_seq  = PROBESET.E_seq(motif,probes,'v')
        if motif.ROC_auc== None: motif.ROC_auc= PROBESET.ROC_AUC(motif,probes,'v')
        #if motif.MNCP   == None: motif.MNCP   = PROBESET.MNCP(motif,probes,'v')
        if motif.frac   == None: motif.frac   = PROBESET.frac(motif,probes,'v',0.7)
        if motif.numbound == 0:
            matching            = PROBESET.matching_ids(motif,[],factor=0.7)
            matchbound          = [x for x in matching if x in probes]
            motif.numbound      = len(probes)
            motif.nummotif      = len(matching)
            motif.numboundmotif = len(matchbound)
        if 0 and motif.CRA    == None:
            try:
                pass
                CRA, Cfrac = PROBESET.cons_ROC_AUC(motif,probes,'v',tuple='YES')
                motif.CRA = CRA
                motif.Cfrac = Cfrac
            except: pass
        
    MotifTools.save_motifs(motifs,outname)
Esempio n. 5
0
def main():
    try:
        opts, args = getopt.getopt(sys.argv[1:], "f:m:n:L:t:a:S:i:", ["help", "output="])  # AD added 'i'
    except getopt.GetoptError:
        usage()
        sys.exit(1)
    if not opts:
        usage()
        sys.exit(1)
        

    print "#" + ' '.join(sys.argv)
    fastafile, motiffile, motifnums, labels, thresh = (None, None, [], None, 0.75) # AD changed thresh val to 0.75 from 0.7
    ambigs = []

    scale   = 50.0 / 1000.0
    
    motifs = []
    for opt, value in opts:
        #print opt, value
        if   opt ==  '-f':  fastafile = value
        elif opt ==  '-m':  motifs.extend(MotifTools.txt2motifs(value))
        elif opt ==  '-n':  motifnums = [int(x) for x in value.split(',')]
        elif opt ==  '-L':  labels    = list(value)
        elif opt ==  '-t':  thresh    = float(value)
        elif opt ==  '-a':  ambigs.extend(value.split(','))
        elif opt ==  '-S':  scale     = float(value)
        elif opt ==  '-i':  motiffile = value  # AD added this option to ACTUALLY supply the tamo motif file at the command-line.  The code to deal with motiffiles already existed. There was just no code for User to supply one.
        
    probes = Fasta.load(fastafile)
    
    if motiffile:
        for f in motiffile.split(','):      # AD added this to allow supplying multiple tamo files at the prompt like you can supply multiple motifs
            motifs.extend(MotifTools.load(f))
    if ambigs:
        for ambig in ambigs:
            motifs.append( MotifTools.Motif_from_text(ambig,0.1) )
    if not motifnums:  motifnums = range(len(motifs))
    print '# %d: %s'%(len(motifs),motifnums)
    for i in range(len(motifnums)):
        motif = motifs[motifnums[i]]
        if labels and i < len(labels):
            txt = labels[i]
        else:
            txt = '%d'%i
        print '%-3s : %s %5.2f (%4.2f)'%(txt,motif,thresh*motif.maxscore,thresh)

    probehits = {}
    for key in probes.keys():
        hits_by_motif = []
        save_flag     = 0
        if re.search('[BDHU]',probes[key]): continue
        for num in motifnums:
            result = motifs[num].scan(probes[key],thresh*motif.maxscore)
            if result[0]:
                hits_by_motif.append(result)
                save_flag = 1
            else:
                hits_by_motif.append(None)
        if save_flag:
            probehits[key]=hits_by_motif

    #scale   = .1
    maxw = 40
    for key in probehits.keys():
        l       = len(probes[key])
        a       = list('-'* int(scale*l) )
        a.extend( list(' '*10 ) )
        desc    = []
        matches = probehits[key]
        for i in range(len(matches)):
            if matches[i]:
                subseqs,endpoints,scores = matches[i]
                for idx in range(len(subseqs)):
                    start,stop = endpoints[idx]
                    subseq     = subseqs[idx]
                    score      = scores[idx]
                    if labels and (i<len(labels)): ID = labels[i]
                    else                         : ID = '%d'%i
                    desc.append('%s %s %d-%d %4.2f '%(ID,subseq,start,stop,score))
                    start = int(start*scale)
                    for offset in range(10):
                        if a[start+offset] == '-':
                            if labels and (i < len(labels)):
                                a[start+offset] = labels[i]
                            else:
                                a[start+offset] = '%d'%i
                            break
        print '%-14s %s'%(key,''.join(a)),
        print ' '*max(0,maxw-len(a)), '| '.join(['%-27s'%x for x in desc])
        
    print
    print "Found matches in %d of %d input probes"%(len(probehits),len(probes))
Esempio n. 6
0
#!/usr/bin/python
'''
This opens a general TAMO cluster list and outputs **TO STANDARD OUT** the probability matrices of all
items there. Separated by a line with the name of each cluster. *It is recommended to be used in a
bash pipeline where the standard out can be written into a file.*  

Has 1 argument: 
- motiflist: a TAMO motif list that will be outputed 

Returns: 
- A series of strings that represet the probability matrices of all motifs in the input list

Author: Hector Galvez
'''

from sys import argv
from TAMO import MotifTools

# Open list
motiflist = MotifTools.load(argv[1])

# Start printing information for each motif
for num in range(len(motiflist)):
    print '>Cluster_' + str(num + 1)
    motiflist[num]._print_p()


import os,sys,string
from   TAMO              import MotifTools
from   TAMO.seq          import Fasta
from   TAMO.MotifMetrics import ProbeSet

promoters = ProbeSet(sys.argv[1])
geneset_ids = open(sys.argv[2]).read().split('\n')[:-1]
match_ids = []
prom_ids = promoters.probes.keys()
for id in geneset_ids:
  if id in prom_ids:
    match_ids.append(id)

motifs = MotifTools.load(sys.argv[3])
church = 0.05
rocauc = 0.1
pvalue = 0.05

print "Name\tMotif\tChurch\tRoc-auc\tP-value"
for m in motifs:
  m.church   = promoters.church  (m, match_ids)
#  m.ROC_auc  = promoters.ROC_AUC (m, match_ids)
  m.pvalue   = promoters.p_value (m, match_ids)
  if m.church <= church and m.pvalue <= pvalue:
    print "%s\t%s\t%s\t%s" %\
    (m.source, m, m.church, m.pvalue) 


Esempio n. 8
0
def motif_matrix(fsa, motif, outfile, genome='mm9'):
    if genome == 'hg18':
        markov = "/nfs/genomes/human_gp_mar_06/hg18_promoters_3000_1000.markov"
    else:
        markov = "/nfs/data/cwng/chipseq/hypotheses/Mouse.markov"

    #Load motif and background adjust PSSM
    m = MotifTools.load(motif)
    EM.loadMarkovBackground(markov)
    bg = EM.theMarkovBackground.zeroth()
    F = Fasta.load(fsa, key_func=lambda x: x)
    seqs = F.values()
    n_seqs = len(seqs)
    n_motifs = len(m)
    SCORES = np.zeros((n_motifs, n_seqs), dtype='float')
    #SHIFTS=np.zeros((n_motifs,n_seqs))

    #out=open(outfile,'w')
    for i, M in enumerate(m):
        ll = M.logP
        EM.loadMarkovBackground(markov)
        bg = EM.theMarkovBackground.zeroth()
        for pos in ll:
            for letter in pos.keys():
                pos[letter] = pos[letter] - math.log(
                    bg[letter]) / math.log(2.0)
        AM = MotifTools.Motif_from_ll(ll)
        #adj_model = MotifTools.Motif_from_ll(ll)
        #adj_model.source = M.source
        #pssm = MDsupport.Motif2c_PSSM(adj_model)
        #w=pssm.width

        #shift=[]
        #scores=[]
        mi, ma = AM.minscore, AM.maxscore

        #F_m={}
        #Search every seq for given motif above threshold t and print motif centered results
        for j, seq in enumerate(seqs):
            seq_fwd = seq.upper()
            #seq_rev = str(MotifTools.revcomplement(seq_fwd))[::-1]
            #scores_fwd = pssm.score_probe(seq_fwd)
            #scores_rev = pssm.score_probe(seq_rev)
            #max_score=mi
            #max_ind=0
            #for ind,s in enumerate(scores_fwd):
            #    if s> max_score:
            #        max_score=s
            #        max_ind=ind
            #        strand='+'
            #for ind,s in enumerate(scores_rev):
            #    if s> max_score:
            #        max_score=s
            #        max_ind=ind
            #        strand='-'
            max_score = AM.bestscore(seq_fwd)
            mscore = (max_score - mi) / (ma - mi)
            #orig=len(seq_fwd)/2
            #bind=max_ind+w//2
            #d=abs(orig-bind)
            SCORES[i, j] = mscore
            #SHIFTS[i,j]=d
            #out.write('%1.3f\t'%mscore)
        #out.write('\n')
    #out.close()
    #del F
    np.savetxt(outfile, SCORES, fmt='%1.3f')
#
# Compare motifs in tamo format
#

from   TAMO              import MotifTools
from   TAMO.MotifMetrics import ProbeSet
from   TAMO.Clustering   import MotifCompare
from   TAMO.Clustering   import Kmedoids
import sys
import pickle
import pprint


file_unknown = sys.argv[1]# Unknown
file_tfbs = sys.argv[2]# TF db
motifs_unknown = MotifTools.load(file_unknown) 
motifs_tfbs = MotifTools.load(file_tfbs) 

match_dict = {}
for unknown in motifs_unknown:
  tf_list = []
  for tfbs in motifs_tfbs:
    #print 
    #print "Comparing motifs:"
    #print "    %s  vs  %s" % (unknown.source, tfbs.source)
    #print "    Unknown motif ( %s ) vs TFBS ( %s ) " % (unknown, tfbs)
    #print
    joined_motifs = []
    joined_motifs.append(unknown)
    joined_motifs.append(tfbs)
    print joined_motifs
Esempio n. 10
0
from gusPyCode.MDAP_proj.MDAP_defs import alignAndCombineMotifs
from TAMO import MotifTools

Motif = MotifTools.Motif

outFile = '/Users/biggus/Documents/James/Collaborations/Campbell/data/Results_HyperGeoScreen/masked/Results_gGEMS/CCupAt4Days.6-8mers.gGEMS.top6PlusCombos.motifs.stdThresh.tmo'

m = MotifTools.load('/Users/biggus/Documents/James/Collaborations/Campbell/data/Results_HyperGeoScreen/masked/Results_gGEMS/CCupAt4Days.6-8mers.gGEMS.top6.motifs.stdThresh.tmo')
w = [5.8952,
     5.6523,
     5.0585,
     4.9788,
     4.9678,
     4.7688]

toTmo = []
toTmo.append(alignAndCombineMotifs([m[0],m[1]],[w[0],w[1]]))
toTmo.append(alignAndCombineMotifs([m[0],m[4]],[w[0],w[4]]))
toTmo.append(alignAndCombineMotifs([m[1],m[4]],[w[1],w[4]]))
toTmo.append(alignAndCombineMotifs([m[2],m[3]],[w[2],w[3]]))
toTmo.append(alignAndCombineMotifs([m[2],m[5]],[w[2],w[5]]))


for e in toTmo:
    print e.oneletter

MotifTools.save_motifs(m+toTmo,outFile)    
    
None
Esempio n. 11
0
genelist = argv[1].split('/')[-1]
allclusters = argv[1] + '/' + genelist + '_allclusters.tamo'
#print genelist
oneletters = argv[1] + '/other/' + genelist + '_oneletter.tmp'
symbols = argv[1] + '/other/' + genelist + '_symbols.tmp'

# Open output files for writing
oneletters = open(oneletters, 'w')
symbols = open(symbols, 'w')

# Define output variables
oneletterlist = []
symbolstring = '1234567890ABCDEFGHIJKLMNOPQRSTUVWXYZ*+.,:;!'

# Open list
motiflist = MotifTools.load(allclusters)

# Try to verify the initial list is not too long
if len(motiflist) > len(symbolstring):
    # If the list is too long, raise an exception so that the program quits
    raise ValueError("The cluster list is too long for sitemap.py")
# If the list is not too long, adjust the symbols string to the appropriate length
else:
    symbolstring = symbolstring[:len(motiflist)]

# Save symbol string in the symbols file and close that file
symbols.write(symbolstring)
symbols.close()

# Add oneletter summaries to the list
for num in range(len(motiflist)):
Esempio n. 12
0
- Requires the clustering.py script to be in the same directory

Author: Hector Galvez
"""

from sys import argv,exit # To be able to parse arguments in the command line
from clustering import clusterinfo,clusteravg # Import clustering functions from clustering.py script
# Import all relevant TAMO modules
from TAMO import MotifTools
from TAMO import Clustering
from TAMO.Clustering import MotifCompare
from TAMO.Clustering import Kmedoids

try:
    # Import the motif lists from the tamo file provided as the first argument
    inputlist = MotifTools.load(argv[1])
except IOError:
    print "Couldn't find %s, moving on..." % str(argv[1].split('/')[-2] + '/' + argv[1].split('/')[-1])   
    exit() 

# Create the name of a new TAMO file for clustered motifs
genelist = str(argv[1].split('/')[-3])
output = argv[1].rstrip(genelist + '.tamo') + genelist + '_clusters.tamo'

# Create output information
clusterinf = clusterinfo(inputlist)
averages = clusteravg(inputlist,clusterinf)

# Save new list of cluster averages
MotifTools.save_motifs(averages,output)
Esempio n. 13
0
def main():
    ##########################################################################################
    #THEME.py: THEME module for performing cross-validated hypothesis testing on transcription
    #factor binding data.
    #Usage: python THEME.py foreground_fasta_file (file path) background_fasta_file (file path)
    #hypothesis_index (integer)  -fse hypothesis_file (file path) -markov markov_background (file path)
    #-motif_file output_file (file path) -cv fold cross-validation (integer)
    ##########################################################################################

    if (len(sys.argv)<4):
        print "Usage: THEME.py foreground.fsa background.fsa hypotheses.txt"
        sys.exit(1)

    fg_file = sys.argv[1]           #get fasta file with foreground sequences
    bg_file = sys.argv[2]           #get fasta file with background sequences
    test_indices = sys.argv[3]      #colon separated indices into fse file
    cv_level = 2                    #default 2-fold cross-validation
    refine = 1
    randomize = 0
    beta = 0.0
    delta = 0.001
    motif_file = 'dummy.out'
    dump_categories_to_file = 0
    test_family = ''
    
    #read in any command line options
    for arg, i in zip(sys.argv,range(len(sys.argv))):
        if (arg == '-cv'):
            cv_level = int(sys.argv[i+1])
        if (arg == '-markov'):
            markov_file = sys.argv[i+1]
        if (arg == '-fse'):
            fse_file = sys.argv[i+1]
        if (arg == '-norefine'):
            refine = 0
        if (arg == '-beta'):
            beta = float(sys.argv[i+1])
        if (arg == '-delta'):
            delta = float(sys.argv[i+1])
        if (arg == '-randomization'):
            randomize = 1
        if (arg == '-motif_file'):
            motif_file = sys.argv[i+1]
        if (arg == '-dump'):
            dump_categories_to_file = 1
        if (arg == '-family'):
            test_family = family
    FH = open(motif_file, 'w')
    FH.write("******THEME Motif Output******")
    FH.close()
    
    random.seed()

    cross_val = THEME(fg_file, bg_file, cv_level, markov_file)
    if ((beta>0.0)and(beta<1.0)) : cross_val.beta = beta/(1-beta)
    cross_val.delta = delta
    cross_val.refine = refine
    cross_val.randomize = randomize
    cross_val.motif_file = motif_file
    if (test_family): cross_val.family = test_family
    if (dump_categories_to_file):
        cross_val.dump = 1

    ###################################################################################
    #get seed sequences that will be tested
    ###################################################################################
    models = []
    fses = MotifTools.load(fse_file)
    if (test_indices=='all'):
        indices = range(len(fses))
    else:
        indices = []
        ivals = test_indices.split(':')
        for v in ivals:
            indices.append(int(v))
    for i in indices:
        ll = fses[i].logP
        bg = EM.theMarkovBackground.zeroth()
        for pos in ll:
            for letter in pos.keys():
                pos[letter] = pos[letter] - math.log(bg[letter])/math.log(2.0)
        adj_bg_model = MotifTools.Motif_from_ll(ll)
        adj_bg_model.source = fses[i].source
        models.append(adj_bg_model)
        
    (m, err) = cross_val.run_CV(models)
Esempio n. 14
0
def motif_matrix(fsa,motif,outfile,genome='mm9'):
    if genome=='hg18': markov="/nfs/genomes/human_gp_mar_06/hg18_promoters_3000_1000.markov"
    else: markov="/nfs/data/cwng/chipseq/hypotheses/Mouse.markov"

    #Load motif and background adjust PSSM
    m=MotifTools.load(motif)
    EM.loadMarkovBackground(markov)
    bg = EM.theMarkovBackground.zeroth()
    F=Fasta.load(fsa,key_func=lambda x:x)
    seqs=F.values()
    n_seqs=len(seqs)
    n_motifs=len(m)
    SCORES=np.zeros((n_motifs,n_seqs),dtype='float')
    #SHIFTS=np.zeros((n_motifs,n_seqs))
    
    #out=open(outfile,'w')
    for i,M in enumerate(m):
        ll = M.logP
        EM.loadMarkovBackground(markov)
        bg = EM.theMarkovBackground.zeroth()
        for pos in ll:
            for letter in pos.keys():
                pos[letter] = pos[letter] - math.log(bg[letter])/math.log(2.0)
        AM = MotifTools.Motif_from_ll(ll)
        #adj_model = MotifTools.Motif_from_ll(ll)
        #adj_model.source = M.source
        #pssm = MDsupport.Motif2c_PSSM(adj_model)
        #w=pssm.width

        #shift=[]
        #scores=[]
        mi,ma=AM.minscore,AM.maxscore

        #F_m={}
        #Search every seq for given motif above threshold t and print motif centered results
        for j,seq in enumerate(seqs):
            seq_fwd = seq.upper()
            #seq_rev = str(MotifTools.revcomplement(seq_fwd))[::-1]
            #scores_fwd = pssm.score_probe(seq_fwd)
            #scores_rev = pssm.score_probe(seq_rev)
            #max_score=mi
            #max_ind=0
            #for ind,s in enumerate(scores_fwd):
            #    if s> max_score:    
            #        max_score=s
            #        max_ind=ind
            #        strand='+'
            #for ind,s in enumerate(scores_rev):
            #    if s> max_score:
            #        max_score=s
            #        max_ind=ind
            #        strand='-'
            max_score=AM.bestscore(seq_fwd)
            mscore=(max_score-mi)/(ma-mi)
            #orig=len(seq_fwd)/2
            #bind=max_ind+w//2
            #d=abs(orig-bind)
            SCORES[i,j]=mscore
            #SHIFTS[i,j]=d
            #out.write('%1.3f\t'%mscore)
        #out.write('\n')
    #out.close()
    #del F
    np.savetxt(outfile,SCORES,fmt='%1.3f')