Esempio n. 1
0
def main():
    seqsD = Fasta.load(sys.argv[1])
    seqs  = seqsD.values()
    for w in range(1,7):
        allnmers = permute(w)
        nmersT = MotifTools.top_nmers(w,seqs,'with counts','purge Ns')
        nmersD = {}
        total = 0
        for nmer in allnmers:
            nmersD[nmer] = 1 #Pseudo count
            total = total + 1
        for nmer,count in nmersT[:]:
            try: 
                rc = MotifTools.revcomplement(nmer)
                nmersD[nmer] = nmersD[nmer] + count
                nmersD[rc]   = nmersD[rc]   + count
                total = total + 2*count
            except KeyError:
                pass
        _t = nmersD.keys()
        _t.sort()
        print "# freq in %s (total %d with pseudocounts)"%(sys.argv[1],total)
        for nmer in _t:
            print "%-7s %20.17f"%(nmer,float(nmersD[nmer]) / total)
        sys.stdout.flush()
Esempio n. 2
0
def info2seeds(N,infofile,probefile,species='YEAST'):
    G    = ProbeSet(species)
    IDs  = G.ids_from_file(probefile)
    Q    = EM.theMarkovBackground.zeroth()
 
    seqs = Fasta.seqs(infofile)
    
    if not N:
        nmers = seqs
    else:
        nmers= MotifTools.top_nmers(N,seqs)
        if len(nmers) > 1000: nmers = nmers[0:1000]
        
    print "Scoring enrichment of %d nmers from %s"%len(nmers,infofile)
    sys.stdout.flush()
    
    nmers_scoresT = []
    for nmer in nmers:
        if nmer.isalpha():
            p = G.p_value(nmer,IDs,'') #'verbose'
            nmers_scoresT.append((nmer,p))
    nmers_scoresT.sort(lambda x,y: cmp(x[1],y[1]))
    last = min(20,len(nmers_scoresT))
    models = []
    for i in range(last):
        seq = nmers_scoresT[i][0]
        m = MotifTools.Motif('',Q)
        m.compute_from_text(seq,0.1)
        models.append(m)
    for tup in nmers_scoresT[0:40]:
        print tup
    return(models)
Esempio n. 3
0
def main():
    seqsD = Fasta.load(sys.argv[1])
    seqs = seqsD.values()
    for w in range(1, 7):
        allnmers = permute(w)
        nmersT = MotifTools.top_nmers(w, seqs, 'with counts', 'purge Ns')
        nmersD = {}
        total = 0
        for nmer in allnmers:
            nmersD[nmer] = 1  #Pseudo count
            total = total + 1
        for nmer, count in nmersT[:]:
            try:
                rc = MotifTools.revcomplement(nmer)
                nmersD[nmer] = nmersD[nmer] + count
                nmersD[rc] = nmersD[rc] + count
                total = total + 2 * count
            except KeyError:
                pass
        _t = nmersD.keys()
        _t.sort()
        print "# freq in %s (total %d with pseudocounts)" % (sys.argv[1],
                                                             total)
        for nmer in _t:
            print "%-7s %20.17f" % (nmer, float(nmersD[nmer]) / total)
        sys.stdout.flush()
Esempio n. 4
0
def main(fastafile, outDirectory):  # !! 1/2/09 AD added 'fastafile' var and changed 'if __name__' as way to call this from script.
    seqsD = Fasta.load(fastafile)
    seqs  = seqsD.values()
    
    output = []
    for w in range(1,7):
        allnmers = permute(w)
        nmersT = MotifTools.top_nmers(w,seqs,'with counts','purge Ns')
        nmersD = {}
        total = 0
        for nmer in allnmers:
            nmersD[nmer] = 1 #Pseudo count
            total = total + 1
        for nmer,count in nmersT[:]:
            try: 
                rc = MotifTools.revcomplement(nmer)
                nmersD[nmer] = nmersD[nmer] + count
                nmersD[rc]   = nmersD[rc]   + count
                total = total + 2*count
            except KeyError:
                pass
        _t = nmersD.keys()
        _t.sort()
        output.append("# freq in %s (total %d with pseudocounts)\n"%(fastafile.split('/')[-1],total))  # AD 02-27-09 added a '\n' to make file look right
        for nmer in _t:
            output.append( "%-7s %20.17f\n"%(nmer,float(nmersD[nmer]) / total))  # AD 02-27-09 added a '\n' to make file look right
        
        # open output file and write out results
        outFile = '%s/%s.freq' % (outDirectory, fastafile.split('/')[-1])
        outFile = open(outFile, 'w')
        for index in output:
            outFile.write(index)
Esempio n. 5
0
def loadmotif(infile, trimstart=0, trimend=0):
    from TAMO import MotifTools
    lines = loadlist(infile)
    if lines[0] == "A\tC\tG\tT":
        ma = []
        for l in lines[1:]:
            p = l.split("\t")
            ma.append({
                'A': float(p[0]),
                'C': float(p[1]),
                'G': float(p[2]),
                'T': float(p[3])
            })
        if trimend == 0: ma = ma[trimstart:]
        else: ma = ma[trimstart:-trimend]
        return MotifTools.Motif_from_counts(ma)
    elif lines[0][0] in 'ACGT':
        if trimend == 0: lines = lines[trimstart:]
        else: lines = lines[trimstart:-trimend]
        return MotifTools.Motif(lines)
    else:
        na = []
        for line in lines:
            na.append(list(map(int, line.split())))
        ma = []
        for i in range(len(na[0])):
            ma.append({
                'A': na[0][i],
                'C': na[1][i],
                'G': na[2][i],
                'T': na[3][i]
            })
        return MotifTools.Motif_from_counts(ma)
Esempio n. 6
0
def tamo2tamo(file, outname):
    global probefile, PROBESET, fsafile

    motifs = MotifTools.load(file)
    if fsafile:
        fsaname = fsafile
    else:
        fsaname = find_fsa(file)

    print '# FSA ', fsaname
    fsaD = MotifMetrics.fasta2seqs(fsaname, 'want_dict')
    probes = fsaD.keys()
    if not probefile:
        PROBESET = MotifMetrics.ProbeSet('YEAST')
        #PROBESET= pick_genome(fsaname)
    #for key,seq in fsaD.items():
    #    PROBESET.probes[key] = seq

    print "# %d motifs" % len(motifs)
    for motif in motifs:
        #motif.pvalue, motif.church = 1,1  #Comment this!
        if motif.pvalue == 1:
            motif.pvalue = PROBESET.p_value(motif, probes, 'v')
        if motif.church == 1:
            motif.church = PROBESET.church(motif, probes, 'v')
        #if motif.E_site == None: motif.E_site = PROBESET.E_sitef(motif,probes,3,'v')
        #if motif.E_chi2 == None: motif.E_chi2 = PROBESET.E_chi2(motif,probes,None,'v')
        #if motif.E_seq  == None: motif.E_seq  = PROBESET.E_seq(motif,probes,'v')
        if motif.ROC_auc == None:
            motif.ROC_auc = PROBESET.ROC_AUC(motif, probes, 'v')
        #if motif.MNCP   == None: motif.MNCP   = PROBESET.MNCP(motif,probes,'v')
        if motif.frac == None:
            motif.frac = PROBESET.frac(motif, probes, 'v', 0.7)
        if motif.numbound == 0:
            matching = PROBESET.matching_ids(motif, [], factor=0.7)
            matchbound = [x for x in matching if x in probes]
            motif.numbound = len(probes)
            motif.nummotif = len(matching)
            motif.numboundmotif = len(matchbound)
        if 0 and motif.CRA == None:
            try:
                pass
                CRA, Cfrac = PROBESET.cons_ROC_AUC(motif,
                                                   probes,
                                                   'v',
                                                   tuple='YES')
                motif.CRA = CRA
                motif.Cfrac = Cfrac
            except:
                pass

    MotifTools.save_motifs(motifs, outname)
Esempio n. 7
0
def memefiles2tamo(files, tamoname):
    global probefile, PROBESET, fsafile
    
    motifs = []
    for filename in files:
        print ">>>SDFSD>F ",filename
        if   re.search('\.ace$',filename):
            mdobject = AlignAce.AlignAce(filename)
            if not mdobject.fastafile: mdobject.fastafile=filename.replace('.ace','.fsa')
        elif re.search('\.meme.*$',filename):
            mdobject = Meme.Meme(filename)
            if not mdobject.fastafile:
                mdobject.fastafile=re.sub('\..\.meme','.meme',filename).replace('.meme','.fsa')
        motifs.extend(mdobject.motifs)

    #fsaname = find_fsa(mdobject.fastafile)
    print mdobject.fastafile
    if fsafile: fsaname = fsafile
    else:       fsaname = Fasta.find(mdobject.fastafile)
    fsaD    = Fasta.load(fsaname)
    probes  = fsaD.keys()
    if not probefile:
        PROBESET = MotifMetrics.ProbeSet('YEAST')
        #PROBESET= pick_genome(fsaname)
    for key,seq in fsaD.items():
        PROBESET.probes[key] = seq

    for motif in motifs:
        if motif.pvalue == 1: motif.pvalue = PROBESET.p_value(motif,probes,'v')
        if motif.church == 1: motif.church = PROBESET.church(motif,probes,'v')
        #if motif.E_site == None: motif.E_site = PROBESET.E_sitef(motif,probes,3,'v')
        #if motif.E_chi2 == None: motif.E_chi2 = PROBESET.E_chi2(motif,probes,None,'v')
        #if motif.E_seq  == None: motif.E_seq  = PROBESET.E_seq(motif,probes,'v')
        if motif.ROC_auc== None: motif.ROC_auc= PROBESET.ROC_AUC(motif,probes,'v')
        #if motif.MNCP   == None: motif.MNCP   = PROBESET.MNCP(motif,probes,'v')
        if motif.frac   == None: motif.frac   = PROBESET.frac(motif,probes,'v',0.7)
        if re.search('\.meme$',filename):
            motif.MAP = -math.log(motif.evalue)/math.log(10)
        if 0 and (motif.CRA == None):
            try:
                pass
                CRA, Cfrac = PROBESET.cons_ROC_AUC(motif,probes,'v',tuple='YES')
                motif.CRA = CRA
                motif.Cfrac = Cfrac
            except: pass

    if re.search('\.meme$',filename):
        mdobject.motifs.sort(lambda x,y: cmp(x.pvalue, y.pvalue))
    else:
        mdobject.motifs.sort(lambda x,y: cmp(x.church, y.church))

    MotifTools.save_motifs(motifs,tamoname)
Esempio n. 8
0
def tamo2tf(TAMO_file):
    '''Converts TAMO files to the TRANSFAC format
    '''

    ml = MotifTools.txt2motifs(TAMO_file)
    TAMO_file_name = TAMO_file.split("/")[-1]
    ACGT = ["A", "C", "G", "T"]
    n = 1
    oup = open("%s.tf" % (TAMO_file), "w")
    for m in ml:
        if m.source == "":
            oup.write("DE\t%s_%s\t%s_%s\n" %
                      (TAMO_file_name, n, TAMO_file_name, n))
        else:
            oup.write("DE\t%s\t%s\n" % (m.source, m.source))
        count = 0
        #print m.source
        for i in range(m.width):
            oup.write("%s\t" % count)
            for letter in ACGT:
                if m.logP:
                    Pij = pow(2.0, m.logP[i][letter])
                    oup.write("%s\t" % int(Pij * 100))
            oup.write("\n")
            count += 1
        oup.write("XX\n")
        n += 1
    oup.close()
Esempio n. 9
0
def averagemotifs(motifs,ovlp=2,template=None,DFUNC=negcommonbitsrange,VERBOSE=1,prop=''):
    if not template: 
        Dmat = computeDmat(motifs)
        idx  = centroididx(Dmat)
        template = motifs[idx]

    for m in motifs:
        off, rc = minshortestoverhangdiff(template,m,OVLP(template,m),'want_offset',DFUNC=DFUNC)
        m.offset = off
        m.rc     = rc
        #Find most negative offset
    offsets = [m.offset for m in motifs]             ; offsets.sort()
    maxposs = [(m.offset + m.width) for m in motifs] ; maxposs.sort()
    minpos = -offsets[0]
    maxpos = maxposs[-1] + minpos
    pmotifs = []
    for m in motifs:
        if m.rc: _m = m.revcomp()
        else   : _m = m
        leftpad  = minpos + m.offset
        rightpad = maxpos - (leftpad + m.width)
        padded   = _m[-leftpad,_m.width+rightpad]
        #print '%s%s%s\t%s'%('*'*leftpad,_m.oneletter,'*'*rightpad,padded)
        pmotifs.append(padded)
    AVE = MotifTools.sum(pmotifs,[])
    if VERBOSE:
        for m in pmotifs:
            d = minshortestoverhangdiff(AVE,m,OVLP(AVE,m),DFUNC=DFUNC)
            print '%s   %5.3f'%(m.oneletter,d),
            if m.__dict__.has_key('key'): print m.key,
            if prop and m.__dict__.has_key(prop): print m.__dict__[prop],
            print
        print '-'*m.width
    return AVE
Esempio n. 10
0
    def __init__(self,seed_seqs, all_seqs, width = 6, verbose = ''):
        self.seed_seqs  = seed_seqs #Sequences to be scanned for seeds
        self.seqs       = all_seqs
        self.candidates = []
        self.models     = []      #Set directly or computed from seed_seqs
        self.width      = width
        self.verbose    = verbose
        if width:
            self.goodwmersT = MotifTools.top_nmers(self.width,self.seed_seqs,1,"")
        else:
            self.goodwmersT = zip(self.seed_seqs,range(len(self.seed_seqs)))
        self.bgprob     = {'A': 0.31, 'C': .19, 'G': .19, 'T': .31}
        self.beta       = 0.001
        self.deltamin   = 1e-3
        self.probes     = []
        self.method     = "ZOOPS" # OOPS or ZOOPS )
        self.param      = {}
        self.gapflank   = 0
        self.gapweight  = 0.2
        self.seedbeta   = 0.02
        self.joint      = 1

        global theMarkovBackground
        if theMarkovBackground:
            self.bgprob = theMarkovBackground.zeroth()

        '''DELETE
Esempio n. 11
0
def parse_opts():
    global GLOBALS
    global DFUNC, DMAX
    short_opts = 'm:'
    long_opts  = ['dfunc:']
    try:   opts, args = getopt.getopt(sys.argv[1:], short_opts, long_opts)
    except getopt.GetoptError:
        print getopt.GetoptError.__dict__
        usage()
    if not opts: usage()

    GLOBALS['args'] = args
    GLOBALS['motifs'] = []
    DFUNCtxt = None
    for opt,value in opts:
        if opt == '-m':                   GLOBALS['motifs'] = MotifTools.txt2motifs(value)
        if opt == '--dfunc':              DFUNCtxt = value
        if opt == '-d':                   DMAX     = float(value)

    # Deal with DFUNC and DMAX
    if DFUNCtxt == 'NCB':
        _DFUNC = MotifCompare.negcommonbits
    elif DFUNCtxt:
        try:
            exec ("_DFUNC = MotifCompare.%s"%DFUNCtxt)
        except:
            usage("No such distance metric: %s"%DFUNCtxt)
    if _DFUNC:  set_dfunc(_DFUNC,DMAX)
Esempio n. 12
0
def tamo2tamo(file, outname):
    global probefile, PROBESET, fsafile
    
    motifs  = MotifTools.load(file)
    if fsafile:
        fsaname = fsafile
    else:
        fsaname = find_fsa(file)

    print '# FSA ',fsaname
    fsaD    = MotifMetrics.fasta2seqs(fsaname,'want_dict')
    probes  = fsaD.keys()
    if not probefile:
        PROBESET = MotifMetrics.ProbeSet('YEAST')
        #PROBESET= pick_genome(fsaname)
    #for key,seq in fsaD.items():
    #    PROBESET.probes[key] = seq

    print "# %d motifs"%len(motifs)
    for motif in motifs:
        #motif.pvalue, motif.church = 1,1  #Comment this!
        if motif.pvalue == 1: motif.pvalue = PROBESET.p_value(motif,probes,'v')
        if motif.church == 1: motif.church = PROBESET.church(motif,probes,'v')
        #if motif.E_site == None: motif.E_site = PROBESET.E_sitef(motif,probes,3,'v')
        #if motif.E_chi2 == None: motif.E_chi2 = PROBESET.E_chi2(motif,probes,None,'v')
        #if motif.E_seq  == None: motif.E_seq  = PROBESET.E_seq(motif,probes,'v')
        if motif.ROC_auc== None: motif.ROC_auc= PROBESET.ROC_AUC(motif,probes,'v')
        #if motif.MNCP   == None: motif.MNCP   = PROBESET.MNCP(motif,probes,'v')
        if motif.frac   == None: motif.frac   = PROBESET.frac(motif,probes,'v',0.7)
        if motif.numbound == 0:
            matching            = PROBESET.matching_ids(motif,[],factor=0.7)
            matchbound          = [x for x in matching if x in probes]
            motif.numbound      = len(probes)
            motif.nummotif      = len(matching)
            motif.numboundmotif = len(matchbound)
        if 0 and motif.CRA    == None:
            try:
                pass
                CRA, Cfrac = PROBESET.cons_ROC_AUC(motif,probes,'v',tuple='YES')
                motif.CRA = CRA
                motif.Cfrac = Cfrac
            except: pass
        
    MotifTools.save_motifs(motifs,outname)
Esempio n. 13
0
def pick_chunk_score(wdir, TAMO_file, target, genome):
    '''Trims and returns the top motif in a cluster.
    
    This script takes in the TAMO file from the motifs in a single cluster. It
    trims the low-information ends from each motifs. It then indentifies the
    motif that is most significantly represented in the target genes in your
    genome. If no motif is significantly represented, then a blank top motif
    file is created.
    '''
    os.system("cd %s" % wdir)
    os.chdir(wdir)

    script_dir = '/'.join(os.path.abspath(__file__).split('/')
                          [:-1])  # path to pcc_merge_CC.py script

    ##
    # step 1 trim tamo to eliminate low information flanking sequence
    trim_motif(TAMO_file, 0.1)

    ##
    # step 2 Group Specificity Score" from the Church lab
    # python MotifMetrics.py [Genes of interest] -genome [FASTA of promoter sequence] -t [Trimmed TAMO of cluster motifs]
    # MotifMetrics.py checks if the motifs appear disproportionatly to the
    # targets compared to the rest of the genes.
    os.system(
        "python %s/MotifMetrics.py %s -genome %s -t %s_0.1.trim -spec > %s_0.1.trim_Cout"
        % (script_dir, target, genome, TAMO_file, TAMO_file))

    ##
    # Gets the motif that is most significantly represented in your target genes
    # Returns "None" if none of the motifs has a p-value above 0.001.
    topm = parse_out_pcs("%s_0.1.trim_Cout" % TAMO_file)
    print "topm", topm

    ##
    # Writes the top motif to its own directory.
    if topm != "None":

        newdic = {}
        ml = MotifTools.txt2motifs("%s_0.1.trim" % TAMO_file)

        for m in ml:

            if m.oneletter == topm:
                newdic[m.oneletter] = m

        save_motifs(newdic.values(), "%s.TOP" % TAMO_file)
        os.system("rm %s_0.1.trim" % TAMO_file)
        os.system("rm %s_0.1.trim_Cout" % TAMO_file)

    ##
    # Writes a blank document if there was no top motif.
    else:
        oup = open("%s.TOP" % TAMO_file, "w")
        oup.close()
Esempio n. 14
0
def ace2tamo(filename, tamoname):
    global probefile, PROBESET
    if   re.search('\.ace$',filename):
        mdobject = AlignAce.AlignAce(filename)
    elif re.search('\.meme$',filename):
        mdobject = Meme.Meme(filename)

    fsaname = find_fsa(mdobject.fastafile)
    fsaD    = MotifMetrics.fasta2seqs(fsaname,'want_dict')
    probes  = fsaD.keys()
    if not probefile:
        PROBESET = MotifMetrics.ProbeSet('HUMAN_250')
        #PROBESET= pick_genome(fsaname)
    for key,seq in fsaD.items():
        PROBESET.probes[key] = seq

    for motif in mdobject.motifs:
        if motif.pvalue == 1: motif.pvalue = PROBESET.p_value(motif,probes,'v')
        if motif.church == 1: motif.church = PROBESET.church(motif,probes,'v')
        if motif.E_site == None: motif.E_site = PROBESET.E_sitef(motif,probes,3,'v')
        #if motif.E_chi2 == None: motif.E_chi2 = PROBESET.E_chi2(motif,probes,None,'v')
        if motif.E_seq  == None: motif.E_seq  = PROBESET.E_seq(motif,probes,'v')
        if motif.ROC_auc== None: motif.ROC_auc= PROBESET.ROC_AUC(motif,probes,'v')
        if motif.MNCP   == None: motif.MNCP   = PROBESET.MNCP(motif,probes,'v')
        if re.search('\.meme$',filename):
            motif.MAP = -math.log(motif.evalue)/math.log(10)
        sys.stdout.flush()

    i = 0
    for motif in mdobject.motifs:
        motif.seednum = i ; i=i+1
        kmers = motif.bogus_kmers(100)
        motif.maxscore = -100
        scores = [motif.scan(kmer)[2][0] for kmer in kmers]
        print Arith.avestd(scores)

    if re.search('\.meme$',filename):
        mdobject.motifs.sort(lambda x,y: cmp(x.pvalue, y.pvalue))
    else:
        mdobject.motifs.sort(lambda x,y: cmp(x.church, y.church))

    MotifTools.save_motifs(mdobject.motifs,tamoname)
Esempio n. 15
0
def ace2tamo(filename, tamoname):
    global probefile, PROBESET
    if   re.search('\.ace$',filename):
        mdobject = AlignAce.AlignAce(filename)
    elif re.search('\.meme$',filename):
        mdobject = Meme.Meme(filename)

    fsaname = find_fsa(mdobject.fastafile)
    fsaD    = MotifMetrics.fasta2seqs(fsaname,'want_dict')
    probes  = fsaD.keys()
    if not probefile:
        PROBESET = MotifMetrics.ProbeSet('HUMAN_250')
        #PROBESET= pick_genome(fsaname)
    for key,seq in fsaD.items():
        PROBESET.probes[key] = seq

    for motif in mdobject.motifs:
        if motif.pvalue == 1: motif.pvalue = PROBESET.p_value(motif,probes,'v')
        if motif.church == 1: motif.church = PROBESET.church(motif,probes,'v')
        if motif.E_site == None: motif.E_site = PROBESET.E_sitef(motif,probes,3,'v')
        #if motif.E_chi2 == None: motif.E_chi2 = PROBESET.E_chi2(motif,probes,None,'v')
        if motif.E_seq  == None: motif.E_seq  = PROBESET.E_seq(motif,probes,'v')
        if motif.ROC_auc== None: motif.ROC_auc= PROBESET.ROC_AUC(motif,probes,'v')
        if motif.MNCP   == None: motif.MNCP   = PROBESET.MNCP(motif,probes,'v')
        if re.search('\.meme$',filename):
            motif.MAP = -math.log(motif.evalue)/math.log(10)
        sys.stdout.flush()

    i = 0
    for motif in mdobject.motifs:
        motif.seednum = i ; i=i+1
        kmers = motif.bogus_kmers(100)
        motif.maxscore = -100
        scores = [motif.scan(kmer)[2][0] for kmer in kmers]
        print Arith.avestd(scores)

    if re.search('\.meme$',filename):
        mdobject.motifs.sort(lambda x,y: cmp(x.pvalue, y.pvalue))
    else:
        mdobject.motifs.sort(lambda x,y: cmp(x.church, y.church))

    MotifTools.save_motifs(mdobject.motifs,tamoname)
Esempio n. 16
0
def trim_motif(TAMO_file, cut=0.4):
    '''Trims the motifs in TAMO_file, eliminating low-information flanks.'''

    testmotifs = MotifTools.load(TAMO_file)
    file = TAMO_file + "_" + str(cut) + ".trim"

    new_mlist = []
    for motif in testmotifs:
        m = motif.trimmed(cut)
        new_mlist.append(m)
    save_motifs(new_mlist, file)
Esempio n. 17
0
def TAMO_split(TAMO_file, motifs_per_file=190):
    '''This function splits a TAMO into smaller files for create_cc'''
    ml = MotifTools.txt2motifs(TAMO_file)
    total = len(ml) / int(motifs_per_file)  # Total number of TAMOs to generate
    by = motifs_per_file
    for i in range(total):
        print i
        print i * by + by, TAMO_file + '_n%s' % i
        save_motifs(ml[i * by:i * by + by], TAMO_file + '_n%s' % i)
    print total * by, len(ml), TAMO_file + '_n%s' % (total)
    save_motifs(ml[total * by:len(ml)], TAMO_file + '_n%s' % (total))
    return (total)
Esempio n. 18
0
def opentamo(fileloc):
    '''
    Opens a tamo file with MotifTools.load and returns the list of motifs,
    except when the input file doesn't exist, in which case it returns an empty list. 

    Has 1 argument:
    - fileloc: a string with the location of the file
    '''
    try:
        return MotifTools.load(fileloc)
    except IOError:
        return []
Esempio n. 19
0
def Reduce_Nmers(Info):
    print 'COMPUTING Nmers ....'
    mseqs = ReduceInfo2seqs(Info,70, lambda L: MotifTools.top_nmers(6,L)[0:3])
    print "Combining representative sequences...: "
    for i in range(len(mseqs)):
        i = i + 1
        print '\t%s'%mseqs[i-1],
        if (i%5 == 0): print
    print 

    top_seq_pairs = MotifTools.top_nmers(5,mseqs,1)
    total_nmers = 0
    for (mner,count) in top_seq_pairs:
        total_nmers = total_nmers + count
    for (nmer,count) in top_seq_pairs[0:8]:
        print "RESULT: %s\t%2d (%5.2f%%) occurences:  "%(nmer,count,
                                                         100*float(count)/total_nmers),
        for bsite in Info.query['bsites']:
            seq = bsite.cleantxt()
            (max,s1,s2) = MotifTools.compare_seqs(nmer,seq)
            print '   %s vs %s %4.2f correct'%(s1,s2,max)
Esempio n. 20
0
def tamofile2motifs(filename):
    FID = open(filename,'r')
    lines = FID.readlines()
    FID.close()
    motifs   = []
    seedD    = {}
    seedfile = ''
    for i in range(len(lines)):
        if lines[i][0:10] == 'Log-odds matrix'[0:10]:
            w = len(lines[i+1].split())-1
            ll = []
            for pos in range(w):
                ll.append({})
            for j in range(0,4):
                toks = lines[i+j+2].split()
                L = toks[0][1]
                for pos in range(w):
                    ll[pos][L] = float(toks[pos+1])
            m = MotifTools.Motif_from_ll(ll)
            motifs.append(m)
        if lines[i][0:6] == 'Motif '[0:6]:
            toks =  lines[i].split()
            motifs[-1].nseqs    = float(re.sub('[\(\)]','',toks[3]))
            motifs[-1].totalbits= float(toks[5])
            motifs[-1].MAP      = float(toks[7])
            motifs[-1].seeddist = float(toks[9])
            motifs[-1].seednum  = int(toks[10][0:-1])
            motifs[-1].pvalue   = math.pow(10,-float(toks[12]))
            if 'ch:' in toks:
                motifs[-1].church = math.pow(10,-float(toks[14]))
        if lines[i][0:10] == 'Threshold: '[0:10]:
            toks =  lines[i].split()
            motifs[-1].threshold= float(toks[1])
        if lines[i][0:5] == 'Seed '[0:5]:
            toks = lines[i].split()
            id = int(toks[1][0:-1])  #'10:' -> '10'
            seedD[id] = toks[2]
        if lines[i][0:7] == 'Source: '[0:7]:
            motifs[-1].source = lines[i][7:].strip()
        if lines[i][0:6] == 'Gamma: '[0:6]:
            motifs[-1].gamma = float(lines[i][6:])
        if lines[i][0:6] == 'Evalue: '[0:6]:
            motifs[-1].evalue = float(lines[i][7:].strip())
        if lines[i].find('Using')>=0 and lines[i].find('as seeds')>=0:
            '''#Using all (132) motifs in SLT_081503.seeds as seeds:'''
            seedfile = lines[i].split()[-3]
    for i in range(len(motifs)):
        if seedfile: motifs[i].seedfile = seedfile
        seednum = motifs[i].seednum
        if seedD.has_key(seednum):
            motifs[i].seedtxt = seedD[seednum]
    return(motifs)
Esempio n. 21
0
def motifs2tamo(motifs, outname):
    global probefile, PROBESET
    
    fsaname = find_fsa(outname)
    fsaD    = MotifMetrics.fasta2seqs(fsaname,'want_dict')
    probes  = fsaD.keys()
    if not probefile:
        PROBESET = MotifMetrics.ProbeSet('YEAST')
        #PROBESET= pick_genome(fsaname)
    #for key,seq in fsaD.items():
    #    PROBESET.probes[key] = seq

    print "# %d motifs"%len(motifs)
    for motif in motifs:
        if motif.pvalue == 1: motif.pvalue = PROBESET.p_value(motif,probes,'v')
        if motif.church == 1: motif.church = PROBESET.church(motif,probes,'v')
        if motif.E_site == None: motif.E_site = PROBESET.E_sitef(motif,probes,3,'v')
        #if motif.E_chi2 == None: motif.E_chi2 = PROBESET.E_chi2(motif,probes,None,'v')
        if motif.E_seq  == None: motif.E_seq  = PROBESET.E_seq(motif,probes,'v')
        if motif.ROC_auc== None: motif.ROC_auc= PROBESET.ROC_AUC(motif,probes,'v')
        if motif.MNCP   == None: motif.MNCP   = PROBESET.MNCP(motif,probes,'v')
    MotifTools.save_motifs(motifs,outname)
Esempio n. 22
0
def parse_block(name, block):
    mat = []
    ACGT = {"A": 1, "C": 2, "G": 3, "T": 4}
    for i in block:
        L = i.strip().split()
        D = {'A': 0, 'C': 0, 'T': 0, 'G': 0}
        for j in ACGT.keys():
            D[j] = float(L[ACGT[j]])
        mat.append(D)
    m = MotifTools.Motif_from_counts(mat)
    m.source = name
    #print m._print_p()
    return m
Esempio n. 23
0
def test():
    motifs = []
    betalist = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 4.0]
    for beta in [1.0]:
        m = MotifTools.Motif()
        m.compute_from_text('GGTTTCAT', beta)  #STE12 binding site
        print m
        m._print_ll()
        print "Against Ste12:"
        match = validate(m, "STE12", 'V', 'T')
        print "Against Fkh2:"
        fmatch = validate(m, "FKH2", 'V', 'T')
        print beta, match, fmatch
Esempio n. 24
0
 def freq_from_seqs_old(self,seqs):
     self.highestorder = 4
     for depth in range(1,6):
         nmersT = MotifTools.top_nmers(depth, seqs, "TUPLES")
         self.nmers_by_size[depth] = map(lambda x:x[0],nmersT)
         total = 0
         for nmer,count in nmersT:
             total = total + count
         for nmer,count in nmersT:
             rc = MotifTools.revcomplement(nmer)
             if nmer == rc:                       #correct top_nmers 
                 f   = float(count)/total         #palindrome count
             else:
                 f   = float(count)/total/2
             self.F[nmer] = f
             self.F[rc]   = f
     for depth in range(0):                       #For debugging
         total = 0
         for k in self.F.keys():
             if len(k) == depth:
                 total = total + self.F[k]
                 print k, self.F[k]
         print depth,total
Esempio n. 25
0
def alignAndCombineMotifs(motifs, weights):
    # zip motifs and weights
    simMotifs = zip(motifs, weights)
    # sort by weights
    simMotifs.sort(key=lambda x: abs(x[1]))
    simMotifs.reverse()
    
    aligned = alignSimilarMotifs([x[0] for x in simMotifs], minoverlap=4)
    #print '--'
    #for each in aligned: print each.oneletter
    #print '\n'
    
    comboMotif = MotifTools.sum(aligned,[-x[1] for x in simMotifs])
    return comboMotif
Esempio n. 26
0
def combine_distance_matrix_for_2(wdir, TAMO_file_1, TAMO_file_2):
    '''Combines matricies made from two TAMO files.
    
    This script is used to create the final matrix after all jobs from 
    create_cc_for_2 are complete.
    '''

    ml_1 = MotifTools.txt2motifs(TAMO_file_1)
    ml_2 = MotifTools.txt2motifs(TAMO_file_2)

    n_split_1 = len(ml_1) / 100
    n_split_2 = len(ml_2) / 100

    print n_split_1, len(ml_1)
    print n_split_2

    # Change to the working directory.
    os.system("cd %s" % wdir)
    os.chdir(wdir)

    # This loop will paste together matricies
    for i in range(n_split_1 + 1):
        com = "paste "
        for j in range(n_split_2 + 1):
            com += "%s_n%s-%s_n%s.dm " % (TAMO_file_1, i, TAMO_file_2, j)
        com += "> distance_%s" % i
        print com
        os.system(com)

    #
    com = "cat "
    for i in range(n_split_1 + 1):
        com += "distance_%s " % i
    com += "> %s-%s.dm" % (TAMO_file_1, TAMO_file_2)

    print com
    os.system(com)
Esempio n. 27
0
def parse_opts():
    global GLOBALS
    short_opts = 'm:g:'
    long_opts  = ['genome=','top=']
    try:   opts, args = getopt.getopt(sys.argv[1:], short_opts, long_opts)
    except getopt.GetoptError:
        print getopt.GetoptError.__dict__
        usage()
    if not opts: usage()

    GLOBALS['args'] = args
    for opt,value in opts:
        if opt == '-m':                GLOBALS['motifs']     = MotifTools.txt2motifs(value)
        if opt in ['-g', '--genome']:  GLOBALS['genomefile'] = value
        if opt == '--top':             GLOBALS['top']        = int(value)
Esempio n. 28
0
def Read_Dreme_PSSM(lines):
	pwm = []
	name = "Dreme Motif";

	vals = []
	for line in lines.split('\n'):
		for item in line.split():
			vals.append(float(item))
		pwm.append(vals)
		vals = [];
	#print pwm

	m = MotifTools.toDict(pwm)
	motif = MotifTools.Motif_from_counts(m)
	return motif;
Esempio n. 29
0
 def study_seqs(self,seqs):
     for depth in range(1,6):
         nmersT = MotifTools.top_nmers(depth, seqs, "TUPLES")
         total = 0
         for nmer,count in nmersT:
             total = total + count
             rc = MotifTools.revcomplement(nmer)
         for nmer,count in nmersT:
             f   = math.log(float(count)/total)/math.log(2)
             f_2 = math.log(0.5 * float(count)/total)/math.log(2)
             rc = MotifTools.revcomplement(nmer)
             if rc != nmer:
                 self.D[nmer] = f_2
                 self.D[rc]   = f_2
             else:
                 self.D[nmer] = f
     for depth in range(0):
         total = 0
         for k in self.D.keys():
             if len(k) == depth:
                 total = total + pow(2,self.D[k])
                 print k, pow(2,self.D[k])
         print depth,total
     self.highestorder = 5
Esempio n. 30
0
 def freq_from_seqs(self,seqs):
    self.highestorder = 6
    for w in range(1,7):
        allnmers = permute(w)
        nmersT = MotifTools.top_nmers(w,seqs,'with counts','purge Ns')
        self.nmers_by_size[w] = allnmers[:]
        nmersD = {}
        total = 0.0
        for nmer in allnmers: #Pseudo count
            nmersD[nmer] = 1 
            total = total + 1
        for nmer,count in nmersT:
            try: 
                rc = MotifTools.revcomplement(nmer)
                nmersD[nmer] = nmersD[nmer] + count
                nmersD[rc]   = nmersD[rc]   + count
                total = total + 2*count
            except KeyError:
                pass
        for nmer in nmersD.keys():
            rc = MotifTools.revcomplement(nmer)
            f  = nmersD[nmer]/total
            self.F[nmer] = f
            self.F[rc]   = f
Esempio n. 31
0
def Make_PWM_Motif(filename, motifBackGround=""):

	#print "# Reading PWM from: [%s]"%filename
	name, pwm = Read_PWM(filename)
	
	m = MotifTools.toDict(pwm)
	#print m
	motif = MotifTools.Motif_from_ll(m);
	motif.source = name
	
	#print "Motif:", motif.source
	#print "Max Motif Score:", motif.maxscore
	#print "Motif Summary:", motif.summary()
	#motif.printlogo(2.3,10)
	
	return motif
Esempio n. 32
0
def averagemotifs(motifs,
                  ovlp=2,
                  template=None,
                  DFUNC=negcommonbitsrange,
                  VERBOSE=1,
                  prop=''):
    if not template:
        Dmat = computeDmat(motifs)
        idx = centroididx(Dmat)
        template = motifs[idx]

    for m in motifs:
        off, rc = minshortestoverhangdiff(template,
                                          m,
                                          OVLP(template, m),
                                          'want_offset',
                                          DFUNC=DFUNC)
        m.offset = off
        m.rc = rc
        #Find most negative offset
    offsets = [m.offset for m in motifs]
    offsets.sort()
    maxposs = [(m.offset + m.width) for m in motifs]
    maxposs.sort()
    minpos = -offsets[0]
    maxpos = maxposs[-1] + minpos
    pmotifs = []
    for m in motifs:
        if m.rc: _m = m.revcomp()
        else: _m = m
        leftpad = minpos + m.offset
        rightpad = maxpos - (leftpad + m.width)
        padded = _m[-leftpad, _m.width + rightpad]
        #print '%s%s%s\t%s'%('*'*leftpad,_m.oneletter,'*'*rightpad,padded)
        pmotifs.append(padded)
    AVE = MotifTools.sum(pmotifs, [])
    if VERBOSE:
        for m in pmotifs:
            d = minshortestoverhangdiff(AVE, m, OVLP(AVE, m), DFUNC=DFUNC)
            print '%s   %5.3f' % (m.oneletter, d),
            if m.__dict__.has_key('key'): print m.key,
            if prop and m.__dict__.has_key(prop): print m.__dict__[prop],
            print
        print '-' * m.width
    return AVE
Esempio n. 33
0
def showdiffXvert(motif, seq, OVLP_FCN=None, DIFF_FCN=None):
    '''
    The funtion converts the sequence to a Motif, computes the D
    of the best alignment, and prints the alignment that generated
    that D.
    '''
    MSOdiff = minshortestoverhangdiff
    if not OVLP_FCN: OVLP_FCN = lambda A, B: min(min(A.width, B.width) - 1, 7)
    bg = motif.background
    other = MotifTools.Motif_from_text(seq, bg=bg)
    ovlp = OVLP_FCN(motif, other)
    diff = MSOdiff(motif, other, ovlp, DFUNC=DIFF_FCN)
    offset, rcflag = MSOdiff(motif, other, ovlp, 'want_offset', DFUNC=DIFF_FCN)
    if rcflag: m = other.revcomp()
    else: m = other
    print 'MSOdiff:  %8.4f %s%s%s' % (diff, ' ' * 15, motif.oneletter, ' ' *
                                      (30 - motif.width))
    print '          %8s %s%s%s' % (' ', ' ' *
                                    (15 + offset), m.oneletter, ' ' *
                                    (30 - offset - other.width))
    return diff
Esempio n. 34
0
def main():
    fsa_fcn = up_and_no_N

    parse()

    FID = open(sys.argv[1])
    tokss = [x.strip().split(',') for x in FID.readlines()]
    FID.close()

    D = {}
    for expt,motif,score,source in tokss:
        print expt,motif
        if expt == 'Category': continue
        if motif == 'x': continue
        motif = MotifTools.Motif_from_text(motif)
        motif.kellis = float(score)
        motif.source = source
        try: D[expt].append(motif)
        except: D[expt] = [motif]

    for expt,motifs in D.items():
        root = expt
        ext  = 'cons'
        if root[0:3] == 'Rnd':
            num = re.sub('.*_','',root)
            if len(num) == 1:
                root = re.sub('_','_00',root)
            else:
                root = re.sub('_','_0',root)
            root = re.sub('Rnd','random_',root)
        outname = '%s.t%s'%(root,ext)
        print '%-18s  --> %s'%(root,outname)
        sys.stdout.flush()
        motifs2tamo(motifs,outname)
        try: 
            pass
            #tamo2tamo(filename,outname)
        except:
            print "Error: Could not convert %s [[ %s ]]"%(
                filename, outname)
Esempio n. 35
0
def memefiles2tamo(files, tamoname):
    global probefile, PROBESET

    motifs = []
    for filename in files:
        print ">>>SDFSD>F ", filename
        if re.search('\.ace$', filename):
            mdobject = AlignAce.AlignAce(filename)
            if not mdobject.fastafile:
                mdobject.fastafile = filename.replace('.ace', '.fsa')
        elif re.search('\.meme.*$', filename):
            mdobject = Meme.Meme(filename)
            if not mdobject.fastafile:
                mdobject.fastafile = re.sub('\..\.meme', '.meme',
                                            filename).replace('.meme', '.fsa')
        motifs.extend(mdobject.motifs)

    #fsaname = find_fsa(mdobject.fastafile)
    print mdobject.fastafile
    fsaname = Fasta.find(mdobject.fastafile)
    fsaD = Fasta.load(fsaname)
    probes = fsaD.keys()
    if not probefile:
        PROBESET = MotifMetrics.ProbeSet('YEAST')
        #PROBESET= pick_genome(fsaname)
    for key, seq in fsaD.items():
        PROBESET.probes[key] = seq

    for motif in motifs:
        if motif.pvalue == 1:
            motif.pvalue = PROBESET.p_value(motif, probes, 'v')
        if motif.church == 1:
            motif.church = PROBESET.church(motif, probes, 'v')
        if motif.E_site == None:
            motif.E_site = PROBESET.E_sitef(motif, probes, 3, 'v')
        #if motif.E_chi2 == None: motif.E_chi2 = PROBESET.E_chi2(motif,probes,None,'v')
        #if motif.E_seq  == None: motif.E_seq  = PROBESET.E_seq(motif,probes,'v')
        if motif.ROC_auc == None:
            motif.ROC_auc = PROBESET.ROC_AUC(motif, probes, 'v')
        if motif.MNCP == None: motif.MNCP = PROBESET.MNCP(motif, probes, 'v')
        if motif.frac == None:
            motif.frac = PROBESET.frac(motif, probes, 'v', 0.7)
        if re.search('\.meme$', filename):
            motif.MAP = -math.log(motif.evalue) / math.log(10)
        if 1 and (motif.CRA == None):
            try:
                pass
                CRA, Cfrac = PROBESET.cons_ROC_AUC(motif,
                                                   probes,
                                                   'v',
                                                   tuple='YES')
                motif.CRA = CRA
                motif.Cfrac = Cfrac
            except:
                pass

    if re.search('\.meme$', filename):
        mdobject.motifs.sort(lambda x, y: cmp(x.pvalue, y.pvalue))
    else:
        mdobject.motifs.sort(lambda x, y: cmp(x.church, y.church))

    MotifTools.save_motifs(motifs, tamoname)
Esempio n. 36
0
    def all_Wmers(self,N,seq):
        forw = []
        rev  = []
        seqrc = MotifTools.revcomplement(seq)
        Mlh = theMarkovBackground.highestorder
        Mlb = theMarkovBackground.logbackground
        MCP = theMarkovBackground.CP
        Fbg = Mlb(seq)
        Rbg = Mlb(seqrc)
        nmask = map(lambda x:1-x, self.mask)

        '''
        ?? QUESTION: Is it sensible to compute the background probabilities
        this way?
        
        1) BG of complementary strand is taken as equal to primary strand.
        2) Letters inside the motif window are not used for conditional probabilities.
           As a result, the calculation essentially breaks down to the log probability the
           background emits the sequence to the left of the window plus the log probability
           the background emits the sequence to the right.
        3) I\'ve worked out an efficient way to compute this by
           a) Compute the background probability for the entire probe/sequence
           b) (Quick) Compute logQdiff below
           c) Subtract
        '''

        for i in range(len(seq)-N+1):
            subseq = seq[i:i+N]

            '''Build Wmer information'''
            #Wtmp        = Wmer(subseq)
            left        = seq[0:i]
            right       = seq[i+N:]
            #Wtmp.lflank = left
            #Wtmp.rflank = right
            #if i==0: Wtmp.src    = seq
            #Wtmp.srcQ   = Fbg
            #Wtmp.i      = i

            '''This is the fast way'''
            logQdiff = Mlb(left[-Mlh:] + subseq + right[0:Mlh]) - Mlb(left[-Mlh:]) - Mlb(right[0:Mlh])
            logQtot = Fbg - logQdiff

            '''Add a bit back for intervening bases in the "gap" '''
            gapbg = 0
            for p in range(N):
                gapbg = gapbg + MCP[subseq[p]] * nmask[p]
            logQtot = logQtot + gapbg

            '''Build Wmer-reverse complement information'''
            #Wtmprc = Wmer(Wtmp.rc)
            #Wtmprc.lflank = seqrc[0:-(i+N)]  #Check this in case it is ever necessary
            #if i!=0:
            #    Wtmprc.rflank = seqrc[-i:]   #Necessary [11-12-02]
            #else:
            #    Wtmprc.rflank = ''
            #Wtmprc.logQtot = Wtmp.logQtot
            #Wtmprc.srcQ    = Wtmp.srcQ
            #Wtmprc.i       = i
            forw.append(logQtot)
            rev.append(logQtot)
        W = []
        W.extend(forw)
        W.extend(rev)
        #seq.c_wmerbgs = MDsupport.list2double(map(lambda x: x.logQtot, W))
        #MDsupport.printdouble(seq.c_wmerbgs,len(W))
        return(W)
Esempio n. 37
0
def merge_runs(TAMO_file, wdir, height, distance, ancestor, target, genome):
    '''This script is used to merge motifs with the PCC matrix of all motifs.
    
    The script was originally written by Cheng Zou, and then converted to a 
    function by Alex Seddon.
    '''

    print "Here are the parameters you specified in this run "
    print "-tamo        %s" % TAMO_file
    print "-wdir        %s" % wdir
    print "-h        height to cut the tree, %s" % height
    print "-distance    %s" % distance
    print "-ancestor    %s" % ancestor
    print "-target    %s" % target
    print "-genome    %s" % genome

    if TAMO_file == '' or wdir == '':
        help()

    os.system("cd %s" % wdir)

    os.chdir(wdir)

    # This code was in the original clustering script. It has been taken out
    # because the processes involved take too long and have been replaced by
    # the matrix creation scripts and the run_UPGMA script.
    #if distance==0:
    #    os.system("python /mnt/home/seddonal/scripts/5_motif_merging/3.calculate_distance_matrix.py   -i %s --dfunc pccrange" % TAMO_file)
    #os.system("R --vanilla --slave --args %s.dm  %s< /mnt/home/seddonal/scripts/5_motif_merging/UPGMA_final.R> %s.Rout" % (TAMO_file,height,TAMO_file))

    cl_dic = {}
    n = 0

    # The file, TAMO_file.dm_UPGMA_Cl_0.05, is inorder of the motifs that appear
    # in the TAMO_file. If two motifs have the same number, they are considered
    # a part of the same cluster.
    # This loop pulls the clustering information out of this file and creats
    # the dictionary cl_dic = {cluster_index:{motif_index:'1'}}
    for line in open("%s.dm_UPGMA_Cl_%s" % (TAMO_file, height), "r"):

        # Gets the clusterindex of this motif
        cl = line.strip()

        # Adds the cluster index if it has not been
        if not cl_dic.has_key(cl):
            cl_dic[cl] = {}

        cl_dic[cl][n] = "1"  # Adds the motif to that cluster
        n += 1  # Increases the motif index for the next motif

    #print cl_dic

    ml = MotifTools.txt2motifs(TAMO_file)
    old = []  # List of motifs that are the sole members of a cluster.

    # I think I can divide up this portion of the code to create a series
    print ancestor, ancestor == 0
    if ancestor == 0:

        # This loop Looks at each cluster and attempts to merge the motifs
        # in the cluster if there are multiple motifs.
        for i in cl_dic.keys():

            print i, cl_dic[i]

            # If there are multiple motifs in the cluster, it merges the motifs
            if len(cl_dic[i]) > 1:

                # Adds all of the motifs in the cluster to an object called
                # mlist.
                mlist = []
                for j in cl_dic[i]:
                    mlist.append(ml[j])

                # Saves these motifs to there own TAMO file.
                save_motifs(mlist, "%s_sub_%s.tm" % (TAMO_file, i))

                # I am fairly certain that this process of converting to TF and
                # then returning it to TAMO format is only for keeping the names
                # consistent. I need to verify this suspicion
                tamo2tf("%s_sub_%s.tm" % (TAMO_file, i))
                os.system("cat  %s_sub_%s.tm.tf > %s_sub_%s_sum.tm.tf" %
                          (TAMO_file, i, TAMO_file, i))
                tf2tamo("%s_sub_%s_sum.tm.tf" % (TAMO_file, i))

                # Gets the top motif in the cluster.
                pick_chunk_score(wdir,
                                 '%s_sub_%s_sum.tm.tf.tm' % (TAMO_file, i),
                                 target, genome)

                # Removes the files that were created.
                os.system("rm  %s_sub_%s_sum.tm.tf.tm" % (TAMO_file, i))
                os.system("rm %s_sub_%s_sum.tm.tf" % (TAMO_file, i))
                os.system("rm -R %s_sub_%s.tm.tf_ST*" % (TAMO_file, i))

            # If there is only one motif in the cluster, it leaves it alone,
            # And adds it to old
            else:
                key = cl_dic[i].keys()[0]
                old.append(ml[key])

    if ancestor == 1:

        # This loop Looks at each cluster and attempts to merge the motifs
        # in the cluster if there are multiple motifs.
        for i in cl_dic.keys():

            print i, cl_dic[i]

            # If there are multiple motifs in the cluster, it merges the motifs
            if len(cl_dic[i]) > 1:

                # Adds all of the motifs in the cluster to an object called
                # mlist.
                mlist = []
                for j in cl_dic[i]:
                    mlist.append(ml[j])

                # Saves these motifs to there own TAMO file.
                save_motifs(mlist, "%s_sub_%s.tm" % (TAMO_file, i))

                # Merges the motifs in the same cluster using STAMP
                tamo2tf("%s_sub_%s.tm" % (TAMO_file, i))

                # Gets the JASPER motifs that best match the motifs from within
                # the cluster.
                os.system(
                    "STAMP -tf  %s_sub_%s.tm.tf  -sd /home/chengzou/bin/STAMP/ScoreDists/JaspRand_PCC_SWU.scores  \
                 -go  1000 -ge 1000 -cc PCC -align SWU -out %s_sub_%s.tm.tf_STout -chp > %s_sub_%s.tm.tf_STout.log"
                    % (TAMO_file, i, TAMO_file, i, TAMO_file, i))
                parse_out_STAMP(TAMO_file, i)

                # combines the JASPER motifs with the cluster motif and then
                # converts them all to one TAMO file
                os.system(
                    "cat  %s_sub_%s.tm.tf %s_sub_%s.tm.tf_SToutFBP.txt.mod %s_sub_%s.tm.tf_STout_tree_clusters.txt > %s_sub_%s_sum.tm.tf"
                    % (TAMO_file, i, TAMO_file, i, TAMO_file, i, TAMO_file, i))
                tf2tamo("%s_sub_%s_sum.tm.tf" % (TAMO_file, i))

                # Gets the top motif within the TAMO file.
                pick_chunk_score(wdir,
                                 '%s_sub_%s_sum.tm.tf.tm' % (TAMO_file, i),
                                 target, genome)

                # Removes any files created in the processing.
                os.system("rm  %s_sub_%s_sum.tm.tf.tm" % (TAMO_file, i))
                os.system("rm %s_sub_%s_sum.tm.tf" % (TAMO_file, i))
                os.system("rm -R %s_sub_%s.tm.tf_ST*" % (TAMO_file, i))
            else:
                key = cl_dic[i].keys()[0]
                old.append(ml[key])

    # Combine together the top motifs from every
    os.system("cat %s_sub_*_sum.tm.tf.tm.TOP > %s_sub_new.tm" %
              (TAMO_file, TAMO_file))
    save_motifs(old, "%s_sub_old.tm" % (TAMO_file))
    os.system("cat %s_sub_old.tm %s_sub_new.tm > %s_P1.tm" %
              (TAMO_file, TAMO_file, TAMO_file))
Esempio n. 38
0
genelist = argv[1].split('/')[-1]
allclusters = argv[1] + '/' + genelist + '_allclusters.tamo'
#print genelist
oneletters = argv[1] + '/other/' + genelist + '_oneletter.tmp'
symbols = argv[1] + '/other/' + genelist + '_symbols.tmp'

# Open output files for writing
oneletters = open(oneletters, 'w')
symbols = open(symbols, 'w')

# Define output variables
oneletterlist = []
symbolstring = '1234567890ABCDEFGHIJKLMNOPQRSTUVWXYZ*+.,:;!'

# Open list
motiflist = MotifTools.load(allclusters)

# Try to verify the initial list is not too long
if len(motiflist) > len(symbolstring):
    # If the list is too long, raise an exception so that the program quits
    raise ValueError("The cluster list is too long for sitemap.py")
# If the list is not too long, adjust the symbols string to the appropriate length
else:
    symbolstring = symbolstring[:len(motiflist)]

# Save symbol string in the symbols file and close that file
symbols.write(symbolstring)
symbols.close()

# Add oneletter summaries to the list
for num in range(len(motiflist)):
Esempio n. 39
0
    def EM_Cstart(self):
        verbose = self.verbose
        if verbose:
            print "Seeding models..."
            sys.stdout.flush()
        self.seed_models()

        #Initialize parameters
        if not self.param.has_key('gamma'): self.param['gamma'] = 0.2
        timings = {'Probes':0, 'Background':0, 'C EM':0, 'Post':0}
        _time = time.time()

        for seq in self.seqs:
            P = Probe(seq)
            self.probes.append(P)


        _time2 = time.time(); timings['Probes'] = _time2-_time; _time = _time2

        if verbose: print "Optimizing candidates by EM."
        if verbose: sys.stdout.flush()

        c_logZ_sets = {}
        for Model,i in zip(self.models,range(len(self.models))):
            width = Model.width

            self.calcmask(width)
              
            if not c_logZ_sets.has_key(width):
                c_logZs_set = []
                if verbose: print "#%s   |%s|"%(' '*28,'-'*len(self.seqs))
                if verbose: sys.stdout.flush()
                if verbose: print "Computing background (width %2d)  "%width,
                for P in self.probes:
                    if verbose: sys.stdout.write('.')
                    if verbose: sys.stdout.flush()
                    logZs = self.all_Wmers(width,P)
                    c_logZs = MDsupport.list2double(logZs)
                    c_logZs_set.append(c_logZs)
                    #P.c_wmerbgs = MDsupport.list2double(map(lambda x: x.logQtot, Wlist))
                c_logZ_sets[width] = c_logZs_set
                if verbose: print

            c_logZ_set = c_logZ_sets[width]
            for P,c_logZs in zip(self.probes,c_logZ_set):
                P.c_wmerbgs = c_logZs
                
            _time2 = time.time()
            timings['Background'] = timings['Background'] +_time2-_time
            _time = _time2


            '''Perform EM'''
            _time  = time.time()
            newModel = self.EM_C(Model, self.probes)
            _time2 = time.time(); timings['C EM'] = timings['C EM'] + _time2-_time; _time = _time2

            #print "cLL: ",newModel.joint
            #print "pLL: ",self.compute_joint(newModel,Wmers_by_seq)

            '''Was there a problem?'''
            if newModel == None:
                continue


            '''Set various things in PSSM'''
            #Distance(s)
            seeddist = MotifTools.infomaskdiff(newModel,Model)
            print '%s ----> %s'%(Model,newModel)
            print "Seed %2d: %s  -->  %s  mask:%9.5f  infoMask:%9.5f d:%9.5f"%(
                i, Model, newModel,
                MotifTools.maskdiff(newModel,Model),
                MotifTools.infomaskdiff(newModel,Model), #order is important
                Model-newModel)
            #Seed
            if Model.seedtxt: newModel.seedtxt = Model.seedtxt
            if Model.source:  newModel.source  = Model.source
            
            #newModel.denoise()
            newModel.seeddist = seeddist
            newModel.seednum  = i
            print newModel
            newModel._print_p()
            newModel._print_ll()

            '''Set various things in Candidate (like a wrapper for PSSM)'''
            C = MotifCandidate()
            C.pssm = newModel.copy()
            #C.wmers = self.best_by_Z(Wmers_by_seq)
            C.wmers  = [newModel.emit() for junk in range(20)]
            #C._update()  #MAJOR REMOVAL????????? DBG 10-14-03
            #C.MAPpurge()
            C.pssm = newModel.copy()  
            self.candidates.append(C)
            _time2 = time.time(); timings['Post'] = timings['Post']+_time2-_time;_time = _time2

        '''Print Timing Information'''
        if verbose:
            print "# Timing Information"
            _t = 0
            for timing in timings.keys():
                _t = _t + timings[timing]
            for timing in timings.keys():
                print "# %12s %f  %f%%"%(timing,timings[timing],timings[timing]*100/_t)
Esempio n. 40
0
# TAMOify kmers and logify pVals
for i in range(len(testMotifs)):
    testMotifs[i] = (Motif(testMotifs[i][0]),numpy.log10(float(testMotifs[i][1])))
    
# Sort on log'd pVals
testMotifs.sort(key=lambda x: x[1])

comboMotifs = []

for i in range(0,int(len(testMotifs)*0.2)):
    simMotifs  = getKmersWithOneMisMtch(testMotifs[i][0],testMotifs) 
    alndMotifs = alignSimilarMotifs([x[0] for x in simMotifs])
    #for m in simMotifs:
        #print m[0].oneletter
    comboMotifs.append(MotifTools.sum(alndMotifs,[-x[1] for x in simMotifs])) # -x[1] to convert neg logs to pos weights
    print len(comboMotifs)

t2 = time.time()    

oFile = '/Users/biggus/Documents/James/Collaborations/Campbell/data/Results_HyperGeoScreen/masked/Results_gGEMS/CCupAt4Days.gte2x.5-16mers.shfSeq.3.gGEMS.tmo'
pFile = '/Users/biggus/Documents/James/Collaborations/Campbell/data/Results_HyperGeoScreen/masked/Results_gGEMS/CCupAt4Days.gte2x.5-16mers.shfSeq.3.gGEMS.pkl'
MotifTools.save_motifs(comboMotifs,oFile,kmer_count=60)

pFile = open(pFile, 'w')
cPickle.dump(comboMotifs,pFile)
t3 = time.time()    
print 'Calculations took %.3f min.\nWriting/Pickling took %.3f min.' % ((float(t2)-t1)/60, (float(t3)-t2)/60) 
    

Esempio n. 41
0
 def has_wmer(self,wmer):
     rc = MotifTools.revcomplement(wmer)
     if (wmer in self.wmers) or (rc in self.wmers):
         return(1)
     else:
         return(0)
Esempio n. 42
0
def main():
    try:
        opts, args = getopt.getopt(sys.argv[1:], "f:m:n:L:t:a:S:i:", ["help", "output="])  # AD added 'i'
    except getopt.GetoptError:
        usage()
        sys.exit(1)
    if not opts:
        usage()
        sys.exit(1)
        

    print "#" + ' '.join(sys.argv)
    fastafile, motiffile, motifnums, labels, thresh = (None, None, [], None, 0.75) # AD changed thresh val to 0.75 from 0.7
    ambigs = []

    scale   = 50.0 / 1000.0
    
    motifs = []
    for opt, value in opts:
        #print opt, value
        if   opt ==  '-f':  fastafile = value
        elif opt ==  '-m':  motifs.extend(MotifTools.txt2motifs(value))
        elif opt ==  '-n':  motifnums = [int(x) for x in value.split(',')]
        elif opt ==  '-L':  labels    = list(value)
        elif opt ==  '-t':  thresh    = float(value)
        elif opt ==  '-a':  ambigs.extend(value.split(','))
        elif opt ==  '-S':  scale     = float(value)
        elif opt ==  '-i':  motiffile = value  # AD added this option to ACTUALLY supply the tamo motif file at the command-line.  The code to deal with motiffiles already existed. There was just no code for User to supply one.
        
    probes = Fasta.load(fastafile)
    
    if motiffile:
        for f in motiffile.split(','):      # AD added this to allow supplying multiple tamo files at the prompt like you can supply multiple motifs
            motifs.extend(MotifTools.load(f))
    if ambigs:
        for ambig in ambigs:
            motifs.append( MotifTools.Motif_from_text(ambig,0.1) )
    if not motifnums:  motifnums = range(len(motifs))
    print '# %d: %s'%(len(motifs),motifnums)
    for i in range(len(motifnums)):
        motif = motifs[motifnums[i]]
        if labels and i < len(labels):
            txt = labels[i]
        else:
            txt = '%d'%i
        print '%-3s : %s %5.2f (%4.2f)'%(txt,motif,thresh*motif.maxscore,thresh)

    probehits = {}
    for key in probes.keys():
        hits_by_motif = []
        save_flag     = 0
        if re.search('[BDHU]',probes[key]): continue
        for num in motifnums:
            result = motifs[num].scan(probes[key],thresh*motif.maxscore)
            if result[0]:
                hits_by_motif.append(result)
                save_flag = 1
            else:
                hits_by_motif.append(None)
        if save_flag:
            probehits[key]=hits_by_motif

    #scale   = .1
    maxw = 40
    for key in probehits.keys():
        l       = len(probes[key])
        a       = list('-'* int(scale*l) )
        a.extend( list(' '*10 ) )
        desc    = []
        matches = probehits[key]
        for i in range(len(matches)):
            if matches[i]:
                subseqs,endpoints,scores = matches[i]
                for idx in range(len(subseqs)):
                    start,stop = endpoints[idx]
                    subseq     = subseqs[idx]
                    score      = scores[idx]
                    if labels and (i<len(labels)): ID = labels[i]
                    else                         : ID = '%d'%i
                    desc.append('%s %s %d-%d %4.2f '%(ID,subseq,start,stop,score))
                    start = int(start*scale)
                    for offset in range(10):
                        if a[start+offset] == '-':
                            if labels and (i < len(labels)):
                                a[start+offset] = labels[i]
                            else:
                                a[start+offset] = '%d'%i
                            break
        print '%-14s %s'%(key,''.join(a)),
        print ' '*max(0,maxw-len(a)), '| '.join(['%-27s'%x for x in desc])
        
    print
    print "Found matches in %d of %d input probes"%(len(probehits),len(probes))
Esempio n. 43
0
def main():
    if len(sys.argv) < 2:
        print "Usage: %s <fasta_file> [width = None ] [options]"%(re.sub('^.*/','',sys.argv[0]))
        print "Options include:"
        print ""
        print " EM Parameters:"
        print "                  -beta    [0.01]   Beta for pseudocounts"
        print "                  -seedbeta[0.02]   Beta for pseudocounts for seeds from text"
        print "                  -gamma   [0.2]    Gamma (fraction of sequences)"
        print "                  -delta   [0.001]  Convergence criteria"
        print " "
        print " Seeds (not actually proper priors)"
        print "                  -prior            Seqences or motifs for seeds (may be repeated)"
        print "                  -top N   [0]      Include w-mers in top N probes"
        print "                  -gap    string    sample gapped motifs"
#       print "                  -TF               Seed with (all) TRANSFAC PSSMs (buggy)"
        print "                  -kmerseeds        Use kmers with best enrichment score as seeds for EM"
        print "                  -pad              add NN..NN to seed"
        print " "
        print " Genome / Background model "
        print "                  -human (250,1000) Use Human Background model"
        print "                  -g genome.fsa     Use specicied Fasta file as background (searches first for matching frequency file)"
#       print "                  -Y2K, -Y5C        Use Yeast Upstream Intergenic regions (2000, 500)"
#       print "                  -B                Use Bacterial Orfs"
        print " " 
        print "Examples:"
        print " %s t.fsa 5 -prior GGGTA -prior AAAAAC "%(sys.argv[0].split('/')[-1])
        print "   will start an EM with 3 seeds: GGGTA, AAAAA, and AAAAC"
        print 
        print " %s t.fsa 5 -info CUP9.info -gamma 0.5 "%(sys.argv[0].split('/')[-1])
        print "   will start an EM with Enriched seeds in CUP9.info, with"
        print "   Gamma expectation of 50% of all probes"
        print 
        print " %s t.fsa -prior MCM1_5.tamo:0 "%(sys.argv[0].split('/')[-1])
        print "   will start an EM with 0th motif of the file MCM1_5.tamo"
        print "   as a seed"
        print 
        sys.exit(1)
    fastafile = sys.argv[1]

    #Echo the command line
    print "#" + ' '.join(map(lambda x: re.sub(' ','\ ',x), sys.argv))

    if sys.argv[2].isdigit():
        width = sys.argv[2]
    else: width = None
    
    algorithm = ''
    beta      = ''
    seedbeta  = ''
    deltamin  = ''
    gamma     = 0.2
    infofile  = ''
    seedmodels= []
    species   = 'YEAST'
    valid_tfs = [] #NOT USED
    gapped_syl= None
    gapflank  = 0
    gapweight = 0.2
    enrichfact= 0.7
    pmax      = 0  #False
    TFSEEDS   = 0
    TFMids    = []
    pad       = None
    bgfile    = None

    seed_count = 0   #Default: Take the top 0
    seed_s     = []  #Initialize seq array

    '''Parse command-line arguments'''
    for tok,i in zip(sys.argv,xrange(len(sys.argv))):
        if   tok == '-top'   :   seed_count = int(sys.argv[i+1])
        elif tok == '-greedy':   algorithm  = "GREEDY"
        elif tok == '-prior' :   seed_s.append(sys.argv[i+1])
        elif tok == '-beta'  :   beta       = float(sys.argv[i+1])
        elif tok == '-seedbeta': seedbeta   = float(sys.argv[i+1])
        elif tok == '-gamma' :   gamma      = float(sys.argv[i+1])
        elif tok == '-delta' :   deltamin   = float(sys.argv[i+1])
        elif tok == '-kmerseeds' :   infofile   = 1
        elif tok == '-valid' :   valid_tfs.append(sys.argv[i+1]) #NOT USED
        elif tok == '-w'     :   width      = sys.argv[i+1]
        elif tok == '-width' :   width      = sys.argv[i+1]
        elif tok == '-gap'   :   gapped_syl = sys.argv[i+1]
        elif tok == '-gapflank' :gapflank   = int(sys.argv[i+1])
        elif tok == '-gapweight':gapweight  = float(sys.argv[i+1])
        elif tok == '-enrichfact':enrichfact= float(sys.argv[i+1])
        elif tok == '-pmax'  :   pmax       = 1
        elif tok == '-Y2K'   :   species    = "YEAST_2000_UP"
        elif tok == '-Y5C'   :   species    = "YEAST_500_UP"
        elif tok == '-B'     :   species    = "BAC_ORF"
        elif tok == '-Ch22'  :   species    = "Ch22"
        elif tok == '-genome':   species    = sys.argv[i+1]
        elif tok == '-pad'   :   pad        = "TRUE"
        elif tok == '-bgfile':   bgfile     = sys.argv[i+1]
        elif tok == '-TF'    :  #NOT USED (TRANSFAC NOT SUPPLIED WITH DISTRIBUTION)
            TFSEEDS = 1
            for j in range(i+1,len(sys.argv)):
                if re.match('M0',sys.argv[j]):
                    TFMids.append(sys.argv[j])
                else:
                    break
        elif tok == '-human' :
            _s = ''
            if sys.argv[i+1].isdigit(): _s = '_'+sys.argv[i+1]
            else:                       _s = ''
            species    = 'HUMAN'+_s

    if infofile: infofile = fastafile

    if bgfile:
        EM.loadMarkovBackground(bgfile)
    elif not ('-random_background' in sys.argv or '-nomarkov' in sys.argv):
        EM.loadMarkovBackground(species)
    else:
        EM.theMarkovBackground = EM.Zeroth()

    fsaD     = Fasta.load(fastafile)
    Fasta.delN(fsaD)
    seqs     = fsaD.values()
    probes   = fsaD.keys()
    all_seqs = seqs
    seed_s.extend(seqs[0:min(seed_count,len(seqs))])

    if infofile and width=='info':
        width = info2width(infofile)
    elif width != None:
        width = int(width)

    #Alternate source of seeds
    if infofile:
        if 1 or width:
            seedmodels.extend(info2seeds(width,infofile,fastafile,species))
        else:
            print 'Error: need to specify motif width w/ .info file'
    
    #Any -prior pointers to motifs in other files?
    (seed_s, motifs) = parse_priors(seed_s)
    seedmodels.extend(motifs)

    #Should we get seeds from TRANSFAC?
    if TFSEEDS: #NOT USED
        tf = []
        D  = tfmats()
        if not TFMids:
            keys = D.keys()
        else:
            keys = []
            for TFMid in TFMids:
                for key in D.keys():
                    if key[0:6] == TFMid:
                        keys.append(key)
                        break
        for key in keys:
            m = D[key]
            m.seednum = int(re.sub('M0*','',key.split()[0]))
            m.seedtxt = '%-24s %s'%(m,key)
            tf.append(m)
        tf.sort(lambda x,y: cmp(x.seednum,y.seednum))
        seedmodels.extend(tf)
        #seedmodels.append(tf[33])

    if gapped_syl:
        gapped_priors = gapped_motifs(gapped_syl)
        gapped_priors = map(lambda x:'N'+x+'N', gapped_priors)
        seed_s.extend(gapped_priors)

    if pad:
        print '# Padding models with NN-m-NN'
        newmodels = []
        left  = MotifTools.Motif_from_text('@')
        right = MotifTools.Motif_from_text('N')
        for m in seedmodels:
            newmodels.append(left + m + right)
            print left + m + right
        seedmodels = newmodels

    '''
    Set everything up and GO!!
    '''
    global theEM
    theEM = EM.EM(seed_s,[],width,"VERBOSE")
    if beta:     theEM.beta     = beta
    if deltamin: theEM.deltamin = deltamin
    if seedbeta: theEM.seedbeta = seedbeta
    theEM.param['gamma']        = gamma
    theEM.seqs.extend(all_seqs)
    theEM.models    = seedmodels
    theEM.gapflank  = gapflank
    theEM.gapweight = gapweight
    theEM.report()
    theEM.EM_Cstart()    #GO!!

    #print "#Sorting candidates"
    #sys.stdout.flush()
    #EM.candidates.sort(lambda x,y: cmp(y.MAP,x.MAP))


    '''
    Compute some metrics
    '''
    print "#Loading Genome %s"%species ; sys.stdout.flush()
    Genome = ProbeSet(species,enrichfact)
    ids    = Genome.ids_from_file(fastafile)
    
    for C in theEM.candidates:
        if not pmax:
            C.pssm.pvalue = Genome.p_value(C.pssm,ids,'verbose')
            C.pssm.church = Genome.church(C.pssm,ids)
            C.pssm.frac   = Genome.frac(C.pssm,probes,None,0.7)
        else:
            (p,frac) = Genome.best_p_value(C.pssm,ids)
            C.pssm.pvalue    = p
            C.pssm.threshold = frac * C.pssm.maxscore
            print "Bests:",p,frac

        matching             = Genome.matching_ids(C.pssm,[],factor=0.7)
        matchbound           = [x for x in matching if x in probes]
        C.pssm.numbound      = len(probes)
        C.pssm.nummotif      = len(matching)
        C.pssm.numboundmotif = len(matchbound)
        sys.stdout.flush()

    
    '''
    Print out all motifs (sorted by Enrichment) in an AlignACE-like form
    '''

    theEM.candidates.sort(lambda x,y: cmp(x.pssm.pvalue,y.pssm.pvalue))
    for C,i in zip(theEM.candidates,range(len(theEM.candidates))):
        C.pssm.maxscore = -100  #May have side effects.  Recompute when done
        if C.pssm.valid:  #NOT USED
            _t = C.pssm.valid
            if not _t[0]:
                vstring = "(--- %8.4f %8.4f %s)"%(_t[1],_t[2],_t[3])
            else:
                vstring = "(HIT %8.4f %8.4f %s)"%(_t[1],_t[2],_t[3])
        else:
            vstring = ''
        C.pssm._maxscore()     #Recomputed

        MotifTools.print_motif(C.pssm,20,i)
        sys.stdout.flush()
        continue
    
        #Antiquated stuff  -- Remove !!
        print "Log-odds matrix for Motif %3d %s"%(i,C)
        C.pssm._print_ll()
        print "Sequence Logo"
        C.pssm._print_bits()
        flush()
        #print '# %3d matching sequences at 90%%'%len(C.pssm.bestseqs(C.pssm.maxscore * 0.9))
        flush()
        m = C.pssm
        if not m.__dict__.has_key('gamma'):  m.gamma = None #Kludge to deal w/ old shelves
        if m.seedtxt:     print "Seed: %3d %s"%(i,m.seedtxt)
        if m.source:      print "Source: ",m.source
        if m.gamma:       print "Gamma: %7.5f"%m.gamma
        if m.threshold:   print "Threshold: %5.2f"%m.threshold
        #if C.pssm.seedtxt:
        #    print 'Seed  %3d %-25s'%(i,C.pssm.seedtxt)
        if C.pssm.church != None: vstring = 'ch: %5.2f  %s'%(
            math.fabs(math.log(C.pssm.church)/math.log(10)), vstring)
        print "Motif %3d %-25s  nlog(p): %6.3f  %s"%(i,C,-math.log(C.pssm.pvalue)/math.log(10),vstring)
        if C.pssm.threshold:
            print "Threshold: %6.3f  %4.1f%%"%(
                C.pssm.threshold, 100.0*C.pssm.threshold/C.pssm.maxscore)
            

        C.pssm.maxscore = -1e100  #May have side effects.  Recompute when done
        for seq in C.wmers:
            print seq,i,C.pssm.scan(seq)[2][0]
        C.pssm._maxscore()      #Recomputed
        print '*'*len(seq)
        print "MAP Score: %f"%C.MAP
        sys.stdout.flush()
    sys.stdout.flush()
    sys.exit(0) #Avoid ridiculous python cleanup times
Esempio n. 44
0
def merge_runs_cc(TAMO_file, wdir, height, distance, ancestor, target, genome):
    '''This script is used to merge motifs with the PCC matrix of all motifs.
    
    The script was originally written by Cheng Zou, and then converted to a 
    function by Alex Seddon.
    '''

    print "Here are the parameters you specified in this run "
    print "-tamo        %s" % TAMO_file
    print "-wdir        %s" % wdir
    print "-h        height to cut the tree, %s" % height
    print "-ancestor    %s" % ancestor
    print "-target    %s" % target
    print "-genome    %s" % genome

    if TAMO_file == '' or wdir == '':
        help()

    os.system("cd %s" % wdir)

    os.chdir(wdir)

    # Get the directory where the script is located.
    script_dir = '/'.join(os.path.abspath(__file__).split('/')[:-1])

    # This code was in the original clustering script. It has been taken out
    # because the processes involved take too long and have been taken up by
    # the matrrix creation scripts and the run_UPGMA script.
    #if distance==0:
    #    os.system("python /mnt/home/seddonal/gil scottscripts/5_motif_merging/3.calculate_distance_matrix.py   -i %s --dfunc pccrange" % TAMO_file)
    #os.system("R --vanilla --slave --args %s.dm  %s< /mnt/home/seddonal/scripts/5_motif_merging/UPGMA_final.R> %s.Rout" % (TAMO_file,height,TAMO_file))

    cl_dic = {}
    n = 0

    # The file, TAMO_file.dm_UPGMA_Cl_0.05, is inorder of the motifs that appear
    # in the TAMO_file. If two motifs have the same number, they are considered
    # a part of the same cluster.
    # This loop pulls the clustering information out of this file and creats
    # the dictionary cl_dic = {cluster_index:{motif_index:'1'}}
    for line in open("%s.dm_UPGMA_Cl_%s" % (TAMO_file, height), "r"):

        # Gets the clusterindex of this motif
        cl = line.strip()

        # Adds the cluster index if it has not been
        if not cl_dic.has_key(cl):
            cl_dic[cl] = {}

        cl_dic[cl][n] = "1"  # Adds the motif to that cluster
        n += 1  # Increases the motif index for the next motif

    #print cl_dic

    ml = MotifTools.txt2motifs(TAMO_file)
    old = []  # List of motifs that are the sole members of a cluster.

    # I think I can divide up this portion of the code to create a series
    print ancestor, ancestor == 0

    cc_output = open('merge_runs_cc', 'w')

    if ancestor == 0:

        # This loop Looks at each cluster and attempts to merge the motifs
        # in the cluster if there are multiple motifs.
        for i in cl_dic.keys():

            print i, cl_dic[i]

            # If there are multiple motifs in the cluster, it merges the motifs
            if len(cl_dic[i]) > 1:

                # Adds all of the motifs in the cluster to an object called
                # mlist.
                mlist = []
                for j in cl_dic[i]:
                    mlist.append(ml[j])

                # Saves these motifs to there own TAMO file.
                save_motifs(mlist, "%s_sub_%s.tm" % (TAMO_file, i))

                cc_output.write(
                    'module load TAMO; python %s/pcc_merge_CC.py merge_runs_no_ancestor -t %s/%s -i %s -target %s -genome %s\n'
                    % (script_dir, wdir, TAMO_file, i, target, genome))

            # If there is only one motif in the cluster, it leaves it alone,
            # And adds it to old
            else:
                key = cl_dic[i].keys()[0]
                old.append(ml[key])

    if ancestor == 1:

        # This loop Looks at each cluster and attempts to merge the motifs
        # in the cluster if there are multiple motifs.
        for i in cl_dic.keys():

            print i, cl_dic[i]

            # If there are multiple motifs in the cluster, it merges the motifs
            if len(cl_dic[i]) > 1:

                # Adds all of the motifs in the cluster to an object called
                # mlist.
                mlist = []
                for j in cl_dic[i]:
                    mlist.append(ml[j])

                # Saves these motifs to there own TAMO file.
                save_motifs(mlist, "%s_sub_%s.tm" % (TAMO_file, i))

                cc_output.write(
                    'module load TAMO; module load STAMPmotif; python %s/pcc_merge_CC.py merge_runs_ancestor -t %s/%s -i %s -target %s -genome %s\n'
                    % (script_dir, wdir, TAMO_file, i, target, genome))

            else:
                key = cl_dic[i].keys()[0]
                old.append(ml[key])

    # Combine together the motifs that are in there own cluster.
    #os.system("cat %s_sub_*_sum.tm.tf.tm.TOP > %s_sub_new.tm" % (TAMO_file,TAMO_file))
    save_motifs(old, "%s_sub_old.tm" % (TAMO_file))
Esempio n. 45
0
def combine_distance_matrix(wdir, TAMO_file):
    '''Combines the PCC score matricies and outputs them as a single matrix.
    
    Originaly written by Cheng Zou, and converted to a function by Alex Seddon.
    '''
    ml = MotifTools.txt2motifs(TAMO_file)
    n_split = len(ml) / 100
    ##
    # Change to the working directory.
    os.system("cd %s" % wdir)
    os.chdir(wdir)
    #
    ##
    ##
    # The following loop keeps counts the number of lines in the each of the
    # PCC matricies for a comparison of a TAMO file with itself.
    lendic = {}  # Dictionary with the length of PCC matricies.
    for i in range(n_split + 1):
        lendic[i] = line_count("%s_n%s.dm" % (TAMO_file, i))
    print lendic
    #
    ##
    ##
    # This loop creates files with blanks. The files are used to ensure that
    # the PCC-distance matrix is square. The blank files will be created to take
    # the place of files that would have been left blank
    for i in range(n_split + 1):
        for j in range(0, i):
            # open the file to add blanks
            oup = open("%s_n%s-%s_n%s.dm" % (TAMO_file, i, TAMO_file, j), "w")
            print lendic[j], lendic[i]
            list = []
            # Add a number of "-" to the list equal to the number of lines in
            # the self comparison files.
            for y in range(lendic[j]):
                list.append("-")
            for x in range(lendic[i]):
                oup.write("%s\n" % "\t".join(list))
            oup.close()
    #
    ##

    ##
    # Creates a copy of the self comparison file so that it can be easily picked
    # out by the function.
    for i in range(n_split + 1):
        os.system("cp %s_n%s.dm %s_n%s-%s_n%s.dm" %
                  (TAMO_file, i, TAMO_file, i, TAMO_file, i))
    #
    ##

    ##
    # This loop will look at each
    for i in range(n_split + 1):
        com = "paste "
        for j in range(n_split + 1):
            com += "%s_n%s-%s_n%s.dm " % (TAMO_file, i, TAMO_file, j)
        com += "> distance_%s" % i
        print com
        os.system(com)

    com = "cat "
    for i in range(n_split + 1):
        com += "distance_%s " % i
    com += "> %s.dm" % TAMO_file

    print com
    # Concatonate all the matricies
    os.system(com)
    # My embarisingly ad hoc way of removing double tabs
    remove_double_tabs("%s.dm" % TAMO_file)
Esempio n. 46
0
#!/usr/bin/python
'''
This opens a general TAMO cluster list and outputs **TO STANDARD OUT** the probability matrices of all
items there. Separated by a line with the name of each cluster. *It is recommended to be used in a
bash pipeline where the standard out can be written into a file.*  

Has 1 argument: 
- motiflist: a TAMO motif list that will be outputed 

Returns: 
- A series of strings that represet the probability matrices of all motifs in the input list

Author: Hector Galvez
'''

from sys import argv
from TAMO import MotifTools

# Open list
motiflist = MotifTools.load(argv[1])

# Start printing information for each motif
for num in range(len(motiflist)):
    print '>Cluster_' + str(num + 1)
    motiflist[num]._print_p()

Esempio n. 47
0
# Create a general list with all the motifs from all algorithms
genlist = []
genlist.extend(seederlist)
genlist.extend(memelist)
genlist.extend(weederlist)

# Perform clustering on the general list of motifs
clusterinf = clusterinfo(genlist)
averages = clusteravg(genlist,clusterinf)

# Trim the final average list
# averages = trim(averages,0.5)
# print clusterinf

# Save new list of cluster averages
MotifTools.save_motifs(averages,tamooutput)

# WEBLOGO IMAGE GENERATION
# Generate giflogos of all average motifs
for index in range(len(averages)):
    cluster = 'Cluster ' + str(index + 1)
    clustergif = argv[1] + '/other/cluster' + str(index + 1)
    averages[index].giflogo(clustergif,title=cluster,scale=2)

# SUMMARY REPORT
# Determine location of the markdown file for the summary report
reportout = open(str(argv[1] + '/final/' + listname + '_cluster_report.md'), 'w')

# Write the header of the report
rundate = date.today()
header = "# Summary report for `" + listname + "`\nThis analysis was run on: " + str(rundate) + \
Esempio n. 48
0
def GetKmerFromMotifFromPWM(pwm, seq):
	m = MotifTools.toDict(pwm)
	#print m
	motif = MotifTools.Motif_from_counts(m)
	return motif.bestscanseq(seq);
#
# Compare motifs in tamo format
#

from   TAMO              import MotifTools
from   TAMO.MotifMetrics import ProbeSet
from   TAMO.Clustering   import MotifCompare
from   TAMO.Clustering   import Kmedoids
import sys
import pickle
import pprint


file_unknown = sys.argv[1]# Unknown
file_tfbs = sys.argv[2]# TF db
motifs_unknown = MotifTools.load(file_unknown) 
motifs_tfbs = MotifTools.load(file_tfbs) 

match_dict = {}
for unknown in motifs_unknown:
  tf_list = []
  for tfbs in motifs_tfbs:
    #print 
    #print "Comparing motifs:"
    #print "    %s  vs  %s" % (unknown.source, tfbs.source)
    #print "    Unknown motif ( %s ) vs TFBS ( %s ) " % (unknown, tfbs)
    #print
    joined_motifs = []
    joined_motifs.append(unknown)
    joined_motifs.append(tfbs)
    print joined_motifs
Esempio n. 50
0
def probOvlp(A,B,thresh=0.7,verbose=None):
    if A.width >= B.width:
        Wide, Narrow = A, B
    else:
        Wide, Narrow = B, A

    RC = MotifTools.revcomplement
    if 1:
        newWide  = Wide[-1,Wide.width+1]
        if Wide.__dict__.has_key('bestWide'):
            bestWide = Wide.bestWide
        else:
            bestWideD = {}
            for x in newWide.bestseqs(thresh*newWide.maxscore):
                bestWideD[x] = 1
            for x in bestWideD.keys():
                bestWideD[RC(x)] = 1
            Wide.bestWide = bestWideD.keys()
            bestWide = Wide.bestWide
        Wide = newWide
    
        if Narrow.__dict__.has_key('bestNarrow'):
            bestNarrow = Narrow.bestNarrow
        else:
            bestNarrowD = {}
            for x in Narrow.bestseqs(thresh*Narrow.maxscore):
                bestNarrowD[x] = 1
            for x in bestNarrowD.keys():
                bestNarrowD[RC(x)] = 1
            bestNarrow = bestNarrowD.keys()
            Narrow.bestNarrow = bestNarrow
        
    #bestWide   = [x[1] for x in Wide.bestseqs  (thresh*Wide.maxscore)  ]
    #bestNarrow = [x[1] for x in Narrow.bestseqs(thresh*Narrow.maxscore)]

    countNarrow = len(bestNarrow)
    countWide   = len(bestWide)

    numtotal    = math.pow(4,Wide.width)
    fudgefactor = math.pow(4,Wide.width - Narrow.width)

    bestWideTups = [(x,MotifTools.revcomplement(x)) for x in bestWide]

    countBoth = 0
    for i in range(len(bestNarrow)):
        m_narrow = bestNarrow[i]
        delj = []

        for j in range(len(bestWideTups)):
            if (bestWideTups[j][0].find(m_narrow) >= 0) or (bestWideTups[j][1].find(m_narrow) >= 0):
                countBoth += 1
                delj.append(j)

        delj.reverse()  #Chew in from the back
        for j in delj:
            del(bestWideTups[j])


    if verbose: print '%10d %10d %10d %10d | %10d  %5d '%(
        countWide, numtotal, countNarrow *fudgefactor , countBoth , countNarrow, Wide.width - Narrow.width),
    
    p = Arith.hypgeomsummore(countWide,                 #Num Interesting
                             numtotal,                  #All k-mers
                             countNarrow * fudgefactor, #Number picked
                             countBoth                ) #Number found
    return p

import os,sys,string
from   TAMO              import MotifTools
from   TAMO.seq          import Fasta
from   TAMO.MotifMetrics import ProbeSet

promoters = ProbeSet(sys.argv[1])
geneset_ids = open(sys.argv[2]).read().split('\n')[:-1]
match_ids = []
prom_ids = promoters.probes.keys()
for id in geneset_ids:
  if id in prom_ids:
    match_ids.append(id)

motifs = MotifTools.load(sys.argv[3])
church = 0.05
rocauc = 0.1
pvalue = 0.05

print "Name\tMotif\tChurch\tRoc-auc\tP-value"
for m in motifs:
  m.church   = promoters.church  (m, match_ids)
#  m.ROC_auc  = promoters.ROC_AUC (m, match_ids)
  m.pvalue   = promoters.p_value (m, match_ids)
  if m.church <= church and m.pvalue <= pvalue:
    print "%s\t%s\t%s\t%s" %\
    (m.source, m, m.church, m.pvalue) 


Esempio n. 52
0
from gusPyCode.MDAP_proj.MDAP_defs import alignAndCombineMotifs
from TAMO import MotifTools

Motif = MotifTools.Motif

outFile = '/Users/biggus/Documents/James/Collaborations/Campbell/data/Results_HyperGeoScreen/masked/Results_gGEMS/CCupAt4Days.6-8mers.gGEMS.top6PlusCombos.motifs.stdThresh.tmo'

m = MotifTools.load('/Users/biggus/Documents/James/Collaborations/Campbell/data/Results_HyperGeoScreen/masked/Results_gGEMS/CCupAt4Days.6-8mers.gGEMS.top6.motifs.stdThresh.tmo')
w = [5.8952,
     5.6523,
     5.0585,
     4.9788,
     4.9678,
     4.7688]

toTmo = []
toTmo.append(alignAndCombineMotifs([m[0],m[1]],[w[0],w[1]]))
toTmo.append(alignAndCombineMotifs([m[0],m[4]],[w[0],w[4]]))
toTmo.append(alignAndCombineMotifs([m[1],m[4]],[w[1],w[4]]))
toTmo.append(alignAndCombineMotifs([m[2],m[3]],[w[2],w[3]]))
toTmo.append(alignAndCombineMotifs([m[2],m[5]],[w[2],w[5]]))


for e in toTmo:
    print e.oneletter

MotifTools.save_motifs(m+toTmo,outFile)    
    
None
Esempio n. 53
0
filename = sys.argv[1]
motif_list = open(filename).read().split('\nMOTIF')[1:]
tamo_list = []
motif_counter = 1
nsites_pat = re.compile("(w= [0-9]+)")

for motif in motif_list:
  m_info1, m_info2 = motif.split('letter-probability matrix: ')
  m_mat = m_info2.split('--------------------------------------------------------------------------------', 1)[0]
  m_mat_header, m_prob_mat = m_mat.split('\n', 1) 
  nsites = int(nsites_pat.findall(m_mat_header)[0].split('= ')[1])
  count_pos = m_prob_mat.split('\n')[:-1]
  count_mat = []
  site_list = []
  for count in count_pos:
    sites = [float(i) for i in count.split()]
    site_list.append(sites)
    count_dict = {'A': int(sites[0] * nsites),
                  'C': int(sites[1] * nsites),
                  'G': int(sites[2] * nsites),
                  'T': int(sites[3] * nsites)}
    count_mat.append(count_dict)
  m = MotifTools.Motif_from_counts(count_mat)
  m.source = "Motif%s | %s" % (motif_counter, m_mat_header)
  tamo_list.append(m)
  motif_counter += 1  
  
MotifTools.save_motifs(tamo_list, "MEME_motifs_%s.tamo" % filename.split('.')[0])

Esempio n. 54
0
def motif_matrix(fsa, motif, outfile, genome='mm9'):
    if genome == 'hg18':
        markov = "/nfs/genomes/human_gp_mar_06/hg18_promoters_3000_1000.markov"
    else:
        markov = "/nfs/data/cwng/chipseq/hypotheses/Mouse.markov"

    #Load motif and background adjust PSSM
    m = MotifTools.load(motif)
    EM.loadMarkovBackground(markov)
    bg = EM.theMarkovBackground.zeroth()
    F = Fasta.load(fsa, key_func=lambda x: x)
    seqs = F.values()
    n_seqs = len(seqs)
    n_motifs = len(m)
    SCORES = np.zeros((n_motifs, n_seqs), dtype='float')
    #SHIFTS=np.zeros((n_motifs,n_seqs))

    #out=open(outfile,'w')
    for i, M in enumerate(m):
        ll = M.logP
        EM.loadMarkovBackground(markov)
        bg = EM.theMarkovBackground.zeroth()
        for pos in ll:
            for letter in pos.keys():
                pos[letter] = pos[letter] - math.log(
                    bg[letter]) / math.log(2.0)
        AM = MotifTools.Motif_from_ll(ll)
        #adj_model = MotifTools.Motif_from_ll(ll)
        #adj_model.source = M.source
        #pssm = MDsupport.Motif2c_PSSM(adj_model)
        #w=pssm.width

        #shift=[]
        #scores=[]
        mi, ma = AM.minscore, AM.maxscore

        #F_m={}
        #Search every seq for given motif above threshold t and print motif centered results
        for j, seq in enumerate(seqs):
            seq_fwd = seq.upper()
            #seq_rev = str(MotifTools.revcomplement(seq_fwd))[::-1]
            #scores_fwd = pssm.score_probe(seq_fwd)
            #scores_rev = pssm.score_probe(seq_rev)
            #max_score=mi
            #max_ind=0
            #for ind,s in enumerate(scores_fwd):
            #    if s> max_score:
            #        max_score=s
            #        max_ind=ind
            #        strand='+'
            #for ind,s in enumerate(scores_rev):
            #    if s> max_score:
            #        max_score=s
            #        max_ind=ind
            #        strand='-'
            max_score = AM.bestscore(seq_fwd)
            mscore = (max_score - mi) / (ma - mi)
            #orig=len(seq_fwd)/2
            #bind=max_ind+w//2
            #d=abs(orig-bind)
            SCORES[i, j] = mscore
            #SHIFTS[i,j]=d
            #out.write('%1.3f\t'%mscore)
        #out.write('\n')
    #out.close()
    #del F
    np.savetxt(outfile, SCORES, fmt='%1.3f')